diff --git a/.gitmodules b/.gitmodules
index 170c105a6f48..42f0027505fd 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -22,3 +22,7 @@
 [submodule "3rdparty/googletest"]
 	path = 3rdparty/googletest
 	url = https://github.com/google/googletest.git
+[submodule "3rdparty/mkldnn"]
+	path = 3rdparty/mkldnn
+	url = https://github.com/intel/mkl-dnn.git
+	branch = master
diff --git a/3rdparty/mkldnn b/3rdparty/mkldnn
new file mode 160000
index 000000000000..3e1f8f53f684
--- /dev/null
+++ b/3rdparty/mkldnn
@@ -0,0 +1 @@
+Subproject commit 3e1f8f53f6845dce23abf8089501c2eb45420b9e
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 14b40e4f7be4..dfa9834ffbab 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,8 +33,8 @@ mxnet_option(USE_OPENMP           "Build with Openmp support" ON)
 mxnet_option(USE_CUDNN            "Build with cudnn support"  ON) # one could set CUDNN_ROOT for search path
 mxnet_option(USE_LAPACK           "Build with lapack support" ON IF NOT MSVC)
 mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
-mxnet_option(USE_MKLML_MKL        "Use MKLML variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND UNIX AND (NOT APPLE))
-mxnet_option(USE_MKL_EXPERIMENTAL "Use experimental MKL (if MKL enabled and found)" OFF)
+mxnet_option(USE_MKLDNN           "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND UNIX AND (NOT APPLE))
+mxnet_option(USE_MKLML_MKL        "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND UNIX AND (NOT APPLE))
 mxnet_option(USE_OPERATOR_TUNING  "Enable auto-tuning of operators" ON AND NOT MSVC)
 mxnet_option(USE_GPERFTOOLS       "Build with GPerfTools support (if found)" ON)
 mxnet_option(USE_JEMALLOC         "Build with Jemalloc support"   ON)
@@ -138,14 +138,11 @@ if(USE_VTUNE)
 endif()
 
 if(USE_MKL_IF_AVAILABLE)
-  if(USE_MKL_EXPERIMENTAL AND NOT USE_MKLML_MKL)
-    message(ERROR " USE_MKL_EXPERIMENTAL can only be used when USE_MKL_EXPERIMENTAL is enabled")
-  endif()
   find_package(MKL)
   if(MKL_FOUND)
     include_directories(${MKL_INCLUDE_DIR})
     include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src/operator/mkl)
-    add_definitions(-DMXNET_USE_MKL2017=1)
+	add_definitions(-DMXNET_USE_MKLDNN=1)
     add_definitions(-DUSE_MKL=1)
     add_definitions(-DCUB_MKL=1)
     list(APPEND mxnet_LINKER_LIBS ${MKL_LIBRARIES})
@@ -154,11 +151,6 @@ if(USE_MKL_IF_AVAILABLE)
     endif()
     # If using MKL, use the Intel OMP libraries
     list(APPEND mxnet_LINKER_LIBS iomp5)
-    if(USE_MKL_EXPERIMENTAL)
-      add_definitions(-DMKL_EXPERIMENTAL=1)
-    else()
-      add_definitions(-DMKL_EXPERIMENTAL=0)
-    endif()
   else()
     message(STATUS " MKL not found")
   endif()
diff --git a/Jenkinsfile b/Jenkinsfile
index 05cda74066f9..80f9424d6812 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -24,6 +24,7 @@
 mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, dmlc-core/libdmlc.a, nnvm/lib/libnnvm.a'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/dmlc-core/libdmlc.a'
+mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_gnu.so, lib/libmkldnn.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, dmlc-core/libdmlc.a, nnvm/lib/libnnvm.a'
 // command to start a docker container
 docker_run = 'tests/ci_build/ci_build.sh'
 // timeout in minutes
@@ -161,18 +162,18 @@ def python3_gpu_ut(docker_type) {
 }
 
 // Python 2
-def python2_mklml_ut(docker_type) {
+def python2_mkldnn_ut(docker_type) {
   timeout(time: max_time, unit: 'MINUTES') {
     sh "${docker_run} ${docker_type} find . -name '*.pyc' -type f -delete"
-    sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-2.7 --with-timer --verbose tests/python/cpu"
+    sh "${docker_run} ${docker_type} PYTHONPATH=./python/ MXNET_MKLDNN_DEBUG=1 nosetests-2.7 --with-timer --verbose tests/python/cpu"
   }
 }
 
 // Python 3
-def python3_mklml_ut(docker_type) {
+def python3_mkldnn_ut(docker_type) {
   timeout(time: max_time, unit: 'MINUTES') {
     sh "${docker_run} ${docker_type} find . -name '*.pyc' -type f -delete"
-    sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-3.4 --with-timer --verbose tests/python/cpu"
+    sh "${docker_run} ${docker_type} PYTHONPATH=./python/ MXNET_MKLDNN_DEBUG=1 nosetests-3.4 --with-timer --verbose tests/python/cpu"
   }
 }
 
@@ -243,21 +244,20 @@ try {
         }
       }
     },
-    'CPU: MKLML': {
+    'CPU: MKLDNN': {
       node('mxnetlinux-cpu') {
-        ws('workspace/build-mklml-cpu') {
+        ws('workspace/build-mkldnn-cpu') {
           init_git()
           def flag = """ \
             DEV=1                         \
             USE_PROFILER=1                \
             USE_CPP_PACKAGE=1             \
             USE_BLAS=openblas             \
-            USE_MKL2017=1                 \
-            USE_MKL2017_EXPERIMENTAL=1    \
+            USE_MKLDNN=1                  \
             -j\$(nproc)
             """
           make("cpu_mklml", flag)
-          pack_lib('mklml_cpu')
+          pack_lib('mkldnn_cpu', mx_mkldnn_lib)
         }
       }
     },
@@ -278,24 +278,23 @@ try {
         }
       }
     },
-    'GPU: MKLML': {
+    'GPU: MKLDNN': {
       node('mxnetlinux-cpu') {
-        ws('workspace/build-mklml-gpu') {
+        ws('workspace/build-mkldnn-gpu') {
           init_git()
           def flag = """ \
             DEV=1                         \
             USE_PROFILER=1                \
             USE_CPP_PACKAGE=1             \
             USE_BLAS=openblas             \
-            USE_MKL2017=1                 \
-            USE_MKL2017_EXPERIMENTAL=1    \
+            USE_MKLDNN=1                  \
             USE_CUDA=1                    \
             USE_CUDA_PATH=/usr/local/cuda \
             USE_CUDNN=1                   \
             -j\$(nproc)
             """
           make("build_cuda", flag)
-          pack_lib('mklml_gpu')
+          pack_lib('mkldnn_gpu', mx_mkldnn_lib)
         }
       }
     },
@@ -442,43 +441,43 @@ try {
         }
       }
     },
-    'Python2: MKLML-CPU': {
+    'Python2: MKLDNN-CPU': {
       node('mxnetlinux-cpu') {
-        ws('workspace/ut-python2-mklml-cpu') {
+        ws('workspace/ut-python2-mkldnn-cpu') {
           init_git()
-          unpack_lib('mklml_cpu')
+          unpack_lib('mkldnn_cpu', mx_mkldnn_lib)
           python2_ut('cpu_mklml')
-          python2_mklml_ut('cpu_mklml')
+          python2_mkldnn_ut('cpu_mklml')
         }
       }
     },
-    'Python2: MKLML-GPU': {
+    'Python2: MKLDNN-GPU': {
       node('mxnetlinux-gpu') {
-        ws('workspace/ut-python2-mklml-gpu') {
+        ws('workspace/ut-python2-mkldnn-gpu') {
           init_git()
-          unpack_lib('mklml_gpu')
+          unpack_lib('mkldnn_gpu', mx_mkldnn_lib)
           python2_gpu_ut('gpu_mklml')
-          python2_mklml_ut('gpu_mklml')
+          python2_mkldnn_ut('gpu_mklml')
         }
       }
     },
-    'Python3: MKLML-CPU': {
+    'Python3: MKLDNN-CPU': {
       node('mxnetlinux-cpu') {
-        ws('workspace/ut-python3-mklml-cpu') {
+        ws('workspace/ut-python3-mkldnn-cpu') {
           init_git()
-          unpack_lib('mklml_cpu')
+          unpack_lib('mkldnn_cpu', mx_mkldnn_lib)
           python3_ut('cpu_mklml')
-          python3_mklml_ut('cpu_mklml')
+          python3_mkldnn_ut('cpu_mklml')
         }
       }
     },
-    'Python3: MKLML-GPU': {
+    'Python3: MKLDNN-GPU': {
       node('mxnetlinux-gpu') {
-        ws('workspace/ut-python3-mklml-gpu') {
+        ws('workspace/ut-python3-mkldnn-gpu') {
           init_git()
-          unpack_lib('mklml_gpu')
+          unpack_lib('mkldnn_gpu', mx_mkldnn_lib)
           python3_gpu_ut('gpu_mklml')
-          python3_mklml_ut('gpu_mklml')
+          python3_mkldnn_ut('gpu_mklml')
         }
       }
     },
diff --git a/Makefile b/Makefile
index 976035b1087c..d325aa65ab01 100644
--- a/Makefile
+++ b/Makefile
@@ -59,11 +59,11 @@ endif
 # use customized config file
 include $(config)
 
-ifeq ($(USE_MKL2017), 1)
-# must run ./prepare_mkl before including mshadow.mk
-	RETURN_STRING := $(shell ./prepare_mkl.sh $(MKLML_ROOT))
-	MKLROOT := $(firstword $(RETURN_STRING))
-	export USE_MKLML = $(lastword $(RETURN_STRING))
+ifeq ($(USE_MKLDNN), 1)
+	RETURN_STRING := $(shell ./prepare_mkldnn.sh $(MKLDNN_ROOT))
+	MKLDNNROOT := $(firstword $(RETURN_STRING))
+	MKLROOT := $(lastword $(RETURN_STRING))
+	export USE_MKLML = 1
 endif
 
 include mshadow/make/mshadow.mk
@@ -131,23 +131,16 @@ ifeq ($(USE_NNPACK), 1)
 	LDFLAGS += -lnnpack
 endif
 
-ifeq ($(USE_MKL2017), 1)
-	CFLAGS += -DMXNET_USE_MKL2017=1
+ifeq ($(USE_MKLDNN), 1)
+	CFLAGS += -DMXNET_USE_MKLDNN=1
 	CFLAGS += -DUSE_MKL=1
-	CFLAGS += -I$(ROOTDIR)/src/operator/mkl/
-	CFLAGS += -I$(MKLML_ROOT)/include
-	LDFLAGS += -L$(MKLML_ROOT)/lib
-	ifeq ($(USE_MKL2017_EXPERIMENTAL), 1)
-		CFLAGS += -DMKL_EXPERIMENTAL=1
-	else
-		CFLAGS += -DMKL_EXPERIMENTAL=0
-	endif
-	ifeq ($(UNAME_S), Darwin)
-		LDFLAGS += -lmklml
-	else
-		LDFLAGS += -Wl,--as-needed -lmklml_intel -lmklml_gnu
+	CFLAGS += -I$(ROOTDIR)/src/operator/nn/mkldnn/
+	ifneq ($(MKLDNNROOT), $(MKLROOT))
+		CFLAGS += -I$(MKLROOT)/include
+		LDFLAGS += -L$(MKLROOT)/lib
 	endif
-	LDFLAGS +=  -liomp5
+	CFLAGS += -I$(MKLDNNROOT)/include
+	LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
 endif
 
 ifeq ($(USE_OPERATOR_TUNING), 1)
@@ -161,7 +154,7 @@ endif
 #   -  for Ubuntu, installing atlas will not automatically install the atlas provided lapack library
 # silently switching lapack off instead of letting the build fail because of backward compatibility
 ifeq ($(USE_LAPACK), 1)
-ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas))
+ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas mkl))
 ifeq (,$(wildcard /lib/liblapack.a))
 ifeq (,$(wildcard /usr/lib/liblapack.a))
 ifeq (,$(wildcard /usr/lib64/liblapack.a))
@@ -179,7 +172,7 @@ ifeq ($(USE_LAPACK), 1)
 	ifneq ($(USE_LAPACK_PATH), )
 		LDFLAGS += -L$(USE_LAPACK_PATH)
 	endif
-	ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas))
+	ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas mkl))
 		LDFLAGS += -llapack
 	endif
 	CFLAGS += -DMXNET_USE_LAPACK
@@ -569,7 +562,8 @@ clean: cyclean $(EXTRA_PACKAGES_CLEAN)
 else
 clean: cyclean testclean $(EXTRA_PACKAGES_CLEAN)
 	$(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~ R-package/NAMESPACE R-package/man R-package/R/mxnet_generated.R \
-		R-package/inst R-package/src/image_recordio.h R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz
+		R-package/inst R-package/src/image_recordio.h R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz \
+		external/mkldnn/install/*
 	cd $(DMLC_CORE); $(MAKE) clean; cd -
 	cd $(PS_PATH); $(MAKE) clean; cd -
 	cd $(NNVM_PATH); $(MAKE) clean; cd -
diff --git a/amalgamation/mxnet_predict0.cc b/amalgamation/mxnet_predict0.cc
index f35591d82b22..cfee60559501 100644
--- a/amalgamation/mxnet_predict0.cc
+++ b/amalgamation/mxnet_predict0.cc
@@ -66,7 +66,7 @@
 #include "src/operator/operator_util.cc"
 #include "src/operator/nn/activation.cc"
 #include "src/operator/nn/batch_norm.cc"
-#include "src/operator/concat.cc"
+#include "src/operator/nn/concat.cc"
 #include "src/operator/nn/convolution.cc"
 #include "src/operator/nn/deconvolution.cc"
 #include "src/operator/nn/dropout.cc"
diff --git a/cmake/ChooseBlas.cmake b/cmake/ChooseBlas.cmake
index 3a8723a5dd5e..13d7083f3d12 100644
--- a/cmake/ChooseBlas.cmake
+++ b/cmake/ChooseBlas.cmake
@@ -23,7 +23,7 @@ if(USE_MKL_IF_AVAILABLE)
     find_package(MKL)
   endif()
   if(MKL_FOUND)
-    if(USE_MKLML_MKL)
+	if(USE_MKLDNN)
       set(BLAS "open")
     else()
       set(BLAS "MKL")
@@ -55,4 +55,4 @@ elseif(BLAS STREQUAL "apple")
   list(APPEND mshadow_LINKER_LIBS ${Accelerate_LIBRARIES})
   add_definitions(-DMSHADOW_USE_MKL=0)
   add_definitions(-DMSHADOW_USE_CBLAS=1)
-endif()
\ No newline at end of file
+endif()
diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake
index 743a871ee7cd..70405566d8ae 100644
--- a/cmake/Modules/FindMKL.cmake
+++ b/cmake/Modules/FindMKL.cmake
@@ -19,7 +19,7 @@
 #
 # Options:
 #
-#   USE_MKLML_MKL                   : Search for MKL:ML library variant
+#   USE_MKLDNN                    : Search for MKL:ML library variant
 #
 #   MKL_USE_SINGLE_DYNAMIC_LIBRARY  : use single dynamic library interface
 #   MKL_USE_STATIC_LIBS             : use static libraries
@@ -33,7 +33,7 @@
 #   MKL_INCLUDE_DIR      : unclude directory
 #   MKL_LIBRARIES        : the libraries to link against.
 #
-# cjolivier01: Changed to also look for MKLML library (subset of mkl) instead of standard MKL package
+# cjolivier01: Changed to also look for MKLDNN library (subset of mkl) instead of standard MKL package
 #
 
 if(MKL_FOUND)
@@ -43,7 +43,7 @@ endif()
 # ---[ Root folders
 set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
 
-if(USE_MKLML_MKL)
+if(USE_MKLDNN)
 
   find_path(MKL_ROOT include/mkl_blas.h
     PATHS $ENV{MKL_ROOT}
@@ -66,13 +66,14 @@ if(USE_MKLML_MKL)
   set(__mkl_libs "")
 
   if(WIN32)
-    list(APPEND __mkl_libs intel)
+    list(APPEND __mkl_libs mklml_intel)
   else()
-    list(APPEND __mkl_libs gnu)
+    list(APPEND __mkl_libs mklml_gnu)
   endif()
+  list(APPEND __mkl_libs mkldnn)
 
   foreach (__lib ${__mkl_libs})
-    set(__mkl_lib "mklml_${__lib}")
+    set(__mkl_lib "${__lib}")
     string(TOUPPER ${__mkl_lib} __mkl_lib_upper)
 
     if(MKL_USE_STATIC_LIBS)
@@ -90,8 +91,7 @@ if(USE_MKLML_MKL)
     list(APPEND MKL_LIBRARIES ${${__mkl_lib_upper}_LIBRARY})
   endforeach()
 
-
-else(USE_MKLML_MKL)
+else(USE_MKLDNN)
 
   # ---[ Options
   mxnet_option(MKL_USE_SINGLE_DYNAMIC_LIBRARY "Use single dynamic library interface" ON)
@@ -193,7 +193,7 @@ else(USE_MKLML_MKL)
     list(APPEND MKL_LIBRARIES ${MKL_RTL_LIBRARY})
   endif()
 
-endif(USE_MKLML_MKL)
+endif(USE_MKLDNN)
 
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(MKL DEFAULT_MSG ${__looked_for})
diff --git a/example/image-classification/common/data.py b/example/image-classification/common/data.py
index dc8915cda4c8..05f5ddc4506e 100755
--- a/example/image-classification/common/data.py
+++ b/example/image-classification/common/data.py
@@ -112,7 +112,8 @@ def get_rec_iter(args, kv=None):
     image_shape = tuple([int(l) for l in args.image_shape.split(',')])
     if 'benchmark' in args and args.benchmark:
         data_shape = (args.batch_size,) + image_shape
-        train = SyntheticDataIter(args.num_classes, data_shape, 500, np.float32)
+        train = SyntheticDataIter(args.num_classes, data_shape,
+                args.num_examples / args.batch_size, np.float32)
         return (train, None)
     if kv:
         (rank, nworker) = (kv.rank, kv.num_workers)
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index a18d2daec8c3..43bc205944e2 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -35,12 +35,13 @@
 #include <map>
 #include <string>
 #include <memory>
+#include <algorithm>
+#if MXNET_USE_MKLDNN == 1
+#include <mkldnn.hpp>
+#endif
 #include "./base.h"
 #include "./storage.h"
 #include "./engine.h"
-#if MKL_EXPERIMENTAL == 1
-#include <mkl_memory.h>
-#endif
 // check c++11
 #if DMLC_USE_CXX11 == 0
 #error "cxx11 was required for ndarray module"
@@ -72,6 +73,7 @@ enum NDArrayFormatErr {
   kRSPIdxErr,     // indices error for row sparse
 };
 
+class MKLDNNMemory;
 
 /*!
  * \brief ndarray interface
@@ -80,9 +82,6 @@ class NDArray {
  public:
   /*! \brief default constructor */
   NDArray() {
-#if MKL_EXPERIMENTAL == 1
-    Mkl_mem_ = MKLMemHolder::create();
-#endif
   }
   /*!
    * \brief constructs a new dynamic NDArray
@@ -96,56 +95,14 @@ class NDArray {
       : ptr_(std::make_shared<Chunk>(shape, ctx, delay_alloc, dtype)),
         shape_(shape), dtype_(dtype), storage_type_(kDefaultStorage),
         entry_({nullptr, 0, 0}) {
-#if MKL_EXPERIMENTAL == 1
-    Mkl_mem_ = std::make_shared<MKLMemHolder>();
-#endif
   }
   /*! \brief constructor for NDArray with storage type
    */
   NDArray(const NDArrayStorageType stype, const TShape &shape, Context ctx,
           bool delay_alloc = true, int dtype = mshadow::default_type_flag,
           std::vector<int> aux_types = {}, std::vector<TShape> aux_shapes = {},
-          TShape storage_shape = TShape(mshadow::Shape1(0)))
-      : shape_(shape), dtype_(dtype), storage_type_(stype),
-        entry_({nullptr, 0, 0}) {
-      // Assign default aux types if not given
-      if (aux_types.size() == 0) {
-        if (stype == kRowSparseStorage) {
-          aux_types = {mshadow::kInt64};
-        } else if (stype == kCSRStorage) {
-          aux_types = {mshadow::kInt64, mshadow::kInt64};
-        } else {
-          LOG(FATAL) << "Unknown storage type " << stype;
-        }
-      }
-      // Assign default shapes if not given
-      // unknown shapes are intialized as {0} such that Size() would return 0
-      if (aux_shapes.size() == 0) {
-        if (stype == kRowSparseStorage) {
-          aux_shapes = {TShape(mshadow::Shape1(0))};
-        } else if (stype == kCSRStorage) {
-          // aux shapes for indptr and indices
-          aux_shapes = {TShape(mshadow::Shape1(0)), TShape(mshadow::Shape1(0))};
-        } else {
-          LOG(FATAL) << "Unknown storage type " << stype;
-        }
-      }
-      if (storage_shape.Size() == 0) {
-        if (stype == kRowSparseStorage) {
-          storage_shape = shape;
-          storage_shape[0] = aux_shapes[rowsparse::kIdx][0];
-        } else if (stype == kCSRStorage) {
-          storage_shape = aux_shapes[csr::kIdx];
-        } else {
-          LOG(FATAL) << "Unknown storage type " << stype;
-        }
-      }
-      ptr_ = std::make_shared<Chunk>(stype, storage_shape, ctx, delay_alloc,
-                                     dtype, aux_types, aux_shapes);
-#if MKL_EXPERIMENTAL == 1
-      Mkl_mem_ = std::make_shared<MKLMemHolder>();
-#endif
-  }
+          TShape storage_shape = TShape(mshadow::Shape1(0)));
+
   /*!
    * \brief constructing a static NDArray that shares data with TBlob
    *  Use with caution: allocate ONLY ONE NDArray for each TBlob,
@@ -157,17 +114,11 @@ class NDArray {
       : ptr_(std::make_shared<Chunk>(data, dev_id)), shape_(data.shape_),
         dtype_(data.type_flag_), storage_type_(kDefaultStorage),
         entry_({nullptr, 0, 0}) {
-#if MKL_EXPERIMENTAL == 1
-    Mkl_mem_ = std::make_shared<MKLMemHolder>();
-#endif
   }
   /*! \brief create ndarray from shared memory */
   NDArray(int shared_pid, int shared_id, const TShape& shape, int dtype)
       : ptr_(std::make_shared<Chunk>(shared_pid, shared_id, shape, dtype)), shape_(shape),
         dtype_(dtype), storage_type_(kDefaultStorage), entry_({nullptr, 0, 0}) {
-#if MKL_EXPERIMENTAL == 1
-    Mkl_mem_ = std::make_shared<MKLMemHolder>();
-#endif
   }
 
   /*!
@@ -184,11 +135,24 @@ class NDArray {
           const TBlob &data, const std::vector<TBlob> &aux_data, int dev_id)
       : ptr_(std::make_shared<Chunk>(stype, data, aux_data, dev_id)), shape_(shape),
         dtype_(data.type_flag_), storage_type_(stype), entry_({nullptr, 0, 0}) {
-#if MKL_EXPERIMENTAL == 1
-    Mkl_mem_ = std::make_shared<MKLMemHolder>();
-#endif
   }
 
+  /*
+   * This indicates whether an array is a view of another array (created by
+   * reshape or slice). If an array is a view and the the data is stored in
+   * MKLDNN format, we need to convert the data to the default format when
+   * data in the view is accessed.
+   */
+  inline bool IsView() const {
+    // View only works on the default storage
+    if (storage_type() != kDefaultStorage)
+      return false;
+    // If the array reuses memory, its shape may be different from the storage
+    // shape. However, we shouldn't consider it as a view.
+    if (reuse_)
+      return false;
+    return byte_offset_ > 0 || shape() != ptr_->storage_shape;
+  }
 
   /*!
    * \return the shape of current NDArray.
@@ -271,9 +235,6 @@ class NDArray {
             << "Unexpected storage type: " << stype;
       res = TBlob(dptr, shape, ptr_->aux_handles[i].ctx.dev_mask(), type);
     });
-#if MKL_EXPERIMENTAL == 1
-    res.Mkl_mem_ = Mkl_mem_;
-#endif
     return res;
   }
   /*!
@@ -534,15 +495,12 @@ class NDArray {
     CHECK_GE(ptr_->shandle.size,
              shape.Size() * mshadow::mshadow_sizeof(dtype))
         << "NDArray.AsArray: target memory size is bigger";
-#if MKL_EXPERIMENTAL == 1
-    if (Mkl_mem_ != nullptr) {
-      // convert prv to cpu
-      Mkl_mem_->check_and_prv_to_cpu(ptr_->shandle.dptr);
-    }
-#endif
+    // We can't reuse memory in a view.
+    CHECK(!IsView());
     NDArray ret = *this;
     ret.shape_ = shape;
     ret.dtype_ = dtype;
+    ret.reuse_ = true;
     return ret;
   }
   /*!
@@ -611,6 +569,83 @@ class NDArray {
              << "CheckAndAllocAuxData is not intended for kDefaultStorage";
     ptr_->CheckAndAllocAuxData(i, aux_shape);
   }
+
+#if MXNET_USE_MKLDNN == 1
+  /*
+   * Test if the data is stored in one of special MKLDNN format.
+   */
+  bool IsMKLDNNData() const {
+    return ptr_->IsMKLDNN();
+  }
+  /*
+   * Test if the data is stored in one of default MXNet formats.
+   */
+  bool IsDefaultData() const {
+    return ptr_->IsDefault();
+  }
+  /*
+   * All functions below return a raw pointer to mkldnn memory. Actually there
+   * is a shared pointer that hold the memory either in NDArray or in MKLDNN
+   * stream. As long as we call these functions inside an operator, the return
+   * memory is always valid.
+   */
+
+  /*
+   * This function returns mkldnn::memory with the default primitive_desc.
+   */
+  const mkldnn::memory *GetMKLDNNData() const;
+  /*
+   * This function returns mkldnn::memory with the given primitive_desc
+   * as long as the array size meets the required size in the given primitive_desc.
+   */
+  const mkldnn::memory *GetMKLDNNData(
+      const mkldnn::memory::primitive_desc &desc) const;
+  /*
+   * This function returns mkldnn::memory with the given primitive_desc.
+   * The returned mkldnn::memory will have the same physical layout as
+   * the given primitive_desc.
+   */
+  const mkldnn::memory *GetMKLDNNDataReorder(
+      const mkldnn::memory::primitive_desc &desc) const;
+
+  /*
+   * This function copies data from mkldnn memory.
+   */
+  void CopyFrom(const mkldnn::memory &mem);
+  /*
+   * This function allocates memory for array and creates mkldnn memory
+   * with the specified format.
+   */
+  mkldnn::memory *CreateMKLDNNData(
+      const mkldnn::memory::primitive_desc &desc);
+
+  /*
+   * Reorder the memory to the specified layout.
+   */
+  void MKLDNNDataReorder(const mkldnn::memory::primitive_desc &desc);
+  void Reorder2Default() {
+    CHECK_EQ(storage_type(), kDefaultStorage);
+    ptr_->Reorder2Default();
+  }
+
+  void InvalidateMKLDNNData() {
+    // Removing mkl_mem_ means the NDArray will store data in the default format.
+    ptr_->mkl_mem_ = nullptr;
+  }
+
+  /*
+   * This function is used inside operators to reshape an array.
+   * It doesn't change the layout of the original array and allocate memory from
+   * the temporary buffer. The returned array is only valid inside the current
+   * invocation of this operator.
+   * This is different from Reshape. Reshape will cause data in the array to be
+   * converted to the default layout and allocate memory from malloc directly,
+   * which can be expensive.
+   * It's used by FullyConnected right now.
+   */
+  NDArray MKLDNNDataReshape(const TShape &shape) const;
+#endif
+
   /*!
    * \brief Save list of ndarray into the Stream.x
    * \param fo The stream of output.
@@ -645,6 +680,12 @@ class NDArray {
                for csr, aux_handles[0] = indptr, aux_handles[1] = indices
     */
     std::vector<Storage::Handle> aux_handles;
+
+#if MXNET_USE_MKLDNN == 1
+    /*! This is created when data is stored in MKLDNN format.
+     */
+    std::shared_ptr<mkldnn::memory> mkl_mem_;
+#endif
     /*! \brief variable from engine */
     Engine::VarHandle var;
     /*!
@@ -706,7 +747,7 @@ class NDArray {
         : static_data(false), delay_alloc(false) {
       var = Engine::Get()->NewVariable();
       ctx = Context::CPUShared(0);
-      shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);;
+      shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);
       shandle.ctx = ctx;
       shandle.shared_pid = shared_pid;
       shandle.shared_id = shared_id;
@@ -781,6 +822,9 @@ class NDArray {
     inline void CheckAndAlloc(void) {
       if (delay_alloc) {
         shandle = Storage::Get()->Alloc(shandle.size, shandle.ctx);
+#if MXNET_USE_MKLDNN == 1
+        mkl_mem_ = nullptr;
+#endif
         delay_alloc = false;
       }
     }
@@ -789,15 +833,22 @@ class NDArray {
     // size is the number of bytes
     void CheckAndAlloc(uint64_t dbytes) {
       CHECK_EQ(kDefaultStorage, storage_type)
-              << "CheckAndAlloc(dbytes) is not intended for kDefaultStorage";
+          << "CheckAndAlloc(dbytes) is only intended for kDefaultStorage";
+      dbytes = std::max(dbytes, shandle.size);
       if (delay_alloc) {
         shandle = Storage::Get()->Alloc(dbytes, shandle.ctx);
+#if MXNET_USE_MKLDNN == 1
+        mkl_mem_ = nullptr;
+#endif
         delay_alloc = false;
       } else if (shandle.size < dbytes) {
         // free storage if necessary and alloc again
         if (shandle.size > 0) Storage::Get()->Free(shandle);
         // init storage
         shandle = Storage::Get()->Alloc(dbytes, shandle.ctx);
+#if MXNET_USE_MKLDNN == 1
+        mkl_mem_ = nullptr;
+#endif
       }
     }
 
@@ -823,20 +874,19 @@ class NDArray {
     // storage shape is also updated
     // if data is already allocated, try reuse the storage. Otherwise, free the current one
     // and allocate new storage
-    inline void CheckAndAllocData(const TShape &shape, int dtype) {
-      CHECK_NE(aux_shapes.size(), 0) << "data is expected to be allocated after aux_data";
-      auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype);
-      if (shandle.size < dbytes) {
-        // free storage if necessary and alloc again
-        if (shandle.size > 0) Storage::Get()->Free(shandle);
-        // init storage
-        shandle = Storage::Get()->Alloc(dbytes, ctx);
-      }
-      // init shape
-      storage_shape = shape;
-      // delay_alloc is only set when data storage handle is present
-      delay_alloc = false;
-    }
+    void CheckAndAllocData(const TShape &shape, int dtype);
+
+#if MXNET_USE_MKLDNN == 1
+    // Have MKL memory reference to the data in the default storage
+    // or create memory for MKLDNN.
+    void SetMKLMem(const TShape &shape, int dtype);
+    // In the data is stored in MKLDNN layout, we reorder data in mkl_mem_ and
+    // save the result in shandle.
+    void Reorder2Default();
+    bool IsMKLDNN() const;
+    bool IsDefault() const;
+#endif
+
     // create storage handle for aux data based on shape
     // this function assumes ctx, aux shapes and aux types are set
     // aux shape is also updated
@@ -862,45 +912,11 @@ class NDArray {
       set_aux_shape(i, shape);
     }
     /*! \brief destructor */
-    ~Chunk() {
-      bool skip_free = static_data || delay_alloc;
-      Storage::Handle h = this->shandle;
-      std::vector<Storage::Handle> aux_h = this->aux_handles;
-      Engine::Get()->DeleteVariable([h, aux_h, skip_free](RunContext s) {
-        if (skip_free == false) {
-          Storage::Get()->Free(h);
-          for (size_t i = 0; i < aux_h.size(); i++) {
-            if (aux_h[i].size > 0) Storage::Get()->Free(aux_h[i]);
-          }
-        }
-      }, shandle.ctx, var);
-    }
+    ~Chunk();
   };  // struct Chunk
 
-  void SetTBlob() const {
-    CHECK(ptr_ != nullptr);
-    TShape shape = shape_;
-    char *dptr = static_cast<char*>(ptr_->shandle.dptr);
-    auto stype = storage_type();
-    if (stype == kDefaultStorage) {
-      dptr += byte_offset_;
-    } else if (stype == kCSRStorage || stype == kRowSparseStorage) {
-      shape = storage_shape();
-    } else {
-      LOG(FATAL) << "unknown storage type " << stype;
-    }
-    tblob_.dptr_ = dptr;
-    tblob_.shape_ = shape;
-    tblob_.type_flag_ = dtype_;
-    tblob_.SetDLTensor(ptr_->shandle.ctx.dev_mask(), ptr_->shandle.ctx.dev_id);
-#if MKL_EXPERIMENTAL == 1
-    tblob_.Mkl_mem_ = Mkl_mem_;
-#endif
-  }
+  void SetTBlob() const;
 
-#if MKL_EXPERIMENTAL == 1
-  std::shared_ptr<MKLMemHolder> Mkl_mem_;
-#endif
   /*! \brief internal data of NDArray */
   std::shared_ptr<Chunk> ptr_{nullptr};
   /*! \brief shape of current NDArray */
@@ -909,6 +925,8 @@ class NDArray {
   size_t byte_offset_ = 0;
   /*! \brief type of data */
   int dtype_ = -1;
+  /*! \brief whether the NDArray uses memory of another NDArray. */
+  bool reuse_ = false;
   /*! \brief storage type of data */
   NDArrayStorageType storage_type_ = kUndefinedStorage;
   /*! \brief node entry for autograd */
diff --git a/include/mxnet/tensor_blob.h b/include/mxnet/tensor_blob.h
index b65cd2b434e4..168ddcca24b7 100755
--- a/include/mxnet/tensor_blob.h
+++ b/include/mxnet/tensor_blob.h
@@ -36,9 +36,6 @@
 #include <utility>
 #include <algorithm>
 #include "./base.h"
-#if MXNET_USE_MKL2017 == 1
-#include <mkl_memory.h>
-#endif
 namespace mxnet {
 
 /* Forward declaration for friend declaration in TBlob */
@@ -66,17 +63,10 @@ class TBlob {
   /*! \brief type flag of the tensor blob */
   int type_flag_;
 
-  /*! \brief storing mkl chunk buffer blob, use for experimental only */
-#if MKL_EXPERIMENTAL == 1
-  std::shared_ptr<MKLMemHolder> Mkl_mem_;
-#endif
   /*! \brief default constructor, default copy assign will work */
   TBlob(void)
       : dptr_(NULL),
         type_flag_(mshadow::DataType<real_t>::kFlag) {
-#if MKL_EXPERIMENTAL == 1
-    Mkl_mem_ = NULL;
-#endif
     SetDLTensor(cpu::kDevMask, 0);
   }
   /*!
@@ -90,9 +80,6 @@ class TBlob {
   TBlob(DType *dptr, const TShape &shape, int dev_mask, int dev_id = -1)
       : dptr_(dptr), shape_(shape),
         type_flag_(mshadow::DataType<DType>::kFlag) {
-#if MKL_EXPERIMENTAL == 1
-    Mkl_mem_ = NULL;
-#endif
     SetDLTensor(dev_mask, dev_id);
   }
   /*!
@@ -105,9 +92,6 @@ class TBlob {
    */
   TBlob(void *dptr, const TShape &shape, int dev_mask, int type_flag, int dev_id = -1)
       : dptr_(dptr), shape_(shape), type_flag_(type_flag) {
-#if MKL_EXPERIMENTAL == 1
-    Mkl_mem_ = NULL;
-#endif
     SetDLTensor(dev_mask, dev_id);
   }
   /*!
@@ -135,9 +119,6 @@ class TBlob {
     shape_ = src.shape_;
     type_flag_ = mshadow::DataType<DType>::kFlag;
     SetDLTensor(Device::kDevMask, -1);
-#if MKL_EXPERIMENTAL == 1
-    Mkl_mem_ = NULL;
-#endif
     return *this;
   }
   /*!
@@ -172,11 +153,6 @@ class TBlob {
     CHECK(mshadow::DataType<DType>::kFlag == type_flag_)
       << "TBlob.get_with_shape: data type do not match specified type."
       << "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType<DType>::kFlag;
-#if MKL_EXPERIMENTAL == 1
-    if (Mkl_mem_ != nullptr) {
-      Mkl_mem_->check_and_prv_to_cpu(dptr_);
-    }
-#endif
     return mshadow::Tensor<Device, 2, DType>(static_cast<DType*>(dptr_),
                                              shape_.FlatTo2D(),
                                              shape_[shape_.ndim() - 1],
@@ -217,11 +193,6 @@ class TBlob {
     CHECK(mshadow::DataType<DType>::kFlag == type_flag_)
       << "TBlob.get_with_shape: data type do not match specified type."
       << "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType<DType>::kFlag;
-#if MKL_EXPERIMENTAL == 1
-    if (Mkl_mem_ != nullptr) {
-      Mkl_mem_->check_and_prv_to_cpu(dptr_);
-    }
-#endif
     return static_cast<DType*>(dptr_);
   }
   /*! \brief device mask of the corresponding device */
diff --git a/prepare_mkldnn.sh b/prepare_mkldnn.sh
new file mode 100755
index 000000000000..7cd7d6af0609
--- /dev/null
+++ b/prepare_mkldnn.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# set -ex
+#
+# All modification made by Intel Corporation: © 2016 Intel Corporation
+#
+# All contributions by the University of California:
+# Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+# All rights reserved.
+#
+# All other contributions:
+# Copyright (c) 2014, 2015, the respective contributors
+# All rights reserved.
+# For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+#
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     * Redistributions of source code must retain the above copyright notice,
+#       this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of Intel Corporation nor the names of its contributors
+#       may be used to endorse or promote products derived from this software
+#       without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+MXNET_ROOTDIR="$(pwd)"
+MKLDNN_ROOTDIR="$MXNET_ROOTDIR/3rdparty/mkldnn/"
+MKLDNN_SRCDIR="$MKLDNN_ROOTDIR/src"
+MKLDNN_BUILDDIR="$MKLDNN_ROOTDIR/build"
+MKLDNN_INSTALLDIR="$MKLDNN_ROOTDIR/install"
+MKLDNN_LIBDIR="$MXNET_ROOTDIR/lib"
+
+# MKLDNN install destination
+HOME_MKLDNN=$1
+if [ ! -z "$HOME_MKLDNN" ]; then
+  mkdir -p $HOME_MKLDNN
+  if [ ! -w $HOME_MKLDNN ]; then
+    echo "MKLDNN install to $HOME_MKLDNN failed, please try with sudo" >&2
+    exit 1
+  fi
+fi
+
+if [ -z $MKLDNNROOT ]; then
+if [ ! -f "$MKLDNN_INSTALLDIR/lib/libmkldnn.so" ]; then
+    mkdir -p $MKLDNN_INSTALLDIR
+	cd $MKLDNN_ROOTDIR
+    if [ -z $MKLROOT ] && [ ! -f $MKLDNN_INSTALLDIR/include/mkl_cblas.h ]; then
+        rm -rf external && cd scripts && ./prepare_mkl.sh && cd ..
+        cp -a external/*/* $MKLDNN_INSTALLDIR/.
+    fi 
+    echo "Building MKLDNN ..." >&2
+    cd $MXNET_ROOTDIR
+	g++ --version >&2
+    if [ -z $ARCH_OPT ]; then
+        cmake $MKLDNN_ROOTDIR -DCMAKE_INSTALL_PREFIX=$MKLDNN_INSTALLDIR -B$MKLDNN_BUILDDIR
+    else
+        cmake $MKLDNN_ROOTDIR -DCMAKE_INSTALL_PREFIX=$MKLDNN_INSTALLDIR -B$MKLDNN_BUILDDIR -DARCH_OPT_FLAGS=$ARCH_OPT
+    fi
+    make -C $MKLDNN_BUILDDIR -j$(cat /proc/cpuinfo | grep processor | wc -l) VERBOSE=1 >&2
+    make -C $MKLDNN_BUILDDIR install
+    rm -rf $MKLDNN_BUILDDIR
+    mkdir -p $MKLDNN_LIBDIR
+    cp $MKLDNN_INSTALLDIR/lib/* $MKLDNN_LIBDIR
+fi
+MKLDNNROOT=$MKLDNN_INSTALLDIR
+fi
+
+if [ -z $MKLROOT ] && [ -f $MKLDNNROOT/include/mkl_cblas.h ]; then 
+  MKLROOT=$MKLDNNROOT;
+fi
+
+# user specified MKLDNN install folder
+if [ -d "$HOME_MKLDNN" ]; then
+  # skip if user specificed MKLDNNROOT
+  [ "$MKLDNNROOT" != "$HOME_MKLDNN" ] && rsync -a $MKLDNNROOT/include $MKLDNNROOT/lib $HOME_MKLDNN/.
+  [ "$MKLROOT" != "$HOME_MKLDNN" ] && rsync -a $MKLROOT/include $MKLROOT/lib $HOME_MKLDNN/.
+  # update ldconfig if possible
+  if [ -w /etc/ld.so.conf.d ]; then
+    echo "$HOME_MKLDNN/lib" > /etc/ld.so.conf.d/mxnmkldnn.conf && ldconfig
+  fi
+# return value to calling script (Makefile,cmake)
+  echo $HOME_MKLDNN $HOME_MKLDNN
+else
+  echo $MKLDNNROOT $MKLROOT
+fi
+
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 64619044862b..56f4b9c83e77 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -1287,6 +1287,10 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
             arr[:] = arg_params[name]
         for name, arr in exe.aux_dict.items():
             arr[:] = aux_params[name]
+        # We need to initialize the gradient arrays if it's add.
+        if (grad_req == "add"):
+            for arr in exe.grad_arrays:
+                arr[:] = np.zeros(arr.shape, dtype=arr.dtype)
 
     dtypes = [np.dtype(exe.outputs[0].dtype) for exe in exe_list]
     max_idx = np.argmax(dtypes)
diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h
index dcd1504fb88e..5fd1a9b1d1b9 100644
--- a/src/common/exec_utils.h
+++ b/src/common/exec_utils.h
@@ -43,19 +43,61 @@ namespace common {
           indices are not recorded
  * \return true if any source NDArray need to cast storage
  */
-inline bool SetupDefaultBlobs(const std::vector<NDArray>& src,
-                              std::vector<TBlob> *blobs,
-                              std::vector<NDArray> *temp_src,
-                              std::vector<NDArray> *temp_dst,
-                              std::unordered_map<uint32_t, uint32_t> *idx_map = nullptr) {
+inline bool SetupDefaultBlobsIn(const std::vector<NDArray>& src,
+                                const std::vector<NDArray> *bufs,
+                                std::vector<TBlob> *blobs,
+                                std::vector<NDArray> *temp_src,
+                                std::vector<NDArray> *temp_dst,
+                                std::unordered_map<uint32_t, uint32_t> *idx_map) {
   bool require_cast = false;
   for (size_t i = 0; i < src.size(); i++) {
     auto& nd = src[i];
-    if (nd.storage_type() != kDefaultStorage) {
-      if (idx_map != nullptr) {
-        (*idx_map)[i] = temp_dst->size();
-      }
-      NDArray temp(nd.shape(), nd.ctx(), false, nd.dtype());
+    bool is_default = nd.storage_type() == kDefaultStorage;
+#if MXNET_USE_MKLDNN == 1
+    // We have to make sure it's default storage and default layout.
+    is_default = nd.IsDefaultData();
+#endif
+    if (!is_default) {
+      (*idx_map)[i] = temp_dst->size();
+      NDArray temp = bufs != nullptr ? bufs->at(i) : NDArray(nd.shape(), nd.ctx(),
+                                                             true, nd.dtype());
+#if MXNET_USE_MKLDNN == 1
+      CHECK(temp.IsDefaultData());
+#endif
+      temp_src->emplace_back(nd);
+      temp_dst->emplace_back(temp);
+      blobs->emplace_back(temp.data());
+      require_cast = true;
+    } else {
+      blobs->push_back(nd.data());
+    }
+  }
+  return require_cast;
+}
+
+inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
+                                 const std::vector<OpReqType> &req,
+                                 const std::vector<NDArray> *bufs,
+                                 std::vector<TBlob> *blobs,
+                                 std::vector<NDArray> *temp_src,
+                                 std::vector<NDArray> *temp_dst) {
+  bool require_cast = false;
+  for (size_t i = 0; i < src.size(); i++) {
+    auto& nd = src[i];
+    bool is_default = nd.storage_type() == kDefaultStorage;
+#if MXNET_USE_MKLDNN == 1
+    // If it's writeTo, we don't need to worry whether it contains valid data.
+    if (req[i] == kWriteTo && is_default)
+      const_cast<NDArray &>(nd).InvalidateMKLDNNData();
+    // We have to make sure it's default storage and default layout.
+    is_default = nd.IsDefaultData();
+#endif
+    if (!is_default) {
+      NDArray temp = bufs != nullptr ? bufs->at(i) : NDArray(nd.shape(), nd.ctx(),
+                                                             true, nd.dtype());
+#if MXNET_USE_MKLDNN == 1
+      CHECK(temp.IsDefaultData());
+#endif
       temp_src->emplace_back(nd);
       temp_dst->emplace_back(temp);
       blobs->emplace_back(temp.data());
@@ -76,6 +118,9 @@ inline bool SetupDefaultBlobs(const std::vector<NDArray>& src,
  */
 inline void SetupDefaultBlobsInOut(const std::vector<NDArray> &ndinputs,
                                    const std::vector<NDArray> &ndoutputs,
+                                   const std::vector<OpReqType> &req,
+                                   const std::vector<NDArray> *in_bufs,
+                                   const std::vector<NDArray> *out_bufs,
                                    std::vector<TBlob> *input_blobs,
                                    std::vector<TBlob> *output_blobs,
                                    std::vector<NDArray> *pre_temp_src,
@@ -85,9 +130,11 @@ inline void SetupDefaultBlobsInOut(const std::vector<NDArray> &ndinputs,
                                    std::unordered_map<uint32_t, uint32_t> *in_temp_idx_map,
                                    const std::vector<uint32_t> &mutate_idx) {
   // populate input blobs
-  SetupDefaultBlobs(ndinputs, input_blobs, pre_temp_src, pre_temp_dst, in_temp_idx_map);
+  SetupDefaultBlobsIn(ndinputs, in_bufs, input_blobs, pre_temp_src, pre_temp_dst,
+                      in_temp_idx_map);
   // populate output blobs
-  SetupDefaultBlobs(ndoutputs, output_blobs, post_temp_dst, post_temp_src);
+  SetupDefaultBlobsOut(ndoutputs, req, out_bufs, output_blobs, post_temp_dst,
+                       post_temp_src);
   // add mutable inputs to post temp list
   for (const auto idx : mutate_idx) {
     auto map_iter = in_temp_idx_map->find(idx);
diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
index 1bcc40a894dd..e4d49554620f 100644
--- a/src/executor/attach_op_execs_pass.cc
+++ b/src/executor/attach_op_execs_pass.cc
@@ -30,11 +30,8 @@
 #include "../common/utils.h"
 #include "../common/exec_utils.h"
 #include "./exec_pass.h"
-#if MXNET_USE_MKL2017 == 1
-#include <mkl_memory.h>
-#include "../operator/mkl/mkl_memory-inl.h"
-#include "../operator/mkl/mkl_util-inl.h"
-#endif
+#include "../operator/nn/mkldnn/mkldnn_base-inl.h"
+
 namespace mxnet {
 
 namespace op {
@@ -58,23 +55,34 @@ class StorageFallbackOpExecutor : public OpExecutor {
  protected:
   // initialize the data blobs
   void InitBlobs() {
-    using namespace common;
     if (!init_) {
-      in_data_.clear(); out_data_.clear();
-      pre_temp_src_.clear(); pre_temp_dst_.clear();
-      post_temp_src_.clear(); post_temp_dst_.clear();
-      in_temp_idx_map_.clear();
-      SetupDefaultBlobsInOut(in_array, out_array, &in_data_, &out_data_,
-                             &pre_temp_src_, &pre_temp_dst_,
-                             &post_temp_src_, &post_temp_dst_,
-                             &in_temp_idx_map_, mutate_idx_);
+      pre_temp_buf_.clear();
+      post_temp_buf_.clear();
+      for (size_t i = 0; i < in_array.size(); i++) {
+        auto &nd = in_array[i];
+        pre_temp_buf_.emplace_back(nd.shape(), nd.ctx(), true, nd.dtype());
+      }
+      for (size_t i = 0; i < out_array.size(); i++) {
+        auto &nd = out_array[i];
+        post_temp_buf_.emplace_back(nd.shape(), nd.ctx(), true, nd.dtype());
+      }
       init_ = true;
     }
   }
 
   // storage fallback before fcompute is launched
   void PreFCompute(bool is_gpu) {
+    using namespace common;
     InitBlobs();
+    in_data_.clear(); out_data_.clear();
+    pre_temp_src_.clear(); pre_temp_dst_.clear();
+    post_temp_src_.clear(); post_temp_dst_.clear();
+    in_temp_idx_map_.clear();
+    SetupDefaultBlobsInOut(in_array, out_array, req, &pre_temp_buf_, &post_temp_buf_,
+                           &in_data_, &out_data_,
+                           &pre_temp_src_, &pre_temp_dst_,
+                           &post_temp_src_, &post_temp_dst_,
+                           &in_temp_idx_map_, mutate_idx_);
     common::CastNonDefaultStorage(pre_temp_src_, pre_temp_dst_, op_ctx, is_gpu);
   }
 
@@ -85,6 +93,8 @@ class StorageFallbackOpExecutor : public OpExecutor {
 
   // default storage tensor blobs for fcompute
   std::vector<TBlob> in_data_, out_data_;
+  // These are NDArray buffers for cast storage.
+  std::vector<NDArray> pre_temp_buf_, post_temp_buf_;
   // source NDArray for cast storage
   std::vector<NDArray> pre_temp_src_, post_temp_src_;
   // destination NDArray for cast storage
@@ -106,10 +116,6 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor {
     PreFCompute(is_gpu);
     fcompute_(state_, op_ctx, in_data_, req, out_data_);
     PostFCompute(is_gpu);
-#if MKL_EXPERIMENTAL == 1
-    mkl_tblobs_prv_to_cpu(in_data_);
-    mkl_tblobs_prv_to_cpu(out_data_);
-#endif
   }
 
   ExecType exec_type() const override {
@@ -175,10 +181,6 @@ class FComputeExecutor : public StorageFallbackOpExecutor {
     PreFCompute(is_gpu);
     fcompute_(attrs_, op_ctx, in_data_, req, out_data_);
     PostFCompute(is_gpu);
-#if MKL_EXPERIMENTAL == 1
-    mkl_tblobs_prv_to_cpu(in_data_);
-    mkl_tblobs_prv_to_cpu(out_data_);
-#endif
   }
 
   ExecType exec_type() const override {
@@ -202,6 +204,9 @@ class FComputeExExecutor : public OpExecutor {
  public:
   void Run(RunContext rctx, bool is_gpu) override {
     op_ctx.run_ctx = rctx;
+#if MXNET_USE_MKLDNN == 1
+    InvalidateOutputs(out_array, req);
+#endif
     fcompute_(attrs_, op_ctx, in_array, req, out_array);
   }
 
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 2a7d2b906684..f685370619f2 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -1209,7 +1209,8 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
       const NDArray& src = data_pool_.at(storage_id);
       data_entry_[i] = src.AsArray(vshape[i], vdtype[i]);
     } else {
-      data_entry_[i] = NDArray(storage_type, vshape[i], data_context[i]);
+      data_entry_[i] = NDArray(storage_type, vshape[i], data_context[i],
+                               true, vdtype[i]);
     }
     if (log_verbose_) {
       LOG(INFO) << "\tinit data entry\t" << i << "\tas " << common::stype_string(storage_type);
diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc
index 73a34c8b0f0d..01fab2240952 100644
--- a/src/executor/infer_graph_attr_pass.cc
+++ b/src/executor/infer_graph_attr_pass.cc
@@ -423,11 +423,6 @@ nnvm::Graph InferStorageType(nnvm::Graph&& graph,
     DispatchModeVector dispatch_modes(graph.indexed_graph().num_nodes(), DispatchMode::kUndefined);
     graph.attrs["dispatch_mode"] = std::make_shared<any>(std::move(dispatch_modes));
   }
-  // initialize unknown values for dispatch modes
-  if (graph.attrs.count("dispatch_mode") == 0) {
-    DispatchModeVector dispatch_modes(graph.indexed_graph().num_nodes(), DispatchMode::kUndefined);
-    graph.attrs["dispatch_mode"] = std::make_shared<any>(std::move(dispatch_modes));
-  }
   // initialize the dev_mask vector from the context vector
   if (graph.attrs.count("dev_mask") == 0) {
     CHECK_GT(graph.attrs.count("context"), 0);
diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
index eaa95a5f2418..93a8bc6c54b2 100644
--- a/src/imperative/cached_op.cc
+++ b/src/imperative/cached_op.cc
@@ -214,6 +214,12 @@ nnvm::Graph Imperative::CachedOp::GetForwardGraph(
 
   StorageVector storage(idx.num_node_entries(), exec::kBadStorageID);
   for (const auto i : idx.input_nodes()) storage[idx.entry_id(i, 0)] = exec::kExternalStorageID;
+  const auto& stypes = g.GetAttr<StorageTypeVector>("storage_type");
+  CHECK_EQ(stypes.size(), storage.size());
+  for (size_t i = 0; i < stypes.size(); i++) {
+    if (stypes[i] != kDefaultStorage)
+      storage[i] = exec::kDynamicStorageID;
+  }
 
   auto mem_plan = PlanMemory(
       &g, std::move(storage), g.GetAttr<std::vector<uint32_t> >(
@@ -320,6 +326,10 @@ nnvm::Graph Imperative::CachedOp::GetBackwardGraph(
   for (size_t i = 0; i < num_forward_entries; ++i) storage[i] = exec::kExternalStorageID;
   for (const auto i : idx.input_nodes()) storage[idx.entry_id(i, 0)] = exec::kExternalStorageID;
   for (const auto i : idx.outputs()) storage[idx.entry_id(i)] = exec::kExternalStorageID;
+  for (size_t i = 0; i < stypes.size(); i++) {
+    if (stypes[i] != kDefaultStorage)
+      storage[i] = exec::kDynamicStorageID;
+  }
 
   auto mem_plan = PlanMemory(
       &g, std::move(storage), g.GetAttr<std::vector<uint32_t> >("backward_ref_count"),
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index fc28f50103b0..966a753dc120 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -362,9 +362,9 @@ inline void PushFCompute(const FCompute& fn,
       // mapping from index in input_blobs to index in pre_temp_dst
       std::unordered_map<uint32_t, uint32_t> in_temp_idx_map;
       // setup blobs
-      SetupDefaultBlobsInOut(inputs, outputs, &input_blobs, &output_blobs,
-                             &pre_temp_src, &pre_temp_dst, &post_temp_src,
-                             &post_temp_dst, &in_temp_idx_map, mutate_idx);
+      SetupDefaultBlobsInOut(inputs, outputs, req, nullptr, nullptr,
+                             &input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst,
+                             &post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx);
       // setup context
       OpContext opctx{is_train, rctx, engine::CallbackOnComplete(), requested};
       bool is_gpu = ctx.dev_mask() == gpu::kDevMask;
@@ -460,9 +460,9 @@ inline void PushOperator(const OpStatePtr& state,
         // mapping from index in input_blobs to index in pre_temp_dst
         std::unordered_map<uint32_t, uint32_t> in_temp_idx_map;
         // populate input blobs and output blobs
-        SetupDefaultBlobsInOut(inputs, outputs, &input_blobs, &output_blobs,
-                               &pre_temp_src, &pre_temp_dst, &post_temp_src, &post_temp_dst,
-                               &in_temp_idx_map, mutate_idx);
+        SetupDefaultBlobsInOut(inputs, outputs, req, nullptr, nullptr,
+                               &input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst,
+                               &post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx);
         // setup contexts
         bool is_gpu = rctx.get_ctx().dev_mask() == gpu::kDevMask;
         // pre-fcompute fallback
@@ -607,6 +607,7 @@ inline bool CheckAndInferStorageType(nnvm::Graph* p_g, exec::DevMaskVector&& dev
     }
     if (match) return true;
   }
+  g.attrs.erase("dispatch_mode");
   g.attrs.erase("storage_type");
   g.attrs.erase("storage_type_inputs");
   if (node_range.second > node_range.first) {
diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h
index e98102b6b0a3..e01cc4206b37 100644
--- a/src/kvstore/kvstore_dist.h
+++ b/src/kvstore/kvstore_dist.h
@@ -32,11 +32,6 @@
 #include "mxnet/engine.h"
 #include "ps/ps.h"
 #include "./kvstore_dist_server.h"
-#if MKL_EXPERIMENTAL == 1
-#include <mkl_memory.h>
-#include "../operator/mkl/mkl_memory-inl.h"
-#include "../operator/mkl/mkl_util-inl.h"
-#endif
 namespace mxnet {
 namespace kvstore {
 
@@ -237,9 +232,6 @@ class KVStoreDist : public KVStoreLocal {
         PSKV& pskv = (gradient_compression_->get_type() == CompressionType::kNone) ?
                       EncodeDefaultKey(key, size, false) :
                       EncodeCompressedKey(key, size, false);
-#if MKL_EXPERIMENTAL == 1
-        mkl_set_tblob_eager_mode(recv_buf.data());
-#endif
         real_t* data = recv_buf.data().dptr<real_t>();
         // false means not to delete data when SArray is deleted
         auto vals = new ps::SArray<real_t>(data, size, false);
@@ -389,9 +381,6 @@ class KVStoreDist : public KVStoreLocal {
       [this, key, pskv, small_buf](RunContext rctx, Engine::CallbackOnComplete cb) {
         size_t size = small_buf.shape().Size();
         real_t* data = small_buf.data().dptr<real_t>();
-#if MKL_EXPERIMENTAL == 1
-        mkl_set_tblob_eager_mode(small_buf.data());
-#endif
         // do push. false means no delete
         ps::SArray<real_t> vals(data, size, false);
         CHECK_NOTNULL(ps_worker_)->ZPush(
@@ -416,9 +405,6 @@ class KVStoreDist : public KVStoreLocal {
           // convert to ps keys
           size_t size = send_buf.shape().Size();
           real_t* data = send_buf.data().dptr<real_t>();
-#if MKL_EXPERIMENTAL == 1
-          mkl_set_tblob_eager_mode(send_buf.data());
-#endif
           // do push. false means no delete
           ps::SArray<real_t> vals(data, size, false);
           CHECK_NOTNULL(ps_worker_)->ZPush(
@@ -440,9 +426,6 @@ class KVStoreDist : public KVStoreLocal {
     using namespace rowsparse;
     auto push_to_servers = [this, key, send_buf]
                            (RunContext rctx, Engine::CallbackOnComplete cb) {
-#if MKL_EXPERIMENTAL == 1
-      mkl_set_tblob_eager_mode(send_buf.data());
-#endif
       real_t* data = send_buf.data().dptr<real_t>();
       const int64_t num_rows = send_buf.aux_shape(kIdx)[0];
       const auto offsets = send_buf.aux_data(kIdx).dptr<int64_t>();
@@ -481,9 +464,6 @@ class KVStoreDist : public KVStoreLocal {
       // allocate memory for the buffer
       size_t num_rows = indices.shape().Size();
       recv_buf.CheckAndAlloc({mshadow::Shape1(num_rows)});
-#if MKL_EXPERIMENTAL == 1
-      mkl_set_tblob_eager_mode(recv_buf.data());
-#endif
       real_t* data = recv_buf.data().dptr<real_t>();
       const auto offsets = indices.data().dptr<int64_t>();
       const auto unit_len = recv_buf.shape().ProdShape(1, recv_buf.shape().ndim());
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 4db314f9cf4b..ae7209e272b0 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -31,10 +31,14 @@
 #include <mxnet/resource.h>
 #include <mxnet/imperative.h>
 #include <mshadow/tensor.h>
+#if MXNET_USE_MKLDNN == 1
+#include <mkldnn.hpp>
+#endif
 #include "./ndarray_function.h"
 #include "../common/utils.h"
 #include "../operator/tensor/matrix_op-inl.h"
 #include "../operator/tensor/init_op.h"
+#include "../operator/nn/mkldnn/mkldnn_base-inl.h"
 
 #if MXNET_USE_OPENCV
 #include <opencv2/opencv.hpp>
@@ -46,6 +50,104 @@ DMLC_REGISTRY_ENABLE(::mxnet::NDArrayFunctionReg);
 
 namespace mxnet {
 
+NDArray::NDArray(const NDArrayStorageType stype, const TShape &shape, Context ctx,
+    bool delay_alloc, int dtype, std::vector<int> aux_types,
+    std::vector<TShape> aux_shapes, TShape storage_shape) : shape_(shape),
+  dtype_(dtype), storage_type_(stype), entry_({nullptr, 0, 0}) {
+  // Assign default aux types if not given
+  if (aux_types.size() == 0
+      && stype != kDefaultStorage) {
+    if (stype == kRowSparseStorage) {
+      aux_types = {mshadow::kInt64};
+    } else if (stype == kCSRStorage) {
+      aux_types = {mshadow::kInt64, mshadow::kInt64};
+    } else {
+      LOG(FATAL) << "Unknown storage type " << stype;
+    }
+  }
+  // Assign default shapes if not given
+  // unknown shapes are intialized as {0} such that Size() would return 0
+  if (aux_shapes.size() == 0
+      && stype != kDefaultStorage) {
+    if (stype == kRowSparseStorage) {
+      aux_shapes = {TShape(mshadow::Shape1(0))};
+    } else if (stype == kCSRStorage) {
+      // aux shapes for indptr and indices
+      aux_shapes = {TShape(mshadow::Shape1(0)), TShape(mshadow::Shape1(0))};
+    } else {
+      LOG(FATAL) << "Unknown storage type " << stype;
+    }
+  }
+  if (storage_shape.Size() == 0
+      && stype != kDefaultStorage) {
+    if (stype == kRowSparseStorage) {
+      storage_shape = shape;
+      storage_shape[0] = aux_shapes[rowsparse::kIdx][0];
+    } else if (stype == kCSRStorage) {
+      storage_shape = aux_shapes[csr::kIdx];
+    } else {
+      LOG(FATAL) << "Unknown storage type " << stype;
+    }
+  }
+  if (stype == kDefaultStorage)
+    ptr_ = std::make_shared<Chunk>(shape, ctx, delay_alloc, dtype);
+  else
+    ptr_ = std::make_shared<Chunk>(stype, storage_shape, ctx, delay_alloc,
+        dtype, aux_types, aux_shapes);
+}
+
+struct ChunkMem {
+  Storage::Handle h;
+  std::vector<Storage::Handle> aux_h;
+#if MXNET_USE_MKLDNN == 1
+  std::shared_ptr<mkldnn::memory> mem;
+#endif
+};
+
+NDArray::Chunk::~Chunk() {
+  bool skip_free = static_data || delay_alloc;
+  ChunkMem mem;
+  mem.h = this->shandle;
+  mem.aux_h = this->aux_handles;
+#if MXNET_USE_MKLDNN == 1
+  // We want to delete mkldnn memory after deleting the variable.
+  mem.mem = this->mkl_mem_;
+#endif
+  Engine::Get()->DeleteVariable([mem, skip_free](RunContext s) {
+    if (skip_free == false) {
+#if MXNET_USE_MKLDNN == 1
+      if (mem.mem) {
+        CHECK_LE(mem.mem->get_primitive_desc().get_size(), mem.h.size);
+        CHECK_EQ(mem.mem->get_data_handle(), mem.h.dptr);
+      }
+#endif
+      if (mem.h.size > 0) Storage::Get()->Free(mem.h);
+      for (size_t i = 0; i < mem.aux_h.size(); i++) {
+        if (mem.aux_h[i].size > 0) Storage::Get()->Free(mem.aux_h[i]);
+      }
+    }
+  }, shandle.ctx, var);
+}
+
+void NDArray::Chunk::CheckAndAllocData(const TShape &shape, int dtype) {
+  CHECK_NE(aux_shapes.size(), 0)
+      << "data is expected to be allocated after aux_data";
+  auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype);
+  if (shandle.size < dbytes) {
+    // free storage if necessary and alloc again
+    if (shandle.size > 0) Storage::Get()->Free(shandle);
+    // init storage
+    shandle = Storage::Get()->Alloc(dbytes, ctx);
+#if MXNET_USE_MKLDNN == 1
+    mkl_mem_ = nullptr;
+#endif
+  }
+  // init shape
+  storage_shape = shape;
+  // delay_alloc is only set when data storage handle is present
+  delay_alloc = false;
+}
+
 NDArray NDArray::grad() const {
   if (Imperative::AGInfo::IsNone(*this)) return NDArray();
   Imperative::AGInfo& info = Imperative::AGInfo::Get(entry_.node);
@@ -64,15 +166,55 @@ nnvm::Symbol NDArray::get_autograd_symbol() const {
   return ret;
 }
 
+#if MXNET_USE_MKLDNN == 1
+
+NDArray NDArray::MKLDNNDataReshape(const TShape &shape) const {
+  CHECK(!is_none()) << "NDArray is not initialized";
+  CHECK_GE(shape_.Size(), shape.Size())
+    << "NDArray.Reshape: target shape size is larger current shape";
+  CHECK_EQ(storage_type(), kDefaultStorage);
+  if (!IsMKLDNNData()) {
+    NDArray ret = this->Detach();
+    ret.shape_ = shape;
+    return ret;
+  } else {
+    NDArray ret(shape, ctx(), true, dtype());
+    // We shouldn't submit the reorder primitive here because submit will
+    // be called in operators.
+    auto format = GetDefaultFormat(ptr_->mkl_mem_->get_primitive_desc().desc());
+    CHECK_NE(format, ptr_->mkl_mem_->get_primitive_desc().desc().data.format);
+    auto def_pd = GetPrimitiveDesc(ptr_->mkl_mem_->get_primitive_desc(), format);
+    auto def_mem = TmpMemMgr::Get()->Alloc(def_pd);
+    MKLDNNStream *stream = MKLDNNStream::Get();
+    stream->RegisterMem(ptr_->mkl_mem_);
+    stream->RegisterPrim(mkldnn::reorder(*ptr_->mkl_mem_, *def_mem));
+    // def_mem points to a memory region in the temp space. It's only valid
+    // inside an operator. As such, the returned NDArray can only be valid
+    // inside an operator and the shared point doesn't need to do anything
+    // when it's destroyed.
+    ret.ptr_->mkl_mem_ = std::shared_ptr<mkldnn::memory>(def_mem,
+                                                         [](mkldnn::memory *mem){});
+    ret.ptr_->shandle.dptr = def_mem->get_data_handle();
+    ret.ptr_->shandle.size = def_mem->get_primitive_desc().get_size();
+    ret.ptr_->delay_alloc = false;
+    ret.ptr_->static_data = true;
+    ret.byte_offset_ = byte_offset_;
+    return ret;
+  }
+}
+
+#endif
+
 NDArray NDArray::Reshape(const TShape &shape) const {
   CHECK(!is_none()) << "NDArray is not initialized";
-  auto stype = storage_type();
-  // reshape is not supported for non-default ndarray with dismatching shapes
-  CHECK((shape_ == shape) || stype == kDefaultStorage)
-    << "Reshape for storage type " << stype << " is not implemented yet";
   CHECK_GE(shape_.Size(), shape.Size())
     << "NDArray.Reshape: target shape size is larger current shape";
   NDArray ret = this->Detach();
+  // If the shape doesn't change, we can just return it now.
+  if (ret.shape_ == shape)
+    return ret;
+  // Otherwise, reshape only works on the default layout.
+  CHECK_EQ(storage_type(), kDefaultStorage);
   ret.shape_ = shape;
   return ret;
 }
@@ -95,7 +237,6 @@ NDArray NDArray::ReshapeWithRecord(const TShape &shape) {
   return ret;
 }
 
-
 NDArray NDArray::Slice(index_t begin, index_t end) const {
   CHECK(!is_none()) << "NDArray is empty";
   CHECK_LE(begin, end)
@@ -127,8 +268,8 @@ NDArray NDArray::SliceWithRecord(index_t begin, index_t end) {
 }
 
 NDArray NDArray::At(index_t idx) const {
-  CHECK(storage_type() == kDefaultStorage) << "Storage type "
-                                           << storage_type() << " doesn't support At()";
+  CHECK(storage_type() == kDefaultStorage)
+      << "Storage type " << storage_type() << " doesn't support At()";
   NDArray ret = this->Slice(idx, idx+1);
   if (shape_.ndim() > 1) {
     return ret.Reshape(TShape(shape_.data()+1, shape_.data()+shape_.ndim()));
@@ -181,6 +322,400 @@ void NDArray::set_fresh_out_grad(bool state) const {
   info.fresh_out_grad = state;
 }
 
+#if MXNET_USE_MKLDNN == 1
+static inline bool same_shape(const TShape &shape, mkldnn_dims_t dims, int ndims) {
+  if (shape.ndim() != (size_t)ndims)
+    return false;
+  for (int i = 0; i < ndims; i++)
+    if (shape[i] != dims[i])
+      return false;
+  return true;
+}
+
+static inline bool same_shape(const TShape &shape, int dtype, mkldnn::memory::desc desc) {
+  return same_shape(shape, desc.data.dims, desc.data.ndims)
+      && get_mkldnn_type(dtype) == desc.data.data_type;
+}
+
+bool NDArray::Chunk::IsMKLDNN() const {
+  if (storage_type != kDefaultStorage)
+    return false;
+  if (mkl_mem_ == nullptr)
+    return false;
+  auto desc = mkl_mem_->get_primitive_desc().desc();
+  return desc.data.format != GetDefaultFormat(desc);
+}
+
+bool NDArray::Chunk::IsDefault() const {
+  if (storage_type != kDefaultStorage)
+    return false;
+  // If we don't have mkldnn memory yet, we just assume it's not the default
+  // format.
+  if (mkl_mem_ == nullptr)
+    return true;
+  auto desc = mkl_mem_->get_primitive_desc().desc();
+  return desc.data.format == GetDefaultFormat(desc);
+}
+
+void NDArray::Chunk::Reorder2Default() {
+  if (mkl_mem_ == nullptr)
+    return;
+
+  auto format = GetDefaultFormat(mkl_mem_->get_primitive_desc().desc());
+  CHECK(format != mkl_mem_->get_primitive_desc().desc().data.format);
+
+  auto def_pd = GetPrimitiveDesc(mkl_mem_->get_primitive_desc(), format);
+  mkldnn_mem_ptr def_mem(new mkldnn::memory(def_pd));
+  // This may be called in MKLDNN operators. We can't use MKLDNNStream here.
+  std::vector<mkldnn::primitive> net;
+  net.push_back(mkldnn::reorder(*mkl_mem_, *def_mem));
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
+
+  CHECK(shandle.size >= def_pd.get_size());
+  CheckAndAlloc(def_pd.get_size());
+  // TODO(zhengda) We need to avoid memory copy here.
+  memcpy(shandle.dptr, def_mem->get_data_handle(), def_pd.get_size());
+  mkl_mem_.reset(new mkldnn::memory(def_pd, shandle.dptr));
+}
+
+void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) {
+  // The shape of the array and the one of the MKL memory may mismatch.
+  // For example, if the array stores parameters, the MKL memory may store data
+  // in 5 dimensions while the NDArray stores data in 4 dimensions.
+  if (mkl_mem_ && mkl_mem_->get_data_handle() == shandle.dptr
+      && same_shape(shape, dtype, mkl_mem_->get_primitive_desc().desc())) {
+    return;
+  }
+
+  mkldnn::memory::dims dims;
+  // These are shapes supprted by MKLDNN.
+  if (shape.ndim() == 1 || shape.ndim() == 2 || shape.ndim() == 4
+      || shape.ndim() == 5) {
+    dims.resize(shape.ndim());
+    for (size_t i = 0; i < dims.size(); i++)
+      dims[i] = shape[i];
+  } else if (shape.ndim() == 3) {
+    // If there are 3 dimensions, we'll force it to 4 dimensions.
+    dims.resize(shape.ndim() + 1);
+    dims[0] = 1;
+    for (size_t i = 0; i < shape.ndim(); i++)
+      dims[i + 1] = shape[i];
+  } else {
+    LOG(FATAL) << "MKLDNN doesn't support " << shape.ndim() << " dimensions";
+  }
+  mkldnn::memory::format layout = mkldnn::memory::format::format_undef;
+  switch (dims.size()) {
+    case 1: layout = mkldnn::memory::format::x; break;
+    case 2: layout = mkldnn::memory::format::nc; break;
+    case 4: layout = mkldnn::memory::format::nchw; break;
+    // This isn't the right layout when the data has 5 dimensions in MXNet.
+    // MXNet interprets 5 dimensions as ncdhw, but MKLDNN doesn't have
+    // a corresponding format.
+    case 5: layout = mkldnn::memory::format::goihw; break;
+  }
+  mkldnn::memory::desc data_md{dims, get_mkldnn_type(dtype), layout};
+  auto cpu_engine = CpuEngine::Get()->get_engine();
+  if (shandle.dptr == nullptr) {
+    CHECK(delay_alloc);
+    CheckAndAlloc();
+  }
+  mkldnn::memory::primitive_desc pd(data_md, cpu_engine);
+  CHECK(shandle.size >= pd.get_size());
+  mkl_mem_.reset(new mkldnn::memory(pd, shandle.dptr));
+}
+
+/*
+ * Here we want to get MKLDNN memory whose primitive desc is exactly the same as
+ * the given one. operator== can't guarantee that. == can return true even if
+ * the formats are different. I need to double check its format.
+ */
+static inline mkldnn::memory *GetMKLDNNExact(
+    const mkldnn::memory *mem, mkldnn::memory::primitive_desc desc) {
+  auto src_desc = mem->get_primitive_desc();
+  if (desc == src_desc && desc.desc().data.format == src_desc.desc().data.format) {
+    return const_cast<mkldnn::memory *>(mem);
+  } else {
+    std::shared_ptr<mkldnn::memory> ret(new mkldnn::memory(
+            desc, mem->get_data_handle()));
+    MKLDNNStream::Get()->RegisterMem(ret);
+    return ret.get();
+  }
+}
+
+const mkldnn::memory *NDArray::GetMKLDNNData(
+    const mkldnn::memory::primitive_desc &desc) const {
+  if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
+    LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc";
+    return nullptr;
+  }
+  auto mem = GetMKLDNNData();
+  mkldnn::memory::primitive_desc _desc = desc;
+  auto desc1 = mem->get_primitive_desc().desc();
+  auto desc2 = _desc.desc();
+  // The MKL memory has the same format and shape as required,
+  // or both use the default format, we can return the MKL memory.
+  if (mem->get_primitive_desc() == desc
+      || (desc1.data.format == GetDefaultFormat(desc1)
+        && desc2.data.format == GetDefaultFormat(desc2))) {
+    return GetMKLDNNExact(ptr_->mkl_mem_.get(), desc);
+  } else {
+    return nullptr;
+  }
+}
+
+const mkldnn::memory *NDArray::GetMKLDNNDataReorder(
+    const mkldnn::memory::primitive_desc &desc) const {
+  if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
+    LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc";
+    return nullptr;
+  }
+  CHECK(storage_type() == kDefaultStorage);
+
+  auto mem = GetMKLDNNData();
+  // If the memory descriptor matches, it's easy.
+  MKLDNNStream *stream = MKLDNNStream::Get();
+  if (mem->get_primitive_desc() == desc) {
+    return GetMKLDNNExact(mem, desc);
+  }
+
+  mkldnn::memory::primitive_desc _desc = desc;
+  // Now we need to determine if we should reorder the memory.
+  // If both use the default formats, we think we don't need to reorder.
+  auto desc1 = mem->get_primitive_desc().desc();
+  auto desc2 = _desc.desc();
+  if (desc1.data.format == GetDefaultFormat(desc1) &&
+      desc2.data.format == GetDefaultFormat(desc2)) {
+    mkldnn_mem_ptr ret(new mkldnn::memory(desc, mem->get_data_handle()));
+    stream->RegisterMem(ret);
+    return ret.get();
+  } else {
+    auto ret = TmpMemMgr::Get()->Alloc(desc);
+    stream->RegisterPrim(mkldnn::reorder(*mem, *ret));
+    return ret;
+  }
+}
+
+const mkldnn::memory *NDArray::GetMKLDNNData() const {
+  CHECK(storage_type() == kDefaultStorage);
+  // If this array uses MKLDNN layout and it's a view, we have to change its
+  // layout to the default layout.
+  if (IsMKLDNNData() && IsView())
+    ptr_->Reorder2Default();
+  ptr_->SetMKLMem(IsView() ? ptr_->storage_shape : shape_, dtype_);
+  // If shandle has data, the data in shandle and mkl_mem_ should match.
+  if (ptr_->shandle.dptr)
+    CHECK(ptr_->shandle.dptr == ptr_->mkl_mem_->get_data_handle());
+  MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_);
+  auto pd = ptr_->mkl_mem_->get_primitive_desc();
+  if (IsView()) {
+    // Sliced array must use the default layout.
+    CHECK_EQ(GetDefaultFormat(pd.desc()), pd.desc().data.format);
+  }
+  if (IsView()) {
+    void *off_addr = static_cast<char *>(ptr_->mkl_mem_->get_data_handle())
+        + byte_offset_;
+
+    // Create the primitive desc for the new mkldnn memory.
+    mkldnn::memory::dims dims(shape().ndim());
+    for (size_t i = 0; i < dims.size(); i++)
+      dims[i] = shape()[i];
+    mkldnn::memory::format cpp_format = static_cast<mkldnn::memory::format>(
+        GetDefaultFormat(shape().ndim()));
+    mkldnn::memory::data_type cpp_type = static_cast<mkldnn::memory::data_type>(
+        pd.desc().data.data_type);
+    mkldnn::memory::desc data_md(dims, cpp_type, cpp_format);
+    mkldnn::memory::primitive_desc new_pd(data_md, pd.get_engine());
+
+    std::shared_ptr<mkldnn::memory> ret(new mkldnn::memory(new_pd, off_addr));
+    MKLDNNStream::Get()->RegisterMem(ret);
+    return ret.get();
+  } else {
+    return ptr_->mkl_mem_.get();
+  }
+}
+
+void NDArray::MKLDNNDataReorder(const mkldnn::memory::primitive_desc &pd) {
+  CHECK_EQ(storage_type(), kDefaultStorage);
+  // If the memory already uses the specified layout, don't do anything.
+  if (ptr_->mkl_mem_ != nullptr && ptr_->mkl_mem_->get_primitive_desc() == pd)
+    return;
+  auto _pd = pd;
+  auto _desc = _pd.desc();
+  auto def_format = GetDefaultFormat(_desc);
+  // If the memory is default, don't do anything.
+  if (def_format == _desc.data.format && ptr_->IsDefault())
+    return;
+  // If the specified layout is default, we should use Reorder2Default.
+  if (def_format == _desc.data.format) {
+    ptr_->Reorder2Default();
+    return;
+  }
+
+  std::shared_ptr<mkldnn::memory> new_mem(new mkldnn::memory(pd));
+  ptr_->SetMKLMem(shape_, dtype_);
+  auto old_mem = ptr_->mkl_mem_;
+  // It's possible that the specified layout has a different number of dimensions.
+  if (old_mem->get_primitive_desc().desc().data.ndims != _desc.data.ndims) {
+    // For now, we only support reorder from the default layout.
+    CHECK(ptr_->IsDefault());
+    auto def_pd = GetPrimitiveDesc(pd, def_format);
+    old_mem.reset(new mkldnn::memory(def_pd, old_mem->get_data_handle()));
+  }
+  // This may be called in MKLDNN operators. We can't use MKLDNNStream here.
+  std::vector<mkldnn::primitive> net;
+  net.push_back(mkldnn::reorder(*old_mem, *new_mem));
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
+
+  CHECK(ptr_->shandle.size >= pd.get_size());
+  ptr_->CheckAndAlloc(pd.get_size());
+  // TODO(zhengda) We need to avoid memory copy here.
+  memcpy(ptr_->shandle.dptr, new_mem->get_data_handle(), pd.get_size());
+  ptr_->mkl_mem_.reset(new mkldnn::memory(pd, ptr_->shandle.dptr));
+}
+
+void NDArray::CopyFrom(const mkldnn::memory &mem) {
+  CHECK(ptr_ != nullptr) << "The NDArray hasn't been initialized";
+  if (ptr_->mkl_mem_.get() == &mem)
+    return;
+
+  CHECK(mem.get_primitive_desc().get_size() == shape().Size() * GetTypeSize(dtype_))
+      << "The size of NDArray doesn't match the requested MKLDNN memory desc";
+  MKLDNNStream *stream = MKLDNNStream::Get();
+  // If this array uses MKLDNN layout and it's a view, we have to change its
+  // layout to the default layout.
+  if (IsMKLDNNData() && IsView())
+    ptr_->Reorder2Default();
+  ptr_->SetMKLMem(IsView() ? ptr_->storage_shape : shape_,
+                  dtype_);
+  stream->RegisterMem(ptr_->mkl_mem_);
+  auto from_desc = mem.get_primitive_desc().desc();
+  auto this_desc = ptr_->mkl_mem_->get_primitive_desc().desc();
+  auto from_def_format = GetDefaultFormat(from_desc);
+  if (IsView()) {
+    // Sliced array must use the default layout.
+    CHECK_EQ(GetDefaultFormat(this_desc), this_desc.data.format);
+  }
+  // It's possible that the memory and the NDArray don't have the same shape.
+  if (!same_shape(shape_, from_desc.data.dims, from_desc.data.ndims)
+      // If the source memory uses the default layout, we can reshape directly.
+      && from_def_format == from_desc.data.format) {
+    // In this case, we can simply create a new MKLDNN memory for the required
+    // shape.
+    mkldnn::memory::dims dims(this_desc.data.dims,
+                              this_desc.data.dims + this_desc.data.ndims);
+    auto this_dtype = static_cast<mkldnn::memory::data_type>(this_desc.data.data_type);
+    auto this_format = static_cast<mkldnn::memory::format>(GetDefaultFormat(this_desc));
+    mkldnn::memory::desc data_md(dims, this_dtype, this_format);
+    mkldnn::memory::primitive_desc pd(data_md, mem.get_primitive_desc().get_engine());
+    mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, mem.get_data_handle()));
+    stream->RegisterMem(tmp_mem);
+    stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->mkl_mem_));
+  } else if (!same_shape(shape_, from_desc.data.dims, from_desc.data.ndims)) {
+    // In this case, the source memory stores data in a customized layout. We
+    // need to reorganize the data in memory before we can reshape.
+    auto def_pd = GetPrimitiveDesc(mem.get_primitive_desc(), from_def_format);
+    auto def_mem = TmpMemMgr::Get()->Alloc(def_pd);
+    stream->RegisterPrim(mkldnn::reorder(mem, *def_mem));
+    // Now we can reshape it
+    mkldnn::memory::dims dims(this_desc.data.dims,
+                              this_desc.data.dims + this_desc.data.ndims);
+    auto this_dtype = static_cast<mkldnn::memory::data_type>(this_desc.data.data_type);
+    auto this_format = static_cast<mkldnn::memory::format>(GetDefaultFormat(this_desc));
+    mkldnn::memory::desc data_md(dims, this_dtype, this_format);
+    mkldnn::memory::primitive_desc pd(data_md, mem.get_primitive_desc().get_engine());
+    mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, def_mem->get_data_handle()));
+    stream->RegisterMem(tmp_mem);
+    stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->mkl_mem_));
+  } else if (mem.get_primitive_desc() == ptr_->mkl_mem_->get_primitive_desc()) {
+    // If the layout is the same, we can just copy data.
+    stream->RegisterPrim(mkldnn::reorder(mem, *ptr_->mkl_mem_));
+  } else {
+    auto src_def = GetDefaultFormat(mem.get_primitive_desc().desc());
+    auto dst_def = GetDefaultFormat(ptr_->mkl_mem_->get_primitive_desc().desc());
+    // If both are not using the default layouts. There isn't much we can do,
+    // other than reorder data layout directly.
+    if (dst_def != ptr_->mkl_mem_->get_primitive_desc().desc().data.format
+        && src_def != mem.get_primitive_desc().desc().data.format) {
+      stream->RegisterPrim(mkldnn::reorder(mem, *ptr_->mkl_mem_));
+    } else if (dst_def == ptr_->mkl_mem_->get_primitive_desc().desc().data.format) {
+      // If the dest mem uses the default memory layout, we can simply use
+      // the default format of the source memory to improve perf of reorder.
+      auto pd = GetPrimitiveDesc(ptr_->mkl_mem_->get_primitive_desc(), src_def);
+      mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, ptr_->mkl_mem_->get_data_handle()));
+      stream->RegisterMem(tmp_mem);
+      stream->RegisterPrim(mkldnn::reorder(mem, *tmp_mem));
+    } else {
+      // If the src mem uses the default memory layout, we can use
+      // the default format of the source memory to improve perf.
+      auto pd = GetPrimitiveDesc(mem.get_primitive_desc(), dst_def);
+      mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, mem.get_data_handle()));
+      stream->RegisterMem(tmp_mem);
+      stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->mkl_mem_));
+    }
+  }
+}
+mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc pd,
+                                                mkldnn_memory_format_t format);
+
+mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::primitive_desc &desc) {
+  // This array shouldn't be a view.
+  CHECK(!IsView());
+
+  if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
+    LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc";
+    return nullptr;
+  }
+
+  mkldnn::memory::primitive_desc _desc = desc;
+  auto required_format = _desc.desc().data.format;
+  auto def_format = GetDefaultFormat(_desc.desc());
+  // If the required format is a default format, we don't need to worry about the shape.
+  // If the shape isn't the same, it actually implicitly reshapes data.
+  if (required_format == def_format) {
+    ptr_->SetMKLMem(shape_, dtype_);
+    MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_);
+    return GetMKLDNNExact(ptr_->mkl_mem_.get(), desc);
+  }
+
+  if (ptr_->mkl_mem_)
+    CHECK(ptr_->mkl_mem_->get_data_handle() == ptr_->shandle.dptr);
+  if (ptr_->mkl_mem_ && ptr_->mkl_mem_->get_primitive_desc() == desc) {
+    MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_);
+    return GetMKLDNNExact(ptr_->mkl_mem_.get(), desc);
+  }
+
+  CHECK(ptr_->shandle.size >= desc.get_size());
+  ptr_->CheckAndAlloc(desc.get_size());
+  ptr_->mkl_mem_.reset(new mkldnn::memory(desc, ptr_->shandle.dptr));
+  MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_);
+  return ptr_->mkl_mem_.get();
+}
+#endif
+
+void NDArray::SetTBlob() const {
+  CHECK(ptr_ != nullptr);
+  TShape shape = shape_;
+  char *dptr = static_cast<char*>(ptr_->shandle.dptr);
+  auto stype = storage_type();
+  if (stype == kDefaultStorage) {
+#if MXNET_USE_MKLDNN == 1
+    if (IsMKLDNNData()) {
+      ptr_->Reorder2Default();
+      dptr = static_cast<char*>(ptr_->shandle.dptr);
+    }
+#endif
+    dptr += byte_offset_;
+  } else if (stype == kCSRStorage || stype == kRowSparseStorage) {
+    CHECK_EQ(byte_offset_, 0);
+    shape = storage_shape();
+  } else {
+    LOG(FATAL) << "unknown storage type " << stype;
+  }
+  tblob_.dptr_ = dptr;
+  tblob_.shape_ = shape;
+  tblob_.type_flag_ = dtype_;
+  tblob_.SetDLTensor(ptr_->shandle.ctx.dev_mask(), ptr_->shandle.ctx.dev_id);
+}
 
 /*!
 * \brief run a ternary operation
@@ -449,11 +984,51 @@ inline void CopyFromToRspImpl(const NDArray& from, const NDArray& to, RunContext
 // Make a copy of a dense NDArray
 template<typename from_xpu, typename to_xpu>
 inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext ctx) {
-  using namespace mshadow;
-  CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type";
-  TBlob tmp = to.data();
-  ndarray::Copy<from_xpu, to_xpu>(from.data(), &tmp,
-                                  from.ctx(), to.ctx(), ctx);
+#if MXNET_USE_MKLDNN == 1
+  // If neither is MKLDNN, we can copy data normally.
+  if (!from.IsMKLDNNData() && !to.IsMKLDNNData()) {
+#endif
+    using namespace mshadow;
+    CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type";
+    TBlob tmp = to.data();
+    ndarray::Copy<from_xpu, to_xpu>(from.data(), &tmp,
+                                    from.ctx(), to.ctx(), ctx);
+#if MXNET_USE_MKLDNN == 1
+  } else if (SupportMKLDNN(from.dtype(), from.shape())
+             && SupportMKLDNN(to.dtype(), to.shape())
+             && from.ctx().dev_mask() == cpu::kDevMask
+             && to.ctx().dev_mask() == cpu::kDevMask) {
+    // If we copy data directly, we need to make sure both NDArrays are supported
+    // by MKLDNN.
+    auto from_mem = from.GetMKLDNNData();
+    auto to_mem = to.GetMKLDNNData();
+    if (from_mem->get_primitive_desc() == to_mem->get_primitive_desc()) {
+      size_t size = std::min(from_mem->get_primitive_desc().get_size(),
+                             to_mem->get_primitive_desc().get_size());
+      memcpy(to_mem->get_data_handle(), from_mem->get_data_handle(), size);
+    } else {
+      std::vector<mkldnn::primitive> net;
+      net.push_back(mkldnn::reorder(*from_mem, *to_mem));
+      mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
+    }
+  } else {
+    // In this case, one of the NDArray isn't supported by MKLDNN, we need
+    // to convert the MKLDNN array to the default format first and copy data
+    // with Copy().
+    NDArray tmp_from = from;
+    if (tmp_from.IsMKLDNNData()) {
+      tmp_from = NDArray(from.shape(), from.ctx(), false, from.dtype());
+      auto tmp_mem = from.GetMKLDNNData();
+      tmp_from.CopyFrom(*tmp_mem);
+      MKLDNNStream::Get()->Submit();
+    }
+    CHECK(tmp_from.IsDefaultData());
+    CHECK(to.IsDefaultData());
+    TBlob tmp = to.data();
+    ndarray::Copy<from_xpu, to_xpu>(from.data(), &tmp,
+                                    from.ctx(), to.ctx(), ctx);
+  }
+#endif
 }
 
 // Make a copy of an NDArray based on storage type
diff --git a/src/operator/concat-inl.h b/src/operator/concat-inl.h
deleted file mode 100644
index 4225ddf4eac0..000000000000
--- a/src/operator/concat-inl.h
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file concat-inl.h
- * \brief
- * \author Bing Xu
-*/
-#ifndef MXNET_OPERATOR_CONCAT_INL_H_
-#define MXNET_OPERATOR_CONCAT_INL_H_
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <cstring>
-#include <map>
-#include <string>
-#include <vector>
-#include <utility>
-#include "./operator_common.h"
-#include "./channel_op_common.h"
-#include "./tensor/broadcast_reduce_op.h"
-
-namespace mxnet {
-namespace op {
-
-namespace concat_enum {
-enum ConcatOpInputs {kData0, kData1, kData2, kData3, kData4};
-enum ConcatOpOutputs {kOut};
-}  // namespace concat_enum
-
-struct ConcatParam : public dmlc::Parameter<ConcatParam> {
-  int num_args;
-  int dim;
-  DMLC_DECLARE_PARAMETER(ConcatParam) {
-    DMLC_DECLARE_FIELD(num_args).set_lower_bound(1)
-    .describe("Number of inputs to be concated.");
-    DMLC_DECLARE_FIELD(dim).set_default(1)
-    .describe("the dimension to be concated.");
-  }
-};  // struct ConcatParam
-
-template<typename xpu, typename DType>
-class ConcatOp : public Operator {
- public:
-  explicit ConcatOp(ConcatParam param)
-    : size_(param.num_args), dimension_(param.dim) {}
-
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(static_cast<int>(in_data.size()), size_);
-    CHECK_EQ(out_data.size(), 1U);
-    int axis = CheckAxis(dimension_, in_data[concat_enum::kData0].ndim());
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    std::vector<Tensor<xpu, 3, DType> > data(size_);
-    Tensor<xpu, 3, DType> out;
-    size_t leading = 1, trailing = 1;
-    for (int i = 0; i < axis; ++i) {
-      leading *= out_data[concat_enum::kOut].shape_[i];
-    }
-    for (int i = axis + 1; i < out_data[concat_enum::kOut].ndim(); ++i) {
-      trailing *= out_data[concat_enum::kOut].shape_[i];
-    }
-    size_t mid = out_data[concat_enum::kOut].shape_[axis];
-    Shape<3> oshape = Shape3(leading, mid, trailing);
-    out = out_data[concat_enum::kOut].get_with_shape<xpu, 3, DType>(oshape, s);
-
-    for (int i = 0; i < size_; ++i) {
-      Shape<3> dshape = Shape3(leading, in_data[i].shape_[axis], trailing);
-      data[i] = in_data[i].get_with_shape<xpu, 3, DType>(dshape, s);
-    }
-    Concatenate(data, &out, 1, req[concat_enum::kOut]);
-  }
-
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_states) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1U);
-    CHECK_EQ(in_grad.size(), static_cast<size_t>(size_));
-    int axis = CheckAxis(dimension_, out_grad[concat_enum::kData0].ndim());
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    std::vector<Tensor<xpu, 3, DType> > grad_in(size_);
-    Tensor<xpu, 3, DType> grad;
-    size_t leading = 1, trailing = 1;
-    for (int i = 0; i < axis; ++i) {
-      leading *= out_grad[concat_enum::kOut].shape_[i];
-    }
-    for (int i = axis + 1; i < out_grad[concat_enum::kOut].ndim(); ++i) {
-      trailing *= out_grad[concat_enum::kOut].shape_[i];
-    }
-    size_t mid = out_grad[concat_enum::kOut].shape_[axis];
-    Shape<3> oshape = Shape3(leading, mid, trailing);
-    grad = out_grad[concat_enum::kOut].get_with_shape<xpu, 3, DType>(oshape, s);
-
-    for (int i = 0; i < size_; ++i) {
-      Shape<3> dshape = Shape3(leading, in_grad[i].shape_[axis], trailing);
-      grad_in[i] = in_grad[i].get_with_shape<xpu, 3, DType>(dshape, s);
-    }
-    Split(grad, &grad_in, 1, req);
-  }
-
- private:
-  int size_;
-  int dimension_;
-};  // class ConcatOp
-
-template<typename xpu>
-Operator *CreateOp(ConcatParam param, int dtype, std::vector<TShape> *in_shape);
-
-#if DMLC_USE_CXX11
-class ConcatProp : public OperatorProperty {
- public:
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  std::vector<std::string> ListArguments() const override {
-    std::vector<std::string> ret;
-    for (int i = 0; i < param_.num_args; ++i) {
-      ret.push_back(std::string("arg") + std::to_string(i));
-    }
-    return ret;
-  }
-
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-    using namespace mshadow;
-    CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
-    TShape dshape;
-    index_t size = 0;
-    bool has_zero = false;
-    int axis = -1;
-    for (int i = 0; i < param_.num_args; ++i) {
-      TShape tmp = (*in_shape)[i];
-      if (tmp.ndim()) {
-        axis = CheckAxis(param_.dim, tmp.ndim());
-        has_zero = tmp[axis] == 0 || has_zero;
-        size += tmp[axis];
-        tmp[axis] = 0;
-        shape_assign(&dshape, tmp);
-      }
-    }
-
-    TShape tmp = (*out_shape)[0];
-    if (tmp.ndim()) {
-      axis = CheckAxis(param_.dim, tmp.ndim());
-      tmp[axis] = 0;
-      shape_assign(&dshape, tmp);
-    }
-
-    if (dshape.ndim() == 0) return false;
-
-    for (int i = 0; i < param_.num_args; ++i) {
-      CHECK(shape_assign(&(*in_shape)[i], dshape))
-        << "Incompatible input shape: expected " << dshape << ", got " << (*in_shape)[i];
-    }
-
-    if (!has_zero) dshape[axis] = size;
-    CHECK(shape_assign(&(*out_shape)[0], dshape))
-      << "Incompatible output shape: expected " << dshape << ", got " << (*out_shape)[0];
-
-    return dshape.Size() != 0;
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    int dtype = -1;
-
-    for (size_t i = 0; i < in_type->size(); ++i) {
-      if (dtype == -1) {
-        dtype = in_type->at(i);
-      } else {
-        CHECK(in_type->at(i) == dtype ||
-              in_type->at(i) == -1) <<
-              "Non-uniform data type in Concat";
-      }
-    }
-
-    if (dtype == -1) {
-      LOG(FATAL) << "Not enough information to infer type in Concat.";
-      return false;
-    }
-
-    size_t nin = this->ListArguments().size();
-    in_type->clear();
-    for (size_t i = 0; i < nin; ++i) in_type->push_back(dtype);
-
-    size_t naux = this->ListAuxiliaryStates().size();
-    aux_type->clear();
-    for (size_t i = 0; i < naux; ++i) aux_type->push_back(dtype);
-
-    size_t nout = this->ListOutputs().size();
-    out_type->clear();
-    for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype);
-
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    auto ptr = new ConcatProp();
-    ptr->param_ = param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override {
-    return "Concat";
-  }
-
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return out_grad;
-  }
-
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not implemented";
-    return NULL;
-  }
-
-  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                             std::vector<int> *in_type) const override;
-
- private:
-  ConcatParam param_;
-};  // class ConcatProp
-#endif  // DMLC_USE_CXX11
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_OPERATOR_CONCAT_INL_H_
diff --git a/src/operator/concat.cc b/src/operator/concat.cc
deleted file mode 100644
index 4d3c2fa1661f..000000000000
--- a/src/operator/concat.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file concat.cc
- * \brief
- * \author Bing Xu
-*/
-
-#include "./concat-inl.h"
-#if MXNET_USE_MKL2017 == 1
-#include <mkl_memory.h>
-#include "./mkl/mkl_memory-inl.h"
-#include "./mkl/mkl_concat-inl.h"
-#endif  // MXNET_USE_MKL2017
-
-namespace mxnet {
-namespace op {
-template<>
-Operator* CreateOp<cpu>(ConcatParam param, int dtype, std::vector<TShape> *in_shape) {
-  Operator *op = NULL;
-#if MXNET_USE_MKL2017 == 1
-  // MKL supports 4D input tensors only for concat operation
-  // 2D/3D input tensors are reshaped to 4D in mkl_concat-inl.h
-  // hence MKL supports 2D/3D/4D input tensors for concat operation
-  size_t dims = (*in_shape)[0].ndim();
-  bool supportedDim = (dims >= 2 && dims <= 4);
-  if ((1 == param.dim) && supportedDim &&
-    (param.num_args < (dnnResourceMultipleDst - dnnResourceMultipleSrc))) {
-    switch (dtype) {
-      case mshadow::kFloat32:
-      return new MKLConcatOp<cpu, float>(param);
-    case mshadow::kFloat64:
-      return new MKLConcatOp<cpu, double>(param);
-    default:
-      break;
-    }
-  }
-  if (enableMKLWarnGenerated())
-    LOG(INFO) << MKLConcatOp<cpu, float>::getName() << " Skip MKL optimization";
-#endif
-  MSHADOW_TYPE_SWITCH(dtype, DType, {
-    op = new ConcatOp<cpu, DType>(param);
-  });
-  return op;
-}
-
-Operator* ConcatProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                                       std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0), in_shape);
-}
-
-DMLC_REGISTER_PARAMETER(ConcatParam);
-
-MXNET_REGISTER_OP_PROPERTY(Concat, ConcatProp)
-.describe(R"code(Joins input arrays along a given axis.
-
-.. note:: `Concat` is deprecated. Use `concat` instead.
-
-The dimensions of the input arrays should be the same except the axis along
-which they will be concatenated.
-The dimension of the output array along the concatenated axis will be equal
-to the sum of the corresponding dimensions of the input arrays.
-
-Example::
-
-   x = [[1,1],[2,2]]
-   y = [[3,3],[4,4],[5,5]]
-   z = [[6,6], [7,7],[8,8]]
-
-   concat(x,y,z,dim=0) = [[ 1.,  1.],
-                          [ 2.,  2.],
-                          [ 3.,  3.],
-                          [ 4.,  4.],
-                          [ 5.,  5.],
-                          [ 6.,  6.],
-                          [ 7.,  7.],
-                          [ 8.,  8.]]
-
-   Note that you cannot concat x,y,z along dimension 1 since dimension
-   0 is not the same for all the input arrays.
-
-   concat(y,z,dim=1) = [[ 3.,  3.,  6.,  6.],
-                         [ 4.,  4.,  7.,  7.],
-                         [ 5.,  5.,  8.,  8.]]
-
-)code" ADD_FILELINE)
-.add_argument("data", "NDArray-or-Symbol[]", "List of arrays to concatenate")
-.add_arguments(ConcatParam::__FIELDS__())
-.set_key_var_num_args("num_args");
-
-NNVM_REGISTER_OP(Concat).add_alias("concat");
-
-}  // namespace op
-}  // namespace mxnet
diff --git a/src/operator/convolution_v1.cc b/src/operator/convolution_v1.cc
index 7de6a34425f5..86c0fbb33291 100644
--- a/src/operator/convolution_v1.cc
+++ b/src/operator/convolution_v1.cc
@@ -25,11 +25,6 @@
 */
 
 #include "./convolution_v1-inl.h"
-#if MXNET_USE_MKL2017 == 1
-#include <mkl_memory.h>
-#include "./mkl/mkl_memory-inl.h"
-#include "./mkl/mkl_convolution-inl.h"
-#endif  // MXNET_USE_MKL2017
 #if MXNET_USE_NNPACK == 1
 #include "./nnpack/nnpack_convolution-inl.h"
 #endif  // MXNET_USE_NNPACK
diff --git a/src/operator/lrn-inl.h b/src/operator/lrn-inl.h
deleted file mode 100644
index adfe4676702d..000000000000
--- a/src/operator/lrn-inl.h
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file lrn-inl.h
- * \brief
- * \author Bing Xu
-*/
-#ifndef MXNET_OPERATOR_LRN_INL_H_
-#define MXNET_OPERATOR_LRN_INL_H_
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <map>
-#include <vector>
-#include <string>
-#include <utility>
-#include "./operator_common.h"
-#include "./mshadow_op.h"
-
-namespace mxnet {
-namespace op {
-
-namespace lrn_enum {
-enum LRNInputs {kData};
-enum LRNOutputs {kOut, kTmpNorm};
-}  // namespace lrn_enum
-
-struct LRNParam : public dmlc::Parameter<LRNParam> {
-  float alpha;
-  float beta;
-  float knorm;
-  uint32_t nsize;
-  DMLC_DECLARE_PARAMETER(LRNParam) {
-    DMLC_DECLARE_FIELD(alpha).set_default(1e-4f)
-    .describe("The variance scaling parameter :math:`\alpha` in the LRN expression.");
-    DMLC_DECLARE_FIELD(beta).set_default(0.75f)
-    .describe("The power parameter :math:`\beta` in the LRN expression.");
-    DMLC_DECLARE_FIELD(knorm).set_default(2.0f)
-    .describe("The parameter :math:`k` in the LRN expression.");
-    DMLC_DECLARE_FIELD(nsize)
-    .describe("normalization window width in elements.");
-  }
-};  // struct LRNParam
-
-template<typename xpu>
-class LocalResponseNormOp : public Operator {
- public:
-  explicit LocalResponseNormOp(LRNParam param) {
-    param_ = param;
-  }
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_states) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    // TODO(xxx): Test with gradient chceker
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 2U);
-    // CHECK_EQ(req.size(), 2);
-    CHECK_EQ(param_.nsize % 2, 1U) << "LRN only supports odd values for local_size";
-    const real_t salpha = param_.alpha / param_.nsize;
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> data = in_data[lrn_enum::kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> out = out_data[lrn_enum::kOut].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> tmp_norm = out_data[lrn_enum::kTmpNorm].get<xpu, 4, real_t>(s);
-    tmp_norm = chpool<red::sum>(F<mshadow_op::square>(data) , param_.nsize) * salpha + param_.knorm;
-    Assign(out, req[lrn_enum::kOut], data *  F<mshadow_op::power>(tmp_norm, -param_.beta));
-  }
-
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_states) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1U);
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 2U);
-    const real_t salpha = param_.alpha / param_.nsize;
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> grad = out_grad[lrn_enum::kOut].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> tmp_norm = out_data[lrn_enum::kTmpNorm].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> data = in_data[lrn_enum::kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> grad_in = in_grad[lrn_enum::kData].get<xpu, 4, real_t>(s);
-    grad_in = grad * F<mshadow_op::power>(tmp_norm, -param_.beta);
-    grad_in += (- 2.0f * param_.beta * salpha) *
-               chpool<red::sum>(grad * data *
-                                F<mshadow_op::power>(tmp_norm, -param_.beta - 1.0f),
-                                param_.nsize)  * data;
-  }
-
- private:
-  LRNParam param_;
-};  // class LocalResponseNormOp
-
-template<typename xpu>
-Operator *CreateOp(LRNParam param, int dtype);
-
-#if DMLC_USE_CXX11
-class LocalResponseNormProp : public OperatorProperty {
- public:
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-    using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 1U) << "Input:[data]";
-    const TShape &dshape = in_shape->at(0);
-    if (dshape.ndim() == 0) return false;
-    out_shape->clear();
-    out_shape->push_back(dshape);
-    out_shape->push_back(dshape);
-    return true;
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1U);
-    int dtype = (*in_type)[0];
-    CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
-      if ((*in_type)[i] == -1) {
-        (*in_type)[i] = dtype;
-      } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
-      }
-    }
-    int n_out = this->ListOutputs().size();
-    out_type->clear();
-    for (int i = 0; i < n_out; ++i ) out_type->push_back(dtype);
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    auto ptr = new LocalResponseNormProp();
-    ptr->param_ = param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override {
-    return "LRN";
-  }
-
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return {
-      out_grad[lrn_enum::kOut], in_data[lrn_enum::kData],
-      out_data[lrn_enum::kTmpNorm], out_data[lrn_enum::kOut]
-    };
-  }
-
-  int NumVisibleOutputs() const override {
-    return 1;
-  }
-
-  int NumOutputs() const override {
-    return 2;
-  }
-
-  std::vector<std::string> ListArguments() const override {
-    return {"data"};
-  }
-
-  std::vector<std::string> ListOutputs() const override {
-    return {"output", "tmp_norm"};
-  }
-
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented.";
-    return NULL;
-  }
-
-  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                             std::vector<int> *in_type) const override;
-
- private:
-  LRNParam param_;
-};  // LocalResponseNormProp
-#endif  // DMLC_USE_CXX11
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_LRN_INL_H_
diff --git a/src/operator/lrn.cc b/src/operator/lrn.cc
deleted file mode 100644
index 9b3afd80cd18..000000000000
--- a/src/operator/lrn.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file lrn.cc
- * \brief
- * \author Bing Xu
-*/
-
-#include "./lrn-inl.h"
-#if MXNET_USE_CUDNN == 1
-#include "./cudnn_lrn-inl.h"
-#endif
-#if MXNET_USE_MKL2017 == 1
-#include <mkl_memory.h>
-#include "./mkl/mkl_memory-inl.h"
-#include "./mkl/mkl_lrn-inl.h"
-#endif
-
-namespace mxnet {
-namespace op {
-template<>
-Operator* CreateOp<cpu>(LRNParam param, int dtype) {
-#if MXNET_USE_MKL2017 == 1
-  return new MKLLRNOp<cpu, float>(param);
-#endif
-  return new LocalResponseNormOp<cpu>(param);
-}
-
-// DO_BIND_DISPATCH comes from operator_common.h
-Operator* LocalResponseNormProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-    std::vector<int> *in_type) const {
-    DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
-}
-
-DMLC_REGISTER_PARAMETER(LRNParam);
-
-MXNET_REGISTER_OP_PROPERTY(LRN, LocalResponseNormProp)
-.add_argument("data", "NDArray-or-Symbol", "Input data.")
-.add_arguments(LRNParam::__FIELDS__())
-.describe(R"code(Applies local response normalization to the input.
-
-The local response normalization layer performs "lateral inhibition" by normalizing
-over local input regions.
-
-If :math:`a_{x,y}^{i}` is the activity of a neuron computed by applying kernel :math:`i` at position
-:math:`(x, y)` and then applying the ReLU nonlinearity, the response-normalized
-activity :math:`b_{x,y}^{i}` is given by the expression:
-
-.. math::
-   b_{x,y}^{i} = \frac{a_{x,y}^{i}}{\Bigg({k + \alpha \sum_{j=max(0, i-\frac{n}{2})}^{min(N-1, i+\frac{n}{2})} (a_{x,y}^{j})^{2}}\Bigg)^{\beta}}
-
-where the sum runs over :math:`n` "adjacent" kernel maps at the same spatial position, and :math:`N` is the total
-number of kernels in the layer.
-
-)code" ADD_FILELINE);
-
-}  // namespace op
-}  // namespace mxnet
diff --git a/src/operator/mkl/mkl_batch_norm-inl.h b/src/operator/mkl/mkl_batch_norm-inl.h
deleted file mode 100644
index b5967f4de294..000000000000
--- a/src/operator/mkl/mkl_batch_norm-inl.h
+++ /dev/null
@@ -1,391 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_batch_norm-inl.h
-* \brief
-* \author lingyan.guo@intel.com
-*         zhenlin.luo@intel.com
-*
-*******************************************************************************/
-#ifndef MXNET_OPERATOR_MKL_MKL_BATCH_NORM_INL_H_
-#define MXNET_OPERATOR_MKL_MKL_BATCH_NORM_INL_H_
-#include <mxnet/storage.h>
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <map>
-#include <vector>
-#include <string>
-#include <utility>
-#include "../operator_common.h"
-#include "../mshadow_op.h"
-#include "./mkl_util-inl.h"
-
-namespace mxnet {
-namespace op {
-
-template<typename xpu, typename DType>
-class MKLBatchNormOp : public Operator {
- public:
-  explicit MKLBatchNormOp(BatchNormParam param) {
-    this->param_ = param;
-    fwd_top_data = MKLData<DType>::create();
-    fwd_bottom_data = MKLData<DType>::create();
-    bwd_top_diff = MKLData<DType>::create();
-    bwd_bottom_diff = MKLData<DType>::create();
-    scaleShift_space.dptr = NULL;
-    scaleShiftDiff_space.dptr = NULL;
-  }
-  virtual ~MKLBatchNormOp() {
-    if (batchNormFwdInference != NULL) dnnDelete<DType>(batchNormFwdInference);
-    if (batchNormFwdTraining != NULL) dnnDelete<DType>(batchNormFwdTraining);
-    if (batchNormBwdScaleShift != NULL) dnnDelete<DType>(batchNormBwdScaleShift);
-    dnnLayoutDelete<DType>(layout_usr_);
-    if (scaleShift_space.dptr)
-      Storage::Get()->Free(scaleShift_space);
-    if (scaleShiftDiff_space.dptr)
-      Storage::Get()->Free(scaleShiftDiff_space);
-  }
-  static std::string getName() {
-    return "MKLBatchNormOp";
-  }
-
- private:
-  void LayerSetUp(const mshadow::Tensor<xpu, 4, DType> &data,
-                  const mshadow::Tensor<xpu, 4, DType> &out) {
-    eps_ = param_.eps;
-    size_t dim = 4, sizes[4], strides[4];
-    channels_ = data.shape_[1];
-    height_ = data.shape_[2];
-    width_ = data.shape_[3];
-    num_ = data.shape_[0];
-
-    sizes[0] = width_;
-    sizes[1] = height_;
-    sizes[2] = channels_;
-    sizes[3] = num_;
-
-    strides[0] = 1;
-    strides[1] = sizes[0];
-    strides[2] = sizes[0] * sizes[1];
-    strides[3] = sizes[0] * sizes[1] * sizes[2];
-
-    // Names are for debugging only
-    fwd_bottom_data->name = "fwd_bottom_data   @ " + getName();
-    fwd_top_data->name = "fwd_top_data      @ " + getName();
-    bwd_bottom_diff->name = "bwd_bottom_diff   @ " + getName();
-    bwd_top_diff->name = "bwd_top_diff      @ " + getName();
-
-    dnnError_t e;
-    e = dnnLayoutCreate<DType>(&layout_usr_, dim, sizes, strides);
-    CHECK_EQ(e, E_SUCCESS);
-
-    fwd_bottom_data->create_user_layout(dim, sizes, strides);
-    fwd_top_data->create_user_layout(dim, sizes, strides);
-    bwd_bottom_diff->create_user_layout(dim, sizes, strides);
-    bwd_top_diff->create_user_layout(dim, sizes, strides);
-
-    // Primitives will be allocated during the first fwd pass
-    batchNormFwdInference = NULL;
-    batchNormFwdTraining = NULL;
-    batchNormBwdScaleShift = NULL;
-    int scaleShift_size = channels_*2*sizeof(DType);
-    scaleShift_space = Storage::Get()->Alloc(scaleShift_size, Context::CPU());
-    scaleShiftDiff_space = Storage::Get()->Alloc(scaleShift_size, Context::CPU());
-    DType * scaleShift_buf = reinterpret_cast<DType*>(scaleShift_space.dptr);
-    /*!use_weight_bias_*/
-    for (int i = 0; i < channels_; i++) {
-        scaleShift_buf[i] = 1.0;
-        scaleShift_buf[channels_ + i] = 0;
-    }
-  }
-
- public:
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_states) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 3);
-    CHECK_EQ(aux_states.size(), 2);
-    if (ctx.is_train) {
-      CHECK_EQ(out_data.size(), 3);
-      CHECK_EQ(req.size(), 3);
-    } else {
-      CHECK_GE(out_data.size(), 1);
-      CHECK_GE(req.size(), 1);
-      CHECK_EQ(req[batchnorm::kOut], kWriteTo);
-    }
-
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType>  data;
-    Tensor<xpu, 4, DType>  out;
-    if (in_data[batchnorm::kData].ndim() == 2) {
-      Shape<4> dshape = Shape4(in_data[batchnorm::kData].shape_[0],
-                               in_data[batchnorm::kData].shape_[1], 1, 1);
-      data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        in_data[batchnorm::kData], dshape, s);
-      out = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_data[batchnorm::kOut], dshape, s);
-    } else {
-      data = mkl_experimental_direct_get<xpu, 4, DType>(in_data[batchnorm::kData], s);
-      out = mkl_experimental_direct_get<xpu, 4, DType>(out_data[batchnorm::kOut], s);
-    }
-
-    // const real_t scale = static_cast<real_t>(in_data[batchnorm::kData].shape_[1]) /
-    //   static_cast<real_t>(in_data[batchnorm::kData].shape_.Size());
-
-    Tensor<xpu, 1, DType> slope = in_data[batchnorm::kGamma].get<xpu, 1, DType>(s);
-    Tensor<xpu, 1, DType> bias = in_data[batchnorm::kBeta].get<xpu, 1, DType>(s);
-    Tensor<xpu, 1, DType> moving_mean = aux_states[batchnorm::kMovingMean].get<xpu, 1, DType>(s);
-    Tensor<xpu, 1, DType> moving_var = aux_states[batchnorm::kMovingVar].get<xpu, 1, DType>(s);
-
-    if (param_.fix_gamma)
-      slope = 1.f;
-
-    dnnError_t e;
-    if (!init_mkldnn_) {
-      LayerSetUp(data, out);
-      init_mkldnn_ = true;
-    }
-    void* bottom_data = NULL;
-#if MKL_EXPERIMENTAL == 1
-    bottom_data =
-          reinterpret_cast<void *>(mkl_prv_data<DType>(in_data[batchnorm::kData]));
-#endif
-    int bwd_flags = dnnUseScaleShift;
-    if (param_.use_global_stats)
-      bwd_flags = dnnUseScaleShift | dnnUseInputMeanVariance;
-#if MKL_EXPERIMENTAL == 1
-    if (NULL != bottom_data) {
-      // Is it the first pass? Create a primitive.
-      if (batchNormFwdInference == NULL) {
-        std::shared_ptr<MKLMemHolder> bottom_data_mem = in_data[batchnorm::kData].Mkl_mem_;
-        std::shared_ptr<PrvMemDescr> bottom_prv_desc = bottom_data_mem->get_prv_descriptor();
-        CHECK(bottom_prv_desc->get_descr_type() == PrvMemDescr::PRV_DESCR_MKL2017);
-        std::shared_ptr<MKLData<DType> > mem_descr
-          = std::static_pointer_cast<MKLData<DType>>(bottom_prv_desc);
-        CHECK(mem_descr != NULL);
-        fwd_bottom_data = mem_descr;
-
-        e = dnnBatchNormalizationCreateForward_v2<DType>(
-             &batchNormFwdInference, NULL, mem_descr->layout_int, eps_,
-             dnnUseInputMeanVariance | dnnUseScaleShift);
-        CHECK_EQ(e, E_SUCCESS);
-
-        e = dnnBatchNormalizationCreateForward_v2<DType>(
-              &batchNormFwdTraining, NULL, mem_descr->layout_int, eps_,
-              dnnUseScaleShift);
-        CHECK_EQ(e, E_SUCCESS);
-
-        fwd_top_data->create_internal_layout(batchNormFwdInference, dnnResourceDst);
-        bwd_top_diff->create_internal_layout(batchNormFwdInference, dnnResourceDst);
-        bwd_bottom_diff->create_internal_layout(batchNormFwdInference, dnnResourceSrc);
-
-        e = dnnBatchNormalizationCreateBackward_v2<DType>(
-                &batchNormBwdScaleShift, NULL, mem_descr->layout_int, eps_, bwd_flags);
-        CHECK_EQ(e, E_SUCCESS);
-      }
-    }
-#endif
-    if (NULL == bottom_data) {
-      if (batchNormFwdInference == NULL) {
-        e = dnnBatchNormalizationCreateForward_v2<DType>(
-          &batchNormFwdInference, NULL, layout_usr_, eps_,
-          dnnUseInputMeanVariance | dnnUseScaleShift);
-        CHECK_EQ(e, E_SUCCESS);
-
-        e = dnnBatchNormalizationCreateForward_v2<DType>(
-              &batchNormFwdTraining, NULL, layout_usr_, eps_, dnnUseScaleShift);
-        CHECK_EQ(e, E_SUCCESS);
-
-        e = dnnBatchNormalizationCreateBackward_v2<DType>(
-              &batchNormBwdScaleShift, NULL, layout_usr_, eps_, bwd_flags);
-        CHECK_EQ(e, E_SUCCESS);
-      }
-      bottom_data = reinterpret_cast<void *>(data.dptr_);
-    }
-
-    DType * scaleShift_buf = reinterpret_cast<DType*>(scaleShift_space.dptr);
-     // use_weight_bias_
-    for (int i = 0; i < channels_; i++) {
-        scaleShift_buf[i] = (slope.dptr_)[i];
-    }
-    for (int i = 0; i < channels_; i++) {
-      scaleShift_buf[channels_ + i] = (bias.dptr_)[i];
-    }
-
-    void* BatchNorm_res[dnnResourceNumber];
-    BatchNorm_res[dnnResourceSrc] = bottom_data;
-    BatchNorm_res[dnnResourceScaleShift] = scaleShift_space.dptr;
-
-    BatchNorm_res[dnnResourceDst] = fwd_top_data->get_output_ptr(out.dptr_,
-      fwd_top_data, out_data[batchnorm::kOut]);
-    if (ctx.is_train && !param_.use_global_stats) {
-      Tensor<xpu, 1, DType> mean = out_data[batchnorm::kMean].get<xpu, 1, DType>(s);
-      Tensor<xpu, 1, DType> var = out_data[batchnorm::kVar].get<xpu, 1, DType>(s);
-      CHECK(req[batchnorm::kMean] == kNullOp || req[batchnorm::kMean] == kWriteTo);
-      CHECK(req[batchnorm::kVar] == kNullOp || req[batchnorm::kVar] == kWriteTo);
-      BatchNorm_res[dnnResourceMean] = mean.dptr_;
-      BatchNorm_res[dnnResourceVariance] = var.dptr_;
-      e = dnnExecute<DType>(batchNormFwdTraining, BatchNorm_res);
-      CHECK_EQ(e, E_SUCCESS);
-    } else {
-      BatchNorm_res[dnnResourceMean] = moving_mean.dptr_;
-      BatchNorm_res[dnnResourceVariance] = moving_var.dptr_;
-      e = dnnExecute<DType>(batchNormFwdInference, BatchNorm_res);
-      CHECK_EQ(e, E_SUCCESS);
-    }
-
-#if MKL_EXPERIMENTAL == 0
-    if (fwd_top_data->conversion_needed()) {
-      fwd_top_data->convert_from_prv(out.dptr_);
-    }
-#endif
-  }
-
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_states) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(in_data.size(), 3);
-    CHECK_EQ(out_data.size(), 3);
-    CHECK_EQ(in_grad.size(), 3);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> data, grad, grad_in;
-
-    if (in_data[batchnorm::kData].ndim() == 2) {
-      Shape<4> dshape = Shape4(out_grad[batchnorm::kOut].shape_[0],
-                               out_grad[batchnorm::kOut].shape_[1], 1, 1);
-      data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        in_data[batchnorm::kData], dshape, s);
-      grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_grad[batchnorm::kOut], dshape, s);
-      grad_in = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        in_grad[batchnorm::kData], dshape, s);
-    } else {
-      data = mkl_experimental_direct_get<xpu, 4, DType>(in_data[batchnorm::kData], s);
-      grad = mkl_experimental_direct_get<xpu, 4, DType>(out_grad[batchnorm::kOut], s);
-      grad_in = mkl_experimental_direct_get<xpu, 4, DType>(in_grad[batchnorm::kData], s);
-    }
-
-    Tensor<xpu, 1, DType> slope = in_data[batchnorm::kGamma].get<xpu, 1, DType>(s);
-    Tensor<xpu, 1, DType> gslope = in_grad[batchnorm::kGamma].get<xpu, 1, DType>(s);
-    Tensor<xpu, 1, DType> gbias = in_grad[batchnorm::kBeta].get<xpu, 1, DType>(s);
-    Tensor<xpu, 1, DType> mean = out_data[batchnorm::kMean].get<xpu, 1, DType>(s);
-    Tensor<xpu, 1, DType> var = out_data[batchnorm::kVar].get<xpu, 1, DType>(s);
-    Tensor<xpu, 1, DType> moving_mean = aux_states[batchnorm::kMovingMean].get<xpu, 1, DType>(s);
-    Tensor<xpu, 1, DType> moving_var = aux_states[batchnorm::kMovingVar].get<xpu, 1, DType>(s);
-
-    if (param_.fix_gamma)  slope = 1.f;
-
-    void* bottom_data = NULL;
-#if MKL_EXPERIMENTAL == 1
-    bottom_data = reinterpret_cast<void *>(mkl_prv_data<DType>(in_data[batchnorm::kData]));
-#endif
-    if (NULL == bottom_data)
-      bottom_data = reinterpret_cast<void *>(data.dptr_);
-
-    dnnError_t e;
-    void* BatchNorm_res[dnnResourceNumber];
-    BatchNorm_res[dnnResourceSrc] = bottom_data;
-    BatchNorm_res[dnnResourceScaleShift] = scaleShift_space.dptr;
-    if (ctx.is_train && !param_.use_global_stats) {
-      int size = mean.size(0);  // Tensor<xpu, 1, DType>
-      float * moving_mean_ptr = reinterpret_cast<float*>(moving_mean.dptr_);
-      float * mean_ptr = reinterpret_cast<float*>(mean.dptr_);
-      float * moving_var_ptr = reinterpret_cast<float*>(moving_var.dptr_);
-      float * var_ptr = reinterpret_cast<float*>(var.dptr_);
-      float minus_mom = (1 - param_.momentum);
-      for (int i = 0; i < size; i++) {
-        moving_mean_ptr[i] = moving_mean_ptr[i] * param_.momentum
-          + mean_ptr[i] * minus_mom;
-      }
-      for (int i = 0; i < size; i++) {
-        moving_var_ptr[i] = moving_var_ptr[i] * param_.momentum
-          + var_ptr[i] * minus_mom;
-      }
-      BatchNorm_res[dnnResourceMean] = mean.dptr_;
-      BatchNorm_res[dnnResourceVariance] = var.dptr_;
-    } else {
-      BatchNorm_res[dnnResourceMean] = moving_mean.dptr_;
-      BatchNorm_res[dnnResourceVariance] = moving_var.dptr_;
-    }
-
-
-    BatchNorm_res[dnnResourceDiffSrc] = bwd_bottom_diff->get_output_ptr(grad_in.dptr_,
-      bwd_bottom_diff, in_grad[batchnorm::kData]);
-    BatchNorm_res[dnnResourceDiffDst] = bwd_top_diff->get_converted_prv(grad.dptr_,
-             true, out_grad[batchnorm::kOut]);
-    BatchNorm_res[dnnResourceDiffScaleShift] = scaleShiftDiff_space.dptr;
-    e = dnnExecute<DType>(batchNormBwdScaleShift, BatchNorm_res);
-    CHECK_EQ(e, E_SUCCESS);
-#if MKL_EXPERIMENTAL == 0
-    if (bwd_bottom_diff->conversion_needed()) {
-      bwd_bottom_diff->convert_from_prv(grad_in.dptr_);
-    }
-#endif
-    DType * scaleShiftDiff_buf = reinterpret_cast<DType*>(scaleShiftDiff_space.dptr);
-    if (!param_.fix_gamma) {
-      // Store ScaleShift blobs
-      DType* diff_scale = gslope.dptr_;
-      for (int i = 0; i < channels_; i++) {
-        diff_scale[i] = scaleShiftDiff_buf[i];
-      }
-    } else {
-      int gslope_size = gslope.size(0);
-      float * gslope_ptr = reinterpret_cast<float*>(gslope.dptr_);
-      for (int i = 0; i < gslope_size; i++) {
-        *gslope_ptr++ = 0.0f;
-      }
-    }
-    DType* diff_shift = gbias.dptr_;
-    for (int i = 0; i < channels_; i++) {
-      diff_shift[i] = scaleShiftDiff_buf[channels_ + i];
-    }
-  }
-
- private:
-  BatchNormParam param_;
-  DType eps_;
-  bool use_weight_bias_;
-
-  int num_;
-  int channels_;
-  int height_;
-  int width_;
-  bool init_mkldnn_ = false;
-  std::shared_ptr<MKLData<DType> > fwd_top_data;
-  std::shared_ptr<MKLData<DType> > fwd_bottom_data;
-  std::shared_ptr<MKLData<DType> > bwd_top_diff;
-  std::shared_ptr<MKLData<DType> > bwd_bottom_diff;
-  dnnPrimitive_t batchNormFwdInference = NULL;
-  dnnPrimitive_t batchNormFwdTraining = NULL;
-  dnnPrimitive_t batchNormBwdScaleShift = NULL;
-  Storage::Handle scaleShift_space;
-  Storage::Handle scaleShiftDiff_space;
-  dnnLayout_t layout_usr_ = NULL;
-};  // class BatchNormOp
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_MKL_MKL_BATCH_NORM_INL_H_
diff --git a/src/operator/mkl/mkl_concat-inl.h b/src/operator/mkl/mkl_concat-inl.h
deleted file mode 100644
index 1ed1e81d1303..000000000000
--- a/src/operator/mkl/mkl_concat-inl.h
+++ /dev/null
@@ -1,314 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_concat-inl.h
-* \brief
-* \author lingyan.guo@intel.com
-*         zhenlin.luo@intel.com
-*
-*******************************************************************************/
-#ifndef MXNET_OPERATOR_MKL_MKL_CONCAT_INL_H_
-#define MXNET_OPERATOR_MKL_MKL_CONCAT_INL_H_
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <cstring>
-#include <map>
-#include <string>
-#include <vector>
-#include <utility>
-#include "../operator_common.h"
-#include "../channel_op_common.h"
-#include "./mkl_util-inl.h"
-namespace mxnet {
-namespace op {
-
-
-template<typename xpu, typename DType>
-class MKLConcatOp : public Operator {
- public:
-  static std::string getName() {
-    return "MKLConcatOp";
-  }
-  explicit MKLConcatOp(ConcatParam param)
-    : size_(param.num_args), dimension_(param.dim), init_mkldnn_(false) {
-    concatFwd_ = static_cast<dnnPrimitive_t>(NULL);
-    concatBwd_ = static_cast<dnnPrimitive_t>(NULL);
-    fwd_top_data_ = MKLData<DType>::create();
-    bwd_top_diff_ = MKLData<DType>::create();
-
-    num_concats_ = param.num_args;
-  }
-  virtual ~MKLConcatOp() {
-    dnnDelete<DType>(concatFwd_);
-    dnnDelete<DType>(concatBwd_);
-  }
-
- private:
-  void LayerSetUp(const std::vector<mshadow::Tensor<xpu, 4, DType> > &data,
-                  const mshadow::Tensor<xpu, 4, DType> &out,
-                  size_t data_shape_size, size_t *split_channels_) {
-    size_t dim_src = data_shape_size;
-    size_t dim_dst = dim_src;
-    num_concats_ = size_;
-    channels_ = 0;
-
-    for (size_t i = 1; i < num_concats_; ++i) {
-      for (size_t j = 1; j < data_shape_size; ++j) {
-        if (j == dimension_) continue;
-        CHECK_EQ(data[0].shape_[j], data[i].shape_[j]);
-      }
-    }
-
-    for (size_t i = 0; i < num_concats_; ++i) {
-      CHECK_EQ((int)dim_src, data[i].shape_.kDimension);
-
-      fwd_bottom_data_.push_back(MKLData<DType>::create());
-      bwd_bottom_diff_.push_back(MKLData<DType>::create());
-      fwd_bottom_data_[i]->name = "fwd_bottom_data_[i]";
-      bwd_bottom_diff_[i]->name = "bwd_bottom_data[i]";
-
-      size_t *sizes_src = new size_t[dim_src];
-      size_t *strides_src = new size_t[dim_src];
-      for (size_t d = 0; d < dim_src; ++d) {
-        sizes_src[d] = data[i].shape_[dim_src - d - 1];
-        strides_src[d] = (d == 0) ? 1 : strides_src[d - 1] * sizes_src[d - 1];
-      }
-
-      split_channels_[i] = data[i].shape_[1];
-      channels_ += split_channels_[i];
-      fwd_bottom_data_[i]->create_user_layout(dim_src, sizes_src, strides_src);
-      bwd_bottom_diff_[i]->create_user_layout(dim_src, sizes_src, strides_src);
-      delete[] sizes_src;
-      delete[] strides_src;
-    }
-    size_t *sizes_dst = new size_t[dim_dst];
-    size_t *strides_dst = new size_t[dim_dst];
-    for (size_t d = 0; d < dim_dst; ++d) {
-      if (d == 2)
-        sizes_dst[d] = channels_;
-      else
-        sizes_dst[d] = data[0].shape_[dim_dst - 1 - d];
-      strides_dst[d] = (d == 0) ? 1 : strides_dst[d - 1] * sizes_dst[d - 1];
-    }
-    bwd_top_diff_->create_user_layout(dim_dst, sizes_dst, strides_dst);
-    fwd_top_data_->create_user_layout(dim_dst, sizes_dst, strides_dst);
-    delete[] sizes_dst;
-    delete[] strides_dst;
-    concatFwd_ = NULL;
-    concatBwd_ = NULL;
-  }
-
- public:
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(static_cast<int>(in_data.size()), size_);
-    CHECK_EQ(out_data.size(), 1);
-    CHECK_LT(dimension_, (size_t)in_data[concat_enum::kData0].ndim());
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    std::vector<Tensor<xpu, 4, DType> > data(size_);
-    Tensor<xpu, 4, DType> out;
-    if (in_data[0].ndim() == 2) {
-      for (int i = 0; i < size_; ++i) {
-        Shape<4> dshape = Shape4(in_data[i].shape_[0],
-                                 in_data[i].shape_[1], 1, 1);
-        data[i] = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-          in_data[i], dshape, s);
-      }
-      Shape<4> dshape = Shape4(out_data[concat_enum::kOut].shape_[0],
-                               out_data[concat_enum::kOut].shape_[1], 1, 1);
-      out = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_data[concat_enum::kOut], dshape, s);
-    } else if (in_data[0].ndim() == 3) {
-      for (int i = 0; i < size_; ++i) {
-        Shape<4> dshape = Shape4(in_data[i].shape_[0],
-          in_data[i].shape_[1], in_data[i].shape_[2], 1);
-        data[i] = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-          in_data[i], dshape, s);
-      }
-      Shape<4> dshape = Shape4(out_data[concat_enum::kOut].shape_[0],
-        out_data[concat_enum::kOut].shape_[1],
-        out_data[concat_enum::kOut].shape_[2], 1);
-      out = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_data[concat_enum::kOut], dshape, s);
-    } else {
-      for (int i = 0; i < size_; ++i) {
-        data[i] = mkl_experimental_direct_get<xpu, 4, DType>(in_data[i], s);
-      }
-      out = mkl_experimental_direct_get<xpu, 4, DType>(out_data[concat_enum::kOut], s);
-    }
-    size_t *split_channels_ = new size_t[num_concats_];
-    if (!init_mkldnn_) {
-      init_mkldnn_ = true;
-      LayerSetUp(data, out, 4, split_channels_);
-    }
-
-    dnnError_t e;
-    std::vector<void*> bottom_data;
-    bool isFirstPass = (concatFwd_ == NULL);
-    dnnLayout_t *layouts = NULL;
-    if (isFirstPass) {
-      layouts = new dnnLayout_t[num_concats_];
-    }
-
-    for (size_t i = 0; i < num_concats_; i++) {
-      void * bottom_i = NULL;
-#if MKL_EXPERIMENTAL == 1
-      bottom_i = mkl_prv_data<DType>(in_data[i]);
-      if (bottom_i != NULL) {
-        if (isFirstPass) {
-          std::shared_ptr<MKLData<DType> > mem_descr =
-            mkl_get_mem_desc<DType>(in_data[i].Mkl_mem_);
-          fwd_bottom_data_[i] = mem_descr;
-          layouts[i] = mem_descr->layout_int;
-        }
-      }
-#endif
-      if (bottom_i == NULL) {
-        bottom_i = data[i].dptr_;
-        if (isFirstPass) {
-          layouts[i] = fwd_bottom_data_[i]->layout_usr;
-        }
-      }
-
-      bottom_data.push_back(reinterpret_cast<void *>(bottom_i));
-    }
-
-    if (isFirstPass) {
-      e = dnnConcatCreate<DType>(&concatFwd_, NULL, num_concats_, layouts);
-      CHECK_EQ(e, E_SUCCESS);
-
-      fwd_top_data_->create_internal_layout(concatFwd_, dnnResourceDst);
-      bwd_top_diff_->create_internal_layout(concatFwd_, dnnResourceDst);
-
-      e = dnnSplitCreate<DType>(&concatBwd_, NULL, num_concats_,
-            bwd_top_diff_->layout_int, split_channels_);
-      CHECK_EQ(e, E_SUCCESS);
-
-      for (size_t n = 0; n < num_concats_; ++n) {
-        fwd_bottom_data_[n]->create_internal_layout(concatFwd_,
-          (dnnResourceType_t)(dnnResourceMultipleSrc + n));
-        bwd_bottom_diff_[n]->create_internal_layout(concatBwd_,
-          (dnnResourceType_t)(dnnResourceMultipleDst + n));
-      }
-    }
-    delete[] layouts;
-
-    void *concat_res[dnnResourceNumber];
-    for (size_t i = 0; i < num_concats_; ++i) {
-      concat_res[dnnResourceMultipleSrc + i]
-        = reinterpret_cast<void*>(bottom_data[i]);
-    }
-
-    concat_res[dnnResourceDst] = fwd_top_data_->get_output_ptr(out.dptr_,
-      fwd_top_data_, out_data[concat_enum::kOut]);
-    e = dnnExecute<DType>(concatFwd_, concat_res);
-    CHECK_EQ(e, E_SUCCESS);
-    delete[] split_channels_;
-  }
-
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_states) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(in_grad.size(), static_cast<size_t>(size_));
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    std::vector<Tensor<xpu, 4, DType> > grad_in(size_);
-    Tensor<xpu, 4, DType> grad;
-    if (in_grad[0].ndim() == 2) {
-      Shape<4> dshape = Shape4(out_grad[concat_enum::kOut].shape_[0],
-        out_grad[concat_enum::kOut].shape_[1], 1, 1);
-      grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_grad[concat_enum::kOut], dshape, s);
-      for (int i = 0; i < size_; ++i) {
-        dshape = Shape4(in_grad[i].shape_[0],
-          in_grad[i].shape_[1], 1, 1);
-        grad_in[i] = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-          in_grad[i], dshape, s);
-      }
-    } else if (in_grad[0].ndim() == 3) {
-      Shape<4> dshape = Shape4(out_grad[concat_enum::kOut].shape_[0],
-        out_grad[concat_enum::kOut].shape_[1],
-        out_grad[concat_enum::kOut].shape_[2], 1);
-      grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_grad[concat_enum::kOut], dshape, s);
-      for (int i = 0; i < size_; ++i) {
-        dshape = Shape4(in_grad[i].shape_[0],
-          in_grad[i].shape_[1], in_grad[i].shape_[2], 1);
-        grad_in[i] = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-          in_grad[i], dshape, s);
-      }
-    } else {
-      grad = mkl_experimental_direct_get<xpu, 4, DType>(out_grad[concat_enum::kOut], s);
-      for (int i = 0; i < size_; ++i) {
-        grad_in[i] = mkl_experimental_direct_get<xpu, 4, DType>(in_grad[i], s);
-      }
-    }
-
-    int need_bwd = 0;
-    for (size_t n = 0; n < num_concats_; n++) {
-      need_bwd += req[n];
-    }
-    if (!need_bwd) {
-      return;
-    }
-
-    dnnError_t e;
-    void *concat_res[dnnResourceNumber];
-    concat_res[dnnResourceSrc] = bwd_top_diff_->get_converted_prv(grad.dptr_, true,
-      out_grad[concat_enum::kOut]);
-    for (size_t i = 0; i < num_concats_; ++i) {
-      concat_res[dnnResourceMultipleDst + i] = bwd_bottom_diff_[i]->get_output_ptr(
-        grad_in[i].dptr_, bwd_bottom_diff_[i], in_grad[i]);
-    }
-    e = dnnExecute<DType>(concatBwd_, concat_res);
-    CHECK_EQ(e, E_SUCCESS);
-  }
-
- private:
-  int size_;
-  size_t dimension_;
-
-  bool init_mkldnn_;
-
-  dnnPrimitive_t concatFwd_;
-  dnnPrimitive_t concatBwd_;
-  std::shared_ptr<MKLData<DType> > fwd_top_data_;
-  std::vector< std::shared_ptr<MKLData<DType> > > fwd_bottom_data_;
-  std::shared_ptr<MKLData<DType> > bwd_top_diff_;
-  std::vector< std::shared_ptr<MKLData<DType> > > bwd_bottom_diff_;
-
-
-  size_t width_;
-  size_t height_;
-  size_t channels_;
-  size_t num_;
-  size_t num_concats_;
-};  // class MKLConcatOp
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_OPERATOR_MKL_MKL_CONCAT_INL_H_
diff --git a/src/operator/mkl/mkl_convolution-inl.h b/src/operator/mkl/mkl_convolution-inl.h
deleted file mode 100644
index 813d061f172b..000000000000
--- a/src/operator/mkl/mkl_convolution-inl.h
+++ /dev/null
@@ -1,490 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_convolution-inl.h
-* \brief
-* \author lingyan.guo@intel.com
-*         zhenlin.luo@intel.com
-*
-*******************************************************************************/
-#ifndef MXNET_OPERATOR_MKL_MKL_CONVOLUTION_INL_H_
-#define MXNET_OPERATOR_MKL_MKL_CONVOLUTION_INL_H_
-#include <mxnet/storage.h>
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <algorithm>
-#include <map>
-#include <vector>
-#include <string>
-#include <utility>
-#include "../operator_common.h"
-#include "../nn/convolution-inl.h"
-#include "./mkl_util-inl.h"
-
-namespace mxnet {
-namespace op {
-
-template<typename xpu, typename DType>
-class MKLConvolutionOp : public Operator {
- public:
-  static std::string getName() {
-    return "MKLConvolutionOp";
-  }
-  void SetupBuffer() {
-    convolutionBwdBias = static_cast<dnnPrimitive_t>(NULL);
-    convolutionBwdFilter = static_cast<dnnPrimitive_t>(NULL);
-    convolutionBwdData = static_cast<dnnPrimitive_t>(NULL);
-    convolutionFwd = static_cast<dnnPrimitive_t>(NULL);
-    fwd_bottom_data = MKLData<DType>::create();
-    fwd_top_data = MKLData<DType>::create();
-    fwd_filter_data = MKLData<DType>::create();
-    fwd_bias_data = MKLData<DType>::create();
-    bwdd_top_diff = MKLData<DType>::create();
-    bwdd_bottom_diff = MKLData<DType>::create();
-    bwdd_filter_data = MKLData<DType>::create();
-    bwdf_top_diff = MKLData<DType>::create();
-    bwdf_filter_diff = MKLData<DType>::create();
-    bwdf_bottom_data = MKLData<DType>::create();
-    bwdb_top_diff = MKLData<DType>::create();
-    bwdb_bias_diff = MKLData<DType>::create();
-    // Names are for debugging purposes only.
-    fwd_bottom_data->name = "fwd_bottom_data   @ " + this->getName();
-    fwd_top_data->name = "fwd_top_data      @ " + this->getName();
-    fwd_filter_data->name = "fwd_filter_data   @ " + this->getName();
-    fwd_bias_data->name = "fwd_bias_data     @ " + this->getName();
-    bwdd_top_diff->name = "bwdd_top_diff     @ " + this->getName();
-    bwdd_bottom_diff->name = "bwdd_bottom_diff  @ " + this->getName();
-    bwdd_filter_data->name = "bwdd_filter_data  @ " + this->getName();
-    bwdf_top_diff->name = "bwdf_top_diff     @ " + this->getName();
-    bwdf_bottom_data->name = "bwdf_bottom_data  @ " + this->getName();
-    bwdf_filter_diff->name = "bwdf_filter_diff  @ " + this->getName();
-    bwdb_top_diff->name = "bwdb_top_diff     @ " + this->getName();
-    bwdb_bias_diff->name = "bwdb_bias_diff    @ " + this->getName();
-  }
-
-  explicit MKLConvolutionOp(ConvolutionParam p):
-                            convolutionFwd(NULL),
-                            convolutionBwdData(static_cast<dnnPrimitive_t>(NULL)),
-                            convolutionBwdFilter(static_cast<dnnPrimitive_t>(NULL)),
-                            convolutionBwdBias(static_cast<dnnPrimitive_t>(NULL)) {
-    this->param_ = p;
-    init_mkldnn_ = false;
-    // convert MBytes first to Bytes and then to elements.
-    param_.workspace = (param_.workspace << 20) / sizeof(DType);
-    SetupBuffer();
-  }
-  void ReleaseBuffer() {
-    if (convolutionFwd != NULL) {
-     dnnDelete<DType>(convolutionFwd);
-     convolutionFwd = NULL;
-    }
-    if (convolutionBwdData != NULL) {
-     dnnDelete<DType>(convolutionBwdData);
-     convolutionBwdData = NULL;
-    }
-    if (convolutionBwdFilter != NULL) {
-     dnnDelete<DType>(convolutionBwdFilter);
-     convolutionBwdFilter = NULL;
-    }
-    if (!param_.no_bias && convolutionBwdBias != NULL) {
-     dnnDelete<DType>(convolutionBwdBias);
-     convolutionBwdBias = NULL;
-    }
-  }
-  virtual ~MKLConvolutionOp() {
-    ReleaseBuffer();
-  }
-
- private:
-  void LayerSetUp(const mshadow::Tensor<xpu, 4, DType> &data,
-                  const mshadow::Tensor<xpu, 4, DType> &out) {
-    this->width_ = data.shape_[3];
-    this->height_ = data.shape_[2];
-    this->channels_ = data.shape_[1];
-    this->num_ = data.shape_[0];
-    this->group_ = param_.num_group;
-    this->width_out_ = out.shape_[3];
-    this->height_out_ = out.shape_[2];
-    int channel_out_ = out.shape_[1];
-    this->num_output_ = channel_out_;
-    kernel_w_ = param_.kernel[1];
-    kernel_h_ = param_.kernel[0];
-    stride_w_ = param_.stride[1];
-    stride_h_ = param_.stride[0];
-    pad_w_ = param_.pad[1];
-    pad_h_ = param_.pad[0];
-    int status;
-    size_t n, g;
-    size_t iw, ih, ic;
-    size_t ow, oh, oc;
-    size_t kw, kh;
-    size_t dimension = 4;
-    g = std::max(this->group_, 1);
-    n = this->num_;
-    iw = this->width_;
-    ih = this->height_;
-    ic = this->channels_;
-    ow = this->width_out_;
-    oh = this->height_out_;
-    oc = this->num_output_;
-    kw = this->kernel_w_;
-    kh = this->kernel_h_;
-    oc = this->num_output_;
-    size_t bdata_sizes[4] = { iw, ih, ic, n };
-    size_t bdata_strides[4] = { 1, iw, iw*ih, iw*ih*ic };
-    /* starting with MKL 2017 Gold in case of groups filter layout
-    * becomes 5D, i.e. groups become a separate dimension */
-    size_t g_mkl2017 = g;
-    size_t f_dimension = dimension + (g != 1);
-    if (getMKLBuildDate() < 20160701) {
-     g_mkl2017 = 1;
-     f_dimension = dimension;
-    }
-    size_t fdata_sizes[5] = { kw, kh, ic / g, oc / g_mkl2017, g_mkl2017 };
-    size_t fdata_strides[5] = { 1, kw, kw*kh, kw*kh*ic / g, kw*kh*ic / g*oc / g };
-    size_t bias_sizes[1] = { oc };
-    size_t bias_strides[1] = { 1 };
-    size_t tdata_sizes[4] = { ow, oh, oc, n };
-    size_t tdata_strides[4] = { 1, ow, ow*oh, ow*oh*oc };
-    size_t convolutionStrides[2] = { this->stride_w_, this->stride_h_ };
-    int    inputOffset[2] = { -this->pad_w_, -this->pad_h_ };
-    // Names are for debugging purposes only.
-    /*** convolution section ***/
-    if (!param_.no_bias) {
-      status = dnnGroupsConvolutionCreateForwardBias<DType>(&convolutionFwd,
-                                                            NULL,
-                                                            dnnAlgorithmConvolutionDirect,
-                                                            g,
-                                                            dimension,
-                                                            bdata_sizes,
-                                                            tdata_sizes,
-                                                            fdata_sizes,
-                                                            convolutionStrides,
-                                                            inputOffset,
-                                                            dnnBorderZeros);
-    } else {
-      status = dnnGroupsConvolutionCreateForward<DType>(&convolutionFwd,
-                                                        NULL,
-                                                        dnnAlgorithmConvolutionDirect,
-                                                        g,
-                                                        dimension,
-                                                        bdata_sizes,
-                                                        tdata_sizes,
-                                                        fdata_sizes,
-                                                        convolutionStrides,
-                                                        inputOffset,
-                                                        dnnBorderZeros);
-    }
-    CHECK_EQ(status, 0)
-     << "Failed dnnCreateConvolution<DType>(dnnForward) with status "
-     << status << "\n";
-    fwd_bottom_data->create_layouts(convolutionFwd, dnnResourceSrc, dimension,
-                                    bdata_sizes, bdata_strides);
-    fwd_top_data->create_layouts(convolutionFwd, dnnResourceDst, dimension,
-                                 tdata_sizes, tdata_strides);
-    fwd_filter_data->create_layouts(convolutionFwd, dnnResourceFilter,
-                                    f_dimension, fdata_sizes, fdata_strides);
-    if (!param_.no_bias)
-      fwd_bias_data->create_layouts(convolutionFwd, dnnResourceBias, 1,
-                                    bias_sizes, bias_strides);
-    /*
-    * Backward by data layer setup
-    */
-    status = dnnGroupsConvolutionCreateBackwardData<DType>(&convolutionBwdData,
-                                                           NULL,
-                                                           dnnAlgorithmConvolutionDirect,
-                                                           g,
-                                                           dimension,
-                                                           bdata_sizes,
-                                                           tdata_sizes,
-                                                           fdata_sizes,
-                                                           convolutionStrides,
-                                                           inputOffset,
-                                                           dnnBorderZeros);
-    CHECK_EQ(status, 0)
-     << "Failed dnnConvolutionCreateBackwardData with status "
-     << status << "\n";
-    bwdd_bottom_diff->create_layouts(convolutionBwdData, dnnResourceDiffSrc,
-                                     dimension, bdata_sizes, bdata_strides);
-    bwdd_top_diff->create_layouts(convolutionBwdData, dnnResourceDiffDst,
-                                  dimension, tdata_sizes, tdata_strides);
-    bwdd_filter_data->create_layouts(convolutionBwdData, dnnResourceFilter,
-                                     f_dimension, fdata_sizes, fdata_strides);
-    /*
-    * Backward by filter layer setup
-    */
-    status = dnnGroupsConvolutionCreateBackwardFilter<DType>(&convolutionBwdFilter,
-                                                             NULL,
-                                                             dnnAlgorithmConvolutionDirect,
-                                                             g,
-                                                             dimension,
-                                                             bdata_sizes,
-                                                             tdata_sizes,
-                                                             fdata_sizes,
-                                                             convolutionStrides,
-                                                             inputOffset,
-                                                             dnnBorderZeros);
-    CHECK_EQ(status, 0)
-     << "Failed dnnConvolutionCreateBackwardFilter with status "
-     << status << "\n";
-    bwdf_bottom_data->create_layouts(convolutionBwdFilter, dnnResourceSrc,
-                                     dimension, bdata_sizes, bdata_strides);
-    bwdf_top_diff->create_layouts(convolutionBwdFilter, dnnResourceDiffDst,
-                                  dimension, tdata_sizes, tdata_strides);
-    bwdf_filter_diff->create_layouts(convolutionBwdFilter, dnnResourceDiffFilter,
-                                     f_dimension, fdata_sizes, fdata_strides);
-    /*
-    * Backward by bias layer setup
-    */
-    if (!param_.no_bias) {
-      status = dnnGroupsConvolutionCreateBackwardBias<DType>(&convolutionBwdBias,
-                                                             NULL,
-                                                             dnnAlgorithmConvolutionDirect,
-                                                             g,
-                                                             dimension,
-                                                             tdata_sizes);
-     CHECK_EQ(status, 0)
-      << "Failed dnnConvolutionCreateBackwardBias with status "
-      << status << "\n";
-     bwdb_top_diff->create_layouts(convolutionBwdBias, dnnResourceDiffDst,
-                                   dimension, tdata_sizes, tdata_strides);
-     bwdb_bias_diff->create_layouts(convolutionBwdBias, dnnResourceDiffBias, 1,
-                                    bias_sizes, bias_strides);
-    }
-  }
-
- public:
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    DType *data_ptr = NULL;
-    DType *wmat_ptr = NULL;
-    DType *out_ptr = NULL;
-    Tensor<xpu, 4, DType> data =
-      mkl_experimental_direct_get<xpu, 4, DType>(in_data[conv::kData], s);
-    Tensor<xpu, 4, DType> out =
-      mkl_experimental_direct_get<xpu, 4, DType>(out_data[conv::kOut], s);
-    Tensor<xpu, 4, DType> wmat =
-      mkl_experimental_direct_get<xpu, 4, DType>(in_data[conv::kWeight], s);
-    if (!init_mkldnn_) {
-      LayerSetUp(data, out);
-      init_mkldnn_ = true;
-    }
-    CHECK_EQ(data.CheckContiguous(), true);
-    CHECK_EQ(wmat.CheckContiguous(), true);
-    CHECK_EQ(out.CheckContiguous(), true);
-    data_ptr = data.dptr_;
-    wmat_ptr = wmat.dptr_;
-    out_ptr = out.dptr_;
-    int status;
-    void *res_convolutionFwd[dnnResourceNumber];
-    res_convolutionFwd[dnnResourceSrc] =
-      fwd_bottom_data->get_converted_prv(data_ptr, false, in_data[conv::kData]);
-    res_convolutionFwd[dnnResourceFilter] =
-      fwd_filter_data->get_converted_prv(wmat_ptr, true, in_data[conv::kWeight]);
-    if (!param_.no_bias) {
-      Tensor<xpu, 1, DType> bias =
-        mkl_experimental_direct_get<xpu, 1, DType>(in_data[conv::kBias], s);
-      res_convolutionFwd[dnnResourceBias] =
-        fwd_bias_data->get_converted_prv(bias.dptr_, true, in_data[conv::kBias]);
-    }
-
-    res_convolutionFwd[dnnResourceDst] = fwd_top_data->get_output_ptr(out_ptr,
-      fwd_top_data, out_data[conv::kOut]);
-    status = dnnExecute<DType>(convolutionFwd, res_convolutionFwd);
-    CHECK_EQ(status, 0) << "Forward convolution failed with status " << status;
-#if MKL_EXPERIMENTAL == 0
-    if (fwd_top_data->conversion_needed()) {
-        fwd_top_data->convert_from_prv(out_ptr);
-    }
-#endif
-  }
-  void AddToModeAllocAndStoreBuffer(void *src, int blob_size, Storage::Handle *pws) {
-    int blob_byte_size = blob_size * sizeof(DType);
-    *pws = Storage::Get()->Alloc(blob_byte_size, Context::CPU());
-    memcpy(pws->dptr, src, blob_byte_size);
-  }
-  void AddToModeAddAndReleaseBuffer(Storage::Handle *pws, void *dst_, int blob_size) {
-    DType *dst = reinterpret_cast<DType*>(dst_);
-    DType *src = reinterpret_cast<DType*>(pws->dptr);
-#pragma omp parallel for
-    for (int i = 0; i < blob_size; i++) {
-      dst[i] += src[i];
-    }
-    if (pws->dptr)
-      Storage::Get()->Free(*pws);
-    pws->dptr = NULL;
-  }
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    if (param_.kernel.ndim() > 2) {
-      LOG(FATAL) << "Volume convolution is not implmented in mshadow";
-    }
-    CHECK_EQ(out_grad.size(), 1);
-    size_t expected = param_.no_bias == 0 ? 3 : 2;
-    CHECK(in_data.size() == expected && in_grad.size() == expected);
-    CHECK_EQ(req.size(), expected);
-    CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> data =
-      mkl_experimental_direct_get<xpu, 4, DType>(in_data[conv::kData], s);
-    Shape<3> wmat_shape =
-      Shape3(param_.num_group,
-             param_.num_filter / param_.num_group,
-             data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]);
-    Tensor<xpu, 3, DType> wmat =
-      mkl_experimental_direct_get_with_shape<xpu, 3, DType>(
-      in_data[conv::kWeight], wmat_shape, s);
-    Tensor<xpu, 4, DType> grad =
-      mkl_experimental_direct_get<xpu, 4, DType>(out_grad[conv::kOut], s);
-    Tensor<xpu, 4, DType> gdata =
-      mkl_experimental_direct_get<xpu, 4, DType>(in_grad[conv::kData], s);
-    Tensor<xpu, 3, DType> gwmat =
-      mkl_experimental_direct_get_with_shape<xpu, 3, DType>(
-      in_grad[conv::kWeight], wmat_shape, s);
-
-    if (!init_mkldnn_) {
-      init_mkldnn_ = true;
-      LayerSetUp(data, grad);
-    }
-    int status;
-    if (req[0]) {
-      void *res_convolutionBwdData[dnnResourceNumber];
-      res_convolutionBwdData[dnnResourceDiffDst] =
-        bwdd_top_diff->get_converted_prv(grad.dptr_, true, out_grad[conv::kOut]);
-
-      res_convolutionBwdData[dnnResourceFilter] =
-        bwdd_filter_data->get_converted_prv(wmat.dptr_, false, in_data[conv::kWeight]);
-     Storage::Handle addtoWorkspace;
-     if (req[0] == kAddTo) {
-       // wait mkl support addto mode
-       AddToModeAllocAndStoreBuffer(gdata.dptr_, in_grad[conv::kData].Size(), &addtoWorkspace);
-     }
-
-     res_convolutionBwdData[dnnResourceDiffSrc] = bwdd_bottom_diff->get_output_ptr(gdata.dptr_,
-       bwdd_bottom_diff, in_grad[conv::kData]);
-     status = dnnExecute<DType>(convolutionBwdData, res_convolutionBwdData);
-     CHECK_EQ(status, 0) << "Backward Data conv failed with status " << status;
-#if MKL_EXPERIMENTAL == 0
-     if (bwdd_bottom_diff->conversion_needed()) {
-       bwdd_bottom_diff->convert_from_prv(gdata.dptr_);
-     }
-#endif
-     if (req[0] == kAddTo) {
-       if (bwdd_bottom_diff->conversion_needed()) {
-         bwdd_bottom_diff->convert_from_prv(gdata.dptr_);
-       }
-      AddToModeAddAndReleaseBuffer(&addtoWorkspace, gdata.dptr_, in_grad[conv::kData].Size());
-     }
-    }
-    if (req[1]) {
-      void *res_convolutionBwdFilter[dnnResourceNumber];
-
-      res_convolutionBwdFilter[dnnResourceDiffDst] =
-        bwdf_top_diff->get_converted_prv(grad.dptr_, true, out_grad[conv::kOut]);
-
-      res_convolutionBwdFilter[dnnResourceSrc] =
-        bwdf_bottom_data->get_converted_prv(data.dptr_, false,
-          in_data[conv::kData]);
-     Storage::Handle addtoWorkspace;
-     if (req[1] == kAddTo) {
-       // wait mkl support addto mode
-       AddToModeAllocAndStoreBuffer(gwmat.dptr_, in_grad[conv::kWeight].Size(), &addtoWorkspace);
-     }
-
-     res_convolutionBwdFilter[dnnResourceDiffFilter] = bwdf_filter_diff->get_output_ptr(
-       gwmat.dptr_, bwdf_filter_diff, in_grad[conv::kWeight]);
-     status = dnnExecute<DType>(convolutionBwdFilter, res_convolutionBwdFilter);
-     CHECK_EQ(status, 0) << "Backward Filter conv failed with status " << status;
-#if MKL_EXPERIMENTAL == 0
-     if (bwdf_filter_diff->conversion_needed()) {
-       bwdf_filter_diff->convert_from_prv(gwmat.dptr_);
-     }
-#endif
-     if (req[1] == kAddTo) {
-       if (bwdf_filter_diff->conversion_needed()) {
-         bwdf_filter_diff->convert_from_prv(gwmat.dptr_);
-       }
-       AddToModeAddAndReleaseBuffer(&addtoWorkspace, gwmat.dptr_, in_grad[conv::kWeight].Size());
-     }
-    }
-    if (!param_.no_bias) {
-      Tensor<xpu, 1, DType> gbias =
-        mkl_experimental_direct_get<xpu, 1, DType>(in_grad[conv::kBias], s);
-      void *res_convolutionBwdBias[dnnResourceNumber];
-      res_convolutionBwdBias[dnnResourceDiffDst] =
-        bwdb_top_diff->get_converted_prv(grad.dptr_, true, out_grad[conv::kOut]);
-
-      res_convolutionBwdBias[dnnResourceDiffBias] = bwdb_bias_diff->get_output_ptr(gbias.dptr_,
-        bwdb_bias_diff, in_grad[conv::kBias]);
-      status = dnnExecute<DType>(convolutionBwdBias, res_convolutionBwdBias);
-      CHECK_EQ(status, 0) << "Backward Bias failed with status " << status;
-#if MKL_EXPERIMENTAL == 0
-      if (bwdb_bias_diff->conversion_needed()) {
-        bwdb_bias_diff->convert_from_prv(gbias.dptr_);
-      }
-#endif
-    }
-  }
-
- private:
-  ConvolutionParam param_;
-  size_t width_,
-         height_,
-         width_out_,
-         height_out_,
-         kernel_w_,
-         kernel_h_,
-         stride_w_,
-         stride_h_;
-  int group_,
-      num_,
-      num_output_;
-  size_t channels_;
-  int pad_w_,
-      pad_h_;
-  bool init_mkldnn_;
-  dnnPrimitive_t convolutionFwd;
-  dnnPrimitive_t convolutionBwdData;
-  dnnPrimitive_t convolutionBwdFilter;
-  dnnPrimitive_t convolutionBwdBias;
-  /* Fwd step */
-  std::shared_ptr<MKLData<DType> > fwd_bottom_data, fwd_top_data, fwd_filter_data,
-                                   fwd_bias_data;
-  /* Bwd data step */
-  std::shared_ptr<MKLData<DType> > bwdd_top_diff, bwdd_bottom_diff;
-  std::shared_ptr<MKLData<DType> > bwdd_filter_data;
-  /* Bwd filter step */
-  std::shared_ptr<MKLData<DType> > bwdf_top_diff, bwdf_filter_diff;
-  std::shared_ptr<MKLData<DType> > bwdf_bottom_data;
-  std::shared_ptr<MKLData<DType> > bwdf_filter_diff_iter, bwdf2fwd_filter_diff,
-                                   bwdb_bias_diff_iter;
-  /* Bwd bias step */
-  std::shared_ptr<MKLData<DType> > bwdb_top_diff, bwdb_bias_diff;
-};  // class ConvolutionOp
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_MKL_MKL_CONVOLUTION_INL_H_
diff --git a/src/operator/mkl/mkl_cppwrapper.cc b/src/operator/mkl/mkl_cppwrapper.cc
deleted file mode 100644
index 507e5498c85b..000000000000
--- a/src/operator/mkl/mkl_cppwrapper.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_cppwrapper.cc
-* \brief
-* \author lingyan.guo@intel.com
-*         zhenlin.luo@intel.com
-*
-*******************************************************************************/
-
-
-
-#include "mkl_cppwrapper.h"
-#include <stdio.h>
-#if MXNET_USE_MKL2017 == 1
-#include "mkl_service.h"
-
-int getMKLBuildDate() {
-    static int build = 0;
-    if (build == 0) {
-        MKLVersion v;
-        mkl_get_version(&v);
-        build = atoi(v.Build);
-        printf("MKL Build:%d\n", build);
-    }
-    return build;
-}
-
-bool enableMKLWarnGenerated() {
-  return false;
-}
-#endif  // MSHADOW_USE_MKL2017
diff --git a/src/operator/mkl/mkl_cppwrapper.h b/src/operator/mkl/mkl_cppwrapper.h
deleted file mode 100644
index 7d66f20ad308..000000000000
--- a/src/operator/mkl/mkl_cppwrapper.h
+++ /dev/null
@@ -1,1020 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_cppwrapper.h
-* \brief
-* \author lingyan.guo@intel.com
-*         zhenlin.luo@intel.com
-*
-*******************************************************************************/
-#ifndef MXNET_OPERATOR_MKL_MKL_CPPWRAPPER_H_
-#define MXNET_OPERATOR_MKL_MKL_CPPWRAPPER_H_
-
-
-#include <stdarg.h>
-#include <stddef.h>
-#if MXNET_USE_MKL2017 == 1
-#include "mkl_dnn_types.h"
-#include "mkl_dnn.h"
-#include "mkl_version.h"
-
-
-extern int getMKLBuildDate();
-extern bool enableMKLWarnGenerated();
-
-
-template <typename Dtype> inline dnnError_t dnnLayoutCreate(
-    dnnLayout_t *pLayout, size_t dimension, const size_t size[], const size_t strides[]);
-template <> inline dnnError_t dnnLayoutCreate<float>(
-    dnnLayout_t *pLayout, size_t dimension, const size_t size[], const size_t strides[]) {
-    return dnnLayoutCreate_F32(pLayout, dimension, size, strides);
-}
-template <> inline dnnError_t dnnLayoutCreate<double>(
-    dnnLayout_t *pLayout, size_t dimension, const size_t size[], const size_t strides[]) {
-    return dnnLayoutCreate_F64(pLayout, dimension, size, strides);
-}
-
-template <typename Dtype> inline dnnError_t dnnLayoutCreateFromPrimitive(
-    dnnLayout_t *pLayout, const dnnPrimitive_t primitive, dnnResourceType_t type);
-template <> inline dnnError_t dnnLayoutCreateFromPrimitive<float>(
-    dnnLayout_t *pLayout, const dnnPrimitive_t primitive, dnnResourceType_t type) {
-    return dnnLayoutCreateFromPrimitive_F32(pLayout, primitive, type);
-}
-template <> inline dnnError_t dnnLayoutCreateFromPrimitive<double>(
-    dnnLayout_t *pLayout, const dnnPrimitive_t primitive, dnnResourceType_t type) {
-    return dnnLayoutCreateFromPrimitive_F64(pLayout, primitive, type);
-}
-
-template <typename Dtype> inline size_t dnnLayoutGetMemorySize(
-    const dnnLayout_t layout);
-template <> inline size_t dnnLayoutGetMemorySize<float>(
-    const dnnLayout_t layout) {
-    return dnnLayoutGetMemorySize_F32(layout);
-}
-template <> inline size_t dnnLayoutGetMemorySize<double>(
-    const dnnLayout_t layout) {
-    return dnnLayoutGetMemorySize_F64(layout);
-}
-
-template <typename Dtype> inline int dnnLayoutCompare(
-    const dnnLayout_t l1, const dnnLayout_t l2);
-template <> inline int dnnLayoutCompare<float>(
-    const dnnLayout_t l1, const dnnLayout_t l2) {
-    return dnnLayoutCompare_F32(l1, l2);
-}
-template <> inline int dnnLayoutCompare<double>(
-    const dnnLayout_t l1, const dnnLayout_t l2) {
-    return dnnLayoutCompare_F64(l1, l2);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnAllocateBuffer(
-    void **pPtr, dnnLayout_t layout);
-template <> inline dnnError_t dnnAllocateBuffer<float>(
-    void **pPtr, dnnLayout_t layout) {
-    return dnnAllocateBuffer_F32(pPtr, layout);
-}
-template <> inline dnnError_t dnnAllocateBuffer<double>(
-    void **pPtr, dnnLayout_t layout) {
-    return dnnAllocateBuffer_F64(pPtr, layout);
-}
-
-template <typename Dtype> inline dnnError_t dnnReleaseBuffer(
-    void *ptr);
-template <> inline dnnError_t dnnReleaseBuffer<float>(
-    void *ptr) {
-    return dnnReleaseBuffer_F32(ptr);
-}
-template <> inline dnnError_t dnnReleaseBuffer<double>(
-    void *ptr) {
-    return dnnReleaseBuffer_F64(ptr);
-}
-
-template <typename Dtype> inline dnnError_t dnnLayoutDelete(
-    dnnLayout_t layout);
-template <> inline dnnError_t dnnLayoutDelete<float>(
-    dnnLayout_t layout) {
-    return dnnLayoutDelete_F32(layout);
-}
-template <> inline dnnError_t dnnLayoutDelete<double>(
-    dnnLayout_t layout) {
-    return dnnLayoutDelete_F64(layout);
-}
-
-template <typename Dtype> inline dnnError_t dnnPrimitiveAttributesCreate(
-    dnnPrimitiveAttributes_t *attributes);
-template <> inline dnnError_t dnnPrimitiveAttributesCreate<float>(
-    dnnPrimitiveAttributes_t *attributes) {
-    return dnnPrimitiveAttributesCreate_F32(attributes);
-}
-template <> inline dnnError_t dnnPrimitiveAttributesCreate<double>(
-    dnnPrimitiveAttributes_t *attributes) {
-    return dnnPrimitiveAttributesCreate_F64(attributes);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnPrimitiveAttributesDestroy(
-    dnnPrimitiveAttributes_t attributes);
-template <> inline dnnError_t dnnPrimitiveAttributesDestroy<float>(
-    dnnPrimitiveAttributes_t attributes) {
-    return dnnPrimitiveAttributesDestroy_F32(attributes);
-}
-template <> inline dnnError_t dnnPrimitiveAttributesDestroy<double>(
-    dnnPrimitiveAttributes_t attributes) {
-    return dnnPrimitiveAttributesDestroy_F64(attributes);
-}
-
-template <typename Dtype> inline dnnError_t dnnPrimitiveGetAttributes(
-    dnnPrimitive_t primitive,
-    dnnPrimitiveAttributes_t *attributes);
-template <> inline dnnError_t dnnPrimitiveGetAttributes<float>(
-    dnnPrimitive_t primitive,
-    dnnPrimitiveAttributes_t *attributes) {
-    return dnnPrimitiveGetAttributes_F32(primitive, attributes);
-}
-template <> inline dnnError_t dnnPrimitiveGetAttributes<double>(
-    dnnPrimitive_t primitive,
-    dnnPrimitiveAttributes_t *attributes) {
-    return dnnPrimitiveGetAttributes_F64(primitive, attributes);
-}
-
-template <typename Dtype> inline dnnError_t dnnExecute(
-    dnnPrimitive_t primitive, void *resources[]);
-template <> inline dnnError_t dnnExecute<float>(
-    dnnPrimitive_t primitive, void *resources[]) {
-    return dnnExecute_F32(primitive, resources);
-}
-template <> inline dnnError_t dnnExecute<double>(
-    dnnPrimitive_t primitive, void *resources[]) {
-    return dnnExecute_F64(primitive, resources);
-}
-
-template <typename Dtype> inline dnnError_t dnnExecuteAsync(
-    dnnPrimitive_t primitive, void *resources[]);
-template <> inline dnnError_t dnnExecuteAsync<float>(
-    dnnPrimitive_t primitive, void *resources[]) {
-    return dnnExecuteAsync_F32(primitive, resources);
-}
-template <> inline dnnError_t dnnExecuteAsync<double>(
-    dnnPrimitive_t primitive, void *resources[]) {
-    return dnnExecuteAsync_F64(primitive, resources);
-}
-
-template <typename Dtype> inline dnnError_t dnnWaitFor(
-    dnnPrimitive_t primitive);
-template <> inline dnnError_t dnnWaitFor<float>(
-    dnnPrimitive_t primitive) {
-    return dnnWaitFor_F32(primitive);
-}
-template <> inline dnnError_t dnnWaitFor<double>(
-    dnnPrimitive_t primitive) {
-    return dnnWaitFor_F64(primitive);
-}
-
-template <typename Dtype> inline dnnError_t dnnDelete(
-    dnnPrimitive_t primitive);
-template <> inline dnnError_t dnnDelete<float>(
-    dnnPrimitive_t primitive) {
-    return dnnDelete_F32(primitive);
-}
-template <> inline dnnError_t dnnDelete<double>(
-    dnnPrimitive_t primitive) {
-    return dnnDelete_F64(primitive);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnConversionCreate(
-    dnnPrimitive_t* pConversion, const dnnLayout_t from, const dnnLayout_t to);
-template <> inline dnnError_t dnnConversionCreate<float>(
-    dnnPrimitive_t* pConversion, const dnnLayout_t from, const dnnLayout_t to) {
-    return dnnConversionCreate_F32(pConversion, from, to);
-}
-template <> inline dnnError_t dnnConversionCreate<double>(
-    dnnPrimitive_t* pConversion, const dnnLayout_t from, const dnnLayout_t to) {
-    return dnnConversionCreate_F64(pConversion, from, to);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnConversionExecute(
-    dnnPrimitive_t conversion, void *from, void *to);
-template <> inline dnnError_t dnnConversionExecute<float>(
-    dnnPrimitive_t conversion, void *from, void *to) {
-    return dnnConversionExecute_F32(conversion, from, to);
-}
-template <> inline dnnError_t dnnConversionExecute<double>(
-    dnnPrimitive_t conversion, void *from, void *to) {
-    return dnnConversionExecute_F64(conversion, from, to);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnConvolutionCreateForward(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type);
-template <> inline dnnError_t dnnConvolutionCreateForward<float>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnConvolutionCreateForward_F32(
-               pConvolution,
-               attributes,
-               algorithm,
-               dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-
-template <> inline dnnError_t dnnConvolutionCreateForward<double>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnConvolutionCreateForward_F64(
-               pConvolution,
-               attributes,
-               algorithm,
-               dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnConvolutionCreateForwardBias(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type);
-template <> inline dnnError_t dnnConvolutionCreateForwardBias<float>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnConvolutionCreateForwardBias_F32(
-               pConvolution,
-               attributes,
-               algorithm,
-               dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-template <> inline dnnError_t dnnConvolutionCreateForwardBias<double>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnConvolutionCreateForwardBias_F64(
-               pConvolution,
-               attributes,
-               algorithm,
-               dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnConvolutionCreateBackwardData(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type);
-template <> inline dnnError_t dnnConvolutionCreateBackwardData<float>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnConvolutionCreateBackwardData_F32(
-               pConvolution,
-               attributes,
-               algorithm,
-               dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-template <> inline dnnError_t dnnConvolutionCreateBackwardData<double>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnConvolutionCreateBackwardData_F64(
-               pConvolution,
-               attributes,
-               algorithm,
-               dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-
-template <typename Dtype> inline dnnError_t dnnConvolutionCreateBackwardFilter(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type);
-template <> inline dnnError_t dnnConvolutionCreateBackwardFilter<float>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnConvolutionCreateBackwardFilter_F32(
-               pConvolution,
-               attributes,
-               algorithm,
-               dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-template <> inline dnnError_t dnnConvolutionCreateBackwardFilter<double>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnConvolutionCreateBackwardFilter_F64(
-               pConvolution,
-               attributes,
-               algorithm,
-               dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-
-template <typename Dtype> inline dnnError_t dnnConvolutionCreateBackwardBias(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t dstSize[]);
-template <> inline dnnError_t dnnConvolutionCreateBackwardBias<float>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t dstSize[]) {
-    return dnnConvolutionCreateBackwardBias_F32(
-               pConvolution,
-               attributes,
-               algorithm,
-               dimension, dstSize);
-}
-template <> inline dnnError_t dnnConvolutionCreateBackwardBias<double>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t dstSize[]) {
-    return dnnConvolutionCreateBackwardBias_F64(
-               pConvolution,
-               attributes,
-               algorithm,
-               dimension, dstSize);
-}
-
-template <typename Dtype> inline dnnError_t dnnGroupsConvolutionCreateForward(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type);
-template <> inline dnnError_t dnnGroupsConvolutionCreateForward<float>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnGroupsConvolutionCreateForward_F32(
-               pConvolution,
-               attributes,
-               algorithm,
-               groups, dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-template <> inline dnnError_t dnnGroupsConvolutionCreateForward<double>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnGroupsConvolutionCreateForward_F64(
-               pConvolution,
-               attributes,
-               algorithm,
-               groups, dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-
-template <typename Dtype> inline dnnError_t dnnGroupsConvolutionCreateForwardBias(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type);
-template <> inline dnnError_t dnnGroupsConvolutionCreateForwardBias<float>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnGroupsConvolutionCreateForwardBias_F32(
-               pConvolution,
-               attributes,
-               algorithm,
-               groups, dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-template <> inline dnnError_t dnnGroupsConvolutionCreateForwardBias<double>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnGroupsConvolutionCreateForwardBias_F64(
-               pConvolution,
-               attributes,
-               algorithm,
-               groups, dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-
-template <typename Dtype> inline dnnError_t dnnGroupsConvolutionCreateBackwardData(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type);
-template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardData<float>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnGroupsConvolutionCreateBackwardData_F32(
-               pConvolution,
-               attributes,
-               algorithm,
-               groups, dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardData<double>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnGroupsConvolutionCreateBackwardData_F64(
-               pConvolution,
-               attributes,
-               algorithm,
-               groups, dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnGroupsConvolutionCreateBackwardFilter(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type);
-template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardFilter<float>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnGroupsConvolutionCreateBackwardFilter_F32(
-               pConvolution,
-               attributes,
-               algorithm,
-               groups, dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardFilter<double>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnGroupsConvolutionCreateBackwardFilter_F64(
-               pConvolution,
-               attributes,
-               algorithm,
-               groups, dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-
-template <typename Dtype> inline dnnError_t dnnGroupsConvolutionCreateBackwardBias(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t dstSize[]);
-template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardBias<float>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t dstSize[]) {
-    return dnnGroupsConvolutionCreateBackwardBias_F32(
-               pConvolution,
-               attributes,
-               algorithm,
-               groups, dimension, dstSize);
-}
-template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardBias<double>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t dstSize[]) {
-    return dnnGroupsConvolutionCreateBackwardBias_F64(
-               pConvolution,
-               attributes,
-               algorithm,
-               groups, dimension, dstSize);
-}
-
-template <typename Dtype> inline dnnError_t dnnReLUCreateForward(
-    dnnPrimitive_t* pRelu,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t dataLayout, float negativeSlope);
-template <> inline dnnError_t dnnReLUCreateForward<float>(
-    dnnPrimitive_t* pRelu,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t dataLayout, float negativeSlope) {
-    return dnnReLUCreateForward_F32(
-               pRelu,
-               attributes,
-               dataLayout, negativeSlope);
-}
-template <> inline dnnError_t dnnReLUCreateForward<double>(
-    dnnPrimitive_t* pRelu,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t dataLayout, float negativeSlope) {
-    return dnnReLUCreateForward_F64(
-               pRelu,
-               attributes,
-               dataLayout, negativeSlope);
-}
-
-template <typename Dtype> inline dnnError_t dnnReLUCreateBackward(
-    dnnPrimitive_t* pRelu,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, float negativeSlope);
-template <> inline dnnError_t dnnReLUCreateBackward<float>(
-    dnnPrimitive_t* pRelu,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, float negativeSlope) {
-    return dnnReLUCreateBackward_F32(
-               pRelu,
-               attributes,
-               diffLayout, dataLayout, negativeSlope);
-}
-template <> inline dnnError_t dnnReLUCreateBackward<double>(
-    dnnPrimitive_t* pRelu,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, float negativeSlope) {
-    return dnnReLUCreateBackward_F64(
-               pRelu,
-               attributes,
-               diffLayout, dataLayout, negativeSlope);
-}
-
-template <typename Dtype> inline dnnError_t dnnLRNCreateForward(
-    dnnPrimitive_t* pLrn,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t dataLayout, size_t kernel_size, float alpha, float beta, float k);
-template <> inline dnnError_t dnnLRNCreateForward<float>(
-    dnnPrimitive_t* pLrn,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t dataLayout, size_t kernel_size, float alpha, float beta, float k) {
-    return dnnLRNCreateForward_F32(
-               pLrn,
-               attributes,
-               dataLayout, kernel_size, alpha, beta, k);
-}
-template <> inline dnnError_t dnnLRNCreateForward<double>(
-    dnnPrimitive_t* pLrn,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t dataLayout, size_t kernel_size, float alpha, float beta, float k) {
-    return dnnLRNCreateForward_F64(
-               pLrn,
-               attributes,
-               dataLayout, kernel_size, alpha, beta, k);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnLRNCreateBackward(
-    dnnPrimitive_t* pLrn,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t diffLayout, const dnnLayout_t dataLayout,
-    size_t kernel_size, float alpha, float beta, float k);
-template <> inline dnnError_t dnnLRNCreateBackward<float>(
-    dnnPrimitive_t* pLrn,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t diffLayout, const dnnLayout_t dataLayout,
-    size_t kernel_size, float alpha, float beta, float k) {
-    return dnnLRNCreateBackward_F32(
-               pLrn,
-               attributes,
-               diffLayout, dataLayout, kernel_size, alpha, beta, k);
-}
-template <> inline dnnError_t dnnLRNCreateBackward<double>(
-    dnnPrimitive_t* pLrn,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t diffLayout, const dnnLayout_t dataLayout,
-    size_t kernel_size, float alpha, float beta, float k) {
-    return dnnLRNCreateBackward_F64(
-               pLrn,
-               attributes,
-               diffLayout, dataLayout, kernel_size, alpha, beta, k);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnPoolingCreateForward(
-    dnnPrimitive_t* pPooling,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t op,
-    const dnnLayout_t srcLayout,
-    const size_t kernelSize[], const size_t kernelStride[],
-    const int inputOffset[], const dnnBorder_t border_type);
-template <> inline dnnError_t dnnPoolingCreateForward<float>(
-    dnnPrimitive_t* pPooling,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t op,
-    const dnnLayout_t srcLayout,
-    const size_t kernelSize[], const size_t kernelStride[],
-    const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnPoolingCreateForward_F32(
-               pPooling,
-               attributes,
-               op,
-               srcLayout,
-               kernelSize, kernelStride,
-               inputOffset, border_type);
-}
-template <> inline dnnError_t dnnPoolingCreateForward<double>(
-    dnnPrimitive_t* pPooling,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t op,
-    const dnnLayout_t srcLayout,
-    const size_t kernelSize[], const size_t kernelStride[],
-    const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnPoolingCreateForward_F64(
-               pPooling,
-               attributes,
-               op,
-               srcLayout,
-               kernelSize, kernelStride,
-               inputOffset, border_type);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnPoolingCreateBackward(
-    dnnPrimitive_t* pPooling,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t op,
-    const dnnLayout_t srcLayout,
-    const size_t kernelSize[], const size_t kernelStride[],
-    const int inputOffset[], const dnnBorder_t border_type);
-template <> inline dnnError_t dnnPoolingCreateBackward<float>(
-    dnnPrimitive_t* pPooling,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t op,
-    const dnnLayout_t srcLayout,
-    const size_t kernelSize[], const size_t kernelStride[],
-    const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnPoolingCreateBackward_F32(
-               pPooling,
-               attributes,
-               op,
-               srcLayout,
-               kernelSize, kernelStride,
-               inputOffset, border_type);
-}
-template <> inline dnnError_t dnnPoolingCreateBackward<double>(
-    dnnPrimitive_t* pPooling,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t op,
-    const dnnLayout_t srcLayout,
-    const size_t kernelSize[], const size_t kernelStride[],
-    const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnPoolingCreateBackward_F64(
-               pPooling,
-               attributes,
-               op,
-               srcLayout,
-               kernelSize, kernelStride,
-               inputOffset, border_type);
-}
-
-template <typename Dtype> inline dnnError_t dnnConcatCreate(
-    dnnPrimitive_t *pConcat,
-    dnnPrimitiveAttributes_t attributes,
-    const size_t N,
-    dnnLayout_t src[]);
-template <> inline dnnError_t dnnConcatCreate<float>(
-    dnnPrimitive_t *pConcat,
-    dnnPrimitiveAttributes_t attributes,
-    const size_t N,
-    dnnLayout_t src[]) {
-    return dnnConcatCreate_F32(
-               pConcat,
-               attributes,
-               N,
-               src);
-}
-template <> inline dnnError_t dnnConcatCreate<double>(
-    dnnPrimitive_t *pConcat,
-    dnnPrimitiveAttributes_t attributes,
-    const size_t N,
-    dnnLayout_t src[]) {
-    return dnnConcatCreate_F64(
-               pConcat,
-               attributes,
-               N,
-               src);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnSplitCreate(
-    dnnPrimitive_t *pSplit,
-    dnnPrimitiveAttributes_t attributes,
-    const size_t N,
-    dnnLayout_t src,
-    size_t dst[]);
-template <> inline dnnError_t dnnSplitCreate<float>(
-    dnnPrimitive_t *pSplit,
-    dnnPrimitiveAttributes_t attributes,
-    const size_t N,
-    dnnLayout_t src,
-    size_t dst[]) {
-    return dnnSplitCreate_F32(
-               pSplit,
-               attributes,
-               N,
-               src,
-               dst);
-}
-template <> inline dnnError_t dnnSplitCreate<double>(
-    dnnPrimitive_t *pSplit,
-    dnnPrimitiveAttributes_t attributes,
-    const size_t N,
-    dnnLayout_t src,
-    size_t dst[]) {
-    return dnnSplitCreate_F64(
-               pSplit,
-               attributes,
-               N,
-               src,
-               dst);
-}
-
-template <typename Dtype> inline dnnError_t dnnSumCreate(
-    dnnPrimitive_t *pSum,
-    dnnPrimitiveAttributes_t attributes,
-    const size_t nSummands, dnnLayout_t layout, Dtype *coefficients);
-template <> inline dnnError_t dnnSumCreate<float>(
-    dnnPrimitive_t *pSum,
-    dnnPrimitiveAttributes_t attributes,
-    const size_t nSummands, dnnLayout_t layout, float *coefficients) {
-    return dnnSumCreate_F32(
-               pSum,
-               attributes,
-               nSummands,
-               layout, coefficients);
-}
-template <> inline dnnError_t dnnSumCreate<double>(
-    dnnPrimitive_t *pSum,
-    dnnPrimitiveAttributes_t attributes,
-    const size_t nSummands, dnnLayout_t layout, double *coefficients) {
-    return dnnSumCreate_F64(
-               pSum,
-               attributes,
-               nSummands,
-               layout, coefficients);
-}
-
-template <typename Dtype> inline dnnError_t dnnBatchNormalizationCreateForward_v2(
-    dnnPrimitive_t* pBatchNormalization,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t dataLayout, float eps,
-    int flags);
-
-template <> inline dnnError_t dnnBatchNormalizationCreateForward_v2<float>(
-    dnnPrimitive_t* pBatchNormalization,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t dataLayout, float eps,
-    int flags) {
-    return dnnBatchNormalizationCreateForward_v2_F32(
-               pBatchNormalization,
-               attributes,
-               dataLayout, eps, flags);
-}
-template <> inline dnnError_t dnnBatchNormalizationCreateForward_v2<double>(
-    dnnPrimitive_t* pBatchNormalization,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t dataLayout, float eps,
-    int flags) {
-    return dnnBatchNormalizationCreateForward_v2_F64(
-               pBatchNormalization,
-               attributes,
-               dataLayout, eps, flags);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnBatchNormalizationCreateBackward_v2(
-    dnnPrimitive_t* pBatchNormalization,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t dataLayout, float eps,
-    int flags);
-
-template <> inline  dnnError_t dnnBatchNormalizationCreateBackward_v2<float>(
-    dnnPrimitive_t* pBatchNormalization,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t dataLayout, float eps,
-    int flags) {
-    return dnnBatchNormalizationCreateBackward_v2_F32(
-               pBatchNormalization,
-               attributes,
-               dataLayout, eps, flags);
-}
-
-template <> inline dnnError_t dnnBatchNormalizationCreateBackward_v2<double>(
-    dnnPrimitive_t* pBatchNormalization,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t dataLayout, float eps,
-    int flags) {
-    return dnnBatchNormalizationCreateBackward_v2_F64(
-               pBatchNormalization,
-               attributes,
-               dataLayout, eps, flags);
-}
-
-template <typename Dtype> inline dnnError_t dnnInnerProductCreateForward(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t srcSize[],
-    size_t outputChannels);
-template <> inline dnnError_t dnnInnerProductCreateForward<float>(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t srcSize[],
-    size_t outputChannels) {
-    return dnnInnerProductCreateForward_F32(pInnerProduct,
-                                            attributes, dimensions,
-                                            srcSize, outputChannels);
-}
-template <> inline dnnError_t dnnInnerProductCreateForward<double>(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t srcSize[],
-    size_t outputChannels) {
-    return dnnInnerProductCreateForward_F64(pInnerProduct,
-                                            attributes, dimensions,
-                                            srcSize, outputChannels);
-}
-
-template <typename Dtype> inline dnnError_t dnnInnerProductCreateForwardBias(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t srcSize[],
-    size_t outputChannels);
-
-template <> inline dnnError_t dnnInnerProductCreateForwardBias<float>(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t srcSize[],
-    size_t outputChannels) {
-    return dnnInnerProductCreateForwardBias_F32(pInnerProduct,
-            attributes, dimensions,
-            srcSize, outputChannels);
-}
-template <> inline dnnError_t dnnInnerProductCreateForwardBias<double>(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t srcSize[],
-    size_t outputChannels) {
-    return dnnInnerProductCreateForwardBias_F64(pInnerProduct,
-            attributes, dimensions,
-            srcSize, outputChannels);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnInnerProductCreateBackwardData(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t srcSize[],
-    size_t outputChannels);
-
-template <> inline dnnError_t dnnInnerProductCreateBackwardData<float>(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t srcSize[],
-    size_t outputChannels) {
-    return dnnInnerProductCreateBackwardData_F32(pInnerProduct,
-            attributes, dimensions,
-            srcSize, outputChannels);
-}
-template <> inline dnnError_t dnnInnerProductCreateBackwardData<double>(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t srcSize[],
-    size_t outputChannels) {
-    return dnnInnerProductCreateBackwardData_F64(pInnerProduct,
-            attributes, dimensions,
-            srcSize, outputChannels);
-}
-
-
-
-
-template <typename Dtype> inline dnnError_t dnnInnerProductCreateBackwardFilter(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t srcSize[],
-    size_t outputChannels);
-
-template <> inline dnnError_t dnnInnerProductCreateBackwardFilter<float>(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t srcSize[],
-    size_t outputChannels) {
-    return dnnInnerProductCreateBackwardFilter_F32(pInnerProduct,
-            attributes, dimensions,
-            srcSize, outputChannels);
-}
-template <> inline dnnError_t dnnInnerProductCreateBackwardFilter<double>(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t srcSize[],
-    size_t outputChannels) {
-    return dnnInnerProductCreateBackwardFilter_F64(pInnerProduct,
-            attributes, dimensions,
-            srcSize, outputChannels);
-}
-
-
-
-template <typename Dtype> inline dnnError_t dnnInnerProductCreateBackwardBias(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t dstSize[]);
-
-template <> inline dnnError_t dnnInnerProductCreateBackwardBias<float>(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t dstSize[]) {
-    return dnnInnerProductCreateBackwardBias_F32(pInnerProduct,
-            attributes, dimensions,
-            dstSize);
-}
-template <> inline dnnError_t dnnInnerProductCreateBackwardBias<double>(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t dstSize[]) {
-    return dnnInnerProductCreateBackwardBias_F64(pInnerProduct,
-            attributes, dimensions,
-            dstSize);
-}
-#endif  // #MXNET_USE_MKL2017 == 1
-#endif  // MXNET_OPERATOR_MKL_MKL_CPPWRAPPER_H_
diff --git a/src/operator/mkl/mkl_elementwise_copy-inl.h b/src/operator/mkl/mkl_elementwise_copy-inl.h
deleted file mode 100644
index 48c931291150..000000000000
--- a/src/operator/mkl/mkl_elementwise_copy-inl.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_elementwise-inl.h
-* \brief
-* \author lingyan.guo@intel.com
-*         zhenlin.luo@intel.com
-*
-*******************************************************************************/
-#ifndef MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_COPY_INL_H_
-#define MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_COPY_INL_H_
-
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <cstring>
-#include <map>
-#include <string>
-#include <vector>
-#include <utility>
-#include "../operator_common.h"
-#include "../mshadow_op.h"
-#include "./mkl_util-inl.h"
-
-
-namespace mxnet {
-namespace op {
-
-template<typename xpu, typename DType>
-void MKLIdentityCompute(const nnvm::NodeAttrs& attrs,
-  const OpContext& ctx,
-  const std::vector<TBlob>& inputs,
-  const std::vector<OpReqType>& req,
-  const std::vector<TBlob>& outputs) {
-  if (!req[0]) return;
-#if MKL_EXPERIMENTAL == 1
-  if (op::mkl_prv_data<DType>(inputs[0])) {
-    std::shared_ptr<MKLMemHolder> in_data_mem = inputs[0].Mkl_mem_;
-    // User copy to avoid potential problem
-    std::shared_ptr<MKLData<DType> > top_data = MKLData<DType>::create();
-    std::shared_ptr<MKLMemHolder> top_mem = outputs[0].Mkl_mem_;
-    top_data->copy_from(in_data_mem);
-    top_mem->set_prv_descriptor(top_data);
-    return;
-  }
-#endif
-  int in_blob_size = inputs[0].Size();
-  int out_blob_size = outputs[0].Size();
-  CHECK_EQ(in_blob_size, out_blob_size) << "MKLIdentityCompute CPU Size not Match ";
-  memcpy(outputs[0].dptr_, inputs[0].dptr_, in_blob_size * sizeof(DType));
-}
-
-
-
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_COPY_INL_H_
diff --git a/src/operator/mkl/mkl_elementwise_sum-inl.h b/src/operator/mkl/mkl_elementwise_sum-inl.h
deleted file mode 100644
index d313fd15a5be..000000000000
--- a/src/operator/mkl/mkl_elementwise_sum-inl.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_elementwise-inl.h
-* \brief
-* \author lingyan.guo@intel.com
-*         zhenlin.luo@intel.com
-*
-*******************************************************************************/
-#ifndef MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_SUM_INL_H_
-#define MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_SUM_INL_H_
-
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <cstring>
-#include <map>
-#include <string>
-#include <vector>
-#include <utility>
-#include "../operator_common.h"
-#include "../mshadow_op.h"
-#include "./mkl_util-inl.h"
-
-
-namespace mxnet {
-namespace op {
-template<typename xpu, typename DType>
-static void LayerSetUp(const std::vector<mshadow::Tensor<xpu, 1, DType> > &data,
-  size_t data_shape_size,
-  std::shared_ptr<MKLData<DType> > fwd_top_data) {
-  // Whether to use an asymptotically slower (for >2 inputs) but stabler method
-  // of computing the gradient for the PROD operation. (No effect for SUM op.)
-  // stable_prod_grad_ = 1;
-  size_t dim_src = data_shape_size;
-  size_t *sizes_src = new size_t[dim_src];
-  size_t *strides_src = new size_t[dim_src];
-  for (size_t d = 0; d < dim_src; ++d) {
-    sizes_src[d] = data[0].shape_[dim_src - d - 1];
-    strides_src[d] = (d == 0) ? 1 : strides_src[d - 1] * sizes_src[d - 1];
-  }
-
-  fwd_top_data->create_user_layout(dim_src, sizes_src, strides_src);
-  delete[] sizes_src;
-  delete[] strides_src;
-}
-
-template<typename xpu, typename DType>
-void MKLElementWiseSumCompute_(const nnvm::NodeAttrs& attrs,
-  const OpContext& ctx,
-  const std::vector<TBlob>& in_data,
-  const std::vector<OpReqType>& req,
-  const std::vector<TBlob>& out_data) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  if (req[0] == kNullOp) return;
-  size_t size = in_data.size();
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  std::vector<Tensor<xpu, 1, DType> > data(size);
-  Tensor<xpu, 1, DType> out = out_data[0].FlatTo1D<xpu, DType>(s);
-  bool in_place_flag = false;
-  int in_place_idx = 0;
-
-  for (size_t i = 0; i < size; ++i) {
-    data[i]  = in_data[i].FlatTo1D<xpu, DType>(s);
-    if (data[i].dptr_ == out.dptr_) {
-      in_place_idx = i;
-      in_place_flag = true;
-    }
-  }
-  std::shared_ptr<MKLData<DType> > fwd_top_data = MKLData<DType>::create();
-  std::vector<DType> coeffs_  = std::vector<DType>(data.size(), 1);
-  LayerSetUp(data, 1, fwd_top_data);
-
-
-  dnnError_t e;
-  void *eltwise_res[dnnResourceNumber];
-  dnnPrimitive_t sumPrimitive = NULL;
-  e = dnnSumCreate<DType>(&sumPrimitive, NULL, size, fwd_top_data->layout_usr,
-    &coeffs_[0]);
-  CHECK_EQ(e, E_SUCCESS);
-
-  eltwise_res[dnnResourceDst] = reinterpret_cast<void*>(const_cast<DType*>(out.dptr_));
-  eltwise_res[dnnResourceMultipleSrc] =
-    reinterpret_cast<void *>(reinterpret_cast<void *>(in_data[in_place_idx].dptr_));
-  for (size_t i = 1; i < size; ++i) {
-    if (i == in_place_idx) continue;
-    eltwise_res[dnnResourceMultipleSrc + i] =
-      reinterpret_cast<void *>(reinterpret_cast<void *>(in_data[i].dptr_));
-  }
-
-  e = dnnExecute<DType>(sumPrimitive, eltwise_res);
-  CHECK_EQ(e, E_SUCCESS);
-
-  if (sumPrimitive != NULL) {
-    dnnDelete<DType>(sumPrimitive);
-    sumPrimitive = NULL;
-  }
-}
-
-
-
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_SUM_INL_H_
diff --git a/src/operator/mkl/mkl_fully_connected-inl.h b/src/operator/mkl/mkl_fully_connected-inl.h
deleted file mode 100644
index 5e296704b6dd..000000000000
--- a/src/operator/mkl/mkl_fully_connected-inl.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_fully_connected-inl.h
-* \brief
-* \author zhenlin.luo@intel.com
-*          lingyan.guo@intel.com
-*         
-*
-*******************************************************************************/
-#ifndef MXNET_OPERATOR_MKL_MKL_FULLY_CONNECTED_INL_H_
-#define MXNET_OPERATOR_MKL_MKL_FULLY_CONNECTED_INL_H_
-#include <string>
-#include <algorithm>
-#include <vector>
-#include "../activation-inl.h"
-#include "./mkl_util-inl.h"
-
-namespace mxnet {
-namespace op {
-
-template<typename xpu, typename DType>
-class MKLFullyConnectedOp : public Operator {
- public:
-  explicit MKLFullyConnectedOp(const FullyConnectedParam& p,
-                               const std::vector<TShape>& in_shapes,
-                               const std::vector<TShape>& out_shapes):
-    param_(p) {
-    LayerSetUp(in_shapes, out_shapes);
-  }
-
-  ~MKLFullyConnectedOp() {
-    dnnDelete<DType>(fullyConnectedFwd);
-    dnnDelete<DType>(fullyConnectedBwdData);
-    dnnDelete<DType>(fullyConnectedBwdFilter);
-    dnnDelete<DType>(fullyConnectedBwdBias);
-  }
-  static std::string getName() {
-    return "MKLFullyConnectedOp";
-  }
-
- private:
-  void LayerSetUp(const std::vector<TShape>& in_shapes,
-                  const std::vector<TShape>& out_shapes) {
-    const TShape& ishape = in_shapes[fullc::kData];
-
-    const size_t dim = 4;
-    const size_t src_sizes[4] = {1, 1, ishape.ProdShape(1, ishape.ndim()), ishape[0]};
-    const size_t dst_sizes[2] = {param_.num_hidden, ishape[0]};
-    const size_t output_channels = param_.num_hidden;
-
-    dnnPrimitiveAttributes_t attributes = NULL;
-    MKLDNN_CALL(dnnPrimitiveAttributesCreate<DType>(&attributes));
-    if (!param_.no_bias) {
-      MKLDNN_CALL(dnnInnerProductCreateForwardBias<DType>(
-            &fullyConnectedFwd,
-            attributes,
-            dim,
-            src_sizes,
-            output_channels));
-    } else {
-      MKLDNN_CALL(dnnInnerProductCreateForward<DType>(
-            &fullyConnectedFwd,
-            attributes,
-            dim,
-            src_sizes,
-            output_channels));
-    }
-    MKLDNN_CALL(dnnInnerProductCreateBackwardData<DType>(
-          &fullyConnectedBwdData,
-          attributes,
-          dim,
-          src_sizes,
-          output_channels));
-    MKLDNN_CALL(dnnInnerProductCreateBackwardFilter<DType>(
-          &fullyConnectedBwdFilter,
-          attributes,
-          dim,
-          src_sizes,
-          output_channels));
-    if (!param_.no_bias) {
-      MKLDNN_CALL(dnnInnerProductCreateBackwardBias<DType>(
-            &fullyConnectedBwdBias,
-            attributes,
-            2,
-            dst_sizes));
-    }
-    // TODO(minjie): Shouldn't `attributes` be destroyed?
-  }
-
-
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-
-    void* res_fullyConnected[dnnResourceNumber];
-    if (req[fullc::kOut] == kNullOp) return;
-    CHECK_EQ(req[fullc::kOut], kWriteTo);
-    CHECK_EQ(in_data.size(), param_.no_bias ? 2 : 3);
-    CHECK_EQ(out_data.size(), 1);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-
-    const TShape& ishape = in_data[fullc::kData].shape_;
-    const TShape& oshape = out_data[fullc::kOut].shape_;
-
-    Tensor<xpu, 4, DType> data;
-    Tensor<xpu, 4, DType> out;
-
-    Shape4(in_data[fullc::kData].shape_[0], in_data[fullc::kData].shape_[1], 1, 1);
-
-    Shape<4> dshape = Shape4(ishape[0], ishape.ProdShape(1, ishape.ndim()), 1, 1);
-    Shape<4> odshape = Shape4(oshape[0], oshape.ProdShape(1, oshape.ndim()), 1, 1);
-
-    data = in_data[fullc::kData].get_with_shape<xpu, 4, DType>(dshape, s);
-    out = out_data[fullc::kOut].get_with_shape<xpu, 4, DType>(odshape, s);
-    res_fullyConnected[dnnResourceSrc] =
-      reinterpret_cast<void *>(in_data[fullc::kData].dptr_);
-    res_fullyConnected[dnnResourceDst] =
-      reinterpret_cast<void *>(out_data[fullc::kOut].dptr_);
-    res_fullyConnected[dnnResourceFilter] =
-      reinterpret_cast<void *>(in_data[fullc::kWeight].dptr_);
-    if (!param_.no_bias) {
-      res_fullyConnected[dnnResourceBias] = reinterpret_cast<void *>(in_data[fullc::kBias].dptr_);
-    }
-
-    MKLDNN_CALL(dnnExecute<DType>(fullyConnectedFwd, res_fullyConnected));
-  }
-
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-
-    void* res_fullyConnected[dnnResourceNumber];
-    CHECK_EQ(out_grad.size(), 1);
-    const size_t expected = param_.no_bias ? 2 : 3;
-    CHECK(in_data.size() == expected && in_grad.size() == expected);
-    CHECK_EQ(req.size(), expected);
-    res_fullyConnected[dnnResourceSrc] =
-      reinterpret_cast<void *>(in_data[fullc::kData].dptr_);
-    res_fullyConnected[dnnResourceFilter] =
-      reinterpret_cast<void *>(in_data[fullc::kWeight].dptr_);
-
-    res_fullyConnected[dnnResourceDiffDst] =
-      reinterpret_cast<void *>(out_grad[fullc::kOut].dptr_);
-    res_fullyConnected[dnnResourceDiffSrc] =
-      reinterpret_cast<void *>(in_grad[fullc::kData].dptr_);
-    res_fullyConnected[dnnResourceDiffFilter] =
-      reinterpret_cast<void *>(in_grad[fullc::kWeight].dptr_);
-    if (!param_.no_bias) {
-      res_fullyConnected[dnnResourceDiffBias] =
-        reinterpret_cast<void *>(in_grad[fullc::kBias].dptr_);
-    }
-    MKLDNN_CALL(dnnExecute<DType>(fullyConnectedBwdFilter, res_fullyConnected));
-    if (!param_.no_bias) {
-      MKLDNN_CALL(dnnExecute<DType>(fullyConnectedBwdBias, res_fullyConnected));
-    }
-    MKLDNN_CALL(dnnExecute<DType>(fullyConnectedBwdData, res_fullyConnected));
-  }
-
- private:
-  dnnPrimitive_t fullyConnectedFwd{nullptr};
-  dnnPrimitive_t fullyConnectedBwdData{nullptr};
-  dnnPrimitive_t fullyConnectedBwdFilter{nullptr};
-  dnnPrimitive_t fullyConnectedBwdBias{nullptr};
-  const FullyConnectedParam param_;
-};  // class MKLFullyConnectedOp
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_OPERATOR_MKL_MKL_FULLY_CONNECTED_INL_H_
diff --git a/src/operator/mkl/mkl_lrn-inl.h b/src/operator/mkl/mkl_lrn-inl.h
deleted file mode 100644
index 90dfad50fa62..000000000000
--- a/src/operator/mkl/mkl_lrn-inl.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_lrn-inl.h
-* \brief
-* \author zhenlin.luo@intel.com
-*         lingyan.guo@intel.com
-*
-*******************************************************************************/
-#ifndef MXNET_OPERATOR_MKL_MKL_LRN_INL_H_
-#define MXNET_OPERATOR_MKL_MKL_LRN_INL_H_
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <map>
-#include <vector>
-#include <string>
-#include <utility>
-#include "../operator_common.h"
-#include "../mshadow_op.h"
-#include "./mkl_util-inl.h"
-
-namespace mxnet {
-namespace op {
-
-template<typename xpu, typename DType>
-class MKLLRNOp : public Operator {
- public:
-  static std::string getName() {
-    return "MKLLRNOp";
-  }
-
-  explicit MKLLRNOp(LRNParam param) :
-    lrnFwd(static_cast<dnnPrimitive_t>(NULL)),
-    lrnBwd(static_cast<dnnPrimitive_t>(NULL)),
-    lrn_buffer_(NULL) {
-    this->param_ = param;
-    fwd_top_data_ = MKLData<DType>::create();
-    fwd_bottom_data_ = MKLData<DType>::create();
-    bwd_top_diff_ = MKLData<DType>::create();
-    bwd_bottom_diff_ = MKLData<DType>::create();
-    init_mkldnn_ = false;
-  }
-
-  virtual ~MKLLRNOp() {
-    if (lrnFwd != NULL) {
-      dnnDelete<DType>(lrnFwd);
-      lrnFwd = NULL;
-    }
-    if (lrnBwd != NULL) {
-      dnnDelete<DType>(lrnBwd);
-      lrnBwd = NULL;
-    }
-    dnnReleaseBuffer<DType>(lrn_buffer_);
-  }
-
- private:
-  void LayerSetup(const mshadow::Tensor<xpu, 4, DType> &data,
-                  const mshadow::Tensor<xpu, 4, DType> &out) {
-    size_ = param_.nsize;
-    CHECK_EQ(size_ % 2, 1) << "LRN only supports odd values for local size";
-
-    alpha_ = param_.alpha;
-    beta_ = param_.beta;
-    k_ = param_.knorm;
-    size_t dim = 4, sizes[4], strides[4];
-    channels_ = data.shape_[1];
-    height_ = data.shape_[2];
-    width_ = data.shape_[3];
-    num_ = data.shape_[0];
-    sizes[0] = width_;
-    sizes[1] = height_;
-    sizes[2] = channels_;
-    sizes[3] = num_;
-
-    strides[0] = 1;
-    strides[1] = sizes[0];
-    strides[2] = sizes[0] * sizes[1];
-    strides[3] = sizes[0] * sizes[1] * sizes[2];
-
-    fwd_bottom_data_->name = "fwd_bottom_data_   @ " + getName();
-    fwd_top_data_->name = "fwd_top_data_      @ " + getName();
-    bwd_top_diff_->name = "bwd_top_diff_      @ " + getName();
-    bwd_bottom_diff_->name = "bwd_bottom_diff_   @ " + getName();
-
-    fwd_bottom_data_->create_user_layout(dim, sizes, strides);
-    fwd_top_data_->create_user_layout(dim, sizes, strides);
-    bwd_bottom_diff_->create_user_layout(dim, sizes, strides);
-    bwd_top_diff_->create_user_layout(dim, sizes, strides);
-  }
-
- public:
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_states) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 2U);
-    CHECK_EQ(param_.nsize % 2, 1U) << "LRN only supports odd values for local_size";
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> data = mkl_experimental_direct_get<xpu, 4, DType>(
-      in_data[lrn_enum::kData], s);
-    Tensor<xpu, 4, DType> out = mkl_experimental_direct_get<xpu, 4, DType>(
-      out_data[lrn_enum::kOut], s);
-    if (!init_mkldnn_) {
-      LayerSetup(data, out);
-      init_mkldnn_ = true;
-    }
-
-    const void* bottom_data = NULL;
-#if MKL_EXPERIMENTAL == 1
-    bottom_data =
-          reinterpret_cast<void*>(mkl_prv_data<DType>(in_data[lrn_enum::kData]));
-#endif
-#if MKL_EXPERIMENTAL == 1
-    if (NULL != bottom_data) {
-      if (lrnFwd == NULL) {
-        std::shared_ptr<MKLMemHolder> bottom_data_mem =
-          in_data[lrn_enum::kData].Mkl_mem_;
-        std::shared_ptr<PrvMemDescr> bottom_prv_descriptor =
-          bottom_data_mem->get_prv_descriptor();
-        CHECK_EQ(bottom_prv_descriptor->get_descr_type(),
-            PrvMemDescr::PRV_DESCR_MKL2017);
-        std::shared_ptr<MKLData<DType> > mem_descr
-          = std::static_pointer_cast<MKLData<DType>>(bottom_prv_descriptor);
-        CHECK(mem_descr != nullptr);
-        fwd_bottom_data_ = mem_descr;
-
-        dnnError_t e;
-        dnnLayout_t lrn_buffer_l = NULL;
-
-        e = dnnLRNCreateForward<DType>(&lrnFwd, NULL, fwd_bottom_data_->layout_int,
-                                       size_, alpha_, beta_, k_);
-        CHECK_EQ(e, E_SUCCESS);
-
-        fwd_top_data_->create_internal_layout(lrnFwd, dnnResourceDst);
-
-        e = dnnLRNCreateBackward<DType>(&lrnBwd, NULL,
-                                        fwd_bottom_data_->layout_int, fwd_bottom_data_->layout_int,
-                                        size_, alpha_, beta_, k_);
-        CHECK_EQ(e, E_SUCCESS);
-
-        e = dnnLayoutCreateFromPrimitive<DType>(
-              &lrn_buffer_l, lrnFwd, dnnResourceWorkspace);
-        CHECK_EQ(e, E_SUCCESS);
-        e = dnnAllocateBuffer<DType>(
-              reinterpret_cast<void **>(&lrn_buffer_), lrn_buffer_l);
-        CHECK_EQ(e, E_SUCCESS);
-        dnnLayoutDelete<DType>(lrn_buffer_l);
-
-        bwd_top_diff_->create_internal_layout(lrnBwd, dnnResourceDiffDst);
-        bwd_bottom_diff_->create_internal_layout(lrnBwd, dnnResourceDiffSrc);
-      }
-    }
-#endif
-    if (bottom_data == NULL) {
-      if (lrnFwd == NULL) {
-        dnnError_t e;
-        dnnLayout_t lrn_buffer_l = NULL;
-        e = dnnLRNCreateForward<DType>(&lrnFwd, NULL, fwd_bottom_data_->layout_usr,
-                                       size_, alpha_, beta_, k_);
-        CHECK_EQ(e, E_SUCCESS);
-
-        e = dnnLayoutCreateFromPrimitive<DType>(
-              &lrn_buffer_l, lrnFwd, dnnResourceWorkspace);
-        CHECK_EQ(e, E_SUCCESS);
-        e = dnnAllocateBuffer<DType>(
-              reinterpret_cast<void **>(&lrn_buffer_), lrn_buffer_l);
-        CHECK_EQ(e, E_SUCCESS);
-        dnnLayoutDelete<DType>(lrn_buffer_l);
-
-        e = dnnLRNCreateBackward<DType>(&lrnBwd, NULL,
-                                        fwd_bottom_data_->layout_usr, fwd_bottom_data_->layout_usr,
-                                        size_, alpha_, beta_, k_);
-        CHECK_EQ(e, E_SUCCESS);
-      }
-      bottom_data = data.dptr_;
-    }
-
-    dnnError_t e;
-    void* lrn_res[dnnResourceNumber];
-    lrn_res[dnnResourceSrc] = const_cast<void*>(bottom_data);
-
-    lrn_res[dnnResourceDst] = fwd_top_data_->get_output_ptr(
-      out.dptr_, fwd_top_data_, out_data[lrn_enum::kOut]);
-    lrn_res[dnnResourceWorkspace] = lrn_buffer_;
-    e = dnnExecute<DType>(lrnFwd, lrn_res);
-    CHECK_EQ(e, E_SUCCESS);
-  }
-
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_states) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 2);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> grad = mkl_experimental_direct_get<xpu, 4, DType>(
-      out_grad[lrn_enum::kOut], s);
-    Tensor<xpu, 4, DType> data = mkl_experimental_direct_get<xpu, 4, DType>(
-      in_data[lrn_enum::kData], s);
-    Tensor<xpu, 4, DType> grad_in = mkl_experimental_direct_get<xpu, 4, DType>(
-      in_grad[lrn_enum::kData], s);
-    dnnError_t e;
-    void* lrn_res[dnnResourceNumber];
-    lrn_res[dnnResourceDiffDst] =
-      bwd_top_diff_->get_converted_prv(grad.dptr_, true, out_grad[lrn_enum::kOut]);
-    lrn_res[dnnResourceWorkspace] = lrn_buffer_;
-    lrn_res[dnnResourceSrc] =
-      fwd_bottom_data_->get_converted_prv(data.dptr_, false, in_data[lrn_enum::kData]);
-
-    lrn_res[dnnResourceDiffSrc] = bwd_bottom_diff_->get_output_ptr(
-      grad_in.dptr_, bwd_bottom_diff_, in_grad[lrn_enum::kData]);
-    e = dnnExecute<DType>(lrnBwd, lrn_res);
-    CHECK_EQ(e, E_SUCCESS);
-  }
-
- private:
-  LRNParam param_;
-  int size_;
-  int pre_pad_;
-  DType alpha_;
-  DType beta_;
-  DType k_;
-  int num_;
-  int channels_;
-  int height_;
-  int width_;
-  bool init_mkldnn_;
-
- private:
-  dnnPrimitive_t lrnFwd, lrnBwd;
-  std::shared_ptr<MKLData<DType> > fwd_top_data_;
-  std::shared_ptr<MKLData<DType> > fwd_bottom_data_;
-
-  std::shared_ptr<MKLData<DType> > bwd_top_diff_;
-  std::shared_ptr<MKLData<DType> > bwd_bottom_diff_;
-
-  DType *lrn_buffer_;
-};  // class LocalResponseNormOp
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_MKL_MKL_LRN_INL_H_
-
diff --git a/src/operator/mkl/mkl_memory-inl.h b/src/operator/mkl/mkl_memory-inl.h
deleted file mode 100644
index 71af10254b2a..000000000000
--- a/src/operator/mkl/mkl_memory-inl.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_memory-inl.h
-* \brief
-* \author lingyan.guo@intel.com
-*         zhenlin.luo@intel.com
-*
-*******************************************************************************/
-#ifndef MXNET_OPERATOR_MKL_MKL_MEMORY_INL_H_
-#define MXNET_OPERATOR_MKL_MKL_MEMORY_INL_H_
-
-
-#include <string>
-#include <vector>
-#include <memory>
-#include "mkl_cppwrapper.h"
-
-namespace mxnet {
-
-template <typename DType>
-struct MKLMemoryDescriptorBase : public PrvMemDescr,
- public std::enable_shared_from_this<MKLMemoryDescriptorBase<DType> > {
-    MKLMemoryDescriptorBase() : layout_usr(NULL), layout_int(NULL),
-    convert_to_int(NULL), convert_from_int(NULL), convert_prv2prv(NULL),
-    name("UNKNOWN"), internal_ptr(NULL) {}
-  virtual ~MKLMemoryDescriptorBase() {
-    dnnLayoutDelete<DType>(layout_usr);
-    dnnLayoutDelete<DType>(layout_int);
-    if (internal_ptr != NULL) {
-      dnnReleaseBuffer<DType>(internal_ptr);
-      internal_ptr = NULL;
-    }
-    if (convert_to_int != NULL) {
-      dnnDelete<DType>(convert_to_int);
-      convert_to_int = NULL;
-    }
-    if (convert_from_int != NULL) {
-      dnnDelete<DType>(convert_from_int);
-      convert_from_int = NULL;
-    }
-    if (convert_prv2prv != NULL) {
-      dnnDelete<DType>(convert_prv2prv);
-      convert_prv2prv = NULL;
-    }
-  }
-  std::shared_ptr<MKLMemoryDescriptorBase<DType> > get_shared_ptr() {
-    return this->shared_from_this();
-  }
-
-  dnnLayout_t layout_usr;
-  dnnLayout_t layout_int;
-  dnnPrimitive_t convert_to_int;
-  dnnPrimitive_t convert_from_int;
-  dnnPrimitive_t convert_prv2prv;
-  std::shared_ptr<MKLMemoryDescriptorBase<DType> > descr_prv2prv_conversion;
-
-
-  std::string name;  // for debugging purposes
-  void allocate() {
-    if (internal_ptr == NULL) {
-      int status = dnnAllocateBuffer<DType>(
-              reinterpret_cast<void **>(&internal_ptr), layout_int);
-      CHECK_EQ(status, E_SUCCESS)
-          << "Failed internal_ptr memory allocation with status "
-          << status << "\n";
-    }
-  }
-  virtual void* prv_ptr(bool allocate_when_uninit = true) {
-    if (internal_ptr == NULL && allocate_when_uninit)
-      allocate();
-    return internal_ptr;
-  }
-  inline bool conversion_needed() {
-    return (convert_to_int != NULL);
-  }
-  void create_conversions();
-  void create_internal_layout(const dnnPrimitive_t primitive,
-                dnnResourceType_t type);
-  void create_user_layout(size_t dimension, const size_t size[],
-              const size_t strides[]);
-  void create_layouts(
-    const dnnPrimitive_t primitive, dnnResourceType_t type,
-    size_t dimension, const size_t size[], const size_t strides[]);
-
-  virtual PrvDescrType get_descr_type() {
-    return PRV_DESCR_MKL2017;
-  }
-  virtual size_t prv_size() {
-    return dnnLayoutGetMemorySize<DType>(layout_int);
-  }
-  virtual size_t prv_count() {
-    return dnnLayoutGetMemorySize<DType>(layout_int) / sizeof(DType);
-  }
-  virtual void convert_from_prv(void* cpu_ptr);
-  virtual void convert_to_prv(void* cpu_ptr);
-  virtual bool layout_compare(std::shared_ptr<PrvMemDescr> other);
-  virtual void convert_from_other(std::shared_ptr<PrvMemDescr> other);
- protected:
-  DType* internal_ptr;
-};
-
-template <typename DType>
-struct MKLMemoryDescriptor : MKLMemoryDescriptorBase<DType> {
-  // The last get_converted_prv() argument is a hack for reusing
-  // in backward a conversion done already in the forward direction.
-  DType* get_converted_prv(DType *data_ptr, bool set_prv_ptr,
-      const TBlob &blob);
-  void* get_output_ptr(DType *data_ptr, std::shared_ptr<MKLMemoryDescriptor<DType> > self_ptr,
-    const TBlob &blob, bool in_place = false);
-  bool copy_from(std::shared_ptr<MKLMemHolder> dnn_chunk);
-  MKLMemoryDescriptor() {}
-};
-
-template <typename DType> struct MKLData : MKLMemoryDescriptor<DType> {
-  static std::shared_ptr<MKLData<DType> > create() {
-    return std::make_shared<MKLData<DType> >();
-  }
-};
-
-template struct MKLData<float>;
-template struct MKLData<double>;
-
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_MKL_MKL_MEMORY_INL_H_
diff --git a/src/operator/mkl/mkl_memory.cc b/src/operator/mkl/mkl_memory.cc
deleted file mode 100644
index 7682fe1c1f37..000000000000
--- a/src/operator/mkl/mkl_memory.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_memory.cc
-* \brief
-* \author lingyan.guo@intel.com
-*         zhenlin.luo@intel.com
-*
-*******************************************************************************/
-#include "../operator_common.h"
-
-#if MXNET_USE_MKL2017 == 1
-#include <mkl_memory.h>
-#include "mkl_memory-inl.h"
-#include "mkl_util-inl.h"
-
-namespace mxnet {
-
-template <typename Dtype>
-void MKLMemoryDescriptorBase<Dtype>::create_conversions() {
-  int status;
-  if (this->convert_from_int) {
-    status = dnnDelete<Dtype>(this->convert_from_int);
-    CHECK_EQ(status, E_SUCCESS);
-    this->convert_from_int = NULL;
-  }
-  if (this->convert_to_int) {
-    status = dnnDelete<Dtype>(this->convert_to_int);
-    CHECK_EQ(status, E_SUCCESS);
-    this->convert_to_int = NULL;
-  }
-  if (layout_int
-      && !dnnLayoutCompare<Dtype>(layout_usr, layout_int)) {
-    CHECK(layout_usr);
-    status = dnnConversionCreate<Dtype>(&convert_to_int, layout_usr,
-            layout_int);
-    CHECK_EQ(status, E_SUCCESS)
-            << "Failed creation convert_to_int with status "
-            << status << " for buffer: " << this->name << "\n";
-    status = dnnConversionCreate<Dtype>(&convert_from_int, layout_int,
-            layout_usr);
-    CHECK_EQ(status, E_SUCCESS)
-            << "Failed creation convert_from_int with status "
-            << status << " for buffer: " << this->name << "\n";
-  }
-}
-
-template <typename Dtype>
-void MKLMemoryDescriptorBase<Dtype>::create_internal_layout(
-    const dnnPrimitive_t primitive, dnnResourceType_t type) {
-  int status;
-  if (this->layout_int) {
-    status = dnnLayoutDelete<Dtype>(this->layout_int);
-    CHECK_EQ(status, E_SUCCESS);
-  }
-  status = dnnLayoutCreateFromPrimitive<Dtype>(
-      &this->layout_int, primitive, type);
-  CHECK_EQ(status, E_SUCCESS)
-      << "Failed dnnLayoutCreateFromPrimitive with status "
-      << status << " for buffer: " << this->name << "\n";
-
-  if (this->layout_usr)
-    this->create_conversions();
-}
-
-template <typename Dtype>
-void MKLMemoryDescriptorBase<Dtype>::create_user_layout(
-    size_t dimension, const size_t size[], const size_t strides[]) {
-  int status;
-  if (this->layout_usr) {
-    status = dnnLayoutDelete<Dtype>(this->layout_usr);
-    CHECK_EQ(status, E_SUCCESS);
-  }
-
-  status = dnnLayoutCreate<Dtype>(
-      &this->layout_usr, dimension, size, strides);
-  CHECK_EQ(status, E_SUCCESS) << "Failed dnnLayoutCreate with status "
-      << status << " for buffer: " << this->name << "\n";
-
-  if (this->layout_int)
-    this->create_conversions();
-}
-
-template <typename Dtype>
-void MKLMemoryDescriptorBase<Dtype>::create_layouts(
-    const dnnPrimitive_t primitive, dnnResourceType_t type,
-    size_t dimension, const size_t size[], const size_t strides[]) {
-  this->create_internal_layout(primitive, type);
-  this->create_user_layout(dimension, size, strides);
-}
-
-
-template <typename Dtype>
-void MKLMemoryDescriptorBase<Dtype>::convert_from_prv(void* cpu_ptr) {
-  CHECK(cpu_ptr);
-  CHECK(this->convert_from_int);
-  int status;
-  void *convert_resources[dnnResourceNumber];
-
-  convert_resources[dnnResourceFrom] = this->prv_ptr();
-  convert_resources[dnnResourceTo]   = cpu_ptr;
-  status = dnnExecute<Dtype>(this->convert_from_int, convert_resources);
-  CHECK_EQ(status, 0) << "Conversion from prv failed with status " << status;
-}
-
-template <typename Dtype>
-void MKLMemoryDescriptorBase<Dtype>::convert_to_prv(void* cpu_ptr) {
-  CHECK(cpu_ptr);
-  CHECK(this->convert_to_int);
-  int status;
-  void *convert_resources[dnnResourceNumber];
-
-  convert_resources[dnnResourceFrom] = cpu_ptr;
-  convert_resources[dnnResourceTo]   = this->prv_ptr();
-  status = dnnExecute<Dtype>(this->convert_to_int, convert_resources);
-  CHECK_EQ(status, 0) << "Conversion from prv failed with status " << status;
-}
-
-
-template <typename Dtype>
-bool MKLMemoryDescriptorBase<Dtype>::layout_compare(
-  std::shared_ptr<PrvMemDescr> other) {
-  CHECK_EQ(other->get_descr_type(),
-              PrvMemDescr::PRV_DESCR_MKL2017);
-  std::shared_ptr<MKLMemoryDescriptorBase<Dtype> >other_descr =
-    std::static_pointer_cast<MKLMemoryDescriptorBase<Dtype> >
-    (other);
-
-  if (dnnLayoutCompare<Dtype>(other_descr->layout_int,
-      this->layout_int))
-    return true;
-  else
-    return false;
-}
-
-template <typename Dtype>
-void MKLMemoryDescriptorBase<Dtype>::convert_from_other(
-  std::shared_ptr<PrvMemDescr> other) {
-    std::shared_ptr<MKLMemoryDescriptorBase<Dtype> > other_descr =
-        std::static_pointer_cast<MKLMemoryDescriptorBase<Dtype> >
-            (other);
-
-  int status;
-  dnnPrimitive_t convert;
-  status = dnnConversionCreate<Dtype>(&convert,
-    other_descr->layout_int, this->layout_int);
-
-  void *convert_resources[dnnResourceNumber];
-  convert_resources[dnnResourceFrom] = other_descr->prv_ptr();
-  convert_resources[dnnResourceTo]   = this->prv_ptr();
-  status = dnnExecute<Dtype>(convert, convert_resources);
-  CHECK_EQ(status, 0) << "Conversion from other failed with status "
-                      << status;
-
-  dnnDelete<Dtype>(convert);
-}
-
-
-template <typename Dtype>
-Dtype* MKLMemoryDescriptor<Dtype>::get_converted_prv(
-    Dtype *cpu_ptr, bool set_prv_ptr, const TBlob &blob) {
-  Dtype* prv_ptr = NULL;
-  std::shared_ptr<MKLMemHolder> dnn_chunk = NULL;
-#if MKL_EXPERIMENTAL == 1
-  dnn_chunk = blob.Mkl_mem_;
-#endif
-#if MKL_EXPERIMENTAL == 1
-  if (dnn_chunk != NULL)
-    prv_ptr = static_cast<Dtype*>(dnn_chunk->prv_data());
-#endif
-
-  if (this->convert_to_int != NULL) {
-#if MKL_EXPERIMENTAL == 1
-    int status;
-    void *convert_resources[dnnResourceNumber];
-#endif
-    if (prv_ptr == NULL) {
-      this->allocate();
-      this->convert_to_prv(cpu_ptr);
-#if MKL_EXPERIMENTAL == 1
-      if (set_prv_ptr) {
-        dnn_chunk->set_prv_descriptor(this->get_shared_ptr(), true);
-      }
-#endif
-      return this->internal_ptr;
-    }
-#if MKL_EXPERIMENTAL == 1
-    if (prv_ptr != NULL)  {
-      std::shared_ptr<MKLData<Dtype> > current_descr =
-        op::mkl_get_mem_desc<Dtype>(dnn_chunk);
-      if (!dnnLayoutCompare<Dtype>(current_descr->layout_int,
-        this->layout_int)) {
-        if (this->convert_prv2prv) {
-          CHECK_EQ(dnnLayoutCompare<Dtype>(
-            this->descr_prv2prv_conversion->layout_int,
-            this->layout_int), 0);
-          status = 0;
-        } else {
-          status = dnnConversionCreate<Dtype>(&this->convert_prv2prv,
-            current_descr->layout_int, this->layout_int);
-          if (status == 0)
-            this->descr_prv2prv_conversion = current_descr;
-        }
-        if (status != 0) {
-          this->allocate();
-          convert_resources[dnnResourceFrom] = cpu_ptr;
-          convert_resources[dnnResourceTo] =
-            reinterpret_cast<void*>(this->internal_ptr);
-          status = dnnExecute<Dtype>(this->convert_to_int, convert_resources);
-          CHECK_EQ(status, 0) << "Conversion failed with status " << status;
-        } else {
-          this->allocate();
-          convert_resources[dnnResourceFrom] = reinterpret_cast<void*>(prv_ptr);
-          convert_resources[dnnResourceTo] =
-            reinterpret_cast<void*>(this->internal_ptr);
-          status = dnnExecute<Dtype>(this->convert_prv2prv, convert_resources);
-          CHECK_EQ(status, 0) << "Conversion failed with status " << status;
-        }
-        if (set_prv_ptr) {
-          dnn_chunk->set_prv_descriptor(this->get_shared_ptr(), true);
-        }
-        return this->internal_ptr;
-      } else if (current_descr.get() != this) {
-        // MKL_DLOG(INFO) << "layout OK                 "
-        //  << current_descr->name << " == " << this->name;
-      }
-    }
-#endif
-    return const_cast<Dtype *>(prv_ptr);
-  } else {
-    if (prv_ptr != NULL) {
-#if MKL_EXPERIMENTAL == 1
-      std::shared_ptr<MKLMemoryDescriptorBase<float> > other_descr =
-        std::static_pointer_cast<MKLMemoryDescriptorBase<float> >
-        (dnn_chunk->prv_descriptor_);
-      dnn_chunk->check_and_prv_to_cpu(cpu_ptr);
-#endif
-      // printf("get_converted_prv release %s\n", other_descr->name.c_str());
-    }
-  }
-  return cpu_ptr;
-}
-
-template <typename Dtype>
-void* MKLMemoryDescriptor<Dtype>::get_output_ptr(Dtype *data_ptr,
-  std::shared_ptr<MKLMemoryDescriptor<Dtype> > self_ptr, const TBlob &blob, bool in_place) {
-#if MKL_EXPERIMENTAL == 1
-  std::shared_ptr<MKLMemHolder> dnn_chunk = blob.Mkl_mem_;
-#endif
-  if (this->conversion_needed()) {
-    void * prv_ptr =  this->prv_ptr();
-#if MKL_EXPERIMENTAL == 1
-    if (!in_place) {
-      dnn_chunk->set_prv_descriptor(self_ptr);
-    } else {
-      Dtype * blob_prv = op::mkl_prv_data<Dtype>(blob);
-      if (blob_prv != NULL)
-        return blob_prv;
-    }
-#endif
-    return prv_ptr;
-  } else {
-#if MKL_EXPERIMENTAL == 1
-    std::shared_ptr<MKLMemoryDescriptorBase<float> > other_descr =
-      std::static_pointer_cast<MKLMemoryDescriptorBase<float> >
-      (dnn_chunk->prv_descriptor_);
-    dnn_chunk->check_and_prv_to_cpu(data_ptr);
-#endif
-    return data_ptr;
-  }
-}
-
-template class MKLMemoryDescriptor<double>;
-template class MKLMemoryDescriptor<float>;
-
-template class MKLMemoryDescriptorBase<float>;
-template class MKLMemoryDescriptorBase<double>;
-}  // namespace mxnet
-#endif
diff --git a/src/operator/mkl/mkl_memory.h b/src/operator/mkl/mkl_memory.h
deleted file mode 100644
index 13f1fd27b12b..000000000000
--- a/src/operator/mkl/mkl_memory.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_memory.cc
-* \brief
-* \author lingyan.guo@intel.com
-*         zhenlin.luo@intel.com
-*
-*******************************************************************************/
-#ifndef MXNET_OPERATOR_MKL_MKL_MEMORY_H_
-#define MXNET_OPERATOR_MKL_MKL_MEMORY_H_
-
-#include <string>
-#include <vector>
-#include <memory>
-
-
-namespace mxnet {
-// Base class
-struct PrvMemDescr {
-  virtual void convert_from_prv(void* cpu_ptr) = 0;
-  virtual void convert_to_prv(void* cpu_ptr) = 0;
-  virtual void convert_from_other(std::shared_ptr<PrvMemDescr> other) = 0;
-  virtual void* prv_ptr(bool allocate_when_uninit = true) = 0;
-  // returns true for matching layouts
-  virtual bool layout_compare(std::shared_ptr<PrvMemDescr> other) = 0;
-  virtual size_t prv_count() = 0;
-  virtual size_t prv_size() = 0;
-  // This might help using prv_ptr_ by different accelerators/engines
-  enum PrvDescrType {
-    PRV_DESCR_MKL2017,
-    PRV_DESCR_MKLDNN
-  };
-  virtual PrvDescrType get_descr_type() = 0;
-};
-
-#if MKL_EXPERIMENTAL == 1
-// Currently HEAD_AT_PRV do not free CPU data
-enum SyncedHead {
-  HEAD_AT_CPU,
-  HEAD_AT_PRV,
-};
-struct MKLMemHolder {
-  SyncedHead head_;
-  std::shared_ptr<PrvMemDescr> prv_descriptor_;
-  bool  b_disable_prv_2_cpu;
-  bool  b_eager_mode;
-  void disable_prv_2_cpu(bool flag) {
-    b_disable_prv_2_cpu = flag;
-  }
-  void set_eager_mode(bool eager_mode) {
-    b_eager_mode = eager_mode;
-  }
-  void set_prv_descriptor(std::shared_ptr<PrvMemDescr> descriptor, bool same_data = false) {
-    head_ = HEAD_AT_PRV;
-    prv_descriptor_ = descriptor;
-  }
-  std::shared_ptr<PrvMemDescr> get_prv_descriptor() {
-    return  prv_descriptor_;
-  }
-  bool head_at_prv() {
-    return (head_ == HEAD_AT_PRV) ? true : false;
-  }
-  void* prv_data(bool allocate_when_uninit = true) {
-    if (head_ != HEAD_AT_PRV) {
-      return NULL;
-    }
-    if (prv_descriptor_ == NULL) {
-      LOG(FATAL) << " prv_descriptor_  is NULL";
-    }
-    CHECK(prv_descriptor_.get());
-    return reinterpret_cast<void*>(prv_descriptor_->prv_ptr(allocate_when_uninit));
-  }
-
-  int prv_count() {
-    if (head_ != HEAD_AT_PRV) {
-      return 0;
-    }
-    if (prv_descriptor_ == NULL) {
-      LOG(FATAL) << " prv_descriptor_  is NULL";
-    }
-    CHECK(prv_descriptor_.get());
-    return prv_descriptor_->prv_count();
-  }
-  static std::shared_ptr<MKLMemHolder> create() {
-    return std::make_shared<MKLMemHolder>();
-  }
-  void  check_and_prv_to_cpu(void *dptr_) {
-    if (!b_disable_prv_2_cpu && head_ == HEAD_AT_PRV) {
-      CHECK(prv_descriptor_ != nullptr);
-      prv_descriptor_->convert_from_prv(dptr_);
-      // Because operator use CPU & maybe change it, change to CPU Flag
-      head_ = HEAD_AT_CPU;
-    }
-    if (b_disable_prv_2_cpu) {
-      b_disable_prv_2_cpu = false;
-    }
-  }
-  MKLMemHolder() :
-    head_(HEAD_AT_CPU), prv_descriptor_(nullptr),
-    b_disable_prv_2_cpu(false), b_eager_mode(false) {}
-};
-#else
-struct MKLMemHolder {
- public:
-  virtual std::shared_ptr<PrvMemDescr> get_prv_descriptor() = 0;
-};
-#endif
-
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_MKL_MKL_MEMORY_H_
diff --git a/src/operator/mkl/mkl_pooling-inl.h b/src/operator/mkl/mkl_pooling-inl.h
deleted file mode 100644
index 5662a61aebd3..000000000000
--- a/src/operator/mkl/mkl_pooling-inl.h
+++ /dev/null
@@ -1,357 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_pooling-inl.h
-* \brief
-* \author zhenlin.luo@intel.com
-*         lingyan.guo@intel.com
-*
-*******************************************************************************/
-
-#ifndef MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_
-#define MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_
-#include <vector>
-#include <string>
-#include <utility>
-#include "../operator_common.h"
-#include "../nn/pooling-inl.h"
-#include "./mkl_util-inl.h"
-
-namespace mxnet {
-namespace op {
-
-
-template<typename xpu, typename DType>
-class MKLPoolingOp : public Operator {
- public:
-  static std::string getName() {
-    return "MKLPoolingOp";
-  }
-  explicit MKLPoolingOp(PoolingParam p) {
-    poolingFwd = static_cast<dnnPrimitive_t>(NULL);
-    poolingBwd = static_cast<dnnPrimitive_t>(NULL);
-    max_idx_data = static_cast<DType*>(NULL);
-    fwd_top_data = MKLData<DType>::create();
-    fwd_bottom_data = MKLData<DType>::create();
-    bwd_top_diff = MKLData<DType>::create();
-    bwd_bottom_diff = MKLData<DType>::create();
-    this->param_ = p;
-    init_mkldnn_ = false;
-  }
-  virtual ~MKLPoolingOp() {
-    if (poolingFwd != NULL) {
-      dnnDelete<DType>(poolingFwd);
-      poolingFwd = NULL;
-    }
-    if (poolingBwd != NULL) {
-      dnnDelete<DType>(poolingBwd);
-      poolingBwd = NULL;
-    }
-    if (max_idx_data != NULL) {
-      dnnReleaseBuffer<DType>(max_idx_data);
-      max_idx_data = NULL;
-    }
-  }
-
- private:
-  void LayerSetUp(const mshadow::Tensor<xpu, 4, DType> &data,
-                  const mshadow::Tensor<xpu, 4, DType> &out) {
-    channels_ = data.shape_[1];
-    height_ = data.shape_[2];
-    width_ = data.shape_[3];
-    num_ = data.shape_[0];
-    global_pooling_ = param_.global_pool;
-    if (global_pooling_) {
-      kernel_h_ = height_;
-      kernel_w_ = width_;
-    } else {
-      kernel_h_ = param_.kernel[0];
-      kernel_w_ = param_.kernel[1];
-    }
-    CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
-    CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
-    pad_h_ = param_.pad[0];
-    pad_w_ = param_.pad[1];
-    if (global_pooling_) {
-      stride_h_ = stride_w_ = 1;
-    } else {
-      stride_h_ = param_.stride[0];
-      stride_w_ = param_.stride[1];
-    }
-    if (global_pooling_) {
-      CHECK(pad_h_ == 0 && pad_w_ == 0 && stride_h_ == 1 && stride_w_ == 1)
-        << "With Global_pooling: true; only pad = 0 and stride = 1";
-    }
-    if (pad_h_ != 0 || pad_w_ != 0) {
-      CHECK(param_.pool_type == pool_enum::kAvgPooling
-          || param_.pool_type == pool_enum::kMaxPooling)
-        << "Padding implemented only for average and max pooling.";
-      CHECK_LT(pad_h_, kernel_h_);
-      CHECK_LT(pad_w_, kernel_w_);
-    }
-    pooled_height_ = out.shape_[2];
-    pooled_width_ = out.shape_[3];
-
-    size_t dim = 4;
-    size_t src_sizes[4], src_strides[4];
-    size_t dst_sizes[4], dst_strides[4];
-    src_sizes[0] = width_;
-    src_sizes[1] = height_;
-    src_sizes[2] = channels_;
-    src_sizes[3] = num_;
-    src_strides[0] = 1;
-    src_strides[1] = src_sizes[0];
-    src_strides[2] = src_sizes[0] * src_sizes[1];
-    src_strides[3] = src_sizes[0] * src_sizes[1] * src_sizes[2];
-    dst_sizes[0] = pooled_width_;
-    dst_sizes[1] = pooled_height_;
-    dst_sizes[2] = src_sizes[2];
-    dst_sizes[3] = src_sizes[3];
-    dst_strides[0] = 1;
-    dst_strides[1] = dst_sizes[0];
-    dst_strides[2] = dst_sizes[0] * dst_sizes[1];
-    dst_strides[3] = dst_sizes[0] * dst_sizes[1] * dst_sizes[2];
-    src_offset[0] = -pad_w_;
-    src_offset[1] = -pad_h_;
-    src_offset[2] = -pad_w_;
-    src_offset[3] = -pad_h_;
-    kernel_stride[0] = stride_w_;
-    kernel_stride[1] = stride_h_;
-    kernel_size[0] = kernel_w_;
-    kernel_size[1] = kernel_h_;
-
-    // Names are for debugging only
-    fwd_bottom_data->name = "fwd_bottom_data   @ " + getName();
-    fwd_top_data->name = "fwd_top_data      @ " + getName();
-    bwd_top_diff->name = "bwd_top_diff      @ " + getName();
-    bwd_bottom_diff->name = "bwd_bottom_diff   @ " + getName();
-
-    fwd_bottom_data->create_user_layout(dim, src_sizes, src_strides);
-    fwd_top_data->create_user_layout(dim, dst_sizes, dst_strides);
-    bwd_bottom_diff->create_user_layout(dim, src_sizes, src_strides);
-    bwd_top_diff->create_user_layout(dim, dst_sizes, dst_strides);
-
-    // Primitives will be allocated during the first fwd pass
-    poolingFwd = NULL;
-    poolingBwd = NULL;
-    max_idx_data = NULL;
-  }
-
- public:
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    if (param_.kernel.ndim() >= 3) {
-      LOG(FATAL) << "Not implmented";
-    }
-    Tensor<xpu, 4, DType> data = mkl_experimental_direct_get<xpu, 4, DType>(
-      in_data[pool_enum::kData], s);
-    Tensor<xpu, 4, DType> out = mkl_experimental_direct_get<xpu, 4, DType>(
-      out_data[pool_enum::kOut], s);
-    if (!init_mkldnn_) {
-      LayerSetUp(data, out);
-      init_mkldnn_ = true;
-    }
-    auto first_pass = false;
-    if (poolingFwd == NULL) first_pass = true;
-
-    dnnAlgorithm_t algorithm = dnnAlgorithmPoolingMax;
-
-    switch (param_.pool_type) {
-    case pool_enum::kMaxPooling:
-      algorithm = dnnAlgorithmPoolingMax;
-      break;
-    case pool_enum::kAvgPooling:
-      algorithm = dnnAlgorithmPoolingAvgIncludePadding;
-
-      break;
-    default:
-      LOG(FATAL) << "Unknown pooling method.";
-    }
-
-    dnnError_t status;
-    void* pooling_res[dnnResourceNumber];
-
-    void* bottom_data = NULL;
-#if MKL_EXPERIMENTAL == 1
-    bottom_data =
-          reinterpret_cast<void *>(mkl_prv_data<DType>(in_data[pool_enum::kData]));
-#endif
-    dnnBorder_t border_type = dnnBorderZerosAsymm;
-    switch (param_.pooling_convention) {
-    case pool_enum::kFull:
-      border_type = dnnBorderZeros;
-      break;
-    case pool_enum::kValid:
-      border_type = dnnBorderZerosAsymm;
-      break;
-    default:
-      border_type = dnnBorderZerosAsymm;
-      break;
-    }
-    if (NULL == bottom_data) {
-      bottom_data = data.dptr_;
-      if (NULL == poolingFwd) {
-        status = dnnPoolingCreateForward<DType>(&poolingFwd, NULL,
-                                                algorithm, fwd_bottom_data->layout_usr,
-                                                kernel_size, kernel_stride,
-                                                src_offset, border_type);
-      CHECK_EQ(status, E_SUCCESS);
-      // Now create poolingBwd
-      status = dnnPoolingCreateBackward<DType>(&poolingBwd, NULL,
-                                               algorithm, fwd_bottom_data->layout_usr,
-                                               kernel_size, kernel_stride,
-                                               src_offset, border_type);
-      CHECK_EQ(status, E_SUCCESS);
-      }
-    }
-#if MKL_EXPERIMENTAL == 1
-    if (NULL != bottom_data) {
-       if (NULL == poolingFwd) {
-          std::shared_ptr<MKLMemHolder> bottom_data_mem = in_data[pool_enum::kData].Mkl_mem_;
-          std::shared_ptr<PrvMemDescr> bottom_prv_descriptor =
-            bottom_data_mem->get_prv_descriptor();
-          CHECK_EQ(bottom_prv_descriptor->get_descr_type(),
-                   PrvMemDescr::PRV_DESCR_MKL2017);
-          std::shared_ptr<MKLData<DType> > mem_descr
-            = std::static_pointer_cast<MKLData<DType>>(bottom_prv_descriptor);
-          CHECK(mem_descr != nullptr);
-          fwd_bottom_data = mem_descr;
-
-          status = dnnPoolingCreateForward<DType>(&poolingFwd, NULL,
-                                                  algorithm, fwd_bottom_data->layout_int,
-                                                  kernel_size, kernel_stride,
-                                                  src_offset, border_type);
-          CHECK_EQ(status, E_SUCCESS);
-          fwd_top_data->create_internal_layout(poolingFwd, dnnResourceDst);
-
-          // Now create poolingBwd
-          status = dnnPoolingCreateBackward<DType>(&poolingBwd, NULL,
-                                                   algorithm, fwd_bottom_data->layout_int,
-                                                   kernel_size, kernel_stride,
-                                                   src_offset, border_type);
-          CHECK_EQ(status, E_SUCCESS);
-          bwd_top_diff->create_internal_layout(poolingFwd, dnnResourceDst);
-          bwd_bottom_diff->create_internal_layout(poolingFwd, dnnResourceSrc);
-        }
-    }
-#endif
-
-    if (first_pass) {
-      dnnLayout_t max_idx_datal = NULL;
-      status = dnnLayoutCreateFromPrimitive<DType>(
-          &max_idx_datal, poolingFwd, dnnResourceWorkspace);
-      CHECK_EQ(status, E_SUCCESS);
-      status = dnnAllocateBuffer<DType>(reinterpret_cast<void**>(&max_idx_data), max_idx_datal);
-      CHECK_EQ(status, E_SUCCESS);
-#if MKL_EXPERIMENTAL == 0
-      fwd_bottom_data->create_internal_layout(poolingFwd, dnnResourceSrc);
-      fwd_top_data->create_internal_layout(poolingFwd, dnnResourceDst);
-      bwd_top_diff->create_internal_layout(poolingBwd, dnnResourceDiffDst);
-      bwd_bottom_diff->create_internal_layout(poolingBwd, dnnResourceDiffSrc);
-#endif
-      dnnLayoutDelete<DType>(max_idx_datal);
-      first_pass = false;
-    }
-    pooling_res[dnnResourceSrc] = bottom_data;
-    pooling_res[dnnResourceWorkspace] = max_idx_data;
-
-    pooling_res[dnnResourceDst] = fwd_top_data->get_output_ptr(
-      out.dptr_, fwd_top_data, out_data[pool_enum::kOut]);
-    status = dnnExecute<DType>(poolingFwd, pooling_res);
-    CHECK_EQ(status, E_SUCCESS);
-#if MKL_EXPERIMENTAL == 0
-    if (fwd_top_data->conversion_needed()) {
-      fwd_top_data->convert_from_prv(out.dptr_);
-    }
-#endif
-  }
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    if (!req[0]) {
-      return;
-    }
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
-    CHECK_EQ(req.size(), 1);
-    CHECK_EQ(in_grad.size(), 1);
-    if (param_.kernel.ndim() >= 3) {
-      LOG(FATAL) << "Not implmented";
-    }
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> grad = mkl_experimental_direct_get<xpu, 4, DType>(
-      out_grad[pool_enum::kOut], s);
-    Tensor<xpu, 4, DType> input_grad = mkl_experimental_direct_get<xpu, 4, DType>(
-      in_grad[pool_enum::kData], s);
-    dnnError_t e;
-    void* pooling_res[dnnResourceNumber];
-    pooling_res[dnnResourceWorkspace] = reinterpret_cast<void *>(max_idx_data);
-
-    pooling_res[dnnResourceDiffDst] =
-      bwd_top_diff->get_converted_prv(grad.dptr_, true, out_grad[pool_enum::kOut]);
-
-    pooling_res[dnnResourceDiffSrc] = bwd_bottom_diff->get_output_ptr(
-      input_grad.dptr_, bwd_bottom_diff, in_grad[pool_enum::kData]);
-    e = dnnExecute<DType>(poolingBwd, pooling_res);
-    CHECK_EQ(e, E_SUCCESS);
-#if MKL_EXPERIMENTAL == 0
-    if (bwd_bottom_diff->conversion_needed()) {
-      bwd_bottom_diff->convert_from_prv(input_grad.dptr_);
-    }
-#endif
-  }
-
- private:
-  PoolingParam param_;
-  int kernel_h_, kernel_w_;
-  int stride_h_, stride_w_;
-  int pad_h_, pad_w_;
-  int channels_, num_;
-  int height_, width_;
-  int pooled_height_, pooled_width_;
-  bool global_pooling_;
-
- private:
-  size_t kernel_size[2],
-         kernel_stride[4];
-  int src_offset[4];  // 2*(dimension-2)
-  dnnPrimitive_t poolingFwd, poolingBwd;
-  DType *max_idx_data;
-
-  std::shared_ptr<MKLData<DType> > fwd_top_data;
-  std::shared_ptr<MKLData<DType> > fwd_bottom_data;
-  std::shared_ptr<MKLData<DType> > bwd_top_diff;
-  std::shared_ptr<MKLData<DType> > bwd_bottom_diff;
-  bool init_mkldnn_;
-};  // class MKLPoolingOp
-}   // namespace op
-}   // namespace mxnet
-
-#endif  // MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_
diff --git a/src/operator/mkl/mkl_relu-inl.h b/src/operator/mkl/mkl_relu-inl.h
deleted file mode 100644
index 8d7ab5e1e2db..000000000000
--- a/src/operator/mkl/mkl_relu-inl.h
+++ /dev/null
@@ -1,272 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_relu-inl.h
-* \brief
-* \author zhenlin.luo@intel.com
-*         lingyan.guo@intel.com
-*
-*******************************************************************************/
-#ifndef MXNET_OPERATOR_MKL_MKL_RELU_INL_H_
-#define MXNET_OPERATOR_MKL_MKL_RELU_INL_H_
-
-
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <algorithm>
-#include <map>
-#include <vector>
-#include <string>
-#include <utility>
-#include "../operator_common.h"
-#include "./mkl_util-inl.h"
-
-namespace mxnet {
-namespace op {
-
-template<typename xpu, typename DType>
-class MKLReluOp : public Operator {
- public:
-  static std::string getName() {
-    return "MKLReluOp";
-  }
-  MKLReluOp():
-      reluFwd_(NULL),
-      reluBwd_(NULL) {
-    init_mkldnn_ = false;
-    fwd_top_data_ = MKLData<DType>::create();
-    fwd_bottom_data_ = MKLData<DType>::create();
-    bwd_top_diff_ = MKLData<DType>::create();
-    bwd_bottom_diff_ = MKLData<DType>::create();
-  }
-
-  ~MKLReluOp() {
-    if (reluFwd_ != NULL) {
-      dnnDelete<DType>(reluFwd_);
-      reluFwd_ = NULL;
-    }
-    if (reluBwd_ != NULL) {
-      dnnDelete<DType>(reluBwd_);
-      reluBwd_ = NULL;
-    }
-  }
-
- private:
-  void LayerSetUp(const mshadow::Tensor<xpu, 4, DType> &data,
-                  const mshadow::Tensor<xpu, 4, DType> &out) {
-    size_t dim = 4;
-    size_t *sizes = new size_t[dim];
-    size_t *strides = new size_t[dim];
-    for (size_t d = 0; d < dim; ++d) {
-      (sizes)[d] = data.shape_[dim - 1 - d];
-      (strides)[d] = (d == 0) ? 1 : (strides)[d - 1] * (sizes)[d - 1];
-    }
-    // Names are for debugging only
-    fwd_bottom_data_->name = "fwd_bottom_data   @ " + getName();
-    fwd_top_data_->name = "fwd_top_data      @ " + getName();
-    bwd_bottom_diff_->name = "bwd_bottom_diff   @ " + getName();
-    bwd_top_diff_->name = "bwd_top_diff      @ " + getName();
-    fwd_bottom_data_->create_user_layout(dim, (sizes), (strides));
-    fwd_top_data_->create_user_layout(dim, (sizes), (strides));
-    bwd_bottom_diff_->create_user_layout(dim, (sizes), (strides));
-    bwd_top_diff_->create_user_layout(dim, (sizes), (strides));
-    delete[] sizes;
-    delete[] strides;
-  }
-
- public:
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> data;
-    Tensor<xpu, 4, DType> out;
-    if (in_data[activation::kData].ndim() == 1) {
-      Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0], 1, 1, 1);
-      data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        in_data[activation::kData], dshape, s);
-      out = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_data[activation::kOut], dshape, s);
-    } else if (in_data[activation::kData].ndim() == 2) {
-      Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0],
-      in_data[activation::kData].shape_[1], 1, 1);
-      data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        in_data[activation::kData], dshape, s);
-      out = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_data[activation::kOut], dshape, s);
-    } else if (in_data[activation::kData].ndim() == 3) {
-      Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0],
-        in_data[activation::kData].shape_[1],
-        in_data[activation::kData].shape_[2], 1);
-      data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        in_data[activation::kData], dshape, s);
-      out = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_data[activation::kOut], dshape, s);
-    } else {
-      data = mkl_experimental_direct_get<xpu, 4, DType>(in_data[activation::kData], s);
-      out = mkl_experimental_direct_get<xpu, 4, DType>(out_data[activation::kOut], s);
-    }
-    if (!init_mkldnn_) {
-      LayerSetUp(data, out);
-      init_mkldnn_ = true;
-    }
-    void* bottom_data = NULL;
-#if MKL_EXPERIMENTAL == 1
-    bottom_data =
-          reinterpret_cast<void *>(mkl_prv_data<DType>(in_data[activation::kData]));
-#endif
-#if MKL_EXPERIMENTAL == 1
-    if (bottom_data != NULL) {
-      if (reluFwd_ == NULL) {
-      std::shared_ptr<MKLData<DType> > mem_descr =
-        mkl_get_mem_desc<DType>(in_data[activation::kData].Mkl_mem_);
-      DType negative_slope = 0;
-      dnnError_t e;
-      e = dnnReLUCreateForward<DType>(&reluFwd_, NULL, mem_descr->layout_int,
-                                      negative_slope);
-      CHECK_EQ(e, E_SUCCESS);
-      e = dnnReLUCreateBackward<DType>(&reluBwd_, NULL, mem_descr->layout_int,
-                                       mem_descr->layout_int, negative_slope);
-      CHECK_EQ(e, E_SUCCESS);
-
-      fwd_bottom_data_ = mem_descr;
-      fwd_top_data_->create_internal_layout(reluFwd_, dnnResourceDst);
-      bwd_top_diff_->create_internal_layout(reluFwd_, dnnResourceDst);
-      bwd_bottom_diff_->create_internal_layout(reluFwd_, dnnResourceSrc);
-      }
-    }
-#endif
-    if (bottom_data  == NULL) {
-      bottom_data = data.dptr_;
-      if (reluFwd_ == NULL) {
-        dnnError_t e;
-        DType negative_slope = 0;
-        e = dnnReLUCreateForward<DType>(&reluFwd_, NULL,
-                                        fwd_bottom_data_->layout_usr, negative_slope);
-        CHECK_EQ(e, E_SUCCESS);
-        e = dnnReLUCreateBackward<DType>(&reluBwd_, NULL,
-                                         fwd_bottom_data_->layout_usr, fwd_bottom_data_->layout_usr,
-                                         negative_slope);
-        CHECK_EQ(e, E_SUCCESS);
-      }
-    }
-    dnnError_t e;
-    void* relu_res[dnnResourceNumber];
-    relu_res[dnnResourceSrc] = bottom_data;
-
-    relu_res[dnnResourceDst] = fwd_top_data_->get_output_ptr(
-      out.dptr_, fwd_top_data_, out_data[activation::kOut], (data.dptr_ == out.dptr_));
-    e = dnnExecute<DType>(reluFwd_, relu_res);
-    CHECK_EQ(e, E_SUCCESS);
-#if MKL_EXPERIMENTAL == 0
-    if (fwd_top_data_->conversion_needed()) {
-      fwd_top_data_->convert_from_prv(out.dptr_);
-    }
-#endif
-  }
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    if (!req[0]) {
-      return;
-    }
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK(in_data.size() == 1 && in_grad.size() == 1);
-    CHECK_EQ(req.size(), 1);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> m_out_grad;
-    Tensor<xpu, 4, DType> m_out_data;
-    Tensor<xpu, 4, DType> m_in_grad;
-
-    if (out_grad[activation::kOut].ndim() == 1) {
-      Shape<4> dshape = Shape4(out_grad[activation::kOut].shape_[0], 1, 1, 1);
-      m_out_grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_grad[activation::kOut], dshape, s);
-      m_out_data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_data[activation::kOut], dshape, s);
-      m_in_grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        in_grad[activation::kData], dshape, s);
-    } else if (out_grad[activation::kOut].ndim() == 2) {
-      Shape<4> dshape = Shape4(out_grad[activation::kOut].shape_[0],
-                               out_grad[activation::kOut].shape_[1], 1, 1);
-      m_out_grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_grad[activation::kOut], dshape, s);
-      m_out_data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_data[activation::kOut], dshape, s);
-      m_in_grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        in_grad[activation::kData], dshape, s);
-    } else if (out_grad[activation::kOut].ndim() == 3) {
-      Shape<4> dshape = Shape4(out_grad[activation::kOut].shape_[0],
-        out_grad[activation::kOut].shape_[1],
-        out_grad[activation::kOut].shape_[2], 1);
-      m_out_grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_grad[activation::kOut], dshape, s);
-      m_out_data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_data[activation::kOut], dshape, s);
-      m_in_grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        in_grad[activation::kData], dshape, s);
-    } else {
-      m_out_grad = mkl_experimental_direct_get<xpu, 4, DType>(out_grad[activation::kOut], s);
-      m_out_data = mkl_experimental_direct_get<xpu, 4, DType>(out_data[activation::kOut], s);
-      m_in_grad = mkl_experimental_direct_get<xpu, 4, DType>(in_grad[activation::kData], s);
-    }
-    dnnError_t e;
-    void* relu_res[dnnResourceNumber];
-
-    void* bottom_data = NULL;
-#if MKL_EXPERIMENTAL == 1
-    bottom_data = reinterpret_cast<void *>(mkl_prv_data<DType>(out_data[activation::kOut]));
-#endif
-    if (NULL == bottom_data) {
-      bottom_data = reinterpret_cast<void *>(const_cast<DType*>(m_out_data.dptr_));
-    }
-    relu_res[dnnResourceSrc] = bottom_data;
-    relu_res[dnnResourceDiffDst] = bwd_top_diff_->get_converted_prv(m_out_grad.dptr_,
-                true, out_grad[activation::kOut]);
-    relu_res[dnnResourceDiffSrc] = bwd_bottom_diff_->get_output_ptr(
-      m_in_grad.dptr_, bwd_bottom_diff_, in_grad[activation::kData]);
-    e = dnnExecute<DType>(reluBwd_, relu_res);
-    CHECK_EQ(e, E_SUCCESS);
-#if MKL_EXPERIMENTAL == 0
-    if (bwd_bottom_diff_->conversion_needed()) {
-      bwd_bottom_diff_->convert_from_prv(m_in_grad.dptr_);
-    }
-#endif
-  }
-
- private:
-  bool init_mkldnn_;
-  std::shared_ptr<MKLData<DType> > fwd_top_data_;
-  std::shared_ptr<MKLData<DType> > fwd_bottom_data_;
-  std::shared_ptr<MKLData<DType> > bwd_top_diff_;
-  std::shared_ptr<MKLData<DType> > bwd_bottom_diff_;
-  dnnPrimitive_t reluFwd_, reluBwd_;
-};  // class MKLReluOp
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_MKL_MKL_RELU_INL_H_
diff --git a/src/operator/mkl/mkl_util-inl.h b/src/operator/mkl/mkl_util-inl.h
deleted file mode 100644
index 4ad786a2ce93..000000000000
--- a/src/operator/mkl/mkl_util-inl.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_util-inl.h
-* \brief
-* \author lingyan.guo@intel.com
-*         zhenlin.luo@intel.com
-*
-*******************************************************************************/
-#ifndef MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_
-#define MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_
-#include <vector>
-#define MKLDNN_CALL(func)                                                               \
-  {                                                                                     \
-    dnnError_t status = (func);                                                                \
-    CHECK_EQ(status, E_SUCCESS) << "MKL DNN call failed (status: " << status << ").";           \
-  }
-
-
-namespace mxnet {
-namespace op {
-
-#if MKL_EXPERIMENTAL == 1
-  template<typename DType>
-  inline DType * mkl_prv_data(const TBlob &b) {
-    std::shared_ptr<MKLMemHolder> bottom_data_mem = b.Mkl_mem_;
-    bool mem_valid = (bottom_data_mem != nullptr) && bottom_data_mem->head_at_prv();
-    if (mem_valid) {
-      return reinterpret_cast<DType*>(bottom_data_mem->prv_data());
-    }
-    return NULL;
-  }
-
-  template<typename DType>
-  inline int mkl_prv_count(const TBlob &b) {
-    std::shared_ptr<MKLMemHolder> bottom_data_mem = b.Mkl_mem_;
-    bool mem_valid = (bottom_data_mem != nullptr) && bottom_data_mem->head_at_prv();
-    if (mem_valid) {
-      return bottom_data_mem->prv_count();
-    }
-    return 0;
-  }
-#endif
-  inline void mkl_set_priv_flag(const TBlob &b) {
-#if MKL_EXPERIMENTAL == 1
-    std::shared_ptr<MKLMemHolder> bottom_data_mem = b.Mkl_mem_;
-    bool mem_valid = (bottom_data_mem != nullptr) && bottom_data_mem->head_at_prv();
-    if (mem_valid) {
-      bottom_data_mem->disable_prv_2_cpu(true);
-    }
-#endif
-  }
-#if MKL_EXPERIMENTAL == 1
-  template<typename DType>
-  inline std::shared_ptr<MKLData<DType> > mkl_get_mem_desc(
-    const std::shared_ptr<MKLMemHolder> data_mem) {
-    std::shared_ptr<PrvMemDescr> prv_descriptor =
-      data_mem->get_prv_descriptor();
-    CHECK_EQ(prv_descriptor->get_descr_type(),
-      PrvMemDescr::PRV_DESCR_MKL2017);
-    std::shared_ptr<MKLData<DType> > mem_descr
-      = std::static_pointer_cast<MKLData<DType>>
-      (prv_descriptor);
-    CHECK(mem_descr != NULL);
-    return mem_descr;
-  }
-#endif
-  template<typename xpu, int dim, typename DType>
-  inline  mshadow::Tensor<xpu, dim, DType> mkl_experimental_direct_get(
-    const TBlob &b, mshadow::Stream<xpu> *s) {
-    mkl_set_priv_flag(b);
-    return b.get<xpu, dim, DType>(s);
-  }
-  template<typename xpu, int dim, typename DType>
-  inline  mshadow::Tensor<xpu, dim, DType> mkl_experimental_direct_get_with_shape(
-    const TBlob &b, const mshadow::Shape<dim> &shape, mshadow::Stream<xpu> *s) {
-    mkl_set_priv_flag(b);
-    return b.get_with_shape<xpu, dim, DType>(shape, s);
-  }
-}  // namespace op
-#if MKL_EXPERIMENTAL == 1
-inline void mkl_tblobs_prv_to_cpu(const std::vector<TBlob> &data) {
-  for (size_t i = 0; i < data.size(); i++) {
-    std::shared_ptr<MKLMemHolder> mem_holder = data[i].Mkl_mem_;
-    if (mem_holder != nullptr && mem_holder->b_eager_mode) {
-      mem_holder->check_and_prv_to_cpu(data[i].dptr_);
-    }
-  }
-}
-inline void mkl_set_tblob_eager_mode(const TBlob &data) {
-  std::shared_ptr<MKLMemHolder> mem_holder = data.Mkl_mem_;
-  if (mem_holder != nullptr) {
-    mem_holder->set_eager_mode(true);
-  }
-}
-#endif
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_
diff --git a/src/operator/nn/activation-inl.h b/src/operator/nn/activation-inl.h
index ac8b747f0f39..a440f97e1382 100644
--- a/src/operator/nn/activation-inl.h
+++ b/src/operator/nn/activation-inl.h
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file activation-inl.h
  * \brief Activation operator
- * \author Bing Xu
+ * \author Bing Xu, Da Zheng
 */
 
 #ifndef MXNET_OPERATOR_NN_ACTIVATION_INL_H_
@@ -37,6 +37,7 @@
 #include <utility>
 #include "../operator_common.h"
 #include "../mxnet_op.h"
+#include "../mshadow_op.h"
 
 namespace mxnet {
 namespace op {
@@ -45,6 +46,7 @@ namespace op {
 namespace activation {
 enum ActivationOpInputs {kData};
 enum ActivationOpOutputs {kOut};
+enum ActivationOpResource {kTempSpace};
 enum ActivationOpType {kReLU, kSigmoid, kTanh, kSoftReLU};
 }  // activation
 
@@ -59,160 +61,148 @@ struct ActivationParam : public dmlc::Parameter<ActivationParam> {
     .add_enum("softrelu", activation::kSoftReLU)
     .describe("Activation function to be applied.");
   }
-};
 
-/**
- * \brief This is the implementation of activation operator.
- * \tparam xpu The device that the op will be executed on.
- */
-template<typename xpu, typename ForwardOp, typename BackwardOp, typename DType>
-class ActivationOp : public Operator {
- public:
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 1U);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    const TBlob& input = in_data[activation::kData];
-    const size_t sz = input.shape_.Size();
-    if (sz) {
-      MXNET_ASSIGN_REQ_SWITCH(req[activation::kOut], Req, {
-        mxnet_op::Kernel<mxnet_op::op_with_req<ForwardOp, Req>, xpu>::Launch(
-          s, sz,
-          out_data[activation::kOut].dptr<DType>(),
-          input.dptr<DType>());
-      });
-    }
+  bool operator==(const ActivationParam& other) const {
+    return this->act_type == other.act_type;
   }
+};
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1U);
-    CHECK(in_data.size() == 1 && in_grad.size() == 1);
-    CHECK_EQ(req.size(), 1U);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    const TBlob& m_out_grad = out_grad[activation::kOut];
-    const TBlob& m_out_data = out_data[activation::kOut];
-    const TBlob&  m_in_grad = in_grad[activation::kData];
-    const size_t sz = m_out_data.shape_.Size();
-    if (sz) {
-      MXNET_ASSIGN_REQ_SWITCH(req[activation::kData], Req, {
-        mxnet_op::Kernel<mxnet_op::op_with_req<
-          mxnet::op::mxnet_op::backward_grad_tuned<BackwardOp>, Req>, xpu>::Launch(
-          s, sz,
-          m_in_grad.dptr<DType>(),
-          m_out_grad.dptr<DType>(),
-          m_out_data.dptr<DType>());
-      });
-    }
-  }
-};  // class ActivationOp
-
-// Declare Factory function, used for dispatch specialization
-template<typename xpu>
-Operator* CreateOp(ActivationParam type, int dtype, const TShape& dshape);
+}  // namespace op
+}  // namespace mxnet
 
-#if DMLC_USE_CXX11
-class ActivationProp : public OperatorProperty {
- public:
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
+namespace std {
+template<>
+struct hash<mxnet::op::ActivationParam> {
+  size_t operator()(const mxnet::op::ActivationParam& val) {
+    return val.act_type;
   }
+};
+}  // namespace std
+
+namespace mxnet {
+namespace op {
 
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
+template<typename xpu, typename ForwardOp, typename BackwardOp, typename DType>
+void ActivationForward(const OpContext &ctx, const TBlob &in_data,
+                       const OpReqType &req, const TBlob &out_data) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const size_t sz = in_data.shape_.Size();
+  if (sz) {
+    MXNET_ASSIGN_REQ_SWITCH(req, Req, {
+      mxnet_op::Kernel<mxnet_op::op_with_req<ForwardOp, Req>, xpu>::Launch(
+        s, sz,
+        out_data.dptr<DType>(),
+        in_data.dptr<DType>());
+    });
   }
+}
 
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-    using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 1U) << "Input:[data]";
-    const TShape &dshape = in_shape->at(activation::kData);
-    if (dshape.ndim() == 0) return false;
-    out_shape->clear();
-    out_shape->push_back(dshape);
-    return true;
+template<typename xpu, typename ForwardOp, typename BackwardOp, typename DType>
+void ActivationBackward(const OpContext &ctx, const TBlob &out_grad,
+                        const TBlob &out_data, const OpReqType &req,
+                        const TBlob &in_grad) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const size_t sz = out_data.shape_.Size();
+  if (sz) {
+    MXNET_ASSIGN_REQ_SWITCH(req, Req, {
+      mxnet_op::Kernel<mxnet_op::op_with_req<
+        mxnet::op::mxnet_op::backward_grad_tuned<BackwardOp>, Req>, xpu>::Launch(
+        s, sz,
+        in_grad.dptr<DType>(),
+        out_grad.dptr<DType>(),
+        out_data.dptr<DType>());
+    });
   }
+}
 
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1U);
-    int dtype = (*in_type)[0];
-    CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
-      if ((*in_type)[i] == -1) {
-          (*in_type)[i] = dtype;
-      } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
-      }
+template<typename xpu>
+void ActivationComputeImpl(const ActivationParam &param, const OpContext &ctx,
+                           const TBlob &input, OpReqType req, const TBlob &output) {
+  MSHADOW_REAL_TYPE_SWITCH(input.type_flag_, DType, {
+    switch (param.act_type) {
+      case activation::kReLU:
+        ActivationForward<xpu, mshadow_op::relu, mshadow_op::relu_grad, DType>(
+            ctx, input, req, output);
+        break;
+      case activation::kSigmoid:
+        ActivationForward<xpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad, DType>(
+            ctx, input, req, output);
+        break;
+      case activation::kTanh:
+        ActivationForward<xpu, mshadow_op::tanh, mshadow_op::tanh_grad, DType>(
+            ctx, input, req, output);
+        break;
+      case activation::kSoftReLU:
+        ActivationForward<xpu, mshadow_op::softrelu, mshadow_op::softrelu_grad, DType>(
+            ctx, input, req, output);
+        break;
+      default:
+        LOG(FATAL) << "unknown activation type";
     }
-    out_type->clear();
-    out_type->push_back(dtype);
-    return true;
-  }
+  });
+}
 
-  OperatorProperty* Copy() const override {
-    auto ptr = new ActivationProp();
-    ptr->param_ = param_;
-    return ptr;
-  }
+template<typename xpu>
+void ActivationGradComputeImpl(const ActivationParam &param, const OpContext &ctx,
+                               const TBlob &out_grad, const TBlob &out_data,
+                               OpReqType req, const TBlob &output) {
+  MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, {
+    switch (param.act_type) {
+      case activation::kReLU:
+        ActivationBackward<xpu, mshadow_op::relu, mshadow_op::relu_grad, DType>(
+            ctx, out_grad, out_data, req, output);
+        break;
+      case activation::kSigmoid:
+        ActivationBackward<xpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad, DType>(
+            ctx, out_grad, out_data, req, output);
+        break;
+      case activation::kTanh:
+        ActivationBackward<xpu, mshadow_op::tanh, mshadow_op::tanh_grad, DType>(
+            ctx, out_grad, out_data, req, output);
+        break;
+      case activation::kSoftReLU:
+        ActivationBackward<xpu, mshadow_op::softrelu, mshadow_op::softrelu_grad, DType>(
+            ctx, out_grad, out_data, req, output);
+        break;
+      default:
+        LOG(FATAL) << "unknown activation type";
+    }
+  });
+}
 
-  std::string TypeString() const override {
-    return "Activation";
-  }
+template<typename xpu>
+void ActivationCompute(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx,
+    const std::vector<TBlob>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+  ActivationComputeImpl<xpu>(param, ctx, inputs[0], req[0], outputs[0]);
+}
 
-  // decalre dependency and inplace optimization options
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
+template<typename xpu>
+void ActivationGradCompute(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx,
+    const std::vector<TBlob>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<TBlob>& outputs) {
 #if MXNET_USE_CUDNN == 1
-    return {out_grad[activation::kOut], out_data[activation::kOut], in_data[activation::kData]};
+  CHECK_EQ(inputs.size(), 3U);
 #else
-    return {out_grad[activation::kOut], out_data[activation::kOut]};
-#endif  // MXNET_USE_CUDNN
-  }
+  CHECK_EQ(inputs.size(), 2U);
+#endif
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+  ActivationGradComputeImpl<xpu>(param, ctx, inputs[0], inputs[1], req[0], outputs[0]);
+}
 
-  std::vector<std::pair<int, void*> > BackwardInplaceOption(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data,
-    const std::vector<void*> &in_grad) const override {
-    return {{out_grad[activation::kOut], in_grad[activation::kData]}};
-  }
-
-  std::vector<std::pair<int, void*> > ForwardInplaceOption(
-    const std::vector<int> &in_data,
-    const std::vector<void*> &out_data) const override {
-    return {{in_data[activation::kData], out_data[activation::kOut]}};
-  }
-
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented.";
-    return NULL;
-  }
-
-  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                             std::vector<int> *in_type) const override;
-
- private:
-  ActivationParam param_;
-};
-#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_NN_ACTIVATION_INL_H_
diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc
index 401a9e3eaa56..0da644cb1f70 100644
--- a/src/operator/nn/activation.cc
+++ b/src/operator/nn/activation.cc
@@ -17,69 +17,130 @@
  * under the License.
  */
 
+
 /*!
  * Copyright (c) 2015 by Contributors
  * \file activation.cc
  * \brief activation op
- * \author Bing Xu
+ * \author Bing Xu, Da Zheng
 */
 #include "./activation-inl.h"
 #include "../mshadow_op.h"
-#if MXNET_USE_MKL2017 == 1
-#include <mkl_memory.h>
-#include "../mkl/mkl_memory-inl.h"
-#include "../mkl/mkl_relu-inl.h"
-#endif  // MXNET_USE_MKL2017
+#include "../tensor/elemwise_unary_op.h"
+#if MXNET_USE_MKLDNN == 1
+#include "./mkldnn/mkldnn_base-inl.h"
+#include "./mkldnn/mkldnn_ops-inl.h"
+#endif  // MXNET_USE_MKLDNN
 
 namespace mxnet {
 namespace op {
-template<>
-Operator *CreateOp<cpu>(ActivationParam param, int dtype, const TShape& dshape) {
-  Operator *op = NULL;
-#if MXNET_USE_MKL2017 == 1
-  if (param.act_type == activation::kReLU && dshape.ndim() <= 4) {
-      switch (dtype) {
-      case mshadow::kFloat32:
-          return new MKLReluOp<cpu, float>();
-      case mshadow::kFloat64:
-          return new MKLReluOp<cpu, double>();
-      default:
-          break;
-      }
+
+DMLC_REGISTER_PARAMETER(ActivationParam);
+
+// This will determine the order of the inputs for backward computation.
+struct ActivationGrad {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
+    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
+    heads.emplace_back(nnvm::NodeEntry{n, activation::kOut, 0});
+#if MXNET_USE_CUDNN == 1
+    heads.push_back(n->inputs[activation::kData]);
+#endif
+    return MakeGradNode(op_name, n, heads, n->attrs.dict);
+  }
+};
+
+#if MXNET_USE_MKLDNN == 1
+static void ActivationComputeExCPU(const nnvm::NodeAttrs& attrs,
+                                   const OpContext& ctx,
+                                   const std::vector<NDArray>& inputs,
+                                   const std::vector<OpReqType>& req,
+                                   const std::vector<NDArray>& outputs) {
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  if (SupportMKLDNN(inputs[0])) {
+    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    MKLDNNActivationForward(attrs, ctx, inputs[0], req[0], outputs[0]);
+    MKLDNN_OPCHECK_RUN(ActivationCompute<cpu>, attrs, ctx, inputs, req, outputs);
+    return;
   }
-  if (enableMKLWarnGenerated())
-    LOG(INFO) << MKLReluOp<cpu, float>::getName() << " Skip MKL optimization";
+  ActivationComputeImpl<cpu>(param, ctx, inputs[0].data(), req[0], outputs[0].data());
+}
+
+void ActivationGradComputeExCPU(const nnvm::NodeAttrs& attrs,
+                                const OpContext& ctx,
+                                const std::vector<NDArray>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<NDArray>& outputs) {
+#if MXNET_USE_CUDNN == 1
+  CHECK_EQ(inputs.size(), 3U);
+#else
+  CHECK_EQ(inputs.size(), 2U);
 #endif
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    switch (param.act_type) {
-      case activation::kReLU:
-        op = new ActivationOp<cpu, mshadow_op::relu, mshadow_op::relu_grad, DType>();
-        break;
-      case activation::kSigmoid:
-        op = new ActivationOp<cpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad, DType>();
-        break;
-      case activation::kTanh:
-        op = new ActivationOp<cpu, mshadow_op::tanh, mshadow_op::tanh_grad, DType>();
-        break;
-      case activation::kSoftReLU:
-        op = new ActivationOp<cpu, mshadow_op::softrelu, mshadow_op::softrelu_grad, DType>();
-        break;
-      default:
-        LOG(FATAL) << "unknown activation type";
-    }
-  })
-  return op;
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+  if (SupportMKLDNN(inputs[0])) {
+    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    MKLDNNActivationBackward(attrs, ctx, inputs[0], inputs[1], req[0],
+                             outputs[0]);
+      MKLDNN_OPCHECK_RUN(ActivationGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+    return;
+  }
+  ActivationGradComputeImpl<cpu>(param, ctx, inputs[0].data(), inputs[1].data(),
+                                 req[0], outputs[0].data());
 }
+#endif
 
-// DO_BIND_DISPATCH comes from operator_common.h
-Operator *ActivationProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                                           std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_shape)[0]);
+inline static bool ActivationStorageType(const nnvm::NodeAttrs& attrs,
+                                         const int dev_mask,
+                                         DispatchMode* dispatch_mode,
+                                         std::vector<int> *in_attrs,
+                                         std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(out_attrs->size(), 1);
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+  bool ret = ElemwiseStorageType<1, 1, false, false, false>(attrs, dev_mask,
+                                                            dispatch_mode,
+                                                            in_attrs, out_attrs);
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param)) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+  return ret;
 }
 
-DMLC_REGISTER_PARAMETER(ActivationParam);
+inline static bool BackwardActStorageType(const nnvm::NodeAttrs& attrs,
+                                          const int dev_mask,
+                                          DispatchMode* dispatch_mode,
+                                          std::vector<int> *in_attrs,
+                                          std::vector<int> *out_attrs) {
+#if MXNET_USE_CUDNN == 1
+  CHECK_EQ(in_attrs->size(), 3U);
+#else
+  CHECK_EQ(in_attrs->size(), 2U);
+#endif
+  CHECK_EQ(out_attrs->size(), 1U);
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+#if MXNET_USE_CUDNN == 1
+  bool ret = ElemwiseStorageType<3, 1, false, false, false>(attrs, dev_mask,
+                                                            dispatch_mode,
+                                                            in_attrs, out_attrs);
+#else
+  bool ret = ElemwiseStorageType<2, 1, false, false, false>(attrs, dev_mask,
+                                                            dispatch_mode,
+                                                            in_attrs, out_attrs);
+#endif
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param)) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+  return ret;
+}
 
-MXNET_REGISTER_OP_PROPERTY(Activation, ActivationProp)
+MXNET_OPERATOR_REGISTER_UNARY(Activation)
 .describe(R"code(Applies an activation function element-wise to the input.
 
 The following activation functions are supported:
@@ -90,8 +151,35 @@ The following activation functions are supported:
 - `softrelu`: Soft ReLU, or SoftPlus, :math:`y = log(1 + exp(x))`
 
 )code" ADD_FILELINE)
-.add_argument("data", "NDArray-or-Symbol", "Input array to activation function.")
+.set_attr_parser(ParamParser<ActivationParam>)
+.set_attr<FInferStorageType>("FInferStorageType", ActivationStorageType)
+.set_attr<FCompute>("FCompute<cpu>", ActivationCompute<cpu>)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FComputeEx>("FComputeEx<cpu>", ActivationComputeExCPU)
+#endif
+.set_attr<nnvm::FGradient>("FGradient", ActivationGrad{"_backward_Activation"})
 .add_arguments(ActivationParam::__FIELDS__());
 
+NNVM_REGISTER_OP(_backward_Activation)
+.set_num_inputs(3)
+.set_num_outputs(1)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FInferStorageType>("FInferStorageType", BackwardActStorageType)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<3, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<3, 1>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
+  return std::vector<std::pair<int, int> >{{0, 0}};
+})
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
+.set_attr_parser(ParamParser<ActivationParam>)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FComputeEx>("FComputeEx<cpu>", ActivationGradComputeExCPU)
+#endif
+.set_attr<FCompute>("FCompute<cpu>", ActivationGradCompute<cpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/activation.cu b/src/operator/nn/activation.cu
index c2f6be9f37c8..dc435b2acc17 100644
--- a/src/operator/nn/activation.cu
+++ b/src/operator/nn/activation.cu
@@ -31,39 +31,73 @@
 
 namespace mxnet {
 namespace op {
+
+#if MXNET_USE_CUDNN == 1
+
+template<typename DType>
+static CuDNNActivationOp<DType> &get_cudnn_op(const ActivationParam& param) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local CuDNNActivationOp<DType> cudnn_op;
+#else
+  static MX_THREAD_LOCAL CuDNNActivationOp<DType> cudnn_op;
+#endif
+  cudnn_op.Init(param);
+  return cudnn_op;
+}
+
 template<>
-Operator *CreateOp<gpu>(ActivationParam param, int dtype, const TShape& dshape) {
-  Operator *op = NULL;
+void ActivationCompute<gpu>(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx,
+    const std::vector<TBlob>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+
   // SoftReLU not supported by CUDNN yet
   if (param.act_type == activation::kSoftReLU) {
-    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-      op = new ActivationOp<gpu, mshadow_op::softrelu, mshadow_op::softrelu_grad, DType>();
-    })
-    return op;
+    MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+      ActivationForward<gpu, mshadow_op::softrelu, mshadow_op::softrelu_grad, DType>(ctx,
+          inputs[0], req[0], outputs[0]);
+    });
+  } else {
+    MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+      get_cudnn_op<DType>(param).Forward(ctx, inputs[0], req[0], outputs[0]);
+    });
   }
+}
 
-#if MXNET_USE_CUDNN == 1
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new CuDNNActivationOp<DType>(param);
-  })
-#else
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    switch (param.act_type) {
-      case activation::kReLU:
-        op = new ActivationOp<gpu, mshadow_op::relu, mshadow_op::relu_grad, DType>();
-        break;
-      case activation::kSigmoid:
-        op = new ActivationOp<gpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad, DType>();
-        break;
-      case activation::kTanh:
-        op = new ActivationOp<gpu, mshadow_op::tanh, mshadow_op::tanh_grad, DType>();
-        break;
-      default:
-        LOG(FATAL) << "unknown activation";
-    }
-  })
-#endif  // MXNET_USE_CUDNN
-  return op;
+template<>
+void ActivationGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx,
+    const std::vector<TBlob>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 3U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+
+  // SoftReLU not supported by CUDNN yet
+  if (param.act_type == activation::kSoftReLU) {
+    MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+      ActivationBackward<gpu, mshadow_op::softrelu, mshadow_op::softrelu_grad, DType>(
+          ctx, inputs[0], inputs[1], req[0], outputs[0]);
+    });
+  } else {
+    MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+      get_cudnn_op<DType>(param).Backward(ctx, inputs[0], inputs[2], inputs[1], req[0], outputs[0]);
+    });
+  }
 }
+#endif
+
+NNVM_REGISTER_OP(Activation)
+.set_attr<FCompute>("FCompute<gpu>", ActivationCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_Activation)
+.set_attr<FCompute>("FCompute<gpu>", ActivationGradCompute<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/batch_norm-inl.h b/src/operator/nn/batch_norm-inl.h
index 2a9dee2cf845..27e0a8434d77 100644
--- a/src/operator/nn/batch_norm-inl.h
+++ b/src/operator/nn/batch_norm-inl.h
@@ -21,7 +21,7 @@
  * Copyright (c) 2017 by Contributors
  * \file batch_norm-inl.h
  * \brief
- * \author Bing Xu, Chris Olivier
+ * \author Bing Xu, Chris Olivier, Da Zheng
  */
 #ifndef MXNET_OPERATOR_NN_BATCH_NORM_INL_H_
 #define MXNET_OPERATOR_NN_BATCH_NORM_INL_H_
@@ -47,8 +47,10 @@ namespace mxnet {
 namespace op {
 
 namespace batchnorm {
-enum BatchNormOpInputs {kData, kGamma, kBeta};  // kGamma: weights, kBeta: biases
+enum BatchNormOpInputs {kData, kGamma, kBeta, kInMovingMean,
+  kInMovingVar};  // kGamma: weights, kBeta: biases
 enum BatchNormOpOutputs {kOut, kMean, kVar};  // req, out_data
+enum BatchNormOpResource {kTempSpace};
 enum BatchNormOpAuxiliary {kMovingMean, kMovingVar};  // aux_states
 
 /*! \brief Default channel axis if none specified int he params */
@@ -83,280 +85,203 @@ struct BatchNormParam : public dmlc::Parameter<BatchNormParam> {
     DMLC_DECLARE_FIELD(cudnn_off).set_default(false)
       .describe("Do not select CUDNN operator, if available");
   }
-};
-
-/*! \brief Batch normalization operator */
-template <typename xpu, typename DType, typename AccReal>
-class BatchNormOp : public Operator {
- public:
-  explicit BatchNormOp(BatchNormParam param) {
-    this->param_ = param;
-  }
-
-  static inline bool IsWriting(const OpReqType ort) {
-    return ort == kWriteTo || ort == kWriteInplace;
-  }
-
-  /*!
-   * \brief perform a forward operation of Operator, save the output to TBlob.
-   * \param ctx runtime context available to this call
-   * \param in_data array of input data, it is const
-   * \param req the request types of saving operation, can only be kWriteTo or kWriteInplace.
-   * \param out_data array of output data, pointer is used to indicate that this is holder
-   *        the space of TBlob in out_data must be pre-allocated with InferShape
-   * \param aux_states Auxiliary states of operator. Normally operator doesn't
-   *        need, epecial case like Batch Norm requires.
-   * \sa OpReqType, OpContext
-   */
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_states) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-
-    CHECK_EQ(in_data.size(), 3U);
-    CHECK_EQ(aux_states.size(), 2U);
-    if (ctx.is_train) {
-      CHECK_EQ(out_data.size(), 3U);
-      CHECK_EQ(req.size(), 3U);
-    } else {
-      CHECK_GE(out_data.size(), 1U);
-      CHECK_GE(req.size(), 1U);
-      CHECK_EQ(req[batchnorm::kOut], kWriteTo);
-    }
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    DoForward(s, ctx, in_data, req, out_data, aux_states);
-  }
-
-  /*!
-   * \brief Perform a Backward Operation, write gradient to the in_grad.
-   *
-   * \note
-   * Convention:
-   *   out_grad.size() == OperatorProperty.NumVisibleOutputs()
-   *   out_data.size() == OperatorProperty.NumOutputs()
-   * out_data can contain additional invisible returns that remembers the
-   * state carried from the Forward pass. For example mask in the dropout.
-   * The gradients are passed from visible returns in this function.
-   *
-   * \par
-   * Not all the TBlobs in the arguments will be available
-   * if you override the DeclareBackwardDependency of corresponding OperatorProperty class.
-   * Only the dependencies you declared will be available at corresponding position,
-   * the rest of the parameters are simply dummy where you will get a nullptr.
-   * You will be safe if you use the default DeclareBackwardDependency.
-   * But only declare what you need will give engine more chance for optimization.
-   *
-   * \param ctx runtime context available to this call
-   * \param out_grad the gradient value we get from of the Operator.
-   * \param in_data the array of input data.
-   * \param out_data the array of output data.
-   * \param req request types of the saving operation, can be all types.
-   * \param in_grad the array of gradient we need to write to.
-   * \param aux_states Auxiliary states of operator. Normally operator doesn't need
-   * \sa OperatorProperty, OpReqType, OpContext
-   */
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_states) {
-    CHECK_EQ(out_grad.size(), param_.output_mean_var ? 3U : 1U);
-    CHECK_EQ(in_data.size(), 3U);
-    CHECK_EQ(out_data.size(), 3U);
-    CHECK_EQ(in_grad.size(), 3U);
-    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-    DoBackward(s, ctx, out_grad, in_data,
-               out_data, req, in_grad, aux_states);
-  }
-
- private:
-  void DoForward(mshadow::Stream<cpu> *stream,
-                 const OpContext &ctx,
-                 const std::vector<TBlob> &in_data,
-                 const std::vector<OpReqType> &req,
-                 const std::vector<TBlob> &out_data,
-                 const std::vector<TBlob> &aux_states);
-
-  void DoBackward(mshadow::Stream<cpu> *stream,
-                  const OpContext &ctx,
-                  const std::vector<TBlob> &out_grad,
-                  const std::vector<TBlob> &in_data,
-                  const std::vector<TBlob> &out_data,
-                  const std::vector<OpReqType> &req,
-                  const std::vector<TBlob> &in_grad,
-                  const std::vector<TBlob> &aux_states);
-
-#if MXNET_USE_CUDA
-  void DoForward(mshadow::Stream<gpu> *stream,
-                 const OpContext &ctx,
-                 const std::vector<TBlob> &in_data,
-                 const std::vector<OpReqType> &req,
-                 const std::vector<TBlob> &out_data,
-                 const std::vector<TBlob> &aux_states);
-  void DoBackward(mshadow::Stream<gpu> *stream,
-                  const OpContext &ctx,
-                  const std::vector<TBlob> &out_grad,
-                  const std::vector<TBlob> &in_data,
-                  const std::vector<TBlob> &out_data,
-                  const std::vector<OpReqType> &req,
-                  const std::vector<TBlob> &in_grad,
-                  const std::vector<TBlob> &aux_states);
-#endif  // MXNET_USE_CUDA
-
-  /*! \brief Batch normalization operator parameters */
-  BatchNormParam param_;
-};  // class BatchNormOp
 
-template<typename xpu>
-Operator *CreateOp(BatchNormParam param, const int dtype, const TShape& shape);
-
-#if DMLC_USE_CXX11
-class BatchNormProp : public OperatorProperty {
- public:
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-    using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]";
-    const TShape &dshape = in_shape->at(0);
-
-    const size_t channelAxis = static_cast<size_t>(param_.axis < 0
-                            ? static_cast<int>(dshape.ndim()) + param_.axis
-                            : param_.axis);
-    CHECK_LT(channelAxis, dshape.ndim()) << "Channel axis out of range: " << param_.axis;
-
-    const int channelCount = dshape[channelAxis];
-
-    if (dshape.ndim() == 0) {
-      return false;
-    }
-
-    in_shape->at(1) = TShape(Shape1(channelCount));
-    in_shape->at(2) = TShape(Shape1(channelCount));
-
-    out_shape->clear();
-    out_shape->push_back(dshape);                // kOut
-    out_shape->push_back(Shape1(channelCount));  // kMean
-    out_shape->push_back(Shape1(channelCount));  // kVar
-
-    aux_shape->clear();
-    aux_shape->push_back(Shape1(channelCount));  // kMovingMean
-    aux_shape->push_back(Shape1(channelCount));  // kMovingVar
-    return true;
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    using namespace mshadow;
-    CHECK_GE(in_type->size(), 1U);
-    const int dtype = (*in_type)[0];
-    CHECK_NE(dtype, -1) << "First input must have specified type";
-    // For float16 input type beta, gamma, mean, and average are stored in float32.
-    // For other input types, these parameters have the same type as input
-    // NOTE: This requirement is from cuDNN (v. 4 and 5)
-    int dtype_param;
-    MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, {
-         dtype_param = mshadow::DataType<AccRealX>::kFlag; });
-    for (index_t i = 1; i < in_type->size(); ++i) {
-      if ((*in_type)[i] == -1) {
-        (*in_type)[i] = dtype_param;
-      } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype_param, ListArguments()[i]);
-      }
-    }
-    for (index_t i = 0; i < aux_type->size(); ++i) {
-      if ((*aux_type)[i] != -1) {
-        UNIFORM_TYPE_CHECK((*aux_type)[i], dtype_param, ListArguments()[i]);
-      }
-    }
-    const size_t n_aux = this->ListAuxiliaryStates().size();
-    aux_type->clear();
-    for (size_t i = 0; i < n_aux; ++i) {
-      aux_type->push_back(dtype_param);
-    }
-    const size_t n_out = this->ListOutputs().size();
-    out_type->clear();
-    out_type->push_back(dtype);
-    for (size_t i = 1; i < n_out; ++i) {
-      out_type->push_back(dtype_param);
-    }
-    return true;
+  bool operator==(const BatchNormParam& other) const {
+    return this->eps == other.eps &&
+           this->momentum == other.momentum &&
+           this->fix_gamma == other.fix_gamma &&
+           this->use_global_stats == other.use_global_stats &&
+           this->output_mean_var == other.output_mean_var &&
+           this->axis == other.axis &&
+           this->cudnn_off == other.cudnn_off;
   }
+};
 
-  OperatorProperty* Copy() const override {
-    auto ptr = new BatchNormProp();
-    ptr->param_ = param_;
-    return ptr;
-  }
+}  // namespace op
+}  // namespace mxnet
 
-  std::string TypeString() const override {
-    return "BatchNorm";
+namespace std {
+template<>
+struct hash<mxnet::op::BatchNormParam> {
+  size_t operator()(const mxnet::op::BatchNormParam& val) {
+    size_t ret = 0;
+    ret = dmlc::HashCombine(ret, val.momentum);
+    ret = dmlc::HashCombine(ret, val.fix_gamma);
+    ret = dmlc::HashCombine(ret, val.use_global_stats);
+    ret = dmlc::HashCombine(ret, val.output_mean_var);
+    ret = dmlc::HashCombine(ret, val.axis);
+    return ret;
   }
+};
+}  // namespace std
 
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return {out_grad[batchnorm::kOut],
-            out_data[batchnorm::kMean],
-            out_data[batchnorm::kVar],
-            in_data[batchnorm::kData],
-            in_data[batchnorm::kGamma]
-           };
-  }
+namespace mxnet {
+namespace op {
 
-  int NumVisibleOutputs() const override {
-    if (param_.output_mean_var) {
-      return 3;
-    }
-    return 1;
-  }
+static inline bool IsBNWriting(const OpReqType ort) {
+  return ort == kWriteTo || ort == kWriteInplace;
+}
 
-  int NumOutputs() const override {
-    return 3;
-  }
+template <typename xpu, typename DType, typename AccReal>
+void BatchNormForwardImpl(mshadow::Stream<cpu> *stream,
+                          const OpContext &ctx, const BatchNormParam& param,
+                          const std::vector<TBlob> &in_data,
+                          const std::vector<OpReqType> &req,
+                          const std::vector<TBlob> &out_data,
+                          const std::vector<TBlob> &aux_states);
 
-  std::vector<std::string> ListArguments() const override {
-    return {"data", "gamma", "beta"};
-  }
+template <typename xpu, typename DType, typename AccReal>
+void BatchNormBackwardImpl(mshadow::Stream<cpu> *stream,
+                           const OpContext &ctx, const BatchNormParam& param,
+                           const std::vector<TBlob> &out_grad,
+                           const std::vector<TBlob> &in_data,
+                           const std::vector<TBlob> &out_data,
+                           const std::vector<OpReqType> &req,
+                           const std::vector<TBlob> &in_grad,
+                           const std::vector<TBlob> &aux_states);
 
-  std::vector<std::string> ListOutputs() const override {
-    return {"output", "mean", "var"};
-  }
+#if MXNET_USE_CUDA
+template <typename xpu, typename DType, typename AccReal>
+void BatchNormForwardImpl(mshadow::Stream<gpu> *stream,
+                          const OpContext &ctx, const BatchNormParam& param,
+                          const std::vector<TBlob> &in_data,
+                          const std::vector<OpReqType> &req,
+                          const std::vector<TBlob> &out_data,
+                          const std::vector<TBlob> &aux_states);
+template <typename xpu, typename DType, typename AccReal>
+void BatchNormBackwardImpl(mshadow::Stream<gpu> *stream,
+                           const OpContext &ctx, const BatchNormParam& param,
+                           const std::vector<TBlob> &out_grad,
+                           const std::vector<TBlob> &in_data,
+                           const std::vector<TBlob> &out_data,
+                           const std::vector<OpReqType> &req,
+                           const std::vector<TBlob> &in_grad,
+                           const std::vector<TBlob> &aux_states);
+#endif  // MXNET_USE_CUDA
 
-  std::vector<std::string> ListAuxiliaryStates() const override {
-    return {"moving_mean", "moving_var"};
-  }
+/*!
+ * \brief perform a forward operation of Operator, save the output to TBlob.
+ * \param ctx runtime context available to this call
+ * \param in_data array of input data, it is const
+ * \param req the request types of saving operation, can only be kWriteTo or kWriteInplace.
+ * \param out_data array of output data, pointer is used to indicate that this is holder
+ *        the space of TBlob in out_data must be pre-allocated with InferShape
+ * \param aux_states Auxiliary states of operator. Normally operator doesn't
+ *        need, epecial case like Batch Norm requires.
+ * \sa OpReqType, OpContext
+ */
+template <typename xpu, typename DType, typename AccReal>
+void BatchNormForward(const OpContext &ctx, const BatchNormParam& param,
+                      const std::vector<TBlob> &in_data,
+                      const std::vector<OpReqType> &req,
+                      const std::vector<TBlob> &out_data,
+                      const std::vector<TBlob> &aux_states) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+
+  CHECK_EQ(in_data.size(), 3U);
+  CHECK_EQ(aux_states.size(), 2U);
+  if (ctx.is_train) {
+    CHECK_EQ(out_data.size(), 3U);
+    CHECK_EQ(req.size(), 3U);
+  } else {
+    CHECK_GE(out_data.size(), 1U);
+    CHECK_GE(req.size(), 1U);
+    CHECK_EQ(req[batchnorm::kOut], kWriteTo);
+  }
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  BatchNormForwardImpl<xpu, DType, AccReal>(s, ctx, param, in_data, req,
+                                            out_data, aux_states);
+}
 
-  Operator* CreateOperator(Context ctx) const override {
-      LOG(FATAL) << "Not Implemented.";
-      return NULL;
-  }
+/*!
+ * \brief Perform a Backward Operation, write gradient to the in_grad.
+ *
+ * \note
+ * Convention:
+ *   out_grad.size() == OperatorProperty.NumVisibleOutputs()
+ *   out_data.size() == OperatorProperty.NumOutputs()
+ * out_data can contain additional invisible returns that remembers the
+ * state carried from the Forward pass. For example mask in the dropout.
+ * The gradients are passed from visible returns in this function.
+ *
+ * \par
+ * Not all the TBlobs in the arguments will be available
+ * if you override the DeclareBackwardDependency of corresponding OperatorProperty class.
+ * Only the dependencies you declared will be available at corresponding position,
+ * the rest of the parameters are simply dummy where you will get a nullptr.
+ * You will be safe if you use the default DeclareBackwardDependency.
+ * But only declare what you need will give engine more chance for optimization.
+ *
+ * \param ctx runtime context available to this call
+ * \param out_grad the gradient value we get from of the Operator.
+ * \param in_data the array of input data.
+ * \param out_data the array of output data.
+ * \param req request types of the saving operation, can be all types.
+ * \param in_grad the array of gradient we need to write to.
+ * \param aux_states Auxiliary states of operator. Normally operator doesn't need
+ * \sa OperatorProperty, OpReqType, OpContext
+ */
+template <typename xpu, typename DType, typename AccReal>
+void BatchNormBackward(const OpContext &ctx, const BatchNormParam& param,
+                       const std::vector<TBlob> &out_grad,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &in_grad,
+                       const std::vector<TBlob> &aux_states) {
+  CHECK_EQ(out_grad.size(), param.output_mean_var ? 3U : 1U);
+  CHECK_EQ(in_data.size(), 3U);
+  CHECK_EQ(out_data.size(), 3U);
+  CHECK_EQ(in_grad.size(), 3U);
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  BatchNormBackwardImpl<xpu, DType, AccReal>(s, ctx, param, out_grad, in_data,
+                                             out_data, req, in_grad, aux_states);
+}
 
-  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-      std::vector<int> *in_type) const override;
+template<typename xpu>
+void BatchNormCompute(const nnvm::NodeAttrs& attrs,
+                      const OpContext& ctx, const std::vector<TBlob>& inputs,
+                      const std::vector<OpReqType>& req,
+                      const std::vector<TBlob>& outputs) {
+  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 5U);
+  std::vector<TBlob> in_data(inputs.begin(),
+                             inputs.begin() + batchnorm::kInMovingMean);
+  std::vector<TBlob> aux_states(inputs.begin() + batchnorm::kInMovingMean,
+                                inputs.end());
+  MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, {
+    BatchNormForward<xpu, DType, AccReal>(ctx, param, in_data, req, outputs,
+                                          aux_states);
+  });
+}
 
-  inline const BatchNormParam& getParam() const {
-    return param_;
-  }
+template<typename xpu>
+void BatchNormGradCompute(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx, const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 11U);
+  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
+  int num_out_grads = param.output_mean_var ? 3U : 1U;
+  int in_data_start = 3;
+  int aux_states_start = in_data_start + batchnorm::kInMovingMean;
+  int out_data_start = in_data_start + batchnorm::kInMovingVar + 1;
+  std::vector<TBlob> out_grad(inputs.begin(), inputs.begin() + num_out_grads);
+  std::vector<TBlob> in_data(inputs.begin() + in_data_start,
+                             inputs.begin() + aux_states_start);
+  std::vector<TBlob> aux_states(inputs.begin() + aux_states_start,
+                                inputs.begin() + out_data_start);
+  std::vector<TBlob> out_data(inputs.begin() + out_data_start, inputs.end());
+  std::vector<TBlob> in_grad(outputs.begin(), outputs.begin() + 3);
+
+  MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, {
+    BatchNormBackward<xpu, DType, AccReal>(ctx, param, out_grad, in_data, out_data, req,
+                                           in_grad, aux_states);
+  });
+}
 
- private:
-  BatchNormParam param_;
-};  // class BatchNormProp
+#if DMLC_USE_CXX11
 
 namespace batchnorm {
 
diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc
index ca2883239488..ba6c413819e4 100644
--- a/src/operator/nn/batch_norm.cc
+++ b/src/operator/nn/batch_norm.cc
@@ -21,16 +21,15 @@
  * Copyright (c) 2015 by Contributors
  * \file batch_norm.cc
  * \brief
- * \author Bing Xu, Chris Olivier
+ * \author Bing Xu, Chris Olivier, Da Zheng
 */
 
 #include "batch_norm-inl.h"
 #include <nnvm/op_attr_types.h>
-#if MXNET_USE_MKL2017 == 1
-#include <mkl_memory.h>
-#include "../mkl/mkl_memory-inl.h"
-#include "../mkl/mkl_batch_norm-inl.h"
-#endif  // MXNET_USE_MKL2017
+#include "../elemwise_op_common.h"
+#if MXNET_USE_MKLDNN == 1
+#include "./mkldnn/mkldnn_batch_norm-inl.h"
+#endif
 
 /*! \brief inverse standard deviation <-> variance */
 #define VARIANCE_TO_INVSTD(__var$,    __eps$)   (1.0/sqrt((__var$) + DType(__eps$)))
@@ -89,12 +88,12 @@ static inline void ForEachFast(const BNTensor3<DType1> &in_data,
 
 /*! \brief Forward CPU */
 template <typename xpu, typename DType, typename AccReal>
-void BatchNormOp<xpu, DType, AccReal>::DoForward(mshadow::Stream<cpu> *,
-                                                 const OpContext &ctx,
-                                                 const std::vector<TBlob> &in_data,
-                                                 const std::vector<OpReqType> &req,
-                                                 const std::vector<TBlob> &out_data,
-                                                 const std::vector<TBlob> &aux_states) {
+void BatchNormForwardImpl(mshadow::Stream<cpu> *,
+                          const OpContext &ctx, const BatchNormParam& param_,
+                          const std::vector<TBlob> &in_data,
+                          const std::vector<OpReqType> &req,
+                          const std::vector<TBlob> &out_data,
+                          const std::vector<TBlob> &aux_states) {
   // Input
   batchnorm::BNTensor3<DType> inputData(in_data[batchnorm::kData], param_.axis);
   const TBlob &weights         = in_data[batchnorm::kGamma];
@@ -164,7 +163,7 @@ void BatchNormOp<xpu, DType, AccReal>::DoForward(mshadow::Stream<cpu> *,
 
     // note that var is still invstd
     if (!param_.fix_gamma) {
-      if (IsWriting(req[batchnorm::kData])) {
+      if (IsBNWriting(req[batchnorm::kData])) {
         ForEachFast(inputData, outputData, channel,
                     [thisWeight, thisBias, thisMean, thisInvstd](const DType *in_data,
                                                                  DType *out_data) {
@@ -173,10 +172,10 @@ void BatchNormOp<xpu, DType, AccReal>::DoForward(mshadow::Stream<cpu> *,
                     });
       }
     } else {
-      if (IsWriting(req[batchnorm::kGamma])) {
+      if (IsBNWriting(req[batchnorm::kGamma])) {
         w[channel] = AccReal(1);
       }
-      if (IsWriting(req[batchnorm::kData])) {
+      if (IsBNWriting(req[batchnorm::kData])) {
         ForEachFast(inputData, outputData, channel,
                     [thisWeight, thisBias, thisMean, thisInvstd](const DType *in_data,
                                                                  DType *out_data) {
@@ -189,14 +188,14 @@ void BatchNormOp<xpu, DType, AccReal>::DoForward(mshadow::Stream<cpu> *,
 }
 
 template <typename xpu, typename DType, typename AccReal>
-void BatchNormOp<xpu, DType, AccReal>::DoBackward(mshadow::Stream<cpu> *,
-                                                  const OpContext &ctx,
-                                                  const std::vector<TBlob> &out_grad,
-                                                  const std::vector<TBlob> &in_data,
-                                                  const std::vector<TBlob> &out_data,
-                                                  const std::vector<OpReqType> &req,
-                                                  const std::vector<TBlob> &in_grad,
-                                                  const std::vector<TBlob> &aux_states) {
+void BatchNormBackwardImpl(mshadow::Stream<cpu> *,
+                           const OpContext &ctx, const BatchNormParam& param_,
+                           const std::vector<TBlob> &out_grad,
+                           const std::vector<TBlob> &in_data,
+                           const std::vector<TBlob> &out_data,
+                           const std::vector<OpReqType> &req,
+                           const std::vector<TBlob> &in_grad,
+                           const std::vector<TBlob> &aux_states) {
   // Input Data
   batchnorm::BNTensor3<DType> inputData(in_data[batchnorm::kData], param_.axis);
   const TBlob &weights   = in_data[batchnorm::kGamma];
@@ -264,7 +263,7 @@ void BatchNormOp<xpu, DType, AccReal>::DoBackward(mshadow::Stream<cpu> *,
                   dotp += (*thisInputData - mean) * (*gradOut_data);
                 });
 
-    if (!gradIn.IsEmpty() && IsWriting(req[batchnorm::kData])) {  // if there's a grad input
+    if (!gradIn.IsEmpty() && IsBNWriting(req[batchnorm::kData])) {  // if there's a grad input
       if (is_train_and_not_global_stats) {
         // when in training mode
         // Q(X) = X - E[x] ; i.e. input centered to zero mean
@@ -300,7 +299,7 @@ void BatchNormOp<xpu, DType, AccReal>::DoBackward(mshadow::Stream<cpu> *,
     // May want to make this a param eventually
     const AccReal scale = 1.0f;
 
-    if (IsWriting(req[batchnorm::kGamma])) {
+    if (IsBNWriting(req[batchnorm::kGamma])) {
       if (!param_.fix_gamma) {
         gradWeightData[channel] = scale * dotp * invstd;
       } else {
@@ -308,51 +307,185 @@ void BatchNormOp<xpu, DType, AccReal>::DoBackward(mshadow::Stream<cpu> *,
       }
     }
 
-    if (IsWriting(req[batchnorm::kBeta])) {
+    if (IsBNWriting(req[batchnorm::kBeta])) {
       gradBiasData[channel] = scale * sumGradOut;
     }
   }
 }
 
-template<>
-Operator *CreateOp<cpu>(BatchNormParam param, const int dtype, const TShape& shape) {
-  param.axis = mxnet::op::batchnorm::GetRealAxis(shape, param.axis);
-  Operator *op = nullptr;
-#if MXNET_USE_MKL2017 == 1
-  if (shape.ndim() == 4
+DMLC_REGISTER_PARAMETER(BatchNormParam);
+
+static bool BatchNormShape(const nnvm::NodeAttrs& attrs,
+                           std::vector<TShape> *in_shape,
+                           std::vector<TShape> *out_shape) {
+  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
+  using namespace mshadow;
+  CHECK_EQ(in_shape->size(), 5U) << "Input:[data, gamma, beta, MovingMean, MovingVar]";
+  const TShape &dshape = in_shape->at(batchnorm::kData);
+
+  const size_t channelAxis = static_cast<size_t>(param.axis < 0
+      ? static_cast<int>(dshape.ndim()) + param.axis
+      : param.axis);
+  CHECK_LT(channelAxis, dshape.ndim()) << "Channel axis out of range: " << param.axis;
+
+  const int channelCount = dshape[channelAxis];
+
+  if (dshape.ndim() == 0) {
+    return false;
+  }
+
+  in_shape->at(batchnorm::kGamma) = TShape(Shape1(channelCount));
+  in_shape->at(batchnorm::kBeta) = TShape(Shape1(channelCount));
+  in_shape->at(batchnorm::kInMovingMean) = TShape(Shape1(channelCount));  // kMovingMean
+  in_shape->at(batchnorm::kInMovingVar) = TShape(Shape1(channelCount));  // kMovingVar
+
+  out_shape->clear();
+  out_shape->push_back(dshape);                // kOut
+  out_shape->push_back(Shape1(channelCount));  // kMean
+  out_shape->push_back(Shape1(channelCount));  // kVar
+
+  return true;
+}
+
+static bool BatchNormType(const nnvm::NodeAttrs& attrs,
+                          std::vector<int> *in_type, std::vector<int> *out_type) {
+  using namespace mshadow;
+  CHECK_GE(in_type->size(), 1U);
+  const int dtype = (*in_type)[0];
+  CHECK_NE(dtype, -1) << "First input must have specified type";
+  // For float16 input type beta, gamma, mean, and average are stored in float32.
+  // For other input types, these parameters have the same type as input
+  // NOTE: This requirement is from cuDNN (v. 4 and 5)
+  int dtype_param;
+  MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, {
+      dtype_param = mshadow::DataType<AccRealX>::kFlag; });
+  std::vector<std::string> args{"data", "gamma", "beta", "mean", "var"};
+  CHECK_LE(in_type->size(), args.size());
+  for (index_t i = 1; i < in_type->size(); ++i) {
+    if ((*in_type)[i] == -1) {
+      (*in_type)[i] = dtype_param;
+    } else {
+      UNIFORM_TYPE_CHECK((*in_type)[i], dtype_param, args[i]);
+    }
+  }
+  const size_t n_out = 3;
+  out_type->clear();
+  out_type->push_back(dtype);
+  for (size_t i = 1; i < n_out; ++i) {
+    out_type->push_back(dtype_param);
+  }
+  return true;
+}
+
+#if MXNET_USE_MKLDNN == 1
+static inline bool SupportMKLDNNBN(const NDArray &input, const BatchNormParam &param) {
+  TShape shape = input.shape();
+  return SupportMKLDNN(input) && shape.ndim() == 4
       && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS
-      && !mxnet::op::batchnorm::disable_mkl) {
-    switch (dtype) {
-      case mshadow::kFloat32:
-        op = new MKLBatchNormOp<cpu, float>(param);
-        break;
-      case mshadow::kFloat64:
-        op = new MKLBatchNormOp<cpu, double>(param);
-        break;
-      default:
-        // MKL operator doesn't support half_t, so fall through
-        break;
+      && shape[param.axis] % 8 == 0;
+}
+
+void BatchNormComputeExCPU(const nnvm::NodeAttrs &attrs,
+                           const OpContext &ctx,
+                           const std::vector<NDArray> &inputs,
+                           const std::vector<OpReqType> &req,
+                           const std::vector<NDArray> &outputs) {
+  CHECK_EQ(inputs.size(), 5U);
+  const BatchNormParam &param = nnvm::get<BatchNormParam>(attrs.parsed);
+  // MKLDNN batchnorm only works well on the special MKLDNN layout.
+  if (SupportMKLDNNBN(inputs[0], param) && inputs[0].IsMKLDNNData()) {
+    std::vector<NDArray> in_data(inputs.begin(), inputs.begin() + batchnorm::kInMovingMean);
+    std::vector<NDArray> aux_states(inputs.begin() + batchnorm::kInMovingMean, inputs.end());
+
+    if (inputs[0].dtype() == mshadow::kFloat32) {
+      MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+      MKLDNNBatchNormForward<float>(ctx, param, in_data, req, outputs, aux_states);
+      MKLDNN_OPCHECK_RUN(BatchNormCompute<cpu>, attrs, ctx, inputs, req, outputs);
+      return;
     }
   }
-#endif
-  if (!op) {
-    MSHADOW_REAL_TYPE_SWITCH_EX(dtype,
-                                DType,
-                                AccReal, {
-                                  op = new BatchNormOp<cpu, DType, AccReal>(param); });
+  FallBackCompute(BatchNormCompute<cpu>, attrs, ctx, inputs, req, outputs);
+}
+
+void BatchNormGradComputeExCPU(const nnvm::NodeAttrs &attrs,
+                               const OpContext &ctx,
+                               const std::vector<NDArray> &inputs,
+                               const std::vector<OpReqType> &req,
+                               const std::vector<NDArray> &outputs) {
+  CHECK_EQ(inputs.size(), 11U);
+  const BatchNormParam &param = nnvm::get<BatchNormParam>(attrs.parsed);
+  int num_out_grads = param.output_mean_var ? 3U : 1U;
+  int in_data_start = 3;
+  int aux_states_start = in_data_start + batchnorm::kInMovingMean;
+  int out_data_start = in_data_start + batchnorm::kInMovingVar + 1;
+
+  TShape shape = inputs[0].shape();
+  // MKLDNN batchnorm only works well on the special MKLDNN layout.
+  if (SupportMKLDNNBN(inputs[0], param)
+      && (inputs[in_data_start].IsMKLDNNData() || inputs[0].IsMKLDNNData())) {
+    std::vector<NDArray> out_grad(inputs.begin(), inputs.begin() + num_out_grads);
+    std::vector<NDArray> in_data(inputs.begin() + in_data_start,
+                                 inputs.begin() + aux_states_start);
+    std::vector<NDArray> aux_states(inputs.begin() + aux_states_start,
+                                    inputs.begin() + out_data_start);
+    std::vector<NDArray> out_data(inputs.begin() + out_data_start, inputs.end());
+    std::vector<NDArray> in_grad(outputs.begin(), outputs.begin() + 3);
+
+    if (inputs[0].dtype() == mshadow::kFloat32) {
+      MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+      MKLDNNBatchNormBackward<float>(ctx, param, out_grad, in_data,
+                                     out_data, req, in_grad, aux_states);
+      MKLDNN_OPCHECK_RUN(BatchNormGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+      return;
+    }
   }
-  return op;
+  FallBackCompute(BatchNormGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
 }
+#endif
 
-// DO_BIND_DISPATCH comes from operator_common.h
-Operator *BatchNormProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                                          std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_shape)[0]);
+static inline bool BatchNormStorageType(const nnvm::NodeAttrs &attrs,
+                                        const int dev_mask,
+                                        DispatchMode *dispatch_mode,
+                                        std::vector<int> *in_attrs,
+                                        std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 5);
+  CHECK_EQ(out_attrs->size(), 3);
+  DispatchMode wanted_mode;
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask)
+    wanted_mode = DispatchMode::kFComputeEx;
+  else
+#endif
+    wanted_mode = DispatchMode::kFCompute;
+  for (int& v : *in_attrs) {
+    if (v == - 1) v = kDefaultStorage;
+  }
+  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
+                             dispatch_mode, wanted_mode);
 }
 
-DMLC_REGISTER_PARAMETER(BatchNormParam);
+static inline bool backward_BatchNormStorageType(const nnvm::NodeAttrs &attrs,
+                                                 const int dev_mask,
+                                                 DispatchMode *dispatch_mode,
+                                                 std::vector<int> *in_attrs,
+                                                 std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 11);
+  CHECK_EQ(out_attrs->size(), 5);
+  DispatchMode wanted_mode;
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask)
+    wanted_mode = DispatchMode::kFComputeEx;
+  else
+#endif
+    wanted_mode = DispatchMode::kFCompute;
+  for (int& v : *in_attrs) {
+    if (v == - 1) v = kDefaultStorage;
+  }
+  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
+                             dispatch_mode, wanted_mode);
+}
 
-MXNET_REGISTER_OP_PROPERTY(BatchNorm, BatchNormProp)
+NNVM_REGISTER_OP(BatchNorm)
 .describe(R"code(Batch normalization.
 
 Normalizes a data batch by mean and variance, and applies a scale ``gamma`` as
@@ -398,14 +531,44 @@ Both ``gamma`` and ``beta`` are learnable parameters. But if ``fix_gamma`` is tr
 then set ``gamma`` to 1 and its gradient to 0.
 
 )code" ADD_FILELINE)
+.set_num_inputs(5)
+.set_num_outputs(3)
+.set_attr_parser(ParamParser<BatchNormParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+    [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"data", "gamma", "beta", "moving_mean", "moving_var"};
+})
+.set_attr<nnvm::FListOutputNames>("FListOutputNames",
+    [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"output", "mean", "var"};
+})
+.set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs",
+    [](const NodeAttrs& attrs) {
+  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
+  return param.output_mean_var ? 3 : 1;
+})
+.set_attr<nnvm::FMutateInputs>("FMutateInputs", [](const nnvm::NodeAttrs& attrs) {
+  return std::vector<uint32_t>{3, 4};
+})
+.set_attr<nnvm::FInferShape>("FInferShape", BatchNormShape)
+.set_attr<nnvm::FInferType>("FInferType", BatchNormType)
+.set_attr<FInferStorageType>("FInferStorageType", BatchNormStorageType)
+.set_attr<FCompute>("FCompute<cpu>", BatchNormCompute<cpu>)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormComputeExCPU)
+#endif
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseInOut{"_backward_BatchNorm"})
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
 .add_argument("data", "NDArray-or-Symbol", "Input data to batch normalization")
 .add_argument("gamma", "NDArray-or-Symbol", "gamma array")
 .add_argument("beta", "NDArray-or-Symbol", "beta array")
 .add_argument("moving_mean", "NDArray-or-Symbol", "running mean of input")
 .add_argument("moving_var", "NDArray-or-Symbol", "running variance of input")
-.add_arguments(BatchNormParam::__FIELDS__());
-
-NNVM_REGISTER_OP(BatchNorm)
+.add_arguments(BatchNormParam::__FIELDS__())
 .set_attr<nnvm::FSetInputVarAttrOnCompose>(
   "FSetInputVarAttrOnCompose",
   [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) {
@@ -417,5 +580,20 @@ NNVM_REGISTER_OP(BatchNorm)
     }
   });
 
+NNVM_REGISTER_OP(_backward_BatchNorm)
+.set_num_outputs(5)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FInferStorageType>("FInferStorageType", backward_BatchNormStorageType)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
+.set_attr_parser(ParamParser<BatchNormParam>)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormGradComputeExCPU)
+#endif
+.set_attr<FCompute>("FCompute<cpu>", BatchNormGradCompute<cpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/batch_norm.cu b/src/operator/nn/batch_norm.cu
index 59317b7fa837..80c15976b65f 100644
--- a/src/operator/nn/batch_norm.cu
+++ b/src/operator/nn/batch_norm.cu
@@ -21,7 +21,7 @@
  * Copyright (c) 2017 by Contributors
  * \file batch_norm.cu
  * \brief CUDA Batch Normalization code
- * \author Chris Olivier, Bing Xu
+ * \author Chris Olivier, Bing Xu, Da Zheng
  * Adapted from Torch
 */
 #include <cuda_runtime_api.h>
@@ -579,13 +579,13 @@ static inline uint32_t SetupFlags(const OpContext &ctx,
   flags |= ctx.is_train ? IS_TRAINING_FLAG : 0;
   flags |= params.fix_gamma ? FIX_GAMMA_FLAG : 0;
   flags |= params.use_global_stats ? USE_GLOBAL_STATS_FLAG : 0;
-  if (BatchNormOp<xpu, DType, AccReal>::IsWriting(req[batchnorm::kData])) {
+  if (IsBNWriting(req[batchnorm::kData])) {
     flags |= WRITE_DATA_FLAG;
   }
-  if (BatchNormOp<xpu, DType, AccReal>::IsWriting(req[batchnorm::kGamma])) {
+  if (IsBNWriting(req[batchnorm::kGamma])) {
     flags |= WRITE_GAMMA_FLAG;
   }
-  if (BatchNormOp<xpu, DType, AccReal>::IsWriting(req[batchnorm::kBeta])) {
+  if (IsBNWriting(req[batchnorm::kBeta])) {
     flags |= WRITE_BETA_FLAG;
   }
   return flags;
@@ -593,12 +593,12 @@ static inline uint32_t SetupFlags(const OpContext &ctx,
 
 /*! \brief Forward batch-norm pass on GPU */
 template<typename xpu, typename DType, typename AccReal>
-void BatchNormOp<xpu, DType, AccReal>::DoForward(mshadow::Stream<gpu> *stream,
-                                                 const OpContext &ctx,
-                                                 const std::vector<TBlob> &in_data,
-                                                 const std::vector<OpReqType> &req,
-                                                 const std::vector<TBlob> &out_data,
-                                                 const std::vector<TBlob> &aux_states) {
+void BatchNormForwardImpl(mshadow::Stream<gpu> *stream,
+                          const OpContext &ctx, const BatchNormParam& param_,
+                          const std::vector<TBlob> &in_data,
+                          const std::vector<OpReqType> &req,
+                          const std::vector<TBlob> &out_data,
+                          const std::vector<TBlob> &aux_states) {
   batchnorm::cuda::BatchNormalizationUpdateOutput<DType, AccReal>(
     stream,
     ctx,
@@ -614,14 +614,14 @@ void BatchNormOp<xpu, DType, AccReal>::DoForward(mshadow::Stream<gpu> *stream,
 
 /*! \brief Backward batch-norm pass on GPU */
 template<typename xpu, typename DType, typename AccReal>
-void BatchNormOp<xpu, DType, AccReal>::DoBackward(mshadow::Stream<gpu> *stream,
-                                                  const OpContext &ctx,
-                                                  const std::vector<TBlob> &out_grad,
-                                                  const std::vector<TBlob> &in_data,
-                                                  const std::vector<TBlob> &out_data,
-                                                  const std::vector<OpReqType> &req,
-                                                  const std::vector<TBlob> &in_grad,
-                                                  const std::vector<TBlob> &aux_states) {
+void BatchNormBackwardImpl(mshadow::Stream<gpu> *stream,
+                           const OpContext &ctx, const BatchNormParam& param_,
+                           const std::vector<TBlob> &out_grad,
+                           const std::vector<TBlob> &in_data,
+                           const std::vector<TBlob> &out_data,
+                           const std::vector<OpReqType> &req,
+                           const std::vector<TBlob> &in_grad,
+                           const std::vector<TBlob> &aux_states) {
   batchnorm::cuda::BatchNormalizationBackward<DType, AccReal>(
     stream,
     ctx,
@@ -637,30 +637,92 @@ void BatchNormOp<xpu, DType, AccReal>::DoBackward(mshadow::Stream<gpu> *stream,
   MSHADOW_CUDA_POST_KERNEL_CHECK(BatchNormOp_DoBackward_gpu);
 }
 
-/*! \brief Create GPU operator for batch normalization */
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 4
+template<typename DType>
+static CuDNNBatchNormOp<DType> &GetCuDNNOp(const BatchNormParam& param) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local CuDNNBatchNormOp<DType> op;
+#else
+  static MX_THREAD_LOCAL CuDNNBatchNormOp<DType> op;
+#endif
+  op.Init(param);
+  return op;
+}
+#endif
+
 template<>
-Operator *CreateOp<gpu>(BatchNormParam param, const int dtype, const TShape& shape) {
+void BatchNormCompute<gpu>(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx, const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& outputs) {
+  BatchNormParam param = nnvm::get<BatchNormParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 5U);
+  std::vector<TBlob> in_data(inputs.begin(), inputs.begin() + 3);
+  std::vector<TBlob> aux_states(inputs.begin() + 3, inputs.end());
+  int dtype = inputs[0].type_flag_;
+  TShape shape = inputs[0].shape_;
+
   param.axis = mxnet::op::batchnorm::GetRealAxis(shape, param.axis);
-  Operator *op = NULL;
 #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
   if (!param.use_global_stats && !param.cudnn_off && shape.ndim() <= 4
       && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS) {
     MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-      op = new CuDNNBatchNormOp<DType>(param);
+      GetCuDNNOp<DType>(param).Forward(ctx, in_data, req, outputs, aux_states);
     })
   } else {
     MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DType, AccReal, {
-      op = new BatchNormOp<gpu, DType, AccReal>(param);
+      BatchNormForward<gpu, DType, AccReal>(ctx, param, in_data, req, outputs, aux_states);
     })
   }
 #else
-  MSHADOW_REAL_TYPE_SWITCH_EX(dtype,
-                              DType,
-                              AccReal,
-                              { op = new BatchNormOp<gpu, DType, AccReal>(param); });
+  MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, {
+    BatchNormForward<gpu, DType, AccReal>(ctx, param, in_data, req, outputs, aux_states);
+  });
+#endif
+}
+
+template<>
+void BatchNormGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
+                               const OpContext& ctx, const std::vector<TBlob>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 11U);
+  BatchNormParam param = nnvm::get<BatchNormParam>(attrs.parsed);
+  std::vector<TBlob> out_grad(1, inputs[0]);
+  std::vector<TBlob> in_data(inputs.begin() + 3, inputs.begin() + 6);
+  std::vector<TBlob> aux_states(inputs.begin() + 6, inputs.begin() + 8);
+  std::vector<TBlob> out_data(inputs.begin() + 8, inputs.end());
+  std::vector<TBlob> in_grad(outputs.begin(), outputs.begin() + 3);
+  int dtype = inputs[0].type_flag_;
+  TShape shape = inputs[0].shape_;
+
+  param.axis = mxnet::op::batchnorm::GetRealAxis(shape, param.axis);
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
+  if (!param.use_global_stats && !param.cudnn_off && shape.ndim() <= 4
+      && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS) {
+    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+      GetCuDNNOp<DType>(param).Backward(ctx, out_grad, in_data, out_data,
+        req, in_grad, aux_states);
+    })
+  } else {
+    MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DType, AccReal, {
+      BatchNormBackward<gpu, DType, AccReal>(ctx, param, out_grad,
+          in_data, out_data, req, in_grad, aux_states);
+    })
+  }
+#else
+  MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, {
+    BatchNormBackward<gpu, DType, AccReal>(ctx, param, out_grad,
+        in_data, out_data, req, in_grad, aux_states);
+  });
 #endif
-  return op;
 }
 
+NNVM_REGISTER_OP(BatchNorm)
+.set_attr<FCompute>("FCompute<gpu>", BatchNormCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_BatchNorm)
+.set_attr<FCompute>("FCompute<gpu>", BatchNormGradCompute<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/concat-inl.h b/src/operator/nn/concat-inl.h
new file mode 100644
index 000000000000..a7f1fa85f612
--- /dev/null
+++ b/src/operator/nn/concat-inl.h
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file concat-inl.h
+ * \brief
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_NN_CONCAT_INL_H_
+#define MXNET_OPERATOR_NN_CONCAT_INL_H_
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+#include <utility>
+#include "../operator_common.h"
+#include "../channel_op_common.h"
+#include "../tensor/broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+namespace concat_enum {
+enum ConcatOpInputs {kData0, kData1, kData2, kData3, kData4};
+enum ConcatOpResource {kTempSpace};
+enum ConcatOpOutputs {kOut};
+}  // namespace concat_enum
+
+struct ConcatParam : public dmlc::Parameter<ConcatParam> {
+  int num_args;
+  int dim;
+  DMLC_DECLARE_PARAMETER(ConcatParam) {
+    DMLC_DECLARE_FIELD(num_args).set_lower_bound(1)
+    .describe("Number of inputs to be concated.");
+    DMLC_DECLARE_FIELD(dim).set_default(1)
+    .describe("the dimension to be concated.");
+  }
+};  // struct ConcatParam
+
+template<typename xpu, typename DType>
+class ConcatOp {
+ public:
+  void Init(const ConcatParam &param) {
+    this->size_ = param.num_args;
+    this->dimension_ = param.dim;
+  }
+
+  void Forward(const OpContext &ctx,
+               const std::vector<TBlob> &in_data,
+               const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &out_data) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(static_cast<int>(in_data.size()), size_);
+    CHECK_EQ(out_data.size(), 1U);
+    int axis = CheckAxis(dimension_, in_data[concat_enum::kData0].ndim());
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    std::vector<Tensor<xpu, 3, DType> > data(size_);
+    Tensor<xpu, 3, DType> out;
+    size_t leading = 1, trailing = 1;
+    for (int i = 0; i < axis; ++i) {
+      leading *= out_data[concat_enum::kOut].shape_[i];
+    }
+    for (int i = axis + 1; i < out_data[concat_enum::kOut].ndim(); ++i) {
+      trailing *= out_data[concat_enum::kOut].shape_[i];
+    }
+    size_t mid = out_data[concat_enum::kOut].shape_[axis];
+    Shape<3> oshape = Shape3(leading, mid, trailing);
+    out = out_data[concat_enum::kOut].get_with_shape<xpu, 3, DType>(oshape, s);
+
+    for (int i = 0; i < size_; ++i) {
+      Shape<3> dshape = Shape3(leading, in_data[i].shape_[axis], trailing);
+      data[i] = in_data[i].get_with_shape<xpu, 3, DType>(dshape, s);
+    }
+    Concatenate(data, &out, 1, req[concat_enum::kOut]);
+  }
+
+  void Backward(const OpContext &ctx, const TBlob &out_grad,
+                const std::vector<OpReqType> &req,
+                const std::vector<TBlob> &in_grad) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_grad.size(), static_cast<size_t>(size_));
+    int axis = CheckAxis(dimension_, out_grad.ndim());
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    std::vector<Tensor<xpu, 3, DType> > grad_in(size_);
+    Tensor<xpu, 3, DType> grad;
+    size_t leading = 1, trailing = 1;
+    for (int i = 0; i < axis; ++i) {
+      leading *= out_grad.shape_[i];
+    }
+    for (int i = axis + 1; i < out_grad.ndim(); ++i) {
+      trailing *= out_grad.shape_[i];
+    }
+    size_t mid = out_grad.shape_[axis];
+    Shape<3> oshape = Shape3(leading, mid, trailing);
+    grad = out_grad.get_with_shape<xpu, 3, DType>(oshape, s);
+
+    for (int i = 0; i < size_; ++i) {
+      Shape<3> dshape = Shape3(leading, in_grad[i].shape_[axis], trailing);
+      grad_in[i] = in_grad[i].get_with_shape<xpu, 3, DType>(dshape, s);
+    }
+    Split(grad, &grad_in, 1, req);
+  }
+
+ private:
+  int size_;
+  int dimension_;
+};  // class ConcatOp
+
+template<typename xpu>
+void ConcatCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
+                   const std::vector<TBlob>& inputs,
+                   const std::vector<OpReqType>& req,
+                   const std::vector<TBlob>& outputs) {
+  const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
+  MSHADOW_TYPE_SWITCH(inputs[concat_enum::kData0].type_flag_, DType, {
+    ConcatOp<xpu, DType> op;
+    op.Init(param);
+    op.Forward(ctx, inputs, req, outputs);
+  });
+}
+
+template<typename xpu>
+void ConcatGradCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
+                       const std::vector<TBlob>& inputs,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& outputs) {
+  const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
+  MSHADOW_TYPE_SWITCH(inputs[concat_enum::kOut].type_flag_, DType, {
+    ConcatOp<xpu, DType> op;
+    op.Init(param);
+    op.Backward(ctx, inputs[concat_enum::kOut], req, outputs);
+  });
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NN_CONCAT_INL_H_
diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc
new file mode 100644
index 000000000000..81dc95f1a5a5
--- /dev/null
+++ b/src/operator/nn/concat.cc
@@ -0,0 +1,289 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file concat.cc
+ * \brief
+ * \author Bing Xu
+*/
+
+#include "./concat-inl.h"
+#include "./mkldnn/mkldnn_ops-inl.h"
+#include "./mkldnn/mkldnn_base-inl.h"
+#include "../../common/utils.h"
+
+namespace mxnet {
+namespace op {
+
+static bool ConcatShape(const nnvm::NodeAttrs& attrs,
+                        std::vector<TShape> *in_shape,
+                        std::vector<TShape> *out_shape) {
+  using namespace mshadow;
+  const ConcatParam& param_ = nnvm::get<ConcatParam>(attrs.parsed);
+  CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
+  TShape dshape;
+  index_t size = 0;
+  bool has_zero = false;
+  int axis = -1;
+  for (int i = 0; i < param_.num_args; ++i) {
+    TShape tmp = (*in_shape)[i];
+    if (tmp.ndim()) {
+      axis = CheckAxis(param_.dim, tmp.ndim());
+      has_zero = tmp[axis] == 0 || has_zero;
+      size += tmp[axis];
+      tmp[axis] = 0;
+      shape_assign(&dshape, tmp);
+    }
+  }
+
+  TShape tmp = (*out_shape)[0];
+  if (tmp.ndim()) {
+    axis = CheckAxis(param_.dim, tmp.ndim());
+    tmp[axis] = 0;
+    shape_assign(&dshape, tmp);
+  }
+
+  if (dshape.ndim() == 0) return false;
+
+  for (int i = 0; i < param_.num_args; ++i) {
+    CHECK(shape_assign(&(*in_shape)[i], dshape))
+        << "Incompatible input shape: expected " << dshape << ", got " << (*in_shape)[i];
+  }
+
+  if (!has_zero) dshape[axis] = size;
+  CHECK(shape_assign(&(*out_shape)[0], dshape))
+      << "Incompatible output shape: expected " << dshape << ", got " << (*out_shape)[0];
+
+  return dshape.Size() != 0;
+}
+
+static bool ConcatType(const nnvm::NodeAttrs& attrs,
+                       std::vector<int> *in_type,
+                       std::vector<int> *out_type) {
+  const ConcatParam& param_ = nnvm::get<ConcatParam>(attrs.parsed);
+  int dtype = -1;
+
+  for (size_t i = 0; i < in_type->size(); ++i) {
+    if (dtype == -1) {
+      dtype = in_type->at(i);
+    } else {
+      CHECK(in_type->at(i) == dtype ||
+            in_type->at(i) == -1) <<
+          "Non-uniform data type in Concat";
+    }
+  }
+
+  if (dtype == -1) {
+    LOG(FATAL) << "Not enough information to infer type in Concat.";
+    return false;
+  }
+
+  size_t nin = param_.num_args;
+  in_type->clear();
+  for (size_t i = 0; i < nin; ++i) in_type->push_back(dtype);
+
+  out_type->clear();
+  out_type->push_back(dtype);
+
+  return true;
+}
+
+inline static bool ConcatForwardInferStorageType(const nnvm::NodeAttrs& attrs,
+                                                 const int dev_mask,
+                                                 DispatchMode* dispatch_mode,
+                                                 std::vector<int> *in_attrs,
+                                                 std::vector<int> *out_attrs) {
+  CHECK(!in_attrs->empty());
+  CHECK_EQ(out_attrs->size(), 1U);
+  DispatchMode wanted_mode;
+#if MXNET_USE_MKLDNN == 1
+  const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
+  if (dev_mask == mshadow::cpu::kDevMask
+      && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)
+      && param.dim > 0)
+    wanted_mode = DispatchMode::kFComputeEx;
+  else
+#endif
+    wanted_mode = DispatchMode::kFCompute;
+  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
+                             dispatch_mode, wanted_mode);
+}
+
+inline static bool BackwardConcatStorageType(const nnvm::NodeAttrs& attrs,
+                                             const int dev_mask,
+                                             DispatchMode* dispatch_mode,
+                                             std::vector<int> *in_attrs,
+                                             std::vector<int> *out_attrs) {
+  DispatchMode wanted_mode;
+#if MXNET_USE_MKLDNN == 1
+  const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
+  CHECK_EQ(out_attrs->size(), in_attrs->size() - 1);
+  if (dev_mask == mshadow::cpu::kDevMask
+      && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)
+      && param.dim > 0)
+    wanted_mode = DispatchMode::kFComputeEx;
+  else
+#endif
+    wanted_mode = DispatchMode::kFCompute;
+  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
+                             dispatch_mode, wanted_mode);
+}
+
+#if MXNET_USE_MKLDNN == 1
+static void ConcatComputeExCPU(const nnvm::NodeAttrs& attrs,
+                               const OpContext& op_ctx,
+                               const std::vector<NDArray>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<NDArray>& outputs) {
+  CHECK(!inputs.empty());
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  if (req[0] == kNullOp) return;
+  // MKLDNN support 2D and 4D concat
+  if ((inputs[0].shape().ndim() == 2 || inputs[0].shape().ndim() == 4)
+      && inputs[0].dtype() == mshadow::kFloat32) {
+    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    MKLDNNConcatForward(attrs, op_ctx, inputs, req, outputs);
+    MKLDNN_OPCHECK_RUN(ConcatCompute<cpu>, attrs, op_ctx, inputs, req, outputs);
+    return;
+  }
+  FallBackCompute(ConcatCompute<cpu>, attrs, op_ctx, inputs, req, outputs);
+}
+
+static void ConcatGradComputeExCPU(const nnvm::NodeAttrs& attrs,
+                                   const OpContext& ctx,
+                                   const std::vector<NDArray>& inputs,
+                                   const std::vector<OpReqType>& req,
+                                   const std::vector<NDArray>& outputs) {
+  if ((inputs[0].shape().ndim() == 2 || inputs[0].shape().ndim() == 4)
+      && inputs[0].dtype() == mshadow::kFloat32) {
+    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    MKLDNNConcatBackward(attrs, ctx, inputs, req, outputs);
+    MKLDNN_OPCHECK_RUN(ConcatGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+    return;
+  }
+  FallBackCompute(ConcatGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+}
+#endif
+
+struct ConcatGrad {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
+    CHECK_EQ(ograds.size(), 1);
+    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
+#if MXNET_USE_MKLDNN == 1
+    for (size_t i = 0; i < n->inputs.size(); i++) {
+      heads.push_back(n->inputs[i]);
+    }
+#endif
+    return MakeGradNode(op_name, n, heads, n->attrs.dict);
+  }
+};
+
+DMLC_REGISTER_PARAMETER(ConcatParam);
+
+NNVM_REGISTER_OP(Concat)
+.describe(R"code(Joins input arrays along a given axis.
+
+.. note:: `Concat` is deprecated. Use `concat` instead.
+
+The dimensions of the input arrays should be the same except the axis along
+which they will be concatenated.
+The dimension of the output array along the concatenated axis will be equal
+to the sum of the corresponding dimensions of the input arrays.
+
+Example::
+
+   x = [[1,1],[2,2]]
+   y = [[3,3],[4,4],[5,5]]
+   z = [[6,6], [7,7],[8,8]]
+
+   concat(x,y,z,dim=0) = [[ 1.,  1.],
+                          [ 2.,  2.],
+                          [ 3.,  3.],
+                          [ 4.,  4.],
+                          [ 5.,  5.],
+                          [ 6.,  6.],
+                          [ 7.,  7.],
+                          [ 8.,  8.]]
+
+   Note that you cannot concat x,y,z along dimension 1 since dimension
+   0 is not the same for all the input arrays.
+
+   concat(y,z,dim=1) = [[ 3.,  3.,  6.,  6.],
+                         [ 4.,  4.,  7.,  7.],
+                         [ 5.,  5.,  8.,  8.]]
+
+)code" ADD_FILELINE)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const ConcatParam& params = nnvm::get<ConcatParam>(attrs.parsed);
+  return params.num_args;
+})
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<ConcatParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+    [](const NodeAttrs& attrs) {
+  const ConcatParam& params = nnvm::get<ConcatParam>(attrs.parsed);
+  std::vector<std::string> ret;
+  for (int i = 0; i < params.num_args; ++i) {
+    ret.push_back(std::string("arg") + std::to_string(i));
+  }
+  return ret;
+})
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
+.set_attr<nnvm::FInferShape>("FInferShape", ConcatShape)
+.set_attr<nnvm::FInferType>("FInferType", ConcatType)
+.set_attr<FInferStorageType>("FInferStorageType", ConcatForwardInferStorageType)
+.set_attr<FCompute>("FCompute<cpu>", ConcatCompute<cpu>)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FComputeEx>("FComputeEx<cpu>", ConcatComputeExCPU)
+#endif
+.set_attr<nnvm::FGradient>("FGradient", ConcatGrad{"_backward_Concat"})
+.set_attr<std::string>("key_var_num_args", "num_args")
+.add_argument("data", "NDArray-or-Symbol[]", "List of arrays to concatenate")
+.add_arguments(ConcatParam::__FIELDS__());
+
+NNVM_REGISTER_OP(Concat).add_alias("concat");
+
+NNVM_REGISTER_OP(_backward_Concat)
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const ConcatParam& params = nnvm::get<ConcatParam>(attrs.parsed);
+  return params.num_args;
+})
+.set_attr_parser(ParamParser<ConcatParam>)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FInferStorageType>("FInferStorageType", BackwardConcatStorageType)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FComputeEx>("FComputeEx<cpu>", ConcatGradComputeExCPU)
+#endif
+.set_attr<FCompute>("FCompute<cpu>", ConcatGradCompute<cpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/concat.cu b/src/operator/nn/concat.cu
similarity index 81%
rename from src/operator/concat.cu
rename to src/operator/nn/concat.cu
index 394fa736ee84..f6bf5ece5c78 100644
--- a/src/operator/concat.cu
+++ b/src/operator/nn/concat.cu
@@ -28,14 +28,12 @@
 
 namespace mxnet {
 namespace op {
-template<>
-Operator* CreateOp<gpu>(ConcatParam param, int dtype, std::vector<TShape> *in_shape) {
-  Operator *op = NULL;
-  MSHADOW_TYPE_SWITCH(dtype, DType, {
-    op = new ConcatOp<gpu, DType>(param);
-  });
-  return op;
-}
+
+NNVM_REGISTER_OP(Concat)
+.set_attr<FCompute>("FCompute<gpu>", ConcatCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_Concat)
+.set_attr<FCompute>("FCompute<gpu>", ConcatGradCompute<gpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/convolution-inl.h b/src/operator/nn/convolution-inl.h
index 1613da6c85d1..6204f75c4697 100644
--- a/src/operator/nn/convolution-inl.h
+++ b/src/operator/nn/convolution-inl.h
@@ -22,7 +22,7 @@
  * \file convolution-inl.h
  * \brief
  * \ref: https://github.com/Yangqing/caffe/wiki/Convolution-in-Caffe:-a-memo
- * \author Bing Xu, Jun Wu
+ * \author Bing Xu, Jun Wu, Da Zheng
 */
 #ifndef MXNET_OPERATOR_NN_CONVOLUTION_INL_H_
 #define MXNET_OPERATOR_NN_CONVOLUTION_INL_H_
@@ -148,9 +148,9 @@ namespace mxnet {
 namespace op {
 
 template<typename xpu, typename DType>
-class ConvolutionOp : public Operator {
+class ConvolutionOp {
  public:
-  explicit ConvolutionOp(ConvolutionParam p) {
+  void Init(ConvolutionParam p) {
     this->param_ = p;
     // convert MBytes first to Bytes and then to elements.
     param_.workspace = (param_.workspace << 20) / sizeof(DType);
@@ -160,11 +160,10 @@ class ConvolutionOp : public Operator {
       << "Only support NCW, NCHW and NCDHW layout";
   }
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
+  void Forward(const OpContext &ctx,
+               const std::vector<TBlob> &in_data,
+               const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &out_data) {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(req[conv::kOut], kWriteTo);
@@ -233,18 +232,19 @@ class ConvolutionOp : public Operator {
     }
   }
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob>& out_grad,
-                        const std::vector<TBlob>& in_data,
-                        const std::vector<TBlob>& out_data,
-                        const std::vector<OpReqType>& req,
-                        const std::vector<TBlob>& in_grad,
-                        const std::vector<TBlob>& aux_args) {
+  void Backward(const OpContext &ctx,
+                const std::vector<TBlob>& out_grad,
+                const std::vector<TBlob>& in_data,
+                const std::vector<OpReqType>& req,
+                const std::vector<TBlob>& in_grad) {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(out_grad.size(), 1U);
+    // We expect 2 inputs: in data and weight. We don't need bias for
+    // computing gradient.
     size_t expected = param_.no_bias == 0 ? 3 : 2;
-    CHECK(in_data.size() == expected && in_grad.size() == expected);
+    CHECK_EQ(in_data.size(), expected);
+    CHECK_EQ(in_grad.size(), expected);
     CHECK_EQ(req.size(), expected);
     CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true);
     LayerSetUp(in_grad[conv::kData].shape_, out_grad[conv::kOut].shape_);
@@ -386,299 +386,35 @@ class ConvolutionOp : public Operator {
 };  // class ConvolutionOp
 
 template<typename xpu>
-Operator* CreateOp(ConvolutionParam param, int dtype,
-                   std::vector<TShape> *in_shape,
-                   std::vector<TShape> *out_shape,
-                   Context ctx);
-
-#if DMLC_USE_CXX11
-class ConvolutionProp : public OperatorProperty {
- public:
-  std::vector<std::string> ListArguments() const override {
-    if (!param_.no_bias) {
-      return {"data", "weight", "bias"};
-    } else {
-      return {"data", "weight"};
-    }
-  }
-
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    using namespace mshadow;
-    param_.Init(kwargs);
-    if (param_.kernel.ndim() == 1) {
-      param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW;
-      if (param_.stride.ndim() == 0) param_.stride = Shape1(1);
-      if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape1(0);
-    } else if (param_.kernel.ndim() == 2) {
-      param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW;
-      if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
-      if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
-    } else {
-      CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D convolution not supported";
-      param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW;
-      if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
-      if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
-    }
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-    using namespace mshadow;
-    if (!param_.no_bias) {
-      CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
-    } else {
-      CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
-    }
-    // CHECK_EQ(out_shape->size(), 1) << "Output: [output]";
-    out_shape->resize(1, TShape());
-    const TShape &dshp = (*in_shape)[conv::kData];
-    if (dshp.ndim() ==  0) return false;
-
-    if (param_.kernel.ndim() == 1) {
-      // 1d conv
-      CHECK_EQ(dshp.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x";
-      Shape<3> dshape = ConvertLayout(dshp.get<3>(), param_.layout.value(), kNCW);
-      Shape<3> wshape = Shape3(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
-                               param_.kernel[0]);
-      wshape = ConvertLayout(wshape, kNCW, param_.layout.value());
-      wshape[0] *= param_.num_group;
-      SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
-      if (!param_.no_bias) {
-        SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
-      }
-
-      const index_t dilated_ksize_x = param_.DilatedKernelSize(0);
-      CHECK_EQ(dshape[1] % param_.num_group, 0U) \
-          << "input num_filter must divide group size";
-      CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
-          << "output num_filter must divide group size";
-      CHECK_GT(param_.kernel.Size(), 0U) \
-          << "incorrect kernel size: " << param_.kernel;
-      CHECK_GT(param_.stride.Size(), 0U) \
-          << "incorrect stride size: " << param_.stride;
-      CHECK_GT(param_.dilate.Size(), 0U) \
-          << "incorrect dilate size: " << param_.dilate;
-      Shape<3> oshape;
-      oshape[0] = dshape[0];
-      oshape[1] = param_.num_filter;
-      oshape[2] = dshape[2] ?
-          (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1 : 0;
-      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value()));
-      // Perform incomplete shape inference. Fill in the missing values in data shape.
-      // 1) We can always fill in the batch_size.
-      // 2) We can back-calculate the input height/width if the corresponding stride is 1.
-      oshape = ConvertLayout((*out_shape)[0].get<3>(), param_.layout.value(), kNCW);
-      dshape[0] = oshape[0];
-      if (oshape[2] && param_.stride[0] == 1) {
-        dshape[2] = oshape[2] + dilated_ksize_x - 1 - 2 * param_.pad[0];
-      }
-      SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
-                          ConvertLayout(dshape, kNCW, param_.layout.value()));
-      // Check whether the kernel sizes are valid
-      if (dshape[2] != 0) {
-        CHECK_LE(dilated_ksize_x, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
-      }
-      return true;
-    } else if (param_.kernel.ndim() == 2) {
-      // 2d conv
-      CHECK_EQ(dshp.ndim(), 4U) \
-          << "Input data should be 4D in batch-num_filter-y-x";
-      Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW);
-      Shape<4> wshape = Shape4(param_.num_filter / param_.num_group,
-                               dshape[1] / param_.num_group,
-                               param_.kernel[0], param_.kernel[1]);
-      wshape = ConvertLayout(wshape, kNCHW, param_.layout.value());
-      wshape[0] *= param_.num_group;
-      SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
-      if (!param_.no_bias) {
-        SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
-      }
-
-      const index_t dilated_ksize_y = param_.DilatedKernelSize(0);
-      const index_t dilated_ksize_x = param_.DilatedKernelSize(1);
-      CHECK_EQ(dshape[1] % param_.num_group, 0U) \
-          << "input num_filter must divide group size";
-      CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
-          << "output num_filter must divide group size";
-      CHECK_GT(param_.kernel.Size(), 0U) \
-          << "incorrect kernel size: " << param_.kernel;
-      CHECK_GT(param_.stride.Size(), 0U) \
-          << "incorrect stride size: " << param_.stride;
-      CHECK_GT(param_.dilate.Size(), 0U) \
-          << "incorrect dilate size: " << param_.dilate;
-      Shape<4> oshape;
-      oshape[0] = dshape[0];
-      oshape[1] = param_.num_filter;
-      oshape[2] = dshape[2] ?
-        (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_y) / param_.stride[0] + 1 : 0;
-      oshape[3] = dshape[3] ?
-        (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_x) / param_.stride[1] + 1 : 0;
-      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value()));
-      // Perform incomplete shape inference. Fill in the missing values in data shape.
-      // 1) We can always fill in the batch_size.
-      // 2) We can back-calculate the input height/width if the corresponding stride is 1.
-      oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW);
-      dshape[0] = oshape[0];
-      if (oshape[2] && param_.stride[0] == 1) {
-        dshape[2] = oshape[2] + dilated_ksize_y - 1 - 2 * param_.pad[0];
-      }
-      if (oshape[3] && param_.stride[1] == 1) {
-        dshape[3] = oshape[3] + dilated_ksize_x - 1 - 2 * param_.pad[1];
-      }
-      SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
-                          ConvertLayout(dshape, kNCHW, param_.layout.value()));
-      // Check whether the kernel sizes are valid
-      if (dshape[2] != 0) {
-        CHECK_LE(dilated_ksize_y, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
-      }
-      if (dshape[3] != 0) {
-        CHECK_LE(dilated_ksize_x, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input";
-      }
-      return true;
-    } else if (param_.kernel.ndim() == 3) {
-      // 3d conv
-      CHECK_EQ(dshp.ndim(), 5U) \
-        << "Input data should be 5D in batch-num_filter-depth-y-x";
-      Shape<5> dshape = ConvertLayout(dshp.get<5>(), param_.layout.value(), kNCDHW);
-      Shape<5> wshape = Shape5(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
-                               param_.kernel[0], param_.kernel[1], param_.kernel[2]);
-      wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value());
-      wshape[0] *= param_.num_group;
-      SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
-      if (!param_.no_bias) {
-        SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
-      }
-
-      // Note: 3D dilation currently not supported.
-      // Calculations below done to preserve symmetry with 1D/2D code.
-      const index_t dilated_ksize_d = param_.DilatedKernelSize(0);
-      const index_t dilated_ksize_y = param_.DilatedKernelSize(1);
-      const index_t dilated_ksize_x = param_.DilatedKernelSize(2);
-      CHECK_EQ(dshape[1] % param_.num_group, 0U)
-        << "input num_filter must divide group size";
-      CHECK_EQ(param_.num_filter % param_.num_group, 0U)
-        << "output num_filter must divide group size";
-      CHECK_GT(param_.kernel.Size(), 0U) \
-        << "incorrect kernel size: " << param_.kernel;
-      CHECK_GT(param_.stride.Size(), 0U) \
-        << "incorrect stride size: " << param_.stride;
-      CHECK_GT(param_.dilate.Size(), 0U) \
-        << "incorrect dilate size: " << param_.dilate;
-      CHECK_EQ(param_.dilate.Size(), 1U)
-        << "Dilate is not supported in 3d convolution";
-      Shape<5> oshape;
-      oshape[0] = dshape[0];
-      oshape[1] = param_.num_filter;
-      oshape[2] = dshape[2] ?
-        (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_d) / param_.stride[0] + 1 : 0;
-      oshape[3] = dshape[3] ?
-        (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_y) / param_.stride[1] + 1 : 0;
-      oshape[4] = dshape[4] ?
-        (AddPad(dshape[4], param_.pad[2]) - dilated_ksize_x) / param_.stride[2] + 1 : 0;
-      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value()));
-      // Perform incomplete shape inference. Fill in the missing values in data shape.
-      // 1) We can always fill in the batch_size.
-      // 2) We can back-calculate the input depth/height/width if the corresponding stride is 1.
-      oshape = ConvertLayout((*out_shape)[0].get<5>(), param_.layout.value(), kNCDHW);
-      dshape[0] = oshape[0];
-      if (oshape[2] && param_.stride[0] == 1) {
-        dshape[2] = oshape[2] + dilated_ksize_d - 1 - 2 * param_.pad[0];
-      }
-      if (oshape[3] && param_.stride[1] == 1) {
-        dshape[3] = oshape[3] + dilated_ksize_y - 1 - 2 * param_.pad[1];
-      }
-      if (oshape[4] && param_.stride[2] == 1) {
-        dshape[4] = oshape[4] + dilated_ksize_x - 1 - 2 * param_.pad[2];
-      }
-      SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
-                          ConvertLayout(dshape, kNCDHW, param_.layout.value()));
-      // Check whether the kernel sizes are valid
-      if (dshape[2] != 0) {
-        CHECK_LE(dilated_ksize_d, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
-      }
-      if (dshape[3] != 0) {
-        CHECK_LE(dilated_ksize_y, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input";
-      }
-      if (dshape[4] != 0) {
-        CHECK_LE(dilated_ksize_x, AddPad(dshape[4], param_.pad[2])) << "kernel size exceed input";
-      }
-      return true;
-    } else {
-      LOG(FATAL) << "Unknown convolution type";
-      return false;
-    }
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1U);
-    int dtype = (*in_type)[0];
-    CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
-      if ((*in_type)[i] == -1) {
-        (*in_type)[i] = dtype;
-      } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
-      }
-    }
-    out_type->clear();
-    out_type->push_back(dtype);
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    auto ptr = new ConvolutionProp();
-    ptr->param_ = param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override {
-    return "Convolution";
-  }
-
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return {out_grad[conv::kOut], in_data[conv::kData], in_data[conv::kWeight]};
-  }
-
-  std::vector<ResourceRequest> ForwardResource(
-      const std::vector<TShape> &in_shape) const override {
-    return {ResourceRequest::kTempSpace};
-  }
-
-  std::vector<ResourceRequest> BackwardResource(
-      const std::vector<TShape> &in_shape) const override {
-    return {ResourceRequest::kTempSpace};
-  }
-
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented.";
-    return NULL;
-  }
-
-  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                             std::vector<int> *in_type) const override;
+void ConvolutionCompute(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx, const std::vector<TBlob>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& outputs) {
+  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  MSHADOW_REAL_TYPE_SWITCH(inputs[conv::kData].type_flag_, DType, {
+    ConvolutionOp<xpu, DType> op;
+    op.Init(param);
+    op.Forward(ctx, inputs, req, outputs);
+  });
+}
 
- private:
-  // Adds symmetric padding to a data input (in one dimension)
-  index_t AddPad(index_t dsize, index_t pad) const {
-    return dsize + 2 * pad;
-  }
+template<typename xpu>
+void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx, const std::vector<TBlob>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<TBlob>& outputs) {
+  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  std::vector<TBlob> in_data(inputs.begin() + 1, inputs.end());
+  const TBlob &out_grad = inputs[0];
+  const std::vector<TBlob> &in_grad = outputs;
+
+  MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, {
+    ConvolutionOp<xpu, DType> op;
+    op.Init(param);
+    op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
+  });
+}
 
-  ConvolutionParam param_;
-};  // class ConvolutionProp
-#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_NN_CONVOLUTION_INL_H_
diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc
index ef8ec9034db2..951063fb4b2f 100644
--- a/src/operator/nn/convolution.cc
+++ b/src/operator/nn/convolution.cc
@@ -21,15 +21,13 @@
  * Copyright (c) 2017 by Contributors
  * \file convolution.cc
  * \brief
- * \author Bing Xu, Jun Wu
+ * \author Bing Xu, Jun Wu, Da Zheng
 */
 
 #include "./convolution-inl.h"
-#if MXNET_USE_MKL2017 == 1
-#include <mkl_memory.h>
-#include "../mkl/mkl_memory-inl.h"
-#include "../mkl/mkl_convolution-inl.h"
-#endif  // MXNET_USE_MKL2017
+#include "../elemwise_op_common.h"
+#include "./mkldnn/mkldnn_ops-inl.h"
+#include "./mkldnn/mkldnn_base-inl.h"
 #if MXNET_USE_NNPACK == 1
 #include "./nnpack/nnpack_convolution-inl.h"
 #endif  // MXNET_USE_NNPACK
@@ -38,63 +36,351 @@ namespace mxnet {
 namespace op {
 DMLC_REGISTER_PARAMETER(ConvolutionParam);
 
-template<>
-Operator* CreateOp<cpu>(ConvolutionParam param, int dtype,
-                        std::vector<TShape> *in_shape,
-                        std::vector<TShape> *out_shape,
-                        Context ctx) {
-  Operator *op = NULL;
-  // If 1D convolution, use MXNet implementation
-  if (param.kernel.ndim() == 1) {
-    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-      op = new ConvolutionOp<cpu, DType>(param);
-    })
-    return op;
+static inline index_t AddPad(index_t dsize, index_t pad) {
+  return dsize + 2 * pad;
+}
+
+static inline std::vector<std::string> ListArguments(const ConvolutionParam& param_) {
+  if (!param_.no_bias) {
+    return {"data", "weight", "bias"};
+  } else {
+    return {"data", "weight"};
   }
-#if MXNET_USE_MKL2017 == 1
-  if ((param.dilate[0] == 1 && param.dilate[1] == 1)
-      && param.kernel.ndim() == 2) {
-    switch (dtype) {
-    case mshadow::kFloat32:
-      return new MKLConvolutionOp<cpu, float>(param);
-    case mshadow::kFloat64:
-      return new MKLConvolutionOp<cpu, double>(param);
-    default:
-      break;
-    }
+}
+
+#if MXNET_USE_MKLDNN == 1
+static void ConvolutionComputeExCPU(const nnvm::NodeAttrs& attrs,
+                                    const OpContext& ctx,
+                                    const std::vector<NDArray>& inputs,
+                                    const std::vector<OpReqType>& req,
+                                    const std::vector<NDArray>& outputs) {
+  if (SupportMKLDNNConv(inputs[0])) {
+    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    MKLDNNConvolutionForward(attrs, ctx, inputs, req, outputs);
+    MKLDNN_OPCHECK_RUN(ConvolutionCompute<cpu>, attrs, ctx, inputs, req, outputs);
+    return;
   }
+  FallBackCompute(ConvolutionCompute<cpu>, attrs, ctx, inputs, req, outputs);
+}
+
+static void ConvolutionGradComputeExCPU(const nnvm::NodeAttrs& attrs,
+                                        const OpContext& ctx,
+                                        const std::vector<NDArray>& inputs,
+                                        const std::vector<OpReqType>& req,
+                                        const std::vector<NDArray>& outputs) {
+  if (SupportMKLDNNConv(inputs[0])) {
+    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    MKLDNNConvolutionBackward(attrs, ctx, inputs, req, outputs);
+    MKLDNN_OPCHECK_RUN(ConvolutionGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+    return;
+  }
+  FallBackCompute(ConvolutionGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+}
 #endif
-#if MXNET_USE_NNPACK == 1
-  const size_t batch_size = (*in_shape)[0][0];
-  if ((param.dilate[0] == 1 && param.dilate[1] == 1)
-      && param.kernel.ndim() == 2 && (!param.no_bias)
-      && param.num_group == 1 && (batch_size == 1 ||
-      ((batch_size > 1) && (param.stride[0] == 1) &&
-      (param.stride[1] == 1)))) {
-    switch (dtype) {
-    case mshadow::kFloat32:
-      return new NNPACKConvolutionOp<cpu, float>(param);
-    default:
-      break;
+
+static bool ConvolutionShape(const nnvm::NodeAttrs& attrs,
+                             std::vector<TShape> *in_shape,
+                             std::vector<TShape> *out_shape) {
+  using namespace mshadow;
+  const ConvolutionParam& param_ = nnvm::get<ConvolutionParam>(attrs.parsed);
+  if (!param_.no_bias) {
+    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
+  } else {
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
+  }
+  // CHECK_EQ(out_shape->size(), 1) << "Output: [output]";
+  out_shape->resize(1, TShape());
+  const TShape &dshp = (*in_shape)[conv::kData];
+  if (dshp.ndim() ==  0) return false;
+
+  if (param_.kernel.ndim() == 1) {
+    // 1d conv
+    CHECK_EQ(dshp.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x";
+    Shape<3> dshape = ConvertLayout(dshp.get<3>(), param_.layout.value(), kNCW);
+    Shape<3> wshape = Shape3(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
+        param_.kernel[0]);
+    wshape = ConvertLayout(wshape, kNCW, param_.layout.value());
+    wshape[0] *= param_.num_group;
+    SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
+    if (!param_.no_bias) {
+      SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
+    }
+
+    const index_t dilated_ksize_x = param_.DilatedKernelSize(0);
+    CHECK_EQ(dshape[1] % param_.num_group, 0U) \
+      << "input num_filter must divide group size";
+    CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
+      << "output num_filter must divide group size";
+    CHECK_GT(param_.kernel.Size(), 0U) \
+      << "incorrect kernel size: " << param_.kernel;
+    CHECK_GT(param_.stride.Size(), 0U) \
+      << "incorrect stride size: " << param_.stride;
+    CHECK_GT(param_.dilate.Size(), 0U) \
+      << "incorrect dilate size: " << param_.dilate;
+    Shape<3> oshape;
+    oshape[0] = dshape[0];
+    oshape[1] = param_.num_filter;
+    oshape[2] = dshape[2] ?
+      (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1 : 0;
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value()));
+    // Perform incomplete shape inference. Fill in the missing values in data shape.
+    // 1) We can always fill in the batch_size.
+    // 2) We can back-calculate the input height/width if the corresponding stride is 1.
+    oshape = ConvertLayout((*out_shape)[0].get<3>(), param_.layout.value(), kNCW);
+    dshape[0] = oshape[0];
+    if (oshape[2] && param_.stride[0] == 1) {
+      dshape[2] = oshape[2] + dilated_ksize_x - 1 - 2 * param_.pad[0];
+    }
+    SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
+        ConvertLayout(dshape, kNCW, param_.layout.value()));
+    // Check whether the kernel sizes are valid
+    if (dshape[2] != 0) {
+      CHECK_LE(dilated_ksize_x, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
+    }
+    return true;
+  } else if (param_.kernel.ndim() == 2) {
+    // 2d conv
+    CHECK_EQ(dshp.ndim(), 4U) \
+      << "Input data should be 4D in batch-num_filter-y-x";
+    Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW);
+    Shape<4> wshape = Shape4(param_.num_filter / param_.num_group,
+        dshape[1] / param_.num_group,
+        param_.kernel[0], param_.kernel[1]);
+    wshape = ConvertLayout(wshape, kNCHW, param_.layout.value());
+    wshape[0] *= param_.num_group;
+    SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
+    if (!param_.no_bias) {
+      SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
+    }
+
+    const index_t dilated_ksize_y = param_.DilatedKernelSize(0);
+    const index_t dilated_ksize_x = param_.DilatedKernelSize(1);
+    CHECK_EQ(dshape[1] % param_.num_group, 0U) \
+      << "input num_filter must divide group size";
+    CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
+      << "output num_filter must divide group size";
+    CHECK_GT(param_.kernel.Size(), 0U) \
+      << "incorrect kernel size: " << param_.kernel;
+    CHECK_GT(param_.stride.Size(), 0U) \
+      << "incorrect stride size: " << param_.stride;
+    CHECK_GT(param_.dilate.Size(), 0U) \
+      << "incorrect dilate size: " << param_.dilate;
+    Shape<4> oshape;
+    oshape[0] = dshape[0];
+    oshape[1] = param_.num_filter;
+    oshape[2] = dshape[2] ?
+      (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_y) / param_.stride[0] + 1 : 0;
+    oshape[3] = dshape[3] ?
+      (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_x) / param_.stride[1] + 1 : 0;
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value()));
+    // Perform incomplete shape inference. Fill in the missing values in data shape.
+    // 1) We can always fill in the batch_size.
+    // 2) We can back-calculate the input height/width if the corresponding stride is 1.
+    oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW);
+    dshape[0] = oshape[0];
+    if (oshape[2] && param_.stride[0] == 1) {
+      dshape[2] = oshape[2] + dilated_ksize_y - 1 - 2 * param_.pad[0];
+    }
+    if (oshape[3] && param_.stride[1] == 1) {
+      dshape[3] = oshape[3] + dilated_ksize_x - 1 - 2 * param_.pad[1];
+    }
+    SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
+        ConvertLayout(dshape, kNCHW, param_.layout.value()));
+    // Check whether the kernel sizes are valid
+    if (dshape[2] != 0) {
+      CHECK_LE(dilated_ksize_y, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
+    }
+    if (dshape[3] != 0) {
+      CHECK_LE(dilated_ksize_x, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input";
+    }
+    return true;
+  } else if (param_.kernel.ndim() == 3) {
+    // 3d conv
+    CHECK_EQ(dshp.ndim(), 5U) \
+      << "Input data should be 5D in batch-num_filter-depth-y-x";
+    Shape<5> dshape = ConvertLayout(dshp.get<5>(), param_.layout.value(), kNCDHW);
+    Shape<5> wshape = Shape5(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
+        param_.kernel[0], param_.kernel[1], param_.kernel[2]);
+    wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value());
+    wshape[0] *= param_.num_group;
+    SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
+    if (!param_.no_bias) {
+      SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
+    }
+
+    // Note: 3D dilation currently not supported.
+    // Calculations below done to preserve symmetry with 1D/2D code.
+    const index_t dilated_ksize_d = param_.DilatedKernelSize(0);
+    const index_t dilated_ksize_y = param_.DilatedKernelSize(1);
+    const index_t dilated_ksize_x = param_.DilatedKernelSize(2);
+    CHECK_EQ(dshape[1] % param_.num_group, 0U)
+      << "input num_filter must divide group size";
+    CHECK_EQ(param_.num_filter % param_.num_group, 0U)
+      << "output num_filter must divide group size";
+    CHECK_GT(param_.kernel.Size(), 0U) \
+      << "incorrect kernel size: " << param_.kernel;
+    CHECK_GT(param_.stride.Size(), 0U) \
+      << "incorrect stride size: " << param_.stride;
+    CHECK_GT(param_.dilate.Size(), 0U) \
+      << "incorrect dilate size: " << param_.dilate;
+    CHECK_EQ(param_.dilate.Size(), 1U)
+      << "Dilate is not supported in 3d convolution";
+    Shape<5> oshape;
+    oshape[0] = dshape[0];
+    oshape[1] = param_.num_filter;
+    oshape[2] = dshape[2] ?
+      (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_d) / param_.stride[0] + 1 : 0;
+    oshape[3] = dshape[3] ?
+      (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_y) / param_.stride[1] + 1 : 0;
+    oshape[4] = dshape[4] ?
+      (AddPad(dshape[4], param_.pad[2]) - dilated_ksize_x) / param_.stride[2] + 1 : 0;
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value()));
+    // Perform incomplete shape inference. Fill in the missing values in data shape.
+    // 1) We can always fill in the batch_size.
+    // 2) We can back-calculate the input depth/height/width if the corresponding stride is 1.
+    oshape = ConvertLayout((*out_shape)[0].get<5>(), param_.layout.value(), kNCDHW);
+    dshape[0] = oshape[0];
+    if (oshape[2] && param_.stride[0] == 1) {
+      dshape[2] = oshape[2] + dilated_ksize_d - 1 - 2 * param_.pad[0];
     }
+    if (oshape[3] && param_.stride[1] == 1) {
+      dshape[3] = oshape[3] + dilated_ksize_y - 1 - 2 * param_.pad[1];
+    }
+    if (oshape[4] && param_.stride[2] == 1) {
+      dshape[4] = oshape[4] + dilated_ksize_x - 1 - 2 * param_.pad[2];
+    }
+    SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
+        ConvertLayout(dshape, kNCDHW, param_.layout.value()));
+    // Check whether the kernel sizes are valid
+    if (dshape[2] != 0) {
+      CHECK_LE(dilated_ksize_d, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
+    }
+    if (dshape[3] != 0) {
+      CHECK_LE(dilated_ksize_y, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input";
+    }
+    if (dshape[4] != 0) {
+      CHECK_LE(dilated_ksize_x, AddPad(dshape[4], param_.pad[2])) << "kernel size exceed input";
+    }
+    return true;
+  } else {
+    LOG(FATAL) << "Unknown convolution type";
+    return false;
   }
+}
+
+static bool ConvolutionType(const nnvm::NodeAttrs& attrs,
+                            std::vector<int> *in_type, std::vector<int> *out_type) {
+  const ConvolutionParam& param_ = nnvm::get<ConvolutionParam>(attrs.parsed);
+  CHECK_GE(in_type->size(), 1U);
+  int dtype = (*in_type)[0];
+  CHECK_NE(dtype, -1) << "First input must have specified type";
+  for (index_t i = 0; i < in_type->size(); ++i) {
+    if ((*in_type)[i] == -1) {
+      (*in_type)[i] = dtype;
+    } else {
+      UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param_)[i]);
+    }
+  }
+  out_type->clear();
+  out_type->push_back(dtype);
+  return true;
+}
+
+inline static bool ConvStorageType(const nnvm::NodeAttrs& attrs,
+                                   const int dev_mask,
+                                   DispatchMode* dispatch_mode,
+                                   std::vector<int> *in_attrs,
+                                   std::vector<int> *out_attrs) {
+  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  uint32_t in_expected = param.no_bias ? 2 : 3;
+  CHECK_EQ(in_attrs->size(), in_expected);
+  CHECK_EQ(out_attrs->size(), 1);
+
+  DispatchMode wanted_mode;
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask)
+    wanted_mode = DispatchMode::kFComputeEx;
+  else
 #endif
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new ConvolutionOp<cpu, DType>(param);
-  })
-  return op;
+    wanted_mode = DispatchMode::kFCompute;
+  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
+                             dispatch_mode, wanted_mode);
 }
 
-// DO_BIND_DISPATCH comes from operator_common.h
-Operator *ConvolutionProp::CreateOperatorEx(Context ctx,
-                                            std::vector<TShape> *in_shape,
-                                            std::vector<int> *in_type) const {
-  std::vector<TShape> out_shape, aux_shape;
-  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], in_shape, &out_shape, ctx);
+inline static bool BackwardConvStorageType(const nnvm::NodeAttrs& attrs,
+                                           const int dev_mask,
+                                           DispatchMode* dispatch_mode,
+                                           std::vector<int> *in_attrs,
+                                           std::vector<int> *out_attrs) {
+  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  uint32_t in_expected = param.no_bias ? 3 : 4;
+  uint32_t out_expected = param.no_bias ? 2 : 3;
+  CHECK_EQ(in_attrs->size(), in_expected);
+  CHECK_EQ(out_attrs->size(), out_expected);
+
+  DispatchMode wanted_mode;
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask)
+    wanted_mode = DispatchMode::kFComputeEx;
+  else
+#endif
+    wanted_mode = DispatchMode::kFCompute;
+  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
+                             dispatch_mode, wanted_mode);
+}
+
+static void ConvolutionParamParser(nnvm::NodeAttrs* attrs) {
+  using namespace mshadow;
+  ConvolutionParam param_;
+  try {
+    param_.Init(attrs->dict);
+  } catch (const dmlc::ParamError& e) {
+    std::ostringstream os;
+    os << e.what();
+    os << ", in operator " << attrs->op->name << "("
+       << "name=\"" << attrs->name << "\"";
+    for (const auto& k : attrs->dict) {
+      os << ", " << k.first << "=\"" << k.second << "\"";
+    }
+    os << ")";
+    throw dmlc::ParamError(os.str());
+  }
+
+  if (param_.kernel.ndim() == 1) {
+    param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW;
+    if (param_.stride.ndim() == 0) param_.stride = Shape1(1);
+    if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1);
+    if (param_.pad.ndim() == 0) param_.pad = Shape1(0);
+  } else if (param_.kernel.ndim() == 2) {
+    param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW;
+    if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
+    if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1);
+    if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
+  } else {
+    CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D convolution not supported";
+    param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW;
+    if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
+    if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1);
+    if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
+  }
+  attrs->parsed = std::move(param_);
 }
 
-MXNET_REGISTER_OP_PROPERTY(Convolution, ConvolutionProp)
+struct ConvolutionGrad {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
+    const ConvolutionParam& param = nnvm::get<ConvolutionParam>(n->attrs.parsed);
+    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
+    heads.push_back(n->inputs[conv::kData]);
+    heads.push_back(n->inputs[conv::kWeight]);
+    if (!param.no_bias)
+      heads.push_back(n->inputs[conv::kBias]);
+    return MakeGradNode(op_name, n, heads, n->attrs.dict);
+  }
+};
+
+NNVM_REGISTER_OP(Convolution)
 .describe(R"code(Compute *N*-D convolution on *(N+2)*-D input.
 
 In the 2-D convolution, given input data with shape *(batch_size,
@@ -168,10 +454,51 @@ There are other options to tune the performance.
   the performance.
 
 )code" ADD_FILELINE)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const ConvolutionParam& params = nnvm::get<ConvolutionParam>(attrs.parsed);
+  return params.no_bias ? 2 : 3;
+})
+.set_num_outputs(1)
+.set_attr_parser(ConvolutionParamParser)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+    [](const NodeAttrs& attrs) {
+  const ConvolutionParam& params = nnvm::get<ConvolutionParam>(attrs.parsed);
+  if (params.no_bias)
+    return std::vector<std::string>{"data", "weight"};
+  else
+    return std::vector<std::string>{"data", "weight", "bias"};
+})
+.set_attr<nnvm::FInferShape>("FInferShape", ConvolutionShape)
+.set_attr<nnvm::FInferType>("FInferType", ConvolutionType)
+.set_attr<FInferStorageType>("FInferStorageType", ConvStorageType)
+.set_attr<FCompute>("FCompute<cpu>", ConvolutionCompute<cpu>)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FComputeEx>("FComputeEx<cpu>", ConvolutionComputeExCPU)
+#endif
+.set_attr<nnvm::FGradient>("FGradient", ConvolutionGrad{"_backward_Convolution"})
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
 .add_argument("data", "NDArray-or-Symbol", "Input data to the ConvolutionOp.")
 .add_argument("weight", "NDArray-or-Symbol", "Weight matrix.")
 .add_argument("bias", "NDArray-or-Symbol", "Bias parameter.")
 .add_arguments(ConvolutionParam::__FIELDS__());
 
+NNVM_REGISTER_OP(_backward_Convolution)
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const ConvolutionParam& params = nnvm::get<ConvolutionParam>(attrs.parsed);
+  return params.no_bias ? 2 : 3;
+})
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FInferStorageType>("FInferStorageType", BackwardConvStorageType)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr_parser(ConvolutionParamParser)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FComputeEx>("FComputeEx<cpu>", ConvolutionGradComputeExCPU)
+#endif
+.set_attr<FCompute>("FCompute<cpu>", ConvolutionGradCompute<cpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/convolution.cu b/src/operator/nn/convolution.cu
index 7234daf0d614..d7f9e564a603 100644
--- a/src/operator/nn/convolution.cu
+++ b/src/operator/nn/convolution.cu
@@ -21,36 +21,66 @@
  * Copyright (c) 2017 by Contributors
  * \file convolution.cu
  * \brief
- * \author Bing Xu, Jun Wu
+ * \author Bing Xu, Jun Wu, Da Zheng
 */
 
 #include "./convolution-inl.h"
 #include <vector>
+#include "./depthwise_convolution-inl.h"
 #if MXNET_USE_CUDNN == 1
 #include "./cudnn/cudnn_convolution-inl.h"
 #endif  // MXNET_USE_CUDNN
 
-#include "./depthwise_convolution-inl.h"
-
 namespace mxnet {
 namespace op {
 
+#if MXNET_USE_CUDNN == 1
+template<typename DType>
+static CuDNNConvolutionOp<DType> &GetCuDNNConvOp(const ConvolutionParam& param,
+    int forward_compute_type, int backward_compute_type,
+    const std::vector<TShape>& in_shape, const std::vector<TShape>& out_shape,
+    const Context& ctx) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local CuDNNConvolutionOp<DType> op;
+#else
+  static MX_THREAD_LOCAL CuDNNConvolutionOp<DType> op;
+#endif
+  op.Init(param, forward_compute_type, backward_compute_type,
+      in_shape, out_shape, ctx);
+  return op;
+}
+#endif
+
 template<>
-Operator* CreateOp<gpu>(ConvolutionParam param, int dtype,
-                        std::vector<TShape> *in_shape,
-                        std::vector<TShape> *out_shape,
-                        Context ctx) {
-  Operator *op = NULL;
-
-  // depth wise conv
-  if (param.num_filter == param.num_group &&
+void ConvolutionCompute<gpu>(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx, const std::vector<TBlob>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<TBlob>& outputs) {
+  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  int dtype = inputs[conv::kData].type_flag_;
+
+  // If 1D convolution, use MXNet implementation
+  if (param.kernel.ndim() == 1) {
+    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+      ConvolutionOp<gpu, DType> op;
+      op.Init(param);
+      op.Forward(ctx, inputs, req, outputs);
+    })
+    return;
+  } else if (param.num_filter == param.num_group &&
       param.layout.value() == mshadow::kNCHW &&
-      param.num_filter == (*in_shape)[conv::kData][1] &&
+      param.num_filter == inputs[conv::kData].shape_[1] &&
       param.kernel.ndim() == 2 &&
       param.dilate == mshadow::Shape2(1, 1) &&
       dtype == mshadow::kFloat32) {
-    op = new DepthwiseConvolutionOp<float>(param, *in_shape, *out_shape);
-    return op;
+    std::vector<TShape> in_shape(inputs.size());
+    std::vector<TShape> out_shape(1, outputs[0].shape_);
+    for (size_t i = 0; i < in_shape.size(); i++)
+      in_shape[i] = inputs[i].shape_;
+    DepthwiseConvolutionOp<float> op;
+    op.Init(param, in_shape, out_shape);
+    op.Forward(ctx, inputs, req, outputs);
+    return;
   }
 
 #if MXNET_USE_CUDNN == 1
@@ -59,23 +89,111 @@ Operator* CreateOp<gpu>(ConvolutionParam param, int dtype,
 
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
     if (param.cudnn_off) {
-      op = new ConvolutionOp<gpu, DType>(param);
-    } else if (!CuDNNConvolutionOp<DType>::Supports(param, compute_type, compute_type, ctx)) {
+      ConvolutionOp<gpu, DType> op;
+      op.Init(param);
+      op.Forward(ctx, inputs, req, outputs);
+    } else if (!CuDNNConvolutionOp<DType>::Supports(param,
+          compute_type, compute_type, ctx.run_ctx.ctx)) {
       LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied.";
-      op = new ConvolutionOp<gpu, DType>(param);
+      ConvolutionOp<gpu, DType> op;
+      op.Init(param);
+      op.Forward(ctx, inputs, req, outputs);
     } else {
-      op = new CuDNNConvolutionOp<DType>(param, compute_type, compute_type,
-                                         *in_shape, *out_shape, ctx);
+      std::vector<TShape> in_shape(inputs.size());
+      std::vector<TShape> out_shape(1, outputs[0].shape_);
+      for (size_t i = 0; i < in_shape.size(); i++)
+        in_shape[i] = inputs[i].shape_;
+      CuDNNConvolutionOp<DType> &op = GetCuDNNConvOp<DType>(param,
+          compute_type, compute_type, in_shape, out_shape, ctx.run_ctx.ctx);
+      op.Forward(ctx, inputs, req, outputs);
     }
   })
 #else
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new ConvolutionOp<gpu, DType>(param);
+    ConvolutionOp<gpu, DType> op;
+    op.Init(param);
+    op.Forward(ctx, inputs, req, outputs);
+  })
+#endif  // MXNET_USE_CUDNN
+}
+
+template<>
+void ConvolutionGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx, const std::vector<TBlob>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<TBlob>& outputs) {
+  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  std::vector<TBlob> in_data(inputs.begin() + 1, inputs.end());
+  const TBlob &out_grad = inputs[0];
+  const std::vector<TBlob> &in_grad = outputs;
+  int dtype = out_grad.type_flag_;
+
+  // If 1D convolution, use MXNet implementation
+  if (param.kernel.ndim() == 1) {
+    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+      ConvolutionOp<gpu, DType> op;
+      op.Init(param);
+      op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
+    })
+    return;
+  } else if (param.num_filter == param.num_group &&
+      param.layout.value() == mshadow::kNCHW &&
+      param.num_filter == in_data[conv::kData].shape_[1] &&
+      param.kernel.ndim() == 2 &&
+      param.dilate == mshadow::Shape2(1, 1) &&
+      dtype == mshadow::kFloat32) {
+    // The first element stores out grad.
+    std::vector<TShape> in_shape(in_data.size());
+    std::vector<TShape> out_shape(1, out_grad.shape_);
+    for (size_t i = 0; i < in_shape.size(); i++)
+      in_shape[i] = in_data[i].shape_;
+    DepthwiseConvolutionOp<float> op;
+    op.Init(param, in_shape, out_shape);
+    op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
+    return;
+  }
+
+#if MXNET_USE_CUDNN == 1
+  // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16).
+  int compute_type = (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype;
+
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    if (param.cudnn_off) {
+      ConvolutionOp<gpu, DType> op;
+      op.Init(param);
+      op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
+    } else if (!CuDNNConvolutionOp<DType>::Supports(param,
+          compute_type, compute_type, ctx.run_ctx.ctx)) {
+      LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied.";
+      ConvolutionOp<gpu, DType> op;
+      op.Init(param);
+      op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
+    } else {
+      // The first element stores out grad.
+      std::vector<TShape> in_shape(in_data.size());
+      std::vector<TShape> out_shape(1, out_grad.shape_);
+      for (size_t i = 0; i < in_shape.size(); i++)
+        in_shape[i] = in_data[i].shape_;
+      CuDNNConvolutionOp<DType> &op = GetCuDNNConvOp<DType>(param,
+          compute_type, compute_type, in_shape, out_shape, ctx.run_ctx.ctx);
+      op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
+    }
+  })
+#else
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    ConvolutionOp<gpu, DType> op;
+    op.Init(param);
+    op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
   })
 #endif  // MXNET_USE_CUDNN
-  return op;
 }
 
+NNVM_REGISTER_OP(Convolution)
+.set_attr<FCompute>("FCompute<gpu>", ConvolutionCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_Convolution)
+.set_attr<FCompute>("FCompute<gpu>", ConvolutionGradCompute<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/nn/cudnn/cudnn_activation-inl.h b/src/operator/nn/cudnn/cudnn_activation-inl.h
index 888528309cdf..a89e7bfaf080 100644
--- a/src/operator/nn/cudnn/cudnn_activation-inl.h
+++ b/src/operator/nn/cudnn/cudnn_activation-inl.h
@@ -33,12 +33,19 @@
 namespace mxnet {
 namespace op {
 template<typename DType>
-class CuDNNActivationOp : public Operator {
+class CuDNNActivationOp {
  public:
-  explicit CuDNNActivationOp(ActivationParam param) {
-    param_ = param;
-    init_cudnn_ = false;
+  CuDNNActivationOp() {
     dtype_ = mshadow::DataType<DType>::kCudnnFlag;
+    #if CUDNN_MAJOR >= 5
+    nan_prop_ = CUDNN_NOT_PROPAGATE_NAN;
+    CUDNN_CALL(cudnnCreateActivationDescriptor(&desc_));
+    #endif
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_));
+  }
+
+  void Init(const ActivationParam &param) {
+    param_ = param;
     switch (param_.act_type) {
       case activation::kReLU:
         mode_ = CUDNN_ACTIVATION_RELU;
@@ -54,67 +61,54 @@ class CuDNNActivationOp : public Operator {
         break;
     }
     #if CUDNN_MAJOR >= 5
-    nan_prop_ = CUDNN_NOT_PROPAGATE_NAN;
-    CUDNN_CALL(cudnnCreateActivationDescriptor(&desc_));
     CUDNN_CALL(cudnnSetActivationDescriptor(desc_, mode_, nan_prop_, relu_ceil_));
     #endif
   }
 
   ~CuDNNActivationOp() {
-    if (init_cudnn_) {
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_));
-      #if CUDNN_MAJOR >= 5
-      CUDNN_CALL(cudnnDestroyActivationDescriptor(desc_));
-      #endif
-    }
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_));
+    #if CUDNN_MAJOR >= 5
+    CUDNN_CALL(cudnnDestroyActivationDescriptor(desc_));
+    #endif
   }
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
+  void Forward(const OpContext &ctx, const TBlob &in_data,
+      const OpReqType &req, const TBlob &out_data) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 1U);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     Tensor<gpu, 4, DType> data;
     Tensor<gpu, 4, DType> out;
-    if (in_data[activation::kData].ndim() == 2) {
-      Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0],
-                               in_data[activation::kData].shape_[1], 1, 1);
-      data = in_data[activation::kData].get_with_shape<gpu, 4, DType>(dshape, s);
-      out = out_data[activation::kOut].get_with_shape<gpu, 4, DType>(dshape, s);
+    if (in_data.ndim() == 2) {
+      Shape<4> dshape = Shape4(in_data.shape_[0],
+                               in_data.shape_[1], 1, 1);
+      data = in_data.get_with_shape<gpu, 4, DType>(dshape, s);
+      out = out_data.get_with_shape<gpu, 4, DType>(dshape, s);
     } else {
       Shape<4> dshape;
-      index_t size_left = in_data[activation::kData].Size();
+      index_t size_left = in_data.Size();
       for (int i = 0; i < 3; ++i) {
-        if (i < in_data[activation::kData].ndim()) {
-          dshape[i] = in_data[activation::kData].shape_[i];
+        if (i < in_data.ndim()) {
+          dshape[i] = in_data.shape_[i];
         } else {
           dshape[i] = 1;
         }
         size_left /= dshape[i];
       }
       dshape[3] = size_left;
-      data = in_data[activation::kData].get_with_shape<gpu, 4, DType>(dshape, s);
-      out = out_data[activation::kOut].get_with_shape<gpu, 4, DType>(dshape, s);
+      data = in_data.get_with_shape<gpu, 4, DType>(dshape, s);
+      out = out_data.get_with_shape<gpu, 4, DType>(dshape, s);
     }
     typename DataType<DType>::ScaleType alpha = 1.0f;
     typename DataType<DType>::ScaleType beta = 0.0f;
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
-    if (!init_cudnn_) {
-      init_cudnn_ = true;
-      CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_));
-      CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_,
-                                            CUDNN_TENSOR_NCHW,
-                                            dtype_,
-                                            data.shape_[0],
-                                            data.shape_[1],
-                                            data.shape_[2],
-                                            data.shape_[3]));
-    }
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_,
+                                          CUDNN_TENSOR_NCHW,
+                                          dtype_,
+                                          data.shape_[0],
+                                          data.shape_[1],
+                                          data.shape_[2],
+                                          data.shape_[3]));
     #if CUDNN_MAJOR <= 4
     CUDNN_CALL(cudnnActivationForward(s->dnn_handle_,
                                       mode_,
@@ -136,20 +130,11 @@ class CuDNNActivationOp : public Operator {
     #endif
   }
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
+  void Backward(const OpContext &ctx, const TBlob &out_grad,
+      const TBlob &in_data, const TBlob &out_data,
+      const OpReqType &req, const TBlob &in_grad) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1U);
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 1U);
-    CHECK_EQ(req.size(), 1U);
-    CHECK_EQ(in_grad.size(), 1U);
     typename DataType<DType>::ScaleType alpha = 1.0f;
     typename DataType<DType>::ScaleType beta = 0.0f;
     Stream<gpu> *s = ctx.get_stream<gpu>();
@@ -157,31 +142,38 @@ class CuDNNActivationOp : public Operator {
     Tensor<gpu, 4, DType> data;
     Tensor<gpu, 4, DType> output_data;
     Tensor<gpu, 4, DType> input_grad;
-    if (in_grad[activation::kData].ndim() == 2) {
-      Shape<4> dshape = Shape4(in_grad[activation::kData].shape_[0],
-                               in_grad[activation::kData].shape_[1], 1, 1);
-      data = in_data[activation::kData].get_with_shape<gpu, 4, DType>(dshape, s);
-      grad = out_grad[activation::kOut].get_with_shape<gpu, 4, DType>(dshape, s);
-      output_data = out_data[activation::kOut].get_with_shape<gpu, 4, DType>(dshape, s);
-      input_grad = in_grad[activation::kData].get_with_shape<gpu, 4, DType>(dshape, s);
+    if (in_grad.ndim() == 2) {
+      Shape<4> dshape = Shape4(in_grad.shape_[0],
+                               in_grad.shape_[1], 1, 1);
+      data = in_data.get_with_shape<gpu, 4, DType>(dshape, s);
+      grad = out_grad.get_with_shape<gpu, 4, DType>(dshape, s);
+      output_data = out_data.get_with_shape<gpu, 4, DType>(dshape, s);
+      input_grad = in_grad.get_with_shape<gpu, 4, DType>(dshape, s);
     } else {
       Shape<4> dshape;
-      index_t size_left = in_grad[activation::kData].Size();
+      index_t size_left = in_grad.Size();
       for (int i = 0; i < 3; ++i) {
-        if (i < in_grad[activation::kData].ndim()) {
-          dshape[i] = in_grad[activation::kData].shape_[i];
+        if (i < in_grad.ndim()) {
+          dshape[i] = in_grad.shape_[i];
         } else {
           dshape[i] = 1;
         }
         size_left /= dshape[i];
       }
       dshape[3] = size_left;
-      data = in_data[activation::kData].get_with_shape<gpu, 4, DType>(dshape, s);
-      output_data = out_data[activation::kOut].get_with_shape<gpu, 4, DType>(dshape, s);
-      grad = out_grad[activation::kOut].get_with_shape<gpu, 4, DType>(dshape, s);
-      input_grad = in_grad[activation::kData].get_with_shape<gpu, 4, DType>(dshape, s);
+      data = in_data.get_with_shape<gpu, 4, DType>(dshape, s);
+      output_data = out_data.get_with_shape<gpu, 4, DType>(dshape, s);
+      grad = out_grad.get_with_shape<gpu, 4, DType>(dshape, s);
+      input_grad = in_grad.get_with_shape<gpu, 4, DType>(dshape, s);
     }
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_,
+                                          CUDNN_TENSOR_NCHW,
+                                          dtype_,
+                                          data.shape_[0],
+                                          data.shape_[1],
+                                          data.shape_[2],
+                                          data.shape_[3]));
     #if CUDNN_MAJOR <= 4
     CUDNN_CALL(cudnnActivationBackward(s->dnn_handle_,
                                        mode_,
@@ -212,7 +204,6 @@ class CuDNNActivationOp : public Operator {
   }
 
  private:
-  bool init_cudnn_;
   cudnnDataType_t dtype_;
   cudnnActivationMode_t mode_;
   cudnnTensorDescriptor_t shape_desc_;
diff --git a/src/operator/nn/cudnn/cudnn_batch_norm-inl.h b/src/operator/nn/cudnn/cudnn_batch_norm-inl.h
index 3dc9c8353a35..e2337049060e 100644
--- a/src/operator/nn/cudnn/cudnn_batch_norm-inl.h
+++ b/src/operator/nn/cudnn/cudnn_batch_norm-inl.h
@@ -43,28 +43,30 @@ enum CuDNNBatchNormOpAuxiliary {kMovingMean, kMovingInvVar};
 
 #if defined(__CUDACC__)
 template<typename DType>
-class CuDNNBatchNormOp : public Operator {
+class CuDNNBatchNormOp {
  public:
-  explicit CuDNNBatchNormOp(BatchNormParam param) {
+  CuDNNBatchNormOp() {
     using namespace mshadow;
-    CHECK_GE(param.eps, CUDNN_BN_MIN_EPSILON)
-     << "CuDNN requires eps to be no less than " << CUDNN_BN_MIN_EPSILON;
-    this->param_ = param;
-    init_cudnn_ = false;
     dtype_ = DataType<DType>::kCudnnFlag;
     // For float16 input type beta, gamma, mean, and average are stored in float32.
     // For other input types, these parameters have the same type as input
     dtype_param_ = (dtype_ == CUDNN_DATA_HALF) ? kFloat32 : DataType<DType>::kFlag;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&io_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&mean_desc_));
+  }
+
+  void Init(const BatchNormParam &param) {
+    CHECK_GE(param.eps, CUDNN_BN_MIN_EPSILON)
+     << "CuDNN requires eps to be no less than " << CUDNN_BN_MIN_EPSILON;
+    this->param_ = param;
   }
 
   ~CuDNNBatchNormOp() {
-    if (init_cudnn_) {
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(io_desc_));
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(mean_desc_));
-    }
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(io_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(mean_desc_));
   }
 
-  virtual void Forward(const OpContext &ctx,
+  void Forward(const OpContext &ctx,
                        const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
                        const std::vector<TBlob> &out_data,
@@ -84,29 +86,7 @@ class CuDNNBatchNormOp : public Operator {
     CHECK_GE(in_data[cudnnbatchnorm::kData].ndim(), 2);
     CHECK_LE(in_data[cudnnbatchnorm::kData].ndim(), 4);
 
-    if (!init_cudnn_) {
-      for (int i = 0; i < 4; ++i) {
-        if (i < in_data[cudnnbatchnorm::kData].ndim()) {
-          shape_[i] = in_data[cudnnbatchnorm::kData].shape_[i];
-        } else {
-          shape_[i] = 1;
-        }
-      }
-      CUDNN_CALL(cudnnCreateTensorDescriptor(&io_desc_));
-      CUDNN_CALL(cudnnCreateTensorDescriptor(&mean_desc_));
-      CUDNN_CALL(cudnnSetTensor4dDescriptor(io_desc_,
-                                            CUDNN_TENSOR_NCHW,
-                                            dtype_,
-                                            shape_[0],
-                                            shape_[1],
-                                            shape_[2],
-                                            shape_[3]));
-      CUDNN_CALL(cudnnDeriveBNTensorDescriptor(mean_desc_,
-                                               io_desc_,
-                                               CUDNN_BATCHNORM_SPATIAL));
-      init_cudnn_  = true;
-    }
-
+    Init(in_data[cudnnbatchnorm::kData]);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     Tensor<gpu, 4, DType> x =
       in_data[cudnnbatchnorm::kData].get_with_shape<gpu, 4, DType>(shape_, s);
@@ -177,7 +157,7 @@ class CuDNNBatchNormOp : public Operator {
     })
   }
 
-  virtual void Backward(const OpContext &ctx,
+  void Backward(const OpContext &ctx,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<TBlob> &in_data,
                         const std::vector<TBlob> &out_data,
@@ -193,6 +173,7 @@ class CuDNNBatchNormOp : public Operator {
     CHECK(ctx.is_train && !param_.use_global_stats)
         << "use global statistics is not yet supported in CuDNNBatchNorm";
 
+    Init(in_data[cudnnbatchnorm::kData]);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     Tensor<gpu, 4, DType> x =
       in_data[cudnnbatchnorm::kData].get_with_shape<gpu, 4, DType>(shape_, s);
@@ -290,7 +271,27 @@ class CuDNNBatchNormOp : public Operator {
   }
 
  private:
-  bool init_cudnn_;
+  void Init(const TBlob &in_data) {
+    for (int i = 0; i < 4; ++i) {
+      if (i < in_data.ndim()) {
+        shape_[i] = in_data.shape_[i];
+      } else {
+        shape_[i] = 1;
+      }
+    }
+
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(io_desc_,
+                                          CUDNN_TENSOR_NCHW,
+                                          dtype_,
+                                          shape_[0],
+                                          shape_[1],
+                                          shape_[2],
+                                          shape_[3]));
+    CUDNN_CALL(cudnnDeriveBNTensorDescriptor(mean_desc_,
+                                             io_desc_,
+                                             CUDNN_BATCHNORM_SPATIAL));
+  }
+
   cudnnDataType_t dtype_;
   int dtype_param_;
   cudnnTensorDescriptor_t io_desc_, mean_desc_;
@@ -299,91 +300,6 @@ class CuDNNBatchNormOp : public Operator {
 };
 #endif  // defined(__CUDACC__)
 
-template<typename xpu>
-Operator *CreateOp_CuDNNv4(BatchNormParam param);
-
-
-#if DMLC_USE_CXX11
-class CuDNNBatchNormProp : public OperatorProperty {
- public:
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-    using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]";
-    const TShape &dshape = in_shape->at(0);
-    if (dshape.ndim() == 0) return false;
-    in_shape->at(1) = TShape(Shape1(dshape[1]));
-    in_shape->at(2) = TShape(Shape1(dshape[1]));
-
-    out_shape->clear();
-    out_shape->push_back(dshape);
-    out_shape->push_back(Shape1(dshape[1]));
-    out_shape->push_back(Shape1(dshape[1]));
-
-    aux_shape->clear();
-    aux_shape->push_back(Shape1(dshape[1]));
-    aux_shape->push_back(Shape1(dshape[1]));
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    auto ptr = new CuDNNBatchNormProp();
-    ptr->param_ = param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override {
-    return "CuDNNBatchNorm";
-  }
-
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return {out_grad[cudnnbatchnorm::kOut],
-            out_data[cudnnbatchnorm::kMean],
-            out_data[cudnnbatchnorm::kInvVar],
-            in_data[cudnnbatchnorm::kData],
-            in_data[cudnnbatchnorm::kGamma]
-           };
-  }
-
-  int NumVisibleOutputs() const override {
-    return 1;
-  }
-
-  int NumOutputs() const override {
-    return 3;
-  }
-
-  std::vector<std::string> ListArguments() const override {
-    return {"data", "gamma", "beta"};
-  }
-
-  std::vector<std::string> ListOutputs() const override {
-    return {"output", "mean", "inv_var"};
-  }
-
-  std::vector<std::string> ListAuxiliaryStates() const override {
-    return {"moving_mean", "moving_inv_var"};
-  }
-
-  Operator* CreateOperator(Context ctx) const override;
-
- private:
-  BatchNormParam param_;
-};  // class CuDNNBatchNormProp
-
-#endif  // DMLC_USE_CXX11
 #endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 4
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/cudnn/cudnn_batch_norm.cc b/src/operator/nn/cudnn/cudnn_batch_norm.cc
index e1e0c999b1fb..f1d229dd5421 100644
--- a/src/operator/nn/cudnn/cudnn_batch_norm.cc
+++ b/src/operator/nn/cudnn/cudnn_batch_norm.cc
@@ -21,46 +21,100 @@
  * Copyright (c) 2015 by Contributors
  * \file cudnn_batch_norm.cc
  * \brief
- * \author Junyuan Xie
+ * \author Junyuan Xie, Da Zheng
 */
 
 #include "./cudnn_batch_norm-inl.h"
 #include <nnvm/op_attr_types.h>
+#include "../../elemwise_op_common.h"
 
 namespace mxnet {
 namespace op {
-#if CUDNN_MAJOR >= 4
-template<>
-Operator *CreateOp_CuDNNv4<cpu>(BatchNormParam param) {
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 4
+
+static bool BatchNormShape(const nnvm::NodeAttrs& attrs, std::vector<TShape> *in_shape,
+    std::vector<TShape> *out_shape) {
+  using namespace mshadow;
+  CHECK_EQ(in_shape->size(), 5U) << "Input:[data, gamma, beta, moving_mean, moving_var]";
+  const TShape &dshape = in_shape->at(0);
+  if (dshape.ndim() == 0) return false;
+  in_shape->at(1) = TShape(Shape1(dshape[1]));
+  in_shape->at(2) = TShape(Shape1(dshape[1]));
+  in_shape->at(3) = TShape(Shape1(dshape[1]));
+  in_shape->at(4) = TShape(Shape1(dshape[1]));
+
+  out_shape->clear();
+  out_shape->push_back(dshape);
+  out_shape->push_back(Shape1(dshape[1]));
+  out_shape->push_back(Shape1(dshape[1]));
+
+  return true;
+}
+
+static void BatchNormCompute_CPU(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx, const std::vector<TBlob>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<TBlob>& outputs) {
   LOG(FATAL) << "CuDNNBatchNormOp is only available for gpu.";
-  return NULL;
 }
 
-Operator *CuDNNBatchNormProp::CreateOperator(Context ctx) const {
-#if CUDNN_MAJOR >= 5
-  LOG(FATAL) << "CuDNNBatchNorm is merged into BatchNorm for cudnn version above v5."
-                "Use the later instead.";
-  return nullptr;
-#else
-  DO_BIND_DISPATCH(CreateOp_CuDNNv4, param_);
-#endif
+static void BatchNormGradCompute_CPU(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx, const std::vector<TBlob>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<TBlob>& outputs) {
+  LOG(FATAL) << "CuDNNBatchNormOp is only available for gpu.";
 }
 
-MXNET_REGISTER_OP_PROPERTY(CuDNNBatchNorm, CuDNNBatchNormProp)
+NNVM_REGISTER_OP(CuDNNBatchNorm)
 .describe("Apply batch normalization to input.")
+.set_num_inputs(5)
+.set_num_outputs(3)
+.set_attr_parser(ParamParser<BatchNormParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+    [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"data", "gamma", "beta", "moving_mean", "moving_var"};
+})
+.set_attr<nnvm::FListOutputNames>("FListOutputNames",
+    [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"output", "mean", "var"};
+})
+.set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs",
+    [](const NodeAttrs& attrs) {
+  return 1;
+})
+.set_attr<nnvm::FMutateInputs>("FMutateInputs", [](const nnvm::NodeAttrs& attrs) {
+  return std::vector<uint32_t>{3, 4};
+})
+.set_attr<nnvm::FInferShape>("FInferShape", BatchNormShape)
+.set_attr<FCompute>("FCompute<cpu>", BatchNormCompute_CPU)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseInOut{"_backward_CuDNNBatchNorm"})
 .add_argument("data", "NDArray-or-Symbol", "Input data to batch normalization")
-.add_arguments(BatchNormParam::__FIELDS__());
+.add_argument("gamma", "NDArray-or-Symbol", "gamma array")
+.add_argument("beta", "NDArray-or-Symbol", "beta array")
+.add_argument("moving_mean", "NDArray-or-Symbol", "running mean of input")
+.add_argument("moving_var", "NDArray-or-Symbol", "running variance of input")
+.add_arguments(BatchNormParam::__FIELDS__())
+.set_attr<nnvm::FSetInputVarAttrOnCompose>(
+  "FSetInputVarAttrOnCompose",
+  [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) {
+    if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return;
+    if (index == 3) {
+      var->attrs.dict["__init__"] = "[\"zero\", {}]";
+    } else if (index == 4) {
+      var->attrs.dict["__init__"] = "[\"one\", {}]";
+    }
+  });
+
+NNVM_REGISTER_OP(_backward_CuDNNBatchNorm)
+.set_num_outputs(5)
+.set_attr<nnvm::FMutateInputs>("FMutateInputs", [](const nnvm::NodeAttrs& attrs) {
+  return std::vector<uint32_t>{6, 7};
+})
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr_parser(ParamParser<BatchNormParam>)
+.set_attr<FCompute>("FCompute<cpu>", BatchNormGradCompute_CPU);
 
-NNVM_REGISTER_OP(CuDNNBatchNorm)
-.set_attr<nnvm::FSetInputVarAttrOnCompose>("FSetInputVarAttrOnCompose",
-    [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) {
-      if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return;
-      if (index == 3) {
-        var->attrs.dict["__init__"] = "[\"zero\", {}]";
-      } else if (index == 4) {
-        var->attrs.dict["__init__"] = "[\"zero\", {}]";
-      }
-    });
 #endif  // CUDNN_MAJOR >= 4
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/cudnn/cudnn_batch_norm.cu b/src/operator/nn/cudnn/cudnn_batch_norm.cu
index e96db2e5e73f..e07cd1e6c8f6 100644
--- a/src/operator/nn/cudnn/cudnn_batch_norm.cu
+++ b/src/operator/nn/cudnn/cudnn_batch_norm.cu
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file cudnn_batch_norm.cu
  * \brief
- * \author Junyuan Xie
+ * \author Junyuan Xie, Da Zheng
 */
 
 #include "./cudnn_batch_norm-inl.h"
@@ -30,10 +30,60 @@
 namespace mxnet {
 namespace op {
 #if CUDNN_MAJOR == 4
-template<>
-Operator *CreateOp_CuDNNv4<gpu>(BatchNormParam param) {
-  return new CuDNNBatchNormOp<float>(param);
+
+template<typename DType>
+static CuDNNBatchNormOp<DType> &GetCuDNNOp(const BatchNormParam& param) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local CuDNNBatchNormOp<DType> op;
+#else
+  static MX_THREAD_LOCAL CuDNNBatchNormOp<DType> op;
+#endif
+  op.Init(param);
+  return op;
+}
+
+static void BatchNormCompute_CuDNNv4(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx, const std::vector<TBlob>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<TBlob>& outputs) {
+#if CUDNN_MAJOR >= 5
+  LOG(FATAL) << "CuDNNBatchNorm is merged into BatchNorm for cudnn version above v5."
+    "Use the later instead.";
+#else
+  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 5U);
+  std::vector<TBlob> in_data(inputs.begin(), inputs.begin() + 3);
+  std::vector<TBlob> aux_states(inputs.begin() + 3, inputs.end());
+  GetCuDNNOp<float>(param).Forward(ctx, in_data, req, outputs, aux_states);
+#endif
+}
+
+static void BatchNormGradCompute_CuDNNv4(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx, const std::vector<TBlob>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<TBlob>& outputs) {
+#if CUDNN_MAJOR >= 5
+  LOG(FATAL) << "CuDNNBatchNorm is merged into BatchNorm for cudnn version above v5."
+    "Use the later instead.";
+#else
+  CHECK_EQ(inputs.size(), 11U);
+  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
+  std::vector<TBlob> out_grad(1, inputs[0]);
+  std::vector<TBlob> in_data(inputs.begin() + 3, inputs.begin() + 6);
+  std::vector<TBlob> aux_states(inputs.begin() + 6, inputs.begin() + 8);
+  std::vector<TBlob> out_data(inputs.begin() + 8, inputs.end());
+  std::vector<TBlob> in_grad(outputs.begin(), outputs.begin() + 3);
+  GetCuDNNOp<float>(param).Backward(ctx, out_grad, in_data, out_data,
+      req, in_grad, aux_states);
+#endif
 }
+
+NNVM_REGISTER_OP(CuDNNBatchNorm)
+.set_attr<FCompute>("FCompute<gpu>", BatchNormCompute_CuDNNv4);
+
+NNVM_REGISTER_OP(_backward_CuDNNBatchNorm)
+.set_attr<FCompute>("FCompute<gpu>", BatchNormGradCompute_CuDNNv4);
+
 #endif  // CUDNN_MAJOR == 4
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h
index 8ffe97d94310..229ba3cb1a8e 100644
--- a/src/operator/nn/cudnn/cudnn_convolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h
@@ -42,9 +42,19 @@ namespace op {
  * \brief The Operator used to perform convolution using cuDNN kernels.
  */
 template<typename DType>
-class CuDNNConvolutionOp : public Operator {
+class CuDNNConvolutionOp {
  public:
-  explicit CuDNNConvolutionOp(const ConvolutionParam& param,
+  CuDNNConvolutionOp() {
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_));
+    CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_));
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_));
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_));
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_));
+  }
+
+  void Init(const ConvolutionParam& param,
                               int forward_compute_type,
                               int backward_compute_type,
                               const std::vector<TShape>& in_shape,
@@ -57,8 +67,6 @@ class CuDNNConvolutionOp : public Operator {
     auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type);
     // convert MB to words
     param_.workspace = (param_.workspace << 20) / sizeof(DType);
-    init_cudnn_ = false;
-    init_temp_size_ = false;
     dtype_ = DataType<DType>::kCudnnFlag;
     // TensorCore algos only allowed on fp16-I/O convolutions if permitted by the global policy.
     cudnn_tensor_core_ = DataType<DType>::kFlag == kFloat16 && GetEnvAllowTensorCore();
@@ -102,22 +110,19 @@ class CuDNNConvolutionOp : public Operator {
   }
 
   ~CuDNNConvolutionOp() {
-    if (init_cudnn_) {
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_));
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_));
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_));
-      CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_));
-      CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_));
-      CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_));
-      CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_));
-    }
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_));
+    CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_));
+    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_));
+    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_));
+    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_));
   }
 
-  virtual void Forward(const OpContext &ctx,
+  void Forward(const OpContext &ctx,
                        const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
+                       const std::vector<TBlob> &out_data) {
     using namespace mshadow;
     size_t expected = param_.no_bias ? 2 : 3;
     CHECK_EQ(in_data.size(), expected);
@@ -174,18 +179,17 @@ class CuDNNConvolutionOp : public Operator {
     }
   }
 
-  virtual void Backward(const OpContext &ctx,
+  void Backward(const OpContext &ctx,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
                         const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
+                        const std::vector<TBlob> &in_grad) {
     using namespace mshadow;
     using namespace mshadow::expr;
     size_t expected = param_.no_bias == 0 ? 3 : 2;
     CHECK_EQ(out_grad.size(), 1U);
-    CHECK(in_data.size() == expected && in_grad.size() == expected);
+    CHECK_EQ(in_data.size(), expected);
+    CHECK_EQ(in_grad.size(), expected);
     Stream<gpu> *s = ctx.get_stream<gpu>();
 
     // I/O's should have 2 more dims than the kernel dim
@@ -195,6 +199,7 @@ class CuDNNConvolutionOp : public Operator {
     DType *data_ptr = GetNdPtr(in_data[conv::kData], param_.kernel.ndim() + 2, s);
     DType *gdata_ptr = GetNdPtr(in_grad[conv::kData], param_.kernel.ndim() + 2, s);
 
+    GetTempSize(ctx);
     Tensor<gpu, 1, DType> workspace = AllocateTempWorkspace(ctx, backward_workspace_byte_);
     size_t workspace_size = TensorSizeBytes(workspace);
     for (uint32_t g = 0; g < param_.num_group; ++g) {
@@ -333,13 +338,6 @@ class CuDNNConvolutionOp : public Operator {
     size_t expected = param_.no_bias ? 2 : 3;
     CHECK_EQ(in_shape.size(), expected);
     CHECK_EQ(out_shape.size(), 1U);
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_));
-    CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_));
-    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_));
-    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_));
-    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_));
 
     TShape dshape = in_shape[conv::kData];
     TShape wshape = in_shape[conv::kWeight];
@@ -512,7 +510,6 @@ class CuDNNConvolutionOp : public Operator {
                                           &bias_shape[0],
                                           &bias_stride[0]));
     }
-    init_cudnn_ = true;
   }
 
   void SelectAlgo(const Context& ctx,
@@ -756,7 +753,6 @@ class CuDNNConvolutionOp : public Operator {
   }
 
   void GetTempSize(const OpContext& ctx) {
-    if (init_temp_size_) return;
     mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
     size_t back_size = 0, back_size_w = 0;
     CUDNN_CALL(cudnnGetConvolutionBackwardDataWorkspaceSize(s->dnn_handle_,
@@ -781,8 +777,6 @@ class CuDNNConvolutionOp : public Operator {
                out_desc_,
                forward_algo_.AlgoNumber(),
                &forward_workspace_byte_));
-
-    init_temp_size_ = true;
   }
 
   int *CastTShapeToIntPtr(const TShape& s, std::vector<int> *buffer) {
@@ -847,8 +841,6 @@ class CuDNNConvolutionOp : public Operator {
   std::vector<int> param_dilate_;
   std::vector<int> param_pad_;
 
-  bool init_cudnn_;
-  bool init_temp_size_;
   // Temp workspace size in bytes needed for Forward() operation.
   size_t forward_workspace_byte_;
   // Temp workspace size in bytes needed for Backward() operation.
diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
index bc02d1b73f45..3c80cdcba4c2 100644
--- a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
@@ -39,9 +39,19 @@ namespace op {
 #if MXNET_USE_CUDNN == 1
 
 template<typename DType>
-class CuDNNDeconvolutionOp : public Operator {
+class CuDNNDeconvolutionOp {
  public:
-  explicit CuDNNDeconvolutionOp(DeconvolutionParam param,
+  CuDNNDeconvolutionOp() {
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_));
+    CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_));
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_));
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_));
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_));
+  }
+
+  void Init(DeconvolutionParam param,
                                 int forward_compute_type,
                                 int backward_compute_type,
                                 const std::vector<TShape>& in_shape,
@@ -54,8 +64,6 @@ class CuDNNDeconvolutionOp : public Operator {
     auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type);
     // convert MB to words
     param_.workspace = (param_.workspace << 20) / sizeof(DType);
-    init_cudnn_ = false;
-    init_temp_size_ = false;
     dtype_ = mshadow::DataType<DType>::kCudnnFlag;
     // TensorCore algos only allowed on fp16-I/O deconvolutions if permitted by the global policy.
     cudnn_tensor_core_ = DataType<DType>::kFlag == kFloat16 && GetEnvAllowTensorCore();
@@ -99,22 +107,19 @@ class CuDNNDeconvolutionOp : public Operator {
   }
 
   ~CuDNNDeconvolutionOp() {
-    if (init_cudnn_) {
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_));
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_));
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_));
-      CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_));
-      CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_));
-      CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_));
-      CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_));
-    }
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_));
+    CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_));
+    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_));
+    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_));
+    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_));
   }
 
-  virtual void Forward(const OpContext &ctx,
+  void Forward(const OpContext &ctx,
                        const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
+                       const std::vector<TBlob> &out_data) {
     using namespace mshadow;
     size_t expected = param_.no_bias ? 2 : 3;
     CHECK_EQ(in_data.size(), expected);
@@ -187,18 +192,17 @@ class CuDNNDeconvolutionOp : public Operator {
     }
   }
 
-  virtual void Backward(const OpContext &ctx,
+  void Backward(const OpContext &ctx,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
                         const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
+                        const std::vector<TBlob> &in_grad) {
     using namespace mshadow;
     using namespace mshadow::expr;
     size_t expected = param_.no_bias == 0 ? 3 : 2;
     CHECK_EQ(out_grad.size(), 1U);
-    CHECK(in_data.size() == expected && in_grad.size() == expected);
+    CHECK_EQ(in_data.size(), param_.no_bias ? 2U : 3U);
+    CHECK_EQ(in_grad.size(), expected);
     Stream<gpu> *s = ctx.get_stream<gpu>();
 
     // I/O's should have 2 more dims than the kernel dim
@@ -213,6 +217,7 @@ class CuDNNDeconvolutionOp : public Operator {
       CHECK_NE(req[deconv::kBias], kWriteInplace);
     }
     CHECK_NE(req[deconv::kData], kWriteInplace);
+    GetTempSize(ctx);
     Tensor<gpu, 1, DType> workspace = AllocateTempWorkspace(ctx, backward_workspace_byte_);
     size_t workspace_size = TensorSizeBytes(workspace);
     for (uint32_t g = 0; g < param_.num_group; ++g) {
@@ -348,13 +353,6 @@ class CuDNNDeconvolutionOp : public Operator {
     size_t expected = param_.no_bias ? 2 : 3;
     CHECK_EQ(in_shape.size(), expected);
     CHECK_EQ(out_shape.size(), 1U);
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_));
-    CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_));
-    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_));
-    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_));
-    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_));
 
     TShape dshape = in_shape[deconv::kData];
     TShape wshape = in_shape[deconv::kWeight];
@@ -536,7 +534,6 @@ class CuDNNDeconvolutionOp : public Operator {
                                             &bias_shape[0],
                                             &bias_stride[0]));
     }
-    init_cudnn_ = true;
   }
 
   void SelectAlgo(const Context& ctx,
@@ -789,7 +786,6 @@ class CuDNNDeconvolutionOp : public Operator {
   }
 
   void GetTempSize(const OpContext& ctx) {
-    if (init_temp_size_) return;
     mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
     size_t back_data_algo_workspace_size = 0;
     size_t back_filter_algo_workspace_size = 0;
@@ -819,7 +815,6 @@ class CuDNNDeconvolutionOp : public Operator {
     forward_workspace_byte_ = back_data_algo_workspace_size;
     backward_workspace_byte_ = std::max(forward_algo_workspace_size,
                                         back_filter_algo_workspace_size);
-    init_temp_size_ = true;
   }
 
   int *CastTShapeToIntPtr(const TShape& s, std::vector<int> *buffer) {
@@ -882,8 +877,11 @@ class CuDNNDeconvolutionOp : public Operator {
   std::vector<int> param_stride_;
   std::vector<int> param_dilate_;
 
-  bool init_cudnn_;
-  bool init_temp_size_;
+  int forward_compute_type_;
+  int backward_compute_type_;
+  const std::vector<TShape> in_shapes_;
+  const std::vector<TShape> out_shapes_;
+
   // Temp workspace size in bytes needed for Forward() operation.  Note that
   // in deconvolution, this is handled by the cuDNN backprop-to-data kernel.
   size_t forward_workspace_byte_;
diff --git a/src/operator/nn/cudnn/cudnn_pooling-inl.h b/src/operator/nn/cudnn/cudnn_pooling-inl.h
index 104ed8546dca..8442b37058d4 100644
--- a/src/operator/nn/cudnn/cudnn_pooling-inl.h
+++ b/src/operator/nn/cudnn/cudnn_pooling-inl.h
@@ -34,13 +34,18 @@ namespace mxnet {
 namespace op {
 
 template<typename DType>
-class CuDNNPoolingOp : public Operator {
+class CuDNNPoolingOp {
  public:
-  explicit CuDNNPoolingOp(PoolingParam p) {
-    param_ = p;
-    init_cudnn_ = false;
+  CuDNNPoolingOp() {
     // TODO(xxx): fp16
     dtype_ = mshadow::DataType<DType>::kCudnnFlag;
+    CUDNN_CALL(cudnnCreatePoolingDescriptor(&pooling_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
+  }
+
+  void Init(const PoolingParam &p) {
+    param_ = p;
     switch (param_.pool_type) {
       case pool_enum::kMaxPooling:
         mode_ = CUDNN_POOLING_MAX;
@@ -54,33 +59,24 @@ class CuDNNPoolingOp : public Operator {
   }
 
   ~CuDNNPoolingOp() {
-    if (init_cudnn_) {
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_));
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_));
-      CUDNN_CALL(cudnnDestroyPoolingDescriptor(pooling_desc_));
-    }
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_));
+    CUDNN_CALL(cudnnDestroyPoolingDescriptor(pooling_desc_));
   }
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
+  void Forward(const OpContext &ctx, const TBlob &in_data,
+      const OpReqType &req, const TBlob &out_data) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 1U);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     typename DataType<DType>::ScaleType alpha = 1.0f;
     typename DataType<DType>::ScaleType beta = 0.0f;
+    this->Init(s, in_data, out_data);
     if (param_.kernel.ndim() == 2) {
       // 2d pool
-      Tensor<gpu, 4, DType> data = in_data[pool_enum::kData].get<gpu, 4, DType>(s);
-      Tensor<gpu, 4, DType> out = out_data[pool_enum::kOut].get<gpu, 4, DType>(s);
-      if (!init_cudnn_) {
-        this->Init(s, in_data, out_data);
-      }
+      Tensor<gpu, 4, DType> data = in_data.get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> out = out_data.get<gpu, 4, DType>(s);
       CHECK_EQ(data.CheckContiguous(), true);
       CHECK_EQ(out.CheckContiguous(), true);
       CUDNN_CALL(cudnnPoolingForward(s->dnn_handle_,
@@ -93,11 +89,8 @@ class CuDNNPoolingOp : public Operator {
                                      out.dptr_));
     } else if (param_.kernel.ndim() == 3) {
       // 3d pool
-      Tensor<gpu, 5, DType> data = in_data[pool_enum::kData].get<gpu, 5, DType>(s);
-      Tensor<gpu, 5, DType> out = out_data[pool_enum::kOut].get<gpu, 5, DType>(s);
-      if (!init_cudnn_) {
-        this->Init(s, in_data, out_data);
-      }
+      Tensor<gpu, 5, DType> data = in_data.get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> out = out_data.get<gpu, 5, DType>(s);
       CHECK_EQ(data.CheckContiguous(), true);
       CHECK_EQ(out.CheckContiguous(), true);
       CUDNN_CALL(cudnnPoolingForward(s->dnn_handle_,
@@ -113,31 +106,23 @@ class CuDNNPoolingOp : public Operator {
     }
   }
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
+  void Backward(const OpContext &ctx, const TBlob &out_grad,
+      const TBlob &in_data, const TBlob &out_data,
+      const OpReqType &req, const TBlob &in_grad) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1U);
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 1U);
-    CHECK_EQ(req.size(), 1U);
-    CHECK_EQ(in_grad.size(), 1U);
 
     Stream<gpu> *s = ctx.get_stream<gpu>();
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     typename DataType<DType>::ScaleType alpha = 1.0f;
     typename DataType<DType>::ScaleType beta = 0.0f;
+    this->Init(s, in_data, out_data);
     if (param_.kernel.ndim() == 2) {
       // 2d pool
-      Tensor<gpu, 4, DType> m_out_grad = out_grad[pool_enum::kOut].get<gpu, 4, DType>(s);
-      Tensor<gpu, 4, DType> m_in_data = in_data[pool_enum::kData].get<gpu, 4, DType>(s);
-      Tensor<gpu, 4, DType> m_out_data = out_data[pool_enum::kOut].get<gpu, 4, DType>(s);
-      Tensor<gpu, 4, DType> m_in_grad = in_grad[pool_enum::kData].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> m_out_grad = out_grad.get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> m_in_data = in_data.get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> m_out_data = out_data.get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> m_in_grad = in_grad.get<gpu, 4, DType>(s);
       CUDNN_CALL(cudnnPoolingBackward(s->dnn_handle_,
                                       pooling_desc_,
                                       &alpha,
@@ -152,10 +137,10 @@ class CuDNNPoolingOp : public Operator {
                                       m_in_grad.dptr_));
     } else if (param_.kernel.ndim() == 3) {
       // 3d pool
-      Tensor<gpu, 5, DType> m_out_grad = out_grad[pool_enum::kOut].get<gpu, 5, DType>(s);
-      Tensor<gpu, 5, DType> m_in_data = in_data[pool_enum::kData].get<gpu, 5, DType>(s);
-      Tensor<gpu, 5, DType> m_out_data = out_data[pool_enum::kOut].get<gpu, 5, DType>(s);
-      Tensor<gpu, 5, DType> m_in_grad = in_grad[pool_enum::kData].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> m_out_grad = out_grad.get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> m_in_data = in_data.get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> m_out_data = out_data.get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> m_in_grad = in_grad.get<gpu, 5, DType>(s);
       CUDNN_CALL(cudnnPoolingBackward(s->dnn_handle_,
                                       pooling_desc_,
                                       &alpha,
@@ -174,129 +159,115 @@ class CuDNNPoolingOp : public Operator {
   }
 
  private:
-  inline void Init(mshadow::Stream<gpu> *s,
-                   const std::vector<TBlob> &in_data,
-                   const std::vector<TBlob> &out_data) {
+  inline void Init(mshadow::Stream<gpu> *s, const TBlob &in_data,
+      const TBlob &out_data) {
     using namespace mshadow;
     #if CUDNN_MAJOR >= 5
     nan_prop_ = CUDNN_NOT_PROPAGATE_NAN;
     #endif
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 1U);
-    if (!init_cudnn_) {
-      init_cudnn_ = true;
-      if (param_.kernel.ndim() == 2) {
-        // 2d conv
-        Tensor<gpu, 4, DType> data = in_data[pool_enum::kData].get<gpu, 4, DType>(s);
-        Tensor<gpu, 4, DType> out = out_data[pool_enum::kOut].get<gpu, 4, DType>(s);
-        mshadow::Shape<4> dshape = data.shape_;
-        CUDNN_CALL(cudnnCreatePoolingDescriptor(&pooling_desc_));
-        CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
-        CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
-        CUDNN_CALL(cudnnSetTensor4dDescriptor(in_desc_,
-                                              CUDNN_TENSOR_NCHW,
-                                              dtype_,
-                                              data.shape_[0],
-                                              data.shape_[1],
-                                              data.shape_[2],
-                                              data.shape_[3]));
-        CUDNN_CALL(cudnnSetTensor4dDescriptor(out_desc_,
-                                              CUDNN_TENSOR_NCHW,
-                                              dtype_,
-                                              out.shape_[0],
-                                              out.shape_[1],
-                                              out.shape_[2],
-                                              out.shape_[3]));
-        #if CUDNN_MAJOR >= 5
-        CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_,
-                                               mode_,
-                                               nan_prop_,
-                                               param_.global_pool ? dshape[2] : param_.kernel[0],
-                                               param_.global_pool ? dshape[3] : param_.kernel[1],
-                                               param_.pad[0],
-                                               param_.pad[1],
-                                               param_.global_pool ? 1 : param_.stride[0],
-                                               param_.global_pool ? 1 :param_.stride[1]));
-        #else
-        CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_,
-                                               mode_,
-                                               param_.global_pool ? dshape[2] : param_.kernel[0],
-                                               param_.global_pool ? dshape[3] : param_.kernel[1],
-                                               param_.pad[0],
-                                               param_.pad[1],
-                                               param_.global_pool ? 1 : param_.stride[0],
-                                               param_.global_pool ? 1 : param_.stride[1]));
-        #endif
-      } else {
-        Tensor<gpu, 5, DType> data = in_data[pool_enum::kData].get<gpu, 5, DType>(s);
-        Tensor<gpu, 5, DType> out = out_data[pool_enum::kOut].get<gpu, 5, DType>(s);
-        CUDNN_CALL(cudnnCreatePoolingDescriptor(&pooling_desc_));
-        CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
-        CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
-        std::vector<int> ishape = {static_cast<int>(data.shape_[0]),
-                                   static_cast<int>(data.shape_[1]),
-                                   static_cast<int>(data.shape_[2]),
-                                   static_cast<int>(data.shape_[3]),
-                                   static_cast<int>(data.shape_[4])};
+    if (param_.kernel.ndim() == 2) {
+      // 2d conv
+      Tensor<gpu, 4, DType> data = in_data.get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> out = out_data.get<gpu, 4, DType>(s);
+      mshadow::Shape<4> dshape = data.shape_;
+      CUDNN_CALL(cudnnSetTensor4dDescriptor(in_desc_,
+                                            CUDNN_TENSOR_NCHW,
+                                            dtype_,
+                                            data.shape_[0],
+                                            data.shape_[1],
+                                            data.shape_[2],
+                                            data.shape_[3]));
+      CUDNN_CALL(cudnnSetTensor4dDescriptor(out_desc_,
+                                            CUDNN_TENSOR_NCHW,
+                                            dtype_,
+                                            out.shape_[0],
+                                            out.shape_[1],
+                                            out.shape_[2],
+                                            out.shape_[3]));
+      #if CUDNN_MAJOR >= 5
+      CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_,
+                                             mode_,
+                                             nan_prop_,
+                                             param_.global_pool ? dshape[2] : param_.kernel[0],
+                                             param_.global_pool ? dshape[3] : param_.kernel[1],
+                                             param_.pad[0],
+                                             param_.pad[1],
+                                             param_.global_pool ? 1 : param_.stride[0],
+                                             param_.global_pool ? 1 :param_.stride[1]));
+      #else
+      CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_,
+                                             mode_,
+                                             param_.global_pool ? dshape[2] : param_.kernel[0],
+                                             param_.global_pool ? dshape[3] : param_.kernel[1],
+                                             param_.pad[0],
+                                             param_.pad[1],
+                                             param_.global_pool ? 1 : param_.stride[0],
+                                             param_.global_pool ? 1 : param_.stride[1]));
+      #endif
+    } else {
+      Tensor<gpu, 5, DType> data = in_data.get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> out = out_data.get<gpu, 5, DType>(s);
+      std::vector<int> ishape = {static_cast<int>(data.shape_[0]),
+                                 static_cast<int>(data.shape_[1]),
+                                 static_cast<int>(data.shape_[2]),
+                                 static_cast<int>(data.shape_[3]),
+                                 static_cast<int>(data.shape_[4])};
 
-        std::vector<int> istride = {static_cast<int>(ishape[1] * ishape[2] * ishape[3] * ishape[4]),
-                                    static_cast<int>(ishape[2] * ishape[3] * ishape[4]),
-                                    static_cast<int>(ishape[3] * ishape[4]),
-                                    static_cast<int>(ishape[4]),
-                                    1};
+      std::vector<int> istride = {static_cast<int>(ishape[1] * ishape[2] * ishape[3] * ishape[4]),
+                                  static_cast<int>(ishape[2] * ishape[3] * ishape[4]),
+                                  static_cast<int>(ishape[3] * ishape[4]),
+                                  static_cast<int>(ishape[4]), 1};
 
-        std::vector<int> oshape = {static_cast<int>(out.shape_[0]),
-                                   static_cast<int>(out.shape_[1]),
-                                   static_cast<int>(out.shape_[2]),
-                                   static_cast<int>(out.shape_[3]),
-                                   static_cast<int>(out.shape_[4])};
+      std::vector<int> oshape = {static_cast<int>(out.shape_[0]),
+                                 static_cast<int>(out.shape_[1]),
+                                 static_cast<int>(out.shape_[2]),
+                                 static_cast<int>(out.shape_[3]),
+                                 static_cast<int>(out.shape_[4])};
 
-        std::vector<int> ostride = {static_cast<int>(oshape[1] * oshape[2] * oshape[3] * oshape[4]),
-                                    static_cast<int>(oshape[2] * oshape[3] * oshape[4]),
-                                    static_cast<int>(oshape[3] * oshape[4]),
-                                    static_cast<int>(oshape[4]),
-                                    1};
+      std::vector<int> ostride = {static_cast<int>(oshape[1] * oshape[2] * oshape[3] * oshape[4]),
+                                  static_cast<int>(oshape[2] * oshape[3] * oshape[4]),
+                                  static_cast<int>(oshape[3] * oshape[4]),
+                                  static_cast<int>(oshape[4]), 1};
 
-        std::vector<int> kernel_vec = {param_.global_pool ? ishape[2] :
-                                                            static_cast<int>(param_.kernel[0]),
-                                       param_.global_pool ? ishape[3] :
-                                                            static_cast<int>(param_.kernel[1]),
-                                       param_.global_pool ? ishape[4] :
-                                                            static_cast<int>(param_.kernel[2])};
+      std::vector<int> kernel_vec = {param_.global_pool ? ishape[2] :
+                                                          static_cast<int>(param_.kernel[0]),
+                                     param_.global_pool ? ishape[3] :
+                                                          static_cast<int>(param_.kernel[1]),
+                                     param_.global_pool ? ishape[4] :
+                                                          static_cast<int>(param_.kernel[2])};
 
-        std::vector<int> pad_vec = {param_.global_pool ? 0 : static_cast<int>(param_.pad[0]),
-                                    param_.global_pool ? 0 : static_cast<int>(param_.pad[1]),
-                                    param_.global_pool ? 0 : static_cast<int>(param_.pad[2])};
+      std::vector<int> pad_vec = {param_.global_pool ? 0 : static_cast<int>(param_.pad[0]),
+                                  param_.global_pool ? 0 : static_cast<int>(param_.pad[1]),
+                                  param_.global_pool ? 0 : static_cast<int>(param_.pad[2])};
 
-        std::vector<int> stride_vec = {param_.global_pool ? 1 : static_cast<int>(param_.stride[0]),
-                                       param_.global_pool ? 1 : static_cast<int>(param_.stride[1]),
-                                       param_.global_pool ? 1 : static_cast<int>(param_.stride[2])};
+      std::vector<int> stride_vec = {param_.global_pool ? 1 : static_cast<int>(param_.stride[0]),
+                                     param_.global_pool ? 1 : static_cast<int>(param_.stride[1]),
+                                     param_.global_pool ? 1 : static_cast<int>(param_.stride[2])};
 
-        CUDNN_CALL(cudnnSetTensorNdDescriptor(in_desc_,
-                                              dtype_,
-                                              static_cast<int>(ishape.size()),
-                                              &ishape[0],
-                                              &istride[0]));
-        CUDNN_CALL(cudnnSetTensorNdDescriptor(out_desc_,
-                                              dtype_,
-                                              static_cast<int>(oshape.size()),
-                                              &oshape[0],
-                                              &ostride[0]));
-        #if CUDNN_MAJOR >= 5
-        CUDNN_CALL(cudnnSetPoolingNdDescriptor(pooling_desc_,
-                                               mode_,
-                                               nan_prop_,
-                                               static_cast<int>(kernel_vec.size()),
-                                               &(kernel_vec[0]),
-                                               &(pad_vec[0]),
-                                               &(stride_vec[0])));
-        #else
-        LOG(FATAL) << "3D pooling only support CUDNN v5 and abouve";
-        #endif
-      }
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(in_desc_,
+                                            dtype_,
+                                            static_cast<int>(ishape.size()),
+                                            &ishape[0],
+                                            &istride[0]));
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(out_desc_,
+                                            dtype_,
+                                            static_cast<int>(oshape.size()),
+                                            &oshape[0],
+                                            &ostride[0]));
+      #if CUDNN_MAJOR >= 5
+      CUDNN_CALL(cudnnSetPoolingNdDescriptor(pooling_desc_,
+                                             mode_,
+                                             nan_prop_,
+                                             static_cast<int>(kernel_vec.size()),
+                                             &(kernel_vec[0]),
+                                             &(pad_vec[0]),
+                                             &(stride_vec[0])));
+      #else
+      LOG(FATAL) << "3D pooling only support CUDNN v5 and abouve";
+      #endif
     }
   }
-  bool init_cudnn_;
+
   cudnnDataType_t dtype_;
   cudnnHandle_t handle_;
   cudnnPoolingMode_t mode_;
diff --git a/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h b/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h
index 5afdb4844364..239da023668d 100644
--- a/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h
+++ b/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h
@@ -32,73 +32,64 @@
 
 namespace mxnet {
 namespace op {
-class CuDNNSoftmaxActivationOp : public Operator {
+class CuDNNSoftmaxActivationOp {
  public:
-  explicit CuDNNSoftmaxActivationOp(SoftmaxActivationParam param) {
-    this->param_ = param;
-    init_cudnn_ = false;
+  CuDNNSoftmaxActivationOp() {
     dtype_ = CUDNN_DATA_FLOAT;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_));
+  }
+
+  void Init(SoftmaxActivationParam param) {
+    this->param_ = param;
   }
 
   ~CuDNNSoftmaxActivationOp() {
-    if (init_cudnn_) {
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_));
-    }
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_));
   }
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
+  void Forward(const OpContext &ctx, const TBlob &in_data,
+      const OpReqType &req, const TBlob &out_data) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 1U);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     Tensor<gpu, 4> data;
     Tensor<gpu, 4> out;
     cudnnSoftmaxMode_t softmax_mode;
     if (param_.mode == softmax_activation::kInstance) {
-      CHECK_EQ(in_data[softmax_activation::kData].ndim(), 2)
+      CHECK_EQ(in_data.ndim(), 2)
         << "Input need to have 2 dimensions when mode=instance.";
-      Shape<4> dshape = Shape4(in_data[softmax_activation::kData].shape_[0],
-                               in_data[softmax_activation::kData].shape_[1], 1, 1);
-      data = in_data[softmax_activation::kData].get_with_shape<gpu, 4, real_t>(dshape, s);
-      out = out_data[softmax_activation::kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
+      Shape<4> dshape = Shape4(in_data.shape_[0], in_data.shape_[1], 1, 1);
+      data = in_data.get_with_shape<gpu, 4, real_t>(dshape, s);
+      out = out_data.get_with_shape<gpu, 4, real_t>(dshape, s);
       softmax_mode = CUDNN_SOFTMAX_MODE_INSTANCE;
     } else {
-      CHECK_GE(in_data[softmax_activation::kData].ndim(), 3)
+      CHECK_GE(in_data.ndim(), 3)
         << "Input need to have a least 3 dimensions when mode=channel";
       Shape<4> dshape;
-      index_t size_left = in_data[softmax_activation::kData].Size();
+      index_t size_left = in_data.Size();
       for (int i = 0; i < 3; ++i) {
-        if (i < in_data[softmax_activation::kData].ndim()) {
-          dshape[i] = in_data[softmax_activation::kData].shape_[i];
+        if (i < in_data.ndim()) {
+          dshape[i] = in_data.shape_[i];
         } else {
           dshape[i] = 1;
         }
         size_left /= dshape[i];
       }
       dshape[3] = size_left;
-      data = in_data[softmax_activation::kData].get_with_shape<gpu, 4, real_t>(dshape, s);
-      out = out_data[softmax_activation::kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
+      data = in_data.get_with_shape<gpu, 4, real_t>(dshape, s);
+      out = out_data.get_with_shape<gpu, 4, real_t>(dshape, s);
       softmax_mode = CUDNN_SOFTMAX_MODE_CHANNEL;
     }
     float alpha = 1.0f;
     float beta = 0.0f;
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
-    if (!init_cudnn_) {
-      init_cudnn_ = true;
-      CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_));
-      CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_,
-                                            CUDNN_TENSOR_NCHW,
-                                            dtype_,
-                                            data.shape_[0],
-                                            data.shape_[1],
-                                            data.shape_[2],
-                                            data.shape_[3]));
-    }
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_,
+                                          CUDNN_TENSOR_NCHW,
+                                          dtype_,
+                                          data.shape_[0],
+                                          data.shape_[1],
+                                          data.shape_[2],
+                                          data.shape_[3]));
     CUDNN_CALL(cudnnSoftmaxForward(s->dnn_handle_,
                                    CUDNN_SOFTMAX_ACCURATE,
                                    softmax_mode,
@@ -110,19 +101,10 @@ class CuDNNSoftmaxActivationOp : public Operator {
                                    out.dptr_));
   }
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
+  void Backward(const OpContext &ctx, const TBlob &out_grad,
+      const TBlob &out_data, const OpReqType &req, const TBlob &in_grad) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1U);
-    CHECK_EQ(out_data.size(), 1U);
-    CHECK_EQ(req.size(), 1U);
-    CHECK_EQ(in_grad.size(), 1U);
     float alpha = 1.0f;
     float beta = 0.0f;
     Stream<gpu> *s = ctx.get_stream<gpu>();
@@ -132,31 +114,30 @@ class CuDNNSoftmaxActivationOp : public Operator {
     Tensor<gpu, 4> input_grad;
     cudnnSoftmaxMode_t softmax_mode;
     if (param_.mode == softmax_activation::kInstance) {
-      CHECK_EQ(in_grad[softmax_activation::kData].ndim(), 2)
+      CHECK_EQ(in_grad.ndim(), 2)
         << "Input need to have 2 dimensions when mode=instance.";
-      Shape<4> dshape = Shape4(in_grad[softmax_activation::kData].shape_[0],
-                               in_grad[softmax_activation::kData].shape_[1], 1, 1);
-      grad = out_grad[softmax_activation::kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
-      output_data = out_data[softmax_activation::kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
-      input_grad = in_grad[softmax_activation::kData].get_with_shape<gpu, 4, real_t>(dshape, s);
+      Shape<4> dshape = Shape4(in_grad.shape_[0], in_grad.shape_[1], 1, 1);
+      grad = out_grad.get_with_shape<gpu, 4, real_t>(dshape, s);
+      output_data = out_data.get_with_shape<gpu, 4, real_t>(dshape, s);
+      input_grad = in_grad.get_with_shape<gpu, 4, real_t>(dshape, s);
       softmax_mode = CUDNN_SOFTMAX_MODE_INSTANCE;
     } else {
-      CHECK_GE(in_grad[softmax_activation::kData].ndim(), 3)
+      CHECK_GE(in_grad.ndim(), 3)
         << "Input need to have a least 3 dimensions when mode=channel";
       Shape<4> dshape;
-      index_t size_left = in_grad[softmax_activation::kData].Size();
+      index_t size_left = in_grad.Size();
       for (int i = 0; i < 3; ++i) {
-        if (i < in_grad[softmax_activation::kData].ndim()) {
-          dshape[i] = in_grad[softmax_activation::kData].shape_[i];
+        if (i < in_grad.ndim()) {
+          dshape[i] = in_grad.shape_[i];
         } else {
           dshape[i] = 1;
         }
         size_left /= dshape[i];
       }
       dshape[3] = size_left;
-      output_data = out_data[softmax_activation::kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
-      grad = out_grad[softmax_activation::kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
-      input_grad = in_grad[softmax_activation::kData].get_with_shape<gpu, 4, real_t>(dshape, s);
+      output_data = out_data.get_with_shape<gpu, 4, real_t>(dshape, s);
+      grad = out_grad.get_with_shape<gpu, 4, real_t>(dshape, s);
+      input_grad = in_grad.get_with_shape<gpu, 4, real_t>(dshape, s);
       softmax_mode = CUDNN_SOFTMAX_MODE_CHANNEL;
     }
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
@@ -174,7 +155,6 @@ class CuDNNSoftmaxActivationOp : public Operator {
   }
 
  private:
-  bool init_cudnn_;
   cudnnDataType_t dtype_;
   cudnnTensorDescriptor_t shape_desc_;
   SoftmaxActivationParam param_;
diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index fbdfaa84faab..b6d522b9e6f9 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file deconvolution-inl.h
  * \brief
- * \author Wei Wu
+ * \author Wei Wu, Da Zheng
 */
 #ifndef MXNET_OPERATOR_NN_DECONVOLUTION_INL_H_
 #define MXNET_OPERATOR_NN_DECONVOLUTION_INL_H_
@@ -195,19 +195,18 @@ namespace mxnet {
 namespace op {
 
 template<typename xpu, typename DType>
-class DeconvolutionOp : public Operator {
+class DeconvolutionOp {
  public:
-  explicit DeconvolutionOp(DeconvolutionParam p) {
+  void Init(DeconvolutionParam p) {
     this->param_ = p;
     // convert MBytes first to Bytes and then to elements.
     param_.workspace = (param_.workspace << 20) / sizeof(real_t);
   }
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
+  void Forward(const OpContext &ctx,
+               const std::vector<TBlob> &in_data,
+               const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &out_data) {
     using namespace mshadow;
     using namespace mshadow::expr;
 
@@ -322,19 +321,18 @@ class DeconvolutionOp : public Operator {
     }
   }
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
+  void Backward(const OpContext &ctx,
+                const std::vector<TBlob> &out_grad,
+                const std::vector<TBlob> &in_data,
+                const std::vector<OpReqType> &req,
+                const std::vector<TBlob> &in_grad) {
     using namespace mshadow;
     using namespace mshadow::expr;
     // TODO(bing): check the BLAS Handle, be careful
     CHECK_EQ(out_grad.size(), 1U);
     size_t expected = param_.no_bias == 0 ? 3 : 2;
-    CHECK(in_data.size() == expected && in_grad.size() == expected);
+    CHECK_EQ(in_data.size(), expected);
+    CHECK_EQ(in_grad.size(), expected);
     CHECK_EQ(req.size(), expected);
     CHECK_EQ(in_data[deconv::kWeight].CheckContiguous(), true);
     // get data
@@ -489,300 +487,52 @@ class DeconvolutionOp : public Operator {
 };  // class DeconvolutionOp
 
 template<typename xpu>
-Operator* CreateOp(DeconvolutionParam param, int dtype,
-                   std::vector<TShape> *in_shape,
-                   std::vector<TShape> *out_shape,
-                   Context ctx);
-
-#if DMLC_USE_CXX11
-class DeconvolutionProp : public OperatorProperty {
- public:
-  std::vector<std::string> ListArguments() const override {
-    if (!param_.no_bias) {
-      return {"data", "weight", "bias"};
-    } else {
-      return {"data", "weight"};
-    }
-  }
-
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    using namespace mshadow;
-    param_.Init(kwargs);
-    if (param_.kernel.ndim() == 1) {
-      param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW;
-      if (param_.stride.ndim() == 0) param_.stride = Shape1(1);
-      if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape1(0);
-      if (param_.adj.ndim() == 0) param_.adj = Shape1(0);
-    } else if (param_.kernel.ndim() == 2) {
-      param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW;
-      if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
-      if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
-      if (param_.adj.ndim() == 0) param_.adj = Shape2(0, 0);
-    } else {
-      CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D deconvolution not supported";
-      param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW;
-      if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
-      if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
-      if (param_.adj.ndim() == 0) param_.adj = Shape3(0, 0, 0);
-    }
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-#if MXNET_USE_CUDNN == 0
-    if (param_.kernel.ndim() > 2) {
-      LOG(FATAL) << "If not using CUDNN, only 1D or 2D Deconvolution is supported";
-      return false;
-    }
-#endif  // CUDNN
-
-    using namespace mshadow;
-    if (!param_.no_bias) {
-      CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
-    } else {
-      CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
-    }
-    out_shape->resize(1, TShape());
-    const TShape &dshape = (*in_shape)[deconv::kData];
-    if (dshape.ndim() ==  0) return false;
-
-    if (param_.kernel.ndim() == 1) {
-      // 1d conv
-      CHECK_EQ(dshape.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x";
-      Shape<3> dshape_ncw = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW);
-      Shape<3> wshape = Shape3(dshape_ncw[1], param_.num_filter / param_.num_group,
-                               param_.kernel[0]);
-      wshape = ConvertLayout(wshape, kNCW, param_.layout.value());
-      SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape);
-      if (!param_.no_bias) {
-        SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter));
-      }
-
-      const index_t dilated_ksize_x = param_.DilatedKernelSize(0);
-
-      index_t o_pad[1];
-      index_t o_adj[1];
-      param_.InferPad(dshape_ncw, o_pad, o_adj);
-
-      CHECK_EQ(dshape_ncw[1] % param_.num_group, 0U) \
-        << "input num_filter must divide group size";
-      CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
-        << "output num_filter must divide group size";
-      CHECK_GT(param_.kernel.Size(), 0U) \
-        << "incorrect kernel size: " << param_.kernel;
-      CHECK_GT(param_.stride.Size(), 0U) \
-        << "incorrect stride size: " << param_.stride;
-      CHECK_GT(param_.dilate.Size(), 0U) \
-        << "incorrect dilate size: " << param_.dilate;
-
-      CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(x) must be samller than stride[0]";
-
-      Shape<3> oshape;
-      oshape[0] = dshape_ncw[0];
-      oshape[1] = param_.num_filter;
-      oshape[2] = param_.stride[0] * (dshape_ncw[2] - 1) +
-        dilated_ksize_x - 2 * o_pad[0] + o_adj[0];
-
-      if (param_.target_shape[0] > 0) {
-        CHECK_EQ(param_.target_shape[0], oshape[2]) \
-          << "param_.target_shape[0] was not reasonable, please set it carefully";
-      }
-
-      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value()));
-
-      return true;
-    } else if (param_.kernel.ndim() == 2) {
-      // 2d conv
-      CHECK_EQ(dshape.ndim(), 4U) \
-        << "Input data should be 4D in batch-num_filter-y-x";
-      Shape<4> dshape_nchw = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW);
-      Shape<4> wshape = Shape4(dshape_nchw[1],
-                               param_.num_filter / param_.num_group,
-                               param_.kernel[0], param_.kernel[1]);
-      wshape = ConvertLayout(wshape, kNCHW, param_.layout.value());
-      SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape);
-      if (!param_.no_bias) {
-        SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter));
-      }
-
-      const index_t dilated_ksize_y = param_.DilatedKernelSize(0);
-      const index_t dilated_ksize_x = param_.DilatedKernelSize(1);
-
-      index_t o_pad[2];
-      index_t o_adj[2];
-      param_.InferPad(dshape_nchw, o_pad, o_adj);
-
-      CHECK_EQ(dshape_nchw[1] % param_.num_group, 0U) \
-        << "input num_filter must divide group size";
-      CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
-        << "output num_filter must divide group size";
-      CHECK_GT(param_.kernel.Size(), 0U) \
-        << "incorrect kernel size: " << param_.kernel;
-      CHECK_GT(param_.stride.Size(), 0U) \
-        << "incorrect stride size: " << param_.stride;
-      CHECK_GT(param_.dilate.Size(), 0U) \
-          << "incorrect dilate size: " << param_.dilate;
-
-      CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(y) must be samller than stride[0]";
-      CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(x) must be samller than stride[1]";
-
-      Shape<4> oshape;
-      oshape[0] = dshape_nchw[0];
-      oshape[1] = param_.num_filter;
-      oshape[2] = param_.stride[0] * (dshape_nchw[2] - 1) +
-        dilated_ksize_y - 2 * o_pad[0] + o_adj[0];
-      oshape[3] = param_.stride[1] * (dshape_nchw[3] - 1) +
-        dilated_ksize_x - 2 * o_pad[1] + o_adj[1];
-
-      if (param_.target_shape[0] > 0) {
-        CHECK_EQ(param_.target_shape[0], oshape[2]) \
-          << "param_.target_shape[0] was not reasonable, please set it carefully";
-      }
-      if (param_.target_shape[1] > 0) {
-        CHECK_EQ(param_.target_shape[1], oshape[3]) \
-          << "param_.target_shape[1] was not reasonable, please set it carefully";
-      }
-
-      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value()));
-
-      return true;
-    } else if (param_.kernel.ndim() == 3) {
-      // 3d conv
-      CHECK_EQ(dshape.ndim(), 5U) \
-        << "Input data should be 5D in batch-num_filter-depth-y-x";
-      Shape<5> dshape_ncdhw = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW);
-      Shape<5> wshape = Shape5(dshape_ncdhw[1], param_.num_filter / param_.num_group,
-                               param_.kernel[0], param_.kernel[1], param_.kernel[2]);
-      wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value());
-      SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape);
-      if (!param_.no_bias) {
-        SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter));
-      }
-
-      // Note: 3D dilation currently not supported.
-      // Calculations below done to preserve symmetry with 1D/2D code.
-      const index_t dilated_ksize_d = param_.DilatedKernelSize(0);
-      const index_t dilated_ksize_y = param_.DilatedKernelSize(1);
-      const index_t dilated_ksize_x = param_.DilatedKernelSize(2);
-
-      index_t o_pad[3];
-      index_t o_adj[3];
-      param_.InferPad(dshape_ncdhw, o_pad, o_adj);
-
-      CHECK_EQ(dshape_ncdhw[1] % param_.num_group, 0U) \
-        << "input num_filter must divide group size";
-      CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
-        << "output num_filter must divide group size";
-      CHECK_GT(param_.kernel.Size(), 0U) \
-        << "incorrect kernel size: " << param_.kernel;
-      CHECK_GT(param_.stride.Size(), 0U) \
-        << "incorrect stride size: " << param_.stride;
-      CHECK_GT(param_.dilate.Size(), 0U) \
-        << "incorrect dilate size: " << param_.dilate;
-      CHECK_EQ(param_.dilate.Size(), 1U)
-        << "Dilate is not supported in 3d deconvolution";
-
-      CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(d) must be samller than stride[0]";
-      CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(y) must be samller than stride[1]";
-      CHECK_GE(param_.stride[2]-1, o_adj[2]) << "adj(x) must be samller than stride[2]";
-
-      Shape<5> oshape;
-      oshape[0] = dshape_ncdhw[0];
-      oshape[1] = param_.num_filter;
-      oshape[2] = param_.stride[0] * (dshape_ncdhw[2] - 1) +
-        dilated_ksize_d - 2 * o_pad[0] + o_adj[0];
-      oshape[3] = param_.stride[1] * (dshape_ncdhw[3] - 1) +
-        dilated_ksize_y - 2 * o_pad[1] + o_adj[1];
-      oshape[4] = param_.stride[2] * (dshape_ncdhw[4] - 1) +
-        dilated_ksize_x - 2 * o_pad[2] + o_adj[2];
-
-      if (param_.target_shape[0] > 0) {
-        CHECK_EQ(param_.target_shape[0], oshape[2]) \
-          << "param_.target_shape[0] was not reasonable, please it carefully";
-      }
-      if (param_.target_shape[1] > 0) {
-        CHECK_EQ(param_.target_shape[1], oshape[3]) \
-          << "param_.target_shape[1] was not reasonable, please set it carefully";
-      }
-      if (param_.target_shape[2] > 0) {
-        CHECK_EQ(param_.target_shape[2], oshape[4]) \
-          << "param_.target_shape[2] was not reasonable, please set it carefully";
-      }
-
-      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value()));
-
-      return true;
-    } else {
-      LOG(FATAL) << "Unknown convolution type";
-      return false;
-    }
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1U);
-    int dtype = (*in_type)[0];
-    CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
-      if ((*in_type)[i] == -1) {
-        (*in_type)[i] = dtype;
-      } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
-      }
-    }
-    out_type->clear();
-    out_type->push_back(dtype);
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    auto ptr = new DeconvolutionProp();
-    ptr->param_ = param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override {
-    return "Deconvolution";
-  }
-
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return {out_grad[deconv::kOut], in_data[deconv::kData], in_data[deconv::kWeight]};
-  }
+void _DeconvolutionCompute(const DeconvolutionParam& param,
+                           const OpContext& ctx, const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& outputs) {
+  MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, {
+    DeconvolutionOp<xpu, DType> op;
+    op.Init(param);
+    op.Forward(ctx, inputs, req, outputs);
+  });
+}
 
-  std::vector<ResourceRequest> ForwardResource(
-      const std::vector<TShape> &in_shape) const override {
-    return {ResourceRequest::kTempSpace};
-  }
+template<typename xpu>
+void DeconvolutionCompute(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx, const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  _DeconvolutionCompute<xpu>(param, ctx, inputs, req, outputs);
+}
 
-  std::vector<ResourceRequest> BackwardResource(
-      const std::vector<TShape> &in_shape) const override {
-    return {ResourceRequest::kTempSpace};
-  }
+template<typename xpu>
+void _DeconvolutionGradCompute(const DeconvolutionParam& param,
+                               const OpContext& ctx, const std::vector<TBlob>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<TBlob>& outputs) {
+  std::vector<TBlob> in_data(inputs.begin() + 1, inputs.end());
+  const TBlob &out_grad = inputs[0];
+  const std::vector<TBlob> &in_grad = outputs;
+
+  MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, {
+    DeconvolutionOp<xpu, DType> op;
+    op.Init(param);
+    op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
+  });
+}
 
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented";
-    return NULL;
-  }
 
-  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                             std::vector<int> *in_type) const override;
+template<typename xpu>
+void DeconvolutionGradCompute(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx, const std::vector<TBlob>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<TBlob>& outputs) {
+  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  _DeconvolutionGradCompute<xpu>(param, ctx, inputs, req, outputs);
+}
 
- private:
-  DeconvolutionParam param_;
-};  // class DeconvolutionProp
-#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_NN_DECONVOLUTION_INL_H_
diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc
index 9d3c040c1d63..a3fc915eb0fe 100644
--- a/src/operator/nn/deconvolution.cc
+++ b/src/operator/nn/deconvolution.cc
@@ -21,45 +21,408 @@
  * Copyright (c) 2015 by Contributors
  * \file deconvolution.cc
  * \brief
- * \author Wei Wu
+ * \author Wei Wu, Da Zheng
 */
 
 #include "./deconvolution-inl.h"
+#include "./mkldnn/mkldnn_ops-inl.h"
+#include "./mkldnn/mkldnn_base-inl.h"
 
 namespace mxnet {
 namespace op {
-template<>
-Operator* CreateOp<cpu>(DeconvolutionParam param, int dtype,
-                        std::vector<TShape> *in_shape,
-                        std::vector<TShape> *out_shape,
-                        Context ctx) {
-  Operator *op = NULL;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new DeconvolutionOp<cpu, DType>(param);
-  });
-  return op;
+
+static bool DeconvolutionShape(const nnvm::NodeAttrs& attrs,
+                               std::vector<TShape> *in_shape,
+                               std::vector<TShape> *out_shape) {
+  const DeconvolutionParam& param_ = nnvm::get<DeconvolutionParam>(attrs.parsed);
+#if MXNET_USE_CUDNN == 0
+  if (param_.kernel.ndim() > 2) {
+    LOG(FATAL) << "If not using CUDNN, only 1D or 2D Deconvolution is supported";
+    return false;
+  }
+#endif  // CUDNN
+
+  using namespace mshadow;
+  if (!param_.no_bias) {
+    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
+  } else {
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
+  }
+  out_shape->resize(1, TShape());
+  const TShape &dshape = (*in_shape)[deconv::kData];
+  if (dshape.ndim() ==  0) return false;
+
+  if (param_.kernel.ndim() == 1) {
+    // 1d conv
+    CHECK_EQ(dshape.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x";
+    Shape<3> dshape_ncw = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW);
+    Shape<3> wshape = Shape3(dshape_ncw[1], param_.num_filter / param_.num_group,
+        param_.kernel[0]);
+    wshape = ConvertLayout(wshape, kNCW, param_.layout.value());
+    SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape);
+    if (!param_.no_bias) {
+      SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter));
+    }
+
+    const index_t dilated_ksize_x = param_.DilatedKernelSize(0);
+
+    index_t o_pad[1];
+    index_t o_adj[1];
+    param_.InferPad(dshape_ncw, o_pad, o_adj);
+
+    CHECK_EQ(dshape_ncw[1] % param_.num_group, 0U) \
+      << "input num_filter must divide group size";
+    CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
+      << "output num_filter must divide group size";
+    CHECK_GT(param_.kernel.Size(), 0U) \
+      << "incorrect kernel size: " << param_.kernel;
+    CHECK_GT(param_.stride.Size(), 0U) \
+      << "incorrect stride size: " << param_.stride;
+    CHECK_GT(param_.dilate.Size(), 0U) \
+      << "incorrect dilate size: " << param_.dilate;
+
+    CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(x) must be samller than stride[0]";
+
+    Shape<3> oshape;
+    oshape[0] = dshape_ncw[0];
+    oshape[1] = param_.num_filter;
+    oshape[2] = param_.stride[0] * (dshape_ncw[2] - 1) +
+      dilated_ksize_x - 2 * o_pad[0] + o_adj[0];
+
+    if (param_.target_shape.ndim() > 0) {
+      if (param_.target_shape[0] > 0) {
+        CHECK_EQ(param_.target_shape[0], oshape[2]) \
+          << "param_.target_shape[0] was not reasonable, please set it carefully";
+      }
+    }
+
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value()));
+
+    return true;
+  } else if (param_.kernel.ndim() == 2) {
+    // 2d conv
+    CHECK_EQ(dshape.ndim(), 4U) \
+      << "Input data should be 4D in batch-num_filter-y-x";
+    Shape<4> dshape_nchw = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW);
+    Shape<4> wshape = Shape4(dshape_nchw[1],
+        param_.num_filter / param_.num_group,
+        param_.kernel[0], param_.kernel[1]);
+    wshape = ConvertLayout(wshape, kNCHW, param_.layout.value());
+    SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape);
+    if (!param_.no_bias) {
+      SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter));
+    }
+
+    const index_t dilated_ksize_y = param_.DilatedKernelSize(0);
+    const index_t dilated_ksize_x = param_.DilatedKernelSize(1);
+
+    index_t o_pad[2];
+    index_t o_adj[2];
+    param_.InferPad(dshape_nchw, o_pad, o_adj);
+
+    CHECK_EQ(dshape_nchw[1] % param_.num_group, 0U) \
+      << "input num_filter must divide group size";
+    CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
+      << "output num_filter must divide group size";
+    CHECK_GT(param_.kernel.Size(), 0U) \
+      << "incorrect kernel size: " << param_.kernel;
+    CHECK_GT(param_.stride.Size(), 0U) \
+      << "incorrect stride size: " << param_.stride;
+    CHECK_GT(param_.dilate.Size(), 0U) \
+      << "incorrect dilate size: " << param_.dilate;
+
+    CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(y) must be samller than stride[0]";
+    CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(x) must be samller than stride[1]";
+
+    Shape<4> oshape;
+    oshape[0] = dshape_nchw[0];
+    oshape[1] = param_.num_filter;
+    oshape[2] = param_.stride[0] * (dshape_nchw[2] - 1) +
+      dilated_ksize_y - 2 * o_pad[0] + o_adj[0];
+    oshape[3] = param_.stride[1] * (dshape_nchw[3] - 1) +
+      dilated_ksize_x - 2 * o_pad[1] + o_adj[1];
+
+    if (param_.target_shape.ndim() > 1) {
+      if (param_.target_shape[0] > 0) {
+        CHECK_EQ(param_.target_shape[0], oshape[2]) \
+          << "param_.target_shape[0] was not reasonable, please set it carefully";
+      }
+      if (param_.target_shape[1] > 0) {
+        CHECK_EQ(param_.target_shape[1], oshape[3]) \
+          << "param_.target_shape[1] was not reasonable, please set it carefully";
+      }
+    }
+
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value()));
+
+    return true;
+  } else if (param_.kernel.ndim() == 3) {
+    // 3d conv
+    CHECK_EQ(dshape.ndim(), 5U) \
+      << "Input data should be 5D in batch-num_filter-depth-y-x";
+    Shape<5> dshape_ncdhw = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW);
+    Shape<5> wshape = Shape5(dshape_ncdhw[1], param_.num_filter / param_.num_group,
+        param_.kernel[0], param_.kernel[1], param_.kernel[2]);
+    wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value());
+    SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape);
+    if (!param_.no_bias) {
+      SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter));
+    }
+
+    // Note: 3D dilation currently not supported.
+    // Calculations below done to preserve symmetry with 1D/2D code.
+    const index_t dilated_ksize_d = param_.DilatedKernelSize(0);
+    const index_t dilated_ksize_y = param_.DilatedKernelSize(1);
+    const index_t dilated_ksize_x = param_.DilatedKernelSize(2);
+
+    index_t o_pad[3];
+    index_t o_adj[3];
+    param_.InferPad(dshape_ncdhw, o_pad, o_adj);
+
+    CHECK_EQ(dshape_ncdhw[1] % param_.num_group, 0U) \
+      << "input num_filter must divide group size";
+    CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
+      << "output num_filter must divide group size";
+    CHECK_GT(param_.kernel.Size(), 0U) \
+      << "incorrect kernel size: " << param_.kernel;
+    CHECK_GT(param_.stride.Size(), 0U) \
+      << "incorrect stride size: " << param_.stride;
+    CHECK_GT(param_.dilate.Size(), 0U) \
+      << "incorrect dilate size: " << param_.dilate;
+    CHECK_EQ(param_.dilate.Size(), 1U)
+      << "Dilate is not supported in 3d deconvolution";
+
+    CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(d) must be samller than stride[0]";
+    CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(y) must be samller than stride[1]";
+    CHECK_GE(param_.stride[2]-1, o_adj[2]) << "adj(x) must be samller than stride[2]";
+
+    Shape<5> oshape;
+    oshape[0] = dshape_ncdhw[0];
+    oshape[1] = param_.num_filter;
+    oshape[2] = param_.stride[0] * (dshape_ncdhw[2] - 1) +
+      dilated_ksize_d - 2 * o_pad[0] + o_adj[0];
+    oshape[3] = param_.stride[1] * (dshape_ncdhw[3] - 1) +
+      dilated_ksize_y - 2 * o_pad[1] + o_adj[1];
+    oshape[4] = param_.stride[2] * (dshape_ncdhw[4] - 1) +
+      dilated_ksize_x - 2 * o_pad[2] + o_adj[2];
+
+    if (param_.target_shape.ndim() > 2) {
+      if (param_.target_shape[0] > 0) {
+        CHECK_EQ(param_.target_shape[0], oshape[2]) \
+          << "param_.target_shape[0] was not reasonable, please it carefully";
+      }
+      if (param_.target_shape[1] > 0) {
+        CHECK_EQ(param_.target_shape[1], oshape[3]) \
+          << "param_.target_shape[1] was not reasonable, please set it carefully";
+      }
+      if (param_.target_shape[2] > 0) {
+        CHECK_EQ(param_.target_shape[2], oshape[4]) \
+          << "param_.target_shape[2] was not reasonable, please set it carefully";
+      }
+    }
+
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value()));
+
+    return true;
+  } else {
+    LOG(FATAL) << "Unknown convolution type";
+    return false;
+  }
+}
+
+static inline std::vector<std::string> ListArguments(const DeconvolutionParam& param_) {
+  if (!param_.no_bias) {
+    return {"data", "weight", "bias"};
+  } else {
+    return {"data", "weight"};
+  }
+}
+
+static bool DeconvolutionType(const nnvm::NodeAttrs& attrs,
+                              std::vector<int> *in_type, std::vector<int> *out_type) {
+  const DeconvolutionParam& param_ = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  CHECK_GE(in_type->size(), 1U);
+  int dtype = (*in_type)[0];
+  CHECK_NE(dtype, -1) << "First input must have specified type";
+  for (index_t i = 0; i < in_type->size(); ++i) {
+    if ((*in_type)[i] == -1) {
+      (*in_type)[i] = dtype;
+    } else {
+      UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param_)[i]);
+    }
+  }
+  out_type->clear();
+  out_type->push_back(dtype);
+  return true;
+}
+
+inline static bool DeconvStorageType(const nnvm::NodeAttrs& attrs,
+                                     const int dev_mask,
+                                     DispatchMode* dispatch_mode,
+                                     std::vector<int> *in_attrs,
+                                     std::vector<int> *out_attrs) {
+  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  uint32_t in_expected = param.no_bias ? 2 : 3;
+  CHECK_EQ(in_attrs->size(), in_expected);
+  CHECK_EQ(out_attrs->size(), 1);
+
+  DispatchMode wanted_mode;
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask)
+    wanted_mode = DispatchMode::kFComputeEx;
+  else
+#endif
+    wanted_mode = DispatchMode::kFCompute;
+  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
+                             dispatch_mode, wanted_mode);
+}
+
+inline static bool BackwardDeconvStorageType(const nnvm::NodeAttrs& attrs,
+                                             const int dev_mask,
+                                             DispatchMode* dispatch_mode,
+                                             std::vector<int> *in_attrs,
+                                             std::vector<int> *out_attrs) {
+  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  uint32_t out_expected = param.no_bias ? 2 : 3;
+  CHECK_EQ(in_attrs->size(), param.no_bias ? 3U : 4U);
+  CHECK_EQ(out_attrs->size(), out_expected);
+
+  DispatchMode wanted_mode;
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask)
+    wanted_mode = DispatchMode::kFComputeEx;
+  else
+#endif
+    wanted_mode = DispatchMode::kFCompute;
+  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
+                             dispatch_mode, wanted_mode);
+}
+
+#if MXNET_USE_MKLDNN == 1
+static void DeconvolutionComputeExCPU(const nnvm::NodeAttrs& attrs,
+                                      const OpContext& ctx,
+                                      const std::vector<NDArray>& inputs,
+                                      const std::vector<OpReqType>& req,
+                                      const std::vector<NDArray>& outputs) {
+  if (SupportMKLDNNConv(inputs[0])) {
+    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    MKLDNNDeconvolutionForward(attrs, ctx, inputs, req, outputs);
+    MKLDNN_OPCHECK_RUN(DeconvolutionCompute<cpu>, attrs, ctx, inputs, req,
+                       outputs);
+    return;
+  }
+  FallBackCompute(DeconvolutionCompute<cpu>, attrs, ctx, inputs, req,
+                  outputs);
 }
 
-Operator* DeconvolutionProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                                              std::vector<int> *in_type) const {
-  std::vector<TShape> out_shape, aux_shape;
-  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
-  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0), in_shape, &out_shape, ctx);
+static void DeconvolutionGradComputeExCPU(const nnvm::NodeAttrs& attrs,
+                                          const OpContext& ctx,
+                                          const std::vector<NDArray>& inputs,
+                                          const std::vector<OpReqType>& req,
+                                          const std::vector<NDArray>& outputs) {
+  if (SupportMKLDNNConv(inputs[0])) {
+    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    MKLDNNDeconvolutionBackward(attrs, ctx, inputs, req, outputs);
+    MKLDNN_OPCHECK_RUN(DeconvolutionGradCompute<cpu>, attrs, ctx, inputs, req,
+                       outputs);
+    return;
+  }
+  FallBackCompute(DeconvolutionGradCompute<cpu>, attrs, ctx, inputs, req,
+                  outputs);
 }
+#endif
+
+static void DeconvolutionParamParser(nnvm::NodeAttrs* attrs) {
+  using namespace mshadow;
+  DeconvolutionParam param_;
+  param_.Init(attrs->dict);
+  if (param_.kernel.ndim() == 1) {
+    param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW;
+    if (param_.stride.ndim() == 0) param_.stride = Shape1(1);
+    if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1);
+    if (param_.pad.ndim() == 0) param_.pad = Shape1(0);
+    if (param_.adj.ndim() == 0) param_.adj = Shape1(0);
+  } else if (param_.kernel.ndim() == 2) {
+    param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW;
+    if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
+    if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1);
+    if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
+    if (param_.adj.ndim() == 0) param_.adj = Shape2(0, 0);
+  } else {
+    CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D deconvolution not supported";
+    param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW;
+    if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
+    if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1);
+    if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
+    if (param_.adj.ndim() == 0) param_.adj = Shape3(0, 0, 0);
+  }
+  attrs->parsed = std::move(param_);
+}
+
+struct DeconvolutionGrad {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
+    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
+    heads.push_back(n->inputs[deconv::kData]);
+    heads.push_back(n->inputs[deconv::kWeight]);
+    const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(n->attrs.parsed);
+    if (!param.no_bias)
+      heads.push_back(n->inputs[deconv::kBias]);
+    return MakeGradNode(op_name, n, heads, n->attrs.dict);
+  }
+};
 
 DMLC_REGISTER_PARAMETER(DeconvolutionParam);
 
-MXNET_REGISTER_OP_PROPERTY(Deconvolution, DeconvolutionProp)
-.add_argument("data", "NDArray-or-Symbol", "Input tensor to the deconvolution operation.")
-.add_argument("weight", "NDArray-or-Symbol", "Weights representing the kernel.")
-.add_argument("bias", "NDArray-or-Symbol", "Bias added to the result after the deconvolution "
-    "operation.")
-.add_arguments(DeconvolutionParam::__FIELDS__())
+NNVM_REGISTER_OP(Deconvolution)
 .describe("Computes 1D or 2D transposed convolution (aka fractionally strided convolution) of the "
     "input tensor. This operation can be seen as the gradient of Convolution operation with "
     "respect to its input. Convolution usually reduces the size of the input. Transposed "
     "convolution works the other way, going from a smaller input to a larger output while "
-    "preserving the connectivity pattern.");
+    "preserving the connectivity pattern.")
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const DeconvolutionParam& params = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  return params.no_bias ? 2 : 3;
+})
+.set_num_outputs(1)
+.set_attr_parser(DeconvolutionParamParser)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+    [](const NodeAttrs& attrs) {
+  return ListArguments(nnvm::get<DeconvolutionParam>(attrs.parsed));
+})
+.set_attr<nnvm::FInferShape>("FInferShape", DeconvolutionShape)
+.set_attr<nnvm::FInferType>("FInferType", DeconvolutionType)
+.set_attr<FInferStorageType>("FInferStorageType", DeconvStorageType)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr<FCompute>("FCompute<cpu>", DeconvolutionCompute<cpu>)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FComputeEx>("FComputeEx<cpu>", DeconvolutionComputeExCPU)
+#endif
+.set_attr<nnvm::FGradient>("FGradient", DeconvolutionGrad{"_backward_Deconvolution"})
+.add_argument("data", "NDArray-or-Symbol", "Input tensor to the deconvolution operation.")
+.add_argument("weight", "NDArray-or-Symbol", "Weights representing the kernel.")
+.add_argument("bias", "NDArray-or-Symbol", "Bias added to the result after the deconvolution "
+    "operation.")
+.add_arguments(DeconvolutionParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_Deconvolution)
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const DeconvolutionParam& params = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  return params.no_bias ? 2 : 3;
+})
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FInferStorageType>("FInferStorageType", BackwardDeconvStorageType)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr_parser(DeconvolutionParamParser)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FComputeEx>("FComputeEx<cpu>", DeconvolutionGradComputeExCPU)
+#endif
+.set_attr<FCompute>("FCompute<cpu>", DeconvolutionGradCompute<cpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/deconvolution.cu b/src/operator/nn/deconvolution.cu
index 623770170d50..c7395428c2a0 100644
--- a/src/operator/nn/deconvolution.cu
+++ b/src/operator/nn/deconvolution.cu
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file deconvolution.cu
  * \brief
- * \author Wei Wu
+ * \author Wei Wu, Da Zheng
 */
 
 #include "./deconvolution-inl.h"
@@ -31,13 +31,29 @@
 
 namespace mxnet {
 namespace op {
+
+#if MXNET_USE_CUDNN == 1
+template<typename DType>
+static CuDNNDeconvolutionOp<DType> &GetCuDNNDeconvOp(const DeconvolutionParam& param,
+                                                     int forward_compute_type,
+                                                     int backward_compute_type,
+                                                     const std::vector<TShape>& in_shape,
+                                                     const std::vector<TShape>& out_shape,
+                                                     const Context& ctx) {
+  static thread_local CuDNNDeconvolutionOp<DType> op;
+  op.Init(param, forward_compute_type, backward_compute_type, in_shape, out_shape, ctx);
+  return op;
+}
+#endif
+
 template<>
-Operator* CreateOp<gpu>(DeconvolutionParam param, int dtype,
-                        std::vector<TShape> *in_shape,
-                        std::vector<TShape> *out_shape,
-                        Context ctx) {
-  // Logic here parallels that in Convolution.cu
-  Operator *op = NULL;
+void DeconvolutionCompute<gpu>(const nnvm::NodeAttrs& attrs,
+                               const OpContext& ctx,
+                               const std::vector<TBlob>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<TBlob>& outputs) {
+  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  int dtype = inputs[0].type_flag_;
 
 #if MXNET_USE_CUDNN == 1
   // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16).
@@ -45,23 +61,88 @@ Operator* CreateOp<gpu>(DeconvolutionParam param, int dtype,
 
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
     if (param.cudnn_off) {
-      op = new DeconvolutionOp<gpu, DType>(param);
-    } else if (!CuDNNDeconvolutionOp<DType>::Supports(param, compute_type, compute_type, ctx)) {
+      DeconvolutionOp<gpu, DType> op;
+      op.Init(param);
+      op.Forward(ctx, inputs, req, outputs);
+    } else if (!CuDNNDeconvolutionOp<DType>::Supports(param,
+          compute_type, compute_type, ctx.run_ctx.ctx)) {
       LOG(WARNING) <<
         "This deconvolution is not supported by cudnn, MXNET deconvolution is applied.";
-      op = new DeconvolutionOp<gpu, DType>(param);
+      DeconvolutionOp<gpu, DType> op;
+      op.Init(param);
+      op.Forward(ctx, inputs, req, outputs);
     } else {
-      op = new CuDNNDeconvolutionOp<DType>(param, compute_type, compute_type,
-                                           *in_shape, *out_shape, ctx);
+      std::vector<TShape> in_shape(inputs.size());
+      std::vector<TShape> out_shape(1, outputs[0].shape_);
+      for (size_t i = 0; i < in_shape.size(); i++) {
+        in_shape[i] = inputs[i].shape_;
+      }
+      GetCuDNNDeconvOp<DType>(param, compute_type, compute_type,
+          in_shape, out_shape, ctx.run_ctx.ctx).Forward(ctx, inputs, req, outputs);
     }
   })
 #else
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new DeconvolutionOp<gpu, DType>(param);
+    DeconvolutionOp<gpu, DType> op;
+    op.Init(param);
+    op.Forward(ctx, inputs, req, outputs);
   })
 #endif  // MXNET_USE_CUDNN
-  return op;
 }
 
+template<>
+void DeconvolutionGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
+                                   const OpContext& ctx,
+                                   const std::vector<TBlob>& inputs,
+                                   const std::vector<OpReqType>& req,
+                                   const std::vector<TBlob>& outputs) {
+  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  std::vector<TBlob> in_data(inputs.begin() + 1, inputs.end());
+  const TBlob &out_grad = inputs[0];
+  const std::vector<TBlob> &in_grad = outputs;
+  int dtype = out_grad.type_flag_;
+
+#if MXNET_USE_CUDNN == 1
+  // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16).
+  int compute_type = (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype;
+
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    if (param.cudnn_off) {
+      DeconvolutionOp<gpu, DType> op;
+      op.Init(param);
+      op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
+    } else if (!CuDNNDeconvolutionOp<DType>::Supports(param,
+          compute_type, compute_type, ctx.run_ctx.ctx)) {
+      LOG(WARNING) <<
+        "This deconvolution is not supported by cudnn, MXNET deconvolution is applied.";
+      DeconvolutionOp<gpu, DType> op;
+      op.Init(param);
+      op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
+    } else {
+      std::vector<TShape> in_shape(in_data.size());
+      std::vector<TShape> out_shape(1, out_grad.shape_);
+      for (size_t i = 0; i < in_shape.size(); i++) {
+        in_shape[i] = in_data[i].shape_;
+      }
+      GetCuDNNDeconvOp<DType>(param, compute_type, compute_type,
+          in_shape, out_shape, ctx.run_ctx.ctx).Backward(ctx,
+            std::vector<TBlob>{out_grad}, in_data, req, in_grad);
+    }
+  })
+#else
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    DeconvolutionOp<gpu, DType> op;
+    op.Init(param);
+    op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
+  })
+#endif  // MXNET_USE_CUDNN
+}
+
+NNVM_REGISTER_OP(Deconvolution)
+.set_attr<FCompute>("FCompute<gpu>", DeconvolutionCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_Deconvolution)
+.set_attr<FCompute>("FCompute<gpu>", DeconvolutionGradCompute<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/depthwise_convolution-inl.h b/src/operator/nn/depthwise_convolution-inl.h
index c4b7a4787554..0af8cae51c84 100644
--- a/src/operator/nn/depthwise_convolution-inl.h
+++ b/src/operator/nn/depthwise_convolution-inl.h
@@ -39,11 +39,11 @@ namespace mxnet {
 namespace op {
 using namespace tf::depthwise_conv;
 template<typename DType>
-class DepthwiseConvolutionOp : public Operator {
+class DepthwiseConvolutionOp {
  public:
-  explicit DepthwiseConvolutionOp(const ConvolutionParam& param,
-                                  const std::vector<TShape>& in_shape,
-                                  const std::vector<TShape>& out_shape) {
+  void Init(const ConvolutionParam& param,
+            const std::vector<TShape>& in_shape,
+            const std::vector<TShape>& out_shape) {
     args_.batch = in_shape[conv::kData][0];
     args_.in_channel = in_shape[conv::kData][1];
     args_.in_height = in_shape[conv::kData][2];
@@ -62,19 +62,16 @@ class DepthwiseConvolutionOp : public Operator {
 
   ~DepthwiseConvolutionOp() {}
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args);
+  void Forward(const OpContext &ctx,
+               const std::vector<TBlob> &in_data,
+               const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &out_data);
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args);
+  void Backward(const OpContext &ctx,
+                const std::vector<TBlob> &out_grad,
+                const std::vector<TBlob> &in_data,
+                const std::vector<OpReqType> &req,
+                const std::vector<TBlob> &in_grad);
 
  private:
   DepthwiseArgs args_;
@@ -282,8 +279,7 @@ template<typename DType>
 void DepthwiseConvolutionOp<DType>::Forward(const OpContext &ctx,
                                             const std::vector<TBlob> &in_data,
                                             const std::vector<OpReqType> &req,
-                                            const std::vector<TBlob> &out_data,
-                                            const std::vector<TBlob> &aux_states) {
+                                            const std::vector<TBlob> &out_data) {
   using namespace mshadow;
   using namespace mshadow::expr;
   auto stream = ctx.get_stream<gpu>();
@@ -305,10 +301,8 @@ template<typename DType>
 void DepthwiseConvolutionOp<DType>::Backward(const OpContext &ctx,
                                              const std::vector<TBlob> &out_grad,
                                              const std::vector<TBlob> &in_data,
-                                             const std::vector<TBlob> &out_data,
                                              const std::vector<OpReqType> &req,
-                                             const std::vector<TBlob> &in_grad,
-                                             const std::vector<TBlob> &aux_states) {
+                                             const std::vector<TBlob> &in_grad) {
   using namespace mshadow;
   using namespace mshadow::expr;
   auto stream = ctx.get_stream<gpu>();
diff --git a/src/operator/nn/depthwise_convolution_tf.cuh b/src/operator/nn/depthwise_convolution_tf.cuh
index c7f48e686136..e4dfd8292d2d 100644
--- a/src/operator/nn/depthwise_convolution_tf.cuh
+++ b/src/operator/nn/depthwise_convolution_tf.cuh
@@ -24,8 +24,8 @@
  *        are different with origin version.
  * \author shuqian.qu@hobot.cc
 */
-#ifndef MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_TF_CUH_
-#define MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_TF_CUH_
+#ifndef MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_TF_CUH_
+#define MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_TF_CUH_
 #include "../../common/cuda_utils.h"
 #include "../mxnet_op.h"
 
@@ -730,4 +730,4 @@ bool TryLaunchDepthwiseConv2dBackwardFilterGPUSmall(mshadow::Stream<mxnet::gpu>
 }  // namespace depthwise_conv
 }  // namespace tf
 
-#endif  // MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_TF_CUH_
+#endif  // MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_TF_CUH_
diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h
index 715a6f4ee219..cff35a3cef7f 100644
--- a/src/operator/nn/dropout-inl.h
+++ b/src/operator/nn/dropout-inl.h
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file dropout-inl.h
  * \brief
- * \author Bing Xu
+ * \author Bing Xu, Da Zheng
 */
 
 #ifndef MXNET_OPERATOR_NN_DROPOUT_INL_H_
@@ -71,7 +71,7 @@ struct DropoutParam : public dmlc::Parameter<DropoutParam> {
 };  // struct DropoutParam
 
 template<typename xpu, typename DType>
-class DropoutOp : public Operator {
+class DropoutOp {
 #if defined(USE_MKL) && defined(_OPENMP)
   static void BernoulliGenerate(common::random::RandGenerator<cpu, DType> gen,
                                 int n, double p, int* r) {
@@ -206,16 +206,15 @@ class DropoutOp : public Operator {
     }
   };
 
-  explicit DropoutOp(DropoutParam param) {
+  void Init(const DropoutParam &param) {
     this->pkeep_ = 1.0f - param.p;
     this->mode_ = static_cast<dropout::DropoutOpMode>(param.mode);
   }
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_states) {
+  void Forward(const OpContext &ctx,
+               const std::vector<TBlob> &in_data,
+               const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &out_data) {
     if (req[dropout::kOut] != kNullOp) {
       CHECK_EQ(in_data.size(), 1U);
       if (ctx.is_train) {
@@ -249,17 +248,13 @@ class DropoutOp : public Operator {
     }
   }
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_states) {
+  void Backward(const OpContext &ctx,
+                const std::vector<TBlob> &out_grad,
+                const std::vector<TBlob> &out_data,
+                const std::vector<OpReqType> &req,
+                const std::vector<TBlob> &in_grad) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1U);
-    CHECK_EQ(in_grad.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     if (ctx.is_train || mode_ == dropout::kAlways) {
       if (!MKLBackward(s, this->pkeep_, in_grad, out_data, out_grad)) {
@@ -293,110 +288,42 @@ class DropoutOp : public Operator {
   dropout::DropoutOpMode mode_;
 };  // class DropoutOp
 
-
 template<typename xpu>
-Operator *CreateOp(DropoutParam param, int dtype);
-
-#if DMLC_USE_CXX11
-class DropoutProp : public OperatorProperty {
- public:
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-    using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 1U);
-    const TShape &dshape = in_shape->at(0);
-    if (dshape.ndim() == 0) return false;
-    out_shape->clear();
-    out_shape->push_back(dshape);
-    out_shape->push_back(dshape);
-    return true;
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_EQ(in_type->size(), 1U);
-    int dtype = in_type->at(0);
-
-    if (dtype == -1) {
-      LOG(FATAL) << "input type to dropout is not specified.";
-      return false;
-    }
-
-    size_t nout = this->ListOutputs().size();
-    out_type->clear();
-    for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype);
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    auto ptr = new DropoutProp();
-    ptr->param_ = param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override {
-    return "Dropout";
-  }
+void DropoutCompute(const nnvm::NodeAttrs& attrs,
+                    const OpContext& ctx,
+                    const std::vector<TBlob>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<TBlob>& outputs) {
+  const DropoutParam& param = nnvm::get<DropoutParam>(attrs.parsed);
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    static thread_local DropoutOp<xpu, DType> op;
+    op.Init(param);
+    op.Forward(ctx, inputs, req, outputs);
+  });
+}
 
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return {out_grad[dropout::kOut], out_data[dropout::kMask]};
-  }
-
-  std::vector<std::pair<int, void*> > BackwardInplaceOption(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data,
-    const std::vector<void*> &in_grad) const override {
-    return {{out_grad[dropout::kOut], in_grad[dropout::kData]}};
-  }
-
-  std::vector<std::pair<int, void*> > ForwardInplaceOption(
-    const std::vector<int> &in_data,
-    const std::vector<void*> &out_data) const override {
-    return {{in_data[dropout::kData], out_data[dropout::kOut]}};
-  }
-
-  std::vector<ResourceRequest> ForwardResource(const std::vector<TShape> &in_shape) const override {
-    return { ResourceRequest::kParallelRandom };
-  }
-
-  int NumVisibleOutputs() const override {
-    return 1;
-  }
-
-  int NumOutputs() const override {
-    return 2;
-  }
-
-  std::vector<std::string> ListOutputs() const override {
-    return {"output", "mask"};
-  }
-
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented";
-    return NULL;
-  }
-
-  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                             std::vector<int> *in_type) const override;
+template<typename xpu>
+void DropoutGradCompute(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const std::vector<TBlob>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& outputs) {
+  const DropoutParam& param = nnvm::get<DropoutParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1);
+  CHECK_EQ(req.size(), 1);
+  std::vector<TBlob> out_grads(2);
+  std::vector<TBlob> out_data(2);
+  out_grads[dropout::kOut] = inputs[0];
+  out_data[dropout::kMask] = inputs[1];
+
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    static thread_local DropoutOp<xpu, DType> op;
+    op.Init(param);
+    op.Backward(ctx, out_grads, out_data, req, outputs);
+  });
+}
 
- private:
-  DropoutParam param_;
-};  // class DropoutProp
-#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_NN_DROPOUT_INL_H_
diff --git a/src/operator/nn/dropout.cc b/src/operator/nn/dropout.cc
index 3aa832a71356..dd5f1e58fbe5 100644
--- a/src/operator/nn/dropout.cc
+++ b/src/operator/nn/dropout.cc
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file dropout.cc
  * \brief
- * \author Bing Xu
+ * \author Bing Xu, Da Zheng
 */
 
 #include "./dropout-inl.h"
@@ -29,24 +29,21 @@
 
 namespace mxnet {
 namespace op {
-template<>
-Operator *CreateOp<cpu>(DropoutParam param, int dtype) {
-  Operator *op = NULL;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new DropoutOp<cpu, DType>(param);
-  });
-  return op;
-}
-
-// DO_BIND_DISPATCH comes from operator_common.h
-Operator *DropoutProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                                              std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
-}
+
+struct DropoutGrad {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
+    std::vector<nnvm::NodeEntry> heads;
+    heads.push_back(ograds[0]);
+    heads.emplace_back(nnvm::NodeEntry{n, dropout::kMask, 0});
+    return MakeGradNode(op_name, n, heads, n->attrs.dict);
+  }
+};
 
 DMLC_REGISTER_PARAMETER(DropoutParam);
 
-MXNET_REGISTER_OP_PROPERTY(Dropout, DropoutProp)
+NNVM_REGISTER_OP(Dropout)
 .describe(R"(Applies dropout operation to input array.
 
 - During training, each element of the input is set to zero with probability p.
@@ -77,8 +74,66 @@ Example::
   [[ 3.     0.5   -0.5    2.     7.   ]
    [ 2.    -0.4    7.     3.     0.2  ]]
 )" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(2)
+.set_attr_parser(ParamParser<DropoutParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+    [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"data"};
+})
+.set_attr<nnvm::FListOutputNames>("FListOutputNames",
+    [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"output", "mask"};
+})
+.set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs",
+    [](const NodeAttrs& attrs) {
+  return 1;
+})
+.set_attr<nnvm::FInferShape>("FInferShape", [](const nnvm::NodeAttrs& attrs,
+      std::vector<TShape> *in_shape, std::vector<TShape> *out_shape){
+  using namespace mshadow;
+  CHECK_EQ(in_shape->size(), 1U);
+  const TShape &dshape = in_shape->at(0);
+  if (dshape.ndim() == 0) return false;
+  out_shape->clear();
+  out_shape->push_back(dshape);
+  out_shape->push_back(dshape);
+  return true;
+})
+.set_attr<nnvm::FInferType>("FInferType", [](const nnvm::NodeAttrs& attrs,
+      std::vector<int> *in_type, std::vector<int> *out_type) {
+  CHECK_EQ(in_type->size(), 1U);
+  int dtype = in_type->at(0);
+
+  if (dtype == -1) {
+    LOG(FATAL) << "input type to dropout is not specified.";
+    return false;
+  }
+
+  size_t nout = 2;
+  out_type->clear();
+  for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype);
+  return true;
+})
+.set_attr<FCompute>("FCompute<cpu>", DropoutCompute<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", DropoutGrad{"_backward_Dropout"})
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
+  return std::vector<std::pair<int, int> >{{0, 0}};
+})
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ ResourceRequest::kParallelRandom };
+})
 .add_argument("data", "NDArray-or-Symbol", "Input array to which dropout will be applied.")
 .add_arguments(DropoutParam::__FIELDS__());
 
+NNVM_REGISTER_OP(_backward_Dropout)
+.set_num_outputs(1)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr_parser(ParamParser<DropoutParam>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
+  return std::vector<std::pair<int, int> >{{0, 0}};
+})
+.set_attr<FCompute>("FCompute<cpu>", DropoutGradCompute<cpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/dropout.cu b/src/operator/nn/dropout.cu
index f416c5883203..e655278822a4 100644
--- a/src/operator/nn/dropout.cu
+++ b/src/operator/nn/dropout.cu
@@ -21,21 +21,20 @@
  * Copyright (c) 2015 by Contributors
  * \file dropout.cc
  * \brief
- * \author Bing Xu
+ * \author Bing Xu, Da Zheng
 */
 
 #include "./dropout-inl.h"
 
 namespace mxnet {
 namespace op {
-template<>
-Operator *CreateOp<gpu>(DropoutParam param, int dtype) {
-  Operator *op = NULL;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new DropoutOp<gpu, DType>(param);
-  });
-  return op;
-}
+
+NNVM_REGISTER_OP(Dropout)
+.set_attr<FCompute>("FCompute<gpu>", DropoutCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_Dropout)
+.set_attr<FCompute>("FCompute<gpu>", DropoutGradCompute<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/nn/fully_connected-inl.h b/src/operator/nn/fully_connected-inl.h
index 9f3deec2449f..e8e95643e647 100644
--- a/src/operator/nn/fully_connected-inl.h
+++ b/src/operator/nn/fully_connected-inl.h
@@ -43,6 +43,7 @@ namespace op {
 // These enums are only visible within this header
 namespace fullc {
 enum FullyConnectedOpInputs {kData, kWeight, kBias};
+enum FullyConnectedOpResource {kTempSpace};
 enum FullyConnectedOpOutputs {kOut};
 }  // fullc
 
@@ -61,240 +62,160 @@ struct FullyConnectedParam : public dmlc::Parameter<FullyConnectedParam> {
   }
 };
 
-/**
- * \brief This is the implementation of fully connected operator.
- * \tparam xpu The device that the op will be executed on.
- */
 template<typename xpu, typename DType>
-class FullyConnectedOp : public Operator {
- public:
-  explicit FullyConnectedOp(FullyConnectedParam p) {
-    this->param_ = p;
-  }
-
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    if (req[fullc::kOut] == kNullOp) return;
-    CHECK_EQ(req[fullc::kOut], kWriteTo);
-    size_t expected = param_.no_bias ? 2 : 3;
-    CHECK_EQ(in_data.size(), expected);
-    CHECK_EQ(out_data.size(), 1U);
-    // TODO(bing): check the BLAS Handle, be careful
-    // maybe need blas handle from context
-    // TODO(bing): judge shape to remove flatten op
-    Stream<xpu> *s = ctx.get_stream<xpu>();
+void FCForward(const OpContext &ctx, const FullyConnectedParam &param,
+               const std::vector<TBlob> &in_data, const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &out_data) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  if (req[fullc::kOut] == kNullOp) return;
+  CHECK_EQ(req[fullc::kOut], kWriteTo);
+  // TODO(bing): check the BLAS Handle, be careful
+  // maybe need blas handle from context
+  // TODO(bing): judge shape to remove flatten op
+  Stream<xpu> *s = ctx.get_stream<xpu>();
 #if defined(__CUDACC__)
-    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
-        << "Must init CuBLAS handle in stream";
+  CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
+      << "Must init CuBLAS handle in stream";
 #endif  // __CUDACC__
-    const TShape& ishape = in_data[fullc::kData].shape_;
-    const TShape& oshape = out_data[fullc::kOut].shape_;
-
-    Tensor<xpu, 2, DType> wmat = in_data[fullc::kWeight].get<xpu, 2, DType>(s);
-    Tensor<xpu, 2, DType> data, out;
-    if (!param_.flatten) {
-      data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
-          Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s);
-      out = out_data[fullc::kOut].get_with_shape<xpu, 2, DType>(
-          Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
-    } else {
-      data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
-          Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
-      out = out_data[fullc::kOut].get_with_shape<xpu, 2, DType>(
-          Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s);
-    }
-
-    // Legacy approach shown here for comparison:
-    //   out = dot(data, wmat.T());
-    linalg_gemm(data, wmat, out, false, true, s);
-    if (!param_.no_bias) {
-      Tensor<xpu, 1, DType> bias = in_data[fullc::kBias].get<xpu, 1, DType>(s);
-      out += repmat(bias, data.size(0));
-    }
+  const TShape& ishape = in_data[fullc::kData].shape_;
+  const TShape& oshape = out_data[fullc::kOut].shape_;
+
+  Tensor<xpu, 2, DType> wmat = in_data[fullc::kWeight].get<xpu, 2, DType>(s);
+  Tensor<xpu, 2, DType> data, out;
+  if (!param.flatten) {
+    data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
+        Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s);
+    out = out_data[fullc::kOut].get_with_shape<xpu, 2, DType>(
+        Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
+  } else {
+    data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
+        Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
+    out = out_data[fullc::kOut].get_with_shape<xpu, 2, DType>(
+        Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s);
   }
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1U);
-    size_t expected = param_.no_bias ? 2 : 3;
-    CHECK(in_data.size() == expected && in_grad.size() == expected);
-    CHECK_EQ(req.size(), expected);
-    // TODO(bing): check the BLAS Handle, be careful
-    //  maybe need blas handle from context
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    const TShape& ishape = in_data[fullc::kData].shape_;
-    const TShape& oshape = out_grad[fullc::kOut].shape_;
+  // Legacy approach shown here for comparison:
+  //   out = dot(data, wmat.T());
+  linalg_gemm(data, wmat, out, false, true, s);
+  if (!param.no_bias) {
+    Tensor<xpu, 1, DType> bias = in_data[fullc::kBias].get<xpu, 1, DType>(s);
+    out += repmat(bias, data.size(0));
+  }
+}
 
-    Tensor<xpu, 2, DType> wmat = in_data[fullc::kWeight].get<xpu, 2, DType>(s);
-    Tensor<xpu, 2, DType> data, grad, gdata;
-    if (!param_.flatten) {
-      data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
-          Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s);
-      grad = out_grad[fullc::kOut].get_with_shape<xpu, 2, DType>(
-          Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
-      gdata = in_grad[fullc::kData].get_with_shape<xpu, 2, DType>(
-          Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s);
-    } else {
-      data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
-          Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
-      grad = out_grad[fullc::kOut].get_with_shape<xpu, 2, DType>(
-          Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s);
-      gdata = in_grad[fullc::kData].get_with_shape<xpu, 2, DType>(
-          Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
-    }
+template<typename xpu, typename DType>
+void FCBackward(const OpContext &ctx, const FullyConnectedParam &param,
+                const std::vector<TBlob> &out_grad, const std::vector<TBlob> &in_data,
+                const std::vector<OpReqType> &req, const std::vector<TBlob> &in_grad) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  // TODO(bing): check the BLAS Handle, be careful
+  //  maybe need blas handle from context
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const TShape& ishape = in_data[fullc::kData].shape_;
+  const TShape& oshape = out_grad[fullc::kOut].shape_;
+
+  Tensor<xpu, 2, DType> wmat = in_data[fullc::kWeight].get<xpu, 2, DType>(s);
+  Tensor<xpu, 2, DType> data, grad, gdata;
+  if (!param.flatten) {
+    data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
+        Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s);
+    grad = out_grad[fullc::kOut].get_with_shape<xpu, 2, DType>(
+        Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
+    gdata = in_grad[fullc::kData].get_with_shape<xpu, 2, DType>(
+        Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s);
+  } else {
+    data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
+        Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
+    grad = out_grad[fullc::kOut].get_with_shape<xpu, 2, DType>(
+        Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s);
+    gdata = in_grad[fullc::kData].get_with_shape<xpu, 2, DType>(
+        Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
+  }
 
 #if defined(__CUDACC__)
-    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
-        << "Must init CuBLAS handle in stream";
+  CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
+      << "Must init CuBLAS handle in stream";
 #endif
-    //  backprop
-    CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace";
-    // gradient of weight
-    Tensor<xpu, 2, DType> gwmat = in_grad[fullc::kWeight].get<xpu, 2, DType>(s);
-    // Legacy approach shown here for comparison:
-    //   out = Assign(gwmat, req[fullc::kWeight], dot(grad.T(), data));
-    linalg_gemm(grad, data, gwmat, true, false, s, req[fullc::kWeight]);
-    // gradient of bias
-    if (!param_.no_bias) {
-      Tensor<xpu, 1, DType> gbias = in_grad[fullc::kBias].get<xpu, 1, DType>(s);
-      Assign(gbias, req[fullc::kBias], sum_rows(grad));
-    }
-    // gradient of data
-    // Legacy approach shown here for comparison:
-    //   Assign(gdata, req[fullc::kData], dot(grad, wmat));
-    linalg_gemm(grad, wmat, gdata, false, false, s, req[fullc::kData]);
+  //  backprop
+  CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace";
+  // gradient of weight
+  Tensor<xpu, 2, DType> gwmat = in_grad[fullc::kWeight].get<xpu, 2, DType>(s);
+  // Legacy approach shown here for comparison:
+  //   out = Assign(gwmat, req[fullc::kWeight], dot(grad.T(), data));
+  linalg_gemm(grad, data, gwmat, true, false, s, req[fullc::kWeight]);
+  // gradient of bias
+  if (!param.no_bias) {
+    Tensor<xpu, 1, DType> gbias = in_grad[fullc::kBias].get<xpu, 1, DType>(s);
+    Assign(gbias, req[fullc::kBias], sum_rows(grad));
   }
+  // gradient of data
+  // Legacy approach shown here for comparison:
+  //   Assign(gdata, req[fullc::kData], dot(grad, wmat));
+  linalg_gemm(grad, wmat, gdata, false, false, s, req[fullc::kData]);
+}
 
- private:
-  FullyConnectedParam param_;
-};  // class FullyConnectedOp
-
-// Decalre Factory function, used for dispatch specialization
 template<typename xpu>
-Operator* CreateOp(FullyConnectedParam param, int dtype,
-                   std::vector<TShape> *in_shape,
-                   std::vector<TShape> *out_shape,
-                   Context ctx);
-
-#if DMLC_USE_CXX11
-class FullyConnectedProp : public OperatorProperty {
- public:
-  std::vector<std::string> ListArguments() const override {
-    if (!param_.no_bias) {
-      return {"data", "weight", "bias"};
-    } else {
-      return {"data", "weight"};
-    }
-  }
-
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
+void FullyConnectedCompute(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& outputs) {
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  uint32_t in_expected = param.no_bias ? 2 : 3;
+  CHECK_EQ(inputs.size(), in_expected);
+  CHECK_EQ(outputs.size(), 1U);
+  int dtype = inputs[0].type_flag_;
+
+  switch (dtype) {
+  case mshadow::kFloat32:
+    FCForward<xpu, float>(ctx, param, inputs, req, outputs);
+    break;
+  case mshadow::kFloat64:
+    FCForward<xpu, double>(ctx, param, inputs, req, outputs);
+    break;
+  case mshadow::kFloat16:
+    LOG(FATAL) << "float16 fully connected layer is currently"
+                  "only supported by CuDNN version.";
+    break;
+  default:
+    LOG(FATAL) << "Unsupported type " << dtype;
   }
+}
 
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-    using namespace mshadow;
-    if (!param_.no_bias) {
-      CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
-    } else {
-      CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
-    }
-    CHECK_EQ(out_shape->size(), 1U);
-    TShape dshape = (*in_shape)[fullc::kData];
-    TShape oshape = (*out_shape)[0];
-    // require data to be known
-    if (dshape.ndim() ==  0) return false;
-
-    index_t num_input;
-    if (!param_.flatten) {
-      num_input = dshape[dshape.ndim()-1];
-    } else {
-      num_input = dshape.ProdShape(1, dshape.ndim());
-    }
-    SHAPE_ASSIGN_CHECK(*in_shape, fullc::kWeight, Shape2(param_.num_hidden, num_input));
-    if (!param_.no_bias) {
-      SHAPE_ASSIGN_CHECK(*in_shape, fullc::kBias, Shape1(param_.num_hidden));
-    }
-
-    if (!param_.flatten) {
-      TShape result_shape(dshape);
-      result_shape[dshape.ndim()-1] = param_.num_hidden;
-      SHAPE_ASSIGN_CHECK(*out_shape, 0, result_shape);
-    } else {
-      SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param_.num_hidden));
-    }
-    if (oshape.ndim() != 0) {
-      dshape[0] = oshape[0];
-      SHAPE_ASSIGN_CHECK(*in_shape, fullc::kData, dshape);
-    }
-    return true;
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1U);
-    nnvm::NodeAttrs attrs;
-    attrs.name = "FullyConnected";
-    return ElemwiseAttr<int, type_is_none, type_assign, true, type_string>(
-      attrs, in_type, out_type, -1);
-  }
-
-  OperatorProperty* Copy() const override {
-    FullyConnectedProp* fc_sym = new FullyConnectedProp();
-    fc_sym->param_ = this->param_;
-    return fc_sym;
-  }
-
-  std::string TypeString() const override {
-    return "FullyConnected";
-  }
-
-  // decalre dependency and inplace optimization options
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return {out_grad[fullc::kOut], in_data[fullc::kData], in_data[fullc::kWeight]};
-  }
-
-  std::vector<std::pair<int, void*> > BackwardInplaceOption(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data,
-    const std::vector<void*> &in_grad) const override {
-    return {{in_data[fullc::kData], in_grad[fullc::kData]}};
-  }
-
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented.";
-    return NULL;
+template<typename xpu>
+void FullyConnectedGradCompute(const nnvm::NodeAttrs& attrs,
+                               const OpContext& ctx,
+                               const std::vector<TBlob>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<TBlob>& outputs) {
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  uint32_t out_expected = param.no_bias ? 2 : 3;
+  CHECK_EQ(inputs.size(), 3U);
+  CHECK_EQ(outputs.size(), out_expected);
+  CHECK_EQ(req.size(), out_expected);
+
+  std::vector<TBlob> out_grad{inputs[0]};
+  std::vector<TBlob> in_data(inputs.begin() + 1, inputs.end());
+  int dtype = inputs[0].type_flag_;
+
+  switch (dtype) {
+  case mshadow::kFloat32:
+    FCBackward<xpu, float>(ctx, param, out_grad, in_data, req, outputs);
+    break;
+  case mshadow::kFloat64:
+    FCBackward<xpu, double>(ctx, param, out_grad, in_data, req, outputs);
+    break;
+  case mshadow::kFloat16:
+    LOG(FATAL) << "float16 fully connected layer is currently"
+                  "only supported by CuDNN version.";
+    break;
+  default:
+    LOG(FATAL) << "Unsupported type " << dtype;
   }
+}
 
-  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                             std::vector<int> *in_type) const override;
-
- private:
-  FullyConnectedParam param_;
-};  // class FullyConnectedSymbol
-#endif
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_NN_FULLY_CONNECTED_INL_H_
diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc
index 9a978160297d..4362408a23a1 100644
--- a/src/operator/nn/fully_connected.cc
+++ b/src/operator/nn/fully_connected.cc
@@ -23,58 +23,153 @@
  * \brief fully connect operator
 */
 #include "./fully_connected-inl.h"
+#include "./mkldnn/mkldnn_ops-inl.h"
+#include "./mkldnn/mkldnn_base-inl.h"
 #if MXNET_USE_NNPACK == 1
 #include "./nnpack/nnpack_fully_connected-inl.h"
 #endif  // MXNET_USE_NNPACK
 
 namespace mxnet {
 namespace op {
-template<>
-Operator* CreateOp<cpu>(FullyConnectedParam param, int dtype,
-                        std::vector<TShape> *in_shape,
-                        std::vector<TShape> *out_shape,
-                        Context ctx) {
-  Operator *op = NULL;
-#if MXNET_USE_NNPACK == 1
-  const size_t batch_size = (*in_shape)[0][0];
-  // nnp_fully_connected_inference will do optimization for batch-size = 1
-  // nnp_fully_connected_output will do optimization for batch-size > 1
-  switch (dtype) {
-  case mshadow::kFloat32:
-    return new NNPACKFullyConnectedOp<cpu, float>(param);
-  default:
-    break;
+
+static bool FullyConnectedShape(const nnvm::NodeAttrs& attrs,
+                                std::vector<TShape> *in_shape,
+                                std::vector<TShape> *out_shape) {
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  using namespace mshadow;
+  if (!param.no_bias) {
+    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
+  } else {
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
+  }
+  CHECK_EQ(out_shape->size(), 1U);
+  TShape dshape = (*in_shape)[fullc::kData];
+  TShape oshape = (*out_shape)[0];
+  // require data to be known
+  if (dshape.ndim() ==  0) return false;
+
+  index_t num_input;
+  if (!param.flatten) {
+    num_input = dshape[dshape.ndim()-1];
+  } else {
+    num_input = dshape.ProdShape(1, dshape.ndim());
+  }
+  SHAPE_ASSIGN_CHECK(*in_shape, fullc::kWeight, Shape2(param.num_hidden, num_input));
+  if (!param.no_bias) {
+    SHAPE_ASSIGN_CHECK(*in_shape, fullc::kBias, Shape1(param.num_hidden));
+  }
+
+  if (!param.flatten) {
+    TShape result_shape(dshape);
+    result_shape[dshape.ndim()-1] = param.num_hidden;
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, result_shape);
+  } else {
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param.num_hidden));
+  }
+  if (oshape.ndim() != 0) {
+    dshape[0] = oshape[0];
+    SHAPE_ASSIGN_CHECK(*in_shape, fullc::kData, dshape);
   }
+  return true;
+}
+
+#if MXNET_USE_MKLDNN == 1
+void FullyConnectedComputeExCPU(const nnvm::NodeAttrs& attrs,
+                                const OpContext &ctx,
+                                const std::vector<NDArray> &inputs,
+                                const std::vector<OpReqType> &req,
+                                const std::vector<NDArray> &outputs) {
+  if (SupportMKLDNN(inputs[0])) {
+    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    MKLDNNFCForward(attrs, ctx, inputs, req, outputs);
+    MKLDNN_OPCHECK_RUN(FullyConnectedCompute<cpu>, attrs, ctx, inputs, req,
+                       outputs);
+    return;
+  }
+  FallBackCompute(FullyConnectedCompute<cpu>, attrs, ctx, inputs, req, outputs);
+}
+
+void FullyConnectedGradComputeExCPU(const nnvm::NodeAttrs& attrs,
+                                    const OpContext &ctx,
+                                    const std::vector<NDArray> &inputs,
+                                    const std::vector<OpReqType> &req,
+                                    const std::vector<NDArray> &outputs) {
+  if (SupportMKLDNN(inputs[0])) {
+    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    MKLDNNFCBackward(attrs, ctx, inputs, req, outputs);
+    MKLDNN_OPCHECK_RUN(FullyConnectedGradCompute<cpu>, attrs, ctx, inputs, req,
+                       outputs);
+    return;
+  }
+  FallBackCompute(FullyConnectedGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+}
 #endif
-  switch (dtype) {
-  case mshadow::kFloat32:
-    op = new FullyConnectedOp<cpu, float>(param);
-    break;
-  case mshadow::kFloat64:
-    op = new FullyConnectedOp<cpu, double>(param);
-    break;
-  case mshadow::kFloat16:
-    LOG(FATAL) << "float16 fully connected layer is currently"
-                  "only supported by CuDNN version.";
-    break;
-  default:
-    LOG(FATAL) << "Unsupported type " << dtype;
+
+static bool FullyConnectedType(const nnvm::NodeAttrs& attrs,
+                               std::vector<int> *in_type, std::vector<int> *out_type) {
+  CHECK_GE(in_type->size(), 1U);
+  return ElemwiseAttr<int, type_is_none, type_assign, true, type_string>(
+      attrs, in_type, out_type, -1);
+}
+
+struct FullyConnectedGrad {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
+    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
+    heads.push_back(n->inputs[fullc::kData]);
+    heads.push_back(n->inputs[fullc::kWeight]);
+    return MakeGradNode(op_name, n, heads, n->attrs.dict);
   }
+};
+
+inline static bool FCStorageType(const nnvm::NodeAttrs& attrs,
+                                 const int dev_mask,
+                                 DispatchMode* dispatch_mode,
+                                 std::vector<int> *in_attrs,
+                                 std::vector<int> *out_attrs) {
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  uint32_t in_expected = param.no_bias ? 2 : 3;
+  CHECK_EQ(in_attrs->size(), in_expected);
+  CHECK_EQ(out_attrs->size(), 1);
 
-  return op;
+  DispatchMode wanted_mode;
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask)
+    wanted_mode = DispatchMode::kFComputeEx;
+  else
+#endif
+    wanted_mode = DispatchMode::kFCompute;
+  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
+                             dispatch_mode, wanted_mode);
 }
 
-// DO_BIND_DISPATCH comes from operator_common.h
-Operator *FullyConnectedProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                                     std::vector<int> *in_type) const {
-  std::vector<TShape> out_shape(1, TShape()), aux_shape;
-  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], in_shape, &out_shape, ctx);
+inline static bool BackwardFCStorageType(const nnvm::NodeAttrs& attrs,
+                                         const int dev_mask,
+                                         DispatchMode* dispatch_mode,
+                                         std::vector<int> *in_attrs,
+                                         std::vector<int> *out_attrs) {
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  uint32_t out_expected = param.no_bias ? 2 : 3;
+  CHECK_EQ(in_attrs->size(), 3U);
+  CHECK_EQ(out_attrs->size(), out_expected);
+
+  DispatchMode wanted_mode;
+#if 0
+  // TODO(zhengda) let's disable MKLDNN for FullyConnected for now.
+  // It seems there is a bug.
+  if (dev_mask == mshadow::cpu::kDevMask)
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  else
+#endif
+    wanted_mode = DispatchMode::kFCompute;
+  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
+                             dispatch_mode, wanted_mode);
 }
 
 DMLC_REGISTER_PARAMETER(FullyConnectedParam);
 
-MXNET_REGISTER_OP_PROPERTY(FullyConnected, FullyConnectedProp)
+NNVM_REGISTER_OP(FullyConnected)
 .describe(R"code(Applies a linear transformation: :math:`Y = XW^T + b`.
 
 If ``flatten`` is set to be true, then the shapes are:
@@ -96,9 +191,59 @@ The learnable parameters include both ``weight`` and ``bias``.
 If ``no_bias`` is set to be true, then the ``bias`` term is ignored.
 
 )code" ADD_FILELINE)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const FullyConnectedParam& params = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  return params.no_bias ? 2 : 3;
+})
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<FullyConnectedParam>)
+.set_attr<FInferStorageType>("FInferStorageType", FCStorageType)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+  const FullyConnectedParam& params = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  if (!params.no_bias) {
+    return std::vector<std::string>{"data", "weight", "bias"};
+  } else {
+    return std::vector<std::string>{"data", "weight"};
+  }
+})
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
+.set_attr<nnvm::FInferShape>("FInferShape", FullyConnectedShape)
+.set_attr<nnvm::FInferType>("FInferType", FullyConnectedType)
+.set_attr<FCompute>("FCompute<cpu>", FullyConnectedCompute<cpu>)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FComputeEx>("FComputeEx<cpu>", FullyConnectedComputeExCPU)
+#endif
+.set_attr<nnvm::FGradient>("FGradient", FullyConnectedGrad{"_backward_FullyConnected"})
 .add_argument("data", "NDArray-or-Symbol", "Input data.")
 .add_argument("weight", "NDArray-or-Symbol", "Weight matrix.")
 .add_argument("bias", "NDArray-or-Symbol", "Bias parameter.")
 .add_arguments(FullyConnectedParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_FullyConnected)
+.set_num_inputs(3)
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const FullyConnectedParam& params = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  return params.no_bias ? 2 : 3;
+})
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
+  return std::vector<std::pair<int, int> >{{1, 0}};
+})
+.set_attr<FInferStorageType>("FInferStorageType", BackwardFCStorageType)
+.set_attr_parser(ParamParser<FullyConnectedParam>)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FComputeEx>("FComputeEx<cpu>", FullyConnectedGradComputeExCPU)
+#endif
+.set_attr<FCompute>("FCompute<cpu>", FullyConnectedGradCompute<cpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/fully_connected.cu b/src/operator/nn/fully_connected.cu
index 279a378e2ad4..c89d37767c4a 100644
--- a/src/operator/nn/fully_connected.cu
+++ b/src/operator/nn/fully_connected.cu
@@ -25,16 +25,50 @@
 #include "./fully_connected-inl.h"
 namespace mxnet {
 namespace op {
+
 template<>
-Operator* CreateOp<gpu>(FullyConnectedParam param, int dtype,
-                        std::vector<TShape> *in_shape,
-                        std::vector<TShape> *out_shape,
-                        Context ctx) {
-  Operator *op = NULL;
+void FullyConnectedCompute<gpu>(const nnvm::NodeAttrs& attrs,
+                                const OpContext& ctx,
+                                const std::vector<TBlob>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<TBlob>& outputs) {
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  uint32_t in_expected = param.no_bias ? 2 : 3;
+  CHECK_EQ(inputs.size(), in_expected);
+  CHECK_EQ(outputs.size(), 1U);
+  int dtype = inputs[0].type_flag_;
+
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new FullyConnectedOp<gpu, DType>(param);
-  })
-  return op;
+    FCForward<gpu, DType>(ctx, param, inputs, req, outputs);
+  });
 }
+
+template<>
+void FullyConnectedGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
+                                    const OpContext& ctx,
+                                    const std::vector<TBlob>& inputs,
+                                    const std::vector<OpReqType>& req,
+                                    const std::vector<TBlob>& outputs) {
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  uint32_t out_expected = param.no_bias ? 2 : 3;
+  CHECK_EQ(inputs.size(), 3U);
+  CHECK_EQ(outputs.size(), out_expected);
+  CHECK_EQ(req.size(), out_expected);
+
+  std::vector<TBlob> out_grad{inputs[0]};
+  std::vector<TBlob> in_data(inputs.begin() + 1, inputs.end());
+  int dtype = inputs[0].type_flag_;
+
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    FCBackward<gpu, DType>(ctx, param, out_grad, in_data, req, outputs);
+  });
+}
+
+NNVM_REGISTER_OP(FullyConnected)
+.set_attr<FCompute>("FCompute<gpu>", FullyConnectedCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_FullyConnected)
+.set_attr<FCompute>("FCompute<gpu>", FullyConnectedGradCompute<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/lrn-inl.h b/src/operator/nn/lrn-inl.h
new file mode 100644
index 000000000000..fdae1eca0aef
--- /dev/null
+++ b/src/operator/nn/lrn-inl.h
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file lrn-inl.h
+ * \brief
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_NN_LRN_INL_H_
+#define MXNET_OPERATOR_NN_LRN_INL_H_
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "../operator_common.h"
+#include "../mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+
+namespace lrn_enum {
+enum LRNInputs {kData};
+enum LRNOutputs {kOut, kTmpNorm};
+}  // namespace lrn_enum
+
+struct LRNParam : public dmlc::Parameter<LRNParam> {
+  float alpha;
+  float beta;
+  float knorm;
+  uint32_t nsize;
+  DMLC_DECLARE_PARAMETER(LRNParam) {
+    DMLC_DECLARE_FIELD(alpha).set_default(1e-4f)
+    .describe("The variance scaling parameter :math:`\alpha` in the LRN expression.");
+    DMLC_DECLARE_FIELD(beta).set_default(0.75f)
+    .describe("The power parameter :math:`\beta` in the LRN expression.");
+    DMLC_DECLARE_FIELD(knorm).set_default(2.0f)
+    .describe("The parameter :math:`k` in the LRN expression.");
+    DMLC_DECLARE_FIELD(nsize)
+    .describe("normalization window width in elements.");
+  }
+};  // struct LRNParam
+
+template<typename xpu>
+void LRNForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                const std::vector<TBlob> &in_data,
+                const std::vector<OpReqType> &req,
+                const std::vector<TBlob> &out_data) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  const LRNParam& param_ = nnvm::get<LRNParam>(attrs.parsed);
+  // TODO(xxx): Test with gradient chceker
+  CHECK_EQ(in_data.size(), 1U);
+  CHECK_EQ(out_data.size(), 2U);
+  // CHECK_EQ(req.size(), 2);
+  CHECK_EQ(param_.nsize % 2, 1U) << "LRN only supports odd values for local_size";
+  const real_t salpha = param_.alpha / param_.nsize;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  Tensor<xpu, 4> data = in_data[lrn_enum::kData].get<xpu, 4, real_t>(s);
+  Tensor<xpu, 4> out = out_data[lrn_enum::kOut].get<xpu, 4, real_t>(s);
+  Tensor<xpu, 4> tmp_norm = out_data[lrn_enum::kTmpNorm].get<xpu, 4, real_t>(s);
+  tmp_norm = chpool<red::sum>(F<mshadow_op::square>(data) , param_.nsize) * salpha + param_.knorm;
+  Assign(out, req[lrn_enum::kOut], data *  F<mshadow_op::power>(tmp_norm, -param_.beta));
+}
+
+template<typename xpu>
+void LRNBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                 const TBlob &out_grad, const TBlob &in_data,
+                 const TBlob &out_norm, const OpReqType &req,
+                 const TBlob &in_grad) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  const LRNParam& param_ = nnvm::get<LRNParam>(attrs.parsed);
+  const real_t salpha = param_.alpha / param_.nsize;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  Tensor<xpu, 4> grad = out_grad.get<xpu, 4, real_t>(s);
+  Tensor<xpu, 4> tmp_norm = out_norm.get<xpu, 4, real_t>(s);
+  Tensor<xpu, 4> data = in_data.get<xpu, 4, real_t>(s);
+  Tensor<xpu, 4> grad_in = in_grad.get<xpu, 4, real_t>(s);
+  grad_in = grad * F<mshadow_op::power>(tmp_norm, -param_.beta);
+  grad_in += (- 2.0f * param_.beta * salpha) *
+      chpool<red::sum>(grad * data *
+                       F<mshadow_op::power>(tmp_norm, -param_.beta - 1.0f),
+                       param_.nsize)  * data;
+}
+
+template<typename xpu>
+void LRNCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
+                const std::vector<TBlob>& inputs,
+                const std::vector<OpReqType>& req,
+                const std::vector<TBlob>& outputs) {
+  LRNForward<xpu>(attrs, ctx, inputs, req, outputs);
+}
+
+template<typename xpu>
+void LRNGradCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
+                    const std::vector<TBlob>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<TBlob>& outputs) {
+  LRNBackward<xpu>(attrs, ctx, inputs[0],  // out_grad
+                   inputs[1],              // in_data
+                   inputs[2],              // out_norm
+                   req[lrn_enum::kData], outputs[lrn_enum::kData]);
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_NN_LRN_INL_H_
diff --git a/src/operator/nn/lrn.cc b/src/operator/nn/lrn.cc
new file mode 100644
index 000000000000..2359b49abab6
--- /dev/null
+++ b/src/operator/nn/lrn.cc
@@ -0,0 +1,203 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file lrn.cc
+ * \brief
+ * \author Bing Xu, Patric Zhao (patric.zhao@intel.com)
+*/
+
+#include "./lrn-inl.h"
+#include "../operator_common.h"
+#if MXNET_USE_MKLDNN == 1
+#include "./mkldnn/mkldnn_lrn-inl.h"
+#endif
+
+namespace mxnet {
+namespace op {
+
+bool LRNShape(const nnvm::NodeAttrs& attrs,
+              std::vector<TShape> *in_shape,
+              std::vector<TShape> *out_shape) {
+  using namespace mshadow;
+  CHECK_EQ(in_shape->size(), 1U) << "Input:[data]";
+  const TShape &dshape = in_shape->at(0);
+  if (dshape.ndim() == 0) return false;
+  out_shape->clear();
+  out_shape->push_back(dshape);
+  out_shape->push_back(dshape);
+  return true;
+}
+
+inline std::vector<std::string> ListArguments() {
+  return {"data"};
+}
+
+bool LRNType(const nnvm::NodeAttrs& attrs,
+             std::vector<int> *in_type,
+             std::vector<int> *out_type) {
+  CHECK_GE(in_type->size(), 1U);
+  int dtype = (*in_type)[0];
+  CHECK_NE(dtype, -1) << "First input must have specified type";
+  for (index_t i = 0; i < in_type->size(); ++i) {
+    if ((*in_type)[i] == -1) {
+      (*in_type)[i] = dtype;
+    } else {
+      UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
+    }
+  }
+  int n_out = 2;
+  out_type->clear();
+  for (int i = 0; i < n_out; ++i ) out_type->push_back(dtype);
+  return true;
+}
+
+struct LRNGrad {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
+                const std::vector<nnvm::NodeEntry>& ograds) const {
+    std::vector<nnvm::NodeEntry> heads;
+    heads.push_back(ograds[0]);  // out_grad
+    heads.push_back(n->inputs[lrn_enum::kData]);
+    heads.emplace_back(nnvm::NodeEntry{n, lrn_enum::kTmpNorm, 0});
+    return MakeGradNode(op_name, n, heads, n->attrs.dict);
+  }
+};
+
+bool LRNForwardInferStorageType(const nnvm::NodeAttrs& attrs,
+                                const int dev_mask,
+                                DispatchMode* dispatch_mode,
+                                std::vector<int> *in_attrs,
+                                std::vector<int> *out_attrs) {
+  CHECK(!in_attrs->empty());
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask) {
+    storage_type_assign(out_attrs, mxnet::kDefaultStorage,
+                        dispatch_mode, DispatchMode::kFComputeEx);
+    return true;
+  }
+#endif
+  storage_type_assign(out_attrs, mxnet::kDefaultStorage,
+                      dispatch_mode, DispatchMode::kFCompute);
+  return true;
+}
+
+bool LRNBackwardInferStorageType(const nnvm::NodeAttrs& attrs,
+                                 const int dev_mask,
+                                 DispatchMode* dispatch_mode,
+                                 std::vector<int> *in_attrs,
+                                 std::vector<int> *out_attrs) {
+  CHECK(!in_attrs->empty());
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask) {
+    storage_type_assign(out_attrs, mxnet::kDefaultStorage,
+                        dispatch_mode, DispatchMode::kFComputeEx);
+    return true;
+  }
+#endif
+  storage_type_assign(out_attrs, mxnet::kDefaultStorage,
+                      dispatch_mode, DispatchMode::kFCompute);
+  return true;
+}
+
+#if MXNET_USE_MKLDNN == 1
+void LRNComputeExCPU(const nnvm::NodeAttrs &attrs,
+                     const OpContext &ctx,
+                     const std::vector<NDArray> &inputs,
+                     const std::vector<OpReqType> &req,
+                     const std::vector<NDArray> &outputs) {
+  const LRNParam &param = nnvm::get<LRNParam>(attrs.parsed);
+  if (SupportMKLDNN(inputs[0])) {
+    // We only need to test one output array.
+    MKLDNN_OPCHECK_INIT(false, 1, inputs, outputs);
+    MKLDNNLRNForward(ctx, param, inputs[0], req[0], outputs[0]);
+    MKLDNN_OPCHECK_RUN(LRNCompute<cpu>, attrs, ctx, inputs, req, outputs);
+    return;
+  }
+  FallBackCompute(LRNCompute<cpu>, attrs, ctx, inputs, req, outputs);
+}
+
+void LRNGradComputeExCPU(const nnvm::NodeAttrs &attrs,
+                         const OpContext &ctx,
+                         const std::vector<NDArray> &inputs,
+                         const std::vector<OpReqType> &req,
+                         const std::vector<NDArray> &outputs) {
+  const LRNParam &param = nnvm::get<LRNParam>(attrs.parsed);
+  const NDArray &out_grad = inputs[0];
+  const NDArray &in_data = inputs[1];
+  const NDArray &in_grad = outputs[0];
+
+  if (SupportMKLDNN(inputs[0])) {
+    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    MKLDNNLRNBackward(ctx, param, out_grad, in_data, req[0], in_grad);
+    MKLDNN_OPCHECK_RUN(LRNGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+    return;
+  }
+  FallBackCompute(LRNGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+}
+#endif
+
+DMLC_REGISTER_PARAMETER(LRNParam);
+
+NNVM_REGISTER_OP(LRN)
+.describe(R"code(Applies local response normalization to the input.
+
+The local response normalization layer performs "lateral inhibition" by normalizing
+over local input regions.
+
+If :math:`a_{x,y}^{i}` is the activity of a neuron computed by applying kernel :math:`i` at position
+:math:`(x, y)` and then applying the ReLU nonlinearity, the response-normalized
+activity :math:`b_{x,y}^{i}` is given by the expression:
+
+.. math::
+   b_{x,y}^{i} = \frac{a_{x,y}^{i}}{\Bigg({k + \alpha \sum_{j=max(0, i-\frac{n}{2})}^{min(N-1, i+\frac{n}{2})} (a_{x,y}^{j})^{2}}\Bigg)^{\beta}}
+
+where the sum runs over :math:`n` "adjacent" kernel maps at the same spatial position, and :math:`N` is the total
+number of kernels in the layer.
+
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(2)
+.set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs",
+                                    [](const NodeAttrs& attrs) { return 1; })
+.set_attr_parser(ParamParser<LRNParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", LRNShape)
+.set_attr<nnvm::FInferType>("FInferType", LRNType)
+.set_attr<FInferStorageType>("FInferStorageType", LRNForwardInferStorageType)
+.set_attr<FCompute>("FCompute<cpu>", LRNCompute<cpu>)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FComputeEx>("FComputeEx<cpu>", LRNComputeExCPU)
+#endif
+.set_attr<nnvm::FGradient>("FGradient", LRNGrad{"_backward_LRN"})
+.add_argument("data", "NDArray-or-Symbol", "Input data to LRN")
+.add_arguments(LRNParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_LRN)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<LRNParam>)
+.set_attr<FInferStorageType>("FInferStorageType", LRNBackwardInferStorageType)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FComputeEx>("FComputeEx<cpu>", LRNGradComputeExCPU)
+#endif
+.set_attr<FCompute>("FCompute<cpu>", LRNGradCompute<cpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/lrn.cu b/src/operator/nn/lrn.cu
similarity index 64%
rename from src/operator/lrn.cu
rename to src/operator/nn/lrn.cu
index ba872f1d26d0..4c31ca96025c 100644
--- a/src/operator/lrn.cu
+++ b/src/operator/nn/lrn.cu
@@ -25,29 +25,15 @@
 */
 
 #include "./lrn-inl.h"
-#if MXNET_USE_CUDNN == 1
-#include "./cudnn_lrn-inl.h"
-#endif
 
 namespace mxnet {
 namespace op {
-template<>
-Operator* CreateOp<gpu>(LRNParam param, int dtype) {
-  Operator *op = NULL;
-#if MXNET_USE_CUDNN == 1
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new CuDNNLocalResponseNormOp<DType>(param);
-  })
-#else
-#if CUDA_VERSION == 7000
-  LOG(FATAL) << "Due to old CUDA compiler bug, LRN is disabled."
-             << "Please upgrade CUDA to 7.5+ or use CUDNN";
-#else
-  op = new LocalResponseNormOp<gpu>(param);
-#endif  // CUDA_VERSION
-#endif  // MXNET_USE_CUDNN
-  return op;
-}
+
+NNVM_REGISTER_OP(LRN)
+.set_attr<FCompute>("FCompute<gpu>", LRNCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_LRN)
+.set_attr<FCompute>("FCompute<gpu>", LRNGradCompute<gpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/mkldnn/mkldnn_act.cc b/src/operator/nn/mkldnn/mkldnn_act.cc
new file mode 100644
index 000000000000..71fdf4ca585b
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_act.cc
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_act.cc
+ * \brief
+ * \author Da Zheng
+*/
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "../../operator_common.h"
+#include "../activation-inl.h"
+#include "./mkldnn_base-inl.h"
+
+#if MXNET_USE_MKLDNN == 1
+
+#include <mkldnn.hpp>
+
+namespace mxnet {
+namespace op {
+
+bool SupportMKLDNNAct(const ActivationParam& param) {
+  // We only enable ReLU for now. It seems other activations have some precision
+  // problems.
+  return param.act_type == activation::kReLU;
+#if 0
+      || param.act_type == activation::kSigmoid
+      || param.act_type == activation::kSoftReLU;
+#endif
+}
+
+static inline mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param) {
+  switch (param.act_type) {
+    case activation::kReLU:
+      return mkldnn::algorithm::eltwise_relu;
+    case activation::kSigmoid:
+      return mkldnn::algorithm::eltwise_logistic;
+    case activation::kTanh:
+      return mkldnn::algorithm::eltwise_tanh;
+    case activation::kSoftReLU:
+      return mkldnn::algorithm::eltwise_soft_relu;
+    default:
+      LOG(FATAL) << "unknown activation type";
+      return mkldnn::algorithm::eltwise_relu;
+  }
+}
+
+typedef std::shared_ptr<mkldnn::eltwise_forward::primitive_desc> mkldnn_act_pdesc_ptr;
+
+static mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl(
+    const ActivationParam& param, bool is_train,
+    const mkldnn::memory &input_mem, int dtype) {
+  mkldnn::memory::primitive_desc data_mpd = input_mem.get_primitive_desc();
+  mkldnn::memory::desc data_md = data_mpd.desc();
+  auto cpu_engine = data_mpd.get_engine();
+
+  auto alg = GetMKLDNNActAlgo(param);
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    DType alpha = 0;
+    mkldnn::eltwise_forward::desc desc = is_train
+        ? mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_training,
+                                        alg, data_md, alpha)
+        : mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_scoring,
+                                        alg, data_md, alpha);
+    return mkldnn::eltwise_forward::primitive_desc(desc, cpu_engine);
+  });
+  LOG(INFO) << "Unsupported data type for MKLDNN activation";
+  mkldnn::eltwise_forward::desc desc = mkldnn::eltwise_forward::desc(
+      mkldnn::prop_kind::forward_training, alg, data_md, 0.0);
+  return mkldnn::eltwise_forward::primitive_desc(desc, cpu_engine);
+}
+
+typedef MKLDNNParamOpSign<ActivationParam> MKLDNNActSignature;
+
+class MKLDNNActForward {
+  std::shared_ptr<mkldnn::eltwise_forward> fwd;
+  std::shared_ptr<mkldnn::memory> data;
+  std::shared_ptr<mkldnn::memory> out;
+
+ public:
+  const mkldnn::eltwise_forward::primitive_desc fwd_pd;
+
+  MKLDNNActForward(const ActivationParam& param, bool is_train,
+                   const NDArray &data, const mkldnn::memory &mem): fwd_pd(
+                       GetActFwdDescImpl(param, is_train, mem, data.dtype())) {
+  }
+
+  void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &output) {
+    if (this->data == nullptr)
+      this->data = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+              data.get_primitive_desc(), data.get_data_handle()));
+    else
+      this->data->set_data_handle(data.get_data_handle());
+
+    CHECK(fwd_pd.dst_primitive_desc() == output.get_primitive_desc());
+    if (this->out == nullptr)
+      this->out = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+              fwd_pd.dst_primitive_desc(), output.get_data_handle()));
+    else
+      this->out->set_data_handle(output.get_data_handle());
+
+    if (this->fwd == nullptr) {
+      this->fwd = std::shared_ptr<mkldnn::eltwise_forward>(
+          new mkldnn::eltwise_forward(fwd_pd, mkldnn::primitive::at(*this->data),
+                                      *this->out));
+    }
+  }
+
+  const mkldnn::eltwise_forward &GetFwd() const {
+    return *fwd;
+  }
+};
+
+static MKLDNNActForward &GetActForward(const ActivationParam& param,
+                                       const OpContext &ctx, const NDArray &in_data,
+                                       const mkldnn::memory &in_mem) {
+  static thread_local std::unordered_map<MKLDNNActSignature, MKLDNNActForward, MKLDNNOpHash> fwds;
+  MKLDNNActSignature key(param);
+  key.AddSign(ctx.is_train);
+  key.AddSign(param.act_type);
+  key.AddSign(in_data);
+
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+    MKLDNNActForward fwd(param, ctx.is_train, in_data, in_mem);
+    auto ins_ret = fwds.insert(std::pair<MKLDNNActSignature, MKLDNNActForward>(
+            key, fwd));
+    CHECK(ins_ret.second);
+    it = ins_ret.first;
+  }
+  return it->second;
+}
+
+void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                             const NDArray &in_data, const OpReqType &req,
+                             const NDArray &out_data) {
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+  auto input_mem = in_data.GetMKLDNNData();
+  MKLDNNActForward &fwd = GetActForward(param, ctx, in_data, *input_mem);
+  auto out_mem = const_cast<NDArray &>(out_data).CreateMKLDNNData(
+      fwd.fwd_pd.dst_primitive_desc());
+  fwd.SetNewMem(*input_mem, *out_mem);
+  MKLDNNStream *stream = MKLDNNStream::Get();
+  stream->RegisterPrim(fwd.GetFwd());
+  stream->Submit();
+}
+
+void MKLDNNActivationBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                              const NDArray &out_grad, const NDArray &in_data,
+                              const OpReqType &req, const NDArray &in_grad) {
+  if (req == kNullOp) {
+    return;
+  }
+
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+  TmpMemMgr::Get()->Init(ctx.requested[activation::kTempSpace]);
+  auto diff_dst_memory = out_grad.GetMKLDNNData();
+  auto input_mem = in_data.GetMKLDNNData();
+  // We need to make sure the two inputs to eltwise_backward has the same memory
+  // descriptor. Otherwise, the perf will suffer.
+  if (input_mem->get_primitive_desc() != diff_dst_memory->get_primitive_desc())
+    input_mem = in_data.GetMKLDNNDataReorder(diff_dst_memory->get_primitive_desc());
+  mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc();
+  mkldnn::memory::desc data_md = data_mpd.desc();
+  mkldnn::memory::desc diff_md = diff_dst_memory->get_primitive_desc().desc();
+  auto cpu_engine = data_mpd.get_engine();
+
+  MKLDNNStream *stream = MKLDNNStream::Get();
+  auto alg = GetMKLDNNActAlgo(param);
+  mkldnn_output_t diff_src_memory;
+
+  MSHADOW_REAL_TYPE_SWITCH(in_data.dtype(), DType, {
+    DType alpha = 0;
+    mkldnn::eltwise_forward::desc fw_desc(mkldnn::prop_kind::forward_training,
+                                          alg, data_md, alpha);
+    mkldnn::eltwise_forward::primitive_desc fw_pdesc(fw_desc, cpu_engine);
+    mkldnn::eltwise_backward::desc bw_desc(alg, diff_md, data_md, alpha);
+    mkldnn::eltwise_backward::primitive_desc bw_pdesc(bw_desc, cpu_engine,
+                                                      fw_pdesc);
+
+    diff_src_memory = CreateMKLDNNMem(in_grad,
+                                      bw_pdesc.diff_src_primitive_desc(), req);
+    stream->RegisterPrim(mkldnn::eltwise_backward(bw_pdesc, *input_mem,
+                                                  *diff_dst_memory,
+                                                  *diff_src_memory.second));
+  });
+  CommitOutput(in_grad, diff_src_memory);
+  stream->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
new file mode 100644
index 000000000000..1c583e1f671e
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -0,0 +1,488 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*******************************************************************************
+* Copyright 2016-2017 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkldnn_base-inl.h
+* \brief
+* \author young.jin.kim@intel.com
+*         ashok.emani@intel.com
+*         deepthi.karkada@intel.com
+*         louis.feng@intel.com
+*         adam.d.straw@intel.com
+*         zhengda1936@gmail.com
+*
+*******************************************************************************/
+
+#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_
+#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_
+
+#if MXNET_USE_MKLDNN == 1
+#include <iterator>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include <utility>
+#include <algorithm>
+#include <memory>
+#include "mkldnn.hpp"
+#include "mxnet/ndarray.h"
+#include "mxnet/resource.h"
+#include "mxnet/op_attr_types.h"
+using namespace mkldnn;
+namespace mxnet {
+extern bool EnableMkldnnWarnGenerated();
+// =====  CpuEngine =======================================
+// cpu_engine singleton
+class CpuEngine {
+ public:
+  static CpuEngine *Get() {
+    // I's thread-safe in C++11.
+    static thread_local CpuEngine myInstance;
+    return &myInstance;
+  }
+  CpuEngine(CpuEngine const &) = delete;             // Copy construct
+  CpuEngine(CpuEngine &&) = delete;                  // Move construct
+  CpuEngine &operator=(CpuEngine const &) = delete;  // Copy assign
+  CpuEngine &operator=(CpuEngine &&) = delete;       // Move assign
+
+  mkldnn::engine &get_engine() { return _cpu_engine; }
+
+ protected:
+  CpuEngine() : _cpu_engine(mkldnn::engine::cpu, 0) {}
+  ~CpuEngine() {}
+
+ private:
+  mkldnn::engine _cpu_engine;
+};
+
+// type enumerator
+template <typename T>
+struct data_type_enum {};
+
+template <>
+struct data_type_enum<float> {
+  enum { type = mkldnn::memory::data_type::f32 };
+};
+
+template <>
+struct data_type_enum<int32_t> {
+  enum { type = mkldnn::memory::data_type::s32 };
+};
+
+template <>
+struct data_type_enum<int16_t> {
+  enum { type = mkldnn::memory::data_type::s16 };
+};
+
+template <>
+struct data_type_enum<int8_t> {
+  enum { type = mkldnn::memory::data_type::s8 };
+};
+
+template <>
+struct data_type_enum<uint8_t> {
+  enum { type = mkldnn::memory::data_type::u8 };
+};
+
+static inline bool SupportMKLDNNArray(int dtype, const TShape &shape) {
+  int ndim = shape.ndim();
+  bool support = ndim == 1 || ndim == 2 || ndim == 4;
+  support = support && (dtype == mshadow::kFloat32 || dtype == mshadow::kInt32
+                        || dtype == mshadow::kInt8 || dtype == mshadow::kUint8);
+  return support;
+}
+
+static inline bool SupportStorageMKLDNN(int stype) {
+  return stype == kDefaultStorage;
+}
+
+static inline bool SupportMKLDNN(int dtype, const TShape &shape) {
+  int ndim = shape.ndim();
+  return dtype == mshadow::kFloat32 && (ndim == 1 || ndim == 2 || ndim == 4);
+}
+
+static inline bool SupportMKLDNN(const NDArray &input) {
+  return SupportMKLDNN(input.dtype(), input.shape())
+      && SupportStorageMKLDNN(input.storage_type());
+}
+
+static inline bool SupportMKLDNNConv(const NDArray &input) {
+  return input.dtype() == mshadow::kFloat32 && input.shape().ndim() == 4;
+}
+
+/*
+ * This is to align address to a certain alignment.
+ */
+void *AlignMem(void *mem, size_t size, size_t alignment, size_t *space);
+
+namespace op {
+struct ActivationParam;
+bool SupportMKLDNNAct(const op::ActivationParam& param);
+}
+
+static int GetTypeSize(int dtype) {
+  int size = -1;
+  MSHADOW_TYPE_SWITCH(dtype, DType, {
+    size = sizeof(DType);
+  });
+  return size;
+}
+
+static inline size_t GetArraySize(const NDArray &arr) {
+  return arr.shape().Size() * GetTypeSize(arr.dtype());
+}
+
+static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) {
+  switch (dtype) {
+    case mshadow::kFloat32:
+      return mkldnn::memory::data_type::f32;
+    case mshadow::kInt32:
+      return mkldnn::memory::data_type::s32;
+    case mshadow::kInt8:
+      return mkldnn::memory::data_type::s8;
+    case mshadow::kUint8:
+      return mkldnn::memory::data_type::u8;
+    default:
+      LOG(FATAL) << "unknown type for MKLDNN";
+      return mkldnn::memory::data_type::data_undef;
+  }
+}
+
+inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr, int ndim) {
+  mkldnn::memory::dims dims(ndim);
+  for (size_t i = 0; i < dims.size(); i++) dims[i] = arr.shape()[i];
+  return mkldnn::memory::desc{dims, get_mkldnn_type(arr.dtype()),
+                              mkldnn::memory::format::any};
+}
+
+inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr) {
+  return GetMemDesc(arr, arr.shape().ndim());
+}
+
+inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr,
+                                                 int num_groups) {
+  if (num_groups == 1) {
+    return GetMemDesc(arr);
+  } else {
+    CHECK_EQ(arr.shape().ndim(), 4U);
+    mkldnn::memory::dims tz = mkldnn::memory::dims{ num_groups,
+      static_cast<int>(arr.shape()[0] / num_groups),
+      static_cast<int>(arr.shape()[1]),
+      static_cast<int>(arr.shape()[2]),
+      static_cast<int>(arr.shape()[3])};
+    return mkldnn::memory::desc{tz, get_mkldnn_type(arr.dtype()),
+                                mkldnn::memory::format::any};
+  }
+}
+
+typedef std::shared_ptr<mkldnn::memory> mkldnn_mem_ptr;
+typedef std::shared_ptr<const mkldnn::memory> mkldnn_mem_const_ptr;
+
+/*
+ * This is to manage the temporary memory provided by MXNet for operators.
+ * The temp memory is mainly used to keep the reordered data. In an operator, we
+ * may need multiple pieces of memory for them. But MXNet can only provide
+ * a single piece of memory. This class is to help break the temporary memory
+ * from MXNet to store the reordered data.
+ * The amount of temporary memory used in an operator depends on the layout of
+ * input arrays and the operator. It's difficult to calculate it manually, so
+ * the class also estimate the amount of memory automatically.
+ */
+class TmpMemMgr {
+  // This points to the memory buffer where we can allocate temp memory.
+  char *curr_mem;
+  // The total size of the temp memory.
+  size_t mem_size;
+  // This contains the current available memory size.
+  size_t curr_size;
+  // This estimate the required temp memory size in an operator.
+  size_t est_size;
+  const size_t alignment = 4096;
+
+ public:
+  static TmpMemMgr *Get() {
+    static thread_local TmpMemMgr mgr;
+    return &mgr;
+  }
+
+  TmpMemMgr() {
+    Reset();
+    est_size = 0;
+    mem_size = 0;
+  }
+
+  void Reset() {
+    curr_mem = nullptr;
+    curr_size = 0;
+    // We don't reset est_size and mem_size because est_size contains the
+    // estimated temp memory size from the last run and mem_size contains the
+    // memroy size allocated in the last run.
+  }
+
+  void Init(const Resource &r) {
+    // If the last time, if we estimate that we need more memory, we should the
+    // larger memory size.
+    mem_size = std::max(mem_size, est_size);
+    if (mem_size > 0) {
+      // Let's allocate some extra memory. If we don't use some of them all the time,
+      // the OS won't physically allocate pages for them any way.
+      this->curr_size = mem_size * 2;
+      this->curr_mem = static_cast<char *>(r.get_host_space_internal(this->curr_size));
+    }
+    // reset est_size, so we can start to estimate the temp memory size.
+    this->est_size = 0;
+  }
+
+  mkldnn::memory *Alloc(const mkldnn::memory::primitive_desc &pd);
+};
+
+class MKLDNNStream {
+  std::vector<mkldnn::primitive> net;
+  // Here we hold all memory related to the operators in the stream.
+  std::vector<std::shared_ptr<const mkldnn::memory> > mem_holder;
+
+ public:
+  static MKLDNNStream *Get() {
+    static thread_local MKLDNNStream stream;
+    return &stream;
+  }
+
+  void RegisterPrim(const mkldnn::primitive &prim) { net.push_back(prim); }
+
+  void RegisterMem(std::shared_ptr<const mkldnn::memory> mem) {
+    mem_holder.push_back(mem);
+  }
+
+  bool HasOps() const {
+    return !net.empty();
+  }
+
+  void Submit() {
+    if (!net.empty())
+      mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
+    net.clear();
+    mem_holder.clear();
+    TmpMemMgr::Get()->Reset();
+  }
+};
+
+class MKLDNNOpSignature {
+  std::vector<int> eles;
+  uint64_t hash;
+
+ public:
+  MKLDNNOpSignature() {
+    hash = 0;
+  }
+
+  explicit MKLDNNOpSignature(uint64_t hash) {
+    this->hash = hash;
+  }
+
+  /*
+   * We provide different methods to add signature to an op.
+   * For operations, such as convolutin and fully connected, which determines
+   * the optimal data layout for the op, we only need to use the shape and data
+   * type to sign the op. For other operations, such as activation, which uses
+   * whatever layout in the input array, we have to use the shape, the data type
+   * and the layout to sign the op.
+   */
+
+  void AddSign(const mkldnn::memory &mem) {
+    auto desc = mem.get_primitive_desc().desc();
+    hash = hash * 2 + desc.data.format;
+    eles.push_back(desc.data.format);
+    hash = hash * 2 + desc.data.data_type;
+    eles.push_back(desc.data.data_type);
+    for (int i = 0; i < desc.data.ndims; i++) {
+      hash = hash * 2 + desc.data.dims[i];
+      eles.push_back(desc.data.dims[i]);
+    }
+  }
+
+  void AddSign(const std::vector<NDArray> &arrs) {
+    for (auto &arr : arrs) {
+      AddSign(arr);
+    }
+  }
+
+  void AddSign(const NDArray &arr) {
+    if (arr.IsMKLDNNData()) {
+      AddSign(*(arr.GetMKLDNNData()));
+    } else {
+      hash = hash * 2 + arr.dtype();
+      eles.push_back(arr.dtype());
+      AddSign(arr.shape());
+    }
+  }
+
+  void AddSign(const TShape &shape) {
+    for (size_t i = 0; i < shape.ndim(); i++) {
+      hash = hash * 2 + shape[i];
+      eles.push_back(shape[i]);
+    }
+  }
+
+  void AddSign(int val) {
+    hash = hash * 2 + val;
+    eles.push_back(val);
+  }
+
+  bool operator==(const MKLDNNOpSignature &sign) const {
+    if (hash != sign.hash)
+      return false;
+    if (eles.size() != sign.eles.size())
+      return false;
+    for (size_t i = 0; i < eles.size(); i++)
+      if (eles[i] != sign.eles[i])
+        return false;
+    return true;
+  }
+
+  uint64_t GetHash() const {
+    return hash;
+  }
+};
+
+struct MKLDNNOpHash {
+  size_t operator()(const MKLDNNOpSignature &sign) const {
+    return sign.GetHash();
+  }
+};
+
+template<typename ParamType>
+class MKLDNNParamOpSign: public MKLDNNOpSignature {
+  const ParamType param;
+
+  static size_t hash(const ParamType &param) {
+    std::hash<ParamType> fn;
+    return fn(param);
+  }
+
+ public:
+  explicit MKLDNNParamOpSign(const ParamType &_param): MKLDNNOpSignature(
+      hash(_param)), param(_param) {
+  }
+
+  bool operator==(const MKLDNNParamOpSign<ParamType> &sign) const {
+    const MKLDNNOpSignature &this_upper = *this;
+    const MKLDNNOpSignature &other_upper = sign;
+    return this_upper == other_upper && param == sign.param;
+  }
+};
+
+enum OutDataOp {
+  Noop,
+  CopyBack,
+  AddBack,
+};
+
+typedef std::pair<OutDataOp, mkldnn::memory *> mkldnn_output_t;
+
+/*
+ * These two functions try to create MKLDNN memory in an NDArray based on `req'.
+ * The difference is that the first function can create MKLDNN memory with
+ * special layouts in an NDArray, while the second one can only create MKLDNN
+ * memory with default layouts.
+ * If these two functions are used, we have to call CommitOutput to write
+ * the output back to the output NDArray.
+ */
+mkldnn_output_t CreateMKLDNNMem(const NDArray &arr,
+                                const mkldnn::memory::primitive_desc &desc,
+                                OpReqType req);
+mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &arr,
+                                       const mkldnn::memory::primitive_desc &desc,
+                                       OpReqType req);
+/* This function has to be used with one of the functions above. */
+void CommitOutput(const NDArray &arr, const mkldnn_output_t &res);
+
+static inline void InvalidateOutputs(const std::vector<NDArray> &arrs,
+                                     const std::vector<OpReqType> &reqs) {
+  for (size_t i = 0; i < arrs.size(); i++) {
+    if (reqs[i] == kWriteTo || reqs[i] == kNullOp) {
+      const_cast<NDArray &>(arrs[i]).InvalidateMKLDNNData();
+    }
+  }
+}
+
+const mkldnn::memory *GetWeights(const NDArray &arr,
+                                 const mkldnn::memory::primitive_desc &target_pd,
+                                 int num_groups);
+
+mkldnn_memory_format_t GetDefaultFormat(mkldnn::memory::desc desc);
+mkldnn_memory_format_t GetDefaultFormat(int num_dims);
+mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc pd,
+                                                mkldnn_memory_format_t format);
+
+void FallBackCompute(FCompute fn, const nnvm::NodeAttrs &attrs,
+                     const OpContext &ctx,
+                     const std::vector<NDArray> &inputs,
+                     const std::vector<OpReqType> &req,
+                     const std::vector<NDArray> &outputs);
+
+/*
+ * This class is used to check the correctness of MKLDNN operators.
+ */
+class OpCheck {
+  std::vector<mxnet::NDArray> inputs;
+  std::vector<mxnet::NDArray> outputs;
+  bool backward;
+  size_t num_checks;
+
+ public:
+  OpCheck(bool backward, size_t num_checks) {
+    this->backward = backward;
+    this->num_checks = num_checks;
+  }
+
+  void Init(const std::vector<mxnet::NDArray> &inputs_,
+          const std::vector<mxnet::NDArray> &outputs_);
+
+  void Run(mxnet::FCompute fn, const nnvm::NodeAttrs &attrs,
+           const mxnet::OpContext &ctx,
+           const std::vector<mxnet::NDArray> &inputs_,
+           const std::vector<mxnet::OpReqType> &req,
+           const std::vector<mxnet::NDArray> &outputs_);
+};
+
+#define MKLDNN_OPCHECK_INIT(backward, num_checks, inputs, outputs)  \
+    static bool debug = dmlc::GetEnv("MXNET_MKLDNN_DEBUG", false);  \
+    OpCheck check(backward, num_checks);                            \
+    if (debug) check.Init(inputs, outputs);
+
+#define MKLDNN_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs)    \
+    if (debug) check.Run(fn, attrs, ctx, inputs, req, outputs);
+
+}  // namespace mxnet
+#endif
+#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc
new file mode 100644
index 000000000000..c34ca03a2809
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_base.cc
@@ -0,0 +1,385 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#if MXNET_USE_MKLDNN == 1
+
+#include <atomic>
+#include "./mkldnn_base-inl.h"
+#include "./mkldnn_ops-inl.h"
+
+namespace mxnet {
+
+void *AlignMem(void *mem, size_t size, size_t alignment, size_t *space) {
+  if (size > *space)
+    return nullptr;
+  intptr_t addr = reinterpret_cast<intptr_t>(mem);
+  // If the address has been aligned, don't do anything.
+  intptr_t last_chunk = addr % alignment;
+  if (last_chunk == 0)
+    return mem;
+  intptr_t padding = alignment - last_chunk;
+  // If the buffer doesn't have enough space, we should return null here.
+  if (padding + size > *space)
+    return nullptr;
+  addr += padding;
+  *space -= padding;
+  CHECK_EQ(addr % alignment, 0);
+  return reinterpret_cast<void *>(addr);
+}
+
+mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::primitive_desc &pd) {
+  // We need to include the size of the memory used for alignment.
+  this->est_size += pd.get_size() + alignment;
+  void *mem = AlignMem(this->curr_mem, pd.get_size(), alignment, &this->curr_size);
+  if (mem) {
+    // The memory is allocated from the temporary memory space in the
+    // operator. It'll only become invalid after we exit from the operator.
+    mkldnn_mem_ptr ret(new mkldnn::memory(pd, mem));
+    MKLDNNStream::Get()->RegisterMem(ret);
+    CHECK_EQ(mem, mem);
+    this->curr_size -= pd.get_size();
+    this->curr_mem = static_cast<char *>(mem) + pd.get_size();
+    return ret.get();
+  } else {
+    LOG(WARNING) << "Allocate " << pd.get_size()
+        << " bytes with malloc directly";
+    mkldnn_mem_ptr ret(new mkldnn::memory(pd));
+    MKLDNNStream::Get()->RegisterMem(ret);
+    return ret.get();
+  }
+}
+
+mkldnn_output_t CreateMKLDNNMem(const NDArray &arr,
+                                const mkldnn::memory::primitive_desc &desc,
+                                OpReqType req) {
+  if (kAddTo == req) {
+    auto tmp = TmpMemMgr::Get()->Alloc(desc);
+    return mkldnn_output_t(OutDataOp::AddBack, tmp);
+  } else if (kWriteInplace == req) {
+    // MKLDNN ops may not support the case that the input and the output uses
+    // the same memory. Let's use an extra copy to make sure it always works.
+    auto tmp = TmpMemMgr::Get()->Alloc(desc);
+    return mkldnn_output_t(OutDataOp::CopyBack, tmp);
+  } else {
+    mkldnn::memory *mem = const_cast<NDArray &>(arr).CreateMKLDNNData(desc);
+    if (mem == nullptr) {
+      auto tmp = TmpMemMgr::Get()->Alloc(desc);
+      return mkldnn_output_t(OutDataOp::CopyBack, tmp);
+    } else {
+      return mkldnn_output_t(OutDataOp::Noop, mem);
+    }
+  }
+}
+
+mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &arr,
+                                       const mkldnn::memory::primitive_desc &desc,
+                                       OpReqType req) {
+  if (kAddTo == req) {
+    auto tmp = TmpMemMgr::Get()->Alloc(desc);
+    return mkldnn_output_t(OutDataOp::AddBack, tmp);
+  } else if (kWriteInplace == req) {
+    auto tmp = TmpMemMgr::Get()->Alloc(desc);
+    return mkldnn_output_t(OutDataOp::CopyBack, tmp);
+  } else {
+    auto _desc = desc;
+    auto def_format = GetDefaultFormat(_desc.desc());
+    mkldnn::memory *mem = nullptr;
+    if (def_format == _desc.desc().data.format) {
+      mem = const_cast<NDArray &>(arr).CreateMKLDNNData(desc);
+    }
+    if (mem == nullptr) {
+      auto tmp = TmpMemMgr::Get()->Alloc(desc);
+      return mkldnn_output_t(OutDataOp::CopyBack, tmp);
+    } else {
+      return mkldnn_output_t(OutDataOp::Noop, mem);
+    }
+  }
+}
+
+void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) {
+  if (res.first == CopyBack) {
+    const_cast<NDArray &>(arr).CopyFrom(*res.second);
+  } else if (res.first == AddBack) {
+    auto mem = arr.GetMKLDNNData(res.second->get_primitive_desc());
+    CHECK(mem != nullptr);
+    // We have to allocate new memory for the sum result.
+    auto sum_res = TmpMemMgr::Get()->Alloc(
+        res.second->get_primitive_desc());
+    op::Sum(*res.second, *mem, *sum_res);
+    const_cast<NDArray &>(arr).CopyFrom(*sum_res);
+  }
+}
+
+const mkldnn::memory *GetWeights(const NDArray &arr,
+                                 const mkldnn::memory::primitive_desc &target_pd,
+                                 int num_groups) {
+  const mkldnn::memory *mem = arr.GetMKLDNNData(target_pd);
+  // If the weight array already uses the target layout, simply return it
+  // directly.
+  if (mem)
+    return mem;
+
+  mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype());
+  auto engine = CpuEngine::Get()->get_engine();
+  if (arr.shape().ndim() == 2) {
+    mkldnn::memory::dims tz = mkldnn::memory::dims{
+      static_cast<int>(arr.shape()[0]), static_cast<int>(arr.shape()[1])};
+    mkldnn::memory::desc md =
+        mkldnn::memory::desc{tz, type, mkldnn::memory::format::oi};
+    mkldnn::memory::primitive_desc pd =
+        mkldnn::memory::primitive_desc{md, engine};
+    mem = arr.GetMKLDNNData(pd);
+  } else if (arr.shape().ndim() == 4 && num_groups == 1) {
+    mkldnn::memory::dims tz = mkldnn::memory::dims{
+      static_cast<int>(arr.shape()[0]), static_cast<int>(arr.shape()[1]),
+          static_cast<int>(arr.shape()[2]), static_cast<int>(arr.shape()[3])};
+    mkldnn::memory::desc md =
+        mkldnn::memory::desc{tz, type, mkldnn::memory::format::oihw};
+    mkldnn::memory::primitive_desc pd =
+        mkldnn::memory::primitive_desc{md, engine};
+    mem = arr.GetMKLDNNData(pd);
+  } else if (arr.shape().ndim() == 4) {
+    mkldnn::memory::dims tz = mkldnn::memory::dims{ num_groups,
+      static_cast<int>(arr.shape()[0] / num_groups),
+      static_cast<int>(arr.shape()[1]),
+      static_cast<int>(arr.shape()[2]),
+      static_cast<int>(arr.shape()[3])};
+    mkldnn::memory::desc md =
+        mkldnn::memory::desc{tz, type, mkldnn::memory::format::goihw};
+    mkldnn::memory::primitive_desc pd =
+        mkldnn::memory::primitive_desc{md, engine};
+    mem = arr.GetMKLDNNData(pd);
+  } else {
+    LOG(FATAL) << "The weight array has an unsupported number of dimensions";
+    return nullptr;
+  }
+  if (mem == nullptr)
+    mem = arr.GetMKLDNNDataReorder(target_pd);
+  if (mem->get_primitive_desc() == target_pd) return mem;
+
+  auto ret = TmpMemMgr::Get()->Alloc(target_pd);
+  MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(*mem, *ret));
+  return ret;
+}
+
+mkldnn_memory_format_t GetDefaultFormat(int num_dims) {
+  switch (num_dims) {
+    case 1: return mkldnn_x;
+    case 2: return mkldnn_nc;
+    case 4: return mkldnn_nchw;
+    case 5: return mkldnn_goihw;
+    default:
+      LOG(FATAL) << "Unsupported MKLDNN dimensions: " << num_dims;
+      return mkldnn_format_undef;
+  }
+}
+
+mkldnn_memory_format_t GetDefaultFormat(mkldnn::memory::desc desc) {
+  if (desc.data.ndims == 1) {
+    return desc.data.format;
+  } else if (desc.data.ndims == 2) {
+    if (desc.data.format == mkldnn_io)
+      return mkldnn_oi;
+    else
+      return desc.data.format;
+  } else if (desc.data.ndims == 4) {
+    switch (desc.data.format) {
+      case mkldnn_nchw:
+      case mkldnn_nhwc:
+      case mkldnn_chwn:
+      case mkldnn_nChw8c:
+      case mkldnn_nChw16c:
+        return mkldnn_nchw;
+      case mkldnn_oihw:
+      case mkldnn_ihwo:
+      case mkldnn_hwio:
+      case mkldnn_OIhw8i8o:
+      case mkldnn_OIhw16i16o:
+      case mkldnn_OIhw8i16o2i:
+      case mkldnn_OIhw8o16i2o:
+      case mkldnn_OIhw8o8i:
+      case mkldnn_OIhw16o16i:
+      case mkldnn_IOhw16o16i:
+      case mkldnn_Oihw8o:
+      case mkldnn_Oihw16o:
+      case mkldnn_Ohwi8o:
+      case mkldnn_Ohwi16o:
+      case mkldnn_OhIw16o4i:
+        return mkldnn_oihw;
+      default:
+        LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format;
+        return mkldnn_format_undef;
+    }
+  } else if (desc.data.ndims == 5) {
+    switch (desc.data.format) {
+      case mkldnn_goihw:
+      case mkldnn_gOIhw8i8o:
+      case mkldnn_gOIhw16i16o:
+      case mkldnn_gOIhw8i16o2i:
+      case mkldnn_gOIhw8o16i2o:
+      case mkldnn_gOIhw8o8i:
+      case mkldnn_gOIhw16o16i:
+      case mkldnn_gIOhw16o16i:
+      case mkldnn_gOihw8o:
+      case mkldnn_gOihw16o:
+      case mkldnn_gOhwi8o:
+      case mkldnn_gOhwi16o:
+      case mkldnn_gOhIw16o4i:
+        return mkldnn_goihw;
+      default:
+        LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format;
+        return mkldnn_format_undef;
+    }
+  } else {
+    LOG(FATAL) << "Unsupported dimensions: " << desc.data.ndims;
+    return mkldnn_format_undef;
+  }
+}
+
+mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc pd,
+                                                mkldnn_memory_format_t format) {
+  mkldnn::memory::dims dims(pd.desc().data.ndims);
+  for (size_t i = 0; i < dims.size(); i++)
+    dims[i] = pd.desc().data.dims[i];
+  mkldnn::memory::format cpp_format = static_cast<mkldnn::memory::format>(format);
+  mkldnn::memory::data_type cpp_type = static_cast<mkldnn::memory::data_type>(
+      pd.desc().data.data_type);
+  mkldnn::memory::desc data_md(dims, cpp_type, cpp_format);
+  return mkldnn::memory::primitive_desc(data_md, pd.get_engine());
+}
+
+void FallBackCompute(FCompute fn, const nnvm::NodeAttrs &attrs,
+                     const OpContext &ctx,
+                     const std::vector<NDArray> &inputs,
+                     const std::vector<OpReqType> &req,
+                     const std::vector<NDArray> &outputs) {
+  std::vector<TBlob> in_blobs(inputs.size());
+  for (size_t i = 0; i < in_blobs.size(); i++) {
+      in_blobs[i] = inputs[i].data();
+  }
+  std::vector<TBlob> out_blobs(outputs.size());
+  for (size_t i = 0; i < out_blobs.size(); i++) {
+    if (req[i] == kWriteTo)
+      const_cast<NDArray &>(outputs[i]).InvalidateMKLDNNData();
+    CHECK(outputs[i].IsDefaultData());
+    out_blobs[i] = outputs[i].data();
+  }
+  fn(attrs, ctx, in_blobs, req, out_blobs);
+}
+
+template<typename DType>
+void print_diff(const mxnet::NDArray &arr1, const mxnet::NDArray &arr2) {
+  DType *data1 = reinterpret_cast<DType *>(arr1.data().dptr_);
+  DType *data2 = reinterpret_cast<DType *>(arr2.data().dptr_);
+  for (size_t i = 0; i < arr1.shape().Size(); i++)
+    std::cout << data1[i] - data2[i] << ", ";
+  std::cout << std::endl;
+}
+
+template<typename DType>
+static bool SimilarArray(const mxnet::NDArray &arr1, const mxnet::NDArray &arr2,
+                         DType rtol, DType atol) {
+  if (arr1.shape().Size() != arr2.shape().Size())
+    return false;
+
+  // This function should be used outside an MKLDNN operator.
+  // There shouldn't be any operators in the stream.
+  CHECK(!MKLDNNStream::Get()->HasOps());
+  // We need to reorder data in the arrays to the default layout.
+  // But we shouldn't reorder data in the original array.
+  NDArray buf1, buf2;
+  if (arr1.IsMKLDNNData()) {
+    buf1 = NDArray(arr1.shape(), arr1.ctx(), false, arr1.dtype());
+    auto mem = arr1.GetMKLDNNData();
+    buf1.CopyFrom(*mem);
+  }
+  if (arr2.IsMKLDNNData()) {
+    buf2 = NDArray(arr2.shape(), arr2.ctx(), false, arr2.dtype());
+    auto mem = arr2.GetMKLDNNData();
+    buf2.CopyFrom(*mem);
+  }
+  MKLDNNStream::Get()->Submit();
+
+  DType *data1 = reinterpret_cast<DType *>(
+      arr1.IsMKLDNNData() ? buf1.data().dptr_: arr1.data().dptr_);
+  DType *data2 = reinterpret_cast<DType *>(
+      arr2.IsMKLDNNData() ? buf2.data().dptr_: arr2.data().dptr_);
+  std::atomic<bool> success(true);
+#pragma omp parallel for
+  for (size_t i = 0; i < arr1.shape().Size(); i++) {
+    if (std::abs(data1[i] - data2[i]) > atol + rtol * std::abs(data2[i]))
+      success.store(false);
+  }
+  return success.load();
+}
+
+void OpCheck::Init(const std::vector<mxnet::NDArray> &inputs_,
+                   const std::vector<mxnet::NDArray> &outputs_) {
+  auto ctx = inputs_[0].ctx();
+  CHECK(!MKLDNNStream::Get()->HasOps());
+  for (size_t i = 0; i < inputs_.size(); i++) {
+    inputs.emplace_back(inputs_[i].shape(), ctx,
+                        false, inputs_[i].dtype());
+    auto mem = inputs_[i].GetMKLDNNData();
+    inputs[i].CopyFrom(*mem);
+  }
+  for (size_t i = 0; i < outputs_.size(); i++) {
+    outputs.emplace_back(outputs_[i].shape(), ctx,
+                         false, outputs_[i].dtype());
+    if (backward) {
+      auto mem = outputs_[i].GetMKLDNNData();
+      outputs[i].CopyFrom(*mem);
+    }
+  }
+  MKLDNNStream::Get()->Submit();
+}
+
+void OpCheck::Run(mxnet::FCompute fn, const nnvm::NodeAttrs &attrs,
+                  const mxnet::OpContext &ctx,
+                  const std::vector<mxnet::NDArray> &inputs_,
+                  const std::vector<mxnet::OpReqType> &req,
+                  const std::vector<mxnet::NDArray> &outputs_) {
+  std::vector<mxnet::TBlob> in_blobs(inputs.size());
+  for (size_t i = 0; i < in_blobs.size(); i++) in_blobs[i] = inputs[i].data();
+  std::vector<mxnet::TBlob> out_blobs(outputs.size());
+  for (size_t i = 0; i < out_blobs.size(); i++)
+    out_blobs[i] = outputs[i].data();
+  fn(attrs, ctx, in_blobs, req, out_blobs);
+
+  LOG(INFO) << "test " << attrs.op->name;
+  size_t num = std::min(outputs.size(), outputs_.size());
+  num = std::min(num_checks, num);
+  for (size_t i = 0; i < num; i++) {
+    // We don't need to compare if it doesn't need to output data.
+    if (req[i] == kNullOp)
+      continue;
+    MSHADOW_TYPE_SWITCH(outputs[i].dtype(), DType, {
+      bool similar = SimilarArray<DType>(outputs[i], outputs_[i], 1e-3, 1e-4);
+      if (!similar) {
+        LOG(ERROR) << attrs.op->name << " fails";
+        print_diff<DType>(outputs[i], outputs_[i]);
+      }
+      CHECK(similar);
+    });
+  }
+}
+
+}  // namespace mxnet
+
+#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
new file mode 100644
index 000000000000..19a98da6af83
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
@@ -0,0 +1,431 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_batch_norm.cc
+ * \brief
+ * \author Tao Lv
+*/
+
+#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_
+#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_
+
+#if MXNET_USE_MKLDNN == 1
+#include <vector>
+#include <utility>
+#include <mkldnn.hpp>
+#include "../batch_norm-inl.h"
+#include "./mkldnn_ops-inl.h"
+#include "./mkldnn_base-inl.h"
+
+#define VARIANCE_TO_INVSTD(__var$,    __eps$)   (1.0/sqrt((__var$) + DType(__eps$)))
+#define INVSTD_TO_VARIANCE(__invstd$, __eps$)   ((1.0 / ((__invstd$) * (__invstd$))) - (__eps$))
+namespace mxnet {
+namespace op {
+
+typedef mkldnn::batch_normalization_forward::primitive_desc     t_bn_f_pdesc;
+typedef mkldnn::batch_normalization_forward::desc               t_bn_f_desc;
+typedef mkldnn::batch_normalization_backward::primitive_desc    t_bn_b_pdesc;
+typedef mkldnn::batch_normalization_backward::desc              t_bn_b_desc;
+
+using mkldnn::use_global_stats;
+using mkldnn::use_scale_shift;
+using mkldnn::forward_training;
+using mkldnn::forward_inference;
+
+inline static unsigned _GetFlags(const std::vector<NDArray> &in_data,
+                                 const std::vector<NDArray> &aux_states,
+                                 const BatchNormParam &param, bool is_train) {
+  unsigned flags = 0U;
+  if (in_data.size() == 3U) {
+    flags |= use_scale_shift;
+  }
+
+  // aux_states[0]: inMean
+  // aux_states[1]: inVariance
+  if (aux_states.size() == 2U && !is_train) {
+    flags |= use_global_stats;
+  }
+  return flags;
+}
+
+template <typename DType>
+inline static t_bn_f_pdesc _GetFwd(const mkldnn::memory &data_mem,
+                                   bool is_train,
+                                   DType eps,
+                                   unsigned flags) {
+  auto data_mpd   = data_mem.get_primitive_desc();
+  auto data_md    = data_mpd.desc();
+  auto engine     = CpuEngine::Get()->get_engine();
+
+  if (is_train) {
+    t_bn_f_desc bnFwd_desc(forward_training, data_md, eps, flags);
+    return t_bn_f_pdesc(bnFwd_desc, engine);
+  } else {
+    t_bn_f_desc bnFwd_desc(forward_inference, data_md, eps, flags);
+    return t_bn_f_pdesc(bnFwd_desc, engine);
+  }
+}
+
+template <typename DType>
+inline static t_bn_b_pdesc _GetBwd(const mkldnn::memory &data_mem,
+                                   const mkldnn::memory &diff_mem,
+                                   DType eps,
+                                   unsigned flags) {
+  auto data_mpd   = data_mem.get_primitive_desc();
+  auto data_md    = data_mpd.desc();
+  auto diff_mpd   = diff_mem.get_primitive_desc();
+  auto diff_md    = diff_mpd.desc();
+  auto engine     = CpuEngine::Get()->get_engine();
+
+  t_bn_b_desc  bnBwd_desc(mkldnn::prop_kind::backward, diff_md, data_md, eps, flags);
+  return t_bn_b_pdesc(bnBwd_desc, engine, _GetFwd(data_mem, true, eps, flags));
+}
+
+typedef MKLDNNParamOpSign<BatchNormParam> MKLDNNBNSignature;
+
+class MKLDNNBNForward {
+  std::shared_ptr<const mkldnn::memory> data_m;
+  std::shared_ptr<const mkldnn::memory> weight_m;
+  std::shared_ptr<const mkldnn::memory> out_m;
+  std::shared_ptr<const mkldnn::memory> mean_m;
+  std::shared_ptr<const mkldnn::memory> var_m;
+  std::shared_ptr<mkldnn::batch_normalization_forward> fwd;
+  bool is_train;
+  t_bn_f_pdesc pd;
+
+ public:
+  MKLDNNBNForward(const t_bn_f_pdesc &_pd, bool is_train): pd(_pd) {
+    weight_m.reset(new mkldnn::memory(pd.weights_primitive_desc()));
+    this->is_train = is_train;
+  }
+
+  const mkldnn::memory &GetWeight() const {
+    return *weight_m;
+  }
+
+  const t_bn_f_pdesc &GetPd() const {
+    return pd;
+  }
+
+  const mkldnn::memory &GetMean() const {
+    return *mean_m;
+  }
+
+  const mkldnn::memory &GetVar() const {
+    return *var_m;
+  }
+
+  void SetDataHandle(const NDArray &data, const NDArray &mean,
+                     const NDArray &var, const mkldnn::memory &out) {
+    auto _data = data.GetMKLDNNData();
+    if (data_m) {
+      data_m->set_data_handle(_data->get_data_handle());
+    } else {
+      data_m.reset(new mkldnn::memory(_data->get_primitive_desc(),
+                                      _data->get_data_handle()));
+    }
+    if (out_m) {
+      out_m->set_data_handle(out.get_data_handle());
+    } else {
+      out_m.reset(new mkldnn::memory(out.get_primitive_desc(),
+                                     out.get_data_handle()));
+    }
+    auto mean_ptr = mean.data().dptr_;
+    if (mean_m) {
+      mean_m->set_data_handle(mean_ptr);
+    } else {
+      mean_m.reset(new mkldnn::memory(pd.mean_primitive_desc(),
+                                      mean_ptr));
+    }
+    auto var_ptr = var.data().dptr_;
+    if (var_m) {
+      var_m->set_data_handle(var_ptr);
+    } else {
+      var_m.reset(new mkldnn::memory(pd.variance_primitive_desc(),
+                                     var_ptr));
+    }
+
+    if (fwd == nullptr) {
+      if (!is_train)
+        fwd.reset(new mkldnn::batch_normalization_forward(
+                pd, *data_m, mkldnn::primitive::at(*mean_m),
+                mkldnn::primitive::at(*var_m), *weight_m, *out_m));
+      else
+        fwd.reset(new mkldnn::batch_normalization_forward(
+                pd, mkldnn::primitive::at(*data_m),
+                mkldnn::primitive::at(*weight_m), *out_m,
+                *mean_m, *var_m));
+    }
+  }
+
+  const mkldnn::batch_normalization_forward &GetFwd() const {
+    return *fwd;
+  }
+};
+
+template<typename DType>
+static MKLDNNBNForward &GetBNForward(const BatchNormParam& param,
+                                     const OpContext &ctx, const NDArray &in_data,
+                                     unsigned flags) {
+  static thread_local std::unordered_map<MKLDNNBNSignature, MKLDNNBNForward, MKLDNNOpHash> fwds;
+  MKLDNNBNSignature key(param);
+  key.AddSign(ctx.is_train);
+  key.AddSign(in_data);
+
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+    auto fwd_pd = _GetFwd(*in_data.GetMKLDNNData(), ctx.is_train,
+                          (DType) param.eps, flags);
+    MKLDNNBNForward fwd(fwd_pd, ctx.is_train);
+    auto ins_ret = fwds.insert(std::pair<MKLDNNBNSignature, MKLDNNBNForward>(
+            key, fwd));
+    CHECK(ins_ret.second);
+    it = ins_ret.first;
+  }
+  return it->second;
+}
+
+template <typename DType>
+void MKLDNNBatchNormForward(const OpContext &ctx, const BatchNormParam &param,
+                            const std::vector<NDArray>   &in_data,
+                            const std::vector<OpReqType> &req,
+                            const std::vector<NDArray>   &out_data,
+                            const std::vector<NDArray>   &aux_states) {
+  TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]);
+  unsigned flags      = _GetFlags(in_data, aux_states, param, ctx.is_train);
+  const NDArray &data = in_data[batchnorm::kData];
+
+  auto &fwd = GetBNForward<DType>(param, ctx, data, flags);
+  const NDArray &out  = out_data[batchnorm::kOut];
+
+  // for output memory
+  auto out_mem = const_cast<NDArray &>(out).CreateMKLDNNData(fwd.GetPd().dst_primitive_desc());
+
+  // mxnet will always use scale shift.
+  // But if fix_gamma is true, then all scale elements will be set to 1.0f
+  if (flags & use_scale_shift) {
+    const NDArray &gamma    = in_data[batchnorm::kGamma];
+    const NDArray &beta     = in_data[batchnorm::kBeta];
+    CHECK_EQ(gamma.storage_type(), mxnet::kDefaultStorage);
+    CHECK_EQ(beta.storage_type(), mxnet::kDefaultStorage);
+
+    const mkldnn::memory &weight_mem = fwd.GetWeight();
+    DType* weight_buf = reinterpret_cast<DType *>(weight_mem.get_data_handle());
+
+    nnvm::dim_t channels_ = data.shape()[1];
+    CHECK(weight_mem.get_primitive_desc().get_size() == channels_ * sizeof(DType) * 2);
+    DType* weight_ptr = gamma.data().dptr<DType>();
+    DType* bias_ptr = beta.data().dptr<DType>();
+    if (!param.fix_gamma) {
+#pragma omp parallel for simd
+      for (int i = 0; i < channels_; i++) {
+        weight_buf[i] = weight_ptr[i];
+        weight_buf[channels_ + i] = bias_ptr[i];  // bias
+      }
+    } else if (IsBNWriting(req[batchnorm::kGamma])) {
+#pragma omp parallel for simd
+      for (int i = 0; i < channels_; i++) {
+        weight_buf[i] = (DType)1.0f;
+        weight_ptr[i] = (DType)1.0f;
+        weight_buf[channels_ + i] = bias_ptr[i];  // bias
+      }
+    } else {
+#pragma omp parallel for simd
+      for (int i = 0; i < channels_; i++) {
+        weight_buf[i] = (DType)1.0f;
+        weight_buf[channels_ + i] = bias_ptr[i];  // bias
+      }
+    }
+
+    if (!ctx.is_train) {
+      DType* omean    = out_data[batchnorm::kMean].data().dptr<DType>();
+      DType* ovar     = out_data[batchnorm::kVar].data().dptr<DType>();
+      DType* inmean   = aux_states[batchnorm::kMovingMean].data().dptr<DType>();
+      DType* invar    = aux_states[batchnorm::kMovingVar].data().dptr<DType>();
+      // to align with origin implmentation: batch_norm.cc: L164
+#pragma omp parallel for simd
+      for (int i = 0; i < channels_; i++) {
+        omean[i] = inmean[i];
+        ovar[i] = VARIANCE_TO_INVSTD(invar[i], param.eps);
+      }
+
+      fwd.SetDataHandle(data, aux_states[batchnorm::kMovingMean],
+                        aux_states[batchnorm::kMovingVar],
+                        *out_mem);
+      MKLDNNStream::Get()->RegisterPrim(fwd.GetFwd());
+      MKLDNNStream::Get()->Submit();
+    } else {  // training
+      const NDArray &outMean  = out_data[batchnorm::kMean];
+      const NDArray &outVar   = out_data[batchnorm::kVar];
+      DType* omean    = outMean.data().dptr<DType>();
+      DType* ovar     = outVar.data().dptr<DType>();
+
+      fwd.SetDataHandle(data, outMean, outVar, *out_mem);
+      MKLDNNStream::Get()->RegisterPrim(fwd.GetFwd());
+      MKLDNNStream::Get()->Submit();
+      DType* mean_mem_ptr = reinterpret_cast<DType*>(fwd.GetMean().get_data_handle());
+      DType* var_mem_ptr  = reinterpret_cast<DType*>(fwd.GetVar().get_data_handle());
+#pragma omp parallel for simd
+      for (int i = 0; i < channels_; i++) {
+        omean[i] = mean_mem_ptr[i];
+        ovar[i]  = VARIANCE_TO_INVSTD(var_mem_ptr[i], param.eps);
+      }
+    }
+  } else {  // no input gamma and beta
+      LOG(FATAL) << "MKLDNN batch normalization: should not reach here ...";
+  }
+}
+
+template <typename DType>
+void MKLDNNBatchNormBackward(const OpContext &ctx, const BatchNormParam &param,
+                             const std::vector<NDArray>    &out_grad,
+                             const std::vector<NDArray>    &in_data,
+                             const std::vector<NDArray>    &out_data,
+                             const std::vector<OpReqType>  &req,
+                             const std::vector<NDArray>    &in_grad,
+                             const std::vector<NDArray>    &aux_states) {
+  TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]);
+  CHECK_EQ(out_grad.size(), param.output_mean_var ? 3U : 1U);
+  CHECK_EQ(in_data.size(), 3U);
+  CHECK_EQ(out_data.size(), 3U);
+  CHECK_EQ(in_grad.size(), 3U);
+  unsigned flags = _GetFlags(in_data, aux_states, param, ctx.is_train);
+
+  const NDArray &data         = in_data[batchnorm::kData];
+  const NDArray &diff         = out_grad[batchnorm::kOut];
+  const NDArray &gradIn       = in_grad[batchnorm::kData];
+  const NDArray &moving_mean  = aux_states[batchnorm::kMovingMean];
+  const NDArray &moving_var   = aux_states[batchnorm::kMovingVar];
+  const NDArray &out_mean     = out_data[batchnorm::kMean];
+  const NDArray &out_var      = out_data[batchnorm::kVar];
+
+  CHECK(out_mean.IsDefaultData());
+  CHECK(out_var.IsDefaultData());
+  CHECK(moving_mean.IsDefaultData());
+  CHECK(moving_var.IsDefaultData());
+
+  auto data_mem  = data.GetMKLDNNData();
+  auto diff_mem  = diff.GetMKLDNNData();
+  // MKLDNN batchnorm should run on special layouts. If one of them isn't, we
+  // should reorder them.
+  if (data.IsDefaultData())
+    data_mem = data.GetMKLDNNDataReorder(diff_mem->get_primitive_desc());
+  else if (diff.IsDefaultData())
+    diff_mem = diff.GetMKLDNNDataReorder(data_mem->get_primitive_desc());
+  auto bwd_pd = _GetBwd(*data_mem, *diff_mem, param.eps, flags);
+  auto gradi_mem = const_cast<NDArray &>(gradIn).CreateMKLDNNData(data_mem->get_primitive_desc());
+
+  if (flags & use_scale_shift) {
+    const NDArray &gamma    = in_data[batchnorm::kGamma];
+    const NDArray &beta     = in_data[batchnorm::kBeta];
+    // TODO(tao): how to reuse this memory?
+    std::shared_ptr<const mkldnn::memory> weight_mem(
+                    new mkldnn::memory(bwd_pd.weights_primitive_desc()));
+
+    DType* weight_buf = reinterpret_cast<DType *>(weight_mem->get_data_handle());
+    nnvm::dim_t channels_ = data.shape()[1];
+    for (int i = 0; i < channels_; i++) {
+      if (!param.fix_gamma)
+        weight_buf[i] = (gamma.data().dptr<DType>())[i];   // weight
+      else
+        weight_buf[i] = (DType)1.0f;
+    }
+
+    for (int i = 0; i < channels_; i++) {
+      weight_buf[channels_ + i] = (beta.data().dptr<DType>())[i];  // bias
+    }
+
+    std::shared_ptr<const mkldnn::memory> gradw_mem(
+                    new mkldnn::memory(bwd_pd.diff_weights_primitive_desc()));
+    // training but no input mean and variance
+    if (ctx.is_train && !param.use_global_stats) {
+      DType* moving_mean_ptr  = reinterpret_cast<DType *>(moving_mean.data().dptr<DType>());
+      DType* moving_var_ptr   = reinterpret_cast<DType *>(moving_var.data().dptr<DType>());
+      DType* out_mean_ptr     = reinterpret_cast<DType *>(out_mean.data().dptr<DType>());
+      DType* out_var_ptr      = reinterpret_cast<DType *>(out_var.data().dptr<DType>());
+      mkldnn::memory var_mem(bwd_pd.variance_primitive_desc());
+      DType *tmp_var_ptr = reinterpret_cast<DType *>(var_mem.get_data_handle());
+
+      DType minus_mom = (1.0f - param.momentum);
+      for (int i = 0; i < channels_; i++) {
+        moving_mean_ptr[i] = moving_mean_ptr[i] * param.momentum +
+                             out_mean_ptr[i] * minus_mom;
+        float variance = INVSTD_TO_VARIANCE(out_var_ptr[i], param.eps);
+        tmp_var_ptr[i] = variance;
+        moving_var_ptr[i] = moving_var_ptr[i] * param.momentum +
+                            variance * minus_mom;
+      }
+
+      std::shared_ptr<const mkldnn::memory> out_mean_mem(
+                      new mkldnn::memory(bwd_pd.mean_primitive_desc(), out_mean_ptr));
+      std::shared_ptr<const mkldnn::memory> out_var_mem(
+                      new mkldnn::memory(bwd_pd.variance_primitive_desc(), out_var_ptr));
+
+      auto bn_bwd = mkldnn::batch_normalization_backward(bwd_pd,
+                                                         *data_mem,
+                                                         mkldnn::primitive::at(*out_mean_mem),
+                                                         mkldnn::primitive::at(var_mem),
+                                                         *diff_mem,
+                                                         *weight_mem,
+                                                         *gradi_mem,
+                                                         *gradw_mem);
+
+      MKLDNNStream::Get()->RegisterPrim(bn_bwd);
+      MKLDNNStream::Get()->Submit();
+    } else {
+      std::shared_ptr<const mkldnn::memory> imean_mem(
+                      new mkldnn::memory(bwd_pd.mean_primitive_desc(),
+                      moving_mean.data().dptr<DType>()));
+      std::shared_ptr<const mkldnn::memory> ivar_mem(
+                      new mkldnn::memory(bwd_pd.variance_primitive_desc(),
+                      moving_var.data().dptr<DType>()));
+      auto bn_bwd = mkldnn::batch_normalization_backward(bwd_pd,
+                                                         *data_mem,
+                                                         mkldnn::primitive::at(*imean_mem),
+                                                         mkldnn::primitive::at(*ivar_mem),
+                                                         *diff_mem,
+                                                         *weight_mem,
+                                                         *gradi_mem,
+                                                         *gradw_mem);
+
+      MKLDNNStream::Get()->RegisterPrim(bn_bwd);
+      MKLDNNStream::Get()->Submit();
+    }
+
+    // copy data from gradw_mem to in_grad[1] and in_grad[2]
+    DType* gw_buf = reinterpret_cast<DType *>(gradw_mem->get_data_handle());
+    for (int i = 0; i < channels_; i++) {
+      if (!param.fix_gamma)
+        (in_grad[1].data().dptr<DType>())[i] = gw_buf[i];
+      else
+        (in_grad[1].data().dptr<DType>())[i] = 0.0f;
+    }
+
+    for (int i = 0; i < channels_; i++) {
+      (in_grad[2].data().dptr<DType>())[i] = gw_buf[i + channels_];
+    }
+  } else {
+    LOG(FATAL) << "MKLDNN batch normalization backward: should not reach here ...";
+  }
+}
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_MKLDNN
+#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_concat.cc b/src/operator/nn/mkldnn/mkldnn_concat.cc
new file mode 100644
index 000000000000..d3e6e775020d
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_concat.cc
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_concat.cc
+ * \brief
+ * \author Wenting Jiang
+*/
+#include "../concat-inl.h"
+#include "./mkldnn_ops-inl.h"
+#include "./mkldnn_base-inl.h"
+
+#if MXNET_USE_MKLDNN == 1
+namespace mxnet {
+namespace op {
+
+void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                         const std::vector<NDArray> &in_data,
+                         const std::vector<OpReqType> &req,
+                         const std::vector<NDArray> &out_data) {
+  TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]);
+  const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
+  int num_in_data = param.num_args;
+  int concat_dim = param.dim;
+  std::vector<mkldnn::memory::primitive_desc> data_md;
+  std::vector<mkldnn::primitive::at> data_mem;
+  for (int i =0; i < num_in_data; i++) {
+      auto tmp_mem = in_data[i].GetMKLDNNData();
+      auto tmp_pd = tmp_mem->get_primitive_desc();
+      data_md.push_back(tmp_pd);
+      data_mem.push_back(*tmp_mem);
+  }
+  mkldnn::concat::primitive_desc fwd_pd(concat_dim, data_md);
+  auto engine = CpuEngine::Get()->get_engine();
+  auto out_mem = CreateMKLDNNMem(out_data[concat_enum::kOut],
+      fwd_pd.dst_primitive_desc(), req[concat_enum::kOut]);
+  MKLDNNStream::Get()->RegisterPrim(mkldnn::concat(fwd_pd, data_mem, *out_mem.second));
+  CommitOutput(out_data[concat_enum::kOut], out_mem);
+  MKLDNNStream::Get()->Submit();
+}
+
+void MKLDNNConcatBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                          const std::vector<NDArray>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<NDArray>& outputs) {
+  TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]);
+  const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
+  int num_in_data = param.num_args;
+  int axis_ = param.dim;
+  auto engine = CpuEngine::Get()->get_engine();
+  auto gz_mem = inputs[0].GetMKLDNNData();
+  mkldnn::memory::primitive_desc gz_pd = gz_mem->get_primitive_desc();
+  /* init the offset */
+  mkldnn::memory::dims offsets = {0, 0, 0, 0};
+  for (int i = 0; i < num_in_data; i++) {
+    mkldnn::memory::dims diff_src_tz
+        = {static_cast<int>(inputs[i+1].shape()[0]),
+          static_cast<int>(inputs[i+1].shape()[1]),
+          static_cast<int>(inputs[i+1].shape()[2]),
+          static_cast<int>(inputs[i+1].shape()[3])};
+    auto diff_src_mpd = inputs[i+1].GetMKLDNNData()->get_primitive_desc();
+    auto gradi_mem_ = CreateMKLDNNMem(outputs[i], diff_src_mpd, req[i]);
+    // create view from gy to gxs[i]
+    std::shared_ptr<mkldnn::view::primitive_desc> view_pd;
+    view_pd.reset(new mkldnn::view::primitive_desc(gz_pd, diff_src_tz, offsets));
+    // create reorder primitive from gy to gxs[i]
+    mkldnn::reorder::primitive_desc reorder_pd(
+        view_pd.get()->dst_primitive_desc(), diff_src_mpd);
+    offsets[axis_] += diff_src_tz[axis_];
+    MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(
+            reorder_pd, *gz_mem, *gradi_mem_.second));
+    CommitOutput(outputs[i], gradi_mem_);
+  }
+  MKLDNNStream::Get()->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc
new file mode 100644
index 000000000000..b94850aa620b
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc
@@ -0,0 +1,357 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_convolution.cc
+ * \brief
+ * \author Da Zheng
+*/
+
+#include "../convolution-inl.h"
+#include "./mkldnn_ops-inl.h"
+#include "./mkldnn_base-inl.h"
+
+#if MXNET_USE_MKLDNN == 1
+namespace mxnet {
+namespace op {
+
+static mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
+    const ConvolutionParam& param, bool is_train, const NDArray &data,
+    const NDArray &weights, const NDArray *bias, const NDArray &output) {
+  auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
+  auto data_md = GetMemDesc(data);
+  auto weight_md = GetWeightDesc(weights, param.num_group);
+  auto out_md = GetMemDesc(output);
+  auto engine = CpuEngine::Get()->get_engine();
+  mkldnn::memory::dims strides{0, 0};
+  if (param.stride.ndim() == 2) {
+    strides[0] = param.stride[0];
+    strides[1] = param.stride[1];
+  }
+  mkldnn::memory::dims padding{0, 0};
+  if (param.pad.ndim() == 2) {
+    padding[0] = param.pad[0];
+    padding[1] = param.pad[1];
+  }
+  if (param.dilate.ndim() == 0 && bias == nullptr) {
+    mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct,
+        data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero);
+    return mkldnn::convolution_forward::primitive_desc(desc, engine);
+  } else if (param.dilate.ndim() == 0) {
+    auto bias_md = GetMemDesc(*bias);
+    mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct,
+        data_md, weight_md, bias_md, out_md, strides, padding, padding,
+        mkldnn::padding_kind::zero);
+    return mkldnn::convolution_forward::primitive_desc(desc, engine);
+  } else {
+    mkldnn::memory::dims dilates{0, 0};
+    if (param.dilate.ndim() == 2) {
+      dilates[0] = param.dilate[0] - 1;
+      dilates[1] = param.dilate[1] - 1;
+    }
+    if (bias == nullptr) {
+      mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct,
+          data_md, weight_md, out_md, strides, dilates, padding, padding,
+          mkldnn::padding_kind::zero);
+      return mkldnn::convolution_forward::primitive_desc(desc, engine);
+    } else {
+      auto bias_md = GetMemDesc(*bias);
+      mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct,
+                                             data_md, weight_md, bias_md, out_md, strides,
+                                             dilates, padding, padding,
+                                             mkldnn::padding_kind::zero);
+      return mkldnn::convolution_forward::primitive_desc(desc, engine);
+    }
+  }
+}
+
+static mkldnn::convolution_backward_data::primitive_desc GetConvBwdData(
+    const ConvolutionParam& param, const NDArray &data, const NDArray &weights,
+    const NDArray &output, const mkldnn::convolution_forward::primitive_desc &fwd_pd) {
+  auto data_md = GetMemDesc(data);
+  auto weight_md = GetWeightDesc(weights, param.num_group);
+  auto out_md = GetMemDesc(output);
+  auto engine = CpuEngine::Get()->get_engine();
+  mkldnn::memory::dims strides{0, 0};
+  if (param.stride.ndim() == 2) {
+    strides[0] = param.stride[0];
+    strides[1] = param.stride[1];
+  }
+  mkldnn::memory::dims padding{0, 0};
+  if (param.pad.ndim() == 2) {
+    padding[0] = param.pad[0];
+    padding[1] = param.pad[1];
+  }
+  if (param.dilate.ndim() == 0) {
+    mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
+        data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero);
+    return mkldnn::convolution_backward_data::primitive_desc(desc, engine, fwd_pd);
+  } else {
+    mkldnn::memory::dims dilates{0, 0};
+    if (param.dilate.ndim() == 2) {
+      dilates[0] = param.dilate[0] - 1;
+      dilates[1] = param.dilate[1] - 1;
+    }
+    mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
+        data_md, weight_md, out_md, strides, dilates, padding, padding,
+        mkldnn::padding_kind::zero);
+    return mkldnn::convolution_backward_data::primitive_desc(desc, engine, fwd_pd);
+  }
+}
+
+static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights(
+    const ConvolutionParam& param, const NDArray &data,
+    const NDArray &weights, const NDArray *bias, const NDArray &output,
+    const mkldnn::convolution_forward::primitive_desc &fwd_pd) {
+  auto data_md = GetMemDesc(data);
+  auto weight_md = GetWeightDesc(weights, param.num_group);
+  auto out_md = GetMemDesc(output);
+  auto engine = CpuEngine::Get()->get_engine();
+  mkldnn::memory::dims strides{0, 0};
+  if (param.stride.ndim() == 2) {
+    strides[0] = param.stride[0];
+    strides[1] = param.stride[1];
+  }
+  mkldnn::memory::dims padding{0, 0};
+  if (param.pad.ndim() == 2) {
+    padding[0] = param.pad[0];
+    padding[1] = param.pad[1];
+  }
+  if (param.dilate.ndim() == 0 && bias == nullptr) {
+    mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
+        data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero);
+    return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd);
+  } else if (param.dilate.ndim() == 0) {
+    auto bias_md = GetMemDesc(*bias);
+    mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
+        data_md, weight_md, bias_md, out_md, strides, padding, padding,
+        mkldnn::padding_kind::zero);
+    return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd);
+  } else {
+    mkldnn::memory::dims dilates{0, 0};
+    if (param.dilate.ndim() == 2) {
+      dilates[0] = param.dilate[0] - 1;
+      dilates[1] = param.dilate[1] - 1;
+    }
+    if (bias == nullptr) {
+      mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
+          data_md, weight_md, out_md, strides, dilates, padding, padding,
+          mkldnn::padding_kind::zero);
+      return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd);
+    } else {
+      auto bias_md = GetMemDesc(*bias);
+      mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
+                                                      data_md, weight_md, bias_md, out_md,
+                                                      strides, dilates, padding, padding,
+                                                      mkldnn::padding_kind::zero);
+      return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd);
+    }
+  }
+}
+
+class MKLDNNConvForward {
+  std::shared_ptr<mkldnn::convolution_forward> fwd;
+  std::shared_ptr<mkldnn::memory> data;
+  std::shared_ptr<mkldnn::memory> weight;
+  std::shared_ptr<mkldnn::memory> bias;
+  std::shared_ptr<mkldnn::memory> out;
+
+ public:
+  mkldnn::convolution_forward::primitive_desc fwd_pd;
+
+  MKLDNNConvForward(const ConvolutionParam& param, bool is_train,
+                    const NDArray &data, const NDArray &weights,
+                    const NDArray *bias, const NDArray &output): fwd_pd(
+                        GetConvFwdImpl(param, is_train, data, weights, bias, output)) {
+  }
+
+  void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &weight,
+                 const mkldnn::memory *bias, const mkldnn::memory &output) {
+    if (this->data == nullptr)
+      this->data = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+              fwd_pd.src_primitive_desc(), data.get_data_handle()));
+    else
+      this->data->set_data_handle(data.get_data_handle());
+
+    if (this->weight == nullptr)
+      this->weight = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+              fwd_pd.weights_primitive_desc(), weight.get_data_handle()));
+    else
+      this->weight->set_data_handle(weight.get_data_handle());
+
+    if (this->out == nullptr)
+      this->out = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+              fwd_pd.dst_primitive_desc(), output.get_data_handle()));
+    else
+      this->out->set_data_handle(output.get_data_handle());
+
+    if (bias != nullptr) {
+      if (this->bias == nullptr)
+        this->bias = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+                fwd_pd.bias_primitive_desc(), bias->get_data_handle()));
+      else
+        this->bias->set_data_handle(bias->get_data_handle());
+      if (this->fwd == nullptr)
+        this->fwd = std::shared_ptr<mkldnn::convolution_forward>(
+            new mkldnn::convolution_forward(fwd_pd, mkldnn::primitive::at(*this->data),
+                                            mkldnn::primitive::at(*this->weight),
+                                            mkldnn::primitive::at(*this->bias),
+                                            *this->out));
+    } else if (this->fwd == nullptr) {
+      this->fwd = std::shared_ptr<mkldnn::convolution_forward>(
+          new mkldnn::convolution_forward(fwd_pd, mkldnn::primitive::at(*this->data),
+                                          mkldnn::primitive::at(*this->weight),
+                                          *this->out));
+    }
+  }
+
+  const mkldnn::convolution_forward &GetFwd() const {
+    return *fwd;
+  }
+};
+
+typedef MKLDNNParamOpSign<ConvolutionParam> MKLDNNConvSignature;
+
+static inline MKLDNNConvForward &GetConvFwd(
+    const nnvm::NodeAttrs& attrs, bool is_train,
+    const NDArray &data, const NDArray &weights,
+    const NDArray *bias, const NDArray &output) {
+  static thread_local std::unordered_map<MKLDNNConvSignature, MKLDNNConvForward, MKLDNNOpHash> fwds;
+  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  MKLDNNConvSignature key(param);
+  key.AddSign(is_train);
+  // Here we can sign the conv op with NDArray because conv primitive will
+  // decide the right layout for the, so we only need to get the shape and the
+  // data type of the arrays.
+  key.AddSign(data);
+  key.AddSign(weights);
+  key.AddSign(output);
+  if (bias)
+    key.AddSign(*bias);
+
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+    MKLDNNConvForward fwd(param, is_train, data, weights, bias, output);
+    auto ins_ret = fwds.insert(
+        std::pair<MKLDNNConvSignature, MKLDNNConvForward>(key, fwd));
+    CHECK(ins_ret.second);
+    it = ins_ret.first;
+  }
+  return it->second;
+}
+
+void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                               const std::vector<NDArray> &in_data,
+                               const std::vector<OpReqType> &req,
+                               const std::vector<NDArray> &out_data) {
+  TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]);
+  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  MKLDNNConvForward &fwd = GetConvFwd(attrs,
+      ctx.is_train, in_data[conv::kData], in_data[conv::kWeight],
+      param.no_bias ? nullptr : &in_data[conv::kBias], out_data[conv::kOut]);
+
+  auto data_mem = in_data[conv::kData].GetMKLDNNDataReorder(fwd.fwd_pd.src_primitive_desc());
+  const mkldnn::memory *weight_mem;
+  if (ctx.is_train) {
+    // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it
+    // to the default format for now.
+    if (in_data[conv::kWeight].IsMKLDNNData())
+      const_cast<NDArray &>(in_data[conv::kWeight]).Reorder2Default();
+    weight_mem = GetWeights(in_data[conv::kWeight], fwd.fwd_pd.weights_primitive_desc(),
+                            param.num_group);
+  } else {
+    // For inference, we want to reorder the weight array so we don't need to
+    // reorder data every time.
+    const_cast<NDArray &>(in_data[conv::kWeight]).MKLDNNDataReorder(
+        fwd.fwd_pd.weights_primitive_desc());
+    weight_mem = in_data[conv::kWeight].GetMKLDNNData();
+  }
+  auto out_mem = CreateMKLDNNMem(out_data[conv::kOut], fwd.fwd_pd.dst_primitive_desc(),
+                                 req[conv::kOut]);
+  const mkldnn::memory *bias_mem = nullptr;
+  if (!param.no_bias)
+    bias_mem = in_data[conv::kBias].GetMKLDNNDataReorder(fwd.fwd_pd.bias_primitive_desc());
+  fwd.SetNewMem(*data_mem, *weight_mem, bias_mem, *out_mem.second);
+  MKLDNNStream::Get()->RegisterPrim(fwd.GetFwd());
+
+  CommitOutput(out_data[conv::kOut], out_mem);
+  MKLDNNStream::Get()->Submit();
+}
+
+void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                               const std::vector<NDArray>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<NDArray>& outputs) {
+  TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]);
+  const std::vector<NDArray> &in_grad = outputs;
+  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  mkldnn::convolution_forward::primitive_desc fwd_pd = GetConvFwdImpl(param, ctx.is_train,
+      inputs[conv::kData + 1], inputs[conv::kWeight + 1],
+      param.no_bias ? nullptr : &inputs[conv::kBias + 1], inputs[conv::kOut]);
+
+  CHECK_NE(req[conv::kWeight], kWriteInplace) << "cannot write weight inplace";
+  mkldnn::convolution_backward_data::primitive_desc bwdData_pd
+    = GetConvBwdData(param, inputs[conv::kData + 1], inputs[conv::kWeight + 1],
+        inputs[conv::kOut], fwd_pd);
+  auto out_grad_mem = inputs[conv::kOut].GetMKLDNNDataReorder(
+      bwdData_pd.diff_dst_primitive_desc());
+  if (req[conv::kData]) {
+    auto weight_mem = GetWeights(inputs[conv::kWeight + 1],
+        bwdData_pd.weights_primitive_desc(), param.num_group);
+    auto in_grad_mem = CreateMKLDNNMem(in_grad[conv::kData],
+        bwdData_pd.diff_src_primitive_desc(), req[conv::kData]);
+    MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_data(bwdData_pd,
+          *out_grad_mem, *weight_mem, *in_grad_mem.second));
+    CommitOutput(in_grad[conv::kData], in_grad_mem);
+  }
+  if (req[conv::kWeight]) {
+    mkldnn::convolution_backward_weights::primitive_desc bwdWeights_pd
+        = GetConvBwdWeights(param, inputs[conv::kData + 1], inputs[conv::kWeight + 1],
+                            param.no_bias ? nullptr : &inputs[conv::kBias + 1],
+                            inputs[conv::kOut], fwd_pd);
+    if (bwdData_pd.diff_dst_primitive_desc() != bwdWeights_pd.diff_dst_primitive_desc())
+      out_grad_mem = inputs[conv::kOut].GetMKLDNNDataReorder(
+          bwdWeights_pd.diff_dst_primitive_desc());
+    auto data_mem = inputs[conv::kData + 1].GetMKLDNNDataReorder(
+        bwdWeights_pd.src_primitive_desc());
+    auto in_grad_weight = CreateMKLDNNWeightGrad(in_grad[conv::kWeight],
+                                                 bwdWeights_pd.diff_weights_primitive_desc(),
+                                                 req[conv::kWeight]);
+    mkldnn_output_t in_grad_bias;
+    if (param.no_bias) {
+      MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_weights(
+              bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second));
+    } else {
+      in_grad_bias = CreateMKLDNNMem(in_grad[conv::kBias],
+                                     bwdWeights_pd.diff_bias_primitive_desc(),
+                                     req[conv::kBias]);
+      MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_weights(
+              bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second,
+              *in_grad_bias.second));
+      CommitOutput(in_grad[conv::kBias], in_grad_bias);
+    }
+    CommitOutput(in_grad[conv::kWeight], in_grad_weight);
+  }
+  MKLDNNStream::Get()->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_copy.cc b/src/operator/nn/mkldnn/mkldnn_copy.cc
new file mode 100644
index 000000000000..71d540c969cd
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_copy.cc
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_softmax.cc
+ * \brief
+ * \author Da Zheng
+*/
+
+#include "../softmax-inl.h"
+#include "./mkldnn_ops-inl.h"
+#include "./mkldnn_base-inl.h"
+
+#if MXNET_USE_MKLDNN == 1
+namespace mxnet {
+namespace op {
+
+void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                const NDArray &in_data, const OpReqType &req,
+                const NDArray &out_data) {
+  TmpMemMgr::Get()->Init(ctx.requested[0]);
+  auto in_mem = in_data.GetMKLDNNData();
+  if (req == kAddTo) {
+    TmpMemMgr::Get()->Init(ctx.requested[0]);
+    // We should try and force the output memory has the same format
+    // as the input memory. If not, we'll have to reorder memory.
+    auto out_mem = out_data.GetMKLDNNData(in_mem->get_primitive_desc());
+    if (out_mem == nullptr)
+      out_mem = out_data.GetMKLDNNData();
+    auto sum_res = TmpMemMgr::Get()->Alloc(out_mem->get_primitive_desc());
+    Sum(*in_mem, *out_mem, *sum_res);
+    const_cast<NDArray &>(out_data).CopyFrom(*sum_res);
+  } else {
+    const_cast<NDArray &>(out_data).CopyFrom(*in_mem);
+  }
+  MKLDNNStream::Get()->Submit();
+}
+
+}   // namespace op
+}   // namespace mxnet
+#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
new file mode 100644
index 000000000000..d336d6dedbea
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
@@ -0,0 +1,390 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_deconvolution.cc
+ * \brief
+ * \author Da Zheng, Rong Zhang (rong.a.zhang@intel.com)
+*/
+
+#if MXNET_USE_MKLDNN == 1
+
+#include "../deconvolution-inl.h"
+#include "./mkldnn_ops-inl.h"
+#include "./mkldnn_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+static inline mkldnn::memory::desc GetBiasDesc(mkldnn::memory::desc md) {
+  mkldnn::memory::dims dims(1);
+  // This is convolution on 4D data. The second dimension is the channel.
+  dims[0] = md.data.dims[1];
+  return mkldnn::memory::desc(dims,
+      static_cast<mkldnn::memory::data_type>(md.data.data_type),
+      mkldnn::memory::format::any);
+}
+
+static mkldnn::convolution_forward::primitive_desc GetDeconvBwd_(
+    const mkldnn::memory::desc &data_md, const mkldnn::memory::desc &weights_md,
+    bool has_bias, const mkldnn::memory::desc &out_md,
+    const mkldnn::engine &engine, const mkldnn::memory::dims &strides,
+    const mkldnn::memory::dims &padding, const mkldnn::memory::dims &dilates) {
+  if (!has_bias) {
+    mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training,
+        mkldnn::algorithm::convolution_direct, out_md, weights_md, data_md, strides,
+        dilates, padding, padding, mkldnn::padding_kind::zero);
+    return mkldnn::convolution_forward::primitive_desc(desc, engine);
+  } else {
+    auto bias_md = GetBiasDesc(data_md);
+    mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training,
+        mkldnn::algorithm::convolution_direct, out_md, weights_md, bias_md,
+        data_md, strides, dilates, padding, padding, mkldnn::padding_kind::zero);
+    return mkldnn::convolution_forward::primitive_desc(desc, engine);
+  }
+}
+
+static mkldnn::convolution_backward_data::primitive_desc GetDeconvFwdImpl(
+    const DeconvolutionParam& param, const NDArray &data, const NDArray &weights,
+    bool has_bias, const NDArray &output) {
+  auto data_md = GetMemDesc(data);
+  auto weight_md = GetWeightDesc(weights, param.num_group);
+  auto out_md = GetMemDesc(output);
+  auto engine = CpuEngine::Get()->get_engine();
+  mkldnn::memory::dims strides{0, 0};
+  if (param.stride.ndim() == 2) {
+    strides[0] = param.stride[0];
+    strides[1] = param.stride[1];
+  } else if (param.stride.ndim() == 1) {
+    strides[0] = param.stride[0];
+    strides[1] = param.stride[0];
+  } else {
+    LOG(FATAL) << "Unsupported stride dim";
+  }
+  mkldnn::memory::dims padding{0, 0};
+  if (param.pad.ndim() == 2) {
+    padding[0] = param.pad[0];
+    padding[1] = param.pad[1];
+  } else if (param.pad.ndim() == 1) {
+    padding[0] = param.pad[0];
+    padding[1] = param.pad[0];
+  } else {
+    LOG(FATAL) << "Unsupported pad dim";
+  }
+  mkldnn::memory::dims dilate{0, 0};
+  if (param.dilate.ndim() == 2) {
+    dilate[0] = param.dilate[0] - 1;
+    dilate[1] = param.dilate[1] - 1;
+  }
+  auto bwd_pd = GetDeconvBwd_(data_md, weight_md, has_bias, out_md, engine,
+      strides, padding, dilate);
+  mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
+      out_md, weight_md, data_md, strides, dilate, padding, padding,
+      mkldnn::padding_kind::zero);
+  return mkldnn::convolution_backward_data::primitive_desc(desc, engine, bwd_pd);
+}
+
+static mkldnn::convolution_forward::primitive_desc GetDeconvBwdData(
+    const DeconvolutionParam &param, const NDArray &data, const NDArray &weights,
+    bool has_bias, const NDArray &output) {
+  auto data_md = GetMemDesc(data);
+  auto weight_md = GetWeightDesc(weights, param.num_group);
+  auto out_md = GetMemDesc(output);
+  auto engine = CpuEngine::Get()->get_engine();
+  mkldnn::memory::dims strides{0, 0};
+  if (param.stride.ndim() == 2) {
+    strides[0] = param.stride[0];
+    strides[1] = param.stride[1];
+  } else if (param.stride.ndim() == 1) {
+    strides[0] = param.stride[0];
+    strides[1] = param.stride[0];
+  } else {
+    LOG(FATAL) << "Unsupported stride dim";
+  }
+  mkldnn::memory::dims padding{0, 0};
+  if (param.pad.ndim() == 2) {
+    padding[0] = param.pad[0];
+    padding[1] = param.pad[1];
+  } else if (param.pad.ndim() == 1) {
+    padding[0] = param.pad[0];
+    padding[1] = param.pad[0];
+  } else {
+    LOG(FATAL) << "Unsupported pad dim";
+  }
+  mkldnn::memory::dims dilate{0, 0};
+  if (param.dilate.ndim() == 2) {
+    dilate[0] = param.dilate[0] - 1;
+    dilate[1] = param.dilate[1] - 1;
+  }
+  return GetDeconvBwd_(data_md, weight_md, has_bias, out_md, engine,
+      strides, padding, dilate);
+}
+
+static mkldnn::convolution_backward_weights::primitive_desc GetDeconvBwdWeights(
+    const DeconvolutionParam& param, const NDArray &data, const NDArray &weights,
+    bool has_bias, const NDArray &output,
+    const mkldnn::convolution_forward::primitive_desc &fwd_pd) {
+  auto data_md = GetMemDesc(data);
+  auto weight_md = GetWeightDesc(weights, param.num_group);
+  auto out_md = GetMemDesc(output);
+  auto engine = CpuEngine::Get()->get_engine();
+  mkldnn::memory::dims strides{0, 0};
+  if (param.stride.ndim() == 2) {
+    strides[0] = param.stride[0];
+    strides[1] = param.stride[1];
+  } else if (param.stride.ndim() == 1) {
+    strides[0] = param.stride[0];
+    strides[1] = param.stride[0];
+  } else {
+    LOG(FATAL) << "Unsupported stride dim";
+  }
+  mkldnn::memory::dims padding{0, 0};
+  if (param.pad.ndim() == 2) {
+    padding[0] = param.pad[0];
+    padding[1] = param.pad[1];
+  } else if (param.pad.ndim() == 1) {
+    padding[0] = param.pad[0];
+    padding[1] = param.pad[0];
+  } else {
+    LOG(FATAL) << "Unsupported pad dim";
+  }
+  mkldnn::memory::dims dilate{0, 0};
+  if (param.dilate.ndim() == 2) {
+    dilate[0] = param.dilate[0] - 1;
+    dilate[1] = param.dilate[1] - 1;
+  }
+  if (!has_bias) {
+    mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
+        out_md, weight_md, data_md, strides, dilate, padding, padding, mkldnn::padding_kind::zero);
+    return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd);
+  } else {
+    auto bias_md = GetBiasDesc(data_md);
+    mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
+        out_md, weight_md, bias_md, data_md, strides, dilate, padding, padding,
+        mkldnn::padding_kind::zero);
+    return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd);
+  }
+}
+
+class MKLDNNDeconvForward {
+  std::shared_ptr<mkldnn::convolution_backward_data> fwd;
+  std::shared_ptr<mkldnn::memory> data;
+  std::shared_ptr<mkldnn::memory> weight;
+  std::shared_ptr<mkldnn::memory> bias;
+  std::shared_ptr<mkldnn::memory> out;
+  OutDataOp data_op;
+
+ public:
+  MKLDNNDeconvForward(const DeconvolutionParam& param,
+                      const NDArray &data,
+                      const NDArray &weights,
+                      bool has_bias,
+                      const NDArray &output);
+  void SetDataHandle(const DeconvolutionParam& param,
+                     const OpContext &ctx,
+                     const std::vector<NDArray> &in_data,
+                     const std::vector<OpReqType> &req,
+                     const std::vector<NDArray> &out_data);
+
+  void Execute(const std::vector<NDArray> &out_data);
+
+ private:
+  mkldnn::convolution_backward_data::primitive_desc fwd_pd;
+};  // class MKLDNNDeconvForward
+
+MKLDNNDeconvForward::MKLDNNDeconvForward(const DeconvolutionParam& param,
+                                const NDArray &data,
+                                const NDArray &weights,
+                                bool has_bias,
+                                const NDArray &output)
+                                :fwd_pd(GetDeconvFwdImpl(param, data, weights, has_bias, output)) {
+  this->data = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+          fwd_pd.diff_dst_primitive_desc()));
+  this->weight = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+          fwd_pd.weights_primitive_desc()));
+  this->out = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+          fwd_pd.diff_src_primitive_desc()));
+  this->fwd = std::shared_ptr<mkldnn::convolution_backward_data>(
+    new mkldnn::convolution_backward_data(fwd_pd,
+                                          mkldnn::primitive::at(*this->data),
+                                          mkldnn::primitive::at(*this->weight),
+                                          *this->out));
+}
+
+void MKLDNNDeconvForward::SetDataHandle(const DeconvolutionParam& param,
+                                        const OpContext &ctx,
+                                        const std::vector<NDArray> &in_data,
+                                        const std::vector<OpReqType> &req,
+                                        const std::vector<NDArray> &out_data) {
+  auto data_mem = in_data[deconv::kData].GetMKLDNNDataReorder(
+      fwd_pd.diff_dst_primitive_desc());
+  const mkldnn::memory *weight_mem;
+  if (ctx.is_train) {
+    // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it
+    // to the default format for now.
+    if (in_data[deconv::kWeight].IsMKLDNNData())
+      const_cast<NDArray &>(in_data[deconv::kWeight]).Reorder2Default();
+    weight_mem = GetWeights(in_data[deconv::kWeight],
+                            fwd_pd.weights_primitive_desc(),
+                            param.num_group);
+  } else {
+    // For inference, we want to reorder the weight array so we don't need to
+    // reorder data every time.
+    const_cast<NDArray &>(in_data[deconv::kWeight]).MKLDNNDataReorder(
+        fwd_pd.weights_primitive_desc());
+    weight_mem = in_data[deconv::kWeight].GetMKLDNNData();
+  }
+  auto out_mem = CreateMKLDNNMem(out_data[deconv::kOut],
+      fwd_pd.diff_src_primitive_desc(), req[deconv::kOut]);
+  auto output = out_mem.second;
+  this->data->set_data_handle(data_mem->get_data_handle());
+  this->weight->set_data_handle(weight_mem->get_data_handle());
+  this->out->set_data_handle(output->get_data_handle());
+  this->data_op = out_mem.first;
+}
+
+void MKLDNNDeconvForward::Execute(const std::vector<NDArray> &out_data) {
+  MKLDNNStream::Get()->RegisterPrim(*fwd);
+  CommitOutput(out_data[deconv::kOut], mkldnn_output_t(this->data_op, this->out.get()));
+  MKLDNNStream::Get()->Submit();
+}
+
+static void MKLDNNDeconvFwdBiasPostProcess(const DeconvolutionParam& param,
+                                           const OpContext &ctx,
+                                           const std::vector<NDArray> &in_data,
+                                           const std::vector<NDArray> &out_data) {
+  // add bias, broadcast bias to dim 1: channel
+  if (!param.no_bias) {
+    // MKLDNN only supports float right now.
+    typedef float DType;
+    Stream<cpu> *s = ctx.get_stream<cpu>();
+    Tensor<cpu, 1, DType> bias = in_data[deconv::kBias].data().get<cpu, 1, DType>(s);
+    // If the output data is stored in a special MKLDNN format, data()
+    // automatically converts its format to the default format.
+    // Unfortunately, MKLDNN doesn't support broadcast.
+    Tensor<cpu, 4, DType> out_cpu = out_data[deconv::kOut].data().get<cpu, 4, DType>(s);
+    out_cpu += mshadow::expr::broadcast<1>(bias, out_cpu.shape_);
+  }
+}
+
+typedef MKLDNNParamOpSign<DeconvolutionParam> MKLDNNDeconvSignature;
+
+static inline MKLDNNDeconvForward &GetDeconvFwd(
+    const nnvm::NodeAttrs& attrs, const NDArray &data,
+    const NDArray &weights, const NDArray *bias,
+    const NDArray &output) {
+  static thread_local
+        std::unordered_map<MKLDNNDeconvSignature, MKLDNNDeconvForward, MKLDNNOpHash> fwds;
+  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  MKLDNNDeconvSignature key(param);
+  // Here we can sign the conv op with NDArray because conv primitive will
+  // decide the right layout for the, so we only need to get the shape and the
+  // data type of the arrays.
+  key.AddSign(data);
+  key.AddSign(weights);
+  key.AddSign(output);
+  if (bias)
+    key.AddSign(*bias);
+
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+    bool has_bias = (bias != nullptr);
+    MKLDNNDeconvForward fwd(param, data, weights, has_bias, output);
+    auto ins_ret = fwds.insert(
+        std::pair<MKLDNNDeconvSignature, MKLDNNDeconvForward>(key, fwd));
+    CHECK(ins_ret.second);
+    it = ins_ret.first;
+  }
+  return it->second;
+}
+
+void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                                const std::vector<NDArray> &in_data,
+                                const std::vector<OpReqType> &req,
+                                const std::vector<NDArray> &out_data) {
+  TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]);
+  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+
+  MKLDNNDeconvForward &deconvFwd = GetDeconvFwd(
+      attrs, in_data[deconv::kData], in_data[deconv::kWeight],
+      param.no_bias ? nullptr : &in_data[deconv::kBias], out_data[deconv::kOut]);
+
+  deconvFwd.SetDataHandle(param, ctx, in_data, req, out_data);
+
+  deconvFwd.Execute(out_data);
+
+  MKLDNNDeconvFwdBiasPostProcess(param, ctx, in_data, out_data);
+}
+
+void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                                 const std::vector<NDArray>& inputs,
+                                 const std::vector<OpReqType>& req,
+                                 const std::vector<NDArray>& outputs) {
+  TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]);
+  const std::vector<NDArray> &in_grad = outputs;
+  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  CHECK_NE(req[deconv::kWeight], kWriteInplace) << "cannot write weight inplace";
+  mkldnn::convolution_forward::primitive_desc bwdData_pd = GetDeconvBwdData(
+      param, inputs[deconv::kData + 1], inputs[deconv::kWeight + 1], false,
+      inputs[deconv::kOut]);
+  auto out_grad_mem = inputs[deconv::kOut].GetMKLDNNDataReorder(
+      bwdData_pd.src_primitive_desc());
+  if (req[deconv::kData]) {
+    auto weight_mem = GetWeights(inputs[deconv::kWeight + 1],
+                                 bwdData_pd.weights_primitive_desc(),
+                                 param.num_group);
+    auto in_grad_mem = CreateMKLDNNMem(in_grad[deconv::kData],
+                                       bwdData_pd.dst_primitive_desc(),
+                                       req[deconv::kData]);
+    MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_forward(bwdData_pd,
+          *out_grad_mem, *weight_mem, *in_grad_mem.second));
+    CommitOutput(in_grad[deconv::kData], in_grad_mem);
+  }
+  if (req[deconv::kWeight]) {
+    mkldnn::convolution_backward_weights::primitive_desc bwdWeights_pd
+      = GetDeconvBwdWeights(param, inputs[deconv::kData + 1],
+          inputs[deconv::kWeight + 1], false, inputs[deconv::kOut], bwdData_pd);
+    if (bwdData_pd.src_primitive_desc() != bwdWeights_pd.src_primitive_desc())
+      out_grad_mem = inputs[deconv::kOut].GetMKLDNNDataReorder(
+          bwdWeights_pd.src_primitive_desc());
+    auto data_mem = inputs[deconv::kData + 1].GetMKLDNNDataReorder(
+        bwdWeights_pd.diff_dst_primitive_desc());
+    auto in_grad_weight = CreateMKLDNNWeightGrad(in_grad[deconv::kWeight],
+                                                 bwdWeights_pd.diff_weights_primitive_desc(),
+                                                 req[deconv::kWeight]);
+    MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_weights(
+          bwdWeights_pd, *out_grad_mem, *data_mem, *in_grad_weight.second));
+    CommitOutput(in_grad[deconv::kWeight], in_grad_weight);
+  }
+  MKLDNNStream::Get()->Submit();
+  if (!param.no_bias) {
+    typedef float DType;
+    Stream<cpu> *s = ctx.get_stream<cpu>();
+    Tensor<cpu, 1, DType> gbias = in_grad[deconv::kBias].data().get<cpu, 1, DType>(s);
+    // If there is bias, the out grad has already been converted to the default
+    // format, so this shouldn't cause any performance issues.
+    Tensor<cpu, 4, DType> grad = inputs[deconv::kOut].data().get<cpu, 4, DType>(s);
+    Assign(gbias, req[deconv::kBias], mshadow::expr::sumall_except_dim<1>(grad));
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
new file mode 100644
index 000000000000..a8b85bbeb151
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_fully_connected.cc
+ * \brief
+ * \author Da Zheng
+*/
+
+#include "../fully_connected-inl.h"
+#include "./mkldnn_base-inl.h"
+
+#if MXNET_USE_MKLDNN == 1
+namespace mxnet {
+namespace op {
+
+inline static mkldnn::inner_product_forward::primitive_desc GetIPFwd(
+    const NDArray &data, const NDArray &weight, const NDArray *bias,
+    const mkldnn::memory::desc &out_md) {
+  auto data_md = GetMemDesc(data);
+  auto weight_md = GetMemDesc(weight);
+  auto engine = CpuEngine::Get()->get_engine();
+  if (bias) {
+    auto bias_md = GetMemDesc(*bias);
+    mkldnn::inner_product_forward::desc ipFwd_desc(mkldnn::prop_kind::forward_training,
+        data_md, weight_md, bias_md, out_md);
+    return mkldnn::inner_product_forward::primitive_desc(ipFwd_desc, engine);
+  } else {
+    mkldnn::inner_product_forward::desc ipFwd_desc(mkldnn::prop_kind::forward_training,
+        data_md, weight_md, out_md);
+    return mkldnn::inner_product_forward::primitive_desc(ipFwd_desc, engine);
+  }
+}
+
+inline static mkldnn::inner_product_backward_data::primitive_desc GetIpBwdData(
+    const NDArray &data, const NDArray &weight, const NDArray &output,
+    mkldnn::inner_product_forward::primitive_desc ipFwd_pd) {
+  auto data_md = GetMemDesc(data);
+  auto weight_md = GetMemDesc(weight);
+  auto out_md = GetMemDesc(output);
+  auto engine = CpuEngine::Get()->get_engine();
+  mkldnn::inner_product_backward_data::desc desc(data_md, weight_md, out_md);
+  return mkldnn::inner_product_backward_data::primitive_desc(desc, engine, ipFwd_pd);
+}
+
+inline static mkldnn::inner_product_backward_weights::primitive_desc GetIPBwdWeights(
+    const NDArray &data, const NDArray &weight, const NDArray *bias,
+    const NDArray &output, mkldnn::inner_product_forward::primitive_desc ipFwd_pd) {
+  auto data_md = GetMemDesc(data);
+  auto weight_md = GetMemDesc(weight);
+  auto out_md = GetMemDesc(output);
+  auto engine = CpuEngine::Get()->get_engine();
+  if (bias) {
+    auto bias_md = GetMemDesc(*bias);
+    mkldnn::inner_product_backward_weights::desc ipBwdWeights_desc(data_md,
+        weight_md, bias_md, out_md);
+    return mkldnn::inner_product_backward_weights::primitive_desc(
+        ipBwdWeights_desc, engine, ipFwd_pd);
+  } else {
+    mkldnn::inner_product_backward_weights::desc ipBwdWeights_desc(data_md,
+        weight_md, out_md);
+    return mkldnn::inner_product_backward_weights::primitive_desc(
+        ipBwdWeights_desc, engine, ipFwd_pd);
+  }
+}
+
+void MKLDNNFCForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                     const std::vector<NDArray> &in_data,
+                     const std::vector<OpReqType> &req,
+                     const std::vector<NDArray> &out_data) {
+  TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]);
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  const TShape& ishape = in_data[fullc::kData].shape();
+  const TShape& oshape = out_data[fullc::kOut].shape();
+  NDArray weight = in_data[fullc::kWeight];
+  NDArray data = in_data[fullc::kData];
+  auto out_md = GetMemDesc(out_data[fullc::kOut]);
+  if (data.shape().ndim() != 2 && !param.flatten) {
+    data = data.MKLDNNDataReshape(Shape2(ishape.ProdShape(0, ishape.ndim()-1),
+                                     ishape[ishape.ndim()-1]));
+    mkldnn::memory::dims out_dims{static_cast<int>(oshape.ProdShape(0, oshape.ndim()-1)),
+      static_cast<int>(oshape[ishape.ndim()-1])};
+    out_md = mkldnn::memory::desc(out_dims, get_mkldnn_type(out_data[fullc::kOut].dtype()),
+      mkldnn::memory::format::any);
+  } else if (data.shape().ndim() != 2) {
+    data = data.MKLDNNDataReshape(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())));
+    mkldnn::memory::dims out_dims{static_cast<int>(oshape[0]),
+      static_cast<int>(oshape.ProdShape(1, oshape.ndim()))};
+    out_md = mkldnn::memory::desc(out_dims, get_mkldnn_type(out_data[fullc::kOut].dtype()),
+      mkldnn::memory::format::any);
+  }
+
+  mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd(data, weight,
+      param.no_bias ? nullptr : &in_data[fullc::kBias], out_md);
+  auto data_mem = data.GetMKLDNNDataReorder(ipFwd_pd.src_primitive_desc());
+  auto weight_mem = weight.GetMKLDNNDataReorder(ipFwd_pd.weights_primitive_desc());
+  auto out_mem = CreateMKLDNNMem(out_data[fullc::kOut],
+      ipFwd_pd.dst_primitive_desc(), req[fullc::kOut]);
+  if (param.no_bias) {
+    MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_forward(
+          ipFwd_pd, *data_mem, *weight_mem, *out_mem.second));
+  } else {
+    auto bias_mem = in_data[fullc::kBias].GetMKLDNNDataReorder(ipFwd_pd.bias_primitive_desc());
+    MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_forward(ipFwd_pd,
+          *data_mem, *weight_mem, *bias_mem, *out_mem.second));
+  }
+  CommitOutput(out_data[fullc::kOut], out_mem);
+  MKLDNNStream::Get()->Submit();
+}
+
+void MKLDNNFCBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                      const std::vector<NDArray> &inputs,
+                      const std::vector<OpReqType> &req,
+                      const std::vector<NDArray> &outputs) {
+  TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]);
+  const std::vector<NDArray> &in_grad = outputs;
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  const TShape& ishape = inputs[fullc::kData + 1].shape();
+  const TShape& oshape = inputs[fullc::kOut].shape();
+
+  NDArray weight = inputs[fullc::kWeight + 1];
+  NDArray data = inputs[fullc::kData + 1];
+  if (data.shape().ndim() != 2 && !param.flatten)
+    data = data.MKLDNNDataReshape(Shape2(ishape.ProdShape(0, ishape.ndim()-1),
+                                     ishape[ishape.ndim()-1]));
+  else if (data.shape().ndim() != 2)
+    data = data.MKLDNNDataReshape(Shape2(ishape[0],
+                                     ishape.ProdShape(1, ishape.ndim())));
+  NDArray out_grad = inputs[fullc::kOut];
+  if (out_grad.shape().ndim() != 2 && !param.flatten)
+    out_grad = out_grad.MKLDNNDataReshape(Shape2(oshape.ProdShape(0, oshape.ndim()-1),
+                                             oshape[oshape.ndim()-1]));
+  else if (out_grad.shape().ndim() != 2)
+    out_grad = out_grad.MKLDNNDataReshape(Shape2(oshape[0],
+                                             oshape.ProdShape(1, oshape.ndim())));
+
+  mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd(data, weight,
+      param.no_bias ? nullptr : &in_grad[fullc::kBias], GetMemDesc(out_grad));
+
+  CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace";
+  if (req[fullc::kData]) {
+    mkldnn::inner_product_backward_data::primitive_desc ipBwdData_pd = GetIpBwdData(
+        data, weight, out_grad, ipFwd_pd);
+    auto out_grad_mem = out_grad.GetMKLDNNDataReorder(
+        ipBwdData_pd.diff_dst_primitive_desc());
+    auto weight_mem = weight.GetMKLDNNDataReorder(ipBwdData_pd.weights_primitive_desc());
+    auto in_grad_mem = CreateMKLDNNMem(in_grad[fullc::kData],
+                                       ipBwdData_pd.diff_src_primitive_desc(),
+                                       req[fullc::kData]);
+    MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_backward_data(
+          ipBwdData_pd, *out_grad_mem, *weight_mem, *in_grad_mem.second));
+    CommitOutput(in_grad[fullc::kData], in_grad_mem);
+  }
+  if (req[fullc::kWeight]) {
+    mkldnn::inner_product_backward_weights::primitive_desc ipBwdWeights_pd
+      = GetIPBwdWeights(data, weight, param.no_bias ? nullptr : &in_grad[fullc::kBias],
+          out_grad, ipFwd_pd);
+    auto out_grad_mem = out_grad.GetMKLDNNDataReorder(
+        ipBwdWeights_pd.diff_dst_primitive_desc());
+    auto data_mem = data.GetMKLDNNDataReorder(ipBwdWeights_pd.src_primitive_desc());
+    auto in_grad_weight = CreateMKLDNNWeightGrad(in_grad[fullc::kWeight],
+                                                 ipBwdWeights_pd.diff_weights_primitive_desc(),
+                                                 req[fullc::kWeight]);
+    mkldnn_output_t in_grad_bias;
+    if (param.no_bias) {
+      MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_backward_weights(
+            ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second));
+    } else {
+      in_grad_bias = CreateMKLDNNMem(in_grad[fullc::kBias],
+                                     ipBwdWeights_pd.diff_bias_primitive_desc(),
+                                     req[fullc::kBias]);
+      MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_backward_weights(
+            ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second,
+            *in_grad_bias.second));
+    }
+    CommitOutput(in_grad[fullc::kWeight], in_grad_weight);
+    CommitOutput(in_grad[fullc::kBias], in_grad_bias);
+  }
+  MKLDNNStream::Get()->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_lrn-inl.h b/src/operator/nn/mkldnn/mkldnn_lrn-inl.h
new file mode 100644
index 000000000000..9a9bf62b67d0
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_lrn-inl.h
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_lrn-inl.h 
+ * \brief
+ * \Author: Patric Zhao, patric.zhao@intel.com
+*/
+#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H_
+#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H_
+
+#if MXNET_USE_MKLDNN == 1
+#include <mkldnn.hpp>
+#include "../lrn-inl.h"
+#include "./mkldnn_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+inline algorithm GetMKLDNNLRNAlgo(const LRNParam &param) {
+  // TODO(Patric): lrn_within_channel will cause core dump in MKLDNN backward
+  //               Need to confirm with MKLDNN team and fix later
+  return algorithm::lrn_across_channels;
+}
+
+inline lrn_forward::primitive_desc GetLRNFwd(const LRNParam &param,
+                                             const bool is_train,
+                                             const memory::desc &src_md) {
+  const auto  engine = CpuEngine::Get()->get_engine();
+  const auto  alg = GetMKLDNNLRNAlgo(param);
+  const float alpha = param.alpha;
+  const float beta = param.beta;
+  const int   nsize = param.nsize;
+  const float k = param.knorm;
+  auto kind = prop_kind::forward_training;
+  if (is_train) {
+    kind = prop_kind::forward_training;
+  } else {
+    kind = prop_kind::forward_scoring;
+  }
+  lrn_forward::desc fwd_desc(kind, alg, src_md, nsize, alpha, beta, k);
+  return mkldnn::lrn_forward::primitive_desc(fwd_desc, engine);
+}
+
+inline mkldnn::lrn_backward::primitive_desc
+GetLRNBwd(const LRNParam &param,
+          const mkldnn::memory::desc &diff_in_md,
+          const mkldnn::memory::desc &diff_md,
+          const lrn_forward::primitive_desc &lrnFwd_desc) {
+  const auto engine = CpuEngine::Get()->get_engine();
+  const auto alg = GetMKLDNNLRNAlgo(param);
+  const float alpha = param.alpha;
+  const float beta = param.beta;
+  const int nsize = param.nsize;
+  const float k = param.knorm;
+
+  lrn_backward::desc lrnBwd_desc(alg, diff_in_md,
+                diff_md, nsize, alpha, beta, k);
+  return mkldnn::lrn_backward::primitive_desc(lrnBwd_desc,
+                               engine, lrnFwd_desc);
+}
+
+void MKLDNNLRNForward(const OpContext &ctx,
+                      const LRNParam &param,
+                      const NDArray &in_data,
+                      const OpReqType req,
+                      const NDArray &out_data) {
+  auto src_mem = in_data.GetMKLDNNData();
+  const auto src_md = src_mem->get_primitive_desc().desc();
+  const auto pdesc = GetLRNFwd(param, ctx.is_train, src_md);
+  auto dst_mem = const_cast<NDArray &>(out_data).CreateMKLDNNData(
+          pdesc.dst_primitive_desc());
+  if (ctx.is_train) {
+    std::shared_ptr<const mkldnn::memory> ws_mem(
+            new mkldnn::memory(pdesc.workspace_primitive_desc()));
+    MKLDNNStream::Get()->RegisterPrim(
+        lrn_forward(pdesc, mkldnn::primitive::at(*src_mem),
+        *ws_mem, *dst_mem));
+    MKLDNNStream::Get()->Submit();
+  } else {
+    MKLDNNStream::Get()->RegisterPrim(
+        lrn_forward(pdesc, mkldnn::primitive::at(*src_mem), *dst_mem));
+    MKLDNNStream::Get()->Submit();
+  }
+}
+
+void MKLDNNLRNBackward(const OpContext &ctx, const LRNParam &param,
+                       const NDArray &out_grad,
+                       const NDArray &in_data,
+                       const OpReqType req,
+                       const NDArray &in_grad) {
+  if (req == kNullOp) {
+    return;
+  }
+  // Repeat FW for getting workspace
+  auto data_mem = in_data.GetMKLDNNData();
+  const auto data_md = data_mem->get_primitive_desc().desc();
+  const auto pdesc_fwd = GetLRNFwd(param, ctx.is_train, data_md);
+
+  // TODO(Patric): To keep the function stateless, we can't pass workspace
+  //               from LRN forward to backward. We have to re-compute
+  //               LRN forward to get the workspace.
+  //               Will refine this code later.
+  std::shared_ptr<const mkldnn::memory> ws_mem(
+          new mkldnn::memory(pdesc_fwd.workspace_primitive_desc()));
+  std::shared_ptr<const mkldnn::memory> dst_temp(
+          new mkldnn::memory(pdesc_fwd.dst_primitive_desc()));
+  MKLDNNStream::Get()->RegisterPrim(
+          lrn_forward(pdesc_fwd, mkldnn::primitive::at(*data_mem),
+          *ws_mem, *dst_temp));
+
+  const auto data_in_md = pdesc_fwd.src_primitive_desc().desc();
+  auto diff_mem = out_grad.GetMKLDNNData();
+  const auto diff_md = diff_mem->get_primitive_desc().desc();
+  const auto pdesc_bwd = GetLRNBwd(param, data_in_md, diff_md, pdesc_fwd);
+  auto diff_src_mem = CreateMKLDNNMem(in_grad,
+          pdesc_bwd.diff_src_primitive_desc(), req);
+
+  MKLDNNStream::Get()->RegisterPrim(
+        lrn_backward(pdesc_bwd, mkldnn::primitive::at(*data_mem),
+        mkldnn::primitive::at(*diff_mem), *ws_mem, *diff_src_mem.second));
+  MKLDNNStream::Get()->Submit();
+}
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H__
diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h
new file mode 100644
index 000000000000..9149cb0c6a94
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_ops-inl.h
+ * \brief
+ * \author Da Zheng
+*/
+
+#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_
+#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_
+
+#if MXNET_USE_MKLDNN == 1
+
+#include <mxnet/io.h>
+#include <mxnet/base.h>
+#include <mxnet/ndarray.h>
+#include <mxnet/operator.h>
+#include <mxnet/operator_util.h>
+#include <dmlc/logging.h>
+#include <dmlc/optional.h>
+#include <vector>
+#include <mkldnn.hpp>
+
+namespace mxnet {
+namespace op {
+
+/* For fully connected. */
+void MKLDNNFCForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                     const std::vector<NDArray> &in_data,
+                     const std::vector<OpReqType> &req,
+                     const std::vector<NDArray> &out_data);
+void MKLDNNFCBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                      const std::vector<NDArray> &inputs,
+                      const std::vector<OpReqType> &req,
+                      const std::vector<NDArray> &outputs);
+
+/* For convolution. */
+void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                              const std::vector<NDArray> &in_data,
+                              const std::vector<OpReqType> &req,
+                              const std::vector<NDArray> &out_data);
+void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                               const std::vector<NDArray>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<NDArray>& outputs);
+
+/* For deconvolution */
+void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                                const std::vector<NDArray> &in_data,
+                                const std::vector<OpReqType> &req,
+                                const std::vector<NDArray> &out_data);
+void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                                 const std::vector<NDArray>& inputs,
+                                 const std::vector<OpReqType>& req,
+                                 const std::vector<NDArray>& outputs);
+
+/* For softmax */
+void MKLDNNSoftmaxForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                          const NDArray &in_data, const OpReqType &req,
+                          const NDArray &out_data);
+
+/* For sum */
+void MKLDNNSumForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                      const std::vector<NDArray> &inputs, const OpReqType &req,
+                      const NDArray &out_data);
+
+/* For copy */
+void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+    const NDArray &in_data, const OpReqType &req,
+    const NDArray &out_data);
+
+/* For concat */
+void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                         const std::vector<NDArray> &in_data,
+                         const std::vector<OpReqType> &req,
+                         const std::vector<NDArray> &out_data);
+void MKLDNNConcatBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                          const std::vector<NDArray>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<NDArray>& outputs);
+
+/* For activation */
+void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                             const NDArray &in_data, const OpReqType &req,
+                             const NDArray &out_data);
+void MKLDNNActivationBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                              const NDArray &out_grad, const NDArray &in_data,
+                              const OpReqType &req, const NDArray &in_grad);
+
+void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2,
+         const mkldnn::memory &out);
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_MKLDNN == 1
+
+#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h
new file mode 100644
index 000000000000..4f2f71866e14
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_pooling-inl.h
+ * \brief
+*/
+#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_POOLING_INL_H_
+#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_POOLING_INL_H_
+
+#if MXNET_USE_MKLDNN == 1
+
+#include <utility>
+#include <mkldnn.hpp>
+#include "../pooling-inl.h"
+#include "./mkldnn_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+class MKLDNNPoolingFwd {
+ public:
+  MKLDNNPoolingFwd(const mxnet::NDArray &input,
+                   const mxnet::NDArray &output,
+                   const int kernel_h, const int kernel_w,
+                   const int stride_h, const int stride_w,
+                   const int padding_t, const int padding_b,
+                   const int padding_l, const int padding_r,
+                   const mkldnn::algorithm alg_kind,
+                   const bool with_workspace, const bool is_train) :
+                   is_train_(is_train),
+                   with_workspace_(with_workspace),
+                   alg_kind_(alg_kind),
+                   fwd_(nullptr), data_(nullptr), out_(nullptr), workspace_(nullptr) {
+    Init(input, output,
+         kernel_h, kernel_w, stride_h, stride_w,
+         padding_t, padding_b, padding_l, padding_r);
+  }
+
+  ~MKLDNNPoolingFwd() {}
+  void SetDataHandle(const mxnet::NDArray &data,
+                     const mxnet::NDArray &output,
+                     const mxnet::NDArray *workspace = nullptr);
+  void Execute();
+
+ private:
+  bool is_train_;
+  bool with_workspace_;
+  mkldnn::algorithm alg_kind_;
+  std::shared_ptr<mkldnn::pooling_forward::primitive_desc> fwd_pd_;
+  std::shared_ptr<mkldnn::pooling_forward> fwd_;
+  std::shared_ptr<mkldnn::memory> data_;
+  std::shared_ptr<mkldnn::memory> out_;
+  std::shared_ptr<mkldnn::memory> workspace_;
+
+ private:
+  void Init(const mxnet::NDArray &input,
+            const mxnet::NDArray &output,
+            const int kernel_h, const int kernel_w,
+            const int stride_h, const int stride_w,
+            const int padding_t, const int padding_b,
+            const int padding_l, const int padding_r);
+};
+
+inline bool SupportMKLDNNPooling(const PoolingParam &param) {
+  return param.kernel.ndim() == 2 &&
+         (param.pool_type == pool_enum::kMaxPooling ||
+          param.pool_type == pool_enum::kAvgPooling)
+         // This is a temporary fix. There is a bug in global pooling of MKLDNN.
+         && !param.global_pool;
+}
+
+inline bool SupportMKLDNNPooling(const PoolingParam &param,
+                                 const TShape &dshape) {
+  bool ret = SupportMKLDNNPooling(param);
+  if (!ret)
+    return false;
+
+  if (param.pooling_convention == pool_enum::kValid)
+    return true;
+
+  if (((dshape[2] + 2 * param.pad[0] - param.kernel[0]) % param.stride[0] == 0) &&
+      ((dshape[3] + 2 * param.pad[1] - param.kernel[1]) % param.stride[1] == 0))
+    return true;
+  else
+    return false;
+}
+
+inline bool MKLDNNRequireWorkspace(const PoolingParam &param) {
+  return param.pool_type != pool_enum::kAvgPooling;
+}
+
+typedef MKLDNNParamOpSign<PoolingParam> MKLDNNPoolingSignature;
+void MKLDNNPoolingCompute(const OpContext &ctx, const PoolingParam &param,
+                          const NDArray &in_data, const OpReqType req,
+                          const NDArray &out_data, const NDArray *workspace);
+
+void MKLDNNPoolingGradCompute(const OpContext &ctx, const PoolingParam &param,
+                              const NDArray &out_grad, const NDArray &in_data,
+                              const NDArray *workspace, const OpReqType req,
+                              const NDArray &in_grad);
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_POOLING_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_pooling.cc b/src/operator/nn/mkldnn/mkldnn_pooling.cc
new file mode 100644
index 000000000000..6eeecaf07271
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_pooling.cc
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_pooling.cc
+ * \brief
+ * \author Tao Lv
+*/
+
+#if MXNET_USE_MKLDNN == 1
+
+#include "./mkldnn_pooling-inl.h"
+
+namespace mxnet {
+namespace op {
+
+void MKLDNNPoolingFwd::Init(const mxnet::NDArray &input, const mxnet::NDArray &output,
+                            const int kernel_h,  const int kernel_w,
+                            const int stride_h,  const int stride_w,
+                            const int padding_t, const int padding_b,
+                            const int padding_l, const int padding_r) {
+  // mkldnn::memory::desc
+  auto src_md = input.GetMKLDNNData()->get_primitive_desc().desc();
+  mkldnn::memory::dims dims = {src_md.data.dims[0],
+                               src_md.data.dims[1],
+                               static_cast<int>(output.shape()[2]),
+                               static_cast<int>(output.shape()[3])};
+  auto dst_md = mkldnn::memory::desc({dims},
+                                     static_cast<mkldnn::memory::data_type>(src_md.data.data_type),
+                                     static_cast<mkldnn::memory::format>(src_md.data.format));
+  const mkldnn::engine engine = CpuEngine::Get()->get_engine();
+  const mkldnn::algorithm alg_kind = this->alg_kind_;
+  if (alg_kind != mkldnn::algorithm::pooling_max &&
+      alg_kind != mkldnn::algorithm::pooling_avg &&
+      alg_kind != mkldnn::algorithm::pooling_avg_include_padding &&
+      alg_kind != mkldnn::algorithm::pooling_avg_exclude_padding) {
+    LOG(FATAL) << "MKLDNN Pooling: algorithm is not supported";
+  }
+
+  mkldnn::prop_kind prop = mkldnn::prop_kind::forward_scoring;
+  if (this->is_train_ && alg_kind != mkldnn::algorithm::pooling_avg) {
+    prop = mkldnn::prop_kind::forward_training;
+  }
+  if (this->is_train_ && prop == mkldnn::prop_kind::forward_scoring) {
+    LOG(INFO) << "MKLDNN Pooling: training with prop_kind is forward_scoring";
+  }
+
+  const mkldnn::memory::dims strides = {stride_h,  stride_w  };
+  const mkldnn::memory::dims pad_l   = {padding_t, padding_l };
+  const mkldnn::memory::dims pad_r   = {padding_b, padding_r };
+  const mkldnn::memory::dims kernel  = {kernel_h,  kernel_w  };
+  // mkldnn::pooling_forward::desc
+  const auto fwd_desc = mkldnn::pooling_forward::desc(prop, alg_kind, src_md, dst_md,
+                                                      strides, kernel, pad_l, pad_r,
+                                                      mkldnn::padding_kind::zero);
+  this->fwd_pd_.reset(new mkldnn::pooling_forward::primitive_desc(fwd_desc, engine));
+  this->data_.reset(new mkldnn::memory(input.GetMKLDNNData()->get_primitive_desc()));
+  this->out_.reset(new mkldnn::memory(this->fwd_pd_->dst_primitive_desc()));
+  if (this->with_workspace_) {
+    this->workspace_.reset(new mkldnn::memory(this->fwd_pd_->workspace_primitive_desc()));
+    this->fwd_.reset(new mkldnn::pooling_forward(*(this->fwd_pd_),
+                                                 mkldnn::primitive::at(*(this->data_)),
+                                                 *(this->out_),
+                                                 *(this->workspace_)));
+  } else {
+    this->fwd_.reset(new mkldnn::pooling_forward(*(this->fwd_pd_),
+                                                 mkldnn::primitive::at(*(this->data_)),
+                                                 *(this->out_)));
+  }
+  return;
+}
+
+void MKLDNNPoolingFwd::SetDataHandle(const mxnet::NDArray &data,
+                                     const mxnet::NDArray &output,
+                                     const mxnet::NDArray *workspace) {
+  // mkldnn::memory
+  auto data_mem = data.GetMKLDNNData();
+  auto out_mem = const_cast<NDArray&>(output).CreateMKLDNNData(
+                                                  this->fwd_pd_->dst_primitive_desc());
+  this->data_->set_data_handle(data_mem->get_data_handle());
+  this->out_->set_data_handle(out_mem->get_data_handle());
+  if (this->with_workspace_ && workspace == nullptr) {
+    LOG(FATAL) << "MKLDNN Pooling: incorrect workspace input";
+  }
+
+  if (this->with_workspace_) {
+    // mkldnn::memory
+    auto ws_mem = workspace->GetMKLDNNData();
+    this->workspace_->set_data_handle(ws_mem->get_data_handle());
+  }
+}
+
+void MKLDNNPoolingFwd::Execute() {
+  if (this->fwd_) {
+    MKLDNNStream::Get()->RegisterPrim(*(this->fwd_));
+    MKLDNNStream::Get()->Submit();
+  } else {
+    LOG(FATAL) << "MKLDNN Pooling: forward primitive is nullptr";
+  }
+}
+
+mkldnn::algorithm GetMKLDNNPoolAlgo(const PoolingParam &param) {
+  switch (param.pool_type) {
+    case pool_enum::kMaxPooling:
+      return mkldnn::algorithm::pooling_max;
+      break;
+    case pool_enum::kAvgPooling:
+      return mkldnn::algorithm::pooling_avg_include_padding;
+      break;
+    default:
+      LOG(FATAL) << "MKLDNN Pooling: Unknown pooling method.";
+      return mkldnn::algorithm::pooling_max;
+  }
+}
+
+mkldnn::pooling_forward::primitive_desc GetPoolingFwd(const PoolingParam &param,
+                                                      const bool is_train,
+                                                      const memory::desc &data_md,
+                                                      const memory::desc &out_md) {
+  CHECK_EQ(param.kernel.ndim(), 2) << "Not Implemented";
+  int kernel_h_, kernel_w_;
+  if (param.global_pool) {
+    kernel_h_ = data_md.data.dims[2];
+    kernel_w_ = data_md.data.dims[3];
+  } else {
+    kernel_h_ = param.kernel[0];
+    kernel_w_ = param.kernel[1];
+  }
+
+  CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
+  CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
+
+  const int pad_t_ = param.pad[0], pad_b_ = param.pad[0];
+  const int pad_l_ = param.pad[1], pad_r_ = param.pad[1];
+  const int stride_h_ = param.stride[0], stride_w_ = param.stride[1];
+
+  const mkldnn::engine engine = CpuEngine::Get()->get_engine();
+  if (param.global_pool) {
+    CHECK(pad_t_ == 0 && pad_l_ == 0 && stride_h_ == 1 && stride_w_ == 1)
+        << "With Global_pooling: true; only pad = 0 and stride = 1";
+  }
+  if (pad_t_ != 0 || pad_l_ != 0) {
+    CHECK(param.pool_type == pool_enum::kAvgPooling ||
+          param.pool_type == pool_enum::kMaxPooling)
+        << "Padding implemented only for average and max pooling.";
+    CHECK_LT(pad_l_, kernel_w_);
+    CHECK_LT(pad_t_, kernel_h_);
+  }
+
+
+  const mkldnn::algorithm alg = GetMKLDNNPoolAlgo(param);
+  mkldnn::prop_kind kind = mkldnn::prop_kind::forward_scoring;
+  if (is_train && alg != algorithm::pooling_avg) {
+    kind = mkldnn::prop_kind::forward_training;
+  }
+
+  const pooling_forward::desc poolingFwd_desc(kind, alg, data_md, out_md,
+                                              {static_cast<int>(stride_h_),
+                                               static_cast<int>(stride_w_)},
+                                              {kernel_h_, kernel_w_},
+                                              {static_cast<int>(pad_t_),
+                                               static_cast<int>(pad_l_)},
+                                              {static_cast<int>(pad_b_),
+                                               static_cast<int>(pad_r_)},
+                                              padding_kind::zero);
+  return mkldnn::pooling_forward::primitive_desc(poolingFwd_desc, engine);
+}
+
+MKLDNNPoolingFwd &GetPoolingFwd(const PoolingParam &param,
+                                const bool is_train,
+                                const NDArray &data,
+                                const NDArray &output) {
+  static thread_local std::unordered_map<MKLDNNPoolingSignature,
+                                         MKLDNNPoolingFwd,
+                                         MKLDNNOpHash> pooling_fwds;
+
+  bool with_workspace = is_train && MKLDNNRequireWorkspace(param);
+  MKLDNNPoolingSignature key(param);
+  key.AddSign(is_train);
+  key.AddSign(with_workspace);
+  key.AddSign(data);
+  key.AddSign(output);
+
+  auto it = pooling_fwds.find(key);
+  if (it == pooling_fwds.end()) {
+    CHECK_EQ(param.kernel.ndim(), 2) << "Not Implemented";
+    auto data_md = data.GetMKLDNNData()->get_primitive_desc().desc();
+    int kernel_h_, kernel_w_;
+    if (param.global_pool) {
+      kernel_h_ = data_md.data.dims[2];
+      kernel_w_ = data_md.data.dims[3];
+    } else {
+      kernel_h_ = param.kernel[0];
+      kernel_w_ = param.kernel[1];
+    }
+
+    CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
+    CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
+
+    const int pad_t_ = param.pad[0], pad_b_ = param.pad[0];
+    const int pad_l_ = param.pad[1], pad_r_ = param.pad[1];
+    const int stride_h_ = param.stride[0], stride_w_ = param.stride[1];
+
+    if (param.global_pool) {
+        CHECK(pad_t_ == 0 && pad_l_ == 0 && stride_h_ == 1 && stride_w_ == 1)
+            << "With Global_pooling: true; only pad = 0 and stride = 1";
+    }
+
+    if (pad_t_ != 0 || pad_l_ != 0) {
+        CHECK(param.pool_type == pool_enum::kAvgPooling ||
+              param.pool_type == pool_enum::kMaxPooling)
+              << "Padding implemented only for average and max pooling.";
+        CHECK_LT(pad_l_, kernel_w_);
+        CHECK_LT(pad_t_, kernel_h_);
+    }
+
+    const mkldnn::algorithm alg = GetMKLDNNPoolAlgo(param);
+    MKLDNNPoolingFwd fwd(data, output, kernel_h_, kernel_w_, stride_h_, stride_w_,
+                         pad_t_, pad_b_, pad_l_, pad_r_, alg, with_workspace, is_train);
+    auto ins_ret = pooling_fwds.insert(
+        std::pair<MKLDNNPoolingSignature, MKLDNNPoolingFwd>(key, fwd));
+    CHECK(ins_ret.second);
+    it = ins_ret.first;
+  }
+  return it->second;
+}
+
+void MKLDNNPoolingCompute(const OpContext &ctx, const PoolingParam &param,
+                          const NDArray &in_data, const OpReqType req,
+                          const NDArray &out_data, const NDArray *workspace) {
+  auto fwd = GetPoolingFwd(param, ctx.is_train, in_data, out_data);
+  fwd.SetDataHandle(in_data, out_data, workspace);
+  fwd.Execute();
+}
+
+void MKLDNNPoolingGradCompute(const OpContext &ctx, const PoolingParam &param,
+                              const NDArray &out_grad, const NDArray &in_data,
+                              const NDArray *workspace, const OpReqType req,
+                              const NDArray &in_grad) {
+  if (req == kNullOp) {
+    return;
+  }
+
+  TmpMemMgr::Get()->Init(ctx.requested[0]);
+  // mkldnn::memory
+  auto diff_dst_mem = out_grad.GetMKLDNNData();
+  auto input_mem = in_data.GetMKLDNNData();
+  mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc();
+  const mkldnn::memory::desc data_md = data_mpd.desc();
+  const memory::dims dims = {data_md.data.dims[0], data_md.data.dims[1],
+                             static_cast<int>(out_grad.shape()[2]),
+                             static_cast<int>(out_grad.shape()[3])};
+  const memory::desc out_md({dims},
+                            static_cast<memory::data_type>(data_md.data.data_type),
+                            static_cast<memory::format>(data_md.data.format));
+  auto pdesc_fwd = GetPoolingFwd(param, ctx.is_train, data_md, out_md);
+
+  const mkldnn::memory::desc diff_md = diff_dst_mem->get_primitive_desc().desc();
+  const memory::dims dims1 = {diff_md.data.dims[0], diff_md.data.dims[1],
+                              static_cast<int>(in_grad.shape()[2]),
+                              static_cast<int>(in_grad.shape()[3])};
+  const memory::desc diff_in_md(
+      {dims1}, static_cast<memory::data_type>(diff_md.data.data_type),
+      static_cast<memory::format>(diff_md.data.format));
+  const mkldnn::engine  cpu_engine = data_mpd.get_engine();
+  const mkldnn::algorithm alg = GetMKLDNNPoolAlgo(param);
+
+  int kernel_h_, kernel_w_;
+  if (param.global_pool) {
+    kernel_h_ = data_md.data.dims[2];
+    kernel_w_ = data_md.data.dims[3];
+  } else {
+    kernel_h_ = param.kernel[0];
+    kernel_w_ = param.kernel[1];
+  }
+  const pooling_backward::desc desc(alg, diff_in_md, diff_md,
+                                    {static_cast<int>(param.stride[0]),
+                                     static_cast<int>(param.stride[1])},
+                                    {kernel_h_, kernel_w_},
+                                    {static_cast<int>(param.pad[0]),
+                                     static_cast<int>(param.pad[1])},
+                                    {static_cast<int>(param.pad[0]),
+                                     static_cast<int>(param.pad[1])},
+                                    mkldnn::padding_kind::zero);
+  const pooling_backward::primitive_desc pdesc(desc, cpu_engine, pdesc_fwd);
+
+  auto diff_src_mem =
+      CreateMKLDNNMem(in_grad, pdesc.diff_src_primitive_desc(), req);
+
+  if (MKLDNNRequireWorkspace(param)) {
+    CHECK(workspace != nullptr);
+    auto workspace_mem = workspace->GetMKLDNNData();
+    MKLDNNStream::Get()->RegisterPrim(
+        pooling_backward(pdesc, *diff_dst_mem, primitive::at(*workspace_mem),
+                         *diff_src_mem.second));
+  } else {
+    MKLDNNStream::Get()->RegisterPrim(
+        pooling_backward(pdesc, *diff_dst_mem, *diff_src_mem.second));
+  }
+  CommitOutput(in_grad, diff_src_mem);
+  MKLDNNStream::Get()->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_softmax.cc b/src/operator/nn/mkldnn/mkldnn_softmax.cc
new file mode 100644
index 000000000000..aa59f13d06da
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_softmax.cc
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_softmax.cc
+ * \brief
+ * \author Da Zheng
+*/
+
+#include "../softmax-inl.h"
+#include "./mkldnn_ops-inl.h"
+#include "./mkldnn_base-inl.h"
+
+#if MXNET_USE_MKLDNN == 1
+namespace mxnet {
+namespace op {
+
+void MKLDNNSoftmaxForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                          const NDArray &in_data, const OpReqType &req,
+                          const NDArray &out_data) {
+  const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
+  auto input_mem = in_data.GetMKLDNNData();
+  mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc();
+  mkldnn::memory::desc data_md = data_mpd.desc();
+  auto cpu_engine = data_mpd.get_engine();
+  auto prop = ctx.is_train
+    ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
+  mkldnn::softmax_forward::desc desc = mkldnn::softmax_forward::desc(prop,
+      data_md, param.axis);
+  mkldnn::softmax_forward::primitive_desc pdesc(desc, cpu_engine);
+
+  auto output_memory = out_data.GetMKLDNNData();
+  MKLDNNStream *stream = MKLDNNStream::Get();
+  stream->RegisterPrim(mkldnn::softmax_forward(pdesc, *input_mem, *output_memory));
+  stream->Submit();
+}
+
+}   // namespace op
+}   // namespace mxnet
+#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_sum.cc b/src/operator/nn/mkldnn/mkldnn_sum.cc
new file mode 100644
index 000000000000..f3aeacf17dd1
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_sum.cc
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_sum.cc
+ * \brief
+ * \author Da Zheng
+*/
+#include <iostream>
+
+#include "./mkldnn_ops-inl.h"
+#include "./mkldnn_base-inl.h"
+
+#if MXNET_USE_MKLDNN == 1
+namespace mxnet {
+namespace op {
+
+void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2,
+         const mkldnn::memory &out) {
+  std::vector<mkldnn::memory::primitive_desc> input_pds(2);
+  std::vector<float> scales(2, 1);
+  std::vector<mkldnn::primitive::at> inputs;
+  input_pds[0] = arr1.get_primitive_desc();
+  input_pds[1] = arr2.get_primitive_desc();
+  CHECK(input_pds[0] == input_pds[1]);
+  inputs.push_back(arr1);
+  inputs.push_back(arr2);
+  // TODO(zhengda) I need to reorder memory here.
+  mkldnn::sum::primitive_desc sum_pd(scales, input_pds);
+  MKLDNNStream::Get()->RegisterPrim(mkldnn::sum(sum_pd, inputs, out));
+}
+
+void MKLDNNSumForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                      const std::vector<NDArray> &inputs, const OpReqType &req,
+                      const NDArray &out_data) {
+  TmpMemMgr::Get()->Init(ctx.requested[0]);
+  std::vector<mkldnn::primitive::at> in_prims;
+  std::vector<mkldnn::memory::primitive_desc> in_pds(inputs.size());
+  std::vector<float> scales(inputs.size(), 1);
+  in_prims.reserve(inputs.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    auto in_mem = inputs[i].GetMKLDNNData();
+    in_prims.push_back(*in_mem);
+    in_pds[i] = in_mem->get_primitive_desc();
+  }
+  mkldnn::sum::primitive_desc pdesc(scales, in_pds);
+
+  auto out_mem = CreateMKLDNNMem(out_data, pdesc.dst_primitive_desc(), req);
+  MKLDNNStream *stream = MKLDNNStream::Get();
+  stream->RegisterPrim(mkldnn::sum(pdesc, in_prims, *out_mem.second));
+  CommitOutput(out_data, out_mem);
+  stream->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif
diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h
index a32aaa2152e9..7a20f026f7b9 100644
--- a/src/operator/nn/pooling-inl.h
+++ b/src/operator/nn/pooling-inl.h
@@ -21,7 +21,7 @@
  * Copyright (c) 2017 by Contributors
  * \file pooling-inl.h
  * \brief
- * \author Bing Xu, Jun Wu
+ * \author Bing Xu, Jun Wu, Da Zheng
 */
 
 #ifndef MXNET_OPERATOR_NN_POOLING_INL_H_
@@ -78,257 +78,138 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
     DMLC_DECLARE_FIELD(pad).set_default(TShape())
     .describe("Pad for pooling: (y, x) or (d, y, x). Defaults to no padding.");
   }
-};
 
-template<typename xpu, typename DType>
-class PoolingOp : public Operator {
- public:
-  explicit PoolingOp(PoolingParam p) {
-    this->param_ = p;
+  bool operator==(const PoolingParam& other) const {
+    return this->kernel             == other.kernel &&
+           this->stride             == other.stride &&
+           this->pad                == other.pad &&
+           this->pool_type          == other.pool_type &&
+           this->pooling_convention == other.pooling_convention &&
+           this->global_pool        == other.global_pool &&
+           this->cudnn_off          == other.cudnn_off;
   }
+};
 
-  virtual void Forward(const OpContext& ctx,
-                       const std::vector<TBlob>& in_data,
-                       const std::vector<OpReqType>& req,
-                       const std::vector<TBlob>& out_data,
-                       const std::vector<TBlob>& aux_args) {
-    using namespace mshadow;
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 1U);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    const TShape& ishape = in_data[pool_enum::kData].shape_;
+}  // namespace op
+}  // namespace mxnet
 
-    pool(s, in_data[pool_enum::kData].dptr<DType>(),
-         in_data[pool_enum::kData].shape_,
-         out_data[pool_enum::kOut].shape_,
-         param_.global_pool?
-           TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim())
-           : param_.kernel,
-         param_.pad,
-         param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride,
-         param_.pool_type,
-         req[pool_enum::kOut],
-         out_data[pool_enum::kOut].dptr<DType>());
+namespace std {
+template<>
+struct hash<mxnet::op::PoolingParam> {
+  size_t operator()(const mxnet::op::PoolingParam& val) {
+    size_t ret = 0;
+    ret = dmlc::HashCombine(ret, val.kernel);
+    ret = dmlc::HashCombine(ret, val.stride);
+    ret = dmlc::HashCombine(ret, val.pad);
+    ret = dmlc::HashCombine(ret, val.pool_type);
+    ret = dmlc::HashCombine(ret, val.pooling_convention);
+    ret = dmlc::HashCombine(ret, val.global_pool);
+    ret = dmlc::HashCombine(ret, val.cudnn_off);
+    return ret;
   }
+};
+}  // namespace std
 
-  virtual void Backward(const OpContext& ctx,
-                        const std::vector<TBlob>& out_grad,
-                        const std::vector<TBlob>& in_data,
-                        const std::vector<TBlob>& out_data,
-                        const std::vector<OpReqType>& req,
-                        const std::vector<TBlob>& in_grad,
-                        const std::vector<TBlob>& aux_args) {
-    using namespace mshadow;
-    CHECK_EQ(out_grad.size(), 1U);
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 1U);
-    CHECK_EQ(req.size(), 1U);
-    CHECK_EQ(in_grad.size(), 1U);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    const TShape& ishape = in_data[pool_enum::kData].shape_;
-
-    unpool(s, out_grad[pool_enum::kOut].dptr<DType>(),
-           in_data[pool_enum::kData].dptr<DType>(),
-           out_data[pool_enum::kOut].dptr<DType>(),
-           in_grad[pool_enum::kData].shape_,
-           out_grad[pool_enum::kOut].shape_,
-           param_.global_pool?
-             TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim())
-             : param_.kernel,
-           param_.pad,
-           param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride,
-           param_.pool_type,
-           req[pool_enum::kData],
-           in_grad[pool_enum::kData].dptr<DType>());
-  }
+namespace mxnet {
+namespace op {
 
- private:
-  PoolingParam param_;
-};  // class PoolingOp
+/*
+ * When MKLDNN is enabled, we might want 2 outputs instead of one inputs, which
+ * also changes the number of inputs for backward.
+ */
+int GetNumOutputs(const PoolingParam &param);
+int GetNumBackInputs(const PoolingParam &param);
 
-template<typename xpu>
-Operator* CreateOp(PoolingParam param, int dtype);
+template<typename xpu, typename DType>
+void PoolingForward(const OpContext& ctx, const PoolingParam &param,
+                    const TBlob& in_data, const OpReqType& req,
+                    const TBlob& out_data) {
+  using namespace mshadow;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const TShape& ishape = in_data.shape_;
+
+  pool(s, in_data.dptr<DType>(), in_data.shape_, out_data.shape_,
+       param.global_pool?
+       TShape(ishape.data()+ishape.ndim()-param.kernel.ndim(), ishape.data()+ishape.ndim())
+       : param.kernel,
+       param.pad,
+       param.global_pool? TShape(param.kernel.ndim()) : param.stride,
+       param.pool_type, req, out_data.dptr<DType>());
+}
 
+template<typename xpu, typename DType>
+void PoolingBackward(const OpContext& ctx, const PoolingParam &param,
+                     const TBlob& out_grad, const TBlob& in_data,
+                     const TBlob& out_data, const OpReqType& req,
+                     const TBlob& in_grad) {
+  using namespace mshadow;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const TShape& ishape = in_data.shape_;
+
+  unpool(s, out_grad.dptr<DType>(), in_data.dptr<DType>(), out_data.dptr<DType>(),
+         in_grad.shape_, out_grad.shape_,
+         param.global_pool?
+         TShape(ishape.data()+ishape.ndim()-param.kernel.ndim(), ishape.data()+ishape.ndim())
+         : param.kernel,
+         param.pad,
+         param.global_pool? TShape(param.kernel.ndim()) : param.stride,
+         param.pool_type, req, in_grad.dptr<DType>());
+}
 
-#if DMLC_USE_CXX11
-class PoolingProp : public OperatorProperty {
- public:
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    using namespace mshadow;
-    param_.Init(kwargs);
-    if (param_.kernel.ndim() == 1) {
-      if (param_.stride.ndim() == 0) param_.stride = Shape1(1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape1(0);
-    } else if (param_.kernel.ndim() == 2) {
-      if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
+template<typename xpu>
+void PoolingCompute(const nnvm::NodeAttrs& attrs,
+                    const OpContext& ctx,
+                    const std::vector<TBlob>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<TBlob>& outputs) {
+  const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), GetNumOutputs(param));
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    if (pool_enum::kMaxPooling == param.pool_type
+        || pool_enum::kAvgPooling == param.pool_type
+        || pool_enum::kSumPooling == param.pool_type) {
+      PoolingForward<xpu, DType>(ctx, param, inputs[0], req[0], outputs[0]);
     } else {
-      CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D pooling not supported";
-      if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
+      LOG(FATAL) << "unknown pooling type";
     }
-    CHECK_EQ(param_.stride.ndim(), param_.kernel.ndim())
-      << "stride and kernel should have the same length";
-    CHECK_EQ(param_.pad.ndim(), param_.kernel.ndim())
-      << "pad and kernel should have the same length";
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
+  });
+}
 
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-    CHECK_EQ(in_shape->size(), 1U);
-    const TShape &dshape = (*in_shape)[0];
-    CHECK_GE(dshape.ndim(), 3U) << "Pooling: Input data should be  3D in (batch, channel, x)"
-                                << " Or 4D in (batch, channel, y, x) "
-                                << " Or 5D in (batch, channel, d, y, x)";
-    TShape oshape = dshape;
-    if (dshape.ndim() ==  0) return false;
-    if (param_.kernel.ndim() == 1) {
-      CHECK_EQ(dshape.ndim(), 3U) << "Pooling: Input data should be 3D in (batch, channel, x)";
-      if (param_.global_pool) {
-        oshape[2] = 1;
-      } else {
-        CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0])
-            << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2]
-            << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")";
-        if (param_.pooling_convention == pool_enum::kValid) {
-          oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
-                              param_.stride[0];
-        } else {
-          oshape[2] = 1 + static_cast<int>(ceil(static_cast<float>(
-                              dshape[2] + 2 * param_.pad[0] -
-                              param_.kernel[0]) / param_.stride[0]));
-        }
-      }
-      out_shape->clear();
-      out_shape->push_back(oshape);  // save output shape
-    } else if (param_.kernel.ndim() == 2) {
-      CHECK_EQ(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in (batch, channel, y, x)";
-      if (param_.global_pool) {
-        oshape[2] = 1;
-        oshape[3] = 1;
-      } else {
-        CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0])
-            << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2]
-            << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")";
-        CHECK(param_.kernel[1] <= dshape[3] + 2 * param_.pad[1])
-            << "kernel size (" << param_.kernel[1] << ") exceeds input (" << dshape[3]
-            << " padded to " << (dshape[3] + 2*param_.pad[1]) << ")";
-        if (param_.pooling_convention == pool_enum::kValid) {
-          oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
-                              param_.stride[0];
-          oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) /
-                              param_.stride[1];
-        } else {
-          oshape[2] = 1 + static_cast<int>(ceil(static_cast<float>(
-                              dshape[2] + 2 * param_.pad[0] -
-                              param_.kernel[0]) / param_.stride[0]));
-          oshape[3] = 1 + static_cast<int>(ceil(static_cast<float>(
-                              dshape[3] + 2 * param_.pad[1] -
-                              param_.kernel[1]) / param_.stride[1]));
-        }
-      }
-      out_shape->clear();
-      out_shape->push_back(oshape);  // save output shape
-    } else if (param_.kernel.ndim() == 3) {
-      CHECK_EQ(dshape.ndim(), 5U)
-        << "Pooling: Input data should be 5D in (batch, channel, d, y, x)";
-      CHECK_LE(param_.kernel[0], dshape[2] + 2 * param_.pad[0]) << "kernel size exceeds input";
-      CHECK_LE(param_.kernel[1], dshape[3] + 2 * param_.pad[1]) << "kernel size exceeds input";
-      CHECK_LE(param_.kernel[2], dshape[4] + 2 * param_.pad[2]) << "kernel size exceeds input";
-      if (param_.global_pool) {
-        oshape[2] = 1;
-        oshape[3] = 1;
-        oshape[4] = 1;
-      } else {
-        if (param_.pooling_convention == pool_enum::kValid) {
-          oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
-                              param_.stride[0];
-          oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) /
-                              param_.stride[1];
-          oshape[4] = 1 + (dshape[4] + 2 * param_.pad[2] - param_.kernel[2]) /
-                              param_.stride[2];
-        } else {
-          oshape[2] = 1 + static_cast<int>(ceil(static_cast<float>(
-                              dshape[2] + 2 * param_.pad[0] -
-                              param_.kernel[0]) / param_.stride[0]));
-          oshape[3] = 1 + static_cast<int>(ceil(static_cast<float>(
-                              dshape[3] + 2 * param_.pad[1] -
-                              param_.kernel[1]) / param_.stride[1]));
-          oshape[4] = 1 + static_cast<int>(ceil(static_cast<float>(
-                              dshape[4] + 2 * param_.pad[2] -
-                              param_.kernel[2]) / param_.stride[2]));
-        }
-      }
-
-      out_shape->clear();
-      out_shape->push_back(oshape);  // save output shape
-    }
-    return true;
+template<typename xpu>
+void PoolingGradCompute(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const std::vector<TBlob>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& outputs) {
+  const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), GetNumBackInputs(param));
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  off_t ograd_idx, in_data_idx, out_data_idx;
+  // When MKLDNN is enabled, the input data may contains arrays for workspace.
+  if (GetNumBackInputs(param) == 5) {
+    ograd_idx = 0;
+    in_data_idx = 2;
+    out_data_idx = 3;
+  } else {
+    ograd_idx = 0;
+    in_data_idx = 1;
+    out_data_idx = 2;
   }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_EQ(in_type->size(), 1U);
-    int dtype = (*in_type)[0];
-
-    if (dtype == -1) {
-      LOG(FATAL) << "Input type to pooling is not specified.";
-      return false;
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    if (pool_enum::kMaxPooling == param.pool_type
+        || pool_enum::kAvgPooling == param.pool_type
+        || pool_enum::kSumPooling == param.pool_type) {
+      PoolingBackward<xpu, DType>(ctx, param, inputs[ograd_idx],
+                                  inputs[in_data_idx], inputs[out_data_idx],
+                                  req[0], outputs[0]);
+    } else {
+      LOG(FATAL) << "unknown pooling type";
     }
+  });
+}
 
-    out_type->clear();
-    out_type->push_back(dtype);
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    PoolingProp *prop_sym = new PoolingProp();
-    prop_sym->param_ = this->param_;
-    return prop_sym;
-  }
-
-  std::string TypeString() const override {
-    return "Pooling";
-  }
-
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return {out_grad[pool_enum::kOut], in_data[pool_enum::kData],
-            out_data[pool_enum::kOut]};
-  }
-
-  std::vector<std::pair<int, void*> > BackwardInplaceOption(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data,
-    const std::vector<void*> &in_grad) const override {
-#if MXNET_USE_CUDNN == 1
-    return {};
-#else
-    return {{in_data[pool_enum::kData], in_grad[pool_enum::kData]}};
-#endif
-  }
-
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented.";
-    return NULL;
-  }
-
-  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                             std::vector<int> *in_type) const override;
-
- private:
-  PoolingParam param_;
-};  // class PoolingProp
-#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 8345ea3886d4..f719e0753e08 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -21,78 +21,300 @@
  * Copyright (c) 2017 by Contributors
  * \file pooling.cc
  * \brief
- * \author Bing Xu, Jun Wu
+ * \author Bing Xu, Jun Wu, Da Zheng
 */
+#include "../elemwise_op_common.h"
 #include "./pooling-inl.h"
-#if MXNET_USE_MKL2017 == 1
-#include <mkl_memory.h>
-#include "../mkl/mkl_memory-inl.h"
-#include "../mkl/mkl_pooling-inl.h"
-#endif  // MXNET_USE_MKL2017
 #if MXNET_USE_NNPACK == 1
 #include "./nnpack/nnpack_pooling-inl.h"
 #endif  // MXNET_USE_NNPACK
+#if MXNET_USE_MKLDNN == 1
+#include "./mkldnn/mkldnn_pooling-inl.h"
+#endif  // MXNET_USE_MKLDNN
 
 namespace mxnet {
 namespace op {
 
-template<>
-Operator *CreateOp<cpu>(PoolingParam param, int dtype) {
-  Operator *op = NULL;
-#if MXNET_USE_MKL2017 == 1
-    if (param.kernel.ndim() == 2
-      && ((param.pool_type == pool_enum::kMaxPooling)
-      || (param.pool_type == pool_enum::kAvgPooling))) {
-      switch (dtype) {
-      case mshadow::kFloat32:
-        return new MKLPoolingOp<cpu, float>(param);
-      case mshadow::kFloat64:
-        return new MKLPoolingOp<cpu, double>(param);
-      default:
-        break;
+static void PoolingParamParser(nnvm::NodeAttrs *attrs) {
+  using namespace mshadow;
+  PoolingParam param;
+  param.Init(attrs->dict);
+  if (param.kernel.ndim() == 1) {
+    if (param.stride.ndim() == 0) param.stride = Shape1(1);
+    if (param.pad.ndim() == 0) param.pad = Shape1(0);
+  } else if (param.kernel.ndim() == 2) {
+    if (param.stride.ndim() == 0) param.stride = Shape2(1, 1);
+    if (param.pad.ndim() == 0) param.pad = Shape2(0, 0);
+  } else {
+    CHECK_EQ(param.kernel.ndim(), 3U) << param.kernel.ndim()
+                                       << "D pooling not supported";
+    if (param.stride.ndim() == 0) param.stride = Shape3(1, 1, 1);
+    if (param.pad.ndim() == 0) param.pad = Shape3(0, 0, 0);
+  }
+  CHECK_EQ(param.stride.ndim(), param.kernel.ndim())
+      << "stride and kernel should have the same length";
+  CHECK_EQ(param.pad.ndim(), param.kernel.ndim())
+      << "pad and kernel should have the same length";
+  attrs->parsed = std::move(param);
+}
+
+int GetNumOutputs(const PoolingParam &param) {
+#if MXNET_USE_MKLDNN == 1
+  return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 2 : 1;
+#else
+  return 1;
+#endif
+}
+
+int GetNumBackInputs(const PoolingParam &param) {
+#if MXNET_USE_MKLDNN == 1
+  return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 5 : 3;
+#else
+  return 3;
+#endif
+}
+
+static bool PoolingType(const nnvm::NodeAttrs& attrs,
+                        std::vector<int> *in_attrs,
+                        std::vector<int> *out_attrs) {
+  out_attrs->at(0) = in_attrs->at(0);
+#if MXNET_USE_MKLDNN == 1
+  const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
+  if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param)) {
+    CHECK_GT(out_attrs->size(), 1U);
+    out_attrs->at(1) = mshadow::kInt32;
+  }
+#endif
+  return true;
+}
+
+static bool PoolingShape(const nnvm::NodeAttrs &attrs,
+                         std::vector<TShape> *in_shape,
+                         std::vector<TShape> *out_shape) {
+  const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
+  CHECK_EQ(in_shape->size(), 1U);
+  const TShape &dshape = (*in_shape)[0];
+  CHECK_GE(dshape.ndim(), 3U)
+      << "Pooling: Input data should be  3D in (batch, channel, x)"
+      << " Or 4D in (batch, channel, y, x) "
+      << " Or 5D in (batch, channel, d, y, x)";
+  TShape oshape = dshape;
+  if (dshape.ndim() == 0) return false;
+  if (param.kernel.ndim() == 1) {
+    CHECK_EQ(dshape.ndim(), 3U)
+        << "Pooling: Input data should be 3D in (batch, channel, x)";
+    if (param.global_pool) {
+      oshape[2] = 1;
+    } else {
+      CHECK(param.kernel[0] <= dshape[2] + 2 * param.pad[0])
+          << "kernel size (" << param.kernel[0] << ") exceeds input ("
+          << dshape[2] << " padded to " << (dshape[2] + 2 * param.pad[0])
+          << ")";
+      if (param.pooling_convention == pool_enum::kValid) {
+        oshape[2] = 1 +
+                    (dshape[2] + 2 * param.pad[0] - param.kernel[0]) /
+                        param.stride[0];
+      } else {
+        oshape[2] = 1 + static_cast<int>(ceil(
+                            static_cast<float>(dshape[2] + 2 * param.pad[0] -
+                                               param.kernel[0]) /
+                            param.stride[0]));
       }
     }
+    out_shape->clear();
+    out_shape->push_back(oshape);  // save output shape
+#if MXNET_USE_MKLDNN == 1
+    if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param))
+      out_shape->push_back(oshape);   // for workspace
 #endif
-#if MXNET_USE_NNPACK == 1
-  // NNPACK only support max-pooling with kernel = 2, stride = 2, pooling_convention
-  // = kFull(note that the default value is kValid in MXNet)
-  if ((param.pool_type == pool_enum::kMaxPooling)
-    && (param.pooling_convention == pool_enum::kFull)
-    && (param.kernel.ndim() == 2) && (param.stride.ndim() == 2)
-    && (param.kernel[0] == 2) && (param.kernel[1] == 2)
-    && (param.stride[0] == 2) && (param.stride[1] == 2)) {
-    switch (dtype) {
-    case mshadow::kFloat32:
-      return new NNPACKPoolingOp<cpu, float>(param);
-    default:
-      break;
+  } else if (param.kernel.ndim() == 2) {
+    CHECK_EQ(dshape.ndim(), 4U)
+        << "Pooling: Input data should be 4D in (batch, channel, y, x)";
+    if (param.global_pool) {
+      oshape[2] = 1;
+      oshape[3] = 1;
+    } else {
+      CHECK(param.kernel[0] <= dshape[2] + 2 * param.pad[0])
+          << "kernel size (" << param.kernel[0] << ") exceeds input ("
+          << dshape[2] << " padded to " << (dshape[2] + 2 * param.pad[0])
+          << ")";
+      CHECK(param.kernel[1] <= dshape[3] + 2 * param.pad[1])
+          << "kernel size (" << param.kernel[1] << ") exceeds input ("
+          << dshape[3] << " padded to " << (dshape[3] + 2 * param.pad[1])
+          << ")";
+      if (param.pooling_convention == pool_enum::kValid) {
+        oshape[2] = 1 +
+                    (dshape[2] + 2 * param.pad[0] - param.kernel[0]) /
+                        param.stride[0];
+        oshape[3] = 1 +
+                    (dshape[3] + 2 * param.pad[1] - param.kernel[1]) /
+                        param.stride[1];
+      } else {
+        oshape[2] = 1 + static_cast<int>(ceil(
+                            static_cast<float>(dshape[2] + 2 * param.pad[0] -
+                                               param.kernel[0]) /
+                            param.stride[0]));
+        oshape[3] = 1 + static_cast<int>(ceil(
+                            static_cast<float>(dshape[3] + 2 * param.pad[1] -
+                                               param.kernel[1]) /
+                            param.stride[1]));
+      }
     }
-  }
+    out_shape->clear();
+    out_shape->push_back(oshape);  // save output shape
+#if MXNET_USE_MKLDNN == 1
+    if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param))
+      out_shape->push_back(oshape);   // for workspace
 #endif
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    if (pool_enum::kMaxPooling == param.pool_type
-        || pool_enum::kAvgPooling == param.pool_type
-        || pool_enum::kSumPooling == param.pool_type) {
-      op = new PoolingOp<cpu, DType>(param);
+  } else if (param.kernel.ndim() == 3) {
+    CHECK_EQ(dshape.ndim(), 5U)
+        << "Pooling: Input data should be 5D in (batch, channel, d, y, x)";
+    CHECK_LE(param.kernel[0], dshape[2] + 2 * param.pad[0])
+        << "kernel size exceeds input";
+    CHECK_LE(param.kernel[1], dshape[3] + 2 * param.pad[1])
+        << "kernel size exceeds input";
+    CHECK_LE(param.kernel[2], dshape[4] + 2 * param.pad[2])
+        << "kernel size exceeds input";
+    if (param.global_pool) {
+      oshape[2] = 1;
+      oshape[3] = 1;
+      oshape[4] = 1;
     } else {
-      LOG(FATAL) << "unknown pooling type";
-      return NULL;
+      if (param.pooling_convention == pool_enum::kValid) {
+        oshape[2] = 1 +
+                    (dshape[2] + 2 * param.pad[0] - param.kernel[0]) /
+                        param.stride[0];
+        oshape[3] = 1 +
+                    (dshape[3] + 2 * param.pad[1] - param.kernel[1]) /
+                        param.stride[1];
+        oshape[4] = 1 +
+                    (dshape[4] + 2 * param.pad[2] - param.kernel[2]) /
+                        param.stride[2];
+      } else {
+        oshape[2] = 1 + static_cast<int>(ceil(
+                            static_cast<float>(dshape[2] + 2 * param.pad[0] -
+                                               param.kernel[0]) /
+                            param.stride[0]));
+        oshape[3] = 1 + static_cast<int>(ceil(
+                            static_cast<float>(dshape[3] + 2 * param.pad[1] -
+                                               param.kernel[1]) /
+                            param.stride[1]));
+        oshape[4] = 1 + static_cast<int>(ceil(
+                            static_cast<float>(dshape[4] + 2 * param.pad[2] -
+                                               param.kernel[2]) /
+                            param.stride[2]));
+      }
     }
-  });
 
-  return op;
+    out_shape->clear();
+    out_shape->push_back(oshape);  // save output shape
+#if MXNET_USE_MKLDNN == 1
+    if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param))
+      out_shape->push_back(oshape);   // for workspace
+#endif
+  }
+  return true;
+}
+
+#if MXNET_USE_MKLDNN == 1
+void PoolingComputeExCPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
+                         const std::vector<NDArray> &inputs,
+                         const std::vector<OpReqType> &req,
+                         const std::vector<NDArray> &outputs) {
+  const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
+  const NDArray *workspace = nullptr;
+  if (MKLDNNRequireWorkspace(param)) {
+    CHECK_GT(outputs.size(), 1U);
+    workspace = &outputs[1];
+  }
+  if (SupportMKLDNN(inputs[0])
+      && SupportMKLDNNPooling(param, inputs[0].shape())) {
+    MKLDNN_OPCHECK_INIT(false, 1, inputs, outputs);
+    MKLDNNPoolingCompute(ctx, param, inputs[0], req[0], outputs[0], workspace);
+    MKLDNN_OPCHECK_RUN(PoolingCompute<cpu>, attrs, ctx, inputs, req, outputs);
+    return;
+  }
+  FallBackCompute(PoolingCompute<cpu>, attrs, ctx, inputs, req, outputs);
+}
+
+void PoolingGradComputeExCPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
+                             const std::vector<NDArray> &inputs,
+                             const std::vector<OpReqType> &req,
+                             const std::vector<NDArray> &outputs) {
+  const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
+  const NDArray &out_grad = inputs[0];
+  const NDArray *workspace = nullptr;
+  const NDArray *in_data = nullptr;
+  if (MKLDNNRequireWorkspace(param)) {
+    // The first two elements are the gradient of the outputs in forward.
+    // The third is the input of forward.
+    // The fourth and the fifth are the outputs of forward.
+    CHECK_EQ(inputs.size(), 5U);
+    in_data = &inputs[2];
+    workspace = &inputs[4];
+  } else {
+    CHECK_EQ(inputs.size(), 3U);
+    in_data = &inputs[1];
+  }
+  const NDArray &in_grad = outputs[0];
+  if (SupportMKLDNN(inputs[0])
+      && SupportMKLDNNPooling(param, inputs[0].shape())) {
+    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    MKLDNNPoolingGradCompute(ctx, param, out_grad, *in_data, workspace,
+                             req[0], in_grad);
+    MKLDNN_OPCHECK_RUN(PoolingGradCompute<cpu>, attrs, ctx, inputs, req,
+                       outputs);
+    return;
+  }
+  FallBackCompute(PoolingGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+}
+#endif
+
+inline static bool PoolingStorageType(const nnvm::NodeAttrs &attrs,
+                                      const int dev_mask,
+                                      DispatchMode *dispatch_mode,
+                                      std::vector<int> *in_attrs,
+                                      std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1);
+
+#if MXNET_USE_MKLDNN == 1
+  const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
+  if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param)) {
+    return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
+                               dispatch_mode, DispatchMode::kFComputeEx);
+  }
+#else
+  CHECK_EQ(out_attrs->size(), 1);
+#endif
+  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
+                             dispatch_mode, DispatchMode::kFCompute);
 }
 
-// DO_BIND_DISPATCH comes from operator_common.h
-Operator* PoolingProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                                     std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+inline static bool BackwardPoolingStorageType(const nnvm::NodeAttrs &attrs,
+                                              const int dev_mask,
+                                              DispatchMode *dispatch_mode,
+                                              std::vector<int> *in_attrs,
+                                              std::vector<int> *out_attrs) {
+  const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), GetNumBackInputs(param));
+  CHECK_EQ(out_attrs->size(), 1);
+
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param)) {
+    return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
+                               dispatch_mode, DispatchMode::kFComputeEx);
+  }
+#else
+  CHECK_EQ(in_attrs->size(), 3);
+#endif
+  return storage_type_assign(out_attrs, mxnet::kDefaultStorage,
+                             dispatch_mode, DispatchMode::kFCompute);
 }
 
 DMLC_REGISTER_PARAMETER(PoolingParam);
 
-MXNET_REGISTER_OP_PROPERTY(Pooling, PoolingProp)
-.describe(R"code(Performs pooling on the input.
+NNVM_REGISTER_OP(Pooling)
+    .describe(R"code(Performs pooling on the input.
 
 The shapes for 1-D pooling are
 
@@ -131,8 +353,61 @@ For 3-D pooling, an additional *depth* dimension is added before
 height, width)*.
 
 )code" ADD_FILELINE)
-.add_argument("data", "NDArray-or-Symbol", "Input data to the pooling operator.")
+.set_num_inputs(1)
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
+  return GetNumOutputs(param);
+})
+#if MXNET_USE_MKLDNN == 1
+.set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs",
+                                    [](const NodeAttrs& attrs) { return 1; })
+#endif
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+    [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"data"};
+})
+.set_attr<nnvm::FListOutputNames>("FListOutputNames",
+    [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"output"};
+})
+.set_attr_parser(PoolingParamParser)
+.set_attr<FInferStorageType>("FInferStorageType", PoolingStorageType)
+.set_attr<nnvm::FInferType>("FInferType", PoolingType)
+.set_attr<nnvm::FInferShape>("FInferShape", PoolingShape)
+.set_attr<FCompute>("FCompute<cpu>", PoolingCompute<cpu>)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FComputeEx>("FComputeEx<cpu>", PoolingComputeExCPU)
+#endif
+.set_attr<nnvm::FGradient>("FGradient",
+                           ElemwiseGradUseInOut{"_backward_Pooling"})
+.add_argument("data", "NDArray-or-Symbol",
+              "Input data to the pooling operator.")
 .add_arguments(PoolingParam::__FIELDS__());
 
+NNVM_REGISTER_OP(_backward_Pooling)
+.set_num_outputs(1)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<nnvm::FInplaceOption>(
+    "FInplaceOption",
+    [](const NodeAttrs &attrs) {
+#if MXNET_USE_CUDNN == 1
+  return std::vector<std::pair<int, int> >();
+#else
+  return std::vector<std::pair<int, int> >{{1, 0}};
+#endif
+})
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
+.set_attr<FInferStorageType>("FInferStorageType",
+                             BackwardPoolingStorageType)
+.set_attr_parser(PoolingParamParser)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FComputeEx>("FComputeEx<cpu>", PoolingGradComputeExCPU)
+#endif
+.set_attr<FCompute>("FCompute<cpu>", PoolingGradCompute<cpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/pooling.cu b/src/operator/nn/pooling.cu
index dcebe6798263..c3bcecfc77b7 100644
--- a/src/operator/nn/pooling.cu
+++ b/src/operator/nn/pooling.cu
@@ -21,7 +21,7 @@
  * Copyright (c) 2017 by Contributors
  * \file pooling.cu
  * \brief
- * \author Bing Xu, Jun Wu
+ * \author Bing Xu, Jun Wu, Da Zheng
 */
 #include <vector>
 #include "./pooling-inl.h"
@@ -32,38 +32,112 @@
 namespace mxnet {
 namespace op {
 
+#if MXNET_USE_CUDNN == 1
+template<typename DType>
+static CuDNNPoolingOp<DType> &GetCuDNNPoolingOp(const PoolingParam &param) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local CuDNNPoolingOp<DType> op;
+#else
+  static MX_THREAD_LOCAL CuDNNPoolingOp<DType> op;
+#endif
+  op.Init(param);
+  return op;
+}
+#endif
+
 template<>
-Operator *CreateOp<gpu>(PoolingParam param, int dtype) {
-  Operator *op = NULL;
+void PoolingCompute<gpu>(const nnvm::NodeAttrs& attrs,
+                         const OpContext& ctx,
+                         const std::vector<TBlob>& inputs,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<TBlob>& outputs) {
+  const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), GetNumOutputs(param));
+
 #if MXNET_USE_CUDNN == 1
   if (!param.cudnn_off && param.kernel.ndim() > 1) {
-    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
       switch (param.pool_type) {
         case pool_enum::kMaxPooling:
-          op = new CuDNNPoolingOp<DType>(param);
-          break;
         case pool_enum::kAvgPooling:
-          op = new CuDNNPoolingOp<DType>(param);
+          GetCuDNNPoolingOp<DType>(param).Forward(ctx, inputs[0], req[0], outputs[0]);
+          return;
+        case pool_enum::kSumPooling:
+          LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied.";
           break;
+      }
+    });
+  }
+#endif  // MXNET_USE_CUDNN
+
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    if (pool_enum::kMaxPooling == param.pool_type
+        || pool_enum::kAvgPooling == param.pool_type
+        || pool_enum::kSumPooling == param.pool_type) {
+      PoolingForward<gpu, DType>(ctx, param, inputs[0], req[0], outputs[0]);
+    } else {
+      LOG(FATAL) << "unknown pooling type";
+    }
+  });
+}
+
+template<>
+void PoolingGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
+                             const OpContext& ctx,
+                             const std::vector<TBlob>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<TBlob>& outputs) {
+  const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), GetNumBackInputs(param));
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  off_t ograd_idx, in_data_idx, out_data_idx;
+  // When MKLDNN is enabled, the input data may contains arrays for workspace.
+  if (GetNumBackInputs(param) == 5) {
+    ograd_idx = 0;
+    in_data_idx = 2;
+    out_data_idx = 3;
+  } else {
+    ograd_idx = 0;
+    in_data_idx = 1;
+    out_data_idx = 2;
+  }
+
+#if MXNET_USE_CUDNN == 1
+  if (!param.cudnn_off && param.kernel.ndim() > 1) {
+    MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+      switch (param.pool_type) {
+        case pool_enum::kMaxPooling:
+        case pool_enum::kAvgPooling:
+          GetCuDNNPoolingOp<DType>(param).Backward(ctx, inputs[ograd_idx],
+              inputs[in_data_idx], inputs[out_data_idx], req[0], outputs[0]);
+          return;
         case pool_enum::kSumPooling:
           LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied.";
           break;
       }
     });
   }
-  if (op) return op;
 #endif  // MXNET_USE_CUDNN
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
     if (pool_enum::kMaxPooling == param.pool_type
         || pool_enum::kAvgPooling == param.pool_type
         || pool_enum::kSumPooling == param.pool_type) {
-      op = new PoolingOp<gpu, DType>(param);
+      PoolingBackward<gpu, DType>(ctx, param, inputs[ograd_idx],
+          inputs[in_data_idx], inputs[out_data_idx], req[0], outputs[0]);
     } else {
       LOG(FATAL) << "unknown pooling type";
     }
   });
-  return op;
 }
 
+NNVM_REGISTER_OP(Pooling)
+.set_attr<FCompute>("FCompute<gpu>", PoolingCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_Pooling)
+.set_attr<FCompute>("FCompute<gpu>", PoolingGradCompute<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc
index 4686fb8c0dc1..0f559475d1c2 100644
--- a/src/operator/nn/softmax.cc
+++ b/src/operator/nn/softmax.cc
@@ -25,11 +25,54 @@
 #include "./softmax-inl.h"
 #include "../tensor/elemwise_unary_op.h"
 #include "../tensor/elemwise_binary_op.h"
+#include "mkldnn/mkldnn_base-inl.h"
+#include "mkldnn/mkldnn_ops-inl.h"
 
 namespace mxnet {
 namespace op {
 DMLC_REGISTER_PARAMETER(SoftmaxParam);
 
+#if MXNET_USE_MKLDNN == 1
+static void SoftmaxComputeExCPU(const nnvm::NodeAttrs& attrs,
+                                const OpContext& ctx,
+                                const std::vector<NDArray>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<NDArray>& outputs) {
+  const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
+  // It seems MKLDNN softmax doesn't support training.
+  // and it only supports non-negative axis.
+  if (SupportMKLDNN(inputs[0]) && !ctx.is_train && param.axis >= 0) {
+    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    MKLDNNSoftmaxForward(attrs, ctx, inputs[0], req[0], outputs[0]);
+    auto fn = SoftmaxCompute<cpu, mxnet_op::softmax_fwd>;
+    MKLDNN_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs);
+    return;
+  }
+  FallBackCompute(SoftmaxCompute<cpu, mxnet_op::softmax_fwd>, attrs, ctx,
+                  inputs, req, outputs);
+}
+#endif
+
+inline static bool SoftmaxStorageType(const nnvm::NodeAttrs& attrs,
+                                      const int dev_mask,
+                                      DispatchMode* dispatch_mode,
+                                      std::vector<int> *in_attrs,
+                                      std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(out_attrs->size(), 1);
+
+  DispatchMode wanted_mode;
+#if MXNET_USE_MKLDNN == 1
+  // We only run MKLDNN op if it runs on CPU.
+  if (dev_mask == mshadow::cpu::kDevMask)
+    wanted_mode = DispatchMode::kFComputeEx;
+  else
+#endif
+    wanted_mode = DispatchMode::kFCompute;
+  return storage_type_assign(out_attrs, static_cast<NDArrayStorageType>((*in_attrs)[0]),
+                             dispatch_mode, wanted_mode);
+}
+
 MXNET_OPERATOR_REGISTER_UNARY(softmax)
 .describe(R"code(Applies the softmax function.
 
@@ -54,6 +97,10 @@ Example::
 )code" ADD_FILELINE)
 .set_attr_parser(ParamParser<SoftmaxParam>)
 .set_attr<FCompute>("FCompute<cpu>", SoftmaxCompute<cpu, mxnet_op::softmax_fwd>)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FComputeEx>("FComputeEx<cpu>", SoftmaxComputeExCPU)
+#endif
+.set_attr<FInferStorageType>("FInferStorageType", SoftmaxStorageType)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_softmax"})
 .add_arguments(SoftmaxParam::__FIELDS__());
 
diff --git a/src/operator/nn/softmax_activation-inl.h b/src/operator/nn/softmax_activation-inl.h
index 500bf51ccd1f..b1d542e4068c 100644
--- a/src/operator/nn/softmax_activation-inl.h
+++ b/src/operator/nn/softmax_activation-inl.h
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file softmax_activation-inl.h
  * \brief SoftmaxActivation operator
- * \author Junyuan Xie
+ * \author Junyuan Xie, Da Zheng
 */
 #ifndef MXNET_OPERATOR_NN_SOFTMAX_ACTIVATION_INL_H_
 #define MXNET_OPERATOR_NN_SOFTMAX_ACTIVATION_INL_H_
@@ -61,153 +61,74 @@ struct SoftmaxActivationParam : public dmlc::Parameter<SoftmaxActivationParam> {
   }
 };
 
-/**
- * \brief This is the implementation of softmax_activation operator.
- * \tparam xpu The device that the op will be executed on.
- */
 template<typename xpu>
-class SoftmaxActivationOp : public Operator {
- public:
-  explicit SoftmaxActivationOp(SoftmaxActivationParam p) {
-    this->param_ = p;
-  }
-
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 1U);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    if (param_.mode == softmax_activation::kInstance) {
-      Tensor<xpu, 2> data = in_data[softmax_activation::kData].FlatTo2D<xpu, real_t>(s);
-      Tensor<xpu, 2> out = out_data[softmax_activation::kOut].FlatTo2D<xpu, real_t>(s);
-      Softmax(out, data);
-    } else {
-      CHECK_GE(in_data[softmax_activation::kData].ndim(), 3)
+void SoftmaxActivationCompute(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<TBlob>& inputs,
+                              const std::vector<OpReqType>& reqs,
+                              const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  const SoftmaxActivationParam& param = nnvm::get<SoftmaxActivationParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  const TBlob &in_data = inputs[softmax_activation::kData];
+  const OpReqType &req = reqs[softmax_activation::kOut];
+  const TBlob &out_data = outputs[softmax_activation::kOut];
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  if (param.mode == softmax_activation::kInstance) {
+    Tensor<xpu, 2> data = in_data.FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> out = out_data.FlatTo2D<xpu, real_t>(s);
+    Softmax(out, data);
+  } else {
+    CHECK_GE(in_data.ndim(), 3)
         << "Input need to have a least 3 dimensions when mode=channel";
-      int n = in_data[softmax_activation::kData].size(0);
-      int k = in_data[softmax_activation::kData].size(1);
-      Shape<3> s3 = Shape3(n, k, static_cast<int>(in_data[softmax_activation::kData].Size()/n/k));
-      Tensor<xpu, 3, real_t> data =
-        in_data[softmax_activation::kData].get_with_shape<xpu, 3, real_t>(s3, s);
-      Tensor<xpu, 3, real_t> out =
-        out_data[softmax_activation::kOut].get_with_shape<xpu, 3, real_t>(s3, s);
-      Softmax(out, data);
-    }
+    int n = in_data.size(0);
+    int k = in_data.size(1);
+    Shape<3> s3 = Shape3(n, k, static_cast<int>(in_data.Size()/n/k));
+    Tensor<xpu, 3, real_t> data = in_data.get_with_shape<xpu, 3, real_t>(s3, s);
+    Tensor<xpu, 3, real_t> out = out_data.get_with_shape<xpu, 3, real_t>(s3, s);
+    Softmax(out, data);
   }
+}
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1U);
-    CHECK(in_data.size() == 1 && in_grad.size() == 1);
-    CHECK_EQ(req.size(), 1U);
-    // Use 3d tensor for both mode -> {instance, channel}. Get shapes
-    int total_size = in_grad[softmax_activation::kData].Size();
-    int batch_size = in_grad[softmax_activation::kData].shape_[0];
-    int channel_num = in_grad[softmax_activation::kData].shape_[1];
-    int rest_size = total_size / (batch_size * channel_num);
-    const Shape<3> data_shape = Shape3(batch_size, channel_num, rest_size);
-    // Get tensors
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 3> m_out_grad =
-      out_grad[softmax_activation::kOut].get_with_shape<xpu, 3, real_t>(data_shape, s);
-    Tensor<xpu, 3> m_out_data =
-      out_data[softmax_activation::kOut].get_with_shape<xpu, 3, real_t>(data_shape, s);
-    Tensor<xpu, 3> m_in_grad =
-      in_grad[softmax_activation::kData].get_with_shape<xpu, 3, real_t>(data_shape, s);
-    // get requested temp space
-    Tensor<xpu, 2> workspace = ctx.requested[softmax_activation::kTempSpace].get_space<xpu>(
-        Shape2(batch_size, rest_size), s);
-    workspace = reduce_with_axis<red::sum, false>(m_out_grad * m_out_data, 1);
-    Assign(m_in_grad, req[softmax_activation::kData],
-        m_out_data * (m_out_grad - broadcast_with_axis(workspace, 0, channel_num)));
-  }
-
- private:
-  SoftmaxActivationParam param_;
-};  // class SoftmaxActivationOp
-
-// Decalre Factory function, used for dispatch specialization
 template<typename xpu>
-Operator* CreateOp(SoftmaxActivationParam type);
-
-#if DMLC_USE_CXX11
-class SoftmaxActivationProp : public OperatorProperty {
- public:
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-    using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 1U) << "Input:[data]";
-    const TShape &dshape = in_shape->at(softmax_activation::kData);
-    if (dshape.ndim() == 0) return false;
-    out_shape->clear();
-    out_shape->push_back(dshape);
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    auto ptr = new SoftmaxActivationProp();
-    ptr->param_ = param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override {
-    return "SoftmaxActivation";
-  }
-
-  // decalre dependency and inplace optimization options
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return {out_grad[softmax_activation::kOut], out_data[softmax_activation::kOut]};
-  }
-
-  std::vector<ResourceRequest> BackwardResource(
-      const std::vector<TShape> &in_shape) const override {
-    return {ResourceRequest::kTempSpace};
-  }
+void SoftmaxActivationGradCompute(const nnvm::NodeAttrs& attrs,
+                                  const OpContext& ctx,
+                                  const std::vector<TBlob>& inputs,
+                                  const std::vector<OpReqType>& reqs,
+                                  const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1);
+  CHECK_EQ(reqs.size(), 1);
+  const TBlob &out_grad = inputs[0];
+  const TBlob &out_data = inputs[1];
+  const OpReqType &req = reqs[0];
+  const TBlob &in_grad = outputs[0];
+  // Use 3d tensor for both mode -> {instance, channel}. Get shapes
+  int total_size = in_grad.Size();
+  int batch_size = in_grad.shape_[0];
+  int channel_num = in_grad.shape_[1];
+  int rest_size = total_size / (batch_size * channel_num);
+  const Shape<3> data_shape = Shape3(batch_size, channel_num, rest_size);
+  // Get tensors
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  Tensor<xpu, 3> m_out_grad =
+      out_grad.get_with_shape<xpu, 3, real_t>(data_shape, s);
+  Tensor<xpu, 3> m_out_data =
+      out_data.get_with_shape<xpu, 3, real_t>(data_shape, s);
+  Tensor<xpu, 3> m_in_grad =
+      in_grad.get_with_shape<xpu, 3, real_t>(data_shape, s);
+  // get requested temp space
+  Tensor<xpu, 2> workspace = ctx.requested[softmax_activation::kTempSpace].get_space<xpu>(
+      Shape2(batch_size, rest_size), s);
+  workspace = reduce_with_axis<red::sum, false>(m_out_grad * m_out_data, 1);
+  Assign(m_in_grad, req,
+         m_out_data * (m_out_grad - broadcast_with_axis(workspace, 0, channel_num)));
+}
 
-  std::vector<std::pair<int, void*> > BackwardInplaceOption(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data,
-    const std::vector<void*> &in_grad) const override {
-    return {{out_grad[softmax_activation::kOut], in_grad[softmax_activation::kData]}};
-  }
-
-  std::vector<std::pair<int, void*> > ForwardInplaceOption(
-    const std::vector<int> &in_data,
-    const std::vector<void*> &out_data) const override {
-    return {{in_data[softmax_activation::kData], out_data[softmax_activation::kOut]}};
-  }
-
-  Operator* CreateOperator(Context ctx) const override;
-
- private:
-  SoftmaxActivationParam param_;
-};
-#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_NN_SOFTMAX_ACTIVATION_INL_H_
diff --git a/src/operator/nn/softmax_activation.cc b/src/operator/nn/softmax_activation.cc
index 657b382c6e03..bdfd8b065de1 100644
--- a/src/operator/nn/softmax_activation.cc
+++ b/src/operator/nn/softmax_activation.cc
@@ -21,26 +21,18 @@
  * Copyright (c) 2015 by Contributors
  * \file activation.cc
  * \brief softmax_activation op
- * \author Junyuan Xie
+ * \author Junyuan Xie, Da Zheng
 */
 #include "./softmax_activation-inl.h"
+#include "../tensor/elemwise_unary_op.h"
 #include "../mshadow_op.h"
 
 namespace mxnet {
 namespace op {
-template<>
-Operator *CreateOp<cpu>(SoftmaxActivationParam param) {
-  return new SoftmaxActivationOp<cpu>(param);
-}
-
-// DO_BIND_DISPATCH comes from operator_common.h
-Operator *SoftmaxActivationProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateOp, param_);
-}
 
 DMLC_REGISTER_PARAMETER(SoftmaxActivationParam);
 
-MXNET_REGISTER_OP_PROPERTY(SoftmaxActivation, SoftmaxActivationProp)
+MXNET_OPERATOR_REGISTER_UNARY(SoftmaxActivation)
 .describe(R"code(Applies softmax activation to input. This is intended for internal layers.
 
 .. note::
@@ -65,8 +57,22 @@ Example::
    [  6.56221947e-03   5.95310994e-04   9.73919690e-01   1.78379621e-02   1.08472735e-03]]
 
 )code" ADD_FILELINE)
-.add_argument("data", "NDArray-or-Symbol", "Input array to activation function.")
+.set_attr_parser(ParamParser<SoftmaxActivationParam>)
+.set_attr<FCompute>("FCompute<cpu>", SoftmaxActivationCompute<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_SoftmaxActivation"})
 .add_arguments(SoftmaxActivationParam::__FIELDS__());
 
+NNVM_REGISTER_OP(_backward_SoftmaxActivation)
+.set_num_outputs(1)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
+  return std::vector<std::pair<int, int> >{{0, 0}};
+})
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr_parser(ParamParser<SoftmaxActivationParam>)
+.set_attr<FCompute>("FCompute<cpu>", SoftmaxActivationGradCompute<cpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/softmax_activation.cu b/src/operator/nn/softmax_activation.cu
index 0810483e1262..f3997e00052e 100644
--- a/src/operator/nn/softmax_activation.cu
+++ b/src/operator/nn/softmax_activation.cu
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file softmax_activation.cu
  * \brief
- * \author Junyuan Xie
+ * \author Junyuan Xie, Da Zheng
 */
 #include "./softmax_activation-inl.h"
 #include "../mshadow_op.h"
@@ -31,14 +31,51 @@
 
 namespace mxnet {
 namespace op {
-template<>
-Operator *CreateOp<gpu>(SoftmaxActivationParam param) {
+
 #if MXNET_USE_CUDNN == 1
-  return new CuDNNSoftmaxActivationOp(param);
+
+static inline CuDNNSoftmaxActivationOp &GetCuDNNSoftmaxActOp(const SoftmaxActivationParam& param) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local CuDNNSoftmaxActivationOp op;
 #else
-  return new SoftmaxActivationOp<gpu>(param);
-#endif  // MXNET_USE_CUDNN
+  static MX_THREAD_LOCAL CuDNNSoftmaxActivationOp op;
+#endif
+  op.Init(param);
+  return op;
+}
+
+template<>
+void SoftmaxActivationCompute<gpu>(const nnvm::NodeAttrs& attrs,
+                                   const OpContext& ctx,
+                                   const std::vector<TBlob>& inputs,
+                                   const std::vector<OpReqType>& req,
+                                   const std::vector<TBlob>& outputs) {
+  const SoftmaxActivationParam& param = nnvm::get<SoftmaxActivationParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  GetCuDNNSoftmaxActOp(param).Forward(ctx, inputs[0], req[0], outputs[0]);
 }
+
+template<>
+void SoftmaxActivationGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
+                                       const OpContext& ctx,
+                                       const std::vector<TBlob>& inputs,
+                                       const std::vector<OpReqType>& req,
+                                       const std::vector<TBlob>& outputs) {
+  const SoftmaxActivationParam& param = nnvm::get<SoftmaxActivationParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1);
+  CHECK_EQ(req.size(), 1);
+  GetCuDNNSoftmaxActOp(param).Backward(ctx, inputs[0], inputs[1], req[0], outputs[0]);
+}
+#endif
+
+NNVM_REGISTER_OP(SoftmaxActivation)
+.set_attr<FCompute>("FCompute<gpu>", SoftmaxActivationCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_SoftmaxActivation)
+.set_attr<FCompute>("FCompute<gpu>", SoftmaxActivationGradCompute<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/nn/upsampling-inl.h b/src/operator/nn/upsampling-inl.h
index f660609ace28..4b9159edd174 100644
--- a/src/operator/nn/upsampling-inl.h
+++ b/src/operator/nn/upsampling-inl.h
@@ -35,6 +35,7 @@
 #include <string>
 #include <utility>
 #include "../operator_common.h"
+#include "./deconvolution-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -82,253 +83,147 @@ struct UpSamplingParam : public dmlc::Parameter<UpSamplingParam> {
 };  // struct UpSamplingParam
 
 template<typename xpu, typename DType>
-class UpSamplingNearestOp : public Operator {
- public:
-  explicit UpSamplingNearestOp(UpSamplingParam p) {
-    this->param_ = p;
-  }
-
-  virtual void Forward(const OpContext &ctx,
+void UpSamplingForward(const OpContext &ctx, const UpSamplingParam &param,
                        const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), static_cast<size_t>(param_.num_args));
-    CHECK_EQ(out_data.size(), 1U);
-    if (req[up_enum::kOut] == kNullOp) {
-      return;
-    }
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> out = out_data[up_enum::kOut].get<xpu, 4, DType>(s);
-    if (param_.num_args > 1) {
-      int begin = 0;
-      for (int i = 0; i < param_.num_args; ++i) {
-        Tensor<xpu, 4, DType> data = in_data[i].get<xpu, 4, DType>(s);
-        int end = begin + data.size(1);
-        int scale = out_data[up_enum::kOut].size(2)/in_data[i].size(2);
-        if (param_.multi_input_mode == up_enum::kSum) {
-          if (i == 0) {
-            Assign(out, req[up_enum::kOut], upsampling_nearest(data, scale));
-          } else {
-            out += upsampling_nearest(data, scale);
-          }
+                       const std::vector<TBlob> &out_data) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  CHECK_EQ(in_data.size(), static_cast<size_t>(param.num_args));
+  CHECK_EQ(out_data.size(), 1U);
+  if (req[up_enum::kOut] == kNullOp) {
+    return;
+  }
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  Tensor<xpu, 4, DType> out = out_data[up_enum::kOut].get<xpu, 4, DType>(s);
+  if (param.num_args > 1) {
+    int begin = 0;
+    for (int i = 0; i < param.num_args; ++i) {
+      Tensor<xpu, 4, DType> data = in_data[i].get<xpu, 4, DType>(s);
+      int end = begin + data.size(1);
+      int scale = out_data[up_enum::kOut].size(2)/in_data[i].size(2);
+      if (param.multi_input_mode == up_enum::kSum) {
+        if (i == 0) {
+          Assign(out, req[up_enum::kOut], upsampling_nearest(data, scale));
         } else {
-          Assign(slice<1>(out, begin, end), req[up_enum::kOut], upsampling_nearest(data, scale));
+          out += upsampling_nearest(data, scale);
         }
-        begin = end;
+      } else {
+        Assign(slice<1>(out, begin, end), req[up_enum::kOut], upsampling_nearest(data, scale));
       }
-    } else {
-      Tensor<xpu, 4, DType> data = in_data[up_enum::kData].get<xpu, 4, DType>(s);
-      Assign(out, req[up_enum::kOut], upsampling_nearest(data, param_.scale));
+      begin = end;
     }
+  } else {
+    Tensor<xpu, 4, DType> data = in_data[up_enum::kData].get<xpu, 4, DType>(s);
+    Assign(out, req[up_enum::kOut], upsampling_nearest(data, param.scale));
   }
+}
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1U);
-    CHECK_EQ(in_grad.size(), static_cast<size_t>(param_.num_args));
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> grad = out_grad[up_enum::kOut].get<xpu, 4, DType>(s);
-    if (param_.num_args > 1) {
-      int begin = 0;
-      for (int i = 0; i < param_.num_args; ++i) {
-        Tensor<xpu, 4, DType> input_grad = in_grad[i].get<xpu, 4, DType>(s);
-        mshadow::Shape<2> in_shape = Shape2(input_grad.shape_[2], input_grad.shape_[3]);
-        int end = begin + input_grad.size(1);
-        int scale = grad.size(2)/in_shape[0];
-        if (param_.multi_input_mode == up_enum::kSum) {
-          Assign(input_grad, req[i],
-                 pool<mshadow::red::sum>(grad,
-                                         in_shape,
-                                         scale,
-                                         scale,
-                                         scale,
-                                         scale));
-        } else {
-          Assign(input_grad, req[i],
-                 pool<mshadow::red::sum>(slice<1>(grad, begin, end),
-                                         in_shape,
-                                         scale,
-                                         scale,
-                                         scale,
-                                         scale));
-        }
-        begin = end;
-      }
-    } else {
-      Tensor<xpu, 4, DType> input_grad = in_grad[up_enum::kData].get<xpu, 4, DType>(s);
+template<typename xpu, typename DType>
+void UpSamplingBackward(const OpContext &ctx, const UpSamplingParam &param,
+                        const TBlob &out_grad, const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  CHECK_EQ(in_grad.size(), static_cast<size_t>(param.num_args));
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  Tensor<xpu, 4, DType> grad = out_grad.get<xpu, 4, DType>(s);
+  if (param.num_args > 1) {
+    int begin = 0;
+    for (int i = 0; i < param.num_args; ++i) {
+      Tensor<xpu, 4, DType> input_grad = in_grad[i].get<xpu, 4, DType>(s);
       mshadow::Shape<2> in_shape = Shape2(input_grad.shape_[2], input_grad.shape_[3]);
-      Assign(input_grad, req[up_enum::kData],
-             pool<mshadow::red::sum>(grad,
-                                     in_shape,
-                                     param_.scale,
-                                     param_.scale,
-                                     param_.scale,
-                                     param_.scale));
-    }
-  }
-
- private:
-  UpSamplingParam param_;
-};  // class UpSamplingNearestOp
-
-template<typename xpu>
-Operator *CreateOp(UpSamplingParam param, int dtype);
-
-
-#if DMLC_USE_CXX11
-class UpSamplingProp : public OperatorProperty {
- public:
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  std::vector<std::string> ListArguments() const override {
-    if (param_.sample_type == up_enum::kNearest) {
-      std::vector<std::string> ret;
-      for (int i = 0; i < param_.num_args; ++i) {
-        ret.push_back(std::string("arg") + std::to_string(i));
-      }
-      return ret;
-    } else {
-      return {"data", "weight"};
-    }
-  }
-
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-    CHECK_GE(in_shape->size(), 1U);
-    const TShape &dshape = (*in_shape)[0];
-    TShape oshape = dshape;
-    if (param_.sample_type == up_enum::kNearest) {
-      CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
-      oshape[1] = 0;
-      for (auto& shape : *in_shape) {
-        CHECK_EQ(shape.ndim(), 4U) << \
-          "UpSamplingNearest: Input data should be 4D in (batch, channel, y, x)";
-        int oh = dshape[2]*param_.scale, ow = dshape[3]*param_.scale;
-        CHECK_EQ(oh%shape[2], 0U) << "UpSamplingNearest: input height of " << shape[2] << \
-          "does not divide output height of " << oh;
-        CHECK_EQ(ow%shape[3], 0U) << "UpSamplingNearest: input width of " << shape[3] << \
-          "does not divide output width of " << ow;
-        if (param_.multi_input_mode == up_enum::kSum) {
-          CHECK(oshape[1] == 0 || oshape[1] == shape[1]) << \
-            "Number of channels must be the same when multi_input_mode==sum";
-          oshape[1] = shape[1];
-        } else {
-          oshape[1] += shape[1];
-        }
-      }
-    } else {
-      CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
-      CHECK_EQ(dshape.ndim(), 4U) << \
-        "UpSamplingBilinear: Input data should be 4D in (batch, channel, y, x)";
-      if (dshape.ndim() ==  0) return false;
-      int kernel = 2 * param_.scale - param_.scale % 2;
-      SHAPE_ASSIGN_CHECK(*in_shape,
-                         up_enum::kWeight,
-                         mshadow::Shape4(dshape[1], 1, kernel, kernel));
-      oshape = dshape;
-    }
-    oshape[2] = dshape[2] * param_.scale;
-    oshape[3] = dshape[3] * param_.scale;
-    out_shape->clear();
-    out_shape->push_back(oshape);
-    return true;
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1U);
-    int dtype = (*in_type)[0];
-    CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
-      if ((*in_type)[i] == -1) {
-        (*in_type)[i] = dtype;
+      int end = begin + input_grad.size(1);
+      int scale = grad.size(2)/in_shape[0];
+      if (param.multi_input_mode == up_enum::kSum) {
+        Assign(input_grad, req[i],
+               pool<mshadow::red::sum>(grad,
+                                       in_shape,
+                                       scale,
+                                       scale,
+                                       scale,
+                                       scale));
       } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
+        Assign(input_grad, req[i],
+               pool<mshadow::red::sum>(slice<1>(grad, begin, end),
+                                       in_shape,
+                                       scale,
+                                       scale,
+                                       scale,
+                                       scale));
       }
+      begin = end;
     }
-    out_type->clear();
-    out_type->push_back(dtype);
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    auto ptr = new UpSamplingProp();
-    ptr->param_ = this->param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override {
-    return "UpSampling";
-  }
-
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    if (param_.sample_type == up_enum::kNearest) {
-      return {out_grad[up_enum::kOut]};
-    } else {
-      return {out_grad[up_enum::kOut], in_data[up_enum::kData], in_data[up_enum::kWeight]};
-    }
-  }
-
-  std::vector<std::pair<int, void*> > BackwardInplaceOption(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data,
-    const std::vector<void*> &in_grad) const override {
-    return {};
-  }
-
-  std::vector<ResourceRequest> ForwardResource(
-      const std::vector<TShape> &in_shape) const override {
-    if (param_.sample_type == up_enum::kNearest) {
-      return {};
-    } else {
-      return {ResourceRequest::kTempSpace};
-    }
-  }
-
-  std::vector<ResourceRequest> BackwardResource(
-      const std::vector<TShape> &in_shape) const override {
-    if (param_.sample_type == up_enum::kNearest) {
-      return {};
-    } else {
-      return {ResourceRequest::kTempSpace};
-    }
-  }
-
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented";
-    return NULL;
-  }
+  } else {
+    Tensor<xpu, 4, DType> input_grad = in_grad[up_enum::kData].get<xpu, 4, DType>(s);
+    mshadow::Shape<2> in_shape = Shape2(input_grad.shape_[2], input_grad.shape_[3]);
+    Assign(input_grad, req[up_enum::kData],
+           pool<mshadow::red::sum>(grad,
+                                   in_shape,
+                                   param.scale,
+                                   param.scale,
+                                   param.scale,
+                                   param.scale));
+  }
+}
+
+static inline DeconvolutionParam GetDeconvolutionParam(const UpSamplingParam& param) {
+  DeconvolutionParam p = DeconvolutionParam();
+  int kernel = 2 * param.scale - param.scale % 2;
+  int stride = param.scale;
+  int pad = static_cast<int>(ceil((param.scale - 1) / 2.));
+  p.workspace = param.workspace;
+  p.num_group = param.num_filter;
+  p.num_filter = param.num_filter;
+  p.no_bias =  true;
+  int shape[] = {1, 1};
+  p.dilate = TShape(shape, shape + 2);
+  shape[0] = shape[1] = kernel;
+  p.kernel = TShape(shape, shape + 2);
+  shape[0] = shape[1] = stride;
+  p.stride = TShape(shape, shape + 2);
+  shape[0] = shape[1] = pad;
+  p.pad = TShape(shape, shape + 2);
+  return p;
+}
 
-  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                             std::vector<int> *in_type) const override;
+template<typename xpu>
+void UpSamplingCompute(const nnvm::NodeAttrs& attrs,
+                       const OpContext& ctx, const std::vector<TBlob>& inputs,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& outputs) {
+  const UpSamplingParam& param = nnvm::get<UpSamplingParam>(attrs.parsed);
+  if (param.sample_type == up_enum::kNearest) {
+    MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, {
+      UpSamplingForward<xpu, DType>(ctx, param, inputs, req, outputs);
+    });
+  } else if (param.sample_type == up_enum::kBilinear) {
+    DeconvolutionParam p = GetDeconvolutionParam(param);
+    _DeconvolutionCompute<xpu>(p, ctx, inputs, req, outputs);
+  } else {
+    LOG(FATAL) << "Unknown sample type";
+  }
+}
 
+template<typename xpu>
+void UpSamplingGradCompute(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx, const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& outputs) {
+  const UpSamplingParam& param = nnvm::get<UpSamplingParam>(attrs.parsed);
+  if (param.sample_type == up_enum::kNearest) {
+    MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, {
+      CHECK_EQ(inputs.size(), 1U);
+      UpSamplingBackward<xpu, DType>(ctx, param, inputs[0], req, outputs);
+    });
+  } else if (param.sample_type == up_enum::kBilinear) {
+    DeconvolutionParam p = GetDeconvolutionParam(param);
+    _DeconvolutionGradCompute<xpu>(p, ctx, inputs, req, outputs);
+  } else {
+    LOG(FATAL) << "Unknown sample type";
+  }
+}
 
- private:
-  UpSamplingParam param_;
-};  // class UpSamplingProp
-#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/nn/upsampling.cc b/src/operator/nn/upsampling.cc
index 8942e35ab325..44b619ac9516 100644
--- a/src/operator/nn/upsampling.cc
+++ b/src/operator/nn/upsampling.cc
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file upsampling_nearest.cc
  * \brief
- * \author Bing Xu
+ * \author Bing Xu, Da Zheng
 */
 
 #include "./upsampling-inl.h"
@@ -30,51 +30,123 @@
 
 namespace mxnet {
 namespace op {
-template<>
-Operator *CreateOp<cpu>(UpSamplingParam param, int dtype) {
-  Operator *op = NULL;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    if (param.sample_type == up_enum::kNearest) {
-      op = new UpSamplingNearestOp<cpu, DType>(param);
-    } else if (param.sample_type == up_enum::kBilinear) {
-      DeconvolutionParam p = DeconvolutionParam();
-      int kernel = 2 * param.scale - param.scale % 2;
-      int stride = param.scale;
-      int pad = static_cast<int>(ceil((param.scale - 1) / 2.));
-      p.workspace = param.workspace;
-      p.num_group = param.num_filter;
-      p.num_filter = param.num_filter;
-      p.no_bias =  true;
-      int shape[] = {1, 1};
-      p.dilate = TShape(shape, shape + 2);
-      shape[0] = shape[1] = kernel;
-      p.kernel = TShape(shape, shape + 2);
-      shape[0] = shape[1] = stride;
-      p.stride = TShape(shape, shape + 2);
-      shape[0] = shape[1] = pad;
-      p.pad = TShape(shape, shape + 2);
-      op = new DeconvolutionOp<cpu, DType>(p);
-    } else {
-      LOG(FATAL) << "Unknown sample type";
+
+static bool UpSamplingShape(const nnvm::NodeAttrs& attrs,
+                            std::vector<TShape> *in_shape, std::vector<TShape> *out_shape) {
+  const UpSamplingParam& param_ = nnvm::get<UpSamplingParam>(attrs.parsed);
+  CHECK_GE(in_shape->size(), 1U);
+  const TShape &dshape = (*in_shape)[0];
+  TShape oshape = dshape;
+  if (param_.sample_type == up_enum::kNearest) {
+    CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
+    oshape[1] = 0;
+    for (auto& shape : *in_shape) {
+      CHECK_EQ(shape.ndim(), 4U) << \
+        "UpSamplingNearest: Input data should be 4D in (batch, channel, y, x)";
+      int oh = dshape[2]*param_.scale, ow = dshape[3]*param_.scale;
+      CHECK_EQ(oh%shape[2], 0U) << "UpSamplingNearest: input height of " << shape[2] << \
+        "does not divide output height of " << oh;
+      CHECK_EQ(ow%shape[3], 0U) << "UpSamplingNearest: input width of " << shape[3] << \
+        "does not divide output width of " << ow;
+      if (param_.multi_input_mode == up_enum::kSum) {
+        CHECK(oshape[1] == 0 || oshape[1] == shape[1]) << \
+                         "Number of channels must be the same when multi_input_mode==sum";
+        oshape[1] = shape[1];
+      } else {
+        oshape[1] += shape[1];
+      }
+    }
+  } else {
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
+    CHECK_EQ(dshape.ndim(), 4U) << \
+      "UpSamplingBilinear: Input data should be 4D in (batch, channel, y, x)";
+    if (dshape.ndim() ==  0) return false;
+    int kernel = 2 * param_.scale - param_.scale % 2;
+    SHAPE_ASSIGN_CHECK(*in_shape,
+        up_enum::kWeight,
+        mshadow::Shape4(dshape[1], 1, kernel, kernel));
+    oshape = dshape;
+  }
+  oshape[2] = dshape[2] * param_.scale;
+  oshape[3] = dshape[3] * param_.scale;
+  out_shape->clear();
+  out_shape->push_back(oshape);
+  return true;
+}
+
+static inline std::vector<std::string> ListArguments(const UpSamplingParam& param) {
+  if (param.sample_type == up_enum::kNearest) {
+    std::vector<std::string> ret;
+    for (int i = 0; i < param.num_args; ++i) {
+      ret.push_back(std::string("arg") + std::to_string(i));
     }
-  });
-  return op;
+    return ret;
+  } else {
+    return {"data", "weight"};
+  }
 }
 
-Operator* UpSamplingProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                                           std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
+static bool UpSamplingType(const nnvm::NodeAttrs& attrs,
+                           std::vector<int> *in_type, std::vector<int> *out_type) {
+  const UpSamplingParam& param = nnvm::get<UpSamplingParam>(attrs.parsed);
+  CHECK_GE(in_type->size(), 1U);
+  int dtype = (*in_type)[0];
+  CHECK_NE(dtype, -1) << "First input must have specified type";
+  for (index_t i = 0; i < in_type->size(); ++i) {
+    if ((*in_type)[i] == -1) {
+      (*in_type)[i] = dtype;
+    } else {
+      UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param)[i]);
+    }
+  }
+  out_type->clear();
+  out_type->push_back(dtype);
+  return true;
 }
 
+struct UpSamplingGrad {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
+    const UpSamplingParam& param_ = nnvm::get<UpSamplingParam>(n->attrs.parsed);
+    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
+    if (param_.sample_type != up_enum::kNearest) {
+      heads.push_back(n->inputs[up_enum::kData]);
+      heads.push_back(n->inputs[up_enum::kWeight]);
+    }
+    return MakeGradNode(op_name, n, heads, n->attrs.dict);
+  }
+};
+
 DMLC_REGISTER_PARAMETER(UpSamplingParam);
 
-MXNET_REGISTER_OP_PROPERTY(UpSampling, UpSamplingProp)
+NNVM_REGISTER_OP(UpSampling)
 .describe("Performs nearest neighbor/bilinear up sampling to inputs.")
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const UpSamplingParam& params = nnvm::get<UpSamplingParam>(attrs.parsed);
+  return params.sample_type == up_enum::kNearest ? params.num_args : 2;
+})
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<UpSamplingParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+    [](const NodeAttrs& attrs) {
+  return ListArguments(nnvm::get<UpSamplingParam>(attrs.parsed));
+})
+.set_attr<nnvm::FInferShape>("FInferShape", UpSamplingShape)
+.set_attr<nnvm::FInferType>("FInferType", UpSamplingType)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  const UpSamplingParam& param = nnvm::get<UpSamplingParam>(n.parsed);
+  if (param.sample_type == up_enum::kNearest) {
+    return std::vector<ResourceRequest>();
+  } else {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  }
+})
+.set_attr<FCompute>("FCompute<cpu>", UpSamplingCompute<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", UpSamplingGrad{"_backward_UpSampling"})
+.set_attr<std::string>("key_var_num_args", "num_args")
 .add_argument("data", "NDArray-or-Symbol[]", "Array of tensors to upsample")
 .add_arguments(UpSamplingParam::__FIELDS__())
-.set_key_var_num_args("num_args");
-
-NNVM_REGISTER_OP(UpSampling)
 .set_attr<nnvm::FSetInputVarAttrOnCompose>("FSetInputVarAttrOnCompose",
     [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) {
       if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return;
@@ -82,5 +154,23 @@ NNVM_REGISTER_OP(UpSampling)
         var->attrs.dict["__init__"] = "[\"bilinear\", {}]";
       }
     });
+
+NNVM_REGISTER_OP(_backward_UpSampling)
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const UpSamplingParam& params = nnvm::get<UpSamplingParam>(attrs.parsed);
+  return params.sample_type == up_enum::kNearest ? params.num_args : 2;
+})
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  const UpSamplingParam& param = nnvm::get<UpSamplingParam>(n.parsed);
+  if (param.sample_type == up_enum::kNearest) {
+    return std::vector<ResourceRequest>();
+  } else {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  }
+})
+.set_attr_parser(ParamParser<UpSamplingParam>)
+.set_attr<FCompute>("FCompute<cpu>", UpSamplingGradCompute<cpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/upsampling.cu b/src/operator/nn/upsampling.cu
index f83535a2b2e6..c5ff2fafd64a 100644
--- a/src/operator/nn/upsampling.cu
+++ b/src/operator/nn/upsampling.cu
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file upsampling_nearest.cc
  * \brief
- * \author Bing Xu
+ * \author Bing Xu, Da Zheng
 */
 
 #include "./deconvolution-inl.h"
@@ -29,36 +29,12 @@
 
 namespace mxnet {
 namespace op {
-template<>
-Operator *CreateOp<gpu>(UpSamplingParam param, int dtype) {
-  Operator *op = NULL;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    if (param.sample_type == up_enum::kNearest) {
-      op = new UpSamplingNearestOp<gpu, DType>(param);
-    } else if (param.sample_type == up_enum::kBilinear) {
-      DeconvolutionParam p = DeconvolutionParam();
-      int kernel = 2 * param.scale - param.scale % 2;
-      int stride = param.scale;
-      int pad = static_cast<int>(ceil((param.scale - 1) / 2.));
-      p.workspace = param.workspace;
-      p.num_group = param.num_filter;
-      p.num_filter = param.num_filter;
-      p.no_bias =  true;
-      int shape[] = {1, 1};
-      p.dilate = TShape(shape, shape + 2);
-      shape[0] = shape[1] = kernel;
-      p.kernel = TShape(shape, shape + 2);
-      shape[0] = shape[1] = stride;
-      p.stride = TShape(shape, shape + 2);
-      shape[0] = shape[1] = pad;
-      p.pad = TShape(shape, shape + 2);
-      op = new DeconvolutionOp<gpu, DType>(p);
-    } else {
-      LOG(FATAL) << "Unknown sample type";
-    }
-  });
-  return op;
-}
+
+NNVM_REGISTER_OP(UpSampling)
+.set_attr<FCompute>("FCompute<gpu>", UpSamplingCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_UpSampling)
+.set_attr<FCompute>("FCompute<gpu>", UpSamplingGradCompute<gpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/cast_storage-inl.h b/src/operator/tensor/cast_storage-inl.h
index ed200273854d..e345bb2193f4 100644
--- a/src/operator/tensor/cast_storage-inl.h
+++ b/src/operator/tensor/cast_storage-inl.h
@@ -27,11 +27,15 @@
 #include <dmlc/timer.h>
 #include <mxnet/ndarray.h>
 #include <vector>
+#include <algorithm>
 #include "../mxnet_op.h"
 #include "../operator_common.h"
 #ifdef __CUDACC__
 #include "./cast_storage-inl.cuh"
 #endif  // __CUDACC__
+#if MXNET_USE_MKLDNN == 1
+#include "../nn/mkldnn/mkldnn_base-inl.h"
+#endif
 
 
 namespace mxnet {
@@ -342,8 +346,20 @@ void CastStorageComputeImpl(const OpContext& ctx,
   } else if (src_stype == kCSRStorage && dst_stype == kDefaultStorage) {
     TBlob ret = output.data();
     CastStorageCsrDnsImpl<xpu>(ctx, input, &ret);
+#if MXNET_USE_MKLDNN == 1
+  } else if (src_stype == kDefaultStorage && dst_stype == kDefaultStorage) {
+    CHECK_EQ(output.ctx().dev_type, input.ctx().dev_type);
+    // If one of them uses the MKLDNN layout.
+    if (input.IsMKLDNNData() || output.IsMKLDNNData()) {
+      auto in_mem = input.GetMKLDNNData();
+      const_cast<NDArray &>(output).CopyFrom(*in_mem);
+      MKLDNNStream::Get()->Submit();
+    } else {
+      mxnet_op::copy(ctx.get_stream<xpu>(), output.data(), input.data());
+    }
+#endif
   } else {
-    LOG(FATAL) << "Not implemented";
+    LOG(FATAL) << "Not implemented from " << src_stype << " to " << dst_stype;
   }
 }
 
@@ -376,8 +392,14 @@ inline bool CastStorageInferStorageType(const nnvm::NodeAttrs& attrs,
   // dns -> dns, dns -> rsp, dns -> csr
   if (!dispatched && in_stype == kDefaultStorage && param_stype == kDefaultStorage) {
     // dns -> dns
-    dispatched = storage_type_assign(out_attrs, kDefaultStorage,
-                                     dispatch_mode, DispatchMode::kFCompute);
+    DispatchMode mode = DispatchMode::kFCompute;
+#if MXNET_USE_MKLDNN == 1
+    // If we use MKLDNN and the arrays are in CPU memory, the array may store
+    // MKLDNN layout, we should convert its layout explicitly.
+    if (dev_mask == kCPU)
+      mode = DispatchMode::kFComputeEx;
+#endif
+    dispatched = storage_type_assign(out_attrs, kDefaultStorage, dispatch_mode, mode);
   }
   if (!dispatched && in_stype == kDefaultStorage &&
     (param_stype == kRowSparseStorage || param_stype == kCSRStorage)) {
diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc
index d7e5e04ce87a..d73edc723520 100644
--- a/src/operator/tensor/elemwise_binary_op_basic.cc
+++ b/src/operator/tensor/elemwise_binary_op_basic.cc
@@ -24,11 +24,68 @@
  */
 #include "./elemwise_unary_op.h"
 #include "./elemwise_binary_op-inl.h"
+#include "../nn/mkldnn/mkldnn_ops-inl.h"
+#include "../nn/mkldnn/mkldnn_base-inl.h"
 
 namespace mxnet {
 namespace op {
 
-MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(elemwise_add, op::mshadow_op::plus)
+static void ElemwiseAddEx(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<NDArray>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+#if MXNET_USE_MKLDNN == 1
+  if (SupportMKLDNN(inputs[0]) && SupportMKLDNN(inputs[1])) {
+    MKLDNNSumForward(attrs, ctx, inputs, req[0], outputs[0]);
+    return;
+  } else if (inputs[0].storage_type() == kDefaultStorage
+             && inputs[1].storage_type() == kDefaultStorage) {
+    // This happens if inputs are supposed to be in MKLDNN format
+    // but MKLDNN doesn't support the data type or the shape. We're
+    // forced to convert it to the default format.
+    std::vector<TBlob> in_blobs(2);
+    std::vector<TBlob> out_blobs(1);
+    in_blobs[0] = inputs[0].data();
+    in_blobs[1] = inputs[1].data();
+    out_blobs[0] = outputs[0].data();
+    ElemwiseBinaryOp::Compute<cpu, op::mshadow_op::plus>(attrs, ctx, in_blobs,
+                                                         req, out_blobs);
+    return;
+  }
+#endif
+  ElemwiseBinaryOp::ComputeEx<cpu, op::mshadow_op::plus>(attrs, ctx, inputs,
+                                                         req, outputs);
+}
+
+static inline bool ElemwiseAddStorageType(const nnvm::NodeAttrs& attrs,
+                                          const int dev_mask,
+                                          DispatchMode* dispatch_mode,
+                                          std::vector<int> *in_attrs,
+                                          std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2);
+  CHECK_EQ(out_attrs->size(), 1);
+  bool ret = ElemwiseStorageType<2, 1, true, true, true>(attrs, dev_mask, dispatch_mode,
+                                                         in_attrs, out_attrs);
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask
+      && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)
+      && out_attrs->at(0) == kDefaultStorage) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+  return ret;
+}
+
+MXNET_OPERATOR_REGISTER_BINARY(elemwise_add)
+.set_attr<FInferStorageType>("FInferStorageType", ElemwiseAddStorageType)
+.set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::Compute<cpu, op::mshadow_op::plus>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", ElemwiseAddEx)
+.set_attr<FResourceRequest>("FResourceRequest",  /* For Sparse CSR */
+                            [](const NodeAttrs& attrs) {
+                            return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};})
 MXNET_ADD_SPARSE_OP_ALIAS(elemwise_add)
 .add_alias("_add").add_alias("_plus").add_alias("_Plus")
 .describe(R"code(Adds arguments element-wise.
@@ -46,6 +103,41 @@ The storage type of ``elemwise_add`` output depends on storage types of inputs
 // this must differ from elemwise_add to prevent add to optimization in forward pass.
 MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_grad_add, op::mshadow_op::plus);
 
+static void _backward_ElemwiseAddEx(const nnvm::NodeAttrs& attrs,
+                                    const OpContext& ctx,
+                                    const std::vector<NDArray>& inputs,
+                                    const std::vector<OpReqType>& req,
+                                    const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 2U);
+#if MXNET_USE_MKLDNN == 1
+  if (inputs[0].IsMKLDNNData()) {
+    MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]);
+    MKLDNNCopy(attrs, ctx, inputs[0], req[1], outputs[1]);
+    return;
+  }
+#endif
+  ElemwiseBinaryOp::BackwardUseNoneEx<cpu, mshadow_op::identity, mshadow_op::identity>(
+      attrs, ctx, inputs, req, outputs);
+}
+
+static inline bool ElemwiseAddBackwardStorageType(const nnvm::NodeAttrs& attrs,
+                                                  const int dev_mask,
+                                                  DispatchMode* dispatch_mode,
+                                                  std::vector<int> *in_attrs,
+                                                  std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(out_attrs->size(), 2);
+  bool ret = ElemwiseStorageType<1, 2, true, true, true>(attrs, dev_mask, dispatch_mode,
+                                                         in_attrs, out_attrs);
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+  return ret;
+}
+
 NNVM_REGISTER_OP(_backward_add)
 .set_num_inputs(1)
 .set_num_outputs(2)
@@ -55,13 +147,15 @@ NNVM_REGISTER_OP(_backward_add)
                                   return std::vector<std::pair<int, int> >{{0, 0},
                                                                            {0, 1}};
                                 })
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
 .set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::BackwardUseNone<
   cpu, mshadow_op::identity, mshadow_op::identity>)
-.set_attr<FComputeEx>("FComputeEx<cpu>",
-                      ElemwiseBinaryOp::BackwardUseNoneEx<cpu, mshadow_op::identity,
-                      mshadow_op::identity>)
-.set_attr<FInferStorageType>("FInferStorageType",
-                             ElemwiseStorageType<1, 2, true, true, true>);
+.set_attr<FComputeEx>("FComputeEx<cpu>", _backward_ElemwiseAddEx)
+.set_attr<FInferStorageType>("FInferStorageType", ElemwiseAddBackwardStorageType);
 
 MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(elemwise_sub, op::mshadow_op::minus)
 MXNET_ADD_SPARSE_OP_ALIAS(elemwise_sub)
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc
index 8c12218be062..6118ddf19c30 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc
@@ -65,7 +65,7 @@ static bool BinaryScalarStorageTypeWithDenseResultStorageType(const NodeAttrs& a
   const auto dispatch_ex = invalid_ctx ? DispatchMode::kFComputeFallback
                                        : DispatchMode::kFComputeEx;
   const double alpha = nnvm::get<double>(attrs.parsed);
-  if (instype == kDefaultStorage) {
+  if (common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) {
     dispatched = storage_type_assign(&out_attrs[0],
       kDefaultStorage, dispatch_mode, DispatchMode::kFCompute);
   }
@@ -89,7 +89,7 @@ static bool BinaryScalarStorageType(const nnvm::NodeAttrs& attrs,
   const auto in_stype = in_attrs->at(0);
   auto &out_stype = out_attrs->at(0);
   bool dispatched = false;
-  if (!dispatched && in_stype == kDefaultStorage) {
+  if (!dispatched && (in_stype == kDefaultStorage)) {
     // dns -> dns
     dispatched = storage_type_assign(&out_stype, kDefaultStorage,
                                      dispatch_mode, DispatchMode::kFCompute);
diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc
index b31dbb2598f0..10154bc9646d 100644
--- a/src/operator/tensor/elemwise_sum.cc
+++ b/src/operator/tensor/elemwise_sum.cc
@@ -24,6 +24,8 @@
 */
 #include "./elemwise_sum.h"
 #include "../../ndarray/ndarray_function.h"
+#include "../nn/mkldnn/mkldnn_ops-inl.h"
+#include "../../common/utils.h"
 
 namespace mxnet {
 namespace op {
@@ -79,9 +81,28 @@ bool ElementWiseSumForwardInferStorageType(const nnvm::NodeAttrs& attrs,
                                            std::vector<int> *out_attrs) {
   CHECK(!in_attrs->empty());
   CHECK_EQ(out_attrs->size(), 1U);
-  return ElemwiseStorageAttr<false, true, false>(attrs, dev_mask, dispatch_mode,
-                                                 in_attrs, out_attrs);
+  bool ret = ElemwiseStorageAttr<false, true, false>(attrs, dev_mask, dispatch_mode,
+                                                     in_attrs, out_attrs);
+#if MXNET_USE_MKLDNN == 1
+  // We should always use FComputeEx.
+  if (dev_mask == mshadow::cpu::kDevMask
+      && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)
+      && out_attrs->at(0) == kDefaultStorage) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+  return ret;
+}
+
+#if MXNET_USE_MKLDNN == 1
+static inline bool IsMKLDNNData(const std::vector<NDArray> &arrs) {
+  for (auto &arr : arrs) {
+    if (!arr.IsMKLDNNData())
+      return false;
+  }
+  return true;
 }
+#endif
 
 void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs,
                                 const OpContext& ctx,
@@ -92,13 +113,28 @@ void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
   if (req[0] == kNullOp) return;
-  CHECK_EQ(req[0], kWriteTo) << "ElementWiseSumComputeExCPU only supports req = kWriteTo";
   if (inputs[0].storage_type() == kRowSparseStorage) {
     mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
     Resource rsc = ResourceManager::Get()->Request(ctx.run_ctx.get_ctx(),
         ResourceRequest(ResourceRequest::kTempSpace));
     NDArray out_nd = outputs[0];
     mxnet::ndarray::ElementwiseSum<cpu>(s, rsc, inputs, &out_nd);
+#if MXNET_USE_MKLDNN == 1
+  } else if (IsMKLDNNData(inputs)) {
+    MKLDNNSumForward(attrs, ctx, inputs, req[0], outputs[0]);
+#endif
+  } else if (common::ContainsOnlyStorage(inputs, kDefaultStorage)) {
+    // This case happens when we want to create an MKLDNN NDArray but the type
+    // or the shape isn't supported by MKLDNN. In this case, NDArray falls back
+    // to the default storage type and, thus, we have to handle the default
+    // storage in FComputeEx.
+    std::vector<TBlob> in_blobs(inputs.size());
+    std::vector<TBlob> out_blobs(outputs.size());
+    for (size_t i = 0; i < in_blobs.size(); i++)
+      in_blobs[i] = inputs[i].data();
+    for (size_t i = 0; i < out_blobs.size(); i++)
+      out_blobs[i] = outputs[i].data();
+    ElementWiseSumCompute<cpu>(attrs, ctx, in_blobs, req, out_blobs);
   } else {
     LogUnimplementedOp(attrs, ctx, inputs, req, outputs);
   }
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index 13a58d0165a8..cca3b2c9ff90 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -24,6 +24,7 @@
 #include <mxnet/base.h>
 #include "elemwise_unary_op.h"
 #include "./elemwise_binary_op-inl.h"
+#include "../nn/mkldnn/mkldnn_ops-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -107,12 +108,64 @@ MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_backward_sigmoid,
                                                unary_bwd<mshadow_op::sigmoid_grad>);
 
 // copy
+static void CopyEx(const nnvm::NodeAttrs& attrs,
+                   const OpContext& ctx,
+                   const std::vector<NDArray>& inputs,
+                   const std::vector<OpReqType>& req,
+                   const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  const auto in_stype = inputs[0].storage_type();
+  const auto out_stype = outputs[0].storage_type();
+#if MXNET_USE_MKLDNN == 1
+  if (inputs[0].IsMKLDNNData()) {
+    MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]);
+    return;
+  } else if (in_stype == kDefaultStorage && out_stype == kDefaultStorage) {
+    // This happens if inputs are supposed to be in MKLDNN format
+    // but MKLDNN doesn't support the data type or the shape. We're
+    // forced to convert it to the default format.
+    std::vector<TBlob> in_blobs {inputs[0].data()};
+    std::vector<TBlob> out_blobs {outputs[0].data()};
+    UnaryOp::IdentityCompute<cpu>(attrs, ctx, in_blobs, req, out_blobs);
+    return;
+  }
+#endif
+  UnaryOp::IdentityComputeEx<cpu>(attrs, ctx, inputs, req, outputs);
+}
+
+static inline bool CopyStorageType(const nnvm::NodeAttrs& attrs,
+                                   const int dev_mask,
+                                   DispatchMode* dispatch_mode,
+                                   std::vector<int> *in_attrs,
+                                   std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(out_attrs->size(), 1);
+  bool ret = ElemwiseStorageType<1, 1, false, true, true>(attrs, dev_mask, dispatch_mode,
+                                                          in_attrs, out_attrs);
+#if MXNET_USE_MKLDNN == 1
+  // We have to make sure all inputs are default layouts. Otherwise, we might
+  // want to fallback.
+  if (dev_mask == mshadow::cpu::kDevMask
+      && in_attrs->at(0) == kDefaultStorage
+      && out_attrs->at(0) == kDefaultStorage) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+  return ret;
+}
+
 MXNET_OPERATOR_REGISTER_UNARY(_copy)
 .MXNET_DESCRIBE("Returns a copy of the input.")
 .add_alias("identity")
-.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>)
+.set_attr<FInferStorageType>("FInferStorageType", CopyStorageType)
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
-.set_attr<FComputeEx>("FComputeEx<cpu>", UnaryOp::IdentityComputeEx<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", CopyEx)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
 .set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
   [](const NodeAttrs& attrs){
     return std::vector<bool>{true};
@@ -127,9 +180,14 @@ NNVM_REGISTER_OP(_backward_copy)
   [](const NodeAttrs& attrs){
     return std::vector<std::pair<int, int> >{{0, 0}};
   })
-.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>)
+.set_attr<FInferStorageType>("FInferStorageType", CopyStorageType)
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
-.set_attr<FComputeEx>("FComputeEx<cpu>", UnaryOp::IdentityComputeEx<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", CopyEx)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
 .set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
   [](const NodeAttrs& attrs){
     return std::vector<bool>{true};
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index 9167fcfe7e34..25c233318f01 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -25,6 +25,8 @@
 // this will be invoked by gcc and compile CPU version
 #include "./matrix_op-inl.h"
 #include "./elemwise_unary_op.h"
+#include "../nn/mkldnn/mkldnn_ops-inl.h"
+#include "../nn/mkldnn/mkldnn_base-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -180,6 +182,51 @@ If the argument `reverse` is set to 1, then the special values are inferred from
 .add_argument("data", "NDArray-or-Symbol", "Input data to reshape.")
 .add_arguments(ReshapeParam::__FIELDS__());
 
+static void FlattenEx(const nnvm::NodeAttrs& attrs,
+                      const OpContext& ctx,
+                      const std::vector<NDArray>& inputs,
+                      const std::vector<OpReqType>& req,
+                      const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+#if MXNET_USE_MKLDNN == 1
+  const auto in_stype = inputs[0].storage_type();
+  const auto out_stype = outputs[0].storage_type();
+  if (inputs[0].IsMKLDNNData()) {
+    MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]);
+    // If the output is a special MKLDNN layout and the number of dimensions
+    // is larger than 2, we should use the default layout.
+    if (outputs[0].IsMKLDNNData() && inputs[0].shape().ndim() > 2)
+      const_cast<NDArray &>(outputs[0]).Reorder2Default();
+    return;
+  } else {
+    // This happens if inputs are supposed to be in MKLDNN format
+    // but MKLDNN doesn't support the data type or the shape. We're
+    // forced to convert it to the default format.
+    FallBackCompute(UnaryOp::IdentityCompute<cpu>, attrs, ctx, inputs, req, outputs);
+    return;
+  }
+#endif
+}
+
+static inline bool FlattenStorageType(const nnvm::NodeAttrs& attrs,
+                                   const int dev_mask,
+                                   DispatchMode* dispatch_mode,
+                                   std::vector<int> *in_attrs,
+                                   std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(out_attrs->size(), 1);
+  bool ret = ElemwiseStorageType<1, 1, false, false, false>(attrs, dev_mask, dispatch_mode,
+                                                            in_attrs, out_attrs);
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask
+      && in_attrs->at(0) == kDefaultStorage
+      && out_attrs->at(0) == kDefaultStorage) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+  return ret;
+}
 
 NNVM_REGISTER_OP(Flatten)
 .add_alias("flatten")
@@ -210,8 +257,15 @@ Example::
 .set_num_outputs(1)
 .set_attr<nnvm::FInferShape>("FInferShape", FlattenShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FInferStorageType>("FInferStorageType", FlattenStorageType)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{ "_backward_copy" })
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", FlattenEx)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
   [](const NodeAttrs& attrs) {
     return std::vector<std::pair<int, int> >{{0, 0}};
diff --git a/src/storage/cpu_device_storage.h b/src/storage/cpu_device_storage.h
index f0dd61f01ac0..52df4dd2bbc8 100644
--- a/src/storage/cpu_device_storage.h
+++ b/src/storage/cpu_device_storage.h
@@ -54,7 +54,13 @@ class CPUDeviceStorage {
   /*!
    * \brief Alignment of allocation.
    */
+#if MXNET_USE_MKLDNN == 1
+  // MKLDNN requires special alignment. 4096 is used by the MKLDNN library in
+  // memory allocation.
+  static constexpr size_t alignment_ = 4096;
+#else
   static constexpr size_t alignment_ = 16;
+#endif
 };  // class CPUDeviceStorage
 
 inline void* CPUDeviceStorage::Alloc(size_t size) {
diff --git a/tests/ci_build/ci_build.sh b/tests/ci_build/ci_build.sh
index 794a4c55ee45..1d6d64be3862 100755
--- a/tests/ci_build/ci_build.sh
+++ b/tests/ci_build/ci_build.sh
@@ -178,6 +178,7 @@ ${DOCKER_BINARY} run --rm --pid=host \
     -e "CI_BUILD_GID=$(id -g)" \
     -e "CUDA_ARCH=-gencode arch=compute_52,code=[sm_52,compute_52] --fatbin-options -compress-all" \
     -e "MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0" \
+    -e "ARCH_OPT=-mavx2" \
     ${CI_DOCKER_EXTRA_PARAMS[@]} \
     ${DOCKER_IMG_NAME} \
     ${PRE_COMMAND} \
diff --git a/tests/cpp/include/test_core_op.h b/tests/cpp/include/test_core_op.h
index 6a220bdad6d7..570911c23568 100644
--- a/tests/cpp/include/test_core_op.h
+++ b/tests/cpp/include/test_core_op.h
@@ -209,6 +209,13 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
           requested.emplace_back(r);
         } else if (req.type == ResourceRequest::kRandom) {
           requested.emplace_back(ResourceManager::Get()->Request(ctx->run_ctx.ctx, req));
+        } else if (req.type == ResourceRequest::kParallelRandom) {
+          Resource rm = ResourceManager::Get()->Request(ctx->run_ctx.ctx, req);
+          if (ctx->run_ctx.ctx.dev_mask() == Context::kCPU) {
+            common::random::RandGenerator<cpu, DType>::AllocState(
+                rm.get_parallel_random<cpu, DType>());
+          }
+          requested.emplace_back(rm);
         } else {
           LOG(FATAL) << "resource type not yet supported";
         }
@@ -314,7 +321,9 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
       // Set up forward
       attrs_ = ParseAttrs(op_, args);
 
-      const int num_inputs = op_->num_inputs;
+      int num_inputs = op_->num_inputs;
+      if (op_->get_num_inputs)
+        num_inputs = op_->get_num_inputs(attrs_);
 
       if (!inputs.empty()) {
         CHECK_EQ(inputs.size(), static_cast<size_t>(num_inputs));
@@ -340,8 +349,8 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
 
       inputs_.reserve(num_inputs);
       inputs_p.reserve(num_inputs);
-      outputs_.reserve(num_visible_outputs);
-      outputs_p.reserve(num_visible_outputs);
+      outputs_.reserve(inferred_num_outputs);
+      outputs_p.reserve(inferred_num_outputs);
 
       for (size_t i = 0; i < static_cast<size_t>(num_inputs); ++i) {
         CHECK_LT(i, static_cast<int>(shapes.size()));
@@ -350,7 +359,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
         inputs_p.emplace_back(&*inputs_.rbegin());
       }
 
-      for (size_t i = 0; i < static_cast<size_t>(num_visible_outputs); ++i) {
+      for (size_t i = 0; i < static_cast<size_t>(inferred_num_outputs); ++i) {
         // If supplied and valid, pass from the supplied outputs vector
         // Otherwise use empty for forward pass, or zero-filled for backward pass
         outputs_.emplace_back(i < outputs.size()
diff --git a/tests/cpp/include/test_op_runner.h b/tests/cpp/include/test_op_runner.h
index 0992c41f760e..1e00e30a1b34 100644
--- a/tests/cpp/include/test_op_runner.h
+++ b/tests/cpp/include/test_op_runner.h
@@ -137,7 +137,8 @@ class OperatorRunner {
              const test::op::kwargs_t& kwargs,
              int dim = 0,
              size_t count = 1,
-             const std::vector<TShape>& timing_shapes = {}) {
+             const std::vector<TShape>& timing_shapes = {},
+             bool backward = true) {
     if (mxnet::test::quick_test) {
       total_iterations_ = 2;
       count = 1;
@@ -225,7 +226,7 @@ class OperatorRunner {
           CHECK(false) << "Unsupported dimension count: " << (D + 1);
       }
       if (info.executor_) {
-        if (info.executor_->HasBackward()) {
+        if (info.executor_->HasBackward() && backward) {
           RunGenericOperatorBackward(&info, count);
         }
         timing += info.executor_->GetTiming();
diff --git a/tests/cpp/operator/activation_perf.cc b/tests/cpp/operator/activation_perf.cc
index e482848705ad..1bd8ca89c9f5 100644
--- a/tests/cpp/operator/activation_perf.cc
+++ b/tests/cpp/operator/activation_perf.cc
@@ -26,7 +26,7 @@
 #include <gtest/gtest.h>
 #include <mxnet/tensor_blob.h>
 #include "../include/test_op_runner.h"
-#include "../include/test_legacy_op.h"
+#include "../include/test_core_op.h"
 #include "../../src/operator/nn/activation-inl.h"
 
 using namespace mxnet;
@@ -41,8 +41,10 @@ TEST(ACTIVATION_PERF, ExecuteBidirectional) {
   TShape shape({5, 5});
   kwargs_t kwargs = basic_activation_args;
   kwargs.push_back({"act_type", "tanh"});
-  test::op::LegacyOpRunner<mxnet::op::ActivationProp, float, float> runner;
-  runner.RunBidirectional(false, { shape }, kwargs, 1);
+
+  test::op::CoreOperatorRunner<float> runner;
+  runner.RunBidirectional(false, { shape }, test::op::CoreOpExecutor<float>::ArgsWithOpName(
+          kwargs, "Activation", "_backward_Activation"), 1);
 }
 
 /*!
@@ -52,10 +54,12 @@ TEST(ACTIVATION_PERF, TimingCPU) {
   kwargs_t kwargs = basic_activation_args;
   // Which math function is arbitrary since it will have roughly constant timing among approaches
   kwargs.push_back({"act_type", "tanh"});
-  test::op::LegacyOpRunner<mxnet::op::ActivationProp, float, float> runner;
-  runner.RunBidirectional(false,
-                          { TShape({10, 10, 10, 10}) },
-                          kwargs, 1);  // prime code and cache
+  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Activation",
+                                                           "_backward_Activation");
+  TShape shape({10, 10, 10, 10});
+  test::op::CoreOperatorRunner<float> runner;
+  runner.RunBidirectional(false, { shape }, kwargs, 1);
+
   std::vector <TShape> shapes;
   if (test::performance_run) {
     shapes = {
@@ -84,11 +88,11 @@ TEST(ACTIVATION_PERF, TimingGPU) {
   kwargs_t kwargs = basic_activation_args;
   // Which math function is arbitrary since it will have roughly constant timing among approaches
   kwargs.push_back({"act_type", "tanh"});
-  test::OperatorRunner<mxnet::op::ActivationProp,
-    test::op::LegacyOperatorExecutor<float, float>> runner;
-  runner.RunBidirectional(true,
-                          { TShape({10, 10, 10, 10}) },
-                          kwargs, 1);  // prime code and cache
+  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Activation",
+                                                           "_backward_Activation");
+  TShape shape({10, 10, 10, 10});
+  test::op::CoreOperatorRunner<float> runner;
+  runner.RunBidirectional(true, { shape }, kwargs, 1);
   std::vector <TShape> shapes = {
       {1,  1, 28,  28},
       {1,  3, 28,  28},
diff --git a/tests/cpp/operator/batchnorm_test.cc b/tests/cpp/operator/batchnorm_test.cc
index 179e42a3830f..607b9804684a 100644
--- a/tests/cpp/operator/batchnorm_test.cc
+++ b/tests/cpp/operator/batchnorm_test.cc
@@ -24,11 +24,14 @@
  * \author Chris Olivier
 */
 
+#if 0
+
 #include <dmlc/logging.h>
 #include <mxnet/tensor_blob.h>
 #include "../../src/operator/nn/batch_norm-inl.h"
 #include "../../src/operator/batch_norm_v1-inl.h"
 #include "./test_legacy_op.h"
+#include "./test_core_op.h"
 #include "executor/exec_pass.h"
 
 using namespace mxnet;
@@ -1827,3 +1830,5 @@ TEST(BATCH_NORM, Test2DBackwardMixedComplex_gpu_cpu_ugs) {
 }
 
 #endif  // MXNET_USE_CUDA
+
+#endif
diff --git a/tests/cpp/operator/dropout_perf.cc b/tests/cpp/operator/dropout_perf.cc
index 90bf6ebb0dfd..c28b9bd48097 100644
--- a/tests/cpp/operator/dropout_perf.cc
+++ b/tests/cpp/operator/dropout_perf.cc
@@ -26,7 +26,7 @@
 #include <gtest/gtest.h>
 #include <mxnet/tensor_blob.h>
 #include "../include/test_op_runner.h"
-#include "../include/test_legacy_op.h"
+#include "../include/test_core_op.h"
 #include "../../src/operator/nn/dropout-inl.h"
 
 using namespace mxnet;
@@ -41,8 +41,10 @@ TEST(DROPOUT_PERF, ExecuteBidirectional) {
   TShape shape({5, 5});
   kwargs_t kwargs = basic_dropout_args;
   kwargs.push_back({"mode", "always"});
-  test::op::LegacyOpRunner<mxnet::op::DropoutProp, float, float> runner;
-  runner.RunBidirectional(false, { shape }, kwargs, 1);
+  test::op::CoreOperatorRunner<float> runner;
+  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Dropout",
+                                                           "_backward_Dropout");
+  runner.RunGenericOperatorForward(false, { shape }, kwargs, 1);
 }
 
 /*!
@@ -52,10 +54,11 @@ TEST(DROPOUT_PERF, TimingCPU) {
   kwargs_t kwargs = basic_dropout_args;
 // Which math function is arbitrary since it will have roughly constant timing among approaches
   kwargs.push_back({"mode", "always"});
-  test::op::LegacyOpRunner<mxnet::op::DropoutProp, float, float> runner;
-  runner.RunBidirectional(false,
-                          { TShape({10, 10, 10, 10}) },
-                          kwargs, 1);  // prime code and cache
+  TShape shape({10, 10, 10, 10});
+  test::op::CoreOperatorRunner<float> runner;
+  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Dropout",
+                                                           "_backward_Dropout");
+  runner.RunGenericOperatorForward(false, { shape }, kwargs, 1);
   std::vector <TShape> shapes;
   if (test::performance_run) {
     shapes = {
@@ -72,7 +75,9 @@ TEST(DROPOUT_PERF, TimingCPU) {
     };
   }
   for (const TShape &shape : shapes) {
-    runner.TimingTest("Dropout Operator CPU", false, false, kwargs, 2, 10, { shape });
+    kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Dropout",
+                                                             "_backward_Dropout");
+    runner.TimingTest("Dropout Operator CPU", false, false, kwargs, 2, 10, { shape }, false);
   }
 }
 
@@ -84,11 +89,11 @@ TEST(DROPOUT_PERF, TimingGPU) {
   kwargs_t kwargs = basic_dropout_args;
   // Which math function is arbitrary since it will have roughly constant timing among approaches
   kwargs.push_back({"mode", "always"});
-  test::OperatorRunner<mxnet::op::DropoutProp,
-    test::op::LegacyOperatorExecutor<float, float>> runner;
-  runner.RunBidirectional(true,
-                          { TShape({10, 10, 10, 10}) },
-                          kwargs, 1);  // prime code and cache
+  TShape shape({10, 10, 10, 10});
+  test::op::CoreOperatorRunner<float> runner;
+  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Dropout",
+                                                           "_backward_Dropout");
+  runner.RunGenericOperatorForward(true, { shape }, kwargs, 1);
   std::vector <TShape> shapes = {
     {1,  1, 28,  28},
     {1,  3, 28,  28},
@@ -97,8 +102,9 @@ TEST(DROPOUT_PERF, TimingGPU) {
     {20, 3, 128, 128}
   };
   for (const TShape &shape : shapes) {
-    runner.TimingTest("Dropout Operator GPU", true, false, kwargs, 2, 10, { shape });
+    kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "Dropout",
+                                                             "_backward_Dropout");
+    runner.TimingTest("Dropout Operator GPU", true, false, kwargs, 2, 10, { shape }, false);
   }
 }
 #endif  // MXNET_USE_CUDA == 1
-
diff --git a/tests/cpp/operator/fully_conn_perf.cc b/tests/cpp/operator/fully_conn_perf.cc
index c8d8021f6f6e..829c20385ab5 100644
--- a/tests/cpp/operator/fully_conn_perf.cc
+++ b/tests/cpp/operator/fully_conn_perf.cc
@@ -28,21 +28,25 @@
 #include <mxnet/tensor_blob.h>
 #include "../../src/operator/nn/fully_connected-inl.h"
 #include "../include/test_op_runner.h"
-#include "../include/test_legacy_op.h"
+#include "../include/test_core_op.h"
 
 using namespace mxnet;
 
 typedef std::vector<std::pair<std::string, std::string> > kwargs_t;
 
-const kwargs_t basic_fullyconn_args = { {"num_hidden", "250"} };
+const kwargs_t basic_fullyconn_args = { {"num_hidden", "250"}, {"no_bias", "true"} };
 /*!
  * \brief Generic bidirectional sanity test
  */
 TEST(FULLY_CONNECTED, ExecuteBidirectionalFullyConnected) {
-  TShape shape({5, 5});
+  TShape shape1({5, 5});
+  TShape shape2({250, 5});
   kwargs_t kwargs = basic_fullyconn_args;
-  test::op::LegacyOpRunner<mxnet::op::FullyConnectedProp, float, float> runner;
-  runner.RunBidirectional(false, { shape }, kwargs, 1);
+  test::op::CoreOperatorRunner<float> runner;
+  runner.set_verbose(true);
+  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "FullyConnected",
+                                                           "_backward_FullyConnected");
+  runner.RunGenericOperatorForward(false, { shape1, shape2 }, kwargs, 1);
 }
 
 /*!
@@ -50,10 +54,12 @@ TEST(FULLY_CONNECTED, ExecuteBidirectionalFullyConnected) {
  */
 TEST(FULLY_CONNECTED, FullyConnectedTimingCPU) {
   kwargs_t kwargs = basic_fullyconn_args;
-  test::op::LegacyOpRunner<mxnet::op::FullyConnectedProp, float, float> runner;
-  runner.RunBidirectional(false,
-                          { TShape({10, 10, 10, 10}) },
-                          kwargs, 1);  // prime code and cache
+  TShape shape1({10, 10, 10, 10});
+  TShape shape2({250, 1000});
+  test::op::CoreOperatorRunner<float> runner;
+  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "FullyConnected",
+                                                           "_backward_FullyConnected");
+  runner.RunGenericOperatorForward(false, { shape1, shape2 }, kwargs, 1);
   std::vector <TShape> shapes;
   if (test::performance_run) {
     shapes = {
@@ -70,7 +76,11 @@ TEST(FULLY_CONNECTED, FullyConnectedTimingCPU) {
     };
   }
   for (const TShape& shape : shapes) {
-    runner.TimingTest("Fully connected CPU", false, false, kwargs, 2, 10, { shape });
+    TShape shape2({250, shape.ProdShape(1, shape.ndim())});
+    kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "FullyConnected",
+                                                             "_backward_FullyConnected");
+    runner.TimingTest("Fully connected CPU", false, false, kwargs, 2, 10,
+                      { shape, shape2 }, false);
   }
 }
 
@@ -80,12 +90,12 @@ TEST(FULLY_CONNECTED, FullyConnectedTimingCPU) {
  */
 TEST(FULLY_CONNECTED, FullyConnectedTimingGPU) {
   kwargs_t kwargs = basic_fullyconn_args;
-  test::OperatorRunner<mxnet::op::FullyConnectedProp,
-    test::op::LegacyOperatorExecutor<float, float>>
-    runner;
-  runner.RunBidirectional(true,
-                          { TShape({10, 10, 10, 10}) },
-                          kwargs, 1);  // prime code and cache
+  TShape shape1({10, 10, 10, 10});
+  TShape shape2({250, 1000});
+  test::op::CoreOperatorRunner<float> runner;
+  kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "FullyConnected",
+                                                           "_backward_FullyConnected");
+  runner.RunGenericOperatorForward(true, { shape1, shape2 }, kwargs, 1);
   std::vector <TShape> shapes;
   if (test::performance_run) {
     shapes = {
@@ -102,7 +112,11 @@ TEST(FULLY_CONNECTED, FullyConnectedTimingGPU) {
     };
   }
   for (const TShape& shape : shapes) {
-    runner.TimingTest("Fully connected GPU", true, false, kwargs, 2, 10, { shape });
+    TShape shape2({250, shape.ProdShape(1, shape.ndim())});
+    kwargs = test::op::CoreOpExecutor<float>::ArgsWithOpName(kwargs, "FullyConnected",
+                                                             "_backward_FullyConnected");
+    runner.TimingTest("Fully connected GPU", true, false, kwargs, 2, 10,
+                      { shape, shape2 }, false);
   }
 }
 #endif  // MXNET_USE_CUDA == 1
diff --git a/tests/cpp/operator/mkldnn.cc b/tests/cpp/operator/mkldnn.cc
new file mode 100644
index 000000000000..a8a3d26fac3d
--- /dev/null
+++ b/tests/cpp/operator/mkldnn.cc
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  \file mkldnn.cc
+ *  \brief test functions in mkldnn.
+ *  \author Da Zheng
+ */
+
+#if MXNET_USE_MKLDNN == 1
+
+#include "gtest/gtest.h"
+#include "../../src/operator/nn/mkldnn/mkldnn_base-inl.h"
+
+bool test_mem_align(void *mem, size_t size, size_t alignment, size_t space) {
+  void *ret1, *ret2;
+  size_t space1, space2;
+  space1 = space;
+  space2 = space;
+  ret1 = mxnet::AlignMem(mem, size, alignment, &space1);
+  ret2 = std::align(alignment, size, mem, space2);
+  EXPECT_EQ(ret1, ret2);
+  EXPECT_EQ(space1, space2);
+  return ret1 == ret2;
+}
+
+TEST(MKLDNN_UTIL_FUNC, AlignMem) {
+  size_t alignment = 4096;
+  void *mem;
+  size_t size, space;
+
+  // When mem has been aligned.
+  mem = reinterpret_cast<void *>(0x10000);
+  size = 1000;
+  space = 10000;
+  test_mem_align(mem, size, alignment, space);
+
+  // When mem isn't aligned and we have enough space for alignment.
+  mem = reinterpret_cast<void *>(0x10010);
+  size = 1000;
+  space = 10000;
+  test_mem_align(mem, size, alignment, space);
+
+  // When mem isn't aligned and we don't have enough memory for alignment
+  mem = reinterpret_cast<void *>(0x10010);
+  size = 1000;
+  space = 1001;
+  test_mem_align(mem, size, alignment, space);
+
+  for (size_t i = 0; i < 10000; i++) {
+    mem = reinterpret_cast<void *>(random());
+    size = random() % 2000;
+    space = random() % 2000;
+    test_mem_align(mem, size, alignment, space);
+  }
+}
+#endif
diff --git a/tests/python/gpu/test_gluon_model_zoo_gpu.py b/tests/python/gpu/test_gluon_model_zoo_gpu.py
new file mode 100644
index 000000000000..bc35b0b32327
--- /dev/null
+++ b/tests/python/gpu/test_gluon_model_zoo_gpu.py
@@ -0,0 +1,163 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import print_function
+import mxnet as mx
+import numpy as np
+import copy
+from mxnet import autograd
+from mxnet.gluon.model_zoo.vision import get_model
+from mxnet.test_utils import assert_almost_equal
+import sys
+
+def eprint(*args, **kwargs):
+    print(*args, file=sys.stderr, **kwargs)
+
+VAL_DATA='data/val-5k-256.rec'
+def download_data():
+    return mx.test_utils.download(
+        'http://data.mxnet.io/data/val-5k-256.rec', VAL_DATA)
+
+def test_inference():
+    all_models = ['resnet50_v1', 'vgg19_bn', 'alexnet', #'inceptionv3',
+                  'densenet201', 'squeezenet1.0', 'mobilenet0.25']
+
+    batch_size = 10
+    download_data()
+    for model_name in all_models:
+        eprint('testing inference on %s'%model_name)
+
+        data_shape = (3, 224, 224) if 'inception' not in model_name else (3, 299, 299)
+        dataIter = mx.io.ImageRecordIter(
+            path_imgrec        = VAL_DATA,
+            label_width        = 1,
+            preprocess_threads = 1,
+            batch_size         = batch_size,
+            data_shape         = data_shape,
+            label_name         = 'softmax_label',
+            rand_crop          = False,
+            rand_mirror        = False)
+        data_batch = dataIter.next()
+        data = data_batch.data[0]
+        label = data_batch.label[0]
+        gpu_data = data.as_in_context(mx.gpu())
+        gpu_label = label.as_in_context(mx.gpu())
+
+        # This is to create a model and run the model once to initialize
+        # all parameters.
+        cpu_model = get_model(model_name)
+        cpu_model.collect_params().initialize(ctx=mx.cpu())
+        cpu_model(mx.nd.array(data, ctx=mx.cpu()))
+        gpu_model = get_model(model_name)
+        gpu_model.collect_params().initialize(ctx=mx.gpu())
+        gpu_model(mx.nd.array(data, ctx=mx.gpu()))
+
+        # Force the two models have the same parameters.
+        cpu_params = cpu_model.collect_params()
+        gpu_params = gpu_model.collect_params()
+        for k in cpu_params.keys():
+            k = k.replace(cpu_params.prefix, '')
+            cpu_param = cpu_params.get(k)
+            gpu_param = gpu_params.get(k)
+            gpu_param.set_data(cpu_param.data().as_in_context(mx.gpu()))
+
+        # Run inference.
+        with autograd.record(train_mode=False):
+            cpu_out = cpu_model(mx.nd.array(data, ctx=mx.cpu()))
+            gpu_out = gpu_model(gpu_data)
+        out = cpu_out.asnumpy()
+        max_val = np.max(out)
+        assert_almost_equal(out / max_val, gpu_out.asnumpy() / max_val, rtol=1e-2, atol=1e-2)
+
+def get_nn_model(name):
+    if "densenet" in name:
+        return get_model(name, dropout=0)
+    else:
+        return get_model(name)
+
+def test_training():
+    # We use network models without dropout for testing.
+    # TODO(zhengda) mobilenet can't pass this test even without MKLDNN.
+    all_models = ['resnet18_v1', 'densenet121']
+
+    batch_size = 10
+    label = mx.nd.random.uniform(low=0, high=10, shape=(batch_size)).astype('int32')
+
+    download_data()
+    dataIter = mx.io.ImageRecordIter(
+        path_imgrec        = VAL_DATA,
+        label_width        = 1,
+        preprocess_threads = 1,
+        batch_size         = batch_size,
+        data_shape         = (3, 224, 224),
+        label_name         = 'softmax_label',
+        rand_crop          = False,
+        rand_mirror        = False)
+    data_batch = dataIter.next()
+    data = data_batch.data[0]
+    label = data_batch.label[0]
+    gpu_data = data.as_in_context(mx.gpu())
+    gpu_label = label.as_in_context(mx.gpu())
+    softmax_cross_entropy = mx.gluon.loss.SoftmaxCrossEntropyLoss()
+
+    for model_name in all_models:
+        eprint('testing %s'%model_name)
+        #data = mx.nd.random.uniform(shape=(100, 3, 224, 224))
+
+        # This is to create a model and run the model once to initialize
+        # all parameters.
+        cpu_model = get_nn_model(model_name)
+        cpu_model.collect_params().initialize(ctx=mx.cpu())
+        cpu_model(mx.nd.array(data, ctx=mx.cpu()))
+        gpu_model = get_nn_model(model_name)
+        gpu_model.collect_params().initialize(ctx=mx.gpu())
+        gpu_model(mx.nd.array(data, ctx=mx.gpu()))
+
+        # Force the two models have the same parameters.
+        cpu_params = cpu_model.collect_params()
+        gpu_params = gpu_model.collect_params()
+        for k in cpu_params.keys():
+            k = k.replace(cpu_params.prefix, '')
+            cpu_param = cpu_params.get(k)
+            gpu_param = gpu_params.get(k)
+            gpu_param.set_data(cpu_param.data().as_in_context(mx.gpu()))
+
+        cpu_trainer = mx.gluon.Trainer(cpu_params, 'sgd', {'learning_rate': 0.1})
+        gpu_trainer = mx.gluon.Trainer(gpu_params, 'sgd', {'learning_rate': 0.1})
+
+        # Run forward and backward once.
+        with autograd.record():
+            cpu_out = cpu_model(mx.nd.array(data, ctx=mx.cpu()))
+            gpu_out = gpu_model(gpu_data)
+            cpu_loss = softmax_cross_entropy(cpu_out, label)
+            gpu_loss = softmax_cross_entropy(gpu_out, gpu_label)
+        assert_almost_equal(cpu_out.asnumpy(), gpu_out.asnumpy(), rtol=1e-2, atol=1e-2)
+        cpu_loss.backward()
+        gpu_loss.backward()
+        cpu_trainer.step(batch_size)
+        gpu_trainer.step(batch_size)
+
+        # Compare the parameters of the two models.
+        for k in cpu_params.keys():
+            k = k.replace(cpu_params.prefix, '')
+            cpu_param = cpu_params.get(k)
+            gpu_param = gpu_params.get(k)
+            assert_almost_equal(cpu_param.data().asnumpy(), gpu_param.data().asnumpy(), rtol=1e-2, atol=1e-2)
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 55bb30cc7d6a..5ae489529c33 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -987,6 +987,13 @@ def test_activation_with_type():
     check_consistency(sym, ctx_list)
 
 
+def test_lrn():
+    sym = mx.sym.LRN(alpha=0.0001, beta=0.75, knorm=2, nsize=5, name='lrn')
+    ctx_list = [{'ctx': mx.gpu(0), 'lrn_data': (2, 6, 10, 10), 'type_dict': {'lrn_data': np.float32}},
+                {'ctx': mx.cpu(0), 'lrn_data': (2, 6, 10, 10), 'type_dict': {'lrn_data': np.float32}}]
+    check_consistency(sym, ctx_list)
+
+
 def test_embedding_with_type():
     def test_embedding_helper(data_types, weight_types, low_pad, high_pad):
         NVD = [[20, 10, 20], [200, 10, 300]]