diff --git a/.gitmodules b/.gitmodules
index 170c105a6f48..eec87413d859 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -22,3 +22,7 @@
 [submodule "3rdparty/googletest"]
 	path = 3rdparty/googletest
 	url = https://github.com/google/googletest.git
+[submodule "3rdparty/mkldnn"]
+	path = 3rdparty/mkldnn
+	url = https://github.com/01org/mkl-dnn.git
+	branch = master
diff --git a/3rdparty/mkldnn b/3rdparty/mkldnn
new file mode 160000
index 000000000000..82cf37f626a0
--- /dev/null
+++ b/3rdparty/mkldnn
@@ -0,0 +1 @@
+Subproject commit 82cf37f626a0998d38f99c30e5a08a0dd5e49bc0
diff --git a/Jenkinsfile b/Jenkinsfile
index aee091f0d82b..ea697402667e 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -4,6 +4,7 @@
 
 // mxnet libraries
 mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, dmlc-core/libdmlc.a, nnvm/lib/libnnvm.a'
+mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_gnu.so, lib/libmkldnn.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, dmlc-core/libdmlc.a, nnvm/lib/libnnvm.a'
 // command to start a docker container
 docker_run = 'tests/ci_build/ci_build.sh'
 // timeout in minutes
@@ -120,7 +121,7 @@ def python3_gpu_ut(docker_type) {
 }
 
 // Python 2
-def python2_mklml_ut(docker_type) {
+def python2_mkldnn_ut(docker_type) {
   timeout(time: max_time, unit: 'MINUTES') {
     sh "${docker_run} ${docker_type} find . -name '*.pyc' -type f -delete"
     sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-2.7 --with-timer --verbose tests/python/cpu"
@@ -128,7 +129,7 @@ def python2_mklml_ut(docker_type) {
 }
 
 // Python 3
-def python3_mklml_ut(docker_type) {
+def python3_mkldnn_ut(docker_type) {
   timeout(time: max_time, unit: 'MINUTES') {
     sh "${docker_run} ${docker_type} find . -name '*.pyc' -type f -delete"
     sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-3.4 --with-timer --verbose tests/python/cpu"
@@ -166,42 +167,40 @@ try {
         }
       }
     },
-    'CPU: MKLML': {
+    'CPU: MKLDNN': {
       node('mxnetlinux-cpu') {
-        ws('workspace/build-mklml-cpu') {
+        ws('workspace/build-mkldnn-cpu') {
           init_git()
           def flag = """ \
             DEV=1                         \
             USE_PROFILER=1                \
             USE_CPP_PACKAGE=1             \
             USE_BLAS=openblas             \
-            USE_MKL2017=1                 \
-            USE_MKL2017_EXPERIMENTAL=1    \
+            USE_MKLDNN=1                  \
             -j\$(nproc)
             """
-          make("cpu_mklml", flag)
-          pack_lib('mklml_cpu')
+          make("cpu_mkldnn", flag)
+          pack_lib('mkldnn_cpu', mx_mkldnn_lib)
         }
       }
     },
-    'GPU: MKLML': {
+    'GPU: MKLDNN': {
       node('mxnetlinux-cpu') {
-        ws('workspace/build-mklml-gpu') {
+        ws('workspace/build-mkldnn-gpu') {
           init_git()
           def flag = """ \
             DEV=1                         \
             USE_PROFILER=1                \
             USE_CPP_PACKAGE=1             \
             USE_BLAS=openblas             \
-            USE_MKL2017=1                 \
-            USE_MKL2017_EXPERIMENTAL=1    \
+            USE_MKLDNN=1                  \
             USE_CUDA=1                    \
             USE_CUDA_PATH=/usr/local/cuda \
             USE_CUDNN=1                   \
             -j\$(nproc)
             """
           make("build_cuda", flag)
-          pack_lib('mklml_gpu')
+          pack_lib('mkldnn_gpu', mx_mkldnn_lib)
         }
       }
     },
@@ -344,43 +343,43 @@ try {
         }
       }
     },
-    'Python2: MKLML-CPU': {
+    'Python2: MKLDNN-CPU': {
       node('mxnetlinux-cpu') {
-        ws('workspace/ut-python2-mklml-cpu') {
+        ws('workspace/ut-python2-mkldnn-cpu') {
           init_git()
-          unpack_lib('mklml_cpu')
-          python2_ut('cpu_mklml')
-          python2_mklml_ut('cpu_mklml')
+          unpack_lib('mkldnn_cpu', mx_mkldnn_lib)
+          python2_ut('cpu_mkldnn')
+          python2_mkldnn_ut('cpu_mkldnn')
         }
       }
     },
-    'Python2: MKLML-GPU': {
+    'Python2: MKLDNN-GPU': {
       node('mxnetlinux-gpu') {
-        ws('workspace/ut-python2-mklml-gpu') {
+        ws('workspace/ut-python2-mkldnn-gpu') {
           init_git()
-          unpack_lib('mklml_gpu')
-          python2_gpu_ut('gpu_mklml')
-          python2_mklml_ut('gpu_mklml')
+          unpack_lib('mkldnn_gpu', mx_mkldnn_lib)
+          python2_gpu_ut('gpu_mkldnn')
+          python2_mkldnn_ut('gpu_mkldnn')
         }
       }
     },
-    'Python3: MKLML-CPU': {
+    'Python3: MKLDNN-CPU': {
       node('mxnetlinux-cpu') {
-        ws('workspace/ut-python3-mklml-cpu') {
+        ws('workspace/ut-python3-mkldnn-cpu') {
           init_git()
-          unpack_lib('mklml_cpu')
-          python3_ut('cpu_mklml')
-          python3_mklml_ut('cpu_mklml')
+          unpack_lib('mkldnn_cpu', mx_mkldnn_lib)
+          python3_ut('cpu_mkldnn')
+          python3_mkldnn_ut('cpu_mkldnn')
         }
       }
     },
-    'Python3: MKLML-GPU': {
+    'Python3: MKLDNN-GPU': {
       node('mxnetlinux-gpu') {
-        ws('workspace/ut-python3-mklml-gpu') {
+        ws('workspace/ut-python3-mkldnn-gpu') {
           init_git()
-          unpack_lib('mklml_gpu')
-          python3_gpu_ut('gpu_mklml')
-          python3_mklml_ut('gpu_mklml')
+          unpack_lib('mkldnn_gpu', mx_mkldnn_lib)
+          python3_gpu_ut('gpu_mkldnn')
+          python3_mkldnn_ut('gpu_mkldnn')
         }
       }
     },
diff --git a/Makefile b/Makefile
index 8584ab658e51..499dd905baa9 100644
--- a/Makefile
+++ b/Makefile
@@ -40,11 +40,11 @@ endif
 # use customized config file
 include $(config)
 
-ifeq ($(USE_MKL2017), 1)
-# must run ./prepare_mkl before including mshadow.mk
-	RETURN_STRING := $(shell ./prepare_mkl.sh $(MKLML_ROOT))
-	MKLROOT := $(firstword $(RETURN_STRING))
-	export USE_MKLML = $(lastword $(RETURN_STRING))
+ifeq ($(USE_MKLDNN), 1)
+	RETURN_STRING := $(shell ./prepare_mkldnn.sh $(MKLDNN_ROOT))
+	MKLDNNROOT := $(firstword $(RETURN_STRING))
+	MKLROOT := $(lastword $(RETURN_STRING))
+	export USE_MKLML = 1
 endif
 
 include mshadow/make/mshadow.mk
@@ -112,23 +112,20 @@ ifeq ($(USE_NNPACK), 1)
 	LDFLAGS += -lnnpack
 endif
 
-ifeq ($(USE_MKL2017), 1)
-	CFLAGS += -DMXNET_USE_MKL2017=1
+ifeq ($(USE_MKLDNN), 1)
+	CFLAGS += -DMXNET_USE_MKLDNN=1
 	CFLAGS += -DUSE_MKL=1
-	CFLAGS += -I$(ROOTDIR)/src/operator/mkl/
-	CFLAGS += -I$(MKLML_ROOT)/include
-	LDFLAGS += -L$(MKLML_ROOT)/lib
-	ifeq ($(USE_MKL2017_EXPERIMENTAL), 1)
-		CFLAGS += -DMKL_EXPERIMENTAL=1
-	else
-		CFLAGS += -DMKL_EXPERIMENTAL=0
-	endif
-	ifeq ($(UNAME_S), Darwin)
-		LDFLAGS += -lmklml
-	else
-		LDFLAGS += -Wl,--as-needed -lmklml_intel -lmklml_gnu
+	CFLAGS += -I$(ROOTDIR)/src/operator/nn/mkldnn/
+	ifneq ($(MKLDNNROOT), $(MKLROOT))
+		CFLAGS += -I$(MKLROOT)/include
+		LDFLAGS += -L$(MKLROOT)/lib
 	endif
-	LDFLAGS +=  -liomp5
+	CFLAGS += -I$(MKLDNNROOT)/include
+	LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
+endif
+
+ifeq ($(BN_DEBUG), 1)
+	CFLAGS += -DMXNET_BN_DEBUG=1
 endif
 
 ifeq ($(USE_OPERATOR_TUNING), 1)
@@ -142,7 +139,7 @@ endif
 #   -  for Ubuntu, installing atlas will not automatically install the atlas provided lapack library
 # silently switching lapack off instead of letting the build fail because of backward compatibility
 ifeq ($(USE_LAPACK), 1)
-ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas))
+ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas mkl))
 ifeq (,$(wildcard /lib/liblapack.a))
 ifeq (,$(wildcard /usr/lib/liblapack.a))
 ifeq (,$(wildcard /usr/lib64/liblapack.a))
@@ -160,7 +157,7 @@ ifeq ($(USE_LAPACK), 1)
 	ifneq ($(USE_LAPACK_PATH), )
 		LDFLAGS += -L$(USE_LAPACK_PATH)
 	endif
-	ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas))
+	ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas mkl))
 		LDFLAGS += -llapack
 	endif
 	CFLAGS += -DMXNET_USE_LAPACK
@@ -546,7 +543,8 @@ clean: cyclean $(EXTRA_PACKAGES_CLEAN)
 else
 clean: cyclean testclean $(EXTRA_PACKAGES_CLEAN)
 	$(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~ R-package/NAMESPACE R-package/man R-package/R/mxnet_generated.R \
-		R-package/inst R-package/src/image_recordio.h R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz
+		R-package/inst R-package/src/image_recordio.h R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz \
+		external/mkldnn/install/*
 	cd $(DMLC_CORE); $(MAKE) clean; cd -
 	cd $(PS_PATH); $(MAKE) clean; cd -
 	cd $(NNVM_PATH); $(MAKE) clean; cd -
diff --git a/amalgamation/mxnet_predict0.cc b/amalgamation/mxnet_predict0.cc
index f35591d82b22..cfee60559501 100644
--- a/amalgamation/mxnet_predict0.cc
+++ b/amalgamation/mxnet_predict0.cc
@@ -66,7 +66,7 @@
 #include "src/operator/operator_util.cc"
 #include "src/operator/nn/activation.cc"
 #include "src/operator/nn/batch_norm.cc"
-#include "src/operator/concat.cc"
+#include "src/operator/nn/concat.cc"
 #include "src/operator/nn/convolution.cc"
 #include "src/operator/nn/deconvolution.cc"
 #include "src/operator/nn/dropout.cc"
diff --git a/example/image-classification/common/data.py b/example/image-classification/common/data.py
index dc8915cda4c8..05f5ddc4506e 100755
--- a/example/image-classification/common/data.py
+++ b/example/image-classification/common/data.py
@@ -112,7 +112,8 @@ def get_rec_iter(args, kv=None):
     image_shape = tuple([int(l) for l in args.image_shape.split(',')])
     if 'benchmark' in args and args.benchmark:
         data_shape = (args.batch_size,) + image_shape
-        train = SyntheticDataIter(args.num_classes, data_shape, 500, np.float32)
+        train = SyntheticDataIter(args.num_classes, data_shape,
+                args.num_examples / args.batch_size, np.float32)
         return (train, None)
     if kv:
         (rank, nworker) = (kv.rank, kv.num_workers)
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index 8398b7bf7291..4f6d295230e4 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -35,12 +35,13 @@
 #include <map>
 #include <string>
 #include <memory>
+#include <algorithm>
+#if MXNET_USE_MKLDNN == 1
+#include <mkldnn.hpp>
+#endif
 #include "./base.h"
 #include "./storage.h"
 #include "./engine.h"
-#if MKL_EXPERIMENTAL == 1
-#include <mkl_memory.h>
-#endif
 // check c++11
 #if DMLC_USE_CXX11 == 0
 #error "cxx11 was required for ndarray module"
@@ -61,6 +62,9 @@ enum NDArrayStorageType {
   kDefaultStorage,         // dense
   kRowSparseStorage,       // row sparse
   kCSRStorage,             // csr
+#if MXNET_USE_MKLDNN == 1
+  kMKLDNNStorage,          // MKLDNN
+#endif
 };
 
 enum NDArrayFormatErr {
@@ -72,6 +76,7 @@ enum NDArrayFormatErr {
   kRSPIdxErr,     // indices error for row sparse
 };
 
+class MKLDNNMemory;
 
 /*!
  * \brief ndarray interface
@@ -80,9 +85,6 @@ class NDArray {
  public:
   /*! \brief default constructor */
   NDArray() {
-#if MKL_EXPERIMENTAL == 1
-    Mkl_mem_ = MKLMemHolder::create();
-#endif
   }
   /*!
    * \brief constructs a new dynamic NDArray
@@ -96,56 +98,14 @@ class NDArray {
       : ptr_(std::make_shared<Chunk>(shape, ctx, delay_alloc, dtype)),
         shape_(shape), dtype_(dtype), storage_type_(kDefaultStorage),
         entry_({nullptr, 0, 0}) {
-#if MKL_EXPERIMENTAL == 1
-    Mkl_mem_ = std::make_shared<MKLMemHolder>();
-#endif
   }
   /*! \brief constructor for NDArray with storage type
    */
   NDArray(const NDArrayStorageType stype, const TShape &shape, Context ctx,
           bool delay_alloc = true, int dtype = mshadow::default_type_flag,
           std::vector<int> aux_types = {}, std::vector<TShape> aux_shapes = {},
-          TShape storage_shape = TShape(mshadow::Shape1(0)))
-      : shape_(shape), dtype_(dtype), storage_type_(stype),
-        entry_({nullptr, 0, 0}) {
-      // Assign default aux types if not given
-      if (aux_types.size() == 0) {
-        if (stype == kRowSparseStorage) {
-          aux_types = {mshadow::kInt64};
-        } else if (stype == kCSRStorage) {
-          aux_types = {mshadow::kInt64, mshadow::kInt64};
-        } else {
-          LOG(FATAL) << "Unknown storage type " << stype;
-        }
-      }
-      // Assign default shapes if not given
-      // unknown shapes are intialized as {0} such that Size() would return 0
-      if (aux_shapes.size() == 0) {
-        if (stype == kRowSparseStorage) {
-          aux_shapes = {TShape(mshadow::Shape1(0))};
-        } else if (stype == kCSRStorage) {
-          // aux shapes for indptr and indices
-          aux_shapes = {TShape(mshadow::Shape1(0)), TShape(mshadow::Shape1(0))};
-        } else {
-          LOG(FATAL) << "Unknown storage type " << stype;
-        }
-      }
-      if (storage_shape.Size() == 0) {
-        if (stype == kRowSparseStorage) {
-          storage_shape = shape;
-          storage_shape[0] = aux_shapes[rowsparse::kIdx][0];
-        } else if (stype == kCSRStorage) {
-          storage_shape = aux_shapes[csr::kIdx];
-        } else {
-          LOG(FATAL) << "Unknown storage type " << stype;
-        }
-      }
-      ptr_ = std::make_shared<Chunk>(stype, storage_shape, ctx, delay_alloc,
-                                     dtype, aux_types, aux_shapes);
-#if MKL_EXPERIMENTAL == 1
-      Mkl_mem_ = std::make_shared<MKLMemHolder>();
-#endif
-  }
+          TShape storage_shape = TShape(mshadow::Shape1(0)));
+
   /*!
    * \brief constructing a static NDArray that shares data with TBlob
    *  Use with caution: allocate ONLY ONE NDArray for each TBlob,
@@ -157,17 +117,11 @@ class NDArray {
       : ptr_(std::make_shared<Chunk>(data, dev_id)), shape_(data.shape_),
         dtype_(data.type_flag_), storage_type_(kDefaultStorage),
         entry_({nullptr, 0, 0}) {
-#if MKL_EXPERIMENTAL == 1
-    Mkl_mem_ = std::make_shared<MKLMemHolder>();
-#endif
   }
   /*! \brief create ndarray from shared memory */
   NDArray(int shared_pid, int shared_id, const TShape& shape, int dtype)
       : ptr_(std::make_shared<Chunk>(shared_pid, shared_id, shape, dtype)), shape_(shape),
         dtype_(dtype), storage_type_(kDefaultStorage), entry_({nullptr, 0, 0}) {
-#if MKL_EXPERIMENTAL == 1
-    Mkl_mem_ = std::make_shared<MKLMemHolder>();
-#endif
   }
 
   /*!
@@ -184,11 +138,17 @@ class NDArray {
           const TBlob &data, const std::vector<TBlob> &aux_data, int dev_id)
       : ptr_(std::make_shared<Chunk>(stype, data, aux_data, dev_id)), shape_(shape),
         dtype_(data.type_flag_), storage_type_(stype), entry_({nullptr, 0, 0}) {
-#if MKL_EXPERIMENTAL == 1
-    Mkl_mem_ = std::make_shared<MKLMemHolder>();
-#endif
   }
 
+  inline bool IsView() const {
+    // Sparse arrays don't have a view.
+    if (storage_type() == kRowSparseStorage || storage_type() == kCSRStorage)
+      return false;
+    // If the array reuses memory, it's not a view.
+    if (reuse_)
+      return false;
+    return byte_offset_ > 0 || shape() != ptr_->storage_shape;
+  }
 
   /*!
    * \return the shape of current NDArray.
@@ -271,9 +231,6 @@ class NDArray {
             << "Unexpected storage type: " << stype;
       res = TBlob(dptr, shape, ptr_->aux_handles[i].ctx.dev_mask(), type);
     });
-#if MKL_EXPERIMENTAL == 1
-    res.Mkl_mem_ = Mkl_mem_;
-#endif
     return res;
   }
   /*!
@@ -531,15 +488,12 @@ class NDArray {
     CHECK_GE(ptr_->shandle.size,
              shape.Size() * mshadow::mshadow_sizeof(dtype))
         << "NDArray.AsArray: target memory size is bigger";
-#if MKL_EXPERIMENTAL == 1
-    if (Mkl_mem_ != nullptr) {
-      // convert prv to cpu
-      Mkl_mem_->check_and_prv_to_cpu(ptr_->shandle.dptr);
-    }
-#endif
+    // We can't reuse memory in a view.
+    CHECK(!IsView());
     NDArray ret = *this;
     ret.shape_ = shape;
     ret.dtype_ = dtype;
+    ret.reuse_ = true;
     return ret;
   }
   /*!
@@ -608,6 +562,65 @@ class NDArray {
              << "CheckAndAllocAuxData is not intended for kDefaultStorage";
     ptr_->CheckAndAllocAuxData(i, aux_shape);
   }
+
+#if MXNET_USE_MKLDNN == 1
+  bool IsMKLDNN() const {
+    return ptr_->IsMKLDNN();
+  }
+  bool IsDefault() const {
+    return ptr_->IsDefault();
+  }
+  /*
+   * All functions below return a raw pointer to mkldnn memory. Actually there
+   * is a shared pointer that hold the memory either in NDArray or in MKLDNN
+   * stream. As long as we call these functions inside an operator, the return
+   * memory is always valid.
+   */
+
+  /*
+   * This function returns mkldnn::memory with the default primitive_desc.
+   */
+  const mkldnn::memory *GetMKLDNNData() const;
+  /*
+   * This function returns mkldnn::memory with the given primitive_desc
+   * as long as the array size meets the required size in the given primitive_desc.
+   */
+  const mkldnn::memory *GetMKLDNNData(
+      const mkldnn::memory::primitive_desc &desc) const;
+  /*
+   * This function returns mkldnn::memory with the given primitive_desc.
+   * The returned mkldnn::memory will have the same physical layout as
+   * the given primitive_desc.
+   */
+  const mkldnn::memory *GetMKLDNNDataReorder(
+      const mkldnn::memory::primitive_desc &desc) const;
+
+  void CopyFrom(const mkldnn::memory &mem);
+  mkldnn::memory *CreateMKLDNNData(
+      const mkldnn::memory::primitive_desc &desc);
+
+  /*
+   * Reorder the memory to the specified layout.
+   */
+  void Reorder(const mkldnn::memory::primitive_desc &desc);
+  void Reorder2Default() {
+    CHECK_EQ(storage_type(), kDefaultStorage);
+    ptr_->Reorder2Default();
+  }
+
+  void InvalidateData() {
+    CHECK_EQ(storage_type(), kDefaultStorage);
+    // When we invalidate data, we don't need to care about the MKLDNN format.
+    ptr_->Mkl_mem_ = nullptr;
+  }
+
+  /*
+   * This function is used inside operators to reshape an array.
+   * It's used by FullyConnected right now.
+   */
+  NDArray ReshapeMKLDNN(const TShape &shape) const;
+#endif
+
   /*!
    * \brief Save list of ndarray into the Stream.x
    * \param fo The stream of output.
@@ -642,6 +655,12 @@ class NDArray {
                for csr, aux_handles[0] = indptr, aux_handles[1] = indices
     */
     std::vector<Storage::Handle> aux_handles;
+
+#if MXNET_USE_MKLDNN == 1
+    /*! This is created when data is stored in MKLDNN format.
+     */
+    std::shared_ptr<mkldnn::memory> Mkl_mem_;
+#endif
     /*! \brief variable from engine */
     Engine::VarHandle var;
     /*!
@@ -703,7 +722,7 @@ class NDArray {
         : static_data(false), delay_alloc(false) {
       var = Engine::Get()->NewVariable();
       ctx = Context::CPUShared(0);
-      shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);;
+      shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);
       shandle.ctx = ctx;
       shandle.shared_pid = shared_pid;
       shandle.shared_id = shared_id;
@@ -778,6 +797,9 @@ class NDArray {
     inline void CheckAndAlloc(void) {
       if (delay_alloc) {
         shandle = Storage::Get()->Alloc(shandle.size, shandle.ctx);
+#if MXNET_USE_MKLDNN == 1
+        Mkl_mem_ = nullptr;
+#endif
         delay_alloc = false;
       }
     }
@@ -786,15 +808,22 @@ class NDArray {
     // size is the number of bytes
     void CheckAndAlloc(uint64_t dbytes) {
       CHECK_EQ(kDefaultStorage, storage_type)
-              << "CheckAndAlloc(dbytes) is not intended for kDefaultStorage";
+          << "CheckAndAlloc(dbytes) is not intended for kDefaultStorage";
+      dbytes = std::max(dbytes, shandle.size);
       if (delay_alloc) {
         shandle = Storage::Get()->Alloc(dbytes, shandle.ctx);
+#if MXNET_USE_MKLDNN == 1
+        Mkl_mem_ = nullptr;
+#endif
         delay_alloc = false;
       } else if (shandle.size < dbytes) {
         // free storage if necessary and alloc again
         if (shandle.size > 0) Storage::Get()->Free(shandle);
         // init storage
         shandle = Storage::Get()->Alloc(dbytes, shandle.ctx);
+#if MXNET_USE_MKLDNN == 1
+        Mkl_mem_ = nullptr;
+#endif
       }
     }
 
@@ -820,20 +849,24 @@ class NDArray {
     // storage shape is also updated
     // if data is already allocated, try reuse the storage. Otherwise, free the current one
     // and allocate new storage
-    inline void CheckAndAllocData(const TShape &shape, int dtype) {
-      CHECK_NE(aux_shapes.size(), 0) << "data is expected to be allocated after aux_data";
-      auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype);
-      if (shandle.size < dbytes) {
-        // free storage if necessary and alloc again
-        if (shandle.size > 0) Storage::Get()->Free(shandle);
-        // init storage
-        shandle = Storage::Get()->Alloc(dbytes, ctx);
-      }
-      // init shape
-      storage_shape = shape;
-      // delay_alloc is only set when data storage handle is present
-      delay_alloc = false;
+    void CheckAndAllocData(const TShape &shape, int dtype);
+
+#if MXNET_USE_MKLDNN == 1
+    // Have MKL memory reference to the data in the default storage
+    // or create memory for MKLDNN.
+    void SetMKLMem(const TShape &shape, int dtype);
+    void ResetMKLMem() {
+      // If Mkl_mem_ isn't referencing to shandle, we need to reset Mkl_mem_.
+      if (Mkl_mem_ && Mkl_mem_->get_data_handle() != shandle.dptr)
+        Mkl_mem_ = nullptr;
     }
+    // In the data is stored in MKLDNN layout, we reorder data in Mkl_mem_ and
+    // save the result in shandle.
+    void Reorder2Default();
+    bool IsMKLDNN() const;
+    bool IsDefault() const;
+#endif
+
     // create storage handle for aux data based on shape
     // this function assumes ctx, aux shapes and aux types are set
     // aux shape is also updated
@@ -859,45 +892,11 @@ class NDArray {
       set_aux_shape(i, shape);
     }
     /*! \brief destructor */
-    ~Chunk() {
-      bool skip_free = static_data || delay_alloc;
-      Storage::Handle h = this->shandle;
-      std::vector<Storage::Handle> aux_h = this->aux_handles;
-      Engine::Get()->DeleteVariable([h, aux_h, skip_free](RunContext s) {
-        if (skip_free == false) {
-          Storage::Get()->Free(h);
-          for (size_t i = 0; i < aux_h.size(); i++) {
-            if (aux_h[i].size > 0) Storage::Get()->Free(aux_h[i]);
-          }
-        }
-      }, shandle.ctx, var);
-    }
+    ~Chunk();
   };  // struct Chunk
 
-  void SetTBlob() const {
-    CHECK(ptr_ != nullptr);
-    TShape shape = shape_;
-    char *dptr = static_cast<char*>(ptr_->shandle.dptr);
-    auto stype = storage_type();
-    if (stype == kDefaultStorage) {
-      dptr += byte_offset_;
-    } else if (stype == kCSRStorage || stype == kRowSparseStorage) {
-      shape = storage_shape();
-    } else {
-      LOG(FATAL) << "unknown storage type " << stype;
-    }
-    tblob_.dptr_ = dptr;
-    tblob_.shape_ = shape;
-    tblob_.type_flag_ = dtype_;
-    tblob_.SetDLTensor(ptr_->shandle.ctx.dev_mask(), ptr_->shandle.ctx.dev_id);
-#if MKL_EXPERIMENTAL == 1
-    tblob_.Mkl_mem_ = Mkl_mem_;
-#endif
-  }
+  void SetTBlob() const;
 
-#if MKL_EXPERIMENTAL == 1
-  std::shared_ptr<MKLMemHolder> Mkl_mem_;
-#endif
   /*! \brief internal data of NDArray */
   std::shared_ptr<Chunk> ptr_{nullptr};
   /*! \brief shape of current NDArray */
@@ -906,6 +905,8 @@ class NDArray {
   size_t byte_offset_ = 0;
   /*! \brief type of data */
   int dtype_ = -1;
+  /*! \brief whether the NDArray uses memory of another NDArray. */
+  bool reuse_ = false;
   /*! \brief storage type of data */
   NDArrayStorageType storage_type_ = kUndefinedStorage;
   /*! \brief node entry for autograd */
diff --git a/include/mxnet/tensor_blob.h b/include/mxnet/tensor_blob.h
index b65cd2b434e4..168ddcca24b7 100755
--- a/include/mxnet/tensor_blob.h
+++ b/include/mxnet/tensor_blob.h
@@ -36,9 +36,6 @@
 #include <utility>
 #include <algorithm>
 #include "./base.h"
-#if MXNET_USE_MKL2017 == 1
-#include <mkl_memory.h>
-#endif
 namespace mxnet {
 
 /* Forward declaration for friend declaration in TBlob */
@@ -66,17 +63,10 @@ class TBlob {
   /*! \brief type flag of the tensor blob */
   int type_flag_;
 
-  /*! \brief storing mkl chunk buffer blob, use for experimental only */
-#if MKL_EXPERIMENTAL == 1
-  std::shared_ptr<MKLMemHolder> Mkl_mem_;
-#endif
   /*! \brief default constructor, default copy assign will work */
   TBlob(void)
       : dptr_(NULL),
         type_flag_(mshadow::DataType<real_t>::kFlag) {
-#if MKL_EXPERIMENTAL == 1
-    Mkl_mem_ = NULL;
-#endif
     SetDLTensor(cpu::kDevMask, 0);
   }
   /*!
@@ -90,9 +80,6 @@ class TBlob {
   TBlob(DType *dptr, const TShape &shape, int dev_mask, int dev_id = -1)
       : dptr_(dptr), shape_(shape),
         type_flag_(mshadow::DataType<DType>::kFlag) {
-#if MKL_EXPERIMENTAL == 1
-    Mkl_mem_ = NULL;
-#endif
     SetDLTensor(dev_mask, dev_id);
   }
   /*!
@@ -105,9 +92,6 @@ class TBlob {
    */
   TBlob(void *dptr, const TShape &shape, int dev_mask, int type_flag, int dev_id = -1)
       : dptr_(dptr), shape_(shape), type_flag_(type_flag) {
-#if MKL_EXPERIMENTAL == 1
-    Mkl_mem_ = NULL;
-#endif
     SetDLTensor(dev_mask, dev_id);
   }
   /*!
@@ -135,9 +119,6 @@ class TBlob {
     shape_ = src.shape_;
     type_flag_ = mshadow::DataType<DType>::kFlag;
     SetDLTensor(Device::kDevMask, -1);
-#if MKL_EXPERIMENTAL == 1
-    Mkl_mem_ = NULL;
-#endif
     return *this;
   }
   /*!
@@ -172,11 +153,6 @@ class TBlob {
     CHECK(mshadow::DataType<DType>::kFlag == type_flag_)
       << "TBlob.get_with_shape: data type do not match specified type."
       << "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType<DType>::kFlag;
-#if MKL_EXPERIMENTAL == 1
-    if (Mkl_mem_ != nullptr) {
-      Mkl_mem_->check_and_prv_to_cpu(dptr_);
-    }
-#endif
     return mshadow::Tensor<Device, 2, DType>(static_cast<DType*>(dptr_),
                                              shape_.FlatTo2D(),
                                              shape_[shape_.ndim() - 1],
@@ -217,11 +193,6 @@ class TBlob {
     CHECK(mshadow::DataType<DType>::kFlag == type_flag_)
       << "TBlob.get_with_shape: data type do not match specified type."
       << "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType<DType>::kFlag;
-#if MKL_EXPERIMENTAL == 1
-    if (Mkl_mem_ != nullptr) {
-      Mkl_mem_->check_and_prv_to_cpu(dptr_);
-    }
-#endif
     return static_cast<DType*>(dptr_);
   }
   /*! \brief device mask of the corresponding device */
diff --git a/prepare_mkldnn.sh b/prepare_mkldnn.sh
new file mode 100755
index 000000000000..525ee14775cf
--- /dev/null
+++ b/prepare_mkldnn.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# set -ex
+#
+# All modification made by Intel Corporation: © 2016 Intel Corporation
+#
+# All contributions by the University of California:
+# Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+# All rights reserved.
+#
+# All other contributions:
+# Copyright (c) 2014, 2015, the respective contributors
+# All rights reserved.
+# For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+#
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     * Redistributions of source code must retain the above copyright notice,
+#       this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * Neither the name of Intel Corporation nor the names of its contributors
+#       may be used to endorse or promote products derived from this software
+#       without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+MXNET_ROOTDIR="$(pwd)"
+MKLDNN_ROOTDIR="$MXNET_ROOTDIR/3rdparty/mkldnn/"
+MKLDNN_SRCDIR="$MKLDNN_ROOTDIR/src"
+MKLDNN_BUILDDIR="$MKLDNN_ROOTDIR/build"
+MKLDNN_INSTALLDIR="$MKLDNN_ROOTDIR/install"
+MKLDNN_LIBDIR="$MXNET_ROOTDIR/lib"
+
+# MKLDNN install destination
+HOME_MKLDNN=$1
+if [ ! -z "$HOME_MKLDNN" ]; then
+  mkdir -p $HOME_MKLDNN
+  if [ ! -w $HOME_MKLDNN ]; then
+    echo "MKLDNN install to $HOME_MKLDNN failed, please try with sudo" >&2
+    exit 1
+  fi
+fi
+
+if [ -z $MKLDNNROOT ]; then
+if [ ! -f "$MKLDNN_INSTALLDIR/lib/libmkldnn.so" ]; then
+    mkdir -p $MKLDNN_INSTALLDIR
+	cd $MKLDNN_ROOTDIR
+    if [ -z $MKLROOT ] && [ ! -f $MKLDNN_INSTALLDIR/include/mkl_cblas.h ]; then
+        rm -rf external && cd scripts && ./prepare_mkl.sh && cd ..
+        cp -a external/*/* $MKLDNN_INSTALLDIR/.
+    fi 
+    echo "Building MKLDNN ..." >&2
+    cd $MXNET_ROOTDIR
+	g++ --version >&2
+    cmake $MKLDNN_ROOTDIR -DCMAKE_INSTALL_PREFIX=$MKLDNN_INSTALLDIR -B$MKLDNN_BUILDDIR
+    make -C $MKLDNN_BUILDDIR -j$(cat /proc/cpuinfo | grep processor | wc -l) VERBOSE=1 >&2
+    make -C $MKLDNN_BUILDDIR install
+    rm -rf $MKLDNN_BUILDDIR
+    mkdir -p $MKLDNN_LIBDIR
+    cp $MKLDNN_INSTALLDIR/lib/* $MKLDNN_LIBDIR
+fi
+MKLDNNROOT=$MKLDNN_INSTALLDIR
+fi
+
+if [ -z $MKLROOT ] && [ -f $MKLDNNROOT/include/mkl_cblas.h ]; then 
+  MKLROOT=$MKLDNNROOT;
+fi
+
+# user specified MKLDNN install folder
+if [ -d "$HOME_MKLDNN" ]; then
+  # skip if user specificed MKLDNNROOT
+  [ "$MKLDNNROOT" != "$HOME_MKLDNN" ] && rsync -a $MKLDNNROOT/include $MKLDNNROOT/lib $HOME_MKLDNN/.
+  [ "$MKLROOT" != "$HOME_MKLDNN" ] && rsync -a $MKLROOT/include $MKLROOT/lib $HOME_MKLDNN/.
+  # update ldconfig if possible
+  if [ -w /etc/ld.so.conf.d ]; then
+    echo "$HOME_MKLDNN/lib" > /etc/ld.so.conf.d/mxnmkldnn.conf && ldconfig
+  fi
+# return value to calling script (Makefile,cmake)
+  echo $HOME_MKLDNN $HOME_MKLDNN
+else
+  echo $MKLDNNROOT $MKLROOT
+fi
+
diff --git a/python/mxnet/ndarray/mkldnn.py b/python/mxnet/ndarray/mkldnn.py
new file mode 100644
index 000000000000..85544266015f
--- /dev/null
+++ b/python/mxnet/ndarray/mkldnn.py
@@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=wildcard-import, unused-wildcard-import, too-many-lines
+
+"""MKLDNN NDArray API of MXNet."""
+
+from __future__ import absolute_import
+from __future__ import division
+
+import warnings
+
+__all__ = ["MKLNDArray"]
+
+from ..context import Context
+from . import _internal
+from . import op
+from .ndarray import NDArray
+from .sparse import _ndarray_cls
+
+class MKLNDArray(NDArray):
+    """The base class of an NDArray stored in a MKLDNN storage format.
+    """
+
+    def _at(self, idx):
+        raise NotSupportedForMKLNDArray(self._at, '[idx]', idx)
+
+    def _slice(self, start, stop):
+        return op.slice(self, begin=start, end=stop)
+
+    def astype(self, dtype):
+        """Returns a copy of the array after casting to a specified type.
+        Parameters
+        ----------
+        dtype : numpy.dtype or str
+            The type of the returned array.
+        Examples
+        --------
+        >>> x = mx.nd.sparse.zeros('row_sparse', (2,3), dtype='float32')
+        >>> y = x.astype('int32')
+        >>> y.dtype
+        <type 'numpy.int32'>
+        """
+        res = zeros(shape=self.shape, ctx=self.context,
+                    dtype=dtype, stype=self.stype)
+        self.copyto(res)
+        return res
+
+    def copyto(self, other):
+        """Copies the value of this array to another array.
+
+        Parameters
+        ----------
+        other : NDArray or CSRNDArray or RowSparseNDArray or Context
+            The destination array or context.
+
+        Returns
+        -------
+        NDArray or CSRNDArray or RowSparseNDArray
+            The copied array.
+        """
+        if isinstance(other, NDArray):
+            if other.handle is self.handle:
+                warnings.warn('You are attempting to copy an array to itself', RuntimeWarning)
+                return
+            return _internal._copyto(self, out=other)
+        elif isinstance(other, Context):
+            hret = _ndarray_cls(_new_alloc_handle(self.stype, self.shape, other,
+                                                  True, self.dtype, self._aux_types))
+            return _internal._copyto(self, out=hret)
+        else:
+            raise TypeError('copyto does not support type ' + str(type(other)))
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index a45a6a82471e..885048e3ae91 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -52,6 +52,7 @@
 _STORAGE_TYPE_DEFAULT = 0
 _STORAGE_TYPE_ROW_SPARSE = 1
 _STORAGE_TYPE_CSR = 2
+_STORAGE_TYPE_MKLDNN = 3
 
 # pylint: disable= no-member
 _DTYPE_NP_TO_MX = {
diff --git a/python/mxnet/ndarray/sparse.py b/python/mxnet/ndarray/sparse.py
index 700dee0b07fa..e29d3da9b476 100644
--- a/python/mxnet/ndarray/sparse.py
+++ b/python/mxnet/ndarray/sparse.py
@@ -50,6 +50,7 @@
 from ._internal import _set_ndarray_class
 from .ndarray import NDArray, _storage_type, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP
 from .ndarray import _STORAGE_TYPE_STR_TO_ID, _STORAGE_TYPE_ROW_SPARSE, _STORAGE_TYPE_CSR
+from .ndarray import _STORAGE_TYPE_MKLDNN
 from .ndarray import _STORAGE_TYPE_UNDEFINED, _STORAGE_TYPE_DEFAULT
 from .ndarray import zeros as _zeros_ndarray
 from .ndarray import array as _array
@@ -1138,6 +1139,8 @@ def _ndarray_cls(handle, writable=True, stype=_STORAGE_TYPE_UNDEFINED):
         stype = _storage_type(handle)
     if stype == _STORAGE_TYPE_DEFAULT:
         return NDArray(handle, writable=writable)
+    elif stype == _STORAGE_TYPE_MKLDNN:
+        return NDArray(handle, writable=writable)
     elif stype == _STORAGE_TYPE_CSR:
         return CSRNDArray(handle, writable=writable)
     elif stype == _STORAGE_TYPE_ROW_SPARSE:
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 58bc8d38f685..e48c61e8de10 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -1287,6 +1287,10 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
             arr[:] = arg_params[name]
         for name, arr in exe.aux_dict.items():
             arr[:] = aux_params[name]
+        # We need to initialize the gradient arrays if it's add.
+        if (grad_req == "add"):
+            for arr in exe.grad_arrays:
+                arr[:] = np.zeros(arr.shape, dtype=arr.dtype)
 
     dtypes = [np.dtype(exe.outputs[0].dtype) for exe in exe_list]
     max_idx = np.argmax(dtypes)
diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h
index dcd1504fb88e..2867a7ab47d8 100644
--- a/src/common/exec_utils.h
+++ b/src/common/exec_utils.h
@@ -43,19 +43,59 @@ namespace common {
           indices are not recorded
  * \return true if any source NDArray need to cast storage
  */
-inline bool SetupDefaultBlobs(const std::vector<NDArray>& src,
-                              std::vector<TBlob> *blobs,
-                              std::vector<NDArray> *temp_src,
-                              std::vector<NDArray> *temp_dst,
-                              std::unordered_map<uint32_t, uint32_t> *idx_map = nullptr) {
+inline bool SetupDefaultBlobsIn(const std::vector<NDArray>& src,
+                                const std::vector<NDArray> *bufs,
+                                std::vector<TBlob> *blobs,
+                                std::vector<NDArray> *temp_src,
+                                std::vector<NDArray> *temp_dst,
+                                std::unordered_map<uint32_t, uint32_t> *idx_map) {
   bool require_cast = false;
   for (size_t i = 0; i < src.size(); i++) {
     auto& nd = src[i];
-    if (nd.storage_type() != kDefaultStorage) {
-      if (idx_map != nullptr) {
-        (*idx_map)[i] = temp_dst->size();
-      }
-      NDArray temp(nd.shape(), nd.ctx(), false, nd.dtype());
+    bool is_default = nd.storage_type() == kDefaultStorage;
+#if MXNET_USE_MKLDNN == 1
+    // We have to make sure it's default storage and default layout.
+    is_default = nd.IsDefault();
+#endif
+    if (!is_default) {
+      (*idx_map)[i] = temp_dst->size();
+      NDArray temp = bufs != nullptr ? bufs->at(i) : NDArray(nd.shape(), nd.ctx(), true, nd.dtype());
+#if MXNET_USE_MKLDNN == 1
+      CHECK(temp.IsDefault());
+#endif
+      temp_src->emplace_back(nd);
+      temp_dst->emplace_back(temp);
+      blobs->emplace_back(temp.data());
+      require_cast = true;
+    } else {
+      blobs->push_back(nd.data());
+    }
+  }
+  return require_cast;
+}
+
+inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
+                                 const std::vector<OpReqType> &req,
+                                 const std::vector<NDArray> *bufs,
+                                 std::vector<TBlob> *blobs,
+                                 std::vector<NDArray> *temp_src,
+                                 std::vector<NDArray> *temp_dst) {
+  bool require_cast = false;
+  for (size_t i = 0; i < src.size(); i++) {
+    auto& nd = src[i];
+    bool is_default = nd.storage_type() == kDefaultStorage;
+#if MXNET_USE_MKLDNN == 1
+    // If it's writeTo, we don't need to worry whether it contains valid data.
+    if (req[i] == kWriteTo)
+      const_cast<NDArray &>(nd).InvalidateData();
+    // We have to make sure it's default storage and default layout.
+    is_default = nd.IsDefault();
+#endif
+    if (!is_default) {
+      NDArray temp = bufs != nullptr ? bufs->at(i) : NDArray(nd.shape(), nd.ctx(), true, nd.dtype());
+#if MXNET_USE_MKLDNN == 1
+      CHECK(temp.IsDefault());
+#endif
       temp_src->emplace_back(nd);
       temp_dst->emplace_back(temp);
       blobs->emplace_back(temp.data());
@@ -76,6 +116,9 @@ inline bool SetupDefaultBlobs(const std::vector<NDArray>& src,
  */
 inline void SetupDefaultBlobsInOut(const std::vector<NDArray> &ndinputs,
                                    const std::vector<NDArray> &ndoutputs,
+                                   const std::vector<OpReqType> &req,
+                                   const std::vector<NDArray> *in_bufs,
+                                   const std::vector<NDArray> *out_bufs,
                                    std::vector<TBlob> *input_blobs,
                                    std::vector<TBlob> *output_blobs,
                                    std::vector<NDArray> *pre_temp_src,
@@ -85,9 +128,11 @@ inline void SetupDefaultBlobsInOut(const std::vector<NDArray> &ndinputs,
                                    std::unordered_map<uint32_t, uint32_t> *in_temp_idx_map,
                                    const std::vector<uint32_t> &mutate_idx) {
   // populate input blobs
-  SetupDefaultBlobs(ndinputs, input_blobs, pre_temp_src, pre_temp_dst, in_temp_idx_map);
+  SetupDefaultBlobsIn(ndinputs, in_bufs, input_blobs, pre_temp_src, pre_temp_dst,
+                      in_temp_idx_map);
   // populate output blobs
-  SetupDefaultBlobs(ndoutputs, output_blobs, post_temp_dst, post_temp_src);
+  SetupDefaultBlobsOut(ndoutputs, req, out_bufs, output_blobs, post_temp_dst,
+                       post_temp_src);
   // add mutable inputs to post temp list
   for (const auto idx : mutate_idx) {
     auto map_iter = in_temp_idx_map->find(idx);
diff --git a/src/common/utils.cc b/src/common/utils.cc
index 784fcf8651ae..939b3e8d0a1b 100644
--- a/src/common/utils.cc
+++ b/src/common/utils.cc
@@ -41,5 +41,21 @@ void CastStorageDispatch<cpu>(const OpContext& ctx,
   mxnet::op::CastStorageComputeImpl<cpu>(ctx, input, output);
 }
 
+std::string stype_string(const int x) {
+  switch (x) {
+    case kDefaultStorage:
+      return "default";
+    case kCSRStorage:
+      return "csr";
+    case kRowSparseStorage:
+      return "row_sparse";
+#if MXNET_USE_MKLDNN == 1
+    case kMKLDNNStorage:
+      return "mkldnn";
+#endif
+  }
+  return "unknown";
+}
+
 }  // namespace common
 }  // namespace mxnet
diff --git a/src/common/utils.h b/src/common/utils.h
index ede218bbdc01..27c2e1b30101 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -217,6 +217,28 @@ void CheckFormatImpl(const RunContext &rctx, const NDArray &input,
 template<typename xpu>
 void CastStorageDispatch(const OpContext& ctx, const NDArray& input, const NDArray& output);
 
+/*! \brief returns true if one of storage types in `inputs` is the same as target `stype`.
+ */
+inline bool ContainsStorage(const std::vector<NDArray>& inputs,
+                            NDArrayStorageType type) {
+  for (const auto &i : inputs) {
+    if (i.storage_type() == type)
+      return true;
+  }
+  return false;
+}
+
+/*! \brief returns true if one of storage types in `vstorage` is the same as target `stype`.
+ */
+inline bool ContainsStorage(const std::vector<int> &vstorages,
+                            NDArrayStorageType type) {
+  for (const auto& i : vstorages) {
+    if (i == type)
+      return true;
+  }
+  return false;
+}
+
 /*! \brief returns true if all storage types in `vstorage` are the same as target `stype`.
  *         false is returned for empty inputs.
  */
@@ -326,17 +348,7 @@ inline std::string dispatch_mode_string(const DispatchMode x) {
 
 
 /*! \brief get string representation of storage_type */
-inline std::string stype_string(const int x) {
-  switch (x) {
-    case kDefaultStorage:
-      return "default";
-    case kCSRStorage:
-      return "csr";
-    case kRowSparseStorage:
-      return "row_sparse";
-  }
-  return "unknown";
-}
+std::string stype_string(const int x);
 
 // heuristic to dermine number of threads per GPU
 inline int GetNumThreadPerGPU() {
diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
index 1bcc40a894dd..f6534fc82398 100644
--- a/src/executor/attach_op_execs_pass.cc
+++ b/src/executor/attach_op_execs_pass.cc
@@ -30,11 +30,7 @@
 #include "../common/utils.h"
 #include "../common/exec_utils.h"
 #include "./exec_pass.h"
-#if MXNET_USE_MKL2017 == 1
-#include <mkl_memory.h>
-#include "../operator/mkl/mkl_memory-inl.h"
-#include "../operator/mkl/mkl_util-inl.h"
-#endif
+
 namespace mxnet {
 
 namespace op {
@@ -58,23 +54,34 @@ class StorageFallbackOpExecutor : public OpExecutor {
  protected:
   // initialize the data blobs
   void InitBlobs() {
-    using namespace common;
     if (!init_) {
-      in_data_.clear(); out_data_.clear();
-      pre_temp_src_.clear(); pre_temp_dst_.clear();
-      post_temp_src_.clear(); post_temp_dst_.clear();
-      in_temp_idx_map_.clear();
-      SetupDefaultBlobsInOut(in_array, out_array, &in_data_, &out_data_,
-                             &pre_temp_src_, &pre_temp_dst_,
-                             &post_temp_src_, &post_temp_dst_,
-                             &in_temp_idx_map_, mutate_idx_);
+      pre_temp_buf_.clear();
+      post_temp_buf_.clear();
+      for (size_t i = 0; i < in_array.size(); i++) {
+        auto &nd = in_array[i];
+        pre_temp_buf_.emplace_back(nd.shape(), nd.ctx(), true, nd.dtype());
+      }
+      for (size_t i = 0; i < out_array.size(); i++) {
+        auto &nd = out_array[i];
+        post_temp_buf_.emplace_back(nd.shape(), nd.ctx(), true, nd.dtype());
+      }
       init_ = true;
     }
   }
 
   // storage fallback before fcompute is launched
   void PreFCompute(bool is_gpu) {
+    using namespace common;
     InitBlobs();
+    in_data_.clear(); out_data_.clear();
+    pre_temp_src_.clear(); pre_temp_dst_.clear();
+    post_temp_src_.clear(); post_temp_dst_.clear();
+    in_temp_idx_map_.clear();
+    SetupDefaultBlobsInOut(in_array, out_array, req, &pre_temp_buf_, &post_temp_buf_,
+                           &in_data_, &out_data_,
+                           &pre_temp_src_, &pre_temp_dst_,
+                           &post_temp_src_, &post_temp_dst_,
+                           &in_temp_idx_map_, mutate_idx_);
     common::CastNonDefaultStorage(pre_temp_src_, pre_temp_dst_, op_ctx, is_gpu);
   }
 
@@ -85,6 +92,8 @@ class StorageFallbackOpExecutor : public OpExecutor {
 
   // default storage tensor blobs for fcompute
   std::vector<TBlob> in_data_, out_data_;
+  // These are NDArray buffers for cast storage.
+  std::vector<NDArray> pre_temp_buf_, post_temp_buf_;
   // source NDArray for cast storage
   std::vector<NDArray> pre_temp_src_, post_temp_src_;
   // destination NDArray for cast storage
@@ -106,10 +115,6 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor {
     PreFCompute(is_gpu);
     fcompute_(state_, op_ctx, in_data_, req, out_data_);
     PostFCompute(is_gpu);
-#if MKL_EXPERIMENTAL == 1
-    mkl_tblobs_prv_to_cpu(in_data_);
-    mkl_tblobs_prv_to_cpu(out_data_);
-#endif
   }
 
   ExecType exec_type() const override {
@@ -175,10 +180,6 @@ class FComputeExecutor : public StorageFallbackOpExecutor {
     PreFCompute(is_gpu);
     fcompute_(attrs_, op_ctx, in_data_, req, out_data_);
     PostFCompute(is_gpu);
-#if MKL_EXPERIMENTAL == 1
-    mkl_tblobs_prv_to_cpu(in_data_);
-    mkl_tblobs_prv_to_cpu(out_data_);
-#endif
   }
 
   ExecType exec_type() const override {
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 77853a61ca04..896ea77cd6f0 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -54,6 +54,14 @@ GraphExecutor::~GraphExecutor() {
   }
 }
 
+inline bool SharableStorage(NDArrayStorageType stype) {
+  bool ret = stype == kDefaultStorage;
+#if MXNET_USE_MKLDNN == 1
+  ret = ret || stype == kMKLDNNStorage;
+#endif
+  return ret;
+}
+
 inline NDArray InitZeros(const NDArrayStorageType stype, const TShape &shape,
                                 const Context &ctx, const int dtype) {
   // NDArray with default storage
@@ -693,7 +701,7 @@ NDArray ReshapeOrCreate(const std::string& name,
                         const Context& ctx,
                         std::unordered_map<std::string, NDArray>* shared_buffer,
                         bool enable_row_sparse_sharing) {
-  bool stype_shareable = dest_arg_stype == kDefaultStorage;
+  bool stype_shareable = SharableStorage(dest_arg_stype);
   if (enable_row_sparse_sharing) {
     stype_shareable = stype_shareable || dest_arg_stype == kRowSparseStorage;
   }
@@ -793,7 +801,7 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
           const NDArray& in_arg_nd = shared_exec->in_arg_map().at(arg_name);
           auto arg_nd_stype = in_arg_nd.storage_type();
           // for model parameter, both default storage and row_sparse storage can be shared
-          bool shareable_arg_stype = inferred_stype == kDefaultStorage ||
+          bool shareable_arg_stype = SharableStorage(inferred_stype) ||
                                      inferred_stype == kRowSparseStorage;
           // try to reuse memory from shared_exec
           CHECK(shareable_arg_stype) << "Inferred storage type "
@@ -827,8 +835,8 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
           auto grad_oid = grad_store_.size() + num_forward_outputs_;
           auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]);
           auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid];
-          if (nullptr != shared_exec && grad_stype == kDefaultStorage &&
-              shared_exec->arg_grad_map().at(arg_name).storage_type() == kDefaultStorage) {
+          if (nullptr != shared_exec && SharableStorage(grad_stype) &&
+              shared_exec->arg_grad_map().at(arg_name).storage_type() == grad_stype) {
             // try to reuse memory from shared_exec
             arg_grad_vec->emplace_back(shared_exec->arg_grad_map().at(arg_name));
           } else {
@@ -1209,7 +1217,8 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
       const NDArray& src = data_pool_.at(storage_id);
       data_entry_[i] = src.AsArray(vshape[i], vdtype[i]);
     } else {
-      data_entry_[i] = NDArray(storage_type, vshape[i], data_context[i]);
+      data_entry_[i] = NDArray(storage_type, vshape[i], data_context[i],
+                               true, vdtype[i]);
     }
     if (log_verbose_) {
       LOG(INFO) << "\tinit data entry\t" << i << "\tas " << common::stype_string(storage_type);
diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc
index 67e61aa357c2..3bf3c9b8545a 100644
--- a/src/executor/infer_graph_attr_pass.cc
+++ b/src/executor/infer_graph_attr_pass.cc
@@ -416,11 +416,6 @@ nnvm::Graph InferStorageType(nnvm::Graph&& graph,
     DispatchModeVector dispatch_modes(graph.indexed_graph().num_nodes(), DispatchMode::kUndefined);
     graph.attrs["dispatch_mode"] = std::make_shared<any>(std::move(dispatch_modes));
   }
-  // initialize unknown values for dispatch modes
-  if (graph.attrs.count("dispatch_mode") == 0) {
-    DispatchModeVector dispatch_modes(graph.indexed_graph().num_nodes(), DispatchMode::kUndefined);
-    graph.attrs["dispatch_mode"] = std::make_shared<any>(std::move(dispatch_modes));
-  }
   // initialize the dev_mask vector from the context vector
   if (graph.attrs.count("dev_mask") == 0) {
     CHECK_GT(graph.attrs.count("context"), 0);
diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
index eaa95a5f2418..93a8bc6c54b2 100644
--- a/src/imperative/cached_op.cc
+++ b/src/imperative/cached_op.cc
@@ -214,6 +214,12 @@ nnvm::Graph Imperative::CachedOp::GetForwardGraph(
 
   StorageVector storage(idx.num_node_entries(), exec::kBadStorageID);
   for (const auto i : idx.input_nodes()) storage[idx.entry_id(i, 0)] = exec::kExternalStorageID;
+  const auto& stypes = g.GetAttr<StorageTypeVector>("storage_type");
+  CHECK_EQ(stypes.size(), storage.size());
+  for (size_t i = 0; i < stypes.size(); i++) {
+    if (stypes[i] != kDefaultStorage)
+      storage[i] = exec::kDynamicStorageID;
+  }
 
   auto mem_plan = PlanMemory(
       &g, std::move(storage), g.GetAttr<std::vector<uint32_t> >(
@@ -320,6 +326,10 @@ nnvm::Graph Imperative::CachedOp::GetBackwardGraph(
   for (size_t i = 0; i < num_forward_entries; ++i) storage[i] = exec::kExternalStorageID;
   for (const auto i : idx.input_nodes()) storage[idx.entry_id(i, 0)] = exec::kExternalStorageID;
   for (const auto i : idx.outputs()) storage[idx.entry_id(i)] = exec::kExternalStorageID;
+  for (size_t i = 0; i < stypes.size(); i++) {
+    if (stypes[i] != kDefaultStorage)
+      storage[i] = exec::kDynamicStorageID;
+  }
 
   auto mem_plan = PlanMemory(
       &g, std::move(storage), g.GetAttr<std::vector<uint32_t> >("backward_ref_count"),
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 8be1eb4e3d75..4e3624b81010 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -353,9 +353,9 @@ inline void PushFCompute(const FCompute& fn,
       // mapping from index in input_blobs to index in pre_temp_dst
       std::unordered_map<uint32_t, uint32_t> in_temp_idx_map;
       // setup blobs
-      SetupDefaultBlobsInOut(inputs, outputs, &input_blobs, &output_blobs,
-                             &pre_temp_src, &pre_temp_dst, &post_temp_src, &post_temp_dst,
-                             &in_temp_idx_map, mutate_idx);
+      SetupDefaultBlobsInOut(inputs, outputs, req, nullptr, nullptr,
+                             &input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst,
+                             &post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx);
       // setup context
       OpContext opctx{is_train, rctx, engine::CallbackOnComplete(), requested};
       bool is_gpu = ctx.dev_mask() == gpu::kDevMask;
@@ -467,9 +467,9 @@ inline void PushOperator(const OpStatePtr& state,
         // mapping from index in input_blobs to index in pre_temp_dst
         std::unordered_map<uint32_t, uint32_t> in_temp_idx_map;
         // populate input blobs and output blobs
-        SetupDefaultBlobsInOut(inputs, outputs, &input_blobs, &output_blobs,
-                               &pre_temp_src, &pre_temp_dst, &post_temp_src, &post_temp_dst,
-                               &in_temp_idx_map, mutate_idx);
+        SetupDefaultBlobsInOut(inputs, outputs, req, nullptr, nullptr,
+                               &input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst,
+                               &post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx);
         // setup contexts
         bool is_gpu = rctx.get_ctx().dev_mask() == gpu::kDevMask;
         // pre-fcompute fallback
@@ -604,6 +604,7 @@ inline bool CheckAndInferStorageType(nnvm::Graph* p_g, exec::DevMaskVector&& dev
     }
     if (match) return true;
   }
+  g.attrs.erase("dispatch_mode");
   g.attrs.erase("storage_type");
   g.attrs.erase("storage_type_inputs");
   if (node_range.second > node_range.first) {
diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h
index b00d0de935f7..d0a968154afb 100644
--- a/src/kvstore/kvstore_dist.h
+++ b/src/kvstore/kvstore_dist.h
@@ -32,11 +32,6 @@
 #include "mxnet/engine.h"
 #include "ps/ps.h"
 #include "./kvstore_dist_server.h"
-#if MKL_EXPERIMENTAL == 1
-#include <mkl_memory.h>
-#include "../operator/mkl/mkl_memory-inl.h"
-#include "../operator/mkl/mkl_util-inl.h"
-#endif
 namespace mxnet {
 namespace kvstore {
 
@@ -228,9 +223,6 @@ class KVStoreDist : public KVStoreLocal {
         PSKV& pskv = (gradient_compression_->get_type() == CompressionType::kNone) ?
                       EncodeDefaultKey(key, size, false) :
                       EncodeCompressedKey(key, size, false);
-#if MKL_EXPERIMENTAL == 1
-        mkl_set_tblob_eager_mode(recv_buf.data());
-#endif
         real_t* data = recv_buf.data().dptr<real_t>();
         // false means not to delete data when SArray is deleted
         auto vals = new ps::SArray<real_t>(data, size, false);
@@ -380,9 +372,6 @@ class KVStoreDist : public KVStoreLocal {
       [this, key, pskv, small_buf](RunContext rctx, Engine::CallbackOnComplete cb) {
         size_t size = small_buf.shape().Size();
         real_t* data = small_buf.data().dptr<real_t>();
-#if MKL_EXPERIMENTAL == 1
-        mkl_set_tblob_eager_mode(small_buf.data());
-#endif
         // do push. false means no delete
         ps::SArray<real_t> vals(data, size, false);
         CHECK_NOTNULL(ps_worker_)->ZPush(
@@ -407,9 +396,6 @@ class KVStoreDist : public KVStoreLocal {
           // convert to ps keys
           size_t size = send_buf.shape().Size();
           real_t* data = send_buf.data().dptr<real_t>();
-#if MKL_EXPERIMENTAL == 1
-          mkl_set_tblob_eager_mode(send_buf.data());
-#endif
           // do push. false means no delete
           ps::SArray<real_t> vals(data, size, false);
           CHECK_NOTNULL(ps_worker_)->ZPush(
@@ -431,9 +417,6 @@ class KVStoreDist : public KVStoreLocal {
     using namespace rowsparse;
     auto push_to_servers = [this, key, send_buf]
                            (RunContext rctx, Engine::CallbackOnComplete cb) {
-#if MKL_EXPERIMENTAL == 1
-      mkl_set_tblob_eager_mode(send_buf.data());
-#endif
       real_t* data = send_buf.data().dptr<real_t>();
       const int64_t num_rows = send_buf.aux_shape(kIdx)[0];
       const auto offsets = send_buf.aux_data(kIdx).dptr<int64_t>();
@@ -472,9 +455,6 @@ class KVStoreDist : public KVStoreLocal {
       // allocate memory for the buffer
       size_t num_rows = indices.shape().Size();
       recv_buf.CheckAndAlloc({mshadow::Shape1(num_rows)});
-#if MKL_EXPERIMENTAL == 1
-      mkl_set_tblob_eager_mode(recv_buf.data());
-#endif
       real_t* data = recv_buf.data().dptr<real_t>();
       const auto offsets = indices.data().dptr<int64_t>();
       const auto unit_len = recv_buf.shape().ProdShape(1, recv_buf.shape().ndim());
diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h
index 1bb84fdc1114..5646d9eef866 100644
--- a/src/kvstore/kvstore_local.h
+++ b/src/kvstore/kvstore_local.h
@@ -256,7 +256,13 @@ class KVStoreLocal : public KVStore {
     auto validator = [this](const int key, const NDArray& nd) -> bool {
       auto stype = nd.storage_type();
       // valid NDArray
-      if (stype == kDefaultStorage || stype == kRowSparseStorage) return true;
+      auto valid_stype = stype == kDefaultStorage || stype == kRowSparseStorage;
+#if MXNET_USE_MKLDNN == 1
+      // When it's kMKLDNNStorage, it'll be converted to a data layout
+      // compatible to the default storage.
+      valid_stype = valid_stype || stype == kMKLDNNStorage;
+#endif
+      if (valid_stype) return true;
       // invalid NDArray, abort
       LOG(FATAL) << "Unexpected storage type detected during kvstore push: " << stype;
       return false;
@@ -272,8 +278,15 @@ class KVStoreLocal : public KVStore {
                                 std::vector<std::vector<NDArray*>> *grouped_vals) {
     // check if the storage type of a value is valid
     auto validator = [this](const int key, const NDArray* nd) -> bool {
+      auto stype = nd->storage_type();
       // valid
-      if (nd->storage_type() == kDefaultStorage) return true;
+      auto valid_stype = stype == kDefaultStorage;
+#if MXNET_USE_MKLDNN == 1
+      // When it's kMKLDNNStorage, it'll be converted to a data layout
+      // compatible to the default storage.
+      valid_stype = valid_stype || stype == kMKLDNNStorage;
+#endif
+      if (valid_stype) return true;
       // invalid, print warning messages once
       if (this->warnings_printed_.find(key) == this->warnings_printed_.end()) {
         LOG(INFO) << "Warning: non-default weights detected during kvstore pull. "
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index f09f168977ab..b28998fb9662 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -31,10 +31,14 @@
 #include <mxnet/resource.h>
 #include <mxnet/imperative.h>
 #include <mshadow/tensor.h>
+#if MXNET_USE_MKLDNN == 1
+#include <mkldnn.hpp>
+#endif
 #include "./ndarray_function.h"
 #include "../common/utils.h"
 #include "../operator/tensor/matrix_op-inl.h"
 #include "../operator/tensor/init_op.h"
+#include "../operator/nn/mkldnn/mkldnn_base-inl.h"
 
 #if MXNET_USE_OPENCV
 #include <opencv2/opencv.hpp>
@@ -46,6 +50,104 @@ DMLC_REGISTRY_ENABLE(::mxnet::NDArrayFunctionReg);
 
 namespace mxnet {
 
+NDArray::NDArray(const NDArrayStorageType stype, const TShape &shape, Context ctx,
+    bool delay_alloc, int dtype, std::vector<int> aux_types,
+    std::vector<TShape> aux_shapes, TShape storage_shape) : shape_(shape),
+  dtype_(dtype), storage_type_(stype), entry_({nullptr, 0, 0}) {
+  // Assign default aux types if not given
+  if (aux_types.size() == 0
+      && stype != kDefaultStorage) {
+    if (stype == kRowSparseStorage) {
+      aux_types = {mshadow::kInt64};
+    } else if (stype == kCSRStorage) {
+      aux_types = {mshadow::kInt64, mshadow::kInt64};
+    } else {
+      LOG(FATAL) << "Unknown storage type " << stype;
+    }
+  }
+  // Assign default shapes if not given
+  // unknown shapes are intialized as {0} such that Size() would return 0
+  if (aux_shapes.size() == 0
+      && stype != kDefaultStorage) {
+    if (stype == kRowSparseStorage) {
+      aux_shapes = {TShape(mshadow::Shape1(0))};
+    } else if (stype == kCSRStorage) {
+      // aux shapes for indptr and indices
+      aux_shapes = {TShape(mshadow::Shape1(0)), TShape(mshadow::Shape1(0))};
+    } else {
+      LOG(FATAL) << "Unknown storage type " << stype;
+    }
+  }
+  if (storage_shape.Size() == 0
+      && stype != kDefaultStorage) {
+    if (stype == kRowSparseStorage) {
+      storage_shape = shape;
+      storage_shape[0] = aux_shapes[rowsparse::kIdx][0];
+    } else if (stype == kCSRStorage) {
+      storage_shape = aux_shapes[csr::kIdx];
+    } else {
+      LOG(FATAL) << "Unknown storage type " << stype;
+    }
+  }
+  if (stype == kDefaultStorage)
+    ptr_ = std::make_shared<Chunk>(shape, ctx, delay_alloc, dtype);
+  else
+    ptr_ = std::make_shared<Chunk>(stype, storage_shape, ctx, delay_alloc,
+        dtype, aux_types, aux_shapes);
+}
+
+struct ChunkMem {
+  Storage::Handle h;
+  std::vector<Storage::Handle> aux_h;
+#if MXNET_USE_MKLDNN == 1
+  std::shared_ptr<mkldnn::memory> mem;
+#endif
+};
+
+NDArray::Chunk::~Chunk() {
+  bool skip_free = static_data || delay_alloc;
+  ChunkMem mem;
+  mem.h = this->shandle;
+  mem.aux_h = this->aux_handles;
+#if MXNET_USE_MKLDNN == 1
+  // We want to delete mkldnn memory after deleting the variable.
+  mem.mem = this->Mkl_mem_;
+#endif
+  Engine::Get()->DeleteVariable([mem, skip_free](RunContext s) {
+    if (skip_free == false) {
+#if MXNET_USE_MKLDNN == 1
+      if (mem.mem) {
+        CHECK_LE(mem.mem->get_primitive_desc().get_size(), mem.h.size);
+        CHECK_EQ(mem.mem->get_data_handle(), mem.h.dptr);
+      }
+#endif
+      if (mem.h.size > 0) Storage::Get()->Free(mem.h);
+      for (size_t i = 0; i < mem.aux_h.size(); i++) {
+        if (mem.aux_h[i].size > 0) Storage::Get()->Free(mem.aux_h[i]);
+      }
+    }
+  }, shandle.ctx, var);
+}
+
+void NDArray::Chunk::CheckAndAllocData(const TShape &shape, int dtype) {
+  CHECK_NE(aux_shapes.size(), 0)
+      << "data is expected to be allocated after aux_data";
+  auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype);
+  if (shandle.size < dbytes) {
+    // free storage if necessary and alloc again
+    if (shandle.size > 0) Storage::Get()->Free(shandle);
+    // init storage
+    shandle = Storage::Get()->Alloc(dbytes, ctx);
+#if MXNET_USE_MKLDNN == 1
+    Mkl_mem_ = nullptr;
+#endif
+  }
+  // init shape
+  storage_shape = shape;
+  // delay_alloc is only set when data storage handle is present
+  delay_alloc = false;
+}
+
 NDArray NDArray::grad() const {
   if (Imperative::AGInfo::IsNone(*this)) return NDArray();
   Imperative::AGInfo& info = Imperative::AGInfo::Get(entry_.node);
@@ -64,14 +166,55 @@ nnvm::Symbol NDArray::get_autograd_symbol() const {
   return ret;
 }
 
+#if MXNET_USE_MKLDNN == 1
+
+struct EmptyMKLDNNDeleter {
+  void operator()(mkldnn::memory *mem) {
+  }
+};
+
+NDArray NDArray::ReshapeMKLDNN(const TShape &shape) const {
+  CHECK(!is_none()) << "NDArray is not initialized";
+  CHECK_GE(shape_.Size(), shape.Size())
+    << "NDArray.Reshape: target shape size is larger current shape";
+  CHECK_EQ(storage_type(), kDefaultStorage);
+  if (!IsMKLDNN()) {
+    NDArray ret = this->Detach();
+    ret.shape_ = shape;
+    return ret;
+  } else {
+    NDArray ret(shape, ctx(), true, dtype());
+    // We shouldn't submit the reorder primitive here because submit will
+    // be called in operators.
+    auto format = GetDefaultFormat(ptr_->Mkl_mem_->get_primitive_desc().desc());
+    CHECK_NE(format, ptr_->Mkl_mem_->get_primitive_desc().desc().data.format);
+    auto def_pd = GetPrimitiveDesc(ptr_->Mkl_mem_->get_primitive_desc(), format);
+    auto def_mem = TmpMemMgr::Get()->Alloc(def_pd);
+    MKLDNNStream *stream = MKLDNNStream::Get();
+    stream->RegisterMem(ptr_->Mkl_mem_);
+    stream->RegisterPrim(mkldnn::reorder(*ptr_->Mkl_mem_, *def_mem));
+    // def_mem points to a memory region in the temp space. It's only valid
+    // inside an operator. As such, the returned NDArray can only be valid
+    // inside an operator and the shared point doesn't need to do anything
+    // when it's destroyed.
+    ret.ptr_->Mkl_mem_ = std::shared_ptr<mkldnn::memory>(def_mem,
+                                                         EmptyMKLDNNDeleter());
+    ret.ptr_->shandle.dptr = def_mem->get_data_handle();
+    ret.ptr_->shandle.size = def_mem->get_primitive_desc().get_size();
+    ret.ptr_->delay_alloc = false;
+    ret.ptr_->static_data = true;
+    ret.byte_offset_ = byte_offset_;
+    return ret;
+  }
+}
+
+#endif
+
 NDArray NDArray::Reshape(const TShape &shape) const {
   CHECK(!is_none()) << "NDArray is not initialized";
-  auto stype = storage_type();
-  // reshape is not supported for non-default ndarray with dismatching shapes
-  CHECK((shape_ == shape) || stype == kDefaultStorage)
-    << "Reshape for storage type " << stype << " is not implemented yet";
   CHECK_GE(shape_.Size(), shape.Size())
     << "NDArray.Reshape: target shape size is larger current shape";
+  CHECK_EQ(storage_type(), kDefaultStorage);
   NDArray ret = this->Detach();
   ret.shape_ = shape;
   return ret;
@@ -95,7 +238,6 @@ NDArray NDArray::ReshapeWithRecord(const TShape &shape) {
   return ret;
 }
 
-
 NDArray NDArray::Slice(index_t begin, index_t end) const {
   CHECK(!is_none()) << "NDArray is empty";
   CHECK_LE(begin, end)
@@ -127,8 +269,8 @@ NDArray NDArray::SliceWithRecord(index_t begin, index_t end) {
 }
 
 NDArray NDArray::At(index_t idx) const {
-  CHECK(storage_type() == kDefaultStorage) << "Storage type "
-                                           << storage_type() << " doesn't support At()";
+  CHECK(storage_type() == kDefaultStorage)
+      << "Storage type " << storage_type() << " doesn't support At()";
   NDArray ret = this->Slice(idx, idx+1);
   if (shape_.ndim() > 1) {
     return ret.Reshape(TShape(shape_.data()+1, shape_.data()+shape_.ndim()));
@@ -181,6 +323,409 @@ void NDArray::set_fresh_out_grad(bool state) const {
   info.fresh_out_grad = state;
 }
 
+#if MXNET_USE_MKLDNN == 1
+static inline bool same_shape(const TShape &shape, mkldnn_dims_t dims, int ndims) {
+  if (shape.ndim() != (size_t)ndims)
+    return false;
+  for (int i = 0; i < ndims; i++)
+    if (shape[i] != dims[i])
+      return false;
+  return true;
+}
+
+static inline bool same_shape(const TShape &shape, int dtype, mkldnn::memory::desc desc) {
+  return same_shape(shape, desc.data.dims, desc.data.ndims)
+      && get_mkldnn_type(dtype) == desc.data.data_type;
+}
+
+bool NDArray::Chunk::IsMKLDNN() const {
+  if (storage_type != kDefaultStorage)
+    return false;
+  if (Mkl_mem_ == nullptr)
+    return false;
+  auto desc = Mkl_mem_->get_primitive_desc().desc();
+  return desc.data.format != GetDefaultFormat(desc);
+}
+
+bool NDArray::Chunk::IsDefault() const {
+  if (storage_type != kDefaultStorage)
+    return false;
+  // If we don't have mkldnn memory yet, we just assume it's not the default
+  // format.
+  if (Mkl_mem_ == nullptr)
+    return true;
+  auto desc = Mkl_mem_->get_primitive_desc().desc();
+  return desc.data.format == GetDefaultFormat(desc);
+}
+
+void NDArray::Chunk::Reorder2Default() {
+  if (Mkl_mem_ == nullptr)
+    return;
+
+  auto format = GetDefaultFormat(Mkl_mem_->get_primitive_desc().desc());
+  CHECK(format != Mkl_mem_->get_primitive_desc().desc().data.format);
+
+  auto def_pd = GetPrimitiveDesc(Mkl_mem_->get_primitive_desc(), format);
+  mkldnn_mem_ptr def_mem(new mkldnn::memory(def_pd));
+  // This may be called in MKLDNN operators. We can't use MKLDNNStream here.
+  std::vector<mkldnn::primitive> net;
+  net.push_back(mkldnn::reorder(*Mkl_mem_, *def_mem));
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
+
+  CHECK(shandle.size >= def_pd.get_size());
+  CheckAndAlloc(def_pd.get_size());
+  // TODO(zhengda) We need to avoid memory copy here.
+  memcpy(shandle.dptr, def_mem->get_data_handle(), def_pd.get_size());
+  Mkl_mem_.reset(new mkldnn::memory(def_pd, shandle.dptr));
+}
+
+void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) {
+  // The shape of the array and the one of the MKL memory may mismatch.
+  // For example, if the array stores parameters, the MKL memory may store data
+  // in 5 dimensions while the NDArray stores data in 4 dimensions.
+  if (Mkl_mem_ && Mkl_mem_->get_data_handle() == shandle.dptr
+      && same_shape(shape, dtype, Mkl_mem_->get_primitive_desc().desc())) {
+    return;
+  }
+
+  mkldnn::memory::dims dims;
+  // These are shapes supprted by MKLDNN.
+  if (shape.ndim() == 1 || shape.ndim() == 2 || shape.ndim() == 4
+      || shape.ndim() == 5) {
+    dims.resize(shape.ndim());
+    for (size_t i = 0; i < dims.size(); i++)
+      dims[i] = shape[i];
+  } else if (shape.ndim() == 3) {
+    // If there are 3 dimensions, we'll force it to 4 dimensions.
+    dims.resize(shape.ndim() + 1);
+    dims[0] = 1;
+    for (size_t i = 0; i < shape.ndim(); i++)
+      dims[i + 1] = shape[i];
+  } else {
+    LOG(FATAL) << "MKLDNN doesn't support " << shape.ndim() << " dimensions";
+  }
+  mkldnn::memory::format layout = mkldnn::memory::format::format_undef;
+  switch (dims.size()) {
+    case 1: layout = mkldnn::memory::format::x; break;
+    case 2: layout = mkldnn::memory::format::nc; break;
+    case 4: layout = mkldnn::memory::format::nchw; break;
+    // This isn't the right layout when the data has 5 dimensions in MXNet.
+    // MXNet interprets 5 dimensions as ncdhw, but MKLDNN doesn't have
+    // a corresponding format.
+    case 5: layout = mkldnn::memory::format::goihw; break;
+  }
+  mkldnn::memory::desc data_md{dims, get_mkldnn_type(dtype), layout};
+  auto cpu_engine = CpuEngine::Get()->get_engine();
+  if (shandle.dptr == nullptr) {
+    CHECK(delay_alloc);
+    CheckAndAlloc();
+  }
+  mkldnn::memory::primitive_desc pd(data_md, cpu_engine);
+  CHECK(shandle.size >= pd.get_size());
+  Mkl_mem_.reset(new mkldnn::memory(pd, shandle.dptr));
+}
+
+/*
+ * Here we want to get MKLDNN memory whose primitive desc is exactly the same as
+ * the given one. operator== can't guarantee that. == can return true even if
+ * the formats are different. I need to double check its format.
+ */
+static inline mkldnn::memory *GetMKLDNNExact(
+    const mkldnn::memory *mem, mkldnn::memory::primitive_desc desc) {
+  auto src_desc = mem->get_primitive_desc();
+  if (desc == src_desc && desc.desc().data.format == src_desc.desc().data.format) {
+    return const_cast<mkldnn::memory *>(mem);
+  } else {
+    std::shared_ptr<mkldnn::memory> ret(new mkldnn::memory(
+            desc, mem->get_data_handle()));
+    MKLDNNStream::Get()->RegisterMem(ret);
+    return ret.get();
+  }
+}
+
+const mkldnn::memory *NDArray::GetMKLDNNData(
+    const mkldnn::memory::primitive_desc &desc) const {
+  if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
+    LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc";
+    return nullptr;
+  }
+  auto mem = GetMKLDNNData();
+  mkldnn::memory::primitive_desc _desc = desc;
+  auto desc1 = mem->get_primitive_desc().desc();
+  auto desc2 = _desc.desc();
+  // The MKL memory has the same format and shape as required,
+  // or both use the default format, we can return the MKL memory.
+  if (mem->get_primitive_desc() == desc
+      || (desc1.data.format == GetDefaultFormat(desc1)
+        && desc2.data.format == GetDefaultFormat(desc2))) {
+    return GetMKLDNNExact(ptr_->Mkl_mem_.get(), desc);
+  } else {
+    return nullptr;
+  }
+}
+
+const mkldnn::memory *NDArray::GetMKLDNNDataReorder(
+    const mkldnn::memory::primitive_desc &desc) const {
+  if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
+    LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc";
+    return nullptr;
+  }
+  CHECK(storage_type() == kDefaultStorage);
+
+  auto mem = GetMKLDNNData();
+  // If the memory descriptor matches, it's easy.
+  MKLDNNStream *stream = MKLDNNStream::Get();
+  if (mem->get_primitive_desc() == desc) {
+    return GetMKLDNNExact(mem, desc);
+  }
+
+  mkldnn::memory::primitive_desc _desc = desc;
+  // Now we need to determine if we should reorder the memory.
+  // If both use the default formats, we think we don't need to reorder.
+  auto desc1 = mem->get_primitive_desc().desc();
+  auto desc2 = _desc.desc();
+  if (desc1.data.format == GetDefaultFormat(desc1) &&
+      desc2.data.format == GetDefaultFormat(desc2)) {
+    mkldnn_mem_ptr ret(new mkldnn::memory(desc, mem->get_data_handle()));
+    stream->RegisterMem(ret);
+    return ret.get();
+  } else {
+    auto ret = TmpMemMgr::Get()->Alloc(desc);
+    stream->RegisterPrim(mkldnn::reorder(*mem, *ret));
+    return ret;
+  }
+}
+
+const mkldnn::memory *NDArray::GetMKLDNNData() const {
+  CHECK(storage_type() == kDefaultStorage);
+  // If this array uses MKLDNN layout and it's a view, we have to change its
+  // layout to the default layout.
+  if (IsMKLDNN() && IsView())
+    ptr_->Reorder2Default();
+  ptr_->SetMKLMem(IsView() ? ptr_->storage_shape : shape_, dtype_);
+  // If shandle has data, the data in shandle and Mkl_mem_ should match.
+  if (ptr_->shandle.dptr)
+    CHECK(ptr_->shandle.dptr == ptr_->Mkl_mem_->get_data_handle());
+  MKLDNNStream::Get()->RegisterMem(ptr_->Mkl_mem_);
+  auto pd = ptr_->Mkl_mem_->get_primitive_desc();
+  if (IsView()) {
+    // Sliced array must use the default layout.
+    CHECK_EQ(GetDefaultFormat(pd.desc()), pd.desc().data.format);
+  }
+  if (IsView()) {
+    void *off_addr = static_cast<char *>(ptr_->Mkl_mem_->get_data_handle())
+        + byte_offset_;
+
+    // Create the primitive desc for the new mkldnn memory.
+    mkldnn::memory::dims dims(pd.desc().data.ndims);
+    // The first dimension has been sliced.
+    dims[0] = shape()[0];
+    for (size_t i = 1; i < dims.size(); i++)
+      dims[i] = pd.desc().data.dims[i];
+    mkldnn::memory::format cpp_format = static_cast<mkldnn::memory::format>(
+        pd.desc().data.format);
+    mkldnn::memory::data_type cpp_type = static_cast<mkldnn::memory::data_type>(
+        pd.desc().data.data_type);
+    mkldnn::memory::desc data_md(dims, cpp_type, cpp_format);
+    mkldnn::memory::primitive_desc new_pd(data_md, pd.get_engine());
+
+    std::shared_ptr<mkldnn::memory> ret(new mkldnn::memory(new_pd, off_addr));
+    MKLDNNStream::Get()->RegisterMem(ret);
+    return ret.get();
+  } else {
+    return ptr_->Mkl_mem_.get();
+  }
+}
+
+void NDArray::Reorder(const mkldnn::memory::primitive_desc &pd) {
+  CHECK_EQ(storage_type(), kDefaultStorage);
+  // If the memory already uses the specified layout, don't do anything.
+  if (ptr_->Mkl_mem_ != nullptr && ptr_->Mkl_mem_->get_primitive_desc() == pd)
+    return;
+  auto _pd = pd;
+  auto _desc = _pd.desc();
+  auto def_format = GetDefaultFormat(_desc);
+  // If the memory is default, don't do anything.
+  if (def_format == _desc.data.format && ptr_->IsDefault())
+    return;
+  // If the specified layout is default, we should use Reorder2Default.
+  if (def_format == _desc.data.format) {
+    ptr_->Reorder2Default();
+    return;
+  }
+
+  std::shared_ptr<mkldnn::memory> new_mem(new mkldnn::memory(pd));
+  ptr_->SetMKLMem(shape_, dtype_);
+  auto old_mem = ptr_->Mkl_mem_;
+  // It's possible that the specified layout has a different number of dimensions.
+  if (old_mem->get_primitive_desc().desc().data.ndims != _desc.data.ndims) {
+    // For now, we only support reorder from the default layout.
+    CHECK(ptr_->IsDefault());
+    auto def_pd = GetPrimitiveDesc(pd, def_format);
+    old_mem.reset(new mkldnn::memory(def_pd, old_mem->get_data_handle()));
+  }
+  // This may be called in MKLDNN operators. We can't use MKLDNNStream here.
+  std::vector<mkldnn::primitive> net;
+  net.push_back(mkldnn::reorder(*old_mem, *new_mem));
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
+
+  CHECK(ptr_->shandle.size >= pd.get_size());
+  ptr_->CheckAndAlloc(pd.get_size());
+  // TODO(zhengda) We need to avoid memory copy here.
+  memcpy(ptr_->shandle.dptr, new_mem->get_data_handle(), pd.get_size());
+  ptr_->Mkl_mem_.reset(new mkldnn::memory(pd, ptr_->shandle.dptr));
+}
+
+void NDArray::CopyFrom(const mkldnn::memory &mem) {
+  if (ptr_ == nullptr) {
+    LOG(FATAL) << "The NDArray hasn't been initialized";
+    return;
+  }
+  if (ptr_->Mkl_mem_.get() == &mem)
+    return;
+
+  if (mem.get_primitive_desc().get_size() != shape().Size() * GetTypeSize(dtype_)) {
+    LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc";
+    return;
+  }
+
+  MKLDNNStream *stream = MKLDNNStream::Get();
+  // If this array uses MKLDNN layout and it's a view, we have to change its
+  // layout to the default layout.
+  if (IsMKLDNN() && IsView())
+    ptr_->Reorder2Default();
+  ptr_->SetMKLMem(IsView() ? ptr_->storage_shape : shape_,
+                  dtype_);
+  stream->RegisterMem(ptr_->Mkl_mem_);
+  auto from_desc = mem.get_primitive_desc().desc();
+  auto this_desc = ptr_->Mkl_mem_->get_primitive_desc().desc();
+  auto from_def_format = GetDefaultFormat(from_desc);
+  if (IsView()) {
+    // Sliced array must use the default layout.
+    CHECK_EQ(GetDefaultFormat(this_desc), this_desc.data.format);
+  }
+  // It's possible that the memory and the NDArray don't have the same shape.
+  if (!same_shape(shape_, from_desc.data.dims, from_desc.data.ndims)
+      // If the source memory uses the default layout, we can reshape directly.
+      && from_def_format == from_desc.data.format) {
+    // In this case, we can simply create a new MKLDNN memory for the required
+    // shape.
+    mkldnn::memory::dims dims(this_desc.data.dims,
+                              this_desc.data.dims + this_desc.data.ndims);
+    auto this_dtype = static_cast<mkldnn::memory::data_type>(this_desc.data.data_type);
+    auto this_format = static_cast<mkldnn::memory::format>(GetDefaultFormat(this_desc));
+    mkldnn::memory::desc data_md(dims, this_dtype, this_format);
+    mkldnn::memory::primitive_desc pd(data_md, mem.get_primitive_desc().get_engine());
+    mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, mem.get_data_handle()));
+    stream->RegisterMem(tmp_mem);
+    stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->Mkl_mem_));
+  } else if (!same_shape(shape_, from_desc.data.dims, from_desc.data.ndims)) {
+    // In this case, the source memory stores data in a customized layout. We
+    // need to reorganize the data in memory before we can reshape.
+    auto def_pd = GetPrimitiveDesc(mem.get_primitive_desc(), from_def_format);
+    auto def_mem = TmpMemMgr::Get()->Alloc(def_pd);
+    stream->RegisterPrim(mkldnn::reorder(mem, *def_mem));
+    // Now we can reshape it
+    mkldnn::memory::dims dims(this_desc.data.dims,
+                              this_desc.data.dims + this_desc.data.ndims);
+    auto this_dtype = static_cast<mkldnn::memory::data_type>(this_desc.data.data_type);
+    auto this_format = static_cast<mkldnn::memory::format>(GetDefaultFormat(this_desc));
+    mkldnn::memory::desc data_md(dims, this_dtype, this_format);
+    mkldnn::memory::primitive_desc pd(data_md, mem.get_primitive_desc().get_engine());
+    mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, def_mem->get_data_handle()));
+    stream->RegisterMem(tmp_mem);
+    stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->Mkl_mem_));
+  } else if (mem.get_primitive_desc() == ptr_->Mkl_mem_->get_primitive_desc()) {
+    // If the layout is the same, we can just copy data.
+    stream->RegisterPrim(mkldnn::reorder(mem, *ptr_->Mkl_mem_));
+  } else {
+    auto src_def = GetDefaultFormat(mem.get_primitive_desc().desc());
+    auto dst_def = GetDefaultFormat(ptr_->Mkl_mem_->get_primitive_desc().desc());
+    // If both are not using the default layouts. There isn't much we can do,
+    // other than reorder data layout directly.
+    if (dst_def != ptr_->Mkl_mem_->get_primitive_desc().desc().data.format
+        && src_def != mem.get_primitive_desc().desc().data.format) {
+      stream->RegisterPrim(mkldnn::reorder(mem, *ptr_->Mkl_mem_));
+    } else if (dst_def == ptr_->Mkl_mem_->get_primitive_desc().desc().data.format) {
+      // If the dest mem uses the default memory layout, we can simply use
+      // the default format of the source memory to improve perf of reorder.
+      auto pd = GetPrimitiveDesc(ptr_->Mkl_mem_->get_primitive_desc(), src_def);
+      mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, ptr_->Mkl_mem_->get_data_handle()));
+      stream->RegisterMem(tmp_mem);
+      stream->RegisterPrim(mkldnn::reorder(mem, *tmp_mem));
+    } else {
+      // If the src mem uses the default memory layout, we can use
+      // the default format of the source memory to improve perf.
+      auto pd = GetPrimitiveDesc(mem.get_primitive_desc(), dst_def);
+      mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, mem.get_data_handle()));
+      stream->RegisterMem(tmp_mem);
+      stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->Mkl_mem_));
+    }
+  }
+}
+mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc pd,
+                                                mkldnn_memory_format_t format);
+
+mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::primitive_desc &desc) {
+  // This array shouldn't be a view.
+  CHECK(!IsView());
+
+  if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
+    LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc";
+    return nullptr;
+  }
+
+  mkldnn::memory::primitive_desc _desc = desc;
+  auto required_format = _desc.desc().data.format;
+  auto def_format = GetDefaultFormat(_desc.desc());
+  // If the required format is a default format, we don't need to worry about the shape.
+  // If the shape isn't the same, it actually implicitly reshapes data.
+  if (required_format == def_format) {
+    ptr_->SetMKLMem(shape_, dtype_);
+    MKLDNNStream::Get()->RegisterMem(ptr_->Mkl_mem_);
+    return GetMKLDNNExact(ptr_->Mkl_mem_.get(), desc);
+  }
+
+  if (ptr_->Mkl_mem_)
+    CHECK(ptr_->Mkl_mem_->get_data_handle() == ptr_->shandle.dptr);
+  ptr_->ResetMKLMem();
+  if (ptr_->Mkl_mem_ && ptr_->Mkl_mem_->get_primitive_desc() == desc) {
+    MKLDNNStream::Get()->RegisterMem(ptr_->Mkl_mem_);
+    return GetMKLDNNExact(ptr_->Mkl_mem_.get(), desc);
+  }
+
+  CHECK(ptr_->shandle.size >= desc.get_size());
+  ptr_->CheckAndAlloc(desc.get_size());
+  ptr_->Mkl_mem_.reset(new mkldnn::memory(desc, ptr_->shandle.dptr));
+  MKLDNNStream::Get()->RegisterMem(ptr_->Mkl_mem_);
+  return ptr_->Mkl_mem_.get();
+}
+#endif
+
+void NDArray::SetTBlob() const {
+  CHECK(ptr_ != nullptr);
+  TShape shape = shape_;
+  char *dptr = static_cast<char*>(ptr_->shandle.dptr);
+  auto stype = storage_type();
+  if (stype == kDefaultStorage) {
+#if MXNET_USE_MKLDNN == 1
+    if (IsMKLDNN()) {
+      ptr_->Reorder2Default();
+      dptr = static_cast<char*>(ptr_->shandle.dptr);
+    }
+#endif
+    dptr += byte_offset_;
+  } else if (stype == kCSRStorage || stype == kRowSparseStorage) {
+    CHECK_EQ(byte_offset_, 0);
+    shape = storage_shape();
+  } else {
+    LOG(FATAL) << "unknown storage type " << stype;
+  }
+  tblob_.dptr_ = dptr;
+  tblob_.shape_ = shape;
+  tblob_.type_flag_ = dtype_;
+  tblob_.SetDLTensor(ptr_->shandle.ctx.dev_mask(), ptr_->shandle.ctx.dev_id);
+}
 
 /*!
 * \brief run a ternary operation
@@ -449,11 +994,30 @@ inline void CopyFromToRspImpl(const NDArray& from, const NDArray& to, RunContext
 // Make a copy of a dense NDArray
 template<typename from_xpu, typename to_xpu>
 inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext ctx) {
-  using namespace mshadow;
-  CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type";
-  TBlob tmp = to.data();
-  ndarray::Copy<from_xpu, to_xpu>(from.data(), &tmp,
-                                  from.ctx(), to.ctx(), ctx);
+#if MXNET_USE_MKLDNN == 1
+  // If neither is MKLDNN, we can copy data normally.
+  if (!from.IsMKLDNN() && !to.IsMKLDNN()) {
+#endif
+    using namespace mshadow;
+    CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type";
+    TBlob tmp = to.data();
+    ndarray::Copy<from_xpu, to_xpu>(from.data(), &tmp,
+                                    from.ctx(), to.ctx(), ctx);
+#if MXNET_USE_MKLDNN == 1
+  } else {
+    auto from_mem = from.GetMKLDNNData();
+    auto to_mem = to.GetMKLDNNData();
+    if (from_mem->get_primitive_desc() == to_mem->get_primitive_desc()) {
+      size_t size = std::min(from_mem->get_primitive_desc().get_size(),
+                             to_mem->get_primitive_desc().get_size());
+      memcpy(to_mem->get_data_handle(), from_mem->get_data_handle(), size);
+    } else {
+      std::vector<mkldnn::primitive> net;
+      net.push_back(mkldnn::reorder(*from_mem, *to_mem));
+      mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
+    }
+  }
+#endif
 }
 
 // Make a copy of an NDArray based on storage type
diff --git a/src/operator/concat-inl.h b/src/operator/concat-inl.h
deleted file mode 100644
index 4225ddf4eac0..000000000000
--- a/src/operator/concat-inl.h
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file concat-inl.h
- * \brief
- * \author Bing Xu
-*/
-#ifndef MXNET_OPERATOR_CONCAT_INL_H_
-#define MXNET_OPERATOR_CONCAT_INL_H_
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <cstring>
-#include <map>
-#include <string>
-#include <vector>
-#include <utility>
-#include "./operator_common.h"
-#include "./channel_op_common.h"
-#include "./tensor/broadcast_reduce_op.h"
-
-namespace mxnet {
-namespace op {
-
-namespace concat_enum {
-enum ConcatOpInputs {kData0, kData1, kData2, kData3, kData4};
-enum ConcatOpOutputs {kOut};
-}  // namespace concat_enum
-
-struct ConcatParam : public dmlc::Parameter<ConcatParam> {
-  int num_args;
-  int dim;
-  DMLC_DECLARE_PARAMETER(ConcatParam) {
-    DMLC_DECLARE_FIELD(num_args).set_lower_bound(1)
-    .describe("Number of inputs to be concated.");
-    DMLC_DECLARE_FIELD(dim).set_default(1)
-    .describe("the dimension to be concated.");
-  }
-};  // struct ConcatParam
-
-template<typename xpu, typename DType>
-class ConcatOp : public Operator {
- public:
-  explicit ConcatOp(ConcatParam param)
-    : size_(param.num_args), dimension_(param.dim) {}
-
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(static_cast<int>(in_data.size()), size_);
-    CHECK_EQ(out_data.size(), 1U);
-    int axis = CheckAxis(dimension_, in_data[concat_enum::kData0].ndim());
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    std::vector<Tensor<xpu, 3, DType> > data(size_);
-    Tensor<xpu, 3, DType> out;
-    size_t leading = 1, trailing = 1;
-    for (int i = 0; i < axis; ++i) {
-      leading *= out_data[concat_enum::kOut].shape_[i];
-    }
-    for (int i = axis + 1; i < out_data[concat_enum::kOut].ndim(); ++i) {
-      trailing *= out_data[concat_enum::kOut].shape_[i];
-    }
-    size_t mid = out_data[concat_enum::kOut].shape_[axis];
-    Shape<3> oshape = Shape3(leading, mid, trailing);
-    out = out_data[concat_enum::kOut].get_with_shape<xpu, 3, DType>(oshape, s);
-
-    for (int i = 0; i < size_; ++i) {
-      Shape<3> dshape = Shape3(leading, in_data[i].shape_[axis], trailing);
-      data[i] = in_data[i].get_with_shape<xpu, 3, DType>(dshape, s);
-    }
-    Concatenate(data, &out, 1, req[concat_enum::kOut]);
-  }
-
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_states) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1U);
-    CHECK_EQ(in_grad.size(), static_cast<size_t>(size_));
-    int axis = CheckAxis(dimension_, out_grad[concat_enum::kData0].ndim());
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    std::vector<Tensor<xpu, 3, DType> > grad_in(size_);
-    Tensor<xpu, 3, DType> grad;
-    size_t leading = 1, trailing = 1;
-    for (int i = 0; i < axis; ++i) {
-      leading *= out_grad[concat_enum::kOut].shape_[i];
-    }
-    for (int i = axis + 1; i < out_grad[concat_enum::kOut].ndim(); ++i) {
-      trailing *= out_grad[concat_enum::kOut].shape_[i];
-    }
-    size_t mid = out_grad[concat_enum::kOut].shape_[axis];
-    Shape<3> oshape = Shape3(leading, mid, trailing);
-    grad = out_grad[concat_enum::kOut].get_with_shape<xpu, 3, DType>(oshape, s);
-
-    for (int i = 0; i < size_; ++i) {
-      Shape<3> dshape = Shape3(leading, in_grad[i].shape_[axis], trailing);
-      grad_in[i] = in_grad[i].get_with_shape<xpu, 3, DType>(dshape, s);
-    }
-    Split(grad, &grad_in, 1, req);
-  }
-
- private:
-  int size_;
-  int dimension_;
-};  // class ConcatOp
-
-template<typename xpu>
-Operator *CreateOp(ConcatParam param, int dtype, std::vector<TShape> *in_shape);
-
-#if DMLC_USE_CXX11
-class ConcatProp : public OperatorProperty {
- public:
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  std::vector<std::string> ListArguments() const override {
-    std::vector<std::string> ret;
-    for (int i = 0; i < param_.num_args; ++i) {
-      ret.push_back(std::string("arg") + std::to_string(i));
-    }
-    return ret;
-  }
-
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-    using namespace mshadow;
-    CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
-    TShape dshape;
-    index_t size = 0;
-    bool has_zero = false;
-    int axis = -1;
-    for (int i = 0; i < param_.num_args; ++i) {
-      TShape tmp = (*in_shape)[i];
-      if (tmp.ndim()) {
-        axis = CheckAxis(param_.dim, tmp.ndim());
-        has_zero = tmp[axis] == 0 || has_zero;
-        size += tmp[axis];
-        tmp[axis] = 0;
-        shape_assign(&dshape, tmp);
-      }
-    }
-
-    TShape tmp = (*out_shape)[0];
-    if (tmp.ndim()) {
-      axis = CheckAxis(param_.dim, tmp.ndim());
-      tmp[axis] = 0;
-      shape_assign(&dshape, tmp);
-    }
-
-    if (dshape.ndim() == 0) return false;
-
-    for (int i = 0; i < param_.num_args; ++i) {
-      CHECK(shape_assign(&(*in_shape)[i], dshape))
-        << "Incompatible input shape: expected " << dshape << ", got " << (*in_shape)[i];
-    }
-
-    if (!has_zero) dshape[axis] = size;
-    CHECK(shape_assign(&(*out_shape)[0], dshape))
-      << "Incompatible output shape: expected " << dshape << ", got " << (*out_shape)[0];
-
-    return dshape.Size() != 0;
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    int dtype = -1;
-
-    for (size_t i = 0; i < in_type->size(); ++i) {
-      if (dtype == -1) {
-        dtype = in_type->at(i);
-      } else {
-        CHECK(in_type->at(i) == dtype ||
-              in_type->at(i) == -1) <<
-              "Non-uniform data type in Concat";
-      }
-    }
-
-    if (dtype == -1) {
-      LOG(FATAL) << "Not enough information to infer type in Concat.";
-      return false;
-    }
-
-    size_t nin = this->ListArguments().size();
-    in_type->clear();
-    for (size_t i = 0; i < nin; ++i) in_type->push_back(dtype);
-
-    size_t naux = this->ListAuxiliaryStates().size();
-    aux_type->clear();
-    for (size_t i = 0; i < naux; ++i) aux_type->push_back(dtype);
-
-    size_t nout = this->ListOutputs().size();
-    out_type->clear();
-    for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype);
-
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    auto ptr = new ConcatProp();
-    ptr->param_ = param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override {
-    return "Concat";
-  }
-
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return out_grad;
-  }
-
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not implemented";
-    return NULL;
-  }
-
-  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                             std::vector<int> *in_type) const override;
-
- private:
-  ConcatParam param_;
-};  // class ConcatProp
-#endif  // DMLC_USE_CXX11
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_OPERATOR_CONCAT_INL_H_
diff --git a/src/operator/concat.cc b/src/operator/concat.cc
deleted file mode 100644
index 4d3c2fa1661f..000000000000
--- a/src/operator/concat.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file concat.cc
- * \brief
- * \author Bing Xu
-*/
-
-#include "./concat-inl.h"
-#if MXNET_USE_MKL2017 == 1
-#include <mkl_memory.h>
-#include "./mkl/mkl_memory-inl.h"
-#include "./mkl/mkl_concat-inl.h"
-#endif  // MXNET_USE_MKL2017
-
-namespace mxnet {
-namespace op {
-template<>
-Operator* CreateOp<cpu>(ConcatParam param, int dtype, std::vector<TShape> *in_shape) {
-  Operator *op = NULL;
-#if MXNET_USE_MKL2017 == 1
-  // MKL supports 4D input tensors only for concat operation
-  // 2D/3D input tensors are reshaped to 4D in mkl_concat-inl.h
-  // hence MKL supports 2D/3D/4D input tensors for concat operation
-  size_t dims = (*in_shape)[0].ndim();
-  bool supportedDim = (dims >= 2 && dims <= 4);
-  if ((1 == param.dim) && supportedDim &&
-    (param.num_args < (dnnResourceMultipleDst - dnnResourceMultipleSrc))) {
-    switch (dtype) {
-      case mshadow::kFloat32:
-      return new MKLConcatOp<cpu, float>(param);
-    case mshadow::kFloat64:
-      return new MKLConcatOp<cpu, double>(param);
-    default:
-      break;
-    }
-  }
-  if (enableMKLWarnGenerated())
-    LOG(INFO) << MKLConcatOp<cpu, float>::getName() << " Skip MKL optimization";
-#endif
-  MSHADOW_TYPE_SWITCH(dtype, DType, {
-    op = new ConcatOp<cpu, DType>(param);
-  });
-  return op;
-}
-
-Operator* ConcatProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                                       std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0), in_shape);
-}
-
-DMLC_REGISTER_PARAMETER(ConcatParam);
-
-MXNET_REGISTER_OP_PROPERTY(Concat, ConcatProp)
-.describe(R"code(Joins input arrays along a given axis.
-
-.. note:: `Concat` is deprecated. Use `concat` instead.
-
-The dimensions of the input arrays should be the same except the axis along
-which they will be concatenated.
-The dimension of the output array along the concatenated axis will be equal
-to the sum of the corresponding dimensions of the input arrays.
-
-Example::
-
-   x = [[1,1],[2,2]]
-   y = [[3,3],[4,4],[5,5]]
-   z = [[6,6], [7,7],[8,8]]
-
-   concat(x,y,z,dim=0) = [[ 1.,  1.],
-                          [ 2.,  2.],
-                          [ 3.,  3.],
-                          [ 4.,  4.],
-                          [ 5.,  5.],
-                          [ 6.,  6.],
-                          [ 7.,  7.],
-                          [ 8.,  8.]]
-
-   Note that you cannot concat x,y,z along dimension 1 since dimension
-   0 is not the same for all the input arrays.
-
-   concat(y,z,dim=1) = [[ 3.,  3.,  6.,  6.],
-                         [ 4.,  4.,  7.,  7.],
-                         [ 5.,  5.,  8.,  8.]]
-
-)code" ADD_FILELINE)
-.add_argument("data", "NDArray-or-Symbol[]", "List of arrays to concatenate")
-.add_arguments(ConcatParam::__FIELDS__())
-.set_key_var_num_args("num_args");
-
-NNVM_REGISTER_OP(Concat).add_alias("concat");
-
-}  // namespace op
-}  // namespace mxnet
diff --git a/src/operator/lrn-inl.h b/src/operator/lrn-inl.h
deleted file mode 100644
index adfe4676702d..000000000000
--- a/src/operator/lrn-inl.h
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file lrn-inl.h
- * \brief
- * \author Bing Xu
-*/
-#ifndef MXNET_OPERATOR_LRN_INL_H_
-#define MXNET_OPERATOR_LRN_INL_H_
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <map>
-#include <vector>
-#include <string>
-#include <utility>
-#include "./operator_common.h"
-#include "./mshadow_op.h"
-
-namespace mxnet {
-namespace op {
-
-namespace lrn_enum {
-enum LRNInputs {kData};
-enum LRNOutputs {kOut, kTmpNorm};
-}  // namespace lrn_enum
-
-struct LRNParam : public dmlc::Parameter<LRNParam> {
-  float alpha;
-  float beta;
-  float knorm;
-  uint32_t nsize;
-  DMLC_DECLARE_PARAMETER(LRNParam) {
-    DMLC_DECLARE_FIELD(alpha).set_default(1e-4f)
-    .describe("The variance scaling parameter :math:`\alpha` in the LRN expression.");
-    DMLC_DECLARE_FIELD(beta).set_default(0.75f)
-    .describe("The power parameter :math:`\beta` in the LRN expression.");
-    DMLC_DECLARE_FIELD(knorm).set_default(2.0f)
-    .describe("The parameter :math:`k` in the LRN expression.");
-    DMLC_DECLARE_FIELD(nsize)
-    .describe("normalization window width in elements.");
-  }
-};  // struct LRNParam
-
-template<typename xpu>
-class LocalResponseNormOp : public Operator {
- public:
-  explicit LocalResponseNormOp(LRNParam param) {
-    param_ = param;
-  }
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_states) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    // TODO(xxx): Test with gradient chceker
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 2U);
-    // CHECK_EQ(req.size(), 2);
-    CHECK_EQ(param_.nsize % 2, 1U) << "LRN only supports odd values for local_size";
-    const real_t salpha = param_.alpha / param_.nsize;
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> data = in_data[lrn_enum::kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> out = out_data[lrn_enum::kOut].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> tmp_norm = out_data[lrn_enum::kTmpNorm].get<xpu, 4, real_t>(s);
-    tmp_norm = chpool<red::sum>(F<mshadow_op::square>(data) , param_.nsize) * salpha + param_.knorm;
-    Assign(out, req[lrn_enum::kOut], data *  F<mshadow_op::power>(tmp_norm, -param_.beta));
-  }
-
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_states) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1U);
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 2U);
-    const real_t salpha = param_.alpha / param_.nsize;
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> grad = out_grad[lrn_enum::kOut].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> tmp_norm = out_data[lrn_enum::kTmpNorm].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> data = in_data[lrn_enum::kData].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> grad_in = in_grad[lrn_enum::kData].get<xpu, 4, real_t>(s);
-    grad_in = grad * F<mshadow_op::power>(tmp_norm, -param_.beta);
-    grad_in += (- 2.0f * param_.beta * salpha) *
-               chpool<red::sum>(grad * data *
-                                F<mshadow_op::power>(tmp_norm, -param_.beta - 1.0f),
-                                param_.nsize)  * data;
-  }
-
- private:
-  LRNParam param_;
-};  // class LocalResponseNormOp
-
-template<typename xpu>
-Operator *CreateOp(LRNParam param, int dtype);
-
-#if DMLC_USE_CXX11
-class LocalResponseNormProp : public OperatorProperty {
- public:
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-    using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 1U) << "Input:[data]";
-    const TShape &dshape = in_shape->at(0);
-    if (dshape.ndim() == 0) return false;
-    out_shape->clear();
-    out_shape->push_back(dshape);
-    out_shape->push_back(dshape);
-    return true;
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1U);
-    int dtype = (*in_type)[0];
-    CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
-      if ((*in_type)[i] == -1) {
-        (*in_type)[i] = dtype;
-      } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
-      }
-    }
-    int n_out = this->ListOutputs().size();
-    out_type->clear();
-    for (int i = 0; i < n_out; ++i ) out_type->push_back(dtype);
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    auto ptr = new LocalResponseNormProp();
-    ptr->param_ = param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override {
-    return "LRN";
-  }
-
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return {
-      out_grad[lrn_enum::kOut], in_data[lrn_enum::kData],
-      out_data[lrn_enum::kTmpNorm], out_data[lrn_enum::kOut]
-    };
-  }
-
-  int NumVisibleOutputs() const override {
-    return 1;
-  }
-
-  int NumOutputs() const override {
-    return 2;
-  }
-
-  std::vector<std::string> ListArguments() const override {
-    return {"data"};
-  }
-
-  std::vector<std::string> ListOutputs() const override {
-    return {"output", "tmp_norm"};
-  }
-
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented.";
-    return NULL;
-  }
-
-  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                             std::vector<int> *in_type) const override;
-
- private:
-  LRNParam param_;
-};  // LocalResponseNormProp
-#endif  // DMLC_USE_CXX11
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_LRN_INL_H_
diff --git a/src/operator/lrn.cc b/src/operator/lrn.cc
deleted file mode 100644
index 9b3afd80cd18..000000000000
--- a/src/operator/lrn.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file lrn.cc
- * \brief
- * \author Bing Xu
-*/
-
-#include "./lrn-inl.h"
-#if MXNET_USE_CUDNN == 1
-#include "./cudnn_lrn-inl.h"
-#endif
-#if MXNET_USE_MKL2017 == 1
-#include <mkl_memory.h>
-#include "./mkl/mkl_memory-inl.h"
-#include "./mkl/mkl_lrn-inl.h"
-#endif
-
-namespace mxnet {
-namespace op {
-template<>
-Operator* CreateOp<cpu>(LRNParam param, int dtype) {
-#if MXNET_USE_MKL2017 == 1
-  return new MKLLRNOp<cpu, float>(param);
-#endif
-  return new LocalResponseNormOp<cpu>(param);
-}
-
-// DO_BIND_DISPATCH comes from operator_common.h
-Operator* LocalResponseNormProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-    std::vector<int> *in_type) const {
-    DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
-}
-
-DMLC_REGISTER_PARAMETER(LRNParam);
-
-MXNET_REGISTER_OP_PROPERTY(LRN, LocalResponseNormProp)
-.add_argument("data", "NDArray-or-Symbol", "Input data.")
-.add_arguments(LRNParam::__FIELDS__())
-.describe(R"code(Applies local response normalization to the input.
-
-The local response normalization layer performs "lateral inhibition" by normalizing
-over local input regions.
-
-If :math:`a_{x,y}^{i}` is the activity of a neuron computed by applying kernel :math:`i` at position
-:math:`(x, y)` and then applying the ReLU nonlinearity, the response-normalized
-activity :math:`b_{x,y}^{i}` is given by the expression:
-
-.. math::
-   b_{x,y}^{i} = \frac{a_{x,y}^{i}}{\Bigg({k + \alpha \sum_{j=max(0, i-\frac{n}{2})}^{min(N-1, i+\frac{n}{2})} (a_{x,y}^{j})^{2}}\Bigg)^{\beta}}
-
-where the sum runs over :math:`n` "adjacent" kernel maps at the same spatial position, and :math:`N` is the total
-number of kernels in the layer.
-
-)code" ADD_FILELINE);
-
-}  // namespace op
-}  // namespace mxnet
diff --git a/src/operator/mkl/mkl_batch_norm-inl.h b/src/operator/mkl/mkl_batch_norm-inl.h
deleted file mode 100644
index b5967f4de294..000000000000
--- a/src/operator/mkl/mkl_batch_norm-inl.h
+++ /dev/null
@@ -1,391 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_batch_norm-inl.h
-* \brief
-* \author lingyan.guo@intel.com
-*         zhenlin.luo@intel.com
-*
-*******************************************************************************/
-#ifndef MXNET_OPERATOR_MKL_MKL_BATCH_NORM_INL_H_
-#define MXNET_OPERATOR_MKL_MKL_BATCH_NORM_INL_H_
-#include <mxnet/storage.h>
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <map>
-#include <vector>
-#include <string>
-#include <utility>
-#include "../operator_common.h"
-#include "../mshadow_op.h"
-#include "./mkl_util-inl.h"
-
-namespace mxnet {
-namespace op {
-
-template<typename xpu, typename DType>
-class MKLBatchNormOp : public Operator {
- public:
-  explicit MKLBatchNormOp(BatchNormParam param) {
-    this->param_ = param;
-    fwd_top_data = MKLData<DType>::create();
-    fwd_bottom_data = MKLData<DType>::create();
-    bwd_top_diff = MKLData<DType>::create();
-    bwd_bottom_diff = MKLData<DType>::create();
-    scaleShift_space.dptr = NULL;
-    scaleShiftDiff_space.dptr = NULL;
-  }
-  virtual ~MKLBatchNormOp() {
-    if (batchNormFwdInference != NULL) dnnDelete<DType>(batchNormFwdInference);
-    if (batchNormFwdTraining != NULL) dnnDelete<DType>(batchNormFwdTraining);
-    if (batchNormBwdScaleShift != NULL) dnnDelete<DType>(batchNormBwdScaleShift);
-    dnnLayoutDelete<DType>(layout_usr_);
-    if (scaleShift_space.dptr)
-      Storage::Get()->Free(scaleShift_space);
-    if (scaleShiftDiff_space.dptr)
-      Storage::Get()->Free(scaleShiftDiff_space);
-  }
-  static std::string getName() {
-    return "MKLBatchNormOp";
-  }
-
- private:
-  void LayerSetUp(const mshadow::Tensor<xpu, 4, DType> &data,
-                  const mshadow::Tensor<xpu, 4, DType> &out) {
-    eps_ = param_.eps;
-    size_t dim = 4, sizes[4], strides[4];
-    channels_ = data.shape_[1];
-    height_ = data.shape_[2];
-    width_ = data.shape_[3];
-    num_ = data.shape_[0];
-
-    sizes[0] = width_;
-    sizes[1] = height_;
-    sizes[2] = channels_;
-    sizes[3] = num_;
-
-    strides[0] = 1;
-    strides[1] = sizes[0];
-    strides[2] = sizes[0] * sizes[1];
-    strides[3] = sizes[0] * sizes[1] * sizes[2];
-
-    // Names are for debugging only
-    fwd_bottom_data->name = "fwd_bottom_data   @ " + getName();
-    fwd_top_data->name = "fwd_top_data      @ " + getName();
-    bwd_bottom_diff->name = "bwd_bottom_diff   @ " + getName();
-    bwd_top_diff->name = "bwd_top_diff      @ " + getName();
-
-    dnnError_t e;
-    e = dnnLayoutCreate<DType>(&layout_usr_, dim, sizes, strides);
-    CHECK_EQ(e, E_SUCCESS);
-
-    fwd_bottom_data->create_user_layout(dim, sizes, strides);
-    fwd_top_data->create_user_layout(dim, sizes, strides);
-    bwd_bottom_diff->create_user_layout(dim, sizes, strides);
-    bwd_top_diff->create_user_layout(dim, sizes, strides);
-
-    // Primitives will be allocated during the first fwd pass
-    batchNormFwdInference = NULL;
-    batchNormFwdTraining = NULL;
-    batchNormBwdScaleShift = NULL;
-    int scaleShift_size = channels_*2*sizeof(DType);
-    scaleShift_space = Storage::Get()->Alloc(scaleShift_size, Context::CPU());
-    scaleShiftDiff_space = Storage::Get()->Alloc(scaleShift_size, Context::CPU());
-    DType * scaleShift_buf = reinterpret_cast<DType*>(scaleShift_space.dptr);
-    /*!use_weight_bias_*/
-    for (int i = 0; i < channels_; i++) {
-        scaleShift_buf[i] = 1.0;
-        scaleShift_buf[channels_ + i] = 0;
-    }
-  }
-
- public:
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_states) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 3);
-    CHECK_EQ(aux_states.size(), 2);
-    if (ctx.is_train) {
-      CHECK_EQ(out_data.size(), 3);
-      CHECK_EQ(req.size(), 3);
-    } else {
-      CHECK_GE(out_data.size(), 1);
-      CHECK_GE(req.size(), 1);
-      CHECK_EQ(req[batchnorm::kOut], kWriteTo);
-    }
-
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType>  data;
-    Tensor<xpu, 4, DType>  out;
-    if (in_data[batchnorm::kData].ndim() == 2) {
-      Shape<4> dshape = Shape4(in_data[batchnorm::kData].shape_[0],
-                               in_data[batchnorm::kData].shape_[1], 1, 1);
-      data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        in_data[batchnorm::kData], dshape, s);
-      out = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_data[batchnorm::kOut], dshape, s);
-    } else {
-      data = mkl_experimental_direct_get<xpu, 4, DType>(in_data[batchnorm::kData], s);
-      out = mkl_experimental_direct_get<xpu, 4, DType>(out_data[batchnorm::kOut], s);
-    }
-
-    // const real_t scale = static_cast<real_t>(in_data[batchnorm::kData].shape_[1]) /
-    //   static_cast<real_t>(in_data[batchnorm::kData].shape_.Size());
-
-    Tensor<xpu, 1, DType> slope = in_data[batchnorm::kGamma].get<xpu, 1, DType>(s);
-    Tensor<xpu, 1, DType> bias = in_data[batchnorm::kBeta].get<xpu, 1, DType>(s);
-    Tensor<xpu, 1, DType> moving_mean = aux_states[batchnorm::kMovingMean].get<xpu, 1, DType>(s);
-    Tensor<xpu, 1, DType> moving_var = aux_states[batchnorm::kMovingVar].get<xpu, 1, DType>(s);
-
-    if (param_.fix_gamma)
-      slope = 1.f;
-
-    dnnError_t e;
-    if (!init_mkldnn_) {
-      LayerSetUp(data, out);
-      init_mkldnn_ = true;
-    }
-    void* bottom_data = NULL;
-#if MKL_EXPERIMENTAL == 1
-    bottom_data =
-          reinterpret_cast<void *>(mkl_prv_data<DType>(in_data[batchnorm::kData]));
-#endif
-    int bwd_flags = dnnUseScaleShift;
-    if (param_.use_global_stats)
-      bwd_flags = dnnUseScaleShift | dnnUseInputMeanVariance;
-#if MKL_EXPERIMENTAL == 1
-    if (NULL != bottom_data) {
-      // Is it the first pass? Create a primitive.
-      if (batchNormFwdInference == NULL) {
-        std::shared_ptr<MKLMemHolder> bottom_data_mem = in_data[batchnorm::kData].Mkl_mem_;
-        std::shared_ptr<PrvMemDescr> bottom_prv_desc = bottom_data_mem->get_prv_descriptor();
-        CHECK(bottom_prv_desc->get_descr_type() == PrvMemDescr::PRV_DESCR_MKL2017);
-        std::shared_ptr<MKLData<DType> > mem_descr
-          = std::static_pointer_cast<MKLData<DType>>(bottom_prv_desc);
-        CHECK(mem_descr != NULL);
-        fwd_bottom_data = mem_descr;
-
-        e = dnnBatchNormalizationCreateForward_v2<DType>(
-             &batchNormFwdInference, NULL, mem_descr->layout_int, eps_,
-             dnnUseInputMeanVariance | dnnUseScaleShift);
-        CHECK_EQ(e, E_SUCCESS);
-
-        e = dnnBatchNormalizationCreateForward_v2<DType>(
-              &batchNormFwdTraining, NULL, mem_descr->layout_int, eps_,
-              dnnUseScaleShift);
-        CHECK_EQ(e, E_SUCCESS);
-
-        fwd_top_data->create_internal_layout(batchNormFwdInference, dnnResourceDst);
-        bwd_top_diff->create_internal_layout(batchNormFwdInference, dnnResourceDst);
-        bwd_bottom_diff->create_internal_layout(batchNormFwdInference, dnnResourceSrc);
-
-        e = dnnBatchNormalizationCreateBackward_v2<DType>(
-                &batchNormBwdScaleShift, NULL, mem_descr->layout_int, eps_, bwd_flags);
-        CHECK_EQ(e, E_SUCCESS);
-      }
-    }
-#endif
-    if (NULL == bottom_data) {
-      if (batchNormFwdInference == NULL) {
-        e = dnnBatchNormalizationCreateForward_v2<DType>(
-          &batchNormFwdInference, NULL, layout_usr_, eps_,
-          dnnUseInputMeanVariance | dnnUseScaleShift);
-        CHECK_EQ(e, E_SUCCESS);
-
-        e = dnnBatchNormalizationCreateForward_v2<DType>(
-              &batchNormFwdTraining, NULL, layout_usr_, eps_, dnnUseScaleShift);
-        CHECK_EQ(e, E_SUCCESS);
-
-        e = dnnBatchNormalizationCreateBackward_v2<DType>(
-              &batchNormBwdScaleShift, NULL, layout_usr_, eps_, bwd_flags);
-        CHECK_EQ(e, E_SUCCESS);
-      }
-      bottom_data = reinterpret_cast<void *>(data.dptr_);
-    }
-
-    DType * scaleShift_buf = reinterpret_cast<DType*>(scaleShift_space.dptr);
-     // use_weight_bias_
-    for (int i = 0; i < channels_; i++) {
-        scaleShift_buf[i] = (slope.dptr_)[i];
-    }
-    for (int i = 0; i < channels_; i++) {
-      scaleShift_buf[channels_ + i] = (bias.dptr_)[i];
-    }
-
-    void* BatchNorm_res[dnnResourceNumber];
-    BatchNorm_res[dnnResourceSrc] = bottom_data;
-    BatchNorm_res[dnnResourceScaleShift] = scaleShift_space.dptr;
-
-    BatchNorm_res[dnnResourceDst] = fwd_top_data->get_output_ptr(out.dptr_,
-      fwd_top_data, out_data[batchnorm::kOut]);
-    if (ctx.is_train && !param_.use_global_stats) {
-      Tensor<xpu, 1, DType> mean = out_data[batchnorm::kMean].get<xpu, 1, DType>(s);
-      Tensor<xpu, 1, DType> var = out_data[batchnorm::kVar].get<xpu, 1, DType>(s);
-      CHECK(req[batchnorm::kMean] == kNullOp || req[batchnorm::kMean] == kWriteTo);
-      CHECK(req[batchnorm::kVar] == kNullOp || req[batchnorm::kVar] == kWriteTo);
-      BatchNorm_res[dnnResourceMean] = mean.dptr_;
-      BatchNorm_res[dnnResourceVariance] = var.dptr_;
-      e = dnnExecute<DType>(batchNormFwdTraining, BatchNorm_res);
-      CHECK_EQ(e, E_SUCCESS);
-    } else {
-      BatchNorm_res[dnnResourceMean] = moving_mean.dptr_;
-      BatchNorm_res[dnnResourceVariance] = moving_var.dptr_;
-      e = dnnExecute<DType>(batchNormFwdInference, BatchNorm_res);
-      CHECK_EQ(e, E_SUCCESS);
-    }
-
-#if MKL_EXPERIMENTAL == 0
-    if (fwd_top_data->conversion_needed()) {
-      fwd_top_data->convert_from_prv(out.dptr_);
-    }
-#endif
-  }
-
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_states) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(in_data.size(), 3);
-    CHECK_EQ(out_data.size(), 3);
-    CHECK_EQ(in_grad.size(), 3);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> data, grad, grad_in;
-
-    if (in_data[batchnorm::kData].ndim() == 2) {
-      Shape<4> dshape = Shape4(out_grad[batchnorm::kOut].shape_[0],
-                               out_grad[batchnorm::kOut].shape_[1], 1, 1);
-      data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        in_data[batchnorm::kData], dshape, s);
-      grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_grad[batchnorm::kOut], dshape, s);
-      grad_in = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        in_grad[batchnorm::kData], dshape, s);
-    } else {
-      data = mkl_experimental_direct_get<xpu, 4, DType>(in_data[batchnorm::kData], s);
-      grad = mkl_experimental_direct_get<xpu, 4, DType>(out_grad[batchnorm::kOut], s);
-      grad_in = mkl_experimental_direct_get<xpu, 4, DType>(in_grad[batchnorm::kData], s);
-    }
-
-    Tensor<xpu, 1, DType> slope = in_data[batchnorm::kGamma].get<xpu, 1, DType>(s);
-    Tensor<xpu, 1, DType> gslope = in_grad[batchnorm::kGamma].get<xpu, 1, DType>(s);
-    Tensor<xpu, 1, DType> gbias = in_grad[batchnorm::kBeta].get<xpu, 1, DType>(s);
-    Tensor<xpu, 1, DType> mean = out_data[batchnorm::kMean].get<xpu, 1, DType>(s);
-    Tensor<xpu, 1, DType> var = out_data[batchnorm::kVar].get<xpu, 1, DType>(s);
-    Tensor<xpu, 1, DType> moving_mean = aux_states[batchnorm::kMovingMean].get<xpu, 1, DType>(s);
-    Tensor<xpu, 1, DType> moving_var = aux_states[batchnorm::kMovingVar].get<xpu, 1, DType>(s);
-
-    if (param_.fix_gamma)  slope = 1.f;
-
-    void* bottom_data = NULL;
-#if MKL_EXPERIMENTAL == 1
-    bottom_data = reinterpret_cast<void *>(mkl_prv_data<DType>(in_data[batchnorm::kData]));
-#endif
-    if (NULL == bottom_data)
-      bottom_data = reinterpret_cast<void *>(data.dptr_);
-
-    dnnError_t e;
-    void* BatchNorm_res[dnnResourceNumber];
-    BatchNorm_res[dnnResourceSrc] = bottom_data;
-    BatchNorm_res[dnnResourceScaleShift] = scaleShift_space.dptr;
-    if (ctx.is_train && !param_.use_global_stats) {
-      int size = mean.size(0);  // Tensor<xpu, 1, DType>
-      float * moving_mean_ptr = reinterpret_cast<float*>(moving_mean.dptr_);
-      float * mean_ptr = reinterpret_cast<float*>(mean.dptr_);
-      float * moving_var_ptr = reinterpret_cast<float*>(moving_var.dptr_);
-      float * var_ptr = reinterpret_cast<float*>(var.dptr_);
-      float minus_mom = (1 - param_.momentum);
-      for (int i = 0; i < size; i++) {
-        moving_mean_ptr[i] = moving_mean_ptr[i] * param_.momentum
-          + mean_ptr[i] * minus_mom;
-      }
-      for (int i = 0; i < size; i++) {
-        moving_var_ptr[i] = moving_var_ptr[i] * param_.momentum
-          + var_ptr[i] * minus_mom;
-      }
-      BatchNorm_res[dnnResourceMean] = mean.dptr_;
-      BatchNorm_res[dnnResourceVariance] = var.dptr_;
-    } else {
-      BatchNorm_res[dnnResourceMean] = moving_mean.dptr_;
-      BatchNorm_res[dnnResourceVariance] = moving_var.dptr_;
-    }
-
-
-    BatchNorm_res[dnnResourceDiffSrc] = bwd_bottom_diff->get_output_ptr(grad_in.dptr_,
-      bwd_bottom_diff, in_grad[batchnorm::kData]);
-    BatchNorm_res[dnnResourceDiffDst] = bwd_top_diff->get_converted_prv(grad.dptr_,
-             true, out_grad[batchnorm::kOut]);
-    BatchNorm_res[dnnResourceDiffScaleShift] = scaleShiftDiff_space.dptr;
-    e = dnnExecute<DType>(batchNormBwdScaleShift, BatchNorm_res);
-    CHECK_EQ(e, E_SUCCESS);
-#if MKL_EXPERIMENTAL == 0
-    if (bwd_bottom_diff->conversion_needed()) {
-      bwd_bottom_diff->convert_from_prv(grad_in.dptr_);
-    }
-#endif
-    DType * scaleShiftDiff_buf = reinterpret_cast<DType*>(scaleShiftDiff_space.dptr);
-    if (!param_.fix_gamma) {
-      // Store ScaleShift blobs
-      DType* diff_scale = gslope.dptr_;
-      for (int i = 0; i < channels_; i++) {
-        diff_scale[i] = scaleShiftDiff_buf[i];
-      }
-    } else {
-      int gslope_size = gslope.size(0);
-      float * gslope_ptr = reinterpret_cast<float*>(gslope.dptr_);
-      for (int i = 0; i < gslope_size; i++) {
-        *gslope_ptr++ = 0.0f;
-      }
-    }
-    DType* diff_shift = gbias.dptr_;
-    for (int i = 0; i < channels_; i++) {
-      diff_shift[i] = scaleShiftDiff_buf[channels_ + i];
-    }
-  }
-
- private:
-  BatchNormParam param_;
-  DType eps_;
-  bool use_weight_bias_;
-
-  int num_;
-  int channels_;
-  int height_;
-  int width_;
-  bool init_mkldnn_ = false;
-  std::shared_ptr<MKLData<DType> > fwd_top_data;
-  std::shared_ptr<MKLData<DType> > fwd_bottom_data;
-  std::shared_ptr<MKLData<DType> > bwd_top_diff;
-  std::shared_ptr<MKLData<DType> > bwd_bottom_diff;
-  dnnPrimitive_t batchNormFwdInference = NULL;
-  dnnPrimitive_t batchNormFwdTraining = NULL;
-  dnnPrimitive_t batchNormBwdScaleShift = NULL;
-  Storage::Handle scaleShift_space;
-  Storage::Handle scaleShiftDiff_space;
-  dnnLayout_t layout_usr_ = NULL;
-};  // class BatchNormOp
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_MKL_MKL_BATCH_NORM_INL_H_
diff --git a/src/operator/mkl/mkl_concat-inl.h b/src/operator/mkl/mkl_concat-inl.h
deleted file mode 100644
index 1ed1e81d1303..000000000000
--- a/src/operator/mkl/mkl_concat-inl.h
+++ /dev/null
@@ -1,314 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_concat-inl.h
-* \brief
-* \author lingyan.guo@intel.com
-*         zhenlin.luo@intel.com
-*
-*******************************************************************************/
-#ifndef MXNET_OPERATOR_MKL_MKL_CONCAT_INL_H_
-#define MXNET_OPERATOR_MKL_MKL_CONCAT_INL_H_
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <cstring>
-#include <map>
-#include <string>
-#include <vector>
-#include <utility>
-#include "../operator_common.h"
-#include "../channel_op_common.h"
-#include "./mkl_util-inl.h"
-namespace mxnet {
-namespace op {
-
-
-template<typename xpu, typename DType>
-class MKLConcatOp : public Operator {
- public:
-  static std::string getName() {
-    return "MKLConcatOp";
-  }
-  explicit MKLConcatOp(ConcatParam param)
-    : size_(param.num_args), dimension_(param.dim), init_mkldnn_(false) {
-    concatFwd_ = static_cast<dnnPrimitive_t>(NULL);
-    concatBwd_ = static_cast<dnnPrimitive_t>(NULL);
-    fwd_top_data_ = MKLData<DType>::create();
-    bwd_top_diff_ = MKLData<DType>::create();
-
-    num_concats_ = param.num_args;
-  }
-  virtual ~MKLConcatOp() {
-    dnnDelete<DType>(concatFwd_);
-    dnnDelete<DType>(concatBwd_);
-  }
-
- private:
-  void LayerSetUp(const std::vector<mshadow::Tensor<xpu, 4, DType> > &data,
-                  const mshadow::Tensor<xpu, 4, DType> &out,
-                  size_t data_shape_size, size_t *split_channels_) {
-    size_t dim_src = data_shape_size;
-    size_t dim_dst = dim_src;
-    num_concats_ = size_;
-    channels_ = 0;
-
-    for (size_t i = 1; i < num_concats_; ++i) {
-      for (size_t j = 1; j < data_shape_size; ++j) {
-        if (j == dimension_) continue;
-        CHECK_EQ(data[0].shape_[j], data[i].shape_[j]);
-      }
-    }
-
-    for (size_t i = 0; i < num_concats_; ++i) {
-      CHECK_EQ((int)dim_src, data[i].shape_.kDimension);
-
-      fwd_bottom_data_.push_back(MKLData<DType>::create());
-      bwd_bottom_diff_.push_back(MKLData<DType>::create());
-      fwd_bottom_data_[i]->name = "fwd_bottom_data_[i]";
-      bwd_bottom_diff_[i]->name = "bwd_bottom_data[i]";
-
-      size_t *sizes_src = new size_t[dim_src];
-      size_t *strides_src = new size_t[dim_src];
-      for (size_t d = 0; d < dim_src; ++d) {
-        sizes_src[d] = data[i].shape_[dim_src - d - 1];
-        strides_src[d] = (d == 0) ? 1 : strides_src[d - 1] * sizes_src[d - 1];
-      }
-
-      split_channels_[i] = data[i].shape_[1];
-      channels_ += split_channels_[i];
-      fwd_bottom_data_[i]->create_user_layout(dim_src, sizes_src, strides_src);
-      bwd_bottom_diff_[i]->create_user_layout(dim_src, sizes_src, strides_src);
-      delete[] sizes_src;
-      delete[] strides_src;
-    }
-    size_t *sizes_dst = new size_t[dim_dst];
-    size_t *strides_dst = new size_t[dim_dst];
-    for (size_t d = 0; d < dim_dst; ++d) {
-      if (d == 2)
-        sizes_dst[d] = channels_;
-      else
-        sizes_dst[d] = data[0].shape_[dim_dst - 1 - d];
-      strides_dst[d] = (d == 0) ? 1 : strides_dst[d - 1] * sizes_dst[d - 1];
-    }
-    bwd_top_diff_->create_user_layout(dim_dst, sizes_dst, strides_dst);
-    fwd_top_data_->create_user_layout(dim_dst, sizes_dst, strides_dst);
-    delete[] sizes_dst;
-    delete[] strides_dst;
-    concatFwd_ = NULL;
-    concatBwd_ = NULL;
-  }
-
- public:
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(static_cast<int>(in_data.size()), size_);
-    CHECK_EQ(out_data.size(), 1);
-    CHECK_LT(dimension_, (size_t)in_data[concat_enum::kData0].ndim());
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    std::vector<Tensor<xpu, 4, DType> > data(size_);
-    Tensor<xpu, 4, DType> out;
-    if (in_data[0].ndim() == 2) {
-      for (int i = 0; i < size_; ++i) {
-        Shape<4> dshape = Shape4(in_data[i].shape_[0],
-                                 in_data[i].shape_[1], 1, 1);
-        data[i] = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-          in_data[i], dshape, s);
-      }
-      Shape<4> dshape = Shape4(out_data[concat_enum::kOut].shape_[0],
-                               out_data[concat_enum::kOut].shape_[1], 1, 1);
-      out = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_data[concat_enum::kOut], dshape, s);
-    } else if (in_data[0].ndim() == 3) {
-      for (int i = 0; i < size_; ++i) {
-        Shape<4> dshape = Shape4(in_data[i].shape_[0],
-          in_data[i].shape_[1], in_data[i].shape_[2], 1);
-        data[i] = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-          in_data[i], dshape, s);
-      }
-      Shape<4> dshape = Shape4(out_data[concat_enum::kOut].shape_[0],
-        out_data[concat_enum::kOut].shape_[1],
-        out_data[concat_enum::kOut].shape_[2], 1);
-      out = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_data[concat_enum::kOut], dshape, s);
-    } else {
-      for (int i = 0; i < size_; ++i) {
-        data[i] = mkl_experimental_direct_get<xpu, 4, DType>(in_data[i], s);
-      }
-      out = mkl_experimental_direct_get<xpu, 4, DType>(out_data[concat_enum::kOut], s);
-    }
-    size_t *split_channels_ = new size_t[num_concats_];
-    if (!init_mkldnn_) {
-      init_mkldnn_ = true;
-      LayerSetUp(data, out, 4, split_channels_);
-    }
-
-    dnnError_t e;
-    std::vector<void*> bottom_data;
-    bool isFirstPass = (concatFwd_ == NULL);
-    dnnLayout_t *layouts = NULL;
-    if (isFirstPass) {
-      layouts = new dnnLayout_t[num_concats_];
-    }
-
-    for (size_t i = 0; i < num_concats_; i++) {
-      void * bottom_i = NULL;
-#if MKL_EXPERIMENTAL == 1
-      bottom_i = mkl_prv_data<DType>(in_data[i]);
-      if (bottom_i != NULL) {
-        if (isFirstPass) {
-          std::shared_ptr<MKLData<DType> > mem_descr =
-            mkl_get_mem_desc<DType>(in_data[i].Mkl_mem_);
-          fwd_bottom_data_[i] = mem_descr;
-          layouts[i] = mem_descr->layout_int;
-        }
-      }
-#endif
-      if (bottom_i == NULL) {
-        bottom_i = data[i].dptr_;
-        if (isFirstPass) {
-          layouts[i] = fwd_bottom_data_[i]->layout_usr;
-        }
-      }
-
-      bottom_data.push_back(reinterpret_cast<void *>(bottom_i));
-    }
-
-    if (isFirstPass) {
-      e = dnnConcatCreate<DType>(&concatFwd_, NULL, num_concats_, layouts);
-      CHECK_EQ(e, E_SUCCESS);
-
-      fwd_top_data_->create_internal_layout(concatFwd_, dnnResourceDst);
-      bwd_top_diff_->create_internal_layout(concatFwd_, dnnResourceDst);
-
-      e = dnnSplitCreate<DType>(&concatBwd_, NULL, num_concats_,
-            bwd_top_diff_->layout_int, split_channels_);
-      CHECK_EQ(e, E_SUCCESS);
-
-      for (size_t n = 0; n < num_concats_; ++n) {
-        fwd_bottom_data_[n]->create_internal_layout(concatFwd_,
-          (dnnResourceType_t)(dnnResourceMultipleSrc + n));
-        bwd_bottom_diff_[n]->create_internal_layout(concatBwd_,
-          (dnnResourceType_t)(dnnResourceMultipleDst + n));
-      }
-    }
-    delete[] layouts;
-
-    void *concat_res[dnnResourceNumber];
-    for (size_t i = 0; i < num_concats_; ++i) {
-      concat_res[dnnResourceMultipleSrc + i]
-        = reinterpret_cast<void*>(bottom_data[i]);
-    }
-
-    concat_res[dnnResourceDst] = fwd_top_data_->get_output_ptr(out.dptr_,
-      fwd_top_data_, out_data[concat_enum::kOut]);
-    e = dnnExecute<DType>(concatFwd_, concat_res);
-    CHECK_EQ(e, E_SUCCESS);
-    delete[] split_channels_;
-  }
-
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_states) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(in_grad.size(), static_cast<size_t>(size_));
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    std::vector<Tensor<xpu, 4, DType> > grad_in(size_);
-    Tensor<xpu, 4, DType> grad;
-    if (in_grad[0].ndim() == 2) {
-      Shape<4> dshape = Shape4(out_grad[concat_enum::kOut].shape_[0],
-        out_grad[concat_enum::kOut].shape_[1], 1, 1);
-      grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_grad[concat_enum::kOut], dshape, s);
-      for (int i = 0; i < size_; ++i) {
-        dshape = Shape4(in_grad[i].shape_[0],
-          in_grad[i].shape_[1], 1, 1);
-        grad_in[i] = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-          in_grad[i], dshape, s);
-      }
-    } else if (in_grad[0].ndim() == 3) {
-      Shape<4> dshape = Shape4(out_grad[concat_enum::kOut].shape_[0],
-        out_grad[concat_enum::kOut].shape_[1],
-        out_grad[concat_enum::kOut].shape_[2], 1);
-      grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_grad[concat_enum::kOut], dshape, s);
-      for (int i = 0; i < size_; ++i) {
-        dshape = Shape4(in_grad[i].shape_[0],
-          in_grad[i].shape_[1], in_grad[i].shape_[2], 1);
-        grad_in[i] = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-          in_grad[i], dshape, s);
-      }
-    } else {
-      grad = mkl_experimental_direct_get<xpu, 4, DType>(out_grad[concat_enum::kOut], s);
-      for (int i = 0; i < size_; ++i) {
-        grad_in[i] = mkl_experimental_direct_get<xpu, 4, DType>(in_grad[i], s);
-      }
-    }
-
-    int need_bwd = 0;
-    for (size_t n = 0; n < num_concats_; n++) {
-      need_bwd += req[n];
-    }
-    if (!need_bwd) {
-      return;
-    }
-
-    dnnError_t e;
-    void *concat_res[dnnResourceNumber];
-    concat_res[dnnResourceSrc] = bwd_top_diff_->get_converted_prv(grad.dptr_, true,
-      out_grad[concat_enum::kOut]);
-    for (size_t i = 0; i < num_concats_; ++i) {
-      concat_res[dnnResourceMultipleDst + i] = bwd_bottom_diff_[i]->get_output_ptr(
-        grad_in[i].dptr_, bwd_bottom_diff_[i], in_grad[i]);
-    }
-    e = dnnExecute<DType>(concatBwd_, concat_res);
-    CHECK_EQ(e, E_SUCCESS);
-  }
-
- private:
-  int size_;
-  size_t dimension_;
-
-  bool init_mkldnn_;
-
-  dnnPrimitive_t concatFwd_;
-  dnnPrimitive_t concatBwd_;
-  std::shared_ptr<MKLData<DType> > fwd_top_data_;
-  std::vector< std::shared_ptr<MKLData<DType> > > fwd_bottom_data_;
-  std::shared_ptr<MKLData<DType> > bwd_top_diff_;
-  std::vector< std::shared_ptr<MKLData<DType> > > bwd_bottom_diff_;
-
-
-  size_t width_;
-  size_t height_;
-  size_t channels_;
-  size_t num_;
-  size_t num_concats_;
-};  // class MKLConcatOp
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_OPERATOR_MKL_MKL_CONCAT_INL_H_
diff --git a/src/operator/mkl/mkl_convolution-inl.h b/src/operator/mkl/mkl_convolution-inl.h
deleted file mode 100644
index 813d061f172b..000000000000
--- a/src/operator/mkl/mkl_convolution-inl.h
+++ /dev/null
@@ -1,490 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_convolution-inl.h
-* \brief
-* \author lingyan.guo@intel.com
-*         zhenlin.luo@intel.com
-*
-*******************************************************************************/
-#ifndef MXNET_OPERATOR_MKL_MKL_CONVOLUTION_INL_H_
-#define MXNET_OPERATOR_MKL_MKL_CONVOLUTION_INL_H_
-#include <mxnet/storage.h>
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <algorithm>
-#include <map>
-#include <vector>
-#include <string>
-#include <utility>
-#include "../operator_common.h"
-#include "../nn/convolution-inl.h"
-#include "./mkl_util-inl.h"
-
-namespace mxnet {
-namespace op {
-
-template<typename xpu, typename DType>
-class MKLConvolutionOp : public Operator {
- public:
-  static std::string getName() {
-    return "MKLConvolutionOp";
-  }
-  void SetupBuffer() {
-    convolutionBwdBias = static_cast<dnnPrimitive_t>(NULL);
-    convolutionBwdFilter = static_cast<dnnPrimitive_t>(NULL);
-    convolutionBwdData = static_cast<dnnPrimitive_t>(NULL);
-    convolutionFwd = static_cast<dnnPrimitive_t>(NULL);
-    fwd_bottom_data = MKLData<DType>::create();
-    fwd_top_data = MKLData<DType>::create();
-    fwd_filter_data = MKLData<DType>::create();
-    fwd_bias_data = MKLData<DType>::create();
-    bwdd_top_diff = MKLData<DType>::create();
-    bwdd_bottom_diff = MKLData<DType>::create();
-    bwdd_filter_data = MKLData<DType>::create();
-    bwdf_top_diff = MKLData<DType>::create();
-    bwdf_filter_diff = MKLData<DType>::create();
-    bwdf_bottom_data = MKLData<DType>::create();
-    bwdb_top_diff = MKLData<DType>::create();
-    bwdb_bias_diff = MKLData<DType>::create();
-    // Names are for debugging purposes only.
-    fwd_bottom_data->name = "fwd_bottom_data   @ " + this->getName();
-    fwd_top_data->name = "fwd_top_data      @ " + this->getName();
-    fwd_filter_data->name = "fwd_filter_data   @ " + this->getName();
-    fwd_bias_data->name = "fwd_bias_data     @ " + this->getName();
-    bwdd_top_diff->name = "bwdd_top_diff     @ " + this->getName();
-    bwdd_bottom_diff->name = "bwdd_bottom_diff  @ " + this->getName();
-    bwdd_filter_data->name = "bwdd_filter_data  @ " + this->getName();
-    bwdf_top_diff->name = "bwdf_top_diff     @ " + this->getName();
-    bwdf_bottom_data->name = "bwdf_bottom_data  @ " + this->getName();
-    bwdf_filter_diff->name = "bwdf_filter_diff  @ " + this->getName();
-    bwdb_top_diff->name = "bwdb_top_diff     @ " + this->getName();
-    bwdb_bias_diff->name = "bwdb_bias_diff    @ " + this->getName();
-  }
-
-  explicit MKLConvolutionOp(ConvolutionParam p):
-                            convolutionFwd(NULL),
-                            convolutionBwdData(static_cast<dnnPrimitive_t>(NULL)),
-                            convolutionBwdFilter(static_cast<dnnPrimitive_t>(NULL)),
-                            convolutionBwdBias(static_cast<dnnPrimitive_t>(NULL)) {
-    this->param_ = p;
-    init_mkldnn_ = false;
-    // convert MBytes first to Bytes and then to elements.
-    param_.workspace = (param_.workspace << 20) / sizeof(DType);
-    SetupBuffer();
-  }
-  void ReleaseBuffer() {
-    if (convolutionFwd != NULL) {
-     dnnDelete<DType>(convolutionFwd);
-     convolutionFwd = NULL;
-    }
-    if (convolutionBwdData != NULL) {
-     dnnDelete<DType>(convolutionBwdData);
-     convolutionBwdData = NULL;
-    }
-    if (convolutionBwdFilter != NULL) {
-     dnnDelete<DType>(convolutionBwdFilter);
-     convolutionBwdFilter = NULL;
-    }
-    if (!param_.no_bias && convolutionBwdBias != NULL) {
-     dnnDelete<DType>(convolutionBwdBias);
-     convolutionBwdBias = NULL;
-    }
-  }
-  virtual ~MKLConvolutionOp() {
-    ReleaseBuffer();
-  }
-
- private:
-  void LayerSetUp(const mshadow::Tensor<xpu, 4, DType> &data,
-                  const mshadow::Tensor<xpu, 4, DType> &out) {
-    this->width_ = data.shape_[3];
-    this->height_ = data.shape_[2];
-    this->channels_ = data.shape_[1];
-    this->num_ = data.shape_[0];
-    this->group_ = param_.num_group;
-    this->width_out_ = out.shape_[3];
-    this->height_out_ = out.shape_[2];
-    int channel_out_ = out.shape_[1];
-    this->num_output_ = channel_out_;
-    kernel_w_ = param_.kernel[1];
-    kernel_h_ = param_.kernel[0];
-    stride_w_ = param_.stride[1];
-    stride_h_ = param_.stride[0];
-    pad_w_ = param_.pad[1];
-    pad_h_ = param_.pad[0];
-    int status;
-    size_t n, g;
-    size_t iw, ih, ic;
-    size_t ow, oh, oc;
-    size_t kw, kh;
-    size_t dimension = 4;
-    g = std::max(this->group_, 1);
-    n = this->num_;
-    iw = this->width_;
-    ih = this->height_;
-    ic = this->channels_;
-    ow = this->width_out_;
-    oh = this->height_out_;
-    oc = this->num_output_;
-    kw = this->kernel_w_;
-    kh = this->kernel_h_;
-    oc = this->num_output_;
-    size_t bdata_sizes[4] = { iw, ih, ic, n };
-    size_t bdata_strides[4] = { 1, iw, iw*ih, iw*ih*ic };
-    /* starting with MKL 2017 Gold in case of groups filter layout
-    * becomes 5D, i.e. groups become a separate dimension */
-    size_t g_mkl2017 = g;
-    size_t f_dimension = dimension + (g != 1);
-    if (getMKLBuildDate() < 20160701) {
-     g_mkl2017 = 1;
-     f_dimension = dimension;
-    }
-    size_t fdata_sizes[5] = { kw, kh, ic / g, oc / g_mkl2017, g_mkl2017 };
-    size_t fdata_strides[5] = { 1, kw, kw*kh, kw*kh*ic / g, kw*kh*ic / g*oc / g };
-    size_t bias_sizes[1] = { oc };
-    size_t bias_strides[1] = { 1 };
-    size_t tdata_sizes[4] = { ow, oh, oc, n };
-    size_t tdata_strides[4] = { 1, ow, ow*oh, ow*oh*oc };
-    size_t convolutionStrides[2] = { this->stride_w_, this->stride_h_ };
-    int    inputOffset[2] = { -this->pad_w_, -this->pad_h_ };
-    // Names are for debugging purposes only.
-    /*** convolution section ***/
-    if (!param_.no_bias) {
-      status = dnnGroupsConvolutionCreateForwardBias<DType>(&convolutionFwd,
-                                                            NULL,
-                                                            dnnAlgorithmConvolutionDirect,
-                                                            g,
-                                                            dimension,
-                                                            bdata_sizes,
-                                                            tdata_sizes,
-                                                            fdata_sizes,
-                                                            convolutionStrides,
-                                                            inputOffset,
-                                                            dnnBorderZeros);
-    } else {
-      status = dnnGroupsConvolutionCreateForward<DType>(&convolutionFwd,
-                                                        NULL,
-                                                        dnnAlgorithmConvolutionDirect,
-                                                        g,
-                                                        dimension,
-                                                        bdata_sizes,
-                                                        tdata_sizes,
-                                                        fdata_sizes,
-                                                        convolutionStrides,
-                                                        inputOffset,
-                                                        dnnBorderZeros);
-    }
-    CHECK_EQ(status, 0)
-     << "Failed dnnCreateConvolution<DType>(dnnForward) with status "
-     << status << "\n";
-    fwd_bottom_data->create_layouts(convolutionFwd, dnnResourceSrc, dimension,
-                                    bdata_sizes, bdata_strides);
-    fwd_top_data->create_layouts(convolutionFwd, dnnResourceDst, dimension,
-                                 tdata_sizes, tdata_strides);
-    fwd_filter_data->create_layouts(convolutionFwd, dnnResourceFilter,
-                                    f_dimension, fdata_sizes, fdata_strides);
-    if (!param_.no_bias)
-      fwd_bias_data->create_layouts(convolutionFwd, dnnResourceBias, 1,
-                                    bias_sizes, bias_strides);
-    /*
-    * Backward by data layer setup
-    */
-    status = dnnGroupsConvolutionCreateBackwardData<DType>(&convolutionBwdData,
-                                                           NULL,
-                                                           dnnAlgorithmConvolutionDirect,
-                                                           g,
-                                                           dimension,
-                                                           bdata_sizes,
-                                                           tdata_sizes,
-                                                           fdata_sizes,
-                                                           convolutionStrides,
-                                                           inputOffset,
-                                                           dnnBorderZeros);
-    CHECK_EQ(status, 0)
-     << "Failed dnnConvolutionCreateBackwardData with status "
-     << status << "\n";
-    bwdd_bottom_diff->create_layouts(convolutionBwdData, dnnResourceDiffSrc,
-                                     dimension, bdata_sizes, bdata_strides);
-    bwdd_top_diff->create_layouts(convolutionBwdData, dnnResourceDiffDst,
-                                  dimension, tdata_sizes, tdata_strides);
-    bwdd_filter_data->create_layouts(convolutionBwdData, dnnResourceFilter,
-                                     f_dimension, fdata_sizes, fdata_strides);
-    /*
-    * Backward by filter layer setup
-    */
-    status = dnnGroupsConvolutionCreateBackwardFilter<DType>(&convolutionBwdFilter,
-                                                             NULL,
-                                                             dnnAlgorithmConvolutionDirect,
-                                                             g,
-                                                             dimension,
-                                                             bdata_sizes,
-                                                             tdata_sizes,
-                                                             fdata_sizes,
-                                                             convolutionStrides,
-                                                             inputOffset,
-                                                             dnnBorderZeros);
-    CHECK_EQ(status, 0)
-     << "Failed dnnConvolutionCreateBackwardFilter with status "
-     << status << "\n";
-    bwdf_bottom_data->create_layouts(convolutionBwdFilter, dnnResourceSrc,
-                                     dimension, bdata_sizes, bdata_strides);
-    bwdf_top_diff->create_layouts(convolutionBwdFilter, dnnResourceDiffDst,
-                                  dimension, tdata_sizes, tdata_strides);
-    bwdf_filter_diff->create_layouts(convolutionBwdFilter, dnnResourceDiffFilter,
-                                     f_dimension, fdata_sizes, fdata_strides);
-    /*
-    * Backward by bias layer setup
-    */
-    if (!param_.no_bias) {
-      status = dnnGroupsConvolutionCreateBackwardBias<DType>(&convolutionBwdBias,
-                                                             NULL,
-                                                             dnnAlgorithmConvolutionDirect,
-                                                             g,
-                                                             dimension,
-                                                             tdata_sizes);
-     CHECK_EQ(status, 0)
-      << "Failed dnnConvolutionCreateBackwardBias with status "
-      << status << "\n";
-     bwdb_top_diff->create_layouts(convolutionBwdBias, dnnResourceDiffDst,
-                                   dimension, tdata_sizes, tdata_strides);
-     bwdb_bias_diff->create_layouts(convolutionBwdBias, dnnResourceDiffBias, 1,
-                                    bias_sizes, bias_strides);
-    }
-  }
-
- public:
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    DType *data_ptr = NULL;
-    DType *wmat_ptr = NULL;
-    DType *out_ptr = NULL;
-    Tensor<xpu, 4, DType> data =
-      mkl_experimental_direct_get<xpu, 4, DType>(in_data[conv::kData], s);
-    Tensor<xpu, 4, DType> out =
-      mkl_experimental_direct_get<xpu, 4, DType>(out_data[conv::kOut], s);
-    Tensor<xpu, 4, DType> wmat =
-      mkl_experimental_direct_get<xpu, 4, DType>(in_data[conv::kWeight], s);
-    if (!init_mkldnn_) {
-      LayerSetUp(data, out);
-      init_mkldnn_ = true;
-    }
-    CHECK_EQ(data.CheckContiguous(), true);
-    CHECK_EQ(wmat.CheckContiguous(), true);
-    CHECK_EQ(out.CheckContiguous(), true);
-    data_ptr = data.dptr_;
-    wmat_ptr = wmat.dptr_;
-    out_ptr = out.dptr_;
-    int status;
-    void *res_convolutionFwd[dnnResourceNumber];
-    res_convolutionFwd[dnnResourceSrc] =
-      fwd_bottom_data->get_converted_prv(data_ptr, false, in_data[conv::kData]);
-    res_convolutionFwd[dnnResourceFilter] =
-      fwd_filter_data->get_converted_prv(wmat_ptr, true, in_data[conv::kWeight]);
-    if (!param_.no_bias) {
-      Tensor<xpu, 1, DType> bias =
-        mkl_experimental_direct_get<xpu, 1, DType>(in_data[conv::kBias], s);
-      res_convolutionFwd[dnnResourceBias] =
-        fwd_bias_data->get_converted_prv(bias.dptr_, true, in_data[conv::kBias]);
-    }
-
-    res_convolutionFwd[dnnResourceDst] = fwd_top_data->get_output_ptr(out_ptr,
-      fwd_top_data, out_data[conv::kOut]);
-    status = dnnExecute<DType>(convolutionFwd, res_convolutionFwd);
-    CHECK_EQ(status, 0) << "Forward convolution failed with status " << status;
-#if MKL_EXPERIMENTAL == 0
-    if (fwd_top_data->conversion_needed()) {
-        fwd_top_data->convert_from_prv(out_ptr);
-    }
-#endif
-  }
-  void AddToModeAllocAndStoreBuffer(void *src, int blob_size, Storage::Handle *pws) {
-    int blob_byte_size = blob_size * sizeof(DType);
-    *pws = Storage::Get()->Alloc(blob_byte_size, Context::CPU());
-    memcpy(pws->dptr, src, blob_byte_size);
-  }
-  void AddToModeAddAndReleaseBuffer(Storage::Handle *pws, void *dst_, int blob_size) {
-    DType *dst = reinterpret_cast<DType*>(dst_);
-    DType *src = reinterpret_cast<DType*>(pws->dptr);
-#pragma omp parallel for
-    for (int i = 0; i < blob_size; i++) {
-      dst[i] += src[i];
-    }
-    if (pws->dptr)
-      Storage::Get()->Free(*pws);
-    pws->dptr = NULL;
-  }
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    if (param_.kernel.ndim() > 2) {
-      LOG(FATAL) << "Volume convolution is not implmented in mshadow";
-    }
-    CHECK_EQ(out_grad.size(), 1);
-    size_t expected = param_.no_bias == 0 ? 3 : 2;
-    CHECK(in_data.size() == expected && in_grad.size() == expected);
-    CHECK_EQ(req.size(), expected);
-    CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> data =
-      mkl_experimental_direct_get<xpu, 4, DType>(in_data[conv::kData], s);
-    Shape<3> wmat_shape =
-      Shape3(param_.num_group,
-             param_.num_filter / param_.num_group,
-             data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]);
-    Tensor<xpu, 3, DType> wmat =
-      mkl_experimental_direct_get_with_shape<xpu, 3, DType>(
-      in_data[conv::kWeight], wmat_shape, s);
-    Tensor<xpu, 4, DType> grad =
-      mkl_experimental_direct_get<xpu, 4, DType>(out_grad[conv::kOut], s);
-    Tensor<xpu, 4, DType> gdata =
-      mkl_experimental_direct_get<xpu, 4, DType>(in_grad[conv::kData], s);
-    Tensor<xpu, 3, DType> gwmat =
-      mkl_experimental_direct_get_with_shape<xpu, 3, DType>(
-      in_grad[conv::kWeight], wmat_shape, s);
-
-    if (!init_mkldnn_) {
-      init_mkldnn_ = true;
-      LayerSetUp(data, grad);
-    }
-    int status;
-    if (req[0]) {
-      void *res_convolutionBwdData[dnnResourceNumber];
-      res_convolutionBwdData[dnnResourceDiffDst] =
-        bwdd_top_diff->get_converted_prv(grad.dptr_, true, out_grad[conv::kOut]);
-
-      res_convolutionBwdData[dnnResourceFilter] =
-        bwdd_filter_data->get_converted_prv(wmat.dptr_, false, in_data[conv::kWeight]);
-     Storage::Handle addtoWorkspace;
-     if (req[0] == kAddTo) {
-       // wait mkl support addto mode
-       AddToModeAllocAndStoreBuffer(gdata.dptr_, in_grad[conv::kData].Size(), &addtoWorkspace);
-     }
-
-     res_convolutionBwdData[dnnResourceDiffSrc] = bwdd_bottom_diff->get_output_ptr(gdata.dptr_,
-       bwdd_bottom_diff, in_grad[conv::kData]);
-     status = dnnExecute<DType>(convolutionBwdData, res_convolutionBwdData);
-     CHECK_EQ(status, 0) << "Backward Data conv failed with status " << status;
-#if MKL_EXPERIMENTAL == 0
-     if (bwdd_bottom_diff->conversion_needed()) {
-       bwdd_bottom_diff->convert_from_prv(gdata.dptr_);
-     }
-#endif
-     if (req[0] == kAddTo) {
-       if (bwdd_bottom_diff->conversion_needed()) {
-         bwdd_bottom_diff->convert_from_prv(gdata.dptr_);
-       }
-      AddToModeAddAndReleaseBuffer(&addtoWorkspace, gdata.dptr_, in_grad[conv::kData].Size());
-     }
-    }
-    if (req[1]) {
-      void *res_convolutionBwdFilter[dnnResourceNumber];
-
-      res_convolutionBwdFilter[dnnResourceDiffDst] =
-        bwdf_top_diff->get_converted_prv(grad.dptr_, true, out_grad[conv::kOut]);
-
-      res_convolutionBwdFilter[dnnResourceSrc] =
-        bwdf_bottom_data->get_converted_prv(data.dptr_, false,
-          in_data[conv::kData]);
-     Storage::Handle addtoWorkspace;
-     if (req[1] == kAddTo) {
-       // wait mkl support addto mode
-       AddToModeAllocAndStoreBuffer(gwmat.dptr_, in_grad[conv::kWeight].Size(), &addtoWorkspace);
-     }
-
-     res_convolutionBwdFilter[dnnResourceDiffFilter] = bwdf_filter_diff->get_output_ptr(
-       gwmat.dptr_, bwdf_filter_diff, in_grad[conv::kWeight]);
-     status = dnnExecute<DType>(convolutionBwdFilter, res_convolutionBwdFilter);
-     CHECK_EQ(status, 0) << "Backward Filter conv failed with status " << status;
-#if MKL_EXPERIMENTAL == 0
-     if (bwdf_filter_diff->conversion_needed()) {
-       bwdf_filter_diff->convert_from_prv(gwmat.dptr_);
-     }
-#endif
-     if (req[1] == kAddTo) {
-       if (bwdf_filter_diff->conversion_needed()) {
-         bwdf_filter_diff->convert_from_prv(gwmat.dptr_);
-       }
-       AddToModeAddAndReleaseBuffer(&addtoWorkspace, gwmat.dptr_, in_grad[conv::kWeight].Size());
-     }
-    }
-    if (!param_.no_bias) {
-      Tensor<xpu, 1, DType> gbias =
-        mkl_experimental_direct_get<xpu, 1, DType>(in_grad[conv::kBias], s);
-      void *res_convolutionBwdBias[dnnResourceNumber];
-      res_convolutionBwdBias[dnnResourceDiffDst] =
-        bwdb_top_diff->get_converted_prv(grad.dptr_, true, out_grad[conv::kOut]);
-
-      res_convolutionBwdBias[dnnResourceDiffBias] = bwdb_bias_diff->get_output_ptr(gbias.dptr_,
-        bwdb_bias_diff, in_grad[conv::kBias]);
-      status = dnnExecute<DType>(convolutionBwdBias, res_convolutionBwdBias);
-      CHECK_EQ(status, 0) << "Backward Bias failed with status " << status;
-#if MKL_EXPERIMENTAL == 0
-      if (bwdb_bias_diff->conversion_needed()) {
-        bwdb_bias_diff->convert_from_prv(gbias.dptr_);
-      }
-#endif
-    }
-  }
-
- private:
-  ConvolutionParam param_;
-  size_t width_,
-         height_,
-         width_out_,
-         height_out_,
-         kernel_w_,
-         kernel_h_,
-         stride_w_,
-         stride_h_;
-  int group_,
-      num_,
-      num_output_;
-  size_t channels_;
-  int pad_w_,
-      pad_h_;
-  bool init_mkldnn_;
-  dnnPrimitive_t convolutionFwd;
-  dnnPrimitive_t convolutionBwdData;
-  dnnPrimitive_t convolutionBwdFilter;
-  dnnPrimitive_t convolutionBwdBias;
-  /* Fwd step */
-  std::shared_ptr<MKLData<DType> > fwd_bottom_data, fwd_top_data, fwd_filter_data,
-                                   fwd_bias_data;
-  /* Bwd data step */
-  std::shared_ptr<MKLData<DType> > bwdd_top_diff, bwdd_bottom_diff;
-  std::shared_ptr<MKLData<DType> > bwdd_filter_data;
-  /* Bwd filter step */
-  std::shared_ptr<MKLData<DType> > bwdf_top_diff, bwdf_filter_diff;
-  std::shared_ptr<MKLData<DType> > bwdf_bottom_data;
-  std::shared_ptr<MKLData<DType> > bwdf_filter_diff_iter, bwdf2fwd_filter_diff,
-                                   bwdb_bias_diff_iter;
-  /* Bwd bias step */
-  std::shared_ptr<MKLData<DType> > bwdb_top_diff, bwdb_bias_diff;
-};  // class ConvolutionOp
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_MKL_MKL_CONVOLUTION_INL_H_
diff --git a/src/operator/mkl/mkl_cppwrapper.cc b/src/operator/mkl/mkl_cppwrapper.cc
deleted file mode 100644
index 507e5498c85b..000000000000
--- a/src/operator/mkl/mkl_cppwrapper.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_cppwrapper.cc
-* \brief
-* \author lingyan.guo@intel.com
-*         zhenlin.luo@intel.com
-*
-*******************************************************************************/
-
-
-
-#include "mkl_cppwrapper.h"
-#include <stdio.h>
-#if MXNET_USE_MKL2017 == 1
-#include "mkl_service.h"
-
-int getMKLBuildDate() {
-    static int build = 0;
-    if (build == 0) {
-        MKLVersion v;
-        mkl_get_version(&v);
-        build = atoi(v.Build);
-        printf("MKL Build:%d\n", build);
-    }
-    return build;
-}
-
-bool enableMKLWarnGenerated() {
-  return false;
-}
-#endif  // MSHADOW_USE_MKL2017
diff --git a/src/operator/mkl/mkl_cppwrapper.h b/src/operator/mkl/mkl_cppwrapper.h
deleted file mode 100644
index 7d66f20ad308..000000000000
--- a/src/operator/mkl/mkl_cppwrapper.h
+++ /dev/null
@@ -1,1020 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_cppwrapper.h
-* \brief
-* \author lingyan.guo@intel.com
-*         zhenlin.luo@intel.com
-*
-*******************************************************************************/
-#ifndef MXNET_OPERATOR_MKL_MKL_CPPWRAPPER_H_
-#define MXNET_OPERATOR_MKL_MKL_CPPWRAPPER_H_
-
-
-#include <stdarg.h>
-#include <stddef.h>
-#if MXNET_USE_MKL2017 == 1
-#include "mkl_dnn_types.h"
-#include "mkl_dnn.h"
-#include "mkl_version.h"
-
-
-extern int getMKLBuildDate();
-extern bool enableMKLWarnGenerated();
-
-
-template <typename Dtype> inline dnnError_t dnnLayoutCreate(
-    dnnLayout_t *pLayout, size_t dimension, const size_t size[], const size_t strides[]);
-template <> inline dnnError_t dnnLayoutCreate<float>(
-    dnnLayout_t *pLayout, size_t dimension, const size_t size[], const size_t strides[]) {
-    return dnnLayoutCreate_F32(pLayout, dimension, size, strides);
-}
-template <> inline dnnError_t dnnLayoutCreate<double>(
-    dnnLayout_t *pLayout, size_t dimension, const size_t size[], const size_t strides[]) {
-    return dnnLayoutCreate_F64(pLayout, dimension, size, strides);
-}
-
-template <typename Dtype> inline dnnError_t dnnLayoutCreateFromPrimitive(
-    dnnLayout_t *pLayout, const dnnPrimitive_t primitive, dnnResourceType_t type);
-template <> inline dnnError_t dnnLayoutCreateFromPrimitive<float>(
-    dnnLayout_t *pLayout, const dnnPrimitive_t primitive, dnnResourceType_t type) {
-    return dnnLayoutCreateFromPrimitive_F32(pLayout, primitive, type);
-}
-template <> inline dnnError_t dnnLayoutCreateFromPrimitive<double>(
-    dnnLayout_t *pLayout, const dnnPrimitive_t primitive, dnnResourceType_t type) {
-    return dnnLayoutCreateFromPrimitive_F64(pLayout, primitive, type);
-}
-
-template <typename Dtype> inline size_t dnnLayoutGetMemorySize(
-    const dnnLayout_t layout);
-template <> inline size_t dnnLayoutGetMemorySize<float>(
-    const dnnLayout_t layout) {
-    return dnnLayoutGetMemorySize_F32(layout);
-}
-template <> inline size_t dnnLayoutGetMemorySize<double>(
-    const dnnLayout_t layout) {
-    return dnnLayoutGetMemorySize_F64(layout);
-}
-
-template <typename Dtype> inline int dnnLayoutCompare(
-    const dnnLayout_t l1, const dnnLayout_t l2);
-template <> inline int dnnLayoutCompare<float>(
-    const dnnLayout_t l1, const dnnLayout_t l2) {
-    return dnnLayoutCompare_F32(l1, l2);
-}
-template <> inline int dnnLayoutCompare<double>(
-    const dnnLayout_t l1, const dnnLayout_t l2) {
-    return dnnLayoutCompare_F64(l1, l2);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnAllocateBuffer(
-    void **pPtr, dnnLayout_t layout);
-template <> inline dnnError_t dnnAllocateBuffer<float>(
-    void **pPtr, dnnLayout_t layout) {
-    return dnnAllocateBuffer_F32(pPtr, layout);
-}
-template <> inline dnnError_t dnnAllocateBuffer<double>(
-    void **pPtr, dnnLayout_t layout) {
-    return dnnAllocateBuffer_F64(pPtr, layout);
-}
-
-template <typename Dtype> inline dnnError_t dnnReleaseBuffer(
-    void *ptr);
-template <> inline dnnError_t dnnReleaseBuffer<float>(
-    void *ptr) {
-    return dnnReleaseBuffer_F32(ptr);
-}
-template <> inline dnnError_t dnnReleaseBuffer<double>(
-    void *ptr) {
-    return dnnReleaseBuffer_F64(ptr);
-}
-
-template <typename Dtype> inline dnnError_t dnnLayoutDelete(
-    dnnLayout_t layout);
-template <> inline dnnError_t dnnLayoutDelete<float>(
-    dnnLayout_t layout) {
-    return dnnLayoutDelete_F32(layout);
-}
-template <> inline dnnError_t dnnLayoutDelete<double>(
-    dnnLayout_t layout) {
-    return dnnLayoutDelete_F64(layout);
-}
-
-template <typename Dtype> inline dnnError_t dnnPrimitiveAttributesCreate(
-    dnnPrimitiveAttributes_t *attributes);
-template <> inline dnnError_t dnnPrimitiveAttributesCreate<float>(
-    dnnPrimitiveAttributes_t *attributes) {
-    return dnnPrimitiveAttributesCreate_F32(attributes);
-}
-template <> inline dnnError_t dnnPrimitiveAttributesCreate<double>(
-    dnnPrimitiveAttributes_t *attributes) {
-    return dnnPrimitiveAttributesCreate_F64(attributes);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnPrimitiveAttributesDestroy(
-    dnnPrimitiveAttributes_t attributes);
-template <> inline dnnError_t dnnPrimitiveAttributesDestroy<float>(
-    dnnPrimitiveAttributes_t attributes) {
-    return dnnPrimitiveAttributesDestroy_F32(attributes);
-}
-template <> inline dnnError_t dnnPrimitiveAttributesDestroy<double>(
-    dnnPrimitiveAttributes_t attributes) {
-    return dnnPrimitiveAttributesDestroy_F64(attributes);
-}
-
-template <typename Dtype> inline dnnError_t dnnPrimitiveGetAttributes(
-    dnnPrimitive_t primitive,
-    dnnPrimitiveAttributes_t *attributes);
-template <> inline dnnError_t dnnPrimitiveGetAttributes<float>(
-    dnnPrimitive_t primitive,
-    dnnPrimitiveAttributes_t *attributes) {
-    return dnnPrimitiveGetAttributes_F32(primitive, attributes);
-}
-template <> inline dnnError_t dnnPrimitiveGetAttributes<double>(
-    dnnPrimitive_t primitive,
-    dnnPrimitiveAttributes_t *attributes) {
-    return dnnPrimitiveGetAttributes_F64(primitive, attributes);
-}
-
-template <typename Dtype> inline dnnError_t dnnExecute(
-    dnnPrimitive_t primitive, void *resources[]);
-template <> inline dnnError_t dnnExecute<float>(
-    dnnPrimitive_t primitive, void *resources[]) {
-    return dnnExecute_F32(primitive, resources);
-}
-template <> inline dnnError_t dnnExecute<double>(
-    dnnPrimitive_t primitive, void *resources[]) {
-    return dnnExecute_F64(primitive, resources);
-}
-
-template <typename Dtype> inline dnnError_t dnnExecuteAsync(
-    dnnPrimitive_t primitive, void *resources[]);
-template <> inline dnnError_t dnnExecuteAsync<float>(
-    dnnPrimitive_t primitive, void *resources[]) {
-    return dnnExecuteAsync_F32(primitive, resources);
-}
-template <> inline dnnError_t dnnExecuteAsync<double>(
-    dnnPrimitive_t primitive, void *resources[]) {
-    return dnnExecuteAsync_F64(primitive, resources);
-}
-
-template <typename Dtype> inline dnnError_t dnnWaitFor(
-    dnnPrimitive_t primitive);
-template <> inline dnnError_t dnnWaitFor<float>(
-    dnnPrimitive_t primitive) {
-    return dnnWaitFor_F32(primitive);
-}
-template <> inline dnnError_t dnnWaitFor<double>(
-    dnnPrimitive_t primitive) {
-    return dnnWaitFor_F64(primitive);
-}
-
-template <typename Dtype> inline dnnError_t dnnDelete(
-    dnnPrimitive_t primitive);
-template <> inline dnnError_t dnnDelete<float>(
-    dnnPrimitive_t primitive) {
-    return dnnDelete_F32(primitive);
-}
-template <> inline dnnError_t dnnDelete<double>(
-    dnnPrimitive_t primitive) {
-    return dnnDelete_F64(primitive);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnConversionCreate(
-    dnnPrimitive_t* pConversion, const dnnLayout_t from, const dnnLayout_t to);
-template <> inline dnnError_t dnnConversionCreate<float>(
-    dnnPrimitive_t* pConversion, const dnnLayout_t from, const dnnLayout_t to) {
-    return dnnConversionCreate_F32(pConversion, from, to);
-}
-template <> inline dnnError_t dnnConversionCreate<double>(
-    dnnPrimitive_t* pConversion, const dnnLayout_t from, const dnnLayout_t to) {
-    return dnnConversionCreate_F64(pConversion, from, to);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnConversionExecute(
-    dnnPrimitive_t conversion, void *from, void *to);
-template <> inline dnnError_t dnnConversionExecute<float>(
-    dnnPrimitive_t conversion, void *from, void *to) {
-    return dnnConversionExecute_F32(conversion, from, to);
-}
-template <> inline dnnError_t dnnConversionExecute<double>(
-    dnnPrimitive_t conversion, void *from, void *to) {
-    return dnnConversionExecute_F64(conversion, from, to);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnConvolutionCreateForward(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type);
-template <> inline dnnError_t dnnConvolutionCreateForward<float>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnConvolutionCreateForward_F32(
-               pConvolution,
-               attributes,
-               algorithm,
-               dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-
-template <> inline dnnError_t dnnConvolutionCreateForward<double>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnConvolutionCreateForward_F64(
-               pConvolution,
-               attributes,
-               algorithm,
-               dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnConvolutionCreateForwardBias(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type);
-template <> inline dnnError_t dnnConvolutionCreateForwardBias<float>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnConvolutionCreateForwardBias_F32(
-               pConvolution,
-               attributes,
-               algorithm,
-               dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-template <> inline dnnError_t dnnConvolutionCreateForwardBias<double>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnConvolutionCreateForwardBias_F64(
-               pConvolution,
-               attributes,
-               algorithm,
-               dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnConvolutionCreateBackwardData(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type);
-template <> inline dnnError_t dnnConvolutionCreateBackwardData<float>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnConvolutionCreateBackwardData_F32(
-               pConvolution,
-               attributes,
-               algorithm,
-               dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-template <> inline dnnError_t dnnConvolutionCreateBackwardData<double>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnConvolutionCreateBackwardData_F64(
-               pConvolution,
-               attributes,
-               algorithm,
-               dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-
-template <typename Dtype> inline dnnError_t dnnConvolutionCreateBackwardFilter(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type);
-template <> inline dnnError_t dnnConvolutionCreateBackwardFilter<float>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnConvolutionCreateBackwardFilter_F32(
-               pConvolution,
-               attributes,
-               algorithm,
-               dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-template <> inline dnnError_t dnnConvolutionCreateBackwardFilter<double>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnConvolutionCreateBackwardFilter_F64(
-               pConvolution,
-               attributes,
-               algorithm,
-               dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-
-template <typename Dtype> inline dnnError_t dnnConvolutionCreateBackwardBias(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t dstSize[]);
-template <> inline dnnError_t dnnConvolutionCreateBackwardBias<float>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t dstSize[]) {
-    return dnnConvolutionCreateBackwardBias_F32(
-               pConvolution,
-               attributes,
-               algorithm,
-               dimension, dstSize);
-}
-template <> inline dnnError_t dnnConvolutionCreateBackwardBias<double>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t dimension, const size_t dstSize[]) {
-    return dnnConvolutionCreateBackwardBias_F64(
-               pConvolution,
-               attributes,
-               algorithm,
-               dimension, dstSize);
-}
-
-template <typename Dtype> inline dnnError_t dnnGroupsConvolutionCreateForward(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type);
-template <> inline dnnError_t dnnGroupsConvolutionCreateForward<float>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnGroupsConvolutionCreateForward_F32(
-               pConvolution,
-               attributes,
-               algorithm,
-               groups, dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-template <> inline dnnError_t dnnGroupsConvolutionCreateForward<double>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnGroupsConvolutionCreateForward_F64(
-               pConvolution,
-               attributes,
-               algorithm,
-               groups, dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-
-template <typename Dtype> inline dnnError_t dnnGroupsConvolutionCreateForwardBias(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type);
-template <> inline dnnError_t dnnGroupsConvolutionCreateForwardBias<float>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnGroupsConvolutionCreateForwardBias_F32(
-               pConvolution,
-               attributes,
-               algorithm,
-               groups, dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-template <> inline dnnError_t dnnGroupsConvolutionCreateForwardBias<double>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnGroupsConvolutionCreateForwardBias_F64(
-               pConvolution,
-               attributes,
-               algorithm,
-               groups, dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-
-template <typename Dtype> inline dnnError_t dnnGroupsConvolutionCreateBackwardData(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type);
-template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardData<float>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnGroupsConvolutionCreateBackwardData_F32(
-               pConvolution,
-               attributes,
-               algorithm,
-               groups, dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardData<double>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnGroupsConvolutionCreateBackwardData_F64(
-               pConvolution,
-               attributes,
-               algorithm,
-               groups, dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnGroupsConvolutionCreateBackwardFilter(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type);
-template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardFilter<float>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnGroupsConvolutionCreateBackwardFilter_F32(
-               pConvolution,
-               attributes,
-               algorithm,
-               groups, dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardFilter<double>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t srcSize[],
-    const size_t dstSize[], const size_t filterSize[],
-    const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnGroupsConvolutionCreateBackwardFilter_F64(
-               pConvolution,
-               attributes,
-               algorithm,
-               groups, dimension, srcSize, dstSize, filterSize,
-               convolutionStrides, inputOffset, border_type);
-}
-
-template <typename Dtype> inline dnnError_t dnnGroupsConvolutionCreateBackwardBias(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t dstSize[]);
-template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardBias<float>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t dstSize[]) {
-    return dnnGroupsConvolutionCreateBackwardBias_F32(
-               pConvolution,
-               attributes,
-               algorithm,
-               groups, dimension, dstSize);
-}
-template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardBias<double>(
-    dnnPrimitive_t* pConvolution,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t algorithm,
-    size_t groups, size_t dimension, const size_t dstSize[]) {
-    return dnnGroupsConvolutionCreateBackwardBias_F64(
-               pConvolution,
-               attributes,
-               algorithm,
-               groups, dimension, dstSize);
-}
-
-template <typename Dtype> inline dnnError_t dnnReLUCreateForward(
-    dnnPrimitive_t* pRelu,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t dataLayout, float negativeSlope);
-template <> inline dnnError_t dnnReLUCreateForward<float>(
-    dnnPrimitive_t* pRelu,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t dataLayout, float negativeSlope) {
-    return dnnReLUCreateForward_F32(
-               pRelu,
-               attributes,
-               dataLayout, negativeSlope);
-}
-template <> inline dnnError_t dnnReLUCreateForward<double>(
-    dnnPrimitive_t* pRelu,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t dataLayout, float negativeSlope) {
-    return dnnReLUCreateForward_F64(
-               pRelu,
-               attributes,
-               dataLayout, negativeSlope);
-}
-
-template <typename Dtype> inline dnnError_t dnnReLUCreateBackward(
-    dnnPrimitive_t* pRelu,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, float negativeSlope);
-template <> inline dnnError_t dnnReLUCreateBackward<float>(
-    dnnPrimitive_t* pRelu,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, float negativeSlope) {
-    return dnnReLUCreateBackward_F32(
-               pRelu,
-               attributes,
-               diffLayout, dataLayout, negativeSlope);
-}
-template <> inline dnnError_t dnnReLUCreateBackward<double>(
-    dnnPrimitive_t* pRelu,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, float negativeSlope) {
-    return dnnReLUCreateBackward_F64(
-               pRelu,
-               attributes,
-               diffLayout, dataLayout, negativeSlope);
-}
-
-template <typename Dtype> inline dnnError_t dnnLRNCreateForward(
-    dnnPrimitive_t* pLrn,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t dataLayout, size_t kernel_size, float alpha, float beta, float k);
-template <> inline dnnError_t dnnLRNCreateForward<float>(
-    dnnPrimitive_t* pLrn,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t dataLayout, size_t kernel_size, float alpha, float beta, float k) {
-    return dnnLRNCreateForward_F32(
-               pLrn,
-               attributes,
-               dataLayout, kernel_size, alpha, beta, k);
-}
-template <> inline dnnError_t dnnLRNCreateForward<double>(
-    dnnPrimitive_t* pLrn,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t dataLayout, size_t kernel_size, float alpha, float beta, float k) {
-    return dnnLRNCreateForward_F64(
-               pLrn,
-               attributes,
-               dataLayout, kernel_size, alpha, beta, k);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnLRNCreateBackward(
-    dnnPrimitive_t* pLrn,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t diffLayout, const dnnLayout_t dataLayout,
-    size_t kernel_size, float alpha, float beta, float k);
-template <> inline dnnError_t dnnLRNCreateBackward<float>(
-    dnnPrimitive_t* pLrn,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t diffLayout, const dnnLayout_t dataLayout,
-    size_t kernel_size, float alpha, float beta, float k) {
-    return dnnLRNCreateBackward_F32(
-               pLrn,
-               attributes,
-               diffLayout, dataLayout, kernel_size, alpha, beta, k);
-}
-template <> inline dnnError_t dnnLRNCreateBackward<double>(
-    dnnPrimitive_t* pLrn,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t diffLayout, const dnnLayout_t dataLayout,
-    size_t kernel_size, float alpha, float beta, float k) {
-    return dnnLRNCreateBackward_F64(
-               pLrn,
-               attributes,
-               diffLayout, dataLayout, kernel_size, alpha, beta, k);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnPoolingCreateForward(
-    dnnPrimitive_t* pPooling,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t op,
-    const dnnLayout_t srcLayout,
-    const size_t kernelSize[], const size_t kernelStride[],
-    const int inputOffset[], const dnnBorder_t border_type);
-template <> inline dnnError_t dnnPoolingCreateForward<float>(
-    dnnPrimitive_t* pPooling,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t op,
-    const dnnLayout_t srcLayout,
-    const size_t kernelSize[], const size_t kernelStride[],
-    const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnPoolingCreateForward_F32(
-               pPooling,
-               attributes,
-               op,
-               srcLayout,
-               kernelSize, kernelStride,
-               inputOffset, border_type);
-}
-template <> inline dnnError_t dnnPoolingCreateForward<double>(
-    dnnPrimitive_t* pPooling,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t op,
-    const dnnLayout_t srcLayout,
-    const size_t kernelSize[], const size_t kernelStride[],
-    const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnPoolingCreateForward_F64(
-               pPooling,
-               attributes,
-               op,
-               srcLayout,
-               kernelSize, kernelStride,
-               inputOffset, border_type);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnPoolingCreateBackward(
-    dnnPrimitive_t* pPooling,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t op,
-    const dnnLayout_t srcLayout,
-    const size_t kernelSize[], const size_t kernelStride[],
-    const int inputOffset[], const dnnBorder_t border_type);
-template <> inline dnnError_t dnnPoolingCreateBackward<float>(
-    dnnPrimitive_t* pPooling,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t op,
-    const dnnLayout_t srcLayout,
-    const size_t kernelSize[], const size_t kernelStride[],
-    const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnPoolingCreateBackward_F32(
-               pPooling,
-               attributes,
-               op,
-               srcLayout,
-               kernelSize, kernelStride,
-               inputOffset, border_type);
-}
-template <> inline dnnError_t dnnPoolingCreateBackward<double>(
-    dnnPrimitive_t* pPooling,
-    dnnPrimitiveAttributes_t attributes,
-    dnnAlgorithm_t op,
-    const dnnLayout_t srcLayout,
-    const size_t kernelSize[], const size_t kernelStride[],
-    const int inputOffset[], const dnnBorder_t border_type) {
-    return dnnPoolingCreateBackward_F64(
-               pPooling,
-               attributes,
-               op,
-               srcLayout,
-               kernelSize, kernelStride,
-               inputOffset, border_type);
-}
-
-template <typename Dtype> inline dnnError_t dnnConcatCreate(
-    dnnPrimitive_t *pConcat,
-    dnnPrimitiveAttributes_t attributes,
-    const size_t N,
-    dnnLayout_t src[]);
-template <> inline dnnError_t dnnConcatCreate<float>(
-    dnnPrimitive_t *pConcat,
-    dnnPrimitiveAttributes_t attributes,
-    const size_t N,
-    dnnLayout_t src[]) {
-    return dnnConcatCreate_F32(
-               pConcat,
-               attributes,
-               N,
-               src);
-}
-template <> inline dnnError_t dnnConcatCreate<double>(
-    dnnPrimitive_t *pConcat,
-    dnnPrimitiveAttributes_t attributes,
-    const size_t N,
-    dnnLayout_t src[]) {
-    return dnnConcatCreate_F64(
-               pConcat,
-               attributes,
-               N,
-               src);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnSplitCreate(
-    dnnPrimitive_t *pSplit,
-    dnnPrimitiveAttributes_t attributes,
-    const size_t N,
-    dnnLayout_t src,
-    size_t dst[]);
-template <> inline dnnError_t dnnSplitCreate<float>(
-    dnnPrimitive_t *pSplit,
-    dnnPrimitiveAttributes_t attributes,
-    const size_t N,
-    dnnLayout_t src,
-    size_t dst[]) {
-    return dnnSplitCreate_F32(
-               pSplit,
-               attributes,
-               N,
-               src,
-               dst);
-}
-template <> inline dnnError_t dnnSplitCreate<double>(
-    dnnPrimitive_t *pSplit,
-    dnnPrimitiveAttributes_t attributes,
-    const size_t N,
-    dnnLayout_t src,
-    size_t dst[]) {
-    return dnnSplitCreate_F64(
-               pSplit,
-               attributes,
-               N,
-               src,
-               dst);
-}
-
-template <typename Dtype> inline dnnError_t dnnSumCreate(
-    dnnPrimitive_t *pSum,
-    dnnPrimitiveAttributes_t attributes,
-    const size_t nSummands, dnnLayout_t layout, Dtype *coefficients);
-template <> inline dnnError_t dnnSumCreate<float>(
-    dnnPrimitive_t *pSum,
-    dnnPrimitiveAttributes_t attributes,
-    const size_t nSummands, dnnLayout_t layout, float *coefficients) {
-    return dnnSumCreate_F32(
-               pSum,
-               attributes,
-               nSummands,
-               layout, coefficients);
-}
-template <> inline dnnError_t dnnSumCreate<double>(
-    dnnPrimitive_t *pSum,
-    dnnPrimitiveAttributes_t attributes,
-    const size_t nSummands, dnnLayout_t layout, double *coefficients) {
-    return dnnSumCreate_F64(
-               pSum,
-               attributes,
-               nSummands,
-               layout, coefficients);
-}
-
-template <typename Dtype> inline dnnError_t dnnBatchNormalizationCreateForward_v2(
-    dnnPrimitive_t* pBatchNormalization,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t dataLayout, float eps,
-    int flags);
-
-template <> inline dnnError_t dnnBatchNormalizationCreateForward_v2<float>(
-    dnnPrimitive_t* pBatchNormalization,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t dataLayout, float eps,
-    int flags) {
-    return dnnBatchNormalizationCreateForward_v2_F32(
-               pBatchNormalization,
-               attributes,
-               dataLayout, eps, flags);
-}
-template <> inline dnnError_t dnnBatchNormalizationCreateForward_v2<double>(
-    dnnPrimitive_t* pBatchNormalization,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t dataLayout, float eps,
-    int flags) {
-    return dnnBatchNormalizationCreateForward_v2_F64(
-               pBatchNormalization,
-               attributes,
-               dataLayout, eps, flags);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnBatchNormalizationCreateBackward_v2(
-    dnnPrimitive_t* pBatchNormalization,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t dataLayout, float eps,
-    int flags);
-
-template <> inline  dnnError_t dnnBatchNormalizationCreateBackward_v2<float>(
-    dnnPrimitive_t* pBatchNormalization,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t dataLayout, float eps,
-    int flags) {
-    return dnnBatchNormalizationCreateBackward_v2_F32(
-               pBatchNormalization,
-               attributes,
-               dataLayout, eps, flags);
-}
-
-template <> inline dnnError_t dnnBatchNormalizationCreateBackward_v2<double>(
-    dnnPrimitive_t* pBatchNormalization,
-    dnnPrimitiveAttributes_t attributes,
-    const dnnLayout_t dataLayout, float eps,
-    int flags) {
-    return dnnBatchNormalizationCreateBackward_v2_F64(
-               pBatchNormalization,
-               attributes,
-               dataLayout, eps, flags);
-}
-
-template <typename Dtype> inline dnnError_t dnnInnerProductCreateForward(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t srcSize[],
-    size_t outputChannels);
-template <> inline dnnError_t dnnInnerProductCreateForward<float>(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t srcSize[],
-    size_t outputChannels) {
-    return dnnInnerProductCreateForward_F32(pInnerProduct,
-                                            attributes, dimensions,
-                                            srcSize, outputChannels);
-}
-template <> inline dnnError_t dnnInnerProductCreateForward<double>(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t srcSize[],
-    size_t outputChannels) {
-    return dnnInnerProductCreateForward_F64(pInnerProduct,
-                                            attributes, dimensions,
-                                            srcSize, outputChannels);
-}
-
-template <typename Dtype> inline dnnError_t dnnInnerProductCreateForwardBias(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t srcSize[],
-    size_t outputChannels);
-
-template <> inline dnnError_t dnnInnerProductCreateForwardBias<float>(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t srcSize[],
-    size_t outputChannels) {
-    return dnnInnerProductCreateForwardBias_F32(pInnerProduct,
-            attributes, dimensions,
-            srcSize, outputChannels);
-}
-template <> inline dnnError_t dnnInnerProductCreateForwardBias<double>(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t srcSize[],
-    size_t outputChannels) {
-    return dnnInnerProductCreateForwardBias_F64(pInnerProduct,
-            attributes, dimensions,
-            srcSize, outputChannels);
-}
-
-
-template <typename Dtype> inline dnnError_t dnnInnerProductCreateBackwardData(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t srcSize[],
-    size_t outputChannels);
-
-template <> inline dnnError_t dnnInnerProductCreateBackwardData<float>(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t srcSize[],
-    size_t outputChannels) {
-    return dnnInnerProductCreateBackwardData_F32(pInnerProduct,
-            attributes, dimensions,
-            srcSize, outputChannels);
-}
-template <> inline dnnError_t dnnInnerProductCreateBackwardData<double>(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t srcSize[],
-    size_t outputChannels) {
-    return dnnInnerProductCreateBackwardData_F64(pInnerProduct,
-            attributes, dimensions,
-            srcSize, outputChannels);
-}
-
-
-
-
-template <typename Dtype> inline dnnError_t dnnInnerProductCreateBackwardFilter(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t srcSize[],
-    size_t outputChannels);
-
-template <> inline dnnError_t dnnInnerProductCreateBackwardFilter<float>(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t srcSize[],
-    size_t outputChannels) {
-    return dnnInnerProductCreateBackwardFilter_F32(pInnerProduct,
-            attributes, dimensions,
-            srcSize, outputChannels);
-}
-template <> inline dnnError_t dnnInnerProductCreateBackwardFilter<double>(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t srcSize[],
-    size_t outputChannels) {
-    return dnnInnerProductCreateBackwardFilter_F64(pInnerProduct,
-            attributes, dimensions,
-            srcSize, outputChannels);
-}
-
-
-
-template <typename Dtype> inline dnnError_t dnnInnerProductCreateBackwardBias(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t dstSize[]);
-
-template <> inline dnnError_t dnnInnerProductCreateBackwardBias<float>(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t dstSize[]) {
-    return dnnInnerProductCreateBackwardBias_F32(pInnerProduct,
-            attributes, dimensions,
-            dstSize);
-}
-template <> inline dnnError_t dnnInnerProductCreateBackwardBias<double>(
-    dnnPrimitive_t *pInnerProduct,
-    dnnPrimitiveAttributes_t attributes,
-    size_t dimensions,
-    const size_t dstSize[]) {
-    return dnnInnerProductCreateBackwardBias_F64(pInnerProduct,
-            attributes, dimensions,
-            dstSize);
-}
-#endif  // #MXNET_USE_MKL2017 == 1
-#endif  // MXNET_OPERATOR_MKL_MKL_CPPWRAPPER_H_
diff --git a/src/operator/mkl/mkl_elementwise_copy-inl.h b/src/operator/mkl/mkl_elementwise_copy-inl.h
deleted file mode 100644
index 48c931291150..000000000000
--- a/src/operator/mkl/mkl_elementwise_copy-inl.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_elementwise-inl.h
-* \brief
-* \author lingyan.guo@intel.com
-*         zhenlin.luo@intel.com
-*
-*******************************************************************************/
-#ifndef MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_COPY_INL_H_
-#define MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_COPY_INL_H_
-
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <cstring>
-#include <map>
-#include <string>
-#include <vector>
-#include <utility>
-#include "../operator_common.h"
-#include "../mshadow_op.h"
-#include "./mkl_util-inl.h"
-
-
-namespace mxnet {
-namespace op {
-
-template<typename xpu, typename DType>
-void MKLIdentityCompute(const nnvm::NodeAttrs& attrs,
-  const OpContext& ctx,
-  const std::vector<TBlob>& inputs,
-  const std::vector<OpReqType>& req,
-  const std::vector<TBlob>& outputs) {
-  if (!req[0]) return;
-#if MKL_EXPERIMENTAL == 1
-  if (op::mkl_prv_data<DType>(inputs[0])) {
-    std::shared_ptr<MKLMemHolder> in_data_mem = inputs[0].Mkl_mem_;
-    // User copy to avoid potential problem
-    std::shared_ptr<MKLData<DType> > top_data = MKLData<DType>::create();
-    std::shared_ptr<MKLMemHolder> top_mem = outputs[0].Mkl_mem_;
-    top_data->copy_from(in_data_mem);
-    top_mem->set_prv_descriptor(top_data);
-    return;
-  }
-#endif
-  int in_blob_size = inputs[0].Size();
-  int out_blob_size = outputs[0].Size();
-  CHECK_EQ(in_blob_size, out_blob_size) << "MKLIdentityCompute CPU Size not Match ";
-  memcpy(outputs[0].dptr_, inputs[0].dptr_, in_blob_size * sizeof(DType));
-}
-
-
-
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_COPY_INL_H_
diff --git a/src/operator/mkl/mkl_elementwise_sum-inl.h b/src/operator/mkl/mkl_elementwise_sum-inl.h
deleted file mode 100644
index d313fd15a5be..000000000000
--- a/src/operator/mkl/mkl_elementwise_sum-inl.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_elementwise-inl.h
-* \brief
-* \author lingyan.guo@intel.com
-*         zhenlin.luo@intel.com
-*
-*******************************************************************************/
-#ifndef MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_SUM_INL_H_
-#define MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_SUM_INL_H_
-
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <cstring>
-#include <map>
-#include <string>
-#include <vector>
-#include <utility>
-#include "../operator_common.h"
-#include "../mshadow_op.h"
-#include "./mkl_util-inl.h"
-
-
-namespace mxnet {
-namespace op {
-template<typename xpu, typename DType>
-static void LayerSetUp(const std::vector<mshadow::Tensor<xpu, 1, DType> > &data,
-  size_t data_shape_size,
-  std::shared_ptr<MKLData<DType> > fwd_top_data) {
-  // Whether to use an asymptotically slower (for >2 inputs) but stabler method
-  // of computing the gradient for the PROD operation. (No effect for SUM op.)
-  // stable_prod_grad_ = 1;
-  size_t dim_src = data_shape_size;
-  size_t *sizes_src = new size_t[dim_src];
-  size_t *strides_src = new size_t[dim_src];
-  for (size_t d = 0; d < dim_src; ++d) {
-    sizes_src[d] = data[0].shape_[dim_src - d - 1];
-    strides_src[d] = (d == 0) ? 1 : strides_src[d - 1] * sizes_src[d - 1];
-  }
-
-  fwd_top_data->create_user_layout(dim_src, sizes_src, strides_src);
-  delete[] sizes_src;
-  delete[] strides_src;
-}
-
-template<typename xpu, typename DType>
-void MKLElementWiseSumCompute_(const nnvm::NodeAttrs& attrs,
-  const OpContext& ctx,
-  const std::vector<TBlob>& in_data,
-  const std::vector<OpReqType>& req,
-  const std::vector<TBlob>& out_data) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  if (req[0] == kNullOp) return;
-  size_t size = in_data.size();
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  std::vector<Tensor<xpu, 1, DType> > data(size);
-  Tensor<xpu, 1, DType> out = out_data[0].FlatTo1D<xpu, DType>(s);
-  bool in_place_flag = false;
-  int in_place_idx = 0;
-
-  for (size_t i = 0; i < size; ++i) {
-    data[i]  = in_data[i].FlatTo1D<xpu, DType>(s);
-    if (data[i].dptr_ == out.dptr_) {
-      in_place_idx = i;
-      in_place_flag = true;
-    }
-  }
-  std::shared_ptr<MKLData<DType> > fwd_top_data = MKLData<DType>::create();
-  std::vector<DType> coeffs_  = std::vector<DType>(data.size(), 1);
-  LayerSetUp(data, 1, fwd_top_data);
-
-
-  dnnError_t e;
-  void *eltwise_res[dnnResourceNumber];
-  dnnPrimitive_t sumPrimitive = NULL;
-  e = dnnSumCreate<DType>(&sumPrimitive, NULL, size, fwd_top_data->layout_usr,
-    &coeffs_[0]);
-  CHECK_EQ(e, E_SUCCESS);
-
-  eltwise_res[dnnResourceDst] = reinterpret_cast<void*>(const_cast<DType*>(out.dptr_));
-  eltwise_res[dnnResourceMultipleSrc] =
-    reinterpret_cast<void *>(reinterpret_cast<void *>(in_data[in_place_idx].dptr_));
-  for (size_t i = 1; i < size; ++i) {
-    if (i == in_place_idx) continue;
-    eltwise_res[dnnResourceMultipleSrc + i] =
-      reinterpret_cast<void *>(reinterpret_cast<void *>(in_data[i].dptr_));
-  }
-
-  e = dnnExecute<DType>(sumPrimitive, eltwise_res);
-  CHECK_EQ(e, E_SUCCESS);
-
-  if (sumPrimitive != NULL) {
-    dnnDelete<DType>(sumPrimitive);
-    sumPrimitive = NULL;
-  }
-}
-
-
-
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_SUM_INL_H_
diff --git a/src/operator/mkl/mkl_fully_connected-inl.h b/src/operator/mkl/mkl_fully_connected-inl.h
deleted file mode 100644
index 5e296704b6dd..000000000000
--- a/src/operator/mkl/mkl_fully_connected-inl.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_fully_connected-inl.h
-* \brief
-* \author zhenlin.luo@intel.com
-*          lingyan.guo@intel.com
-*         
-*
-*******************************************************************************/
-#ifndef MXNET_OPERATOR_MKL_MKL_FULLY_CONNECTED_INL_H_
-#define MXNET_OPERATOR_MKL_MKL_FULLY_CONNECTED_INL_H_
-#include <string>
-#include <algorithm>
-#include <vector>
-#include "../activation-inl.h"
-#include "./mkl_util-inl.h"
-
-namespace mxnet {
-namespace op {
-
-template<typename xpu, typename DType>
-class MKLFullyConnectedOp : public Operator {
- public:
-  explicit MKLFullyConnectedOp(const FullyConnectedParam& p,
-                               const std::vector<TShape>& in_shapes,
-                               const std::vector<TShape>& out_shapes):
-    param_(p) {
-    LayerSetUp(in_shapes, out_shapes);
-  }
-
-  ~MKLFullyConnectedOp() {
-    dnnDelete<DType>(fullyConnectedFwd);
-    dnnDelete<DType>(fullyConnectedBwdData);
-    dnnDelete<DType>(fullyConnectedBwdFilter);
-    dnnDelete<DType>(fullyConnectedBwdBias);
-  }
-  static std::string getName() {
-    return "MKLFullyConnectedOp";
-  }
-
- private:
-  void LayerSetUp(const std::vector<TShape>& in_shapes,
-                  const std::vector<TShape>& out_shapes) {
-    const TShape& ishape = in_shapes[fullc::kData];
-
-    const size_t dim = 4;
-    const size_t src_sizes[4] = {1, 1, ishape.ProdShape(1, ishape.ndim()), ishape[0]};
-    const size_t dst_sizes[2] = {param_.num_hidden, ishape[0]};
-    const size_t output_channels = param_.num_hidden;
-
-    dnnPrimitiveAttributes_t attributes = NULL;
-    MKLDNN_CALL(dnnPrimitiveAttributesCreate<DType>(&attributes));
-    if (!param_.no_bias) {
-      MKLDNN_CALL(dnnInnerProductCreateForwardBias<DType>(
-            &fullyConnectedFwd,
-            attributes,
-            dim,
-            src_sizes,
-            output_channels));
-    } else {
-      MKLDNN_CALL(dnnInnerProductCreateForward<DType>(
-            &fullyConnectedFwd,
-            attributes,
-            dim,
-            src_sizes,
-            output_channels));
-    }
-    MKLDNN_CALL(dnnInnerProductCreateBackwardData<DType>(
-          &fullyConnectedBwdData,
-          attributes,
-          dim,
-          src_sizes,
-          output_channels));
-    MKLDNN_CALL(dnnInnerProductCreateBackwardFilter<DType>(
-          &fullyConnectedBwdFilter,
-          attributes,
-          dim,
-          src_sizes,
-          output_channels));
-    if (!param_.no_bias) {
-      MKLDNN_CALL(dnnInnerProductCreateBackwardBias<DType>(
-            &fullyConnectedBwdBias,
-            attributes,
-            2,
-            dst_sizes));
-    }
-    // TODO(minjie): Shouldn't `attributes` be destroyed?
-  }
-
-
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-
-    void* res_fullyConnected[dnnResourceNumber];
-    if (req[fullc::kOut] == kNullOp) return;
-    CHECK_EQ(req[fullc::kOut], kWriteTo);
-    CHECK_EQ(in_data.size(), param_.no_bias ? 2 : 3);
-    CHECK_EQ(out_data.size(), 1);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-
-    const TShape& ishape = in_data[fullc::kData].shape_;
-    const TShape& oshape = out_data[fullc::kOut].shape_;
-
-    Tensor<xpu, 4, DType> data;
-    Tensor<xpu, 4, DType> out;
-
-    Shape4(in_data[fullc::kData].shape_[0], in_data[fullc::kData].shape_[1], 1, 1);
-
-    Shape<4> dshape = Shape4(ishape[0], ishape.ProdShape(1, ishape.ndim()), 1, 1);
-    Shape<4> odshape = Shape4(oshape[0], oshape.ProdShape(1, oshape.ndim()), 1, 1);
-
-    data = in_data[fullc::kData].get_with_shape<xpu, 4, DType>(dshape, s);
-    out = out_data[fullc::kOut].get_with_shape<xpu, 4, DType>(odshape, s);
-    res_fullyConnected[dnnResourceSrc] =
-      reinterpret_cast<void *>(in_data[fullc::kData].dptr_);
-    res_fullyConnected[dnnResourceDst] =
-      reinterpret_cast<void *>(out_data[fullc::kOut].dptr_);
-    res_fullyConnected[dnnResourceFilter] =
-      reinterpret_cast<void *>(in_data[fullc::kWeight].dptr_);
-    if (!param_.no_bias) {
-      res_fullyConnected[dnnResourceBias] = reinterpret_cast<void *>(in_data[fullc::kBias].dptr_);
-    }
-
-    MKLDNN_CALL(dnnExecute<DType>(fullyConnectedFwd, res_fullyConnected));
-  }
-
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-
-    void* res_fullyConnected[dnnResourceNumber];
-    CHECK_EQ(out_grad.size(), 1);
-    const size_t expected = param_.no_bias ? 2 : 3;
-    CHECK(in_data.size() == expected && in_grad.size() == expected);
-    CHECK_EQ(req.size(), expected);
-    res_fullyConnected[dnnResourceSrc] =
-      reinterpret_cast<void *>(in_data[fullc::kData].dptr_);
-    res_fullyConnected[dnnResourceFilter] =
-      reinterpret_cast<void *>(in_data[fullc::kWeight].dptr_);
-
-    res_fullyConnected[dnnResourceDiffDst] =
-      reinterpret_cast<void *>(out_grad[fullc::kOut].dptr_);
-    res_fullyConnected[dnnResourceDiffSrc] =
-      reinterpret_cast<void *>(in_grad[fullc::kData].dptr_);
-    res_fullyConnected[dnnResourceDiffFilter] =
-      reinterpret_cast<void *>(in_grad[fullc::kWeight].dptr_);
-    if (!param_.no_bias) {
-      res_fullyConnected[dnnResourceDiffBias] =
-        reinterpret_cast<void *>(in_grad[fullc::kBias].dptr_);
-    }
-    MKLDNN_CALL(dnnExecute<DType>(fullyConnectedBwdFilter, res_fullyConnected));
-    if (!param_.no_bias) {
-      MKLDNN_CALL(dnnExecute<DType>(fullyConnectedBwdBias, res_fullyConnected));
-    }
-    MKLDNN_CALL(dnnExecute<DType>(fullyConnectedBwdData, res_fullyConnected));
-  }
-
- private:
-  dnnPrimitive_t fullyConnectedFwd{nullptr};
-  dnnPrimitive_t fullyConnectedBwdData{nullptr};
-  dnnPrimitive_t fullyConnectedBwdFilter{nullptr};
-  dnnPrimitive_t fullyConnectedBwdBias{nullptr};
-  const FullyConnectedParam param_;
-};  // class MKLFullyConnectedOp
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_OPERATOR_MKL_MKL_FULLY_CONNECTED_INL_H_
diff --git a/src/operator/mkl/mkl_lrn-inl.h b/src/operator/mkl/mkl_lrn-inl.h
deleted file mode 100644
index 90dfad50fa62..000000000000
--- a/src/operator/mkl/mkl_lrn-inl.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_lrn-inl.h
-* \brief
-* \author zhenlin.luo@intel.com
-*         lingyan.guo@intel.com
-*
-*******************************************************************************/
-#ifndef MXNET_OPERATOR_MKL_MKL_LRN_INL_H_
-#define MXNET_OPERATOR_MKL_MKL_LRN_INL_H_
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <map>
-#include <vector>
-#include <string>
-#include <utility>
-#include "../operator_common.h"
-#include "../mshadow_op.h"
-#include "./mkl_util-inl.h"
-
-namespace mxnet {
-namespace op {
-
-template<typename xpu, typename DType>
-class MKLLRNOp : public Operator {
- public:
-  static std::string getName() {
-    return "MKLLRNOp";
-  }
-
-  explicit MKLLRNOp(LRNParam param) :
-    lrnFwd(static_cast<dnnPrimitive_t>(NULL)),
-    lrnBwd(static_cast<dnnPrimitive_t>(NULL)),
-    lrn_buffer_(NULL) {
-    this->param_ = param;
-    fwd_top_data_ = MKLData<DType>::create();
-    fwd_bottom_data_ = MKLData<DType>::create();
-    bwd_top_diff_ = MKLData<DType>::create();
-    bwd_bottom_diff_ = MKLData<DType>::create();
-    init_mkldnn_ = false;
-  }
-
-  virtual ~MKLLRNOp() {
-    if (lrnFwd != NULL) {
-      dnnDelete<DType>(lrnFwd);
-      lrnFwd = NULL;
-    }
-    if (lrnBwd != NULL) {
-      dnnDelete<DType>(lrnBwd);
-      lrnBwd = NULL;
-    }
-    dnnReleaseBuffer<DType>(lrn_buffer_);
-  }
-
- private:
-  void LayerSetup(const mshadow::Tensor<xpu, 4, DType> &data,
-                  const mshadow::Tensor<xpu, 4, DType> &out) {
-    size_ = param_.nsize;
-    CHECK_EQ(size_ % 2, 1) << "LRN only supports odd values for local size";
-
-    alpha_ = param_.alpha;
-    beta_ = param_.beta;
-    k_ = param_.knorm;
-    size_t dim = 4, sizes[4], strides[4];
-    channels_ = data.shape_[1];
-    height_ = data.shape_[2];
-    width_ = data.shape_[3];
-    num_ = data.shape_[0];
-    sizes[0] = width_;
-    sizes[1] = height_;
-    sizes[2] = channels_;
-    sizes[3] = num_;
-
-    strides[0] = 1;
-    strides[1] = sizes[0];
-    strides[2] = sizes[0] * sizes[1];
-    strides[3] = sizes[0] * sizes[1] * sizes[2];
-
-    fwd_bottom_data_->name = "fwd_bottom_data_   @ " + getName();
-    fwd_top_data_->name = "fwd_top_data_      @ " + getName();
-    bwd_top_diff_->name = "bwd_top_diff_      @ " + getName();
-    bwd_bottom_diff_->name = "bwd_bottom_diff_   @ " + getName();
-
-    fwd_bottom_data_->create_user_layout(dim, sizes, strides);
-    fwd_top_data_->create_user_layout(dim, sizes, strides);
-    bwd_bottom_diff_->create_user_layout(dim, sizes, strides);
-    bwd_top_diff_->create_user_layout(dim, sizes, strides);
-  }
-
- public:
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_states) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 2U);
-    CHECK_EQ(param_.nsize % 2, 1U) << "LRN only supports odd values for local_size";
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> data = mkl_experimental_direct_get<xpu, 4, DType>(
-      in_data[lrn_enum::kData], s);
-    Tensor<xpu, 4, DType> out = mkl_experimental_direct_get<xpu, 4, DType>(
-      out_data[lrn_enum::kOut], s);
-    if (!init_mkldnn_) {
-      LayerSetup(data, out);
-      init_mkldnn_ = true;
-    }
-
-    const void* bottom_data = NULL;
-#if MKL_EXPERIMENTAL == 1
-    bottom_data =
-          reinterpret_cast<void*>(mkl_prv_data<DType>(in_data[lrn_enum::kData]));
-#endif
-#if MKL_EXPERIMENTAL == 1
-    if (NULL != bottom_data) {
-      if (lrnFwd == NULL) {
-        std::shared_ptr<MKLMemHolder> bottom_data_mem =
-          in_data[lrn_enum::kData].Mkl_mem_;
-        std::shared_ptr<PrvMemDescr> bottom_prv_descriptor =
-          bottom_data_mem->get_prv_descriptor();
-        CHECK_EQ(bottom_prv_descriptor->get_descr_type(),
-            PrvMemDescr::PRV_DESCR_MKL2017);
-        std::shared_ptr<MKLData<DType> > mem_descr
-          = std::static_pointer_cast<MKLData<DType>>(bottom_prv_descriptor);
-        CHECK(mem_descr != nullptr);
-        fwd_bottom_data_ = mem_descr;
-
-        dnnError_t e;
-        dnnLayout_t lrn_buffer_l = NULL;
-
-        e = dnnLRNCreateForward<DType>(&lrnFwd, NULL, fwd_bottom_data_->layout_int,
-                                       size_, alpha_, beta_, k_);
-        CHECK_EQ(e, E_SUCCESS);
-
-        fwd_top_data_->create_internal_layout(lrnFwd, dnnResourceDst);
-
-        e = dnnLRNCreateBackward<DType>(&lrnBwd, NULL,
-                                        fwd_bottom_data_->layout_int, fwd_bottom_data_->layout_int,
-                                        size_, alpha_, beta_, k_);
-        CHECK_EQ(e, E_SUCCESS);
-
-        e = dnnLayoutCreateFromPrimitive<DType>(
-              &lrn_buffer_l, lrnFwd, dnnResourceWorkspace);
-        CHECK_EQ(e, E_SUCCESS);
-        e = dnnAllocateBuffer<DType>(
-              reinterpret_cast<void **>(&lrn_buffer_), lrn_buffer_l);
-        CHECK_EQ(e, E_SUCCESS);
-        dnnLayoutDelete<DType>(lrn_buffer_l);
-
-        bwd_top_diff_->create_internal_layout(lrnBwd, dnnResourceDiffDst);
-        bwd_bottom_diff_->create_internal_layout(lrnBwd, dnnResourceDiffSrc);
-      }
-    }
-#endif
-    if (bottom_data == NULL) {
-      if (lrnFwd == NULL) {
-        dnnError_t e;
-        dnnLayout_t lrn_buffer_l = NULL;
-        e = dnnLRNCreateForward<DType>(&lrnFwd, NULL, fwd_bottom_data_->layout_usr,
-                                       size_, alpha_, beta_, k_);
-        CHECK_EQ(e, E_SUCCESS);
-
-        e = dnnLayoutCreateFromPrimitive<DType>(
-              &lrn_buffer_l, lrnFwd, dnnResourceWorkspace);
-        CHECK_EQ(e, E_SUCCESS);
-        e = dnnAllocateBuffer<DType>(
-              reinterpret_cast<void **>(&lrn_buffer_), lrn_buffer_l);
-        CHECK_EQ(e, E_SUCCESS);
-        dnnLayoutDelete<DType>(lrn_buffer_l);
-
-        e = dnnLRNCreateBackward<DType>(&lrnBwd, NULL,
-                                        fwd_bottom_data_->layout_usr, fwd_bottom_data_->layout_usr,
-                                        size_, alpha_, beta_, k_);
-        CHECK_EQ(e, E_SUCCESS);
-      }
-      bottom_data = data.dptr_;
-    }
-
-    dnnError_t e;
-    void* lrn_res[dnnResourceNumber];
-    lrn_res[dnnResourceSrc] = const_cast<void*>(bottom_data);
-
-    lrn_res[dnnResourceDst] = fwd_top_data_->get_output_ptr(
-      out.dptr_, fwd_top_data_, out_data[lrn_enum::kOut]);
-    lrn_res[dnnResourceWorkspace] = lrn_buffer_;
-    e = dnnExecute<DType>(lrnFwd, lrn_res);
-    CHECK_EQ(e, E_SUCCESS);
-  }
-
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_states) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 2);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> grad = mkl_experimental_direct_get<xpu, 4, DType>(
-      out_grad[lrn_enum::kOut], s);
-    Tensor<xpu, 4, DType> data = mkl_experimental_direct_get<xpu, 4, DType>(
-      in_data[lrn_enum::kData], s);
-    Tensor<xpu, 4, DType> grad_in = mkl_experimental_direct_get<xpu, 4, DType>(
-      in_grad[lrn_enum::kData], s);
-    dnnError_t e;
-    void* lrn_res[dnnResourceNumber];
-    lrn_res[dnnResourceDiffDst] =
-      bwd_top_diff_->get_converted_prv(grad.dptr_, true, out_grad[lrn_enum::kOut]);
-    lrn_res[dnnResourceWorkspace] = lrn_buffer_;
-    lrn_res[dnnResourceSrc] =
-      fwd_bottom_data_->get_converted_prv(data.dptr_, false, in_data[lrn_enum::kData]);
-
-    lrn_res[dnnResourceDiffSrc] = bwd_bottom_diff_->get_output_ptr(
-      grad_in.dptr_, bwd_bottom_diff_, in_grad[lrn_enum::kData]);
-    e = dnnExecute<DType>(lrnBwd, lrn_res);
-    CHECK_EQ(e, E_SUCCESS);
-  }
-
- private:
-  LRNParam param_;
-  int size_;
-  int pre_pad_;
-  DType alpha_;
-  DType beta_;
-  DType k_;
-  int num_;
-  int channels_;
-  int height_;
-  int width_;
-  bool init_mkldnn_;
-
- private:
-  dnnPrimitive_t lrnFwd, lrnBwd;
-  std::shared_ptr<MKLData<DType> > fwd_top_data_;
-  std::shared_ptr<MKLData<DType> > fwd_bottom_data_;
-
-  std::shared_ptr<MKLData<DType> > bwd_top_diff_;
-  std::shared_ptr<MKLData<DType> > bwd_bottom_diff_;
-
-  DType *lrn_buffer_;
-};  // class LocalResponseNormOp
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_MKL_MKL_LRN_INL_H_
-
diff --git a/src/operator/mkl/mkl_memory-inl.h b/src/operator/mkl/mkl_memory-inl.h
deleted file mode 100644
index 71af10254b2a..000000000000
--- a/src/operator/mkl/mkl_memory-inl.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_memory-inl.h
-* \brief
-* \author lingyan.guo@intel.com
-*         zhenlin.luo@intel.com
-*
-*******************************************************************************/
-#ifndef MXNET_OPERATOR_MKL_MKL_MEMORY_INL_H_
-#define MXNET_OPERATOR_MKL_MKL_MEMORY_INL_H_
-
-
-#include <string>
-#include <vector>
-#include <memory>
-#include "mkl_cppwrapper.h"
-
-namespace mxnet {
-
-template <typename DType>
-struct MKLMemoryDescriptorBase : public PrvMemDescr,
- public std::enable_shared_from_this<MKLMemoryDescriptorBase<DType> > {
-    MKLMemoryDescriptorBase() : layout_usr(NULL), layout_int(NULL),
-    convert_to_int(NULL), convert_from_int(NULL), convert_prv2prv(NULL),
-    name("UNKNOWN"), internal_ptr(NULL) {}
-  virtual ~MKLMemoryDescriptorBase() {
-    dnnLayoutDelete<DType>(layout_usr);
-    dnnLayoutDelete<DType>(layout_int);
-    if (internal_ptr != NULL) {
-      dnnReleaseBuffer<DType>(internal_ptr);
-      internal_ptr = NULL;
-    }
-    if (convert_to_int != NULL) {
-      dnnDelete<DType>(convert_to_int);
-      convert_to_int = NULL;
-    }
-    if (convert_from_int != NULL) {
-      dnnDelete<DType>(convert_from_int);
-      convert_from_int = NULL;
-    }
-    if (convert_prv2prv != NULL) {
-      dnnDelete<DType>(convert_prv2prv);
-      convert_prv2prv = NULL;
-    }
-  }
-  std::shared_ptr<MKLMemoryDescriptorBase<DType> > get_shared_ptr() {
-    return this->shared_from_this();
-  }
-
-  dnnLayout_t layout_usr;
-  dnnLayout_t layout_int;
-  dnnPrimitive_t convert_to_int;
-  dnnPrimitive_t convert_from_int;
-  dnnPrimitive_t convert_prv2prv;
-  std::shared_ptr<MKLMemoryDescriptorBase<DType> > descr_prv2prv_conversion;
-
-
-  std::string name;  // for debugging purposes
-  void allocate() {
-    if (internal_ptr == NULL) {
-      int status = dnnAllocateBuffer<DType>(
-              reinterpret_cast<void **>(&internal_ptr), layout_int);
-      CHECK_EQ(status, E_SUCCESS)
-          << "Failed internal_ptr memory allocation with status "
-          << status << "\n";
-    }
-  }
-  virtual void* prv_ptr(bool allocate_when_uninit = true) {
-    if (internal_ptr == NULL && allocate_when_uninit)
-      allocate();
-    return internal_ptr;
-  }
-  inline bool conversion_needed() {
-    return (convert_to_int != NULL);
-  }
-  void create_conversions();
-  void create_internal_layout(const dnnPrimitive_t primitive,
-                dnnResourceType_t type);
-  void create_user_layout(size_t dimension, const size_t size[],
-              const size_t strides[]);
-  void create_layouts(
-    const dnnPrimitive_t primitive, dnnResourceType_t type,
-    size_t dimension, const size_t size[], const size_t strides[]);
-
-  virtual PrvDescrType get_descr_type() {
-    return PRV_DESCR_MKL2017;
-  }
-  virtual size_t prv_size() {
-    return dnnLayoutGetMemorySize<DType>(layout_int);
-  }
-  virtual size_t prv_count() {
-    return dnnLayoutGetMemorySize<DType>(layout_int) / sizeof(DType);
-  }
-  virtual void convert_from_prv(void* cpu_ptr);
-  virtual void convert_to_prv(void* cpu_ptr);
-  virtual bool layout_compare(std::shared_ptr<PrvMemDescr> other);
-  virtual void convert_from_other(std::shared_ptr<PrvMemDescr> other);
- protected:
-  DType* internal_ptr;
-};
-
-template <typename DType>
-struct MKLMemoryDescriptor : MKLMemoryDescriptorBase<DType> {
-  // The last get_converted_prv() argument is a hack for reusing
-  // in backward a conversion done already in the forward direction.
-  DType* get_converted_prv(DType *data_ptr, bool set_prv_ptr,
-      const TBlob &blob);
-  void* get_output_ptr(DType *data_ptr, std::shared_ptr<MKLMemoryDescriptor<DType> > self_ptr,
-    const TBlob &blob, bool in_place = false);
-  bool copy_from(std::shared_ptr<MKLMemHolder> dnn_chunk);
-  MKLMemoryDescriptor() {}
-};
-
-template <typename DType> struct MKLData : MKLMemoryDescriptor<DType> {
-  static std::shared_ptr<MKLData<DType> > create() {
-    return std::make_shared<MKLData<DType> >();
-  }
-};
-
-template struct MKLData<float>;
-template struct MKLData<double>;
-
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_MKL_MKL_MEMORY_INL_H_
diff --git a/src/operator/mkl/mkl_memory.cc b/src/operator/mkl/mkl_memory.cc
deleted file mode 100644
index 7682fe1c1f37..000000000000
--- a/src/operator/mkl/mkl_memory.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_memory.cc
-* \brief
-* \author lingyan.guo@intel.com
-*         zhenlin.luo@intel.com
-*
-*******************************************************************************/
-#include "../operator_common.h"
-
-#if MXNET_USE_MKL2017 == 1
-#include <mkl_memory.h>
-#include "mkl_memory-inl.h"
-#include "mkl_util-inl.h"
-
-namespace mxnet {
-
-template <typename Dtype>
-void MKLMemoryDescriptorBase<Dtype>::create_conversions() {
-  int status;
-  if (this->convert_from_int) {
-    status = dnnDelete<Dtype>(this->convert_from_int);
-    CHECK_EQ(status, E_SUCCESS);
-    this->convert_from_int = NULL;
-  }
-  if (this->convert_to_int) {
-    status = dnnDelete<Dtype>(this->convert_to_int);
-    CHECK_EQ(status, E_SUCCESS);
-    this->convert_to_int = NULL;
-  }
-  if (layout_int
-      && !dnnLayoutCompare<Dtype>(layout_usr, layout_int)) {
-    CHECK(layout_usr);
-    status = dnnConversionCreate<Dtype>(&convert_to_int, layout_usr,
-            layout_int);
-    CHECK_EQ(status, E_SUCCESS)
-            << "Failed creation convert_to_int with status "
-            << status << " for buffer: " << this->name << "\n";
-    status = dnnConversionCreate<Dtype>(&convert_from_int, layout_int,
-            layout_usr);
-    CHECK_EQ(status, E_SUCCESS)
-            << "Failed creation convert_from_int with status "
-            << status << " for buffer: " << this->name << "\n";
-  }
-}
-
-template <typename Dtype>
-void MKLMemoryDescriptorBase<Dtype>::create_internal_layout(
-    const dnnPrimitive_t primitive, dnnResourceType_t type) {
-  int status;
-  if (this->layout_int) {
-    status = dnnLayoutDelete<Dtype>(this->layout_int);
-    CHECK_EQ(status, E_SUCCESS);
-  }
-  status = dnnLayoutCreateFromPrimitive<Dtype>(
-      &this->layout_int, primitive, type);
-  CHECK_EQ(status, E_SUCCESS)
-      << "Failed dnnLayoutCreateFromPrimitive with status "
-      << status << " for buffer: " << this->name << "\n";
-
-  if (this->layout_usr)
-    this->create_conversions();
-}
-
-template <typename Dtype>
-void MKLMemoryDescriptorBase<Dtype>::create_user_layout(
-    size_t dimension, const size_t size[], const size_t strides[]) {
-  int status;
-  if (this->layout_usr) {
-    status = dnnLayoutDelete<Dtype>(this->layout_usr);
-    CHECK_EQ(status, E_SUCCESS);
-  }
-
-  status = dnnLayoutCreate<Dtype>(
-      &this->layout_usr, dimension, size, strides);
-  CHECK_EQ(status, E_SUCCESS) << "Failed dnnLayoutCreate with status "
-      << status << " for buffer: " << this->name << "\n";
-
-  if (this->layout_int)
-    this->create_conversions();
-}
-
-template <typename Dtype>
-void MKLMemoryDescriptorBase<Dtype>::create_layouts(
-    const dnnPrimitive_t primitive, dnnResourceType_t type,
-    size_t dimension, const size_t size[], const size_t strides[]) {
-  this->create_internal_layout(primitive, type);
-  this->create_user_layout(dimension, size, strides);
-}
-
-
-template <typename Dtype>
-void MKLMemoryDescriptorBase<Dtype>::convert_from_prv(void* cpu_ptr) {
-  CHECK(cpu_ptr);
-  CHECK(this->convert_from_int);
-  int status;
-  void *convert_resources[dnnResourceNumber];
-
-  convert_resources[dnnResourceFrom] = this->prv_ptr();
-  convert_resources[dnnResourceTo]   = cpu_ptr;
-  status = dnnExecute<Dtype>(this->convert_from_int, convert_resources);
-  CHECK_EQ(status, 0) << "Conversion from prv failed with status " << status;
-}
-
-template <typename Dtype>
-void MKLMemoryDescriptorBase<Dtype>::convert_to_prv(void* cpu_ptr) {
-  CHECK(cpu_ptr);
-  CHECK(this->convert_to_int);
-  int status;
-  void *convert_resources[dnnResourceNumber];
-
-  convert_resources[dnnResourceFrom] = cpu_ptr;
-  convert_resources[dnnResourceTo]   = this->prv_ptr();
-  status = dnnExecute<Dtype>(this->convert_to_int, convert_resources);
-  CHECK_EQ(status, 0) << "Conversion from prv failed with status " << status;
-}
-
-
-template <typename Dtype>
-bool MKLMemoryDescriptorBase<Dtype>::layout_compare(
-  std::shared_ptr<PrvMemDescr> other) {
-  CHECK_EQ(other->get_descr_type(),
-              PrvMemDescr::PRV_DESCR_MKL2017);
-  std::shared_ptr<MKLMemoryDescriptorBase<Dtype> >other_descr =
-    std::static_pointer_cast<MKLMemoryDescriptorBase<Dtype> >
-    (other);
-
-  if (dnnLayoutCompare<Dtype>(other_descr->layout_int,
-      this->layout_int))
-    return true;
-  else
-    return false;
-}
-
-template <typename Dtype>
-void MKLMemoryDescriptorBase<Dtype>::convert_from_other(
-  std::shared_ptr<PrvMemDescr> other) {
-    std::shared_ptr<MKLMemoryDescriptorBase<Dtype> > other_descr =
-        std::static_pointer_cast<MKLMemoryDescriptorBase<Dtype> >
-            (other);
-
-  int status;
-  dnnPrimitive_t convert;
-  status = dnnConversionCreate<Dtype>(&convert,
-    other_descr->layout_int, this->layout_int);
-
-  void *convert_resources[dnnResourceNumber];
-  convert_resources[dnnResourceFrom] = other_descr->prv_ptr();
-  convert_resources[dnnResourceTo]   = this->prv_ptr();
-  status = dnnExecute<Dtype>(convert, convert_resources);
-  CHECK_EQ(status, 0) << "Conversion from other failed with status "
-                      << status;
-
-  dnnDelete<Dtype>(convert);
-}
-
-
-template <typename Dtype>
-Dtype* MKLMemoryDescriptor<Dtype>::get_converted_prv(
-    Dtype *cpu_ptr, bool set_prv_ptr, const TBlob &blob) {
-  Dtype* prv_ptr = NULL;
-  std::shared_ptr<MKLMemHolder> dnn_chunk = NULL;
-#if MKL_EXPERIMENTAL == 1
-  dnn_chunk = blob.Mkl_mem_;
-#endif
-#if MKL_EXPERIMENTAL == 1
-  if (dnn_chunk != NULL)
-    prv_ptr = static_cast<Dtype*>(dnn_chunk->prv_data());
-#endif
-
-  if (this->convert_to_int != NULL) {
-#if MKL_EXPERIMENTAL == 1
-    int status;
-    void *convert_resources[dnnResourceNumber];
-#endif
-    if (prv_ptr == NULL) {
-      this->allocate();
-      this->convert_to_prv(cpu_ptr);
-#if MKL_EXPERIMENTAL == 1
-      if (set_prv_ptr) {
-        dnn_chunk->set_prv_descriptor(this->get_shared_ptr(), true);
-      }
-#endif
-      return this->internal_ptr;
-    }
-#if MKL_EXPERIMENTAL == 1
-    if (prv_ptr != NULL)  {
-      std::shared_ptr<MKLData<Dtype> > current_descr =
-        op::mkl_get_mem_desc<Dtype>(dnn_chunk);
-      if (!dnnLayoutCompare<Dtype>(current_descr->layout_int,
-        this->layout_int)) {
-        if (this->convert_prv2prv) {
-          CHECK_EQ(dnnLayoutCompare<Dtype>(
-            this->descr_prv2prv_conversion->layout_int,
-            this->layout_int), 0);
-          status = 0;
-        } else {
-          status = dnnConversionCreate<Dtype>(&this->convert_prv2prv,
-            current_descr->layout_int, this->layout_int);
-          if (status == 0)
-            this->descr_prv2prv_conversion = current_descr;
-        }
-        if (status != 0) {
-          this->allocate();
-          convert_resources[dnnResourceFrom] = cpu_ptr;
-          convert_resources[dnnResourceTo] =
-            reinterpret_cast<void*>(this->internal_ptr);
-          status = dnnExecute<Dtype>(this->convert_to_int, convert_resources);
-          CHECK_EQ(status, 0) << "Conversion failed with status " << status;
-        } else {
-          this->allocate();
-          convert_resources[dnnResourceFrom] = reinterpret_cast<void*>(prv_ptr);
-          convert_resources[dnnResourceTo] =
-            reinterpret_cast<void*>(this->internal_ptr);
-          status = dnnExecute<Dtype>(this->convert_prv2prv, convert_resources);
-          CHECK_EQ(status, 0) << "Conversion failed with status " << status;
-        }
-        if (set_prv_ptr) {
-          dnn_chunk->set_prv_descriptor(this->get_shared_ptr(), true);
-        }
-        return this->internal_ptr;
-      } else if (current_descr.get() != this) {
-        // MKL_DLOG(INFO) << "layout OK                 "
-        //  << current_descr->name << " == " << this->name;
-      }
-    }
-#endif
-    return const_cast<Dtype *>(prv_ptr);
-  } else {
-    if (prv_ptr != NULL) {
-#if MKL_EXPERIMENTAL == 1
-      std::shared_ptr<MKLMemoryDescriptorBase<float> > other_descr =
-        std::static_pointer_cast<MKLMemoryDescriptorBase<float> >
-        (dnn_chunk->prv_descriptor_);
-      dnn_chunk->check_and_prv_to_cpu(cpu_ptr);
-#endif
-      // printf("get_converted_prv release %s\n", other_descr->name.c_str());
-    }
-  }
-  return cpu_ptr;
-}
-
-template <typename Dtype>
-void* MKLMemoryDescriptor<Dtype>::get_output_ptr(Dtype *data_ptr,
-  std::shared_ptr<MKLMemoryDescriptor<Dtype> > self_ptr, const TBlob &blob, bool in_place) {
-#if MKL_EXPERIMENTAL == 1
-  std::shared_ptr<MKLMemHolder> dnn_chunk = blob.Mkl_mem_;
-#endif
-  if (this->conversion_needed()) {
-    void * prv_ptr =  this->prv_ptr();
-#if MKL_EXPERIMENTAL == 1
-    if (!in_place) {
-      dnn_chunk->set_prv_descriptor(self_ptr);
-    } else {
-      Dtype * blob_prv = op::mkl_prv_data<Dtype>(blob);
-      if (blob_prv != NULL)
-        return blob_prv;
-    }
-#endif
-    return prv_ptr;
-  } else {
-#if MKL_EXPERIMENTAL == 1
-    std::shared_ptr<MKLMemoryDescriptorBase<float> > other_descr =
-      std::static_pointer_cast<MKLMemoryDescriptorBase<float> >
-      (dnn_chunk->prv_descriptor_);
-    dnn_chunk->check_and_prv_to_cpu(data_ptr);
-#endif
-    return data_ptr;
-  }
-}
-
-template class MKLMemoryDescriptor<double>;
-template class MKLMemoryDescriptor<float>;
-
-template class MKLMemoryDescriptorBase<float>;
-template class MKLMemoryDescriptorBase<double>;
-}  // namespace mxnet
-#endif
diff --git a/src/operator/mkl/mkl_memory.h b/src/operator/mkl/mkl_memory.h
deleted file mode 100644
index 13f1fd27b12b..000000000000
--- a/src/operator/mkl/mkl_memory.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_memory.cc
-* \brief
-* \author lingyan.guo@intel.com
-*         zhenlin.luo@intel.com
-*
-*******************************************************************************/
-#ifndef MXNET_OPERATOR_MKL_MKL_MEMORY_H_
-#define MXNET_OPERATOR_MKL_MKL_MEMORY_H_
-
-#include <string>
-#include <vector>
-#include <memory>
-
-
-namespace mxnet {
-// Base class
-struct PrvMemDescr {
-  virtual void convert_from_prv(void* cpu_ptr) = 0;
-  virtual void convert_to_prv(void* cpu_ptr) = 0;
-  virtual void convert_from_other(std::shared_ptr<PrvMemDescr> other) = 0;
-  virtual void* prv_ptr(bool allocate_when_uninit = true) = 0;
-  // returns true for matching layouts
-  virtual bool layout_compare(std::shared_ptr<PrvMemDescr> other) = 0;
-  virtual size_t prv_count() = 0;
-  virtual size_t prv_size() = 0;
-  // This might help using prv_ptr_ by different accelerators/engines
-  enum PrvDescrType {
-    PRV_DESCR_MKL2017,
-    PRV_DESCR_MKLDNN
-  };
-  virtual PrvDescrType get_descr_type() = 0;
-};
-
-#if MKL_EXPERIMENTAL == 1
-// Currently HEAD_AT_PRV do not free CPU data
-enum SyncedHead {
-  HEAD_AT_CPU,
-  HEAD_AT_PRV,
-};
-struct MKLMemHolder {
-  SyncedHead head_;
-  std::shared_ptr<PrvMemDescr> prv_descriptor_;
-  bool  b_disable_prv_2_cpu;
-  bool  b_eager_mode;
-  void disable_prv_2_cpu(bool flag) {
-    b_disable_prv_2_cpu = flag;
-  }
-  void set_eager_mode(bool eager_mode) {
-    b_eager_mode = eager_mode;
-  }
-  void set_prv_descriptor(std::shared_ptr<PrvMemDescr> descriptor, bool same_data = false) {
-    head_ = HEAD_AT_PRV;
-    prv_descriptor_ = descriptor;
-  }
-  std::shared_ptr<PrvMemDescr> get_prv_descriptor() {
-    return  prv_descriptor_;
-  }
-  bool head_at_prv() {
-    return (head_ == HEAD_AT_PRV) ? true : false;
-  }
-  void* prv_data(bool allocate_when_uninit = true) {
-    if (head_ != HEAD_AT_PRV) {
-      return NULL;
-    }
-    if (prv_descriptor_ == NULL) {
-      LOG(FATAL) << " prv_descriptor_  is NULL";
-    }
-    CHECK(prv_descriptor_.get());
-    return reinterpret_cast<void*>(prv_descriptor_->prv_ptr(allocate_when_uninit));
-  }
-
-  int prv_count() {
-    if (head_ != HEAD_AT_PRV) {
-      return 0;
-    }
-    if (prv_descriptor_ == NULL) {
-      LOG(FATAL) << " prv_descriptor_  is NULL";
-    }
-    CHECK(prv_descriptor_.get());
-    return prv_descriptor_->prv_count();
-  }
-  static std::shared_ptr<MKLMemHolder> create() {
-    return std::make_shared<MKLMemHolder>();
-  }
-  void  check_and_prv_to_cpu(void *dptr_) {
-    if (!b_disable_prv_2_cpu && head_ == HEAD_AT_PRV) {
-      CHECK(prv_descriptor_ != nullptr);
-      prv_descriptor_->convert_from_prv(dptr_);
-      // Because operator use CPU & maybe change it, change to CPU Flag
-      head_ = HEAD_AT_CPU;
-    }
-    if (b_disable_prv_2_cpu) {
-      b_disable_prv_2_cpu = false;
-    }
-  }
-  MKLMemHolder() :
-    head_(HEAD_AT_CPU), prv_descriptor_(nullptr),
-    b_disable_prv_2_cpu(false), b_eager_mode(false) {}
-};
-#else
-struct MKLMemHolder {
- public:
-  virtual std::shared_ptr<PrvMemDescr> get_prv_descriptor() = 0;
-};
-#endif
-
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_MKL_MKL_MEMORY_H_
diff --git a/src/operator/mkl/mkl_pooling-inl.h b/src/operator/mkl/mkl_pooling-inl.h
deleted file mode 100644
index 5662a61aebd3..000000000000
--- a/src/operator/mkl/mkl_pooling-inl.h
+++ /dev/null
@@ -1,357 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_pooling-inl.h
-* \brief
-* \author zhenlin.luo@intel.com
-*         lingyan.guo@intel.com
-*
-*******************************************************************************/
-
-#ifndef MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_
-#define MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_
-#include <vector>
-#include <string>
-#include <utility>
-#include "../operator_common.h"
-#include "../nn/pooling-inl.h"
-#include "./mkl_util-inl.h"
-
-namespace mxnet {
-namespace op {
-
-
-template<typename xpu, typename DType>
-class MKLPoolingOp : public Operator {
- public:
-  static std::string getName() {
-    return "MKLPoolingOp";
-  }
-  explicit MKLPoolingOp(PoolingParam p) {
-    poolingFwd = static_cast<dnnPrimitive_t>(NULL);
-    poolingBwd = static_cast<dnnPrimitive_t>(NULL);
-    max_idx_data = static_cast<DType*>(NULL);
-    fwd_top_data = MKLData<DType>::create();
-    fwd_bottom_data = MKLData<DType>::create();
-    bwd_top_diff = MKLData<DType>::create();
-    bwd_bottom_diff = MKLData<DType>::create();
-    this->param_ = p;
-    init_mkldnn_ = false;
-  }
-  virtual ~MKLPoolingOp() {
-    if (poolingFwd != NULL) {
-      dnnDelete<DType>(poolingFwd);
-      poolingFwd = NULL;
-    }
-    if (poolingBwd != NULL) {
-      dnnDelete<DType>(poolingBwd);
-      poolingBwd = NULL;
-    }
-    if (max_idx_data != NULL) {
-      dnnReleaseBuffer<DType>(max_idx_data);
-      max_idx_data = NULL;
-    }
-  }
-
- private:
-  void LayerSetUp(const mshadow::Tensor<xpu, 4, DType> &data,
-                  const mshadow::Tensor<xpu, 4, DType> &out) {
-    channels_ = data.shape_[1];
-    height_ = data.shape_[2];
-    width_ = data.shape_[3];
-    num_ = data.shape_[0];
-    global_pooling_ = param_.global_pool;
-    if (global_pooling_) {
-      kernel_h_ = height_;
-      kernel_w_ = width_;
-    } else {
-      kernel_h_ = param_.kernel[0];
-      kernel_w_ = param_.kernel[1];
-    }
-    CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
-    CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
-    pad_h_ = param_.pad[0];
-    pad_w_ = param_.pad[1];
-    if (global_pooling_) {
-      stride_h_ = stride_w_ = 1;
-    } else {
-      stride_h_ = param_.stride[0];
-      stride_w_ = param_.stride[1];
-    }
-    if (global_pooling_) {
-      CHECK(pad_h_ == 0 && pad_w_ == 0 && stride_h_ == 1 && stride_w_ == 1)
-        << "With Global_pooling: true; only pad = 0 and stride = 1";
-    }
-    if (pad_h_ != 0 || pad_w_ != 0) {
-      CHECK(param_.pool_type == pool_enum::kAvgPooling
-          || param_.pool_type == pool_enum::kMaxPooling)
-        << "Padding implemented only for average and max pooling.";
-      CHECK_LT(pad_h_, kernel_h_);
-      CHECK_LT(pad_w_, kernel_w_);
-    }
-    pooled_height_ = out.shape_[2];
-    pooled_width_ = out.shape_[3];
-
-    size_t dim = 4;
-    size_t src_sizes[4], src_strides[4];
-    size_t dst_sizes[4], dst_strides[4];
-    src_sizes[0] = width_;
-    src_sizes[1] = height_;
-    src_sizes[2] = channels_;
-    src_sizes[3] = num_;
-    src_strides[0] = 1;
-    src_strides[1] = src_sizes[0];
-    src_strides[2] = src_sizes[0] * src_sizes[1];
-    src_strides[3] = src_sizes[0] * src_sizes[1] * src_sizes[2];
-    dst_sizes[0] = pooled_width_;
-    dst_sizes[1] = pooled_height_;
-    dst_sizes[2] = src_sizes[2];
-    dst_sizes[3] = src_sizes[3];
-    dst_strides[0] = 1;
-    dst_strides[1] = dst_sizes[0];
-    dst_strides[2] = dst_sizes[0] * dst_sizes[1];
-    dst_strides[3] = dst_sizes[0] * dst_sizes[1] * dst_sizes[2];
-    src_offset[0] = -pad_w_;
-    src_offset[1] = -pad_h_;
-    src_offset[2] = -pad_w_;
-    src_offset[3] = -pad_h_;
-    kernel_stride[0] = stride_w_;
-    kernel_stride[1] = stride_h_;
-    kernel_size[0] = kernel_w_;
-    kernel_size[1] = kernel_h_;
-
-    // Names are for debugging only
-    fwd_bottom_data->name = "fwd_bottom_data   @ " + getName();
-    fwd_top_data->name = "fwd_top_data      @ " + getName();
-    bwd_top_diff->name = "bwd_top_diff      @ " + getName();
-    bwd_bottom_diff->name = "bwd_bottom_diff   @ " + getName();
-
-    fwd_bottom_data->create_user_layout(dim, src_sizes, src_strides);
-    fwd_top_data->create_user_layout(dim, dst_sizes, dst_strides);
-    bwd_bottom_diff->create_user_layout(dim, src_sizes, src_strides);
-    bwd_top_diff->create_user_layout(dim, dst_sizes, dst_strides);
-
-    // Primitives will be allocated during the first fwd pass
-    poolingFwd = NULL;
-    poolingBwd = NULL;
-    max_idx_data = NULL;
-  }
-
- public:
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    if (param_.kernel.ndim() >= 3) {
-      LOG(FATAL) << "Not implmented";
-    }
-    Tensor<xpu, 4, DType> data = mkl_experimental_direct_get<xpu, 4, DType>(
-      in_data[pool_enum::kData], s);
-    Tensor<xpu, 4, DType> out = mkl_experimental_direct_get<xpu, 4, DType>(
-      out_data[pool_enum::kOut], s);
-    if (!init_mkldnn_) {
-      LayerSetUp(data, out);
-      init_mkldnn_ = true;
-    }
-    auto first_pass = false;
-    if (poolingFwd == NULL) first_pass = true;
-
-    dnnAlgorithm_t algorithm = dnnAlgorithmPoolingMax;
-
-    switch (param_.pool_type) {
-    case pool_enum::kMaxPooling:
-      algorithm = dnnAlgorithmPoolingMax;
-      break;
-    case pool_enum::kAvgPooling:
-      algorithm = dnnAlgorithmPoolingAvgIncludePadding;
-
-      break;
-    default:
-      LOG(FATAL) << "Unknown pooling method.";
-    }
-
-    dnnError_t status;
-    void* pooling_res[dnnResourceNumber];
-
-    void* bottom_data = NULL;
-#if MKL_EXPERIMENTAL == 1
-    bottom_data =
-          reinterpret_cast<void *>(mkl_prv_data<DType>(in_data[pool_enum::kData]));
-#endif
-    dnnBorder_t border_type = dnnBorderZerosAsymm;
-    switch (param_.pooling_convention) {
-    case pool_enum::kFull:
-      border_type = dnnBorderZeros;
-      break;
-    case pool_enum::kValid:
-      border_type = dnnBorderZerosAsymm;
-      break;
-    default:
-      border_type = dnnBorderZerosAsymm;
-      break;
-    }
-    if (NULL == bottom_data) {
-      bottom_data = data.dptr_;
-      if (NULL == poolingFwd) {
-        status = dnnPoolingCreateForward<DType>(&poolingFwd, NULL,
-                                                algorithm, fwd_bottom_data->layout_usr,
-                                                kernel_size, kernel_stride,
-                                                src_offset, border_type);
-      CHECK_EQ(status, E_SUCCESS);
-      // Now create poolingBwd
-      status = dnnPoolingCreateBackward<DType>(&poolingBwd, NULL,
-                                               algorithm, fwd_bottom_data->layout_usr,
-                                               kernel_size, kernel_stride,
-                                               src_offset, border_type);
-      CHECK_EQ(status, E_SUCCESS);
-      }
-    }
-#if MKL_EXPERIMENTAL == 1
-    if (NULL != bottom_data) {
-       if (NULL == poolingFwd) {
-          std::shared_ptr<MKLMemHolder> bottom_data_mem = in_data[pool_enum::kData].Mkl_mem_;
-          std::shared_ptr<PrvMemDescr> bottom_prv_descriptor =
-            bottom_data_mem->get_prv_descriptor();
-          CHECK_EQ(bottom_prv_descriptor->get_descr_type(),
-                   PrvMemDescr::PRV_DESCR_MKL2017);
-          std::shared_ptr<MKLData<DType> > mem_descr
-            = std::static_pointer_cast<MKLData<DType>>(bottom_prv_descriptor);
-          CHECK(mem_descr != nullptr);
-          fwd_bottom_data = mem_descr;
-
-          status = dnnPoolingCreateForward<DType>(&poolingFwd, NULL,
-                                                  algorithm, fwd_bottom_data->layout_int,
-                                                  kernel_size, kernel_stride,
-                                                  src_offset, border_type);
-          CHECK_EQ(status, E_SUCCESS);
-          fwd_top_data->create_internal_layout(poolingFwd, dnnResourceDst);
-
-          // Now create poolingBwd
-          status = dnnPoolingCreateBackward<DType>(&poolingBwd, NULL,
-                                                   algorithm, fwd_bottom_data->layout_int,
-                                                   kernel_size, kernel_stride,
-                                                   src_offset, border_type);
-          CHECK_EQ(status, E_SUCCESS);
-          bwd_top_diff->create_internal_layout(poolingFwd, dnnResourceDst);
-          bwd_bottom_diff->create_internal_layout(poolingFwd, dnnResourceSrc);
-        }
-    }
-#endif
-
-    if (first_pass) {
-      dnnLayout_t max_idx_datal = NULL;
-      status = dnnLayoutCreateFromPrimitive<DType>(
-          &max_idx_datal, poolingFwd, dnnResourceWorkspace);
-      CHECK_EQ(status, E_SUCCESS);
-      status = dnnAllocateBuffer<DType>(reinterpret_cast<void**>(&max_idx_data), max_idx_datal);
-      CHECK_EQ(status, E_SUCCESS);
-#if MKL_EXPERIMENTAL == 0
-      fwd_bottom_data->create_internal_layout(poolingFwd, dnnResourceSrc);
-      fwd_top_data->create_internal_layout(poolingFwd, dnnResourceDst);
-      bwd_top_diff->create_internal_layout(poolingBwd, dnnResourceDiffDst);
-      bwd_bottom_diff->create_internal_layout(poolingBwd, dnnResourceDiffSrc);
-#endif
-      dnnLayoutDelete<DType>(max_idx_datal);
-      first_pass = false;
-    }
-    pooling_res[dnnResourceSrc] = bottom_data;
-    pooling_res[dnnResourceWorkspace] = max_idx_data;
-
-    pooling_res[dnnResourceDst] = fwd_top_data->get_output_ptr(
-      out.dptr_, fwd_top_data, out_data[pool_enum::kOut]);
-    status = dnnExecute<DType>(poolingFwd, pooling_res);
-    CHECK_EQ(status, E_SUCCESS);
-#if MKL_EXPERIMENTAL == 0
-    if (fwd_top_data->conversion_needed()) {
-      fwd_top_data->convert_from_prv(out.dptr_);
-    }
-#endif
-  }
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    if (!req[0]) {
-      return;
-    }
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
-    CHECK_EQ(req.size(), 1);
-    CHECK_EQ(in_grad.size(), 1);
-    if (param_.kernel.ndim() >= 3) {
-      LOG(FATAL) << "Not implmented";
-    }
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> grad = mkl_experimental_direct_get<xpu, 4, DType>(
-      out_grad[pool_enum::kOut], s);
-    Tensor<xpu, 4, DType> input_grad = mkl_experimental_direct_get<xpu, 4, DType>(
-      in_grad[pool_enum::kData], s);
-    dnnError_t e;
-    void* pooling_res[dnnResourceNumber];
-    pooling_res[dnnResourceWorkspace] = reinterpret_cast<void *>(max_idx_data);
-
-    pooling_res[dnnResourceDiffDst] =
-      bwd_top_diff->get_converted_prv(grad.dptr_, true, out_grad[pool_enum::kOut]);
-
-    pooling_res[dnnResourceDiffSrc] = bwd_bottom_diff->get_output_ptr(
-      input_grad.dptr_, bwd_bottom_diff, in_grad[pool_enum::kData]);
-    e = dnnExecute<DType>(poolingBwd, pooling_res);
-    CHECK_EQ(e, E_SUCCESS);
-#if MKL_EXPERIMENTAL == 0
-    if (bwd_bottom_diff->conversion_needed()) {
-      bwd_bottom_diff->convert_from_prv(input_grad.dptr_);
-    }
-#endif
-  }
-
- private:
-  PoolingParam param_;
-  int kernel_h_, kernel_w_;
-  int stride_h_, stride_w_;
-  int pad_h_, pad_w_;
-  int channels_, num_;
-  int height_, width_;
-  int pooled_height_, pooled_width_;
-  bool global_pooling_;
-
- private:
-  size_t kernel_size[2],
-         kernel_stride[4];
-  int src_offset[4];  // 2*(dimension-2)
-  dnnPrimitive_t poolingFwd, poolingBwd;
-  DType *max_idx_data;
-
-  std::shared_ptr<MKLData<DType> > fwd_top_data;
-  std::shared_ptr<MKLData<DType> > fwd_bottom_data;
-  std::shared_ptr<MKLData<DType> > bwd_top_diff;
-  std::shared_ptr<MKLData<DType> > bwd_bottom_diff;
-  bool init_mkldnn_;
-};  // class MKLPoolingOp
-}   // namespace op
-}   // namespace mxnet
-
-#endif  // MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_
diff --git a/src/operator/mkl/mkl_relu-inl.h b/src/operator/mkl/mkl_relu-inl.h
deleted file mode 100644
index 8d7ab5e1e2db..000000000000
--- a/src/operator/mkl/mkl_relu-inl.h
+++ /dev/null
@@ -1,272 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_relu-inl.h
-* \brief
-* \author zhenlin.luo@intel.com
-*         lingyan.guo@intel.com
-*
-*******************************************************************************/
-#ifndef MXNET_OPERATOR_MKL_MKL_RELU_INL_H_
-#define MXNET_OPERATOR_MKL_MKL_RELU_INL_H_
-
-
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <algorithm>
-#include <map>
-#include <vector>
-#include <string>
-#include <utility>
-#include "../operator_common.h"
-#include "./mkl_util-inl.h"
-
-namespace mxnet {
-namespace op {
-
-template<typename xpu, typename DType>
-class MKLReluOp : public Operator {
- public:
-  static std::string getName() {
-    return "MKLReluOp";
-  }
-  MKLReluOp():
-      reluFwd_(NULL),
-      reluBwd_(NULL) {
-    init_mkldnn_ = false;
-    fwd_top_data_ = MKLData<DType>::create();
-    fwd_bottom_data_ = MKLData<DType>::create();
-    bwd_top_diff_ = MKLData<DType>::create();
-    bwd_bottom_diff_ = MKLData<DType>::create();
-  }
-
-  ~MKLReluOp() {
-    if (reluFwd_ != NULL) {
-      dnnDelete<DType>(reluFwd_);
-      reluFwd_ = NULL;
-    }
-    if (reluBwd_ != NULL) {
-      dnnDelete<DType>(reluBwd_);
-      reluBwd_ = NULL;
-    }
-  }
-
- private:
-  void LayerSetUp(const mshadow::Tensor<xpu, 4, DType> &data,
-                  const mshadow::Tensor<xpu, 4, DType> &out) {
-    size_t dim = 4;
-    size_t *sizes = new size_t[dim];
-    size_t *strides = new size_t[dim];
-    for (size_t d = 0; d < dim; ++d) {
-      (sizes)[d] = data.shape_[dim - 1 - d];
-      (strides)[d] = (d == 0) ? 1 : (strides)[d - 1] * (sizes)[d - 1];
-    }
-    // Names are for debugging only
-    fwd_bottom_data_->name = "fwd_bottom_data   @ " + getName();
-    fwd_top_data_->name = "fwd_top_data      @ " + getName();
-    bwd_bottom_diff_->name = "bwd_bottom_diff   @ " + getName();
-    bwd_top_diff_->name = "bwd_top_diff      @ " + getName();
-    fwd_bottom_data_->create_user_layout(dim, (sizes), (strides));
-    fwd_top_data_->create_user_layout(dim, (sizes), (strides));
-    bwd_bottom_diff_->create_user_layout(dim, (sizes), (strides));
-    bwd_top_diff_->create_user_layout(dim, (sizes), (strides));
-    delete[] sizes;
-    delete[] strides;
-  }
-
- public:
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> data;
-    Tensor<xpu, 4, DType> out;
-    if (in_data[activation::kData].ndim() == 1) {
-      Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0], 1, 1, 1);
-      data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        in_data[activation::kData], dshape, s);
-      out = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_data[activation::kOut], dshape, s);
-    } else if (in_data[activation::kData].ndim() == 2) {
-      Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0],
-      in_data[activation::kData].shape_[1], 1, 1);
-      data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        in_data[activation::kData], dshape, s);
-      out = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_data[activation::kOut], dshape, s);
-    } else if (in_data[activation::kData].ndim() == 3) {
-      Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0],
-        in_data[activation::kData].shape_[1],
-        in_data[activation::kData].shape_[2], 1);
-      data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        in_data[activation::kData], dshape, s);
-      out = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_data[activation::kOut], dshape, s);
-    } else {
-      data = mkl_experimental_direct_get<xpu, 4, DType>(in_data[activation::kData], s);
-      out = mkl_experimental_direct_get<xpu, 4, DType>(out_data[activation::kOut], s);
-    }
-    if (!init_mkldnn_) {
-      LayerSetUp(data, out);
-      init_mkldnn_ = true;
-    }
-    void* bottom_data = NULL;
-#if MKL_EXPERIMENTAL == 1
-    bottom_data =
-          reinterpret_cast<void *>(mkl_prv_data<DType>(in_data[activation::kData]));
-#endif
-#if MKL_EXPERIMENTAL == 1
-    if (bottom_data != NULL) {
-      if (reluFwd_ == NULL) {
-      std::shared_ptr<MKLData<DType> > mem_descr =
-        mkl_get_mem_desc<DType>(in_data[activation::kData].Mkl_mem_);
-      DType negative_slope = 0;
-      dnnError_t e;
-      e = dnnReLUCreateForward<DType>(&reluFwd_, NULL, mem_descr->layout_int,
-                                      negative_slope);
-      CHECK_EQ(e, E_SUCCESS);
-      e = dnnReLUCreateBackward<DType>(&reluBwd_, NULL, mem_descr->layout_int,
-                                       mem_descr->layout_int, negative_slope);
-      CHECK_EQ(e, E_SUCCESS);
-
-      fwd_bottom_data_ = mem_descr;
-      fwd_top_data_->create_internal_layout(reluFwd_, dnnResourceDst);
-      bwd_top_diff_->create_internal_layout(reluFwd_, dnnResourceDst);
-      bwd_bottom_diff_->create_internal_layout(reluFwd_, dnnResourceSrc);
-      }
-    }
-#endif
-    if (bottom_data  == NULL) {
-      bottom_data = data.dptr_;
-      if (reluFwd_ == NULL) {
-        dnnError_t e;
-        DType negative_slope = 0;
-        e = dnnReLUCreateForward<DType>(&reluFwd_, NULL,
-                                        fwd_bottom_data_->layout_usr, negative_slope);
-        CHECK_EQ(e, E_SUCCESS);
-        e = dnnReLUCreateBackward<DType>(&reluBwd_, NULL,
-                                         fwd_bottom_data_->layout_usr, fwd_bottom_data_->layout_usr,
-                                         negative_slope);
-        CHECK_EQ(e, E_SUCCESS);
-      }
-    }
-    dnnError_t e;
-    void* relu_res[dnnResourceNumber];
-    relu_res[dnnResourceSrc] = bottom_data;
-
-    relu_res[dnnResourceDst] = fwd_top_data_->get_output_ptr(
-      out.dptr_, fwd_top_data_, out_data[activation::kOut], (data.dptr_ == out.dptr_));
-    e = dnnExecute<DType>(reluFwd_, relu_res);
-    CHECK_EQ(e, E_SUCCESS);
-#if MKL_EXPERIMENTAL == 0
-    if (fwd_top_data_->conversion_needed()) {
-      fwd_top_data_->convert_from_prv(out.dptr_);
-    }
-#endif
-  }
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    if (!req[0]) {
-      return;
-    }
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK(in_data.size() == 1 && in_grad.size() == 1);
-    CHECK_EQ(req.size(), 1);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> m_out_grad;
-    Tensor<xpu, 4, DType> m_out_data;
-    Tensor<xpu, 4, DType> m_in_grad;
-
-    if (out_grad[activation::kOut].ndim() == 1) {
-      Shape<4> dshape = Shape4(out_grad[activation::kOut].shape_[0], 1, 1, 1);
-      m_out_grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_grad[activation::kOut], dshape, s);
-      m_out_data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_data[activation::kOut], dshape, s);
-      m_in_grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        in_grad[activation::kData], dshape, s);
-    } else if (out_grad[activation::kOut].ndim() == 2) {
-      Shape<4> dshape = Shape4(out_grad[activation::kOut].shape_[0],
-                               out_grad[activation::kOut].shape_[1], 1, 1);
-      m_out_grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_grad[activation::kOut], dshape, s);
-      m_out_data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_data[activation::kOut], dshape, s);
-      m_in_grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        in_grad[activation::kData], dshape, s);
-    } else if (out_grad[activation::kOut].ndim() == 3) {
-      Shape<4> dshape = Shape4(out_grad[activation::kOut].shape_[0],
-        out_grad[activation::kOut].shape_[1],
-        out_grad[activation::kOut].shape_[2], 1);
-      m_out_grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_grad[activation::kOut], dshape, s);
-      m_out_data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        out_data[activation::kOut], dshape, s);
-      m_in_grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        in_grad[activation::kData], dshape, s);
-    } else {
-      m_out_grad = mkl_experimental_direct_get<xpu, 4, DType>(out_grad[activation::kOut], s);
-      m_out_data = mkl_experimental_direct_get<xpu, 4, DType>(out_data[activation::kOut], s);
-      m_in_grad = mkl_experimental_direct_get<xpu, 4, DType>(in_grad[activation::kData], s);
-    }
-    dnnError_t e;
-    void* relu_res[dnnResourceNumber];
-
-    void* bottom_data = NULL;
-#if MKL_EXPERIMENTAL == 1
-    bottom_data = reinterpret_cast<void *>(mkl_prv_data<DType>(out_data[activation::kOut]));
-#endif
-    if (NULL == bottom_data) {
-      bottom_data = reinterpret_cast<void *>(const_cast<DType*>(m_out_data.dptr_));
-    }
-    relu_res[dnnResourceSrc] = bottom_data;
-    relu_res[dnnResourceDiffDst] = bwd_top_diff_->get_converted_prv(m_out_grad.dptr_,
-                true, out_grad[activation::kOut]);
-    relu_res[dnnResourceDiffSrc] = bwd_bottom_diff_->get_output_ptr(
-      m_in_grad.dptr_, bwd_bottom_diff_, in_grad[activation::kData]);
-    e = dnnExecute<DType>(reluBwd_, relu_res);
-    CHECK_EQ(e, E_SUCCESS);
-#if MKL_EXPERIMENTAL == 0
-    if (bwd_bottom_diff_->conversion_needed()) {
-      bwd_bottom_diff_->convert_from_prv(m_in_grad.dptr_);
-    }
-#endif
-  }
-
- private:
-  bool init_mkldnn_;
-  std::shared_ptr<MKLData<DType> > fwd_top_data_;
-  std::shared_ptr<MKLData<DType> > fwd_bottom_data_;
-  std::shared_ptr<MKLData<DType> > bwd_top_diff_;
-  std::shared_ptr<MKLData<DType> > bwd_bottom_diff_;
-  dnnPrimitive_t reluFwd_, reluBwd_;
-};  // class MKLReluOp
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_MKL_MKL_RELU_INL_H_
diff --git a/src/operator/mkl/mkl_util-inl.h b/src/operator/mkl/mkl_util-inl.h
deleted file mode 100644
index 4ad786a2ce93..000000000000
--- a/src/operator/mkl/mkl_util-inl.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkl_util-inl.h
-* \brief
-* \author lingyan.guo@intel.com
-*         zhenlin.luo@intel.com
-*
-*******************************************************************************/
-#ifndef MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_
-#define MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_
-#include <vector>
-#define MKLDNN_CALL(func)                                                               \
-  {                                                                                     \
-    dnnError_t status = (func);                                                                \
-    CHECK_EQ(status, E_SUCCESS) << "MKL DNN call failed (status: " << status << ").";           \
-  }
-
-
-namespace mxnet {
-namespace op {
-
-#if MKL_EXPERIMENTAL == 1
-  template<typename DType>
-  inline DType * mkl_prv_data(const TBlob &b) {
-    std::shared_ptr<MKLMemHolder> bottom_data_mem = b.Mkl_mem_;
-    bool mem_valid = (bottom_data_mem != nullptr) && bottom_data_mem->head_at_prv();
-    if (mem_valid) {
-      return reinterpret_cast<DType*>(bottom_data_mem->prv_data());
-    }
-    return NULL;
-  }
-
-  template<typename DType>
-  inline int mkl_prv_count(const TBlob &b) {
-    std::shared_ptr<MKLMemHolder> bottom_data_mem = b.Mkl_mem_;
-    bool mem_valid = (bottom_data_mem != nullptr) && bottom_data_mem->head_at_prv();
-    if (mem_valid) {
-      return bottom_data_mem->prv_count();
-    }
-    return 0;
-  }
-#endif
-  inline void mkl_set_priv_flag(const TBlob &b) {
-#if MKL_EXPERIMENTAL == 1
-    std::shared_ptr<MKLMemHolder> bottom_data_mem = b.Mkl_mem_;
-    bool mem_valid = (bottom_data_mem != nullptr) && bottom_data_mem->head_at_prv();
-    if (mem_valid) {
-      bottom_data_mem->disable_prv_2_cpu(true);
-    }
-#endif
-  }
-#if MKL_EXPERIMENTAL == 1
-  template<typename DType>
-  inline std::shared_ptr<MKLData<DType> > mkl_get_mem_desc(
-    const std::shared_ptr<MKLMemHolder> data_mem) {
-    std::shared_ptr<PrvMemDescr> prv_descriptor =
-      data_mem->get_prv_descriptor();
-    CHECK_EQ(prv_descriptor->get_descr_type(),
-      PrvMemDescr::PRV_DESCR_MKL2017);
-    std::shared_ptr<MKLData<DType> > mem_descr
-      = std::static_pointer_cast<MKLData<DType>>
-      (prv_descriptor);
-    CHECK(mem_descr != NULL);
-    return mem_descr;
-  }
-#endif
-  template<typename xpu, int dim, typename DType>
-  inline  mshadow::Tensor<xpu, dim, DType> mkl_experimental_direct_get(
-    const TBlob &b, mshadow::Stream<xpu> *s) {
-    mkl_set_priv_flag(b);
-    return b.get<xpu, dim, DType>(s);
-  }
-  template<typename xpu, int dim, typename DType>
-  inline  mshadow::Tensor<xpu, dim, DType> mkl_experimental_direct_get_with_shape(
-    const TBlob &b, const mshadow::Shape<dim> &shape, mshadow::Stream<xpu> *s) {
-    mkl_set_priv_flag(b);
-    return b.get_with_shape<xpu, dim, DType>(shape, s);
-  }
-}  // namespace op
-#if MKL_EXPERIMENTAL == 1
-inline void mkl_tblobs_prv_to_cpu(const std::vector<TBlob> &data) {
-  for (size_t i = 0; i < data.size(); i++) {
-    std::shared_ptr<MKLMemHolder> mem_holder = data[i].Mkl_mem_;
-    if (mem_holder != nullptr && mem_holder->b_eager_mode) {
-      mem_holder->check_and_prv_to_cpu(data[i].dptr_);
-    }
-  }
-}
-inline void mkl_set_tblob_eager_mode(const TBlob &data) {
-  std::shared_ptr<MKLMemHolder> mem_holder = data.Mkl_mem_;
-  if (mem_holder != nullptr) {
-    mem_holder->set_eager_mode(true);
-  }
-}
-#endif
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_
diff --git a/src/operator/nn/activation-inl.h b/src/operator/nn/activation-inl.h
index ac8b747f0f39..8368c9898502 100644
--- a/src/operator/nn/activation-inl.h
+++ b/src/operator/nn/activation-inl.h
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file activation-inl.h
  * \brief Activation operator
- * \author Bing Xu
+ * \author Bing Xu, Da Zheng
 */
 
 #ifndef MXNET_OPERATOR_NN_ACTIVATION_INL_H_
@@ -37,6 +37,7 @@
 #include <utility>
 #include "../operator_common.h"
 #include "../mxnet_op.h"
+#include "../mshadow_op.h"
 
 namespace mxnet {
 namespace op {
@@ -45,6 +46,7 @@ namespace op {
 namespace activation {
 enum ActivationOpInputs {kData};
 enum ActivationOpOutputs {kOut};
+enum ActivationOpResource {kTempSpace};
 enum ActivationOpType {kReLU, kSigmoid, kTanh, kSoftReLU};
 }  // activation
 
@@ -59,160 +61,133 @@ struct ActivationParam : public dmlc::Parameter<ActivationParam> {
     .add_enum("softrelu", activation::kSoftReLU)
     .describe("Activation function to be applied.");
   }
+
+  bool operator==(const ActivationParam& other) const {
+    return this->act_type == other.act_type;
+  }
 };
 
-/**
- * \brief This is the implementation of activation operator.
- * \tparam xpu The device that the op will be executed on.
- */
 template<typename xpu, typename ForwardOp, typename BackwardOp, typename DType>
-class ActivationOp : public Operator {
- public:
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 1U);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    const TBlob& input = in_data[activation::kData];
-    const size_t sz = input.shape_.Size();
-    if (sz) {
-      MXNET_ASSIGN_REQ_SWITCH(req[activation::kOut], Req, {
-        mxnet_op::Kernel<mxnet_op::op_with_req<ForwardOp, Req>, xpu>::Launch(
-          s, sz,
-          out_data[activation::kOut].dptr<DType>(),
-          input.dptr<DType>());
-      });
-    }
+void ActivationForward(const OpContext &ctx, const TBlob &in_data,
+                       const OpReqType &req, const TBlob &out_data) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const size_t sz = in_data.shape_.Size();
+  if (sz) {
+    MXNET_ASSIGN_REQ_SWITCH(req, Req, {
+      mxnet_op::Kernel<mxnet_op::op_with_req<ForwardOp, Req>, xpu>::Launch(
+        s, sz,
+        out_data.dptr<DType>(),
+        in_data.dptr<DType>());
+    });
   }
+}
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1U);
-    CHECK(in_data.size() == 1 && in_grad.size() == 1);
-    CHECK_EQ(req.size(), 1U);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    const TBlob& m_out_grad = out_grad[activation::kOut];
-    const TBlob& m_out_data = out_data[activation::kOut];
-    const TBlob&  m_in_grad = in_grad[activation::kData];
-    const size_t sz = m_out_data.shape_.Size();
-    if (sz) {
-      MXNET_ASSIGN_REQ_SWITCH(req[activation::kData], Req, {
-        mxnet_op::Kernel<mxnet_op::op_with_req<
-          mxnet::op::mxnet_op::backward_grad_tuned<BackwardOp>, Req>, xpu>::Launch(
-          s, sz,
-          m_in_grad.dptr<DType>(),
-          m_out_grad.dptr<DType>(),
-          m_out_data.dptr<DType>());
-      });
-    }
+template<typename xpu, typename ForwardOp, typename BackwardOp, typename DType>
+void ActivationBackward(const OpContext &ctx, const TBlob &out_grad,
+                        const TBlob &out_data, const OpReqType &req,
+                        const TBlob &in_grad) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const size_t sz = out_data.shape_.Size();
+  if (sz) {
+    MXNET_ASSIGN_REQ_SWITCH(req, Req, {
+      mxnet_op::Kernel<mxnet_op::op_with_req<
+        mxnet::op::mxnet_op::backward_grad_tuned<BackwardOp>, Req>, xpu>::Launch(
+        s, sz,
+        in_grad.dptr<DType>(),
+        out_grad.dptr<DType>(),
+        out_data.dptr<DType>());
+    });
   }
-};  // class ActivationOp
+}
 
-// Declare Factory function, used for dispatch specialization
 template<typename xpu>
-Operator* CreateOp(ActivationParam type, int dtype, const TShape& dshape);
-
-#if DMLC_USE_CXX11
-class ActivationProp : public OperatorProperty {
- public:
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-    using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 1U) << "Input:[data]";
-    const TShape &dshape = in_shape->at(activation::kData);
-    if (dshape.ndim() == 0) return false;
-    out_shape->clear();
-    out_shape->push_back(dshape);
-    return true;
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1U);
-    int dtype = (*in_type)[0];
-    CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
-      if ((*in_type)[i] == -1) {
-          (*in_type)[i] = dtype;
-      } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
-      }
+void ActivationComputeImpl(const ActivationParam &param, const OpContext &ctx,
+                           const TBlob &input, OpReqType req, const TBlob &output) {
+  MSHADOW_REAL_TYPE_SWITCH(input.type_flag_, DType, {
+    switch (param.act_type) {
+      case activation::kReLU:
+        ActivationForward<xpu, mshadow_op::relu, mshadow_op::relu_grad, DType>(
+            ctx, input, req, output);
+        break;
+      case activation::kSigmoid:
+        ActivationForward<xpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad, DType>(
+            ctx, input, req, output);
+        break;
+      case activation::kTanh:
+        ActivationForward<xpu, mshadow_op::tanh, mshadow_op::tanh_grad, DType>(
+            ctx, input, req, output);
+        break;
+      case activation::kSoftReLU:
+        ActivationForward<xpu, mshadow_op::softrelu, mshadow_op::softrelu_grad, DType>(
+            ctx, input, req, output);
+        break;
+      default:
+        LOG(FATAL) << "unknown activation type";
     }
-    out_type->clear();
-    out_type->push_back(dtype);
-    return true;
-  }
+  });
+}
 
-  OperatorProperty* Copy() const override {
-    auto ptr = new ActivationProp();
-    ptr->param_ = param_;
-    return ptr;
-  }
+template<typename xpu>
+void ActivationGradComputeImpl(const ActivationParam &param, const OpContext &ctx,
+                               const TBlob &out_grad, const TBlob &out_data,
+                               OpReqType req, const TBlob &output) {
+  MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, {
+    switch (param.act_type) {
+      case activation::kReLU:
+        ActivationBackward<xpu, mshadow_op::relu, mshadow_op::relu_grad, DType>(
+            ctx, out_grad, out_data, req, output);
+        break;
+      case activation::kSigmoid:
+        ActivationBackward<xpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad, DType>(
+            ctx, out_grad, out_data, req, output);
+        break;
+      case activation::kTanh:
+        ActivationBackward<xpu, mshadow_op::tanh, mshadow_op::tanh_grad, DType>(
+            ctx, out_grad, out_data, req, output);
+        break;
+      case activation::kSoftReLU:
+        ActivationBackward<xpu, mshadow_op::softrelu, mshadow_op::softrelu_grad, DType>(
+            ctx, out_grad, out_data, req, output);
+        break;
+      default:
+        LOG(FATAL) << "unknown activation type";
+    }
+  });
+}
 
-  std::string TypeString() const override {
-    return "Activation";
-  }
+template<typename xpu>
+void ActivationCompute(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx,
+    const std::vector<TBlob>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+  ActivationComputeImpl<xpu>(param, ctx, inputs[0], req[0], outputs[0]);
+}
 
-  // decalre dependency and inplace optimization options
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
+template<typename xpu>
+void ActivationGradCompute(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx,
+    const std::vector<TBlob>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<TBlob>& outputs) {
 #if MXNET_USE_CUDNN == 1
-    return {out_grad[activation::kOut], out_data[activation::kOut], in_data[activation::kData]};
+  CHECK_EQ(inputs.size(), 3U);
 #else
-    return {out_grad[activation::kOut], out_data[activation::kOut]};
-#endif  // MXNET_USE_CUDNN
-  }
-
-  std::vector<std::pair<int, void*> > BackwardInplaceOption(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data,
-    const std::vector<void*> &in_grad) const override {
-    return {{out_grad[activation::kOut], in_grad[activation::kData]}};
-  }
-
-  std::vector<std::pair<int, void*> > ForwardInplaceOption(
-    const std::vector<int> &in_data,
-    const std::vector<void*> &out_data) const override {
-    return {{in_data[activation::kData], out_data[activation::kOut]}};
-  }
+  CHECK_EQ(inputs.size(), 2U);
+#endif
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+  ActivationGradComputeImpl<xpu>(param, ctx, inputs[0], inputs[1], req[0], outputs[0]);
+}
 
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented.";
-    return NULL;
-  }
-
-  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                             std::vector<int> *in_type) const override;
-
- private:
-  ActivationParam param_;
-};
-#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_NN_ACTIVATION_INL_H_
diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc
index 401a9e3eaa56..06cc2e7b5819 100644
--- a/src/operator/nn/activation.cc
+++ b/src/operator/nn/activation.cc
@@ -21,65 +21,123 @@
  * Copyright (c) 2015 by Contributors
  * \file activation.cc
  * \brief activation op
- * \author Bing Xu
+ * \author Bing Xu, Da Zheng
 */
 #include "./activation-inl.h"
 #include "../mshadow_op.h"
-#if MXNET_USE_MKL2017 == 1
-#include <mkl_memory.h>
-#include "../mkl/mkl_memory-inl.h"
-#include "../mkl/mkl_relu-inl.h"
-#endif  // MXNET_USE_MKL2017
+#include "../tensor/elemwise_unary_op.h"
+#if MXNET_USE_MKLDNN == 1
+#include "./mkldnn/mkldnn_base-inl.h"
+#include "./mkldnn/mkldnn_ops-inl.h"
+#endif  // MXNET_USE_MKLDNN
 
 namespace mxnet {
 namespace op {
-template<>
-Operator *CreateOp<cpu>(ActivationParam param, int dtype, const TShape& dshape) {
-  Operator *op = NULL;
-#if MXNET_USE_MKL2017 == 1
-  if (param.act_type == activation::kReLU && dshape.ndim() <= 4) {
-      switch (dtype) {
-      case mshadow::kFloat32:
-          return new MKLReluOp<cpu, float>();
-      case mshadow::kFloat64:
-          return new MKLReluOp<cpu, double>();
-      default:
-          break;
-      }
+
+DMLC_REGISTER_PARAMETER(ActivationParam);
+
+// This will determine the order of the inputs for backward computation.
+struct ActivationGrad {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
+    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
+    heads.emplace_back(nnvm::NodeEntry{n, activation::kOut, 0});
+#if MXNET_USE_CUDNN == 1
+    heads.push_back(n->inputs[activation::kData]);
+#endif
+    return MakeGradNode(op_name, n, heads, n->attrs.dict);
+  }
+};
+
+static void ActivationComputeEx_CPU(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx,
+    const std::vector<NDArray>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<NDArray>& outputs) {
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+#if MXNET_USE_MKLDNN == 1
+  if (SupportMKLDNN(inputs[0])) {
+    MKLDNNActivationForward(attrs, ctx, inputs[0], req[0], outputs[0]);
+    return;
   }
-  if (enableMKLWarnGenerated())
-    LOG(INFO) << MKLReluOp<cpu, float>::getName() << " Skip MKL optimization";
 #endif
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    switch (param.act_type) {
-      case activation::kReLU:
-        op = new ActivationOp<cpu, mshadow_op::relu, mshadow_op::relu_grad, DType>();
-        break;
-      case activation::kSigmoid:
-        op = new ActivationOp<cpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad, DType>();
-        break;
-      case activation::kTanh:
-        op = new ActivationOp<cpu, mshadow_op::tanh, mshadow_op::tanh_grad, DType>();
-        break;
-      case activation::kSoftReLU:
-        op = new ActivationOp<cpu, mshadow_op::softrelu, mshadow_op::softrelu_grad, DType>();
-        break;
-      default:
-        LOG(FATAL) << "unknown activation type";
-    }
-  })
-  return op;
+  ActivationComputeImpl<cpu>(param, ctx, inputs[0].data(), req[0], outputs[0].data());
 }
 
-// DO_BIND_DISPATCH comes from operator_common.h
-Operator *ActivationProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                                           std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_shape)[0]);
+void ActivationGradComputeEx_CPU(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx,
+    const std::vector<NDArray>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<NDArray>& outputs) {
+#if MXNET_USE_CUDNN == 1
+  CHECK_EQ(inputs.size(), 3U);
+#else
+  CHECK_EQ(inputs.size(), 2U);
+#endif
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+#if MXNET_USE_MKLDNN == 1
+  if (SupportMKLDNN(inputs[0])) {
+    MKLDNNActivationBackward(attrs, ctx, inputs[0], inputs[1], req[0],
+                             outputs[0]);
+    return;
+  }
+#endif
+  ActivationGradComputeImpl<cpu>(param, ctx, inputs[0].data(), inputs[1].data(),
+                                 req[0], outputs[0].data());
 }
 
-DMLC_REGISTER_PARAMETER(ActivationParam);
+inline static bool ActivationStorageType(const nnvm::NodeAttrs& attrs,
+                                         const int dev_mask,
+                                         DispatchMode* dispatch_mode,
+                                         std::vector<int> *in_attrs,
+                                         std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(out_attrs->size(), 1);
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+  bool ret = ElemwiseStorageType<1, 1, false, false, false>(attrs, dev_mask,
+                                                            dispatch_mode,
+                                                            in_attrs, out_attrs);
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param)) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+  return ret;
+}
+
+inline static bool BackwardActStorageType(const nnvm::NodeAttrs& attrs,
+                                          const int dev_mask,
+                                          DispatchMode* dispatch_mode,
+                                          std::vector<int> *in_attrs,
+                                          std::vector<int> *out_attrs) {
+#if MXNET_USE_CUDNN == 1
+  CHECK_EQ(in_attrs->size(), 3U);
+#else
+  CHECK_EQ(in_attrs->size(), 2U);
+#endif
+  CHECK_EQ(out_attrs->size(), 1U);
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+#if MXNET_USE_CUDNN == 1
+  bool ret = ElemwiseStorageType<3, 1, false, false, false>(attrs, dev_mask,
+                                                            dispatch_mode,
+                                                            in_attrs, out_attrs);
+#else
+  bool ret = ElemwiseStorageType<2, 1, false, false, false>(attrs, dev_mask,
+                                                            dispatch_mode,
+                                                            in_attrs, out_attrs);
+#endif
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param)) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+  return ret;
+}
 
-MXNET_REGISTER_OP_PROPERTY(Activation, ActivationProp)
+MXNET_OPERATOR_REGISTER_UNARY(Activation)
 .describe(R"code(Applies an activation function element-wise to the input.
 
 The following activation functions are supported:
@@ -90,8 +148,31 @@ The following activation functions are supported:
 - `softrelu`: Soft ReLU, or SoftPlus, :math:`y = log(1 + exp(x))`
 
 )code" ADD_FILELINE)
-.add_argument("data", "NDArray-or-Symbol", "Input array to activation function.")
+.set_attr_parser(ParamParser<ActivationParam>)
+.set_attr<FInferStorageType>("FInferStorageType", ActivationStorageType)
+.set_attr<FCompute>("FCompute<cpu>", ActivationCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", ActivationComputeEx_CPU)
+.set_attr<nnvm::FGradient>("FGradient", ActivationGrad{"_backward_Activation"})
 .add_arguments(ActivationParam::__FIELDS__());
 
+NNVM_REGISTER_OP(_backward_Activation)
+.set_num_inputs(3)
+.set_num_outputs(1)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FInferStorageType>("FInferStorageType", BackwardActStorageType)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<3, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<3, 1>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
+  return std::vector<std::pair<int, int> >{{0, 0}};
+})
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
+.set_attr_parser(ParamParser<ActivationParam>)
+.set_attr<FCompute>("FCompute<cpu>", ActivationGradCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", ActivationGradComputeEx_CPU);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/activation.cu b/src/operator/nn/activation.cu
index c2f6be9f37c8..dc435b2acc17 100644
--- a/src/operator/nn/activation.cu
+++ b/src/operator/nn/activation.cu
@@ -31,39 +31,73 @@
 
 namespace mxnet {
 namespace op {
+
+#if MXNET_USE_CUDNN == 1
+
+template<typename DType>
+static CuDNNActivationOp<DType> &get_cudnn_op(const ActivationParam& param) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local CuDNNActivationOp<DType> cudnn_op;
+#else
+  static MX_THREAD_LOCAL CuDNNActivationOp<DType> cudnn_op;
+#endif
+  cudnn_op.Init(param);
+  return cudnn_op;
+}
+
 template<>
-Operator *CreateOp<gpu>(ActivationParam param, int dtype, const TShape& dshape) {
-  Operator *op = NULL;
+void ActivationCompute<gpu>(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx,
+    const std::vector<TBlob>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+
   // SoftReLU not supported by CUDNN yet
   if (param.act_type == activation::kSoftReLU) {
-    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-      op = new ActivationOp<gpu, mshadow_op::softrelu, mshadow_op::softrelu_grad, DType>();
-    })
-    return op;
+    MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+      ActivationForward<gpu, mshadow_op::softrelu, mshadow_op::softrelu_grad, DType>(ctx,
+          inputs[0], req[0], outputs[0]);
+    });
+  } else {
+    MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+      get_cudnn_op<DType>(param).Forward(ctx, inputs[0], req[0], outputs[0]);
+    });
   }
+}
 
-#if MXNET_USE_CUDNN == 1
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new CuDNNActivationOp<DType>(param);
-  })
-#else
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    switch (param.act_type) {
-      case activation::kReLU:
-        op = new ActivationOp<gpu, mshadow_op::relu, mshadow_op::relu_grad, DType>();
-        break;
-      case activation::kSigmoid:
-        op = new ActivationOp<gpu, mshadow_op::sigmoid, mshadow_op::sigmoid_grad, DType>();
-        break;
-      case activation::kTanh:
-        op = new ActivationOp<gpu, mshadow_op::tanh, mshadow_op::tanh_grad, DType>();
-        break;
-      default:
-        LOG(FATAL) << "unknown activation";
-    }
-  })
-#endif  // MXNET_USE_CUDNN
-  return op;
+template<>
+void ActivationGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx,
+    const std::vector<TBlob>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 3U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+
+  // SoftReLU not supported by CUDNN yet
+  if (param.act_type == activation::kSoftReLU) {
+    MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+      ActivationBackward<gpu, mshadow_op::softrelu, mshadow_op::softrelu_grad, DType>(
+          ctx, inputs[0], inputs[1], req[0], outputs[0]);
+    });
+  } else {
+    MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+      get_cudnn_op<DType>(param).Backward(ctx, inputs[0], inputs[2], inputs[1], req[0], outputs[0]);
+    });
+  }
 }
+#endif
+
+NNVM_REGISTER_OP(Activation)
+.set_attr<FCompute>("FCompute<gpu>", ActivationCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_Activation)
+.set_attr<FCompute>("FCompute<gpu>", ActivationGradCompute<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/batch_norm-inl.h b/src/operator/nn/batch_norm-inl.h
index 2a9dee2cf845..a07ff40a65f6 100644
--- a/src/operator/nn/batch_norm-inl.h
+++ b/src/operator/nn/batch_norm-inl.h
@@ -21,7 +21,7 @@
  * Copyright (c) 2017 by Contributors
  * \file batch_norm-inl.h
  * \brief
- * \author Bing Xu, Chris Olivier
+ * \author Bing Xu, Chris Olivier, Da Zheng
  */
 #ifndef MXNET_OPERATOR_NN_BATCH_NORM_INL_H_
 #define MXNET_OPERATOR_NN_BATCH_NORM_INL_H_
@@ -47,8 +47,10 @@ namespace mxnet {
 namespace op {
 
 namespace batchnorm {
-enum BatchNormOpInputs {kData, kGamma, kBeta};  // kGamma: weights, kBeta: biases
+enum BatchNormOpInputs {kData, kGamma, kBeta, kInMovingMean,
+  kInMovingVar};  // kGamma: weights, kBeta: biases
 enum BatchNormOpOutputs {kOut, kMean, kVar};  // req, out_data
+enum BatchNormOpResource {kTempSpace};
 enum BatchNormOpAuxiliary {kMovingMean, kMovingVar};  // aux_states
 
 /*! \brief Default channel axis if none specified int he params */
@@ -83,280 +85,182 @@ struct BatchNormParam : public dmlc::Parameter<BatchNormParam> {
     DMLC_DECLARE_FIELD(cudnn_off).set_default(false)
       .describe("Do not select CUDNN operator, if available");
   }
-};
-
-/*! \brief Batch normalization operator */
-template <typename xpu, typename DType, typename AccReal>
-class BatchNormOp : public Operator {
- public:
-  explicit BatchNormOp(BatchNormParam param) {
-    this->param_ = param;
-  }
 
-  static inline bool IsWriting(const OpReqType ort) {
-    return ort == kWriteTo || ort == kWriteInplace;
+  bool operator==(const BatchNormParam& other) const {
+    return this->eps == other.eps &&
+           this->momentum == other.momentum &&
+           this->fix_gamma == other.fix_gamma &&
+           this->use_global_stats == other.use_global_stats &&
+           this->output_mean_var == other.output_mean_var &&
+           this->axis == other.axis &&
+           this->cudnn_off == other.cudnn_off;
   }
+};
 
-  /*!
-   * \brief perform a forward operation of Operator, save the output to TBlob.
-   * \param ctx runtime context available to this call
-   * \param in_data array of input data, it is const
-   * \param req the request types of saving operation, can only be kWriteTo or kWriteInplace.
-   * \param out_data array of output data, pointer is used to indicate that this is holder
-   *        the space of TBlob in out_data must be pre-allocated with InferShape
-   * \param aux_states Auxiliary states of operator. Normally operator doesn't
-   *        need, epecial case like Batch Norm requires.
-   * \sa OpReqType, OpContext
-   */
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_states) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-
-    CHECK_EQ(in_data.size(), 3U);
-    CHECK_EQ(aux_states.size(), 2U);
-    if (ctx.is_train) {
-      CHECK_EQ(out_data.size(), 3U);
-      CHECK_EQ(req.size(), 3U);
-    } else {
-      CHECK_GE(out_data.size(), 1U);
-      CHECK_GE(req.size(), 1U);
-      CHECK_EQ(req[batchnorm::kOut], kWriteTo);
-    }
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    DoForward(s, ctx, in_data, req, out_data, aux_states);
-  }
+static inline bool IsBNWriting(const OpReqType ort) {
+  return ort == kWriteTo || ort == kWriteInplace;
+}
 
-  /*!
-   * \brief Perform a Backward Operation, write gradient to the in_grad.
-   *
-   * \note
-   * Convention:
-   *   out_grad.size() == OperatorProperty.NumVisibleOutputs()
-   *   out_data.size() == OperatorProperty.NumOutputs()
-   * out_data can contain additional invisible returns that remembers the
-   * state carried from the Forward pass. For example mask in the dropout.
-   * The gradients are passed from visible returns in this function.
-   *
-   * \par
-   * Not all the TBlobs in the arguments will be available
-   * if you override the DeclareBackwardDependency of corresponding OperatorProperty class.
-   * Only the dependencies you declared will be available at corresponding position,
-   * the rest of the parameters are simply dummy where you will get a nullptr.
-   * You will be safe if you use the default DeclareBackwardDependency.
-   * But only declare what you need will give engine more chance for optimization.
-   *
-   * \param ctx runtime context available to this call
-   * \param out_grad the gradient value we get from of the Operator.
-   * \param in_data the array of input data.
-   * \param out_data the array of output data.
-   * \param req request types of the saving operation, can be all types.
-   * \param in_grad the array of gradient we need to write to.
-   * \param aux_states Auxiliary states of operator. Normally operator doesn't need
-   * \sa OperatorProperty, OpReqType, OpContext
-   */
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_states) {
-    CHECK_EQ(out_grad.size(), param_.output_mean_var ? 3U : 1U);
-    CHECK_EQ(in_data.size(), 3U);
-    CHECK_EQ(out_data.size(), 3U);
-    CHECK_EQ(in_grad.size(), 3U);
-    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-    DoBackward(s, ctx, out_grad, in_data,
-               out_data, req, in_grad, aux_states);
-  }
+template <typename xpu, typename DType, typename AccReal>
+void BatchNormForwardImpl(mshadow::Stream<cpu> *stream,
+                          const OpContext &ctx, const BatchNormParam& param,
+                          const std::vector<TBlob> &in_data,
+                          const std::vector<OpReqType> &req,
+                          const std::vector<TBlob> &out_data,
+                          const std::vector<TBlob> &aux_states);
 
- private:
-  void DoForward(mshadow::Stream<cpu> *stream,
-                 const OpContext &ctx,
-                 const std::vector<TBlob> &in_data,
-                 const std::vector<OpReqType> &req,
-                 const std::vector<TBlob> &out_data,
-                 const std::vector<TBlob> &aux_states);
-
-  void DoBackward(mshadow::Stream<cpu> *stream,
-                  const OpContext &ctx,
-                  const std::vector<TBlob> &out_grad,
-                  const std::vector<TBlob> &in_data,
-                  const std::vector<TBlob> &out_data,
-                  const std::vector<OpReqType> &req,
-                  const std::vector<TBlob> &in_grad,
-                  const std::vector<TBlob> &aux_states);
+template <typename xpu, typename DType, typename AccReal>
+void BatchNormBackwardImpl(mshadow::Stream<cpu> *stream,
+                           const OpContext &ctx, const BatchNormParam& param,
+                           const std::vector<TBlob> &out_grad,
+                           const std::vector<TBlob> &in_data,
+                           const std::vector<TBlob> &out_data,
+                           const std::vector<OpReqType> &req,
+                           const std::vector<TBlob> &in_grad,
+                           const std::vector<TBlob> &aux_states);
 
 #if MXNET_USE_CUDA
-  void DoForward(mshadow::Stream<gpu> *stream,
-                 const OpContext &ctx,
-                 const std::vector<TBlob> &in_data,
-                 const std::vector<OpReqType> &req,
-                 const std::vector<TBlob> &out_data,
-                 const std::vector<TBlob> &aux_states);
-  void DoBackward(mshadow::Stream<gpu> *stream,
-                  const OpContext &ctx,
-                  const std::vector<TBlob> &out_grad,
-                  const std::vector<TBlob> &in_data,
-                  const std::vector<TBlob> &out_data,
-                  const std::vector<OpReqType> &req,
-                  const std::vector<TBlob> &in_grad,
-                  const std::vector<TBlob> &aux_states);
+template <typename xpu, typename DType, typename AccReal>
+void BatchNormForwardImpl(mshadow::Stream<gpu> *stream,
+                          const OpContext &ctx, const BatchNormParam& param,
+                          const std::vector<TBlob> &in_data,
+                          const std::vector<OpReqType> &req,
+                          const std::vector<TBlob> &out_data,
+                          const std::vector<TBlob> &aux_states);
+template <typename xpu, typename DType, typename AccReal>
+void BatchNormBackwardImpl(mshadow::Stream<gpu> *stream,
+                           const OpContext &ctx, const BatchNormParam& param,
+                           const std::vector<TBlob> &out_grad,
+                           const std::vector<TBlob> &in_data,
+                           const std::vector<TBlob> &out_data,
+                           const std::vector<OpReqType> &req,
+                           const std::vector<TBlob> &in_grad,
+                           const std::vector<TBlob> &aux_states);
 #endif  // MXNET_USE_CUDA
 
-  /*! \brief Batch normalization operator parameters */
-  BatchNormParam param_;
-};  // class BatchNormOp
-
-template<typename xpu>
-Operator *CreateOp(BatchNormParam param, const int dtype, const TShape& shape);
-
-#if DMLC_USE_CXX11
-class BatchNormProp : public OperatorProperty {
- public:
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-    using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]";
-    const TShape &dshape = in_shape->at(0);
-
-    const size_t channelAxis = static_cast<size_t>(param_.axis < 0
-                            ? static_cast<int>(dshape.ndim()) + param_.axis
-                            : param_.axis);
-    CHECK_LT(channelAxis, dshape.ndim()) << "Channel axis out of range: " << param_.axis;
-
-    const int channelCount = dshape[channelAxis];
-
-    if (dshape.ndim() == 0) {
-      return false;
-    }
-
-    in_shape->at(1) = TShape(Shape1(channelCount));
-    in_shape->at(2) = TShape(Shape1(channelCount));
-
-    out_shape->clear();
-    out_shape->push_back(dshape);                // kOut
-    out_shape->push_back(Shape1(channelCount));  // kMean
-    out_shape->push_back(Shape1(channelCount));  // kVar
-
-    aux_shape->clear();
-    aux_shape->push_back(Shape1(channelCount));  // kMovingMean
-    aux_shape->push_back(Shape1(channelCount));  // kMovingVar
-    return true;
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    using namespace mshadow;
-    CHECK_GE(in_type->size(), 1U);
-    const int dtype = (*in_type)[0];
-    CHECK_NE(dtype, -1) << "First input must have specified type";
-    // For float16 input type beta, gamma, mean, and average are stored in float32.
-    // For other input types, these parameters have the same type as input
-    // NOTE: This requirement is from cuDNN (v. 4 and 5)
-    int dtype_param;
-    MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, {
-         dtype_param = mshadow::DataType<AccRealX>::kFlag; });
-    for (index_t i = 1; i < in_type->size(); ++i) {
-      if ((*in_type)[i] == -1) {
-        (*in_type)[i] = dtype_param;
-      } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype_param, ListArguments()[i]);
-      }
-    }
-    for (index_t i = 0; i < aux_type->size(); ++i) {
-      if ((*aux_type)[i] != -1) {
-        UNIFORM_TYPE_CHECK((*aux_type)[i], dtype_param, ListArguments()[i]);
-      }
-    }
-    const size_t n_aux = this->ListAuxiliaryStates().size();
-    aux_type->clear();
-    for (size_t i = 0; i < n_aux; ++i) {
-      aux_type->push_back(dtype_param);
-    }
-    const size_t n_out = this->ListOutputs().size();
-    out_type->clear();
-    out_type->push_back(dtype);
-    for (size_t i = 1; i < n_out; ++i) {
-      out_type->push_back(dtype_param);
-    }
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    auto ptr = new BatchNormProp();
-    ptr->param_ = param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override {
-    return "BatchNorm";
-  }
-
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return {out_grad[batchnorm::kOut],
-            out_data[batchnorm::kMean],
-            out_data[batchnorm::kVar],
-            in_data[batchnorm::kData],
-            in_data[batchnorm::kGamma]
-           };
-  }
-
-  int NumVisibleOutputs() const override {
-    if (param_.output_mean_var) {
-      return 3;
-    }
-    return 1;
-  }
-
-  int NumOutputs() const override {
-    return 3;
-  }
-
-  std::vector<std::string> ListArguments() const override {
-    return {"data", "gamma", "beta"};
-  }
-
-  std::vector<std::string> ListOutputs() const override {
-    return {"output", "mean", "var"};
-  }
-
-  std::vector<std::string> ListAuxiliaryStates() const override {
-    return {"moving_mean", "moving_var"};
-  }
+/*!
+ * \brief perform a forward operation of Operator, save the output to TBlob.
+ * \param ctx runtime context available to this call
+ * \param in_data array of input data, it is const
+ * \param req the request types of saving operation, can only be kWriteTo or kWriteInplace.
+ * \param out_data array of output data, pointer is used to indicate that this is holder
+ *        the space of TBlob in out_data must be pre-allocated with InferShape
+ * \param aux_states Auxiliary states of operator. Normally operator doesn't
+ *        need, epecial case like Batch Norm requires.
+ * \sa OpReqType, OpContext
+ */
+template <typename xpu, typename DType, typename AccReal>
+void BatchNormForward(const OpContext &ctx, const BatchNormParam& param,
+                      const std::vector<TBlob> &in_data,
+                      const std::vector<OpReqType> &req,
+                      const std::vector<TBlob> &out_data,
+                      const std::vector<TBlob> &aux_states) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+
+  CHECK_EQ(in_data.size(), 3U);
+  CHECK_EQ(aux_states.size(), 2U);
+  if (ctx.is_train) {
+    CHECK_EQ(out_data.size(), 3U);
+    CHECK_EQ(req.size(), 3U);
+  } else {
+    CHECK_GE(out_data.size(), 1U);
+    CHECK_GE(req.size(), 1U);
+    CHECK_EQ(req[batchnorm::kOut], kWriteTo);
+  }
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  BatchNormForwardImpl<xpu, DType, AccReal>(s, ctx, param, in_data, req,
+                                            out_data, aux_states);
+}
 
-  Operator* CreateOperator(Context ctx) const override {
-      LOG(FATAL) << "Not Implemented.";
-      return NULL;
-  }
+/*!
+ * \brief Perform a Backward Operation, write gradient to the in_grad.
+ *
+ * \note
+ * Convention:
+ *   out_grad.size() == OperatorProperty.NumVisibleOutputs()
+ *   out_data.size() == OperatorProperty.NumOutputs()
+ * out_data can contain additional invisible returns that remembers the
+ * state carried from the Forward pass. For example mask in the dropout.
+ * The gradients are passed from visible returns in this function.
+ *
+ * \par
+ * Not all the TBlobs in the arguments will be available
+ * if you override the DeclareBackwardDependency of corresponding OperatorProperty class.
+ * Only the dependencies you declared will be available at corresponding position,
+ * the rest of the parameters are simply dummy where you will get a nullptr.
+ * You will be safe if you use the default DeclareBackwardDependency.
+ * But only declare what you need will give engine more chance for optimization.
+ *
+ * \param ctx runtime context available to this call
+ * \param out_grad the gradient value we get from of the Operator.
+ * \param in_data the array of input data.
+ * \param out_data the array of output data.
+ * \param req request types of the saving operation, can be all types.
+ * \param in_grad the array of gradient we need to write to.
+ * \param aux_states Auxiliary states of operator. Normally operator doesn't need
+ * \sa OperatorProperty, OpReqType, OpContext
+ */
+template <typename xpu, typename DType, typename AccReal>
+void BatchNormBackward(const OpContext &ctx, const BatchNormParam& param,
+                       const std::vector<TBlob> &out_grad,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &in_grad,
+                       const std::vector<TBlob> &aux_states) {
+  CHECK_EQ(out_grad.size(), param.output_mean_var ? 3U : 1U);
+  CHECK_EQ(in_data.size(), 3U);
+  CHECK_EQ(out_data.size(), 3U);
+  CHECK_EQ(in_grad.size(), 3U);
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  BatchNormBackwardImpl<xpu, DType, AccReal>(s, ctx, param, out_grad, in_data,
+                                             out_data, req, in_grad, aux_states);
+}
 
-  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-      std::vector<int> *in_type) const override;
+template<typename xpu>
+void BatchNormCompute(const nnvm::NodeAttrs& attrs,
+                      const OpContext& ctx, const std::vector<TBlob>& inputs,
+                      const std::vector<OpReqType>& req,
+                      const std::vector<TBlob>& outputs) {
+  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 5U);
+  std::vector<TBlob> in_data(inputs.begin(),
+                             inputs.begin() + batchnorm::kInMovingMean);
+  std::vector<TBlob> aux_states(inputs.begin() + batchnorm::kInMovingMean,
+                                inputs.end());
+  MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, {
+    BatchNormForward<xpu, DType, AccReal>(ctx, param, in_data, req, outputs,
+                                          aux_states);
+  });
+}
 
-  inline const BatchNormParam& getParam() const {
-    return param_;
-  }
+template<typename xpu>
+void BatchNormGradCompute(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx, const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 11U);
+  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
+  int num_out_grads = param.output_mean_var ? 3U : 1U;
+  int in_data_start = 3;
+  int aux_states_start = in_data_start + batchnorm::kInMovingMean;
+  int out_data_start = in_data_start + batchnorm::kInMovingVar + 1;
+  std::vector<TBlob> out_grad(inputs.begin(), inputs.begin() + num_out_grads);
+  std::vector<TBlob> in_data(inputs.begin() + in_data_start,
+                             inputs.begin() + aux_states_start);
+  std::vector<TBlob> aux_states(inputs.begin() + aux_states_start,
+                                inputs.begin() + out_data_start);
+  std::vector<TBlob> out_data(inputs.begin() + out_data_start, inputs.end());
+  std::vector<TBlob> in_grad(outputs.begin(), outputs.begin() + 3);
+
+  MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, {
+    BatchNormBackward<xpu, DType, AccReal>(ctx, param, out_grad, in_data, out_data, req,
+                                           in_grad, aux_states);
+  });
+}
 
- private:
-  BatchNormParam param_;
-};  // class BatchNormProp
+#if DMLC_USE_CXX11
 
 namespace batchnorm {
 
diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc
index ca2883239488..96c1444857f2 100644
--- a/src/operator/nn/batch_norm.cc
+++ b/src/operator/nn/batch_norm.cc
@@ -21,16 +21,20 @@
  * Copyright (c) 2015 by Contributors
  * \file batch_norm.cc
  * \brief
- * \author Bing Xu, Chris Olivier
+ * \author Bing Xu, Chris Olivier, Da Zheng
 */
 
 #include "batch_norm-inl.h"
-#include <nnvm/op_attr_types.h>
 #if MXNET_USE_MKL2017 == 1
 #include <mkl_memory.h>
 #include "../mkl/mkl_memory-inl.h"
 #include "../mkl/mkl_batch_norm-inl.h"
 #endif  // MXNET_USE_MKL2017
+#include <nnvm/op_attr_types.h>
+#include "../elemwise_op_common.h"
+#if MXNET_USE_MKLDNN == 1
+#include "./mkldnn/mkldnn_batch_norm-inl.h"
+#endif
 
 /*! \brief inverse standard deviation <-> variance */
 #define VARIANCE_TO_INVSTD(__var$,    __eps$)   (1.0/sqrt((__var$) + DType(__eps$)))
@@ -89,12 +93,12 @@ static inline void ForEachFast(const BNTensor3<DType1> &in_data,
 
 /*! \brief Forward CPU */
 template <typename xpu, typename DType, typename AccReal>
-void BatchNormOp<xpu, DType, AccReal>::DoForward(mshadow::Stream<cpu> *,
-                                                 const OpContext &ctx,
-                                                 const std::vector<TBlob> &in_data,
-                                                 const std::vector<OpReqType> &req,
-                                                 const std::vector<TBlob> &out_data,
-                                                 const std::vector<TBlob> &aux_states) {
+void BatchNormForwardImpl(mshadow::Stream<cpu> *,
+                          const OpContext &ctx, const BatchNormParam& param_,
+                          const std::vector<TBlob> &in_data,
+                          const std::vector<OpReqType> &req,
+                          const std::vector<TBlob> &out_data,
+                          const std::vector<TBlob> &aux_states) {
   // Input
   batchnorm::BNTensor3<DType> inputData(in_data[batchnorm::kData], param_.axis);
   const TBlob &weights         = in_data[batchnorm::kGamma];
@@ -164,7 +168,7 @@ void BatchNormOp<xpu, DType, AccReal>::DoForward(mshadow::Stream<cpu> *,
 
     // note that var is still invstd
     if (!param_.fix_gamma) {
-      if (IsWriting(req[batchnorm::kData])) {
+      if (IsBNWriting(req[batchnorm::kData])) {
         ForEachFast(inputData, outputData, channel,
                     [thisWeight, thisBias, thisMean, thisInvstd](const DType *in_data,
                                                                  DType *out_data) {
@@ -173,10 +177,10 @@ void BatchNormOp<xpu, DType, AccReal>::DoForward(mshadow::Stream<cpu> *,
                     });
       }
     } else {
-      if (IsWriting(req[batchnorm::kGamma])) {
+      if (IsBNWriting(req[batchnorm::kGamma])) {
         w[channel] = AccReal(1);
       }
-      if (IsWriting(req[batchnorm::kData])) {
+      if (IsBNWriting(req[batchnorm::kData])) {
         ForEachFast(inputData, outputData, channel,
                     [thisWeight, thisBias, thisMean, thisInvstd](const DType *in_data,
                                                                  DType *out_data) {
@@ -189,14 +193,14 @@ void BatchNormOp<xpu, DType, AccReal>::DoForward(mshadow::Stream<cpu> *,
 }
 
 template <typename xpu, typename DType, typename AccReal>
-void BatchNormOp<xpu, DType, AccReal>::DoBackward(mshadow::Stream<cpu> *,
-                                                  const OpContext &ctx,
-                                                  const std::vector<TBlob> &out_grad,
-                                                  const std::vector<TBlob> &in_data,
-                                                  const std::vector<TBlob> &out_data,
-                                                  const std::vector<OpReqType> &req,
-                                                  const std::vector<TBlob> &in_grad,
-                                                  const std::vector<TBlob> &aux_states) {
+void BatchNormBackwardImpl(mshadow::Stream<cpu> *,
+                           const OpContext &ctx, const BatchNormParam& param_,
+                           const std::vector<TBlob> &out_grad,
+                           const std::vector<TBlob> &in_data,
+                           const std::vector<TBlob> &out_data,
+                           const std::vector<OpReqType> &req,
+                           const std::vector<TBlob> &in_grad,
+                           const std::vector<TBlob> &aux_states) {
   // Input Data
   batchnorm::BNTensor3<DType> inputData(in_data[batchnorm::kData], param_.axis);
   const TBlob &weights   = in_data[batchnorm::kGamma];
@@ -264,7 +268,7 @@ void BatchNormOp<xpu, DType, AccReal>::DoBackward(mshadow::Stream<cpu> *,
                   dotp += (*thisInputData - mean) * (*gradOut_data);
                 });
 
-    if (!gradIn.IsEmpty() && IsWriting(req[batchnorm::kData])) {  // if there's a grad input
+    if (!gradIn.IsEmpty() && IsBNWriting(req[batchnorm::kData])) {  // if there's a grad input
       if (is_train_and_not_global_stats) {
         // when in training mode
         // Q(X) = X - E[x] ; i.e. input centered to zero mean
@@ -300,7 +304,7 @@ void BatchNormOp<xpu, DType, AccReal>::DoBackward(mshadow::Stream<cpu> *,
     // May want to make this a param eventually
     const AccReal scale = 1.0f;
 
-    if (IsWriting(req[batchnorm::kGamma])) {
+    if (IsBNWriting(req[batchnorm::kGamma])) {
       if (!param_.fix_gamma) {
         gradWeightData[channel] = scale * dotp * invstd;
       } else {
@@ -308,51 +312,207 @@ void BatchNormOp<xpu, DType, AccReal>::DoBackward(mshadow::Stream<cpu> *,
       }
     }
 
-    if (IsWriting(req[batchnorm::kBeta])) {
+    if (IsBNWriting(req[batchnorm::kBeta])) {
       gradBiasData[channel] = scale * sumGradOut;
     }
   }
 }
 
-template<>
-Operator *CreateOp<cpu>(BatchNormParam param, const int dtype, const TShape& shape) {
-  param.axis = mxnet::op::batchnorm::GetRealAxis(shape, param.axis);
-  Operator *op = nullptr;
-#if MXNET_USE_MKL2017 == 1
-  if (shape.ndim() == 4
+DMLC_REGISTER_PARAMETER(BatchNormParam);
+
+static bool BatchNormShape(const nnvm::NodeAttrs& attrs,
+                           std::vector<TShape> *in_shape,
+                           std::vector<TShape> *out_shape) {
+  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
+  using namespace mshadow;
+  CHECK_EQ(in_shape->size(), 5U) << "Input:[data, gamma, beta, MovingMean, MovingVar]";
+  const TShape &dshape = in_shape->at(batchnorm::kData);
+
+  const size_t channelAxis = static_cast<size_t>(param.axis < 0
+      ? static_cast<int>(dshape.ndim()) + param.axis
+      : param.axis);
+  CHECK_LT(channelAxis, dshape.ndim()) << "Channel axis out of range: " << param.axis;
+
+  const int channelCount = dshape[channelAxis];
+
+  if (dshape.ndim() == 0) {
+    return false;
+  }
+
+  in_shape->at(batchnorm::kGamma) = TShape(Shape1(channelCount));
+  in_shape->at(batchnorm::kBeta) = TShape(Shape1(channelCount));
+  in_shape->at(batchnorm::kInMovingMean) = TShape(Shape1(channelCount));  // kMovingMean
+  in_shape->at(batchnorm::kInMovingVar) = TShape(Shape1(channelCount));  // kMovingVar
+
+  out_shape->clear();
+  out_shape->push_back(dshape);                // kOut
+  out_shape->push_back(Shape1(channelCount));  // kMean
+  out_shape->push_back(Shape1(channelCount));  // kVar
+
+  return true;
+}
+
+static bool BatchNormType(const nnvm::NodeAttrs& attrs,
+                          std::vector<int> *in_type, std::vector<int> *out_type) {
+  using namespace mshadow;
+  CHECK_GE(in_type->size(), 1U);
+  const int dtype = (*in_type)[0];
+  CHECK_NE(dtype, -1) << "First input must have specified type";
+  // For float16 input type beta, gamma, mean, and average are stored in float32.
+  // For other input types, these parameters have the same type as input
+  // NOTE: This requirement is from cuDNN (v. 4 and 5)
+  int dtype_param;
+  MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, {
+      dtype_param = mshadow::DataType<AccRealX>::kFlag; });
+  std::vector<std::string> args{"data", "gamma", "beta", "mean", "var"};
+  CHECK_LE(in_type->size(), args.size());
+  for (index_t i = 1; i < in_type->size(); ++i) {
+    if ((*in_type)[i] == -1) {
+      (*in_type)[i] = dtype_param;
+    } else {
+      UNIFORM_TYPE_CHECK((*in_type)[i], dtype_param, args[i]);
+    }
+  }
+  const size_t n_out = 3;
+  out_type->clear();
+  out_type->push_back(dtype);
+  for (size_t i = 1; i < n_out; ++i) {
+    out_type->push_back(dtype_param);
+  }
+  return true;
+}
+
+#if MXNET_USE_MKLDNN == 1
+static inline bool SupportMKLDNNBN(const NDArray &input, const BatchNormParam &param) {
+  TShape shape = input.shape();
+  return SupportMKLDNN(input) && shape.ndim() == 4
       && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS
-      && !mxnet::op::batchnorm::disable_mkl) {
-    switch (dtype) {
+      && shape[param.axis] % 8 == 0;
+}
+#endif
+
+void BatchNormComputeCPU(const nnvm::NodeAttrs &attrs,
+                         const OpContext &ctx,
+                         const std::vector<NDArray> &inputs,
+                         const std::vector<OpReqType> &req,
+                         const std::vector<NDArray> &outputs) {
+  CHECK_EQ(inputs.size(), 5U);
+#if MXNET_USE_MKLDNN == 1
+  const BatchNormParam &param = nnvm::get<BatchNormParam>(attrs.parsed);
+  // MKLDNN batchnorm only works well on the special MKLDNN layout.
+  if (SupportMKLDNNBN(inputs[0], param) && inputs[0].IsMKLDNN()) {
+    std::vector<NDArray> in_data(inputs.begin(), inputs.begin() + batchnorm::kInMovingMean);
+    std::vector<NDArray> aux_states(inputs.begin() + batchnorm::kInMovingMean, inputs.end());
+
+    switch (inputs[0].dtype()) {
       case mshadow::kFloat32:
-        op = new MKLBatchNormOp<cpu, float>(param);
-        break;
-      case mshadow::kFloat64:
-        op = new MKLBatchNormOp<cpu, double>(param);
-        break;
-      default:
-        // MKL operator doesn't support half_t, so fall through
-        break;
+        MKLDNNBatchNormCompute<float>(ctx, param, in_data, req, outputs, aux_states);
+        return;
     }
   }
 #endif
-  if (!op) {
-    MSHADOW_REAL_TYPE_SWITCH_EX(dtype,
-                                DType,
-                                AccReal, {
-                                  op = new BatchNormOp<cpu, DType, AccReal>(param); });
+  std::vector<TBlob> in_blobs(inputs.size());
+  for (size_t i = 0; i < in_blobs.size(); i++) {
+    in_blobs[i] = inputs[i].data();
   }
-  return op;
+
+  std::vector<TBlob> out_blobs(outputs.size());
+  for (size_t i = 0; i < out_blobs.size(); i++) {
+    out_blobs[i] = outputs[i].data();
+  }
+  BatchNormCompute<cpu>(attrs, ctx, in_blobs, req, out_blobs);
 }
 
-// DO_BIND_DISPATCH comes from operator_common.h
-Operator *BatchNormProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                                          std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_shape)[0]);
+void BatchNormGradComputeCPU(const nnvm::NodeAttrs &attrs,
+                             const OpContext &ctx,
+                             const std::vector<NDArray> &inputs,
+                             const std::vector<OpReqType> &req,
+                             const std::vector<NDArray> &outputs) {
+  CHECK_EQ(inputs.size(), 11U);
+  const BatchNormParam &param = nnvm::get<BatchNormParam>(attrs.parsed);
+  int num_out_grads = param.output_mean_var ? 3U : 1U;
+  int in_data_start = 3;
+  int aux_states_start = in_data_start + batchnorm::kInMovingMean;
+  int out_data_start = in_data_start + batchnorm::kInMovingVar + 1;
+
+  TShape shape = inputs[0].shape();
+#if MXNET_USE_MKLDNN == 1
+  // MKLDNN batchnorm only works well on the special MKLDNN layout.
+  if (SupportMKLDNNBN(inputs[0], param)
+      && (inputs[in_data_start].IsMKLDNN() || inputs[0].IsMKLDNN())) {
+    std::vector<NDArray> out_grad(inputs.begin(), inputs.begin() + num_out_grads);
+    std::vector<NDArray> in_data(inputs.begin() + in_data_start,
+                                 inputs.begin() + aux_states_start);
+    std::vector<NDArray> aux_states(inputs.begin() + aux_states_start,
+                                    inputs.begin() + out_data_start);
+    std::vector<NDArray> out_data(inputs.begin() + out_data_start, inputs.end());
+    std::vector<NDArray> in_grad(outputs.begin(), outputs.begin() + 3);
+
+    if (inputs[0].dtype() == mshadow::kFloat32) {
+      MKLDNNBatchNormGradCompute<float>(ctx, param, out_grad, in_data,
+                                        out_data, req, in_grad, aux_states);
+      return;
+    }
+  }
+#endif
+  // cast NDArray to TBlob, and call original implementation.
+  std::vector<TBlob> in_blobs(inputs.size());
+  for (size_t i = 0; i < in_blobs.size(); i++) {
+    in_blobs[i] = inputs[i].data();
+  }
+
+  std::vector<TBlob> out_blobs(outputs.size());
+  for (size_t i = 0; i < out_blobs.size(); i++) {
+    out_blobs[i] = outputs[i].data();
+  }
+  BatchNormGradCompute<cpu>(attrs, ctx, in_blobs, req, out_blobs);
 }
 
-DMLC_REGISTER_PARAMETER(BatchNormParam);
+static inline bool BatchNormStorageType(const nnvm::NodeAttrs &attrs,
+                                        const int dev_mask,
+                                        DispatchMode *dispatch_mode,
+                                        std::vector<int> *in_attrs,
+                                        std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 5);
+  CHECK_EQ(out_attrs->size(), 3);
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask)
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  else
+#endif
+    *dispatch_mode = DispatchMode::kFCompute;
+  for (int& v : *in_attrs) {
+    if (v == - 1) v = kDefaultStorage;
+  }
+  for (size_t i = 0; i < out_attrs->size(); i++) {
+    (*out_attrs)[i] = kDefaultStorage;
+  }
+  return true;
+}
+
+static inline bool backward_BatchNormStorageType(const nnvm::NodeAttrs &attrs,
+                                                 const int dev_mask,
+                                                 DispatchMode *dispatch_mode,
+                                                 std::vector<int> *in_attrs,
+                                                 std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 11);
+  CHECK_EQ(out_attrs->size(), 5);
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask)
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  else
+#endif
+    *dispatch_mode = DispatchMode::kFCompute;
+  for (int& v : *in_attrs) {
+    if (v == - 1) v = kDefaultStorage;
+  }
+  for (size_t i = 0; i < out_attrs->size(); i++) {
+    (*out_attrs)[i] = kDefaultStorage;
+  }
+  return true;
+}
 
-MXNET_REGISTER_OP_PROPERTY(BatchNorm, BatchNormProp)
+NNVM_REGISTER_OP(BatchNorm)
 .describe(R"code(Batch normalization.
 
 Normalizes a data batch by mean and variance, and applies a scale ``gamma`` as
@@ -398,14 +558,42 @@ Both ``gamma`` and ``beta`` are learnable parameters. But if ``fix_gamma`` is tr
 then set ``gamma`` to 1 and its gradient to 0.
 
 )code" ADD_FILELINE)
+.set_num_inputs(5)
+.set_num_outputs(3)
+.set_attr_parser(ParamParser<BatchNormParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+    [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"data", "gamma", "beta", "moving_mean", "moving_var"};
+})
+.set_attr<nnvm::FListOutputNames>("FListOutputNames",
+    [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"output", "mean", "var"};
+})
+.set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs",
+    [](const NodeAttrs& attrs) {
+  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
+  return param.output_mean_var ? 3 : 1;
+})
+.set_attr<nnvm::FMutateInputs>("FMutateInputs", [](const nnvm::NodeAttrs& attrs) {
+  return std::vector<uint32_t>{3, 4};
+})
+.set_attr<nnvm::FInferShape>("FInferShape", BatchNormShape)
+.set_attr<nnvm::FInferType>("FInferType", BatchNormType)
+.set_attr<FInferStorageType>("FInferStorageType", BatchNormStorageType)
+.set_attr<FCompute>("FCompute<cpu>", BatchNormCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormComputeCPU)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseInOut{"_backward_BatchNorm"})
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
 .add_argument("data", "NDArray-or-Symbol", "Input data to batch normalization")
 .add_argument("gamma", "NDArray-or-Symbol", "gamma array")
 .add_argument("beta", "NDArray-or-Symbol", "beta array")
 .add_argument("moving_mean", "NDArray-or-Symbol", "running mean of input")
 .add_argument("moving_var", "NDArray-or-Symbol", "running variance of input")
-.add_arguments(BatchNormParam::__FIELDS__());
-
-NNVM_REGISTER_OP(BatchNorm)
+.add_arguments(BatchNormParam::__FIELDS__())
 .set_attr<nnvm::FSetInputVarAttrOnCompose>(
   "FSetInputVarAttrOnCompose",
   [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) {
@@ -417,5 +605,18 @@ NNVM_REGISTER_OP(BatchNorm)
     }
   });
 
+NNVM_REGISTER_OP(_backward_BatchNorm)
+.set_num_outputs(5)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FInferStorageType>("FInferStorageType", backward_BatchNormStorageType)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
+.set_attr_parser(ParamParser<BatchNormParam>)
+.set_attr<FCompute>("FCompute<cpu>", BatchNormGradCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormGradComputeCPU);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/batch_norm.cu b/src/operator/nn/batch_norm.cu
index 59317b7fa837..80c15976b65f 100644
--- a/src/operator/nn/batch_norm.cu
+++ b/src/operator/nn/batch_norm.cu
@@ -21,7 +21,7 @@
  * Copyright (c) 2017 by Contributors
  * \file batch_norm.cu
  * \brief CUDA Batch Normalization code
- * \author Chris Olivier, Bing Xu
+ * \author Chris Olivier, Bing Xu, Da Zheng
  * Adapted from Torch
 */
 #include <cuda_runtime_api.h>
@@ -579,13 +579,13 @@ static inline uint32_t SetupFlags(const OpContext &ctx,
   flags |= ctx.is_train ? IS_TRAINING_FLAG : 0;
   flags |= params.fix_gamma ? FIX_GAMMA_FLAG : 0;
   flags |= params.use_global_stats ? USE_GLOBAL_STATS_FLAG : 0;
-  if (BatchNormOp<xpu, DType, AccReal>::IsWriting(req[batchnorm::kData])) {
+  if (IsBNWriting(req[batchnorm::kData])) {
     flags |= WRITE_DATA_FLAG;
   }
-  if (BatchNormOp<xpu, DType, AccReal>::IsWriting(req[batchnorm::kGamma])) {
+  if (IsBNWriting(req[batchnorm::kGamma])) {
     flags |= WRITE_GAMMA_FLAG;
   }
-  if (BatchNormOp<xpu, DType, AccReal>::IsWriting(req[batchnorm::kBeta])) {
+  if (IsBNWriting(req[batchnorm::kBeta])) {
     flags |= WRITE_BETA_FLAG;
   }
   return flags;
@@ -593,12 +593,12 @@ static inline uint32_t SetupFlags(const OpContext &ctx,
 
 /*! \brief Forward batch-norm pass on GPU */
 template<typename xpu, typename DType, typename AccReal>
-void BatchNormOp<xpu, DType, AccReal>::DoForward(mshadow::Stream<gpu> *stream,
-                                                 const OpContext &ctx,
-                                                 const std::vector<TBlob> &in_data,
-                                                 const std::vector<OpReqType> &req,
-                                                 const std::vector<TBlob> &out_data,
-                                                 const std::vector<TBlob> &aux_states) {
+void BatchNormForwardImpl(mshadow::Stream<gpu> *stream,
+                          const OpContext &ctx, const BatchNormParam& param_,
+                          const std::vector<TBlob> &in_data,
+                          const std::vector<OpReqType> &req,
+                          const std::vector<TBlob> &out_data,
+                          const std::vector<TBlob> &aux_states) {
   batchnorm::cuda::BatchNormalizationUpdateOutput<DType, AccReal>(
     stream,
     ctx,
@@ -614,14 +614,14 @@ void BatchNormOp<xpu, DType, AccReal>::DoForward(mshadow::Stream<gpu> *stream,
 
 /*! \brief Backward batch-norm pass on GPU */
 template<typename xpu, typename DType, typename AccReal>
-void BatchNormOp<xpu, DType, AccReal>::DoBackward(mshadow::Stream<gpu> *stream,
-                                                  const OpContext &ctx,
-                                                  const std::vector<TBlob> &out_grad,
-                                                  const std::vector<TBlob> &in_data,
-                                                  const std::vector<TBlob> &out_data,
-                                                  const std::vector<OpReqType> &req,
-                                                  const std::vector<TBlob> &in_grad,
-                                                  const std::vector<TBlob> &aux_states) {
+void BatchNormBackwardImpl(mshadow::Stream<gpu> *stream,
+                           const OpContext &ctx, const BatchNormParam& param_,
+                           const std::vector<TBlob> &out_grad,
+                           const std::vector<TBlob> &in_data,
+                           const std::vector<TBlob> &out_data,
+                           const std::vector<OpReqType> &req,
+                           const std::vector<TBlob> &in_grad,
+                           const std::vector<TBlob> &aux_states) {
   batchnorm::cuda::BatchNormalizationBackward<DType, AccReal>(
     stream,
     ctx,
@@ -637,30 +637,92 @@ void BatchNormOp<xpu, DType, AccReal>::DoBackward(mshadow::Stream<gpu> *stream,
   MSHADOW_CUDA_POST_KERNEL_CHECK(BatchNormOp_DoBackward_gpu);
 }
 
-/*! \brief Create GPU operator for batch normalization */
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 4
+template<typename DType>
+static CuDNNBatchNormOp<DType> &GetCuDNNOp(const BatchNormParam& param) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local CuDNNBatchNormOp<DType> op;
+#else
+  static MX_THREAD_LOCAL CuDNNBatchNormOp<DType> op;
+#endif
+  op.Init(param);
+  return op;
+}
+#endif
+
 template<>
-Operator *CreateOp<gpu>(BatchNormParam param, const int dtype, const TShape& shape) {
+void BatchNormCompute<gpu>(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx, const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& outputs) {
+  BatchNormParam param = nnvm::get<BatchNormParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 5U);
+  std::vector<TBlob> in_data(inputs.begin(), inputs.begin() + 3);
+  std::vector<TBlob> aux_states(inputs.begin() + 3, inputs.end());
+  int dtype = inputs[0].type_flag_;
+  TShape shape = inputs[0].shape_;
+
   param.axis = mxnet::op::batchnorm::GetRealAxis(shape, param.axis);
-  Operator *op = NULL;
 #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
   if (!param.use_global_stats && !param.cudnn_off && shape.ndim() <= 4
       && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS) {
     MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-      op = new CuDNNBatchNormOp<DType>(param);
+      GetCuDNNOp<DType>(param).Forward(ctx, in_data, req, outputs, aux_states);
     })
   } else {
     MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DType, AccReal, {
-      op = new BatchNormOp<gpu, DType, AccReal>(param);
+      BatchNormForward<gpu, DType, AccReal>(ctx, param, in_data, req, outputs, aux_states);
     })
   }
 #else
-  MSHADOW_REAL_TYPE_SWITCH_EX(dtype,
-                              DType,
-                              AccReal,
-                              { op = new BatchNormOp<gpu, DType, AccReal>(param); });
+  MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, {
+    BatchNormForward<gpu, DType, AccReal>(ctx, param, in_data, req, outputs, aux_states);
+  });
+#endif
+}
+
+template<>
+void BatchNormGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
+                               const OpContext& ctx, const std::vector<TBlob>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 11U);
+  BatchNormParam param = nnvm::get<BatchNormParam>(attrs.parsed);
+  std::vector<TBlob> out_grad(1, inputs[0]);
+  std::vector<TBlob> in_data(inputs.begin() + 3, inputs.begin() + 6);
+  std::vector<TBlob> aux_states(inputs.begin() + 6, inputs.begin() + 8);
+  std::vector<TBlob> out_data(inputs.begin() + 8, inputs.end());
+  std::vector<TBlob> in_grad(outputs.begin(), outputs.begin() + 3);
+  int dtype = inputs[0].type_flag_;
+  TShape shape = inputs[0].shape_;
+
+  param.axis = mxnet::op::batchnorm::GetRealAxis(shape, param.axis);
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
+  if (!param.use_global_stats && !param.cudnn_off && shape.ndim() <= 4
+      && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS) {
+    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+      GetCuDNNOp<DType>(param).Backward(ctx, out_grad, in_data, out_data,
+        req, in_grad, aux_states);
+    })
+  } else {
+    MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DType, AccReal, {
+      BatchNormBackward<gpu, DType, AccReal>(ctx, param, out_grad,
+          in_data, out_data, req, in_grad, aux_states);
+    })
+  }
+#else
+  MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, {
+    BatchNormBackward<gpu, DType, AccReal>(ctx, param, out_grad,
+        in_data, out_data, req, in_grad, aux_states);
+  });
 #endif
-  return op;
 }
 
+NNVM_REGISTER_OP(BatchNorm)
+.set_attr<FCompute>("FCompute<gpu>", BatchNormCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_BatchNorm)
+.set_attr<FCompute>("FCompute<gpu>", BatchNormGradCompute<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/concat-inl.h b/src/operator/nn/concat-inl.h
new file mode 100644
index 000000000000..f4c9104e181a
--- /dev/null
+++ b/src/operator/nn/concat-inl.h
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file concat-inl.h
+ * \brief
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_NN_CONCAT_INL_H_
+#define MXNET_OPERATOR_NN_CONCAT_INL_H_
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+#include <utility>
+#include "../operator_common.h"
+#include "../channel_op_common.h"
+
+namespace mxnet {
+namespace op {
+
+namespace concat_enum {
+enum ConcatOpInputs {kData0, kData1, kData2, kData3, kData4};
+enum ConcatOpResource {kTempSpace};
+enum ConcatOpOutputs {kOut};
+}  // namespace concat_enum
+
+struct ConcatParam : public dmlc::Parameter<ConcatParam> {
+  int num_args;
+  int dim;
+  DMLC_DECLARE_PARAMETER(ConcatParam) {
+    DMLC_DECLARE_FIELD(num_args).set_lower_bound(1)
+    .describe("Number of inputs to be concated.");
+    DMLC_DECLARE_FIELD(dim).set_range(0,  4).set_default(1)
+    .describe("the dimension to be concated.");
+  }
+};  // struct ConcatParam
+
+template<typename xpu, typename DType>
+class ConcatOp {
+ public:
+  void Init(const ConcatParam &param) {
+    this->size_ = param.num_args;
+    this->dimension_ = param.dim;
+  }
+
+  void Forward(const OpContext &ctx,
+               const std::vector<TBlob> &in_data,
+               const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &out_data) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(static_cast<int>(in_data.size()), size_);
+    CHECK_EQ(out_data.size(), 1U);
+    CHECK_LT(dimension_, in_data[concat_enum::kData0].ndim());
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    std::vector<Tensor<xpu, 3, DType> > data(size_);
+    Tensor<xpu, 3, DType> out;
+    size_t leading = 1, trailing = 1;
+    for (int i = 0; i < dimension_; ++i) {
+      leading *= out_data[concat_enum::kOut].shape_[i];
+    }
+    for (int i = dimension_ + 1; i < out_data[concat_enum::kOut].ndim(); ++i) {
+      trailing *= out_data[concat_enum::kOut].shape_[i];
+    }
+    size_t mid = out_data[concat_enum::kOut].shape_[dimension_];
+    Shape<3> oshape = Shape3(leading, mid, trailing);
+    out = out_data[concat_enum::kOut].get_with_shape<xpu, 3, DType>(oshape, s);
+
+    for (int i = 0; i < size_; ++i) {
+      Shape<3> dshape = Shape3(leading, in_data[i].shape_[dimension_], trailing);
+      data[i] = in_data[i].get_with_shape<xpu, 3, DType>(dshape, s);
+    }
+    Concatenate(data, &out, 1, req[concat_enum::kOut]);
+  }
+
+  void Backward(const OpContext &ctx, const TBlob &out_grad,
+                const std::vector<OpReqType> &req,
+                const std::vector<TBlob> &in_grad) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_grad.size(), static_cast<size_t>(size_));
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    std::vector<Tensor<xpu, 3, DType> > grad_in(size_);
+    Tensor<xpu, 3, DType> grad;
+    size_t leading = 1, trailing = 1;
+    for (int i = 0; i < dimension_; ++i) {
+      leading *= out_grad.shape_[i];
+    }
+    for (int i = dimension_ + 1; i < out_grad.ndim(); ++i) {
+      trailing *= out_grad.shape_[i];
+    }
+    size_t mid = out_grad.shape_[dimension_];
+    Shape<3> oshape = Shape3(leading, mid, trailing);
+    grad = out_grad.get_with_shape<xpu, 3, DType>(oshape, s);
+
+    for (int i = 0; i < size_; ++i) {
+      Shape<3> dshape = Shape3(leading, in_grad[i].shape_[dimension_], trailing);
+      grad_in[i] = in_grad[i].get_with_shape<xpu, 3, DType>(dshape, s);
+    }
+    Split(grad, &grad_in, 1, req);
+  }
+
+ private:
+  int size_;
+  int dimension_;
+};  // class ConcatOp
+
+template<typename xpu>
+void ConcatCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
+                   const std::vector<TBlob>& inputs,
+                   const std::vector<OpReqType>& req,
+                   const std::vector<TBlob>& outputs) {
+  const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
+  MSHADOW_TYPE_SWITCH(inputs[concat_enum::kData0].type_flag_, DType, {
+    ConcatOp<xpu, DType> op;
+    op.Init(param);
+    op.Forward(ctx, inputs, req, outputs);
+  });
+}
+
+template<typename xpu>
+void ConcatGradCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
+                       const std::vector<TBlob>& inputs,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& outputs) {
+  const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
+  MSHADOW_TYPE_SWITCH(inputs[concat_enum::kOut].type_flag_, DType, {
+    ConcatOp<xpu, DType> op;
+    op.Init(param);
+    op.Backward(ctx, inputs[concat_enum::kOut], req, outputs);
+  });
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NN_CONCAT_INL_H_
diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc
new file mode 100644
index 000000000000..ec89a3aae974
--- /dev/null
+++ b/src/operator/nn/concat.cc
@@ -0,0 +1,287 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file concat.cc
+ * \brief
+ * \author Bing Xu
+*/
+
+#include "./concat-inl.h"
+#include "./mkldnn/mkldnn_ops-inl.h"
+#include "../../common/utils.h"
+
+namespace mxnet {
+namespace op {
+
+static bool ConcatShape(const nnvm::NodeAttrs& attrs,
+                        std::vector<TShape> *in_shape,
+                        std::vector<TShape> *out_shape) {
+  using namespace mshadow;
+  const ConcatParam& param_ = nnvm::get<ConcatParam>(attrs.parsed);
+  CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
+  TShape dshape;
+  index_t size = 0;
+  bool has_zero = false;
+  for (int i = 0; i < param_.num_args; ++i) {
+    TShape tmp = (*in_shape)[i];
+    if (tmp.ndim()) {
+      CHECK_LT(static_cast<index_t>(param_.dim), tmp.ndim())
+          << "concat dim " << param_.dim << " out of range of input shape " << tmp;
+      has_zero = tmp[param_.dim] == 0 || has_zero;
+      size += tmp[param_.dim];
+      tmp[param_.dim] = 0;
+      shape_assign(&dshape, tmp);
+    }
+  }
+
+  TShape tmp = (*out_shape)[0];
+  if (tmp.ndim()) {
+    CHECK_LT(static_cast<index_t>(param_.dim), tmp.ndim())
+        << "concat dim " << param_.dim << " out of range of input shape " << tmp;
+    tmp[param_.dim] = 0;
+    shape_assign(&dshape, tmp);
+  }
+
+  if (dshape.ndim() == 0) return false;
+
+  for (int i = 0; i < param_.num_args; ++i) {
+    CHECK(shape_assign(&(*in_shape)[i], dshape))
+        << "Incompatible input shape: expected " << dshape << ", got " << (*in_shape)[i];
+  }
+
+  if (!has_zero) dshape[param_.dim] = size;
+  CHECK(shape_assign(&(*out_shape)[0], dshape))
+      << "Incompatible output shape: expected " << dshape << ", got " << (*out_shape)[0];
+
+  return dshape.Size() != 0;
+}
+
+static bool ConcatType(const nnvm::NodeAttrs& attrs,
+                       std::vector<int> *in_type,
+                       std::vector<int> *out_type) {
+  const ConcatParam& param_ = nnvm::get<ConcatParam>(attrs.parsed);
+  int dtype = -1;
+
+  for (size_t i = 0; i < in_type->size(); ++i) {
+    if (dtype == -1) {
+      dtype = in_type->at(i);
+    } else {
+      CHECK(in_type->at(i) == dtype ||
+            in_type->at(i) == -1) <<
+          "Non-uniform data type in Concat";
+    }
+  }
+
+  if (dtype == -1) {
+    LOG(FATAL) << "Not enough information to infer type in Concat.";
+    return false;
+  }
+
+  size_t nin = param_.num_args;
+  in_type->clear();
+  for (size_t i = 0; i < nin; ++i) in_type->push_back(dtype);
+
+  out_type->clear();
+  out_type->push_back(dtype);
+
+  return true;
+}
+
+inline static bool ConcatForwardInferStorageType(const nnvm::NodeAttrs& attrs,
+                                 const int dev_mask,
+                                 DispatchMode* dispatch_mode,
+                                 std::vector<int> *in_attrs,
+                                 std::vector<int> *out_attrs) {
+  CHECK(!in_attrs->empty());
+  CHECK_EQ(out_attrs->size(), 1U);
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask
+      && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage))
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  else
+#endif
+    *dispatch_mode = DispatchMode::kFCompute;
+  (*out_attrs)[0] = kDefaultStorage;
+  return true;
+}
+
+inline static bool BackwardConcatStorageType(const nnvm::NodeAttrs& attrs,
+                                             const int dev_mask,
+                                             DispatchMode* dispatch_mode,
+                                             std::vector<int> *in_attrs,
+                                             std::vector<int> *out_attrs) {
+#if MXNET_USE_MKLDNN == 1
+  CHECK_EQ(out_attrs->size(), in_attrs->size() - 1);
+  if (dev_mask == mshadow::cpu::kDevMask
+      && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage))
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  else
+#endif
+    *dispatch_mode = DispatchMode::kFCompute;
+  for (size_t i = 0; i < out_attrs->size(); i++)
+    (*out_attrs)[i] = kDefaultStorage;
+  return true;
+}
+
+void ConcatComputeExCPU(const nnvm::NodeAttrs& attrs,
+                                const OpContext& op_ctx,
+                                const std::vector<NDArray>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<NDArray>& outputs) {
+  CHECK(!inputs.empty());
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  if (req[0] == kNullOp) return;
+#if MXNET_USE_MKLDNN == 1
+  // MKLDNN support 2D and 4D concat
+  if ((inputs[0].shape().ndim() == 2 || inputs[0].shape().ndim() == 4)
+      && inputs[0].dtype() == mshadow::kFloat32) {
+    MKLDNNConcatForward(attrs, op_ctx, inputs, req, outputs);
+  } else {
+    std::vector<TBlob> in_blobs(inputs.size());
+    for (size_t i = 0; i < in_blobs.size(); i++)
+      in_blobs[i] = inputs[i].data();
+    std::vector<TBlob> out_blobs(outputs.size());
+    for (size_t i = 0; i < out_blobs.size(); i++)
+      out_blobs[i] = outputs[i].data();
+    ConcatCompute<cpu>(attrs, op_ctx, in_blobs, req, out_blobs);
+  }
+#endif
+}
+
+static void ConcatGradComputeExCPU(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx, const std::vector<NDArray>& inputs,
+    const std::vector<OpReqType>& req, const std::vector<NDArray>& outputs) {
+#if MXNET_USE_MKLDNN == 1
+  if ((inputs[0].shape().ndim() == 2 || inputs[0].shape().ndim() == 4)
+      && inputs[0].dtype() == mshadow::kFloat32) {
+    MKLDNNConcatBackward(attrs, ctx, inputs, req, outputs);
+  } else {
+    std::vector<TBlob> in_blobs(1);
+    in_blobs[0] = inputs[0].data();
+    std::vector<TBlob> out_blobs(outputs.size());
+    for (size_t i = 0; i < out_blobs.size(); i++)
+      out_blobs[i] = outputs[i].data();
+    ConcatGradCompute<cpu>(attrs, ctx, in_blobs, req, out_blobs);
+  }
+#endif
+}
+
+struct ConcatGrad {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
+    CHECK_EQ(ograds.size(), 1);
+    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
+#if MXNET_USE_MKLDNN == 1
+    for (size_t i = 0; i < n->inputs.size(); i++) {
+      heads.push_back(n->inputs[i]);
+    }
+#endif
+    return MakeGradNode(op_name, n, heads, n->attrs.dict);
+  }
+};
+
+DMLC_REGISTER_PARAMETER(ConcatParam);
+
+NNVM_REGISTER_OP(Concat)
+.describe(R"code(Joins input arrays along a given axis.
+
+.. note:: `Concat` is deprecated. Use `concat` instead.
+
+The dimensions of the input arrays should be the same except the axis along
+which they will be concatenated.
+The dimension of the output array along the concatenated axis will be equal
+to the sum of the corresponding dimensions of the input arrays.
+
+Example::
+
+   x = [[1,1],[2,2]]
+   y = [[3,3],[4,4],[5,5]]
+   z = [[6,6], [7,7],[8,8]]
+
+   concat(x,y,z,dim=0) = [[ 1.,  1.],
+                          [ 2.,  2.],
+                          [ 3.,  3.],
+                          [ 4.,  4.],
+                          [ 5.,  5.],
+                          [ 6.,  6.],
+                          [ 7.,  7.],
+                          [ 8.,  8.]]
+
+   Note that you cannot concat x,y,z along dimension 1 since dimension
+   0 is not the same for all the input arrays.
+
+   concat(y,z,dim=1) = [[ 3.,  3.,  6.,  6.],
+                         [ 4.,  4.,  7.,  7.],
+                         [ 5.,  5.,  8.,  8.]]
+
+)code" ADD_FILELINE)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const ConcatParam& params = nnvm::get<ConcatParam>(attrs.parsed);
+  return params.num_args;
+})
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<ConcatParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+    [](const NodeAttrs& attrs) {
+  const ConcatParam& params = nnvm::get<ConcatParam>(attrs.parsed);
+  std::vector<std::string> ret;
+  for (int i = 0; i < params.num_args; ++i) {
+    ret.push_back(std::string("arg") + std::to_string(i));
+  }
+  return ret;
+})
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
+.set_attr<nnvm::FInferShape>("FInferShape", ConcatShape)
+.set_attr<nnvm::FInferType>("FInferType", ConcatType)
+.set_attr<FInferStorageType>("FInferStorageType", ConcatForwardInferStorageType)
+.set_attr<FCompute>("FCompute<cpu>", ConcatCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", ConcatComputeExCPU)
+.set_attr<nnvm::FGradient>("FGradient", ConcatGrad{"_backward_Concat"})
+.set_attr<std::string>("key_var_num_args", "num_args")
+.add_argument("data", "NDArray-or-Symbol[]", "List of arrays to concatenate")
+.add_arguments(ConcatParam::__FIELDS__());
+
+NNVM_REGISTER_OP(Concat).add_alias("concat");
+
+NNVM_REGISTER_OP(_backward_Concat)
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const ConcatParam& params = nnvm::get<ConcatParam>(attrs.parsed);
+  return params.num_args;
+})
+.set_attr_parser(ParamParser<ConcatParam>)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FInferStorageType>("FInferStorageType", BackwardConcatStorageType)
+.set_attr<FCompute>("FCompute<cpu>", ConcatGradCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", ConcatGradComputeExCPU);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/concat.cu b/src/operator/nn/concat.cu
similarity index 81%
rename from src/operator/concat.cu
rename to src/operator/nn/concat.cu
index 394fa736ee84..f6bf5ece5c78 100644
--- a/src/operator/concat.cu
+++ b/src/operator/nn/concat.cu
@@ -28,14 +28,12 @@
 
 namespace mxnet {
 namespace op {
-template<>
-Operator* CreateOp<gpu>(ConcatParam param, int dtype, std::vector<TShape> *in_shape) {
-  Operator *op = NULL;
-  MSHADOW_TYPE_SWITCH(dtype, DType, {
-    op = new ConcatOp<gpu, DType>(param);
-  });
-  return op;
-}
+
+NNVM_REGISTER_OP(Concat)
+.set_attr<FCompute>("FCompute<gpu>", ConcatCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_Concat)
+.set_attr<FCompute>("FCompute<gpu>", ConcatGradCompute<gpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/convolution-inl.h b/src/operator/nn/convolution-inl.h
index 38971aefa2d3..bc54996c3ea4 100644
--- a/src/operator/nn/convolution-inl.h
+++ b/src/operator/nn/convolution-inl.h
@@ -22,7 +22,7 @@
  * \file convolution-inl.h
  * \brief
  * \ref: https://github.com/Yangqing/caffe/wiki/Convolution-in-Caffe:-a-memo
- * \author Bing Xu, Jun Wu
+ * \author Bing Xu, Jun Wu, Da Zheng
 */
 #ifndef MXNET_OPERATOR_NN_CONVOLUTION_INL_H_
 #define MXNET_OPERATOR_NN_CONVOLUTION_INL_H_
@@ -148,9 +148,9 @@ namespace mxnet {
 namespace op {
 
 template<typename xpu, typename DType>
-class ConvolutionOp : public Operator {
+class ConvolutionOp {
  public:
-  explicit ConvolutionOp(ConvolutionParam p) {
+  void Init(ConvolutionParam p) {
     this->param_ = p;
     // convert MBytes first to Bytes and then to elements.
     param_.workspace = (param_.workspace << 20) / sizeof(DType);
@@ -160,11 +160,10 @@ class ConvolutionOp : public Operator {
       << "Only support NCW, NCHW and NCDHW layout";
   }
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
+  void Forward(const OpContext &ctx,
+               const std::vector<TBlob> &in_data,
+               const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &out_data) {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(req[conv::kOut], kWriteTo);
@@ -233,18 +232,19 @@ class ConvolutionOp : public Operator {
     }
   }
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob>& out_grad,
-                        const std::vector<TBlob>& in_data,
-                        const std::vector<TBlob>& out_data,
-                        const std::vector<OpReqType>& req,
-                        const std::vector<TBlob>& in_grad,
-                        const std::vector<TBlob>& aux_args) {
+  void Backward(const OpContext &ctx,
+                const std::vector<TBlob>& out_grad,
+                const std::vector<TBlob>& in_data,
+                const std::vector<OpReqType>& req,
+                const std::vector<TBlob>& in_grad) {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(out_grad.size(), 1U);
+    // We expect 2 inputs: in data and weight. We don't need bias for
+    // computing gradient.
     size_t expected = param_.no_bias == 0 ? 3 : 2;
-    CHECK(in_data.size() == expected && in_grad.size() == expected);
+    CHECK_EQ(in_data.size(), expected);
+    CHECK_EQ(in_grad.size(), expected);
     CHECK_EQ(req.size(), expected);
     CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true);
     LayerSetUp(in_grad[conv::kData].shape_, out_grad[conv::kOut].shape_);
@@ -386,299 +386,35 @@ class ConvolutionOp : public Operator {
 };  // class ConvolutionOp
 
 template<typename xpu>
-Operator* CreateOp(ConvolutionParam param, int dtype,
-                   std::vector<TShape> *in_shape,
-                   std::vector<TShape> *out_shape,
-                   Context ctx);
-
-#if DMLC_USE_CXX11
-class ConvolutionProp : public OperatorProperty {
- public:
-  std::vector<std::string> ListArguments() const override {
-    if (!param_.no_bias) {
-      return {"data", "weight", "bias"};
-    } else {
-      return {"data", "weight"};
-    }
-  }
-
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    using namespace mshadow;
-    param_.Init(kwargs);
-    if (param_.kernel.ndim() == 1) {
-      param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW;
-      if (param_.stride.ndim() == 0) param_.stride = Shape1(1);
-      if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape1(0);
-    } else if (param_.kernel.ndim() == 2) {
-      param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW;
-      if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
-      if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
-    } else {
-      CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D convolution not supported";
-      param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW;
-      if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
-      if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
-    }
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-    using namespace mshadow;
-    if (!param_.no_bias) {
-      CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
-    } else {
-      CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
-    }
-    // CHECK_EQ(out_shape->size(), 1) << "Output: [output]";
-    out_shape->resize(1, TShape());
-    const TShape &dshp = (*in_shape)[conv::kData];
-    if (dshp.ndim() ==  0) return false;
-
-    if (param_.kernel.ndim() == 1) {
-      // 1d conv
-      CHECK_EQ(dshp.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x";
-      Shape<3> dshape = ConvertLayout(dshp.get<3>(), param_.layout.value(), kNCW);
-      Shape<3> wshape = Shape3(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
-                               param_.kernel[0]);
-      wshape = ConvertLayout(wshape, kNCW, param_.layout.value());
-      wshape[0] *= param_.num_group;
-      SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
-      if (!param_.no_bias) {
-        SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
-      }
-
-      const index_t dilated_ksize_x = param_.DilatedKernelSize(0);
-      CHECK_EQ(dshape[1] % param_.num_group, 0U) \
-          << "input num_filter must divide group size";
-      CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
-          << "output num_filter must divide group size";
-      CHECK_GT(param_.kernel.Size(), 0U) \
-          << "incorrect kernel size: " << param_.kernel;
-      CHECK_GT(param_.stride.Size(), 0U) \
-          << "incorrect stride size: " << param_.stride;
-      CHECK_GT(param_.dilate.Size(), 0U) \
-          << "incorrect dilate size: " << param_.dilate;
-      Shape<3> oshape;
-      oshape[0] = dshape[0];
-      oshape[1] = param_.num_filter;
-      oshape[2] = dshape[2] ?
-          (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1 : 0;
-      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value()));
-      // Perform incomplete shape inference. Fill in the missing values in data shape.
-      // 1) We can always fill in the batch_size.
-      // 2) We can back-calculate the input height/width if the corresponding stride is 1.
-      oshape = ConvertLayout((*out_shape)[0].get<3>(), param_.layout.value(), kNCW);
-      dshape[0] = oshape[0];
-      if (oshape[2] && param_.stride[0] == 1) {
-        dshape[2] = oshape[2] + dilated_ksize_x - 1 - 2 * param_.pad[0];
-      }
-      SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
-                          ConvertLayout(dshape, kNCW, param_.layout.value()));
-      // Check whether the kernel sizes are valid
-      if (dshape[2] != 0) {
-        CHECK_LE(dilated_ksize_x, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
-      }
-      return true;
-    } else if (param_.kernel.ndim() == 2) {
-      // 2d conv
-      CHECK_EQ(dshp.ndim(), 4U) \
-          << "Input data should be 4D in batch-num_filter-y-x";
-      Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW);
-      Shape<4> wshape = Shape4(param_.num_filter / param_.num_group,
-                               dshape[1] / param_.num_group,
-                               param_.kernel[0], param_.kernel[1]);
-      wshape = ConvertLayout(wshape, kNCHW, param_.layout.value());
-      wshape[0] *= param_.num_group;
-      SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
-      if (!param_.no_bias) {
-        SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
-      }
-
-      const index_t dilated_ksize_y = param_.DilatedKernelSize(0);
-      const index_t dilated_ksize_x = param_.DilatedKernelSize(1);
-      CHECK_EQ(dshape[1] % param_.num_group, 0U) \
-          << "input num_filter must divide group size";
-      CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
-          << "output num_filter must divide group size";
-      CHECK_GT(param_.kernel.Size(), 0U) \
-          << "incorrect kernel size: " << param_.kernel;
-      CHECK_GT(param_.stride.Size(), 0U) \
-          << "incorrect stride size: " << param_.stride;
-      CHECK_GT(param_.dilate.Size(), 0U) \
-          << "incorrect dilate size: " << param_.dilate;
-      Shape<4> oshape;
-      oshape[0] = dshape[0];
-      oshape[1] = param_.num_filter;
-      oshape[2] = dshape[2] ?
-        (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_y) / param_.stride[0] + 1 : 0;
-      oshape[3] = dshape[3] ?
-        (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_x) / param_.stride[1] + 1 : 0;
-      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value()));
-      // Perform incomplete shape inference. Fill in the missing values in data shape.
-      // 1) We can always fill in the batch_size.
-      // 2) We can back-calculate the input height/width if the corresponding stride is 1.
-      oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW);
-      dshape[0] = oshape[0];
-      if (oshape[2] && param_.stride[0] == 1) {
-        dshape[2] = oshape[2] + dilated_ksize_y - 1 - 2 * param_.pad[0];
-      }
-      if (oshape[3] && param_.stride[1] == 1) {
-        dshape[3] = oshape[3] + dilated_ksize_x - 1 - 2 * param_.pad[1];
-      }
-      SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
-                          ConvertLayout(dshape, kNCHW, param_.layout.value()));
-      // Check whether the kernel sizes are valid
-      if (dshape[2] != 0) {
-        CHECK_LE(dilated_ksize_y, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
-      }
-      if (dshape[3] != 0) {
-        CHECK_LE(dilated_ksize_x, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input";
-      }
-      return true;
-    } else if (param_.kernel.ndim() == 3) {
-      // 3d conv
-      CHECK_EQ(dshp.ndim(), 5U) \
-        << "Input data should be 5D in batch-num_filter-depth-y-x";
-      Shape<5> dshape = ConvertLayout(dshp.get<5>(), param_.layout.value(), kNCDHW);
-      Shape<5> wshape = Shape5(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
-                               param_.kernel[0], param_.kernel[1], param_.kernel[2]);
-      wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value());
-      wshape[0] *= param_.num_group;
-      SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
-      if (!param_.no_bias) {
-        SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
-      }
-
-      // Note: 3D dilation currently not supported.
-      // Calculations below done to preserve symmetry with 1D/2D code.
-      const index_t dilated_ksize_d = param_.DilatedKernelSize(0);
-      const index_t dilated_ksize_y = param_.DilatedKernelSize(1);
-      const index_t dilated_ksize_x = param_.DilatedKernelSize(2);
-      CHECK_EQ(dshape[1] % param_.num_group, 0U)
-        << "input num_filter must divide group size";
-      CHECK_EQ(param_.num_filter % param_.num_group, 0U)
-        << "output num_filter must divide group size";
-      CHECK_GT(param_.kernel.Size(), 0U) \
-        << "incorrect kernel size: " << param_.kernel;
-      CHECK_GT(param_.stride.Size(), 0U) \
-        << "incorrect stride size: " << param_.stride;
-      CHECK_GT(param_.dilate.Size(), 0U) \
-        << "incorrect dilate size: " << param_.dilate;
-      CHECK_EQ(param_.dilate.Size(), 1U)
-        << "Dilate is not supported in 3d convolution";
-      Shape<5> oshape;
-      oshape[0] = dshape[0];
-      oshape[1] = param_.num_filter;
-      oshape[2] = dshape[2] ?
-        (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_d) / param_.stride[0] + 1 : 0;
-      oshape[3] = dshape[3] ?
-        (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_y) / param_.stride[1] + 1 : 0;
-      oshape[4] = dshape[4] ?
-        (AddPad(dshape[4], param_.pad[2]) - dilated_ksize_x) / param_.stride[2] + 1 : 0;
-      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value()));
-      // Perform incomplete shape inference. Fill in the missing values in data shape.
-      // 1) We can always fill in the batch_size.
-      // 2) We can back-calculate the input depth/height/width if the corresponding stride is 1.
-      oshape = ConvertLayout((*out_shape)[0].get<5>(), param_.layout.value(), kNCDHW);
-      dshape[0] = oshape[0];
-      if (oshape[2] && param_.stride[0] == 1) {
-        dshape[2] = oshape[2] + dilated_ksize_d - 1 - 2 * param_.pad[0];
-      }
-      if (oshape[3] && param_.stride[1] == 1) {
-        dshape[3] = oshape[3] + dilated_ksize_y - 1 - 2 * param_.pad[1];
-      }
-      if (oshape[4] && param_.stride[2] == 1) {
-        dshape[4] = oshape[4] + dilated_ksize_x - 1 - 2 * param_.pad[2];
-      }
-      SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
-                          ConvertLayout(dshape, kNCDHW, param_.layout.value()));
-      // Check whether the kernel sizes are valid
-      if (dshape[2] != 0) {
-        CHECK_LE(dilated_ksize_d, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
-      }
-      if (dshape[3] != 0) {
-        CHECK_LE(dilated_ksize_y, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input";
-      }
-      if (dshape[4] != 0) {
-        CHECK_LE(dilated_ksize_x, AddPad(dshape[4], param_.pad[2])) << "kernel size exceed input";
-      }
-      return true;
-    } else {
-      LOG(FATAL) << "Unknown convolution type";
-      return false;
-    }
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1U);
-    int dtype = (*in_type)[0];
-    CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
-      if ((*in_type)[i] == -1) {
-        (*in_type)[i] = dtype;
-      } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
-      }
-    }
-    out_type->clear();
-    out_type->push_back(dtype);
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    auto ptr = new ConvolutionProp();
-    ptr->param_ = param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override {
-    return "Convolution";
-  }
-
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return {out_grad[conv::kOut], in_data[conv::kData], in_data[conv::kWeight]};
-  }
-
-  std::vector<ResourceRequest> ForwardResource(
-      const std::vector<TShape> &in_shape) const override {
-    return {ResourceRequest::kTempSpace};
-  }
-
-  std::vector<ResourceRequest> BackwardResource(
-      const std::vector<TShape> &in_shape) const override {
-    return {ResourceRequest::kTempSpace};
-  }
-
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented.";
-    return NULL;
-  }
-
-  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                             std::vector<int> *in_type) const override;
+void ConvolutionCompute(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx, const std::vector<TBlob>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& outputs) {
+  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  MSHADOW_REAL_TYPE_SWITCH(inputs[conv::kData].type_flag_, DType, {
+    ConvolutionOp<xpu, DType> op;
+    op.Init(param);
+    op.Forward(ctx, inputs, req, outputs);
+  });
+}
 
- private:
-  // Adds symmetric padding to a data input (in one dimension)
-  index_t AddPad(index_t dsize, index_t pad) const {
-    return dsize + 2 * pad;
-  }
+template<typename xpu>
+void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx, const std::vector<TBlob>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<TBlob>& outputs) {
+  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  std::vector<TBlob> in_data(inputs.begin() + 1, inputs.end());
+  const TBlob &out_grad = inputs[0];
+  const std::vector<TBlob> &in_grad = outputs;
+
+  MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, {
+    ConvolutionOp<xpu, DType> op;
+    op.Init(param);
+    op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
+  });
+}
 
-  ConvolutionParam param_;
-};  // class ConvolutionProp
-#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_NN_CONVOLUTION_INL_H_
diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc
index ef8ec9034db2..d5781c063f3b 100644
--- a/src/operator/nn/convolution.cc
+++ b/src/operator/nn/convolution.cc
@@ -21,15 +21,13 @@
  * Copyright (c) 2017 by Contributors
  * \file convolution.cc
  * \brief
- * \author Bing Xu, Jun Wu
+ * \author Bing Xu, Jun Wu, Da Zheng
 */
 
 #include "./convolution-inl.h"
-#if MXNET_USE_MKL2017 == 1
-#include <mkl_memory.h>
-#include "../mkl/mkl_memory-inl.h"
-#include "../mkl/mkl_convolution-inl.h"
-#endif  // MXNET_USE_MKL2017
+#include "../elemwise_op_common.h"
+#include "./mkldnn/mkldnn_ops-inl.h"
+#include "./mkldnn/mkldnn_base-inl.h"
 #if MXNET_USE_NNPACK == 1
 #include "./nnpack/nnpack_convolution-inl.h"
 #endif  // MXNET_USE_NNPACK
@@ -38,63 +36,356 @@ namespace mxnet {
 namespace op {
 DMLC_REGISTER_PARAMETER(ConvolutionParam);
 
-template<>
-Operator* CreateOp<cpu>(ConvolutionParam param, int dtype,
-                        std::vector<TShape> *in_shape,
-                        std::vector<TShape> *out_shape,
-                        Context ctx) {
-  Operator *op = NULL;
-  // If 1D convolution, use MXNet implementation
-  if (param.kernel.ndim() == 1) {
-    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-      op = new ConvolutionOp<cpu, DType>(param);
-    })
-    return op;
+static inline index_t AddPad(index_t dsize, index_t pad) {
+  return dsize + 2 * pad;
+}
+
+static inline std::vector<std::string> ListArguments(const ConvolutionParam& param_) {
+  if (!param_.no_bias) {
+    return {"data", "weight", "bias"};
+  } else {
+    return {"data", "weight"};
   }
-#if MXNET_USE_MKL2017 == 1
-  if ((param.dilate[0] == 1 && param.dilate[1] == 1)
-      && param.kernel.ndim() == 2) {
-    switch (dtype) {
-    case mshadow::kFloat32:
-      return new MKLConvolutionOp<cpu, float>(param);
-    case mshadow::kFloat64:
-      return new MKLConvolutionOp<cpu, double>(param);
-    default:
-      break;
-    }
+}
+
+static void ConvolutionCompute_CPU(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx, const std::vector<NDArray>& inputs,
+    const std::vector<OpReqType>& req, const std::vector<NDArray>& outputs) {
+#if MXNET_USE_MKLDNN == 1
+  if (SupportMKLDNNConv(inputs[0])) {
+    MKLDNNConvolutionForward(attrs, ctx, inputs, req, outputs);
+    return;
   }
 #endif
-#if MXNET_USE_NNPACK == 1
-  const size_t batch_size = (*in_shape)[0][0];
-  if ((param.dilate[0] == 1 && param.dilate[1] == 1)
-      && param.kernel.ndim() == 2 && (!param.no_bias)
-      && param.num_group == 1 && (batch_size == 1 ||
-      ((batch_size > 1) && (param.stride[0] == 1) &&
-      (param.stride[1] == 1)))) {
-    switch (dtype) {
-    case mshadow::kFloat32:
-      return new NNPACKConvolutionOp<cpu, float>(param);
-    default:
-      break;
+  std::vector<TBlob> in_blobs(inputs.size());
+  for (size_t i = 0; i < in_blobs.size(); i++)
+    in_blobs[i] = inputs[i].data();
+  std::vector<TBlob> out_blobs(outputs.size());
+  for (size_t i = 0; i < out_blobs.size(); i++)
+    out_blobs[i] = outputs[i].data();
+  ConvolutionCompute<cpu>(attrs, ctx, in_blobs, req, out_blobs);
+}
+
+static void ConvolutionGradCompute_CPU(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx, const std::vector<NDArray>& inputs,
+    const std::vector<OpReqType>& req, const std::vector<NDArray>& outputs) {
+#if MXNET_USE_MKLDNN == 1
+  if (SupportMKLDNNConv(inputs[0])) {
+    MKLDNNConvolutionBackward(attrs, ctx, inputs, req, outputs);
+    return;
+  }
+#endif
+  std::vector<TBlob> in_blobs(inputs.size());
+  for (size_t i = 0; i < in_blobs.size(); i++)
+    in_blobs[i] = inputs[i].data();
+  std::vector<TBlob> out_blobs(outputs.size());
+  for (size_t i = 0; i < out_blobs.size(); i++)
+    out_blobs[i] = outputs[i].data();
+  ConvolutionGradCompute<cpu>(attrs, ctx, in_blobs, req, out_blobs);
+}
+
+static bool ConvolutionShape(const nnvm::NodeAttrs& attrs,
+                             std::vector<TShape> *in_shape,
+                             std::vector<TShape> *out_shape) {
+  using namespace mshadow;
+  const ConvolutionParam& param_ = nnvm::get<ConvolutionParam>(attrs.parsed);
+  if (!param_.no_bias) {
+    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
+  } else {
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
+  }
+  // CHECK_EQ(out_shape->size(), 1) << "Output: [output]";
+  out_shape->resize(1, TShape());
+  const TShape &dshp = (*in_shape)[conv::kData];
+  if (dshp.ndim() ==  0) return false;
+
+  if (param_.kernel.ndim() == 1) {
+    // 1d conv
+    CHECK_EQ(dshp.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x";
+    Shape<3> dshape = ConvertLayout(dshp.get<3>(), param_.layout.value(), kNCW);
+    Shape<3> wshape = Shape3(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
+        param_.kernel[0]);
+    wshape = ConvertLayout(wshape, kNCW, param_.layout.value());
+    wshape[0] *= param_.num_group;
+    SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
+    if (!param_.no_bias) {
+      SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
+    }
+
+    const index_t dilated_ksize_x = param_.DilatedKernelSize(0);
+    CHECK_EQ(dshape[1] % param_.num_group, 0U) \
+      << "input num_filter must divide group size";
+    CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
+      << "output num_filter must divide group size";
+    CHECK_GT(param_.kernel.Size(), 0U) \
+      << "incorrect kernel size: " << param_.kernel;
+    CHECK_GT(param_.stride.Size(), 0U) \
+      << "incorrect stride size: " << param_.stride;
+    CHECK_GT(param_.dilate.Size(), 0U) \
+      << "incorrect dilate size: " << param_.dilate;
+    Shape<3> oshape;
+    oshape[0] = dshape[0];
+    oshape[1] = param_.num_filter;
+    oshape[2] = dshape[2] ?
+      (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1 : 0;
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value()));
+    // Perform incomplete shape inference. Fill in the missing values in data shape.
+    // 1) We can always fill in the batch_size.
+    // 2) We can back-calculate the input height/width if the corresponding stride is 1.
+    oshape = ConvertLayout((*out_shape)[0].get<3>(), param_.layout.value(), kNCW);
+    dshape[0] = oshape[0];
+    if (oshape[2] && param_.stride[0] == 1) {
+      dshape[2] = oshape[2] + dilated_ksize_x - 1 - 2 * param_.pad[0];
+    }
+    SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
+        ConvertLayout(dshape, kNCW, param_.layout.value()));
+    // Check whether the kernel sizes are valid
+    if (dshape[2] != 0) {
+      CHECK_LE(dilated_ksize_x, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
+    }
+    return true;
+  } else if (param_.kernel.ndim() == 2) {
+    // 2d conv
+    CHECK_EQ(dshp.ndim(), 4U) \
+      << "Input data should be 4D in batch-num_filter-y-x";
+    Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW);
+    Shape<4> wshape = Shape4(param_.num_filter / param_.num_group,
+        dshape[1] / param_.num_group,
+        param_.kernel[0], param_.kernel[1]);
+    wshape = ConvertLayout(wshape, kNCHW, param_.layout.value());
+    wshape[0] *= param_.num_group;
+    SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
+    if (!param_.no_bias) {
+      SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
+    }
+
+    const index_t dilated_ksize_y = param_.DilatedKernelSize(0);
+    const index_t dilated_ksize_x = param_.DilatedKernelSize(1);
+    CHECK_EQ(dshape[1] % param_.num_group, 0U) \
+      << "input num_filter must divide group size";
+    CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
+      << "output num_filter must divide group size";
+    CHECK_GT(param_.kernel.Size(), 0U) \
+      << "incorrect kernel size: " << param_.kernel;
+    CHECK_GT(param_.stride.Size(), 0U) \
+      << "incorrect stride size: " << param_.stride;
+    CHECK_GT(param_.dilate.Size(), 0U) \
+      << "incorrect dilate size: " << param_.dilate;
+    Shape<4> oshape;
+    oshape[0] = dshape[0];
+    oshape[1] = param_.num_filter;
+    oshape[2] = dshape[2] ?
+      (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_y) / param_.stride[0] + 1 : 0;
+    oshape[3] = dshape[3] ?
+      (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_x) / param_.stride[1] + 1 : 0;
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value()));
+    // Perform incomplete shape inference. Fill in the missing values in data shape.
+    // 1) We can always fill in the batch_size.
+    // 2) We can back-calculate the input height/width if the corresponding stride is 1.
+    oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW);
+    dshape[0] = oshape[0];
+    if (oshape[2] && param_.stride[0] == 1) {
+      dshape[2] = oshape[2] + dilated_ksize_y - 1 - 2 * param_.pad[0];
+    }
+    if (oshape[3] && param_.stride[1] == 1) {
+      dshape[3] = oshape[3] + dilated_ksize_x - 1 - 2 * param_.pad[1];
+    }
+    SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
+        ConvertLayout(dshape, kNCHW, param_.layout.value()));
+    // Check whether the kernel sizes are valid
+    if (dshape[2] != 0) {
+      CHECK_LE(dilated_ksize_y, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
     }
+    if (dshape[3] != 0) {
+      CHECK_LE(dilated_ksize_x, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input";
+    }
+    return true;
+  } else if (param_.kernel.ndim() == 3) {
+    // 3d conv
+    CHECK_EQ(dshp.ndim(), 5U) \
+      << "Input data should be 5D in batch-num_filter-depth-y-x";
+    Shape<5> dshape = ConvertLayout(dshp.get<5>(), param_.layout.value(), kNCDHW);
+    Shape<5> wshape = Shape5(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
+        param_.kernel[0], param_.kernel[1], param_.kernel[2]);
+    wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value());
+    wshape[0] *= param_.num_group;
+    SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
+    if (!param_.no_bias) {
+      SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
+    }
+
+    // Note: 3D dilation currently not supported.
+    // Calculations below done to preserve symmetry with 1D/2D code.
+    const index_t dilated_ksize_d = param_.DilatedKernelSize(0);
+    const index_t dilated_ksize_y = param_.DilatedKernelSize(1);
+    const index_t dilated_ksize_x = param_.DilatedKernelSize(2);
+    CHECK_EQ(dshape[1] % param_.num_group, 0U)
+      << "input num_filter must divide group size";
+    CHECK_EQ(param_.num_filter % param_.num_group, 0U)
+      << "output num_filter must divide group size";
+    CHECK_GT(param_.kernel.Size(), 0U) \
+      << "incorrect kernel size: " << param_.kernel;
+    CHECK_GT(param_.stride.Size(), 0U) \
+      << "incorrect stride size: " << param_.stride;
+    CHECK_GT(param_.dilate.Size(), 0U) \
+      << "incorrect dilate size: " << param_.dilate;
+    CHECK_EQ(param_.dilate.Size(), 1U)
+      << "Dilate is not supported in 3d convolution";
+    Shape<5> oshape;
+    oshape[0] = dshape[0];
+    oshape[1] = param_.num_filter;
+    oshape[2] = dshape[2] ?
+      (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_d) / param_.stride[0] + 1 : 0;
+    oshape[3] = dshape[3] ?
+      (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_y) / param_.stride[1] + 1 : 0;
+    oshape[4] = dshape[4] ?
+      (AddPad(dshape[4], param_.pad[2]) - dilated_ksize_x) / param_.stride[2] + 1 : 0;
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value()));
+    // Perform incomplete shape inference. Fill in the missing values in data shape.
+    // 1) We can always fill in the batch_size.
+    // 2) We can back-calculate the input depth/height/width if the corresponding stride is 1.
+    oshape = ConvertLayout((*out_shape)[0].get<5>(), param_.layout.value(), kNCDHW);
+    dshape[0] = oshape[0];
+    if (oshape[2] && param_.stride[0] == 1) {
+      dshape[2] = oshape[2] + dilated_ksize_d - 1 - 2 * param_.pad[0];
+    }
+    if (oshape[3] && param_.stride[1] == 1) {
+      dshape[3] = oshape[3] + dilated_ksize_y - 1 - 2 * param_.pad[1];
+    }
+    if (oshape[4] && param_.stride[2] == 1) {
+      dshape[4] = oshape[4] + dilated_ksize_x - 1 - 2 * param_.pad[2];
+    }
+    SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
+        ConvertLayout(dshape, kNCDHW, param_.layout.value()));
+    // Check whether the kernel sizes are valid
+    if (dshape[2] != 0) {
+      CHECK_LE(dilated_ksize_d, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
+    }
+    if (dshape[3] != 0) {
+      CHECK_LE(dilated_ksize_y, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input";
+    }
+    if (dshape[4] != 0) {
+      CHECK_LE(dilated_ksize_x, AddPad(dshape[4], param_.pad[2])) << "kernel size exceed input";
+    }
+    return true;
+  } else {
+    LOG(FATAL) << "Unknown convolution type";
+    return false;
   }
+}
+
+static bool ConvolutionType(const nnvm::NodeAttrs& attrs,
+                            std::vector<int> *in_type, std::vector<int> *out_type) {
+  const ConvolutionParam& param_ = nnvm::get<ConvolutionParam>(attrs.parsed);
+  CHECK_GE(in_type->size(), 1U);
+  int dtype = (*in_type)[0];
+  CHECK_NE(dtype, -1) << "First input must have specified type";
+  for (index_t i = 0; i < in_type->size(); ++i) {
+    if ((*in_type)[i] == -1) {
+      (*in_type)[i] = dtype;
+    } else {
+      UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param_)[i]);
+    }
+  }
+  out_type->clear();
+  out_type->push_back(dtype);
+  return true;
+}
+
+inline static bool ConvStorageType(const nnvm::NodeAttrs& attrs,
+                                 const int dev_mask,
+                                 DispatchMode* dispatch_mode,
+                                 std::vector<int> *in_attrs,
+                                 std::vector<int> *out_attrs) {
+  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  uint32_t in_expected = param.no_bias ? 2 : 3;
+  CHECK_EQ(in_attrs->size(), in_expected);
+  CHECK_EQ(out_attrs->size(), 1);
+
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask)
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  else
+#endif
+    *dispatch_mode = DispatchMode::kFCompute;
+  (*out_attrs)[0] = kDefaultStorage;
+  return true;
+}
+
+inline static bool BackwardConvStorageType(const nnvm::NodeAttrs& attrs,
+                                           const int dev_mask,
+                                           DispatchMode* dispatch_mode,
+                                           std::vector<int> *in_attrs,
+                                           std::vector<int> *out_attrs) {
+  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  uint32_t in_expected = param.no_bias ? 3 : 4;
+  uint32_t out_expected = param.no_bias ? 2 : 3;
+  CHECK_EQ(in_attrs->size(), in_expected);
+  CHECK_EQ(out_attrs->size(), out_expected);
+
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask)
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  else
 #endif
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new ConvolutionOp<cpu, DType>(param);
-  })
-  return op;
+    *dispatch_mode = DispatchMode::kFCompute;
+  for (size_t i = 0; i < out_attrs->size(); i++)
+    (*out_attrs)[i] = kDefaultStorage;
+  return true;
 }
 
-// DO_BIND_DISPATCH comes from operator_common.h
-Operator *ConvolutionProp::CreateOperatorEx(Context ctx,
-                                            std::vector<TShape> *in_shape,
-                                            std::vector<int> *in_type) const {
-  std::vector<TShape> out_shape, aux_shape;
-  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], in_shape, &out_shape, ctx);
+static void ConvolutionParamParser(nnvm::NodeAttrs* attrs) {
+  using namespace mshadow;
+  ConvolutionParam param_;
+  try {
+    param_.Init(attrs->dict);
+  } catch (const dmlc::ParamError& e) {
+    std::ostringstream os;
+    os << e.what();
+    os << ", in operator " << attrs->op->name << "("
+       << "name=\"" << attrs->name << "\"";
+    for (const auto& k : attrs->dict) {
+      os << ", " << k.first << "=\"" << k.second << "\"";
+    }
+    os << ")";
+    throw dmlc::ParamError(os.str());
+  }
+
+  if (param_.kernel.ndim() == 1) {
+    param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW;
+    if (param_.stride.ndim() == 0) param_.stride = Shape1(1);
+    if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1);
+    if (param_.pad.ndim() == 0) param_.pad = Shape1(0);
+  } else if (param_.kernel.ndim() == 2) {
+    param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW;
+    if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
+    if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1);
+    if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
+  } else {
+    CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D convolution not supported";
+    param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW;
+    if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
+    if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1);
+    if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
+  }
+  attrs->parsed = std::move(param_);
 }
 
-MXNET_REGISTER_OP_PROPERTY(Convolution, ConvolutionProp)
+struct ConvolutionGrad {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
+    const ConvolutionParam& param = nnvm::get<ConvolutionParam>(n->attrs.parsed);
+    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
+    heads.push_back(n->inputs[conv::kData]);
+    heads.push_back(n->inputs[conv::kWeight]);
+    if (!param.no_bias)
+      heads.push_back(n->inputs[conv::kBias]);
+    return MakeGradNode(op_name, n, heads, n->attrs.dict);
+  }
+};
+
+NNVM_REGISTER_OP(Convolution)
 .describe(R"code(Compute *N*-D convolution on *(N+2)*-D input.
 
 In the 2-D convolution, given input data with shape *(batch_size,
@@ -168,10 +459,47 @@ There are other options to tune the performance.
   the performance.
 
 )code" ADD_FILELINE)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const ConvolutionParam& params = nnvm::get<ConvolutionParam>(attrs.parsed);
+  return params.no_bias ? 2 : 3;
+})
+.set_num_outputs(1)
+.set_attr_parser(ConvolutionParamParser)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+    [](const NodeAttrs& attrs) {
+  const ConvolutionParam& params = nnvm::get<ConvolutionParam>(attrs.parsed);
+  if (params.no_bias)
+    return std::vector<std::string>{"data", "weight"};
+  else
+    return std::vector<std::string>{"data", "weight", "bias"};
+})
+.set_attr<nnvm::FInferShape>("FInferShape", ConvolutionShape)
+.set_attr<nnvm::FInferType>("FInferType", ConvolutionType)
+.set_attr<FInferStorageType>("FInferStorageType", ConvStorageType)
+.set_attr<FCompute>("FCompute<cpu>", ConvolutionCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", ConvolutionCompute_CPU)
+.set_attr<nnvm::FGradient>("FGradient", ConvolutionGrad{"_backward_Convolution"})
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
 .add_argument("data", "NDArray-or-Symbol", "Input data to the ConvolutionOp.")
 .add_argument("weight", "NDArray-or-Symbol", "Weight matrix.")
 .add_argument("bias", "NDArray-or-Symbol", "Bias parameter.")
 .add_arguments(ConvolutionParam::__FIELDS__());
 
+NNVM_REGISTER_OP(_backward_Convolution)
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const ConvolutionParam& params = nnvm::get<ConvolutionParam>(attrs.parsed);
+  return params.no_bias ? 2 : 3;
+})
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FInferStorageType>("FInferStorageType", BackwardConvStorageType)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr_parser(ConvolutionParamParser)
+.set_attr<FCompute>("FCompute<cpu>", ConvolutionGradCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", ConvolutionGradCompute_CPU);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/convolution.cu b/src/operator/nn/convolution.cu
index c31d78c226f4..d7f9e564a603 100644
--- a/src/operator/nn/convolution.cu
+++ b/src/operator/nn/convolution.cu
@@ -21,43 +21,136 @@
  * Copyright (c) 2017 by Contributors
  * \file convolution.cu
  * \brief
- * \author Bing Xu, Jun Wu
+ * \author Bing Xu, Jun Wu, Da Zheng
 */
 
 #include "./convolution-inl.h"
 #include <vector>
+#include "./depthwise_convolution-inl.h"
 #if MXNET_USE_CUDNN == 1
 #include "./cudnn/cudnn_convolution-inl.h"
 #endif  // MXNET_USE_CUDNN
 
-#include "./depthwise_convolution-inl.h"
-
 namespace mxnet {
 namespace op {
 
+#if MXNET_USE_CUDNN == 1
+template<typename DType>
+static CuDNNConvolutionOp<DType> &GetCuDNNConvOp(const ConvolutionParam& param,
+    int forward_compute_type, int backward_compute_type,
+    const std::vector<TShape>& in_shape, const std::vector<TShape>& out_shape,
+    const Context& ctx) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local CuDNNConvolutionOp<DType> op;
+#else
+  static MX_THREAD_LOCAL CuDNNConvolutionOp<DType> op;
+#endif
+  op.Init(param, forward_compute_type, backward_compute_type,
+      in_shape, out_shape, ctx);
+  return op;
+}
+#endif
+
 template<>
-Operator* CreateOp<gpu>(ConvolutionParam param, int dtype,
-                        std::vector<TShape> *in_shape,
-                        std::vector<TShape> *out_shape,
-                        Context ctx) {
-  Operator *op = NULL;
+void ConvolutionCompute<gpu>(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx, const std::vector<TBlob>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<TBlob>& outputs) {
+  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  int dtype = inputs[conv::kData].type_flag_;
+
   // If 1D convolution, use MXNet implementation
   if (param.kernel.ndim() == 1) {
     MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-      op = new ConvolutionOp<gpu, DType>(param);
+      ConvolutionOp<gpu, DType> op;
+      op.Init(param);
+      op.Forward(ctx, inputs, req, outputs);
     })
-    return op;
+    return;
+  } else if (param.num_filter == param.num_group &&
+      param.layout.value() == mshadow::kNCHW &&
+      param.num_filter == inputs[conv::kData].shape_[1] &&
+      param.kernel.ndim() == 2 &&
+      param.dilate == mshadow::Shape2(1, 1) &&
+      dtype == mshadow::kFloat32) {
+    std::vector<TShape> in_shape(inputs.size());
+    std::vector<TShape> out_shape(1, outputs[0].shape_);
+    for (size_t i = 0; i < in_shape.size(); i++)
+      in_shape[i] = inputs[i].shape_;
+    DepthwiseConvolutionOp<float> op;
+    op.Init(param, in_shape, out_shape);
+    op.Forward(ctx, inputs, req, outputs);
+    return;
   }
 
-  // depth wise conv
-  if (param.num_filter == param.num_group &&
+#if MXNET_USE_CUDNN == 1
+  // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16).
+  int compute_type = (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype;
+
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    if (param.cudnn_off) {
+      ConvolutionOp<gpu, DType> op;
+      op.Init(param);
+      op.Forward(ctx, inputs, req, outputs);
+    } else if (!CuDNNConvolutionOp<DType>::Supports(param,
+          compute_type, compute_type, ctx.run_ctx.ctx)) {
+      LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied.";
+      ConvolutionOp<gpu, DType> op;
+      op.Init(param);
+      op.Forward(ctx, inputs, req, outputs);
+    } else {
+      std::vector<TShape> in_shape(inputs.size());
+      std::vector<TShape> out_shape(1, outputs[0].shape_);
+      for (size_t i = 0; i < in_shape.size(); i++)
+        in_shape[i] = inputs[i].shape_;
+      CuDNNConvolutionOp<DType> &op = GetCuDNNConvOp<DType>(param,
+          compute_type, compute_type, in_shape, out_shape, ctx.run_ctx.ctx);
+      op.Forward(ctx, inputs, req, outputs);
+    }
+  })
+#else
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    ConvolutionOp<gpu, DType> op;
+    op.Init(param);
+    op.Forward(ctx, inputs, req, outputs);
+  })
+#endif  // MXNET_USE_CUDNN
+}
+
+template<>
+void ConvolutionGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx, const std::vector<TBlob>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<TBlob>& outputs) {
+  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  std::vector<TBlob> in_data(inputs.begin() + 1, inputs.end());
+  const TBlob &out_grad = inputs[0];
+  const std::vector<TBlob> &in_grad = outputs;
+  int dtype = out_grad.type_flag_;
+
+  // If 1D convolution, use MXNet implementation
+  if (param.kernel.ndim() == 1) {
+    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+      ConvolutionOp<gpu, DType> op;
+      op.Init(param);
+      op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
+    })
+    return;
+  } else if (param.num_filter == param.num_group &&
       param.layout.value() == mshadow::kNCHW &&
-      param.num_filter == (*in_shape)[conv::kData][1] &&
+      param.num_filter == in_data[conv::kData].shape_[1] &&
       param.kernel.ndim() == 2 &&
       param.dilate == mshadow::Shape2(1, 1) &&
       dtype == mshadow::kFloat32) {
-    op = new DepthwiseConvolutionOp<float>(param, *in_shape, *out_shape);
-    return op;
+    // The first element stores out grad.
+    std::vector<TShape> in_shape(in_data.size());
+    std::vector<TShape> out_shape(1, out_grad.shape_);
+    for (size_t i = 0; i < in_shape.size(); i++)
+      in_shape[i] = in_data[i].shape_;
+    DepthwiseConvolutionOp<float> op;
+    op.Init(param, in_shape, out_shape);
+    op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
+    return;
   }
 
 #if MXNET_USE_CUDNN == 1
@@ -66,23 +159,41 @@ Operator* CreateOp<gpu>(ConvolutionParam param, int dtype,
 
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
     if (param.cudnn_off) {
-      op = new ConvolutionOp<gpu, DType>(param);
-    } else if (!CuDNNConvolutionOp<DType>::Supports(param, compute_type, compute_type, ctx)) {
+      ConvolutionOp<gpu, DType> op;
+      op.Init(param);
+      op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
+    } else if (!CuDNNConvolutionOp<DType>::Supports(param,
+          compute_type, compute_type, ctx.run_ctx.ctx)) {
       LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied.";
-      op = new ConvolutionOp<gpu, DType>(param);
+      ConvolutionOp<gpu, DType> op;
+      op.Init(param);
+      op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
     } else {
-      op = new CuDNNConvolutionOp<DType>(param, compute_type, compute_type,
-                                         *in_shape, *out_shape, ctx);
+      // The first element stores out grad.
+      std::vector<TShape> in_shape(in_data.size());
+      std::vector<TShape> out_shape(1, out_grad.shape_);
+      for (size_t i = 0; i < in_shape.size(); i++)
+        in_shape[i] = in_data[i].shape_;
+      CuDNNConvolutionOp<DType> &op = GetCuDNNConvOp<DType>(param,
+          compute_type, compute_type, in_shape, out_shape, ctx.run_ctx.ctx);
+      op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
     }
   })
 #else
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new ConvolutionOp<gpu, DType>(param);
+    ConvolutionOp<gpu, DType> op;
+    op.Init(param);
+    op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
   })
 #endif  // MXNET_USE_CUDNN
-  return op;
 }
 
+NNVM_REGISTER_OP(Convolution)
+.set_attr<FCompute>("FCompute<gpu>", ConvolutionCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_Convolution)
+.set_attr<FCompute>("FCompute<gpu>", ConvolutionGradCompute<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/nn/cudnn/cudnn_activation-inl.h b/src/operator/nn/cudnn/cudnn_activation-inl.h
index 888528309cdf..a89e7bfaf080 100644
--- a/src/operator/nn/cudnn/cudnn_activation-inl.h
+++ b/src/operator/nn/cudnn/cudnn_activation-inl.h
@@ -33,12 +33,19 @@
 namespace mxnet {
 namespace op {
 template<typename DType>
-class CuDNNActivationOp : public Operator {
+class CuDNNActivationOp {
  public:
-  explicit CuDNNActivationOp(ActivationParam param) {
-    param_ = param;
-    init_cudnn_ = false;
+  CuDNNActivationOp() {
     dtype_ = mshadow::DataType<DType>::kCudnnFlag;
+    #if CUDNN_MAJOR >= 5
+    nan_prop_ = CUDNN_NOT_PROPAGATE_NAN;
+    CUDNN_CALL(cudnnCreateActivationDescriptor(&desc_));
+    #endif
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_));
+  }
+
+  void Init(const ActivationParam &param) {
+    param_ = param;
     switch (param_.act_type) {
       case activation::kReLU:
         mode_ = CUDNN_ACTIVATION_RELU;
@@ -54,67 +61,54 @@ class CuDNNActivationOp : public Operator {
         break;
     }
     #if CUDNN_MAJOR >= 5
-    nan_prop_ = CUDNN_NOT_PROPAGATE_NAN;
-    CUDNN_CALL(cudnnCreateActivationDescriptor(&desc_));
     CUDNN_CALL(cudnnSetActivationDescriptor(desc_, mode_, nan_prop_, relu_ceil_));
     #endif
   }
 
   ~CuDNNActivationOp() {
-    if (init_cudnn_) {
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_));
-      #if CUDNN_MAJOR >= 5
-      CUDNN_CALL(cudnnDestroyActivationDescriptor(desc_));
-      #endif
-    }
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_));
+    #if CUDNN_MAJOR >= 5
+    CUDNN_CALL(cudnnDestroyActivationDescriptor(desc_));
+    #endif
   }
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
+  void Forward(const OpContext &ctx, const TBlob &in_data,
+      const OpReqType &req, const TBlob &out_data) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 1U);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     Tensor<gpu, 4, DType> data;
     Tensor<gpu, 4, DType> out;
-    if (in_data[activation::kData].ndim() == 2) {
-      Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0],
-                               in_data[activation::kData].shape_[1], 1, 1);
-      data = in_data[activation::kData].get_with_shape<gpu, 4, DType>(dshape, s);
-      out = out_data[activation::kOut].get_with_shape<gpu, 4, DType>(dshape, s);
+    if (in_data.ndim() == 2) {
+      Shape<4> dshape = Shape4(in_data.shape_[0],
+                               in_data.shape_[1], 1, 1);
+      data = in_data.get_with_shape<gpu, 4, DType>(dshape, s);
+      out = out_data.get_with_shape<gpu, 4, DType>(dshape, s);
     } else {
       Shape<4> dshape;
-      index_t size_left = in_data[activation::kData].Size();
+      index_t size_left = in_data.Size();
       for (int i = 0; i < 3; ++i) {
-        if (i < in_data[activation::kData].ndim()) {
-          dshape[i] = in_data[activation::kData].shape_[i];
+        if (i < in_data.ndim()) {
+          dshape[i] = in_data.shape_[i];
         } else {
           dshape[i] = 1;
         }
         size_left /= dshape[i];
       }
       dshape[3] = size_left;
-      data = in_data[activation::kData].get_with_shape<gpu, 4, DType>(dshape, s);
-      out = out_data[activation::kOut].get_with_shape<gpu, 4, DType>(dshape, s);
+      data = in_data.get_with_shape<gpu, 4, DType>(dshape, s);
+      out = out_data.get_with_shape<gpu, 4, DType>(dshape, s);
     }
     typename DataType<DType>::ScaleType alpha = 1.0f;
     typename DataType<DType>::ScaleType beta = 0.0f;
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
-    if (!init_cudnn_) {
-      init_cudnn_ = true;
-      CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_));
-      CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_,
-                                            CUDNN_TENSOR_NCHW,
-                                            dtype_,
-                                            data.shape_[0],
-                                            data.shape_[1],
-                                            data.shape_[2],
-                                            data.shape_[3]));
-    }
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_,
+                                          CUDNN_TENSOR_NCHW,
+                                          dtype_,
+                                          data.shape_[0],
+                                          data.shape_[1],
+                                          data.shape_[2],
+                                          data.shape_[3]));
     #if CUDNN_MAJOR <= 4
     CUDNN_CALL(cudnnActivationForward(s->dnn_handle_,
                                       mode_,
@@ -136,20 +130,11 @@ class CuDNNActivationOp : public Operator {
     #endif
   }
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
+  void Backward(const OpContext &ctx, const TBlob &out_grad,
+      const TBlob &in_data, const TBlob &out_data,
+      const OpReqType &req, const TBlob &in_grad) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1U);
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 1U);
-    CHECK_EQ(req.size(), 1U);
-    CHECK_EQ(in_grad.size(), 1U);
     typename DataType<DType>::ScaleType alpha = 1.0f;
     typename DataType<DType>::ScaleType beta = 0.0f;
     Stream<gpu> *s = ctx.get_stream<gpu>();
@@ -157,31 +142,38 @@ class CuDNNActivationOp : public Operator {
     Tensor<gpu, 4, DType> data;
     Tensor<gpu, 4, DType> output_data;
     Tensor<gpu, 4, DType> input_grad;
-    if (in_grad[activation::kData].ndim() == 2) {
-      Shape<4> dshape = Shape4(in_grad[activation::kData].shape_[0],
-                               in_grad[activation::kData].shape_[1], 1, 1);
-      data = in_data[activation::kData].get_with_shape<gpu, 4, DType>(dshape, s);
-      grad = out_grad[activation::kOut].get_with_shape<gpu, 4, DType>(dshape, s);
-      output_data = out_data[activation::kOut].get_with_shape<gpu, 4, DType>(dshape, s);
-      input_grad = in_grad[activation::kData].get_with_shape<gpu, 4, DType>(dshape, s);
+    if (in_grad.ndim() == 2) {
+      Shape<4> dshape = Shape4(in_grad.shape_[0],
+                               in_grad.shape_[1], 1, 1);
+      data = in_data.get_with_shape<gpu, 4, DType>(dshape, s);
+      grad = out_grad.get_with_shape<gpu, 4, DType>(dshape, s);
+      output_data = out_data.get_with_shape<gpu, 4, DType>(dshape, s);
+      input_grad = in_grad.get_with_shape<gpu, 4, DType>(dshape, s);
     } else {
       Shape<4> dshape;
-      index_t size_left = in_grad[activation::kData].Size();
+      index_t size_left = in_grad.Size();
       for (int i = 0; i < 3; ++i) {
-        if (i < in_grad[activation::kData].ndim()) {
-          dshape[i] = in_grad[activation::kData].shape_[i];
+        if (i < in_grad.ndim()) {
+          dshape[i] = in_grad.shape_[i];
         } else {
           dshape[i] = 1;
         }
         size_left /= dshape[i];
       }
       dshape[3] = size_left;
-      data = in_data[activation::kData].get_with_shape<gpu, 4, DType>(dshape, s);
-      output_data = out_data[activation::kOut].get_with_shape<gpu, 4, DType>(dshape, s);
-      grad = out_grad[activation::kOut].get_with_shape<gpu, 4, DType>(dshape, s);
-      input_grad = in_grad[activation::kData].get_with_shape<gpu, 4, DType>(dshape, s);
+      data = in_data.get_with_shape<gpu, 4, DType>(dshape, s);
+      output_data = out_data.get_with_shape<gpu, 4, DType>(dshape, s);
+      grad = out_grad.get_with_shape<gpu, 4, DType>(dshape, s);
+      input_grad = in_grad.get_with_shape<gpu, 4, DType>(dshape, s);
     }
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_,
+                                          CUDNN_TENSOR_NCHW,
+                                          dtype_,
+                                          data.shape_[0],
+                                          data.shape_[1],
+                                          data.shape_[2],
+                                          data.shape_[3]));
     #if CUDNN_MAJOR <= 4
     CUDNN_CALL(cudnnActivationBackward(s->dnn_handle_,
                                        mode_,
@@ -212,7 +204,6 @@ class CuDNNActivationOp : public Operator {
   }
 
  private:
-  bool init_cudnn_;
   cudnnDataType_t dtype_;
   cudnnActivationMode_t mode_;
   cudnnTensorDescriptor_t shape_desc_;
diff --git a/src/operator/nn/cudnn/cudnn_batch_norm-inl.h b/src/operator/nn/cudnn/cudnn_batch_norm-inl.h
index 3dc9c8353a35..e2337049060e 100644
--- a/src/operator/nn/cudnn/cudnn_batch_norm-inl.h
+++ b/src/operator/nn/cudnn/cudnn_batch_norm-inl.h
@@ -43,28 +43,30 @@ enum CuDNNBatchNormOpAuxiliary {kMovingMean, kMovingInvVar};
 
 #if defined(__CUDACC__)
 template<typename DType>
-class CuDNNBatchNormOp : public Operator {
+class CuDNNBatchNormOp {
  public:
-  explicit CuDNNBatchNormOp(BatchNormParam param) {
+  CuDNNBatchNormOp() {
     using namespace mshadow;
-    CHECK_GE(param.eps, CUDNN_BN_MIN_EPSILON)
-     << "CuDNN requires eps to be no less than " << CUDNN_BN_MIN_EPSILON;
-    this->param_ = param;
-    init_cudnn_ = false;
     dtype_ = DataType<DType>::kCudnnFlag;
     // For float16 input type beta, gamma, mean, and average are stored in float32.
     // For other input types, these parameters have the same type as input
     dtype_param_ = (dtype_ == CUDNN_DATA_HALF) ? kFloat32 : DataType<DType>::kFlag;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&io_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&mean_desc_));
+  }
+
+  void Init(const BatchNormParam &param) {
+    CHECK_GE(param.eps, CUDNN_BN_MIN_EPSILON)
+     << "CuDNN requires eps to be no less than " << CUDNN_BN_MIN_EPSILON;
+    this->param_ = param;
   }
 
   ~CuDNNBatchNormOp() {
-    if (init_cudnn_) {
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(io_desc_));
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(mean_desc_));
-    }
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(io_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(mean_desc_));
   }
 
-  virtual void Forward(const OpContext &ctx,
+  void Forward(const OpContext &ctx,
                        const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
                        const std::vector<TBlob> &out_data,
@@ -84,29 +86,7 @@ class CuDNNBatchNormOp : public Operator {
     CHECK_GE(in_data[cudnnbatchnorm::kData].ndim(), 2);
     CHECK_LE(in_data[cudnnbatchnorm::kData].ndim(), 4);
 
-    if (!init_cudnn_) {
-      for (int i = 0; i < 4; ++i) {
-        if (i < in_data[cudnnbatchnorm::kData].ndim()) {
-          shape_[i] = in_data[cudnnbatchnorm::kData].shape_[i];
-        } else {
-          shape_[i] = 1;
-        }
-      }
-      CUDNN_CALL(cudnnCreateTensorDescriptor(&io_desc_));
-      CUDNN_CALL(cudnnCreateTensorDescriptor(&mean_desc_));
-      CUDNN_CALL(cudnnSetTensor4dDescriptor(io_desc_,
-                                            CUDNN_TENSOR_NCHW,
-                                            dtype_,
-                                            shape_[0],
-                                            shape_[1],
-                                            shape_[2],
-                                            shape_[3]));
-      CUDNN_CALL(cudnnDeriveBNTensorDescriptor(mean_desc_,
-                                               io_desc_,
-                                               CUDNN_BATCHNORM_SPATIAL));
-      init_cudnn_  = true;
-    }
-
+    Init(in_data[cudnnbatchnorm::kData]);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     Tensor<gpu, 4, DType> x =
       in_data[cudnnbatchnorm::kData].get_with_shape<gpu, 4, DType>(shape_, s);
@@ -177,7 +157,7 @@ class CuDNNBatchNormOp : public Operator {
     })
   }
 
-  virtual void Backward(const OpContext &ctx,
+  void Backward(const OpContext &ctx,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<TBlob> &in_data,
                         const std::vector<TBlob> &out_data,
@@ -193,6 +173,7 @@ class CuDNNBatchNormOp : public Operator {
     CHECK(ctx.is_train && !param_.use_global_stats)
         << "use global statistics is not yet supported in CuDNNBatchNorm";
 
+    Init(in_data[cudnnbatchnorm::kData]);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     Tensor<gpu, 4, DType> x =
       in_data[cudnnbatchnorm::kData].get_with_shape<gpu, 4, DType>(shape_, s);
@@ -290,7 +271,27 @@ class CuDNNBatchNormOp : public Operator {
   }
 
  private:
-  bool init_cudnn_;
+  void Init(const TBlob &in_data) {
+    for (int i = 0; i < 4; ++i) {
+      if (i < in_data.ndim()) {
+        shape_[i] = in_data.shape_[i];
+      } else {
+        shape_[i] = 1;
+      }
+    }
+
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(io_desc_,
+                                          CUDNN_TENSOR_NCHW,
+                                          dtype_,
+                                          shape_[0],
+                                          shape_[1],
+                                          shape_[2],
+                                          shape_[3]));
+    CUDNN_CALL(cudnnDeriveBNTensorDescriptor(mean_desc_,
+                                             io_desc_,
+                                             CUDNN_BATCHNORM_SPATIAL));
+  }
+
   cudnnDataType_t dtype_;
   int dtype_param_;
   cudnnTensorDescriptor_t io_desc_, mean_desc_;
@@ -299,91 +300,6 @@ class CuDNNBatchNormOp : public Operator {
 };
 #endif  // defined(__CUDACC__)
 
-template<typename xpu>
-Operator *CreateOp_CuDNNv4(BatchNormParam param);
-
-
-#if DMLC_USE_CXX11
-class CuDNNBatchNormProp : public OperatorProperty {
- public:
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-    using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]";
-    const TShape &dshape = in_shape->at(0);
-    if (dshape.ndim() == 0) return false;
-    in_shape->at(1) = TShape(Shape1(dshape[1]));
-    in_shape->at(2) = TShape(Shape1(dshape[1]));
-
-    out_shape->clear();
-    out_shape->push_back(dshape);
-    out_shape->push_back(Shape1(dshape[1]));
-    out_shape->push_back(Shape1(dshape[1]));
-
-    aux_shape->clear();
-    aux_shape->push_back(Shape1(dshape[1]));
-    aux_shape->push_back(Shape1(dshape[1]));
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    auto ptr = new CuDNNBatchNormProp();
-    ptr->param_ = param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override {
-    return "CuDNNBatchNorm";
-  }
-
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return {out_grad[cudnnbatchnorm::kOut],
-            out_data[cudnnbatchnorm::kMean],
-            out_data[cudnnbatchnorm::kInvVar],
-            in_data[cudnnbatchnorm::kData],
-            in_data[cudnnbatchnorm::kGamma]
-           };
-  }
-
-  int NumVisibleOutputs() const override {
-    return 1;
-  }
-
-  int NumOutputs() const override {
-    return 3;
-  }
-
-  std::vector<std::string> ListArguments() const override {
-    return {"data", "gamma", "beta"};
-  }
-
-  std::vector<std::string> ListOutputs() const override {
-    return {"output", "mean", "inv_var"};
-  }
-
-  std::vector<std::string> ListAuxiliaryStates() const override {
-    return {"moving_mean", "moving_inv_var"};
-  }
-
-  Operator* CreateOperator(Context ctx) const override;
-
- private:
-  BatchNormParam param_;
-};  // class CuDNNBatchNormProp
-
-#endif  // DMLC_USE_CXX11
 #endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 4
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/cudnn/cudnn_batch_norm.cc b/src/operator/nn/cudnn/cudnn_batch_norm.cc
index e1e0c999b1fb..f1d229dd5421 100644
--- a/src/operator/nn/cudnn/cudnn_batch_norm.cc
+++ b/src/operator/nn/cudnn/cudnn_batch_norm.cc
@@ -21,46 +21,100 @@
  * Copyright (c) 2015 by Contributors
  * \file cudnn_batch_norm.cc
  * \brief
- * \author Junyuan Xie
+ * \author Junyuan Xie, Da Zheng
 */
 
 #include "./cudnn_batch_norm-inl.h"
 #include <nnvm/op_attr_types.h>
+#include "../../elemwise_op_common.h"
 
 namespace mxnet {
 namespace op {
-#if CUDNN_MAJOR >= 4
-template<>
-Operator *CreateOp_CuDNNv4<cpu>(BatchNormParam param) {
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 4
+
+static bool BatchNormShape(const nnvm::NodeAttrs& attrs, std::vector<TShape> *in_shape,
+    std::vector<TShape> *out_shape) {
+  using namespace mshadow;
+  CHECK_EQ(in_shape->size(), 5U) << "Input:[data, gamma, beta, moving_mean, moving_var]";
+  const TShape &dshape = in_shape->at(0);
+  if (dshape.ndim() == 0) return false;
+  in_shape->at(1) = TShape(Shape1(dshape[1]));
+  in_shape->at(2) = TShape(Shape1(dshape[1]));
+  in_shape->at(3) = TShape(Shape1(dshape[1]));
+  in_shape->at(4) = TShape(Shape1(dshape[1]));
+
+  out_shape->clear();
+  out_shape->push_back(dshape);
+  out_shape->push_back(Shape1(dshape[1]));
+  out_shape->push_back(Shape1(dshape[1]));
+
+  return true;
+}
+
+static void BatchNormCompute_CPU(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx, const std::vector<TBlob>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<TBlob>& outputs) {
   LOG(FATAL) << "CuDNNBatchNormOp is only available for gpu.";
-  return NULL;
 }
 
-Operator *CuDNNBatchNormProp::CreateOperator(Context ctx) const {
-#if CUDNN_MAJOR >= 5
-  LOG(FATAL) << "CuDNNBatchNorm is merged into BatchNorm for cudnn version above v5."
-                "Use the later instead.";
-  return nullptr;
-#else
-  DO_BIND_DISPATCH(CreateOp_CuDNNv4, param_);
-#endif
+static void BatchNormGradCompute_CPU(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx, const std::vector<TBlob>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<TBlob>& outputs) {
+  LOG(FATAL) << "CuDNNBatchNormOp is only available for gpu.";
 }
 
-MXNET_REGISTER_OP_PROPERTY(CuDNNBatchNorm, CuDNNBatchNormProp)
+NNVM_REGISTER_OP(CuDNNBatchNorm)
 .describe("Apply batch normalization to input.")
+.set_num_inputs(5)
+.set_num_outputs(3)
+.set_attr_parser(ParamParser<BatchNormParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+    [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"data", "gamma", "beta", "moving_mean", "moving_var"};
+})
+.set_attr<nnvm::FListOutputNames>("FListOutputNames",
+    [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"output", "mean", "var"};
+})
+.set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs",
+    [](const NodeAttrs& attrs) {
+  return 1;
+})
+.set_attr<nnvm::FMutateInputs>("FMutateInputs", [](const nnvm::NodeAttrs& attrs) {
+  return std::vector<uint32_t>{3, 4};
+})
+.set_attr<nnvm::FInferShape>("FInferShape", BatchNormShape)
+.set_attr<FCompute>("FCompute<cpu>", BatchNormCompute_CPU)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseInOut{"_backward_CuDNNBatchNorm"})
 .add_argument("data", "NDArray-or-Symbol", "Input data to batch normalization")
-.add_arguments(BatchNormParam::__FIELDS__());
+.add_argument("gamma", "NDArray-or-Symbol", "gamma array")
+.add_argument("beta", "NDArray-or-Symbol", "beta array")
+.add_argument("moving_mean", "NDArray-or-Symbol", "running mean of input")
+.add_argument("moving_var", "NDArray-or-Symbol", "running variance of input")
+.add_arguments(BatchNormParam::__FIELDS__())
+.set_attr<nnvm::FSetInputVarAttrOnCompose>(
+  "FSetInputVarAttrOnCompose",
+  [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) {
+    if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return;
+    if (index == 3) {
+      var->attrs.dict["__init__"] = "[\"zero\", {}]";
+    } else if (index == 4) {
+      var->attrs.dict["__init__"] = "[\"one\", {}]";
+    }
+  });
+
+NNVM_REGISTER_OP(_backward_CuDNNBatchNorm)
+.set_num_outputs(5)
+.set_attr<nnvm::FMutateInputs>("FMutateInputs", [](const nnvm::NodeAttrs& attrs) {
+  return std::vector<uint32_t>{6, 7};
+})
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr_parser(ParamParser<BatchNormParam>)
+.set_attr<FCompute>("FCompute<cpu>", BatchNormGradCompute_CPU);
 
-NNVM_REGISTER_OP(CuDNNBatchNorm)
-.set_attr<nnvm::FSetInputVarAttrOnCompose>("FSetInputVarAttrOnCompose",
-    [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) {
-      if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return;
-      if (index == 3) {
-        var->attrs.dict["__init__"] = "[\"zero\", {}]";
-      } else if (index == 4) {
-        var->attrs.dict["__init__"] = "[\"zero\", {}]";
-      }
-    });
 #endif  // CUDNN_MAJOR >= 4
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/cudnn/cudnn_batch_norm.cu b/src/operator/nn/cudnn/cudnn_batch_norm.cu
index e96db2e5e73f..e07cd1e6c8f6 100644
--- a/src/operator/nn/cudnn/cudnn_batch_norm.cu
+++ b/src/operator/nn/cudnn/cudnn_batch_norm.cu
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file cudnn_batch_norm.cu
  * \brief
- * \author Junyuan Xie
+ * \author Junyuan Xie, Da Zheng
 */
 
 #include "./cudnn_batch_norm-inl.h"
@@ -30,10 +30,60 @@
 namespace mxnet {
 namespace op {
 #if CUDNN_MAJOR == 4
-template<>
-Operator *CreateOp_CuDNNv4<gpu>(BatchNormParam param) {
-  return new CuDNNBatchNormOp<float>(param);
+
+template<typename DType>
+static CuDNNBatchNormOp<DType> &GetCuDNNOp(const BatchNormParam& param) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local CuDNNBatchNormOp<DType> op;
+#else
+  static MX_THREAD_LOCAL CuDNNBatchNormOp<DType> op;
+#endif
+  op.Init(param);
+  return op;
+}
+
+static void BatchNormCompute_CuDNNv4(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx, const std::vector<TBlob>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<TBlob>& outputs) {
+#if CUDNN_MAJOR >= 5
+  LOG(FATAL) << "CuDNNBatchNorm is merged into BatchNorm for cudnn version above v5."
+    "Use the later instead.";
+#else
+  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 5U);
+  std::vector<TBlob> in_data(inputs.begin(), inputs.begin() + 3);
+  std::vector<TBlob> aux_states(inputs.begin() + 3, inputs.end());
+  GetCuDNNOp<float>(param).Forward(ctx, in_data, req, outputs, aux_states);
+#endif
+}
+
+static void BatchNormGradCompute_CuDNNv4(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx, const std::vector<TBlob>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<TBlob>& outputs) {
+#if CUDNN_MAJOR >= 5
+  LOG(FATAL) << "CuDNNBatchNorm is merged into BatchNorm for cudnn version above v5."
+    "Use the later instead.";
+#else
+  CHECK_EQ(inputs.size(), 11U);
+  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
+  std::vector<TBlob> out_grad(1, inputs[0]);
+  std::vector<TBlob> in_data(inputs.begin() + 3, inputs.begin() + 6);
+  std::vector<TBlob> aux_states(inputs.begin() + 6, inputs.begin() + 8);
+  std::vector<TBlob> out_data(inputs.begin() + 8, inputs.end());
+  std::vector<TBlob> in_grad(outputs.begin(), outputs.begin() + 3);
+  GetCuDNNOp<float>(param).Backward(ctx, out_grad, in_data, out_data,
+      req, in_grad, aux_states);
+#endif
 }
+
+NNVM_REGISTER_OP(CuDNNBatchNorm)
+.set_attr<FCompute>("FCompute<gpu>", BatchNormCompute_CuDNNv4);
+
+NNVM_REGISTER_OP(_backward_CuDNNBatchNorm)
+.set_attr<FCompute>("FCompute<gpu>", BatchNormGradCompute_CuDNNv4);
+
 #endif  // CUDNN_MAJOR == 4
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h
index f37203998e0a..fe10c42aaea7 100644
--- a/src/operator/nn/cudnn/cudnn_convolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h
@@ -42,9 +42,19 @@ namespace op {
  * \brief The Operator used to perform convolution using cuDNN kernels.
  */
 template<typename DType>
-class CuDNNConvolutionOp : public Operator {
+class CuDNNConvolutionOp {
  public:
-  explicit CuDNNConvolutionOp(const ConvolutionParam& param,
+  CuDNNConvolutionOp() {
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_));
+    CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_));
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_));
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_));
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_));
+  }
+
+  void Init(const ConvolutionParam& param,
                               int forward_compute_type,
                               int backward_compute_type,
                               const std::vector<TShape>& in_shape,
@@ -57,8 +67,6 @@ class CuDNNConvolutionOp : public Operator {
     auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type);
     // convert MB to words
     param_.workspace = (param_.workspace << 20) / sizeof(DType);
-    init_cudnn_ = false;
-    init_temp_size_ = false;
     dtype_ = DataType<DType>::kCudnnFlag;
     // TensorCore algos only allowed on fp16-I/O convolutions if permitted by the global policy.
     cudnn_tensor_core_ = DataType<DType>::kFlag == kFloat16 && GetEnvAllowTensorCore();
@@ -92,22 +100,19 @@ class CuDNNConvolutionOp : public Operator {
   }
 
   ~CuDNNConvolutionOp() {
-    if (init_cudnn_) {
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_));
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_));
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_));
-      CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_));
-      CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_));
-      CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_));
-      CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_));
-    }
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_));
+    CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_));
+    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_));
+    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_));
+    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_));
   }
 
-  virtual void Forward(const OpContext &ctx,
+  void Forward(const OpContext &ctx,
                        const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
+                       const std::vector<TBlob> &out_data) {
     using namespace mshadow;
     size_t expected = param_.no_bias ? 2 : 3;
     DType *data_ptr = NULL;
@@ -183,13 +188,11 @@ class CuDNNConvolutionOp : public Operator {
     }
   }
 
-  virtual void Backward(const OpContext &ctx,
+  void Backward(const OpContext &ctx,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
                         const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
+                        const std::vector<TBlob> &in_grad) {
     using namespace mshadow;
     using namespace mshadow::expr;
     size_t expected = param_.no_bias == 0 ? 3 : 2;
@@ -199,7 +202,8 @@ class CuDNNConvolutionOp : public Operator {
     DType *data_ptr = NULL;
     DType *gdata_ptr = NULL;
     CHECK_EQ(out_grad.size(), 1U);
-    CHECK(in_data.size() == expected && in_grad.size() == expected);
+    CHECK_EQ(in_data.size(), expected);
+    CHECK_EQ(in_grad.size(), expected);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     if (param_.kernel.ndim() == 2) {
       Tensor<gpu, 4, DType> grad = out_grad[conv::kOut].get<gpu, 4, DType>(s);
@@ -224,6 +228,7 @@ class CuDNNConvolutionOp : public Operator {
       data_ptr = data.dptr_;
       gdata_ptr = gdata.dptr_;
     }
+    GetTempSize(ctx);
     Tensor<gpu, 1, DType> workspace = AllocateTempWorkspace(ctx, backward_workspace_byte_);
     size_t workspace_size = TensorSizeBytes(workspace);
     for (uint32_t g = 0; g < param_.num_group; ++g) {
@@ -361,13 +366,6 @@ class CuDNNConvolutionOp : public Operator {
     size_t expected = param_.no_bias ? 2 : 3;
     CHECK_EQ(in_shape.size(), expected);
     CHECK_EQ(out_shape.size(), 1U);
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_));
-    CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_));
-    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_));
-    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_));
-    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_));
 
     TShape dshape = in_shape[conv::kData];
     TShape wshape = in_shape[conv::kWeight];
@@ -573,7 +571,6 @@ class CuDNNConvolutionOp : public Operator {
                                           &bias_shape[0],
                                           &bias_stride[0]));
     }
-    init_cudnn_ = true;
   }
 
   void SelectAlgo(const Context& ctx,
@@ -817,7 +814,6 @@ class CuDNNConvolutionOp : public Operator {
   }
 
   void GetTempSize(const OpContext& ctx) {
-    if (init_temp_size_) return;
     mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
     size_t back_size = 0, back_size_w = 0;
     CUDNN_CALL(cudnnGetConvolutionBackwardDataWorkspaceSize(s->dnn_handle_,
@@ -842,8 +838,6 @@ class CuDNNConvolutionOp : public Operator {
                out_desc_,
                forward_algo_.AlgoNumber(),
                &forward_workspace_byte_));
-
-    init_temp_size_ = true;
   }
 
   int *CastTShapeToIntPtr(const TShape& s, std::vector<int> *buffer) {
@@ -876,8 +870,6 @@ class CuDNNConvolutionOp : public Operator {
   std::vector<int> param_dilate_;
   std::vector<int> param_pad_;
 
-  bool init_cudnn_;
-  bool init_temp_size_;
   // Temp workspace size in bytes needed for Forward() operation.
   size_t forward_workspace_byte_;
   // Temp workspace size in bytes needed for Backward() operation.
diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
index 09e89c27bbaf..2172ec0b4fe0 100644
--- a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
@@ -39,9 +39,19 @@ namespace op {
 #if MXNET_USE_CUDNN == 1
 
 template<typename DType>
-class CuDNNDeconvolutionOp : public Operator {
+class CuDNNDeconvolutionOp {
  public:
-  explicit CuDNNDeconvolutionOp(DeconvolutionParam param,
+  CuDNNDeconvolutionOp() {
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_));
+    CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_));
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_));
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_));
+    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_));
+  }
+
+  void Init(DeconvolutionParam param,
                                 int forward_compute_type,
                                 int backward_compute_type,
                                 const std::vector<TShape>& in_shape,
@@ -54,8 +64,6 @@ class CuDNNDeconvolutionOp : public Operator {
     auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type);
     // convert MB to words
     param_.workspace = (param_.workspace << 20) / sizeof(DType);
-    init_cudnn_ = false;
-    init_temp_size_ = false;
     dtype_ = mshadow::DataType<DType>::kCudnnFlag;
     // TensorCore algos only allowed on fp16-I/O deconvolutions if permitted by the global policy.
     cudnn_tensor_core_ = DataType<DType>::kFlag == kFloat16 && GetEnvAllowTensorCore();
@@ -89,22 +97,19 @@ class CuDNNDeconvolutionOp : public Operator {
   }
 
   ~CuDNNDeconvolutionOp() {
-    if (init_cudnn_) {
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_));
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_));
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_));
-      CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_));
-      CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_));
-      CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_));
-      CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_));
-    }
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_));
+    CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_));
+    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_));
+    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_));
+    CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_));
   }
 
-  virtual void Forward(const OpContext &ctx,
+  void Forward(const OpContext &ctx,
                        const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
+                       const std::vector<TBlob> &out_data) {
     using namespace mshadow;
     size_t expected = param_.no_bias ? 2 : 3;
     DType *data_ptr = NULL;
@@ -197,13 +202,11 @@ class CuDNNDeconvolutionOp : public Operator {
     }
   }
 
-  virtual void Backward(const OpContext &ctx,
+  void Backward(const OpContext &ctx,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
                         const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
+                        const std::vector<TBlob> &in_grad) {
     using namespace mshadow;
     using namespace mshadow::expr;
     size_t expected = param_.no_bias == 0 ? 3 : 2;
@@ -213,7 +216,8 @@ class CuDNNDeconvolutionOp : public Operator {
     DType *data_ptr = NULL;
     DType *gdata_ptr = NULL;
     CHECK_EQ(out_grad.size(), 1U);
-    CHECK(in_data.size() == expected && in_grad.size() == expected);
+    CHECK_EQ(in_data.size(), param_.no_bias ? 2U : 3U);
+    CHECK_EQ(in_grad.size(), expected);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     if (param_.kernel.ndim() == 2) {
       Tensor<gpu, 4, DType> grad = out_grad[deconv::kOut].get<gpu, 4, DType>(s);
@@ -243,6 +247,7 @@ class CuDNNDeconvolutionOp : public Operator {
       CHECK_NE(req[deconv::kBias], kWriteInplace);
     }
     CHECK_NE(req[deconv::kData], kWriteInplace);
+    GetTempSize(ctx);
     Tensor<gpu, 1, DType> workspace = AllocateTempWorkspace(ctx, backward_workspace_byte_);
     size_t workspace_size = TensorSizeBytes(workspace);
     for (uint32_t g = 0; g < param_.num_group; ++g) {
@@ -380,13 +385,6 @@ class CuDNNDeconvolutionOp : public Operator {
     size_t expected = param_.no_bias ? 2 : 3;
     CHECK_EQ(in_shape.size(), expected);
     CHECK_EQ(out_shape.size(), 1U);
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_));
-    CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_));
-    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_));
-    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_));
-    CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_));
 
     TShape dshape = in_shape[deconv::kData];
     TShape wshape = in_shape[deconv::kWeight];
@@ -591,7 +589,6 @@ class CuDNNDeconvolutionOp : public Operator {
                                             &bias_shape[0],
                                             &bias_stride[0]));
     }
-    init_cudnn_ = true;
   }
 
   void SelectAlgo(const Context& ctx,
@@ -844,7 +841,6 @@ class CuDNNDeconvolutionOp : public Operator {
   }
 
   void GetTempSize(const OpContext& ctx) {
-    if (init_temp_size_) return;
     mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
     size_t back_data_algo_workspace_size = 0;
     size_t back_filter_algo_workspace_size = 0;
@@ -874,7 +870,6 @@ class CuDNNDeconvolutionOp : public Operator {
     forward_workspace_byte_ = back_data_algo_workspace_size;
     backward_workspace_byte_ = std::max(forward_algo_workspace_size,
                                         back_filter_algo_workspace_size);
-    init_temp_size_ = true;
   }
 
   int *CastTShapeToIntPtr(const TShape& s, std::vector<int> *buffer) {
@@ -905,8 +900,11 @@ class CuDNNDeconvolutionOp : public Operator {
   std::vector<int> param_stride_;
   std::vector<int> param_dilate_;
 
-  bool init_cudnn_;
-  bool init_temp_size_;
+  int forward_compute_type_;
+  int backward_compute_type_;
+  const std::vector<TShape> in_shapes_;
+  const std::vector<TShape> out_shapes_;
+
   // Temp workspace size in bytes needed for Forward() operation.  Note that
   // in deconvolution, this is handled by the cuDNN backprop-to-data kernel.
   size_t forward_workspace_byte_;
diff --git a/src/operator/nn/cudnn/cudnn_pooling-inl.h b/src/operator/nn/cudnn/cudnn_pooling-inl.h
index 104ed8546dca..8442b37058d4 100644
--- a/src/operator/nn/cudnn/cudnn_pooling-inl.h
+++ b/src/operator/nn/cudnn/cudnn_pooling-inl.h
@@ -34,13 +34,18 @@ namespace mxnet {
 namespace op {
 
 template<typename DType>
-class CuDNNPoolingOp : public Operator {
+class CuDNNPoolingOp {
  public:
-  explicit CuDNNPoolingOp(PoolingParam p) {
-    param_ = p;
-    init_cudnn_ = false;
+  CuDNNPoolingOp() {
     // TODO(xxx): fp16
     dtype_ = mshadow::DataType<DType>::kCudnnFlag;
+    CUDNN_CALL(cudnnCreatePoolingDescriptor(&pooling_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
+  }
+
+  void Init(const PoolingParam &p) {
+    param_ = p;
     switch (param_.pool_type) {
       case pool_enum::kMaxPooling:
         mode_ = CUDNN_POOLING_MAX;
@@ -54,33 +59,24 @@ class CuDNNPoolingOp : public Operator {
   }
 
   ~CuDNNPoolingOp() {
-    if (init_cudnn_) {
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_));
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_));
-      CUDNN_CALL(cudnnDestroyPoolingDescriptor(pooling_desc_));
-    }
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_));
+    CUDNN_CALL(cudnnDestroyPoolingDescriptor(pooling_desc_));
   }
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
+  void Forward(const OpContext &ctx, const TBlob &in_data,
+      const OpReqType &req, const TBlob &out_data) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 1U);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     typename DataType<DType>::ScaleType alpha = 1.0f;
     typename DataType<DType>::ScaleType beta = 0.0f;
+    this->Init(s, in_data, out_data);
     if (param_.kernel.ndim() == 2) {
       // 2d pool
-      Tensor<gpu, 4, DType> data = in_data[pool_enum::kData].get<gpu, 4, DType>(s);
-      Tensor<gpu, 4, DType> out = out_data[pool_enum::kOut].get<gpu, 4, DType>(s);
-      if (!init_cudnn_) {
-        this->Init(s, in_data, out_data);
-      }
+      Tensor<gpu, 4, DType> data = in_data.get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> out = out_data.get<gpu, 4, DType>(s);
       CHECK_EQ(data.CheckContiguous(), true);
       CHECK_EQ(out.CheckContiguous(), true);
       CUDNN_CALL(cudnnPoolingForward(s->dnn_handle_,
@@ -93,11 +89,8 @@ class CuDNNPoolingOp : public Operator {
                                      out.dptr_));
     } else if (param_.kernel.ndim() == 3) {
       // 3d pool
-      Tensor<gpu, 5, DType> data = in_data[pool_enum::kData].get<gpu, 5, DType>(s);
-      Tensor<gpu, 5, DType> out = out_data[pool_enum::kOut].get<gpu, 5, DType>(s);
-      if (!init_cudnn_) {
-        this->Init(s, in_data, out_data);
-      }
+      Tensor<gpu, 5, DType> data = in_data.get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> out = out_data.get<gpu, 5, DType>(s);
       CHECK_EQ(data.CheckContiguous(), true);
       CHECK_EQ(out.CheckContiguous(), true);
       CUDNN_CALL(cudnnPoolingForward(s->dnn_handle_,
@@ -113,31 +106,23 @@ class CuDNNPoolingOp : public Operator {
     }
   }
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
+  void Backward(const OpContext &ctx, const TBlob &out_grad,
+      const TBlob &in_data, const TBlob &out_data,
+      const OpReqType &req, const TBlob &in_grad) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1U);
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 1U);
-    CHECK_EQ(req.size(), 1U);
-    CHECK_EQ(in_grad.size(), 1U);
 
     Stream<gpu> *s = ctx.get_stream<gpu>();
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     typename DataType<DType>::ScaleType alpha = 1.0f;
     typename DataType<DType>::ScaleType beta = 0.0f;
+    this->Init(s, in_data, out_data);
     if (param_.kernel.ndim() == 2) {
       // 2d pool
-      Tensor<gpu, 4, DType> m_out_grad = out_grad[pool_enum::kOut].get<gpu, 4, DType>(s);
-      Tensor<gpu, 4, DType> m_in_data = in_data[pool_enum::kData].get<gpu, 4, DType>(s);
-      Tensor<gpu, 4, DType> m_out_data = out_data[pool_enum::kOut].get<gpu, 4, DType>(s);
-      Tensor<gpu, 4, DType> m_in_grad = in_grad[pool_enum::kData].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> m_out_grad = out_grad.get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> m_in_data = in_data.get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> m_out_data = out_data.get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> m_in_grad = in_grad.get<gpu, 4, DType>(s);
       CUDNN_CALL(cudnnPoolingBackward(s->dnn_handle_,
                                       pooling_desc_,
                                       &alpha,
@@ -152,10 +137,10 @@ class CuDNNPoolingOp : public Operator {
                                       m_in_grad.dptr_));
     } else if (param_.kernel.ndim() == 3) {
       // 3d pool
-      Tensor<gpu, 5, DType> m_out_grad = out_grad[pool_enum::kOut].get<gpu, 5, DType>(s);
-      Tensor<gpu, 5, DType> m_in_data = in_data[pool_enum::kData].get<gpu, 5, DType>(s);
-      Tensor<gpu, 5, DType> m_out_data = out_data[pool_enum::kOut].get<gpu, 5, DType>(s);
-      Tensor<gpu, 5, DType> m_in_grad = in_grad[pool_enum::kData].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> m_out_grad = out_grad.get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> m_in_data = in_data.get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> m_out_data = out_data.get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> m_in_grad = in_grad.get<gpu, 5, DType>(s);
       CUDNN_CALL(cudnnPoolingBackward(s->dnn_handle_,
                                       pooling_desc_,
                                       &alpha,
@@ -174,129 +159,115 @@ class CuDNNPoolingOp : public Operator {
   }
 
  private:
-  inline void Init(mshadow::Stream<gpu> *s,
-                   const std::vector<TBlob> &in_data,
-                   const std::vector<TBlob> &out_data) {
+  inline void Init(mshadow::Stream<gpu> *s, const TBlob &in_data,
+      const TBlob &out_data) {
     using namespace mshadow;
     #if CUDNN_MAJOR >= 5
     nan_prop_ = CUDNN_NOT_PROPAGATE_NAN;
     #endif
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 1U);
-    if (!init_cudnn_) {
-      init_cudnn_ = true;
-      if (param_.kernel.ndim() == 2) {
-        // 2d conv
-        Tensor<gpu, 4, DType> data = in_data[pool_enum::kData].get<gpu, 4, DType>(s);
-        Tensor<gpu, 4, DType> out = out_data[pool_enum::kOut].get<gpu, 4, DType>(s);
-        mshadow::Shape<4> dshape = data.shape_;
-        CUDNN_CALL(cudnnCreatePoolingDescriptor(&pooling_desc_));
-        CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
-        CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
-        CUDNN_CALL(cudnnSetTensor4dDescriptor(in_desc_,
-                                              CUDNN_TENSOR_NCHW,
-                                              dtype_,
-                                              data.shape_[0],
-                                              data.shape_[1],
-                                              data.shape_[2],
-                                              data.shape_[3]));
-        CUDNN_CALL(cudnnSetTensor4dDescriptor(out_desc_,
-                                              CUDNN_TENSOR_NCHW,
-                                              dtype_,
-                                              out.shape_[0],
-                                              out.shape_[1],
-                                              out.shape_[2],
-                                              out.shape_[3]));
-        #if CUDNN_MAJOR >= 5
-        CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_,
-                                               mode_,
-                                               nan_prop_,
-                                               param_.global_pool ? dshape[2] : param_.kernel[0],
-                                               param_.global_pool ? dshape[3] : param_.kernel[1],
-                                               param_.pad[0],
-                                               param_.pad[1],
-                                               param_.global_pool ? 1 : param_.stride[0],
-                                               param_.global_pool ? 1 :param_.stride[1]));
-        #else
-        CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_,
-                                               mode_,
-                                               param_.global_pool ? dshape[2] : param_.kernel[0],
-                                               param_.global_pool ? dshape[3] : param_.kernel[1],
-                                               param_.pad[0],
-                                               param_.pad[1],
-                                               param_.global_pool ? 1 : param_.stride[0],
-                                               param_.global_pool ? 1 : param_.stride[1]));
-        #endif
-      } else {
-        Tensor<gpu, 5, DType> data = in_data[pool_enum::kData].get<gpu, 5, DType>(s);
-        Tensor<gpu, 5, DType> out = out_data[pool_enum::kOut].get<gpu, 5, DType>(s);
-        CUDNN_CALL(cudnnCreatePoolingDescriptor(&pooling_desc_));
-        CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_));
-        CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_));
-        std::vector<int> ishape = {static_cast<int>(data.shape_[0]),
-                                   static_cast<int>(data.shape_[1]),
-                                   static_cast<int>(data.shape_[2]),
-                                   static_cast<int>(data.shape_[3]),
-                                   static_cast<int>(data.shape_[4])};
+    if (param_.kernel.ndim() == 2) {
+      // 2d conv
+      Tensor<gpu, 4, DType> data = in_data.get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> out = out_data.get<gpu, 4, DType>(s);
+      mshadow::Shape<4> dshape = data.shape_;
+      CUDNN_CALL(cudnnSetTensor4dDescriptor(in_desc_,
+                                            CUDNN_TENSOR_NCHW,
+                                            dtype_,
+                                            data.shape_[0],
+                                            data.shape_[1],
+                                            data.shape_[2],
+                                            data.shape_[3]));
+      CUDNN_CALL(cudnnSetTensor4dDescriptor(out_desc_,
+                                            CUDNN_TENSOR_NCHW,
+                                            dtype_,
+                                            out.shape_[0],
+                                            out.shape_[1],
+                                            out.shape_[2],
+                                            out.shape_[3]));
+      #if CUDNN_MAJOR >= 5
+      CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_,
+                                             mode_,
+                                             nan_prop_,
+                                             param_.global_pool ? dshape[2] : param_.kernel[0],
+                                             param_.global_pool ? dshape[3] : param_.kernel[1],
+                                             param_.pad[0],
+                                             param_.pad[1],
+                                             param_.global_pool ? 1 : param_.stride[0],
+                                             param_.global_pool ? 1 :param_.stride[1]));
+      #else
+      CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_,
+                                             mode_,
+                                             param_.global_pool ? dshape[2] : param_.kernel[0],
+                                             param_.global_pool ? dshape[3] : param_.kernel[1],
+                                             param_.pad[0],
+                                             param_.pad[1],
+                                             param_.global_pool ? 1 : param_.stride[0],
+                                             param_.global_pool ? 1 : param_.stride[1]));
+      #endif
+    } else {
+      Tensor<gpu, 5, DType> data = in_data.get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> out = out_data.get<gpu, 5, DType>(s);
+      std::vector<int> ishape = {static_cast<int>(data.shape_[0]),
+                                 static_cast<int>(data.shape_[1]),
+                                 static_cast<int>(data.shape_[2]),
+                                 static_cast<int>(data.shape_[3]),
+                                 static_cast<int>(data.shape_[4])};
 
-        std::vector<int> istride = {static_cast<int>(ishape[1] * ishape[2] * ishape[3] * ishape[4]),
-                                    static_cast<int>(ishape[2] * ishape[3] * ishape[4]),
-                                    static_cast<int>(ishape[3] * ishape[4]),
-                                    static_cast<int>(ishape[4]),
-                                    1};
+      std::vector<int> istride = {static_cast<int>(ishape[1] * ishape[2] * ishape[3] * ishape[4]),
+                                  static_cast<int>(ishape[2] * ishape[3] * ishape[4]),
+                                  static_cast<int>(ishape[3] * ishape[4]),
+                                  static_cast<int>(ishape[4]), 1};
 
-        std::vector<int> oshape = {static_cast<int>(out.shape_[0]),
-                                   static_cast<int>(out.shape_[1]),
-                                   static_cast<int>(out.shape_[2]),
-                                   static_cast<int>(out.shape_[3]),
-                                   static_cast<int>(out.shape_[4])};
+      std::vector<int> oshape = {static_cast<int>(out.shape_[0]),
+                                 static_cast<int>(out.shape_[1]),
+                                 static_cast<int>(out.shape_[2]),
+                                 static_cast<int>(out.shape_[3]),
+                                 static_cast<int>(out.shape_[4])};
 
-        std::vector<int> ostride = {static_cast<int>(oshape[1] * oshape[2] * oshape[3] * oshape[4]),
-                                    static_cast<int>(oshape[2] * oshape[3] * oshape[4]),
-                                    static_cast<int>(oshape[3] * oshape[4]),
-                                    static_cast<int>(oshape[4]),
-                                    1};
+      std::vector<int> ostride = {static_cast<int>(oshape[1] * oshape[2] * oshape[3] * oshape[4]),
+                                  static_cast<int>(oshape[2] * oshape[3] * oshape[4]),
+                                  static_cast<int>(oshape[3] * oshape[4]),
+                                  static_cast<int>(oshape[4]), 1};
 
-        std::vector<int> kernel_vec = {param_.global_pool ? ishape[2] :
-                                                            static_cast<int>(param_.kernel[0]),
-                                       param_.global_pool ? ishape[3] :
-                                                            static_cast<int>(param_.kernel[1]),
-                                       param_.global_pool ? ishape[4] :
-                                                            static_cast<int>(param_.kernel[2])};
+      std::vector<int> kernel_vec = {param_.global_pool ? ishape[2] :
+                                                          static_cast<int>(param_.kernel[0]),
+                                     param_.global_pool ? ishape[3] :
+                                                          static_cast<int>(param_.kernel[1]),
+                                     param_.global_pool ? ishape[4] :
+                                                          static_cast<int>(param_.kernel[2])};
 
-        std::vector<int> pad_vec = {param_.global_pool ? 0 : static_cast<int>(param_.pad[0]),
-                                    param_.global_pool ? 0 : static_cast<int>(param_.pad[1]),
-                                    param_.global_pool ? 0 : static_cast<int>(param_.pad[2])};
+      std::vector<int> pad_vec = {param_.global_pool ? 0 : static_cast<int>(param_.pad[0]),
+                                  param_.global_pool ? 0 : static_cast<int>(param_.pad[1]),
+                                  param_.global_pool ? 0 : static_cast<int>(param_.pad[2])};
 
-        std::vector<int> stride_vec = {param_.global_pool ? 1 : static_cast<int>(param_.stride[0]),
-                                       param_.global_pool ? 1 : static_cast<int>(param_.stride[1]),
-                                       param_.global_pool ? 1 : static_cast<int>(param_.stride[2])};
+      std::vector<int> stride_vec = {param_.global_pool ? 1 : static_cast<int>(param_.stride[0]),
+                                     param_.global_pool ? 1 : static_cast<int>(param_.stride[1]),
+                                     param_.global_pool ? 1 : static_cast<int>(param_.stride[2])};
 
-        CUDNN_CALL(cudnnSetTensorNdDescriptor(in_desc_,
-                                              dtype_,
-                                              static_cast<int>(ishape.size()),
-                                              &ishape[0],
-                                              &istride[0]));
-        CUDNN_CALL(cudnnSetTensorNdDescriptor(out_desc_,
-                                              dtype_,
-                                              static_cast<int>(oshape.size()),
-                                              &oshape[0],
-                                              &ostride[0]));
-        #if CUDNN_MAJOR >= 5
-        CUDNN_CALL(cudnnSetPoolingNdDescriptor(pooling_desc_,
-                                               mode_,
-                                               nan_prop_,
-                                               static_cast<int>(kernel_vec.size()),
-                                               &(kernel_vec[0]),
-                                               &(pad_vec[0]),
-                                               &(stride_vec[0])));
-        #else
-        LOG(FATAL) << "3D pooling only support CUDNN v5 and abouve";
-        #endif
-      }
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(in_desc_,
+                                            dtype_,
+                                            static_cast<int>(ishape.size()),
+                                            &ishape[0],
+                                            &istride[0]));
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(out_desc_,
+                                            dtype_,
+                                            static_cast<int>(oshape.size()),
+                                            &oshape[0],
+                                            &ostride[0]));
+      #if CUDNN_MAJOR >= 5
+      CUDNN_CALL(cudnnSetPoolingNdDescriptor(pooling_desc_,
+                                             mode_,
+                                             nan_prop_,
+                                             static_cast<int>(kernel_vec.size()),
+                                             &(kernel_vec[0]),
+                                             &(pad_vec[0]),
+                                             &(stride_vec[0])));
+      #else
+      LOG(FATAL) << "3D pooling only support CUDNN v5 and abouve";
+      #endif
     }
   }
-  bool init_cudnn_;
+
   cudnnDataType_t dtype_;
   cudnnHandle_t handle_;
   cudnnPoolingMode_t mode_;
diff --git a/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h b/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h
index 5afdb4844364..239da023668d 100644
--- a/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h
+++ b/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h
@@ -32,73 +32,64 @@
 
 namespace mxnet {
 namespace op {
-class CuDNNSoftmaxActivationOp : public Operator {
+class CuDNNSoftmaxActivationOp {
  public:
-  explicit CuDNNSoftmaxActivationOp(SoftmaxActivationParam param) {
-    this->param_ = param;
-    init_cudnn_ = false;
+  CuDNNSoftmaxActivationOp() {
     dtype_ = CUDNN_DATA_FLOAT;
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_));
+  }
+
+  void Init(SoftmaxActivationParam param) {
+    this->param_ = param;
   }
 
   ~CuDNNSoftmaxActivationOp() {
-    if (init_cudnn_) {
-      CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_));
-    }
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_));
   }
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
+  void Forward(const OpContext &ctx, const TBlob &in_data,
+      const OpReqType &req, const TBlob &out_data) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 1U);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     Tensor<gpu, 4> data;
     Tensor<gpu, 4> out;
     cudnnSoftmaxMode_t softmax_mode;
     if (param_.mode == softmax_activation::kInstance) {
-      CHECK_EQ(in_data[softmax_activation::kData].ndim(), 2)
+      CHECK_EQ(in_data.ndim(), 2)
         << "Input need to have 2 dimensions when mode=instance.";
-      Shape<4> dshape = Shape4(in_data[softmax_activation::kData].shape_[0],
-                               in_data[softmax_activation::kData].shape_[1], 1, 1);
-      data = in_data[softmax_activation::kData].get_with_shape<gpu, 4, real_t>(dshape, s);
-      out = out_data[softmax_activation::kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
+      Shape<4> dshape = Shape4(in_data.shape_[0], in_data.shape_[1], 1, 1);
+      data = in_data.get_with_shape<gpu, 4, real_t>(dshape, s);
+      out = out_data.get_with_shape<gpu, 4, real_t>(dshape, s);
       softmax_mode = CUDNN_SOFTMAX_MODE_INSTANCE;
     } else {
-      CHECK_GE(in_data[softmax_activation::kData].ndim(), 3)
+      CHECK_GE(in_data.ndim(), 3)
         << "Input need to have a least 3 dimensions when mode=channel";
       Shape<4> dshape;
-      index_t size_left = in_data[softmax_activation::kData].Size();
+      index_t size_left = in_data.Size();
       for (int i = 0; i < 3; ++i) {
-        if (i < in_data[softmax_activation::kData].ndim()) {
-          dshape[i] = in_data[softmax_activation::kData].shape_[i];
+        if (i < in_data.ndim()) {
+          dshape[i] = in_data.shape_[i];
         } else {
           dshape[i] = 1;
         }
         size_left /= dshape[i];
       }
       dshape[3] = size_left;
-      data = in_data[softmax_activation::kData].get_with_shape<gpu, 4, real_t>(dshape, s);
-      out = out_data[softmax_activation::kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
+      data = in_data.get_with_shape<gpu, 4, real_t>(dshape, s);
+      out = out_data.get_with_shape<gpu, 4, real_t>(dshape, s);
       softmax_mode = CUDNN_SOFTMAX_MODE_CHANNEL;
     }
     float alpha = 1.0f;
     float beta = 0.0f;
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
-    if (!init_cudnn_) {
-      init_cudnn_ = true;
-      CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_));
-      CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_,
-                                            CUDNN_TENSOR_NCHW,
-                                            dtype_,
-                                            data.shape_[0],
-                                            data.shape_[1],
-                                            data.shape_[2],
-                                            data.shape_[3]));
-    }
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_,
+                                          CUDNN_TENSOR_NCHW,
+                                          dtype_,
+                                          data.shape_[0],
+                                          data.shape_[1],
+                                          data.shape_[2],
+                                          data.shape_[3]));
     CUDNN_CALL(cudnnSoftmaxForward(s->dnn_handle_,
                                    CUDNN_SOFTMAX_ACCURATE,
                                    softmax_mode,
@@ -110,19 +101,10 @@ class CuDNNSoftmaxActivationOp : public Operator {
                                    out.dptr_));
   }
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
+  void Backward(const OpContext &ctx, const TBlob &out_grad,
+      const TBlob &out_data, const OpReqType &req, const TBlob &in_grad) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1U);
-    CHECK_EQ(out_data.size(), 1U);
-    CHECK_EQ(req.size(), 1U);
-    CHECK_EQ(in_grad.size(), 1U);
     float alpha = 1.0f;
     float beta = 0.0f;
     Stream<gpu> *s = ctx.get_stream<gpu>();
@@ -132,31 +114,30 @@ class CuDNNSoftmaxActivationOp : public Operator {
     Tensor<gpu, 4> input_grad;
     cudnnSoftmaxMode_t softmax_mode;
     if (param_.mode == softmax_activation::kInstance) {
-      CHECK_EQ(in_grad[softmax_activation::kData].ndim(), 2)
+      CHECK_EQ(in_grad.ndim(), 2)
         << "Input need to have 2 dimensions when mode=instance.";
-      Shape<4> dshape = Shape4(in_grad[softmax_activation::kData].shape_[0],
-                               in_grad[softmax_activation::kData].shape_[1], 1, 1);
-      grad = out_grad[softmax_activation::kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
-      output_data = out_data[softmax_activation::kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
-      input_grad = in_grad[softmax_activation::kData].get_with_shape<gpu, 4, real_t>(dshape, s);
+      Shape<4> dshape = Shape4(in_grad.shape_[0], in_grad.shape_[1], 1, 1);
+      grad = out_grad.get_with_shape<gpu, 4, real_t>(dshape, s);
+      output_data = out_data.get_with_shape<gpu, 4, real_t>(dshape, s);
+      input_grad = in_grad.get_with_shape<gpu, 4, real_t>(dshape, s);
       softmax_mode = CUDNN_SOFTMAX_MODE_INSTANCE;
     } else {
-      CHECK_GE(in_grad[softmax_activation::kData].ndim(), 3)
+      CHECK_GE(in_grad.ndim(), 3)
         << "Input need to have a least 3 dimensions when mode=channel";
       Shape<4> dshape;
-      index_t size_left = in_grad[softmax_activation::kData].Size();
+      index_t size_left = in_grad.Size();
       for (int i = 0; i < 3; ++i) {
-        if (i < in_grad[softmax_activation::kData].ndim()) {
-          dshape[i] = in_grad[softmax_activation::kData].shape_[i];
+        if (i < in_grad.ndim()) {
+          dshape[i] = in_grad.shape_[i];
         } else {
           dshape[i] = 1;
         }
         size_left /= dshape[i];
       }
       dshape[3] = size_left;
-      output_data = out_data[softmax_activation::kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
-      grad = out_grad[softmax_activation::kOut].get_with_shape<gpu, 4, real_t>(dshape, s);
-      input_grad = in_grad[softmax_activation::kData].get_with_shape<gpu, 4, real_t>(dshape, s);
+      output_data = out_data.get_with_shape<gpu, 4, real_t>(dshape, s);
+      grad = out_grad.get_with_shape<gpu, 4, real_t>(dshape, s);
+      input_grad = in_grad.get_with_shape<gpu, 4, real_t>(dshape, s);
       softmax_mode = CUDNN_SOFTMAX_MODE_CHANNEL;
     }
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
@@ -174,7 +155,6 @@ class CuDNNSoftmaxActivationOp : public Operator {
   }
 
  private:
-  bool init_cudnn_;
   cudnnDataType_t dtype_;
   cudnnTensorDescriptor_t shape_desc_;
   SoftmaxActivationParam param_;
diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index b7d2676fadf3..f217779f6458 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file deconvolution-inl.h
  * \brief
- * \author Wei Wu
+ * \author Wei Wu, Da Zheng
 */
 #ifndef MXNET_OPERATOR_NN_DECONVOLUTION_INL_H_
 #define MXNET_OPERATOR_NN_DECONVOLUTION_INL_H_
@@ -195,19 +195,18 @@ namespace mxnet {
 namespace op {
 
 template<typename xpu, typename DType>
-class DeconvolutionOp : public Operator {
+class DeconvolutionOp {
  public:
-  explicit DeconvolutionOp(DeconvolutionParam p) {
+  void Init(DeconvolutionParam p) {
     this->param_ = p;
     // convert MBytes first to Bytes and then to elements.
     param_.workspace = (param_.workspace << 20) / sizeof(real_t);
   }
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
+  void Forward(const OpContext &ctx,
+               const std::vector<TBlob> &in_data,
+               const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &out_data) {
     using namespace mshadow;
     using namespace mshadow::expr;
 
@@ -311,19 +310,18 @@ class DeconvolutionOp : public Operator {
     }
   }
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
+  void Backward(const OpContext &ctx,
+                const std::vector<TBlob> &out_grad,
+                const std::vector<TBlob> &in_data,
+                const std::vector<OpReqType> &req,
+                const std::vector<TBlob> &in_grad) {
     using namespace mshadow;
     using namespace mshadow::expr;
     // TODO(bing): check the BLAS Handle, be careful
     CHECK_EQ(out_grad.size(), 1U);
     size_t expected = param_.no_bias == 0 ? 3 : 2;
-    CHECK(in_data.size() == expected && in_grad.size() == expected);
+    CHECK_EQ(in_data.size(), expected);
+    CHECK_EQ(in_grad.size(), expected);
     CHECK_EQ(req.size(), expected);
     CHECK_EQ(in_data[deconv::kWeight].CheckContiguous(), true);
     // get data
@@ -456,300 +454,52 @@ class DeconvolutionOp : public Operator {
 };  // class DeconvolutionOp
 
 template<typename xpu>
-Operator* CreateOp(DeconvolutionParam param, int dtype,
-                   std::vector<TShape> *in_shape,
-                   std::vector<TShape> *out_shape,
-                   Context ctx);
-
-#if DMLC_USE_CXX11
-class DeconvolutionProp : public OperatorProperty {
- public:
-  std::vector<std::string> ListArguments() const override {
-    if (!param_.no_bias) {
-      return {"data", "weight", "bias"};
-    } else {
-      return {"data", "weight"};
-    }
-  }
-
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    using namespace mshadow;
-    param_.Init(kwargs);
-    if (param_.kernel.ndim() == 1) {
-      param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW;
-      if (param_.stride.ndim() == 0) param_.stride = Shape1(1);
-      if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape1(0);
-      if (param_.adj.ndim() == 0) param_.adj = Shape1(0);
-    } else if (param_.kernel.ndim() == 2) {
-      param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW;
-      if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
-      if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
-      if (param_.adj.ndim() == 0) param_.adj = Shape2(0, 0);
-    } else {
-      CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D deconvolution not supported";
-      param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW;
-      if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
-      if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
-      if (param_.adj.ndim() == 0) param_.adj = Shape3(0, 0, 0);
-    }
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-#if MXNET_USE_CUDNN == 0
-    if (param_.kernel.ndim() != 2) {
-      LOG(FATAL) << "If not using CUDNN only 2D-Deconvolution is supported";
-      return false;
-    }
-#endif  // CUDNN
-
-    using namespace mshadow;
-    if (!param_.no_bias) {
-      CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
-    } else {
-      CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
-    }
-    out_shape->resize(1, TShape());
-    const TShape &dshape = (*in_shape)[deconv::kData];
-    if (dshape.ndim() ==  0) return false;
-
-    if (param_.kernel.ndim() == 1) {
-      // 1d conv
-      CHECK_EQ(dshape.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x";
-      Shape<3> dshape_ncw = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW);
-      Shape<3> wshape = Shape3(dshape_ncw[1], param_.num_filter / param_.num_group,
-                               param_.kernel[0]);
-      wshape = ConvertLayout(wshape, kNCW, param_.layout.value());
-      SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape);
-      if (!param_.no_bias) {
-        SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter));
-      }
-
-      const index_t dilated_ksize_x = param_.DilatedKernelSize(0);
-
-      index_t o_pad[1];
-      index_t o_adj[1];
-      param_.InferPad(dshape_ncw, o_pad, o_adj);
-
-      CHECK_EQ(dshape_ncw[1] % param_.num_group, 0U) \
-        << "input num_filter must divide group size";
-      CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
-        << "output num_filter must divide group size";
-      CHECK_GT(param_.kernel.Size(), 0U) \
-        << "incorrect kernel size: " << param_.kernel;
-      CHECK_GT(param_.stride.Size(), 0U) \
-        << "incorrect stride size: " << param_.stride;
-      CHECK_GT(param_.dilate.Size(), 0U) \
-        << "incorrect dilate size: " << param_.dilate;
-
-      CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(x) must be samller than stride[0]";
-
-      Shape<3> oshape;
-      oshape[0] = dshape_ncw[0];
-      oshape[1] = param_.num_filter;
-      oshape[2] = param_.stride[0] * (dshape_ncw[2] - 1) +
-        dilated_ksize_x - 2 * o_pad[0] + o_adj[0];
-
-      if (param_.target_shape[0] > 0) {
-        CHECK_EQ(param_.target_shape[0], oshape[2]) \
-          << "param_.target_shape[0] was not reasonable, please set it carefully";
-      }
-
-      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value()));
-
-      return true;
-    } else if (param_.kernel.ndim() == 2) {
-      // 2d conv
-      CHECK_EQ(dshape.ndim(), 4U) \
-        << "Input data should be 4D in batch-num_filter-y-x";
-      Shape<4> dshape_nchw = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW);
-      Shape<4> wshape = Shape4(dshape_nchw[1],
-                               param_.num_filter / param_.num_group,
-                               param_.kernel[0], param_.kernel[1]);
-      wshape = ConvertLayout(wshape, kNCHW, param_.layout.value());
-      SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape);
-      if (!param_.no_bias) {
-        SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter));
-      }
-
-      const index_t dilated_ksize_y = param_.DilatedKernelSize(0);
-      const index_t dilated_ksize_x = param_.DilatedKernelSize(1);
-
-      index_t o_pad[2];
-      index_t o_adj[2];
-      param_.InferPad(dshape_nchw, o_pad, o_adj);
-
-      CHECK_EQ(dshape_nchw[1] % param_.num_group, 0U) \
-        << "input num_filter must divide group size";
-      CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
-        << "output num_filter must divide group size";
-      CHECK_GT(param_.kernel.Size(), 0U) \
-        << "incorrect kernel size: " << param_.kernel;
-      CHECK_GT(param_.stride.Size(), 0U) \
-        << "incorrect stride size: " << param_.stride;
-      CHECK_GT(param_.dilate.Size(), 0U) \
-          << "incorrect dilate size: " << param_.dilate;
-
-      CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(y) must be samller than stride[0]";
-      CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(x) must be samller than stride[1]";
-
-      Shape<4> oshape;
-      oshape[0] = dshape_nchw[0];
-      oshape[1] = param_.num_filter;
-      oshape[2] = param_.stride[0] * (dshape_nchw[2] - 1) +
-        dilated_ksize_y - 2 * o_pad[0] + o_adj[0];
-      oshape[3] = param_.stride[1] * (dshape_nchw[3] - 1) +
-        dilated_ksize_x - 2 * o_pad[1] + o_adj[1];
-
-      if (param_.target_shape[0] > 0) {
-        CHECK_EQ(param_.target_shape[0], oshape[2]) \
-          << "param_.target_shape[0] was not reasonable, please set it carefully";
-      }
-      if (param_.target_shape[1] > 0) {
-        CHECK_EQ(param_.target_shape[1], oshape[3]) \
-          << "param_.target_shape[1] was not reasonable, please set it carefully";
-      }
-
-      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value()));
-
-      return true;
-    } else if (param_.kernel.ndim() == 3) {
-      // 3d conv
-      CHECK_EQ(dshape.ndim(), 5U) \
-        << "Input data should be 5D in batch-num_filter-depth-y-x";
-      Shape<5> dshape_ncdhw = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW);
-      Shape<5> wshape = Shape5(dshape_ncdhw[1], param_.num_filter / param_.num_group,
-                               param_.kernel[0], param_.kernel[1], param_.kernel[2]);
-      wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value());
-      SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape);
-      if (!param_.no_bias) {
-        SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter));
-      }
-
-      // Note: 3D dilation currently not supported.
-      // Calculations below done to preserve symmetry with 1D/2D code.
-      const index_t dilated_ksize_d = param_.DilatedKernelSize(0);
-      const index_t dilated_ksize_y = param_.DilatedKernelSize(1);
-      const index_t dilated_ksize_x = param_.DilatedKernelSize(2);
-
-      index_t o_pad[3];
-      index_t o_adj[3];
-      param_.InferPad(dshape_ncdhw, o_pad, o_adj);
-
-      CHECK_EQ(dshape_ncdhw[1] % param_.num_group, 0U) \
-        << "input num_filter must divide group size";
-      CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
-        << "output num_filter must divide group size";
-      CHECK_GT(param_.kernel.Size(), 0U) \
-        << "incorrect kernel size: " << param_.kernel;
-      CHECK_GT(param_.stride.Size(), 0U) \
-        << "incorrect stride size: " << param_.stride;
-      CHECK_GT(param_.dilate.Size(), 0U) \
-        << "incorrect dilate size: " << param_.dilate;
-      CHECK_EQ(param_.dilate.Size(), 1U)
-        << "Dilate is not supported in 3d deconvolution";
-
-      CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(d) must be samller than stride[0]";
-      CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(y) must be samller than stride[1]";
-      CHECK_GE(param_.stride[2]-1, o_adj[2]) << "adj(x) must be samller than stride[2]";
-
-      Shape<5> oshape;
-      oshape[0] = dshape_ncdhw[0];
-      oshape[1] = param_.num_filter;
-      oshape[2] = param_.stride[0] * (dshape_ncdhw[2] - 1) +
-        dilated_ksize_d - 2 * o_pad[0] + o_adj[0];
-      oshape[3] = param_.stride[1] * (dshape_ncdhw[3] - 1) +
-        dilated_ksize_y - 2 * o_pad[1] + o_adj[1];
-      oshape[4] = param_.stride[2] * (dshape_ncdhw[4] - 1) +
-        dilated_ksize_x - 2 * o_pad[2] + o_adj[2];
-
-      if (param_.target_shape[0] > 0) {
-        CHECK_EQ(param_.target_shape[0], oshape[2]) \
-          << "param_.target_shape[0] was not reasonable, please it carefully";
-      }
-      if (param_.target_shape[1] > 0) {
-        CHECK_EQ(param_.target_shape[1], oshape[3]) \
-          << "param_.target_shape[1] was not reasonable, please set it carefully";
-      }
-      if (param_.target_shape[2] > 0) {
-        CHECK_EQ(param_.target_shape[2], oshape[4]) \
-          << "param_.target_shape[2] was not reasonable, please set it carefully";
-      }
-
-      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value()));
-
-      return true;
-    } else {
-      LOG(FATAL) << "Unknown convolution type";
-      return false;
-    }
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1U);
-    int dtype = (*in_type)[0];
-    CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
-      if ((*in_type)[i] == -1) {
-        (*in_type)[i] = dtype;
-      } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
-      }
-    }
-    out_type->clear();
-    out_type->push_back(dtype);
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    auto ptr = new DeconvolutionProp();
-    ptr->param_ = param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override {
-    return "Deconvolution";
-  }
-
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return {out_grad[deconv::kOut], in_data[deconv::kData], in_data[deconv::kWeight]};
-  }
+void _DeconvolutionCompute(const DeconvolutionParam& param,
+                           const OpContext& ctx, const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& outputs) {
+  MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, {
+    DeconvolutionOp<xpu, DType> op;
+    op.Init(param);
+    op.Forward(ctx, inputs, req, outputs);
+  });
+}
 
-  std::vector<ResourceRequest> ForwardResource(
-      const std::vector<TShape> &in_shape) const override {
-    return {ResourceRequest::kTempSpace};
-  }
+template<typename xpu>
+void DeconvolutionCompute(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx, const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  _DeconvolutionCompute<xpu>(param, ctx, inputs, req, outputs);
+}
 
-  std::vector<ResourceRequest> BackwardResource(
-      const std::vector<TShape> &in_shape) const override {
-    return {ResourceRequest::kTempSpace};
-  }
+template<typename xpu>
+void _DeconvolutionGradCompute(const DeconvolutionParam& param,
+                               const OpContext& ctx, const std::vector<TBlob>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<TBlob>& outputs) {
+  std::vector<TBlob> in_data(inputs.begin() + 1, inputs.end());
+  const TBlob &out_grad = inputs[0];
+  const std::vector<TBlob> &in_grad = outputs;
+
+  MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, {
+    DeconvolutionOp<xpu, DType> op;
+    op.Init(param);
+    op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
+  });
+}
 
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented";
-    return NULL;
-  }
 
-  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                             std::vector<int> *in_type) const override;
+template<typename xpu>
+void DeconvolutionGradCompute(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx, const std::vector<TBlob>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<TBlob>& outputs) {
+  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  _DeconvolutionGradCompute<xpu>(param, ctx, inputs, req, outputs);
+}
 
- private:
-  DeconvolutionParam param_;
-};  // class DeconvolutionProp
-#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_NN_DECONVOLUTION_INL_H_
diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc
index 45867f78593c..f4cba13cb147 100644
--- a/src/operator/nn/deconvolution.cc
+++ b/src/operator/nn/deconvolution.cc
@@ -21,45 +21,405 @@
  * Copyright (c) 2015 by Contributors
  * \file deconvolution.cc
  * \brief
- * \author Wei Wu
+ * \author Wei Wu, Da Zheng
 */
 
 #include "./deconvolution-inl.h"
+#include "./mkldnn/mkldnn_ops-inl.h"
+#include "./mkldnn/mkldnn_base-inl.h"
 
 namespace mxnet {
 namespace op {
-template<>
-Operator* CreateOp<cpu>(DeconvolutionParam param, int dtype,
-                        std::vector<TShape> *in_shape,
-                        std::vector<TShape> *out_shape,
-                        Context ctx) {
-  Operator *op = NULL;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new DeconvolutionOp<cpu, DType>(param);
-  });
-  return op;
+
+static bool DeconvolutionShape(const nnvm::NodeAttrs& attrs,
+                               std::vector<TShape> *in_shape,
+                               std::vector<TShape> *out_shape) {
+  const DeconvolutionParam& param_ = nnvm::get<DeconvolutionParam>(attrs.parsed);
+#if MXNET_USE_CUDNN == 0
+  if (param_.kernel.ndim() != 2) {
+    LOG(FATAL) << "If not using CUDNN only 2D-Deconvolution is supported";
+    return false;
+  }
+#endif  // CUDNN
+
+  using namespace mshadow;
+  if (!param_.no_bias) {
+    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
+  } else {
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
+  }
+  out_shape->resize(1, TShape());
+  const TShape &dshape = (*in_shape)[deconv::kData];
+  if (dshape.ndim() ==  0) return false;
+
+  if (param_.kernel.ndim() == 1) {
+    // 1d conv
+    CHECK_EQ(dshape.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x";
+    Shape<3> dshape_ncw = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW);
+    Shape<3> wshape = Shape3(dshape_ncw[1], param_.num_filter / param_.num_group,
+        param_.kernel[0]);
+    wshape = ConvertLayout(wshape, kNCW, param_.layout.value());
+    SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape);
+    if (!param_.no_bias) {
+      SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter));
+    }
+
+    const index_t dilated_ksize_x = param_.DilatedKernelSize(0);
+
+    index_t o_pad[1];
+    index_t o_adj[1];
+    param_.InferPad(dshape_ncw, o_pad, o_adj);
+
+    CHECK_EQ(dshape_ncw[1] % param_.num_group, 0U) \
+      << "input num_filter must divide group size";
+    CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
+      << "output num_filter must divide group size";
+    CHECK_GT(param_.kernel.Size(), 0U) \
+      << "incorrect kernel size: " << param_.kernel;
+    CHECK_GT(param_.stride.Size(), 0U) \
+      << "incorrect stride size: " << param_.stride;
+    CHECK_GT(param_.dilate.Size(), 0U) \
+      << "incorrect dilate size: " << param_.dilate;
+
+    CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(x) must be samller than stride[0]";
+
+    Shape<3> oshape;
+    oshape[0] = dshape_ncw[0];
+    oshape[1] = param_.num_filter;
+    oshape[2] = param_.stride[0] * (dshape_ncw[2] - 1) +
+      dilated_ksize_x - 2 * o_pad[0] + o_adj[0];
+
+    if (param_.target_shape.ndim() > 0) {
+      if (param_.target_shape[0] > 0) {
+        CHECK_EQ(param_.target_shape[0], oshape[2]) \
+          << "param_.target_shape[0] was not reasonable, please set it carefully";
+      }
+    }
+
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value()));
+
+    return true;
+  } else if (param_.kernel.ndim() == 2) {
+    // 2d conv
+    CHECK_EQ(dshape.ndim(), 4U) \
+      << "Input data should be 4D in batch-num_filter-y-x";
+    Shape<4> dshape_nchw = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW);
+    Shape<4> wshape = Shape4(dshape_nchw[1],
+        param_.num_filter / param_.num_group,
+        param_.kernel[0], param_.kernel[1]);
+    wshape = ConvertLayout(wshape, kNCHW, param_.layout.value());
+    SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape);
+    if (!param_.no_bias) {
+      SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter));
+    }
+
+    const index_t dilated_ksize_y = param_.DilatedKernelSize(0);
+    const index_t dilated_ksize_x = param_.DilatedKernelSize(1);
+
+    index_t o_pad[2];
+    index_t o_adj[2];
+    param_.InferPad(dshape_nchw, o_pad, o_adj);
+
+    CHECK_EQ(dshape_nchw[1] % param_.num_group, 0U) \
+      << "input num_filter must divide group size";
+    CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
+      << "output num_filter must divide group size";
+    CHECK_GT(param_.kernel.Size(), 0U) \
+      << "incorrect kernel size: " << param_.kernel;
+    CHECK_GT(param_.stride.Size(), 0U) \
+      << "incorrect stride size: " << param_.stride;
+    CHECK_GT(param_.dilate.Size(), 0U) \
+      << "incorrect dilate size: " << param_.dilate;
+
+    CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(y) must be samller than stride[0]";
+    CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(x) must be samller than stride[1]";
+
+    Shape<4> oshape;
+    oshape[0] = dshape_nchw[0];
+    oshape[1] = param_.num_filter;
+    oshape[2] = param_.stride[0] * (dshape_nchw[2] - 1) +
+      dilated_ksize_y - 2 * o_pad[0] + o_adj[0];
+    oshape[3] = param_.stride[1] * (dshape_nchw[3] - 1) +
+      dilated_ksize_x - 2 * o_pad[1] + o_adj[1];
+
+    if (param_.target_shape.ndim() > 1) {
+      if (param_.target_shape[0] > 0) {
+        CHECK_EQ(param_.target_shape[0], oshape[2]) \
+          << "param_.target_shape[0] was not reasonable, please set it carefully";
+      }
+      if (param_.target_shape[1] > 0) {
+        CHECK_EQ(param_.target_shape[1], oshape[3]) \
+          << "param_.target_shape[1] was not reasonable, please set it carefully";
+      }
+    }
+
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value()));
+
+    return true;
+  } else if (param_.kernel.ndim() == 3) {
+    // 3d conv
+    CHECK_EQ(dshape.ndim(), 5U) \
+      << "Input data should be 5D in batch-num_filter-depth-y-x";
+    Shape<5> dshape_ncdhw = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW);
+    Shape<5> wshape = Shape5(dshape_ncdhw[1], param_.num_filter / param_.num_group,
+        param_.kernel[0], param_.kernel[1], param_.kernel[2]);
+    wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value());
+    SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape);
+    if (!param_.no_bias) {
+      SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter));
+    }
+
+    // Note: 3D dilation currently not supported.
+    // Calculations below done to preserve symmetry with 1D/2D code.
+    const index_t dilated_ksize_d = param_.DilatedKernelSize(0);
+    const index_t dilated_ksize_y = param_.DilatedKernelSize(1);
+    const index_t dilated_ksize_x = param_.DilatedKernelSize(2);
+
+    index_t o_pad[3];
+    index_t o_adj[3];
+    param_.InferPad(dshape_ncdhw, o_pad, o_adj);
+
+    CHECK_EQ(dshape_ncdhw[1] % param_.num_group, 0U) \
+      << "input num_filter must divide group size";
+    CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
+      << "output num_filter must divide group size";
+    CHECK_GT(param_.kernel.Size(), 0U) \
+      << "incorrect kernel size: " << param_.kernel;
+    CHECK_GT(param_.stride.Size(), 0U) \
+      << "incorrect stride size: " << param_.stride;
+    CHECK_GT(param_.dilate.Size(), 0U) \
+      << "incorrect dilate size: " << param_.dilate;
+    CHECK_EQ(param_.dilate.Size(), 1U)
+      << "Dilate is not supported in 3d deconvolution";
+
+    CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(d) must be samller than stride[0]";
+    CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(y) must be samller than stride[1]";
+    CHECK_GE(param_.stride[2]-1, o_adj[2]) << "adj(x) must be samller than stride[2]";
+
+    Shape<5> oshape;
+    oshape[0] = dshape_ncdhw[0];
+    oshape[1] = param_.num_filter;
+    oshape[2] = param_.stride[0] * (dshape_ncdhw[2] - 1) +
+      dilated_ksize_d - 2 * o_pad[0] + o_adj[0];
+    oshape[3] = param_.stride[1] * (dshape_ncdhw[3] - 1) +
+      dilated_ksize_y - 2 * o_pad[1] + o_adj[1];
+    oshape[4] = param_.stride[2] * (dshape_ncdhw[4] - 1) +
+      dilated_ksize_x - 2 * o_pad[2] + o_adj[2];
+
+    if (param_.target_shape.ndim() > 2) {
+      if (param_.target_shape[0] > 0) {
+        CHECK_EQ(param_.target_shape[0], oshape[2]) \
+          << "param_.target_shape[0] was not reasonable, please it carefully";
+      }
+      if (param_.target_shape[1] > 0) {
+        CHECK_EQ(param_.target_shape[1], oshape[3]) \
+          << "param_.target_shape[1] was not reasonable, please set it carefully";
+      }
+      if (param_.target_shape[2] > 0) {
+        CHECK_EQ(param_.target_shape[2], oshape[4]) \
+          << "param_.target_shape[2] was not reasonable, please set it carefully";
+      }
+    }
+
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value()));
+
+    return true;
+  } else {
+    LOG(FATAL) << "Unknown convolution type";
+    return false;
+  }
+}
+
+static inline std::vector<std::string> ListArguments(const DeconvolutionParam& param_) {
+  if (!param_.no_bias) {
+    return {"data", "weight", "bias"};
+  } else {
+    return {"data", "weight"};
+  }
+}
+
+static bool DeconvolutionType(const nnvm::NodeAttrs& attrs,
+                              std::vector<int> *in_type, std::vector<int> *out_type) {
+  const DeconvolutionParam& param_ = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  CHECK_GE(in_type->size(), 1U);
+  int dtype = (*in_type)[0];
+  CHECK_NE(dtype, -1) << "First input must have specified type";
+  for (index_t i = 0; i < in_type->size(); ++i) {
+    if ((*in_type)[i] == -1) {
+      (*in_type)[i] = dtype;
+    } else {
+      UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param_)[i]);
+    }
+  }
+  out_type->clear();
+  out_type->push_back(dtype);
+  return true;
+}
+
+inline static bool DeconvStorageType(const nnvm::NodeAttrs& attrs,
+                                     const int dev_mask,
+                                     DispatchMode* dispatch_mode,
+                                     std::vector<int> *in_attrs,
+                                     std::vector<int> *out_attrs) {
+  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  uint32_t in_expected = param.no_bias ? 2 : 3;
+  CHECK_EQ(in_attrs->size(), in_expected);
+  CHECK_EQ(out_attrs->size(), 1);
+
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask)
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  else
+#endif
+    *dispatch_mode = DispatchMode::kFCompute;
+  (*out_attrs)[0] = kDefaultStorage;
+  return true;
+}
+
+inline static bool BackwardDeconvStorageType(const nnvm::NodeAttrs& attrs,
+                                             const int dev_mask,
+                                             DispatchMode* dispatch_mode,
+                                             std::vector<int> *in_attrs,
+                                             std::vector<int> *out_attrs) {
+  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  uint32_t out_expected = param.no_bias ? 2 : 3;
+  CHECK_EQ(in_attrs->size(), param.no_bias ? 3U : 4U);
+  CHECK_EQ(out_attrs->size(), out_expected);
+
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask)
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  else
+#endif
+    *dispatch_mode = DispatchMode::kFCompute;
+  for (size_t i = 0; i < out_attrs->size(); i++)
+    (*out_attrs)[i] = kDefaultStorage;
+  return true;
+}
+
+static void DeconvolutionCompute_CPU(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx, const std::vector<NDArray>& inputs,
+    const std::vector<OpReqType>& req, const std::vector<NDArray>& outputs) {
+#if MXNET_USE_MKLDNN == 1
+  if (SupportMKLDNNConv(inputs[0])) {
+    MKLDNNDeconvolutionForward(attrs, ctx, inputs, req, outputs);
+    return;
+  }
+#endif
+  std::vector<TBlob> in_blobs(inputs.size());
+  for (size_t i = 0; i < in_blobs.size(); i++)
+    in_blobs[i] = inputs[i].data();
+  std::vector<TBlob> out_blobs(outputs.size());
+  for (size_t i = 0; i < out_blobs.size(); i++)
+    out_blobs[i] = outputs[i].data();
+  DeconvolutionCompute<cpu>(attrs, ctx, in_blobs, req, out_blobs);
 }
 
-Operator* DeconvolutionProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                                              std::vector<int> *in_type) const {
-  std::vector<TShape> out_shape, aux_shape;
-  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
-  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0), in_shape, &out_shape, ctx);
+static void DeconvolutionGradCompute_CPU(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx, const std::vector<NDArray>& inputs,
+    const std::vector<OpReqType>& req, const std::vector<NDArray>& outputs) {
+#if MXNET_USE_MKLDNN == 1
+  if (SupportMKLDNNConv(inputs[0])) {
+    MKLDNNDeconvolutionBackward(attrs, ctx, inputs, req, outputs);
+    return;
+  }
+#endif
+  std::vector<TBlob> in_blobs(inputs.size());
+  for (size_t i = 0; i < in_blobs.size(); i++)
+    in_blobs[i] = inputs[i].data();
+  std::vector<TBlob> out_blobs(outputs.size());
+  for (size_t i = 0; i < out_blobs.size(); i++)
+    out_blobs[i] = outputs[i].data();
+  DeconvolutionGradCompute<cpu>(attrs, ctx, in_blobs, req, out_blobs);
 }
 
+static void DeconvolutionParamParser(nnvm::NodeAttrs* attrs) {
+  using namespace mshadow;
+  DeconvolutionParam param_;
+  param_.Init(attrs->dict);
+  if (param_.kernel.ndim() == 1) {
+    param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW;
+    if (param_.stride.ndim() == 0) param_.stride = Shape1(1);
+    if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1);
+    if (param_.pad.ndim() == 0) param_.pad = Shape1(0);
+    if (param_.adj.ndim() == 0) param_.adj = Shape1(0);
+  } else if (param_.kernel.ndim() == 2) {
+    param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW;
+    if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
+    if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1);
+    if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
+    if (param_.adj.ndim() == 0) param_.adj = Shape2(0, 0);
+  } else {
+    CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D deconvolution not supported";
+    param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW;
+    if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
+    if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1);
+    if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
+    if (param_.adj.ndim() == 0) param_.adj = Shape3(0, 0, 0);
+  }
+  attrs->parsed = std::move(param_);
+}
+
+struct DeconvolutionGrad {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
+    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
+    heads.push_back(n->inputs[deconv::kData]);
+    heads.push_back(n->inputs[deconv::kWeight]);
+    const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(n->attrs.parsed);
+    if (!param.no_bias)
+      heads.push_back(n->inputs[deconv::kBias]);
+    return MakeGradNode(op_name, n, heads, n->attrs.dict);
+  }
+};
+
 DMLC_REGISTER_PARAMETER(DeconvolutionParam);
 
-MXNET_REGISTER_OP_PROPERTY(Deconvolution, DeconvolutionProp)
-.add_argument("data", "NDArray-or-Symbol", "Input tensor to the deconvolution operation.")
-.add_argument("weight", "NDArray-or-Symbol", "Weights representing the kernel.")
-.add_argument("bias", "NDArray-or-Symbol", "Bias added to the result after the deconvolution "
-    "operation.")
-.add_arguments(DeconvolutionParam::__FIELDS__())
+NNVM_REGISTER_OP(Deconvolution)
 .describe("Computes 2D transposed convolution (aka fractionally strided convolution) of the "
     "input tensor. This operation can be seen as the gradient of Convolution operation with "
     "respect to its input. Convolution usually reduces the size of the input. Transposed "
     "convolution works the other way, going from a smaller input to a larger output while "
-    "preserving the connectivity pattern.");
+    "preserving the connectivity pattern.")
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const DeconvolutionParam& params = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  return params.no_bias ? 2 : 3;
+})
+.set_num_outputs(1)
+.set_attr_parser(DeconvolutionParamParser)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+    [](const NodeAttrs& attrs) {
+  return ListArguments(nnvm::get<DeconvolutionParam>(attrs.parsed));
+})
+.set_attr<nnvm::FInferShape>("FInferShape", DeconvolutionShape)
+.set_attr<nnvm::FInferType>("FInferType", DeconvolutionType)
+.set_attr<FInferStorageType>("FInferStorageType", DeconvStorageType)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr<FCompute>("FCompute<cpu>", DeconvolutionCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", DeconvolutionCompute_CPU)
+.set_attr<nnvm::FGradient>("FGradient", DeconvolutionGrad{"_backward_Deconvolution"})
+.add_argument("data", "NDArray-or-Symbol", "Input tensor to the deconvolution operation.")
+.add_argument("weight", "NDArray-or-Symbol", "Weights representing the kernel.")
+.add_argument("bias", "NDArray-or-Symbol", "Bias added to the result after the deconvolution "
+    "operation.")
+.add_arguments(DeconvolutionParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_Deconvolution)
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const DeconvolutionParam& params = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  return params.no_bias ? 2 : 3;
+})
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FInferStorageType>("FInferStorageType", BackwardDeconvStorageType)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr_parser(DeconvolutionParamParser)
+.set_attr<FCompute>("FCompute<cpu>", DeconvolutionGradCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", DeconvolutionGradCompute_CPU);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/deconvolution.cu b/src/operator/nn/deconvolution.cu
index 6d0787662c64..9e8840cade85 100644
--- a/src/operator/nn/deconvolution.cu
+++ b/src/operator/nn/deconvolution.cu
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file deconvolution.cu
  * \brief
- * \author Wei Wu
+ * \author Wei Wu, Da Zheng
 */
 
 #include "./deconvolution-inl.h"
@@ -31,19 +31,37 @@
 
 namespace mxnet {
 namespace op {
+
+#if MXNET_USE_CUDNN == 1
+template<typename DType>
+static CuDNNDeconvolutionOp<DType> &GetCuDNNDeconvOp(const DeconvolutionParam& param,
+                                                     int forward_compute_type,
+                                                     int backward_compute_type,
+                                                     const std::vector<TShape>& in_shape,
+                                                     const std::vector<TShape>& out_shape,
+                                                     const Context& ctx) {
+  static thread_local CuDNNDeconvolutionOp<DType> op;
+  op.Init(param, forward_compute_type, backward_compute_type, in_shape, out_shape, ctx);
+  return op;
+}
+#endif
+
 template<>
-Operator* CreateOp<gpu>(DeconvolutionParam param, int dtype,
-                        std::vector<TShape> *in_shape,
-                        std::vector<TShape> *out_shape,
-                        Context ctx) {
-  // Logic here parallels that in Convolution.cu
-  Operator *op = NULL;
+void DeconvolutionCompute<gpu>(const nnvm::NodeAttrs& attrs,
+                               const OpContext& ctx,
+                               const std::vector<TBlob>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<TBlob>& outputs) {
+  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  int dtype = inputs[0].type_flag_;
   // If 1D deconvolution, use MXNet implementation
   if (param.kernel.ndim() == 1) {
     MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-      op = new DeconvolutionOp<gpu, DType>(param);
+      DeconvolutionOp<gpu, DType> op;
+      op.Init(param);
+      op.Forward(ctx, inputs, req, outputs);
     })
-    return op;
+    return;
   }
 #if MXNET_USE_CUDNN == 1
   // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16).
@@ -51,23 +69,97 @@ Operator* CreateOp<gpu>(DeconvolutionParam param, int dtype,
 
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
     if (param.cudnn_off) {
-      op = new DeconvolutionOp<gpu, DType>(param);
-    } else if (!CuDNNDeconvolutionOp<DType>::Supports(param, compute_type, compute_type, ctx)) {
+      DeconvolutionOp<gpu, DType> op;
+      op.Init(param);
+      op.Forward(ctx, inputs, req, outputs);
+    } else if (!CuDNNDeconvolutionOp<DType>::Supports(param,
+          compute_type, compute_type, ctx.run_ctx.ctx)) {
       LOG(WARNING) <<
         "This deconvolution is not supported by cudnn, MXNET deconvolution is applied.";
-      op = new DeconvolutionOp<gpu, DType>(param);
+      DeconvolutionOp<gpu, DType> op;
+      op.Init(param);
+      op.Forward(ctx, inputs, req, outputs);
     } else {
-      op = new CuDNNDeconvolutionOp<DType>(param, compute_type, compute_type,
-                                           *in_shape, *out_shape, ctx);
+      std::vector<TShape> in_shape(inputs.size());
+      std::vector<TShape> out_shape(1, outputs[0].shape_);
+      for (size_t i = 0; i < in_shape.size(); i++) {
+        in_shape[i] = inputs[i].shape_;
+      }
+      GetCuDNNDeconvOp<DType>(param, compute_type, compute_type,
+          in_shape, out_shape, ctx.run_ctx.ctx).Forward(ctx, inputs, req, outputs);
     }
   })
 #else
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new DeconvolutionOp<gpu, DType>(param);
+    DeconvolutionOp<gpu, DType> op;
+    op.Init(param);
+    op.Forward(ctx, inputs, req, outputs);
+  })
+#endif  // MXNET_USE_CUDNN
+}
+
+template<>
+void DeconvolutionGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
+                                   const OpContext& ctx,
+                                   const std::vector<TBlob>& inputs,
+                                   const std::vector<OpReqType>& req,
+                                   const std::vector<TBlob>& outputs) {
+  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  std::vector<TBlob> in_data(inputs.begin() + 1, inputs.end());
+  const TBlob &out_grad = inputs[0];
+  const std::vector<TBlob> &in_grad = outputs;
+  int dtype = out_grad.type_flag_;
+
+  // If 1D deconvolution, use MXNet implementation
+  if (param.kernel.ndim() == 1) {
+    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+      DeconvolutionOp<gpu, DType> op;
+      op.Init(param);
+      op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
+    })
+    return;
+  }
+#if MXNET_USE_CUDNN == 1
+  // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16).
+  int compute_type = (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype;
+
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    if (param.cudnn_off) {
+      DeconvolutionOp<gpu, DType> op;
+      op.Init(param);
+      op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
+    } else if (!CuDNNDeconvolutionOp<DType>::Supports(param,
+          compute_type, compute_type, ctx.run_ctx.ctx)) {
+      LOG(WARNING) <<
+        "This deconvolution is not supported by cudnn, MXNET deconvolution is applied.";
+      DeconvolutionOp<gpu, DType> op;
+      op.Init(param);
+      op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
+    } else {
+      std::vector<TShape> in_shape(in_data.size());
+      std::vector<TShape> out_shape(1, out_grad.shape_);
+      for (size_t i = 0; i < in_shape.size(); i++) {
+        in_shape[i] = in_data[i].shape_;
+      }
+      GetCuDNNDeconvOp<DType>(param, compute_type, compute_type,
+          in_shape, out_shape, ctx.run_ctx.ctx).Backward(ctx,
+            std::vector<TBlob>{out_grad}, in_data, req, in_grad);
+    }
+  })
+#else
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    DeconvolutionOp<gpu, DType> op;
+    op.Init(param);
+    op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
   })
 #endif  // MXNET_USE_CUDNN
-  return op;
 }
 
+NNVM_REGISTER_OP(Deconvolution)
+.set_attr<FCompute>("FCompute<gpu>", DeconvolutionCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_Deconvolution)
+.set_attr<FCompute>("FCompute<gpu>", DeconvolutionGradCompute<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/depthwise_convolution-inl.h b/src/operator/nn/depthwise_convolution-inl.h
index c4b7a4787554..0af8cae51c84 100644
--- a/src/operator/nn/depthwise_convolution-inl.h
+++ b/src/operator/nn/depthwise_convolution-inl.h
@@ -39,11 +39,11 @@ namespace mxnet {
 namespace op {
 using namespace tf::depthwise_conv;
 template<typename DType>
-class DepthwiseConvolutionOp : public Operator {
+class DepthwiseConvolutionOp {
  public:
-  explicit DepthwiseConvolutionOp(const ConvolutionParam& param,
-                                  const std::vector<TShape>& in_shape,
-                                  const std::vector<TShape>& out_shape) {
+  void Init(const ConvolutionParam& param,
+            const std::vector<TShape>& in_shape,
+            const std::vector<TShape>& out_shape) {
     args_.batch = in_shape[conv::kData][0];
     args_.in_channel = in_shape[conv::kData][1];
     args_.in_height = in_shape[conv::kData][2];
@@ -62,19 +62,16 @@ class DepthwiseConvolutionOp : public Operator {
 
   ~DepthwiseConvolutionOp() {}
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args);
+  void Forward(const OpContext &ctx,
+               const std::vector<TBlob> &in_data,
+               const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &out_data);
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args);
+  void Backward(const OpContext &ctx,
+                const std::vector<TBlob> &out_grad,
+                const std::vector<TBlob> &in_data,
+                const std::vector<OpReqType> &req,
+                const std::vector<TBlob> &in_grad);
 
  private:
   DepthwiseArgs args_;
@@ -282,8 +279,7 @@ template<typename DType>
 void DepthwiseConvolutionOp<DType>::Forward(const OpContext &ctx,
                                             const std::vector<TBlob> &in_data,
                                             const std::vector<OpReqType> &req,
-                                            const std::vector<TBlob> &out_data,
-                                            const std::vector<TBlob> &aux_states) {
+                                            const std::vector<TBlob> &out_data) {
   using namespace mshadow;
   using namespace mshadow::expr;
   auto stream = ctx.get_stream<gpu>();
@@ -305,10 +301,8 @@ template<typename DType>
 void DepthwiseConvolutionOp<DType>::Backward(const OpContext &ctx,
                                              const std::vector<TBlob> &out_grad,
                                              const std::vector<TBlob> &in_data,
-                                             const std::vector<TBlob> &out_data,
                                              const std::vector<OpReqType> &req,
-                                             const std::vector<TBlob> &in_grad,
-                                             const std::vector<TBlob> &aux_states) {
+                                             const std::vector<TBlob> &in_grad) {
   using namespace mshadow;
   using namespace mshadow::expr;
   auto stream = ctx.get_stream<gpu>();
diff --git a/src/operator/nn/depthwise_convolution_tf.cuh b/src/operator/nn/depthwise_convolution_tf.cuh
index c7f48e686136..e4dfd8292d2d 100644
--- a/src/operator/nn/depthwise_convolution_tf.cuh
+++ b/src/operator/nn/depthwise_convolution_tf.cuh
@@ -24,8 +24,8 @@
  *        are different with origin version.
  * \author shuqian.qu@hobot.cc
 */
-#ifndef MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_TF_CUH_
-#define MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_TF_CUH_
+#ifndef MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_TF_CUH_
+#define MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_TF_CUH_
 #include "../../common/cuda_utils.h"
 #include "../mxnet_op.h"
 
@@ -730,4 +730,4 @@ bool TryLaunchDepthwiseConv2dBackwardFilterGPUSmall(mshadow::Stream<mxnet::gpu>
 }  // namespace depthwise_conv
 }  // namespace tf
 
-#endif  // MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_TF_CUH_
+#endif  // MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_TF_CUH_
diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h
index 4c8a5eee68ce..1d2c9eaeb456 100644
--- a/src/operator/nn/dropout-inl.h
+++ b/src/operator/nn/dropout-inl.h
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file dropout-inl.h
  * \brief
- * \author Bing Xu
+ * \author Bing Xu, Da Zheng
 */
 
 #ifndef MXNET_OPERATOR_NN_DROPOUT_INL_H_
@@ -38,13 +38,6 @@
 #include "../operator_common.h"
 #include "../mshadow_op.h"
 
-#if defined(USE_MKL) && defined(_OPENMP)
-#include <omp.h>
-
-#include <mkl_vml_functions.h>
-#include <mkl_vsl.h>
-#endif  // USE_MKL && _OPENMP
-
 namespace dropout {
 enum DropoutOpInputs {kData};
 enum DropoutOpOutputs {kOut, kMask};
@@ -55,28 +48,6 @@ enum DropoutOpMode {kTraining, kAlways};
 namespace mxnet {
 namespace op {
 
-#if defined(USE_MKL) && defined(_OPENMP)
-static void bernoulli_generate(int n, double p, int* r) {
-  const int seed = 17 + rand() % 4096;  // NOLINT(runtime/threadsafe_fn)
-  const int nthr = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
-# pragma omp parallel num_threads(nthr)
-  {
-    const int ithr = omp_get_thread_num();
-    const int avg_amount = (n + nthr - 1) / nthr;
-    const int my_offset = ithr * avg_amount;
-    const int my_amount = std::min(my_offset + avg_amount, n) - my_offset;
-    if (my_amount > 0) {
-      VSLStreamStatePtr stream;
-      vslNewStream(&stream, VSL_BRNG_MCG31, seed);
-      vslSkipAheadStream(stream, my_offset);
-      viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, stream, my_amount,
-        r + my_offset, p);
-      vslDeleteStream(&stream);
-    }
-  }
-}
-#endif  // USE_MKL && _OPENMP
-
 struct DropoutParam : public dmlc::Parameter<DropoutParam> {
   float p;
   int mode;
@@ -93,196 +64,78 @@ struct DropoutParam : public dmlc::Parameter<DropoutParam> {
 };  // struct DropoutParam
 
 template<typename xpu, typename DType>
-class DropoutOp : public Operator {
- public:
-  explicit DropoutOp(DropoutParam param) {
-    this->pkeep_ = 1.0f - param.p;
-    this->mode_ = param.mode;
-  }
-
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_states) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1U);
-    if (ctx.is_train) {
-      CHECK_EQ(out_data.size(), 2U);
-    }
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2, DType> data = in_data[dropout::kData].FlatTo2D<xpu, DType>(s);
-    Tensor<xpu, 2, DType> out = out_data[dropout::kOut].FlatTo2D<xpu, DType>(s);
-    if (ctx.is_train || mode_ == dropout::kAlways) {
-      Tensor<xpu, 2, DType> mask = out_data[dropout::kMask].FlatTo2D<xpu, DType>(s);
-#if !defined(__CUDACC__) && defined(USE_MKL) && defined(_OPENMP)
-      DType* outptr = out.dptr_;
-      DType* dataptr = data.dptr_;
-      auto maskptr = reinterpret_cast<int*>(mask.dptr_);
-      int count = mask.shape_[0]*mask.shape_[1];
-      bernoulli_generate(count, this->pkeep_, maskptr);
-      const float pk_1 = 1.0f / pkeep_;
-      #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-      for (int i = 0; i < count; ++i) {
-        outptr[i] = dataptr[i] * maskptr[i] * pk_1;
-      }
-#else
-      Random<xpu> *prnd = ctx.requested[dropout::kRandom].get_random<xpu, real_t>(s);
-      mask = tcast<DType>(F<mshadow_op::threshold>(
-             prnd->uniform(mask.shape_), pkeep_) * (1.0f / pkeep_));
-      Assign(out, req[dropout::kOut], data * mask);
-#endif  // USE_MKL && _OPENMP
-    } else {
-      Assign(out, req[dropout::kOut], F<mshadow_op::identity>(data));
-    }
-  }
-
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_states) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1U);
-    CHECK_EQ(in_grad.size(), 1U);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2, DType> grad = out_grad[dropout::kOut].FlatTo2D<xpu, DType>(s);
+void DropoutForward(const OpContext &ctx, const DropoutParam &param,
+                    const std::vector<TBlob> &in_data, const std::vector<OpReqType> &req,
+                    const std::vector<TBlob> &out_data) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  real_t pkeep_ = 1.0f - param.p;
+  int mode_ = param.mode;
+  CHECK_EQ(in_data.size(), 1U);
+  if (ctx.is_train) {
+    CHECK_EQ(out_data.size(), 2U);
+  }
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  Tensor<xpu, 2, DType> data = in_data[dropout::kData].FlatTo2D<xpu, DType>(s);
+  Tensor<xpu, 2, DType> out = out_data[dropout::kOut].FlatTo2D<xpu, DType>(s);
+  if (ctx.is_train || mode_ == dropout::kAlways) {
     Tensor<xpu, 2, DType> mask = out_data[dropout::kMask].FlatTo2D<xpu, DType>(s);
-    Tensor<xpu, 2, DType> gdata = in_grad[dropout::kData].FlatTo2D<xpu, DType>(s);
-    if (ctx.is_train || mode_ == dropout::kAlways) {
-#if !defined(__CUDACC__) && defined(USE_MKL) && defined(_OPENMP)
-      DType* ingradptr = gdata.dptr_;
-      DType* outgradptr = grad.dptr_;
-      auto maskptr = reinterpret_cast<int*>(mask.dptr_);
-      int count = mask.shape_[0]*mask.shape_[1];
-      const float pk_1 = 1.0f / pkeep_;
-      #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-      for (int i = 0; i < count; ++i) {
-        ingradptr[i] = outgradptr[i] * maskptr[i] * pk_1;
-      }
-#else  // USE_MKL && _OPENMP
-      CHECK_EQ(grad.shape_.Size(), mask.shape_.Size());
-      Assign(gdata, req[dropout::kData], grad * mask);
-#endif  // USE_MKL && _OPENMP
-    } else {
-      Assign(gdata, req[dropout::kData], F<mshadow_op::identity>(grad));
-    }
-  }
-
- private:
-  real_t pkeep_;
-  int mode_;
-};  // class DropoutOp
-
-
-template<typename xpu>
-Operator *CreateOp(DropoutParam param, int dtype);
-
-#if DMLC_USE_CXX11
-class DropoutProp : public OperatorProperty {
- public:
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-    using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 1U);
-    const TShape &dshape = in_shape->at(0);
-    if (dshape.ndim() == 0) return false;
-    out_shape->clear();
-    out_shape->push_back(dshape);
-    out_shape->push_back(dshape);
-    return true;
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_EQ(in_type->size(), 1U);
-    int dtype = in_type->at(0);
-
-    if (dtype == -1) {
-      LOG(FATAL) << "input type to dropout is not specified.";
-      return false;
-    }
-
-    size_t nout = this->ListOutputs().size();
-    out_type->clear();
-    for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype);
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    auto ptr = new DropoutProp();
-    ptr->param_ = param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override {
-    return "Dropout";
-  }
-
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return {out_grad[dropout::kOut], out_data[dropout::kMask]};
-  }
-
-  std::vector<std::pair<int, void*> > BackwardInplaceOption(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data,
-    const std::vector<void*> &in_grad) const override {
-    return {{out_grad[dropout::kOut], in_grad[dropout::kData]}};
-  }
-
-  std::vector<std::pair<int, void*> > ForwardInplaceOption(
-    const std::vector<int> &in_data,
-    const std::vector<void*> &out_data) const override {
-    return {{in_data[dropout::kData], out_data[dropout::kOut]}};
-  }
-
-  std::vector<ResourceRequest> ForwardResource(
-    const std::vector<TShape> &in_shape) const override {
-    return {ResourceRequest::kRandom};
-  }
-
-  int NumVisibleOutputs() const override {
-    return 1;
-  }
-
-  int NumOutputs() const override {
-    return 2;
+    Random<xpu> *prnd = ctx.requested[dropout::kRandom].get_random<xpu, real_t>(s);
+    mask = tcast<DType>(F<mshadow_op::threshold>(
+            prnd->uniform(mask.shape_), pkeep_) * (1.0f / pkeep_));
+    Assign(out, req[dropout::kOut], data * mask);
+  } else {
+    Assign(out, req[dropout::kOut], F<mshadow_op::identity>(data));
   }
+}
 
-  std::vector<std::string> ListOutputs() const override {
-    return {"output", "mask"};
+template<typename xpu, typename DType>
+void DropoutBackward(const OpContext &ctx, const DropoutParam &param,
+                     const TBlob &out_grad, const TBlob &out_data_mask,
+                     const OpReqType &req, const TBlob &in_grad) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  int mode_ = param.mode;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  Tensor<xpu, 2, DType> grad = out_grad.FlatTo2D<xpu, DType>(s);
+  Tensor<xpu, 2, DType> mask = out_data_mask.FlatTo2D<xpu, DType>(s);
+  Tensor<xpu, 2, DType> gdata = in_grad.FlatTo2D<xpu, DType>(s);
+  if (ctx.is_train || mode_ == dropout::kAlways) {
+    Assign(gdata, req, grad * mask);
+  } else {
+    Assign(gdata, req, F<mshadow_op::identity>(grad));
   }
+}
 
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented";
-    return NULL;
-  }
+template<typename xpu>
+void DropoutCompute(const nnvm::NodeAttrs& attrs,
+                    const OpContext& ctx,
+                    const std::vector<TBlob>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<TBlob>& outputs) {
+  const DropoutParam& param = nnvm::get<DropoutParam>(attrs.parsed);
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    DropoutForward<xpu, DType>(ctx, param, inputs, req, outputs);
+  });
+}
 
-  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                             std::vector<int> *in_type) const override;
+template<typename xpu>
+void DropoutGradCompute(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const std::vector<TBlob>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& outputs) {
+  const DropoutParam& param = nnvm::get<DropoutParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1);
+  CHECK_EQ(req.size(), 1);
+
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    DropoutBackward<xpu, DType>(ctx, param, inputs[0], inputs[1], req[0],
+                                outputs[0]);
+  });
+}
 
- private:
-  DropoutParam param_;
-};  // class DropoutProp
-#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_NN_DROPOUT_INL_H_
diff --git a/src/operator/nn/dropout.cc b/src/operator/nn/dropout.cc
index bbf5e2dea25b..9facb6d26680 100644
--- a/src/operator/nn/dropout.cc
+++ b/src/operator/nn/dropout.cc
@@ -21,31 +21,28 @@
  * Copyright (c) 2015 by Contributors
  * \file dropout.cc
  * \brief
- * \author Bing Xu
+ * \author Bing Xu, Da Zheng
 */
 
 #include "./dropout-inl.h"
 
 namespace mxnet {
 namespace op {
-template<>
-Operator *CreateOp<cpu>(DropoutParam param, int dtype) {
-  Operator *op = NULL;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new DropoutOp<cpu, DType>(param);
-  });
-  return op;
-}
-
-// DO_BIND_DISPATCH comes from operator_common.h
-Operator *DropoutProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                                              std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
-}
+
+struct DropoutGrad {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
+    std::vector<nnvm::NodeEntry> heads;
+    heads.push_back(ograds[0]);
+    heads.emplace_back(nnvm::NodeEntry{n, dropout::kMask, 0});
+    return MakeGradNode(op_name, n, heads, n->attrs.dict);
+  }
+};
 
 DMLC_REGISTER_PARAMETER(DropoutParam);
 
-MXNET_REGISTER_OP_PROPERTY(Dropout, DropoutProp)
+NNVM_REGISTER_OP(Dropout)
 .describe(R"(Applies dropout operation to input array.
 
 - During training, each element of the input is set to zero with probability p.
@@ -76,8 +73,66 @@ Example::
   [[ 3.     0.5   -0.5    2.     7.   ]
    [ 2.    -0.4    7.     3.     0.2  ]]
 )" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(2)
+.set_attr_parser(ParamParser<DropoutParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+    [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"data"};
+})
+.set_attr<nnvm::FListOutputNames>("FListOutputNames",
+    [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"output", "mask"};
+})
+.set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs",
+    [](const NodeAttrs& attrs) {
+  return 1;
+})
+.set_attr<nnvm::FInferShape>("FInferShape", [](const nnvm::NodeAttrs& attrs,
+      std::vector<TShape> *in_shape, std::vector<TShape> *out_shape){
+  using namespace mshadow;
+  CHECK_EQ(in_shape->size(), 1U);
+  const TShape &dshape = in_shape->at(0);
+  if (dshape.ndim() == 0) return false;
+  out_shape->clear();
+  out_shape->push_back(dshape);
+  out_shape->push_back(dshape);
+  return true;
+})
+.set_attr<nnvm::FInferType>("FInferType", [](const nnvm::NodeAttrs& attrs,
+      std::vector<int> *in_type, std::vector<int> *out_type) {
+  CHECK_EQ(in_type->size(), 1U);
+  int dtype = in_type->at(0);
+
+  if (dtype == -1) {
+    LOG(FATAL) << "input type to dropout is not specified.";
+    return false;
+  }
+
+  size_t nout = 2;
+  out_type->clear();
+  for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype);
+  return true;
+})
+.set_attr<FCompute>("FCompute<cpu>", DropoutCompute<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", DropoutGrad{"_backward_Dropout"})
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
+  return std::vector<std::pair<int, int> >{{0, 0}};
+})
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kRandom};
+})
 .add_argument("data", "NDArray-or-Symbol", "Input array to which dropout will be applied.")
 .add_arguments(DropoutParam::__FIELDS__());
 
+NNVM_REGISTER_OP(_backward_Dropout)
+.set_num_outputs(1)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr_parser(ParamParser<DropoutParam>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
+  return std::vector<std::pair<int, int> >{{0, 0}};
+})
+.set_attr<FCompute>("FCompute<cpu>", DropoutGradCompute<cpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/dropout.cu b/src/operator/nn/dropout.cu
index f416c5883203..e655278822a4 100644
--- a/src/operator/nn/dropout.cu
+++ b/src/operator/nn/dropout.cu
@@ -21,21 +21,20 @@
  * Copyright (c) 2015 by Contributors
  * \file dropout.cc
  * \brief
- * \author Bing Xu
+ * \author Bing Xu, Da Zheng
 */
 
 #include "./dropout-inl.h"
 
 namespace mxnet {
 namespace op {
-template<>
-Operator *CreateOp<gpu>(DropoutParam param, int dtype) {
-  Operator *op = NULL;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new DropoutOp<gpu, DType>(param);
-  });
-  return op;
-}
+
+NNVM_REGISTER_OP(Dropout)
+.set_attr<FCompute>("FCompute<gpu>", DropoutCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_Dropout)
+.set_attr<FCompute>("FCompute<gpu>", DropoutGradCompute<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/nn/fully_connected-inl.h b/src/operator/nn/fully_connected-inl.h
index 9f3deec2449f..e7d1a25b79ab 100644
--- a/src/operator/nn/fully_connected-inl.h
+++ b/src/operator/nn/fully_connected-inl.h
@@ -43,6 +43,7 @@ namespace op {
 // These enums are only visible within this header
 namespace fullc {
 enum FullyConnectedOpInputs {kData, kWeight, kBias};
+enum FullyConnectedOpResource {kTempSpace};
 enum FullyConnectedOpOutputs {kOut};
 }  // fullc
 
@@ -59,242 +60,167 @@ struct FullyConnectedParam : public dmlc::Parameter<FullyConnectedParam> {
     DMLC_DECLARE_FIELD(flatten).set_default(true)
     .describe("Whether to collapse all but the first axis of the input data tensor.");
   }
+  bool operator==(const FullyConnectedParam& other) const {
+    return this->num_hidden == other.num_hidden &&
+           this->no_bias == other.no_bias &&
+           this->flatten == other.flatten;
+  }
 };
 
-/**
- * \brief This is the implementation of fully connected operator.
- * \tparam xpu The device that the op will be executed on.
- */
 template<typename xpu, typename DType>
-class FullyConnectedOp : public Operator {
- public:
-  explicit FullyConnectedOp(FullyConnectedParam p) {
-    this->param_ = p;
-  }
-
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    if (req[fullc::kOut] == kNullOp) return;
-    CHECK_EQ(req[fullc::kOut], kWriteTo);
-    size_t expected = param_.no_bias ? 2 : 3;
-    CHECK_EQ(in_data.size(), expected);
-    CHECK_EQ(out_data.size(), 1U);
-    // TODO(bing): check the BLAS Handle, be careful
-    // maybe need blas handle from context
-    // TODO(bing): judge shape to remove flatten op
-    Stream<xpu> *s = ctx.get_stream<xpu>();
+void FCForward(const OpContext &ctx, const FullyConnectedParam &param,
+               const std::vector<TBlob> &in_data, const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &out_data) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  if (req[fullc::kOut] == kNullOp) return;
+  CHECK_EQ(req[fullc::kOut], kWriteTo);
+  // TODO(bing): check the BLAS Handle, be careful
+  // maybe need blas handle from context
+  // TODO(bing): judge shape to remove flatten op
+  Stream<xpu> *s = ctx.get_stream<xpu>();
 #if defined(__CUDACC__)
-    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
-        << "Must init CuBLAS handle in stream";
+  CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
+      << "Must init CuBLAS handle in stream";
 #endif  // __CUDACC__
-    const TShape& ishape = in_data[fullc::kData].shape_;
-    const TShape& oshape = out_data[fullc::kOut].shape_;
-
-    Tensor<xpu, 2, DType> wmat = in_data[fullc::kWeight].get<xpu, 2, DType>(s);
-    Tensor<xpu, 2, DType> data, out;
-    if (!param_.flatten) {
-      data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
-          Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s);
-      out = out_data[fullc::kOut].get_with_shape<xpu, 2, DType>(
-          Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
-    } else {
-      data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
-          Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
-      out = out_data[fullc::kOut].get_with_shape<xpu, 2, DType>(
-          Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s);
-    }
-
-    // Legacy approach shown here for comparison:
-    //   out = dot(data, wmat.T());
-    linalg_gemm(data, wmat, out, false, true, s);
-    if (!param_.no_bias) {
-      Tensor<xpu, 1, DType> bias = in_data[fullc::kBias].get<xpu, 1, DType>(s);
-      out += repmat(bias, data.size(0));
-    }
+  const TShape& ishape = in_data[fullc::kData].shape_;
+  const TShape& oshape = out_data[fullc::kOut].shape_;
+
+  Tensor<xpu, 2, DType> wmat = in_data[fullc::kWeight].get<xpu, 2, DType>(s);
+  Tensor<xpu, 2, DType> data, out;
+  if (!param.flatten) {
+    data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
+        Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s);
+    out = out_data[fullc::kOut].get_with_shape<xpu, 2, DType>(
+        Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
+  } else {
+    data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
+        Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
+    out = out_data[fullc::kOut].get_with_shape<xpu, 2, DType>(
+        Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s);
   }
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1U);
-    size_t expected = param_.no_bias ? 2 : 3;
-    CHECK(in_data.size() == expected && in_grad.size() == expected);
-    CHECK_EQ(req.size(), expected);
-    // TODO(bing): check the BLAS Handle, be careful
-    //  maybe need blas handle from context
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    const TShape& ishape = in_data[fullc::kData].shape_;
-    const TShape& oshape = out_grad[fullc::kOut].shape_;
+  // Legacy approach shown here for comparison:
+  //   out = dot(data, wmat.T());
+  linalg_gemm(data, wmat, out, false, true, s);
+  if (!param.no_bias) {
+    Tensor<xpu, 1, DType> bias = in_data[fullc::kBias].get<xpu, 1, DType>(s);
+    out += repmat(bias, data.size(0));
+  }
+}
 
-    Tensor<xpu, 2, DType> wmat = in_data[fullc::kWeight].get<xpu, 2, DType>(s);
-    Tensor<xpu, 2, DType> data, grad, gdata;
-    if (!param_.flatten) {
-      data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
-          Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s);
-      grad = out_grad[fullc::kOut].get_with_shape<xpu, 2, DType>(
-          Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
-      gdata = in_grad[fullc::kData].get_with_shape<xpu, 2, DType>(
-          Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s);
-    } else {
-      data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
-          Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
-      grad = out_grad[fullc::kOut].get_with_shape<xpu, 2, DType>(
-          Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s);
-      gdata = in_grad[fullc::kData].get_with_shape<xpu, 2, DType>(
-          Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
-    }
+template<typename xpu, typename DType>
+void FCBackward(const OpContext &ctx, const FullyConnectedParam &param,
+                const std::vector<TBlob> &out_grad, const std::vector<TBlob> &in_data,
+                const std::vector<OpReqType> &req, const std::vector<TBlob> &in_grad) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  // TODO(bing): check the BLAS Handle, be careful
+  //  maybe need blas handle from context
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const TShape& ishape = in_data[fullc::kData].shape_;
+  const TShape& oshape = out_grad[fullc::kOut].shape_;
+
+  Tensor<xpu, 2, DType> wmat = in_data[fullc::kWeight].get<xpu, 2, DType>(s);
+  Tensor<xpu, 2, DType> data, grad, gdata;
+  if (!param.flatten) {
+    data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
+        Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s);
+    grad = out_grad[fullc::kOut].get_with_shape<xpu, 2, DType>(
+        Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
+    gdata = in_grad[fullc::kData].get_with_shape<xpu, 2, DType>(
+        Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s);
+  } else {
+    data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
+        Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
+    grad = out_grad[fullc::kOut].get_with_shape<xpu, 2, DType>(
+        Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s);
+    gdata = in_grad[fullc::kData].get_with_shape<xpu, 2, DType>(
+        Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
+  }
 
 #if defined(__CUDACC__)
-    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
-        << "Must init CuBLAS handle in stream";
+  CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
+      << "Must init CuBLAS handle in stream";
 #endif
-    //  backprop
-    CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace";
-    // gradient of weight
-    Tensor<xpu, 2, DType> gwmat = in_grad[fullc::kWeight].get<xpu, 2, DType>(s);
-    // Legacy approach shown here for comparison:
-    //   out = Assign(gwmat, req[fullc::kWeight], dot(grad.T(), data));
-    linalg_gemm(grad, data, gwmat, true, false, s, req[fullc::kWeight]);
-    // gradient of bias
-    if (!param_.no_bias) {
-      Tensor<xpu, 1, DType> gbias = in_grad[fullc::kBias].get<xpu, 1, DType>(s);
-      Assign(gbias, req[fullc::kBias], sum_rows(grad));
-    }
-    // gradient of data
-    // Legacy approach shown here for comparison:
-    //   Assign(gdata, req[fullc::kData], dot(grad, wmat));
-    linalg_gemm(grad, wmat, gdata, false, false, s, req[fullc::kData]);
+  //  backprop
+  CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace";
+  // gradient of weight
+  Tensor<xpu, 2, DType> gwmat = in_grad[fullc::kWeight].get<xpu, 2, DType>(s);
+  // Legacy approach shown here for comparison:
+  //   out = Assign(gwmat, req[fullc::kWeight], dot(grad.T(), data));
+  linalg_gemm(grad, data, gwmat, true, false, s, req[fullc::kWeight]);
+  // gradient of bias
+  if (!param.no_bias) {
+    Tensor<xpu, 1, DType> gbias = in_grad[fullc::kBias].get<xpu, 1, DType>(s);
+    Assign(gbias, req[fullc::kBias], sum_rows(grad));
   }
+  // gradient of data
+  // Legacy approach shown here for comparison:
+  //   Assign(gdata, req[fullc::kData], dot(grad, wmat));
+  linalg_gemm(grad, wmat, gdata, false, false, s, req[fullc::kData]);
+}
 
- private:
-  FullyConnectedParam param_;
-};  // class FullyConnectedOp
-
-// Decalre Factory function, used for dispatch specialization
 template<typename xpu>
-Operator* CreateOp(FullyConnectedParam param, int dtype,
-                   std::vector<TShape> *in_shape,
-                   std::vector<TShape> *out_shape,
-                   Context ctx);
-
-#if DMLC_USE_CXX11
-class FullyConnectedProp : public OperatorProperty {
- public:
-  std::vector<std::string> ListArguments() const override {
-    if (!param_.no_bias) {
-      return {"data", "weight", "bias"};
-    } else {
-      return {"data", "weight"};
-    }
-  }
-
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-    using namespace mshadow;
-    if (!param_.no_bias) {
-      CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
-    } else {
-      CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
-    }
-    CHECK_EQ(out_shape->size(), 1U);
-    TShape dshape = (*in_shape)[fullc::kData];
-    TShape oshape = (*out_shape)[0];
-    // require data to be known
-    if (dshape.ndim() ==  0) return false;
-
-    index_t num_input;
-    if (!param_.flatten) {
-      num_input = dshape[dshape.ndim()-1];
-    } else {
-      num_input = dshape.ProdShape(1, dshape.ndim());
-    }
-    SHAPE_ASSIGN_CHECK(*in_shape, fullc::kWeight, Shape2(param_.num_hidden, num_input));
-    if (!param_.no_bias) {
-      SHAPE_ASSIGN_CHECK(*in_shape, fullc::kBias, Shape1(param_.num_hidden));
-    }
-
-    if (!param_.flatten) {
-      TShape result_shape(dshape);
-      result_shape[dshape.ndim()-1] = param_.num_hidden;
-      SHAPE_ASSIGN_CHECK(*out_shape, 0, result_shape);
-    } else {
-      SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param_.num_hidden));
-    }
-    if (oshape.ndim() != 0) {
-      dshape[0] = oshape[0];
-      SHAPE_ASSIGN_CHECK(*in_shape, fullc::kData, dshape);
-    }
-    return true;
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1U);
-    nnvm::NodeAttrs attrs;
-    attrs.name = "FullyConnected";
-    return ElemwiseAttr<int, type_is_none, type_assign, true, type_string>(
-      attrs, in_type, out_type, -1);
-  }
-
-  OperatorProperty* Copy() const override {
-    FullyConnectedProp* fc_sym = new FullyConnectedProp();
-    fc_sym->param_ = this->param_;
-    return fc_sym;
-  }
-
-  std::string TypeString() const override {
-    return "FullyConnected";
-  }
-
-  // decalre dependency and inplace optimization options
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return {out_grad[fullc::kOut], in_data[fullc::kData], in_data[fullc::kWeight]};
-  }
-
-  std::vector<std::pair<int, void*> > BackwardInplaceOption(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data,
-    const std::vector<void*> &in_grad) const override {
-    return {{in_data[fullc::kData], in_grad[fullc::kData]}};
+void FullyConnectedCompute(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& outputs) {
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  uint32_t in_expected = param.no_bias ? 2 : 3;
+  CHECK_EQ(inputs.size(), in_expected);
+  CHECK_EQ(outputs.size(), 1U);
+  int dtype = inputs[0].type_flag_;
+
+  switch (dtype) {
+  case mshadow::kFloat32:
+    FCForward<xpu, float>(ctx, param, inputs, req, outputs);
+    break;
+  case mshadow::kFloat64:
+    FCForward<xpu, double>(ctx, param, inputs, req, outputs);
+    break;
+  case mshadow::kFloat16:
+    LOG(FATAL) << "float16 fully connected layer is currently"
+                  "only supported by CuDNN version.";
+    break;
+  default:
+    LOG(FATAL) << "Unsupported type " << dtype;
   }
+}
 
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented.";
-    return NULL;
+template<typename xpu>
+void FullyConnectedGradCompute(const nnvm::NodeAttrs& attrs,
+                               const OpContext& ctx,
+                               const std::vector<TBlob>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<TBlob>& outputs) {
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  uint32_t out_expected = param.no_bias ? 2 : 3;
+  CHECK_EQ(inputs.size(), 3U);
+  CHECK_EQ(outputs.size(), out_expected);
+  CHECK_EQ(req.size(), out_expected);
+
+  std::vector<TBlob> out_grad{inputs[0]};
+  std::vector<TBlob> in_data(inputs.begin() + 1, inputs.end());
+  int dtype = inputs[0].type_flag_;
+
+  switch (dtype) {
+  case mshadow::kFloat32:
+    FCBackward<xpu, float>(ctx, param, out_grad, in_data, req, outputs);
+    break;
+  case mshadow::kFloat64:
+    FCBackward<xpu, double>(ctx, param, out_grad, in_data, req, outputs);
+    break;
+  case mshadow::kFloat16:
+    LOG(FATAL) << "float16 fully connected layer is currently"
+                  "only supported by CuDNN version.";
+    break;
+  default:
+    LOG(FATAL) << "Unsupported type " << dtype;
   }
+}
 
-  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                             std::vector<int> *in_type) const override;
-
- private:
-  FullyConnectedParam param_;
-};  // class FullyConnectedSymbol
-#endif
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_NN_FULLY_CONNECTED_INL_H_
diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc
index 9a978160297d..abca564c4319 100644
--- a/src/operator/nn/fully_connected.cc
+++ b/src/operator/nn/fully_connected.cc
@@ -23,58 +23,158 @@
  * \brief fully connect operator
 */
 #include "./fully_connected-inl.h"
+#include "./mkldnn/mkldnn_ops-inl.h"
+#include "./mkldnn/mkldnn_base-inl.h"
 #if MXNET_USE_NNPACK == 1
 #include "./nnpack/nnpack_fully_connected-inl.h"
 #endif  // MXNET_USE_NNPACK
 
 namespace mxnet {
 namespace op {
-template<>
-Operator* CreateOp<cpu>(FullyConnectedParam param, int dtype,
-                        std::vector<TShape> *in_shape,
-                        std::vector<TShape> *out_shape,
-                        Context ctx) {
-  Operator *op = NULL;
-#if MXNET_USE_NNPACK == 1
-  const size_t batch_size = (*in_shape)[0][0];
-  // nnp_fully_connected_inference will do optimization for batch-size = 1
-  // nnp_fully_connected_output will do optimization for batch-size > 1
-  switch (dtype) {
-  case mshadow::kFloat32:
-    return new NNPACKFullyConnectedOp<cpu, float>(param);
-  default:
-    break;
+
+static bool FullyConnectedShape(const nnvm::NodeAttrs& attrs,
+                                std::vector<TShape> *in_shape,
+                                std::vector<TShape> *out_shape) {
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  using namespace mshadow;
+  if (!param.no_bias) {
+    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
+  } else {
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
+  }
+  CHECK_EQ(out_shape->size(), 1U);
+  TShape dshape = (*in_shape)[fullc::kData];
+  TShape oshape = (*out_shape)[0];
+  // require data to be known
+  if (dshape.ndim() ==  0) return false;
+
+  index_t num_input;
+  if (!param.flatten) {
+    num_input = dshape[dshape.ndim()-1];
+  } else {
+    num_input = dshape.ProdShape(1, dshape.ndim());
+  }
+  SHAPE_ASSIGN_CHECK(*in_shape, fullc::kWeight, Shape2(param.num_hidden, num_input));
+  if (!param.no_bias) {
+    SHAPE_ASSIGN_CHECK(*in_shape, fullc::kBias, Shape1(param.num_hidden));
+  }
+
+  if (!param.flatten) {
+    TShape result_shape(dshape);
+    result_shape[dshape.ndim()-1] = param.num_hidden;
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, result_shape);
+  } else {
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param.num_hidden));
+  }
+  if (oshape.ndim() != 0) {
+    dshape[0] = oshape[0];
+    SHAPE_ASSIGN_CHECK(*in_shape, fullc::kData, dshape);
+  }
+  return true;
+}
+
+void FullyConnectedComputeCPU(const nnvm::NodeAttrs& attrs,
+                              const OpContext &ctx,
+                              const std::vector<NDArray> &inputs,
+                              const std::vector<OpReqType> &req,
+                              const std::vector<NDArray> &outputs) {
+#if MXNET_USE_MKLDNN == 1
+  if (SupportMKLDNN(inputs[0])) {
+    MKLDNNFullyConnectedCompute(attrs, ctx, inputs, req, outputs);
+    return;
+  }
+#endif
+  std::vector<TBlob> in_blobs(inputs.size());
+  for (size_t i = 0; i < in_blobs.size(); i++)
+    in_blobs[i] = inputs[i].data();
+  std::vector<TBlob> out_blobs(outputs.size());
+  for (size_t i = 0; i < out_blobs.size(); i++)
+    out_blobs[i] = outputs[i].data();
+  FullyConnectedCompute<cpu>(attrs, ctx, in_blobs, req, out_blobs);
+}
+
+void FullyConnectedGradComputeCPU(const nnvm::NodeAttrs& attrs,
+                                  const OpContext &ctx,
+                                  const std::vector<NDArray> &inputs,
+                                  const std::vector<OpReqType> &req,
+                                  const std::vector<NDArray> &outputs) {
+#if MXNET_USE_MKLDNN == 1
+  if (SupportMKLDNN(inputs[0])) {
+    MKLDNNFullyConnectedGradCompute(attrs, ctx, inputs, req, outputs);
+    return;
   }
 #endif
-  switch (dtype) {
-  case mshadow::kFloat32:
-    op = new FullyConnectedOp<cpu, float>(param);
-    break;
-  case mshadow::kFloat64:
-    op = new FullyConnectedOp<cpu, double>(param);
-    break;
-  case mshadow::kFloat16:
-    LOG(FATAL) << "float16 fully connected layer is currently"
-                  "only supported by CuDNN version.";
-    break;
-  default:
-    LOG(FATAL) << "Unsupported type " << dtype;
+  std::vector<TBlob> in_blobs(inputs.size());
+  for (size_t i = 0; i < in_blobs.size(); i++)
+    in_blobs[i] = inputs[i].data();
+  std::vector<TBlob> out_blobs(outputs.size());
+  for (size_t i = 0; i < out_blobs.size(); i++)
+    out_blobs[i] = outputs[i].data();
+  FullyConnectedGradCompute<cpu>(attrs, ctx, in_blobs, req, out_blobs);
+}
+
+static bool FullyConnectedType(const nnvm::NodeAttrs& attrs,
+                               std::vector<int> *in_type, std::vector<int> *out_type) {
+  CHECK_GE(in_type->size(), 1U);
+  return ElemwiseAttr<int, type_is_none, type_assign, true, type_string>(
+      attrs, in_type, out_type, -1);
+}
+
+struct FullyConnectedGrad {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
+    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
+    heads.push_back(n->inputs[fullc::kData]);
+    heads.push_back(n->inputs[fullc::kWeight]);
+    return MakeGradNode(op_name, n, heads, n->attrs.dict);
   }
+};
+
+inline static bool FCStorageType(const nnvm::NodeAttrs& attrs,
+                                 const int dev_mask,
+                                 DispatchMode* dispatch_mode,
+                                 std::vector<int> *in_attrs,
+                                 std::vector<int> *out_attrs) {
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  uint32_t in_expected = param.no_bias ? 2 : 3;
+  CHECK_EQ(in_attrs->size(), in_expected);
+  CHECK_EQ(out_attrs->size(), 1);
 
-  return op;
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask)
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  else
+#endif
+    *dispatch_mode = DispatchMode::kFCompute;
+  (*out_attrs)[0] = kDefaultStorage;
+  return true;
 }
 
-// DO_BIND_DISPATCH comes from operator_common.h
-Operator *FullyConnectedProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                                     std::vector<int> *in_type) const {
-  std::vector<TShape> out_shape(1, TShape()), aux_shape;
-  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], in_shape, &out_shape, ctx);
+inline static bool BackwardFCStorageType(const nnvm::NodeAttrs& attrs,
+                                         const int dev_mask,
+                                         DispatchMode* dispatch_mode,
+                                         std::vector<int> *in_attrs,
+                                         std::vector<int> *out_attrs) {
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  uint32_t out_expected = param.no_bias ? 2 : 3;
+  CHECK_EQ(in_attrs->size(), 3U);
+  CHECK_EQ(out_attrs->size(), out_expected);
+
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask)
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  else
+#endif
+    *dispatch_mode = DispatchMode::kFCompute;
+  for (size_t i = 0; i < out_attrs->size(); i++)
+    (*out_attrs)[i] = kDefaultStorage;
+  return true;
 }
 
 DMLC_REGISTER_PARAMETER(FullyConnectedParam);
 
-MXNET_REGISTER_OP_PROPERTY(FullyConnected, FullyConnectedProp)
+NNVM_REGISTER_OP(FullyConnected)
 .describe(R"code(Applies a linear transformation: :math:`Y = XW^T + b`.
 
 If ``flatten`` is set to be true, then the shapes are:
@@ -96,9 +196,55 @@ The learnable parameters include both ``weight`` and ``bias``.
 If ``no_bias`` is set to be true, then the ``bias`` term is ignored.
 
 )code" ADD_FILELINE)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const FullyConnectedParam& params = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  return params.no_bias ? 2 : 3;
+})
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<FullyConnectedParam>)
+.set_attr<FInferStorageType>("FInferStorageType", FCStorageType)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+  const FullyConnectedParam& params = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  if (!params.no_bias) {
+    return std::vector<std::string>{"data", "weight", "bias"};
+  } else {
+    return std::vector<std::string>{"data", "weight"};
+  }
+})
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
+.set_attr<nnvm::FInferShape>("FInferShape", FullyConnectedShape)
+.set_attr<nnvm::FInferType>("FInferType", FullyConnectedType)
+.set_attr<FCompute>("FCompute<cpu>", FullyConnectedCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", FullyConnectedComputeCPU)
+.set_attr<nnvm::FGradient>("FGradient", FullyConnectedGrad{"_backward_FullyConnected"})
 .add_argument("data", "NDArray-or-Symbol", "Input data.")
 .add_argument("weight", "NDArray-or-Symbol", "Weight matrix.")
 .add_argument("bias", "NDArray-or-Symbol", "Bias parameter.")
 .add_arguments(FullyConnectedParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_FullyConnected)
+.set_num_inputs(3)
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const FullyConnectedParam& params = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  return params.no_bias ? 2 : 3;
+})
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
+  return std::vector<std::pair<int, int> >{{1, 0}};
+})
+.set_attr<FInferStorageType>("FInferStorageType", BackwardFCStorageType)
+.set_attr_parser(ParamParser<FullyConnectedParam>)
+.set_attr<FCompute>("FCompute<cpu>", FullyConnectedGradCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", FullyConnectedGradComputeCPU);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/fully_connected.cu b/src/operator/nn/fully_connected.cu
index 279a378e2ad4..c89d37767c4a 100644
--- a/src/operator/nn/fully_connected.cu
+++ b/src/operator/nn/fully_connected.cu
@@ -25,16 +25,50 @@
 #include "./fully_connected-inl.h"
 namespace mxnet {
 namespace op {
+
 template<>
-Operator* CreateOp<gpu>(FullyConnectedParam param, int dtype,
-                        std::vector<TShape> *in_shape,
-                        std::vector<TShape> *out_shape,
-                        Context ctx) {
-  Operator *op = NULL;
+void FullyConnectedCompute<gpu>(const nnvm::NodeAttrs& attrs,
+                                const OpContext& ctx,
+                                const std::vector<TBlob>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<TBlob>& outputs) {
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  uint32_t in_expected = param.no_bias ? 2 : 3;
+  CHECK_EQ(inputs.size(), in_expected);
+  CHECK_EQ(outputs.size(), 1U);
+  int dtype = inputs[0].type_flag_;
+
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new FullyConnectedOp<gpu, DType>(param);
-  })
-  return op;
+    FCForward<gpu, DType>(ctx, param, inputs, req, outputs);
+  });
 }
+
+template<>
+void FullyConnectedGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
+                                    const OpContext& ctx,
+                                    const std::vector<TBlob>& inputs,
+                                    const std::vector<OpReqType>& req,
+                                    const std::vector<TBlob>& outputs) {
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  uint32_t out_expected = param.no_bias ? 2 : 3;
+  CHECK_EQ(inputs.size(), 3U);
+  CHECK_EQ(outputs.size(), out_expected);
+  CHECK_EQ(req.size(), out_expected);
+
+  std::vector<TBlob> out_grad{inputs[0]};
+  std::vector<TBlob> in_data(inputs.begin() + 1, inputs.end());
+  int dtype = inputs[0].type_flag_;
+
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    FCBackward<gpu, DType>(ctx, param, out_grad, in_data, req, outputs);
+  });
+}
+
+NNVM_REGISTER_OP(FullyConnected)
+.set_attr<FCompute>("FCompute<gpu>", FullyConnectedCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_FullyConnected)
+.set_attr<FCompute>("FCompute<gpu>", FullyConnectedGradCompute<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/lrn-inl.h b/src/operator/nn/lrn-inl.h
new file mode 100644
index 000000000000..fdae1eca0aef
--- /dev/null
+++ b/src/operator/nn/lrn-inl.h
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file lrn-inl.h
+ * \brief
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_NN_LRN_INL_H_
+#define MXNET_OPERATOR_NN_LRN_INL_H_
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "../operator_common.h"
+#include "../mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+
+namespace lrn_enum {
+enum LRNInputs {kData};
+enum LRNOutputs {kOut, kTmpNorm};
+}  // namespace lrn_enum
+
+struct LRNParam : public dmlc::Parameter<LRNParam> {
+  float alpha;
+  float beta;
+  float knorm;
+  uint32_t nsize;
+  DMLC_DECLARE_PARAMETER(LRNParam) {
+    DMLC_DECLARE_FIELD(alpha).set_default(1e-4f)
+    .describe("The variance scaling parameter :math:`\alpha` in the LRN expression.");
+    DMLC_DECLARE_FIELD(beta).set_default(0.75f)
+    .describe("The power parameter :math:`\beta` in the LRN expression.");
+    DMLC_DECLARE_FIELD(knorm).set_default(2.0f)
+    .describe("The parameter :math:`k` in the LRN expression.");
+    DMLC_DECLARE_FIELD(nsize)
+    .describe("normalization window width in elements.");
+  }
+};  // struct LRNParam
+
+template<typename xpu>
+void LRNForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                const std::vector<TBlob> &in_data,
+                const std::vector<OpReqType> &req,
+                const std::vector<TBlob> &out_data) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  const LRNParam& param_ = nnvm::get<LRNParam>(attrs.parsed);
+  // TODO(xxx): Test with gradient chceker
+  CHECK_EQ(in_data.size(), 1U);
+  CHECK_EQ(out_data.size(), 2U);
+  // CHECK_EQ(req.size(), 2);
+  CHECK_EQ(param_.nsize % 2, 1U) << "LRN only supports odd values for local_size";
+  const real_t salpha = param_.alpha / param_.nsize;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  Tensor<xpu, 4> data = in_data[lrn_enum::kData].get<xpu, 4, real_t>(s);
+  Tensor<xpu, 4> out = out_data[lrn_enum::kOut].get<xpu, 4, real_t>(s);
+  Tensor<xpu, 4> tmp_norm = out_data[lrn_enum::kTmpNorm].get<xpu, 4, real_t>(s);
+  tmp_norm = chpool<red::sum>(F<mshadow_op::square>(data) , param_.nsize) * salpha + param_.knorm;
+  Assign(out, req[lrn_enum::kOut], data *  F<mshadow_op::power>(tmp_norm, -param_.beta));
+}
+
+template<typename xpu>
+void LRNBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                 const TBlob &out_grad, const TBlob &in_data,
+                 const TBlob &out_norm, const OpReqType &req,
+                 const TBlob &in_grad) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  const LRNParam& param_ = nnvm::get<LRNParam>(attrs.parsed);
+  const real_t salpha = param_.alpha / param_.nsize;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  Tensor<xpu, 4> grad = out_grad.get<xpu, 4, real_t>(s);
+  Tensor<xpu, 4> tmp_norm = out_norm.get<xpu, 4, real_t>(s);
+  Tensor<xpu, 4> data = in_data.get<xpu, 4, real_t>(s);
+  Tensor<xpu, 4> grad_in = in_grad.get<xpu, 4, real_t>(s);
+  grad_in = grad * F<mshadow_op::power>(tmp_norm, -param_.beta);
+  grad_in += (- 2.0f * param_.beta * salpha) *
+      chpool<red::sum>(grad * data *
+                       F<mshadow_op::power>(tmp_norm, -param_.beta - 1.0f),
+                       param_.nsize)  * data;
+}
+
+template<typename xpu>
+void LRNCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
+                const std::vector<TBlob>& inputs,
+                const std::vector<OpReqType>& req,
+                const std::vector<TBlob>& outputs) {
+  LRNForward<xpu>(attrs, ctx, inputs, req, outputs);
+}
+
+template<typename xpu>
+void LRNGradCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
+                    const std::vector<TBlob>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<TBlob>& outputs) {
+  LRNBackward<xpu>(attrs, ctx, inputs[0],  // out_grad
+                   inputs[1],              // in_data
+                   inputs[2],              // out_norm
+                   req[lrn_enum::kData], outputs[lrn_enum::kData]);
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_NN_LRN_INL_H_
diff --git a/src/operator/nn/lrn.cc b/src/operator/nn/lrn.cc
new file mode 100644
index 000000000000..330a4550f04f
--- /dev/null
+++ b/src/operator/nn/lrn.cc
@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file lrn.cc
+ * \brief
+ * \author Bing Xu
+*/
+
+#include "./lrn-inl.h"
+#if MXNET_USE_MKLDNN == 1
+#include "./mkldnn/mkldnn_lrn-inl.h"
+#endif
+
+namespace mxnet {
+namespace op {
+
+static bool LRNShape(const nnvm::NodeAttrs& attrs,
+                     std::vector<TShape> *in_shape,
+                     std::vector<TShape> *out_shape) {
+  using namespace mshadow;
+  CHECK_EQ(in_shape->size(), 1U) << "Input:[data]";
+  const TShape &dshape = in_shape->at(0);
+  if (dshape.ndim() == 0) return false;
+  out_shape->clear();
+  out_shape->push_back(dshape);
+  out_shape->push_back(dshape);
+  return true;
+}
+
+static inline std::vector<std::string> ListArguments() {
+  return {"data"};
+}
+
+static bool LRNType(const nnvm::NodeAttrs& attrs,
+                    std::vector<int> *in_type,
+                    std::vector<int> *out_type) {
+  CHECK_GE(in_type->size(), 1U);
+  int dtype = (*in_type)[0];
+  CHECK_NE(dtype, -1) << "First input must have specified type";
+  for (index_t i = 0; i < in_type->size(); ++i) {
+    if ((*in_type)[i] == -1) {
+      (*in_type)[i] = dtype;
+    } else {
+      UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
+    }
+  }
+  int n_out = 2;
+  out_type->clear();
+  for (int i = 0; i < n_out; ++i ) out_type->push_back(dtype);
+  return true;
+}
+
+struct LRNGrad {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
+                const std::vector<nnvm::NodeEntry>& ograds) const {
+    std::vector<nnvm::NodeEntry> heads;
+    heads.push_back(ograds[0]);  // out_grad
+    heads.push_back(n->inputs[lrn_enum::kData]);
+    heads.emplace_back(nnvm::NodeEntry{n, lrn_enum::kTmpNorm, 0});
+    return MakeGradNode(op_name, n, heads, n->attrs.dict);
+  }
+};
+
+inline static bool LRNForwardInferStorageType(const nnvm::NodeAttrs& attrs,
+                                              const int dev_mask,
+                                              DispatchMode* dispatch_mode,
+                                              std::vector<int> *in_attrs,
+                                              std::vector<int> *out_attrs) {
+  *dispatch_mode = DispatchMode::kFCompute;
+#if MXNET_USE_MKLDNN == 1
+  CHECK(!in_attrs->empty());
+  if (dev_mask == mshadow::cpu::kDevMask) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+  for (size_t i = 0; i < out_attrs->size(); i++)
+    (*out_attrs)[i] = kDefaultStorage;
+  return true;
+}
+
+inline static bool LRNBackwardInferStorageType(const nnvm::NodeAttrs& attrs,
+                                               const int dev_mask,
+                                               DispatchMode* dispatch_mode,
+                                               std::vector<int> *in_attrs,
+                                               std::vector<int> *out_attrs) {
+  *dispatch_mode = DispatchMode::kFCompute;
+#if MXNET_USE_MKLDNN == 1
+  CHECK(!in_attrs->empty());
+  if (dev_mask == mshadow::cpu::kDevMask) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+  for (size_t i = 0; i < out_attrs->size(); i++)
+    (*out_attrs)[i] = kDefaultStorage;
+  return true;
+}
+
+void LRNCompute_CPU(const nnvm::NodeAttrs &attrs,
+                    const OpContext &ctx,
+                    const std::vector<NDArray> &inputs,
+                    const std::vector<OpReqType> &req,
+                    const std::vector<NDArray> &outputs) {
+#if MXNET_USE_MKLDNN == 1
+  const LRNParam &param = nnvm::get<LRNParam>(attrs.parsed);
+  if (SupportMKLDNN(inputs[0])) {
+    MKLDNNLRN_Forward(ctx, param, inputs[0], req[0], outputs[0]);
+    return;
+  }
+#endif
+  std::vector<TBlob> in_blobs(inputs.size());
+  for (size_t i = 0; i < in_blobs.size(); i++)
+    in_blobs[i] = inputs[i].data();
+
+  std::vector<TBlob> out_blobs(outputs.size());
+  for (size_t i = 0; i < out_blobs.size(); i++)
+    out_blobs[i] = outputs[i].data();
+  LRNCompute<cpu>(attrs, ctx, in_blobs, req, out_blobs);
+}
+
+void LRNGradCompute_CPU(const nnvm::NodeAttrs &attrs,
+                        const OpContext &ctx,
+                        const std::vector<NDArray> &inputs,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<NDArray> &outputs) {
+#if MXNET_USE_MKLDNN == 1
+  const LRNParam &param = nnvm::get<LRNParam>(attrs.parsed);
+  const NDArray &out_grad = inputs[0];
+  const NDArray &in_data = inputs[1];
+  const NDArray &in_grad = outputs[0];
+
+  if (SupportMKLDNN(inputs[0])) {
+    MKLDNNLRN_Backward(ctx, param, out_grad, in_data,
+                           req[0], in_grad);
+    return;
+  }
+#endif
+  std::vector<TBlob> in_blobs(inputs.size());
+  for (size_t i = 0; i < in_blobs.size(); i++) {
+    in_blobs[i] = inputs[i].data();
+  }
+
+  std::vector<TBlob> out_blobs(outputs.size());
+  for (size_t i = 0; i < out_blobs.size(); i++)
+    out_blobs[i] = outputs[i].data();
+
+  LRNGradCompute<cpu>(attrs, ctx, in_blobs, req, out_blobs);
+}
+
+DMLC_REGISTER_PARAMETER(LRNParam);
+
+NNVM_REGISTER_OP(LRN)
+.describe(R"code(Applies local response normalization to the input.
+
+The local response normalization layer performs "lateral inhibition" by normalizing
+over local input regions.
+
+If :math:`a_{x,y}^{i}` is the activity of a neuron computed by applying kernel :math:`i` at position
+:math:`(x, y)` and then applying the ReLU nonlinearity, the response-normalized
+activity :math:`b_{x,y}^{i}` is given by the expression:
+
+.. math::
+   b_{x,y}^{i} = \frac{a_{x,y}^{i}}{\Bigg({k + \alpha \sum_{j=max(0, i-\frac{n}{2})}^{min(N-1, i+\frac{n}{2})} (a_{x,y}^{j})^{2}}\Bigg)^{\beta}}
+
+where the sum runs over :math:`n` "adjacent" kernel maps at the same spatial position, and :math:`N` is the total
+number of kernels in the layer.
+
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(2)
+.set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs",
+                                    [](const NodeAttrs& attrs) { return 1; })
+.set_attr_parser(ParamParser<LRNParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", LRNShape)
+.set_attr<nnvm::FInferType>("FInferType", LRNType)
+.set_attr<FInferStorageType>("FInferStorageType", LRNForwardInferStorageType)
+.set_attr<FCompute>("FCompute<cpu>", LRNCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", LRNCompute_CPU)
+.set_attr<nnvm::FGradient>("FGradient", LRNGrad{"_backward_LRN"})
+.add_argument("data", "NDArray-or-Symbol", "Input data to LRN")
+.add_arguments(LRNParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_LRN)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<LRNParam>)
+.set_attr<FInferStorageType>("FInferStorageType", LRNBackwardInferStorageType)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", LRNGradCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", LRNGradCompute_CPU);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/lrn.cu b/src/operator/nn/lrn.cu
similarity index 64%
rename from src/operator/lrn.cu
rename to src/operator/nn/lrn.cu
index ba872f1d26d0..4c31ca96025c 100644
--- a/src/operator/lrn.cu
+++ b/src/operator/nn/lrn.cu
@@ -25,29 +25,15 @@
 */
 
 #include "./lrn-inl.h"
-#if MXNET_USE_CUDNN == 1
-#include "./cudnn_lrn-inl.h"
-#endif
 
 namespace mxnet {
 namespace op {
-template<>
-Operator* CreateOp<gpu>(LRNParam param, int dtype) {
-  Operator *op = NULL;
-#if MXNET_USE_CUDNN == 1
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new CuDNNLocalResponseNormOp<DType>(param);
-  })
-#else
-#if CUDA_VERSION == 7000
-  LOG(FATAL) << "Due to old CUDA compiler bug, LRN is disabled."
-             << "Please upgrade CUDA to 7.5+ or use CUDNN";
-#else
-  op = new LocalResponseNormOp<gpu>(param);
-#endif  // CUDA_VERSION
-#endif  // MXNET_USE_CUDNN
-  return op;
-}
+
+NNVM_REGISTER_OP(LRN)
+.set_attr<FCompute>("FCompute<gpu>", LRNCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_LRN)
+.set_attr<FCompute>("FCompute<gpu>", LRNGradCompute<gpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/mkldnn/mkldnn_act.cc b/src/operator/nn/mkldnn/mkldnn_act.cc
new file mode 100644
index 000000000000..7783ddd5fa4e
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_act.cc
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_act.cc
+ * \brief
+ * \author Da Zheng
+*/
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "../../operator_common.h"
+#include "../activation-inl.h"
+#include "./mkldnn_base-inl.h"
+
+#if MXNET_USE_MKLDNN == 1
+
+#include <mkldnn.hpp>
+
+namespace mxnet {
+namespace op {
+
+bool SupportMKLDNNAct(const ActivationParam& param) {
+  // We only enable ReLU for now. It seems other activations have some precision
+  // problems.
+  return param.act_type == activation::kReLU;
+#if 0
+      || param.act_type == activation::kSigmoid
+      || param.act_type == activation::kSoftReLU;
+#endif
+}
+
+static inline mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param) {
+  switch (param.act_type) {
+    case activation::kReLU:
+      return mkldnn::algorithm::eltwise_relu;
+    case activation::kSigmoid:
+      return mkldnn::algorithm::eltwise_logistic;
+    case activation::kTanh:
+      return mkldnn::algorithm::eltwise_tanh;
+    case activation::kSoftReLU:
+      return mkldnn::algorithm::eltwise_soft_relu;
+    default:
+      LOG(FATAL) << "unknown activation type";
+      return mkldnn::algorithm::eltwise_relu;
+  }
+}
+
+typedef std::shared_ptr<mkldnn::eltwise_forward::primitive_desc> mkldnn_act_pdesc_ptr;
+
+static mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl(
+    const ActivationParam& param, bool is_train,
+    const mkldnn::memory &input_mem, int dtype) {
+  mkldnn::memory::primitive_desc data_mpd = input_mem.get_primitive_desc();
+  mkldnn::memory::desc data_md = data_mpd.desc();
+  auto cpu_engine = data_mpd.get_engine();
+
+  auto alg = GetMKLDNNActAlgo(param);
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    DType alpha = 0;
+    mkldnn::eltwise_forward::desc desc = is_train
+        ? mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_training,
+                                        alg, data_md, alpha)
+        : mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_scoring,
+                                        alg, data_md, alpha);
+    return mkldnn::eltwise_forward::primitive_desc(desc, cpu_engine);
+  });
+  LOG(INFO) << "Unsupported data type for MKLDNN activation";
+  mkldnn::eltwise_forward::desc desc = mkldnn::eltwise_forward::desc(
+      mkldnn::prop_kind::forward_training, alg, data_md, 0.0);
+  return mkldnn::eltwise_forward::primitive_desc(desc, cpu_engine);
+}
+
+typedef MKLDNNParamOpSign<ActivationParam> MKLDNNActSignature;
+
+class MKLDNNActForward {
+  std::shared_ptr<mkldnn::eltwise_forward> fwd;
+  std::shared_ptr<mkldnn::memory> data;
+  std::shared_ptr<mkldnn::memory> out;
+
+ public:
+  const mkldnn::eltwise_forward::primitive_desc fwd_pd;
+
+  MKLDNNActForward(const ActivationParam& param, bool is_train,
+                   const NDArray &data, const mkldnn::memory &mem): fwd_pd(
+                       GetActFwdDescImpl(param, is_train, mem, data.dtype())) {
+  }
+
+  void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &output) {
+    if (this->data == nullptr)
+      this->data = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+              data.get_primitive_desc(), data.get_data_handle()));
+    else
+      this->data->set_data_handle(data.get_data_handle());
+
+    CHECK(fwd_pd.dst_primitive_desc() == output.get_primitive_desc());
+    if (this->out == nullptr)
+      this->out = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+              fwd_pd.dst_primitive_desc(), output.get_data_handle()));
+    else
+      this->out->set_data_handle(output.get_data_handle());
+
+    if (this->fwd == nullptr) {
+      this->fwd = std::shared_ptr<mkldnn::eltwise_forward>(
+          new mkldnn::eltwise_forward(fwd_pd, mkldnn::primitive::at(*this->data),
+                                      *this->out));
+    }
+  }
+
+  const mkldnn::eltwise_forward &GetFwd() const {
+    return *fwd;
+  }
+};
+
+static MKLDNNActForward &GetActForward(const ActivationParam& param,
+                                       const OpContext &ctx, const NDArray &in_data,
+                                       const mkldnn::memory &in_mem) {
+  static thread_local std::unordered_map<MKLDNNActSignature, MKLDNNActForward, MKLDNNOpHash> fwds;
+  MKLDNNActSignature key(param);
+  key.AddSign(ctx.is_train);
+  key.AddSign(param.act_type);
+  key.AddSign(in_mem);
+
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+    MKLDNNActForward fwd(param, ctx.is_train, in_data, in_mem);
+    auto ins_ret = fwds.insert(std::pair<MKLDNNActSignature, MKLDNNActForward>(
+            key, fwd));
+    CHECK(ins_ret.second);
+    it = ins_ret.first;
+  }
+  return it->second;
+}
+
+void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                             const NDArray &in_data, const OpReqType &req,
+                             const NDArray &out_data) {
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+  auto input_mem = in_data.GetMKLDNNData();
+  MKLDNNActForward &fwd = GetActForward(param, ctx, in_data, *input_mem);
+  auto out_mem = const_cast<NDArray &>(out_data).CreateMKLDNNData(
+      fwd.fwd_pd.dst_primitive_desc());
+  fwd.SetNewMem(*input_mem, *out_mem);
+  MKLDNNStream *stream = MKLDNNStream::Get();
+  stream->RegisterPrim(fwd.GetFwd());
+  stream->Submit();
+}
+
+void MKLDNNActivationBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                              const NDArray &out_grad, const NDArray &in_data,
+                              const OpReqType &req, const NDArray &in_grad) {
+  if (req == kNullOp) {
+    return;
+  }
+
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+  TmpMemMgr::Get()->Init(ctx.requested[activation::kTempSpace]);
+  auto diff_dst_memory = out_grad.GetMKLDNNData();
+  auto input_mem = in_data.GetMKLDNNData();
+  // We need to make sure the two inputs to eltwise_backward has the same memory
+  // descriptor. Otherwise, the perf will suffer.
+  if (input_mem->get_primitive_desc() != diff_dst_memory->get_primitive_desc())
+    input_mem = in_data.GetMKLDNNDataReorder(diff_dst_memory->get_primitive_desc());
+  mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc();
+  mkldnn::memory::desc data_md = data_mpd.desc();
+  mkldnn::memory::desc diff_md = diff_dst_memory->get_primitive_desc().desc();
+  auto cpu_engine = data_mpd.get_engine();
+
+  MKLDNNStream *stream = MKLDNNStream::Get();
+  auto alg = GetMKLDNNActAlgo(param);
+  mkldnn_output_t diff_src_memory;
+
+  MSHADOW_REAL_TYPE_SWITCH(in_data.dtype(), DType, {
+    DType alpha = 0;
+    mkldnn::eltwise_forward::desc fw_desc(mkldnn::prop_kind::forward_training,
+                                          alg, data_md, alpha);
+    mkldnn::eltwise_forward::primitive_desc fw_pdesc(fw_desc, cpu_engine);
+    mkldnn::eltwise_backward::desc bw_desc(alg, diff_md, data_md, alpha);
+    mkldnn::eltwise_backward::primitive_desc bw_pdesc(bw_desc, cpu_engine,
+                                                      fw_pdesc);
+
+    diff_src_memory = CreateMKLDNNMem(in_grad,
+                                      bw_pdesc.diff_src_primitive_desc(), req);
+    stream->RegisterPrim(mkldnn::eltwise_backward(bw_pdesc, *input_mem,
+                                                  *diff_dst_memory,
+                                                  *diff_src_memory.second));
+  });
+  CommitOutput(in_grad, diff_src_memory);
+  stream->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
new file mode 100644
index 000000000000..846f9596eea6
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -0,0 +1,406 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*******************************************************************************
+* Copyright 2016-2017 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+* \file mkldnn_base-inl.h
+* \brief
+* \author young.jin.kim@intel.com
+*         ashok.emani@intel.com
+*         deepthi.karkada@intel.com
+*         louis.feng@intel.com
+*         adam.d.straw@intel.com
+*         zhengda1936@gmail.com
+*
+*******************************************************************************/
+
+#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_
+#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_
+
+#if MXNET_USE_MKLDNN == 1
+#include <iterator>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include <utility>
+#include <algorithm>
+#include <memory>
+#include "mkldnn.hpp"
+#include "mxnet/ndarray.h"
+#include "mxnet/resource.h"
+#include "mxnet/op_attr_types.h"
+using namespace mkldnn;
+namespace mxnet {
+extern bool EnableMkldnnWarnGenerated();
+// =====  CpuEngine =======================================
+// cpu_engine singleton
+class CpuEngine {
+ public:
+  static CpuEngine *Get() {
+    // I's thread-safe in C++11.
+    static thread_local CpuEngine myInstance;
+    return &myInstance;
+  }
+  CpuEngine(CpuEngine const &) = delete;             // Copy construct
+  CpuEngine(CpuEngine &&) = delete;                  // Move construct
+  CpuEngine &operator=(CpuEngine const &) = delete;  // Copy assign
+  CpuEngine &operator=(CpuEngine &&) = delete;       // Move assign
+
+  mkldnn::engine &get_engine() { return _cpu_engine; }
+
+ protected:
+  CpuEngine() : _cpu_engine(mkldnn::engine::cpu, 0) {}
+  ~CpuEngine() {}
+
+ private:
+  mkldnn::engine _cpu_engine;
+};
+
+// type enumerator
+template <typename T>
+struct data_type_enum {};
+
+template <>
+struct data_type_enum<float> {
+  enum { type = mkldnn::memory::data_type::f32 };
+};
+
+template <>
+struct data_type_enum<int32_t> {
+  enum { type = mkldnn::memory::data_type::s32 };
+};
+
+template <>
+struct data_type_enum<int16_t> {
+  enum { type = mkldnn::memory::data_type::s16 };
+};
+
+template <>
+struct data_type_enum<int8_t> {
+  enum { type = mkldnn::memory::data_type::s8 };
+};
+
+template <>
+struct data_type_enum<uint8_t> {
+  enum { type = mkldnn::memory::data_type::u8 };
+};
+
+static inline bool SupportMKLDNNArray(int dtype, const TShape &shape) {
+  int ndim = shape.ndim();
+  bool support = ndim == 1 || ndim == 2 || ndim == 4;
+  support = support && (dtype == mshadow::kFloat32 || dtype == mshadow::kInt32
+                        || dtype == mshadow::kInt8 || dtype == mshadow::kUint8);
+  return support;
+}
+
+static inline bool SupportStorageMKLDNN(int stype) {
+  return stype == kMKLDNNStorage || stype == kDefaultStorage;
+}
+
+static inline bool SupportMKLDNN(int dtype, const TShape &shape) {
+  int ndim = shape.ndim();
+  return dtype == mshadow::kFloat32 && (ndim == 1 || ndim == 2 || ndim == 4);
+}
+
+static inline bool SupportMKLDNN(const NDArray &input) {
+  return SupportMKLDNN(input.dtype(), input.shape())
+      && SupportStorageMKLDNN(input.storage_type());
+}
+
+static inline bool SupportMKLDNNConv(const NDArray &input) {
+  return input.dtype() == mshadow::kFloat32 && input.shape().ndim() == 4;
+}
+
+namespace op {
+struct ActivationParam;
+bool SupportMKLDNNAct(const op::ActivationParam& param);
+}
+
+static int GetTypeSize(int dtype) {
+  MSHADOW_TYPE_SWITCH(dtype, DType, {
+    return sizeof(DType);
+  });
+  return -1;
+}
+
+static inline size_t GetArraySize(const NDArray &arr) {
+  return arr.shape().Size() * GetTypeSize(arr.dtype());
+}
+
+static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) {
+  switch (dtype) {
+    case mshadow::kFloat32:
+      return mkldnn::memory::data_type::f32;
+    case mshadow::kInt32:
+      return mkldnn::memory::data_type::s32;
+    case mshadow::kInt8:
+      return mkldnn::memory::data_type::s8;
+    case mshadow::kUint8:
+      return mkldnn::memory::data_type::u8;
+    default:
+      LOG(FATAL) << "unknown type for MKLDNN";
+      return mkldnn::memory::data_type::data_undef;
+  }
+}
+
+inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr, int ndim) {
+  mkldnn::memory::dims dims(ndim);
+  for (size_t i = 0; i < dims.size(); i++) dims[i] = arr.shape()[i];
+  return mkldnn::memory::desc{dims, get_mkldnn_type(arr.dtype()),
+                              mkldnn::memory::format::any};
+}
+
+inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr) {
+  return GetMemDesc(arr, arr.shape().ndim());
+}
+
+inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr,
+                                                 int num_groups) {
+  if (num_groups == 1) {
+    return GetMemDesc(arr);
+  } else {
+    CHECK_EQ(arr.shape().ndim(), 4U);
+    mkldnn::memory::dims tz = mkldnn::memory::dims{ num_groups,
+      static_cast<int>(arr.shape()[0] / num_groups),
+      static_cast<int>(arr.shape()[1]),
+      static_cast<int>(arr.shape()[2]),
+      static_cast<int>(arr.shape()[3])};
+    return mkldnn::memory::desc{tz, get_mkldnn_type(arr.dtype()),
+                                mkldnn::memory::format::any};
+  }
+}
+
+typedef std::shared_ptr<mkldnn::memory> mkldnn_mem_ptr;
+typedef std::shared_ptr<const mkldnn::memory> mkldnn_mem_const_ptr;
+
+class TmpMemMgr {
+  // This points to the memory buffer where we can allocate temp memory.
+  char *curr_mem;
+  // The total size of the temp memory.
+  size_t mem_size;
+  // This contains the current available memory size.
+  size_t curr_size;
+  // This estimate the required temp memory size in an operator.
+  size_t est_size;
+  const size_t alignment = 4096;
+
+ public:
+  static TmpMemMgr *Get() {
+    static thread_local TmpMemMgr mgr;
+    return &mgr;
+  }
+
+  TmpMemMgr() {
+    Reset();
+    est_size = 0;
+    mem_size = 0;
+  }
+
+  void Reset() {
+    curr_mem = nullptr;
+    curr_size = 0;
+    // We don't reset est_size and mem_size because est_size contains the
+    // estimated temp memory size from the last run and mem_size contains the
+    // memroy size allocated in the last run.
+  }
+
+  void Init(const Resource &r) {
+    // If the last time, if we estimate that we need more memory, we should the
+    // larger memory size.
+    mem_size = std::max(mem_size, est_size);
+    if (mem_size > 0) {
+      // Let's allocate some extra memory. If we don't use some of them all the time,
+      // the OS won't physically allocate pages for them any way.
+      this->curr_size = mem_size * 2;
+      this->curr_mem = static_cast<char *>(r.get_host_space_internal(this->curr_size));
+    }
+    // reset est_size, so we can start to estimate the temp memory size.
+    this->est_size = 0;
+  }
+
+  mkldnn::memory *Alloc(const mkldnn::memory::primitive_desc &pd);
+};
+
+class MKLDNNStream {
+  std::vector<mkldnn::primitive> net;
+  // Here we hold all memory related to the operators in the stream.
+  std::vector<std::shared_ptr<const mkldnn::memory> > mem_holder;
+
+ public:
+  static MKLDNNStream *Get() {
+    static thread_local MKLDNNStream stream;
+    return &stream;
+  }
+
+  void RegisterPrim(const mkldnn::primitive &prim) { net.push_back(prim); }
+
+  void RegisterMem(std::shared_ptr<const mkldnn::memory> mem) {
+    mem_holder.push_back(mem);
+  }
+
+  void Submit() {
+    if (!net.empty())
+      mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
+    net.clear();
+    mem_holder.clear();
+    TmpMemMgr::Get()->Reset();
+  }
+};
+
+class MKLDNNOpSignature {
+  std::vector<int> eles;
+  uint64_t hash;
+
+ public:
+  MKLDNNOpSignature() {
+    hash = 0;
+  }
+
+  /*
+   * We provide different methods to add signature to an op.
+   * For operations, such as convolutin and fully connected, which determines
+   * the optimal data layout for the op, we only need to use the shape and data
+   * type to sign the op. For other operations, such as activation, which uses
+   * whatever layout in the input array, we have to use the shape, the data type
+   * and the layout to sign the op.
+   */
+
+  void AddSign(const mkldnn::memory &mem) {
+    auto desc = mem.get_primitive_desc().desc();
+    hash = hash * 2 + desc.data.format;
+    eles.push_back(desc.data.format);
+    hash = hash * 2 + desc.data.data_type;
+    eles.push_back(desc.data.data_type);
+    for (int i = 0; i < desc.data.ndims; i++) {
+      hash = hash * 2 + desc.data.dims[i];
+      eles.push_back(desc.data.dims[i]);
+    }
+  }
+
+  void AddSign(const std::vector<NDArray> &arrs) {
+    for (auto &arr : arrs) {
+      AddSign(arr);
+    }
+  }
+
+  void AddSign(const NDArray &arr) {
+    hash = hash * 2 + arr.dtype();
+    eles.push_back(arr.dtype());
+    AddSign(arr.shape());
+  }
+
+  void AddSign(const TShape &shape) {
+    for (size_t i = 0; i < shape.ndim(); i++) {
+      hash = hash * 2 + shape[i];
+      eles.push_back(shape[i]);
+    }
+  }
+
+  void AddSign(int val) {
+    hash = hash * 2 + val;
+    eles.push_back(val);
+  }
+
+  bool operator==(const MKLDNNOpSignature &sign) const {
+    if (hash != sign.hash)
+      return false;
+    if (eles.size() != sign.eles.size())
+      return false;
+    for (size_t i = 0; i < eles.size(); i++)
+      if (eles[i] != sign.eles[i])
+        return false;
+    return true;
+  }
+
+  uint64_t GetHash() const {
+    return hash;
+  }
+};
+
+struct MKLDNNOpHash {
+  size_t operator()(const MKLDNNOpSignature &sign) const {
+    return sign.GetHash();
+  }
+};
+
+template<typename ParamType>
+class MKLDNNParamOpSign: public MKLDNNOpSignature {
+  const ParamType param;
+
+ public:
+  explicit MKLDNNParamOpSign(const ParamType &_param): param(_param) {
+  }
+
+  bool operator==(const MKLDNNParamOpSign<ParamType> &sign) const {
+    const MKLDNNOpSignature &this_upper = *this;
+    const MKLDNNOpSignature &other_upper = sign;
+    return this_upper == other_upper && param == sign.param;
+  }
+};
+
+enum OutDataOp {
+  Noop,
+  CopyBack,
+  AddBack,
+};
+
+typedef std::pair<OutDataOp, mkldnn::memory *> mkldnn_output_t;
+
+/*
+ * These two functions try to create MKLDNN memory in an NDArray based on `req'.
+ * The difference is that the first function can create MKLDNN memory with
+ * special layouts in an NDArray, while the second one can only create MKLDNN
+ * memory with default layouts.
+ * If these two functions are used, we have to call CommitOutput to write
+ * the output back to the output NDArray.
+ */
+mkldnn_output_t CreateMKLDNNMem(const NDArray &arr,
+                                const mkldnn::memory::primitive_desc &desc,
+                                OpReqType req);
+mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &arr,
+                                       const mkldnn::memory::primitive_desc &desc,
+                                       OpReqType req);
+/* This function has to be used with one of the functions above. */
+void CommitOutput(const NDArray &arr, const mkldnn_output_t &res);
+
+const mkldnn::memory *GetWeights(const NDArray &arr,
+                                 const mkldnn::memory::primitive_desc &target_pd,
+                                 int num_groups);
+
+mkldnn_memory_format_t GetDefaultFormat(mkldnn::memory::desc desc);
+mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc pd,
+                                                mkldnn_memory_format_t format);
+
+
+}  // namespace mxnet
+#endif
+#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc
new file mode 100644
index 000000000000..7b5d9cb6706a
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_base.cc
@@ -0,0 +1,239 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#if MXNET_USE_MKLDNN == 1
+
+#include "./mkldnn_base-inl.h"
+#include "./mkldnn_ops-inl.h"
+
+namespace mxnet {
+
+mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::primitive_desc &pd) {
+  // We need to include the size of the memory used for alignment.
+  this->est_size += pd.get_size() + alignment;
+  void *this_mem = this->curr_mem;
+  void *mem = std::align(alignment, pd.get_size(), this_mem, this->curr_size);
+  if (mem) {
+    // The memory is allocated from the temporary memory space in the
+    // operator. It'll only become invalid after we exit from the operator.
+    mkldnn_mem_ptr ret(new mkldnn::memory(pd, this_mem));
+    MKLDNNStream::Get()->RegisterMem(ret);
+    CHECK_EQ(this_mem, mem);
+    this->curr_size -= pd.get_size();
+    this->curr_mem = static_cast<char *>(this_mem) + pd.get_size();
+    return ret.get();
+  } else {
+    LOG(WARNING) << "Allocate " << pd.get_size()
+        << " bytes with malloc directly";
+    mkldnn_mem_ptr ret(new mkldnn::memory(pd));
+    MKLDNNStream::Get()->RegisterMem(ret);
+    return ret.get();
+  }
+}
+
+mkldnn_output_t CreateMKLDNNMem(const NDArray &arr,
+                                const mkldnn::memory::primitive_desc &desc,
+                                OpReqType req) {
+  if (kAddTo == req) {
+    auto tmp = TmpMemMgr::Get()->Alloc(desc);
+    return mkldnn_output_t(OutDataOp::AddBack, tmp);
+  } else if (kWriteInplace == req) {
+    // MKLDNN ops may not support the case that the input and the output uses
+    // the same memory. Let's use an extra copy to make sure it always works.
+    auto tmp = TmpMemMgr::Get()->Alloc(desc);
+    return mkldnn_output_t(OutDataOp::CopyBack, tmp);
+  } else {
+    mkldnn::memory *mem = const_cast<NDArray &>(arr).CreateMKLDNNData(desc);
+    if (mem == nullptr) {
+      auto tmp = TmpMemMgr::Get()->Alloc(desc);
+      return mkldnn_output_t(OutDataOp::CopyBack, tmp);
+    } else {
+      return mkldnn_output_t(OutDataOp::Noop, mem);
+    }
+  }
+}
+
+mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &arr,
+                                       const mkldnn::memory::primitive_desc &desc,
+                                       OpReqType req) {
+  if (kAddTo == req) {
+    auto tmp = TmpMemMgr::Get()->Alloc(desc);
+    return mkldnn_output_t(OutDataOp::AddBack, tmp);
+  } else if (kWriteInplace == req) {
+    auto tmp = TmpMemMgr::Get()->Alloc(desc);
+    return mkldnn_output_t(OutDataOp::CopyBack, tmp);
+  } else {
+    auto _desc = desc;
+    auto def_format = GetDefaultFormat(_desc.desc());
+    mkldnn::memory *mem = nullptr;
+    if (def_format == _desc.desc().data.format) {
+      mem = const_cast<NDArray &>(arr).CreateMKLDNNData(desc);
+    }
+    if (mem == nullptr) {
+      auto tmp = TmpMemMgr::Get()->Alloc(desc);
+      return mkldnn_output_t(OutDataOp::CopyBack, tmp);
+    } else {
+      return mkldnn_output_t(OutDataOp::Noop, mem);
+    }
+  }
+}
+
+void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) {
+  if (res.first == CopyBack) {
+    const_cast<NDArray &>(arr).CopyFrom(*res.second);
+  } else if (res.first == AddBack) {
+    auto mem = arr.GetMKLDNNData(res.second->get_primitive_desc());
+    CHECK(mem != nullptr);
+    // We have to allocate new memory for the sum result.
+    auto sum_res = TmpMemMgr::Get()->Alloc(
+        res.second->get_primitive_desc());
+    op::Sum(*res.second, *mem, *sum_res);
+    const_cast<NDArray &>(arr).CopyFrom(*sum_res);
+  }
+}
+
+const mkldnn::memory *GetWeights(const NDArray &arr,
+                                 const mkldnn::memory::primitive_desc &target_pd,
+                                 int num_groups) {
+  const mkldnn::memory *mem = arr.GetMKLDNNData(target_pd);
+  // If the weight array already uses the target layout, simply return it
+  // directly.
+  if (mem)
+    return mem;
+
+  mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype());
+  auto engine = CpuEngine::Get()->get_engine();
+  if (arr.shape().ndim() == 2) {
+    mkldnn::memory::dims tz = mkldnn::memory::dims{
+      static_cast<int>(arr.shape()[0]), static_cast<int>(arr.shape()[1])};
+    mkldnn::memory::desc md =
+        mkldnn::memory::desc{tz, type, mkldnn::memory::format::oi};
+    mkldnn::memory::primitive_desc pd =
+        mkldnn::memory::primitive_desc{md, engine};
+    mem = arr.GetMKLDNNData(pd);
+  } else if (arr.shape().ndim() == 4 && num_groups == 1) {
+    mkldnn::memory::dims tz = mkldnn::memory::dims{
+      static_cast<int>(arr.shape()[0]), static_cast<int>(arr.shape()[1]),
+          static_cast<int>(arr.shape()[2]), static_cast<int>(arr.shape()[3])};
+    mkldnn::memory::desc md =
+        mkldnn::memory::desc{tz, type, mkldnn::memory::format::oihw};
+    mkldnn::memory::primitive_desc pd =
+        mkldnn::memory::primitive_desc{md, engine};
+    mem = arr.GetMKLDNNData(pd);
+  } else if (arr.shape().ndim() == 4) {
+    mkldnn::memory::dims tz = mkldnn::memory::dims{ num_groups,
+      static_cast<int>(arr.shape()[0] / num_groups),
+      static_cast<int>(arr.shape()[1]),
+      static_cast<int>(arr.shape()[2]),
+      static_cast<int>(arr.shape()[3])};
+    mkldnn::memory::desc md =
+        mkldnn::memory::desc{tz, type, mkldnn::memory::format::goihw};
+    mkldnn::memory::primitive_desc pd =
+        mkldnn::memory::primitive_desc{md, engine};
+    mem = arr.GetMKLDNNData(pd);
+  } else {
+    LOG(FATAL) << "The weight array has an unsupported number of dimensions";
+    return nullptr;
+  }
+  if (mem == nullptr)
+    mem = arr.GetMKLDNNDataReorder(target_pd);
+  if (mem->get_primitive_desc() == target_pd) return mem;
+
+  auto ret = TmpMemMgr::Get()->Alloc(target_pd);
+  MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(*mem, *ret));
+  return ret;
+}
+
+mkldnn_memory_format_t GetDefaultFormat(mkldnn::memory::desc desc) {
+  if (desc.data.ndims == 1) {
+    return desc.data.format;
+  } else if (desc.data.ndims == 2) {
+    if (desc.data.format == mkldnn_io)
+      return mkldnn_oi;
+    else
+      return desc.data.format;
+  } else if (desc.data.ndims == 4) {
+    switch (desc.data.format) {
+      case mkldnn_nchw:
+      case mkldnn_nhwc:
+      case mkldnn_chwn:
+      case mkldnn_nChw8c:
+      case mkldnn_nChw16c:
+        return mkldnn_nchw;
+      case mkldnn_oihw:
+      case mkldnn_ihwo:
+      case mkldnn_hwio:
+      case mkldnn_OIhw8i8o:
+      case mkldnn_OIhw16i16o:
+      case mkldnn_OIhw8i16o2i:
+      case mkldnn_OIhw8o16i2o:
+      case mkldnn_OIhw8o8i:
+      case mkldnn_OIhw16o16i:
+      case mkldnn_IOhw16o16i:
+      case mkldnn_Oihw8o:
+      case mkldnn_Oihw16o:
+      case mkldnn_Ohwi8o:
+      case mkldnn_Ohwi16o:
+      case mkldnn_OhIw16o4i:
+        return mkldnn_oihw;
+      default:
+        LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format;
+        return mkldnn_format_undef;
+    }
+  } else if (desc.data.ndims == 5) {
+    switch (desc.data.format) {
+      case mkldnn_goihw:
+      case mkldnn_gOIhw8i8o:
+      case mkldnn_gOIhw16i16o:
+      case mkldnn_gOIhw8i16o2i:
+      case mkldnn_gOIhw8o16i2o:
+      case mkldnn_gOIhw8o8i:
+      case mkldnn_gOIhw16o16i:
+      case mkldnn_gIOhw16o16i:
+      case mkldnn_gOihw8o:
+      case mkldnn_gOihw16o:
+      case mkldnn_gOhwi8o:
+      case mkldnn_gOhwi16o:
+      case mkldnn_gOhIw16o4i:
+        return mkldnn_goihw;
+      default:
+        LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format;
+        return mkldnn_format_undef;
+    }
+  } else {
+    LOG(FATAL) << "Unsupported dimensions: " << desc.data.ndims;
+    return mkldnn_format_undef;
+  }
+}
+
+mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc pd,
+                                                mkldnn_memory_format_t format) {
+  mkldnn::memory::dims dims(pd.desc().data.ndims);
+  for (size_t i = 0; i < dims.size(); i++)
+    dims[i] = pd.desc().data.dims[i];
+  mkldnn::memory::format cpp_format = static_cast<mkldnn::memory::format>(format);
+  mkldnn::memory::data_type cpp_type = static_cast<mkldnn::memory::data_type>(
+      pd.desc().data.data_type);
+  mkldnn::memory::desc data_md(dims, cpp_type, cpp_format);
+  return mkldnn::memory::primitive_desc(data_md, pd.get_engine());
+}
+
+}  // namespace mxnet
+
+#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
new file mode 100644
index 000000000000..9f5dc5bc65fd
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
@@ -0,0 +1,335 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_batch_norm-inl.h
+ * \brief
+ * \author Tao Lv (tao.a.lv@intel.com)
+*/
+
+#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_
+#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_
+
+#if MXNET_USE_MKLDNN == 1
+#include <vector>
+#include <utility>
+#include <mkldnn.hpp>
+#include "../batch_norm-inl.h"
+#include "./mkldnn_ops-inl.h"
+#include "./mkldnn_base-inl.h"
+
+#define VARIANCE_TO_INVSTD(__var$,    __eps$)   (1.0/sqrt((__var$) + DType(__eps$)))
+#define INVSTD_TO_VARIANCE(__invstd$, __eps$)   ((1.0 / ((__invstd$) * (__invstd$))) - (__eps$))
+namespace mxnet {
+namespace op {
+
+typedef mkldnn::batch_normalization_forward::primitive_desc     t_bn_f_pdesc;
+typedef mkldnn::batch_normalization_forward::desc               t_bn_f_desc;
+typedef mkldnn::batch_normalization_backward::primitive_desc    t_bn_b_pdesc;
+typedef mkldnn::batch_normalization_backward::desc              t_bn_b_desc;
+typedef MKLDNNParamOpSign<BatchNormParam>                       MKLDNNBNSignature;
+
+using mkldnn::use_global_stats;
+using mkldnn::use_scale_shift;
+using mkldnn::forward_training;
+using mkldnn::forward_inference;
+
+inline static unsigned _GetFlags(const std::vector<NDArray> &in_data,
+                                 const std::vector<NDArray> &aux_states,
+                                 const BatchNormParam &param, bool is_train) {
+  unsigned flags = 0U;
+  if (in_data.size() == 3U) {
+    flags |= use_scale_shift;
+  }
+
+  if (aux_states.size() == 2U && !is_train) {
+    flags |= use_global_stats;
+  }
+  return flags;
+}
+
+template <typename DType>
+inline static t_bn_f_pdesc _GetFwd(const mkldnn::memory &data_mem,
+                                   bool is_train,
+                                   DType eps,
+                                   unsigned flags) {
+  auto data_mpd   = data_mem.get_primitive_desc();
+  auto data_md    = data_mpd.desc();
+  auto engine     = CpuEngine::Get()->get_engine();
+
+  if (is_train) {
+    t_bn_f_desc bnFwd_desc(forward_training, data_md, eps, flags);
+    return t_bn_f_pdesc(bnFwd_desc, engine);
+  } else {
+    t_bn_f_desc bnFwd_desc(forward_inference, data_md, eps, flags);
+    return t_bn_f_pdesc(bnFwd_desc, engine);
+  }
+}
+
+template <typename DType>
+inline static t_bn_b_pdesc _GetBwd(const mkldnn::memory &data_mem,
+                                   const mkldnn::memory &diff_mem,
+                                   DType eps,
+                                   unsigned flags) {
+  auto data_mpd   = data_mem.get_primitive_desc();
+  auto data_md    = data_mpd.desc();
+  auto diff_mpd   = diff_mem.get_primitive_desc();
+  auto diff_md    = diff_mpd.desc();
+  auto engine     = CpuEngine::Get()->get_engine();
+
+  t_bn_b_desc  bnBwd_desc(mkldnn::prop_kind::backward, diff_md, data_md, eps, flags);
+  return t_bn_b_pdesc(bnBwd_desc, engine, _GetFwd(data_mem, true, eps, flags));
+}
+
+template <typename DType>
+class MKLDNNBatchNormFwd {
+ public:
+  MKLDNNBatchNormFwd(const mxnet::NDArray &data, DType eps,
+                     bool is_train, bool scale_shift,
+                     bool global_stats, bool fix_gamma) :
+                     _out_mean(nullptr), _out_var(nullptr),
+                     _flag(0U), _fix_gamma(fix_gamma), _is_train(is_train),
+                     _channels(data.shape()[1]), _eps(eps),
+                     fwd(nullptr), data(nullptr), weight(nullptr),
+                     out(nullptr), mean(nullptr), variance(nullptr) {
+    _Init(data, scale_shift, global_stats);
+  }
+
+  ~MKLDNNBatchNormFwd() {}
+
+  void SetDataHandle(const std::vector<OpReqType> &req,
+                     const mxnet::NDArray         &data,
+                     const mxnet::NDArray         &output,
+                     const mxnet::TBlob           &moving_mean,
+                     const mxnet::TBlob           &moving_var,
+                     const mxnet::TBlob           &out_mean,
+                     const mxnet::TBlob           &out_var,
+                     const mxnet::TBlob           *gamma        = nullptr,
+                     const mxnet::TBlob           *beta         = nullptr);
+
+  void Execute();
+
+ private:
+  DType *_out_mean;
+  DType *_out_var;
+  unsigned _flag;
+  bool _fix_gamma;
+  bool _is_train;
+  nnvm::dim_t _channels;
+  DType _eps;
+
+  std::shared_ptr<mkldnn::batch_normalization_forward> fwd;
+  std::shared_ptr<mkldnn::memory> data;
+  std::shared_ptr<mkldnn::memory> weight;
+  std::shared_ptr<mkldnn::memory> out;
+  std::shared_ptr<mkldnn::memory> mean;
+  std::shared_ptr<mkldnn::memory> variance;
+
+ private:
+  void _Init(const mxnet::NDArray &data, bool scale_shift, bool global_stats);
+  void _SetWeight(const mxnet::TBlob &gamma,
+                  const mxnet::TBlob &beta,
+                  const OpReqType    &req);
+  void _SetMeanVar(const DType *imean,
+                   const DType *ivar,
+                   DType *omean,
+                   DType *ovar);
+};
+
+template <typename DType>
+static inline MKLDNNBatchNormFwd<DType> &GetBatchNormFwd(const BatchNormParam &param,
+                                                         bool is_train,
+                                                         const NDArray &data) {
+  static thread_local std::unordered_map<MKLDNNBNSignature,
+                                         MKLDNNBatchNormFwd<DType>,
+                                         MKLDNNOpHash> bn_fwds;
+  MKLDNNBNSignature key(param);
+  key.AddSign(is_train);
+  key.AddSign(*(data.GetMKLDNNData()));
+
+  auto it = bn_fwds.find(key);
+  if (it == bn_fwds.end()) {
+    MKLDNNBatchNormFwd<DType> fwd(data, param.eps, is_train, true,
+                               param.use_global_stats, param.fix_gamma);
+    auto ins_ret = bn_fwds.insert(
+                std::pair<MKLDNNBNSignature, MKLDNNBatchNormFwd<DType> >(key, fwd));
+    CHECK(ins_ret.second);
+    it = ins_ret.first;
+  }
+  return it->second;
+}
+
+template <typename DType>
+void MKLDNNBatchNormCompute(const OpContext &ctx, const BatchNormParam &param,
+                            const std::vector<NDArray>   &in_data,
+                            const std::vector<OpReqType> &req,
+                            const std::vector<NDArray>   &out_data,
+                            const std::vector<NDArray>   &aux_states) {
+  TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]);
+  const NDArray &data  = in_data[batchnorm::kData];
+  auto gamma           = in_data[batchnorm::kGamma].data();
+  auto beta            = in_data[batchnorm::kBeta].data();
+  auto moving_mean     = aux_states[batchnorm::kMovingMean].data();
+  auto moving_var      = aux_states[batchnorm::kMovingVar].data();
+  const NDArray &out   = out_data[batchnorm::kOut];
+  auto out_mean        = out_data[batchnorm::kMean].data();
+  auto out_var         = out_data[batchnorm::kVar].data();
+
+  MKLDNNBatchNormFwd<DType> &fwd = GetBatchNormFwd<DType>(param, ctx.is_train, data);
+  fwd.SetDataHandle(req, data, out, moving_mean, moving_var,
+                    out_mean, out_var, &gamma, &beta);
+  fwd.Execute();
+}
+
+template <typename DType>
+void MKLDNNBatchNormGradCompute(const OpContext &ctx, const BatchNormParam &param,
+                                const std::vector<NDArray>    &out_grad,
+                                const std::vector<NDArray>    &in_data,
+                                const std::vector<NDArray>    &out_data,
+                                const std::vector<OpReqType>  &req,
+                                const std::vector<NDArray>    &in_grad,
+                                const std::vector<NDArray>    &aux_states) {
+  TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]);
+  CHECK_EQ(out_grad.size(), param.output_mean_var ? 3U : 1U);
+  CHECK_EQ(in_data.size(), 3U);
+  CHECK_EQ(out_data.size(), 3U);
+  CHECK_EQ(in_grad.size(), 3U);
+  unsigned flags = _GetFlags(in_data, aux_states, param, ctx.is_train);
+
+  const NDArray &data         = in_data[batchnorm::kData];
+  const NDArray &diff         = out_grad[batchnorm::kOut];
+  const NDArray &gradIn       = in_grad[batchnorm::kData];
+  const NDArray &moving_mean  = aux_states[batchnorm::kMovingMean];
+  const NDArray &moving_var   = aux_states[batchnorm::kMovingVar];
+  const NDArray &out_mean     = out_data[batchnorm::kMean];
+  const NDArray &out_var      = out_data[batchnorm::kVar];
+
+  CHECK(out_mean.IsDefault());
+  CHECK(out_var.IsDefault());
+  CHECK(moving_mean.IsDefault());
+  CHECK(moving_var.IsDefault());
+
+  auto data_mem  = data.GetMKLDNNData();
+  auto diff_mem  = diff.GetMKLDNNData();
+  // MKLDNN batchnorm should run on special layouts. If one of them isn't, we
+  // should reorder them.
+  if (data.IsDefault())
+    data_mem = data.GetMKLDNNDataReorder(diff_mem->get_primitive_desc());
+  else if (diff.IsDefault())
+    diff_mem = diff.GetMKLDNNDataReorder(data_mem->get_primitive_desc());
+  auto bwd_pd = _GetBwd(*data_mem, *diff_mem, param.eps, flags);
+  auto gradi_mem = const_cast<NDArray &>(gradIn).CreateMKLDNNData(data_mem->get_primitive_desc());
+
+  if (flags & use_scale_shift) {
+    const NDArray &gamma    = in_data[batchnorm::kGamma];
+    const NDArray &beta     = in_data[batchnorm::kBeta];
+    // TODO(tao): how to reuse this memory?
+    std::shared_ptr<const mkldnn::memory> weight_mem(
+                    new mkldnn::memory(bwd_pd.weights_primitive_desc()));
+
+    DType* weight_buf = reinterpret_cast<DType *>(weight_mem->get_data_handle());
+    nnvm::dim_t channels_ = data.shape()[1];
+    for (int i = 0; i < channels_; i++) {
+      if (!param.fix_gamma)
+        weight_buf[i] = (gamma.data().dptr<DType>())[i];   // weight
+      else
+        weight_buf[i] = (DType)1.0f;
+    }
+
+    for (int i = 0; i < channels_; i++) {
+      weight_buf[channels_ + i] = (beta.data().dptr<DType>())[i];  // bias
+    }
+
+    std::shared_ptr<const mkldnn::memory> gradw_mem(
+                    new mkldnn::memory(bwd_pd.diff_weights_primitive_desc()));
+    // training but no input mean and variance
+    if (ctx.is_train && !param.use_global_stats) {
+      DType* moving_mean_ptr  = reinterpret_cast<DType *>(moving_mean.data().dptr<DType>());
+      DType* moving_var_ptr   = reinterpret_cast<DType *>(moving_var.data().dptr<DType>());
+      DType* out_mean_ptr     = reinterpret_cast<DType *>(out_mean.data().dptr<DType>());
+      DType* out_var_ptr      = reinterpret_cast<DType *>(out_var.data().dptr<DType>());
+      mkldnn::memory var_mem(bwd_pd.variance_primitive_desc());
+      DType *tmp_var_ptr = reinterpret_cast<DType *>(var_mem.get_data_handle());
+
+      DType minus_mom = (1.0f - param.momentum);
+      for (int i = 0; i < channels_; i++) {
+        moving_mean_ptr[i] = moving_mean_ptr[i] * param.momentum +
+                             out_mean_ptr[i] * minus_mom;
+        float variance = INVSTD_TO_VARIANCE(out_var_ptr[i], param.eps);
+        tmp_var_ptr[i] = variance;
+        moving_var_ptr[i] = moving_var_ptr[i] * param.momentum +
+                            variance * minus_mom;
+      }
+
+      std::shared_ptr<const mkldnn::memory> out_mean_mem(
+                      new mkldnn::memory(bwd_pd.mean_primitive_desc(), out_mean_ptr));
+      std::shared_ptr<const mkldnn::memory> out_var_mem(
+                      new mkldnn::memory(bwd_pd.variance_primitive_desc(), out_var_ptr));
+
+      auto bn_bwd = mkldnn::batch_normalization_backward(bwd_pd,
+                                                         *data_mem,
+                                                         mkldnn::primitive::at(*out_mean_mem),
+                                                         mkldnn::primitive::at(var_mem),
+                                                         *diff_mem,
+                                                         *weight_mem,
+                                                         *gradi_mem,
+                                                         *gradw_mem);
+
+      MKLDNNStream::Get()->RegisterPrim(bn_bwd);
+      MKLDNNStream::Get()->Submit();
+    } else {
+      std::shared_ptr<const mkldnn::memory> imean_mem(
+                      new mkldnn::memory(bwd_pd.mean_primitive_desc(),
+                      moving_mean.data().dptr<DType>()));
+      std::shared_ptr<const mkldnn::memory> ivar_mem(
+                      new mkldnn::memory(bwd_pd.variance_primitive_desc(),
+                      moving_var.data().dptr<DType>()));
+      auto bn_bwd = mkldnn::batch_normalization_backward(bwd_pd,
+                                                         *data_mem,
+                                                         mkldnn::primitive::at(*imean_mem),
+                                                         mkldnn::primitive::at(*ivar_mem),
+                                                         *diff_mem,
+                                                         *weight_mem,
+                                                         *gradi_mem,
+                                                         *gradw_mem);
+
+      MKLDNNStream::Get()->RegisterPrim(bn_bwd);
+      MKLDNNStream::Get()->Submit();
+    }
+
+    // copy data from gradw_mem to in_grad[1] and in_grad[2]
+    DType* gw_buf = reinterpret_cast<DType *>(gradw_mem->get_data_handle());
+    for (int i = 0; i < channels_; i++) {
+      if (!param.fix_gamma)
+        (in_grad[1].data().dptr<DType>())[i] = gw_buf[i];
+      else
+        (in_grad[1].data().dptr<DType>())[i] = 0.0f;
+    }
+
+    for (int i = 0; i < channels_; i++) {
+      (in_grad[2].data().dptr<DType>())[i] = gw_buf[i + channels_];
+    }
+  } else {
+    LOG(FATAL) << "MKLDNN batch normalization backward: should not reach here ...";
+  }
+}
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_MKLDNN
+#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_batch_norm.cc b/src/operator/nn/mkldnn/mkldnn_batch_norm.cc
new file mode 100644
index 000000000000..b40a9f3b290e
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_batch_norm.cc
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_batch_norm.cc
+ * \brief
+ * \author Tao Lv (tao.a.lv@intel.com)
+*/
+
+#if MXNET_USE_MKLDNN == 1
+#include "./mkldnn_batch_norm-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template <typename DType>
+void MKLDNNBatchNormFwd<DType>::_Init(const mxnet::NDArray &src,
+                                      bool scale_shift,
+                                      bool global_stats) {
+    this->_flag |= scale_shift ? use_scale_shift : 0U;
+    // this->_flag |= global_stats ? use_global_stats : 0U;
+    if (!(this->_is_train))
+        this->_flag |= use_global_stats;
+
+    auto src_md = src.GetMKLDNNData()->get_primitive_desc().desc();
+    auto engine = CpuEngine::Get()->get_engine();
+
+    mkldnn::prop_kind prop = forward_training;
+    if (this->_is_train) {
+        prop = forward_training;
+    } else {
+        prop = forward_inference;
+    }
+
+    auto fwd_desc = t_bn_f_desc(prop, src_md, this->_eps, this->_flag);
+    auto fwd_pd   = t_bn_f_pdesc(fwd_desc, engine);
+
+    this->data.reset(new mkldnn::memory(src.GetMKLDNNData()->get_primitive_desc()));
+    this->out.reset(new mkldnn::memory(fwd_pd.dst_primitive_desc()));
+
+    if (this->_flag & use_scale_shift) {
+        this->weight.reset(new memory(fwd_pd.weights_primitive_desc()));
+    }
+
+    if (this->_is_train || (this->_flag & use_global_stats)) {
+        this->mean.reset(new mkldnn::memory(fwd_pd.mean_primitive_desc()));
+        this->variance.reset(new mkldnn::memory(fwd_pd.variance_primitive_desc()));
+    }
+
+    // for mxnet, there always has weight
+    CHECK_EQ(this->_flag & use_scale_shift, use_scale_shift);
+    if (!(this->_is_train)) {
+        this->fwd.reset(
+                new mkldnn::batch_normalization_forward(fwd_pd,
+                                                        *(this->data),
+                                                        mkldnn::primitive::at(*(this->mean)),
+                                                        mkldnn::primitive::at(*(this->variance)),
+                                                        mkldnn::primitive::at(*(this->weight)),
+                                                        *(this->out)));
+    } else {
+        this->fwd.reset(
+                new mkldnn::batch_normalization_forward(fwd_pd,
+                                                        *(this->data),
+                                                        mkldnn::primitive::at(*(this->weight)),
+                                                        *(this->out),
+                                                        *(this->mean),
+                                                        *(this->variance)));
+    }
+    return;
+}
+
+template <typename DType>
+void MKLDNNBatchNormFwd<DType>::SetDataHandle(const std::vector<OpReqType> &req,
+                                              const mxnet::NDArray         &data,
+                                              const mxnet::NDArray         &output,
+                                              const mxnet::TBlob           &moving_mean,
+                                              const mxnet::TBlob           &moving_var,
+                                              const mxnet::TBlob           &out_mean,
+                                              const mxnet::TBlob           &out_var,
+                                              const mxnet::TBlob           *gamma,
+                                              const mxnet::TBlob           *beta) {
+    auto data_mem = data.GetMKLDNNData();
+    auto out_mem = const_cast<NDArray&>(output).CreateMKLDNNData(this->out->get_primitive_desc());
+    this->data->set_data_handle(data_mem->get_data_handle());
+    this->out->set_data_handle(out_mem->get_data_handle());
+
+    // weights
+    if (gamma != nullptr && beta != nullptr && (this->_flag | use_scale_shift)) {
+      _SetWeight(*gamma, *beta, req[batchnorm::kGamma]);
+    }
+
+    // mean and variance
+    this->_out_mean = out_mean.dptr<DType>();
+    this->_out_var  = out_var.dptr<DType>();
+    if (!(this->_is_train)) {
+      this->mean->set_data_handle(moving_mean.dptr<DType>());
+      this->variance->set_data_handle(moving_var.dptr<DType>());
+    } else {
+      this->mean->set_data_handle(this->_out_mean);
+      this->variance->set_data_handle(this->_out_var);
+    }
+}
+
+template <typename DType>
+void MKLDNNBatchNormFwd<DType>::Execute() {
+    MKLDNNStream::Get()->RegisterPrim(*(this->fwd));
+    MKLDNNStream::Get()->Submit();
+    _SetMeanVar(reinterpret_cast<DType*>(this->mean->get_data_handle()),
+                reinterpret_cast<DType*>(this->variance->get_data_handle()),
+                this->_out_mean, this->_out_var);
+}
+
+template <typename DType>
+void MKLDNNBatchNormFwd<DType>::_SetWeight(const mxnet::TBlob &gamma,
+                                           const mxnet::TBlob &beta,
+                                           const OpReqType    &req) {
+    // CHECK_NE(this->weight, nullptr);
+    DType *gamma_ptr  = gamma.dptr<DType>();
+    DType *beta_ptr   = beta.dptr<DType>();
+    DType *weight_ptr = reinterpret_cast<DType*>(this->weight->get_data_handle());
+    // std::cout << "_SetWeight: channel size: " << this->_channels << std::endl;
+    if (!(this->_fix_gamma)) {
+#pragma omp parallel for simd
+      for (int i = 0; i < this->_channels; i++) {
+        weight_ptr[i] = gamma_ptr[i];
+        weight_ptr[this->_channels + i] = beta_ptr[i];  // bias
+      }
+    } else if (IsBNWriting(req)) {
+#pragma omp parallel for simd
+      for (int i = 0; i < this->_channels; i++) {
+        weight_ptr[i] = (DType)1.0f;
+        weight_ptr[this->_channels + i] = beta_ptr[i];  // bias
+        gamma_ptr[i] = (DType)1.0f;
+      }
+    } else {
+#pragma omp parallel for simd
+      for (int i = 0; i < this->_channels; i++) {
+        weight_ptr[i] = (DType)1.0f;
+        weight_ptr[this->_channels + i] = beta_ptr[i];  // bias
+      }
+    }
+}
+
+template <typename DType>
+void MKLDNNBatchNormFwd<DType>::_SetMeanVar(const DType *imean,
+                                            const DType *ivar,
+                                            DType *omean,
+                                            DType *ovar) {
+    float e = this->_eps;
+#pragma omp parallel for firstprivate(e)
+    for (int i = 0; i < this->_channels; i++) {
+      omean[i] = imean[i];
+      ovar[i] = VARIANCE_TO_INVSTD(ivar[i], e);
+    }
+}
+
+template class MKLDNNBatchNormFwd<float>;
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_MKLDNN
diff --git a/src/operator/nn/mkldnn/mkldnn_concat.cc b/src/operator/nn/mkldnn/mkldnn_concat.cc
new file mode 100644
index 000000000000..d3e6e775020d
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_concat.cc
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_concat.cc
+ * \brief
+ * \author Wenting Jiang
+*/
+#include "../concat-inl.h"
+#include "./mkldnn_ops-inl.h"
+#include "./mkldnn_base-inl.h"
+
+#if MXNET_USE_MKLDNN == 1
+namespace mxnet {
+namespace op {
+
+void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                         const std::vector<NDArray> &in_data,
+                         const std::vector<OpReqType> &req,
+                         const std::vector<NDArray> &out_data) {
+  TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]);
+  const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
+  int num_in_data = param.num_args;
+  int concat_dim = param.dim;
+  std::vector<mkldnn::memory::primitive_desc> data_md;
+  std::vector<mkldnn::primitive::at> data_mem;
+  for (int i =0; i < num_in_data; i++) {
+      auto tmp_mem = in_data[i].GetMKLDNNData();
+      auto tmp_pd = tmp_mem->get_primitive_desc();
+      data_md.push_back(tmp_pd);
+      data_mem.push_back(*tmp_mem);
+  }
+  mkldnn::concat::primitive_desc fwd_pd(concat_dim, data_md);
+  auto engine = CpuEngine::Get()->get_engine();
+  auto out_mem = CreateMKLDNNMem(out_data[concat_enum::kOut],
+      fwd_pd.dst_primitive_desc(), req[concat_enum::kOut]);
+  MKLDNNStream::Get()->RegisterPrim(mkldnn::concat(fwd_pd, data_mem, *out_mem.second));
+  CommitOutput(out_data[concat_enum::kOut], out_mem);
+  MKLDNNStream::Get()->Submit();
+}
+
+void MKLDNNConcatBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                          const std::vector<NDArray>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<NDArray>& outputs) {
+  TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]);
+  const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
+  int num_in_data = param.num_args;
+  int axis_ = param.dim;
+  auto engine = CpuEngine::Get()->get_engine();
+  auto gz_mem = inputs[0].GetMKLDNNData();
+  mkldnn::memory::primitive_desc gz_pd = gz_mem->get_primitive_desc();
+  /* init the offset */
+  mkldnn::memory::dims offsets = {0, 0, 0, 0};
+  for (int i = 0; i < num_in_data; i++) {
+    mkldnn::memory::dims diff_src_tz
+        = {static_cast<int>(inputs[i+1].shape()[0]),
+          static_cast<int>(inputs[i+1].shape()[1]),
+          static_cast<int>(inputs[i+1].shape()[2]),
+          static_cast<int>(inputs[i+1].shape()[3])};
+    auto diff_src_mpd = inputs[i+1].GetMKLDNNData()->get_primitive_desc();
+    auto gradi_mem_ = CreateMKLDNNMem(outputs[i], diff_src_mpd, req[i]);
+    // create view from gy to gxs[i]
+    std::shared_ptr<mkldnn::view::primitive_desc> view_pd;
+    view_pd.reset(new mkldnn::view::primitive_desc(gz_pd, diff_src_tz, offsets));
+    // create reorder primitive from gy to gxs[i]
+    mkldnn::reorder::primitive_desc reorder_pd(
+        view_pd.get()->dst_primitive_desc(), diff_src_mpd);
+    offsets[axis_] += diff_src_tz[axis_];
+    MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(
+            reorder_pd, *gz_mem, *gradi_mem_.second));
+    CommitOutput(outputs[i], gradi_mem_);
+  }
+  MKLDNNStream::Get()->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc
new file mode 100644
index 000000000000..f10ff0f674a2
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc
@@ -0,0 +1,357 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_convolution.cc
+ * \brief
+ * \author Da Zheng
+*/
+
+#include "../convolution-inl.h"
+#include "./mkldnn_ops-inl.h"
+#include "./mkldnn_base-inl.h"
+
+#if MXNET_USE_MKLDNN == 1
+namespace mxnet {
+namespace op {
+
+static mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
+    const ConvolutionParam& param, bool is_train, const NDArray &data,
+    const NDArray &weights, const NDArray *bias, const NDArray &output) {
+  auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
+  auto data_md = GetMemDesc(data);
+  auto weight_md = GetWeightDesc(weights, param.num_group);
+  auto out_md = GetMemDesc(output);
+  auto engine = CpuEngine::Get()->get_engine();
+  mkldnn::memory::dims strides{0, 0};
+  if (param.stride.ndim() == 2) {
+    strides[0] = param.stride[0];
+    strides[1] = param.stride[1];
+  }
+  mkldnn::memory::dims padding{0, 0};
+  if (param.pad.ndim() == 2) {
+    padding[0] = param.pad[0];
+    padding[1] = param.pad[1];
+  }
+  if (param.dilate.ndim() == 0 && bias == nullptr) {
+    mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct,
+        data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero);
+    return mkldnn::convolution_forward::primitive_desc(desc, engine);
+  } else if (param.dilate.ndim() == 0) {
+    auto bias_md = GetMemDesc(*bias);
+    mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct,
+        data_md, weight_md, bias_md, out_md, strides, padding, padding,
+        mkldnn::padding_kind::zero);
+    return mkldnn::convolution_forward::primitive_desc(desc, engine);
+  } else {
+    mkldnn::memory::dims dilates{0, 0};
+    if (param.dilate.ndim() == 2) {
+      dilates[0] = param.dilate[0] - 1;
+      dilates[1] = param.dilate[1] - 1;
+    }
+    if (bias == nullptr) {
+      mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct,
+          data_md, weight_md, out_md, strides, dilates, padding, padding,
+          mkldnn::padding_kind::zero);
+      return mkldnn::convolution_forward::primitive_desc(desc, engine);
+    } else {
+      auto bias_md = GetMemDesc(*bias);
+      mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct,
+                                             data_md, weight_md, bias_md, out_md, strides,
+                                             dilates, padding, padding,
+                                             mkldnn::padding_kind::zero);
+      return mkldnn::convolution_forward::primitive_desc(desc, engine);
+    }
+  }
+}
+
+static mkldnn::convolution_backward_data::primitive_desc GetConvBwdData(
+    const ConvolutionParam& param, const NDArray &data, const NDArray &weights,
+    const NDArray &output, const mkldnn::convolution_forward::primitive_desc &fwd_pd) {
+  auto data_md = GetMemDesc(data);
+  auto weight_md = GetWeightDesc(weights, param.num_group);
+  auto out_md = GetMemDesc(output);
+  auto engine = CpuEngine::Get()->get_engine();
+  mkldnn::memory::dims strides{0, 0};
+  if (param.stride.ndim() == 2) {
+    strides[0] = param.stride[0];
+    strides[1] = param.stride[1];
+  }
+  mkldnn::memory::dims padding{0, 0};
+  if (param.pad.ndim() == 2) {
+    padding[0] = param.pad[0];
+    padding[1] = param.pad[1];
+  }
+  if (param.dilate.ndim() == 0) {
+    mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
+        data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero);
+    return mkldnn::convolution_backward_data::primitive_desc(desc, engine, fwd_pd);
+  } else {
+    mkldnn::memory::dims dilates{0, 0};
+    if (param.dilate.ndim() == 2) {
+      dilates[0] = param.dilate[0] - 1;
+      dilates[1] = param.dilate[1] - 1;
+    }
+    mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
+        data_md, weight_md, out_md, strides, dilates, padding, padding,
+        mkldnn::padding_kind::zero);
+    return mkldnn::convolution_backward_data::primitive_desc(desc, engine, fwd_pd);
+  }
+}
+
+static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights(
+    const ConvolutionParam& param, const NDArray &data,
+    const NDArray &weights, const NDArray *bias, const NDArray &output,
+    const mkldnn::convolution_forward::primitive_desc &fwd_pd) {
+  auto data_md = GetMemDesc(data);
+  auto weight_md = GetWeightDesc(weights, param.num_group);
+  auto out_md = GetMemDesc(output);
+  auto engine = CpuEngine::Get()->get_engine();
+  mkldnn::memory::dims strides{0, 0};
+  if (param.stride.ndim() == 2) {
+    strides[0] = param.stride[0];
+    strides[1] = param.stride[1];
+  }
+  mkldnn::memory::dims padding{0, 0};
+  if (param.pad.ndim() == 2) {
+    padding[0] = param.pad[0];
+    padding[1] = param.pad[1];
+  }
+  if (param.dilate.ndim() == 0 && bias == nullptr) {
+    mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
+        data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero);
+    return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd);
+  } else if (param.dilate.ndim() == 0) {
+    auto bias_md = GetMemDesc(*bias);
+    mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
+        data_md, weight_md, bias_md, out_md, strides, padding, padding,
+        mkldnn::padding_kind::zero);
+    return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd);
+  } else {
+    mkldnn::memory::dims dilates{0, 0};
+    if (param.dilate.ndim() == 2) {
+      dilates[0] = param.dilate[0] - 1;
+      dilates[1] = param.dilate[1] - 1;
+    }
+    if (bias == nullptr) {
+      mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
+          data_md, weight_md, out_md, strides, dilates, padding, padding,
+          mkldnn::padding_kind::zero);
+      return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd);
+    } else {
+      auto bias_md = GetMemDesc(*bias);
+      mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
+                                                      data_md, weight_md, bias_md, out_md,
+                                                      strides, dilates, padding, padding,
+                                                      mkldnn::padding_kind::zero);
+      return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd);
+    }
+  }
+}
+
+class MKLDNNConvForward {
+  std::shared_ptr<mkldnn::convolution_forward> fwd;
+  std::shared_ptr<mkldnn::memory> data;
+  std::shared_ptr<mkldnn::memory> weight;
+  std::shared_ptr<mkldnn::memory> bias;
+  std::shared_ptr<mkldnn::memory> out;
+
+ public:
+  mkldnn::convolution_forward::primitive_desc fwd_pd;
+
+  MKLDNNConvForward(const ConvolutionParam& param, bool is_train,
+                    const NDArray &data, const NDArray &weights,
+                    const NDArray *bias, const NDArray &output): fwd_pd(
+                        GetConvFwdImpl(param, is_train, data, weights, bias, output)) {
+  }
+
+  void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &weight,
+                 const mkldnn::memory *bias, const mkldnn::memory &output) {
+    if (this->data == nullptr)
+      this->data = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+              fwd_pd.src_primitive_desc(), data.get_data_handle()));
+    else
+      this->data->set_data_handle(data.get_data_handle());
+
+    if (this->weight == nullptr)
+      this->weight = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+              fwd_pd.weights_primitive_desc(), weight.get_data_handle()));
+    else
+      this->weight->set_data_handle(weight.get_data_handle());
+
+    if (this->out == nullptr)
+      this->out = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+              fwd_pd.dst_primitive_desc(), output.get_data_handle()));
+    else
+      this->out->set_data_handle(output.get_data_handle());
+
+    if (bias != nullptr) {
+      if (this->bias == nullptr)
+        this->bias = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+                fwd_pd.bias_primitive_desc(), bias->get_data_handle()));
+      else
+        this->bias->set_data_handle(bias->get_data_handle());
+      if (this->fwd == nullptr)
+        this->fwd = std::shared_ptr<mkldnn::convolution_forward>(
+            new mkldnn::convolution_forward(fwd_pd, mkldnn::primitive::at(*this->data),
+                                            mkldnn::primitive::at(*this->weight),
+                                            mkldnn::primitive::at(*this->bias),
+                                            *this->out));
+    } else if (this->fwd == nullptr) {
+      this->fwd = std::shared_ptr<mkldnn::convolution_forward>(
+          new mkldnn::convolution_forward(fwd_pd, mkldnn::primitive::at(*this->data),
+                                          mkldnn::primitive::at(*this->weight),
+                                          *this->out));
+    }
+  }
+
+  const mkldnn::convolution_forward &GetFwd() const {
+    return *fwd;
+  }
+};
+
+typedef MKLDNNParamOpSign<ConvolutionParam> MKLDNNConvSignature;
+
+static inline MKLDNNConvForward &GetConvFwd(
+    const nnvm::NodeAttrs& attrs, bool is_train,
+    const NDArray &data, const NDArray &weights,
+    const NDArray *bias, const NDArray &output) {
+  static thread_local std::unordered_map<MKLDNNConvSignature, MKLDNNConvForward, MKLDNNOpHash> fwds;
+  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  MKLDNNConvSignature key(param);
+  key.AddSign(is_train);
+  // Here we can sign the conv op with NDArray because conv primitive will
+  // decide the right layout for the, so we only need to get the shape and the
+  // data type of the arrays.
+  key.AddSign(data);
+  key.AddSign(weights);
+  key.AddSign(output);
+  if (bias)
+    key.AddSign(*bias);
+
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+    MKLDNNConvForward fwd(param, is_train, data, weights, bias, output);
+    auto ins_ret = fwds.insert(
+        std::pair<MKLDNNConvSignature, MKLDNNConvForward>(key, fwd));
+    CHECK(ins_ret.second);
+    it = ins_ret.first;
+  }
+  return it->second;
+}
+
+void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                               const std::vector<NDArray> &in_data,
+                               const std::vector<OpReqType> &req,
+                               const std::vector<NDArray> &out_data) {
+  TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]);
+  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  MKLDNNConvForward &fwd = GetConvFwd(attrs,
+      ctx.is_train, in_data[conv::kData], in_data[conv::kWeight],
+      param.no_bias ? nullptr : &in_data[conv::kBias], out_data[conv::kOut]);
+
+  auto data_mem = in_data[conv::kData].GetMKLDNNDataReorder(fwd.fwd_pd.src_primitive_desc());
+  const mkldnn::memory *weight_mem;
+  if (ctx.is_train) {
+    // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it
+    // to the default format for now.
+    if (in_data[conv::kWeight].IsMKLDNN())
+      const_cast<NDArray &>(in_data[conv::kWeight]).Reorder2Default();
+    weight_mem = GetWeights(in_data[conv::kWeight], fwd.fwd_pd.weights_primitive_desc(),
+                            param.num_group);
+  } else {
+    // For inference, we want to reorder the weight array so we don't need to
+    // reorder data every time.
+    const_cast<NDArray &>(in_data[conv::kWeight]).Reorder(
+        fwd.fwd_pd.weights_primitive_desc());
+    weight_mem = in_data[conv::kWeight].GetMKLDNNData();
+  }
+  auto out_mem = CreateMKLDNNMem(out_data[conv::kOut], fwd.fwd_pd.dst_primitive_desc(),
+                                 req[conv::kOut]);
+  const mkldnn::memory *bias_mem = nullptr;
+  if (!param.no_bias)
+    bias_mem = in_data[conv::kBias].GetMKLDNNDataReorder(fwd.fwd_pd.bias_primitive_desc());
+  fwd.SetNewMem(*data_mem, *weight_mem, bias_mem, *out_mem.second);
+  MKLDNNStream::Get()->RegisterPrim(fwd.GetFwd());
+
+  CommitOutput(out_data[conv::kOut], out_mem);
+  MKLDNNStream::Get()->Submit();
+}
+
+void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                               const std::vector<NDArray>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<NDArray>& outputs) {
+  TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]);
+  const std::vector<NDArray> &in_grad = outputs;
+  const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
+  mkldnn::convolution_forward::primitive_desc fwd_pd = GetConvFwdImpl(param, ctx.is_train,
+      inputs[conv::kData + 1], inputs[conv::kWeight + 1],
+      param.no_bias ? nullptr : &inputs[conv::kBias + 1], inputs[conv::kOut]);
+
+  CHECK_NE(req[conv::kWeight], kWriteInplace) << "cannot write weight inplace";
+  mkldnn::convolution_backward_data::primitive_desc bwdData_pd
+    = GetConvBwdData(param, inputs[conv::kData + 1], inputs[conv::kWeight + 1],
+        inputs[conv::kOut], fwd_pd);
+  auto out_grad_mem = inputs[conv::kOut].GetMKLDNNDataReorder(
+      bwdData_pd.diff_dst_primitive_desc());
+  if (req[conv::kData]) {
+    auto weight_mem = GetWeights(inputs[conv::kWeight + 1],
+        bwdData_pd.weights_primitive_desc(), param.num_group);
+    auto in_grad_mem = CreateMKLDNNMem(in_grad[conv::kData],
+        bwdData_pd.diff_src_primitive_desc(), req[conv::kData]);
+    MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_data(bwdData_pd,
+          *out_grad_mem, *weight_mem, *in_grad_mem.second));
+    CommitOutput(in_grad[conv::kData], in_grad_mem);
+  }
+  if (req[conv::kWeight]) {
+    mkldnn::convolution_backward_weights::primitive_desc bwdWeights_pd
+        = GetConvBwdWeights(param, inputs[conv::kData + 1], inputs[conv::kWeight + 1],
+                            param.no_bias ? nullptr : &inputs[conv::kBias + 1],
+                            inputs[conv::kOut], fwd_pd);
+    if (bwdData_pd.diff_dst_primitive_desc() != bwdWeights_pd.diff_dst_primitive_desc())
+      out_grad_mem = inputs[conv::kOut].GetMKLDNNDataReorder(
+          bwdWeights_pd.diff_dst_primitive_desc());
+    auto data_mem = inputs[conv::kData + 1].GetMKLDNNDataReorder(
+        bwdWeights_pd.src_primitive_desc());
+    auto in_grad_weight = CreateMKLDNNWeightGrad(in_grad[conv::kWeight],
+                                                 bwdWeights_pd.diff_weights_primitive_desc(),
+                                                 req[conv::kWeight]);
+    mkldnn_output_t in_grad_bias;
+    if (param.no_bias) {
+      MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_weights(
+              bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second));
+    } else {
+      in_grad_bias = CreateMKLDNNMem(in_grad[conv::kBias],
+                                     bwdWeights_pd.diff_bias_primitive_desc(),
+                                     req[conv::kBias]);
+      MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_weights(
+              bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second,
+              *in_grad_bias.second));
+      CommitOutput(in_grad[conv::kBias], in_grad_bias);
+    }
+    CommitOutput(in_grad[conv::kWeight], in_grad_weight);
+  }
+  MKLDNNStream::Get()->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_copy.cc b/src/operator/nn/mkldnn/mkldnn_copy.cc
new file mode 100644
index 000000000000..71d540c969cd
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_copy.cc
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_softmax.cc
+ * \brief
+ * \author Da Zheng
+*/
+
+#include "../softmax-inl.h"
+#include "./mkldnn_ops-inl.h"
+#include "./mkldnn_base-inl.h"
+
+#if MXNET_USE_MKLDNN == 1
+namespace mxnet {
+namespace op {
+
+void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                const NDArray &in_data, const OpReqType &req,
+                const NDArray &out_data) {
+  TmpMemMgr::Get()->Init(ctx.requested[0]);
+  auto in_mem = in_data.GetMKLDNNData();
+  if (req == kAddTo) {
+    TmpMemMgr::Get()->Init(ctx.requested[0]);
+    // We should try and force the output memory has the same format
+    // as the input memory. If not, we'll have to reorder memory.
+    auto out_mem = out_data.GetMKLDNNData(in_mem->get_primitive_desc());
+    if (out_mem == nullptr)
+      out_mem = out_data.GetMKLDNNData();
+    auto sum_res = TmpMemMgr::Get()->Alloc(out_mem->get_primitive_desc());
+    Sum(*in_mem, *out_mem, *sum_res);
+    const_cast<NDArray &>(out_data).CopyFrom(*sum_res);
+  } else {
+    const_cast<NDArray &>(out_data).CopyFrom(*in_mem);
+  }
+  MKLDNNStream::Get()->Submit();
+}
+
+}   // namespace op
+}   // namespace mxnet
+#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
new file mode 100644
index 000000000000..dc22437e68df
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
@@ -0,0 +1,259 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_deconvolution.cc
+ * \brief
+ * \author Da Zheng
+*/
+
+#include "../deconvolution-inl.h"
+#include "./mkldnn_ops-inl.h"
+#include "./mkldnn_base-inl.h"
+
+#if MXNET_USE_MKLDNN == 1
+namespace mxnet {
+namespace op {
+
+static inline mkldnn::memory::desc GetBiasDesc(mkldnn::memory::desc md) {
+  mkldnn::memory::dims dims(1);
+  // This is convolution on 4D data. The second dimension is the channel.
+  dims[0] = md.data.dims[1];
+  return mkldnn::memory::desc(dims,
+      static_cast<mkldnn::memory::data_type>(md.data.data_type),
+      mkldnn::memory::format::any);
+}
+
+static mkldnn::convolution_forward::primitive_desc GetDeconvBwd_(
+    const mkldnn::memory::desc &data_md, const mkldnn::memory::desc &weights_md,
+    bool has_bias, const mkldnn::memory::desc &out_md,
+    const mkldnn::engine &engine, const mkldnn::memory::dims &strides,
+    const mkldnn::memory::dims &padding, const mkldnn::memory::dims &dilates) {
+  if (!has_bias) {
+    mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training,
+        mkldnn::algorithm::convolution_direct, out_md, weights_md, data_md, strides,
+        dilates, padding, padding, mkldnn::padding_kind::zero);
+    return mkldnn::convolution_forward::primitive_desc(desc, engine);
+  } else {
+    auto bias_md = GetBiasDesc(data_md);
+    mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training,
+        mkldnn::algorithm::convolution_direct, out_md, weights_md, bias_md,
+        data_md, strides, dilates, padding, padding, mkldnn::padding_kind::zero);
+    return mkldnn::convolution_forward::primitive_desc(desc, engine);
+  }
+}
+
+static mkldnn::convolution_backward_data::primitive_desc GetDeconvFwd(
+    const DeconvolutionParam& param, const NDArray &data, const NDArray &weights,
+    bool has_bias, const NDArray &output) {
+  auto data_md = GetMemDesc(data);
+  auto weight_md = GetWeightDesc(weights, param.num_group);
+  auto out_md = GetMemDesc(output);
+  auto engine = CpuEngine::Get()->get_engine();
+  mkldnn::memory::dims strides{0, 0};
+  if (param.stride.ndim() == 2) {
+    strides[0] = param.stride[0];
+    strides[1] = param.stride[1];
+  }
+  mkldnn::memory::dims padding{0, 0};
+  if (param.pad.ndim() == 2) {
+    padding[0] = param.pad[0];
+    padding[1] = param.pad[1];
+  }
+  mkldnn::memory::dims dilate{0, 0};
+  if (param.dilate.ndim() == 2) {
+    dilate[0] = param.dilate[0] - 1;
+    dilate[1] = param.dilate[1] - 1;
+  }
+  auto bwd_pd = GetDeconvBwd_(data_md, weight_md, has_bias, out_md, engine,
+      strides, padding, dilate);
+  mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
+      out_md, weight_md, data_md, strides, dilate, padding, padding,
+      mkldnn::padding_kind::zero);
+  return mkldnn::convolution_backward_data::primitive_desc(desc, engine, bwd_pd);
+}
+
+static mkldnn::convolution_forward::primitive_desc GetDeconvBwdData(
+    const DeconvolutionParam &param, const NDArray &data, const NDArray &weights,
+    bool has_bias, const NDArray &output) {
+  auto data_md = GetMemDesc(data);
+  auto weight_md = GetWeightDesc(weights, param.num_group);
+  auto out_md = GetMemDesc(output);
+  auto engine = CpuEngine::Get()->get_engine();
+  mkldnn::memory::dims strides{0, 0};
+  if (param.stride.ndim() == 2) {
+    strides[0] = param.stride[0];
+    strides[1] = param.stride[1];
+  }
+  mkldnn::memory::dims padding{0, 0};
+  if (param.pad.ndim() == 2) {
+    padding[0] = param.pad[0];
+    padding[1] = param.pad[1];
+  }
+  mkldnn::memory::dims dilate{0, 0};
+  if (param.dilate.ndim() == 2) {
+    dilate[0] = param.dilate[0] - 1;
+    dilate[1] = param.dilate[1] - 1;
+  }
+  return GetDeconvBwd_(data_md, weight_md, has_bias, out_md, engine,
+      strides, padding, dilate);
+}
+
+static mkldnn::convolution_backward_weights::primitive_desc GetDeconvBwdWeights(
+    const DeconvolutionParam& param, const NDArray &data, const NDArray &weights,
+    bool has_bias, const NDArray &output,
+    const mkldnn::convolution_forward::primitive_desc &fwd_pd) {
+  auto data_md = GetMemDesc(data);
+  auto weight_md = GetWeightDesc(weights, param.num_group);
+  auto out_md = GetMemDesc(output);
+  auto engine = CpuEngine::Get()->get_engine();
+  mkldnn::memory::dims strides{0, 0};
+  if (param.stride.ndim() == 2) {
+    strides[0] = param.stride[0];
+    strides[1] = param.stride[1];
+  }
+  mkldnn::memory::dims padding{0, 0};
+  if (param.pad.ndim() == 2) {
+    padding[0] = param.pad[0];
+    padding[1] = param.pad[1];
+  }
+  mkldnn::memory::dims dilate{0, 0};
+  if (param.dilate.ndim() == 2) {
+    dilate[0] = param.dilate[0] - 1;
+    dilate[1] = param.dilate[1] - 1;
+  }
+  if (!has_bias) {
+    mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
+        out_md, weight_md, data_md, strides, dilate, padding, padding, mkldnn::padding_kind::zero);
+    return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd);
+  } else {
+    auto bias_md = GetBiasDesc(data_md);
+    mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
+        out_md, weight_md, bias_md, data_md, strides, dilate, padding, padding,
+        mkldnn::padding_kind::zero);
+    return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd);
+  }
+}
+
+void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                                const std::vector<NDArray> &in_data,
+                                const std::vector<OpReqType> &req,
+                                const std::vector<NDArray> &out_data) {
+  TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]);
+  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+
+  mkldnn::convolution_backward_data::primitive_desc deconvFwd_pd = GetDeconvFwd(
+      param, in_data[deconv::kData], in_data[deconv::kWeight], false,
+      out_data[deconv::kOut]);
+  auto data_mem = in_data[deconv::kData].GetMKLDNNDataReorder(
+      deconvFwd_pd.diff_dst_primitive_desc());
+  const mkldnn::memory *weight_mem;
+  if (ctx.is_train) {
+    // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it
+    // to the default format for now.
+    if (in_data[deconv::kWeight].IsMKLDNN())
+      const_cast<NDArray &>(in_data[deconv::kWeight]).Reorder2Default();
+    weight_mem = GetWeights(in_data[deconv::kWeight],
+                            deconvFwd_pd.weights_primitive_desc(),
+                            param.num_group);
+  } else {
+    // For inference, we want to reorder the weight array so we don't need to
+    // reorder data every time.
+    const_cast<NDArray &>(in_data[deconv::kWeight]).Reorder(
+        deconvFwd_pd.weights_primitive_desc());
+    weight_mem = in_data[deconv::kWeight].GetMKLDNNData();
+  }
+  auto out_mem = CreateMKLDNNMem(out_data[deconv::kOut],
+                                 deconvFwd_pd.diff_src_primitive_desc(),
+                                 req[deconv::kOut]);
+
+  MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_data(
+        deconvFwd_pd, *data_mem, *weight_mem, *out_mem.second));
+  CommitOutput(out_data[deconv::kOut], out_mem);
+  MKLDNNStream::Get()->Submit();
+  // add bias, broadcast bias to dim 1: channel
+  if (!param.no_bias) {
+    // MKLDNN only supports float right now.
+    typedef float DType;
+    Stream<cpu> *s = ctx.get_stream<cpu>();
+    Tensor<cpu, 1, DType> bias = in_data[deconv::kBias].data().get<cpu, 1, DType>(s);
+    // If the output data is stored in a special MKLDNN format, data()
+    // automatically converts its format to the default format.
+    // Unfortunately, MKLDNN doesn't support broadcast.
+    Tensor<cpu, 4, DType> out_cpu = out_data[deconv::kOut].data().get<cpu, 4, DType>(s);
+    out_cpu += mshadow::expr::broadcast<1>(bias, out_cpu.shape_);
+  }
+}
+
+void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                                 const std::vector<NDArray>& inputs,
+                                 const std::vector<OpReqType>& req,
+                                 const std::vector<NDArray>& outputs) {
+  TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]);
+  const std::vector<NDArray> &in_grad = outputs;
+  const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  CHECK_NE(req[deconv::kWeight], kWriteInplace) << "cannot write weight inplace";
+  mkldnn::convolution_forward::primitive_desc bwdData_pd = GetDeconvBwdData(
+      param, inputs[deconv::kData + 1], inputs[deconv::kWeight + 1], false,
+      inputs[deconv::kOut]);
+  auto out_grad_mem = inputs[deconv::kOut].GetMKLDNNDataReorder(
+      bwdData_pd.src_primitive_desc());
+  if (req[deconv::kData]) {
+    auto weight_mem = GetWeights(inputs[deconv::kWeight + 1],
+                                 bwdData_pd.weights_primitive_desc(),
+                                 param.num_group);
+    auto in_grad_mem = CreateMKLDNNMem(in_grad[deconv::kData],
+                                       bwdData_pd.dst_primitive_desc(),
+                                       req[deconv::kData]);
+    MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_forward(bwdData_pd,
+          *out_grad_mem, *weight_mem, *in_grad_mem.second));
+    CommitOutput(in_grad[deconv::kData], in_grad_mem);
+  }
+  if (req[deconv::kWeight]) {
+    mkldnn::convolution_backward_weights::primitive_desc bwdWeights_pd
+      = GetDeconvBwdWeights(param, inputs[deconv::kData + 1],
+          inputs[deconv::kWeight + 1], false, inputs[deconv::kOut], bwdData_pd);
+    if (bwdData_pd.src_primitive_desc() != bwdWeights_pd.src_primitive_desc())
+      out_grad_mem = inputs[deconv::kOut].GetMKLDNNDataReorder(
+          bwdWeights_pd.src_primitive_desc());
+    auto data_mem = inputs[deconv::kData + 1].GetMKLDNNDataReorder(
+        bwdWeights_pd.diff_dst_primitive_desc());
+    auto in_grad_weight = CreateMKLDNNWeightGrad(in_grad[deconv::kWeight],
+                                                 bwdWeights_pd.diff_weights_primitive_desc(),
+                                                 req[deconv::kWeight]);
+    MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_weights(
+          bwdWeights_pd, *out_grad_mem, *data_mem, *in_grad_weight.second));
+    CommitOutput(in_grad[deconv::kWeight], in_grad_weight);
+  }
+  MKLDNNStream::Get()->Submit();
+  if (!param.no_bias) {
+    typedef float DType;
+    Stream<cpu> *s = ctx.get_stream<cpu>();
+    Tensor<cpu, 1, DType> gbias = in_grad[deconv::kBias].data().get<cpu, 1, DType>(s);
+    // If there is bias, the out grad has already been converted to the default
+    // format, so this shouldn't cause any performance issues.
+    Tensor<cpu, 4, DType> grad = inputs[deconv::kOut].data().get<cpu, 4, DType>(s);
+    Assign(gbias, req[deconv::kBias], mshadow::expr::sumall_except_dim<1>(grad));
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
new file mode 100644
index 000000000000..d396276719e8
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
@@ -0,0 +1,337 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_fully_connected.cc
+ * \brief
+ * \author Da Zheng
+*/
+
+#include "../fully_connected-inl.h"
+#include "./mkldnn_base-inl.h"
+
+#if MXNET_USE_MKLDNN == 1
+namespace mxnet {
+namespace op {
+
+inline static mkldnn::inner_product_forward::primitive_desc GetIPFwd(
+    const NDArray &data, const NDArray &weight, const NDArray *bias,
+    const mkldnn::memory::desc &out_md) {
+  auto data_md = GetMemDesc(data);
+  auto weight_md = GetMemDesc(weight);
+  auto engine = CpuEngine::Get()->get_engine();
+  if (bias) {
+    auto bias_md = GetMemDesc(*bias);
+    mkldnn::inner_product_forward::desc ipFwd_desc(mkldnn::prop_kind::forward_training,
+        data_md, weight_md, bias_md, out_md);
+    return mkldnn::inner_product_forward::primitive_desc(ipFwd_desc, engine);
+  } else {
+    mkldnn::inner_product_forward::desc ipFwd_desc(mkldnn::prop_kind::forward_training,
+        data_md, weight_md, out_md);
+    return mkldnn::inner_product_forward::primitive_desc(ipFwd_desc, engine);
+  }
+}
+
+inline static mkldnn::inner_product_backward_data::primitive_desc GetIpBwdData(
+    const NDArray &data, const NDArray &weight, const NDArray &output,
+    mkldnn::inner_product_forward::primitive_desc ipFwd_pd) {
+  auto data_md = GetMemDesc(data);
+  auto weight_md = GetMemDesc(weight);
+  auto out_md = GetMemDesc(output);
+  auto engine = CpuEngine::Get()->get_engine();
+  mkldnn::inner_product_backward_data::desc desc(data_md, weight_md, out_md);
+  return mkldnn::inner_product_backward_data::primitive_desc(desc, engine, ipFwd_pd);
+}
+
+inline static mkldnn::inner_product_backward_weights::primitive_desc GetIPBwdWeights(
+    const NDArray &data, const NDArray &weight, const NDArray *bias,
+    const NDArray &output, mkldnn::inner_product_forward::primitive_desc ipFwd_pd) {
+  auto data_md = GetMemDesc(data);
+  auto weight_md = GetMemDesc(weight);
+  auto out_md = GetMemDesc(output);
+  auto engine = CpuEngine::Get()->get_engine();
+  if (bias) {
+    auto bias_md = GetMemDesc(*bias);
+    mkldnn::inner_product_backward_weights::desc ipBwdWeights_desc(data_md,
+        weight_md, bias_md, out_md);
+    return mkldnn::inner_product_backward_weights::primitive_desc(
+        ipBwdWeights_desc, engine, ipFwd_pd);
+  } else {
+    mkldnn::inner_product_backward_weights::desc ipBwdWeights_desc(data_md,
+        weight_md, out_md);
+    return mkldnn::inner_product_backward_weights::primitive_desc(
+        ipBwdWeights_desc, engine, ipFwd_pd);
+  }
+}
+
+typedef MKLDNNParamOpSign<FullyConnectedParam> MKLDNNFullyConnectedSignature;
+
+class MKLDNNFullyConnectedFwd {
+ public:
+  std::shared_ptr<mkldnn::inner_product_forward::primitive_desc> fwd_pd;
+  MKLDNNFullyConnectedFwd(const FullyConnectedParam &param,
+                          NDArray *data, const NDArray &weights,
+                          const NDArray *bias, const NDArray &output,
+                          const OpReqType &req_out) {
+    _Init(param, data, weights, bias, output, req_out);
+  }
+  ~MKLDNNFullyConnectedFwd() {}
+  void SetDataHandle(const FullyConnectedParam &param,
+                     NDArray *data,
+                     const NDArray &weights,
+                     const NDArray *bias,
+                     const NDArray &output,
+                     const OpReqType &req_out);
+  void Execute(const NDArray &output);
+
+ private:
+  void _Init(const FullyConnectedParam &param, NDArray *data,
+             const NDArray &weights, const NDArray *bias,
+             const NDArray &output, const OpReqType &req_out);
+
+ private:
+  std::shared_ptr<mkldnn::inner_product_forward> fwd;
+  std::shared_ptr<mkldnn::memory> data;
+  std::shared_ptr<mkldnn::memory> weights;
+  std::shared_ptr<mkldnn::memory> bias;
+  std::shared_ptr<mkldnn::memory> out;
+  OutDataOp data_op;
+};
+
+void MKLDNNFullyConnectedFwd::_Init(const FullyConnectedParam &param,
+                                    NDArray *data,
+                                    const NDArray &weights,
+                                    const NDArray *bias,
+                                    const NDArray &output,
+                                    const OpReqType &req_out) {
+  const TShape& ishape = data->shape();
+  const TShape& oshape = output.shape();
+  auto out_md = GetMemDesc(output);
+  if (data->shape().ndim() != 2 && !param.flatten) {
+    *data = data->ReshapeMKLDNN(Shape2(ishape.ProdShape(0, ishape.ndim()-1),
+                                     ishape[ishape.ndim()-1]));
+    mkldnn::memory::dims out_dims{static_cast<int>(oshape.ProdShape(0,
+                                                   oshape.ndim()-1)),
+                                  static_cast<int>(oshape[ishape.ndim()-1])};
+    out_md = mkldnn::memory::desc(out_dims, get_mkldnn_type(output.dtype()),
+                                  mkldnn::memory::format::any);
+  } else if (data->shape().ndim() != 2) {
+    *data = data->ReshapeMKLDNN(Shape2(ishape[0], ishape.ProdShape(1,
+                                                              ishape.ndim())));
+    mkldnn::memory::dims out_dims{static_cast<int>(oshape[0]),
+                                  static_cast<int>(oshape.ProdShape(1,
+                                                             oshape.ndim()))};
+    out_md = mkldnn::memory::desc(out_dims, get_mkldnn_type(output.dtype()),
+                                  mkldnn::memory::format::any);
+  }
+
+  auto data_md = GetMemDesc(*data);
+  auto weight_md = GetMemDesc(weights);
+  auto engine = CpuEngine::Get()->get_engine();
+  if (bias) {
+    auto bias_md = GetMemDesc(*bias);
+    mkldnn::inner_product_forward::desc
+                 ipFwd_desc(mkldnn::prop_kind::forward_training,
+                            data_md, weight_md, bias_md, out_md);
+    this->fwd_pd.reset(new mkldnn::inner_product_forward::primitive_desc(
+                                            ipFwd_desc, engine));
+  } else {
+    mkldnn::inner_product_forward::desc
+                 ipFwd_desc(mkldnn::prop_kind::forward_training,
+                            data_md, weight_md, out_md);
+    this->fwd_pd.reset(new mkldnn::inner_product_forward::primitive_desc(
+                                            ipFwd_desc, engine));
+  }
+
+  this->data.reset(new mkldnn::memory(this->fwd_pd->src_primitive_desc()));
+  this->weights.reset(new mkldnn::memory(
+                                    this->fwd_pd->weights_primitive_desc()));
+  this->out.reset(new mkldnn::memory(this->fwd_pd->dst_primitive_desc()));
+  if (param.no_bias) {
+    this->fwd.reset(new mkldnn::inner_product_forward(
+        *(this->fwd_pd), *(this->data), *(this->weights), *(this->out)));
+  } else {
+    this->bias.reset(new mkldnn::memory(this->fwd_pd->bias_primitive_desc()));
+    this->fwd.reset(new mkldnn::inner_product_forward(*(this->fwd_pd),
+        *(this->data), *(this->weights), *(this->bias), *(this->out)));
+  }
+}
+
+void MKLDNNFullyConnectedFwd::SetDataHandle(const FullyConnectedParam &param,
+                                            NDArray *data,
+                                            const NDArray &weights,
+                                            const NDArray *bias,
+                                            const NDArray &output,
+                                            const OpReqType &req_out) {
+  const TShape& ishape = data->shape();
+  if (data->shape().ndim() != 2 && !param.flatten) {
+    *data = data->ReshapeMKLDNN(Shape2(ishape.ProdShape(0, ishape.ndim()-1),
+                                     ishape[ishape.ndim()-1]));
+  } else if (data->shape().ndim() != 2) {
+    *data = data->ReshapeMKLDNN(Shape2(ishape[0], ishape.ProdShape(1,
+                                                              ishape.ndim())));
+  }
+  auto data_mem = data->GetMKLDNNDataReorder(this->fwd_pd->src_primitive_desc());
+  auto weight_mem = weights.GetMKLDNNDataReorder(
+                                      this->fwd_pd->weights_primitive_desc());
+  auto out = CreateMKLDNNMem(output,
+                             this->fwd_pd->dst_primitive_desc(), req_out);
+
+  this->data->set_data_handle(data_mem->get_data_handle());
+  this->weights->set_data_handle(weight_mem->get_data_handle());
+  if (bias) {
+    auto bias_mem = bias->GetMKLDNNDataReorder(
+                        this->fwd_pd->bias_primitive_desc());
+    this->bias->set_data_handle(bias_mem->get_data_handle());
+  }
+  this->out->set_data_handle(out.second->get_data_handle());
+  this->data_op = out.first;
+}
+
+void MKLDNNFullyConnectedFwd::Execute(const NDArray &output) {
+  MKLDNNStream::Get()->RegisterPrim(*(this->fwd));
+  CommitOutput(output, mkldnn_output_t(this->data_op, this->out.get()));
+  MKLDNNStream::Get()->Submit();
+}
+
+static MKLDNNFullyConnectedFwd
+&GetFullyConnectedFwd(const FullyConnectedParam &param, const OpContext &ctx,
+                      NDArray *data, const NDArray &weights,
+                      const NDArray *bias, const NDArray &output,
+                      const OpReqType &req_out) {
+  static thread_local std::unordered_map<MKLDNNFullyConnectedSignature,
+                                         MKLDNNFullyConnectedFwd,
+                                         MKLDNNOpHash> fc_fwds;
+  MKLDNNFullyConnectedSignature key(param);
+  key.AddSign(ctx.is_train);
+  key.AddSign(req_out);
+  key.AddSign(*data);
+  key.AddSign(weights);
+  key.AddSign(output);
+  if (bias) {
+    key.AddSign(*bias);
+  }
+
+  auto it = fc_fwds.find(key);
+  if (it == fc_fwds.end()) {
+    MKLDNNFullyConnectedFwd fwd(param, data, weights, bias, output, req_out);
+    auto ins_ret = fc_fwds.insert(std::pair<MKLDNNFullyConnectedSignature,
+                                            MKLDNNFullyConnectedFwd>(key, fwd));
+    CHECK(ins_ret.second);
+    it = ins_ret.first;
+  }
+  return it->second;
+}
+
+void MKLDNNFullyConnectedCompute(const nnvm::NodeAttrs& attrs,
+                                 const OpContext &ctx,
+                                 const std::vector<NDArray> &in_data,
+                                 const std::vector<OpReqType> &req,
+                                 const std::vector<NDArray> &out_data) {
+  TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]);
+  const FullyConnectedParam &param
+                    = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  auto data = in_data[fullc::kData];
+  auto weights = in_data[fullc::kWeight];
+  auto output = out_data[fullc::kOut];
+  OpReqType req_out = req[fullc::kOut];
+  MKLDNNFullyConnectedFwd &fwd = GetFullyConnectedFwd(param, ctx, &data,
+          weights, param.no_bias ? nullptr : &in_data[fullc::kBias],
+          output, req_out);
+  fwd.SetDataHandle(param, &data, weights,
+                    param.no_bias ? nullptr : &in_data[fullc::kBias],
+                    output, req_out);
+  fwd.Execute(output);
+}
+
+void MKLDNNFullyConnectedGradCompute(const nnvm::NodeAttrs& attrs,
+                                     const OpContext &ctx,
+                                     const std::vector<NDArray> &inputs,
+                                     const std::vector<OpReqType> &req,
+                                     const std::vector<NDArray> &outputs) {
+  TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]);
+  const std::vector<NDArray> &in_grad = outputs;
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  const TShape& ishape = inputs[fullc::kData + 1].shape();
+  const TShape& oshape = inputs[fullc::kOut].shape();
+
+  NDArray weight = inputs[fullc::kWeight + 1];
+  NDArray data = inputs[fullc::kData + 1];
+  if (data.shape().ndim() != 2 && !param.flatten)
+    data = data.ReshapeMKLDNN(Shape2(ishape.ProdShape(0, ishape.ndim()-1),
+                                     ishape[ishape.ndim()-1]));
+  else if (data.shape().ndim() != 2)
+    data = data.ReshapeMKLDNN(Shape2(ishape[0],
+                                     ishape.ProdShape(1, ishape.ndim())));
+  NDArray out_grad = inputs[fullc::kOut];
+  if (out_grad.shape().ndim() != 2 && !param.flatten)
+    out_grad = out_grad.ReshapeMKLDNN(Shape2(oshape.ProdShape(0, oshape.ndim()-1),
+                                             oshape[oshape.ndim()-1]));
+  else if (out_grad.shape().ndim() != 2)
+    out_grad = out_grad.ReshapeMKLDNN(Shape2(oshape[0],
+                                             oshape.ProdShape(1, oshape.ndim())));
+
+  mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd(data, weight,
+      param.no_bias ? nullptr : &in_grad[fullc::kBias], GetMemDesc(out_grad));
+
+  CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace";
+  if (req[fullc::kData]) {
+    mkldnn::inner_product_backward_data::primitive_desc ipBwdData_pd = GetIpBwdData(
+        data, weight, out_grad, ipFwd_pd);
+    auto out_grad_mem = out_grad.GetMKLDNNDataReorder(
+        ipBwdData_pd.diff_dst_primitive_desc());
+    auto weight_mem = weight.GetMKLDNNDataReorder(ipBwdData_pd.weights_primitive_desc());
+    auto in_grad_mem = CreateMKLDNNMem(in_grad[fullc::kData],
+                                       ipBwdData_pd.diff_src_primitive_desc(),
+                                       req[fullc::kData]);
+    MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_backward_data(
+          ipBwdData_pd, *out_grad_mem, *weight_mem, *in_grad_mem.second));
+    CommitOutput(in_grad[fullc::kData], in_grad_mem);
+  }
+  if (req[fullc::kWeight]) {
+    mkldnn::inner_product_backward_weights::primitive_desc ipBwdWeights_pd
+      = GetIPBwdWeights(data, weight, param.no_bias ? nullptr : &in_grad[fullc::kBias],
+          out_grad, ipFwd_pd);
+    auto out_grad_mem = out_grad.GetMKLDNNDataReorder(
+        ipBwdWeights_pd.diff_dst_primitive_desc());
+    auto data_mem = data.GetMKLDNNDataReorder(ipBwdWeights_pd.src_primitive_desc());
+    auto in_grad_weight = CreateMKLDNNWeightGrad(in_grad[fullc::kWeight],
+                                                 ipBwdWeights_pd.diff_weights_primitive_desc(),
+                                                 req[fullc::kWeight]);
+    mkldnn_output_t in_grad_bias;
+    if (param.no_bias) {
+      MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_backward_weights(
+            ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second));
+    } else {
+      in_grad_bias = CreateMKLDNNMem(in_grad[fullc::kBias],
+                                     ipBwdWeights_pd.diff_bias_primitive_desc(),
+                                     req[fullc::kBias]);
+      MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_backward_weights(
+            ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second,
+            *in_grad_bias.second));
+    }
+    CommitOutput(in_grad[fullc::kWeight], in_grad_weight);
+    CommitOutput(in_grad[fullc::kBias], in_grad_bias);
+  }
+  MKLDNNStream::Get()->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_lrn-inl.h b/src/operator/nn/mkldnn/mkldnn_lrn-inl.h
new file mode 100644
index 000000000000..e0ecc1873d96
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_lrn-inl.h
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_lrn-inl.h 
+ * \brief
+ * \Author: Patric Zhao, patric.zhao@intel.com
+*/
+#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H_
+#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H_
+
+#if MXNET_USE_MKLDNN == 1
+#include <mkldnn.hpp>
+#include "../lrn-inl.h"
+#include "./mkldnn_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+static inline algorithm GetMKLDNNLRNAlgo(const LRNParam &param) {
+  // TODO(Patric): lrn_within_channel will cause core dump in MKLDNN backward
+  //               Need to confirm with MKLDNN team and fix later
+  return algorithm::lrn_across_channels;
+}
+
+inline static lrn_forward::primitive_desc GetLRNFwd(
+    const LRNParam &param, bool is_train, const memory::desc &src_md) {
+  auto engine = CpuEngine::Get()->get_engine();
+  auto alg_ = GetMKLDNNLRNAlgo(param);
+  auto alpha_ = param.alpha;
+  auto beta_ = param.beta;
+  auto nsize_ = param.nsize;
+  auto k_ = param.knorm;
+  auto kind_ = prop_kind::forward_training;
+  if (is_train) {
+    kind_ = prop_kind::forward_training;
+  } else {
+    kind_ = prop_kind::forward_scoring;
+  }
+  lrn_forward::desc fwd_desc_(kind_, alg_, src_md, nsize_, alpha_, beta_, k_);
+  return mkldnn::lrn_forward::primitive_desc(fwd_desc_, engine);
+}
+
+inline static mkldnn::lrn_backward::primitive_desc GetLRNBwd(
+    const LRNParam &param, const mkldnn::memory::desc &diff_in_md,
+    const mkldnn::memory::desc &diff_md,
+    const lrn_forward::primitive_desc &lrnFwd_desc) {
+  auto engine = CpuEngine::Get()->get_engine();
+  auto alg_ = GetMKLDNNLRNAlgo(param);
+  auto alpha_ = param.alpha;
+  auto  beta_ = param.beta;
+  int nsize_ = param.nsize;
+  auto k_ = param.knorm;
+
+  lrn_backward::desc lrnBwd_desc(alg_, diff_in_md,
+                diff_md, nsize_, alpha_, beta_, k_);
+  return mkldnn::lrn_backward::primitive_desc(lrnBwd_desc,
+                               engine, lrnFwd_desc);
+}
+
+void MKLDNNLRN_Forward(const OpContext &ctx, const LRNParam &param,
+                       const NDArray &in_data, const OpReqType &req,
+                       const NDArray &out_data) {
+  auto src_mem = in_data.GetMKLDNNData();
+  auto src_md =  src_mem->get_primitive_desc().desc();
+  auto pdesc = GetLRNFwd(param, ctx.is_train, src_md);
+  auto dst_mem =  const_cast<NDArray &>(out_data).CreateMKLDNNData(
+          pdesc.dst_primitive_desc());
+  if (ctx.is_train) {
+    std::shared_ptr<const mkldnn::memory> ws_mem(
+            new mkldnn::memory(pdesc.workspace_primitive_desc()));
+    MKLDNNStream::Get()->RegisterPrim(
+        lrn_forward(pdesc, mkldnn::primitive::at(*src_mem),
+        *ws_mem, *dst_mem));
+    MKLDNNStream::Get()->Submit();
+  } else {
+    MKLDNNStream::Get()->RegisterPrim(
+        lrn_forward(pdesc, mkldnn::primitive::at(*src_mem), *dst_mem));
+    MKLDNNStream::Get()->Submit();
+  }
+}
+
+void MKLDNNLRN_Backward(const OpContext &ctx, const LRNParam &param,
+                            const NDArray &out_grad,
+                            const NDArray &in_data,
+                            const OpReqType &req,
+                            const NDArray &in_grad) {
+  if (req == kNullOp) {
+    return;
+  }
+  // Repeat FW for getting workspace
+  auto data_mem = in_data.GetMKLDNNData();
+  auto data_md = data_mem->get_primitive_desc().desc();
+  auto pdesc_fwd = GetLRNFwd(param, ctx.is_train, data_md);
+
+  // TODO(Patric): To keep the function stateless, we can't pass workspace
+  //               from LRN forward to backward. We have to re-compute
+  //               LRN forward to get the workspace.
+  //               Will refine this code later.
+  std::shared_ptr<const mkldnn::memory> ws_mem(
+          new mkldnn::memory(pdesc_fwd.workspace_primitive_desc()));
+  std::shared_ptr<const mkldnn::memory> dst_temp(
+          new mkldnn::memory(pdesc_fwd.dst_primitive_desc()));
+  MKLDNNStream::Get()->RegisterPrim(
+          lrn_forward(pdesc_fwd, mkldnn::primitive::at(*data_mem),
+          *ws_mem, *dst_temp));
+
+  auto data_in_md = pdesc_fwd.src_primitive_desc().desc();
+  auto diff_mem = out_grad.GetMKLDNNData();
+  auto diff_md = diff_mem->get_primitive_desc().desc();
+  auto pdesc_bwd = GetLRNBwd(param, data_in_md, diff_md, pdesc_fwd);
+  auto diff_src_mem = CreateMKLDNNMem(in_grad,
+          pdesc_bwd.diff_src_primitive_desc(), req);
+
+  MKLDNNStream::Get()->RegisterPrim(
+        lrn_backward(pdesc_bwd, mkldnn::primitive::at(*data_mem),
+        mkldnn::primitive::at(*diff_mem), *ws_mem, *diff_src_mem.second));
+  MKLDNNStream::Get()->Submit();
+}
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H__
diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h
new file mode 100644
index 000000000000..a54fceddbc9e
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_ops-inl.h
+ * \brief
+ * \author Da Zheng
+*/
+
+#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_
+#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_
+
+#if MXNET_USE_MKLDNN == 1
+
+#include <mxnet/io.h>
+#include <mxnet/base.h>
+#include <mxnet/ndarray.h>
+#include <mxnet/operator.h>
+#include <mxnet/operator_util.h>
+#include <dmlc/logging.h>
+#include <dmlc/optional.h>
+#include <vector>
+#include <mkldnn.hpp>
+
+namespace mxnet {
+namespace op {
+
+/* For fully connected. */
+void MKLDNNFullyConnectedCompute(
+                     const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                     const std::vector<NDArray> &in_data,
+                     const std::vector<OpReqType> &req,
+                     const std::vector<NDArray> &out_data);
+void MKLDNNFullyConnectedGradCompute(
+                      const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                      const std::vector<NDArray> &inputs,
+                      const std::vector<OpReqType> &req,
+                      const std::vector<NDArray> &outputs);
+
+/* For convolution. */
+void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                              const std::vector<NDArray> &in_data,
+                              const std::vector<OpReqType> &req,
+                              const std::vector<NDArray> &out_data);
+void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                               const std::vector<NDArray>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<NDArray>& outputs);
+
+/* For deconvolution */
+void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                                const std::vector<NDArray> &in_data,
+                                const std::vector<OpReqType> &req,
+                                const std::vector<NDArray> &out_data);
+void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                                 const std::vector<NDArray>& inputs,
+                                 const std::vector<OpReqType>& req,
+                                 const std::vector<NDArray>& outputs);
+
+/* For softmax */
+void MKLDNNSoftmaxForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                          const NDArray &in_data, const OpReqType &req,
+                          const NDArray &out_data);
+
+/* For sum */
+void MKLDNNSumForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                      const std::vector<NDArray> &inputs, const OpReqType &req,
+                      const NDArray &out_data);
+
+/* For copy */
+void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+    const NDArray &in_data, const OpReqType &req,
+    const NDArray &out_data);
+
+/* For concat */
+void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                         const std::vector<NDArray> &in_data,
+                         const std::vector<OpReqType> &req,
+                         const std::vector<NDArray> &out_data);
+void MKLDNNConcatBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                          const std::vector<NDArray>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<NDArray>& outputs);
+
+/* For activation */
+void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                             const NDArray &in_data, const OpReqType &req,
+                             const NDArray &out_data);
+void MKLDNNActivationBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                              const NDArray &out_grad, const NDArray &in_data,
+                              const OpReqType &req, const NDArray &in_grad);
+
+void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2,
+         const mkldnn::memory &out);
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_MKLDNN == 1
+
+#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h
new file mode 100644
index 000000000000..6bae480b8c30
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h
@@ -0,0 +1,226 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_pooling.cc
+ * \brief
+*/
+#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_POOLING_INL_H_
+#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_POOLING_INL_H_
+
+#if MXNET_USE_MKLDNN == 1
+
+#include <mkldnn.hpp>
+#include "../pooling-inl.h"
+#include "./mkldnn_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+static inline bool SupportMKLDNNPooling(const PoolingParam &param) {
+  return param.kernel.ndim() == 2
+      && (param.pool_type == pool_enum::kMaxPooling);
+#if 0
+      // It seems average pooling has precision problems.
+          || param.pool_type == pool_enum::kAvgPooling);
+#endif
+}
+
+static inline bool SupportMKLDNNPooling(const PoolingParam &param,
+                                        const TShape &dshape) {
+  auto ret = SupportMKLDNNPooling(param);
+  if (!ret)
+    return false;
+  if (param.pooling_convention == pool_enum::kValid)
+    return true;
+  if ((dshape[2] + 2 * param.pad[0] - param.kernel[0]) % param.stride[0] == 0
+      && (dshape[3] + 2 * param.pad[1] - param.kernel[1]) % param.stride[1] == 0)
+    return true;
+  else
+    return false;
+}
+
+static inline algorithm GetMKLDNNPoolAlgo(const PoolingParam &param) {
+  switch (param.pool_type) {
+    case pool_enum::kMaxPooling:
+      return algorithm::pooling_max;
+      break;
+    case pool_enum::kAvgPooling:
+      return algorithm::pooling_avg;
+      break;
+    default:
+      LOG(FATAL) << "Unknown pooling method.";
+      return algorithm::pooling_max;
+  }
+}
+
+inline static pooling_forward::primitive_desc GetPoolingFwd(
+    const PoolingParam &param, bool is_train, const memory::desc &data_md,
+    const memory::desc &out_md) {
+  CHECK_EQ(param.kernel.ndim(), 2) << "Not Implemented";
+  int kernel_h_, kernel_w_;
+  if (param.global_pool) {
+    kernel_h_ = data_md.data.dims[2];
+    kernel_w_ = data_md.data.dims[3];
+  } else {
+    kernel_h_ = param.kernel[0];
+    kernel_w_ = param.kernel[1];
+  }
+  CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
+  CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
+
+  auto pad_t_ = param.pad[0], pad_b_ = param.pad[0];
+  auto pad_l_ = param.pad[1], pad_r_ = param.pad[1];
+  auto stride_h_ = param.stride[0], stride_w_ = param.stride[1];
+
+  auto engine = CpuEngine::Get()->get_engine();
+  if (param.global_pool) {
+    CHECK(pad_t_ == 0 && pad_l_ == 0 && stride_h_ == 1 && stride_w_ == 1)
+        << "With Global_pooling: true; only pad = 0 and stride = 1";
+  }
+  if (pad_t_ != 0 || pad_l_ != 0) {
+    CHECK(param.pool_type == pool_enum::kAvgPooling ||
+          param.pool_type == pool_enum::kMaxPooling)
+        << "Padding implemented only for average and max pooling.";
+    CHECK_LT(pad_l_, kernel_w_);
+    CHECK_LT(pad_t_, kernel_h_);
+  }
+  auto alg = GetMKLDNNPoolAlgo(param);
+  auto kind = prop_kind::forward_scoring;
+  if (is_train && alg != algorithm::pooling_avg) {
+    kind = prop_kind::forward_training;
+  }
+  pooling_forward::desc poolingFwd_desc(kind, alg, data_md, out_md,
+                                        {static_cast<int>(stride_h_),
+                                        static_cast<int>(stride_w_)},
+                                        {kernel_h_, kernel_w_},
+                                        {static_cast<int>(pad_t_),
+                                        static_cast<int>(pad_l_)},
+                                        {static_cast<int>(pad_b_),
+                                        static_cast<int>(pad_r_)},
+                                        padding_kind::zero);
+  return mkldnn::pooling_forward::primitive_desc(poolingFwd_desc, engine);
+}
+
+inline bool MKLDNNRequireWorkspace(const PoolingParam &param) {
+  return param.pool_type != pool_enum::kAvgPooling;
+}
+
+void MKLDNNPoolingForward(const OpContext &ctx, const PoolingParam &param,
+                          const NDArray &in_data, const OpReqType &req,
+                          const NDArray &out_data, const NDArray *workspace) {
+  auto input_mem = in_data.GetMKLDNNData();
+  auto data_mpd = input_mem->get_primitive_desc();
+  auto data_md = data_mpd.desc();
+
+  memory::dims dims = {data_md.data.dims[0], data_md.data.dims[1],
+                       static_cast<int>(out_data.shape()[2]),
+                       static_cast<int>(out_data.shape()[3])};
+  memory::desc out_md({dims},
+                      static_cast<memory::data_type>(data_md.data.data_type),
+                      static_cast<memory::format>(data_md.data.format));
+
+  auto pdesc = GetPoolingFwd(param, ctx.is_train, data_md, out_md);
+
+  auto output_memory = const_cast<NDArray &>(out_data).CreateMKLDNNData(
+      pdesc.dst_primitive_desc());
+
+  if (ctx.is_train && MKLDNNRequireWorkspace(param)) {
+    CHECK(workspace != nullptr);
+    auto workspace_mem = workspace->GetMKLDNNData();
+    MKLDNNStream::Get()->RegisterPrim(
+        pooling_forward(pdesc, *input_mem, *output_memory, *workspace_mem));
+  } else {
+    MKLDNNStream::Get()->RegisterPrim(
+        pooling_forward(pdesc, *input_mem, *output_memory));
+  }
+  MKLDNNStream::Get()->Submit();
+}
+
+void MKLDNNPoolingBackward(const OpContext &ctx, const PoolingParam &param,
+                           const NDArray &out_grad, const NDArray &in_data,
+                           const NDArray *workspace, const OpReqType &req,
+                           const NDArray &in_grad) {
+  if (req == kNullOp) {
+    return;
+  }
+
+  TmpMemMgr::Get()->Init(ctx.requested[0]);
+  auto diff_dst_mem = out_grad.GetMKLDNNData();
+  auto input_mem = in_data.GetMKLDNNData();
+  mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc();
+  mkldnn::memory::desc data_md = data_mpd.desc();
+  memory::dims dims = {data_md.data.dims[0], data_md.data.dims[1],
+                       static_cast<int>(out_grad.shape()[2]),
+                       static_cast<int>(out_grad.shape()[3])};
+  memory::desc out_md({dims},
+                      static_cast<memory::data_type>(data_md.data.data_type),
+                      static_cast<memory::format>(data_md.data.format));
+  auto pdesc_fwd = GetPoolingFwd(param, ctx.is_train, data_md, out_md);
+
+  mkldnn::memory::desc diff_md = diff_dst_mem->get_primitive_desc().desc();
+  memory::dims dims1 = {diff_md.data.dims[0], diff_md.data.dims[1],
+                        static_cast<int>(in_grad.shape()[2]),
+                        static_cast<int>(in_grad.shape()[3])};
+  memory::desc diff_in_md(
+      {dims1}, static_cast<memory::data_type>(diff_md.data.data_type),
+      static_cast<memory::format>(diff_md.data.format));
+  auto cpu_engine = data_mpd.get_engine();
+
+  auto alg = GetMKLDNNPoolAlgo(param);
+
+  int kernel_h_, kernel_w_;
+  if (param.global_pool) {
+    kernel_h_ = data_md.data.dims[2];
+    kernel_w_ = data_md.data.dims[3];
+  } else {
+    kernel_h_ = param.kernel[0];
+    kernel_w_ = param.kernel[1];
+  }
+  pooling_backward::desc desc(alg, diff_in_md, diff_md,
+                              {static_cast<int>(param.stride[0]),
+                              static_cast<int>(param.stride[1])},
+                              {kernel_h_, kernel_w_},
+                              {static_cast<int>(param.pad[0]),
+                              static_cast<int>(param.pad[1])},
+                              {static_cast<int>(param.pad[0]),
+                              static_cast<int>(param.pad[1])},
+                              padding_kind::zero);
+  pooling_backward::primitive_desc pdesc(desc, cpu_engine, pdesc_fwd);
+
+  auto diff_src_mem =
+      CreateMKLDNNMem(in_grad, pdesc.diff_src_primitive_desc(), req);
+
+  if (MKLDNNRequireWorkspace(param)) {
+    CHECK(workspace != nullptr);
+    auto workspace_mem = workspace->GetMKLDNNData();
+    MKLDNNStream::Get()->RegisterPrim(
+        pooling_backward(pdesc, *diff_dst_mem, primitive::at(*workspace_mem),
+                         *diff_src_mem.second));
+  } else {
+    MKLDNNStream::Get()->RegisterPrim(
+        pooling_backward(pdesc, *diff_dst_mem, *diff_src_mem.second));
+  }
+  CommitOutput(in_grad, diff_src_mem);
+  MKLDNNStream::Get()->Submit();
+}
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_POOLING_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_softmax.cc b/src/operator/nn/mkldnn/mkldnn_softmax.cc
new file mode 100644
index 000000000000..aa59f13d06da
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_softmax.cc
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_softmax.cc
+ * \brief
+ * \author Da Zheng
+*/
+
+#include "../softmax-inl.h"
+#include "./mkldnn_ops-inl.h"
+#include "./mkldnn_base-inl.h"
+
+#if MXNET_USE_MKLDNN == 1
+namespace mxnet {
+namespace op {
+
+void MKLDNNSoftmaxForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                          const NDArray &in_data, const OpReqType &req,
+                          const NDArray &out_data) {
+  const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
+  auto input_mem = in_data.GetMKLDNNData();
+  mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc();
+  mkldnn::memory::desc data_md = data_mpd.desc();
+  auto cpu_engine = data_mpd.get_engine();
+  auto prop = ctx.is_train
+    ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
+  mkldnn::softmax_forward::desc desc = mkldnn::softmax_forward::desc(prop,
+      data_md, param.axis);
+  mkldnn::softmax_forward::primitive_desc pdesc(desc, cpu_engine);
+
+  auto output_memory = out_data.GetMKLDNNData();
+  MKLDNNStream *stream = MKLDNNStream::Get();
+  stream->RegisterPrim(mkldnn::softmax_forward(pdesc, *input_mem, *output_memory));
+  stream->Submit();
+}
+
+}   // namespace op
+}   // namespace mxnet
+#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_sum.cc b/src/operator/nn/mkldnn/mkldnn_sum.cc
new file mode 100644
index 000000000000..1efc285b808f
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_sum.cc
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_sum.cc
+ * \brief
+ * \author Da Zheng
+*/
+#include <iostream>
+
+#include "./mkldnn_ops-inl.h"
+#include "./mkldnn_base-inl.h"
+
+#if MXNET_USE_MKLDNN == 1
+namespace mxnet {
+namespace op {
+
+void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2,
+         const mkldnn::memory &out) {
+  std::vector<mkldnn::memory::primitive_desc> input_pds(2);
+  std::vector<float> scales(2);
+  std::vector<mkldnn::primitive::at> inputs;
+  input_pds[0] = arr1.get_primitive_desc();
+  input_pds[1] = arr2.get_primitive_desc();
+  CHECK(input_pds[0] == input_pds[1]);
+  scales[0] = 1;
+  scales[1] = 1;
+  inputs.push_back(arr1);
+  inputs.push_back(arr2);
+  // TODO(zhengda) I need to reorder memory here.
+  mkldnn::sum::primitive_desc sum_pd(scales, input_pds);
+  MKLDNNStream::Get()->RegisterPrim(mkldnn::sum(sum_pd, inputs, out));
+}
+
+void MKLDNNSumForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                      const std::vector<NDArray> &inputs, const OpReqType &req,
+                      const NDArray &out_data) {
+  TmpMemMgr::Get()->Init(ctx.requested[0]);
+  std::vector<mkldnn::primitive::at> in_prims;
+  std::vector<mkldnn::memory::primitive_desc> in_pds(inputs.size());
+  std::vector<float> scales(inputs.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    auto in_mem = inputs[i].GetMKLDNNData();
+    in_prims.push_back(*in_mem);
+    in_pds[i] = in_mem->get_primitive_desc();
+    scales[i] = 1;
+  }
+  mkldnn::sum::primitive_desc pdesc(scales, in_pds);
+
+  auto out_mem = CreateMKLDNNMem(out_data, pdesc.dst_primitive_desc(), req);
+  MKLDNNStream *stream = MKLDNNStream::Get();
+  stream->RegisterPrim(mkldnn::sum(pdesc, in_prims, *out_mem.second));
+  CommitOutput(out_data, out_mem);
+  stream->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif
diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h
index a32aaa2152e9..de98bc4981af 100644
--- a/src/operator/nn/pooling-inl.h
+++ b/src/operator/nn/pooling-inl.h
@@ -21,7 +21,7 @@
  * Copyright (c) 2017 by Contributors
  * \file pooling-inl.h
  * \brief
- * \author Bing Xu, Jun Wu
+ * \author Bing Xu, Jun Wu, Da Zheng
 */
 
 #ifndef MXNET_OPERATOR_NN_POOLING_INL_H_
@@ -80,255 +80,103 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
   }
 };
 
-template<typename xpu, typename DType>
-class PoolingOp : public Operator {
- public:
-  explicit PoolingOp(PoolingParam p) {
-    this->param_ = p;
-  }
-
-  virtual void Forward(const OpContext& ctx,
-                       const std::vector<TBlob>& in_data,
-                       const std::vector<OpReqType>& req,
-                       const std::vector<TBlob>& out_data,
-                       const std::vector<TBlob>& aux_args) {
-    using namespace mshadow;
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 1U);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    const TShape& ishape = in_data[pool_enum::kData].shape_;
-
-    pool(s, in_data[pool_enum::kData].dptr<DType>(),
-         in_data[pool_enum::kData].shape_,
-         out_data[pool_enum::kOut].shape_,
-         param_.global_pool?
-           TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim())
-           : param_.kernel,
-         param_.pad,
-         param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride,
-         param_.pool_type,
-         req[pool_enum::kOut],
-         out_data[pool_enum::kOut].dptr<DType>());
-  }
-
-  virtual void Backward(const OpContext& ctx,
-                        const std::vector<TBlob>& out_grad,
-                        const std::vector<TBlob>& in_data,
-                        const std::vector<TBlob>& out_data,
-                        const std::vector<OpReqType>& req,
-                        const std::vector<TBlob>& in_grad,
-                        const std::vector<TBlob>& aux_args) {
-    using namespace mshadow;
-    CHECK_EQ(out_grad.size(), 1U);
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 1U);
-    CHECK_EQ(req.size(), 1U);
-    CHECK_EQ(in_grad.size(), 1U);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    const TShape& ishape = in_data[pool_enum::kData].shape_;
+/*
+ * When MKLDNN is enabled, we might want 2 outputs instead of one inputs, which
+ * also changes the number of inputs for backward.
+ */
+int GetNumOutputs(const PoolingParam &param);
+int GetNumBackInputs(const PoolingParam &param);
 
-    unpool(s, out_grad[pool_enum::kOut].dptr<DType>(),
-           in_data[pool_enum::kData].dptr<DType>(),
-           out_data[pool_enum::kOut].dptr<DType>(),
-           in_grad[pool_enum::kData].shape_,
-           out_grad[pool_enum::kOut].shape_,
-           param_.global_pool?
-             TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim())
-             : param_.kernel,
-           param_.pad,
-           param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride,
-           param_.pool_type,
-           req[pool_enum::kData],
-           in_grad[pool_enum::kData].dptr<DType>());
-  }
+template<typename xpu, typename DType>
+void PoolingForward(const OpContext& ctx, const PoolingParam &param,
+                    const TBlob& in_data, const OpReqType& req,
+                    const TBlob& out_data) {
+  using namespace mshadow;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const TShape& ishape = in_data.shape_;
+
+  pool(s, in_data.dptr<DType>(), in_data.shape_, out_data.shape_,
+       param.global_pool?
+       TShape(ishape.data()+ishape.ndim()-param.kernel.ndim(), ishape.data()+ishape.ndim())
+       : param.kernel,
+       param.pad,
+       param.global_pool? TShape(param.kernel.ndim()) : param.stride,
+       param.pool_type, req, out_data.dptr<DType>());
+}
 
- private:
-  PoolingParam param_;
-};  // class PoolingOp
+template<typename xpu, typename DType>
+void PoolingBackward(const OpContext& ctx, const PoolingParam &param,
+                     const TBlob& out_grad, const TBlob& in_data,
+                     const TBlob& out_data, const OpReqType& req,
+                     const TBlob& in_grad) {
+  using namespace mshadow;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const TShape& ishape = in_data.shape_;
+
+  unpool(s, out_grad.dptr<DType>(), in_data.dptr<DType>(), out_data.dptr<DType>(),
+         in_grad.shape_, out_grad.shape_,
+         param.global_pool?
+         TShape(ishape.data()+ishape.ndim()-param.kernel.ndim(), ishape.data()+ishape.ndim())
+         : param.kernel,
+         param.pad,
+         param.global_pool? TShape(param.kernel.ndim()) : param.stride,
+         param.pool_type, req, in_grad.dptr<DType>());
+}
 
 template<typename xpu>
-Operator* CreateOp(PoolingParam param, int dtype);
-
-
-#if DMLC_USE_CXX11
-class PoolingProp : public OperatorProperty {
- public:
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    using namespace mshadow;
-    param_.Init(kwargs);
-    if (param_.kernel.ndim() == 1) {
-      if (param_.stride.ndim() == 0) param_.stride = Shape1(1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape1(0);
-    } else if (param_.kernel.ndim() == 2) {
-      if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
+void PoolingCompute(const nnvm::NodeAttrs& attrs,
+                    const OpContext& ctx,
+                    const std::vector<TBlob>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<TBlob>& outputs) {
+  const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), GetNumOutputs(param));
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    if (pool_enum::kMaxPooling == param.pool_type
+        || pool_enum::kAvgPooling == param.pool_type
+        || pool_enum::kSumPooling == param.pool_type) {
+      PoolingForward<xpu, DType>(ctx, param, inputs[0], req[0], outputs[0]);
     } else {
-      CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D pooling not supported";
-      if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
+      LOG(FATAL) << "unknown pooling type";
     }
-    CHECK_EQ(param_.stride.ndim(), param_.kernel.ndim())
-      << "stride and kernel should have the same length";
-    CHECK_EQ(param_.pad.ndim(), param_.kernel.ndim())
-      << "pad and kernel should have the same length";
-  }
+  });
+}
 
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-    CHECK_EQ(in_shape->size(), 1U);
-    const TShape &dshape = (*in_shape)[0];
-    CHECK_GE(dshape.ndim(), 3U) << "Pooling: Input data should be  3D in (batch, channel, x)"
-                                << " Or 4D in (batch, channel, y, x) "
-                                << " Or 5D in (batch, channel, d, y, x)";
-    TShape oshape = dshape;
-    if (dshape.ndim() ==  0) return false;
-    if (param_.kernel.ndim() == 1) {
-      CHECK_EQ(dshape.ndim(), 3U) << "Pooling: Input data should be 3D in (batch, channel, x)";
-      if (param_.global_pool) {
-        oshape[2] = 1;
-      } else {
-        CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0])
-            << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2]
-            << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")";
-        if (param_.pooling_convention == pool_enum::kValid) {
-          oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
-                              param_.stride[0];
-        } else {
-          oshape[2] = 1 + static_cast<int>(ceil(static_cast<float>(
-                              dshape[2] + 2 * param_.pad[0] -
-                              param_.kernel[0]) / param_.stride[0]));
-        }
-      }
-      out_shape->clear();
-      out_shape->push_back(oshape);  // save output shape
-    } else if (param_.kernel.ndim() == 2) {
-      CHECK_EQ(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in (batch, channel, y, x)";
-      if (param_.global_pool) {
-        oshape[2] = 1;
-        oshape[3] = 1;
-      } else {
-        CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0])
-            << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2]
-            << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")";
-        CHECK(param_.kernel[1] <= dshape[3] + 2 * param_.pad[1])
-            << "kernel size (" << param_.kernel[1] << ") exceeds input (" << dshape[3]
-            << " padded to " << (dshape[3] + 2*param_.pad[1]) << ")";
-        if (param_.pooling_convention == pool_enum::kValid) {
-          oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
-                              param_.stride[0];
-          oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) /
-                              param_.stride[1];
-        } else {
-          oshape[2] = 1 + static_cast<int>(ceil(static_cast<float>(
-                              dshape[2] + 2 * param_.pad[0] -
-                              param_.kernel[0]) / param_.stride[0]));
-          oshape[3] = 1 + static_cast<int>(ceil(static_cast<float>(
-                              dshape[3] + 2 * param_.pad[1] -
-                              param_.kernel[1]) / param_.stride[1]));
-        }
-      }
-      out_shape->clear();
-      out_shape->push_back(oshape);  // save output shape
-    } else if (param_.kernel.ndim() == 3) {
-      CHECK_EQ(dshape.ndim(), 5U)
-        << "Pooling: Input data should be 5D in (batch, channel, d, y, x)";
-      CHECK_LE(param_.kernel[0], dshape[2] + 2 * param_.pad[0]) << "kernel size exceeds input";
-      CHECK_LE(param_.kernel[1], dshape[3] + 2 * param_.pad[1]) << "kernel size exceeds input";
-      CHECK_LE(param_.kernel[2], dshape[4] + 2 * param_.pad[2]) << "kernel size exceeds input";
-      if (param_.global_pool) {
-        oshape[2] = 1;
-        oshape[3] = 1;
-        oshape[4] = 1;
-      } else {
-        if (param_.pooling_convention == pool_enum::kValid) {
-          oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
-                              param_.stride[0];
-          oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) /
-                              param_.stride[1];
-          oshape[4] = 1 + (dshape[4] + 2 * param_.pad[2] - param_.kernel[2]) /
-                              param_.stride[2];
-        } else {
-          oshape[2] = 1 + static_cast<int>(ceil(static_cast<float>(
-                              dshape[2] + 2 * param_.pad[0] -
-                              param_.kernel[0]) / param_.stride[0]));
-          oshape[3] = 1 + static_cast<int>(ceil(static_cast<float>(
-                              dshape[3] + 2 * param_.pad[1] -
-                              param_.kernel[1]) / param_.stride[1]));
-          oshape[4] = 1 + static_cast<int>(ceil(static_cast<float>(
-                              dshape[4] + 2 * param_.pad[2] -
-                              param_.kernel[2]) / param_.stride[2]));
-        }
-      }
-
-      out_shape->clear();
-      out_shape->push_back(oshape);  // save output shape
-    }
-    return true;
+template<typename xpu>
+void PoolingGradCompute(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const std::vector<TBlob>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& outputs) {
+  const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), GetNumBackInputs(param));
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  off_t ograd_idx, in_data_idx, out_data_idx;
+  // When MKLDNN is enabled, the input data may contains arrays for workspace.
+  if (GetNumBackInputs(param) == 5) {
+    ograd_idx = 0;
+    in_data_idx = 2;
+    out_data_idx = 3;
+  } else {
+    ograd_idx = 0;
+    in_data_idx = 1;
+    out_data_idx = 2;
   }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_EQ(in_type->size(), 1U);
-    int dtype = (*in_type)[0];
-
-    if (dtype == -1) {
-      LOG(FATAL) << "Input type to pooling is not specified.";
-      return false;
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    if (pool_enum::kMaxPooling == param.pool_type
+        || pool_enum::kAvgPooling == param.pool_type
+        || pool_enum::kSumPooling == param.pool_type) {
+      PoolingBackward<xpu, DType>(ctx, param, inputs[ograd_idx],
+                                  inputs[in_data_idx], inputs[out_data_idx],
+                                  req[0], outputs[0]);
+    } else {
+      LOG(FATAL) << "unknown pooling type";
     }
+  });
+}
 
-    out_type->clear();
-    out_type->push_back(dtype);
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    PoolingProp *prop_sym = new PoolingProp();
-    prop_sym->param_ = this->param_;
-    return prop_sym;
-  }
-
-  std::string TypeString() const override {
-    return "Pooling";
-  }
-
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return {out_grad[pool_enum::kOut], in_data[pool_enum::kData],
-            out_data[pool_enum::kOut]};
-  }
-
-  std::vector<std::pair<int, void*> > BackwardInplaceOption(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data,
-    const std::vector<void*> &in_grad) const override {
-#if MXNET_USE_CUDNN == 1
-    return {};
-#else
-    return {{in_data[pool_enum::kData], in_grad[pool_enum::kData]}};
-#endif
-  }
-
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented.";
-    return NULL;
-  }
-
-  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                             std::vector<int> *in_type) const override;
-
- private:
-  PoolingParam param_;
-};  // class PoolingProp
-#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 8345ea3886d4..f757473f3c96 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -21,8 +21,9 @@
  * Copyright (c) 2017 by Contributors
  * \file pooling.cc
  * \brief
- * \author Bing Xu, Jun Wu
+ * \author Bing Xu, Jun Wu, Da Zheng
 */
+#include "../elemwise_op_common.h"
 #include "./pooling-inl.h"
 #if MXNET_USE_MKL2017 == 1
 #include <mkl_memory.h>
@@ -32,67 +33,317 @@
 #if MXNET_USE_NNPACK == 1
 #include "./nnpack/nnpack_pooling-inl.h"
 #endif  // MXNET_USE_NNPACK
+#if MXNET_USE_MKLDNN == 1
+#include "./mkldnn/mkldnn_pooling-inl.h"
+#endif  // MXNET_USE_MKLDNN
 
 namespace mxnet {
 namespace op {
 
-template<>
-Operator *CreateOp<cpu>(PoolingParam param, int dtype) {
-  Operator *op = NULL;
-#if MXNET_USE_MKL2017 == 1
-    if (param.kernel.ndim() == 2
-      && ((param.pool_type == pool_enum::kMaxPooling)
-      || (param.pool_type == pool_enum::kAvgPooling))) {
-      switch (dtype) {
-      case mshadow::kFloat32:
-        return new MKLPoolingOp<cpu, float>(param);
-      case mshadow::kFloat64:
-        return new MKLPoolingOp<cpu, double>(param);
-      default:
-        break;
+static void PoolingParamParser(nnvm::NodeAttrs *attrs) {
+  using namespace mshadow;
+  PoolingParam param_;
+  param_.Init(attrs->dict);
+  if (param_.kernel.ndim() == 1) {
+    if (param_.stride.ndim() == 0) param_.stride = Shape1(1);
+    if (param_.pad.ndim() == 0) param_.pad = Shape1(0);
+  } else if (param_.kernel.ndim() == 2) {
+    if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
+    if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
+  } else {
+    CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim()
+                                       << "D pooling not supported";
+    if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
+    if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
+  }
+  CHECK_EQ(param_.stride.ndim(), param_.kernel.ndim())
+      << "stride and kernel should have the same length";
+  CHECK_EQ(param_.pad.ndim(), param_.kernel.ndim())
+      << "pad and kernel should have the same length";
+  attrs->parsed = std::move(param_);
+}
+
+int GetNumOutputs(const PoolingParam &param) {
+#if MXNET_USE_MKLDNN == 1
+  return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 2 : 1;
+#else
+  return 1;
+#endif
+}
+
+int GetNumBackInputs(const PoolingParam &param) {
+#if MXNET_USE_MKLDNN == 1
+  return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 5 : 3;
+#else
+  return 3;
+#endif
+}
+
+static bool PoolingType(const nnvm::NodeAttrs& attrs,
+                        std::vector<int> *in_attrs,
+                        std::vector<int> *out_attrs) {
+  out_attrs->at(0) = in_attrs->at(0);
+#if MXNET_USE_MKLDNN == 1
+  const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
+  if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param)) {
+    CHECK_GT(out_attrs->size(), 1U);
+    out_attrs->at(1) = mshadow::kInt32;
+  }
+#endif
+  return true;
+}
+
+static bool PoolingShape(const nnvm::NodeAttrs &attrs,
+                         std::vector<TShape> *in_shape,
+                         std::vector<TShape> *out_shape) {
+  const PoolingParam &param_ = nnvm::get<PoolingParam>(attrs.parsed);
+  CHECK_EQ(in_shape->size(), 1U);
+  const TShape &dshape = (*in_shape)[0];
+  CHECK_GE(dshape.ndim(), 3U)
+      << "Pooling: Input data should be  3D in (batch, channel, x)"
+      << " Or 4D in (batch, channel, y, x) "
+      << " Or 5D in (batch, channel, d, y, x)";
+  TShape oshape = dshape;
+  if (dshape.ndim() == 0) return false;
+  if (param_.kernel.ndim() == 1) {
+    CHECK_EQ(dshape.ndim(), 3U)
+        << "Pooling: Input data should be 3D in (batch, channel, x)";
+    if (param_.global_pool) {
+      oshape[2] = 1;
+    } else {
+      CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0])
+          << "kernel size (" << param_.kernel[0] << ") exceeds input ("
+          << dshape[2] << " padded to " << (dshape[2] + 2 * param_.pad[0])
+          << ")";
+      if (param_.pooling_convention == pool_enum::kValid) {
+        oshape[2] = 1 +
+                    (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
+                        param_.stride[0];
+      } else {
+        oshape[2] = 1 + static_cast<int>(ceil(
+                            static_cast<float>(dshape[2] + 2 * param_.pad[0] -
+                                               param_.kernel[0]) /
+                            param_.stride[0]));
       }
     }
+    out_shape->clear();
+    out_shape->push_back(oshape);  // save output shape
+#if MXNET_USE_MKLDNN == 1
+    if (MKLDNNRequireWorkspace(param_) && SupportMKLDNNPooling(param_))
+      out_shape->push_back(oshape);   // for workspace
 #endif
-#if MXNET_USE_NNPACK == 1
-  // NNPACK only support max-pooling with kernel = 2, stride = 2, pooling_convention
-  // = kFull(note that the default value is kValid in MXNet)
-  if ((param.pool_type == pool_enum::kMaxPooling)
-    && (param.pooling_convention == pool_enum::kFull)
-    && (param.kernel.ndim() == 2) && (param.stride.ndim() == 2)
-    && (param.kernel[0] == 2) && (param.kernel[1] == 2)
-    && (param.stride[0] == 2) && (param.stride[1] == 2)) {
-    switch (dtype) {
-    case mshadow::kFloat32:
-      return new NNPACKPoolingOp<cpu, float>(param);
-    default:
-      break;
+  } else if (param_.kernel.ndim() == 2) {
+    CHECK_EQ(dshape.ndim(), 4U)
+        << "Pooling: Input data should be 4D in (batch, channel, y, x)";
+    if (param_.global_pool) {
+      oshape[2] = 1;
+      oshape[3] = 1;
+    } else {
+      CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0])
+          << "kernel size (" << param_.kernel[0] << ") exceeds input ("
+          << dshape[2] << " padded to " << (dshape[2] + 2 * param_.pad[0])
+          << ")";
+      CHECK(param_.kernel[1] <= dshape[3] + 2 * param_.pad[1])
+          << "kernel size (" << param_.kernel[1] << ") exceeds input ("
+          << dshape[3] << " padded to " << (dshape[3] + 2 * param_.pad[1])
+          << ")";
+      if (param_.pooling_convention == pool_enum::kValid) {
+        oshape[2] = 1 +
+                    (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
+                        param_.stride[0];
+        oshape[3] = 1 +
+                    (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) /
+                        param_.stride[1];
+      } else {
+        oshape[2] = 1 + static_cast<int>(ceil(
+                            static_cast<float>(dshape[2] + 2 * param_.pad[0] -
+                                               param_.kernel[0]) /
+                            param_.stride[0]));
+        oshape[3] = 1 + static_cast<int>(ceil(
+                            static_cast<float>(dshape[3] + 2 * param_.pad[1] -
+                                               param_.kernel[1]) /
+                            param_.stride[1]));
+      }
     }
-  }
+    out_shape->clear();
+    out_shape->push_back(oshape);  // save output shape
+#if MXNET_USE_MKLDNN == 1
+    if (MKLDNNRequireWorkspace(param_) && SupportMKLDNNPooling(param_))
+      out_shape->push_back(oshape);   // for workspace
 #endif
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    if (pool_enum::kMaxPooling == param.pool_type
-        || pool_enum::kAvgPooling == param.pool_type
-        || pool_enum::kSumPooling == param.pool_type) {
-      op = new PoolingOp<cpu, DType>(param);
+  } else if (param_.kernel.ndim() == 3) {
+    CHECK_EQ(dshape.ndim(), 5U)
+        << "Pooling: Input data should be 5D in (batch, channel, d, y, x)";
+    CHECK_LE(param_.kernel[0], dshape[2] + 2 * param_.pad[0])
+        << "kernel size exceeds input";
+    CHECK_LE(param_.kernel[1], dshape[3] + 2 * param_.pad[1])
+        << "kernel size exceeds input";
+    CHECK_LE(param_.kernel[2], dshape[4] + 2 * param_.pad[2])
+        << "kernel size exceeds input";
+    if (param_.global_pool) {
+      oshape[2] = 1;
+      oshape[3] = 1;
+      oshape[4] = 1;
     } else {
-      LOG(FATAL) << "unknown pooling type";
-      return NULL;
+      if (param_.pooling_convention == pool_enum::kValid) {
+        oshape[2] = 1 +
+                    (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
+                        param_.stride[0];
+        oshape[3] = 1 +
+                    (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) /
+                        param_.stride[1];
+        oshape[4] = 1 +
+                    (dshape[4] + 2 * param_.pad[2] - param_.kernel[2]) /
+                        param_.stride[2];
+      } else {
+        oshape[2] = 1 + static_cast<int>(ceil(
+                            static_cast<float>(dshape[2] + 2 * param_.pad[0] -
+                                               param_.kernel[0]) /
+                            param_.stride[0]));
+        oshape[3] = 1 + static_cast<int>(ceil(
+                            static_cast<float>(dshape[3] + 2 * param_.pad[1] -
+                                               param_.kernel[1]) /
+                            param_.stride[1]));
+        oshape[4] = 1 + static_cast<int>(ceil(
+                            static_cast<float>(dshape[4] + 2 * param_.pad[2] -
+                                               param_.kernel[2]) /
+                            param_.stride[2]));
+      }
     }
-  });
 
-  return op;
+    out_shape->clear();
+    out_shape->push_back(oshape);  // save output shape
+#if MXNET_USE_MKLDNN == 1
+    if (MKLDNNRequireWorkspace(param_) && SupportMKLDNNPooling(param_))
+      out_shape->push_back(oshape);   // for workspace
+#endif
+  }
+  return true;
+}
+
+void PoolingCompute_CPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
+                        const std::vector<NDArray> &inputs,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<NDArray> &outputs) {
+#if MXNET_USE_MKLDNN == 1
+  const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
+  const NDArray *workspace = nullptr;
+  if (MKLDNNRequireWorkspace(param)) {
+    CHECK_GT(outputs.size(), 1U);
+    workspace = &outputs[1];
+  }
+  if (SupportMKLDNN(inputs[0])
+      && SupportMKLDNNPooling(param, inputs[0].shape())) {
+    MKLDNNPoolingForward(ctx, param, inputs[0], req[0], outputs[0],
+                         workspace);
+    return;
+  }
+#endif
+  std::vector<TBlob> in_blobs(inputs.size());
+  for (size_t i = 0; i < in_blobs.size(); i++) in_blobs[i] = inputs[i].data();
+  std::vector<TBlob> out_blobs(outputs.size());
+  for (size_t i = 0; i < out_blobs.size(); i++)
+    out_blobs[i] = outputs[i].data();
+  PoolingCompute<cpu>(attrs, ctx, in_blobs, req, out_blobs);
 }
 
-// DO_BIND_DISPATCH comes from operator_common.h
-Operator* PoolingProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                                     std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+void PoolingGradCompute_CPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
+                            const std::vector<NDArray> &inputs,
+                            const std::vector<OpReqType> &req,
+                            const std::vector<NDArray> &outputs) {
+#if MXNET_USE_MKLDNN == 1
+  const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
+  const NDArray &out_grad = inputs[0];
+  const NDArray *workspace = nullptr;
+  const NDArray *in_data = nullptr;
+  if (MKLDNNRequireWorkspace(param)) {
+    // The first two elements are the gradient of the outputs in forward.
+    // The third is the input of forward.
+    // The fourth and the fifth are the outputs of forward.
+    CHECK_EQ(inputs.size(), 5U);
+    in_data = &inputs[2];
+    workspace = &inputs[4];
+  } else {
+    CHECK_EQ(inputs.size(), 3U);
+    in_data = &inputs[1];
+  }
+  const NDArray &in_grad = outputs[0];
+  if (SupportMKLDNN(inputs[0])
+      && SupportMKLDNNPooling(param, inputs[0].shape())) {
+    MKLDNNPoolingBackward(ctx, param, out_grad, *in_data, workspace,
+                          req[0], in_grad);
+    return;
+  }
+#endif
+  std::vector<TBlob> in_blobs(inputs.size());
+  for (size_t i = 0; i < in_blobs.size(); i++)
+    in_blobs[i] = inputs[i].data();
+  std::vector<TBlob> out_blobs(outputs.size());
+  for (size_t i = 0; i < out_blobs.size(); i++)
+    out_blobs[i] = outputs[i].data();
+  PoolingGradCompute<cpu>(attrs, ctx, in_blobs, req, out_blobs);
+}
+
+struct PoolingGrad {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(
+      const nnvm::NodePtr &n,
+      const std::vector<nnvm::NodeEntry> &ograds) const {
+    std::vector<nnvm::NodeEntry> heads;
+    heads.push_back(ograds[pool_enum::kOut]);
+    heads.push_back(n->inputs[pool_enum::kData]);
+    heads.emplace_back(nnvm::NodeEntry{n, pool_enum::kOut, 0});
+    return MakeGradNode(op_name, n, heads, n->attrs.dict);
+  }
+};
+
+inline static bool PoolingStorageType(const nnvm::NodeAttrs &attrs,
+                                      const int dev_mask,
+                                      DispatchMode *dispatch_mode,
+                                      std::vector<int> *in_attrs,
+                                      std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1);
+
+  *dispatch_mode = DispatchMode::kFCompute;
+#if MXNET_USE_MKLDNN == 1
+  const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
+  if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param)) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#else
+  CHECK_EQ(out_attrs->size(), 1);
+#endif
+  for (size_t i = 0; i < out_attrs->size(); i++)
+    (*out_attrs)[i] = kDefaultStorage;
+  return true;
+}
+
+inline static bool BackwardPoolingStorageType(const nnvm::NodeAttrs &attrs,
+                                              const int dev_mask,
+                                              DispatchMode *dispatch_mode,
+                                              std::vector<int> *in_attrs,
+                                              std::vector<int> *out_attrs) {
+  const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), GetNumBackInputs(param));
+  CHECK_EQ(out_attrs->size(), 1);
+
+  *dispatch_mode = DispatchMode::kFCompute;
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param)) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#else
+  CHECK_EQ(in_attrs->size(), 3);
+#endif
+  for (size_t i = 0; i < out_attrs->size(); i++)
+    (*out_attrs)[i] = kDefaultStorage;
+  return true;
 }
 
 DMLC_REGISTER_PARAMETER(PoolingParam);
 
-MXNET_REGISTER_OP_PROPERTY(Pooling, PoolingProp)
-.describe(R"code(Performs pooling on the input.
+NNVM_REGISTER_OP(Pooling)
+    .describe(R"code(Performs pooling on the input.
 
 The shapes for 1-D pooling are
 
@@ -131,8 +382,53 @@ For 3-D pooling, an additional *depth* dimension is added before
 height, width)*.
 
 )code" ADD_FILELINE)
-.add_argument("data", "NDArray-or-Symbol", "Input data to the pooling operator.")
+.set_num_inputs(1)
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
+  return GetNumOutputs(param);
+})
+#if MXNET_USE_MKLDNN == 1
+.set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs",
+                                    [](const NodeAttrs& attrs) { return 1; })
+#endif
+.set_attr<nnvm::FListOutputNames>("FListOutputNames",
+    [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"output"};
+})
+.set_attr_parser(PoolingParamParser)
+.set_attr<FInferStorageType>("FInferStorageType", PoolingStorageType)
+.set_attr<nnvm::FInferType>("FInferType", PoolingType)
+.set_attr<nnvm::FInferShape>("FInferShape", PoolingShape)
+.set_attr<FCompute>("FCompute<cpu>", PoolingCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", PoolingCompute_CPU)
+.set_attr<nnvm::FGradient>("FGradient",
+                           ElemwiseGradUseInOut{"_backward_Pooling"})
+.add_argument("data", "NDArray-or-Symbol",
+              "Input data to the pooling operator.")
 .add_arguments(PoolingParam::__FIELDS__());
 
+NNVM_REGISTER_OP(_backward_Pooling)
+.set_num_outputs(1)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<nnvm::FInplaceOption>(
+    "FInplaceOption",
+    [](const NodeAttrs &attrs) {
+#if MXNET_USE_CUDNN == 1
+  return std::vector<std::pair<int, int> >();
+#else
+  return std::vector<std::pair<int, int> >{{1, 0}};
+#endif
+})
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
+.set_attr<FInferStorageType>("FInferStorageType",
+                             BackwardPoolingStorageType)
+.set_attr_parser(PoolingParamParser)
+.set_attr<FCompute>("FCompute<cpu>", PoolingGradCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", PoolingGradCompute_CPU);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/pooling.cu b/src/operator/nn/pooling.cu
index dcebe6798263..c3bcecfc77b7 100644
--- a/src/operator/nn/pooling.cu
+++ b/src/operator/nn/pooling.cu
@@ -21,7 +21,7 @@
  * Copyright (c) 2017 by Contributors
  * \file pooling.cu
  * \brief
- * \author Bing Xu, Jun Wu
+ * \author Bing Xu, Jun Wu, Da Zheng
 */
 #include <vector>
 #include "./pooling-inl.h"
@@ -32,38 +32,112 @@
 namespace mxnet {
 namespace op {
 
+#if MXNET_USE_CUDNN == 1
+template<typename DType>
+static CuDNNPoolingOp<DType> &GetCuDNNPoolingOp(const PoolingParam &param) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local CuDNNPoolingOp<DType> op;
+#else
+  static MX_THREAD_LOCAL CuDNNPoolingOp<DType> op;
+#endif
+  op.Init(param);
+  return op;
+}
+#endif
+
 template<>
-Operator *CreateOp<gpu>(PoolingParam param, int dtype) {
-  Operator *op = NULL;
+void PoolingCompute<gpu>(const nnvm::NodeAttrs& attrs,
+                         const OpContext& ctx,
+                         const std::vector<TBlob>& inputs,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<TBlob>& outputs) {
+  const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), GetNumOutputs(param));
+
 #if MXNET_USE_CUDNN == 1
   if (!param.cudnn_off && param.kernel.ndim() > 1) {
-    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
       switch (param.pool_type) {
         case pool_enum::kMaxPooling:
-          op = new CuDNNPoolingOp<DType>(param);
-          break;
         case pool_enum::kAvgPooling:
-          op = new CuDNNPoolingOp<DType>(param);
+          GetCuDNNPoolingOp<DType>(param).Forward(ctx, inputs[0], req[0], outputs[0]);
+          return;
+        case pool_enum::kSumPooling:
+          LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied.";
           break;
+      }
+    });
+  }
+#endif  // MXNET_USE_CUDNN
+
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    if (pool_enum::kMaxPooling == param.pool_type
+        || pool_enum::kAvgPooling == param.pool_type
+        || pool_enum::kSumPooling == param.pool_type) {
+      PoolingForward<gpu, DType>(ctx, param, inputs[0], req[0], outputs[0]);
+    } else {
+      LOG(FATAL) << "unknown pooling type";
+    }
+  });
+}
+
+template<>
+void PoolingGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
+                             const OpContext& ctx,
+                             const std::vector<TBlob>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<TBlob>& outputs) {
+  const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), GetNumBackInputs(param));
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  off_t ograd_idx, in_data_idx, out_data_idx;
+  // When MKLDNN is enabled, the input data may contains arrays for workspace.
+  if (GetNumBackInputs(param) == 5) {
+    ograd_idx = 0;
+    in_data_idx = 2;
+    out_data_idx = 3;
+  } else {
+    ograd_idx = 0;
+    in_data_idx = 1;
+    out_data_idx = 2;
+  }
+
+#if MXNET_USE_CUDNN == 1
+  if (!param.cudnn_off && param.kernel.ndim() > 1) {
+    MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+      switch (param.pool_type) {
+        case pool_enum::kMaxPooling:
+        case pool_enum::kAvgPooling:
+          GetCuDNNPoolingOp<DType>(param).Backward(ctx, inputs[ograd_idx],
+              inputs[in_data_idx], inputs[out_data_idx], req[0], outputs[0]);
+          return;
         case pool_enum::kSumPooling:
           LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied.";
           break;
       }
     });
   }
-  if (op) return op;
 #endif  // MXNET_USE_CUDNN
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
     if (pool_enum::kMaxPooling == param.pool_type
         || pool_enum::kAvgPooling == param.pool_type
         || pool_enum::kSumPooling == param.pool_type) {
-      op = new PoolingOp<gpu, DType>(param);
+      PoolingBackward<gpu, DType>(ctx, param, inputs[ograd_idx],
+          inputs[in_data_idx], inputs[out_data_idx], req[0], outputs[0]);
     } else {
       LOG(FATAL) << "unknown pooling type";
     }
   });
-  return op;
 }
 
+NNVM_REGISTER_OP(Pooling)
+.set_attr<FCompute>("FCompute<gpu>", PoolingCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_Pooling)
+.set_attr<FCompute>("FCompute<gpu>", PoolingGradCompute<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc
index 4686fb8c0dc1..5a2071a1a53f 100644
--- a/src/operator/nn/softmax.cc
+++ b/src/operator/nn/softmax.cc
@@ -25,11 +25,53 @@
 #include "./softmax-inl.h"
 #include "../tensor/elemwise_unary_op.h"
 #include "../tensor/elemwise_binary_op.h"
+#include "mkldnn/mkldnn_base-inl.h"
+#include "mkldnn/mkldnn_ops-inl.h"
 
 namespace mxnet {
 namespace op {
 DMLC_REGISTER_PARAMETER(SoftmaxParam);
 
+static void SoftmaxCompute_CPU(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx, const std::vector<NDArray>& inputs,
+    const std::vector<OpReqType>& req, const std::vector<NDArray>& outputs) {
+#if MXNET_USE_MKLDNN == 1
+  const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
+  // It seems MKLDNN softmax doesn't support training.
+  // and it only supports non-negative axis.
+  if (SupportMKLDNN(inputs[0]) && !ctx.is_train && param.axis >= 0) {
+    MKLDNNSoftmaxForward(attrs, ctx, inputs[0], req[0], outputs[0]);
+    return;
+  }
+#endif
+  std::vector<TBlob> in_blobs(inputs.size());
+  for (size_t i = 0; i < in_blobs.size(); i++)
+    in_blobs[i] = inputs[i].data();
+  std::vector<TBlob> out_blobs(outputs.size());
+  for (size_t i = 0; i < out_blobs.size(); i++)
+    out_blobs[i] = outputs[i].data();
+  SoftmaxCompute<cpu, mxnet_op::softmax_fwd>(attrs, ctx, in_blobs, req, out_blobs);
+}
+
+inline static bool SoftmaxStorageType(const nnvm::NodeAttrs& attrs,
+                                 const int dev_mask,
+                                 DispatchMode* dispatch_mode,
+                                 std::vector<int> *in_attrs,
+                                 std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(out_attrs->size(), 1);
+
+#if MXNET_USE_MKLDNN == 1
+  // We only run MKLDNN op if it runs on CPU.
+  if (dev_mask == mshadow::cpu::kDevMask)
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  else
+#endif
+    *dispatch_mode = DispatchMode::kFCompute;
+  (*out_attrs)[0] = (*in_attrs)[0];
+  return true;
+}
+
 MXNET_OPERATOR_REGISTER_UNARY(softmax)
 .describe(R"code(Applies the softmax function.
 
@@ -54,6 +96,8 @@ Example::
 )code" ADD_FILELINE)
 .set_attr_parser(ParamParser<SoftmaxParam>)
 .set_attr<FCompute>("FCompute<cpu>", SoftmaxCompute<cpu, mxnet_op::softmax_fwd>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SoftmaxCompute_CPU)
+.set_attr<FInferStorageType>("FInferStorageType", SoftmaxStorageType)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_softmax"})
 .add_arguments(SoftmaxParam::__FIELDS__());
 
diff --git a/src/operator/nn/softmax_activation-inl.h b/src/operator/nn/softmax_activation-inl.h
index 500bf51ccd1f..b1d542e4068c 100644
--- a/src/operator/nn/softmax_activation-inl.h
+++ b/src/operator/nn/softmax_activation-inl.h
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file softmax_activation-inl.h
  * \brief SoftmaxActivation operator
- * \author Junyuan Xie
+ * \author Junyuan Xie, Da Zheng
 */
 #ifndef MXNET_OPERATOR_NN_SOFTMAX_ACTIVATION_INL_H_
 #define MXNET_OPERATOR_NN_SOFTMAX_ACTIVATION_INL_H_
@@ -61,153 +61,74 @@ struct SoftmaxActivationParam : public dmlc::Parameter<SoftmaxActivationParam> {
   }
 };
 
-/**
- * \brief This is the implementation of softmax_activation operator.
- * \tparam xpu The device that the op will be executed on.
- */
 template<typename xpu>
-class SoftmaxActivationOp : public Operator {
- public:
-  explicit SoftmaxActivationOp(SoftmaxActivationParam p) {
-    this->param_ = p;
-  }
-
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1U);
-    CHECK_EQ(out_data.size(), 1U);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    if (param_.mode == softmax_activation::kInstance) {
-      Tensor<xpu, 2> data = in_data[softmax_activation::kData].FlatTo2D<xpu, real_t>(s);
-      Tensor<xpu, 2> out = out_data[softmax_activation::kOut].FlatTo2D<xpu, real_t>(s);
-      Softmax(out, data);
-    } else {
-      CHECK_GE(in_data[softmax_activation::kData].ndim(), 3)
+void SoftmaxActivationCompute(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<TBlob>& inputs,
+                              const std::vector<OpReqType>& reqs,
+                              const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  const SoftmaxActivationParam& param = nnvm::get<SoftmaxActivationParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  const TBlob &in_data = inputs[softmax_activation::kData];
+  const OpReqType &req = reqs[softmax_activation::kOut];
+  const TBlob &out_data = outputs[softmax_activation::kOut];
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  if (param.mode == softmax_activation::kInstance) {
+    Tensor<xpu, 2> data = in_data.FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> out = out_data.FlatTo2D<xpu, real_t>(s);
+    Softmax(out, data);
+  } else {
+    CHECK_GE(in_data.ndim(), 3)
         << "Input need to have a least 3 dimensions when mode=channel";
-      int n = in_data[softmax_activation::kData].size(0);
-      int k = in_data[softmax_activation::kData].size(1);
-      Shape<3> s3 = Shape3(n, k, static_cast<int>(in_data[softmax_activation::kData].Size()/n/k));
-      Tensor<xpu, 3, real_t> data =
-        in_data[softmax_activation::kData].get_with_shape<xpu, 3, real_t>(s3, s);
-      Tensor<xpu, 3, real_t> out =
-        out_data[softmax_activation::kOut].get_with_shape<xpu, 3, real_t>(s3, s);
-      Softmax(out, data);
-    }
+    int n = in_data.size(0);
+    int k = in_data.size(1);
+    Shape<3> s3 = Shape3(n, k, static_cast<int>(in_data.Size()/n/k));
+    Tensor<xpu, 3, real_t> data = in_data.get_with_shape<xpu, 3, real_t>(s3, s);
+    Tensor<xpu, 3, real_t> out = out_data.get_with_shape<xpu, 3, real_t>(s3, s);
+    Softmax(out, data);
   }
+}
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1U);
-    CHECK(in_data.size() == 1 && in_grad.size() == 1);
-    CHECK_EQ(req.size(), 1U);
-    // Use 3d tensor for both mode -> {instance, channel}. Get shapes
-    int total_size = in_grad[softmax_activation::kData].Size();
-    int batch_size = in_grad[softmax_activation::kData].shape_[0];
-    int channel_num = in_grad[softmax_activation::kData].shape_[1];
-    int rest_size = total_size / (batch_size * channel_num);
-    const Shape<3> data_shape = Shape3(batch_size, channel_num, rest_size);
-    // Get tensors
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 3> m_out_grad =
-      out_grad[softmax_activation::kOut].get_with_shape<xpu, 3, real_t>(data_shape, s);
-    Tensor<xpu, 3> m_out_data =
-      out_data[softmax_activation::kOut].get_with_shape<xpu, 3, real_t>(data_shape, s);
-    Tensor<xpu, 3> m_in_grad =
-      in_grad[softmax_activation::kData].get_with_shape<xpu, 3, real_t>(data_shape, s);
-    // get requested temp space
-    Tensor<xpu, 2> workspace = ctx.requested[softmax_activation::kTempSpace].get_space<xpu>(
-        Shape2(batch_size, rest_size), s);
-    workspace = reduce_with_axis<red::sum, false>(m_out_grad * m_out_data, 1);
-    Assign(m_in_grad, req[softmax_activation::kData],
-        m_out_data * (m_out_grad - broadcast_with_axis(workspace, 0, channel_num)));
-  }
-
- private:
-  SoftmaxActivationParam param_;
-};  // class SoftmaxActivationOp
-
-// Decalre Factory function, used for dispatch specialization
 template<typename xpu>
-Operator* CreateOp(SoftmaxActivationParam type);
-
-#if DMLC_USE_CXX11
-class SoftmaxActivationProp : public OperatorProperty {
- public:
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-    using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 1U) << "Input:[data]";
-    const TShape &dshape = in_shape->at(softmax_activation::kData);
-    if (dshape.ndim() == 0) return false;
-    out_shape->clear();
-    out_shape->push_back(dshape);
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    auto ptr = new SoftmaxActivationProp();
-    ptr->param_ = param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override {
-    return "SoftmaxActivation";
-  }
-
-  // decalre dependency and inplace optimization options
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return {out_grad[softmax_activation::kOut], out_data[softmax_activation::kOut]};
-  }
-
-  std::vector<ResourceRequest> BackwardResource(
-      const std::vector<TShape> &in_shape) const override {
-    return {ResourceRequest::kTempSpace};
-  }
+void SoftmaxActivationGradCompute(const nnvm::NodeAttrs& attrs,
+                                  const OpContext& ctx,
+                                  const std::vector<TBlob>& inputs,
+                                  const std::vector<OpReqType>& reqs,
+                                  const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1);
+  CHECK_EQ(reqs.size(), 1);
+  const TBlob &out_grad = inputs[0];
+  const TBlob &out_data = inputs[1];
+  const OpReqType &req = reqs[0];
+  const TBlob &in_grad = outputs[0];
+  // Use 3d tensor for both mode -> {instance, channel}. Get shapes
+  int total_size = in_grad.Size();
+  int batch_size = in_grad.shape_[0];
+  int channel_num = in_grad.shape_[1];
+  int rest_size = total_size / (batch_size * channel_num);
+  const Shape<3> data_shape = Shape3(batch_size, channel_num, rest_size);
+  // Get tensors
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  Tensor<xpu, 3> m_out_grad =
+      out_grad.get_with_shape<xpu, 3, real_t>(data_shape, s);
+  Tensor<xpu, 3> m_out_data =
+      out_data.get_with_shape<xpu, 3, real_t>(data_shape, s);
+  Tensor<xpu, 3> m_in_grad =
+      in_grad.get_with_shape<xpu, 3, real_t>(data_shape, s);
+  // get requested temp space
+  Tensor<xpu, 2> workspace = ctx.requested[softmax_activation::kTempSpace].get_space<xpu>(
+      Shape2(batch_size, rest_size), s);
+  workspace = reduce_with_axis<red::sum, false>(m_out_grad * m_out_data, 1);
+  Assign(m_in_grad, req,
+         m_out_data * (m_out_grad - broadcast_with_axis(workspace, 0, channel_num)));
+}
 
-  std::vector<std::pair<int, void*> > BackwardInplaceOption(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data,
-    const std::vector<void*> &in_grad) const override {
-    return {{out_grad[softmax_activation::kOut], in_grad[softmax_activation::kData]}};
-  }
-
-  std::vector<std::pair<int, void*> > ForwardInplaceOption(
-    const std::vector<int> &in_data,
-    const std::vector<void*> &out_data) const override {
-    return {{in_data[softmax_activation::kData], out_data[softmax_activation::kOut]}};
-  }
-
-  Operator* CreateOperator(Context ctx) const override;
-
- private:
-  SoftmaxActivationParam param_;
-};
-#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_NN_SOFTMAX_ACTIVATION_INL_H_
diff --git a/src/operator/nn/softmax_activation.cc b/src/operator/nn/softmax_activation.cc
index 657b382c6e03..bdfd8b065de1 100644
--- a/src/operator/nn/softmax_activation.cc
+++ b/src/operator/nn/softmax_activation.cc
@@ -21,26 +21,18 @@
  * Copyright (c) 2015 by Contributors
  * \file activation.cc
  * \brief softmax_activation op
- * \author Junyuan Xie
+ * \author Junyuan Xie, Da Zheng
 */
 #include "./softmax_activation-inl.h"
+#include "../tensor/elemwise_unary_op.h"
 #include "../mshadow_op.h"
 
 namespace mxnet {
 namespace op {
-template<>
-Operator *CreateOp<cpu>(SoftmaxActivationParam param) {
-  return new SoftmaxActivationOp<cpu>(param);
-}
-
-// DO_BIND_DISPATCH comes from operator_common.h
-Operator *SoftmaxActivationProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateOp, param_);
-}
 
 DMLC_REGISTER_PARAMETER(SoftmaxActivationParam);
 
-MXNET_REGISTER_OP_PROPERTY(SoftmaxActivation, SoftmaxActivationProp)
+MXNET_OPERATOR_REGISTER_UNARY(SoftmaxActivation)
 .describe(R"code(Applies softmax activation to input. This is intended for internal layers.
 
 .. note::
@@ -65,8 +57,22 @@ Example::
    [  6.56221947e-03   5.95310994e-04   9.73919690e-01   1.78379621e-02   1.08472735e-03]]
 
 )code" ADD_FILELINE)
-.add_argument("data", "NDArray-or-Symbol", "Input array to activation function.")
+.set_attr_parser(ParamParser<SoftmaxActivationParam>)
+.set_attr<FCompute>("FCompute<cpu>", SoftmaxActivationCompute<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_SoftmaxActivation"})
 .add_arguments(SoftmaxActivationParam::__FIELDS__());
 
+NNVM_REGISTER_OP(_backward_SoftmaxActivation)
+.set_num_outputs(1)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
+  return std::vector<std::pair<int, int> >{{0, 0}};
+})
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr_parser(ParamParser<SoftmaxActivationParam>)
+.set_attr<FCompute>("FCompute<cpu>", SoftmaxActivationGradCompute<cpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/softmax_activation.cu b/src/operator/nn/softmax_activation.cu
index 0810483e1262..f3997e00052e 100644
--- a/src/operator/nn/softmax_activation.cu
+++ b/src/operator/nn/softmax_activation.cu
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file softmax_activation.cu
  * \brief
- * \author Junyuan Xie
+ * \author Junyuan Xie, Da Zheng
 */
 #include "./softmax_activation-inl.h"
 #include "../mshadow_op.h"
@@ -31,14 +31,51 @@
 
 namespace mxnet {
 namespace op {
-template<>
-Operator *CreateOp<gpu>(SoftmaxActivationParam param) {
+
 #if MXNET_USE_CUDNN == 1
-  return new CuDNNSoftmaxActivationOp(param);
+
+static inline CuDNNSoftmaxActivationOp &GetCuDNNSoftmaxActOp(const SoftmaxActivationParam& param) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local CuDNNSoftmaxActivationOp op;
 #else
-  return new SoftmaxActivationOp<gpu>(param);
-#endif  // MXNET_USE_CUDNN
+  static MX_THREAD_LOCAL CuDNNSoftmaxActivationOp op;
+#endif
+  op.Init(param);
+  return op;
+}
+
+template<>
+void SoftmaxActivationCompute<gpu>(const nnvm::NodeAttrs& attrs,
+                                   const OpContext& ctx,
+                                   const std::vector<TBlob>& inputs,
+                                   const std::vector<OpReqType>& req,
+                                   const std::vector<TBlob>& outputs) {
+  const SoftmaxActivationParam& param = nnvm::get<SoftmaxActivationParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  GetCuDNNSoftmaxActOp(param).Forward(ctx, inputs[0], req[0], outputs[0]);
 }
+
+template<>
+void SoftmaxActivationGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
+                                       const OpContext& ctx,
+                                       const std::vector<TBlob>& inputs,
+                                       const std::vector<OpReqType>& req,
+                                       const std::vector<TBlob>& outputs) {
+  const SoftmaxActivationParam& param = nnvm::get<SoftmaxActivationParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1);
+  CHECK_EQ(req.size(), 1);
+  GetCuDNNSoftmaxActOp(param).Backward(ctx, inputs[0], inputs[1], req[0], outputs[0]);
+}
+#endif
+
+NNVM_REGISTER_OP(SoftmaxActivation)
+.set_attr<FCompute>("FCompute<gpu>", SoftmaxActivationCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_SoftmaxActivation)
+.set_attr<FCompute>("FCompute<gpu>", SoftmaxActivationGradCompute<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/nn/upsampling-inl.h b/src/operator/nn/upsampling-inl.h
index f660609ace28..4b9159edd174 100644
--- a/src/operator/nn/upsampling-inl.h
+++ b/src/operator/nn/upsampling-inl.h
@@ -35,6 +35,7 @@
 #include <string>
 #include <utility>
 #include "../operator_common.h"
+#include "./deconvolution-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -82,253 +83,147 @@ struct UpSamplingParam : public dmlc::Parameter<UpSamplingParam> {
 };  // struct UpSamplingParam
 
 template<typename xpu, typename DType>
-class UpSamplingNearestOp : public Operator {
- public:
-  explicit UpSamplingNearestOp(UpSamplingParam p) {
-    this->param_ = p;
-  }
-
-  virtual void Forward(const OpContext &ctx,
+void UpSamplingForward(const OpContext &ctx, const UpSamplingParam &param,
                        const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), static_cast<size_t>(param_.num_args));
-    CHECK_EQ(out_data.size(), 1U);
-    if (req[up_enum::kOut] == kNullOp) {
-      return;
-    }
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> out = out_data[up_enum::kOut].get<xpu, 4, DType>(s);
-    if (param_.num_args > 1) {
-      int begin = 0;
-      for (int i = 0; i < param_.num_args; ++i) {
-        Tensor<xpu, 4, DType> data = in_data[i].get<xpu, 4, DType>(s);
-        int end = begin + data.size(1);
-        int scale = out_data[up_enum::kOut].size(2)/in_data[i].size(2);
-        if (param_.multi_input_mode == up_enum::kSum) {
-          if (i == 0) {
-            Assign(out, req[up_enum::kOut], upsampling_nearest(data, scale));
-          } else {
-            out += upsampling_nearest(data, scale);
-          }
+                       const std::vector<TBlob> &out_data) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  CHECK_EQ(in_data.size(), static_cast<size_t>(param.num_args));
+  CHECK_EQ(out_data.size(), 1U);
+  if (req[up_enum::kOut] == kNullOp) {
+    return;
+  }
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  Tensor<xpu, 4, DType> out = out_data[up_enum::kOut].get<xpu, 4, DType>(s);
+  if (param.num_args > 1) {
+    int begin = 0;
+    for (int i = 0; i < param.num_args; ++i) {
+      Tensor<xpu, 4, DType> data = in_data[i].get<xpu, 4, DType>(s);
+      int end = begin + data.size(1);
+      int scale = out_data[up_enum::kOut].size(2)/in_data[i].size(2);
+      if (param.multi_input_mode == up_enum::kSum) {
+        if (i == 0) {
+          Assign(out, req[up_enum::kOut], upsampling_nearest(data, scale));
         } else {
-          Assign(slice<1>(out, begin, end), req[up_enum::kOut], upsampling_nearest(data, scale));
+          out += upsampling_nearest(data, scale);
         }
-        begin = end;
+      } else {
+        Assign(slice<1>(out, begin, end), req[up_enum::kOut], upsampling_nearest(data, scale));
       }
-    } else {
-      Tensor<xpu, 4, DType> data = in_data[up_enum::kData].get<xpu, 4, DType>(s);
-      Assign(out, req[up_enum::kOut], upsampling_nearest(data, param_.scale));
+      begin = end;
     }
+  } else {
+    Tensor<xpu, 4, DType> data = in_data[up_enum::kData].get<xpu, 4, DType>(s);
+    Assign(out, req[up_enum::kOut], upsampling_nearest(data, param.scale));
   }
+}
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1U);
-    CHECK_EQ(in_grad.size(), static_cast<size_t>(param_.num_args));
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> grad = out_grad[up_enum::kOut].get<xpu, 4, DType>(s);
-    if (param_.num_args > 1) {
-      int begin = 0;
-      for (int i = 0; i < param_.num_args; ++i) {
-        Tensor<xpu, 4, DType> input_grad = in_grad[i].get<xpu, 4, DType>(s);
-        mshadow::Shape<2> in_shape = Shape2(input_grad.shape_[2], input_grad.shape_[3]);
-        int end = begin + input_grad.size(1);
-        int scale = grad.size(2)/in_shape[0];
-        if (param_.multi_input_mode == up_enum::kSum) {
-          Assign(input_grad, req[i],
-                 pool<mshadow::red::sum>(grad,
-                                         in_shape,
-                                         scale,
-                                         scale,
-                                         scale,
-                                         scale));
-        } else {
-          Assign(input_grad, req[i],
-                 pool<mshadow::red::sum>(slice<1>(grad, begin, end),
-                                         in_shape,
-                                         scale,
-                                         scale,
-                                         scale,
-                                         scale));
-        }
-        begin = end;
-      }
-    } else {
-      Tensor<xpu, 4, DType> input_grad = in_grad[up_enum::kData].get<xpu, 4, DType>(s);
+template<typename xpu, typename DType>
+void UpSamplingBackward(const OpContext &ctx, const UpSamplingParam &param,
+                        const TBlob &out_grad, const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  CHECK_EQ(in_grad.size(), static_cast<size_t>(param.num_args));
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  Tensor<xpu, 4, DType> grad = out_grad.get<xpu, 4, DType>(s);
+  if (param.num_args > 1) {
+    int begin = 0;
+    for (int i = 0; i < param.num_args; ++i) {
+      Tensor<xpu, 4, DType> input_grad = in_grad[i].get<xpu, 4, DType>(s);
       mshadow::Shape<2> in_shape = Shape2(input_grad.shape_[2], input_grad.shape_[3]);
-      Assign(input_grad, req[up_enum::kData],
-             pool<mshadow::red::sum>(grad,
-                                     in_shape,
-                                     param_.scale,
-                                     param_.scale,
-                                     param_.scale,
-                                     param_.scale));
-    }
-  }
-
- private:
-  UpSamplingParam param_;
-};  // class UpSamplingNearestOp
-
-template<typename xpu>
-Operator *CreateOp(UpSamplingParam param, int dtype);
-
-
-#if DMLC_USE_CXX11
-class UpSamplingProp : public OperatorProperty {
- public:
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  std::vector<std::string> ListArguments() const override {
-    if (param_.sample_type == up_enum::kNearest) {
-      std::vector<std::string> ret;
-      for (int i = 0; i < param_.num_args; ++i) {
-        ret.push_back(std::string("arg") + std::to_string(i));
-      }
-      return ret;
-    } else {
-      return {"data", "weight"};
-    }
-  }
-
-  bool InferShape(std::vector<TShape> *in_shape,
-                  std::vector<TShape> *out_shape,
-                  std::vector<TShape> *aux_shape) const override {
-    CHECK_GE(in_shape->size(), 1U);
-    const TShape &dshape = (*in_shape)[0];
-    TShape oshape = dshape;
-    if (param_.sample_type == up_enum::kNearest) {
-      CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
-      oshape[1] = 0;
-      for (auto& shape : *in_shape) {
-        CHECK_EQ(shape.ndim(), 4U) << \
-          "UpSamplingNearest: Input data should be 4D in (batch, channel, y, x)";
-        int oh = dshape[2]*param_.scale, ow = dshape[3]*param_.scale;
-        CHECK_EQ(oh%shape[2], 0U) << "UpSamplingNearest: input height of " << shape[2] << \
-          "does not divide output height of " << oh;
-        CHECK_EQ(ow%shape[3], 0U) << "UpSamplingNearest: input width of " << shape[3] << \
-          "does not divide output width of " << ow;
-        if (param_.multi_input_mode == up_enum::kSum) {
-          CHECK(oshape[1] == 0 || oshape[1] == shape[1]) << \
-            "Number of channels must be the same when multi_input_mode==sum";
-          oshape[1] = shape[1];
-        } else {
-          oshape[1] += shape[1];
-        }
-      }
-    } else {
-      CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
-      CHECK_EQ(dshape.ndim(), 4U) << \
-        "UpSamplingBilinear: Input data should be 4D in (batch, channel, y, x)";
-      if (dshape.ndim() ==  0) return false;
-      int kernel = 2 * param_.scale - param_.scale % 2;
-      SHAPE_ASSIGN_CHECK(*in_shape,
-                         up_enum::kWeight,
-                         mshadow::Shape4(dshape[1], 1, kernel, kernel));
-      oshape = dshape;
-    }
-    oshape[2] = dshape[2] * param_.scale;
-    oshape[3] = dshape[3] * param_.scale;
-    out_shape->clear();
-    out_shape->push_back(oshape);
-    return true;
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1U);
-    int dtype = (*in_type)[0];
-    CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
-      if ((*in_type)[i] == -1) {
-        (*in_type)[i] = dtype;
+      int end = begin + input_grad.size(1);
+      int scale = grad.size(2)/in_shape[0];
+      if (param.multi_input_mode == up_enum::kSum) {
+        Assign(input_grad, req[i],
+               pool<mshadow::red::sum>(grad,
+                                       in_shape,
+                                       scale,
+                                       scale,
+                                       scale,
+                                       scale));
       } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
+        Assign(input_grad, req[i],
+               pool<mshadow::red::sum>(slice<1>(grad, begin, end),
+                                       in_shape,
+                                       scale,
+                                       scale,
+                                       scale,
+                                       scale));
       }
+      begin = end;
     }
-    out_type->clear();
-    out_type->push_back(dtype);
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    auto ptr = new UpSamplingProp();
-    ptr->param_ = this->param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override {
-    return "UpSampling";
-  }
-
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    if (param_.sample_type == up_enum::kNearest) {
-      return {out_grad[up_enum::kOut]};
-    } else {
-      return {out_grad[up_enum::kOut], in_data[up_enum::kData], in_data[up_enum::kWeight]};
-    }
-  }
-
-  std::vector<std::pair<int, void*> > BackwardInplaceOption(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data,
-    const std::vector<void*> &in_grad) const override {
-    return {};
-  }
-
-  std::vector<ResourceRequest> ForwardResource(
-      const std::vector<TShape> &in_shape) const override {
-    if (param_.sample_type == up_enum::kNearest) {
-      return {};
-    } else {
-      return {ResourceRequest::kTempSpace};
-    }
-  }
-
-  std::vector<ResourceRequest> BackwardResource(
-      const std::vector<TShape> &in_shape) const override {
-    if (param_.sample_type == up_enum::kNearest) {
-      return {};
-    } else {
-      return {ResourceRequest::kTempSpace};
-    }
-  }
-
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented";
-    return NULL;
-  }
+  } else {
+    Tensor<xpu, 4, DType> input_grad = in_grad[up_enum::kData].get<xpu, 4, DType>(s);
+    mshadow::Shape<2> in_shape = Shape2(input_grad.shape_[2], input_grad.shape_[3]);
+    Assign(input_grad, req[up_enum::kData],
+           pool<mshadow::red::sum>(grad,
+                                   in_shape,
+                                   param.scale,
+                                   param.scale,
+                                   param.scale,
+                                   param.scale));
+  }
+}
+
+static inline DeconvolutionParam GetDeconvolutionParam(const UpSamplingParam& param) {
+  DeconvolutionParam p = DeconvolutionParam();
+  int kernel = 2 * param.scale - param.scale % 2;
+  int stride = param.scale;
+  int pad = static_cast<int>(ceil((param.scale - 1) / 2.));
+  p.workspace = param.workspace;
+  p.num_group = param.num_filter;
+  p.num_filter = param.num_filter;
+  p.no_bias =  true;
+  int shape[] = {1, 1};
+  p.dilate = TShape(shape, shape + 2);
+  shape[0] = shape[1] = kernel;
+  p.kernel = TShape(shape, shape + 2);
+  shape[0] = shape[1] = stride;
+  p.stride = TShape(shape, shape + 2);
+  shape[0] = shape[1] = pad;
+  p.pad = TShape(shape, shape + 2);
+  return p;
+}
 
-  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                             std::vector<int> *in_type) const override;
+template<typename xpu>
+void UpSamplingCompute(const nnvm::NodeAttrs& attrs,
+                       const OpContext& ctx, const std::vector<TBlob>& inputs,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& outputs) {
+  const UpSamplingParam& param = nnvm::get<UpSamplingParam>(attrs.parsed);
+  if (param.sample_type == up_enum::kNearest) {
+    MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, {
+      UpSamplingForward<xpu, DType>(ctx, param, inputs, req, outputs);
+    });
+  } else if (param.sample_type == up_enum::kBilinear) {
+    DeconvolutionParam p = GetDeconvolutionParam(param);
+    _DeconvolutionCompute<xpu>(p, ctx, inputs, req, outputs);
+  } else {
+    LOG(FATAL) << "Unknown sample type";
+  }
+}
 
+template<typename xpu>
+void UpSamplingGradCompute(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx, const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& outputs) {
+  const UpSamplingParam& param = nnvm::get<UpSamplingParam>(attrs.parsed);
+  if (param.sample_type == up_enum::kNearest) {
+    MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, {
+      CHECK_EQ(inputs.size(), 1U);
+      UpSamplingBackward<xpu, DType>(ctx, param, inputs[0], req, outputs);
+    });
+  } else if (param.sample_type == up_enum::kBilinear) {
+    DeconvolutionParam p = GetDeconvolutionParam(param);
+    _DeconvolutionGradCompute<xpu>(p, ctx, inputs, req, outputs);
+  } else {
+    LOG(FATAL) << "Unknown sample type";
+  }
+}
 
- private:
-  UpSamplingParam param_;
-};  // class UpSamplingProp
-#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/nn/upsampling.cc b/src/operator/nn/upsampling.cc
index 8942e35ab325..44b619ac9516 100644
--- a/src/operator/nn/upsampling.cc
+++ b/src/operator/nn/upsampling.cc
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file upsampling_nearest.cc
  * \brief
- * \author Bing Xu
+ * \author Bing Xu, Da Zheng
 */
 
 #include "./upsampling-inl.h"
@@ -30,51 +30,123 @@
 
 namespace mxnet {
 namespace op {
-template<>
-Operator *CreateOp<cpu>(UpSamplingParam param, int dtype) {
-  Operator *op = NULL;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    if (param.sample_type == up_enum::kNearest) {
-      op = new UpSamplingNearestOp<cpu, DType>(param);
-    } else if (param.sample_type == up_enum::kBilinear) {
-      DeconvolutionParam p = DeconvolutionParam();
-      int kernel = 2 * param.scale - param.scale % 2;
-      int stride = param.scale;
-      int pad = static_cast<int>(ceil((param.scale - 1) / 2.));
-      p.workspace = param.workspace;
-      p.num_group = param.num_filter;
-      p.num_filter = param.num_filter;
-      p.no_bias =  true;
-      int shape[] = {1, 1};
-      p.dilate = TShape(shape, shape + 2);
-      shape[0] = shape[1] = kernel;
-      p.kernel = TShape(shape, shape + 2);
-      shape[0] = shape[1] = stride;
-      p.stride = TShape(shape, shape + 2);
-      shape[0] = shape[1] = pad;
-      p.pad = TShape(shape, shape + 2);
-      op = new DeconvolutionOp<cpu, DType>(p);
-    } else {
-      LOG(FATAL) << "Unknown sample type";
+
+static bool UpSamplingShape(const nnvm::NodeAttrs& attrs,
+                            std::vector<TShape> *in_shape, std::vector<TShape> *out_shape) {
+  const UpSamplingParam& param_ = nnvm::get<UpSamplingParam>(attrs.parsed);
+  CHECK_GE(in_shape->size(), 1U);
+  const TShape &dshape = (*in_shape)[0];
+  TShape oshape = dshape;
+  if (param_.sample_type == up_enum::kNearest) {
+    CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
+    oshape[1] = 0;
+    for (auto& shape : *in_shape) {
+      CHECK_EQ(shape.ndim(), 4U) << \
+        "UpSamplingNearest: Input data should be 4D in (batch, channel, y, x)";
+      int oh = dshape[2]*param_.scale, ow = dshape[3]*param_.scale;
+      CHECK_EQ(oh%shape[2], 0U) << "UpSamplingNearest: input height of " << shape[2] << \
+        "does not divide output height of " << oh;
+      CHECK_EQ(ow%shape[3], 0U) << "UpSamplingNearest: input width of " << shape[3] << \
+        "does not divide output width of " << ow;
+      if (param_.multi_input_mode == up_enum::kSum) {
+        CHECK(oshape[1] == 0 || oshape[1] == shape[1]) << \
+                         "Number of channels must be the same when multi_input_mode==sum";
+        oshape[1] = shape[1];
+      } else {
+        oshape[1] += shape[1];
+      }
+    }
+  } else {
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
+    CHECK_EQ(dshape.ndim(), 4U) << \
+      "UpSamplingBilinear: Input data should be 4D in (batch, channel, y, x)";
+    if (dshape.ndim() ==  0) return false;
+    int kernel = 2 * param_.scale - param_.scale % 2;
+    SHAPE_ASSIGN_CHECK(*in_shape,
+        up_enum::kWeight,
+        mshadow::Shape4(dshape[1], 1, kernel, kernel));
+    oshape = dshape;
+  }
+  oshape[2] = dshape[2] * param_.scale;
+  oshape[3] = dshape[3] * param_.scale;
+  out_shape->clear();
+  out_shape->push_back(oshape);
+  return true;
+}
+
+static inline std::vector<std::string> ListArguments(const UpSamplingParam& param) {
+  if (param.sample_type == up_enum::kNearest) {
+    std::vector<std::string> ret;
+    for (int i = 0; i < param.num_args; ++i) {
+      ret.push_back(std::string("arg") + std::to_string(i));
     }
-  });
-  return op;
+    return ret;
+  } else {
+    return {"data", "weight"};
+  }
 }
 
-Operator* UpSamplingProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                                           std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
+static bool UpSamplingType(const nnvm::NodeAttrs& attrs,
+                           std::vector<int> *in_type, std::vector<int> *out_type) {
+  const UpSamplingParam& param = nnvm::get<UpSamplingParam>(attrs.parsed);
+  CHECK_GE(in_type->size(), 1U);
+  int dtype = (*in_type)[0];
+  CHECK_NE(dtype, -1) << "First input must have specified type";
+  for (index_t i = 0; i < in_type->size(); ++i) {
+    if ((*in_type)[i] == -1) {
+      (*in_type)[i] = dtype;
+    } else {
+      UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param)[i]);
+    }
+  }
+  out_type->clear();
+  out_type->push_back(dtype);
+  return true;
 }
 
+struct UpSamplingGrad {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
+    const UpSamplingParam& param_ = nnvm::get<UpSamplingParam>(n->attrs.parsed);
+    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
+    if (param_.sample_type != up_enum::kNearest) {
+      heads.push_back(n->inputs[up_enum::kData]);
+      heads.push_back(n->inputs[up_enum::kWeight]);
+    }
+    return MakeGradNode(op_name, n, heads, n->attrs.dict);
+  }
+};
+
 DMLC_REGISTER_PARAMETER(UpSamplingParam);
 
-MXNET_REGISTER_OP_PROPERTY(UpSampling, UpSamplingProp)
+NNVM_REGISTER_OP(UpSampling)
 .describe("Performs nearest neighbor/bilinear up sampling to inputs.")
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const UpSamplingParam& params = nnvm::get<UpSamplingParam>(attrs.parsed);
+  return params.sample_type == up_enum::kNearest ? params.num_args : 2;
+})
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<UpSamplingParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+    [](const NodeAttrs& attrs) {
+  return ListArguments(nnvm::get<UpSamplingParam>(attrs.parsed));
+})
+.set_attr<nnvm::FInferShape>("FInferShape", UpSamplingShape)
+.set_attr<nnvm::FInferType>("FInferType", UpSamplingType)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  const UpSamplingParam& param = nnvm::get<UpSamplingParam>(n.parsed);
+  if (param.sample_type == up_enum::kNearest) {
+    return std::vector<ResourceRequest>();
+  } else {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  }
+})
+.set_attr<FCompute>("FCompute<cpu>", UpSamplingCompute<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", UpSamplingGrad{"_backward_UpSampling"})
+.set_attr<std::string>("key_var_num_args", "num_args")
 .add_argument("data", "NDArray-or-Symbol[]", "Array of tensors to upsample")
 .add_arguments(UpSamplingParam::__FIELDS__())
-.set_key_var_num_args("num_args");
-
-NNVM_REGISTER_OP(UpSampling)
 .set_attr<nnvm::FSetInputVarAttrOnCompose>("FSetInputVarAttrOnCompose",
     [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) {
       if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return;
@@ -82,5 +154,23 @@ NNVM_REGISTER_OP(UpSampling)
         var->attrs.dict["__init__"] = "[\"bilinear\", {}]";
       }
     });
+
+NNVM_REGISTER_OP(_backward_UpSampling)
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const UpSamplingParam& params = nnvm::get<UpSamplingParam>(attrs.parsed);
+  return params.sample_type == up_enum::kNearest ? params.num_args : 2;
+})
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  const UpSamplingParam& param = nnvm::get<UpSamplingParam>(n.parsed);
+  if (param.sample_type == up_enum::kNearest) {
+    return std::vector<ResourceRequest>();
+  } else {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  }
+})
+.set_attr_parser(ParamParser<UpSamplingParam>)
+.set_attr<FCompute>("FCompute<cpu>", UpSamplingGradCompute<cpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/upsampling.cu b/src/operator/nn/upsampling.cu
index f83535a2b2e6..c5ff2fafd64a 100644
--- a/src/operator/nn/upsampling.cu
+++ b/src/operator/nn/upsampling.cu
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file upsampling_nearest.cc
  * \brief
- * \author Bing Xu
+ * \author Bing Xu, Da Zheng
 */
 
 #include "./deconvolution-inl.h"
@@ -29,36 +29,12 @@
 
 namespace mxnet {
 namespace op {
-template<>
-Operator *CreateOp<gpu>(UpSamplingParam param, int dtype) {
-  Operator *op = NULL;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    if (param.sample_type == up_enum::kNearest) {
-      op = new UpSamplingNearestOp<gpu, DType>(param);
-    } else if (param.sample_type == up_enum::kBilinear) {
-      DeconvolutionParam p = DeconvolutionParam();
-      int kernel = 2 * param.scale - param.scale % 2;
-      int stride = param.scale;
-      int pad = static_cast<int>(ceil((param.scale - 1) / 2.));
-      p.workspace = param.workspace;
-      p.num_group = param.num_filter;
-      p.num_filter = param.num_filter;
-      p.no_bias =  true;
-      int shape[] = {1, 1};
-      p.dilate = TShape(shape, shape + 2);
-      shape[0] = shape[1] = kernel;
-      p.kernel = TShape(shape, shape + 2);
-      shape[0] = shape[1] = stride;
-      p.stride = TShape(shape, shape + 2);
-      shape[0] = shape[1] = pad;
-      p.pad = TShape(shape, shape + 2);
-      op = new DeconvolutionOp<gpu, DType>(p);
-    } else {
-      LOG(FATAL) << "Unknown sample type";
-    }
-  });
-  return op;
-}
+
+NNVM_REGISTER_OP(UpSampling)
+.set_attr<FCompute>("FCompute<gpu>", UpSamplingCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_UpSampling)
+.set_attr<FCompute>("FCompute<gpu>", UpSamplingGradCompute<gpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/cast_storage-inl.h b/src/operator/tensor/cast_storage-inl.h
index ebe19d41bbc4..1510766639bc 100644
--- a/src/operator/tensor/cast_storage-inl.h
+++ b/src/operator/tensor/cast_storage-inl.h
@@ -32,6 +32,9 @@
 #ifdef __CUDACC__
 #include "./cast_storage-inl.cuh"
 #endif  // __CUDACC__
+#if MXNET_USE_MKLDNN == 1
+#include "../nn/mkldnn/mkldnn_base-inl.h"
+#endif
 
 
 namespace mxnet {
@@ -342,8 +345,16 @@ void CastStorageComputeImpl(const OpContext& ctx,
   } else if (src_stype == kCSRStorage && dst_stype == kDefaultStorage) {
     TBlob ret = output.data();
     CastStorageCsrDnsImpl<xpu>(ctx, input, &ret);
+#if MXNET_USE_MKLDNN == 1
+  } else if (src_stype == kDefaultStorage && dst_stype == kDefaultStorage) {
+    // In this case, one of the arrays must use non-default layout.
+    CHECK(input.IsMKLDNN() || output.IsMKLDNN());
+    auto in_mem = input.GetMKLDNNData();
+    const_cast<NDArray &>(output).CopyFrom(*in_mem);
+    MKLDNNStream::Get()->Submit();
+#endif
   } else {
-    LOG(FATAL) << "Not implemented";
+    LOG(FATAL) << "Not implemented from " << src_stype << " to " << dst_stype;
   }
 }
 
diff --git a/src/operator/tensor/cast_storage.cc b/src/operator/tensor/cast_storage.cc
index 9f257b140f7b..81abcc7dc955 100644
--- a/src/operator/tensor/cast_storage.cc
+++ b/src/operator/tensor/cast_storage.cc
@@ -25,10 +25,50 @@
 #include "./cast_storage-inl.h"
 #include "../elemwise_op_common.h"
 #include "../tensor/elemwise_unary_op.h"
+#include "../nn/mkldnn/mkldnn_base-inl.h"
 
 namespace mxnet {
 namespace op {
 
+#if MXNET_USE_MKLDNN == 1
+
+static inline int get_type_size(int dtype) {
+  MSHADOW_TYPE_SWITCH(dtype, DType, {return sizeof(DType);});
+  return -1;
+}
+
+void CastStorageMKLDnsImpl(const OpContext& ctx, const NDArray& src, const NDArray &dst_arr) {
+  TBlob dns = dst_arr.data();
+  CHECK_EQ(ctx.run_ctx.ctx.dev_mask(), Context::kCPU);
+  CHECK(src.shape() == dns.shape_);
+  if (src.dtype() != dns.type_flag_) {
+    // If the input and output have different data types, we have to convert
+    // the source array into the default layout, cast the data type and copy
+    // data to the destination array.
+    const TBlob &src_blob = src.data();
+    CHECK(src.ctx() == dst_arr.ctx());
+    ndarray::Copy<cpu, cpu>(src.data(), &dns, src.ctx(), dst_arr.ctx(), ctx.run_ctx);
+  } else {
+    // This converts the source data to the default format and write the data to
+    // the destination directly.
+    std::vector<mkldnn::primitive> net;
+    auto src_mkldnn = src.GetMKLDNNData();
+    auto src_pd = src_mkldnn->get_primitive_desc();
+    auto def_format = GetDefaultFormat(src_pd.desc());
+    if (def_format != src_pd.desc().data.format) {
+      auto dst_pd = GetPrimitiveDesc(src_pd, def_format);
+      mkldnn::memory dst_mkldnn(dst_pd, dns.dptr_);
+      net.push_back(mkldnn::reorder(*src_mkldnn, dst_mkldnn));
+      mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
+    } else {
+      const TBlob &src_blob = src.data();
+      memcpy(dns.dptr_, src_blob.dptr_, src.shape().Size() * get_type_size(dns.type_flag_));
+    }
+  }
+}
+
+#endif
+
 DMLC_REGISTER_PARAMETER(CastStorageParam);
 NNVM_REGISTER_OP(cast_storage)
 .add_alias("_sparse_cast_storage")
diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc
index d7e5e04ce87a..93b8d4687453 100644
--- a/src/operator/tensor/elemwise_binary_op_basic.cc
+++ b/src/operator/tensor/elemwise_binary_op_basic.cc
@@ -24,11 +24,68 @@
  */
 #include "./elemwise_unary_op.h"
 #include "./elemwise_binary_op-inl.h"
+#include "../nn/mkldnn/mkldnn_ops-inl.h"
+#include "../nn/mkldnn/mkldnn_base-inl.h"
 
 namespace mxnet {
 namespace op {
 
-MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(elemwise_add, op::mshadow_op::plus)
+static void ElemwiseAddEx(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<NDArray>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+#if MXNET_USE_MKLDNN == 1
+  if (SupportMKLDNN(inputs[0]) && SupportMKLDNN(inputs[1])) {
+    MKLDNNSumForward(attrs, ctx, inputs, req[0], outputs[0]);
+    return;
+  } else if (inputs[0].storage_type() == kDefaultStorage
+             && inputs[1].storage_type() == kDefaultStorage) {
+    // This happens if inputs are supposed to be in MKLDNN format
+    // but MKLDNN doesn't support the data type or the shape. We're
+    // forced to convert it to the default format.
+    std::vector<TBlob> in_blobs(2);
+    std::vector<TBlob> out_blobs(1);
+    in_blobs[0] = inputs[0].data();
+    in_blobs[1] = inputs[1].data();
+    out_blobs[0] = outputs[0].data();
+    ElemwiseBinaryOp::Compute<cpu, op::mshadow_op::plus>(attrs, ctx, in_blobs,
+                                                         req, out_blobs);
+    return;
+  }
+#endif
+  ElemwiseBinaryOp::ComputeEx<cpu, op::mshadow_op::plus>(attrs, ctx, inputs,
+                                                         req, outputs);
+}
+
+static inline bool ElemwiseAddStorageType(const nnvm::NodeAttrs& attrs,
+                                          const int dev_mask,
+                                          DispatchMode* dispatch_mode,
+                                          std::vector<int> *in_attrs,
+                                          std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2);
+  CHECK_EQ(out_attrs->size(), 1);
+  bool ret = ElemwiseStorageType<2, 1, true, true, true>(attrs, dev_mask, dispatch_mode,
+                                                         in_attrs, out_attrs);
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask
+      && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)
+      && out_attrs->at(0) == kDefaultStorage) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+  return ret;
+}
+
+MXNET_OPERATOR_REGISTER_BINARY(elemwise_add)
+.set_attr<FInferStorageType>("FInferStorageType", ElemwiseAddStorageType)
+.set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::Compute<cpu, op::mshadow_op::plus>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", ElemwiseAddEx)
+.set_attr<FResourceRequest>("FResourceRequest",  /* For Sparse CSR */
+                            [](const NodeAttrs& attrs) {
+                            return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};})
 MXNET_ADD_SPARSE_OP_ALIAS(elemwise_add)
 .add_alias("_add").add_alias("_plus").add_alias("_Plus")
 .describe(R"code(Adds arguments element-wise.
@@ -46,6 +103,41 @@ The storage type of ``elemwise_add`` output depends on storage types of inputs
 // this must differ from elemwise_add to prevent add to optimization in forward pass.
 MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_grad_add, op::mshadow_op::plus);
 
+static void _backward_ElemwiseAddEx(const nnvm::NodeAttrs& attrs,
+                                    const OpContext& ctx,
+                                    const std::vector<NDArray>& inputs,
+                                    const std::vector<OpReqType>& req,
+                                    const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 2U);
+#if MXNET_USE_MKLDNN == 1
+  if (inputs[0].IsMKLDNN()) {
+    MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]);
+    MKLDNNCopy(attrs, ctx, inputs[0], req[1], outputs[1]);
+    return;
+  }
+#endif
+  ElemwiseBinaryOp::BackwardUseNoneEx<cpu, mshadow_op::identity, mshadow_op::identity>(
+      attrs, ctx, inputs, req, outputs);
+}
+
+static inline bool _backward_ElemwiseAddStorageType(const nnvm::NodeAttrs& attrs,
+                                                    const int dev_mask,
+                                                    DispatchMode* dispatch_mode,
+                                                    std::vector<int> *in_attrs,
+                                                    std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(out_attrs->size(), 2);
+  bool ret = ElemwiseStorageType<1, 2, true, true, true>(attrs, dev_mask, dispatch_mode,
+                                                         in_attrs, out_attrs);
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+  return ret;
+}
+
 NNVM_REGISTER_OP(_backward_add)
 .set_num_inputs(1)
 .set_num_outputs(2)
@@ -55,13 +147,15 @@ NNVM_REGISTER_OP(_backward_add)
                                   return std::vector<std::pair<int, int> >{{0, 0},
                                                                            {0, 1}};
                                 })
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
 .set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::BackwardUseNone<
   cpu, mshadow_op::identity, mshadow_op::identity>)
-.set_attr<FComputeEx>("FComputeEx<cpu>",
-                      ElemwiseBinaryOp::BackwardUseNoneEx<cpu, mshadow_op::identity,
-                      mshadow_op::identity>)
-.set_attr<FInferStorageType>("FInferStorageType",
-                             ElemwiseStorageType<1, 2, true, true, true>);
+.set_attr<FComputeEx>("FComputeEx<cpu>", _backward_ElemwiseAddEx)
+.set_attr<FInferStorageType>("FInferStorageType", _backward_ElemwiseAddStorageType);
 
 MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(elemwise_sub, op::mshadow_op::minus)
 MXNET_ADD_SPARSE_OP_ALIAS(elemwise_sub)
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc
index 9a278d8c97a0..226a3b6fb7d4 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc
@@ -81,7 +81,7 @@ static bool BinaryScalarStorageType(const nnvm::NodeAttrs& attrs,
   const auto in_stype = in_attrs->at(0);
   auto &out_stype = out_attrs->at(0);
   bool dispatched = false;
-  if (!dispatched && in_stype == kDefaultStorage) {
+  if (!dispatched && (in_stype == kDefaultStorage)) {
     // dns -> dns
     dispatched = storage_type_assign(&out_stype, kDefaultStorage,
                                      dispatch_mode, DispatchMode::kFCompute);
diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc
index 041a0be00796..59a284aea6e1 100644
--- a/src/operator/tensor/elemwise_sum.cc
+++ b/src/operator/tensor/elemwise_sum.cc
@@ -24,6 +24,8 @@
 */
 #include "./elemwise_sum.h"
 #include "../../ndarray/ndarray_function.h"
+#include "../nn/mkldnn/mkldnn_ops-inl.h"
+#include "../../common/utils.h"
 
 namespace mxnet {
 namespace op {
@@ -79,9 +81,28 @@ bool ElementWiseSumForwardInferStorageType(const nnvm::NodeAttrs& attrs,
                                            std::vector<int> *out_attrs) {
   CHECK(!in_attrs->empty());
   CHECK_EQ(out_attrs->size(), 1U);
-  return ElemwiseStorageAttr<false, true, false>(attrs, dev_mask, dispatch_mode,
-                                                 in_attrs, out_attrs);
+  bool ret = ElemwiseStorageAttr<false, true, false>(attrs, dev_mask, dispatch_mode,
+                                                     in_attrs, out_attrs);
+#if MXNET_USE_MKLDNN == 1
+  // We should always use FComputeEx.
+  if (dev_mask == mshadow::cpu::kDevMask
+      && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)
+      && out_attrs->at(0) == kDefaultStorage) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+  return ret;
+}
+
+#if MXNET_USE_MKLDNN == 1
+static inline bool IsMKLDNN(const std::vector<NDArray> &arrs) {
+  for (auto &arr : arrs) {
+    if (!arr.IsMKLDNN())
+      return false;
+  }
+  return true;
 }
+#endif
 
 void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs,
                                 const OpContext& op_ctx,
@@ -92,13 +113,30 @@ void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
   if (req[0] == kNullOp) return;
-  CHECK_EQ(req[0], kWriteTo) << "ElementWiseSumComputeExCPU only supports req = kWriteTo";
   if (inputs[0].storage_type() == kRowSparseStorage) {
+    CHECK_EQ(req[0], kWriteTo)
+        << "ElementWiseSumComputeExCPU only supports req = kWriteTo";
     mshadow::Stream<cpu>* s = op_ctx.get_stream<cpu>();
     Resource rsc = ResourceManager::Get()->Request(op_ctx.run_ctx.get_ctx(),
         ResourceRequest(ResourceRequest::kTempSpace));
     NDArray out_nd = outputs[0];
     mxnet::ndarray::ElementwiseSum<cpu>(s, rsc, inputs, &out_nd);
+#if MXNET_USE_MKLDNN == 1
+  } else if (IsMKLDNN(inputs)) {
+    MKLDNNSumForward(attrs, op_ctx, inputs, req[0], outputs[0]);
+#endif
+  } else if (common::ContainsOnlyStorage(inputs, kDefaultStorage)) {
+    // This case happens when we want to create an MKLDNN NDArray but the type
+    // or the shape isn't supported by MKLDNN. In this case, NDArray falls back
+    // to the default storage type and, thus, we have to handle the default
+    // storage in FComputeEx.
+    std::vector<TBlob> in_blobs(inputs.size());
+    std::vector<TBlob> out_blobs(outputs.size());
+    for (size_t i = 0; i < in_blobs.size(); i++)
+      in_blobs[i] = inputs[i].data();
+    for (size_t i = 0; i < out_blobs.size(); i++)
+      out_blobs[i] = outputs[i].data();
+    ElementWiseSumCompute<cpu>(attrs, op_ctx, in_blobs, req, out_blobs);
   } else {
     LOG(FATAL) << "Not implemented: " << operator_string(attrs, op_ctx, inputs, req, outputs);
   }
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index 079a33e87548..8b32199f313a 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -24,6 +24,7 @@
 #include <mxnet/base.h>
 #include "elemwise_unary_op.h"
 #include "./elemwise_binary_op-inl.h"
+#include "../nn/mkldnn/mkldnn_ops-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -108,12 +109,66 @@ MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_backward_sigmoid,
                                                unary_bwd<mshadow_op::sigmoid_grad>);
 
 // copy
+static void CopyEx(const nnvm::NodeAttrs& attrs,
+                   const OpContext& ctx,
+                   const std::vector<NDArray>& inputs,
+                   const std::vector<OpReqType>& req,
+                   const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  const auto in_stype = inputs[0].storage_type();
+  const auto out_stype = outputs[0].storage_type();
+#if MXNET_USE_MKLDNN == 1
+  if (inputs[0].IsMKLDNN()) {
+    MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]);
+    return;
+  } else if (in_stype == kDefaultStorage && out_stype == kDefaultStorage) {
+    // This happens if inputs are supposed to be in MKLDNN format
+    // but MKLDNN doesn't support the data type or the shape. We're
+    // forced to convert it to the default format.
+    std::vector<TBlob> in_blobs(1);
+    std::vector<TBlob> out_blobs(1);
+    in_blobs[0] = inputs[0].data();
+    out_blobs[0] = outputs[0].data();
+    UnaryOp::IdentityCompute<cpu>(attrs, ctx, in_blobs, req, out_blobs);
+    return;
+  }
+#endif
+  UnaryOp::IdentityComputeEx<cpu>(attrs, ctx, inputs, req, outputs);
+}
+
+static inline bool CopyStorageType(const nnvm::NodeAttrs& attrs,
+                                   const int dev_mask,
+                                   DispatchMode* dispatch_mode,
+                                   std::vector<int> *in_attrs,
+                                   std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(out_attrs->size(), 1);
+  bool ret = ElemwiseStorageType<1, 1, false, true, true>(attrs, dev_mask, dispatch_mode,
+                                                          in_attrs, out_attrs);
+#if MXNET_USE_MKLDNN == 1
+  // We have to make sure all inputs are default layouts. Otherwise, we might
+  // want to fallback.
+  if (dev_mask == mshadow::cpu::kDevMask
+      && in_attrs->at(0) == kDefaultStorage
+      && out_attrs->at(0) == kDefaultStorage) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+  return ret;
+}
+
 MXNET_OPERATOR_REGISTER_UNARY(_copy)
 .MXNET_DESCRIBE("Returns a copy of the input.")
 .add_alias("identity")
-.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>)
+.set_attr<FInferStorageType>("FInferStorageType", CopyStorageType)
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
-.set_attr<FComputeEx>("FComputeEx<cpu>", UnaryOp::IdentityComputeEx<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", CopyEx)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
 .set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
   [](const NodeAttrs& attrs){
     return std::vector<bool>{true};
@@ -128,9 +183,14 @@ NNVM_REGISTER_OP(_backward_copy)
   [](const NodeAttrs& attrs){
     return std::vector<std::pair<int, int> >{{0, 0}};
   })
-.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>)
+.set_attr<FInferStorageType>("FInferStorageType", CopyStorageType)
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
-.set_attr<FComputeEx>("FComputeEx<cpu>", UnaryOp::IdentityComputeEx<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", CopyEx)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
 .set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
   [](const NodeAttrs& attrs){
     return std::vector<bool>{true};
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index e8fdce491484..83451db38dd6 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -25,6 +25,7 @@
 // this will be invoked by gcc and compile CPU version
 #include "./matrix_op-inl.h"
 #include "./elemwise_unary_op.h"
+#include "../nn/mkldnn/mkldnn_ops-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -180,6 +181,55 @@ If the argument `reverse` is set to 1, then the special values are inferred from
 .add_argument("data", "NDArray-or-Symbol", "Input data to reshape.")
 .add_arguments(ReshapeParam::__FIELDS__());
 
+static void FlattenEx(const nnvm::NodeAttrs& attrs,
+                      const OpContext& ctx,
+                      const std::vector<NDArray>& inputs,
+                      const std::vector<OpReqType>& req,
+                      const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+#if MXNET_USE_MKLDNN == 1
+  const auto in_stype = inputs[0].storage_type();
+  const auto out_stype = outputs[0].storage_type();
+  if (inputs[0].IsMKLDNN()) {
+    MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]);
+    // If the output is a special MKLDNN layout and the number of dimensions
+    // is larger than 2, we should use the default layout.
+    if (outputs[0].IsMKLDNN() && inputs[0].shape().ndim() > 2)
+      const_cast<NDArray &>(outputs[0]).Reorder2Default();
+    return;
+  } else {
+    // This happens if inputs are supposed to be in MKLDNN format
+    // but MKLDNN doesn't support the data type or the shape. We're
+    // forced to convert it to the default format.
+    std::vector<TBlob> in_blobs(1);
+    std::vector<TBlob> out_blobs(1);
+    in_blobs[0] = inputs[0].data();
+    out_blobs[0] = outputs[0].data();
+    UnaryOp::IdentityCompute<cpu>(attrs, ctx, in_blobs, req, out_blobs);
+    return;
+  }
+#endif
+}
+
+static inline bool FlattenStorageType(const nnvm::NodeAttrs& attrs,
+                                   const int dev_mask,
+                                   DispatchMode* dispatch_mode,
+                                   std::vector<int> *in_attrs,
+                                   std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(out_attrs->size(), 1);
+  bool ret = ElemwiseStorageType<1, 1, false, true, true>(attrs, dev_mask, dispatch_mode,
+                                                          in_attrs, out_attrs);
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask
+      && in_attrs->at(0) == kDefaultStorage
+      && out_attrs->at(0) == kDefaultStorage) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+  return ret;
+}
 
 NNVM_REGISTER_OP(Flatten)
 .add_alias("flatten")
@@ -210,8 +260,15 @@ Example::
 .set_num_outputs(1)
 .set_attr<nnvm::FInferShape>("FInferShape", FlattenShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FInferStorageType>("FInferStorageType", FlattenStorageType)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{ "_backward_copy" })
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", FlattenEx)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
   [](const NodeAttrs& attrs) {
     return std::vector<std::pair<int, int> >{{0, 0}};
diff --git a/src/storage/cpu_device_storage.h b/src/storage/cpu_device_storage.h
index f0dd61f01ac0..fcea4ef74cdf 100644
--- a/src/storage/cpu_device_storage.h
+++ b/src/storage/cpu_device_storage.h
@@ -54,7 +54,11 @@ class CPUDeviceStorage {
   /*!
    * \brief Alignment of allocation.
    */
+#if MXNET_USE_MKLDNN == 1
+  static constexpr size_t alignment_ = 4096;
+#else
   static constexpr size_t alignment_ = 16;
+#endif
 };  // class CPUDeviceStorage
 
 inline void* CPUDeviceStorage::Alloc(size_t size) {
diff --git a/tests/ci_build/Dockerfile.cpu_mkldnn b/tests/ci_build/Dockerfile.cpu_mkldnn
new file mode 100644
index 000000000000..591f11365014
--- /dev/null
+++ b/tests/ci_build/Dockerfile.cpu_mkldnn
@@ -0,0 +1,18 @@
+FROM ubuntu:16.04
+
+COPY install/ubuntu_install_core.sh /install/
+RUN /install/ubuntu_install_core.sh
+COPY install/ubuntu_install_python.sh /install/
+RUN /install/ubuntu_install_python.sh
+COPY install/ubuntu_install_scala.sh /install/
+RUN /install/ubuntu_install_scala.sh
+COPY install/ubuntu_install_r.sh /install/
+RUN /install/ubuntu_install_r.sh
+COPY install/ubuntu_install_perl.sh /install/
+RUN /install/ubuntu_install_perl.sh
+
+# Add MKLML library, compatiable with Ubuntu16.04
+#RUN wget --no-check-certificate -O /tmp/mklml.tgz https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz
+#RUN tar -zxvf /tmp/mklml.tgz && cp -rf mklml_*/* /usr/local/ && rm -rf mklml_*
+
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/tests/ci_build/Dockerfile.gpu_mkldnn b/tests/ci_build/Dockerfile.gpu_mkldnn
new file mode 100644
index 000000000000..0b6bd2e44505
--- /dev/null
+++ b/tests/ci_build/Dockerfile.gpu_mkldnn
@@ -0,0 +1,18 @@
+FROM nvidia/cuda:8.0-cudnn5-devel
+# cuda8.0 has to be used because this is the first ubuntu16.04 container
+# # which is required due to OpenBLAS being incompatible with ubuntu14.04
+# the reason we used a gpu base container because we are going to test MKLDNN
+# operator implementation against GPU implementation
+
+COPY install/ubuntu_install_core.sh /install/
+RUN /install/ubuntu_install_core.sh
+COPY install/ubuntu_install_python.sh /install/
+RUN /install/ubuntu_install_python.sh
+COPY install/ubuntu_install_scala.sh /install/
+RUN /install/ubuntu_install_scala.sh
+
+# Add MKLML library, compatible with Ubuntu16.04
+#RUN wget --no-check-certificate -O /tmp/mklml.tgz https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz
+#RUN tar -zxvf /tmp/mklml.tgz && cp -rf mklml_*/* /usr/local/ && rm -rf mklml_*
+
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/tests/ci_build/install/ubuntu_install_core.sh b/tests/ci_build/install/ubuntu_install_core.sh
index eefd7590cdbc..71721744859a 100755
--- a/tests/ci_build/install/ubuntu_install_core.sh
+++ b/tests/ci_build/install/ubuntu_install_core.sh
@@ -21,7 +21,7 @@
 
 apt-get update && apt-get install -y \
     build-essential git libopenblas-dev liblapack-dev libopencv-dev \
-    libcurl4-openssl-dev libgtest-dev cmake wget unzip sudo  
+    libcurl4-openssl-dev libgtest-dev cmake wget unzip sudo curl
 
 # Link Openblas to Cblas as this link does not exist on ubuntu16.04
 ln -s /usr/lib/libopenblas.so /usr/lib/libcblas.so
diff --git a/tests/cpp/include/test_core_op.h b/tests/cpp/include/test_core_op.h
index 6a220bdad6d7..1074d0182198 100644
--- a/tests/cpp/include/test_core_op.h
+++ b/tests/cpp/include/test_core_op.h
@@ -314,7 +314,9 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
       // Set up forward
       attrs_ = ParseAttrs(op_, args);
 
-      const int num_inputs = op_->num_inputs;
+      int num_inputs = op_->num_inputs;
+      if (op_->get_num_inputs)
+        num_inputs = op_->get_num_inputs(attrs_);
 
       if (!inputs.empty()) {
         CHECK_EQ(inputs.size(), static_cast<size_t>(num_inputs));
diff --git a/tests/python/unittest/test_executor.py b/tests/python/unittest/test_executor.py
index e3d977df65de..04a0ef3f8600 100644
--- a/tests/python/unittest/test_executor.py
+++ b/tests/python/unittest/test_executor.py
@@ -136,22 +136,20 @@ def test_dot():
 
 def test_reshape():
     x = mx.sym.Variable('x')
-    y = mx.sym.FullyConnected(x, num_hidden=4)
+    y = mx.sym.Dropout(x, p=0.2)
 
     exe = y.simple_bind(mx.cpu(), x=(5,4), grad_req='null')
     exe.arg_arrays[0][:] = 1
-    exe.arg_arrays[1][:] = mx.nd.ones((4,4))
-    exe.arg_arrays[2][:] = 0
 
     new_exe = exe.reshape(x=(3,4))
     new_exe.forward(is_train=False)
     # test sub exec forward
-    assert np.all(new_exe.outputs[0].asnumpy() == 4)
+    assert np.all(new_exe.outputs[0].asnumpy() == 1)
     # test shared memory
-    assert np.all(exe.outputs[0].asnumpy()[:3] == 4)
+    assert np.all(exe.outputs[0].asnumpy()[:3] == 1)
     # test base exec forward
     exe.forward(is_train=False)
-    assert np.all(exe.outputs[0].asnumpy() == 4)
+    assert np.all(exe.outputs[0].asnumpy() == 1)
 
 if __name__ == "__main__":
     test_bind(disable_bulk_exec=False)