[MKLDNN] mkldnn RNN operator enhancement (#17075)

* mkldnn rnn operator enhancement `add` operation support Rename AddTo Add MXNET_USE_MKLDNN_RNN env Add Env var for switching to naive RNN impl and naive add/copy impl * Re-run CI, op:test_reduce failed on Unix-CPU * Rerun CI, Python2 CPU on Unix-CPU timeout
apache · Dec 16, 2019 · 897f4fa · 897f4fa
1 parent 843daf5
commit 897f4fa
Show file tree

Hide file tree

Showing 6 changed files with 264 additions and 146 deletions.
diff --git a/docs/static_site/src/pages/api/faq/env_var.md b/docs/static_site/src/pages/api/faq/env_var.md
@@ -289,11 +289,11 @@ If ctypes is used, it must be `mxnet._ctypes.ndarray.NDArrayBase`.
   If no such algorithm exists given other constraints, MXNet will error out. This variable affects the choice
   of CUDNN convolution algorithms. Please see [CUDNN developer guide](https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html) for more details.
 
-* MXNET_CPU_PARALLEL_COPY_SIZE
+* MXNET_CPU_PARALLEL_SIZE
   - Values: Int ```(default=200000)```
-  - The minimum size to call parallel copy by OpenMP in CPU2CPU mode.
-  - When the array size is bigger than or equal to  this threshold, NDArray::Copy(from, to) is implemented by OpenMP with the Recommended OMP Thread Count.
-  - When the array size is less than this threshold, NDArray::Copy(from , to)) is implemented by memcpy in single thread.
+  - The minimum size to call parallel operations by OpenMP for CPU context.
+  - When the array size is bigger than or equal to this threshold, the operation implemented by OpenMP is executed with the Recommended OMP Thread Count.
+  - When the array size is less than this threshold, the operation is implemented naively in single thread.
 
 * MXNET_OPTIMIZER_AGGREGATION_SIZE
   - Values: Int ```(default=4)```
@@ -349,6 +349,10 @@ If ctypes is used, it must be `mxnet._ctypes.ndarray.NDArrayBase`.
   - Values: 0(false) or 1(true) ```(default=1)```
   - If this variable is set, MXNet will simplify the computation graph, eliminating duplicated operations on the same inputs.
 
+* MXNET_USE_MKLDNN_RNN
+  - Values: 0(false) or 1(true) ```(default=1)```
+  - This variable controls whether to use the MKL-DNN backend in fused RNN operator for CPU context. There are two fusion implementations of RNN operator in MXNet. The MKL-DNN implementation has a better performance than the naive one, but the latter is more stable in the backward operation currently.
+
 Settings for Minimum Memory Usage
 ---------------------------------
 - Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1```

diff --git a/src/common/utils.h b/src/common/utils.h
@@ -769,7 +769,7 @@ inline void EmplaceBackZeros(const NDArrayStorageType stype, const mxnet::TShape
  */
 template<typename DType>
 inline void ParallelCopy(DType* dst, const DType* src, index_t size) {
-  static index_t copy_block_size = dmlc::GetEnv("MXNET_CPU_PARALLEL_COPY_SIZE", 200000);
+  static index_t copy_block_size = dmlc::GetEnv("MXNET_CPU_PARALLEL_SIZE", 200000);
   if (size >= copy_block_size) {
     #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
     for (index_t i = 0; i < size; ++i) {
@@ -780,6 +780,24 @@ inline void ParallelCopy(DType* dst, const DType* src, index_t size) {
   }
 }
 
+/*!
+ * \breif parallelize add by OpenMP
+ */
+template<typename DType>
+inline void ParallelAdd(DType* dst, const DType* src, index_t size) {
+  static index_t add_block_size = dmlc::GetEnv("MXNET_CPU_PARALLEL_SIZE", 200000);
+  if (size >= add_block_size) {
+    #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
+    for (index_t i = 0; i < size; ++i) {
+      dst[i] += src[i];
+    }
+  } else {
+    for (index_t i = 0; i < size; ++i) {
+      dst[i] += src[i];
+    }
+  }
+}
+
 /*!
  * \brief If numpy compatibility is turned off (default), the shapes passed in
  * by users follow the legacy shape definition:

diff --git a/src/operator/nn/mkldnn/mkldnn_rnn-inl.h b/src/operator/nn/mkldnn/mkldnn_rnn-inl.h
@@ -120,33 +120,32 @@ class RnnPrimitive {
   template<typename rnn_fwd, typename... Args>
   static RnnPrimitive Create(Args&&... args) {
     RnnPrimitive rnn_fwd_prim;
-    rnn_fwd_prim.pd_.reset(
-      new typename rnn_fwd::desc(std::forward<Args>(args)...),
-      [](typename rnn_fwd::desc* pd) {
-        delete reinterpret_cast<typename rnn_fwd::desc*>(pd);
+    auto fwd_desc = typename rnn_fwd::desc(std::forward<Args>(args)...);
+    rnn_fwd_prim.fwd_pd_.reset(
+      new typename rnn_fwd::primitive_desc(fwd_desc, CpuEngine::Get()->get_engine()),
+      [](typename rnn_fwd::primitive_desc* pd) {
+        delete reinterpret_cast<typename rnn_fwd::primitive_desc*>(pd);
       });
-    const typename rnn_fwd::desc& fwd_desc =
-        *(reinterpret_cast<typename rnn_fwd::desc*>(rnn_fwd_prim.pd_.get()));
-    typename rnn_fwd::primitive_desc fwd_pd(fwd_desc, CpuEngine::Get()->get_engine());
-    rnn_fwd_prim.weights_layer_desc_ = fwd_pd.weights_layer_desc();
-    rnn_fwd_prim.weights_iter_desc_  = fwd_pd.weights_iter_desc();
-    rnn_fwd_prim.workspace_desc_ = fwd_pd.workspace_desc();
+    auto fwd_pd = reinterpret_cast<typename rnn_fwd::primitive_desc*>(rnn_fwd_prim.fwd_pd_.get());
+    rnn_fwd_prim.weights_layer_desc_ = fwd_pd->weights_layer_desc();
+    rnn_fwd_prim.weights_iter_desc_  = fwd_pd->weights_iter_desc();
+    rnn_fwd_prim.workspace_desc_ = fwd_pd->workspace_desc();
 
-    rnn_fwd_prim.primitive_ = std::shared_ptr<mkldnn::primitive>(new rnn_fwd(fwd_pd));
+    rnn_fwd_prim.primitive_ = std::shared_ptr<mkldnn::primitive>(new rnn_fwd(*fwd_pd));
 
     return rnn_fwd_prim;
   }
 
   RnnPrimitive() {
-    this->pd_ = nullptr;
+    this->fwd_pd_ = nullptr;
     this->primitive_ = nullptr;
     this->weights_layer_desc_ = mkldnn::memory::desc();
     this->weights_iter_desc_ = mkldnn::memory::desc();
     this->workspace_desc_ = mkldnn::memory::desc();
   }
 
   RnnPrimitive(const RnnPrimitive& rnn_fwd_prim) {
-    this->pd_ = rnn_fwd_prim.pd_;
+    this->fwd_pd_ = rnn_fwd_prim.fwd_pd_;
     this->primitive_ = rnn_fwd_prim.primitive_;
     this->weights_layer_desc_ = rnn_fwd_prim.weights_layer_desc_;
     this->weights_iter_desc_ = rnn_fwd_prim.weights_iter_desc_;
@@ -155,7 +154,7 @@ class RnnPrimitive {
 
   RnnPrimitive& operator=(const RnnPrimitive& rnn_fwd_prim) {
     if (this != &rnn_fwd_prim) {
-      this->pd_ = rnn_fwd_prim.pd_;
+      this->fwd_pd_ = rnn_fwd_prim.fwd_pd_;
       this->primitive_ = rnn_fwd_prim.primitive_;
       this->weights_layer_desc_ = rnn_fwd_prim.weights_layer_desc_;
       this->weights_iter_desc_ = rnn_fwd_prim.weights_iter_desc_;
@@ -165,7 +164,7 @@ class RnnPrimitive {
     return *this;
   }
 
-  const void* GetPrimDesc() const { return pd_.get(); }
+  const void* GetPrimDesc() const { return fwd_pd_.get(); }
   const mkldnn::primitive& GetPrim() const { return *primitive_; }
 
   const mkldnn::memory::desc& GetLayerDesc() const {
@@ -181,7 +180,7 @@ class RnnPrimitive {
   }
 
  private:
-  std::shared_ptr<void> pd_;
+  std::shared_ptr<void> fwd_pd_;
   std::shared_ptr<mkldnn::primitive> primitive_;
   mkldnn::memory::desc weights_layer_desc_;
   mkldnn::memory::desc weights_iter_desc_;
@@ -370,7 +369,9 @@ class MKLDNNRnnBackward {
   void SetDataGradsMem(void* diff_src, void* diff_state, void* diff_statecell,
                        void* diff_out, void* diff_state_out, void* diff_statecell_out,
                        const int dtype = mshadow::kFloat32);
-  void CommitWeightsDiff(void* diff_weights, void* diff_bias, const int dtype = mshadow::kFloat32);
+  void CommitWeightsDiff(void* diff_weights, void* diff_bias,
+                         const OpReqType req,
+                         const int dtype = mshadow::kFloat32);
 
   const mkldnn::primitive& GetBwd() const { return *bwd_.primitive_; }
   const mkldnn_args_map_t& GetArgsMap() const { return net_args_; }