[mkldnn-v1.0] Add MKL-DNN Convolution (#16141)

* add mkldnn conv * revert unnecessary change * fix testcase fail for cpu: test_convolution_independent_gradients * fix failed testcase: test_reshape_transpose_6d&&test_weight_async_reorder * fix comments * change variable name from weights to weight in mkldnn_conv
apache · Sep 18, 2019 · 1ff9429 · 1ff9429
1 parent 99b4961
commit 1ff9429
Show file tree

Hide file tree

Showing 13 changed files with 349 additions and 434 deletions.
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
@@ -761,8 +761,8 @@ class NDArray {
    * It changes the layout of this NDArray, but it happens after all accesses to
    * the array are complete.
    */
-  void Reorder2DefaultAsync();
-  void MKLDNNDataReorderAsync(const mkldnn::memory::desc &md);
+  void Reorder2DefaultAsync() const;
+  void MKLDNNDataReorderAsync(const mkldnn::memory::desc &md) const;
 
   /*
    * This creates a new NDArray with the reordered data.

diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h
@@ -59,15 +59,15 @@ inline bool SetupDefaultBlobsIn(const std::vector<NDArray>& src,
   for (size_t i = 0; i < src.size(); i++) {
     auto& nd = src[i];
     bool is_default = nd.storage_type() == kDefaultStorage;
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 100
     // We have to make sure it's default storage and default layout.
     is_default = nd.IsDefaultData();
 #endif
     if (!is_default) {
       (*idx_map)[i] = temp_dst->size();
       NDArray temp = bufs != nullptr ? bufs->at(i) : NDArray(nd.shape(), nd.ctx(),
                                                              true, nd.dtype());
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 100
       CHECK(temp.IsDefaultData());
 #endif
       temp_src->emplace_back(nd);
@@ -91,7 +91,7 @@ inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
   for (size_t i = 0; i < src.size(); i++) {
     auto& nd = src[i];
     bool is_default = nd.storage_type() == kDefaultStorage;
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 100
     if (req->at(i) == kWriteInplace && nd.IsMKLDNNData())
       // If it's write inplace and the output array doesn't use the default
       // layout, we'll generate a temporary output array below, which means
@@ -102,7 +102,7 @@ inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
     is_default = nd.IsDefaultData();
 #endif
     if (!is_default) {
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 100
       NDArray temp;
       if (bufs != nullptr) {
         temp = bufs->at(i);

diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
@@ -116,7 +116,7 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor {
  public:
   void Run(RunContext rctx, bool is_gpu) override {
     op_ctx.run_ctx = rctx;
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 100
     InvalidateOutputs(out_array, req);
 #endif
     PreFCompute(is_gpu);
@@ -155,7 +155,7 @@ class StatefulComputeExExecutor : public OpExecutor {
  public:
   void Run(RunContext rctx, bool is_gpu) override {
     op_ctx.run_ctx = rctx;
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 100
     InvalidateOutputs(out_array, req);
     // TODO(alex): (MXNET-847) Remove this fallback feature after subgraph implemented
     const auto is_mkldnn = Op::GetAttr<bool>("TIsMKLDNN");
@@ -202,7 +202,7 @@ class FComputeExecutor : public StorageFallbackOpExecutor {
   void Run(RunContext rctx, bool is_gpu) override {
     using namespace common;
     op_ctx.run_ctx = rctx;
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 100
     InvalidateOutputs(out_array, req);
 #endif
     PreFCompute(is_gpu);
@@ -231,7 +231,7 @@ class FComputeExExecutor : public OpExecutor {
  public:
   void Run(RunContext rctx, bool is_gpu) override {
     op_ctx.run_ctx = rctx;
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 100
     InvalidateOutputs(out_array, req);
     // TODO(alex): (MXNET-847) Remove this fallback feature after subgraph implemented
     const auto is_mkldnn = Op::GetAttr<bool>("TIsMKLDNN");

diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
@@ -418,7 +418,7 @@ inline void PushFCompute(const FCompute& fn,
       std::vector<NDArray> pre_temp_src, pre_temp_dst, post_temp_dst, post_temp_src;
       // mapping from index in input_blobs to index in pre_temp_dst
       std::unordered_map<uint32_t, uint32_t> in_temp_idx_map;
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 100
       if (exec_type != ExecType::kCrossDeviceCopy) {
         // kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in
         // its FCcomputeEx, but AsyncPush the copy operation to engine.
@@ -467,7 +467,7 @@ inline void PushFComputeEx(const FComputeEx& fn,
   DerefInputOutput(p_inputs, p_outputs, &inputs, &outputs);
   const auto& run = [=](RunContext rctx) {
       OpContext opctx{need_grad, is_train, rctx, engine::CallbackOnComplete(), requested};
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 100
       if (exec_type != ExecType::kCrossDeviceCopy) {
         // kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in
         // its FCcomputeEx, but AsyncPush the copy operation to engine.
@@ -476,8 +476,18 @@ inline void PushFComputeEx(const FComputeEx& fn,
         // copying A to B may not happen, and will corrupt A's memory.
         InvalidateOutputs(outputs, req);
       }
+      // add for mkldnn OP + no mkldnn OP
+      const auto is_mkldnn = Op::GetAttr<bool>("TIsMKLDNN");
+      if (!is_mkldnn.get(attrs.op, false)) {
+        std::vector<NDArray> inputs_fallback;
+        CreateDefaultInputs(inputs, &inputs_fallback);
+        fn(attrs, opctx, inputs_fallback, req, outputs);
+      } else {
+#endif
+        fn(attrs, opctx, inputs, req, outputs);
+#if MXNET_USE_MKLDNN == 100
+      }
 #endif
-      fn(attrs, opctx, inputs, req, outputs);
       if (ctx.dev_mask() == gpu::kDevMask && exec_type == ExecType::kSync && !rctx.is_bulk) {
         rctx.get_stream<gpu>()->Wait();
       }
@@ -521,7 +531,7 @@ inline void PushOperator(const OpStatePtr& state,
     const auto& run = [=](RunContext rctx,
                           engine::CallbackOnComplete on_complete) {
       OpContext opctx{need_grad, is_train, rctx, on_complete, requested};
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 100
       if (exec_type != ExecType::kCrossDeviceCopy) {
         // kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in
         // its FCcomputeEx, but AsyncPush the copy operation to engine.
@@ -567,7 +577,7 @@ inline void PushOperator(const OpStatePtr& state,
         std::vector<NDArray> pre_temp_src, pre_temp_dst, post_temp_dst, post_temp_src;
         // mapping from index in input_blobs to index in pre_temp_dst
         std::unordered_map<uint32_t, uint32_t> in_temp_idx_map;
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 100
       if (exec_type != ExecType::kCrossDeviceCopy) {
         // kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in
         // its FCcomputeEx, but AsyncPush the copy operation to engine.

diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
@@ -474,7 +474,7 @@ void NDArray::Chunk::SetMKLMem(const mxnet::TShape &shape, int dtype) {
 
   mkldnn::memory::dims dims;
   // These are shapes supprted by MKLDNN.
-  if (shape.ndim() >= 1 && shape.ndim() <= 5) {
+  if (shape.ndim() >= 1 && shape.ndim() <= 6) {
     dims.resize(shape.ndim());
     for (size_t i = 0; i < dims.size(); i++)
       dims[i] = shape[i];
@@ -488,6 +488,7 @@ void NDArray::Chunk::SetMKLMem(const mxnet::TShape &shape, int dtype) {
     case 3: layout = mkldnn::memory::format_tag::abc; break;
     case 4: layout = mkldnn::memory::format_tag::abcd; break;
     case 5: layout = mkldnn::memory::format_tag::abcde; break;
+    case 6: layout = mkldnn::memory::format_tag::abcdef; break;
     default:
       LOG(FATAL) << "Not implemented dimension (" << dims.size() << ") for MKLDNN";
   }
@@ -592,7 +593,7 @@ NDArray NDArray::Reorder2Default() const {
   return ret;
 }
 
-void NDArray::Reorder2DefaultAsync() {
+void NDArray::Reorder2DefaultAsync() const {
   std::vector<Engine::VarHandle> const_vars;
   std::vector<Engine::VarHandle> mutable_vars(1, this->var());
   NDArray tmp = *this;
@@ -604,13 +605,18 @@ void NDArray::Reorder2DefaultAsync() {
     FnProperty::kNormal, 0, "Reorder2Default");
 }
 
-void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::desc &desc) {
+void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::desc &desc) const {
   std::vector<Engine::VarHandle> const_vars;
   std::vector<Engine::VarHandle> mutable_vars(1, this->var());
   NDArray tmp = *this;
+  const auto version = this->version();
   Engine::Get()->PushAsync(
-    [tmp, desc](RunContext ctx, Engine::CallbackOnComplete on_complete) {
-      tmp.ptr_->MKLDNNDataReorder(desc);
+    [tmp, version, desc](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+      // MXNet will try to reuse NDArray from memory planning, so we need to ensure
+      // the NDArray is still holding the original trunk data.
+      if (tmp.version() == version) {
+        tmp.ptr_->MKLDNNDataReorder(desc);
+      }
       on_complete();
     }, ctx(), const_vars, mutable_vars,
     FnProperty::kNormal, 0, "Reorder");

diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc
@@ -30,7 +30,7 @@
 #if MXNET_USE_NNPACK == 1
 #include "../nnpack/nnpack_pooling-inl.h"
 #endif  // MXNET_USE_NNPACK
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 100
 #include "./mkldnn/mkldnn_base-inl.h"
 #include "./mkldnn/mkldnn_ops-inl.h"
 #endif  // MXNET_USE_MKLDNN
@@ -51,7 +51,7 @@ static inline std::vector<std::string> ListArguments(const ConvolutionParam& par
   }
 }
 
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 100
 static void ConvolutionComputeExCPU(const nnvm::NodeAttrs& attrs,
                                     const OpContext& ctx,
                                     const std::vector<NDArray>& inputs,
@@ -60,7 +60,12 @@ static void ConvolutionComputeExCPU(const nnvm::NodeAttrs& attrs,
   const ConvolutionParam& params = nnvm::get<ConvolutionParam>(attrs.parsed);
   if (SupportMKLDNNConv(params, inputs[0])) {
     MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNConvolutionForward(attrs, ctx, inputs, req, outputs);
+    if (CheckMKLDNNInputArrayIsView(inputs)) {
+      const auto mkldnn_inputs = GetMKLDNNInputArray(inputs);
+      MKLDNNConvolutionForward(attrs, ctx, mkldnn_inputs, req, outputs);
+    } else {
+      MKLDNNConvolutionForward(attrs, ctx, inputs, req, outputs);
+    }
     MKLDNN_OPCHECK_RUN(ConvolutionCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
@@ -75,7 +80,12 @@ static void ConvolutionGradComputeExCPU(const nnvm::NodeAttrs& attrs,
   const ConvolutionParam& params = nnvm::get<ConvolutionParam>(attrs.parsed);
   if (SupportMKLDNNConv(params, inputs[0])) {
     MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-    MKLDNNConvolutionBackward(attrs, ctx, inputs, req, outputs);
+    if (CheckMKLDNNInputArrayIsView(inputs)) {
+      const auto mkldnn_inputs = GetMKLDNNInputArray(inputs);
+      MKLDNNConvolutionBackward(attrs, ctx, mkldnn_inputs, req, outputs);
+    } else {
+      MKLDNNConvolutionBackward(attrs, ctx, inputs, req, outputs);
+    }
     MKLDNN_OPCHECK_RUN(ConvolutionGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
@@ -302,7 +312,7 @@ static bool ConvolutionType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 100
 inline static bool ConvStorageType(const nnvm::NodeAttrs& attrs,
                                    const int dev_mask,
                                    DispatchMode* dispatch_mode,
@@ -491,11 +501,11 @@ There are other options to tune the performance.
 })
 .set_attr<mxnet::FInferShape>("FInferShape", ConvolutionShape)
 .set_attr<nnvm::FInferType>("FInferType", ConvolutionType)
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 100
 .set_attr<FInferStorageType>("FInferStorageType", ConvStorageType)
 #endif
 .set_attr<FCompute>("FCompute<cpu>", ConvolutionCompute<cpu>)
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 100
 .set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FComputeEx>("FComputeEx<cpu>", ConvolutionComputeExCPU)
 #endif
@@ -514,14 +524,14 @@ NNVM_REGISTER_OP(_backward_Convolution)
   return params.no_bias ? 2 : 3;
 })
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 100
 .set_attr<FInferStorageType>("FInferStorageType", BackwardConvStorageType)
 #endif
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
 })
 .set_attr_parser(ConvolutionParamParser)
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 100
 .set_attr<bool>("TIsMKLDNN", true)
 .set_attr<FComputeEx>("FComputeEx<cpu>", ConvolutionGradComputeExCPU)
 #endif

diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -277,6 +277,28 @@ inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr,
   }
 }
 
+inline static bool CheckMKLDNNInputArrayIsView(const std::vector<NDArray> &inputs) {
+  for (const auto &in : inputs) {
+    if (in.IsView() && in.IsMKLDNNData()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+inline static const std::vector<NDArray> GetMKLDNNInputArray(const std::vector<NDArray> &inputs) {
+  std::vector<NDArray> ret;
+  ret.reserve(inputs.size());
+  for (const auto &in : inputs) {
+    if (in.IsView() && in.IsMKLDNNData()) {
+      ret.push_back(in.Reorder2Default());
+    } else {
+      ret.push_back(in);
+    }
+  }
+  return ret;
+}
+
 typedef std::shared_ptr<mkldnn::memory> mkldnn_mem_ptr;
 typedef std::shared_ptr<const mkldnn::memory> mkldnn_mem_const_ptr;
 

diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc
@@ -312,6 +312,7 @@ mkldnn_format_tag_t GetDefaultFormat(int num_dims) {
     case 3: return mkldnn_abc;
     case 4: return mkldnn_abcd;
     case 5: return mkldnn_abcde;
+    case 6: return mkldnn_abcdef;
     default:
       LOG(FATAL) << "Not implemented dimension (" << num_dims << ") for MKLDNN";
       return mkldnn_format_tag_undef;
@@ -530,7 +531,7 @@ bool MKLDNNStorageType(const nnvm::NodeAttrs &attrs,
     if (v == - 1) v = kDefaultStorage;
 
   DispatchMode wanted_mode;
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 100
   if (dev_mask == mshadow::cpu::kDevMask && !MKLDNNEnvSet())
     wanted_mode = DispatchMode::kFComputeFallback;
   else if (dev_mask == mshadow::cpu::kDevMask && support_mkldnn)