diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc
index 1415c0b45140..9cf1b71880a8 100644
--- a/src/operator/nn/mkldnn/mkldnn_convolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc
@@ -85,6 +85,11 @@ mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
     attr.set_int_output_round_mode(round_nearest);
   }
 
+  // MKL-DNN introduced padded formats since 0.15 which require more memory
+  // for computation compared with the actual tensor size. Currently, MKL-DNN
+  // operators are still reusing those memory from memory planning and the
+  // memory size may smaller than what MKL-DNN kernels require. So here we need
+  // select suboptimal kernel for computation according to tensor sizes.
   if (param.conv_param.dilate.ndim() == 0 && bias == nullptr) {
     mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct,
         data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero);
@@ -155,6 +160,12 @@ static mkldnn::convolution_backward_data::primitive_desc GetConvBwdData(
   mkldnn::memory::dims padding{0, 0};
   padding[0] = param.pad[0];
   padding[1] = param.pad[1];
+
+  // MKL-DNN introduced padded formats since 0.15 which require more memory
+  // for computation compared with the actual tensor size. Currently, MKL-DNN
+  // operators are still reusing those memory from memory planning and the
+  // memory size may smaller than what MKL-DNN kernels require. So here we need
+  // select suboptimal kernel for computation according to tensor sizes.
   if (param.dilate.ndim() == 0) {
     mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
         data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero);
@@ -199,6 +210,12 @@ static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights(
   mkldnn::memory::dims padding{0, 0};
   padding[0] = param.pad[0];
   padding[1] = param.pad[1];
+
+  // MKL-DNN introduced padded formats since 0.15 which require more memory
+  // for computation compared with the actual tensor size. Currently, MKL-DNN
+  // operators are still reusing those memory from memory planning and the
+  // memory size may smaller than what MKL-DNN kernels require. So here we need
+  // select suboptimal kernel for computation according to tensor sizes.
   if (param.dilate.ndim() == 0 && bias == nullptr) {
     mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
         data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero);
@@ -229,26 +246,26 @@ static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights(
       mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
           data_md, weight_md, out_md, strides, dilates, padding, padding,
           mkldnn::padding_kind::zero);
-    auto conv_pd = mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd);
-    while (conv_pd.diff_dst_primitive_desc().get_size() != GetArraySize(output) ||
-           conv_pd.src_primitive_desc().get_size() != GetArraySize(data) ||
-           conv_pd.diff_weights_primitive_desc().get_size() != GetArraySize(weights)) {
-      CHECK(conv_pd.next_impl()) << "No implementation";
-    }
-    return conv_pd;
+      auto conv_pd = mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd);
+      while (conv_pd.diff_dst_primitive_desc().get_size() != GetArraySize(output) ||
+             conv_pd.src_primitive_desc().get_size() != GetArraySize(data) ||
+             conv_pd.diff_weights_primitive_desc().get_size() != GetArraySize(weights)) {
+        CHECK(conv_pd.next_impl()) << "No implementation";
+      }
+      return conv_pd;
     } else {
       auto bias_md = GetMemDesc(*bias);
       mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
                                                       data_md, weight_md, bias_md, out_md,
                                                       strides, dilates, padding, padding,
                                                       mkldnn::padding_kind::zero);
-    auto conv_pd = mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd);
-    while (conv_pd.diff_dst_primitive_desc().get_size() != GetArraySize(output) ||
-           conv_pd.src_primitive_desc().get_size() != GetArraySize(data) ||
-           conv_pd.diff_weights_primitive_desc().get_size() != GetArraySize(weights)) {
-      CHECK(conv_pd.next_impl()) << "No implementation";
-    }
-    return conv_pd;
+      auto conv_pd = mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd);
+      while (conv_pd.diff_dst_primitive_desc().get_size() != GetArraySize(output) ||
+             conv_pd.src_primitive_desc().get_size() != GetArraySize(data) ||
+             conv_pd.diff_weights_primitive_desc().get_size() != GetArraySize(weights)) {
+        CHECK(conv_pd.next_impl()) << "No implementation";
+      }
+      return conv_pd;
     }
   }
 }
diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
index 87e89646d7fe..93032f7c92d6 100644
--- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
@@ -52,6 +52,11 @@ static mkldnn::convolution_forward::primitive_desc GetDeconvBwd_(
     bool has_bias, const mkldnn::memory::desc &out_md,
     const mkldnn::engine &engine, const mkldnn::memory::dims &strides,
     const mkldnn::memory::dims &padding, const mkldnn::memory::dims &dilates) {
+  // MKL-DNN introduced padded formats since 0.15 which require more memory
+  // for computation compared with the actual tensor size. Currently, MKL-DNN
+  // operators are still reusing those memory from memory planning and the
+  // memory size may smaller than what MKL-DNN kernels require. So here we need
+  // select suboptimal kernel for computation according to tensor sizes.
   if (!has_bias) {
     mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training,
         mkldnn::algorithm::convolution_direct, out_md, weights_md, data_md, strides,
@@ -103,6 +108,11 @@ static mkldnn::convolution_backward_data::primitive_desc GetDeconvFwdImpl(
       out_md, weight_md, data_md, strides, dilate, padding, padding,
       mkldnn::padding_kind::zero);
   auto deconv_pd = mkldnn::convolution_backward_data::primitive_desc(desc, engine, bwd_pd);
+  // MKL-DNN introduced padded formats since 0.15 which require more memory
+  // for computation compared with the actual tensor size. Currently, MKL-DNN
+  // operators are still reusing those memory from memory planning and the
+  // memory size may smaller than what MKL-DNN kernels require. So here we need
+  // select suboptimal kernel for computation according to tensor sizes.
   while (deconv_pd.diff_dst_primitive_desc().get_size() != GetMemDescSize(data_md) ||
          deconv_pd.diff_src_primitive_desc().get_size() != GetMemDescSize(out_md) ||
          deconv_pd.weights_primitive_desc().get_size() != GetMemDescSize(weight_md)) {
@@ -155,6 +165,12 @@ GetDeconvBwdWeightsImpl(
   mkldnn::memory::dims dilate{0, 0};
   dilate[0] = param.dilate[0] - 1;
   dilate[1] = param.dilate[1] - 1;
+
+  // MKL-DNN introduced padded formats since 0.15 which require more memory
+  // for computation compared with the actual tensor size. Currently, MKL-DNN
+  // operators are still reusing those memory from memory planning and the
+  // memory size may smaller than what MKL-DNN kernels require. So here we need
+  // select suboptimal kernel for computation according to tensor sizes.
   if (!has_bias) {
     mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
         out_md, weight_md, data_md, strides, dilate, padding, padding, mkldnn::padding_kind::zero);