From c8e867cc2168c6422b724f206f29aaa7bfe7cc50 Mon Sep 17 00:00:00 2001
From: huangzhiyuan <zhiyuan.huang@intel.com>
Date: Wed, 26 Dec 2018 19:12:32 +0800
Subject: [PATCH 1/8] add mkldnn slice

---
 src/operator/nn/mkldnn/mkldnn_slice-inl.h |  66 ++++++++++++++
 src/operator/nn/mkldnn/mkldnn_slice.cc    | 103 ++++++++++++++++++++++
 src/operator/tensor/matrix_op-inl.h       |  69 ++++++++-------
 src/operator/tensor/matrix_op.cc          |   3 +
 src/operator/tensor/slice-inl.h           |  71 +++++++++++++++
 5 files changed, 280 insertions(+), 32 deletions(-)
 create mode 100644 src/operator/nn/mkldnn/mkldnn_slice-inl.h
 create mode 100644 src/operator/nn/mkldnn/mkldnn_slice.cc
 create mode 100644 src/operator/tensor/slice-inl.h
diff --git a/src/operator/nn/mkldnn/mkldnn_slice-inl.h b/src/operator/nn/mkldnn/mkldnn_slice-inl.h
new file mode 100644
index 000000000000..1778efc42f7c
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_slice-inl.h
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_slice-inl.h
+ * \brief
+ * \author Zhiyuan Huang
+*/
+
+#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_SLICE_INL_H_
+#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_SLICE_INL_H_
+
+#if MXNET_USE_MKLDNN == 1
+
+#include <mkldnn.hpp>
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <utility>
+#include "../../operator_common.h"
+#include "../../tensor/slice-inl.h"
+#include "./mkldnn_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+class MKLDNNSliceFwd {
+ public:
+  MKLDNNSliceFwd(const SliceParam &param,
+                 const NDArray &in,
+                 const NDArray &out);
+  void SetNewMem(const mkldnn::memory &input, const mkldnn::memory &output);
+  const mkldnn::reorder &GetPd() const;
+
+  std::shared_ptr<mkldnn::memory> data_;
+  std::shared_ptr<mkldnn::memory> out_;
+  std::shared_ptr<mkldnn::reorder> fwd_;
+};
+
+typedef ParamOpSign<SliceParam> MKLDNNSliceSignature;
+MKLDNNSliceFwd &GetSliceForward(const SliceParam &param,
+    const NDArray &in_data, const NDArray &out_data);
+
+void MKLDNNSlice(const SliceParam &param, const OpContext& ctx,
+                 const NDArray &in, OpReqType req, const NDArray &out);
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_SLICE_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_slice.cc b/src/operator/nn/mkldnn/mkldnn_slice.cc
new file mode 100644
index 000000000000..c1e41e1ee7ee
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_slice.cc
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_slice.cc
+ * \brief
+ * \author Zhiyuan Huang
+*/
+
+#if MXNET_USE_MKLDNN == 1
+
+#include "./mkldnn_ops-inl.h"
+#include "./mkldnn_base-inl.h"
+#include "./mkldnn_slice-inl.h"
+
+namespace mxnet {
+namespace op {
+
+MKLDNNSliceFwd::MKLDNNSliceFwd(const SliceParam &param,
+                               const NDArray &in,
+                               const NDArray &out) {
+  const TShape ishape = in.shape();
+  const TShape oshape = out.shape();
+  uint32_t N = ishape.ndim();
+  mkldnn::memory::dims dims(N);
+  mkldnn::memory::dims offsets(N);
+  for (uint32_t i = 0; i < N; ++i) {
+    int s = 0;
+    if (param.begin[i]) {
+      s = *param.begin[i];
+      if (s < 0) s += ishape[i];
+    }
+    dims[i] = oshape[i];
+    offsets[i] = s;
+  }
+  auto in_mem = in.GetMKLDNNData();
+  auto in_mem_pd = in_mem->get_primitive_desc();
+  auto out_mem_pd = out.GetMKLDNNData()->get_primitive_desc();
+  auto view_pd = mkldnn::view::primitive_desc(in_mem_pd, dims, offsets);
+  auto reorder_pd = reorder::primitive_desc(view_pd.dst_primitive_desc(), out_mem_pd);
+  this->data_ = std::make_shared<mkldnn::memory>(view_pd.dst_primitive_desc(), nullptr);
+  this->out_ = std::make_shared<mkldnn::memory>(view_pd.dst_primitive_desc(), nullptr);
+  fwd_.reset(new mkldnn::reorder(reorder_pd, *data_, *out_));
+}
+
+void MKLDNNSliceFwd::SetNewMem(const mkldnn::memory &input, const mkldnn::memory &output) {
+  this->data_->set_data_handle(input.get_data_handle());
+  this->out_->set_data_handle(output.get_data_handle());
+}
+
+const mkldnn::reorder &MKLDNNSliceFwd::GetPd() const {
+  return *fwd_;
+}
+
+MKLDNNSliceFwd &GetSliceForward(const SliceParam &param,
+    const NDArray &in_data, const NDArray &out_data) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<MKLDNNSliceSignature, MKLDNNSliceFwd, OpHash> fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<MKLDNNSliceSignature, MKLDNNSliceFwd, OpHash> fwds;
+#endif
+  MKLDNNSliceSignature key(param);
+  key.AddSign(in_data);
+
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+    MKLDNNSliceFwd fwd(param, in_data, out_data);
+    it = AddToCache(&fwds, key, fwd);
+  }
+  return it->second;
+}
+
+void MKLDNNSlice(const SliceParam &param, const OpContext& ctx,
+                 const NDArray &in, OpReqType req, const NDArray &out) {
+  MKLDNNSliceFwd &fwd = GetSliceForward(param, in, out);
+  auto in_mem = in.GetMKLDNNData();
+  auto out_mem_pd = out.GetMKLDNNData()->get_primitive_desc();
+  auto out_mem = CreateMKLDNNMem(out, out_mem_pd, req);
+  fwd.SetNewMem(*in_mem, *out_mem.second);
+  MKLDNNStream::Get()->RegisterPrim(fwd.GetPd());
+  CommitOutput(out, out_mem);
+  MKLDNNStream::Get()->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 3b229cf38eba..d7ad7c0cdf33 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -37,6 +37,8 @@
 #include "broadcast_reduce_op.h"
 #include "./init_op.h"
 #include "../../common/static_array.h"
+#include "./slice-inl.h"
+#include "../nn/mkldnn/mkldnn_slice-inl.h"
 
 #if MXNET_USE_CUDA
 #include <thrust/device_vector.h>
@@ -398,20 +400,6 @@ inline bool ExpandDimShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-struct SliceParam : public dmlc::Parameter<SliceParam> {
-  nnvm::Tuple<dmlc::optional<int>> begin, end;
-  nnvm::Tuple<dmlc::optional<int>> step;
-  DMLC_DECLARE_PARAMETER(SliceParam) {
-    DMLC_DECLARE_FIELD(begin)
-    .describe("starting indices for the slice operation, supports negative indices.");
-    DMLC_DECLARE_FIELD(end)
-    .describe("ending indices for the slice operation, supports negative indices.");
-    DMLC_DECLARE_FIELD(step)
-    .set_default(nnvm::Tuple<dmlc::optional<int>>())
-    .describe("step for the slice operation, supports negative values.");
-  }
-};
-
 inline bool SliceForwardInferStorageType(const nnvm::NodeAttrs& attrs,
                                          const int dev_mask,
                                          DispatchMode* dispatch_mode,
@@ -432,7 +420,16 @@ inline bool SliceForwardInferStorageType(const nnvm::NodeAttrs& attrs,
       && (!param.step[0].has_value() || param.step[0].value() == 1)) {
     trivial_step = true;
   }
-  if (!dispatched && in_stype == kDefaultStorage) {
+  if (!dispatched && in_stype == kDefaultStorage && trivial_step) {
+#if MXNET_USE_MKLDNN == 1
+    dispatched = storage_type_assign(&out_stype, kDefaultStorage,
+                                     dispatch_mode, dispatch_ex);
+#else
+    dispatched = storage_type_assign(&out_stype, kDefaultStorage,
+                                     dispatch_mode, DispatchMode::kFCompute);
+#endif
+  }
+  else if (!dispatched && in_stype == kDefaultStorage) {
     dispatched = storage_type_assign(&out_stype, kDefaultStorage,
                                      dispatch_mode, DispatchMode::kFCompute);
   }
@@ -604,23 +601,6 @@ void SliceCsrImpl(const SliceParam &param, const OpContext& ctx,
   }
 }
 
-template<typename xpu>
-void SliceEx(const nnvm::NodeAttrs& attrs,
-             const OpContext& ctx,
-             const std::vector<NDArray>& inputs,
-             const std::vector<OpReqType>& req,
-             const std::vector<NDArray>& outputs) {
-  CHECK_EQ(inputs.size(), 1);
-  CHECK_EQ(outputs.size(), 1);
-  const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
-  auto in_stype = inputs[0].storage_type();
-  if (in_stype == kCSRStorage) {
-    SliceCsrImpl<xpu>(param, ctx, inputs[0], req[0], outputs[0]);
-  } else {
-    LOG(FATAL) << "Slice not implemented for storage type" << in_stype;
-  }
-}
-
 template<int ndim>
 inline void GetIndexRange(const TShape& dshape,
                           const nnvm::Tuple<dmlc::optional<int>>& param_begin,
@@ -829,6 +809,31 @@ void SliceOpForward(const nnvm::NodeAttrs& attrs,
   })
 }
 
+template<typename xpu>
+void SliceEx(const nnvm::NodeAttrs& attrs,
+                const OpContext& ctx,
+                const std::vector<NDArray>& inputs,
+                const std::vector<OpReqType>& req,
+                const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 1);
+  CHECK_EQ(outputs.size(), 1);
+  const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
+  auto in_stype = inputs[0].storage_type();
+  if (in_stype == kCSRStorage) {
+    SliceCsrImpl<xpu>(param, ctx, inputs[0], req[0], outputs[0]);
+#if MXNET_USE_MKLDNN == 1
+  } else if(in_stype == kDefaultStorage){ // For default storage, detect whether we are using MKLDNN or not
+    if (SupportMKLDNN(inputs[0])) {
+      MKLDNNSlice(param, ctx, inputs[0], req[0], outputs[0]);
+    } else {
+      FallBackCompute(SliceOpForward<xpu>, attrs, ctx, inputs, req, outputs);
+    }
+#endif
+  } else {
+    LOG(FATAL) << "Slice not implemented for storage type" << in_stype;
+  }
+}
+
 template<int ndim, int req, typename xpu>
 struct slice_assign;
 
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index db8efa454385..c75e8ad7419e 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -479,6 +479,9 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_slice"})
 .set_attr<FCompute>("FCompute<cpu>", SliceOpForward<cpu>)
 .set_attr<FComputeEx>("FComputeEx<cpu>", SliceEx<cpu>)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
+#endif
 .add_argument("data", "NDArray-or-Symbol", "Source input")
 .add_arguments(SliceParam::__FIELDS__());
 
diff --git a/src/operator/tensor/slice-inl.h b/src/operator/tensor/slice-inl.h
new file mode 100644
index 000000000000..4e94cbeda46c
--- /dev/null
+++ b/src/operator/tensor/slice-inl.h
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file slice-inl.h
+ * \brief
+ * \author Zhiyuan Huang
+*/
+
+#ifndef MXNET_OPERATOR_TENSOR_SLICE_INL_H_
+#define MXNET_OPERATOR_TENSOR_SLICE_INL_H_
+
+#include <utility>
+#include <vector>
+#include <string>
+
+namespace mxnet {
+namespace op {
+
+struct SliceParam : public dmlc::Parameter<SliceParam> {
+  nnvm::Tuple<dmlc::optional<int>> begin, end;
+  nnvm::Tuple<dmlc::optional<int>> step;
+  DMLC_DECLARE_PARAMETER(SliceParam) {
+    DMLC_DECLARE_FIELD(begin)
+    .describe("starting indices for the slice operation, supports negative indices.");
+    DMLC_DECLARE_FIELD(end)
+    .describe("ending indices for the slice operation, supports negative indices.");
+    DMLC_DECLARE_FIELD(step)
+    .set_default(nnvm::Tuple<dmlc::optional<int>>())
+    .describe("step for the slice operation, supports negative values.");
+  }
+  bool operator==(const SliceParam& other) const {
+    return this->begin == other.begin &&
+           this->end == other.end &&
+           this->step == other.step;
+  }
+};
+
+}  // namespace op
+}  // namespace mxnet
+
+namespace std {
+template<>
+struct hash<mxnet::op::SliceParam> {
+  size_t operator()(const mxnet::op::SliceParam& val) {
+    size_t ret = 0;
+    ret = dmlc::HashCombine(ret, val.begin);
+    ret = dmlc::HashCombine(ret, val.end);
+    ret = dmlc::HashCombine(ret, val.step);
+    return ret;
+  }
+};
+}  // namespace std
+
+#endif  // MXNET_OPERATOR_TENSOR_SLICE_INL_H_

From 3123e0ce4ee82f5655ff42e2cfd4d6ee2854028e Mon Sep 17 00:00:00 2001
From: huangzhiyuan <zhiyuan.huang@intel.com>
Date: Thu, 27 Dec 2018 21:00:18 +0800
Subject: [PATCH 2/8] fix lint

---
 src/operator/tensor/matrix_op-inl.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index d7ad7c0cdf33..212d6817de3a 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -420,7 +420,8 @@ inline bool SliceForwardInferStorageType(const nnvm::NodeAttrs& attrs,
       && (!param.step[0].has_value() || param.step[0].value() == 1)) {
     trivial_step = true;
   }
-  if (!dispatched && in_stype == kDefaultStorage && trivial_step) {
+  if (!dispatched && in_stype == kDefaultStorage
+                  && trivial_step && dev_mask == Context::kCPU) {
 #if MXNET_USE_MKLDNN == 1
     dispatched = storage_type_assign(&out_stype, kDefaultStorage,
                                      dispatch_mode, dispatch_ex);
@@ -428,8 +429,7 @@ inline bool SliceForwardInferStorageType(const nnvm::NodeAttrs& attrs,
     dispatched = storage_type_assign(&out_stype, kDefaultStorage,
                                      dispatch_mode, DispatchMode::kFCompute);
 #endif
-  }
-  else if (!dispatched && in_stype == kDefaultStorage) {
+  } else if (!dispatched && in_stype == kDefaultStorage) {
     dispatched = storage_type_assign(&out_stype, kDefaultStorage,
                                      dispatch_mode, DispatchMode::kFCompute);
   }
@@ -822,7 +822,7 @@ void SliceEx(const nnvm::NodeAttrs& attrs,
   if (in_stype == kCSRStorage) {
     SliceCsrImpl<xpu>(param, ctx, inputs[0], req[0], outputs[0]);
 #if MXNET_USE_MKLDNN == 1
-  } else if(in_stype == kDefaultStorage){ // For default storage, detect whether we are using MKLDNN or not
+  } else if (in_stype == kDefaultStorage) {
     if (SupportMKLDNN(inputs[0])) {
       MKLDNNSlice(param, ctx, inputs[0], req[0], outputs[0]);
     } else {

From c2e6d26875841d0703d25f363282b52388de6391 Mon Sep 17 00:00:00 2001
From: huangzhiyuan <zhiyuan.huang@intel.com>
Date: Thu, 27 Dec 2018 22:50:26 +0800
Subject: [PATCH 3/8] fix lint

---
 src/operator/nn/mkldnn/mkldnn_slice-inl.h | 1 +
 src/operator/nn/mkldnn/mkldnn_slice.cc    | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/operator/nn/mkldnn/mkldnn_slice-inl.h b/src/operator/nn/mkldnn/mkldnn_slice-inl.h
index 1778efc42f7c..b66e7045d101 100644
--- a/src/operator/nn/mkldnn/mkldnn_slice-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_slice-inl.h
@@ -48,6 +48,7 @@ class MKLDNNSliceFwd {
   void SetNewMem(const mkldnn::memory &input, const mkldnn::memory &output);
   const mkldnn::reorder &GetPd() const;
 
+ private:
   std::shared_ptr<mkldnn::memory> data_;
   std::shared_ptr<mkldnn::memory> out_;
   std::shared_ptr<mkldnn::reorder> fwd_;
diff --git a/src/operator/nn/mkldnn/mkldnn_slice.cc b/src/operator/nn/mkldnn/mkldnn_slice.cc
index c1e41e1ee7ee..8f2bdf24886d 100644
--- a/src/operator/nn/mkldnn/mkldnn_slice.cc
+++ b/src/operator/nn/mkldnn/mkldnn_slice.cc
@@ -56,7 +56,7 @@ MKLDNNSliceFwd::MKLDNNSliceFwd(const SliceParam &param,
   auto reorder_pd = reorder::primitive_desc(view_pd.dst_primitive_desc(), out_mem_pd);
   this->data_ = std::make_shared<mkldnn::memory>(view_pd.dst_primitive_desc(), nullptr);
   this->out_ = std::make_shared<mkldnn::memory>(view_pd.dst_primitive_desc(), nullptr);
-  fwd_.reset(new mkldnn::reorder(reorder_pd, *data_, *out_));
+  this->fwd_ = std::make_shared<mkldnn::reorder>(reorder_pd, *this->data_, *this->out_);
 }
 
 void MKLDNNSliceFwd::SetNewMem(const mkldnn::memory &input, const mkldnn::memory &output) {

From 609843ff9be50504a1588998641a2d8761607490 Mon Sep 17 00:00:00 2001
From: huangzhiyuan <zhiyuan.huang@intel.com>
Date: Wed, 2 Jan 2019 09:45:15 +0800
Subject: [PATCH 4/8] mv SliceEx to matrix_op.cc

---
 src/operator/nn/mkldnn/mkldnn_slice-inl.h |  1 -
 src/operator/tensor/matrix_op-inl.h       | 55 +++++++++++------------
 src/operator/tensor/matrix_op.cc          | 27 ++++++++++-
 3 files changed, 51 insertions(+), 32 deletions(-)

diff --git a/src/operator/nn/mkldnn/mkldnn_slice-inl.h b/src/operator/nn/mkldnn/mkldnn_slice-inl.h
index b66e7045d101..b5bb30000b82 100644
--- a/src/operator/nn/mkldnn/mkldnn_slice-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_slice-inl.h
@@ -28,7 +28,6 @@
 
 #if MXNET_USE_MKLDNN == 1
 
-#include <mkldnn.hpp>
 #include <dmlc/logging.h>
 #include <dmlc/parameter.h>
 #include <mxnet/operator.h>
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 212d6817de3a..9363b488ba9b 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -38,7 +38,6 @@
 #include "./init_op.h"
 #include "../../common/static_array.h"
 #include "./slice-inl.h"
-#include "../nn/mkldnn/mkldnn_slice-inl.h"
 
 #if MXNET_USE_CUDA
 #include <thrust/device_vector.h>
@@ -420,18 +419,22 @@ inline bool SliceForwardInferStorageType(const nnvm::NodeAttrs& attrs,
       && (!param.step[0].has_value() || param.step[0].value() == 1)) {
     trivial_step = true;
   }
-  if (!dispatched && in_stype == kDefaultStorage
-                  && trivial_step && dev_mask == Context::kCPU) {
+  if (!dispatched && in_stype == kDefaultStorage && trivial_step) {
 #if MXNET_USE_MKLDNN == 1
+  if (!MKLDNNEnvSet()) {
+    dispatched = storage_type_assign(&out_stype, kDefaultStorage,
+                                     dispatch_mode, DispatchMode::kFCompute);
+  } else {
     dispatched = storage_type_assign(&out_stype, kDefaultStorage,
                                      dispatch_mode, dispatch_ex);
+  }
 #else
     dispatched = storage_type_assign(&out_stype, kDefaultStorage,
                                      dispatch_mode, DispatchMode::kFCompute);
 #endif
   } else if (!dispatched && in_stype == kDefaultStorage) {
-    dispatched = storage_type_assign(&out_stype, kDefaultStorage,
-                                     dispatch_mode, DispatchMode::kFCompute);
+     dispatched = storage_type_assign(&out_stype, kDefaultStorage,
+                                      dispatch_mode, DispatchMode::kFCompute);
   }
 
   if (!dispatched && in_stype == kCSRStorage && trivial_step) {
@@ -601,6 +604,23 @@ void SliceCsrImpl(const SliceParam &param, const OpContext& ctx,
   }
 }
 
+template<typename xpu>
+void SliceEx(const nnvm::NodeAttrs& attrs,
+             const OpContext& ctx,
+             const std::vector<NDArray>& inputs,
+             const std::vector<OpReqType>& req,
+             const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 1);
+  CHECK_EQ(outputs.size(), 1);
+  const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
+  auto in_stype = inputs[0].storage_type();
+  if (in_stype == kCSRStorage) {
+    SliceCsrImpl<xpu>(param, ctx, inputs[0], req[0], outputs[0]);
+  } else {
+    LOG(FATAL) << "Slice not implemented for storage type" << in_stype;
+  }
+}
+
 template<int ndim>
 inline void GetIndexRange(const TShape& dshape,
                           const nnvm::Tuple<dmlc::optional<int>>& param_begin,
@@ -809,31 +829,6 @@ void SliceOpForward(const nnvm::NodeAttrs& attrs,
   })
 }
 
-template<typename xpu>
-void SliceEx(const nnvm::NodeAttrs& attrs,
-                const OpContext& ctx,
-                const std::vector<NDArray>& inputs,
-                const std::vector<OpReqType>& req,
-                const std::vector<NDArray>& outputs) {
-  CHECK_EQ(inputs.size(), 1);
-  CHECK_EQ(outputs.size(), 1);
-  const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
-  auto in_stype = inputs[0].storage_type();
-  if (in_stype == kCSRStorage) {
-    SliceCsrImpl<xpu>(param, ctx, inputs[0], req[0], outputs[0]);
-#if MXNET_USE_MKLDNN == 1
-  } else if (in_stype == kDefaultStorage) {
-    if (SupportMKLDNN(inputs[0])) {
-      MKLDNNSlice(param, ctx, inputs[0], req[0], outputs[0]);
-    } else {
-      FallBackCompute(SliceOpForward<xpu>, attrs, ctx, inputs, req, outputs);
-    }
-#endif
-  } else {
-    LOG(FATAL) << "Slice not implemented for storage type" << in_stype;
-  }
-}
-
 template<int ndim, int req, typename xpu>
 struct slice_assign;
 
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index c75e8ad7419e..ed8912f7b7be 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -27,6 +27,7 @@
 #include "./elemwise_unary_op.h"
 #include "../nn/mkldnn/mkldnn_ops-inl.h"
 #include "../nn/mkldnn/mkldnn_base-inl.h"
+#include "../nn/mkldnn/mkldnn_slice-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -420,6 +421,30 @@ will return a new array with shape ``(2,1,3,4)``.
 .add_argument("data", "NDArray-or-Symbol", "Source input")
 .add_arguments(ExpandDimParam::__FIELDS__());
 
+void SliceExCPU(const nnvm::NodeAttrs& attrs,
+                const OpContext& ctx,
+                const std::vector<NDArray>& inputs,
+                const std::vector<OpReqType>& req,
+                const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 1);
+  CHECK_EQ(outputs.size(), 1);
+  const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
+  auto in_stype = inputs[0].storage_type();
+  if (in_stype == kCSRStorage) {
+    SliceCsrImpl<cpu>(param, ctx, inputs[0], req[0], outputs[0]);
+#if MXNET_USE_MKLDNN == 1
+  } else if (in_stype == kDefaultStorage) {
+    if (SupportMKLDNN(inputs[0])) {
+      MKLDNNSlice(param, ctx, inputs[0], req[0], outputs[0]);
+    } else {
+      FallBackCompute(SliceOpForward<cpu>, attrs, ctx, inputs, req, outputs);
+    }
+#endif
+  } else {
+    LOG(FATAL) << "Slice not implemented for storage type" << in_stype;
+  }
+}
+
 NNVM_REGISTER_OP(slice)
 MXNET_ADD_SPARSE_OP_ALIAS(slice)
 .add_alias("crop")
@@ -478,7 +503,7 @@ Example::
 .set_attr<FInferStorageType>("FInferStorageType", SliceForwardInferStorageType)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_slice"})
 .set_attr<FCompute>("FCompute<cpu>", SliceOpForward<cpu>)
-.set_attr<FComputeEx>("FComputeEx<cpu>", SliceEx<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SliceExCPU)
 #if MXNET_USE_MKLDNN == 1
 .set_attr<bool>("TIsMKLDNN", true)
 #endif

From ddc3b9d2c3ca47a7f35e930c5fe3d8f8e0d7d147 Mon Sep 17 00:00:00 2001
From: huangzhiyuan <zhiyuan.huang@intel.com>
Date: Wed, 2 Jan 2019 13:59:54 +0800
Subject: [PATCH 5/8] fix lint

---
 src/operator/tensor/matrix_op-inl.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 9363b488ba9b..42fb561f0e55 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -421,20 +421,20 @@ inline bool SliceForwardInferStorageType(const nnvm::NodeAttrs& attrs,
   }
   if (!dispatched && in_stype == kDefaultStorage && trivial_step) {
 #if MXNET_USE_MKLDNN == 1
-  if (!MKLDNNEnvSet()) {
+  if (MKLDNNEnvSet() && dev_mask == Context::kCPU) {
     dispatched = storage_type_assign(&out_stype, kDefaultStorage,
-                                     dispatch_mode, DispatchMode::kFCompute);
+                                     dispatch_mode, dispatch_ex);
   } else {
     dispatched = storage_type_assign(&out_stype, kDefaultStorage,
-                                     dispatch_mode, dispatch_ex);
+                                     dispatch_mode, DispatchMode::kFCompute);
   }
 #else
     dispatched = storage_type_assign(&out_stype, kDefaultStorage,
                                      dispatch_mode, DispatchMode::kFCompute);
 #endif
   } else if (!dispatched && in_stype == kDefaultStorage) {
-     dispatched = storage_type_assign(&out_stype, kDefaultStorage,
-                                      dispatch_mode, DispatchMode::kFCompute);
+    dispatched = storage_type_assign(&out_stype, kDefaultStorage,
+                                     dispatch_mode, DispatchMode::kFCompute);
   }
 
   if (!dispatched && in_stype == kCSRStorage && trivial_step) {

From a66ac0a029eacc73b72afac2c8552031d1bdaf16 Mon Sep 17 00:00:00 2001
From: huangzhiyuan <zhiyuan.huang@intel.com>
Date: Tue, 8 Jan 2019 16:30:18 +0800
Subject: [PATCH 6/8] optimize dispatch_mode

---
 src/operator/tensor/matrix_op-inl.h | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 42fb561f0e55..ceafe6249764 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -399,6 +399,16 @@ inline bool ExpandDimShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+// Currently MKLDNN only supports step = 1 or step has no value
+inline bool SupportMKLDNNSlice(const SliceParam& param) {
+  if (param.step.ndim() == 0U) return true;
+  for (uint32_t i = 0; i < param.step.ndim(); ++i) {
+    if (param.step[i].has_value() && param.step[i].value() != 1)
+      return false;
+  }
+  return true;
+}
+
 inline bool SliceForwardInferStorageType(const nnvm::NodeAttrs& attrs,
                                          const int dev_mask,
                                          DispatchMode* dispatch_mode,
@@ -419,22 +429,18 @@ inline bool SliceForwardInferStorageType(const nnvm::NodeAttrs& attrs,
       && (!param.step[0].has_value() || param.step[0].value() == 1)) {
     trivial_step = true;
   }
-  if (!dispatched && in_stype == kDefaultStorage && trivial_step) {
+
+  if (in_stype == kDefaultStorage) {
 #if MXNET_USE_MKLDNN == 1
-  if (MKLDNNEnvSet() && dev_mask == Context::kCPU) {
+  if (dev_mask == Context::kCPU && MKLDNNEnvSet() && SupportMKLDNNSlice(param)) {
     dispatched = storage_type_assign(&out_stype, kDefaultStorage,
                                      dispatch_mode, dispatch_ex);
-  } else {
-    dispatched = storage_type_assign(&out_stype, kDefaultStorage,
-                                     dispatch_mode, DispatchMode::kFCompute);
   }
-#else
-    dispatched = storage_type_assign(&out_stype, kDefaultStorage,
-                                     dispatch_mode, DispatchMode::kFCompute);
 #endif
-  } else if (!dispatched && in_stype == kDefaultStorage) {
-    dispatched = storage_type_assign(&out_stype, kDefaultStorage,
-                                     dispatch_mode, DispatchMode::kFCompute);
+    if (!dispatched) {
+      dispatched = storage_type_assign(&out_stype, kDefaultStorage,
+                                      dispatch_mode, DispatchMode::kFCompute);
+    }
   }
 
   if (!dispatched && in_stype == kCSRStorage && trivial_step) {

From 9a4ab21bec3cd441253307fb4072b875ca27411d Mon Sep 17 00:00:00 2001
From: huangzhiyuan <zhiyuan.huang@intel.com>
Date: Sat, 12 Jan 2019 19:58:45 +0800
Subject: [PATCH 7/8] retrigger ci

---
 src/operator/nn/mkldnn/mkldnn_slice-inl.h |  4 ++--
 src/operator/nn/mkldnn/mkldnn_slice.cc    | 11 ++++++-----
 src/operator/tensor/matrix_op-inl.h       |  5 +++--
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/operator/nn/mkldnn/mkldnn_slice-inl.h b/src/operator/nn/mkldnn/mkldnn_slice-inl.h
index b5bb30000b82..f41db01a9837 100644
--- a/src/operator/nn/mkldnn/mkldnn_slice-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_slice-inl.h
@@ -54,8 +54,8 @@ class MKLDNNSliceFwd {
 };
 
 typedef ParamOpSign<SliceParam> MKLDNNSliceSignature;
-MKLDNNSliceFwd &GetSliceForward(const SliceParam &param,
-    const NDArray &in_data, const NDArray &out_data);
+MKLDNNSliceFwd &GetSliceForward(const SliceParam &param, const bool is_train,
+                 const NDArray &in_data, const NDArray &out_data);
 
 void MKLDNNSlice(const SliceParam &param, const OpContext& ctx,
                  const NDArray &in, OpReqType req, const NDArray &out);
diff --git a/src/operator/nn/mkldnn/mkldnn_slice.cc b/src/operator/nn/mkldnn/mkldnn_slice.cc
index 8f2bdf24886d..f3c8a14e0c63 100644
--- a/src/operator/nn/mkldnn/mkldnn_slice.cc
+++ b/src/operator/nn/mkldnn/mkldnn_slice.cc
@@ -49,8 +49,7 @@ MKLDNNSliceFwd::MKLDNNSliceFwd(const SliceParam &param,
     dims[i] = oshape[i];
     offsets[i] = s;
   }
-  auto in_mem = in.GetMKLDNNData();
-  auto in_mem_pd = in_mem->get_primitive_desc();
+  auto in_mem_pd = in.GetMKLDNNData()->get_primitive_desc();
   auto out_mem_pd = out.GetMKLDNNData()->get_primitive_desc();
   auto view_pd = mkldnn::view::primitive_desc(in_mem_pd, dims, offsets);
   auto reorder_pd = reorder::primitive_desc(view_pd.dst_primitive_desc(), out_mem_pd);
@@ -68,15 +67,17 @@ const mkldnn::reorder &MKLDNNSliceFwd::GetPd() const {
   return *fwd_;
 }
 
-MKLDNNSliceFwd &GetSliceForward(const SliceParam &param,
-    const NDArray &in_data, const NDArray &out_data) {
+MKLDNNSliceFwd &GetSliceForward(const SliceParam &param, const bool is_train,
+                                const NDArray &in_data, const NDArray &out_data) {
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local std::unordered_map<MKLDNNSliceSignature, MKLDNNSliceFwd, OpHash> fwds;
 #else
   static MX_THREAD_LOCAL std::unordered_map<MKLDNNSliceSignature, MKLDNNSliceFwd, OpHash> fwds;
 #endif
   MKLDNNSliceSignature key(param);
+  key.AddSign(is_train);
   key.AddSign(in_data);
+  key.AddSign(out_data);
 
   auto it = fwds.find(key);
   if (it == fwds.end()) {
@@ -88,7 +89,7 @@ MKLDNNSliceFwd &GetSliceForward(const SliceParam &param,
 
 void MKLDNNSlice(const SliceParam &param, const OpContext& ctx,
                  const NDArray &in, OpReqType req, const NDArray &out) {
-  MKLDNNSliceFwd &fwd = GetSliceForward(param, in, out);
+  MKLDNNSliceFwd &fwd = GetSliceForward(param, ctx.is_train, in, out);
   auto in_mem = in.GetMKLDNNData();
   auto out_mem_pd = out.GetMKLDNNData()->get_primitive_desc();
   auto out_mem = CreateMKLDNNMem(out, out_mem_pd, req);
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index ceafe6249764..299ea384a6f7 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -432,14 +432,15 @@ inline bool SliceForwardInferStorageType(const nnvm::NodeAttrs& attrs,
 
   if (in_stype == kDefaultStorage) {
 #if MXNET_USE_MKLDNN == 1
-  if (dev_mask == Context::kCPU && MKLDNNEnvSet() && SupportMKLDNNSlice(param)) {
+  if (dev_mask == Context::kCPU && MKLDNNEnvSet()
+      && SupportMKLDNNSlice(param)) {
     dispatched = storage_type_assign(&out_stype, kDefaultStorage,
                                      dispatch_mode, dispatch_ex);
   }
 #endif
     if (!dispatched) {
       dispatched = storage_type_assign(&out_stype, kDefaultStorage,
-                                      dispatch_mode, DispatchMode::kFCompute);
+                                       dispatch_mode, DispatchMode::kFCompute);
     }
   }
 

From cc7a13abbdfd5e4c7cade713a9330bdf3b92991b Mon Sep 17 00:00:00 2001
From: huangzhiyuan <zhiyuan.huang@intel.com>
Date: Mon, 14 Jan 2019 09:31:28 +0800
Subject: [PATCH 8/8] fix indent

---
 src/operator/tensor/matrix_op-inl.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 299ea384a6f7..8b575ca75365 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -432,11 +432,11 @@ inline bool SliceForwardInferStorageType(const nnvm::NodeAttrs& attrs,
 
   if (in_stype == kDefaultStorage) {
 #if MXNET_USE_MKLDNN == 1
-  if (dev_mask == Context::kCPU && MKLDNNEnvSet()
-      && SupportMKLDNNSlice(param)) {
-    dispatched = storage_type_assign(&out_stype, kDefaultStorage,
-                                     dispatch_mode, dispatch_ex);
-  }
+    if (dev_mask == Context::kCPU && MKLDNNEnvSet()
+        && SupportMKLDNNSlice(param)) {
+      dispatched = storage_type_assign(&out_stype, kDefaultStorage,
+                                       dispatch_mode, dispatch_ex);
+    }
 #endif
     if (!dispatched) {
       dispatched = storage_type_assign(&out_stype, kDefaultStorage,