From 04a7589f60d4c48a17998e82f067169f9dd5d9f0 Mon Sep 17 00:00:00 2001
From: Zhennan Qin <zhennan.qin@intel.com>
Date: Mon, 8 Apr 2019 12:54:28 +0800
Subject: [PATCH 1/6] stateful_quantize

---
 src/operator/quantization/dequantize-inl.h    |  74 ++++--
 src/operator/quantization/dequantize.cc       |   7 +-
 src/operator/quantization/dequantize.cu       |   2 +-
 .../mkldnn/mkldnn_dequantize-inl.h            | 143 +++++++-----
 .../mkldnn/mkldnn_quantize_v2-inl.h           | 212 ++++++++++--------
 src/operator/quantization/quantize_v2-inl.h   | 197 +++++++++-------
 src/operator/quantization/quantize_v2.cc      |   7 +-
 src/operator/quantization/quantize_v2.cu      |   2 +-
 8 files changed, 376 insertions(+), 268 deletions(-)
diff --git a/src/operator/quantization/dequantize-inl.h b/src/operator/quantization/dequantize-inl.h
index dcda5a8b4bef..86dbeb13abec 100644
--- a/src/operator/quantization/dequantize-inl.h
+++ b/src/operator/quantization/dequantize-inl.h
@@ -68,30 +68,6 @@ struct dequantize_zero_centered {
   }
 };
 
-template<typename xpu>
-void DequantizeCompute(const nnvm::NodeAttrs& attrs,
-                     const OpContext& ctx,
-                     const std::vector<TBlob>& inputs,
-                     const std::vector<OpReqType>& req,
-                     const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  using namespace mxnet_op;
-  using mshadow::red::limits::MinValue;
-  using mshadow::red::limits::MaxValue;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  if (inputs[0].type_flag_ == mshadow::kUint8) {
-    Kernel<dequantize_unsigned, xpu>::Launch(s, outputs[0].Size(), outputs[0].dptr<float>(),
-      inputs[0].dptr<uint8_t>(), inputs[1].dptr<float>(), inputs[2].dptr<float>(),
-      MinValue<uint8_t>(), MaxValue<uint8_t>());
-  } else if (inputs[0].type_flag_ == mshadow::kInt8) {
-    Kernel<dequantize_zero_centered, xpu>::Launch(s, outputs[0].Size(), outputs[0].dptr<float>(),
-      inputs[0].dptr<int8_t>(), inputs[1].dptr<float>(), inputs[2].dptr<float>(),
-      MinAbs(MaxValue<int8_t>(), MinValue<int8_t>()));
-  } else {
-    LOG(FATAL) << "dequantize op only supports input type int8 or uint8";
-  }
-}
-
 inline bool DequantizeShape(const nnvm::NodeAttrs& attrs,
                           mxnet::ShapeVector *in_attrs,
                           mxnet::ShapeVector *out_attrs) {
@@ -119,6 +95,56 @@ inline bool DequantizeType(const nnvm::NodeAttrs& attrs,
   return (*in_attrs)[0] != -1;
 }
 
+template <typename xpu>
+class DequantizeOperator {
+ public:
+  DequantizeOperator(const nnvm::NodeAttrs &attrs) : attrs_(attrs) {}
+  void Forward(const OpContext &ctx, const std::vector<TBlob> &inputs,
+               const std::vector<OpReqType> &req, const std::vector<TBlob> &outputs) {
+    using namespace mshadow;
+    using namespace mxnet_op;
+    using mshadow::red::limits::MaxValue;
+    using mshadow::red::limits::MinValue;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    if (inputs[0].type_flag_ == mshadow::kUint8) {
+      Kernel<dequantize_unsigned, xpu>::Launch(s, outputs[0].Size(), outputs[0].dptr<float>(),
+                                               inputs[0].dptr<uint8_t>(), inputs[1].dptr<float>(),
+                                               inputs[2].dptr<float>(), MinValue<uint8_t>(),
+                                               MaxValue<uint8_t>());
+    } else if (inputs[0].type_flag_ == mshadow::kInt8) {
+      Kernel<dequantize_zero_centered, xpu>::Launch(
+          s, outputs[0].Size(), outputs[0].dptr<float>(), inputs[0].dptr<int8_t>(),
+          inputs[1].dptr<float>(), inputs[2].dptr<float>(),
+          MinAbs(MaxValue<int8_t>(), MinValue<int8_t>()));
+    } else {
+      LOG(FATAL) << "dequantize op only supports input type int8 or uint8";
+    }
+  }
+
+ private:
+  nnvm::NodeAttrs attrs_;
+};
+
+static OpStatePtr CreateDequantizeState(const nnvm::NodeAttrs &attrs, Context ctx,
+                                        const std::vector<TShape> &in_shapes,
+                                        const std::vector<int> &in_types) {
+  OpStatePtr state;
+  if (ctx.dev_type == kGPU) {
+    state = OpStatePtr::Create<DequantizeOperator<gpu>>(attrs);
+  } else {
+    state = OpStatePtr::Create<DequantizeOperator<cpu>>(attrs);
+  }
+  return state;
+}
+
+template <typename xpu>
+static void DequantizeForward(const OpStatePtr &state_ptr, const OpContext &ctx,
+                              const std::vector<TBlob> &inputs, const std::vector<OpReqType> &req,
+                              const std::vector<TBlob> &outputs) {
+  auto &op = state_ptr.get_state<DequantizeOperator<xpu>>();
+  op.Forward(ctx, inputs, req, outputs);
+}
+
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_QUANTIZATION_DEQUANTIZE_INL_H_
diff --git a/src/operator/quantization/dequantize.cc b/src/operator/quantization/dequantize.cc
index 7c84673095f0..ed3bbdcbb845 100644
--- a/src/operator/quantization/dequantize.cc
+++ b/src/operator/quantization/dequantize.cc
@@ -76,9 +76,12 @@ by keep zero centered for the quantized value:
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 #if MXNET_USE_MKLDNN == 1
 .set_attr<bool>("TIsMKLDNN", true)
-.set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNDequantizeCompute)
+.set_attr<FCreateOpState>("FCreateOpState", CreateSgMKLDNNDequantizeState)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", SgMKLDNNDequantizeForward)
+#else
+.set_attr<FCreateOpState>("FCreateOpState", CreateDequantizeState)
+.set_attr<FStatefulCompute>("FStatefulCompute<cpu>", DequantizeForward<cpu>)
 #endif
-.set_attr<FCompute>("FCompute<cpu>", DequantizeCompute<cpu>)
 .add_argument("data", "NDArray-or-Symbol", "A ndarray/symbol of type `uint8`")
 .add_argument("min_range", "NDArray-or-Symbol", "The minimum scalar value "
   "possibly produced for the input in float32")
diff --git a/src/operator/quantization/dequantize.cu b/src/operator/quantization/dequantize.cu
index ca5f91c5def9..41b6e7d20494 100644
--- a/src/operator/quantization/dequantize.cu
+++ b/src/operator/quantization/dequantize.cu
@@ -28,7 +28,7 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_contrib_dequantize)
-.set_attr<FCompute>("FCompute<gpu>", DequantizeCompute<gpu>);
+.set_attr<FCompute>("FStatefulCompute<gpu>", DequantizeForward<gpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h b/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
index b66adf787fef..c142d3832004 100644
--- a/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
+++ b/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
@@ -26,80 +26,105 @@
 #ifndef MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_DEQUANTIZE_INL_H_
 #define MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_DEQUANTIZE_INL_H_
 #if MXNET_USE_MKLDNN == 1
-#include <string>
 #include <algorithm>
+#include <string>
 #include <vector>
 #include "../../nn/mkldnn/mkldnn_base-inl.h"
 
 namespace mxnet {
 namespace op {
 
-template<typename SrcType, typename DstType>
-static void MKLDNNDequantizeComputeKer(const std::vector<NDArray> &inputs,
-                                       const std::vector<NDArray> &outputs,
-                                       const std::vector<OpReqType> &req) {
-  using namespace mshadow;
-  using namespace mxnet_op;
-  using red::limits::MaxValue;
-  using red::limits::MinValue;
-  float real_range = 0.0;
-  float quantized_range = 0.0;
-  if (inputs[0].dtype() == mshadow::kUint8) {
-    quantized_range = MaxAbs(MaxValue<SrcType>(), MinValue<SrcType>());
-    real_range = MaxAbs(*inputs[1].data().dptr<DstType>(), *inputs[2].data().dptr<DstType>());
-  } else if (inputs[0].dtype() == mshadow::kInt8) {
-    quantized_range = MinAbs(MaxValue<SrcType>(), MinValue<SrcType>());
-    real_range = MaxAbs(*inputs[1].data().dptr<DstType>(), *inputs[2].data().dptr<DstType>());
-  } else {
-    LOG(FATAL) << "mkldnn dequantize op only supports int8 and uint8 as output type";
-  }
-  float scale = real_range / quantized_range;
-  primitive_attr attr;
-  const int mask = 0;
-  std::vector<float> scales = {scale};
-  attr.set_output_scales(mask, scales);
-  attr.set_int_output_round_mode(round_nearest);
-  mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
+class SgMKLDNNDequantizeOperator : public DequantizeOperator {
+ public:
+  explicit SgMKLDNNDequantizeOperator(const nnvm::NodeAttrs &attrs)
+      : DequantizeOperator(attrs), param_(nnvm::get<DequantizeParam>(attrs.parsed)) {}
 
-  NDArray in_buffer = inputs[0];
-  if (inputs[0].IsView() && inputs[0].IsMKLDNNData())
-    in_buffer = inputs[0].Reorder2Default();
+  void Forward(const OpContext &ctx, const std::vector<NDArray> &inputs,
+               const std::vector<OpReqType> &req, const std::vector<NDArray> &outputs);
+
+ private:
+  bool initalized_{false};
+  DequantizeParam param_;
+  float cached_data_min_{0.f};
+  float cached_data_max_{0.f};
+  std::shared_ptr<mkldnn::memory> i_mem_;
+  std::shared_ptr<mkldnn::memory> o_mem_;
+  std::shared_ptr<mkldnn::reorder> fwd_pd_;
+};
 
+void SgMKLDNNDequantizeOperator::Forward(const OpContext &ctx, const std::vector<NDArray> &inputs,
+                                         const std::vector<OpReqType> &req,
+                                         const std::vector<NDArray> &outputs) {
+  NDArray in_buffer = inputs[0];
+  if (inputs[0].IsView() && inputs[0].IsMKLDNNData()) in_buffer = inputs[0].Reorder2Default();
   auto i_mem = in_buffer.GetMKLDNNData();
-  auto i_mpd = i_mem->get_primitive_desc();
-  auto i_desc = i_mpd.desc();
-  size_t i_ndim = in_buffer.shape().ndim();
-  mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
-  for (size_t i = 0; i < i_ndim; i++) {
-    i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
-  }
-  mkldnn::memory::format i_fmt = static_cast<mkldnn::memory::format>(i_desc.data.format);
-  if (i_fmt == mkldnn::memory::format::nhwc) {
-    // For 4d tensor, nchw is the default format
-    i_fmt = mkldnn::memory::format::nchw;
+  float data_min = *inputs[1].data().dptr<float>();
+  float data_max = *inputs[2].data().dptr<float>();
+
+  if (initalized_ && (cached_data_min_ != data_min || cached_data_max_ != data_max))
+    initalized_ = false;
+
+  if (!initalized_) {
+    cached_data_min_ = data_min;
+    cached_data_max_ = data_max;
+    float real_range = MaxAbs(cached_data_min_, cached_data_max_);
+    float quantized_range = 0.0;
+    if (inputs[0].dtype() == mshadow::kUint8) {
+      quantized_range = kUint8Range;
+    } else if (inputs[0].dtype() == mshadow::kInt8) {
+      quantized_range = kInt8Range;
+      real_range = MaxAbs(*inputs[1].data().dptr<float>(), *inputs[2].data().dptr<float>());
+    } else {
+      LOG(FATAL) << "mkldnn dequantize op only supports int8 and uint8 as output type";
+    }
+    float scale = real_range / quantized_range;
+    primitive_attr attr;
+    const int mask = 0;
+    std::vector<float> scales = {scale};
+    attr.set_output_scales(mask, scales);
+    attr.set_int_output_round_mode(round_nearest);
+    mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
+    auto i_mpd = i_mem->get_primitive_desc();
+    auto i_desc = i_mpd.desc();
+    size_t i_ndim = in_buffer.shape().ndim();
+    mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
+    for (size_t i = 0; i < i_ndim; i++) {
+      i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
+    }
+    mkldnn::memory::format o_fmt = static_cast<mkldnn::memory::format>(i_desc.data.format);
+    if (o_fmt == mkldnn::memory::format::nhwc) {
+      // For 4d tensor, nchw is the default format
+      o_fmt = mkldnn::memory::format::nchw;
+    }
+    auto o_desc =
+        mkldnn::memory::desc(i_dims, (mkldnn::memory::data_type)data_type_enum<float>::type, o_fmt);
+    auto o_mpd = memory::primitive_desc(o_desc, cpu_engine);
+    auto reorder_pd = reorder::primitive_desc(i_mpd, o_mpd, attr);
+    i_mem_ = std::make_shared<mkldnn::memory>(i_mpd, nullptr);
+    o_mem_ = std::make_shared<mkldnn::memory>(o_mpd, nullptr);
+    fwd_pd_ = std::make_shared<mkldnn::reorder>(reorder_pd, *i_mem_, *o_mem_);
+    initalized_ = true;
   }
-  auto o_desc = mkldnn::memory::desc(i_dims,
-                                    (mkldnn::memory::data_type)data_type_enum<DstType>::type,
-                                    i_fmt);
-  auto o_mpd = memory::primitive_desc(o_desc, cpu_engine);
-  auto reorder_pd  = reorder::primitive_desc(i_mpd, o_mpd, attr);
-  auto o_mem = CreateMKLDNNMem(outputs[0], o_mpd, req[0]);
-  MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(reorder_pd, *i_mem, *o_mem.second));
+  auto o_mem = CreateMKLDNNMem(outputs[0], o_mem_->get_primitive_desc(), req[0]);
+  i_mem_->set_data_handle(i_mem->get_data_handle());
+  o_mem_->set_data_handle(o_mem.second->get_data_handle());
+  MKLDNNStream::Get()->RegisterPrim(*fwd_pd_);
   CommitOutput(outputs[0], o_mem);
   MKLDNNStream::Get()->Submit();
 }
 
-static void MKLDNNDequantizeCompute(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                                    const std::vector<NDArray> &inputs,
-                                    const std::vector<OpReqType> &req,
-                                    const std::vector<NDArray> &outputs) {
-  if (inputs[0].dtype() == mshadow::kUint8) {
-    MKLDNNDequantizeComputeKer<uint8_t, float>(inputs, outputs, req);
-  } else if (inputs[0].dtype() == mshadow::kInt8) {
-    MKLDNNDequantizeComputeKer<int8_t, float>(inputs, outputs, req);
-  } else {
-    LOG(FATAL) << "mkldnn dequantize op only supports int8 and uint8 as input type";
-  }
+static void SgMKLDNNDequantizeForward(const OpStatePtr &state_ptr, const OpContext &ctx,
+                                      const std::vector<NDArray> &inputs,
+                                      const std::vector<OpReqType> &req,
+                                      const std::vector<NDArray> &outputs) {
+  SgMKLDNNDequantizeOperator &op = state_ptr.get_state<SgMKLDNNDequantizeOperator>();
+  op.Forward(ctx, inputs, req, outputs);
+}
+
+static OpStatePtr CreateSgMKLDNNDequantizeState(const nnvm::NodeAttrs &attrs, Context ctx,
+                                                const std::vector<TShape> &in_shapes,
+                                                const std::vector<int> &in_types) {
+  return OpStatePtr::Create<SgMKLDNNDequantizeOperator>(attrs);
 }
 
 }  // namespace op
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h b/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
index d6060e54a82c..117195584b68 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
+++ b/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
@@ -34,99 +34,37 @@
 namespace mxnet {
 namespace op {
 
-template <typename SrcType, typename DstType>
-static void MKLDNNQuantizeComputeKer(const std::vector<NDArray>& inputs,
-                                     const std::vector<NDArray>& outputs,
-                                     const QuantizeV2Param& param,
-                                     const std::vector<OpReqType>& req) {
-  using namespace mshadow;
-  using namespace mxnet_op;
-  using red::limits::MaxValue;
-  using red::limits::MinValue;
-  SrcType real_range = 0.f;
-  DstType quantized_range = 0;
-  NDArray in_buffer = inputs[0];
-  SrcType data_min = red::limits::MaxValue<SrcType>();
-  SrcType data_max = red::limits::MinValue<SrcType>();
-  if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
-    data_min = param.min_calib_range.value();
-    data_max = param.max_calib_range.value();
-  } else {
-    // no calib info
-    in_buffer = inputs[0].Reorder2Default();
-    auto in_ptr = in_buffer.data().dptr<SrcType>();
-    auto nthreads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
-    std::vector<SrcType> data_maxs(nthreads, data_max);
-    std::vector<SrcType> data_mins(nthreads, data_min);
-#pragma omp parallel for num_threads(nthreads)
-    for (index_t i = 0; i < static_cast<index_t>(in_buffer.shape().Size()); i++) {
-      int tid = omp_get_thread_num();
-      if (in_ptr[i] > data_maxs[tid]) data_maxs[tid] = in_ptr[i];
-      if (in_ptr[i] < data_mins[tid]) data_mins[tid] = in_ptr[i];
-    }
-    for (index_t i = 0; i < nthreads; i++) {
-      if (data_maxs[i] > data_max) data_max = data_maxs[i];
-      if (data_mins[i] < data_min) data_min = data_mins[i];
-    }
-  }
+class SgMKLDNNQuantizeOperator : public QuantizeV2Operator {
+ public:
+  explicit SgMKLDNNQuantizeOperator(const nnvm::NodeAttrs &attrs)
+      : QuantizeV2Operator(attrs), param_(nnvm::get<QuantizeV2Param>(attrs.parsed)) {}
 
-  auto out_type = GetOutputType(param);
-  if (out_type == mshadow::kUint8) {
-    real_range = std::max<SrcType>(0.f, data_max);
-    quantized_range = MaxValue<DstType>();
-    *outputs[1].data().dptr<float>() = 0.f;
-    *outputs[2].data().dptr<float>() = real_range;
-  } else if (out_type == mshadow::kInt8) {
-    real_range = MaxAbs(data_min, data_max);
-    quantized_range = MinAbs(MaxValue<DstType>(), MinValue<DstType>());
-    *outputs[1].data().dptr<float>() = -real_range;
-    *outputs[2].data().dptr<float>() = real_range;
-  } else {
-    LOG(FATAL) << "mkldnn quantize op only supports int8 and uint8 as output type";
-  }
-  float scale = static_cast<float>(quantized_range) / real_range;
-
-  primitive_attr attr;
-  const int mask = 0;
-  std::vector<float> scales = {scale};
-  attr.set_output_scales(mask, scales);
-  attr.set_int_output_round_mode(round_nearest);
-  mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
-
-  if (in_buffer.IsView() && in_buffer.IsMKLDNNData()) in_buffer = inputs[0].Reorder2Default();
-  auto i_mem = in_buffer.GetMKLDNNData();
-  auto i_mpd = i_mem->get_primitive_desc();
-  auto i_desc = i_mpd.desc();
-  mkldnn::memory::format i_fmt = static_cast<mkldnn::memory::format>(i_desc.data.format);
-  if (i_fmt == mkldnn::memory::format::nchw ||
-      i_fmt == mkldnn::memory::format::nChw8c ||
-      i_fmt == mkldnn_nChw16c) {
-    i_fmt = mkldnn::memory::format::nhwc;
-  }
-  size_t i_ndim = in_buffer.shape().ndim();
-  mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
-  for (size_t i = 0; i < i_ndim; i++) {
-    i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
-  }
-  auto o_desc =
-      mkldnn::memory::desc(i_dims, (mkldnn::memory::data_type)data_type_enum<DstType>::type, i_fmt);
-  auto o_mpd = memory::primitive_desc(o_desc, cpu_engine);
-  auto reorder_pd = reorder::primitive_desc(i_mpd, o_mpd, attr);
-  auto o_mem = CreateMKLDNNMem(outputs[0], o_mpd, req[0]);
-  MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(reorder_pd, *i_mem, *o_mem.second));
-  CommitOutput(outputs[0], o_mem);
-  MKLDNNStream::Get()->Submit();
-}
+  void Forward(const OpContext &ctx, const std::vector<NDArray> &inputs,
+               const std::vector<OpReqType> &req, const std::vector<NDArray> &outputs);
+
+ private:
+  bool initalized_{false};
+  QuantizeV2Param param_;
+  float cached_data_min_{0.f};
+  float cached_data_max_{0.f};
+  std::shared_ptr<mkldnn::memory> i_mem_;
+  std::shared_ptr<mkldnn::memory> o_mem_;
+  std::shared_ptr<mkldnn::reorder> fwd_pd_;
+};
 
-static void MKLDNNQuantizeV2Compute(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
-                                    const std::vector<NDArray>& inputs,
-                                    const std::vector<OpReqType>& req,
-                                    const std::vector<NDArray>& outputs) {
-  const QuantizeV2Param& param = nnvm::get<QuantizeV2Param>(attrs.parsed);
+void SgMKLDNNQuantizeOperator::Forward(const OpContext &ctx, const std::vector<NDArray> &inputs,
+                                       const std::vector<OpReqType> &req,
+                                       const std::vector<NDArray> &outputs) {
+  float quantized_range = 0.0;
+  NDArray in_buffer = inputs[0];
+  float data_min = mshadow::red::limits::MaxValue<float>();
+  float data_max = mshadow::red::limits::MinValue<float>();
+
+  // Pass through quantized data
   if (inputs[0].dtype() == mshadow::kUint8 || inputs[0].dtype() == mshadow::kInt8) {
-    if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
-      *outputs[1].data().dptr<float>() = param.min_calib_range.value();
-      *outputs[2].data().dptr<float>() = param.max_calib_range.value();
+    if (param_.min_calib_range.has_value() && param_.max_calib_range.has_value()) {
+      *outputs[1].data().dptr<float>() = param_.min_calib_range.value();
+      *outputs[2].data().dptr<float>() = param_.max_calib_range.value();
     } else {
       if (inputs[0].dtype() == mshadow::kUint8) {
         *outputs[1].data().dptr<float>() = 0;
@@ -137,21 +75,107 @@ static void MKLDNNQuantizeV2Compute(const nnvm::NodeAttrs& attrs, const OpContex
       }
     }
     if (req[0] != kWriteInplace) {
-      const_cast<NDArray&>(outputs[0]).CopyFrom(*inputs[0].GetMKLDNNData());
+      const_cast<NDArray &>(outputs[0]).CopyFrom(*inputs[0].GetMKLDNNData());
       MKLDNNStream::Get()->Submit();
     }
   } else {
-    auto out_type = GetOutputType(param);
+    if (in_buffer.IsView() && in_buffer.IsMKLDNNData()) in_buffer = inputs[0].Reorder2Default();
+    auto i_mem = in_buffer.GetMKLDNNData();
+
+    if (param_.min_calib_range.has_value() && param_.max_calib_range.has_value()) {
+      data_min = param_.min_calib_range.value();
+      data_max = param_.max_calib_range.value();
+    } else {
+      // no calib info
+      in_buffer = inputs[0].Reorder2Default();
+      auto in_ptr = in_buffer.data().dptr<float>();
+      auto nthreads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+      std::vector<float> data_maxs(nthreads, data_max);
+      std::vector<float> data_mins(nthreads, data_min);
+#pragma omp parallel for num_threads(nthreads)
+      for (index_t i = 0; i < static_cast<index_t>(in_buffer.shape().Size()); i++) {
+        int tid = omp_get_thread_num();
+        if (in_ptr[i] > data_maxs[tid]) data_maxs[tid] = in_ptr[i];
+        if (in_ptr[i] < data_mins[tid]) data_mins[tid] = in_ptr[i];
+      }
+      for (index_t i = 0; i < nthreads; i++) {
+        if (data_maxs[i] > data_max) data_max = data_maxs[i];
+        if (data_mins[i] < data_min) data_min = data_mins[i];
+      }
+    }
+
+    // Write output min/max
+    auto out_type = GetOutputType(param_);
     if (out_type == mshadow::kUint8) {
-      MKLDNNQuantizeComputeKer<float, uint8_t>(inputs, outputs, param, req);
+      quantized_range = kUint8Range;
+      *outputs[1].data().dptr<float>() = data_min;
+      *outputs[2].data().dptr<float>() = data_max;
     } else if (out_type == mshadow::kInt8) {
-      MKLDNNQuantizeComputeKer<float, int8_t>(inputs, outputs, param, req);
+      float real_range = MaxAbs(data_min, data_max);
+      quantized_range = kInt8Range;
+      *outputs[1].data().dptr<float>() = -real_range;
+      *outputs[2].data().dptr<float>() = real_range;
     } else {
       LOG(FATAL) << "mkldnn quantize op only supports int8 and uint8 as output type";
     }
+
+    if (initalized_ && (cached_data_min_ != data_min || cached_data_max_ != data_max))
+      initalized_ = false;
+
+    if (!initalized_) {
+      cached_data_min_ = data_min;
+      cached_data_max_ = data_max;
+      float real_range = MaxAbs(data_min, data_max);
+      float scale = quantized_range / real_range;
+      primitive_attr attr;
+      const int mask = 0;
+      std::vector<float> scales = {scale};
+      attr.set_output_scales(mask, scales);
+      attr.set_int_output_round_mode(round_nearest);
+      mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
+      auto i_mpd = i_mem->get_primitive_desc();
+      auto i_desc = i_mpd.desc();
+      mkldnn::memory::format i_fmt = static_cast<mkldnn::memory::format>(i_desc.data.format);
+      if (i_fmt == mkldnn::memory::format::nchw || i_fmt == mkldnn::memory::format::nChw8c ||
+          i_fmt == mkldnn_nChw16c) {
+        i_fmt = mkldnn::memory::format::nhwc;
+      }
+      size_t i_ndim = in_buffer.shape().ndim();
+      mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
+      for (size_t i = 0; i < i_ndim; i++) {
+        i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
+      }
+      auto o_desc = mkldnn::memory::desc(i_dims, get_mkldnn_type(out_type), i_fmt);
+      auto o_mpd = memory::primitive_desc(o_desc, cpu_engine);
+      auto reorder_pd = reorder::primitive_desc(i_mpd, o_mpd, attr);
+      i_mem_ = std::make_shared<mkldnn::memory>(i_mpd, nullptr);
+      o_mem_ = std::make_shared<mkldnn::memory>(o_mpd, nullptr);
+      fwd_pd_ = std::make_shared<mkldnn::reorder>(reorder_pd, *i_mem_, *o_mem_);
+      initalized_ = true;
+    }
+    auto o_mem = CreateMKLDNNMem(outputs[0], o_mem_->get_primitive_desc(), req[0]);
+    i_mem_->set_data_handle(i_mem->get_data_handle());
+    o_mem_->set_data_handle(o_mem.second->get_data_handle());
+    MKLDNNStream::Get()->RegisterPrim(*fwd_pd_);
+    CommitOutput(outputs[0], o_mem);
+    MKLDNNStream::Get()->Submit();
   }
 }
 
+static void SgMKLDNNQuantizeForward(const OpStatePtr &state_ptr, const OpContext &ctx,
+                                    const std::vector<NDArray> &inputs,
+                                    const std::vector<OpReqType> &req,
+                                    const std::vector<NDArray> &outputs) {
+  SgMKLDNNQuantizeOperator &op = state_ptr.get_state<SgMKLDNNQuantizeOperator>();
+  op.Forward(ctx, inputs, req, outputs);
+}
+
+static OpStatePtr CreateSgMKLDNNQuantizeState(const nnvm::NodeAttrs &attrs, Context ctx,
+                                              const std::vector<TShape> &in_shapes,
+                                              const std::vector<int> &in_types) {
+  return OpStatePtr::Create<SgMKLDNNQuantizeOperator>(attrs);
+}
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/quantization/quantize_v2-inl.h b/src/operator/quantization/quantize_v2-inl.h
index 02ace6c39fac..31a10fd54ec0 100644
--- a/src/operator/quantization/quantize_v2-inl.h
+++ b/src/operator/quantization/quantize_v2-inl.h
@@ -125,95 +125,14 @@ struct quantize_v2_zero_centered {
   }
 };
 
-template <typename xpu>
-void QuantizeV2Compute(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
-                       const std::vector<TBlob> &inputs, const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &outputs) {
-  using namespace mshadow;
-  using namespace mxnet_op;
-  typedef float SrcDType;
-  using mshadow::red::limits::MaxValue;
-  using mshadow::red::limits::MinValue;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  const QuantizeV2Param &param = nnvm::get<QuantizeV2Param>(attrs.parsed);
-  auto out_type = GetOutputType(param);
-  if (out_type == mshadow::kUint8 && std::is_same<xpu, gpu>::value) {
-    LOG(FATAL) << "currently, uint8 quantization is only supported by CPU, "
-                  "please switch to the context of CPU or int8 data type for GPU.";
-  }
-
-  if (inputs[0].type_flag_ == mshadow::kUint8 || inputs[0].type_flag_ == mshadow::kInt8) {
-    if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
-      *outputs[1].dptr<float>() = param.min_calib_range.value();
-      *outputs[2].dptr<float>() = param.max_calib_range.value();
-    } else {
-      if (inputs[0].type_flag_ == mshadow::kUint8) {
-        *outputs[1].dptr<float>() = 0;
-        *outputs[2].dptr<float>() = 255;
-      } else {
-        *outputs[1].dptr<float>() = -127;
-        *outputs[2].dptr<float>() = 127;
-      }
-    }
-    UnaryOp::IdentityCompute<xpu>(attrs, ctx, {inputs[0]}, req, outputs);
-  } else {
-    if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
-      if (out_type == mshadow::kUint8) {
-        Kernel<quantize_v2_unsigned, xpu>::Launch(
-            s, outputs[0].Size(), outputs[0].dptr<uint8_t>(), outputs[1].dptr<float>(),
-            outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), param.min_calib_range.value(),
-            param.max_calib_range.value(), MinValue<uint8_t>(), MaxValue<uint8_t>());
-      } else if (out_type == mshadow::kInt8) {  // zero-centered quantization
-        Kernel<quantize_v2_zero_centered, xpu>::Launch(
-            s, outputs[0].Size(), outputs[0].dptr<int8_t>(), outputs[1].dptr<float>(),
-            outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), param.min_calib_range.value(),
-            param.max_calib_range.value(), MinAbs(MaxValue<int8_t>(), MinValue<int8_t>()));
-      } else {
-        LOG(FATAL) << "quantize op only supports int8 and uint8 as output type";
-      }
-    } else {  // model is not calibrated
-      mxnet::TShape src_shape, dst_shape;
-      const size_t actual_float_size = sizeof(float);
-      const size_t temp_reduce_size = ConfigReduce<xpu, SrcDType>(
-          s, inputs[0].shape_, mxnet::TShape({1}), &src_shape, &dst_shape);
-      Tensor<xpu, 1, char> temp_space = ctx.requested[0].get_space_typed<xpu, 1, char>(
-          Shape1(2 * actual_float_size + temp_reduce_size), s);
-      const int dev_id = ctx.run_ctx.ctx.dev_id;
-      TBlob in_min_t(reinterpret_cast<SrcDType *>(temp_space.dptr_), Shape1(1), xpu::kDevMask,
-                    dev_id);
-      TBlob in_max_t(reinterpret_cast<SrcDType *>(temp_space.dptr_) + 1, Shape1(1), xpu::kDevMask,
-                    dev_id);
-      Tensor<xpu, 1, char> workspace(temp_space.dptr_ + 2 * actual_float_size,
-                                    Shape1(temp_reduce_size), s);
-      broadcast::Reduce<red::minimum, 2, SrcDType, mshadow::op::identity>(
-          s, in_min_t.reshape(dst_shape), kWriteTo, workspace, inputs[0].reshape(src_shape));
-      broadcast::Reduce<red::maximum, 2, SrcDType, mshadow::op::identity>(
-          s, in_max_t.reshape(dst_shape), kWriteTo, workspace, inputs[0].reshape(src_shape));
-      if (out_type == mshadow::kUint8) {
-        Kernel<quantize_v2_unsigned, xpu>::Launch(
-            s, outputs[0].Size(), outputs[0].dptr<uint8_t>(), outputs[1].dptr<float>(),
-            outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), in_min_t.dptr<float>(),
-            in_max_t.dptr<float>(), MinValue<uint8_t>(), MaxValue<uint8_t>());
-      } else if (out_type == mshadow::kInt8) {  // zero-centered quantization
-        Kernel<quantize_v2_zero_centered, xpu>::Launch(
-            s, outputs[0].Size(), outputs[0].dptr<int8_t>(), outputs[1].dptr<float>(),
-            outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), in_min_t.dptr<float>(),
-            in_max_t.dptr<float>(), MinAbs(MaxValue<int8_t>(), MinValue<int8_t>()));
-      } else {
-        LOG(FATAL) << "quantize op only supports int8 and uint8 as output type";
-      }
-    }
-  }
-}
-
-static inline bool QuantizeV2Shape(const nnvm::NodeAttrs &attrs, mxnet::ShapeVector *in_attrs,
-                                   mxnet::ShapeVector *out_attrs) {
+static inline bool QuantizeV2Shape(const nnvm::NodeAttrs &attrs, std::vector<TShape> *in_attrs,
+                                   std::vector<TShape> *out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 3U);
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
-  SHAPE_ASSIGN_CHECK(*out_attrs, 1, mxnet::TShape{1});
-  SHAPE_ASSIGN_CHECK(*out_attrs, 2, mxnet::TShape{1});
+  SHAPE_ASSIGN_CHECK(*out_attrs, 1, TShape{1});
+  SHAPE_ASSIGN_CHECK(*out_attrs, 2, TShape{1});
   return !shape_is_none(out_attrs->at(0));
 }
 
@@ -237,6 +156,114 @@ static inline bool QuantizeV2Type(const nnvm::NodeAttrs &attrs, std::vector<int>
   return (*in_attrs)[0] != -1;
 }
 
+template<typename xpu>
+class QuantizeV2Operator {
+ public:
+  QuantizeV2Operator(const nnvm::NodeAttrs &attrs) : attrs_(attrs) {}
+
+  void Forward(const OpContext &ctx, const std::vector<TBlob> &inputs,
+               const std::vector<OpReqType> &req, const std::vector<TBlob> &outputs) {
+    using namespace mshadow;
+    using namespace mxnet_op;
+    typedef float SrcDType;
+    using mshadow::red::limits::MaxValue;
+    using mshadow::red::limits::MinValue;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const QuantizeV2Param &param = nnvm::get<QuantizeV2Param>(attrs_.parsed);
+    auto out_type = GetOutputType(param);
+    if (out_type == mshadow::kUint8 && std::is_same<xpu, gpu>::value) {
+      LOG(FATAL) << "currently, uint8 quantization is only supported by CPU, "
+                    "please switch to the context of CPU or int8 data type for GPU.";
+    }
+
+    if (inputs[0].type_flag_ == mshadow::kUint8 || inputs[0].type_flag_ == mshadow::kInt8) {
+      if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
+        *outputs[1].dptr<float>() = param.min_calib_range.value();
+        *outputs[2].dptr<float>() = param.max_calib_range.value();
+      } else {
+        if (inputs[0].type_flag_ == mshadow::kUint8) {
+          *outputs[1].dptr<float>() = 0;
+          *outputs[2].dptr<float>() = 255;
+        } else {
+          *outputs[1].dptr<float>() = -127;
+          *outputs[2].dptr<float>() = 127;
+        }
+      }
+      UnaryOp::IdentityCompute<xpu>(attrs_, ctx, {inputs[0]}, req, outputs);
+    } else {
+      if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
+        if (out_type == mshadow::kUint8) {
+          Kernel<quantize_v2_unsigned, xpu>::Launch(
+              s, outputs[0].Size(), outputs[0].dptr<uint8_t>(), outputs[1].dptr<float>(),
+              outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), param.min_calib_range.value(),
+              param.max_calib_range.value(), MinValue<uint8_t>(), MaxValue<uint8_t>());
+        } else if (out_type == mshadow::kInt8) {  // zero-centered quantization
+          Kernel<quantize_v2_zero_centered, xpu>::Launch(
+              s, outputs[0].Size(), outputs[0].dptr<int8_t>(), outputs[1].dptr<float>(),
+              outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), param.min_calib_range.value(),
+              param.max_calib_range.value(), MinAbs(MaxValue<int8_t>(), MinValue<int8_t>()));
+        } else {
+          LOG(FATAL) << "quantize op only supports int8 and uint8 as output type";
+        }
+      } else {  // model is not calibrated
+        mxnet::TShape src_shape, dst_shape;
+        const size_t actual_float_size = sizeof(float);
+        const size_t temp_reduce_size = ConfigReduce<xpu, SrcDType>(
+            s, inputs[0].shape_, mxnet::TShape({1}), &src_shape, &dst_shape);
+        Tensor<xpu, 1, char> temp_space = ctx.requested[0].get_space_typed<xpu, 1, char>(
+            Shape1(2 * actual_float_size + temp_reduce_size), s);
+        const int dev_id = ctx.run_ctx.ctx.dev_id;
+        TBlob in_min_t(reinterpret_cast<SrcDType *>(temp_space.dptr_), Shape1(1), xpu::kDevMask,
+                       dev_id);
+        TBlob in_max_t(reinterpret_cast<SrcDType *>(temp_space.dptr_) + 1, Shape1(1), xpu::kDevMask,
+                       dev_id);
+        Tensor<xpu, 1, char> workspace(temp_space.dptr_ + 2 * actual_float_size,
+                                       Shape1(temp_reduce_size), s);
+        broadcast::Reduce<red::minimum, 2, SrcDType, mshadow::op::identity>(
+            s, in_min_t.reshape(dst_shape), kWriteTo, workspace, inputs[0].reshape(src_shape));
+        broadcast::Reduce<red::maximum, 2, SrcDType, mshadow::op::identity>(
+            s, in_max_t.reshape(dst_shape), kWriteTo, workspace, inputs[0].reshape(src_shape));
+        if (out_type == mshadow::kUint8) {
+          Kernel<quantize_v2_unsigned, xpu>::Launch(
+              s, outputs[0].Size(), outputs[0].dptr<uint8_t>(), outputs[1].dptr<float>(),
+              outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), in_min_t.dptr<float>(),
+              in_max_t.dptr<float>(), MinValue<uint8_t>(), MaxValue<uint8_t>());
+        } else if (out_type == mshadow::kInt8) {  // zero-centered quantization
+          Kernel<quantize_v2_zero_centered, xpu>::Launch(
+              s, outputs[0].Size(), outputs[0].dptr<int8_t>(), outputs[1].dptr<float>(),
+              outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), in_min_t.dptr<float>(),
+              in_max_t.dptr<float>(), MinAbs(MaxValue<int8_t>(), MinValue<int8_t>()));
+        } else {
+          LOG(FATAL) << "quantize op only supports int8 and uint8 as output type";
+        }
+      }
+    }
+  }
+
+ private:
+  nnvm::NodeAttrs attrs_;
+};
+
+static OpStatePtr CreateQuantizeV2State(const nnvm::NodeAttrs &attrs, Context ctx,
+                                        const std::vector<TShape> &in_shapes,
+                                        const std::vector<int> &in_types) {
+  OpStatePtr state;
+  if (ctx.dev_type == kGPU) {
+    state = OpStatePtr::Create<QuantizeV2Operator<gpu>>(attrs);
+  } else {
+    state = OpStatePtr::Create<QuantizeV2Operator<cpu>>(attrs);
+  }
+  return state;
+}
+
+template <typename xpu>
+static void QuantizeV2Forward(const OpStatePtr &state_ptr, const OpContext &ctx,
+                              const std::vector<TBlob> &inputs, const std::vector<OpReqType> &req,
+                              const std::vector<TBlob> &outputs) {
+  auto &op = state_ptr.get_state<QuantizeV2Operator<xpu>>();
+  op.Forward(ctx, inputs, req, outputs);
+}
+
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_QUANTIZATION_QUANTIZE_V2_INL_H_
diff --git a/src/operator/quantization/quantize_v2.cc b/src/operator/quantization/quantize_v2.cc
index 920100bc9f8b..95858826a4e0 100644
--- a/src/operator/quantization/quantize_v2.cc
+++ b/src/operator/quantization/quantize_v2.cc
@@ -88,9 +88,12 @@ If min_calib_range isn't presented, the output type will be int8.
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 #if MXNET_USE_MKLDNN == 1
 .set_attr<bool>("TIsMKLDNN", true)
-.set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizeV2Compute)
+.set_attr<FCreateOpState>("FCreateOpState", CreateSgMKLDNNQuantizeState)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", SgMKLDNNQuantizeForward)
+#else
+.set_attr<FCreateOpState>("FCreateOpState", CreateQuantizeV2State)
+.set_attr<FStatefulCompute>("FStatefulCompute<cpu>", QuantizeV2Forward<cpu>)
 #endif
-.set_attr<FCompute>("FCompute<cpu>", QuantizeV2Compute<cpu>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs) {
   return std::vector<std::pair<int, int> >{{0, 0}};
 })
diff --git a/src/operator/quantization/quantize_v2.cu b/src/operator/quantization/quantize_v2.cu
index ab0cf9c5ad0e..0707f41ded94 100644
--- a/src/operator/quantization/quantize_v2.cu
+++ b/src/operator/quantization/quantize_v2.cu
@@ -28,7 +28,7 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_contrib_quantize_v2)
-.set_attr<FCompute>("FCompute<gpu>", QuantizeV2Compute<gpu>);
+.set_attr<FCompute>("FStatefulCompute<gpu>", QuantizeV2Forward<cpu>);
 
 }  // namespace op
 }  // namespace mxnet

From e543979e2e743509e3f683ee25aa28aa6dea3906 Mon Sep 17 00:00:00 2001
From: Zhennan Qin <zhennan.qin@intel.com>
Date: Mon, 8 Apr 2019 16:48:07 +0800
Subject: [PATCH 2/6] fix lint

---
 src/operator/quantization/dequantize-inl.h          |  2 +-
 .../quantization/mkldnn/mkldnn_dequantize-inl.h     | 13 ++++++++++---
 .../quantization/mkldnn/mkldnn_quantize_v2-inl.h    | 12 +++++++++---
 src/operator/quantization/quantize_v2-inl.h         |  2 +-
 4 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/src/operator/quantization/dequantize-inl.h b/src/operator/quantization/dequantize-inl.h
index 86dbeb13abec..5cb1c3ac917d 100644
--- a/src/operator/quantization/dequantize-inl.h
+++ b/src/operator/quantization/dequantize-inl.h
@@ -98,7 +98,7 @@ inline bool DequantizeType(const nnvm::NodeAttrs& attrs,
 template <typename xpu>
 class DequantizeOperator {
  public:
-  DequantizeOperator(const nnvm::NodeAttrs &attrs) : attrs_(attrs) {}
+  explicit DequantizeOperator(const nnvm::NodeAttrs &attrs) : attrs_(attrs) {}
   void Forward(const OpContext &ctx, const std::vector<TBlob> &inputs,
                const std::vector<OpReqType> &req, const std::vector<TBlob> &outputs) {
     using namespace mshadow;
diff --git a/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h b/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
index c142d3832004..093a9582be43 100644
--- a/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
+++ b/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
@@ -34,10 +34,11 @@
 namespace mxnet {
 namespace op {
 
-class SgMKLDNNDequantizeOperator : public DequantizeOperator {
+
+class SgMKLDNNDequantizeOperator {
  public:
   explicit SgMKLDNNDequantizeOperator(const nnvm::NodeAttrs &attrs)
-      : DequantizeOperator(attrs), param_(nnvm::get<DequantizeParam>(attrs.parsed)) {}
+      : param_(nnvm::get<DequantizeParam>(attrs.parsed)) {}
 
   void Forward(const OpContext &ctx, const std::vector<NDArray> &inputs,
                const std::vector<OpReqType> &req, const std::vector<NDArray> &outputs);
@@ -124,7 +125,13 @@ static void SgMKLDNNDequantizeForward(const OpStatePtr &state_ptr, const OpConte
 static OpStatePtr CreateSgMKLDNNDequantizeState(const nnvm::NodeAttrs &attrs, Context ctx,
                                                 const std::vector<TShape> &in_shapes,
                                                 const std::vector<int> &in_types) {
-  return OpStatePtr::Create<SgMKLDNNDequantizeOperator>(attrs);
+  OpStatePtr state;
+  if (ctx.dev_type == kGPU) {
+    state = OpStatePtr::Create<DequantizeOperator<gpu>>(attrs);
+  } else {
+    state = OpStatePtr::Create<SgMKLDNNDequantizeOperator>(attrs);
+  }
+  return state;
 }
 
 }  // namespace op
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h b/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
index 117195584b68..d93a541bcaa0 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
+++ b/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
@@ -34,10 +34,10 @@
 namespace mxnet {
 namespace op {
 
-class SgMKLDNNQuantizeOperator : public QuantizeV2Operator {
+class SgMKLDNNQuantizeOperator {
  public:
   explicit SgMKLDNNQuantizeOperator(const nnvm::NodeAttrs &attrs)
-      : QuantizeV2Operator(attrs), param_(nnvm::get<QuantizeV2Param>(attrs.parsed)) {}
+      : param_(nnvm::get<QuantizeV2Param>(attrs.parsed)) {}
 
   void Forward(const OpContext &ctx, const std::vector<NDArray> &inputs,
                const std::vector<OpReqType> &req, const std::vector<NDArray> &outputs);
@@ -173,7 +173,13 @@ static void SgMKLDNNQuantizeForward(const OpStatePtr &state_ptr, const OpContext
 static OpStatePtr CreateSgMKLDNNQuantizeState(const nnvm::NodeAttrs &attrs, Context ctx,
                                               const std::vector<TShape> &in_shapes,
                                               const std::vector<int> &in_types) {
-  return OpStatePtr::Create<SgMKLDNNQuantizeOperator>(attrs);
+  OpStatePtr state;
+  if (ctx.dev_type == kGPU) {
+    state = OpStatePtr::Create<QuantizeV2Operator<gpu>>(attrs);
+  } else {
+    state = OpStatePtr::Create<SgMKLDNNQuantizeOperator>(attrs);
+  }
+  return state;
 }
 
 }  // namespace op
diff --git a/src/operator/quantization/quantize_v2-inl.h b/src/operator/quantization/quantize_v2-inl.h
index 31a10fd54ec0..b87495daa22c 100644
--- a/src/operator/quantization/quantize_v2-inl.h
+++ b/src/operator/quantization/quantize_v2-inl.h
@@ -159,7 +159,7 @@ static inline bool QuantizeV2Type(const nnvm::NodeAttrs &attrs, std::vector<int>
 template<typename xpu>
 class QuantizeV2Operator {
  public:
-  QuantizeV2Operator(const nnvm::NodeAttrs &attrs) : attrs_(attrs) {}
+  explicit QuantizeV2Operator(const nnvm::NodeAttrs &attrs) : attrs_(attrs) {}
 
   void Forward(const OpContext &ctx, const std::vector<TBlob> &inputs,
                const std::vector<OpReqType> &req, const std::vector<TBlob> &outputs) {

From b904f0dc038c91424701da6a02de1b5b6b2d0296 Mon Sep 17 00:00:00 2001
From: Zhennan Qin <zhennan.qin@intel.com>
Date: Tue, 9 Apr 2019 09:14:30 +0800
Subject: [PATCH 3/6] Fix build

---
 src/operator/quantization/dequantize-inl.h    | 12 ----------
 src/operator/quantization/dequantize.cc       | 22 +++++++++++++++----
 .../mkldnn/mkldnn_dequantize-inl.h            | 12 +---------
 .../mkldnn/mkldnn_quantize_v2-inl.h           | 12 ----------
 src/operator/quantization/quantize_v2-inl.h   | 12 ----------
 src/operator/quantization/quantize_v2.cc      | 22 +++++++++++++++----
 6 files changed, 37 insertions(+), 55 deletions(-)

diff --git a/src/operator/quantization/dequantize-inl.h b/src/operator/quantization/dequantize-inl.h
index 5cb1c3ac917d..fd6da3f89b5e 100644
--- a/src/operator/quantization/dequantize-inl.h
+++ b/src/operator/quantization/dequantize-inl.h
@@ -125,18 +125,6 @@ class DequantizeOperator {
   nnvm::NodeAttrs attrs_;
 };
 
-static OpStatePtr CreateDequantizeState(const nnvm::NodeAttrs &attrs, Context ctx,
-                                        const std::vector<TShape> &in_shapes,
-                                        const std::vector<int> &in_types) {
-  OpStatePtr state;
-  if (ctx.dev_type == kGPU) {
-    state = OpStatePtr::Create<DequantizeOperator<gpu>>(attrs);
-  } else {
-    state = OpStatePtr::Create<DequantizeOperator<cpu>>(attrs);
-  }
-  return state;
-}
-
 template <typename xpu>
 static void DequantizeForward(const OpStatePtr &state_ptr, const OpContext &ctx,
                               const std::vector<TBlob> &inputs, const std::vector<OpReqType> &req,
diff --git a/src/operator/quantization/dequantize.cc b/src/operator/quantization/dequantize.cc
index ed3bbdcbb845..dd433e41f69c 100644
--- a/src/operator/quantization/dequantize.cc
+++ b/src/operator/quantization/dequantize.cc
@@ -48,6 +48,22 @@ bool DequantizeStorageType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+static OpStatePtr CreateDequantizeState(const nnvm::NodeAttrs &attrs, Context ctx,
+                                        const std::vector<TShape> &in_shapes,
+                                        const std::vector<int> &in_types) {
+  OpStatePtr state;
+  if (ctx.dev_type == kGPU) {
+    state = OpStatePtr::Create<DequantizeOperator<gpu>>(attrs);
+  } else {
+#if MXNET_USE_MKLDNN == 1
+    state = OpStatePtr::Create<SgMKLDNNDequantizeOperator>(attrs);
+#else
+    state = OpStatePtr::Create<DequantizeOperator<cpu>>(attrs);
+#endif
+  }
+  return state;
+}
+
 NNVM_REGISTER_OP(_contrib_dequantize)
 .describe(R"code(Dequantize the input tensor into a float tensor.
 min_range and max_range are scalar floats that specify the range for
@@ -74,14 +90,12 @@ by keep zero centered for the quantized value:
 // TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
 // will be reverted after the improvement of CachedOP is done.
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.set_attr<FCreateOpState>("FCreateOpState", CreateDequantizeState)
 #if MXNET_USE_MKLDNN == 1
 .set_attr<bool>("TIsMKLDNN", true)
-.set_attr<FCreateOpState>("FCreateOpState", CreateSgMKLDNNDequantizeState)
 .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", SgMKLDNNDequantizeForward)
-#else
-.set_attr<FCreateOpState>("FCreateOpState", CreateDequantizeState)
-.set_attr<FStatefulCompute>("FStatefulCompute<cpu>", DequantizeForward<cpu>)
 #endif
+.set_attr<FStatefulCompute>("FStatefulCompute<cpu>", DequantizeForward<cpu>)
 .add_argument("data", "NDArray-or-Symbol", "A ndarray/symbol of type `uint8`")
 .add_argument("min_range", "NDArray-or-Symbol", "The minimum scalar value "
   "possibly produced for the input in float32")
diff --git a/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h b/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
index 093a9582be43..5600961df77c 100644
--- a/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
+++ b/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
@@ -122,17 +122,7 @@ static void SgMKLDNNDequantizeForward(const OpStatePtr &state_ptr, const OpConte
   op.Forward(ctx, inputs, req, outputs);
 }
 
-static OpStatePtr CreateSgMKLDNNDequantizeState(const nnvm::NodeAttrs &attrs, Context ctx,
-                                                const std::vector<TShape> &in_shapes,
-                                                const std::vector<int> &in_types) {
-  OpStatePtr state;
-  if (ctx.dev_type == kGPU) {
-    state = OpStatePtr::Create<DequantizeOperator<gpu>>(attrs);
-  } else {
-    state = OpStatePtr::Create<SgMKLDNNDequantizeOperator>(attrs);
-  }
-  return state;
-}
+
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h b/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
index d93a541bcaa0..544f3deeace7 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
+++ b/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
@@ -170,18 +170,6 @@ static void SgMKLDNNQuantizeForward(const OpStatePtr &state_ptr, const OpContext
   op.Forward(ctx, inputs, req, outputs);
 }
 
-static OpStatePtr CreateSgMKLDNNQuantizeState(const nnvm::NodeAttrs &attrs, Context ctx,
-                                              const std::vector<TShape> &in_shapes,
-                                              const std::vector<int> &in_types) {
-  OpStatePtr state;
-  if (ctx.dev_type == kGPU) {
-    state = OpStatePtr::Create<QuantizeV2Operator<gpu>>(attrs);
-  } else {
-    state = OpStatePtr::Create<SgMKLDNNQuantizeOperator>(attrs);
-  }
-  return state;
-}
-
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/quantization/quantize_v2-inl.h b/src/operator/quantization/quantize_v2-inl.h
index b87495daa22c..cfbf3da8921f 100644
--- a/src/operator/quantization/quantize_v2-inl.h
+++ b/src/operator/quantization/quantize_v2-inl.h
@@ -244,18 +244,6 @@ class QuantizeV2Operator {
   nnvm::NodeAttrs attrs_;
 };
 
-static OpStatePtr CreateQuantizeV2State(const nnvm::NodeAttrs &attrs, Context ctx,
-                                        const std::vector<TShape> &in_shapes,
-                                        const std::vector<int> &in_types) {
-  OpStatePtr state;
-  if (ctx.dev_type == kGPU) {
-    state = OpStatePtr::Create<QuantizeV2Operator<gpu>>(attrs);
-  } else {
-    state = OpStatePtr::Create<QuantizeV2Operator<cpu>>(attrs);
-  }
-  return state;
-}
-
 template <typename xpu>
 static void QuantizeV2Forward(const OpStatePtr &state_ptr, const OpContext &ctx,
                               const std::vector<TBlob> &inputs, const std::vector<OpReqType> &req,
diff --git a/src/operator/quantization/quantize_v2.cc b/src/operator/quantization/quantize_v2.cc
index 95858826a4e0..e9017a58a82c 100644
--- a/src/operator/quantization/quantize_v2.cc
+++ b/src/operator/quantization/quantize_v2.cc
@@ -47,6 +47,22 @@ static bool QuantizeV2StorageType(const nnvm::NodeAttrs& attrs, const int dev_ma
   return true;
 }
 
+static OpStatePtr CreateQuantizeV2State(const nnvm::NodeAttrs& attrs, Context ctx,
+                                        const std::vector<TShape>& in_shapes,
+                                        const std::vector<int>& in_types) {
+  OpStatePtr state;
+  if (ctx.dev_type == kGPU) {
+    state = OpStatePtr::Create<QuantizeV2Operator<gpu>>(attrs);
+  } else {
+#if MXNET_USE_MKLDNN == 1
+    state = OpStatePtr::Create<SgMKLDNNQuantizeOperator>(attrs);
+#else
+    state = OpStatePtr::Create<QuantizeV2Operator<cpu>>(attrs);
+#endif
+  }
+  return state;
+}
+
 NNVM_REGISTER_OP(_contrib_quantize_v2)
 .describe(R"code(Quantize a input tensor from float to `out_type`,
 with user-specified `min_calib_range` and `max_calib_range` or the input range collected at runtime.
@@ -86,14 +102,12 @@ If min_calib_range isn't presented, the output type will be int8.
 // TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
 // will be reverted after the improvement of CachedOP is done.
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.set_attr<FCreateOpState>("FCreateOpState", CreateQuantizeV2State)
 #if MXNET_USE_MKLDNN == 1
 .set_attr<bool>("TIsMKLDNN", true)
-.set_attr<FCreateOpState>("FCreateOpState", CreateSgMKLDNNQuantizeState)
 .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", SgMKLDNNQuantizeForward)
-#else
-.set_attr<FCreateOpState>("FCreateOpState", CreateQuantizeV2State)
-.set_attr<FStatefulCompute>("FStatefulCompute<cpu>", QuantizeV2Forward<cpu>)
 #endif
+.set_attr<FStatefulCompute>("FStatefulCompute<cpu>", QuantizeV2Forward<cpu>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs) {
   return std::vector<std::pair<int, int> >{{0, 0}};
 })

From b695486b1827f69c55019201208ffd3efd102a65 Mon Sep 17 00:00:00 2001
From: Zhennan Qin <zhennan.qin@intel.com>
Date: Tue, 9 Apr 2019 09:30:17 +0800
Subject: [PATCH 4/6] fix gpu build

---
 src/operator/quantization/dequantize.cu  | 2 +-
 src/operator/quantization/quantize_v2.cu | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/operator/quantization/dequantize.cu b/src/operator/quantization/dequantize.cu
index 41b6e7d20494..dee8b2207e01 100644
--- a/src/operator/quantization/dequantize.cu
+++ b/src/operator/quantization/dequantize.cu
@@ -28,7 +28,7 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_contrib_dequantize)
-.set_attr<FCompute>("FStatefulCompute<gpu>", DequantizeForward<gpu>);
+.set_attr<FStatefulCompute>("FStatefulCompute<gpu>", DequantizeForward<gpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/quantization/quantize_v2.cu b/src/operator/quantization/quantize_v2.cu
index 0707f41ded94..7acdf56082e5 100644
--- a/src/operator/quantization/quantize_v2.cu
+++ b/src/operator/quantization/quantize_v2.cu
@@ -28,7 +28,7 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_contrib_quantize_v2)
-.set_attr<FCompute>("FStatefulCompute<gpu>", QuantizeV2Forward<cpu>);
+.set_attr<FStatefulCompute>("FStatefulCompute<gpu>", QuantizeV2Forward<gpu>);
 
 }  // namespace op
 }  // namespace mxnet

From 0d78ae92cca53db115f75aed1ddda3bace5798f0 Mon Sep 17 00:00:00 2001
From: Zhennan Qin <zhennan.qin@intel.com>
Date: Wed, 17 Apr 2019 09:20:16 +0800
Subject: [PATCH 5/6] Fix typo

---
 .../quantization/mkldnn/mkldnn_dequantize-inl.h  | 10 +++++-----
 src/operator/subgraph/mkldnn/mkldnn_conv.cc      | 16 ++++++++--------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h b/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
index 5600961df77c..27fa070afbe0 100644
--- a/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
+++ b/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
@@ -44,7 +44,7 @@ class SgMKLDNNDequantizeOperator {
                const std::vector<OpReqType> &req, const std::vector<NDArray> &outputs);
 
  private:
-  bool initalized_{false};
+  bool initialized_{false};
   DequantizeParam param_;
   float cached_data_min_{0.f};
   float cached_data_max_{0.f};
@@ -62,10 +62,10 @@ void SgMKLDNNDequantizeOperator::Forward(const OpContext &ctx, const std::vector
   float data_min = *inputs[1].data().dptr<float>();
   float data_max = *inputs[2].data().dptr<float>();
 
-  if (initalized_ && (cached_data_min_ != data_min || cached_data_max_ != data_max))
-    initalized_ = false;
+  if (initialized_ && (cached_data_min_ != data_min || cached_data_max_ != data_max))
+    initialized_ = false;
 
-  if (!initalized_) {
+  if (!initialized_) {
     cached_data_min_ = data_min;
     cached_data_max_ = data_max;
     float real_range = MaxAbs(cached_data_min_, cached_data_max_);
@@ -104,7 +104,7 @@ void SgMKLDNNDequantizeOperator::Forward(const OpContext &ctx, const std::vector
     i_mem_ = std::make_shared<mkldnn::memory>(i_mpd, nullptr);
     o_mem_ = std::make_shared<mkldnn::memory>(o_mpd, nullptr);
     fwd_pd_ = std::make_shared<mkldnn::reorder>(reorder_pd, *i_mem_, *o_mem_);
-    initalized_ = true;
+    initialized_ = true;
   }
   auto o_mem = CreateMKLDNNMem(outputs[0], o_mem_->get_primitive_desc(), req[0]);
   i_mem_->set_data_handle(i_mem->get_data_handle());
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv.cc b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
index d61b4613602a..e142fae90e97 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_conv.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
@@ -175,7 +175,7 @@ class SgMKLDNNConvOperator {
                const std::vector<NDArray> &outputs);
 
  private:
-  bool initalized_{false};
+  bool initialized_{false};
   bool inplace_{false};
   bool post_requantize_{false};
   nnvm::Symbol subgraph_sym_;
@@ -235,7 +235,7 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
 
   // Copy inputs[in_sum] into outputs[kOut] in case inplace optimization failed.
   if (mkldnn_param.with_sum) {
-    if (!initalized_) {
+    if (!initialized_) {
       // TODO(zhennan): Currently, mkldnn fallback mechanism will break inplace option,
       // which make check (req[kOut] == kWriteInplace) useless.
       auto in_mkl_mem = inputs[in_sum].GetMKLDNNData();
@@ -257,23 +257,23 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
 
   // Check input change
   // TODO(zhennan): Only update cached_* changed.
-  if (initalized_) {
+  if (initialized_) {
     if (mkldnn_param.with_bn) {
       if (weight_ver_ != inputs[in_weight].version() ||
           ((!conv_param.no_bias) && bias_ver_ != inputs[in_bias].version())) {
-        initalized_ = false;
+        initialized_ = false;
       }
     }
-    if (initalized_ && mkldnn_param.quantized) {
+    if (initialized_ && mkldnn_param.quantized) {
       if (cached_data_min_ != data_min || cached_data_max_ != data_max ||
           cached_sum_min_ != sum_min || cached_sum_max_ != sum_max ||
           weight_ver_ != inputs[in_weight].version() ||
           ((!conv_param.no_bias) && bias_ver_ != inputs[in_bias].version())) {
-        initalized_ = false;
+        initialized_ = false;
       }
     }
   }
-  if (!initalized_) {
+  if (!initialized_) {
     cached_data_min_ = data_min;
     cached_data_max_ = data_max;
     cached_sum_min_ = sum_min;
@@ -353,7 +353,7 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
     fwd_->SetNewMem(*data.GetMKLDNNData(), *cached_weight_.GetMKLDNNData(),
                     has_bias ? cached_bias_.GetMKLDNNData() : nullptr,
                     *output.GetMKLDNNData());
-    initalized_ = true;
+    initialized_ = true;
   }
 
   if (mkldnn_param.quantized) {

From b976a050474e1b8ae64a03756d88232c3141890f Mon Sep 17 00:00:00 2001
From: Zhennan Qin <zhennan.qin@intel.com>
Date: Wed, 17 Apr 2019 13:32:02 +0800
Subject: [PATCH 6/6] Move check to online calibration

---
 src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h b/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
index 544f3deeace7..2da415877b8b 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
+++ b/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
@@ -102,6 +102,9 @@ void SgMKLDNNQuantizeOperator::Forward(const OpContext &ctx, const std::vector<N
         if (data_maxs[i] > data_max) data_max = data_maxs[i];
         if (data_mins[i] < data_min) data_min = data_mins[i];
       }
+
+      if (initalized_ && (cached_data_min_ != data_min || cached_data_max_ != data_max))
+        initalized_ = false;
     }
 
     // Write output min/max
@@ -119,9 +122,6 @@ void SgMKLDNNQuantizeOperator::Forward(const OpContext &ctx, const std::vector<N
       LOG(FATAL) << "mkldnn quantize op only supports int8 and uint8 as output type";
     }
 
-    if (initalized_ && (cached_data_min_ != data_min || cached_data_max_ != data_max))
-      initalized_ = false;
-
     if (!initalized_) {
       cached_data_min_ = data_min;
       cached_data_max_ = data_max;