From 49b189fac234312ef91f5443196901c62850ab80 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Tue, 23 Oct 2018 15:40:37 +0800
Subject: [PATCH 01/23] add quantized fully connect support

---
 .../quantized_fully_connected-inl.h           | 157 ++++++++++++++++++
 .../quantization/quantized_fully_connected.cc |  17 +-
 .../python/quantization/test_quantization.py  |  21 +--
 3 files changed, 184 insertions(+), 11 deletions(-)
 create mode 100644 src/operator/quantization/quantized_fully_connected-inl.h
diff --git a/src/operator/quantization/quantized_fully_connected-inl.h b/src/operator/quantization/quantized_fully_connected-inl.h
new file mode 100644
index 000000000000..8c4455cd15b0
--- /dev/null
+++ b/src/operator/quantization/quantized_fully_connected-inl.h
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef MXNET_OPERATOR_QUANTIZATION_QUANTIZED_FULLY_CONNECTED_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_QUANTIZED_FULLY_CONNECTED_INL_H_
+
+#include <vector>
+#include "quantization_utils.h"
+#include "../nn/fully_connected-inl.h"
+
+namespace mxnet {
+namespace op {
+
+namespace quantized_fc {
+enum QuantilizedfcOpResource {kTempSpace};
+}
+
+struct QuantizedSumInitKernelWithBias {
+  //  init sum data with bias for matrix b (n)
+  MSHADOW_XINLINE static void Map(int i, int32_t *out,
+                                  const int8_t *bias, const float *min_out,
+                                  const float *max_out, const float *min_bias,
+                                  const float *max_bias) {
+    typedef int32_t T1;
+    typedef int8_t  T2;
+    using mshadow::red::limits::MinValue;
+    using mshadow::red::limits::MaxValue;
+    float float_for_one_out_quant  =
+      MaxAbs(*min_out, *max_out) / static_cast<double>(MaxValue<T1>());
+    float float_for_one_bias_quant =
+      MaxAbs(*min_bias, *max_bias) / static_cast<double>(MaxValue<T2>());
+    if (float_for_one_out_quant != 0) {
+      out[i] = bias[i] * float_for_one_bias_quant /
+               float_for_one_out_quant;
+    } else {
+      LOG(INFO) << "WARNING: QuantizedBiasAddKernel float_for_one_out_quant is 0 !";
+      out[i] = 0;
+    }
+  }
+};
+template<typename SrcType>
+void MKLDNNQuantizedFullyConnectedForward(const nnvm::NodeAttrs& attrs,
+                                          const OpContext &ctx,
+                                          const std::vector<NDArray> &in_data,
+                                          const std::vector<OpReqType> &req,
+                                          const std::vector<NDArray> &out_data) {
+#if MSHADOW_USE_MKL == 1
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  using namespace mshadow;
+  using namespace mxnet_op;
+  size_t num_inputs = param.no_bias ? 2 : 3;
+  CHECK_EQ(in_data.size(),  num_inputs * 3);
+  CHECK_EQ(out_data.size(), 3U);
+  const NDArray& data = in_data[0];
+  const NDArray& weight = in_data[1];
+  const NDArray& out = out_data[0];
+  TShape dshape = data.shape();
+  TShape wshape = weight.shape();
+  TShape oshape = out.shape();
+  auto output_temp = out.data().dptr<int32_t>();
+  auto weight_temp = weight.data().dptr<SrcType>();
+  auto data_temp = data.data().dptr<SrcType>();
+  const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+  const float alpha = 1.0f;
+  const float beta  = 1.0f;
+  const CBLAS_OFFSET offsetc = CblasFixOffset;
+  const MKL_INT8 oa = 0;
+  const MKL_INT8 ob = 0;
+  MKL_INT32 oc = 0;
+  const int m = dshape[0], n = wshape[0], k = dshape.ProdShape(1, dshape.ndim());
+  Stream<cpu> *s = ctx.get_stream<cpu>();
+  //  cblas_gemm_s8u8s32 required first matrix must be uint8
+  //  shift data from int8(from -128 to 127) to uint8 (from 0 to 255)
+  int shift = 128;
+  Tensor<cpu, 1, uint8_t> shiftdata =
+    ctx.requested[quantized_fc::kTempSpace].get_space_typed<cpu, 1, uint8_t>(
+      Shape1(m * k), s);
+  #pragma omp parallel for num_threads(omp_threads)
+  for (int i = 0; i < m * k; ++i) {
+    shiftdata.dptr_[i] = data_temp[i] + shift;
+  }
+
+  Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(s, 1,
+      out_data[1].data().dptr<float>(), out_data[2].data().dptr<float>(),
+      in_data[num_inputs].data().dptr<float>(), in_data[num_inputs+1].data().dptr<float>(),
+      in_data[num_inputs+2].data().dptr<float>(), in_data[num_inputs+3].data().dptr<float>());
+  if (!param.no_bias) {
+    const NDArray& bias = in_data[2];
+    Kernel<QuantizedSumInitKernelWithBias, cpu>::Launch(s, n, out.data().dptr<int32_t>(),
+        bias.data().dptr<int8_t>(), out_data[1].data().dptr<float>(),
+        out_data[2].data().dptr<float>(), in_data[7].data().dptr<float>(),
+        in_data[8].data().dptr<float>());
+  } else {
+    #pragma omp parallel for num_threads(omp_threads)
+    for (int i = 0; i < m * n; ++i) {
+      output_temp[i] = 0;
+    }
+  }
+  #pragma omp parallel for num_threads(omp_threads)
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < k; ++j) {
+      output_temp[i] -= shift * weight_temp[i * k + j];
+    }
+  }
+  #pragma omp parallel for num_threads(omp_threads)
+  for (int i = n; i < m * n; ++i) {
+    output_temp[i] = output_temp[i % n];
+  }
+  cblas_gemm_s8u8s32(CblasRowMajor,
+                     CblasNoTrans,
+                     CblasTrans,
+                     offsetc,
+                     m,
+                     n,
+                     k,
+                     alpha,
+                     shiftdata.dptr_,
+                     k,
+                     oa,
+                     weight.data().dptr<SrcType>(),
+                     k,
+                     ob,
+                     beta,
+                     out.data().dptr<int32_t>(),
+                     n,
+                     &oc);
+#else
+  LOG(FATAL) << "s8u8s32 is only supported by MKL BLAS library";
+#endif
+}
+
+NNVM_REGISTER_OP(_contrib_quantized_fully_connected)
+.set_attr<FComputeEx>("FComputeEx<cpu>",
+    MKLDNNQuantizedFullyConnectedForward<int8_t>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  });
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_QUANTIZATION_QUANTIZED_FULLY_CONNECTED_INL_H_
diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index e334fe7ec9b2..8abeddcf89d6 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -24,6 +24,7 @@
  * \author Ziheng Jiang, Jun Wu
 */
 #include "../nn/fully_connected-inl.h"
+#include "./quantized_fully_connected-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -79,6 +80,20 @@ bool QuantizedFullyConnectedType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+bool QuantizedFullyConnectedStorageType(const nnvm::NodeAttrs& attrs,
+                                        const int dev_mask,
+                                        DispatchMode* dispatch_mode,
+                                        std::vector<int> *in_attrs,
+                                        std::vector<int> *out_attrs) {
+  *dispatch_mode = DispatchMode::kFCompute;
+  if (dev_mask == mshadow::cpu::kDevMask) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+  for (size_t i = 0; i < out_attrs->size(); i++)
+    (*out_attrs)[i] = kDefaultStorage;
+  return true;
+}
+
 NNVM_REGISTER_OP(_contrib_quantized_fully_connected)
 .describe(R"code(Fully Connected operator for input, weight and bias data type of int8,
 and accumulates in type int32 for the output. For each argument, two more arguments of type
@@ -112,6 +127,7 @@ and max thresholds representing the threholds for quantizing the float32 output
   })
 .set_attr<nnvm::FInferShape>("FInferShape", QuantizedFullyConnectedShape)
 .set_attr<nnvm::FInferType>("FInferType", QuantizedFullyConnectedType)
+.set_attr<FInferStorageType>("FInferStorageType", QuantizedFullyConnectedStorageType)
 .set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return true; })
 .add_argument("data", "NDArray-or-Symbol", "Input data.")
 .add_argument("weight", "NDArray-or-Symbol", "weight.")
@@ -135,6 +151,5 @@ NNVM_REGISTER_OP(FullyConnected)
     }
     return node;
   });
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 5ae2c6c398e9..8d33a30f2f73 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -269,10 +269,7 @@ def check_quantized_pooling(data_shape, kernel, pool_type, pad, stride, global_p
 @with_seed()
 def test_quantized_fc():
     def check_quantized_fc(data_shape, num_hidden, no_bias, qdtype, flatten=True):
-        if mx.current_context().device_type != 'gpu':
-            print('skipped testing quantized_fc on cpu since it is not supported yet')
-            return
-        elif qdtype == 'uint8' and is_test_for_gpu():
+        if qdtype == 'uint8' and is_test_for_gpu():
             print('skipped testing quantized_fc for gpu uint8 since it is not supported yet')
             return
 
@@ -283,16 +280,16 @@ def check_quantized_fc(data_shape, num_hidden, no_bias, qdtype, flatten=True):
         fc_fp32_exe = fc_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
         if qdtype == 'uint8':
             data_low = 0.0
-            data_high = 127.0
+            data_high = 63.0
         else:
-            data_low = -127.0
-            data_high = 127.0
+            data_low = -63.0
+            data_high = 63.0
         fc_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
                                                                      shape=data_shape).astype('int32')
-        fc_fp32_exe.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
+        fc_fp32_exe.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
                                                                      shape=arg_shapes[1]).astype('int32')
         if not no_bias:
-            fc_fp32_exe.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
+            fc_fp32_exe.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
                                                                          shape=arg_shapes[2]).astype('int32')
         output = fc_fp32_exe.forward()[0]
 
@@ -335,6 +332,10 @@ def check_quantized_fc(data_shape, num_hidden, no_bias, qdtype, flatten=True):
         check_quantized_fc((32, 111, 2, 2), 100, True, qdtype)
         check_quantized_fc((32, 512, 2, 2), 100, False, qdtype)
         check_quantized_fc((32, 111, 2, 2), 100, False, qdtype)
+        check_quantized_fc((256, 2048, 2, 2), 800, False, qdtype)
+        check_quantized_fc((256, 111, 2, 2), 800, False, qdtype)
+        check_quantized_fc((256, 2048, 2, 2), 800, True, qdtype)
+        check_quantized_fc((256, 111, 2, 2), 800, True, qdtype)
 
 @with_seed()
 def test_quantized_flatten():
@@ -632,4 +633,4 @@ def get_threshold(nd):
 
 if __name__ == "__main__":
     import nose
-    nose.runmodule()
+    nose.runmodule()
\ No newline at end of file

From a2bfef468f654c9610692b7cf6aabd753a71c60f Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Tue, 23 Oct 2018 20:49:57 +0800
Subject: [PATCH 02/23] disable qfc cpu case since s8u8s32 is only supported by
 MKL BLAS library

---
 tests/python/quantization/test_quantization.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 8d33a30f2f73..15ed3c8363bf 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -269,7 +269,10 @@ def check_quantized_pooling(data_shape, kernel, pool_type, pad, stride, global_p
 @with_seed()
 def test_quantized_fc():
     def check_quantized_fc(data_shape, num_hidden, no_bias, qdtype, flatten=True):
-        if qdtype == 'uint8' and is_test_for_gpu():
+        if mx.current_context().device_type != 'gpu':
+            print('skipped testing quantized_fc on cpu since s8u8s32 is only supported by MKL BLAS library')
+            return
+        elif qdtype == 'uint8' and is_test_for_gpu():
             print('skipped testing quantized_fc for gpu uint8 since it is not supported yet')
             return
 

From 91f1a9b2abb4ca19560e2a605dc55c867ef2e658 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Wed, 24 Oct 2018 08:54:56 +0800
Subject: [PATCH 03/23] retrigger to ci testing

---
 tests/python/quantization/test_quantization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 15ed3c8363bf..ed2bb3ad4410 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -636,4 +636,4 @@ def get_threshold(nd):
 
 if __name__ == "__main__":
     import nose
-    nose.runmodule()
\ No newline at end of file
+    nose.runmodule()

From b8e82574d38ac1ff45ce08b27ce425ea84909868 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Mon, 29 Oct 2018 10:46:00 +0800
Subject: [PATCH 04/23] move implementation to cc file and add 
 STORAGE_TYPE_ASSIGN_CHECK

---
 .../quantized_fully_connected-inl.h           | 157 ------------------
 .../quantization/quantized_fully_connected.cc | 143 +++++++++++++++-
 2 files changed, 141 insertions(+), 159 deletions(-)
 delete mode 100644 src/operator/quantization/quantized_fully_connected-inl.h

diff --git a/src/operator/quantization/quantized_fully_connected-inl.h b/src/operator/quantization/quantized_fully_connected-inl.h
deleted file mode 100644
index 8c4455cd15b0..000000000000
--- a/src/operator/quantization/quantized_fully_connected-inl.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#ifndef MXNET_OPERATOR_QUANTIZATION_QUANTIZED_FULLY_CONNECTED_INL_H_
-#define MXNET_OPERATOR_QUANTIZATION_QUANTIZED_FULLY_CONNECTED_INL_H_
-
-#include <vector>
-#include "quantization_utils.h"
-#include "../nn/fully_connected-inl.h"
-
-namespace mxnet {
-namespace op {
-
-namespace quantized_fc {
-enum QuantilizedfcOpResource {kTempSpace};
-}
-
-struct QuantizedSumInitKernelWithBias {
-  //  init sum data with bias for matrix b (n)
-  MSHADOW_XINLINE static void Map(int i, int32_t *out,
-                                  const int8_t *bias, const float *min_out,
-                                  const float *max_out, const float *min_bias,
-                                  const float *max_bias) {
-    typedef int32_t T1;
-    typedef int8_t  T2;
-    using mshadow::red::limits::MinValue;
-    using mshadow::red::limits::MaxValue;
-    float float_for_one_out_quant  =
-      MaxAbs(*min_out, *max_out) / static_cast<double>(MaxValue<T1>());
-    float float_for_one_bias_quant =
-      MaxAbs(*min_bias, *max_bias) / static_cast<double>(MaxValue<T2>());
-    if (float_for_one_out_quant != 0) {
-      out[i] = bias[i] * float_for_one_bias_quant /
-               float_for_one_out_quant;
-    } else {
-      LOG(INFO) << "WARNING: QuantizedBiasAddKernel float_for_one_out_quant is 0 !";
-      out[i] = 0;
-    }
-  }
-};
-template<typename SrcType>
-void MKLDNNQuantizedFullyConnectedForward(const nnvm::NodeAttrs& attrs,
-                                          const OpContext &ctx,
-                                          const std::vector<NDArray> &in_data,
-                                          const std::vector<OpReqType> &req,
-                                          const std::vector<NDArray> &out_data) {
-#if MSHADOW_USE_MKL == 1
-  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
-  using namespace mshadow;
-  using namespace mxnet_op;
-  size_t num_inputs = param.no_bias ? 2 : 3;
-  CHECK_EQ(in_data.size(),  num_inputs * 3);
-  CHECK_EQ(out_data.size(), 3U);
-  const NDArray& data = in_data[0];
-  const NDArray& weight = in_data[1];
-  const NDArray& out = out_data[0];
-  TShape dshape = data.shape();
-  TShape wshape = weight.shape();
-  TShape oshape = out.shape();
-  auto output_temp = out.data().dptr<int32_t>();
-  auto weight_temp = weight.data().dptr<SrcType>();
-  auto data_temp = data.data().dptr<SrcType>();
-  const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
-  const float alpha = 1.0f;
-  const float beta  = 1.0f;
-  const CBLAS_OFFSET offsetc = CblasFixOffset;
-  const MKL_INT8 oa = 0;
-  const MKL_INT8 ob = 0;
-  MKL_INT32 oc = 0;
-  const int m = dshape[0], n = wshape[0], k = dshape.ProdShape(1, dshape.ndim());
-  Stream<cpu> *s = ctx.get_stream<cpu>();
-  //  cblas_gemm_s8u8s32 required first matrix must be uint8
-  //  shift data from int8(from -128 to 127) to uint8 (from 0 to 255)
-  int shift = 128;
-  Tensor<cpu, 1, uint8_t> shiftdata =
-    ctx.requested[quantized_fc::kTempSpace].get_space_typed<cpu, 1, uint8_t>(
-      Shape1(m * k), s);
-  #pragma omp parallel for num_threads(omp_threads)
-  for (int i = 0; i < m * k; ++i) {
-    shiftdata.dptr_[i] = data_temp[i] + shift;
-  }
-
-  Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(s, 1,
-      out_data[1].data().dptr<float>(), out_data[2].data().dptr<float>(),
-      in_data[num_inputs].data().dptr<float>(), in_data[num_inputs+1].data().dptr<float>(),
-      in_data[num_inputs+2].data().dptr<float>(), in_data[num_inputs+3].data().dptr<float>());
-  if (!param.no_bias) {
-    const NDArray& bias = in_data[2];
-    Kernel<QuantizedSumInitKernelWithBias, cpu>::Launch(s, n, out.data().dptr<int32_t>(),
-        bias.data().dptr<int8_t>(), out_data[1].data().dptr<float>(),
-        out_data[2].data().dptr<float>(), in_data[7].data().dptr<float>(),
-        in_data[8].data().dptr<float>());
-  } else {
-    #pragma omp parallel for num_threads(omp_threads)
-    for (int i = 0; i < m * n; ++i) {
-      output_temp[i] = 0;
-    }
-  }
-  #pragma omp parallel for num_threads(omp_threads)
-  for (int i = 0; i < n; ++i) {
-    for (int j = 0; j < k; ++j) {
-      output_temp[i] -= shift * weight_temp[i * k + j];
-    }
-  }
-  #pragma omp parallel for num_threads(omp_threads)
-  for (int i = n; i < m * n; ++i) {
-    output_temp[i] = output_temp[i % n];
-  }
-  cblas_gemm_s8u8s32(CblasRowMajor,
-                     CblasNoTrans,
-                     CblasTrans,
-                     offsetc,
-                     m,
-                     n,
-                     k,
-                     alpha,
-                     shiftdata.dptr_,
-                     k,
-                     oa,
-                     weight.data().dptr<SrcType>(),
-                     k,
-                     ob,
-                     beta,
-                     out.data().dptr<int32_t>(),
-                     n,
-                     &oc);
-#else
-  LOG(FATAL) << "s8u8s32 is only supported by MKL BLAS library";
-#endif
-}
-
-NNVM_REGISTER_OP(_contrib_quantized_fully_connected)
-.set_attr<FComputeEx>("FComputeEx<cpu>",
-    MKLDNNQuantizedFullyConnectedForward<int8_t>)
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  });
-
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_QUANTIZATION_QUANTIZED_FULLY_CONNECTED_INL_H_
diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index 8abeddcf89d6..16222fff20d3 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -23,12 +23,17 @@
  * \brief
  * \author Ziheng Jiang, Jun Wu
 */
+#include <vector>
+#include "quantization_utils.h"
 #include "../nn/fully_connected-inl.h"
-#include "./quantized_fully_connected-inl.h"
 
 namespace mxnet {
 namespace op {
 
+namespace quantized_fc {
+enum QuantilizedfcOpResource {kTempSpace};
+}
+
 bool QuantizedFullyConnectedShape(const nnvm::NodeAttrs& attrs,
                                   std::vector<TShape> *in_shape,
                                   std::vector<TShape> *out_shape) {
@@ -89,11 +94,139 @@ bool QuantizedFullyConnectedStorageType(const nnvm::NodeAttrs& attrs,
   if (dev_mask == mshadow::cpu::kDevMask) {
     *dispatch_mode = DispatchMode::kFComputeEx;
   }
-  for (size_t i = 0; i < out_attrs->size(); i++)
+  for (size_t i = 0; i < out_attrs->size(); i++) {
     (*out_attrs)[i] = kDefaultStorage;
+    STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, i, kDefaultStorage);
+    if (common::stype_string((*out_attrs)[i]).compare("unknown") == 0) {
+      return false;
+    }
+  }
+
+  for (size_t i = 0; i < in_attrs->size(); i++) {
+    (*in_attrs)[i] = kDefaultStorage;
+    STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, i, kDefaultStorage);
+    if (common::stype_string((*in_attrs)[i]).compare("unknown") == 0) {
+      return false;
+    }
+  }
+
   return true;
 }
 
+struct QuantizedSumInitKernelWithBias {
+  //  init sum data with bias for matrix b (n)
+  MSHADOW_XINLINE static void Map(int i, int32_t *out,
+                                  const int8_t *bias, const float *min_out,
+                                  const float *max_out, const float *min_bias,
+                                  const float *max_bias) {
+    typedef int32_t T1;
+    typedef int8_t  T2;
+    using mshadow::red::limits::MinValue;
+    using mshadow::red::limits::MaxValue;
+    float float_for_one_out_quant  =
+      MaxAbs(*min_out, *max_out) / static_cast<double>(MaxValue<T1>());
+    float float_for_one_bias_quant =
+      MaxAbs(*min_bias, *max_bias) / static_cast<double>(MaxValue<T2>());
+    if (float_for_one_out_quant != 0) {
+      out[i] = bias[i] * float_for_one_bias_quant /
+               float_for_one_out_quant;
+    } else {
+      LOG(INFO) << "WARNING: QuantizedBiasAddKernel float_for_one_out_quant is 0 !";
+      out[i] = 0;
+    }
+  }
+};
+template<typename SrcType>
+void MKLDNNQuantizedFullyConnectedForward(const nnvm::NodeAttrs& attrs,
+                                          const OpContext &ctx,
+                                          const std::vector<NDArray> &in_data,
+                                          const std::vector<OpReqType> &req,
+                                          const std::vector<NDArray> &out_data) {
+#if MSHADOW_USE_MKL == 1
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  using namespace mshadow;
+  using namespace mxnet_op;
+  size_t num_inputs = param.no_bias ? 2 : 3;
+  CHECK_EQ(in_data.size(),  num_inputs * 3);
+  CHECK_EQ(out_data.size(), 3U);
+  const NDArray& data = in_data[0];
+  const NDArray& weight = in_data[1];
+  const NDArray& out = out_data[0];
+  TShape dshape = data.shape();
+  TShape wshape = weight.shape();
+  TShape oshape = out.shape();
+  auto output_temp = out.data().dptr<int32_t>();
+  auto weight_temp = weight.data().dptr<SrcType>();
+  auto data_temp = data.data().dptr<SrcType>();
+  const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+  const float alpha = 1.0f;
+  const float beta  = 1.0f;
+  const CBLAS_OFFSET offsetc = CblasFixOffset;
+  const MKL_INT8 oa = 0;
+  const MKL_INT8 ob = 0;
+  MKL_INT32 oc = 0;
+  const int m = dshape[0], n = wshape[0], k = dshape.ProdShape(1, dshape.ndim());
+  Stream<cpu> *s = ctx.get_stream<cpu>();
+  //  cblas_gemm_s8u8s32 required first matrix must be uint8
+  //  shift data from int8(from -128 to 127) to uint8 (from 0 to 255)
+  int shift = 128;
+  Tensor<cpu, 1, uint8_t> shiftdata =
+    ctx.requested[quantized_fc::kTempSpace].get_space_typed<cpu, 1, uint8_t>(
+      Shape1(m * k), s);
+  #pragma omp parallel for num_threads(omp_threads)
+  for (int i = 0; i < m * k; ++i) {
+    shiftdata.dptr_[i] = data_temp[i] + shift;
+  }
+
+  Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(s, 1,
+      out_data[1].data().dptr<float>(), out_data[2].data().dptr<float>(),
+      in_data[num_inputs].data().dptr<float>(), in_data[num_inputs+1].data().dptr<float>(),
+      in_data[num_inputs+2].data().dptr<float>(), in_data[num_inputs+3].data().dptr<float>());
+  if (!param.no_bias) {
+    const NDArray& bias = in_data[2];
+    Kernel<QuantizedSumInitKernelWithBias, cpu>::Launch(s, n, out.data().dptr<int32_t>(),
+        bias.data().dptr<int8_t>(), out_data[1].data().dptr<float>(),
+        out_data[2].data().dptr<float>(), in_data[7].data().dptr<float>(),
+        in_data[8].data().dptr<float>());
+  } else {
+    #pragma omp parallel for num_threads(omp_threads)
+    for (int i = 0; i < m * n; ++i) {
+      output_temp[i] = 0;
+    }
+  }
+  #pragma omp parallel for num_threads(omp_threads)
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < k; ++j) {
+      output_temp[i] -= shift * weight_temp[i * k + j];
+    }
+  }
+  #pragma omp parallel for num_threads(omp_threads)
+  for (int i = n; i < m * n; ++i) {
+    output_temp[i] = output_temp[i % n];
+  }
+  cblas_gemm_s8u8s32(CblasRowMajor,
+                     CblasNoTrans,
+                     CblasTrans,
+                     offsetc,
+                     m,
+                     n,
+                     k,
+                     alpha,
+                     shiftdata.dptr_,
+                     k,
+                     oa,
+                     weight.data().dptr<SrcType>(),
+                     k,
+                     ob,
+                     beta,
+                     out.data().dptr<int32_t>(),
+                     n,
+                     &oc);
+#else
+  LOG(FATAL) << "s8u8s32 is only supported by MKL BLAS library";
+#endif
+}
+
 NNVM_REGISTER_OP(_contrib_quantized_fully_connected)
 .describe(R"code(Fully Connected operator for input, weight and bias data type of int8,
 and accumulates in type int32 for the output. For each argument, two more arguments of type
@@ -129,6 +262,12 @@ and max thresholds representing the threholds for quantizing the float32 output
 .set_attr<nnvm::FInferType>("FInferType", QuantizedFullyConnectedType)
 .set_attr<FInferStorageType>("FInferStorageType", QuantizedFullyConnectedStorageType)
 .set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return true; })
+.set_attr<FComputeEx>("FComputeEx<cpu>",
+    MKLDNNQuantizedFullyConnectedForward<int8_t>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
 .add_argument("data", "NDArray-or-Symbol", "Input data.")
 .add_argument("weight", "NDArray-or-Symbol", "weight.")
 .add_argument("bias", "NDArray-or-Symbol", "bias.")

From 471a2dced98974928e7b435656d308fc80b1c3d2 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Tue, 30 Oct 2018 09:35:03 +0800
Subject: [PATCH 05/23] fix typo bug

---
 src/operator/quantization/quantized_fully_connected.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index 16222fff20d3..587c16e3b470 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -95,7 +95,6 @@ bool QuantizedFullyConnectedStorageType(const nnvm::NodeAttrs& attrs,
     *dispatch_mode = DispatchMode::kFComputeEx;
   }
   for (size_t i = 0; i < out_attrs->size(); i++) {
-    (*out_attrs)[i] = kDefaultStorage;
     STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, i, kDefaultStorage);
     if (common::stype_string((*out_attrs)[i]).compare("unknown") == 0) {
       return false;
@@ -103,7 +102,6 @@ bool QuantizedFullyConnectedStorageType(const nnvm::NodeAttrs& attrs,
   }
 
   for (size_t i = 0; i < in_attrs->size(); i++) {
-    (*in_attrs)[i] = kDefaultStorage;
     STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, i, kDefaultStorage);
     if (common::stype_string((*in_attrs)[i]).compare("unknown") == 0) {
       return false;

From 7b64226ffd532215e37427c49c47b8cf80ce3fc0 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Tue, 30 Oct 2018 12:12:05 +0800
Subject: [PATCH 06/23] retrigger the ci test

---
 src/operator/quantization/quantized_fully_connected.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index 587c16e3b470..621d1b34bb00 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -100,7 +100,6 @@ bool QuantizedFullyConnectedStorageType(const nnvm::NodeAttrs& attrs,
       return false;
     }
   }
-
   for (size_t i = 0; i < in_attrs->size(); i++) {
     STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, i, kDefaultStorage);
     if (common::stype_string((*in_attrs)[i]).compare("unknown") == 0) {

From 1dbc10611ffaa161f91d9bbcc786a67c18d9e0da Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Sat, 3 Nov 2018 13:17:54 +0800
Subject: [PATCH 07/23] fix typo bug

---
 .../quantization/quantized_fully_connected.cc       | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index 621d1b34bb00..91d2ef0433ee 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -31,7 +31,7 @@ namespace mxnet {
 namespace op {
 
 namespace quantized_fc {
-enum QuantilizedfcOpResource {kTempSpace};
+enum QuantizedfcOpResource {kTempSpace};
 }
 
 bool QuantizedFullyConnectedShape(const nnvm::NodeAttrs& attrs,
@@ -121,20 +121,21 @@ struct QuantizedSumInitKernelWithBias {
     using mshadow::red::limits::MinValue;
     using mshadow::red::limits::MaxValue;
     float float_for_one_out_quant  =
-      MaxAbs(*min_out, *max_out) / static_cast<double>(MaxValue<T1>());
+        MaxAbs(*min_out, *max_out) / static_cast<double>(MaxValue<T1>());
     float float_for_one_bias_quant =
-      MaxAbs(*min_bias, *max_bias) / static_cast<double>(MaxValue<T2>());
+        MaxAbs(*min_bias, *max_bias) / static_cast<double>(MaxValue<T2>());
     if (float_for_one_out_quant != 0) {
       out[i] = bias[i] * float_for_one_bias_quant /
-               float_for_one_out_quant;
+          float_for_one_out_quant;
     } else {
       LOG(INFO) << "WARNING: QuantizedBiasAddKernel float_for_one_out_quant is 0 !";
       out[i] = 0;
     }
   }
 };
+
 template<typename SrcType>
-void MKLDNNQuantizedFullyConnectedForward(const nnvm::NodeAttrs& attrs,
+void QuantizedFullyConnectedForward(const nnvm::NodeAttrs& attrs,
                                           const OpContext &ctx,
                                           const std::vector<NDArray> &in_data,
                                           const std::vector<OpReqType> &req,
@@ -260,7 +261,7 @@ and max thresholds representing the threholds for quantizing the float32 output
 .set_attr<FInferStorageType>("FInferStorageType", QuantizedFullyConnectedStorageType)
 .set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return true; })
 .set_attr<FComputeEx>("FComputeEx<cpu>",
-    MKLDNNQuantizedFullyConnectedForward<int8_t>)
+    QuantizedFullyConnectedForward<int8_t>)
 .set_attr<FResourceRequest>("FResourceRequest",
   [](const NodeAttrs& attrs) {
     return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};

From babc764add79983181296d2e23a0c2cc42e3636d Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Sat, 3 Nov 2018 15:03:46 +0800
Subject: [PATCH 08/23] retrigger ci

---
 src/operator/quantization/quantized_fully_connected.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index 91d2ef0433ee..494d782b3e05 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -106,7 +106,6 @@ bool QuantizedFullyConnectedStorageType(const nnvm::NodeAttrs& attrs,
       return false;
     }
   }
-
   return true;
 }
 

From d365b64e437929a66e02e323c2b22e3234be6cc8 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Mon, 5 Nov 2018 08:51:32 +0800
Subject: [PATCH 09/23] retrigger the ci test

---
 src/operator/quantization/quantized_fully_connected.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index 494d782b3e05..91d2ef0433ee 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -106,6 +106,7 @@ bool QuantizedFullyConnectedStorageType(const nnvm::NodeAttrs& attrs,
       return false;
     }
   }
+
   return true;
 }
 

From 1010deb787c8d41b8d66c66cbf960aab4197d488 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Tue, 6 Nov 2018 08:46:03 +0800
Subject: [PATCH 10/23] retrigger the ci

---
 src/operator/quantization/quantized_fully_connected.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index 91d2ef0433ee..494d782b3e05 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -106,7 +106,6 @@ bool QuantizedFullyConnectedStorageType(const nnvm::NodeAttrs& attrs,
       return false;
     }
   }
-
   return true;
 }
 

From 818021d2b6074c40b63c8a76d492ea5741edb24c Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Wed, 7 Nov 2018 09:49:53 +0800
Subject: [PATCH 11/23] retrigger the ci test

---
 src/operator/quantization/quantized_fully_connected.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index 494d782b3e05..91d2ef0433ee 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -106,6 +106,7 @@ bool QuantizedFullyConnectedStorageType(const nnvm::NodeAttrs& attrs,
       return false;
     }
   }
+
   return true;
 }
 

From b3df5a6284ba6c4539611e67f5bce66f72932098 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Thu, 8 Nov 2018 08:51:42 +0800
Subject: [PATCH 12/23] retrigger ci test

---
 src/operator/quantization/quantized_fully_connected.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index 91d2ef0433ee..494d782b3e05 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -106,7 +106,6 @@ bool QuantizedFullyConnectedStorageType(const nnvm::NodeAttrs& attrs,
       return false;
     }
   }
-
   return true;
 }
 

From b3bf9f7ee9b9a61049ef140c82b4228b77f7bd3a Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Wed, 14 Nov 2018 09:58:27 +0800
Subject: [PATCH 13/23] fix indent issue

---
 src/operator/quantization/quantized_fully_connected.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index 494d782b3e05..25643dae6426 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -135,10 +135,10 @@ struct QuantizedSumInitKernelWithBias {
 
 template<typename SrcType>
 void QuantizedFullyConnectedForward(const nnvm::NodeAttrs& attrs,
-                                          const OpContext &ctx,
-                                          const std::vector<NDArray> &in_data,
-                                          const std::vector<OpReqType> &req,
-                                          const std::vector<NDArray> &out_data) {
+                                    const OpContext &ctx,
+                                    const std::vector<NDArray> &in_data,
+                                    const std::vector<OpReqType> &req,
+                                    const std::vector<NDArray> &out_data) {
 #if MSHADOW_USE_MKL == 1
   const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
   using namespace mshadow;

From e537fc19a723efb390c56dbf9f8f868784d49fe8 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Wed, 14 Nov 2018 12:09:28 +0800
Subject: [PATCH 14/23] retrigger the ci

---
 src/operator/quantization/quantized_fully_connected.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index 25643dae6426..4a8ea2a92819 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -106,6 +106,7 @@ bool QuantizedFullyConnectedStorageType(const nnvm::NodeAttrs& attrs,
       return false;
     }
   }
+
   return true;
 }
 

From 72b81d9aafb0a8b58fb7ff62a616691734f92d3c Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Thu, 15 Nov 2018 09:01:00 +0800
Subject: [PATCH 15/23] retrigger the ci test

---
 src/operator/quantization/quantized_fully_connected.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index 4a8ea2a92819..25643dae6426 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -106,7 +106,6 @@ bool QuantizedFullyConnectedStorageType(const nnvm::NodeAttrs& attrs,
       return false;
     }
   }
-
   return true;
 }
 

From 1f98f6345c0bf56ffeb6e56933a06f4881fb7ecf Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Mon, 3 Dec 2018 09:53:46 +0800
Subject: [PATCH 16/23] add verbose message

---
 src/operator/quantization/quantized_fully_connected.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index 25643dae6426..2a3b430f677b 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -127,7 +127,7 @@ struct QuantizedSumInitKernelWithBias {
       out[i] = bias[i] * float_for_one_bias_quant /
           float_for_one_out_quant;
     } else {
-      LOG(INFO) << "WARNING: QuantizedBiasAddKernel float_for_one_out_quant is 0 !";
+      LOG(INFO) << "WARNING: float_for_one_out_quant is 0, need to check min/max data !";
       out[i] = 0;
     }
   }
@@ -220,7 +220,7 @@ void QuantizedFullyConnectedForward(const nnvm::NodeAttrs& attrs,
                      n,
                      &oc);
 #else
-  LOG(FATAL) << "s8u8s32 is only supported by MKL BLAS library";
+  LOG(FATAL) << "Quantized INT8 cblas_gemm_s8u8s32 is only supported by MKL BLAS library";
 #endif
 }
 

From 9171b1a486584ea89df132285cd1f53d087bf5c2 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Tue, 11 Dec 2018 10:07:54 +0800
Subject: [PATCH 17/23] update log message

---
 src/operator/quantization/quantized_fully_connected.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index 2a3b430f677b..3e0574f4de6f 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -127,7 +127,8 @@ struct QuantizedSumInitKernelWithBias {
       out[i] = bias[i] * float_for_one_bias_quant /
           float_for_one_out_quant;
     } else {
-      LOG(INFO) << "WARNING: float_for_one_out_quant is 0, need to check min/max data !";
+      LOG(INFO) << "float_for_one_out_quant is 0,"
+          << " need to check the why MaxAbs(*min_out, *max_out) of out_data is 0!";
       out[i] = 0;
     }
   }
@@ -220,7 +221,9 @@ void QuantizedFullyConnectedForward(const nnvm::NodeAttrs& attrs,
                      n,
                      &oc);
 #else
-  LOG(FATAL) << "Quantized INT8 cblas_gemm_s8u8s32 is only supported by MKL BLAS library";
+  LOG(FATAL) << "Quantized fully connected operator relies on cblas_gemm_s8u8s32"
+      << " which is only supported by MKL BLAS."
+      << " Please build MXNet with USE_BLAS=mkl to leverage this operator.";
 #endif
 }
 

From daf75e6eb0e48aacb7b7d66361834c532f97525f Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Tue, 11 Dec 2018 13:05:16 +0800
Subject: [PATCH 18/23] using range for loop

---
 .../quantization/quantized_fully_connected.cc | 30 +++++++++++--------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index 3e0574f4de6f..cf302167d48f 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -54,8 +54,9 @@ bool QuantizedFullyConnectedShape(const nnvm::NodeAttrs& attrs,
     SHAPE_ASSIGN_CHECK(*in_shape, 2, bshape);
   }
 
-  for (size_t i = num_inputs; i < 3 * num_inputs; ++i) {
-    SHAPE_ASSIGN_CHECK(*in_shape, i, TShape{1});
+  std::vector<TShape>::iterator in_s;
+  for (in_s = in_shape->begin() + num_inputs; in_s < in_shape->begin() + 3 * num_inputs; in_s++) {
+    *in_s = TShape{1};
   }
 
   SHAPE_ASSIGN_CHECK(*out_shape, 0, TShape({dshape[0], wshape[0]}));
@@ -72,11 +73,12 @@ bool QuantizedFullyConnectedType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_type->size(), num_inputs * 3);
   CHECK_EQ(out_type->size(), 3U);
 
-  for (size_t i = 0; i < num_inputs; ++i) {
-    TYPE_ASSIGN_CHECK(*in_type, i, mshadow::kInt8);
+  std::vector<int>::iterator in_t;
+  for (in_t = in_type->begin(); in_t < in_type->begin() + num_inputs; in_t++) {
+    *in_t = mshadow::kInt8;
   }
-  for (size_t i = num_inputs; i < 3 * num_inputs; ++i) {
-    TYPE_ASSIGN_CHECK(*in_type, i, mshadow::kFloat32);
+  for (in_t = in_type->begin() + num_inputs; in_t < in_type->begin() + 3 * num_inputs; in_t++) {
+    *in_t = mshadow::kFloat32;
   }
 
   TYPE_ASSIGN_CHECK(*out_type, 0, mshadow::kInt32);
@@ -94,18 +96,22 @@ bool QuantizedFullyConnectedStorageType(const nnvm::NodeAttrs& attrs,
   if (dev_mask == mshadow::cpu::kDevMask) {
     *dispatch_mode = DispatchMode::kFComputeEx;
   }
-  for (size_t i = 0; i < out_attrs->size(); i++) {
-    STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, i, kDefaultStorage);
-    if (common::stype_string((*out_attrs)[i]).compare("unknown") == 0) {
+  std::vector<int>::iterator out_attr;
+  for (out_attr = out_attrs->begin(); out_attr != out_attrs->end(); out_attr++) {
+    *out_attr = kDefaultStorage;
+    if (common::stype_string(*out_attr).compare("unknown") == 0) {
       return false;
     }
   }
-  for (size_t i = 0; i < in_attrs->size(); i++) {
-    STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, i, kDefaultStorage);
-    if (common::stype_string((*in_attrs)[i]).compare("unknown") == 0) {
+
+  std::vector<int>::iterator in_attr;
+  for (in_attr = in_attrs->begin(); in_attr != in_attrs->end(); in_attr++) {
+    *in_attr = kDefaultStorage;
+    if (common::stype_string(*in_attr).compare("unknown") == 0) {
       return false;
     }
   }
+
   return true;
 }
 

From 1ea0675ca6f484cf44f4eb1a844962c0f4461896 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Tue, 11 Dec 2018 14:06:59 +0800
Subject: [PATCH 19/23] using for auto range

---
 .../quantization/quantized_fully_connected.cc    | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index cf302167d48f..2912a2e06998 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -96,22 +96,20 @@ bool QuantizedFullyConnectedStorageType(const nnvm::NodeAttrs& attrs,
   if (dev_mask == mshadow::cpu::kDevMask) {
     *dispatch_mode = DispatchMode::kFComputeEx;
   }
-  std::vector<int>::iterator out_attr;
-  for (out_attr = out_attrs->begin(); out_attr != out_attrs->end(); out_attr++) {
-    *out_attr = kDefaultStorage;
-    if (common::stype_string(*out_attr).compare("unknown") == 0) {
+
+  for (auto &v : *out_attrs) {
+    v = kDefaultStorage;
+    if (common::stype_string(v).compare("unknown") == 0) {
       return false;
     }
   }
 
-  std::vector<int>::iterator in_attr;
-  for (in_attr = in_attrs->begin(); in_attr != in_attrs->end(); in_attr++) {
-    *in_attr = kDefaultStorage;
-    if (common::stype_string(*in_attr).compare("unknown") == 0) {
+  for (auto &v : *in_attrs) {
+    v = kDefaultStorage;
+    if (common::stype_string(v).compare("unknown") == 0) {
       return false;
     }
   }
-
   return true;
 }
 

From c87402eac2c88278c60c8cfdb4cbfcb5ba88951b Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Wed, 12 Dec 2018 10:09:33 +0800
Subject: [PATCH 20/23] enable MKL BLAS ci test

---
 tests/python/quantization/test_quantization.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index ed2bb3ad4410..67a84910d334 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -26,6 +26,7 @@
 from mxnet.module import Module
 from mxnet.io import NDArrayIter
 import unittest
+import operator
 
 def is_test_for_gpu():
     return mx.current_context().device_type == 'gpu'
@@ -270,8 +271,15 @@ def check_quantized_pooling(data_shape, kernel, pool_type, pad, stride, global_p
 def test_quantized_fc():
     def check_quantized_fc(data_shape, num_hidden, no_bias, qdtype, flatten=True):
         if mx.current_context().device_type != 'gpu':
-            print('skipped testing quantized_fc on cpu since s8u8s32 is only supported by MKL BLAS library')
-            return
+            hasMKL = False;
+            for key in os.environ.keys():
+                if operator.eq(key, "BUILD_TAG"):
+                    if os.environ['BUILD_TAG'].find("MKL") != -1:
+                        hasMKL = True
+                    break
+            if hasMKL == False:
+                print('skipped testing quantized_fc on cpu since s8u8s32 is only supported by MKL BLAS library')
+                return
         elif qdtype == 'uint8' and is_test_for_gpu():
             print('skipped testing quantized_fc for gpu uint8 since it is not supported yet')
             return

From 88562b9af1282c4ff5ea2ee18cc70debe71414e3 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Wed, 12 Dec 2018 13:08:35 +0800
Subject: [PATCH 21/23] fix typo issue

---
 src/operator/quantization/quantized_fully_connected.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index 2912a2e06998..dd526ee6ba45 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -132,7 +132,7 @@ struct QuantizedSumInitKernelWithBias {
           float_for_one_out_quant;
     } else {
       LOG(INFO) << "float_for_one_out_quant is 0,"
-          << " need to check the why MaxAbs(*min_out, *max_out) of out_data is 0!";
+                << " need to check the why MaxAbs(*min_out, *max_out) of out_data is 0!";
       out[i] = 0;
     }
   }
@@ -226,8 +226,8 @@ void QuantizedFullyConnectedForward(const nnvm::NodeAttrs& attrs,
                      &oc);
 #else
   LOG(FATAL) << "Quantized fully connected operator relies on cblas_gemm_s8u8s32"
-      << " which is only supported by MKL BLAS."
-      << " Please build MXNet with USE_BLAS=mkl to leverage this operator.";
+             << " which is only supported by MKL BLAS."
+             << " Please build MXNet with USE_BLAS=mkl to leverage this operator.";
 #endif
 }
 

From 54ee001748e1957a102d01542a324b46a220d15c Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Thu, 13 Dec 2018 09:15:02 +0800
Subject: [PATCH 22/23] use TYPE_ASSIGN_CHECK

---
 .../quantization/quantized_fully_connected.cc      | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index dd526ee6ba45..d9123d984af2 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -54,9 +54,8 @@ bool QuantizedFullyConnectedShape(const nnvm::NodeAttrs& attrs,
     SHAPE_ASSIGN_CHECK(*in_shape, 2, bshape);
   }
 
-  std::vector<TShape>::iterator in_s;
-  for (in_s = in_shape->begin() + num_inputs; in_s < in_shape->begin() + 3 * num_inputs; in_s++) {
-    *in_s = TShape{1};
+  for (size_t i = num_inputs; i < 3 * num_inputs; ++i) {
+    SHAPE_ASSIGN_CHECK(*in_shape, i, TShape{1});
   }
 
   SHAPE_ASSIGN_CHECK(*out_shape, 0, TShape({dshape[0], wshape[0]}));
@@ -73,12 +72,11 @@ bool QuantizedFullyConnectedType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_type->size(), num_inputs * 3);
   CHECK_EQ(out_type->size(), 3U);
 
-  std::vector<int>::iterator in_t;
-  for (in_t = in_type->begin(); in_t < in_type->begin() + num_inputs; in_t++) {
-    *in_t = mshadow::kInt8;
+  for (size_t i = 0; i < num_inputs; ++i) {
+    TYPE_ASSIGN_CHECK(*in_type, i, mshadow::kInt8);
   }
-  for (in_t = in_type->begin() + num_inputs; in_t < in_type->begin() + 3 * num_inputs; in_t++) {
-    *in_t = mshadow::kFloat32;
+  for (size_t i = num_inputs; i < 3 * num_inputs; ++i) {
+    TYPE_ASSIGN_CHECK(*in_type, i, mshadow::kFloat32);
   }
 
   TYPE_ASSIGN_CHECK(*out_type, 0, mshadow::kInt32);

From d2dde15901aaf6583aaa459b916862fdc1e57605 Mon Sep 17 00:00:00 2001
From: "Li, Hao H" <hao.h.li@intel.com>
Date: Fri, 14 Dec 2018 09:14:08 +0800
Subject: [PATCH 23/23] retrigger the ci

---
 src/operator/quantization/quantized_fully_connected.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index d9123d984af2..64ce73ba1cf7 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -136,6 +136,7 @@ struct QuantizedSumInitKernelWithBias {
   }
 };
 
+
 template<typename SrcType>
 void QuantizedFullyConnectedForward(const nnvm::NodeAttrs& attrs,
                                     const OpContext &ctx,