From 3b5276ced9170d1a20798e12fc92615e3b8f13a8 Mon Sep 17 00:00:00 2001
From: Xinyu Chen <xinyu1.chen@intel.com>
Date: Tue, 19 Mar 2019 23:13:59 -0700
Subject: [PATCH] [MKL-DNN] Enable s8 support for inner product and 3d input
 with flatten=false (#14466)

* support 3d input innerproduct with flatten=false

* simplify test case and improve error msg

* enable s8s8 inner product

* improve code style

* add tests and improve error msg
---
 src/operator/nn/fully_connected.cc            | 10 ++++--
 .../mkldnn_quantized_fully_connected.cc       | 11 -------
 .../quantization/quantized_fully_connected.cc | 33 +++++++++++--------
 .../quantization/quantized_fully_connected.cu |  5 +++
 src/operator/subgraph/mkldnn/mkldnn_fc.cc     | 12 +++----
 tests/python/mkl/test_subgraph.py             | 13 ++------
 .../python/quantization/test_quantization.py  |  8 +++--
 7 files changed, 44 insertions(+), 48 deletions(-)
diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc
index 2e02de300e8f..2bc321832af6 100644
--- a/src/operator/nn/fully_connected.cc
+++ b/src/operator/nn/fully_connected.cc
@@ -32,6 +32,12 @@
 namespace mxnet {
 namespace op {
 
+bool SupportMKLDNNFC(const NDArray& input) {
+  int ndim = input.shape().ndim();
+  return input.dtype() == mshadow::kFloat32 && (ndim >= 1 && ndim <= 4) &&
+         input.storage_type() == kDefaultStorage;
+}
+
 static bool FullyConnectedShape(const nnvm::NodeAttrs& attrs,
                                 mxnet::ShapeVector *in_shape,
                                 mxnet::ShapeVector *out_shape) {
@@ -94,7 +100,7 @@ void FullyConnectedComputeExCPU(const nnvm::NodeAttrs& attrs,
 #if MXNET_USE_MKLDNN == 1
   if (common::ContainsOnlyStorage(inputs, kDefaultStorage) &&
       common::ContainsOnlyStorage(outputs, kDefaultStorage)) {
-    if (SupportMKLDNN(inputs[0])) {
+    if (SupportMKLDNNFC(inputs[0])) {
       MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
       MKLDNNFCForward(attrs, ctx, inputs, req, outputs);
       MKLDNN_OPCHECK_RUN(FullyConnectedCompute<cpu>, attrs, ctx, inputs, req,
@@ -141,7 +147,7 @@ void FullyConnectedGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                     const std::vector<NDArray> &inputs,
                                     const std::vector<OpReqType> &req,
                                     const std::vector<NDArray> &outputs) {
-  if (SupportMKLDNN(inputs[0])) {
+  if (SupportMKLDNNFC(inputs[0])) {
     MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
     MKLDNNFCBackward(attrs, ctx, inputs, req, outputs);
     MKLDNN_OPCHECK_RUN(FullyConnectedGradCompute<cpu>, attrs, ctx, inputs, req,
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
index 4176ebe89f2f..801d68d73311 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
@@ -50,17 +50,6 @@ void MKLDNNQuantizedFullyConnectedForward(const nnvm::NodeAttrs &attrs,
 
   NDArray data = in_data[fullc::kData];
   NDArray weight = in_data[fullc::kWeight];
-  const TShape &ishape = data.shape();
-
-  CHECK(data.dtype() == mshadow::kUint8)
-    << "MKLDNNQuantizedFullyConnected Op only supports uint8 for now, but got "
-    << mxnet::op::type_string(data.dtype());
-
-  if (ishape.ndim() != 2) {
-    CHECK(param.flatten)
-      << "QuantizedFullyConnected Op only supports flatten=true when ishape.ndim()!=2 for now.";
-    data = data.MKLDNNDataReshape(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())));
-  }
 
   const float min_data =
     in_data[num_inputs + quantized_fc_enum::kDataMin].data().dptr<float>()[0];
diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index cd62ebce559f..4d6142559bd4 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -50,11 +50,14 @@ bool QuantizedFullyConnectedShape(const nnvm::NodeAttrs& attrs,
   CHECK(!shape_is_none(in_shape->at(0)))
     << "QuantizedFullyConnectedOp input data shape must be given";
   const mxnet::TShape& dshape = in_shape->at(0);
-  mxnet::TShape wshape = Shape2(param.num_hidden, dshape.ProdShape(1, dshape.ndim()));
-  if (dshape.ndim() != 2) {
-      CHECK(param.flatten)
-        << "QuantizedFullyConnectedOp only supports flatten=true when ishape.ndim()!=2 for now. ";
+  index_t num_input;
+  if (!param.flatten) {
+    num_input = dshape[dshape.ndim() - 1];
+  } else {
+    num_input = dshape.ProdShape(1, dshape.ndim());
   }
+
+  TShape wshape = Shape2(param.num_hidden, num_input);
   SHAPE_ASSIGN_CHECK(*in_shape, 1, wshape);
   if (!param.no_bias) {
     mxnet::TShape bshape = Shape1(param.num_hidden);
@@ -65,7 +68,13 @@ bool QuantizedFullyConnectedShape(const nnvm::NodeAttrs& attrs,
     SHAPE_ASSIGN_CHECK(*in_shape, i, mxnet::TShape{1});
   }
 
-  SHAPE_ASSIGN_CHECK(*out_shape, 0, mxnet::TShape({dshape[0], wshape[0]}));
+  if (!param.flatten) {
+    TShape result_shape(dshape);
+    result_shape[dshape.ndim() - 1] = param.num_hidden;
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, result_shape);
+  } else {
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param.num_hidden));
+  }
   SHAPE_ASSIGN_CHECK(*out_shape, 1, mxnet::TShape({1}));
   SHAPE_ASSIGN_CHECK(*out_shape, 2, mxnet::TShape({1}));
   return true;
@@ -80,9 +89,9 @@ bool QuantizedFullyConnectedType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_type->size(), 3U);
 
 #if MXNET_USE_MKLDNN == 1
-  // TODO(ciyong): currently, only uint8 fully_connected is upported,
-  // int8 fully_connected will be supported after mkldnn v0.18
-  TYPE_ASSIGN_CHECK(*in_type, 0, mshadow::kUint8);
+  CHECK(in_type->at(0) == mshadow::kInt8 || in_type->at(0) == mshadow::kUint8)
+      << "QuantizedFullyConnected only supports int8/uint8 input, while "
+      << in_type->at(0) << " is given.";
 #else
   TYPE_ASSIGN_CHECK(*in_type, 0, mshadow::kInt8);
 #endif
@@ -182,7 +191,8 @@ void QuantizedFullyConnectedForwardCPU(const nnvm::NodeAttrs& attrs,
 
   if (dshape.ndim() != 2)
     CHECK(param.flatten)
-      << "QuantizedFullyConnectedOp only supports flatten=true when input_shape!=2 for now. ";
+        << "QuantizedFullyConnectedForwardCPU only supports flatten=true "
+        << "when dshape.ndim() != 2 for now.";
 
   Tensor<cpu, 2, int8_t> weight = in_data[fullc::kWeight].get<cpu, 2, int8_t>(s);
   Tensor<cpu, 2, int8_t> data = in_data[fullc::kData].get_with_shape<cpu, 2, int8_t>(
@@ -276,11 +286,6 @@ void QuantizedFullyConnectedForwardExCPU(const nnvm::NodeAttrs &attrs,
                                          const std::vector<NDArray> &in_data,
                                          const std::vector<OpReqType> &req,
                                          const std::vector<NDArray> &out_data) {
-  if (in_data[fullc::kData].dtype() == mshadow::kInt8) {
-    FallBackCompute(QuantizedFullyConnectedForwardCPU, attrs, ctx, in_data, req, out_data);
-    return;
-  }
-
   MKLDNNQuantizedFullyConnectedForward(attrs, ctx, in_data, req, out_data);
 }
 #endif
diff --git a/src/operator/quantization/quantized_fully_connected.cu b/src/operator/quantization/quantized_fully_connected.cu
index e4c891973ae5..04680c8c2b78 100644
--- a/src/operator/quantization/quantized_fully_connected.cu
+++ b/src/operator/quantization/quantized_fully_connected.cu
@@ -75,6 +75,11 @@ void QuantizedFullyConnectedForwardGPU(const nnvm::NodeAttrs& attrs,
   mxnet::TShape oshape = out.shape_;
   // (m, n) * (k, n).T = (m, k)
   // A * B.T = C
+  if (dshape.ndim() != 2) {
+    CHECK(param.flatten)
+      << "Currently, QuantizedFullyConnected Op only supports flatten=true "
+      << "when ishape.ndim()!=2 for GPU.";
+  }
 
   // row_C = col_C(T) = cublas(col_B * col_A(T)) = cublas(row_B(T), row_A)
   // row_C = col_C(T) = cublas(col_B(T) * col_A(T)) = cublas(row_B, row_A)
diff --git a/src/operator/subgraph/mkldnn/mkldnn_fc.cc b/src/operator/subgraph/mkldnn/mkldnn_fc.cc
index a5107c387a16..dba27f6dc443 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_fc.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_fc.cc
@@ -116,11 +116,6 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx,
   NDArray data = in_data[fullc::kData];
   NDArray weight = in_data[fullc::kWeight];
   NDArray output = out_data[fullc::kOut];
-  const mxnet::TShape &ishape = data.shape();
-  if (mkldnn_param.quantized && ishape.ndim() != 2) {
-    CHECK(default_param.flatten)
-      << "QuantizedFullyConnected only supports flatten=true when ishape.ndim() != 2 for now.";
-  }
 
   mkldnn::memory::desc out_md = GetMemDesc(output);
   MKLDNNFCFlattenData(default_param, out_data[fullc::kOut], &data, &out_md);
@@ -307,9 +302,10 @@ static bool SgMKLDNNFCInferType(const nnvm::NodeAttrs &attrs,
   if (full_param.mkldnn_param.quantized) {
     size_t base_num_inputs = full_param.default_param.no_bias ? 2 : 3;
 
-    // TODO(ciyong): currently, only uint8 fully_connected is supported,
-    // int8 fully_connected will be supported after mkldnn v0.18
-    // TYPE_ASSIGN_CHECK(*in_types, 0, mshadow::kUint8);
+    CHECK(in_types->at(0) == mshadow::kInt8 ||
+          in_types->at(0) == mshadow::kUint8)
+        << "QuantizedFullyConnected only supports int8/uint8 input, while "
+        << in_types->at(0) << " is given.";
     for (size_t i = 1; i < in_types->size(); ++i) {
       if (i < base_num_inputs) {
         TYPE_ASSIGN_CHECK(*in_types, i, mshadow::kInt8);
diff --git a/tests/python/mkl/test_subgraph.py b/tests/python/mkl/test_subgraph.py
index e6fe0011af19..871c1e3d566a 100644
--- a/tests/python/mkl/test_subgraph.py
+++ b/tests/python/mkl/test_subgraph.py
@@ -124,7 +124,6 @@ def check_quantize(sym, data_shape, out_type, name='conv',
     mod.bind(for_training=False,
             data_shapes=[('data', data_shape)],
             label_shapes=[('softmax_label', label_shape)])
-
   mod.init_params(mx.init.Normal(0.5))
   arg_params, aux_params = mod.get_params()
 
@@ -136,12 +135,10 @@ def check_quantize(sym, data_shape, out_type, name='conv',
       output.wait_to_read()
   ref_out = mod.get_outputs()
 
-  # TODO(ciyong), exclude the second fc due to int8 fully_connected is not
-  # supported before mkldnn 0.18
   excluded_sym_names = []
-  if mx.current_context() == mx.cpu():
+  if mx.current_context() == mx.cpu() and gluon_forward == True:
+    excluded_sym_names += ['sg_mkldnn_fully_connected_0']
     excluded_sym_names += ['fc_softmax']
-    excluded_sym_names += ['sg_mkldnn_fully_connected_1']
 
   calib_data = mx.nd.random.uniform(shape=data_shape)
   calib_data = NDArrayIter(data=calib_data)
@@ -193,11 +190,7 @@ def check_fusion(sym, data_shape, attrs_op, name='conv', check_quantization=True
     assert_almost_equal(exe.outputs[i].asnumpy(), exe_sg.outputs[i].asnumpy(), rtol=1e-3, atol=1e-3)
 
   # fp32 to int8
-  # TODO(ciyong), int8 fully_connected will be supported after mkldnn 0.18
-  if name == 'fc':
-    out_type_list = ['uint8', 'auto']
-  else:
-    out_type_list = ['uint8', 'int8', 'auto']
+  out_type_list = ['uint8', 'int8', 'auto']
 
   if check_quantization:
     for out_type in out_type_list:
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index e2457c7a4d50..eedc867ce8d3 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -288,9 +288,6 @@ def check_quantized_fc(data_shape, num_hidden, no_bias, qdtype, flatten=True):
             if hasMKL == False:
                 print('skipped testing quantized_fc on cpu since s8u8s32 is only supported by MKL BLAS library')
                 return
-        elif qdtype == 'int8' and is_test_for_mkldnn():
-            print('skipped testing test_quantized_fc for mkldnn cpu int8 since it is not supported yet')
-            return
         elif qdtype == 'uint8' and is_test_for_gpu():
             print('skipped testing quantized_fc for gpu uint8 since it is not supported yet')
             return
@@ -377,6 +374,11 @@ def maxabs(a, b):
             assert cond == 0
 
     for qdtype in ['int8', 'uint8']:
+        if is_test_for_mkldnn():
+            check_quantized_fc((32, 512, 2), 100, True, qdtype, flatten=False)
+            check_quantized_fc((32, 512, 2), 100, False, qdtype, flatten=False)
+            check_quantized_fc((32, 512, 2, 2), 100, True, qdtype, flatten=False)
+            check_quantized_fc((32, 512, 2, 2), 100, False, qdtype, flatten=False)
         check_quantized_fc((32, 512, 2, 2), 100, True, qdtype)
         check_quantized_fc((32, 111, 2, 2), 100, True, qdtype)
         check_quantized_fc((32, 512, 2, 2), 100, False, qdtype)