From e1bfae358912d0de18c158c9ec51de4ff3910436 Mon Sep 17 00:00:00 2001
From: Yixin Bao <yixin.bao@intel.com>
Date: Mon, 26 Aug 2019 09:11:31 +0800
Subject: [PATCH 1/7] add uint8 bn mkldnn implementation

---
 .../quantization/imagenet_gen_qsym_mkldnn.py  |  2 +-
 .../nn/mkldnn/mkldnn_batch_norm-inl.h         | 11 ++++-----
 .../mkldnn/mkldnn_quantized_batch_norm.cc     | 23 ++++++++++++++++++-
 .../quantization/quantized_batch_norm.cc      |  6 +++++
 4 files changed, 34 insertions(+), 8 deletions(-)
diff --git a/example/quantization/imagenet_gen_qsym_mkldnn.py b/example/quantization/imagenet_gen_qsym_mkldnn.py
index 302a04449885..67cdda2e0751 100644
--- a/example/quantization/imagenet_gen_qsym_mkldnn.py
+++ b/example/quantization/imagenet_gen_qsym_mkldnn.py
@@ -216,7 +216,7 @@ def save_params(fname, arg_params, aux_params, logger=None):
             if exclude_first_conv:
                 excluded_sym_names += ['resnetv10_conv0_fwd']
         elif args.model.find('resnet') != -1 and args.model.find('v2') != -1:
-            excluded_sym_names += ['resnetv20_flatten0_flatten0']
+            excluded_sym_names += ['resnetv20_flatten0_flatten0', 'resnetv20_stage1_batchnorm0_fwd']
             if exclude_first_conv:
                 excluded_sym_names += ['resnetv20_conv0_fwd']
         elif args.model.find('vgg') != -1:
diff --git a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
index 2d2bf2c64596..510ca29d7f91 100644
--- a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
@@ -132,14 +132,13 @@ class MKLDNNBNForward {
     return *var_m;
   }
 
-  void SetDataHandle(const NDArray &data, const mkldnn::memory *mean,
+  void SetDataHandle(const mkldnn::memory *data, const mkldnn::memory *mean,
                      const mkldnn::memory *var, const mkldnn::memory *out) {
-    auto _data = data.GetMKLDNNData();
     if (data_m) {
-      data_m->set_data_handle(_data->get_data_handle());
+      data_m->set_data_handle(data->get_data_handle());
     } else {
-      data_m.reset(new mkldnn::memory(_data->get_primitive_desc(),
-                                      _data->get_data_handle()));
+      data_m.reset(new mkldnn::memory(data->get_primitive_desc(),
+                                      data->get_data_handle()));
     }
     if (out_m) {
       out_m->set_data_handle(out->get_data_handle());
@@ -175,7 +174,7 @@ class MKLDNNBNForward {
 
   void SetDataHandle(const NDArray &data, const NDArray &mean,
                      const NDArray &var, const mkldnn::memory &out) {
-    SetDataHandle(data, mean.GetMKLDNNData(), var.GetMKLDNNData(), &out);
+    SetDataHandle(data.GetMKLDNNData(), mean.GetMKLDNNData(), var.GetMKLDNNData(), &out);
   }
 
   const mkldnn::batch_normalization_forward &GetFwd() const {
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_batch_norm.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_batch_norm.cc
index df5e48744f2d..617a1229b979 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_batch_norm.cc
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_batch_norm.cc
@@ -40,6 +40,27 @@ static void MKLDNNQuantizedBatchNormForward(const nnvm::NodeAttrs &attrs, const
   TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]);
   const BatchNormParam &param = nnvm::get<BatchNormParam>(attrs.parsed);
   const NDArray &data = in_data[quantized_batchnorm::kData];
+  auto data_mem = data.GetMKLDNNData();
+
+  // reorder if data type = uint8
+  if (in_data[quantized_batchnorm::kData].dtype() == mshadow::kUint8) {
+    auto u8_pd = data_mem->get_primitive_desc();
+    auto u8_md = u8_pd.desc();
+    mkldnn::memory::desc s8_md(
+        mkldnn::memory::dims(u8_md.data.dims, u8_md.data.dims + u8_md.data.ndims),
+        mkldnn::memory::data_type::s8, static_cast<mkldnn::memory::format>(u8_md.data.format));
+    auto s8_pd = mkldnn::memory::primitive_desc(s8_md, CpuEngine::Get()->get_engine());
+    auto data_reorder_mem = TmpMemMgr::Get()->Alloc(s8_pd);
+
+    std::vector<float> reorder_scale;
+    reorder_scale = {float(kInt8Range) / kUint8Range};
+    primitive_attr reorder_attr;
+    reorder_attr.set_int_output_round_mode(round_mode::round_nearest);
+    reorder_attr.set_output_scales(0, reorder_scale);
+    const auto reorder_pd = mkldnn::reorder::primitive_desc(u8_pd, s8_pd, reorder_attr);
+    MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(reorder_pd, *data_mem, *data_reorder_mem));
+    data_mem = data_reorder_mem;
+  }
   const size_t channelAxis = static_cast<size_t>(
       param.axis < 0 ? static_cast<int>(data.shape().ndim()) + param.axis : param.axis);
   const int channel_count = data.shape()[channelAxis];
@@ -92,7 +113,7 @@ static void MKLDNNQuantizedBatchNormForward(const nnvm::NodeAttrs &attrs, const
 
   auto out_mem = CreateMKLDNNMem(outputs[batchnorm::kOut],
       fwd.GetPd().dst_primitive_desc(), req[batchnorm::kOut], &data);
-  fwd.SetDataHandle(data, rescaled_mean_mem, rescaled_var_mem, out_mem.second);
+  fwd.SetDataHandle(data_mem, rescaled_mean_mem, rescaled_var_mem, out_mem.second);
 
   MKLDNNStream::Get()->RegisterPrim(fwd.GetFwd());
   MKLDNNStream::Get()->Submit();
diff --git a/src/operator/quantization/quantized_batch_norm.cc b/src/operator/quantization/quantized_batch_norm.cc
index 3187826fe996..3c46e1b8bd5c 100644
--- a/src/operator/quantization/quantized_batch_norm.cc
+++ b/src/operator/quantization/quantized_batch_norm.cc
@@ -67,7 +67,13 @@ bool QuantizedBatchNormType(const nnvm::NodeAttrs& attrs, std::vector<int>* in_t
   CHECK_EQ(in_type->size(), 7U);
   CHECK_EQ(out_type->size(), 3U);
 
+#if MXNET_USE_MKLDNN == 1
+  CHECK(in_type->at(0) == mshadow::kInt8 || in_type->at(0) == mshadow::kUint8)
+      << "QuantizedBatchNorm with MKLDNN backend only supports int8/uint8 input, while "
+      << in_type->at(0) << " is given.";
+#else
   TYPE_ASSIGN_CHECK(*in_type, 0, mshadow::kInt8);
+#endif
   for (size_t i = 1; i < 7; ++i) {
     TYPE_ASSIGN_CHECK(*in_type, i, mshadow::kFloat32);
   }

From df4c02a349b2aadf72c1f1d4a37d0101fc886334 Mon Sep 17 00:00:00 2001
From: Yixin Bao <yixin.bao@intel.com>
Date: Mon, 26 Aug 2019 09:44:57 +0800
Subject: [PATCH 2/7] update test case for uint8 bn

---
 .../python/quantization/test_quantization.py  | 24 +++++++------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index a1c23fb23208..5b17c73023ef 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -607,10 +607,7 @@ def get_mean_var(data):
         return mean, var
 
     def check_quantized_bn(data_shape, qdtype):
-        if qdtype == 'uint8':
-            print('skipped testing quantize_bn for uint8 since it is not supported yet')
-            return
-        elif is_test_for_native_cpu():
+        if is_test_for_native_cpu():
             print('skipped testing quantize_bn for native cpu since it is not supported yet')
             return
         elif is_test_for_gpu():
@@ -672,9 +669,10 @@ def check_quantized_bn(data_shape, qdtype):
 
         assert_almost_equal(output.asnumpy(), output_int8_to_fp32.asnumpy(), rtol=1e-1, atol=3)
 
-    check_quantized_bn((32, 512, 4, 4), 'int8')
-    check_quantized_bn((32, 1024, 8, 8), 'int8')
-    check_quantized_bn((32, 3, 224, 224), 'int8')
+    for qdtype in ['int8', 'uint8']:
+        check_quantized_bn((32, 512, 4, 4), qdtype)
+        check_quantized_bn((32, 1024, 8, 8), qdtype)
+        check_quantized_bn((32, 3, 224, 224), qdtype)
 
 @with_seed()
 def test_quantize_params():
@@ -918,15 +916,9 @@ def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape=N
         lshape_list.append(None)
 
         for s, dshape, lshape, name in zip(sym_list, dshape_list, lshape_list, name_list):
-            if qdtype == 'int8' and is_test_for_mkldnn() and name in ['sym1', 'sym2', 'sym3']:
-              print('skipped testing test_quantize_model_with_forward for mkldnn cpu int8 since it is not supported yet')
-              continue
-            elif qdtype == 'uint8' and is_test_for_mkldnn() and name in ['sym1']:
-              print('skipping test_quantize_model_with_forward for mkldnn cpu uint8 since it is not supported yet')
-              continue
-            elif qdtype == 'int8' and is_test_for_gpu() and name in ['sym1']:
-              print('skipped testing test_quantize_model_with_forward for gpu int8 since it is not supported yet')
-              continue
+            if is_test_for_gpu() and name in ['sym1']:
+               print('skipped testing test_quantize_model_with_forward for gpu int8 since it is not supported yet')
+               continue
 
             if lshape is None:
                 mod = Module(symbol=s, label_names=None)

From 7d00792970f0df54d09a24102863284ab79cef00 Mon Sep 17 00:00:00 2001
From: Yixin Bao <yixin.bao@intel.com>
Date: Mon, 26 Aug 2019 10:09:07 +0800
Subject: [PATCH 3/7] fix lint

---
 src/operator/quantization/mkldnn/mkldnn_quantized_batch_norm.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_batch_norm.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_batch_norm.cc
index 617a1229b979..429a80e6b186 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_batch_norm.cc
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_batch_norm.cc
@@ -53,7 +53,7 @@ static void MKLDNNQuantizedBatchNormForward(const nnvm::NodeAttrs &attrs, const
     auto data_reorder_mem = TmpMemMgr::Get()->Alloc(s8_pd);
 
     std::vector<float> reorder_scale;
-    reorder_scale = {float(kInt8Range) / kUint8Range};
+    reorder_scale = {static_cast<float>(kInt8Range) / kUint8Range};
     primitive_attr reorder_attr;
     reorder_attr.set_int_output_round_mode(round_mode::round_nearest);
     reorder_attr.set_output_scales(0, reorder_scale);

From f736c04455ad8b9ce5dadd381d5bdef155417192 Mon Sep 17 00:00:00 2001
From: Yixin Bao <yixin.bao@intel.com>
Date: Mon, 26 Aug 2019 10:29:40 +0800
Subject: [PATCH 4/7] update test with gpu

---
 tests/python/quantization/test_quantization.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 5b17c73023ef..805256817c22 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -916,10 +916,6 @@ def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape=N
         lshape_list.append(None)
 
         for s, dshape, lshape, name in zip(sym_list, dshape_list, lshape_list, name_list):
-            if is_test_for_gpu() and name in ['sym1']:
-               print('skipped testing test_quantize_model_with_forward for gpu int8 since it is not supported yet')
-               continue
-
             if lshape is None:
                 mod = Module(symbol=s, label_names=None)
                 mod.bind(for_training=False,

From 3d0a45742f1d5233799cd944d8c1201ce105861a Mon Sep 17 00:00:00 2001
From: Yixin Bao <yixin.bao@intel.com>
Date: Mon, 26 Aug 2019 10:30:14 +0800
Subject: [PATCH 5/7] add comment for quantization

---
 example/quantization/imagenet_gen_qsym_mkldnn.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/example/quantization/imagenet_gen_qsym_mkldnn.py b/example/quantization/imagenet_gen_qsym_mkldnn.py
index 67cdda2e0751..6c87f58b63e2 100644
--- a/example/quantization/imagenet_gen_qsym_mkldnn.py
+++ b/example/quantization/imagenet_gen_qsym_mkldnn.py
@@ -216,6 +216,7 @@ def save_params(fname, arg_params, aux_params, logger=None):
             if exclude_first_conv:
                 excluded_sym_names += ['resnetv10_conv0_fwd']
         elif args.model.find('resnet') != -1 and args.model.find('v2') != -1:
+            # resnetv20_stage1_batchnorm0_fwd is excluded for the sake of accuracy
             excluded_sym_names += ['resnetv20_flatten0_flatten0', 'resnetv20_stage1_batchnorm0_fwd']
             if exclude_first_conv:
                 excluded_sym_names += ['resnetv20_conv0_fwd']

From dd5362214c47e8cfe09021f1d51aa60a003f7205 Mon Sep 17 00:00:00 2001
From: Yixin Bao <yixin.bao@intel.com>
Date: Mon, 26 Aug 2019 15:36:17 +0800
Subject: [PATCH 6/7] fix quantized_bn test

---
 .../python/quantization/test_quantization.py  | 27 ++++++++++---------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 805256817c22..989a07b94512 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -614,9 +614,14 @@ def check_quantized_bn(data_shape, qdtype):
             print('skipped testing quantize_bn for gpu since it is not supported yet')
             return
 
-        # qdtype = int8
-        data_low = -127.0
-        data_high = 127.0
+        # qdtype = uint8
+        if qdtype == 'uint8':
+            data_low = 0.0
+            data_high = 127.0
+        else:
+            data_low = -127.0
+            data_high = 127.0
+        # output type = int8
         quantized_range = 127.0
         # run fp32 bn
         data_sym = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
@@ -636,9 +641,6 @@ def check_quantized_bn(data_shape, qdtype):
         bn_fp32_exe.arg_dict[arg_names[2]][:] = beta
         bn_fp32_exe.aux_dict[aux_names[0]][:] = moving_mean
         bn_fp32_exe.aux_dict[aux_names[1]][:] = moving_var
-        min_data = mx.nd.min(data)
-        max_data = mx.nd.max(data)
-        data_range = mx.nd.maximum(mx.nd.abs(min_data), mx.nd.abs(max_data))
 
         output= bn_fp32_exe.forward()[0]
 
@@ -651,11 +653,12 @@ def check_quantized_bn(data_shape, qdtype):
 
         calib_data = NDArrayIter(data=data, batch_size=data_shape[0])
         calib_data = DummyIter(calib_data)
+        # quantize bn with quantized_type = int8: MKLDNN BN only support int8 output
         qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=bn_fp32,
                                                                              arg_params=arg_params,
                                                                              aux_params=bn_fp32_exe.aux_dict,
                                                                              ctx=mx.current_context(),
-                                                                             quantized_dtype=qdtype,
+                                                                             quantized_dtype='int8',
                                                                              calib_mode='naive',
                                                                              calib_data=calib_data,
                                                                              num_calib_examples=20)
@@ -665,14 +668,14 @@ def check_quantized_bn(data_shape, qdtype):
         mod.set_params(qarg_params, qaux_params)
         batch = mx.io.DataBatch([data], [])
         mod.forward(batch, is_train=False)
-        output_int8_to_fp32= mod.get_outputs()[0]
+        output_int8_to_fp32 = mod.get_outputs()[0]
 
-        assert_almost_equal(output.asnumpy(), output_int8_to_fp32.asnumpy(), rtol=1e-1, atol=3)
+        assert_almost_equal(output.asnumpy(), output_int8_to_fp32.asnumpy(), rtol=1e-1, atol=4)
 
     for qdtype in ['int8', 'uint8']:
-        check_quantized_bn((32, 512, 4, 4), qdtype)
-        check_quantized_bn((32, 1024, 8, 8), qdtype)
-        check_quantized_bn((32, 3, 224, 224), qdtype)
+      check_quantized_bn((32, 512, 4, 4), qdtype)
+      check_quantized_bn((32, 1024, 8, 8), qdtype)
+      check_quantized_bn((32, 3, 224, 224), qdtype)
 
 @with_seed()
 def test_quantize_params():

From eeb60f091006ece6adcd62500805f8bfc32fb81e Mon Sep 17 00:00:00 2001
From: Yixin Bao <yixin.bao@intel.com>
Date: Mon, 26 Aug 2019 16:02:36 +0800
Subject: [PATCH 7/7] fix quantize_model_with_forward test

---
 tests/python/quantization/test_quantization.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 989a07b94512..31bc1638b010 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -919,6 +919,12 @@ def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape=N
         lshape_list.append(None)
 
         for s, dshape, lshape, name in zip(sym_list, dshape_list, lshape_list, name_list):
+            if qdtype == 'int8' and name in ['sym1','sym2','sym3']:
+                print('mkldnn_quantized_conv op only supports uint8 as input type, skip test with int8.')
+                continue
+            if qdtype == 'uint8' and name in ['sym1']:
+                print('mkldnn_quantized_bn doesn\'t support calib_mode=None')
+                continue
             if lshape is None:
                 mod = Module(symbol=s, label_names=None)
                 mod.bind(for_training=False,