diff --git a/example/quantization/README.md b/example/quantization/README.md
index 032ca97176df..8cdc1bb7e06f 100644
--- a/example/quantization/README.md
+++ b/example/quantization/README.md
@@ -80,6 +80,29 @@ optional arguments:
                         if calibration mode is enabled
 ```
 
+A new benchmark script `launch_inference_mkldnn.sh` has been designed to launch performance benchmark for float32 or int8 image-classification models with Intel® MKL-DNN.
+```
+usage: bash ./launch_inference_mkldnn.sh [[[-s symbol_file ] [-b batch_size] [-iter iteraton] [-ins instance] [-c cores/instance]] | [-h]]
+
+optional arguments:
+  -h, --help                show this help message and exit
+  -s, --symbol_file         symbol file for benchmark
+  -b, --batch_size          inference batch size
+                            default: 64
+  -iter, --iteration        inference iteration
+                            default: 500
+  -ins, --instance          launch multi-instance inference
+                            default: one instance per socket
+  -c, --core                number of cores per instance
+                            default: divide full physical cores
+
+example: resnet int8 performance benchmark on c5.24xlarge(duo sockets, 24 physical cores per socket).
+
+    bash ./launch_inference_mkldnn.sh -s ./model/resnet50_v1-quantized-5batches-naive-symbol.json
+
+will launch two instances for throughput benchmark and each instance will use 24 physical cores.
+```
+
 Use the following command to install [Gluon-CV](https://gluon-cv.mxnet.io/):
 
 ```
@@ -120,8 +143,8 @@ python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --par
 python imagenet_inference.py --symbol-file=./model/resnet50_v1-quantized-5batches-naive-symbol.json --param-file=./model/resnet50_v1-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
 
 # Launch dummy data Inference
-python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
-python imagenet_inference.py --symbol-file=./model/resnet50_v1-quantized-5batches-naive-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
+bash ./launch_inference_mkldnn.sh -s ./model/resnet50_v1-symbol.json
+bash ./launch_inference_mkldnn.sh -s ./model/resnet50_v1-quantized-5batches-naive-symbol.json
 ```
 
 <h3 id='4'>SqueezeNet 1.0</h3>
@@ -142,8 +165,8 @@ python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --p
 python imagenet_inference.py --symbol-file=./model/squeezenet1.0-quantized-5batches-naive-symbol.json --param-file=./model/squeezenet1.0-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
 
 # Launch dummy data Inference
-python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu  --benchmark=True
-python imagenet_inference.py --symbol-file=./model/squeezenet1.0-quantized-5batches-naive-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
+bash ./launch_inference_mkldnn.sh -s ./model/squeezenet1.0-symbol.json
+bash ./launch_inference_mkldnn.sh -s ./model/squeezenet1.0-quantized-5batches-naive-symbol.json 
 ```
 
 <h3 id='5'>MobileNet 1.0</h3>
@@ -164,8 +187,8 @@ python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --pa
 python imagenet_inference.py --symbol-file=./model/mobilenet1.0-quantized-5batches-naive-symbol.json --param-file=./model/mobilenet1.0-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
 
 # Launch dummy data Inference
-python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu  --benchmark=True
-python imagenet_inference.py --symbol-file=./model/mobilenet1.0-quantized-5batches-naive-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
+bash ./launch_inference_mkldnn.sh -s ./model/mobilenet1.0-symbol.json
+bash ./launch_inference_mkldnn.sh -s ./model/mobilenet1.0-quantized-5batches-naive-symbol.json
 ```
 
 <h3 id='6'>MobileNetV2 1.0</h3>
@@ -186,8 +209,8 @@ python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json -
 python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-quantized-5batches-naive-symbol.json --param-file=./model/mobilenetv2_1.0-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
 
 # Launch dummy data Inference
-python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu  --benchmark=True
-python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-quantized-5batches-naive-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
+bash ./launch_inference_mkldnn.sh -s ./model/mobilenetv2_1.0-symbol.json
+bash ./launch_inference_mkldnn.sh -s ./model/mobilenetv2_1.0-quantized-5batches-naive-symbol.json
 ```
 
 <h3 id='7'>Inception-V3</h3>
@@ -208,8 +231,8 @@ python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --par
 python imagenet_inference.py --symbol-file=./model/inceptionv3-quantized-5batches-naive-symbol.json --param-file=./model/inceptionv3-quantized-0000.params --image-shape=3,299,299 --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
 
 # Launch dummy data Inference
-python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --image-shape=3,299,299 --batch-size=64 --num-inference-batches=500 --ctx=cpu  --benchmark=True
-python imagenet_inference.py --symbol-file=./model/inceptionv3-quantized-5batches-naive-symbol.json --image-shape=3,299,299 --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
+bash ./launch_inference_mkldnn.sh -s ./model/inceptionv3-symbol.json
+bash ./launch_inference_mkldnn.sh -s ./model/inceptionv3-quantized-5batches-naive-symbol.json
 ```
 
 <h3 id='8'>ResNet152-V2</h3>
@@ -231,8 +254,8 @@ python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.
 python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-quantized-5batches-naive-symbol.json --param-file=./model/imagenet1k-resnet-152-quantized-0000.params --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
 
 # Launch dummy data Inference
-python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
-python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-quantized-5batches-naive-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
+bash ./launch_inference_mkldnn.sh -s ./model/imagenet1k-resnet-152-symbol.json
+bash ./launch_inference_mkldnn.sh -s ./model/imagenet1k-resnet-152-quantized-5batches-naive-symbol.json
 ```
 
 <h3 id='9'>Inception-BN</h3>
@@ -254,8 +277,8 @@ python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-symbo
 python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-quantized-5batches-naive-symbol.json --param-file=./model/imagenet1k-inception-bn-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
 
 # Launch dummy data Inference
-python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
-python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-quantized-5batches-naive-symbol.json --batch-size=64 --num-inference-batches=500 --ctx=cpu --benchmark=True
+bash ./launch_inference_mkldnn.sh -s ./model/imagenet1k-inception-bn-symbol.json
+bash ./launch_inference_mkldnn.sh -s ./model/imagenet1k-inception-bn-quantized-5batches-naive-symbol.json
 ```
 
 <h3 id='10'>SSD-VGG16</h3>
@@ -307,7 +330,7 @@ python imagenet_gen_qsym_mkldnn.py --model=custom --num-calib-batches=5 --calib-
 python imagenet_inference.py --symbol-file=./model/*.json --param-file=./model/*.params --rgb-mean=* --rgb-std=* --num-skipped-batches=* --batch-size=* --num-inference-batches=*--dataset=./data/* --ctx=cpu
 
 # Launch dummy data Inference
-python imagenet_inference.py --symbol-file=./model/*.json --batch-size=* --num-inference-batches=500 --ctx=cpu --benchmark=True
+bash ./launch_inference_mkldnn.sh -s ./model/*.json
 ```
 
 <h2 id="2">Model Quantization with CUDNN</h2>
diff --git a/example/quantization/launch_inference_mkldnn.sh b/example/quantization/launch_inference_mkldnn.sh
new file mode 100644
index 000000000000..f67787b41b03
--- /dev/null
+++ b/example/quantization/launch_inference_mkldnn.sh
@@ -0,0 +1,111 @@
+#!/bin/sh
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+usage()
+{
+    echo "usage: bash ./launch_inference_mkldnn.sh [[[-s symbol_file ] [-b batch_size] [-iter iteraton] [-ins instance] [-c cores/instance]] | [-h]]"
+}
+
+while [ $# -gt 0 ]; do
+  case "$1" in
+    --symbol | -s)
+      shift
+      SYMBOL=$1
+      ;;
+    --batch-size | -b)
+      shift
+      BS=$1
+      ;;
+    --iteration | -iter)
+      shift
+      ITERATIONS=$1
+      ;;
+    --instance | -ins)
+      shift
+      INS=$1
+      ;;
+    --core | -c)
+      shift
+      CORES=$1
+      ;;
+    --help | -h)
+      usage
+      exit 1
+      ;;
+    *)
+      usage
+      exit 1
+  esac
+  shift
+done
+
+NUM_SOCKET=`lscpu | grep 'Socket(s)' | awk '{print $NF}'`
+NUM_NUMA_NODE=`lscpu | grep 'NUMA node(s)' | awk '{print $NF}'`
+CORES_PER_SOCKET=`lscpu | grep 'Core(s) per socket' | awk '{print $NF}'`
+NUM_CORES=$((CORES_PER_SOCKET * NUM_SOCKET))
+CORES_PER_NUMA=$((NUM_CORES / NUM_NUMA_NODE))
+echo "target machine has $NUM_CORES physical core(s) on $NUM_NUMA_NODE numa nodes of $NUM_SOCKET socket(s)."
+
+if [ -z $SYMBOL ]; then
+  echo "Error: Need a symbol file as input."
+fi
+if [ -z $INS ]; then
+  echo "Default: launch one instance per socket."
+  INS=$NUM_SOCKET
+fi
+if [ -z $CORES ]; then
+  echo "Default: divide full physical cores."
+  CORES=$((NUM_CORES / $INS))
+fi
+if [ -z $BS ]; then
+  echo "Default: set batch size to 64."
+  BS=64
+fi
+if [ -z $ITERATIONS ]; then
+  echo "Default: set iterations to 500."
+  ITERATIONS=500
+fi
+
+echo "  benchmark configs"
+echo "  cores per instance: $CORES"
+echo "  total instances: $INS"
+echo "  batch size: $BS"
+echo "  iterations: $ITERATIONS"
+echo ""
+
+rm BENCHMARK_*.log  || echo "benchmarking..."
+
+for((i=0;i<$INS;i++));
+do
+  ((a=$i*$CORES))
+  ((b=$a+$CORES-1))
+  memid=$((b/CORES_PER_NUMA))
+  LOG=BENCHMARK_$i.log
+  echo "  $i instance use $a-$b cores and $memid mem with $LOG"
+  KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0 \
+  OMP_NUM_THREADS=$CORES \
+  nohup numactl --physcpubind=$a-$b --membind=$memid python imagenet_inference.py --symbol-file=$SYMBOL --batch-size=$BS --num-inference-batches=$ITERATIONS --ctx=cpu --benchmark=True > $LOG 2>&1 &
+done
+wait
+
+fps=`grep image/sec BENCHMARK_*.log | awk '{ sum += $(NF) }; END { print sum }'`
+latency=$(echo "scale=2; 1000*$INS/$fps" | bc)
+echo "overall throughput: $fps"
+echo "latency per instance: $latency"
+echo "benchmark finish:)"