diff --git a/example/quantization/README.md b/example/quantization/README.md
index b77537d4fba7..fc9a26755b4e 100644
--- a/example/quantization/README.md
+++ b/example/quantization/README.md
@@ -35,6 +35,7 @@ The following models have been tested on Linux systems.
 |[ResNet152-V2](#8)|[MXNet ModelZoo](http://data.mxnet.io/models/imagenet/resnet/152-layers/)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|76.76%/93.03%|76.48%/92.96%|
 |[Inception-BN](#9)|[MXNet ModelZoo](http://data.mxnet.io/models/imagenet/inception-bn/)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|72.09%/90.60%|72.00%/90.53%|
 | [SSD-VGG16](#10) | [example/ssd](https://github.com/apache/incubator-mxnet/tree/master/example/ssd)  | VOC2007/2012  | 0.8366 mAP  | 0.8364 mAP  |
+| [SSD-VGG16](#10) | [example/ssd](https://github.com/apache/incubator-mxnet/tree/master/example/ssd)  | COCO2014  | 0.2552 mAP  | 0.253 mAP  |
 
 <h3 id='3'>ResNet50-V1</h3>
 
diff --git a/example/ssd/README.md b/example/ssd/README.md
index 6d4caa481bd7..92a125f1892d 100644
--- a/example/ssd/README.md
+++ b/example/ssd/README.md
@@ -42,6 +42,7 @@ remarkable traits of MXNet.
 Due to the permission issue, this example is maintained in this [repository](https://github.com/zhreshold/mxnet-ssd) separately. You can use the link regarding specific per example [issues](https://github.com/zhreshold/mxnet-ssd/issues).
 
 ### What's new
+* Support training and inference on COCO dataset. Int8 inference achieves 0.253 mAP on CPU with MKL-DNN backend, which is a comparable accuracy to FP32 (0.2552 mAP).
 * Support uint8 inference on CPU with MKL-DNN backend. Uint8 inference achieves 0.8364 mAP, which is a comparable accuracy to FP32 (0.8366 mAP).
 * Added live camera capture and detection display (run with --camera flag). Example:
     `./demo.py --camera --cpu --frame-resize 0.5`
@@ -119,9 +120,9 @@ You can use `./demo.py --camera` to use a video capture device with opencv such
 will open a window that will display the camera output together with the detections. You can play
 with the detection threshold to get more or less detections.
 
-### Train the model
+### Train the model on VOC
 * Note that we recommend to use gluon-cv to train the model, please refer to [gluon-cv ssd](https://gluon-cv.mxnet.io/build/examples_detection/train_ssd_voc.html).
-This example only covers training on Pascal VOC dataset. Other datasets should
+This example only covers training on Pascal VOC or MS COCO dataset. Other datasets should
 be easily supported by adding subclass derived from class `Imdb` in `dataset/imdb.py`.
 See example of `dataset/pascal_voc.py` for details.
 * Download the converted pretrained `vgg16_reduced` model [here](https://github.com/zhreshold/mxnet-ssd/releases/download/v0.2-beta/vgg16_reduced.zip), unzip `.param` and `.json` files
@@ -166,16 +167,53 @@ Check `python train.py --help` for more training options. For example, if you ha
 python train.py --gpus 0,1,2,3 --batch-size 32
 ```
 
+### Train the model on COCO
+* Download the COCO2014 dataset, skip this step if you already have one.
+```
+cd /path/to/where_you_store_datasets/
+wget http://images.cocodataset.org/zips/train2014.zip
+wget http://images.cocodataset.org/zips/val2014.zip
+wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip
+# Extract the data.
+unzip train2014.zip
+unzip val2014.zip
+unzip annotations_trainval2014.zip
+```
+* We are going to use `train2014,valminusminival2014` set in COCO2014 for training and `minival2014` for evaluation as a common strategy.
+* Then link `COCO2014` folder to `data/coco` by default:
+```
+ln -s /path/to/COCO2014 /path/to/incubator-mxnet/example/ssd/data/coco
+```
+Use hard link instead of copy could save us a bit disk space.
+* Create packed binary file for faster training:
+```
+# cd /path/to/incubator-mxnet/example/ssd
+bash tools/prepare_coco.sh
+# or if you are using windows
+python tools/prepare_dataset.py --dataset coco --set train2014,valminusminival2014 --target ./data/train.lst --root ./data/coco
+python tools/prepare_dataset.py --dataset coco --set minival2014 --target ./data/val.lst --root ./data/coco --no-shuffle
+```
+* Start training:
+```
+# cd /path/to/incubator-mxnet/example/ssd
+python train.py --label-width=560 --num-class=80 --class-names=./dataset/names/coco_label --pretrained="" --num-example=117265 --batch-size=64
+```
+
 ### Evalute trained model
 Make sure you have val.rec as validation dataset. It's the same one as used in training. Use:
 ```
 # cd /path/to/incubator-mxnet/example/ssd
 python evaluate.py --gpus 0,1 --batch-size 128 --epoch 0
+
+# Evaluate on COCO dataset
+python evaluate.py --gpus 0,1 --batch-size 128 --epoch 0 --num-class=80 --class-names=./dataset/names/mscoco.names
 ```
 
 ### Quantize model
 
-Follow the [Train instructions](https://github.com/apache/incubator-mxnet/tree/master/example/ssd#train-the-model) to train a FP32 `SSD-VGG16_reduced_300x300` model based on Pascal VOC dataset. You can also download our [SSD-VGG16 pre-trained model](http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_vgg16_reduced_300-dd479559.zip) and [packed binary data](http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/ssd-val-fc19a535.zip). Create `model` and `data` directories if they're not exist, extract the zip files, then rename the uncompressed files as follows (eg, rename `ssd-val-fc19a535.idx` to `val.idx`, `ssd-val-fc19a535.lst` to `val.lst`, `ssd-val-fc19a535.rec` to `val.rec`, `ssd_vgg16_reduced_300-dd479559.params` to `ssd_vgg16_reduced_300-0000.params`, `ssd_vgg16_reduced_300-symbol-dd479559.json` to `ssd_vgg16_reduced_300-symbol.json`.)
+To quantize a model on VOC dataset, follow the [Train instructions](https://github.com/apache/incubator-mxnet/tree/master/example/ssd#train-the-model-on-VOC) to train a FP32 `SSD-VGG16_reduced_300x300` model based on Pascal VOC dataset. You can also download our [SSD-VGG16 pre-trained model](http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_vgg16_reduced_300-dd479559.zip) and [packed binary data](http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/ssd-val-fc19a535.zip). Create `model` and `data` directories if they're not exist, extract the zip files, then rename the uncompressed files as follows (eg, rename `ssd-val-fc19a535.idx` to `val.idx`, `ssd-val-fc19a535.lst` to `val.lst`, `ssd-val-fc19a535.rec` to `val.rec`, `ssd_vgg16_reduced_300-dd479559.params` to `ssd_vgg16_reduced_300-0000.params`, `ssd_vgg16_reduced_300-symbol-dd479559.json` to `ssd_vgg16_reduced_300-symbol.json`.)
+
+To quantize a model on COCO dataset, follow the [Train instructions](https://github.com/apache/incubator-mxnet/tree/master/example/ssd#train-the-model-on-COCO) to train a FP32 `SSD-VGG16_reduced_300x300` model based on COCO dataset. You can also download our [SSD-VGG16 pre-trained model](http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_vgg16_reduced_300-7fedd4ad.zip) and [packed binary data](http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/ssd_coco-val-e91096e8.zip). Create `model` and `data` directories if they're not exist, extract the zip files, then rename the uncompressed files as follows (eg, rename `ssd_coco-val-e91096e8.idx` to `val.idx`, `ssd_coco-val-e91096e8.lst` to `val.lst`, `ssd_coco-val-e91096e8.rec` to `val.rec`, `ssd_vgg16_reduced_300-7fedd4ad.params` to `ssd_vgg16_reduced_300-0000.params`, `ssd_vgg16_reduced_300-symbol-7fedd4ad.json` to `ssd_vgg16_reduced_300-symbol.json`.)
 
 ```
 data/
@@ -199,12 +237,20 @@ After quantization, INT8 models will be saved in `model/` dictionary.  Use the f
 # USE MKLDNN AS SUBGRAPH BACKEND
 export MXNET_SUBGRAPH_BACKEND=MKLDNN
 
-# Launch FP32 Inference
+# Launch FP32 Inference on VOC dataset
 python evaluate.py --cpu --num-batch 10 --batch-size 224 --deploy --prefix=./model/ssd_
 
-# Launch INT8 Inference
+# Launch INT8 Inference on VOC dataset
 python evaluate.py --cpu --num-batch 10 --batch-size 224 --deploy --prefix=./model/cqssd_
 
+# Launch FP32 Inference on COCO dataset
+
+python evaluate.py --cpu --num-batch 10 --batch-size 224 --deploy --prefix=./model/ssd_ --num-class=80 --class-names=./dataset/names/mscoco.names
+
+# Launch INT8 Inference on COCO dataset
+
+python evaluate.py --cpu --num-batch 10 --batch-size 224 --deploy --prefix=./model/cqssd_ --num-class=80 --class-names=./dataset/names/mscoco.names
+
 # Launch dummy data Inference
 python benchmark_score.py --deploy --prefix=./model/ssd_
 python benchmark_score.py --deploy --prefix=./model/cqssd_
diff --git a/example/ssd/dataset/mscoco.py b/example/ssd/dataset/mscoco.py
index 469a15ae2720..dbe6e6909f4d 100644
--- a/example/ssd/dataset/mscoco.py
+++ b/example/ssd/dataset/mscoco.py
@@ -97,6 +97,12 @@ def _load_all(self, anno_file, shuffle):
         labels = []
         coco = COCO(anno_file)
         img_ids = coco.getImgIds()
+        # deal with class names
+        cats = [cat['name'] for cat in coco.loadCats(coco.getCatIds())]
+        class_to_coco_ind = dict(zip(cats, coco.getCatIds()))
+        class_to_ind = dict(zip(self.classes, range(len(self.classes))))
+        coco_ind_to_class_ind = dict([(class_to_coco_ind[cls], class_to_ind[cls])
+                                     for cls in self.classes[0:]])
         for img_id in img_ids:
             # filename
             image_info = coco.loadImgs(img_id)[0]
@@ -109,7 +115,7 @@ def _load_all(self, anno_file, shuffle):
             annos = coco.loadAnns(anno_ids)
             label = []
             for anno in annos:
-                cat_id = int(anno["category_id"])
+                cat_id = coco_ind_to_class_ind[anno['category_id']]
                 bbox = anno["bbox"]
                 assert len(bbox) == 4
                 xmin = float(bbox[0]) / width
@@ -123,7 +129,7 @@ def _load_all(self, anno_file, shuffle):
 
         if shuffle:
             import random
-            indices = range(len(image_set_index))
+            indices = list(range(len(image_set_index)))
             random.shuffle(indices)
             image_set_index = [image_set_index[i] for i in indices]
             labels = [labels[i] for i in indices]
diff --git a/example/ssd/dataset/names/mscoco.names b/example/ssd/dataset/names/mscoco.names
index ca76c80b5b2c..941cb4e13922 100644
--- a/example/ssd/dataset/names/mscoco.names
+++ b/example/ssd/dataset/names/mscoco.names
@@ -1,8 +1,8 @@
 person
 bicycle
 car
-motorbike
-aeroplane
+motorcycle
+airplane
 bus
 train
 truck
@@ -55,12 +55,12 @@ pizza
 donut
 cake
 chair
-sofa
-pottedplant
+couch
+potted plant
 bed
-diningtable
+dining table
 toilet
-tvmonitor
+tv
 laptop
 mouse
 remote
diff --git a/example/ssd/symbol/legacy_vgg16_ssd_300.py b/example/ssd/symbol/legacy_vgg16_ssd_300.py
index 29fc30be65d4..0acac6e4294b 100644
--- a/example/ssd/symbol/legacy_vgg16_ssd_300.py
+++ b/example/ssd/symbol/legacy_vgg16_ssd_300.py
@@ -200,8 +200,7 @@ def get_symbol(num_classes=20, nms_thresh=0.5, force_suppress=False,
     loc_preds = net.get_internals()["multibox_loc_pred_output"]
     anchor_boxes = net.get_internals()["multibox_anchors_output"]
 
-    cls_prob = mx.symbol.SoftmaxActivation(data=cls_preds, mode='channel', \
-        name='cls_prob')
+    cls_prob = mx.symbol.softmax(data=cls_preds, axis=1, name='cls_prob')
     out = mx.symbol.contrib.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
         name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
         variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
diff --git a/example/ssd/symbol/legacy_vgg16_ssd_512.py b/example/ssd/symbol/legacy_vgg16_ssd_512.py
index c5c3095dfd77..74d6b37fc11e 100644
--- a/example/ssd/symbol/legacy_vgg16_ssd_512.py
+++ b/example/ssd/symbol/legacy_vgg16_ssd_512.py
@@ -203,8 +203,7 @@ def get_symbol(num_classes=20, nms_thresh=0.5, force_suppress=False, nms_topk=40
     loc_preds = net.get_internals()["multibox_loc_pred_output"]
     anchor_boxes = net.get_internals()["multibox_anchors_output"]
 
-    cls_prob = mx.symbol.SoftmaxActivation(data=cls_preds, mode='channel', \
-        name='cls_prob')
+    cls_prob = mx.symbol.softmax(data=cls_preds, axis=1, name='cls_prob')
     out = mx.symbol.contrib.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
         name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
         variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
diff --git a/example/ssd/symbol/symbol_builder.py b/example/ssd/symbol/symbol_builder.py
index 041f83eb44da..135c42e8be15 100644
--- a/example/ssd/symbol/symbol_builder.py
+++ b/example/ssd/symbol/symbol_builder.py
@@ -175,8 +175,7 @@ def get_symbol(network, num_classes, from_layers, num_filters, sizes, ratios,
         num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \
         num_channels=num_filters, clip=False, interm_layer=0, steps=steps)
 
-    cls_prob = mx.symbol.SoftmaxActivation(data=cls_preds, mode='channel', \
-        name='cls_prob')
+    cls_prob = mx.symbol.softmax(data=cls_preds, axis=1, name='cls_prob')
     out = mx.symbol.contrib.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
         name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
         variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
diff --git a/example/ssd/train.py b/example/ssd/train.py
index 09c618a96429..5965aeec6c7a 100755
--- a/example/ssd/train.py
+++ b/example/ssd/train.py
@@ -103,6 +103,8 @@ def parse_args():
                         help='use difficult ground-truths in evaluation')
     parser.add_argument('--no-voc07', dest='use_voc07_metric', action='store_false',
                         help='dont use PASCAL VOC 07 11-point metric')
+    parser.add_argument('--kv-store', type=str, default='local',
+                        help='key-value store type')
     args = parser.parse_args()
     return args
 
@@ -150,4 +152,5 @@ def parse_class_names(args):
               force_nms=args.force_nms,
               ovp_thresh=args.overlap_thresh,
               use_difficult=args.use_difficult,
-              voc07_metric=args.use_voc07_metric)
+              voc07_metric=args.use_voc07_metric,
+              kv_store=args.kv_store)
diff --git a/example/ssd/train/metric.py b/example/ssd/train/metric.py
index 731f8fcc19f4..eeb9796bf4a8 100644
--- a/example/ssd/train/metric.py
+++ b/example/ssd/train/metric.py
@@ -39,6 +39,17 @@ def reset(self):
             self.num_inst = [0] * self.num
             self.sum_metric = [0.0] * self.num
 
+    def reset_local(self):
+        """
+        override reset behavior
+        """
+        if getattr(self, 'num', None) is None:
+            self.num_inst = 0
+            self.sum_metric = 0.0
+        else:
+            self.num_inst = [0] * self.num
+            self.sum_metric = [0.0] * self.num
+
     def update(self, labels, preds):
         """
         Implementation of updating metrics
diff --git a/example/ssd/train/train_net.py b/example/ssd/train/train_net.py
index 304a43b3d949..b37e3d5abcec 100644
--- a/example/ssd/train/train_net.py
+++ b/example/ssd/train/train_net.py
@@ -97,7 +97,7 @@ def train_net(net, train_path, num_classes, batch_size,
               use_difficult=False, class_names=None,
               voc07_metric=False, nms_topk=400, force_suppress=False,
               train_list="", val_path="", val_list="", iter_monitor=0,
-              monitor_pattern=".*", log_file=None):
+              monitor_pattern=".*", log_file=None, kv_store=None):
     """
     Wrapper for training phase.
 
@@ -258,6 +258,9 @@ def train_net(net, train_path, num_classes, batch_size,
     else:
         valid_metric = MApMetric(ovp_thresh, use_difficult, class_names, pred_idx=3)
 
+    # create kvstore when there are gpus
+    kv = mx.kvstore.create(kv_store) if kv_store else None
+
     mod.fit(train_iter,
             val_iter,
             eval_metric=MultiBoxMetric(),
@@ -272,4 +275,5 @@ def train_net(net, train_path, num_classes, batch_size,
             arg_params=args,
             aux_params=auxs,
             allow_missing=True,
-            monitor=monitor)
+            monitor=monitor,
+            kvstore=kv)