diff --git a/contrib/clojure-package/examples/infer/objectdetector/project.clj b/contrib/clojure-package/examples/infer/objectdetector/project.clj
index cdd9a8991dc8..da01797f5a21 100644
--- a/contrib/clojure-package/examples/infer/objectdetector/project.clj
+++ b/contrib/clojure-package/examples/infer/objectdetector/project.clj
@@ -22,7 +22,6 @@
   :aliases {"run-detector" ["run" "--" "-m" "models/resnet50_ssd/resnet50_ssd_model" "-i" "images/dog.jpg" "-d" "images/"]}
   :dependencies [[org.clojure/clojure "1.9.0"]
                  [org.clojure/tools.cli "0.4.1"]
-                 [origami "4.0.0-3"]
                  [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
   :main ^:skip-aot infer.objectdetector-example
   :profiles {:uberjar {:aot :all}})
diff --git a/contrib/clojure-package/examples/infer/objectdetector/src/infer/draw.clj b/contrib/clojure-package/examples/infer/objectdetector/src/infer/draw.clj
deleted file mode 100644
index d29b34b5c22a..000000000000
--- a/contrib/clojure-package/examples/infer/objectdetector/src/infer/draw.clj
+++ /dev/null
@@ -1,44 +0,0 @@
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns infer.draw
-  (:require
-   [opencv4.colors.rgb :as rgb]
-   [opencv4.core :refer [FONT_HERSHEY_PLAIN imread imwrite new-point put-text! rectangle]]))
-
-(defn black-boxes! [img results]
-  (doseq [{confidence :confidence label :label top-left :top-left bottom-right :bottom-right} results]
-    (let [w (.width img)
-          h (.height img)
-          top-left-p (new-point (int (* w (first top-left))) (int (* h (second top-left))))
-          bottom-right-p (new-point (int (* w (first bottom-right))) (int (* h (second bottom-right))))]
-      (if (< 15 confidence)
-        (do
-          (rectangle img top-left-p bottom-right-p rgb/white 1)
-          (put-text! img
-                     (str label "[" confidence "% ]")
-                     top-left-p
-                     FONT_HERSHEY_PLAIN
-                     1.0
-                     rgb/white 1)))))
-  img)
-
-(defn draw-bounds [image results output-dir]
-  (let [out-file (str output-dir "/" (.getName (clojure.java.io/as-file image)))]
-    (-> image
-        (imread)
-        (black-boxes! results)
-        (imwrite out-file))))
\ No newline at end of file
diff --git a/contrib/clojure-package/examples/infer/objectdetector/src/infer/objectdetector_example.clj b/contrib/clojure-package/examples/infer/objectdetector/src/infer/objectdetector_example.clj
index 9331798b038c..65d822ff36aa 100644
--- a/contrib/clojure-package/examples/infer/objectdetector/src/infer/objectdetector_example.clj
+++ b/contrib/clojure-package/examples/infer/objectdetector/src/infer/objectdetector_example.clj
@@ -17,13 +17,15 @@
 (ns infer.objectdetector-example
   (:require [org.apache.clojure-mxnet.context :as context]
             [org.apache.clojure-mxnet.dtype :as dtype]
+            [org.apache.clojure-mxnet.image :as image]
             [org.apache.clojure-mxnet.infer :as infer]
             [org.apache.clojure-mxnet.layout :as layout]
             [clojure.java.io :as io]
-            [infer.draw :as draw]
-            [clojure.string :refer [join]]
+            [clojure.string :as string]
             [clojure.tools.cli :refer [parse-opts]])
-  (:gen-class))
+  (:gen-class)
+  (:import (javax.imageio ImageIO)
+           (java.io File)))
 
 (defn check-valid-dir
   "Check that the input directory exists"
@@ -54,27 +56,36 @@
     :validate [check-valid-dir "Input directory not found"]]
    ["-h" "--help"]])
 
-(defn result->map [{:keys [class prob x-min y-min x-max y-max]}]
-  (hash-map
-   :label class
-   :confidence (int (* 100 prob))
-   :top-left [x-min y-min]
-   :bottom-right [x-max y-max]))
 
-(defn print-results [results]
-  (doseq [_r results]
-    (println (format "Class: %s Confidence=%s Coords=(%s, %s)"
-                     (_r :label)
-                     (_r :confidence)
-                     (_r :top-left)
-                     (_r :bottom-right)))))
+(defn process-result! [output-dir image-path predictions]
+  (println "looking at image" image-path)
+  (println "predictions: " predictions)
+  (let [buf (ImageIO/read (new File image-path))
+        width (.getWidth buf)
+        height (.getHeight buf)
+        names (mapv :class predictions)
+        coords (mapv (fn [prediction]
+                       (-> prediction
+                           (update :x-min #(* width %))
+                           (update :x-max #(* width %))
+                           (update :y-min #(* height %))
+                           (update :y-max #(* height %))))
+                     predictions)
+        new-img  (-> (ImageIO/read (new File image-path))
+                     (image/draw-bounding-box! coords
+                                               {:stroke 2
+                                                :names (mapv #(str (:class %) "-" (:prob %))
+                                                             predictions)
+                                                :transparency 0.5
+
+                                                :font-size-mult 1.0}))]
+    (->> (string/split image-path #"\/")
+         last
+         (io/file output-dir)
+         (ImageIO/write new-img "jpg"))))
 
 (defn process-results [images results output-dir]
-  (dotimes [i (count images)]
-    (let [image (nth images i) _results (map result->map (nth results i))]
-      (println "processing: " image)
-      (print-results _results)
-      (draw/draw-bounds image _results output-dir))))
+  (doall (map (partial process-result! output-dir) images results)))
 
 (defn detect-single-image
   "Detect objects in a single image and print top-5 predictions"
@@ -82,7 +93,7 @@
   ([detector input-image output-dir]
     (.mkdir (io/file output-dir))
   (let [image (infer/load-image-from-file input-image)
-        topk 5
+        topk 3
         res (infer/detect-objects detector image topk)
         ]
     (process-results
@@ -109,7 +120,7 @@
     (apply concat
      (for [image-files image-file-batches]
        (let [image-batch (infer/load-image-paths image-files) 
-             topk 5 
+             topk 3
              res (infer/detect-objects-batch detector image-batch topk) ]
          (process-results
           image-files
@@ -143,5 +154,5 @@
         (parse-opts args cli-options)]
     (cond
       (:help options) (println summary)
-      (some? errors) (println (join "\n" errors))
+      (some? errors) (println (string/join "\n" errors))
       :else (run-detector options))))
diff --git a/contrib/clojure-package/examples/infer/objectdetector/test/infer/objectdetector_example_test.clj b/contrib/clojure-package/examples/infer/objectdetector/test/infer/objectdetector_example_test.clj
index 696d96b3ae3a..3d20c614918f 100644
--- a/contrib/clojure-package/examples/infer/objectdetector/test/infer/objectdetector_example_test.clj
+++ b/contrib/clojure-package/examples/infer/objectdetector/test/infer/objectdetector_example_test.clj
@@ -47,11 +47,11 @@
         {:keys [class prob x-min x-max y-min y-max] :as pred} (first predictions)]
     (clojure.pprint/pprint predictions)
     (is (some? predictions))
-    (is (= 5 (count predictions)))
+    (is (= 3 (count predictions)))
     (is (string? class))
     (is (< 0.8 prob))
     (is (every? #(< 0 % 1) [x-min x-max y-min y-max]))
-    (is (= #{"dog" "person" "bicycle" "car"} (set (mapv :class predictions))))))
+    (is (= #{"dog" "bicycle" "car"} (set (mapv :class predictions))))))
 
 (deftest test-batch-detection
   (let [detector (create-detector)
@@ -60,7 +60,7 @@
         predictions (first batch-predictions)
         {:keys [class prob x-min x-max y-min y-max] :as pred} (first predictions)]
     (is (some? batch-predictions))
-    (is (= 5 (count predictions)))
+    (is (= 3 (count predictions)))
     (is (string? class))
     (is (< 0.8 prob))
     (println [x-min x-max y-min y-max])
diff --git a/contrib/clojure-package/integration-tests.sh b/contrib/clojure-package/integration-tests.sh
index 5ae26e8dfcd4..3df9ba9787b5 100755
--- a/contrib/clojure-package/integration-tests.sh
+++ b/contrib/clojure-package/integration-tests.sh
@@ -26,7 +26,7 @@ lein install
 # then run through the examples 
 EXAMPLES_HOME=${MXNET_HOME}/contrib/clojure-package/examples
 # use AWK pattern for blacklisting
-TEST_CASES=`find ${EXAMPLES_HOME} -name test | awk '!/dontselect1|cnn-text-classification|gan|neural-style|infer|pre-trained-models/'`
+TEST_CASES=`find ${EXAMPLES_HOME} -name test | awk '!/dontselect1|cnn-text-classification|gan|neural-style|pre-trained-models/'`
 for i in $TEST_CASES ; do
  cd ${i} && lein test
 done
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/image.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/image.clj
index e2e98c4e8f01..f81a35803171 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/image.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/image.clj
@@ -202,11 +202,11 @@
   (Image/toImage input))
 
 (s/def ::buffered-image #(instance? BufferedImage %))
-(s/def ::xmin integer?)
-(s/def ::xmax integer?)
-(s/def ::ymin integer?)
-(s/def ::ymax integer?)
-(s/def ::coordinate (s/keys :req-un [::xmin ::xmax ::ymin ::ymax]))
+(s/def ::x-min number?)
+(s/def ::x-max number?)
+(s/def ::y-min number?)
+(s/def ::y-max number?)
+(s/def ::coordinate (s/keys :req-un [::x-min ::x-max ::y-min ::y-max]))
 (s/def ::coordinates (s/coll-of ::coordinate))
 (s/def ::names (s/nilable (s/coll-of string?)))
 (s/def ::stroke (s/and integer? pos?))
@@ -217,11 +217,11 @@
 
 (defn- convert-coordinate
   "Convert bounding box coordinate to Scala correct types."
-  [{:keys [xmin xmax ymin ymax]}]
-  {:xmin (int xmin)
-   :xmax (int xmax)
-   :ymin (int ymin)
-   :ymax (int ymax)})
+  [{:keys [x-min x-max y-min y-max]}]
+  {:xmin (int x-min)
+   :xmax (int x-max)
+   :ymin (int y-min)
+   :ymax (int y-max)})
 
 (defn draw-bounding-box!
   "Draw bounding boxes on `buffered-image` and Mutate the input image.
@@ -233,9 +233,9 @@
   `transparency`: float in (0.0, 1.0) - Transparency of the bounding box
   returns: Modified `buffered-image`
   Ex:
-    (draw-bounding-box! img [{:xmin 0 :xmax 100 :ymin 0 :ymax 100}])
-    (draw-bounding-box! [{:xmin 190 :xmax 850 :ymin 50 :ymax 450}
-                         {:xmin 200 :xmax 350 :ymin 440 :ymax 530}]
+    (draw-bounding-box! img [{:x-min 0 :x-max 100 :y-min 0 :y-max 100}])
+    (draw-bounding-box! [{:x-min 190 :x-max 850 :y-min 50 :y-max 450}
+                         {:x-min 200 :x-max 350 :y-min 440 :y-max 530}]
                         {:stroke 2
                          :names [\"pug\" \"cookie\"]
                          :transparency 0.8
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/image_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/image_test.clj
index 38ab11c86012..23b88d07e896 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/image_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/image_test.clj
@@ -20,7 +20,8 @@
             [org.apache.clojure-mxnet.ndarray :as ndarray]
             [clojure.java.io :as io]
             [clojure.test :refer :all])
-  (:import (javax.imageio ImageIO)))
+  (:import (javax.imageio ImageIO)
+           (java.io File)))
 
 (def tmp-dir (System/getProperty "java.io.tmpdir"))
 (def image-path (.getAbsolutePath (io/file tmp-dir "Pug-Cookie.jpg")))
@@ -76,4 +77,15 @@
   (let [img-arr (image/read-image image-path)
         resized-arr (image/resize-image img-arr 224 224)
         new-img (image/to-image resized-arr)]
-    (is (= true (ImageIO/write new-img "png" (io/file tmp-dir "out.png"))))))
+    (is (ImageIO/write new-img "png" (io/file tmp-dir "out.png")))))
+
+(deftest test-draw-bounding-box!
+  (let [orig-img (ImageIO/read (new File image-path))
+        new-img  (-> orig-img
+                     (image/draw-bounding-box! [{:x-min 190 :x-max 850 :y-min 50 :y-max 450}
+                                                {:x-min 200 :x-max 350 :y-min 440 :y-max 530}]
+                                               {:stroke 2
+                                                :names ["pug" "cookie"]
+                                                :transparency 0.8
+                                                :font-size-mult 2.0}))]
+    (is (ImageIO/write new-img "png" (io/file tmp-dir "out.png")))))
diff --git a/example/quantization/README.md b/example/quantization/README.md
index b77537d4fba7..fc9a26755b4e 100644
--- a/example/quantization/README.md
+++ b/example/quantization/README.md
@@ -35,6 +35,7 @@ The following models have been tested on Linux systems.
 |[ResNet152-V2](#8)|[MXNet ModelZoo](http://data.mxnet.io/models/imagenet/resnet/152-layers/)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|76.76%/93.03%|76.48%/92.96%|
 |[Inception-BN](#9)|[MXNet ModelZoo](http://data.mxnet.io/models/imagenet/inception-bn/)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|72.09%/90.60%|72.00%/90.53%|
 | [SSD-VGG16](#10) | [example/ssd](https://github.com/apache/incubator-mxnet/tree/master/example/ssd)  | VOC2007/2012  | 0.8366 mAP  | 0.8364 mAP  |
+| [SSD-VGG16](#10) | [example/ssd](https://github.com/apache/incubator-mxnet/tree/master/example/ssd)  | COCO2014  | 0.2552 mAP  | 0.253 mAP  |
 
 <h3 id='3'>ResNet50-V1</h3>
 
diff --git a/example/ssd/README.md b/example/ssd/README.md
index 6d4caa481bd7..92a125f1892d 100644
--- a/example/ssd/README.md
+++ b/example/ssd/README.md
@@ -42,6 +42,7 @@ remarkable traits of MXNet.
 Due to the permission issue, this example is maintained in this [repository](https://github.com/zhreshold/mxnet-ssd) separately. You can use the link regarding specific per example [issues](https://github.com/zhreshold/mxnet-ssd/issues).
 
 ### What's new
+* Support training and inference on COCO dataset. Int8 inference achieves 0.253 mAP on CPU with MKL-DNN backend, which is a comparable accuracy to FP32 (0.2552 mAP).
 * Support uint8 inference on CPU with MKL-DNN backend. Uint8 inference achieves 0.8364 mAP, which is a comparable accuracy to FP32 (0.8366 mAP).
 * Added live camera capture and detection display (run with --camera flag). Example:
     `./demo.py --camera --cpu --frame-resize 0.5`
@@ -119,9 +120,9 @@ You can use `./demo.py --camera` to use a video capture device with opencv such
 will open a window that will display the camera output together with the detections. You can play
 with the detection threshold to get more or less detections.
 
-### Train the model
+### Train the model on VOC
 * Note that we recommend to use gluon-cv to train the model, please refer to [gluon-cv ssd](https://gluon-cv.mxnet.io/build/examples_detection/train_ssd_voc.html).
-This example only covers training on Pascal VOC dataset. Other datasets should
+This example only covers training on Pascal VOC or MS COCO dataset. Other datasets should
 be easily supported by adding subclass derived from class `Imdb` in `dataset/imdb.py`.
 See example of `dataset/pascal_voc.py` for details.
 * Download the converted pretrained `vgg16_reduced` model [here](https://github.com/zhreshold/mxnet-ssd/releases/download/v0.2-beta/vgg16_reduced.zip), unzip `.param` and `.json` files
@@ -166,16 +167,53 @@ Check `python train.py --help` for more training options. For example, if you ha
 python train.py --gpus 0,1,2,3 --batch-size 32
 ```
 
+### Train the model on COCO
+* Download the COCO2014 dataset, skip this step if you already have one.
+```
+cd /path/to/where_you_store_datasets/
+wget http://images.cocodataset.org/zips/train2014.zip
+wget http://images.cocodataset.org/zips/val2014.zip
+wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip
+# Extract the data.
+unzip train2014.zip
+unzip val2014.zip
+unzip annotations_trainval2014.zip
+```
+* We are going to use `train2014,valminusminival2014` set in COCO2014 for training and `minival2014` for evaluation as a common strategy.
+* Then link `COCO2014` folder to `data/coco` by default:
+```
+ln -s /path/to/COCO2014 /path/to/incubator-mxnet/example/ssd/data/coco
+```
+Use hard link instead of copy could save us a bit disk space.
+* Create packed binary file for faster training:
+```
+# cd /path/to/incubator-mxnet/example/ssd
+bash tools/prepare_coco.sh
+# or if you are using windows
+python tools/prepare_dataset.py --dataset coco --set train2014,valminusminival2014 --target ./data/train.lst --root ./data/coco
+python tools/prepare_dataset.py --dataset coco --set minival2014 --target ./data/val.lst --root ./data/coco --no-shuffle
+```
+* Start training:
+```
+# cd /path/to/incubator-mxnet/example/ssd
+python train.py --label-width=560 --num-class=80 --class-names=./dataset/names/coco_label --pretrained="" --num-example=117265 --batch-size=64
+```
+
 ### Evalute trained model
 Make sure you have val.rec as validation dataset. It's the same one as used in training. Use:
 ```
 # cd /path/to/incubator-mxnet/example/ssd
 python evaluate.py --gpus 0,1 --batch-size 128 --epoch 0
+
+# Evaluate on COCO dataset
+python evaluate.py --gpus 0,1 --batch-size 128 --epoch 0 --num-class=80 --class-names=./dataset/names/mscoco.names
 ```
 
 ### Quantize model
 
-Follow the [Train instructions](https://github.com/apache/incubator-mxnet/tree/master/example/ssd#train-the-model) to train a FP32 `SSD-VGG16_reduced_300x300` model based on Pascal VOC dataset. You can also download our [SSD-VGG16 pre-trained model](http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_vgg16_reduced_300-dd479559.zip) and [packed binary data](http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/ssd-val-fc19a535.zip). Create `model` and `data` directories if they're not exist, extract the zip files, then rename the uncompressed files as follows (eg, rename `ssd-val-fc19a535.idx` to `val.idx`, `ssd-val-fc19a535.lst` to `val.lst`, `ssd-val-fc19a535.rec` to `val.rec`, `ssd_vgg16_reduced_300-dd479559.params` to `ssd_vgg16_reduced_300-0000.params`, `ssd_vgg16_reduced_300-symbol-dd479559.json` to `ssd_vgg16_reduced_300-symbol.json`.)
+To quantize a model on VOC dataset, follow the [Train instructions](https://github.com/apache/incubator-mxnet/tree/master/example/ssd#train-the-model-on-VOC) to train a FP32 `SSD-VGG16_reduced_300x300` model based on Pascal VOC dataset. You can also download our [SSD-VGG16 pre-trained model](http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_vgg16_reduced_300-dd479559.zip) and [packed binary data](http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/ssd-val-fc19a535.zip). Create `model` and `data` directories if they're not exist, extract the zip files, then rename the uncompressed files as follows (eg, rename `ssd-val-fc19a535.idx` to `val.idx`, `ssd-val-fc19a535.lst` to `val.lst`, `ssd-val-fc19a535.rec` to `val.rec`, `ssd_vgg16_reduced_300-dd479559.params` to `ssd_vgg16_reduced_300-0000.params`, `ssd_vgg16_reduced_300-symbol-dd479559.json` to `ssd_vgg16_reduced_300-symbol.json`.)
+
+To quantize a model on COCO dataset, follow the [Train instructions](https://github.com/apache/incubator-mxnet/tree/master/example/ssd#train-the-model-on-COCO) to train a FP32 `SSD-VGG16_reduced_300x300` model based on COCO dataset. You can also download our [SSD-VGG16 pre-trained model](http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_vgg16_reduced_300-7fedd4ad.zip) and [packed binary data](http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/ssd_coco-val-e91096e8.zip). Create `model` and `data` directories if they're not exist, extract the zip files, then rename the uncompressed files as follows (eg, rename `ssd_coco-val-e91096e8.idx` to `val.idx`, `ssd_coco-val-e91096e8.lst` to `val.lst`, `ssd_coco-val-e91096e8.rec` to `val.rec`, `ssd_vgg16_reduced_300-7fedd4ad.params` to `ssd_vgg16_reduced_300-0000.params`, `ssd_vgg16_reduced_300-symbol-7fedd4ad.json` to `ssd_vgg16_reduced_300-symbol.json`.)
 
 ```
 data/
@@ -199,12 +237,20 @@ After quantization, INT8 models will be saved in `model/` dictionary.  Use the f
 # USE MKLDNN AS SUBGRAPH BACKEND
 export MXNET_SUBGRAPH_BACKEND=MKLDNN
 
-# Launch FP32 Inference
+# Launch FP32 Inference on VOC dataset
 python evaluate.py --cpu --num-batch 10 --batch-size 224 --deploy --prefix=./model/ssd_
 
-# Launch INT8 Inference
+# Launch INT8 Inference on VOC dataset
 python evaluate.py --cpu --num-batch 10 --batch-size 224 --deploy --prefix=./model/cqssd_
 
+# Launch FP32 Inference on COCO dataset
+
+python evaluate.py --cpu --num-batch 10 --batch-size 224 --deploy --prefix=./model/ssd_ --num-class=80 --class-names=./dataset/names/mscoco.names
+
+# Launch INT8 Inference on COCO dataset
+
+python evaluate.py --cpu --num-batch 10 --batch-size 224 --deploy --prefix=./model/cqssd_ --num-class=80 --class-names=./dataset/names/mscoco.names
+
 # Launch dummy data Inference
 python benchmark_score.py --deploy --prefix=./model/ssd_
 python benchmark_score.py --deploy --prefix=./model/cqssd_
diff --git a/example/ssd/dataset/mscoco.py b/example/ssd/dataset/mscoco.py
index 469a15ae2720..dbe6e6909f4d 100644
--- a/example/ssd/dataset/mscoco.py
+++ b/example/ssd/dataset/mscoco.py
@@ -97,6 +97,12 @@ def _load_all(self, anno_file, shuffle):
         labels = []
         coco = COCO(anno_file)
         img_ids = coco.getImgIds()
+        # deal with class names
+        cats = [cat['name'] for cat in coco.loadCats(coco.getCatIds())]
+        class_to_coco_ind = dict(zip(cats, coco.getCatIds()))
+        class_to_ind = dict(zip(self.classes, range(len(self.classes))))
+        coco_ind_to_class_ind = dict([(class_to_coco_ind[cls], class_to_ind[cls])
+                                     for cls in self.classes[0:]])
         for img_id in img_ids:
             # filename
             image_info = coco.loadImgs(img_id)[0]
@@ -109,7 +115,7 @@ def _load_all(self, anno_file, shuffle):
             annos = coco.loadAnns(anno_ids)
             label = []
             for anno in annos:
-                cat_id = int(anno["category_id"])
+                cat_id = coco_ind_to_class_ind[anno['category_id']]
                 bbox = anno["bbox"]
                 assert len(bbox) == 4
                 xmin = float(bbox[0]) / width
@@ -123,7 +129,7 @@ def _load_all(self, anno_file, shuffle):
 
         if shuffle:
             import random
-            indices = range(len(image_set_index))
+            indices = list(range(len(image_set_index)))
             random.shuffle(indices)
             image_set_index = [image_set_index[i] for i in indices]
             labels = [labels[i] for i in indices]
diff --git a/example/ssd/dataset/names/mscoco.names b/example/ssd/dataset/names/mscoco.names
index ca76c80b5b2c..941cb4e13922 100644
--- a/example/ssd/dataset/names/mscoco.names
+++ b/example/ssd/dataset/names/mscoco.names
@@ -1,8 +1,8 @@
 person
 bicycle
 car
-motorbike
-aeroplane
+motorcycle
+airplane
 bus
 train
 truck
@@ -55,12 +55,12 @@ pizza
 donut
 cake
 chair
-sofa
-pottedplant
+couch
+potted plant
 bed
-diningtable
+dining table
 toilet
-tvmonitor
+tv
 laptop
 mouse
 remote
diff --git a/example/ssd/symbol/legacy_vgg16_ssd_300.py b/example/ssd/symbol/legacy_vgg16_ssd_300.py
index 29fc30be65d4..0acac6e4294b 100644
--- a/example/ssd/symbol/legacy_vgg16_ssd_300.py
+++ b/example/ssd/symbol/legacy_vgg16_ssd_300.py
@@ -200,8 +200,7 @@ def get_symbol(num_classes=20, nms_thresh=0.5, force_suppress=False,
     loc_preds = net.get_internals()["multibox_loc_pred_output"]
     anchor_boxes = net.get_internals()["multibox_anchors_output"]
 
-    cls_prob = mx.symbol.SoftmaxActivation(data=cls_preds, mode='channel', \
-        name='cls_prob')
+    cls_prob = mx.symbol.softmax(data=cls_preds, axis=1, name='cls_prob')
     out = mx.symbol.contrib.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
         name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
         variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
diff --git a/example/ssd/symbol/legacy_vgg16_ssd_512.py b/example/ssd/symbol/legacy_vgg16_ssd_512.py
index c5c3095dfd77..74d6b37fc11e 100644
--- a/example/ssd/symbol/legacy_vgg16_ssd_512.py
+++ b/example/ssd/symbol/legacy_vgg16_ssd_512.py
@@ -203,8 +203,7 @@ def get_symbol(num_classes=20, nms_thresh=0.5, force_suppress=False, nms_topk=40
     loc_preds = net.get_internals()["multibox_loc_pred_output"]
     anchor_boxes = net.get_internals()["multibox_anchors_output"]
 
-    cls_prob = mx.symbol.SoftmaxActivation(data=cls_preds, mode='channel', \
-        name='cls_prob')
+    cls_prob = mx.symbol.softmax(data=cls_preds, axis=1, name='cls_prob')
     out = mx.symbol.contrib.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
         name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
         variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
diff --git a/example/ssd/symbol/symbol_builder.py b/example/ssd/symbol/symbol_builder.py
index 041f83eb44da..135c42e8be15 100644
--- a/example/ssd/symbol/symbol_builder.py
+++ b/example/ssd/symbol/symbol_builder.py
@@ -175,8 +175,7 @@ def get_symbol(network, num_classes, from_layers, num_filters, sizes, ratios,
         num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \
         num_channels=num_filters, clip=False, interm_layer=0, steps=steps)
 
-    cls_prob = mx.symbol.SoftmaxActivation(data=cls_preds, mode='channel', \
-        name='cls_prob')
+    cls_prob = mx.symbol.softmax(data=cls_preds, axis=1, name='cls_prob')
     out = mx.symbol.contrib.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
         name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
         variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
diff --git a/example/ssd/train.py b/example/ssd/train.py
index 09c618a96429..5965aeec6c7a 100755
--- a/example/ssd/train.py
+++ b/example/ssd/train.py
@@ -103,6 +103,8 @@ def parse_args():
                         help='use difficult ground-truths in evaluation')
     parser.add_argument('--no-voc07', dest='use_voc07_metric', action='store_false',
                         help='dont use PASCAL VOC 07 11-point metric')
+    parser.add_argument('--kv-store', type=str, default='local',
+                        help='key-value store type')
     args = parser.parse_args()
     return args
 
@@ -150,4 +152,5 @@ def parse_class_names(args):
               force_nms=args.force_nms,
               ovp_thresh=args.overlap_thresh,
               use_difficult=args.use_difficult,
-              voc07_metric=args.use_voc07_metric)
+              voc07_metric=args.use_voc07_metric,
+              kv_store=args.kv_store)
diff --git a/example/ssd/train/metric.py b/example/ssd/train/metric.py
index 731f8fcc19f4..eeb9796bf4a8 100644
--- a/example/ssd/train/metric.py
+++ b/example/ssd/train/metric.py
@@ -39,6 +39,17 @@ def reset(self):
             self.num_inst = [0] * self.num
             self.sum_metric = [0.0] * self.num
 
+    def reset_local(self):
+        """
+        override reset behavior
+        """
+        if getattr(self, 'num', None) is None:
+            self.num_inst = 0
+            self.sum_metric = 0.0
+        else:
+            self.num_inst = [0] * self.num
+            self.sum_metric = [0.0] * self.num
+
     def update(self, labels, preds):
         """
         Implementation of updating metrics
diff --git a/example/ssd/train/train_net.py b/example/ssd/train/train_net.py
index 304a43b3d949..b37e3d5abcec 100644
--- a/example/ssd/train/train_net.py
+++ b/example/ssd/train/train_net.py
@@ -97,7 +97,7 @@ def train_net(net, train_path, num_classes, batch_size,
               use_difficult=False, class_names=None,
               voc07_metric=False, nms_topk=400, force_suppress=False,
               train_list="", val_path="", val_list="", iter_monitor=0,
-              monitor_pattern=".*", log_file=None):
+              monitor_pattern=".*", log_file=None, kv_store=None):
     """
     Wrapper for training phase.
 
@@ -258,6 +258,9 @@ def train_net(net, train_path, num_classes, batch_size,
     else:
         valid_metric = MApMetric(ovp_thresh, use_difficult, class_names, pred_idx=3)
 
+    # create kvstore when there are gpus
+    kv = mx.kvstore.create(kv_store) if kv_store else None
+
     mod.fit(train_iter,
             val_iter,
             eval_metric=MultiBoxMetric(),
@@ -272,4 +275,5 @@ def train_net(net, train_path, num_classes, batch_size,
             arg_params=args,
             aux_params=auxs,
             allow_missing=True,
-            monitor=monitor)
+            monitor=monitor,
+            kvstore=kv)
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 9a24b7516128..2f9d74dc5ba0 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -95,10 +95,22 @@ typedef void *CudaKernelHandle;
 typedef void *ProfileHandle;
 /*! \brief handle to DLManagedTensor*/
 typedef void *DLManagedTensorHandle;
-
+/*! \brief handle to Context */
+typedef const void *ContextHandle;
+/*! \brief handle to Engine FnProperty */
+typedef const void *EngineFnPropertyHandle;
+/*! \brief handle to Engine VarHandle */
+typedef void *EngineVarHandle;
+
+/*! \brief Engine asynchronous operation */
+typedef void (*EngineAsyncFunc)(void*, void*, void*);
+/*! \brief Engine synchronous operation */
+typedef void (*EngineSyncFunc)(void*, void*);
+/*! \brief Callback to free the param for EngineAsyncFunc/EngineSyncFunc */
+typedef void (*EngineFuncParamDeleter)(void*);
 typedef void (*ExecutorMonitorCallback)(const char*,
                                         NDArrayHandle,
-                                        void *);
+                                        void*);
 
 struct NativeOpInfo {
   void (*forward)(int, float**, int*, unsigned**, int*, void*);
@@ -2541,6 +2553,51 @@ MXNET_DLL int MXNDArrayGetSharedMemHandle(NDArrayHandle handle, int* shared_pid,
 MXNET_DLL int MXNDArrayCreateFromSharedMem(int shared_pid, int shared_id, const mx_uint *shape,
                                            mx_uint ndim, int dtype, NDArrayHandle *out);
 
+/*!
+  * \brief Push an asynchronous operation to the engine.
+  * \param async_func Execution function whici takes a parameter on_complete
+  *                   that must be called when the execution ompletes.
+  * \param func_param The parameter set on calling async_func, can be NULL.
+  * \param deleter The callback to free func_param, can be NULL.
+  * \param ctx_handle Execution context.
+  * \param const_vars_handle The variables that current operation will use
+  *                          but not mutate.
+  * \param num_const_vars The number of const_vars.
+  * \param mutable_vars_handle The variables that current operation will mutate.
+  * \param num_mutable_vars The number of mutable_vars.
+  * \param prop_handle Property of the function.
+  * \param priority Priority of the action, as hint to the engine.
+  * \param opr_name The operation name.
+  * \param wait Whether this is a WaitForVar operation.
+  */
+MXNET_DLL int MXEnginePushAsync(EngineAsyncFunc async_func, void* func_param,
+                                EngineFuncParamDeleter deleter, ContextHandle ctx_handle,
+                                EngineVarHandle const_vars_handle, int num_const_vars,
+                                EngineVarHandle mutable_vars_handle, int num_mutable_vars,
+                                EngineFnPropertyHandle prop_handle = NULL, int priority = 0,
+                                const char* opr_name = NULL, bool wait = false);
+
+/*!
+  * \brief Push a synchronous operation to the engine.
+  * \param sync_func Execution function that executes the operation.
+  * \param func_param The parameter set on calling sync_func, can be NULL.
+  * \param deleter The callback to free func_param, can be NULL.
+  * \param ctx_handle Execution context.
+  * \param const_vars_handle The variables that current operation will use
+  *                          but not mutate.
+  * \param num_const_vars The number of const_vars.
+  * \param mutable_vars_handle The variables that current operation will mutate.
+  * \param num_mutable_vars The number of mutable_vars.
+  * \param prop_handle Property of the function.
+  * \param priority Priority of the action, as hint to the engine.
+  * \param opr_name The operation name.
+  */
+MXNET_DLL int MXEnginePushSync(EngineSyncFunc sync_func, void* func_param,
+                               EngineFuncParamDeleter deleter, ContextHandle ctx_handle,
+                               EngineVarHandle const_vars_handle, int num_const_vars,
+                               EngineVarHandle mutable_vars_handle, int num_mutable_vars,
+                               EngineFnPropertyHandle prop_handle = NULL, int priority = 0,
+                               const char* opr_name = NULL);
 
 #ifdef __cplusplus
 }
diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
index ecb8e1c3bc22..4bfd1b36bf89 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/metric.py
@@ -860,7 +860,7 @@ class MCC(EvalMetric):
 
     .. note::
 
-        This version of MCC only supports binary classification.
+        This version of MCC only supports binary classification.  See PCC.
 
     Parameters
     ----------
@@ -1476,6 +1476,136 @@ def update(self, labels, preds):
             self.global_num_inst += 1
 
 
+@register
+class PCC(EvalMetric):
+    """PCC is a multiclass equivalent for the Matthews correlation coefficient derived
+    from a discrete solution to the Pearson correlation coefficient.
+
+    .. math::
+        \\text{PCC} = \\frac {\\sum _{k}\\sum _{l}\\sum _{m}C_{kk}C_{lm}-C_{kl}C_{mk}}
+        {{\\sqrt {\\sum _{k}(\\sum _{l}C_{kl})(\\sum _{k'|k'\\neq k}\\sum _{l'}C_{k'l'})}}
+         {\\sqrt {\\sum _{k}(\\sum _{l}C_{lk})(\\sum _{k'|k'\\neq k}\\sum _{l'}C_{l'k'})}}}
+
+    defined in terms of a K x K confusion matrix C.
+
+    When there are more than two labels the PCC will no longer range between -1 and +1.
+    Instead the minimum value will be between -1 and 0 depending on the true distribution.
+    The maximum value is always +1.
+
+    Parameters
+    ----------
+    name : str
+        Name of this metric instance for display.
+    output_names : list of str, or None
+        Name of predictions that should be used when updating with update_dict.
+        By default include all predictions.
+    label_names : list of str, or None
+        Name of labels that should be used when updating with update_dict.
+        By default include all labels.
+
+    Examples
+    --------
+    >>> # In this example the network almost always predicts positive
+    >>> false_positives = 1000
+    >>> false_negatives = 1
+    >>> true_positives = 10000
+    >>> true_negatives = 1
+    >>> predicts = [mx.nd.array(
+        [[.3, .7]]*false_positives +
+        [[.7, .3]]*true_negatives +
+        [[.7, .3]]*false_negatives +
+        [[.3, .7]]*true_positives
+    )]
+    >>> labels  = [mx.nd.array(
+        [0]*(false_positives + true_negatives) +
+        [1]*(false_negatives + true_positives)
+    )]
+    >>> f1 = mx.metric.F1()
+    >>> f1.update(preds = predicts, labels = labels)
+    >>> pcc = mx.metric.PCC()
+    >>> pcc.update(preds = predicts, labels = labels)
+    >>> print f1.get()
+    ('f1', 0.95233560306652054)
+    >>> print pcc.get()
+    ('pcc', 0.01917751877733392)
+    """
+    def __init__(self, name='pcc',
+                 output_names=None, label_names=None,
+                 has_global_stats=True):
+        self.k = 2
+        super(PCC, self).__init__(
+            name=name, output_names=output_names, label_names=label_names,
+            has_global_stats=has_global_stats)
+
+    def _grow(self, inc):
+        self.lcm = numpy.pad(
+            self.lcm, ((0, inc), (0, inc)), 'constant', constant_values=(0))
+        self.gcm = numpy.pad(
+            self.gcm, ((0, inc), (0, inc)), 'constant', constant_values=(0))
+        self.k += inc
+
+    def _calc_mcc(self, cmat):
+        n = cmat.sum()
+        x = cmat.sum(axis=1)
+        y = cmat.sum(axis=0)
+        cov_xx = numpy.sum(x * (n - x))
+        cov_yy = numpy.sum(y * (n - y))
+        if cov_xx == 0 or cov_yy == 0:
+            return float('nan')
+        i = cmat.diagonal()
+        cov_xy = numpy.sum(i * n - x * y)
+        return cov_xy / (cov_xx * cov_yy) ** 0.5
+
+    def update(self, labels, preds):
+        """Updates the internal evaluation result.
+
+        Parameters
+        ----------
+        labels : list of `NDArray`
+            The labels of the data.
+
+        preds : list of `NDArray`
+            Predicted values.
+        """
+        labels, preds = check_label_shapes(labels, preds, True)
+
+        # update the confusion matrix
+        for label, pred in zip(labels, preds):
+            label = label.astype('int32', copy=False).asnumpy()
+            pred = pred.asnumpy().argmax(axis=1)
+            n = max(pred.max(), label.max())
+            if n >= self.k:
+                self._grow(n + 1 - self.k)
+            bcm = numpy.zeros((self.k, self.k))
+            for i, j in zip(pred, label):
+                bcm[i, j] += 1
+            self.lcm += bcm
+            self.gcm += bcm
+
+        self.num_inst += 1
+        self.global_num_inst += 1
+
+    @property
+    def sum_metric(self):
+        return self._calc_mcc(self.lcm) * self.num_inst
+
+    @property
+    def global_sum_metric(self):
+        return self._calc_mcc(self.gcm) * self.global_num_inst
+
+    def reset(self):
+        """Resets the internal evaluation result to initial state."""
+        self.global_num_inst = 0.
+        self.gcm = numpy.zeros((self.k, self.k))
+        self.reset_local()
+
+    def reset_local(self):
+        """Resets the local portion of the internal evaluation results
+        to initial state."""
+        self.num_inst = 0.
+        self.lcm = numpy.zeros((self.k, self.k))
+
+
 @register
 class Loss(EvalMetric):
     """Dummy metric for directly printing loss.
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index 0c0a0a1e3c88..91d4ca16df07 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -47,7 +47,7 @@
 from ._internal import SymbolBase, _set_symbol_class
 
 __all__ = ["Symbol", "var", "Variable", "Group", "load", "load_json",
-           "pow", "maximum", "minimum", "hypot", "eye", "zeros", "ones", "full", "arange",
+           "pow", "power", "maximum", "minimum", "hypot", "eye", "zeros", "ones", "full", "arange",
            "histogram", "split_v2"]
 
 
@@ -2740,6 +2740,8 @@ def pow(base, exp):
     Both inputs can be Symbol or scalar number.
     Broadcasting is not supported. Use `broadcast_pow` instead.
 
+    `sym.pow` is being deprecated, please use `sym.power` instead.
+
     Parameters
     ---------
     base : Symbol or scalar
@@ -2780,6 +2782,43 @@ def pow(base, exp):
         raise TypeError('types (%s, %s) not supported' % (str(type(base)), str(type(exp))))
 
 
+def power(base, exp):
+    """Returns element-wise result of base element raised to powers from exp element.
+
+    Both inputs can be Symbol or scalar number.
+    Broadcasting is not supported. Use `broadcast_pow` instead.
+
+    Parameters
+    ---------
+    base : Symbol or scalar
+        The base symbol
+    exp : Symbol or scalar
+        The exponent symbol
+
+    Returns
+    -------
+    Symbol or scalar
+        The bases in x raised to the exponents in y.
+
+    Examples
+    --------
+    >>> mx.sym.power(2, 3)
+    8
+    >>> x = mx.sym.Variable('x')
+    >>> y = mx.sym.Variable('y')
+    >>> z = mx.sym.power(x, 2)
+    >>> z.eval(x=mx.nd.array([1,2]))[0].asnumpy()
+    array([ 1.,  4.], dtype=float32)
+    >>> z = mx.sym.power(3, y)
+    >>> z.eval(y=mx.nd.array([2,3]))[0].asnumpy()
+    array([  9.,  27.], dtype=float32)
+    >>> z = mx.sym.power(x, y)
+    >>> z.eval(x=mx.nd.array([3,4]), y=mx.nd.array([2,3]))[0].asnumpy()
+    array([  9.,  64.], dtype=float32)
+    """
+    return pow(base, exp)
+
+
 # pylint: disable=no-member
 # pylint: disable=redefined-builtin
 def maximum(left, right):
diff --git a/scala-package/assembly/src/main/assembly/assembly.xml b/scala-package/assembly/src/main/assembly/assembly.xml
index 3a8e11b436d1..5a931360645c 100644
--- a/scala-package/assembly/src/main/assembly/assembly.xml
+++ b/scala-package/assembly/src/main/assembly/assembly.xml
@@ -33,8 +33,8 @@
         <exclude>org.slf4j:slf4j-api</exclude>
         <exclude>args4j:args4j</exclude>
       </excludes>
-      <outputDirectory>/</outputDirectory>
-      <useProjectArtifact>true</useProjectArtifact>
+      <outputDirectory>.</outputDirectory>
+      <useProjectArtifact>false</useProjectArtifact>
       <unpack>true</unpack>
       <scope>runtime</scope>
     </dependencySet>
@@ -71,7 +71,7 @@
         <include>cub/LICENSE.TXT</include>
         <include>mkldnn/external/mklml_mac_2019.0.1.20180928/license.txt</include>
       </includes>
-      <outputDirectory>/</outputDirectory>
+      <outputDirectory>.</outputDirectory>
     </fileSet>
   </fileSets>
 </assembly>
diff --git a/scala-package/assembly/src/main/assembly/javadoc.xml b/scala-package/assembly/src/main/assembly/javadoc.xml
index 176fa432190c..c6df96a3f5a5 100644
--- a/scala-package/assembly/src/main/assembly/javadoc.xml
+++ b/scala-package/assembly/src/main/assembly/javadoc.xml
@@ -25,7 +25,7 @@
   <fileSets>
     <fileSet>
       <directory>${rootdir}/core/target/site/scaladocs</directory>
-      <outputDirectory>/</outputDirectory>
+      <outputDirectory>.</outputDirectory>
     </fileSet>
   </fileSets>
 </assembly>
diff --git a/scala-package/assembly/src/main/assembly/source.xml b/scala-package/assembly/src/main/assembly/source.xml
index c06786130192..1f004e811cfc 100644
--- a/scala-package/assembly/src/main/assembly/source.xml
+++ b/scala-package/assembly/src/main/assembly/source.xml
@@ -29,7 +29,7 @@
       <includes>
         <include>**\/*.scala</include>
       </includes>
-      <outputDirectory>/</outputDirectory>
+      <outputDirectory>.</outputDirectory>
     </fileSet>
   </fileSets>
 </assembly>
diff --git a/scala-package/core/pom.xml b/scala-package/core/pom.xml
index 5a28a11a0208..b888b5549e7a 100644
--- a/scala-package/core/pom.xml
+++ b/scala-package/core/pom.xml
@@ -100,10 +100,6 @@
           </excludes>
         </configuration>
       </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-compiler-plugin</artifactId>
-      </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-clean-plugin</artifactId>
@@ -141,6 +137,7 @@
       </plugin>
     </plugins>
   </build>
+
   <dependencies>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/FeedForward.scala b/scala-package/core/src/main/scala/org/apache/mxnet/FeedForward.scala
index 2ed9d8cfbb84..2b1765531824 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/FeedForward.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/FeedForward.scala
@@ -180,6 +180,7 @@ class FeedForward private(
 
   // Initialize the predictor module for running prediction.
   private def initPredictor(inputShapes: Map[String, Shape]): Unit = {
+    var shouldInit = true
     if (this.predExec != null) {
       val (argShapes, _, _) = symbol.inferShape(inputShapes)
       require(argShapes != null, "Shape inference failed." +
@@ -187,14 +188,16 @@ class FeedForward private(
         s"and aux states ${symbol.listAuxiliaryStates()}")
       val predShapes = this.predExec.argArrays.map(_.shape)
       if (argShapes.sameElements(predShapes)) {
-        return
+        shouldInit = false
       }
     }
-    // for now only use the first device
-    val predExec = symbol.simpleBind(ctx(0), gradReq = "null", shapeDict = inputShapes)
-    predExec.copyParamsFrom(_argParams, _auxParams)
-    ExecutorManager.checkArguments(symbol)
-    this.predExec = predExec
+    if(shouldInit) {
+      // for now only use the first device
+      val predExec = symbol.simpleBind(ctx(0), gradReq = "null", shapeDict = inputShapes)
+      predExec.copyParamsFrom(_argParams, _auxParams)
+      ExecutorManager.checkArguments(symbol)
+      this.predExec = predExec
+    }
   }
 
   // Initialize the iterator given input.
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/module/BucketingModule.scala b/scala-package/core/src/main/scala/org/apache/mxnet/module/BucketingModule.scala
index 1ac798e1b617..41a6f69394d2 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/module/BucketingModule.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/module/BucketingModule.scala
@@ -173,14 +173,13 @@ class BucketingModule(symGen: AnyRef => (Symbol, IndexedSeq[String], IndexedSeq[
                           allowMissing: Boolean = false,
                           forceInit: Boolean = false,
                           allowExtra: Boolean = false): Unit = {
-    if (paramsInitialized && !forceInit) {
-      return
+    if (!paramsInitialized || forceInit) {
+      require(binded, "call bind before initializing the parameters")
+      this._currModule.initParams(initializer, argParams, auxParams,
+        allowMissing, forceInit, allowExtra)
+      this.paramsDirty = false
+      this.paramsInitialized = true
     }
-    require(binded, "call bind before initializing the parameters")
-    this._currModule.initParams(initializer, argParams, auxParams,
-      allowMissing, forceInit, allowExtra)
-    this.paramsDirty = false
-    this.paramsInitialized = true
   }
 
   /**
@@ -218,28 +217,27 @@ class BucketingModule(symGen: AnyRef => (Symbol, IndexedSeq[String], IndexedSeq[
 
     if (this.binded) {
       logger.warn("Already bound, ignoring bind()")
-      return
-    }
+    } else {
+      require(sharedModule.isEmpty,
+        "sharedModule for BucketingModule is not supported")
 
-    require(sharedModule.isEmpty,
-      "sharedModule for BucketingModule is not supported")
-
-    this.forTraining = forTraining
-    this.inputsNeedGrad = inputsNeedGrad
-    this.binded = true
-
-    val (sym, dNames, lNames) = this.symGen(this.defaultBucketKey)
-    val module = new Module(sym, dNames, lNames, this.contexts,
-      this.workLoadList, this.fixedParamNames)
-    module.bind(dataShapes, labelShapes, forTraining, inputsNeedGrad,
-      forceRebind = false, sharedModule = None, gradReq)
-    this._currModule = module
-    this._currBucketKey = this.defaultBucketKey
-    this._buckets(this.defaultBucketKey) = module
-
-    // copy back saved params, if already initialized
-    if (this.paramsInitialized) {
-      this.setParams(argParams, auxParams)
+      this.forTraining = forTraining
+      this.inputsNeedGrad = inputsNeedGrad
+      this.binded = true
+
+      val (sym, dNames, lNames) = this.symGen(this.defaultBucketKey)
+      val module = new Module(sym, dNames, lNames, this.contexts,
+        this.workLoadList, this.fixedParamNames)
+      module.bind(dataShapes, labelShapes, forTraining, inputsNeedGrad,
+        forceRebind = false, sharedModule = None, gradReq)
+      this._currModule = module
+      this._currBucketKey = this.defaultBucketKey
+      this._buckets(this.defaultBucketKey) = module
+
+      // copy back saved params, if already initialized
+      if (this.paramsInitialized) {
+        this.setParams(argParams, auxParams)
+      }
     }
   }
 
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/module/Module.scala b/scala-package/core/src/main/scala/org/apache/mxnet/module/Module.scala
index 97df3dcb307d..3255d9346b80 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/module/Module.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/module/Module.scala
@@ -121,36 +121,35 @@ class Module(symbolVar: Symbol,
                           allowMissing: Boolean = false,
                           forceInit: Boolean = false,
                           allowExtra: Boolean = false): Unit = {
-    if (paramsInitialized && !forceInit) {
-      return
-    }
-    require(binded, "call bind before initializing the parameters")
+    if (!paramsInitialized || forceInit) {
+      require(binded, "call bind before initializing the parameters")
 
-    if (this.argParams == null) {
-      val paramArrays =
-        execGroup.paramArrays.map(nds => NDArray.zeros(nds(0).shape, dtype = nds(0).dtype))
-      this.argParams = this.paramNames.zip(paramArrays).toMap
-    }
+      if (this.argParams == null) {
+        val paramArrays =
+          execGroup.paramArrays.map(nds => NDArray.zeros(nds(0).shape, dtype = nds(0).dtype))
+        this.argParams = this.paramNames.zip(paramArrays).toMap
+      }
 
-    if (this.auxParams == null) {
-      val auxArrays =
-        execGroup.auxArrays.map(nds => NDArray.zeros(nds(0).shape, dtype = nds(0).dtype))
-      this.auxParams = this.auxNames.zip(auxArrays).toMap
-    }
+      if (this.auxParams == null) {
+        val auxArrays =
+          execGroup.auxArrays.map(nds => NDArray.zeros(nds(0).shape, dtype = nds(0).dtype))
+        this.auxParams = this.auxNames.zip(auxArrays).toMap
+      }
 
-    this.argParams.foreach { case (name, arr) =>
-      impl(name, arr, allowMissing, Option(initializer), argParams)
-    }
+      this.argParams.foreach { case (name, arr) =>
+        impl(name, arr, allowMissing, Option(initializer), argParams)
+      }
 
-    this.auxParams.foreach { case (name, arr) =>
-      impl(name, arr, allowMissing, Option(initializer), auxParams)
-    }
+      this.auxParams.foreach { case (name, arr) =>
+        impl(name, arr, allowMissing, Option(initializer), auxParams)
+      }
 
-    this.paramsInitialized = true
-    this.paramsDirty = false
+      this.paramsInitialized = true
+      this.paramsDirty = false
 
-    // copy the initialized parameters to devices
-    this.execGroup.setParams(this.argParams, this.auxParams, allowExtra = allowExtra)
+      // copy the initialized parameters to devices
+      this.execGroup.setParams(this.argParams, this.auxParams, allowExtra = allowExtra)
+    }
   }
 
   // Internal helper for parameter initialization
@@ -246,64 +245,64 @@ class Module(symbolVar: Symbol,
 
     if (binded) {
       logger.warn("Already binded, ignoring bind()")
-      return
-    }
+    } else {
+      this.forTraining = forTraining
+      this.inputsNeedGrad = inputsNeedGrad
+      this.binded = true
 
-    this.forTraining = forTraining
-    this.inputsNeedGrad = inputsNeedGrad
-    this.binded = true
+      if (!forTraining) {
+        require(!inputsNeedGrad, "Invalid inputsNeedGrad (cannot be true if not forTraining)")
+      } else {
+        // this is not True, as some module might not contains a loss function
+        // that consumes the labels
+        // require(labelShapes != None)
+      }
 
-    if (!forTraining) {
-      require(!inputsNeedGrad, "Invalid inputsNeedGrad (cannot be true if not forTraining)")
-    } else {
-      // this is not True, as some module might not contains a loss function
-      // that consumes the labels
-      // require(labelShapes != None)
-    }
+      this.dataShapesVar = dataShapes
+      this.labelShapesVar = labelShapes
 
-    this.dataShapesVar = dataShapes
-    this.labelShapesVar = labelShapes
-
-    val sharedGroup =
-      sharedModule.map(sharedModuleInst => {
-        require(sharedModuleInst.binded && sharedModuleInst.paramsInitialized,
-          s"bind() and initParams() must be called first on shared module.")
-        sharedModuleInst.execGroup
-      })
-
-    val inputTypes = this.dataShapesVar.map(dataDesc => (dataDesc.name, dataDesc.dtype)).toMap ++
-      labelShapes.map(shapes => shapes.map(dataDesc => (dataDesc.name, dataDesc.dtype)).toMap)
-                 .getOrElse(Map.empty[String, DType])
-
-    execGroup = new Builder(symbol, contexts, paramNames)
-      .setWorkLoadList(workLoads)
-      .setDataShapes(dataShapes)
-      .setLabelShapes(labelShapes.orNull)
-      .setForTraining(forTraining)
-      .setInputsNeedGrad(inputsNeedGrad)
-      .setSharedGroup(sharedGroup.orNull)
-      .setFixedParamNames(fixedParamNames.orNull)
-      .setGradReq(gradReq)
-      .setInputTypes(inputTypes)
-      .build()
-
-    if (sharedModule.isDefined) {
-      paramsInitialized = true
-      argParams = sharedModule.get.argParams
-      auxParams = sharedModule.get.auxParams
-    } else if (paramsInitialized) {
-      // if the parameters are already initialized, we are re-binding
-      // so automatically copy the already initialized params
-      execGroup.setParams(argParams, auxParams)
-    }
+      val sharedGroup =
+        sharedModule.map(sharedModuleInst => {
+          require(sharedModuleInst.binded && sharedModuleInst.paramsInitialized,
+            s"bind() and initParams() must be called first on shared module.")
+          sharedModuleInst.execGroup
+        })
 
-    sharedModule.foreach {
-      case sharedModuleInst: Module =>
-        if (sharedModuleInst.optimizerInitialized) {
-          borrowOptimizer(sharedModuleInst)
-        }
-      case _ =>
+      val inputTypes = this.dataShapesVar.map(dataDesc => (dataDesc.name, dataDesc.dtype)).toMap ++
+        labelShapes.map(shapes => shapes.map(dataDesc => (dataDesc.name, dataDesc.dtype)).toMap)
+          .getOrElse(Map.empty[String, DType])
+
+      execGroup = new Builder(symbol, contexts, paramNames)
+        .setWorkLoadList(workLoads)
+        .setDataShapes(dataShapes)
+        .setLabelShapes(labelShapes.orNull)
+        .setForTraining(forTraining)
+        .setInputsNeedGrad(inputsNeedGrad)
+        .setSharedGroup(sharedGroup.orNull)
+        .setFixedParamNames(fixedParamNames.orNull)
+        .setGradReq(gradReq)
+        .setInputTypes(inputTypes)
+        .build()
+
+      if (sharedModule.isDefined) {
+        paramsInitialized = true
+        argParams = sharedModule.get.argParams
+        auxParams = sharedModule.get.auxParams
+      } else if (paramsInitialized) {
+        // if the parameters are already initialized, we are re-binding
+        // so automatically copy the already initialized params
+        execGroup.setParams(argParams, auxParams)
+      }
+
+      sharedModule.foreach {
+        case sharedModuleInst: Module =>
+          if (sharedModuleInst.optimizerInitialized) {
+            borrowOptimizer(sharedModuleInst)
+          }
+        case _ =>
+      }
     }
+
   }
 
   /**
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/module/SequentialModule.scala b/scala-package/core/src/main/scala/org/apache/mxnet/module/SequentialModule.scala
index 2e506c08e548..3c3eeb97f201 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/module/SequentialModule.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/module/SequentialModule.scala
@@ -154,38 +154,37 @@ class SequentialModule extends BaseModule {
                           allowMissing: Boolean = false,
                           forceInit: Boolean = false,
                           allowExtra: Boolean = false): Unit = {
-    if (this.paramsInitialized && !forceInit) {
-      return
-    }
-    require(this.binded, "call bind before initializing the parameters")
+    if (!this.paramsInitialized || forceInit) {
+      require(this.binded, "call bind before initializing the parameters")
 
-    for (module <- this.modules) {
-      module.initParams(initializer = initializer, argParams = argParams,
-          auxParams = auxParams, allowMissing = allowMissing,
-          forceInit = forceInit, allowExtra = allowExtra)
-    }
+      for (module <- this.modules) {
+        module.initParams(initializer = initializer, argParams = argParams,
+            auxParams = auxParams, allowMissing = allowMissing,
+            forceInit = forceInit, allowExtra = allowExtra)
+      }
 
-    // Internal function to help checking duplicated names,
-    // make sure we do not have duplicated parameter names.
-    def checkName(knownNames: scala.collection.mutable.Map[String, Int],
-      newNames: Array[String], modules: ArrayBuffer[BaseModule], i: Int): Unit = {
-      for (name <- newNames) {
-        require(!knownNames.contains(name), s"Duplicated parameter names: " +
-            s"name $name in layer $i (${modules(i).getClass.getName}) is already " +
-            s"used in layer ${knownNames("name")}" +
-            s"(${modules(knownNames("name")).getClass.getName})")
-        knownNames(name) = i
+      // Internal function to help checking duplicated names,
+      // make sure we do not have duplicated parameter names.
+      def checkName(knownNames: scala.collection.mutable.Map[String, Int],
+        newNames: Array[String], modules: ArrayBuffer[BaseModule], i: Int): Unit = {
+        for (name <- newNames) {
+          require(!knownNames.contains(name), s"Duplicated parameter names: " +
+              s"name $name in layer $i (${modules(i).getClass.getName}) is already " +
+              s"used in layer ${knownNames("name")}" +
+              s"(${modules(knownNames("name")).getClass.getName})")
+          knownNames(name) = i
+        }
       }
-    }
 
-    val argNames = scala.collection.mutable.Map[String, Int]()
-    val auxNames = scala.collection.mutable.Map[String, Int]()
-    for ((module, iLayer) <- this.modules.zipWithIndex) {
-      val (argParams, auxParams) = module.getParams
-      checkName(argNames, argParams.keys.toArray, this.modules, iLayer)
-      checkName(auxNames, auxParams.keys.toArray, this.modules, iLayer)
+      val argNames = scala.collection.mutable.Map[String, Int]()
+      val auxNames = scala.collection.mutable.Map[String, Int]()
+      for ((module, iLayer) <- this.modules.zipWithIndex) {
+        val (argParams, auxParams) = module.getParams
+        checkName(argNames, argParams.keys.toArray, this.modules, iLayer)
+        checkName(auxNames, auxParams.keys.toArray, this.modules, iLayer)
+      }
+      this.paramsInitialized = true
     }
-    this.paramsInitialized = true
   }
 
   /**
@@ -216,54 +215,54 @@ class SequentialModule extends BaseModule {
                     gradReq: String = "write"): Unit = {
     if (this.binded && !forceRebind) {
       logger.warn(s"Already binded, ignoring bind()")
-      return
-    }
-
-    if (inputsNeedGrad) {
-      require(forTraining, "inputsNeedGrad can be set only for training")
-    }
-
-    require(sharedModule == None, "Shared module is not supported")
-    require(this.modules.length > 0, "Attempting to bind an empty SequentialModule")
-
-    this.forTraining = forTraining
-    this.inputsNeedGrad = inputsNeedGrad
-    this.binded = true
-
-    // the same label shapes are used for all chained modules
-    this.labelShapesVar = labelShapes
+    } else {
+      if (inputsNeedGrad) {
+        require(forTraining, "inputsNeedGrad can be set only for training")
+      }
 
-    var myDataShapes = dataShapes
-    var myLabelShapes = labelShapes
-    var anybodyEverNeedsLabel = false
-    for ((module, iLayer) <- this.modules.zipWithIndex) {
-      val meta = this.metas(iLayer)
-      if (meta.contains(META_TAKE_LABELS) && meta(META_TAKE_LABELS)) {
-        myLabelShapes = labelShapes
-        anybodyEverNeedsLabel = true
-      } else myLabelShapes = None
-
-      val myInputsNeedGrad = if (inputsNeedGrad || (forTraining && iLayer > 0)) true else false
-      if (meta.contains(META_AUTO_WIRING) && meta(META_AUTO_WIRING)) {
-        val dataNames = module.dataNames
-        require(dataNames.length == myDataShapes.length,
-          s"dataNmes $dataNames and dataShapes $myDataShapes do not match")
-        myDataShapes = dataNames.zip(myDataShapes).map { case (newName, dataDes) =>
-          DataDesc(newName, dataDes.shape)
+      require(sharedModule == None, "Shared module is not supported")
+      require(this.modules.length > 0, "Attempting to bind an empty SequentialModule")
+
+      this.forTraining = forTraining
+      this.inputsNeedGrad = inputsNeedGrad
+      this.binded = true
+
+      // the same label shapes are used for all chained modules
+      this.labelShapesVar = labelShapes
+
+      var myDataShapes = dataShapes
+      var myLabelShapes = labelShapes
+      var anybodyEverNeedsLabel = false
+      for ((module, iLayer) <- this.modules.zipWithIndex) {
+        val meta = this.metas(iLayer)
+        if (meta.contains(META_TAKE_LABELS) && meta(META_TAKE_LABELS)) {
+          myLabelShapes = labelShapes
+          anybodyEverNeedsLabel = true
+        } else myLabelShapes = None
+
+        val myInputsNeedGrad = if (inputsNeedGrad || (forTraining && iLayer > 0)) true else false
+        if (meta.contains(META_AUTO_WIRING) && meta(META_AUTO_WIRING)) {
+          val dataNames = module.dataNames
+          require(dataNames.length == myDataShapes.length,
+            s"dataNmes $dataNames and dataShapes $myDataShapes do not match")
+          myDataShapes = dataNames.zip(myDataShapes).map { case (newName, dataDes) =>
+            DataDesc(newName, dataDes.shape)
+          }
         }
-      }
 
-      module.bind(myDataShapes, myLabelShapes, forTraining, myInputsNeedGrad,
+        module.bind(myDataShapes, myLabelShapes, forTraining, myInputsNeedGrad,
           forceRebind, sharedModule = None, gradReq)
-      // the output of the previous module is the data of the next module
-      myDataShapes = module.outputShapes.map{case (name, shape) => DataDesc(name, shape)}
-    }
+        // the output of the previous module is the data of the next module
+        myDataShapes = module.outputShapes.map{case (name, shape) => DataDesc(name, shape)}
+      }
 
 
-    if (!anybodyEverNeedsLabel) {
-      // then I do not need label either
-      this.labelShapesVar = None
+      if (!anybodyEverNeedsLabel) {
+        // then I do not need label either
+        this.labelShapesVar = None
+      }
     }
+
   }
 
   /**
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/ImageSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/ImageSuite.scala
index d4cf35af186f..dca4ce02ef89 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/ImageSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/ImageSuite.scala
@@ -57,7 +57,7 @@ class ImageSuite extends FunSuite with BeforeAndAfterAll {
 
   test("Test load image") {
     val nd = Image.imRead(imLocation)
-    logger.info(s"OpenCV load image with shape: ${nd.shape}")
+    logger.debug(s"OpenCV load image with shape: ${nd.shape}")
     require(nd.shape == Shape(576, 1024, 3), "image shape not Match!")
   }
 
@@ -65,14 +65,14 @@ class ImageSuite extends FunSuite with BeforeAndAfterAll {
     val url = new URL("https://s3.amazonaws.com/model-server/inputs/Pug-Cookie.jpg")
     val inputStream = url.openStream
     val nd = Image.imDecode(inputStream)
-    logger.info(s"OpenCV load image with shape: ${nd.shape}")
+    logger.debug(s"OpenCV load image with shape: ${nd.shape}")
     require(nd.shape == Shape(576, 1024, 3), "image shape not Match!")
   }
 
   test("Test resize image") {
     val nd = Image.imRead(imLocation)
     val resizeIm = Image.imResize(nd, 224, 224)
-    logger.info(s"OpenCV resize image with shape: ${resizeIm.shape}")
+    logger.debug(s"OpenCV resize image with shape: ${resizeIm.shape}")
     require(resizeIm.shape == Shape(224, 224, 3), "image shape not Match!")
   }
 
@@ -94,7 +94,7 @@ class ImageSuite extends FunSuite with BeforeAndAfterAll {
     val tempDirPath = System.getProperty("java.io.tmpdir")
     val img = Image.toImage(resizeIm)
     ImageIO.write(img, "png", new File(tempDirPath + "/inputImages/out.png"))
-    logger.info(s"converted image stored in ${tempDirPath + "/inputImages/out.png"}")
+    logger.debug(s"converted image stored in ${tempDirPath + "/inputImages/out.png"}")
   }
 
   test("Test draw Bounding box") {
@@ -107,7 +107,7 @@ class ImageSuite extends FunSuite with BeforeAndAfterAll {
     Image.drawBoundingBox(buf, box, Some(names), fontSizeMult = Some(1.4f))
     val tempDirPath = System.getProperty("java.io.tmpdir")
     ImageIO.write(buf, "png", new File(tempDirPath + "/inputImages/out2.png"))
-    logger.info(s"converted image stored in ${tempDirPath + "/inputImages/out2.png"}")
+    logger.debug(s"converted image stored in ${tempDirPath + "/inputImages/out2.png"}")
     for (coord <- box) {
       val topLeft = buf.getRGB(coord("xmin"), coord("ymin"))
       val downLeft = buf.getRGB(coord("xmin"), coord("ymax"))
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/KVStoreSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/KVStoreSuite.scala
index 2e1b36c5b162..3eb61414bac1 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/KVStoreSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/KVStoreSuite.scala
@@ -67,9 +67,6 @@ class KVStoreSuite extends FunSuite with BeforeAndAfterAll {
     val kv = KVStore.create()
     val updater = new MXKVStoreUpdater {
       override def update(key: Int, input: NDArray, stored: NDArray): Unit = {
-        // scalastyle:off println
-        println(s"update on key $key")
-        // scalastyle:on println
         stored += input * 2
       }
       override def dispose(): Unit = {}
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/ModelParallelSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/ModelParallelSuite.scala
index 8ed1dc4c2332..2962e3b4781c 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/ModelParallelSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/ModelParallelSuite.scala
@@ -58,11 +58,6 @@ class ModelParallelSuite extends FunSuite with BeforeAndAfterAll {
     val arrGrad2 = arrGrad.map(_.copyTo(ctx1))
     val exec2 = net.bind(ctx1, args = arr2, argsGrad = arrGrad2)
 
-    // Show the execution plan that involves copynode
-    // scalastyle:off println
-    print(exec1.debugStr)
-    // scalastyle:on println
-
     exec1.forward()
     exec2.forward()
     assert(reldiff(exec1.outputs(0).copyTo(ctx1),
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/OperatorSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/OperatorSuite.scala
index dd5f96f980a6..deb149985ce8 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/OperatorSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/OperatorSuite.scala
@@ -614,9 +614,6 @@ class OperatorSuite extends FunSuite with BeforeAndAfterAll
     val embed = Symbol.Embedding(name = "embed")()(
       Map("data" -> data, "input_dim" -> inDim, "output_dim" -> outDim))
     // TODO
-    // scalastyle:off println
-    println(s"Embeded symbol: ${embed.toJson}")
-    // scalastyle:on println
   }
 
   // check ops handle duplicate input correctly.
@@ -983,9 +980,6 @@ class OperatorSuite extends FunSuite with BeforeAndAfterAll
   test("batch norm") {
     val data = Symbol.Variable("data")
     val test = Symbol.BatchNorm(name = "bn")()(Map("data" -> data, "fix_gamma" -> "False"))
-    // scalastyle:off println
-    println(s"BatchNorm: ${test.toJson}")
-    // scalastyle:on println
     // TODO: check numeric gradient
   }
 
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/SymbolSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/SymbolSuite.scala
index d134c83ff7e7..415be5122c95 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/SymbolSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/SymbolSuite.scala
@@ -32,14 +32,8 @@ class SymbolSuite extends FunSuite with BeforeAndAfterAll {
     var net2 = Symbol.FullyConnected(name = "fc3")()(Map("num_hidden" -> 10))
     net2 = Symbol.Activation()()(Map("data" -> net2, "act_type" -> "relu"))
     net2 = Symbol.FullyConnected(name = "fc4")()(Map("data" -> net2, "num_hidden" -> 20))
-    // scalastyle:off println
-    println(s"net2 debug info:\n${net2.debugStr}")
-    // scalastyle:on println
 
     val composed = net2(name = "composed", Map("fc3_data" -> net1))
-    // scalastyle:off println
-    println(s"composed debug info:\n${composed.debugStr}")
-    // scalastyle:on println
     val multiOut = Symbol.Group(composed, net1)
     assert(multiOut.listOutputs().length === 2)
   }
@@ -77,10 +71,6 @@ class SymbolSuite extends FunSuite with BeforeAndAfterAll {
     val lam = Symbol.Variable("lam")
     val rnd = Symbol.random.poisson(lam = Some(lam), shape = Some(Shape(2, 2)))
     val rnd2 = Symbol.random.poisson(lam = Some(1f), shape = Some(Shape(2, 2)))
-    // scalastyle:off println
-    println(s"Symbol.random.poisson debug info: ${rnd.debugStr}")
-    println(s"Symbol.random.poisson debug info: ${rnd2.debugStr}")
-    // scalastyle:on println
   }
 
   test("Symbol random module is generated properly - special case of 'normal'") {
@@ -88,9 +78,5 @@ class SymbolSuite extends FunSuite with BeforeAndAfterAll {
     val scale = Symbol.Variable("scale")
     val rnd = Symbol.random.normal(mu = Some(loc), sigma = Some(scale), shape = Some(Shape(2, 2)))
     val rnd2 = Symbol.random.normal(mu = Some(1f), sigma = Some(2f), shape = Some(Shape(2, 2)))
-    // scalastyle:off println
-    println(s"Symbol.random.sample_normal debug info: ${rnd.debugStr}")
-    println(s"Symbol.random.random_normal debug info: ${rnd2.debugStr}")
-    // scalastyle:on println
   }
 }
diff --git a/scala-package/deploy/pom.xml b/scala-package/deploy/pom.xml
index 74b57077773c..b4dd273719bb 100644
--- a/scala-package/deploy/pom.xml
+++ b/scala-package/deploy/pom.xml
@@ -150,6 +150,9 @@
             </configuration>
           </execution>
         </executions>
+        <configuration>
+            <skip>true</skip> <!-- Skip deploy-plugin:deploy and only use deploy-plugin:deploy-file -->
+        </configuration>
       </plugin>
     </plugins>
   </build>
diff --git a/scala-package/deploy/src/main/deploy/deploy.xml b/scala-package/deploy/src/main/deploy/deploy.xml
index 7265e7de8809..f5b624885cbb 100644
--- a/scala-package/deploy/src/main/deploy/deploy.xml
+++ b/scala-package/deploy/src/main/deploy/deploy.xml
@@ -38,7 +38,7 @@
     <dependency>
       <groupId>org.scala-lang.modules</groupId>
       <artifactId>scala-parser-combinators_2.11</artifactId>
-      <version>1.0.4</version>
+      <version>1.0.5</version>
     </dependency>
     <dependency>
       <groupId>org.scala-lang</groupId>
diff --git a/scala-package/examples/pom.xml b/scala-package/examples/pom.xml
index d60782ffd06b..257529199176 100644
--- a/scala-package/examples/pom.xml
+++ b/scala-package/examples/pom.xml
@@ -82,10 +82,6 @@
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-jar-plugin</artifactId>
       </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-compiler-plugin</artifactId>
-      </plugin>
       <plugin>
         <groupId>net.alchim31.maven</groupId>
         <artifactId>scala-maven-plugin</artifactId>
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/BucketIo.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/BucketIo.scala
index 6d414bb0328a..350e28cf8634 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/BucketIo.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/BucketIo.scala
@@ -202,10 +202,10 @@ object BucketIo {
       labelBuf.set(labels.flatten)
 
       iBucket += 1
-      val batchProvideData = { val tmp = ListMap("data" -> dataBuf.shape)
-        tmp ++ initStates.map(x => x._1 -> Shape(x._2._1, x._2._2))
-      }
-      val batchProvideLabel = ListMap("softmax_label" -> labelBuf.shape)
+      val batchProvideData = IndexedSeq(DataDesc("data", dataBuf.shape, dataBuf.dtype)) ++
+        initStates.map {
+          case (name, shape) => DataDesc(name, Shape(shape._1, shape._2), DType.Float32)}
+      val batchProvideLabel = IndexedSeq(DataDesc("softmax_label", labelBuf.shape, labelBuf.dtype))
       val initStateArrays = initStates.map(x => NDArray.zeros(x._2._1, x._2._2))
       new DataBatch(IndexedSeq(dataBuf.copy()) ++ initStateArrays,
         IndexedSeq(labelBuf.copy()),
diff --git a/scala-package/infer/pom.xml b/scala-package/infer/pom.xml
index ed90d8073675..81e93932e83f 100644
--- a/scala-package/infer/pom.xml
+++ b/scala-package/infer/pom.xml
@@ -46,10 +46,6 @@
           </excludes>
         </configuration>
       </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-compiler-plugin</artifactId>
-      </plugin>
       <plugin>
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest-maven-plugin</artifactId>
diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Classifier.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Classifier.scala
index bf6581588114..38fdc0028a7a 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Classifier.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Classifier.scala
@@ -31,6 +31,7 @@ trait ClassifierBase {
 
   /**
     * Takes an array of floats and returns corresponding (Label, Score) tuples
+    * @tparam T The Scala equivalent of the DType used for the input array and return value
     * @param input            Indexed sequence one-dimensional array of floats/doubles
     * @param topK             (Optional) How many result (sorting based on the last axis)
     *                         elements to return. Default returns unsorted output.
@@ -167,6 +168,12 @@ class Classifier(modelPathPrefix: String,
     result.toIndexedSeq
   }
 
+  /**
+    * Gives the path to the standard location of the synset.txt file
+    * @throws IllegalArgumentException Thrown when the file does not exist
+    * @param modelPathPrefix The path to the model directory
+    * @return The path to the synset.txt file
+    */
   private[infer] def getSynsetFilePath(modelPathPrefix: String): String = {
     val dirPath = modelPathPrefix.substring(0, 1 + modelPathPrefix.lastIndexOf(File.separator))
     val d = new File(dirPath)
@@ -179,6 +186,11 @@ class Classifier(modelPathPrefix: String,
     s.getCanonicalPath
   }
 
+  /**
+    * Parses the labels from a synset file
+    * @param synsetFilePath The path to the synset file. Can be gotten from getSynsetFilePath
+    * @return A IndexedSeq of each element in the file
+    */
   private[infer]  def readSynsetFile(synsetFilePath: String): IndexedSeq[String] = {
     val f = io.Source.fromFile(synsetFilePath)
     try {
@@ -188,6 +200,11 @@ class Classifier(modelPathPrefix: String,
     }
   }
 
+  /**
+    * Creates a predictor with the same modelPath, inputDescriptors, contexts,
+    * and epoch as the classifier
+    * @return The new Predictor
+    */
   private[infer] def getPredictor(): PredictBase = {
       new Predictor(modelPathPrefix, inputDescriptors, contexts, epoch)
   }
diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ImageClassifier.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ImageClassifier.scala
index 99c0432d79f2..fb5f39fb2096 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ImageClassifier.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ImageClassifier.scala
@@ -66,6 +66,10 @@ class ImageClassifier(modelPathPrefix: String,
   protected[infer] val height = inputShape(inputLayout.indexOf('H'))
   protected[infer] val width = inputShape(inputLayout.indexOf('W'))
 
+  /**
+    * Get the names and shapes that would be returns by a classify call
+    * @return a list of (name, shape) tuples
+    */
   def outputShapes: IndexedSeq[(String, Shape)] = predictor.outputShapes
 
   /**
@@ -127,6 +131,19 @@ class ImageClassifier(modelPathPrefix: String,
     result
   }
 
+  /**
+    * Creates a Classifier
+    *
+    * @param modelPathPrefix    Path prefix from where to load the model artifacts.
+    *                           These include the symbol, parameters, and synset.txt.
+    *                           Example: file://model-dir/resnet-152 (containing
+    *                           resnet-152-symbol.json, resnet-152-0000.params, and synset.txt).
+    * @param inputDescriptors   Descriptors defining the input node names, shape,
+    *                           layout and type parameters
+    * @param contexts           Device contexts on which you want to run inference; defaults to CPU
+    * @param epoch              Model epoch to load; defaults to 0
+    * @return                   A Classifier to perform inference with
+    */
   private[infer] def getClassifier(modelPathPrefix: String,
                                      inputDescriptors: IndexedSeq[DataDesc],
                     contexts: Array[Context] = Context.cpu(),
@@ -156,19 +173,16 @@ object ImageClassifier {
 
   /**
     * Convert input BufferedImage to NDArray of input shape
-    *
-    * <p>
     * Note: Caller is responsible to dispose the NDArray
     * returned by this method after the use.
-    * </p>
-    * @param resizedImage     BufferedImage to get pixels from
     *
-    * @param inputImageShape  Input shape; for example for resnet it is (3,224,224).
-                              Should be same as inputDescriptor shape.
-    * @param dType            The DataType of the NDArray created from the image
-    *                         that should be returned.
-    *                         Currently it defaults to Dtype.Float32
-    * @return                 NDArray pixels array with shape (3, 224, 224) in CHW format
+    * @param resizedImage BufferedImage to get pixels from
+    * @param inputImageShape Input shape; for example for resnet it is (3,224,224).
+    *                        Should be same as inputDescriptor shape.
+    * @param dType The DataType of the NDArray created from the image
+    *              that should be returned.
+    *              Currently it defaults to Dtype.Float32
+    * @return NDArray pixels array with shape (3, 224, 224) in CHW format
     */
   def bufferedImageToPixels(resizedImage: BufferedImage, inputImageShape: Shape,
                             dType : DType = DType.Float32): NDArray = {
@@ -235,4 +249,4 @@ object ImageClassifier {
   def loadInputBatch(inputImagePaths: List[String]): Traversable[BufferedImage] = {
     inputImagePaths.map(path => ImageIO.read(new File(path)))
   }
-}
\ No newline at end of file
+}
diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/MXNetHandler.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/MXNetHandler.scala
index d2bed3aa9d80..593bab66bf12 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/MXNetHandler.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/MXNetHandler.scala
@@ -23,6 +23,12 @@ import org.slf4j.LoggerFactory
 
 private[infer] trait MXNetHandler {
 
+  /**
+    * Executes a function within a thread-safe executor
+    * @param f The function to execute
+    * @tparam T The return type of the function
+    * @return Returns the result of the function f
+    */
   def execute[T](f: => T): T
 
   val executor: ExecutorService
@@ -31,7 +37,11 @@ private[infer] trait MXNetHandler {
 
 private[infer] object MXNetHandlerType extends Enumeration {
 
+  /**
+    * The internal type of the MXNetHandlerType enumeration
+    */
   type MXNetHandlerType = Value
+
   val SingleThreadHandler = Value("MXNetSingleThreadHandler")
   val OneThreadPerModelHandler = Value("MXNetOneThreadPerModelHandler")
 }
@@ -93,6 +103,10 @@ private[infer] object MXNetSingleThreadHandler extends MXNetThreadPoolHandler(1)
 
 private[infer] object MXNetHandler {
 
+  /**
+    * Creates a handler based on the handlerType
+    * @return A ThreadPool or Thread Handler
+    */
   def apply(): MXNetHandler = {
     if (handlerType == MXNetHandlerType.OneThreadPerModelHandler) {
       new MXNetThreadPoolHandler(1)
diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ObjectDetector.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ObjectDetector.scala
index 28a578cae79f..b78cfbccd987 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ObjectDetector.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ObjectDetector.scala
@@ -111,6 +111,13 @@ class ObjectDetector(modelPathPrefix: String,
     batchResult.toIndexedSeq
   }
 
+  /**
+    * Formats detection results by sorting in descending order of accuracy (topK only)
+    * and combining with synset labels
+    * @param predictResultND The results from the objectDetect call
+    * @param topK The number of top results to return or None for all
+    * @return The top predicted results as (className, [Accuracy, Xmin, Ymin, Xmax, Ymax])
+    */
   private[infer] def sortAndReformat(predictResultND: NDArray, topK: Option[Int])
   : IndexedSeq[(String, Array[Float])] = {
     // iterating over the all the predictions
@@ -170,6 +177,18 @@ class ObjectDetector(modelPathPrefix: String,
     result
   }
 
+  /**
+    * Creates an image classifier from the object detector model
+    * @param modelPathPrefix    Path prefix from where to load the model artifacts.
+    *                           These include the symbol, parameters, and synset.txt.
+    *                           Example: file://model-dir/resnet-152 (containing
+    *                           resnet-152-symbol.json, resnet-152-0000.params, and synset.txt).
+    * @param inputDescriptors   Descriptors defining the input node names, shape,
+    *                           layout and type parameters
+    * @param contexts           Device contexts on which you want to run inference; defaults to CPU
+    * @param epoch              Model epoch to load; defaults to 0
+    * @return The corresponding image classifier
+    */
   private[infer] def getImageClassifier(modelPathPrefix: String,
                                         inputDescriptors: IndexedSeq[DataDesc],
                          contexts: Array[Context] = Context.cpu(),
diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Predictor.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Predictor.scala
index 66284c81bd2e..cb27c930903d 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Predictor.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Predictor.scala
@@ -33,27 +33,27 @@ import org.slf4j.LoggerFactory
 private[infer] trait PredictBase {
 
   /**
-   * Converts indexed sequences of 1-D array to NDArrays.
-   * <p>
-   * This method will take input as IndexedSeq one dimensional arrays and creates the
-   * NDArray needed for inference. The array will be reshaped based on the input descriptors.
-   * @param input:            An Indexed Sequence of a one-dimensional array of datatype
-    *                         Float or Double
-                              An IndexedSequence is needed when the model has more than one input.
-   * @return                  Indexed sequence array of outputs
-   */
+    * Converts indexed sequences of 1-D array to NDArrays.
+    * This method will take input as IndexedSeq one dimensional arrays and creates the
+    * NDArray needed for inference. The array will be reshaped based on the input descriptors.
+    * @tparam T The Scala equivalent of the DType used for the input array and return value
+    * @param input An Indexed Sequence of a one-dimensional array of datatype
+    *              Float or Double
+    *              An IndexedSequence is needed when the model has more than one input.
+    * @return      Indexed sequence array of outputs
+    */
   def predict[@specialized (Base.MX_PRIMITIVES) T](input: IndexedSeq[Array[T]])
   : IndexedSeq[Array[T]]
 
   /**
-   * Predict using NDArray as input.
-   * <p>
-   * This method is useful when the input is a batch of data
-   * or when multiple operations on the input have to performed.
-   * Note: User is responsible for managing allocation/deallocation of NDArrays.
-   * @param input             IndexedSequence NDArrays.
-   * @return                  Output of predictions as NDArrays.
-   */
+    * Predict using NDArray as input.
+    * <p>
+    * This method is useful when the input is a batch of data
+    * or when multiple operations on the input have to performed.
+    * Note: User is responsible for managing allocation/deallocation of NDArrays.
+    * @param input             IndexedSequence NDArrays.
+    * @return                  Output of predictions as NDArrays.
+    */
   def predictWithNDArray(input: IndexedSeq[NDArray]): IndexedSeq[NDArray]
 
   /**
@@ -248,6 +248,10 @@ class Predictor(modelPathPrefix: String,
     resultND
   }
 
+  /**
+    * Creates the module backing the Predictor with the same path, epoch, contexts, and inputs
+    * @return The Module
+    */
   private[infer] def loadModule(): Module = {
     val mod = mxNetHandler.execute(Module.loadCheckpoint(modelPathPrefix, epoch.get,
       contexts = contexts, dataNames = inputDescriptors.map(desc => desc.name)))
diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/ObjectDetector.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/ObjectDetector.scala
index 05334e49a356..8131273eca94 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/ObjectDetector.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/ObjectDetector.scala
@@ -31,19 +31,23 @@ import scala.language.implicitConversions
   * The ObjectDetector class helps to run ObjectDetection tasks where the goal
   * is to find bounding boxes and corresponding labels for objects in a image.
   *
-  * @param modelPathPrefix    Path prefix from where to load the model artifacts.
-  *                           These include the symbol, parameters, and synset.txt.
-  *                           Example: file://model-dir/ssd_resnet50_512 (containing
-  *                           ssd_resnet50_512-symbol.json, ssd_resnet50_512-0000.params,
-  *                           and synset.txt)
-  * @param inputDescriptors   Descriptors defining the input node names, shape,
-  *                           layout and type parameters
-  * @param contexts           Device contexts on which you want to run inference.
-  *                           Defaults to CPU.
-  * @param epoch              Model epoch to load; defaults to 0
+  * @param objDetector A source Scala Object detector
   */
 class ObjectDetector private[mxnet] (val objDetector: org.apache.mxnet.infer.ObjectDetector){
 
+  /**
+    *
+    * @param modelPathPrefix    Path prefix from where to load the model artifacts.
+    *                           These include the symbol, parameters, and synset.txt.
+    *                           Example: file://model-dir/ssd_resnet50_512 (containing
+    *                           ssd_resnet50_512-symbol.json, ssd_resnet50_512-0000.params,
+    *                           and synset.txt)
+    * @param inputDescriptors   Descriptors defining the input node names, shape,
+    *                           layout and type parameters
+    * @param contexts           Device contexts on which you want to run inference.
+    *                           Defaults to CPU.
+    * @param epoch              Model epoch to load; defaults to 0
+    */
   def this(modelPathPrefix: String, inputDescriptors: java.lang.Iterable[DataDesc], contexts:
   java.lang.Iterable[Context], epoch: Int)
   = this {
@@ -98,32 +102,78 @@ class ObjectDetector private[mxnet] (val objDetector: org.apache.mxnet.infer.Obj
     (ret map {a => (a map {e => new ObjectDetectorOutput(e._1, e._2)}).asJava}).asJava
   }
 
+  /**
+    * Helper to map an implicit conversion
+    * @param l The value to convert
+    * @tparam B The desired type
+    * @tparam A The input type
+    * @return The converted result
+    */
   def convert[B, A <% B](l: IndexedSeq[A]): IndexedSeq[B] = l map { a => a: B }
 
 }
 
 
 object ObjectDetector {
-  implicit def fromObjectDetector(OD: org.apache.mxnet.infer.ObjectDetector):
-    ObjectDetector = new ObjectDetector(OD)
-
-  implicit def toObjectDetector(jOD: ObjectDetector):
-    org.apache.mxnet.infer.ObjectDetector = jOD.objDetector
 
+  /**
+    * Loads an input images from file
+    * @param inputImagePath   Path of single input image
+    * @return                 BufferedImage Buffered image
+    */
   def loadImageFromFile(inputImagePath: String): BufferedImage = {
     org.apache.mxnet.infer.ImageClassifier.loadImageFromFile(inputImagePath)
   }
 
+  /**
+    * Reshape the input image to a new shape
+    *
+    * @param img              Input image
+    * @param newWidth         New width for rescaling
+    * @param newHeight        New height for rescaling
+    * @return                 Rescaled BufferedImage
+    */
   def reshapeImage(img : BufferedImage, newWidth: Int, newHeight: Int): BufferedImage = {
     org.apache.mxnet.infer.ImageClassifier.reshapeImage(img, newWidth, newHeight)
   }
 
+  /**
+    * Convert input BufferedImage to NDArray of input shape
+    * Note: Caller is responsible to dispose the NDArray
+    * returned by this method after the use.
+    *
+    * @param resizedImage BufferedImage to get pixels from
+    * @param inputImageShape Input shape; for example for resnet it is (3,224,224).
+    *                        Should be same as inputDescriptor shape.
+    * @return NDArray pixels array with shape (3, 224, 224) in CHW format
+    */
   def bufferedImageToPixels(resizedImage: BufferedImage, inputImageShape: Shape): NDArray = {
     org.apache.mxnet.infer.ImageClassifier.bufferedImageToPixels(resizedImage, inputImageShape)
   }
 
+  /**
+    * Loads a batch of images from a folder
+    * @param inputImagePaths  Path to a folder of images
+    * @return                   List of buffered images
+    */
   def loadInputBatch(inputImagePaths: java.lang.Iterable[String]): java.util.List[BufferedImage] = {
     org.apache.mxnet.infer.ImageClassifier
       .loadInputBatch(inputImagePaths.asScala.toList).toList.asJava
   }
+
+  /**
+    * Implicitly convert a Scala ObjectDetector to a Java ObjectDetector
+    * @param OD The Scala ObjectDetector
+    * @return The Java ObjectDetector
+    */
+  implicit def fromObjectDetector(OD: org.apache.mxnet.infer.ObjectDetector):
+  ObjectDetector = new ObjectDetector(OD)
+
+  /**
+    * Implicitly converts a Java ObjectDetector to a Scala ObjectDetector
+    * @param jOD The Java ObjectDetector
+    * @return The Scala ObjectDetector
+    */
+  implicit def toObjectDetector(jOD: ObjectDetector):
+  org.apache.mxnet.infer.ObjectDetector = jOD.objDetector
 }
diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/Predictor.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/Predictor.scala
index 6c0871fae51b..e1505a4da821 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/Predictor.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/Predictor.scala
@@ -25,21 +25,25 @@ import scala.collection.JavaConverters._
 /**
   * Implementation of prediction routines.
   *
-  * @param modelPathPrefix     Path prefix from where to load the model artifacts.
-  *                            These include the symbol, parameters, and synset.txt
-  *                            Example: file://model-dir/resnet-152 (containing
-  *                            resnet-152-symbol.json, resnet-152-0000.params, and synset.txt).
-  * @param inputDescriptors    Descriptors defining the input node names, shape,
-  *                            layout and type parameters
-  *                            <p>Note: If the input Descriptors is missing batchSize
-  *                            ('N' in layout), a batchSize of 1 is assumed for the model.
-  * @param contexts            Device contexts on which you want to run inference; defaults to CPU
-  * @param epoch               Model epoch to load; defaults to 0
-
+  * @param predictor The underlying Scala predictor
   */
 
 // JavaDoc description of class to be updated in https://issues.apache.org/jira/browse/MXNET-1178
 class Predictor private[mxnet] (val predictor: org.apache.mxnet.infer.Predictor){
+
+  /**
+    *
+    * @param modelPathPrefix     Path prefix from where to load the model artifacts.
+    *                            These include the symbol, parameters, and synset.txt
+    *                            Example: file://model-dir/resnet-152 (containing
+    *                            resnet-152-symbol.json, resnet-152-0000.params, and synset.txt).
+    * @param inputDescriptors    Descriptors defining the input node names, shape,
+    *                            layout and type parameters
+    *                            <p>Note: If the input Descriptors is missing batchSize
+    *                            ('N' in layout), a batchSize of 1 is assumed for the model.
+    * @param contexts            Device contexts on which you want to run inference; defaults to CPU
+    * @param epoch               Model epoch to load; defaults to 0
+    */
   def this(modelPathPrefix: String, inputDescriptors: java.lang.Iterable[DataDesc],
            contexts: java.lang.Iterable[Context], epoch: Int)
   = this {
@@ -53,19 +57,17 @@ class Predictor private[mxnet] (val predictor: org.apache.mxnet.infer.Predictor)
     * Takes input as Array of one dimensional arrays and creates the NDArray needed for inference
     * The array will be reshaped based on the input descriptors. Example of calling in Java:
     *
-    * <pre>
-    * {@code
+    * {{{
     * float tmp[][] = new float[1][224];
     * for (int x = 0; x < 1; x++)
     *   for (int y = 0; y < 224; y++)
     *     tmp[x][y] = (int)(Math.random()*10);
     * predictor.predict(tmp);
-    * }
-    * </pre>
+    * }}}
     *
-    * @param input:            An Array of a one-dimensional array.
-                              An extra Array is needed for when the model has more than one input.
-    * @return                  Indexed sequence array of outputs
+    * @param input An Array of a one-dimensional array.
+    *              An extra Array is needed for when the model has more than one input.
+    * @return Indexed sequence array of outputs
     */
   def predict(input: Array[Array[Float]]):
   Array[Array[Float]] = {
@@ -76,18 +78,16 @@ class Predictor private[mxnet] (val predictor: org.apache.mxnet.infer.Predictor)
     * Takes input as Array of one dimensional arrays and creates the NDArray needed for inference
     * The array will be reshaped based on the input descriptors. Example of calling in Java:
     *
-    * <pre>
-    * {@code
+    * {{{
     * double tmp[][] = new double[1][224];
     * for (int x = 0; x < 1; x++)
     *   for (int y = 0; y < 224; y++)
     *     tmp[x][y] = (int)(Math.random()*10);
     * predictor.predict(tmp);
-    * }
-    * </pre>
+    * }}}
     *
-    * @param input:            An Array of a one-dimensional array.
-                              An extra Array is needed for when the model has more than one input.
+    * @param input            An Array of a one-dimensional array.
+    *                         An extra Array is needed for when the model has more than one input.
     * @return                  Indexed sequence array of outputs
     */
 
@@ -100,8 +100,8 @@ class Predictor private[mxnet] (val predictor: org.apache.mxnet.infer.Predictor)
     * Takes input as List of one dimensional iterables and creates the NDArray needed for inference
     * The array will be reshaped based on the input descriptors.
     *
-    * @param input:            A List of a one-dimensional iterables of DType Float.
-                              An extra List is needed for when the model has more than one input.
+    * @param input            A List of a one-dimensional iterables of DType Float.
+    *                         An extra List is needed for when the model has more than one input.
     * @return                  Indexed sequence array of outputs
     */
   def predict(input: java.util.List[java.util.List[java.lang.Float]]):
diff --git a/scala-package/macros/pom.xml b/scala-package/macros/pom.xml
index 91a13d0fc521..cb84824bb4e2 100644
--- a/scala-package/macros/pom.xml
+++ b/scala-package/macros/pom.xml
@@ -51,10 +51,6 @@
           </excludes>
         </configuration>
       </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-compiler-plugin</artifactId>
-      </plugin>
       <plugin>
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest-maven-plugin</artifactId>
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/GeneratorBase.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/GeneratorBase.scala
index 498c4e943669..b2033f529c65 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/GeneratorBase.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/GeneratorBase.scala
@@ -116,7 +116,7 @@ private[mxnet] abstract class GeneratorBase {
     */
   protected def structGeneration(c: blackbox.Context)
                                 (funcDef: List[c.universe.DefDef], annottees: c.Expr[Any]*)
-  : c.Expr[Any] = {
+  : c.Expr[Nothing] = {
     import c.universe._
     val inputs = annottees.map(_.tree).toList
     // pattern match on the inputs
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala
index c18694b59bf6..f5b8bce11cf5 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala
@@ -22,15 +22,16 @@ import scala.language.experimental.macros
 import scala.reflect.macros.blackbox
 
 private[mxnet] class AddNDArrayFunctions(isContrib: Boolean) extends StaticAnnotation {
-  private[mxnet] def macroTransform(annottees: Any*) = macro NDArrayMacro.addDefs
+  private[mxnet] def macroTransform(annottees: Any*): Any = macro NDArrayMacro.addDefs
 }
 
 private[mxnet] class AddNDArrayAPIs(isContrib: Boolean) extends StaticAnnotation {
-  private[mxnet] def macroTransform(annottees: Any*) = macro TypedNDArrayAPIMacro.typeSafeAPIDefs
+  private[mxnet] def macroTransform(annottees: Any*): Any =
+  macro TypedNDArrayAPIMacro.typeSafeAPIDefs
 }
 
 private[mxnet] class AddNDArrayRandomAPIs(isContrib: Boolean) extends StaticAnnotation {
-  private[mxnet] def macroTransform(annottees: Any*) =
+  private[mxnet] def macroTransform(annottees: Any*): Any =
   macro TypedNDArrayRandomAPIMacro.typeSafeAPIDefs
 }
 
@@ -39,7 +40,7 @@ private[mxnet] class AddNDArrayRandomAPIs(isContrib: Boolean) extends StaticAnno
   */
 private[mxnet] object NDArrayMacro extends GeneratorBase {
 
-  def addDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
+  def addDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Nothing] = {
     import c.universe._
     val isContrib: Boolean = c.prefix.tree match {
       case q"new AddNDArrayFunctions($b)" => c.eval[Boolean](c.Expr(b))
@@ -49,7 +50,7 @@ private[mxnet] object NDArrayMacro extends GeneratorBase {
   }
 
   private def impl(c: blackbox.Context)
-                  (isContrib: Boolean, annottees: c.Expr[Any]*): c.Expr[Any] = {
+                  (isContrib: Boolean, annottees: c.Expr[Any]*): c.Expr[Nothing] = {
     import c.universe._
 
     val functions = functionsToGenerate(isSymbol = false, isContrib)
@@ -82,7 +83,7 @@ private[mxnet] object NDArrayMacro extends GeneratorBase {
   */
 private[mxnet] object TypedNDArrayAPIMacro extends GeneratorBase {
 
-  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
+  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Nothing] = {
     import c.universe._
     val isContrib: Boolean = c.prefix.tree match {
       case q"new AddNDArrayAPIs($b)" => c.eval[Boolean](c.Expr(b))
@@ -148,7 +149,7 @@ private[mxnet] object TypedNDArrayAPIMacro extends GeneratorBase {
 private[mxnet] object TypedNDArrayRandomAPIMacro extends GeneratorBase
   with RandomHelpers {
 
-  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
+  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Nothing] = {
     // Note: no contrib managed in this module
 
     val functionDefs = typeSafeRandomFunctionsToGenerate(isSymbol = false)
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
index 7ec80b9c066c..06b567c3d2d4 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
@@ -23,15 +23,16 @@ import scala.language.experimental.macros
 import scala.reflect.macros.blackbox
 
 private[mxnet] class AddSymbolFunctions(isContrib: Boolean) extends StaticAnnotation {
-  private[mxnet] def macroTransform(annottees: Any*) = macro SymbolMacro.addDefs
+  private[mxnet] def macroTransform(annottees: Any*): Any = macro SymbolMacro.addDefs
 }
 
 private[mxnet] class AddSymbolAPIs(isContrib: Boolean) extends StaticAnnotation {
-  private[mxnet] def macroTransform(annottees: Any*) = macro TypedSymbolAPIMacro.typeSafeAPIDefs
+  private[mxnet] def macroTransform(annottees: Any*): Any =
+  macro TypedSymbolAPIMacro.typeSafeAPIDefs
 }
 
 private[mxnet] class AddSymbolRandomAPIs(isContrib: Boolean) extends StaticAnnotation {
-  private[mxnet] def macroTransform(annottees: Any*) =
+  private[mxnet] def macroTransform(annottees: Any*): Any =
   macro TypedSymbolRandomAPIMacro.typeSafeAPIDefs
 }
 
@@ -40,7 +41,7 @@ private[mxnet] class AddSymbolRandomAPIs(isContrib: Boolean) extends StaticAnnot
   */
 private[mxnet] object SymbolMacro extends GeneratorBase {
 
-  def addDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
+  def addDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Nothing] = {
     import c.universe._
     val isContrib: Boolean = c.prefix.tree match {
       case q"new AddSymbolFunctions($b)" => c.eval[Boolean](c.Expr(b))
@@ -50,7 +51,7 @@ private[mxnet] object SymbolMacro extends GeneratorBase {
   }
 
   private def impl(c: blackbox.Context)
-                  (isContrib: Boolean, annottees: c.Expr[Any]*): c.Expr[Any] = {
+                  (isContrib: Boolean, annottees: c.Expr[Any]*): c.Expr[Nothing] = {
     import c.universe._
 
     val functions = functionsToGenerate(isSymbol = false, isContrib)
@@ -76,7 +77,7 @@ private[mxnet] object SymbolMacro extends GeneratorBase {
   */
 private[mxnet] object TypedSymbolAPIMacro extends GeneratorBase {
 
-  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
+  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Nothing] = {
     import c.universe._
     val isContrib: Boolean = c.prefix.tree match {
       case q"new AddSymbolAPIs($b)" => c.eval[Boolean](c.Expr(b))
@@ -140,7 +141,7 @@ private[mxnet] object TypedSymbolAPIMacro extends GeneratorBase {
 private[mxnet] object TypedSymbolRandomAPIMacro extends GeneratorBase
   with RandomHelpers {
 
-  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
+  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Nothing] = {
     val functionDefs = typeSafeRandomFunctionsToGenerate(isSymbol = true)
       .map(f => buildTypedFunction(c)(f))
 
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/javaapi/JavaNDArrayMacro.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/javaapi/JavaNDArrayMacro.scala
index fa3565b4fb8e..9bf0818c14a4 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/javaapi/JavaNDArrayMacro.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/javaapi/JavaNDArrayMacro.scala
@@ -25,18 +25,16 @@ import scala.language.experimental.macros
 import scala.reflect.macros.blackbox
 
 private[mxnet] class AddJNDArrayAPIs(isContrib: Boolean) extends StaticAnnotation {
-  private[mxnet] def macroTransform(annottees: Any*) = macro JavaNDArrayMacro.typeSafeAPIDefs
+  private[mxnet] def macroTransform(annottees: Any*): Any = macro JavaNDArrayMacro.typeSafeAPIDefs
 }
 
 private[mxnet] object JavaNDArrayMacro extends GeneratorBase {
 
-  // scalastyle:off havetype
-  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*) = {
+  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Nothing] = {
     typeSafeAPIImpl(c)(annottees: _*)
   }
-  // scalastyle:off havetype
 
-  private def typeSafeAPIImpl(c: blackbox.Context)(annottees: c.Expr[Any]*) : c.Expr[Any] = {
+  private def typeSafeAPIImpl(c: blackbox.Context)(annottees: c.Expr[Any]*) : c.Expr[Nothing] = {
     import c.universe._
 
     val isContrib: Boolean = c.prefix.tree match {
diff --git a/scala-package/pom.xml b/scala-package/pom.xml
index 62def16627b9..44b784a56f8e 100644
--- a/scala-package/pom.xml
+++ b/scala-package/pom.xml
@@ -53,8 +53,11 @@
   </scm>
 
   <properties>
+    <java.version>1.7</java.version>
     <scala.version>2.11.8</scala.version>
     <build.platform/>
+    <scala.binary.version>2.11</scala.binary.version>
+    <build.platform />
     <cxx>g++</cxx>
     <dollar>$</dollar>
     <MXNET_DIR>${project.basedir}/..</MXNET_DIR>
@@ -252,9 +255,11 @@
         <artifactId>maven-compiler-plugin</artifactId>
         <version>3.3</version>
         <configuration>
-          <source>1.7</source>
-          <target>1.7</target>
+          <source>${java.version}</source>
+          <target>${java.version}</target>
           <encoding>UTF-8</encoding>
+          <skipMain>true</skipMain>
+          <skip>true</skip>
         </configuration>
       </plugin>
       <plugin>
@@ -339,11 +344,12 @@
         </executions>
       </plugin>
       <plugin>
-        <groupId>org.scala-tools</groupId>
-        <artifactId>maven-scala-plugin</artifactId>
-        <version>2.15.2</version>
+        <groupId>net.alchim31.maven</groupId>
+        <artifactId>scala-maven-plugin</artifactId>
+        <version>3.4.4</version>
         <configuration>
-          <recompileMode>incremental</recompileMode>
+          <source>${java.version}</source>
+          <target>${java.version}</target>
           <compilerPlugins>
             <compilerPlugin>
               <groupId>org.scalamacros</groupId>
@@ -354,25 +360,19 @@
         </configuration>
         <executions>
           <execution>
+            <id>compile</id>
             <goals>
+              <goal>add-source</goal>
               <goal>compile</goal>
               <goal>testCompile</goal>
+              <goal>doc-jar</goal>
             </goals>
           </execution>
-        </executions>
-      </plugin>
-      <plugin>
-        <groupId>net.alchim31.maven</groupId>
-        <artifactId>scala-maven-plugin</artifactId>
-        <version>3.3.2</version>
-        <configuration>
-        </configuration>
-        <executions>
           <execution>
-            <phase>package</phase>
-            <id>attach-javadocs</id>
+            <id>presite</id>
+            <phase>pre-site</phase>
             <goals>
-              <goal>doc-jar</goal>
+              <goal>add-source</goal>
             </goals>
           </execution>
         </executions>
@@ -388,6 +388,7 @@
       </plugin>
     </plugins>
   </build>
+
   <dependencies>
     <dependency>
       <groupId>commons-codec</groupId>
@@ -419,7 +420,7 @@
     <dependency>
       <groupId>org.scalatest</groupId>
       <artifactId>scalatest_2.11</artifactId>
-      <version>3.0.4</version>
+      <version>3.0.2</version>
       <scope>test</scope>
     </dependency>
     <dependency>
@@ -454,7 +455,12 @@
     <dependency>
       <groupId>org.scala-lang.modules</groupId>
       <artifactId>scala-parser-combinators_2.11</artifactId>
-      <version>1.0.4</version>
+      <version>1.0.5</version>
+    </dependency>
+    <dependency>
+      <groupId>org.scala-lang.modules</groupId>
+      <artifactId>scala-xml_2.11</artifactId>
+      <version>1.0.6</version>
     </dependency>
     <dependency>
       <groupId>org.scala-lang</groupId>
diff --git a/scala-package/spark/pom.xml b/scala-package/spark/pom.xml
index 2acb70b43303..b61a427f83ef 100644
--- a/scala-package/spark/pom.xml
+++ b/scala-package/spark/pom.xml
@@ -50,5 +50,10 @@
       <artifactId>args4j</artifactId>
       <version>2.33</version>
     </dependency>
+    <dependency>
+        <groupId>org.json4s</groupId>
+        <artifactId>json4s-core_2.11</artifactId>
+        <version>3.5.1</version>
+    </dependency>
   </dependencies>
 </project>
diff --git a/scala-package/spark/src/main/scala/org/apache/mxnet/spark/MXNDArray.scala b/scala-package/spark/src/main/scala/org/apache/mxnet/spark/MXNDArray.scala
index a18c47d78741..0d4c18c2b8e9 100644
--- a/scala-package/spark/src/main/scala/org/apache/mxnet/spark/MXNDArray.scala
+++ b/scala-package/spark/src/main/scala/org/apache/mxnet/spark/MXNDArray.scala
@@ -20,7 +20,7 @@ package org.apache.mxnet.spark
 import org.apache.mxnet.NDArray
 
 /**
- * A wrapper for serialize & deserialize <pre>[[org.apache.mxnet.NDArray]]</pre> in spark job
+ * A wrapper for serialize & deserialize ``org.apache.mxnet.NDArray`` in spark job
  * @author Yizhi Liu
  */
 class MXNDArray(@transient private var ndArray: NDArray) extends Serializable {
diff --git a/scala-package/spark/src/main/scala/org/apache/mxnet/spark/MXNetModel.scala b/scala-package/spark/src/main/scala/org/apache/mxnet/spark/MXNetModel.scala
index 2c4c8fe42780..234e9a597cf5 100644
--- a/scala-package/spark/src/main/scala/org/apache/mxnet/spark/MXNetModel.scala
+++ b/scala-package/spark/src/main/scala/org/apache/mxnet/spark/MXNetModel.scala
@@ -23,7 +23,7 @@ import org.apache.spark.SparkContext
 import org.apache.spark.mllib.linalg.Vector
 
 /**
- * Wrapper for <pre>[[org.apache.mxnet.Model]]</pre> which used in Spark application
+ * Wrapper for ``org.apache.mxnet.Model`` which used in Spark application
  * @author Yizhi Liu
  */
 class MXNetModel private[mxnet](
diff --git a/scala-package/spark/src/main/scala/org/apache/mxnet/spark/utils/Network.scala b/scala-package/spark/src/main/scala/org/apache/mxnet/spark/utils/Network.scala
index c61229af0035..836901f69f8f 100644
--- a/scala-package/spark/src/main/scala/org/apache/mxnet/spark/utils/Network.scala
+++ b/scala-package/spark/src/main/scala/org/apache/mxnet/spark/utils/Network.scala
@@ -20,6 +20,7 @@ package org.apache.mxnet.spark.utils
 import java.io.IOException
 import java.net.{ServerSocket, NetworkInterface}
 import java.util.regex.Pattern
+import scala.collection.JavaConverters._
 
 /**
  * Helper functions to decide ip address / port
@@ -33,19 +34,16 @@ object Network {
       "([01]?\\d\\d?|2[0-4]\\d|25[0-5])$")
 
   def ipAddress: String = {
-    val interfaces = NetworkInterface.getNetworkInterfaces
-    while (interfaces.hasMoreElements) {
-      val interface = interfaces.nextElement
-      val addresses = interface.getInetAddresses
-      while (addresses.hasMoreElements) {
-        val address = addresses.nextElement
-        val ip = address.getHostAddress
-        if (!ip.startsWith("127.") && IPADDRESS_PATTERN.matcher(ip).matches()) {
-          return ip
+    val interfaces = NetworkInterface.getNetworkInterfaces.asScala
+    val interface = interfaces.toStream.flatMap(
+      _.getInetAddresses.asScala.toStream.flatMap(
+        address => {
+          val ip = address.getHostAddress
+          Option(ip).filter(ip => !ip.startsWith("127.") && IPADDRESS_PATTERN.matcher(ip).matches())
         }
-      }
-    }
-    "127.0.0.1"
+      )
+    ).headOption
+    interface.getOrElse("127.0.0.1")
   }
 
   def availablePort: Int = {
diff --git a/scala-package/spark/src/test/scala/org/apache/mxnet/spark/SharedSparkContext.scala b/scala-package/spark/src/test/scala/org/apache/mxnet/spark/SharedSparkContext.scala
index 6d36ca51db90..293cfa13cfce 100644
--- a/scala-package/spark/src/test/scala/org/apache/mxnet/spark/SharedSparkContext.scala
+++ b/scala-package/spark/src/test/scala/org/apache/mxnet/spark/SharedSparkContext.scala
@@ -92,20 +92,12 @@ trait SharedSparkContext extends FunSuite with BeforeAndAfterEach with BeforeAnd
 
   private def getJarFilePath(root: String): String = {
     val jarFiles = findJars(s"$root/target/")
-    if (jarFiles != null && jarFiles.nonEmpty) {
-      jarFiles.head.getAbsolutePath
-    } else {
-      null
-    }
+    Option(jarFiles).flatMap(_.headOption).map(_.getAbsolutePath).orNull
   }
 
   private def getSparkJar: String = {
     val jarFiles = findJars(s"$composeWorkingDirPath/target/")
-    if (jarFiles != null && jarFiles.nonEmpty) {
-      jarFiles.head.getAbsolutePath
-    } else {
-      null
-    }
+    Option(jarFiles).flatMap(_.headOption).map(_.getAbsolutePath).orNull
   }
 
   private def getNativeJars(root: String): String =
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 70ba84b5f94b..45197aafe019 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1401,3 +1401,91 @@ int MXNDArrayCreateFromSharedMem(int shared_pid, int shared_id, const mx_uint *s
   *out = new NDArray(shared_pid, shared_id, mxnet::TShape(shape, shape + ndim), dtype);
   API_END();
 }
+
+typedef Engine::VarHandle VarHandle;
+typedef Engine::CallbackOnComplete CallbackOnComplete;
+
+void AssertValidNumberVars(int num_const_vars, int num_mutable_vars) {
+  CHECK_GE(num_const_vars, 0) << "Non-negative number of const vars expected.";
+  CHECK_GE(num_mutable_vars, 0) << "Non-negative number of mutable vars expected.";
+}
+
+int MXEnginePushAsync(EngineAsyncFunc async_func, void* func_param,
+                      EngineFuncParamDeleter deleter, ContextHandle ctx_handle,
+                      EngineVarHandle const_vars_handle, int num_const_vars,
+                      EngineVarHandle mutable_vars_handle, int num_mutable_vars,
+                      EngineFnPropertyHandle prop_handle, int priority,
+                      const char* opr_name, bool wait) {
+  API_BEGIN();
+
+  auto exec_ctx = *static_cast<const Context*>(ctx_handle);
+  auto const_vars = static_cast<VarHandle*>(const_vars_handle);
+  auto mutable_vars = static_cast<VarHandle*>(mutable_vars_handle);
+  auto prop = FnProperty::kNormal;
+  if (prop_handle) {
+    prop = *static_cast<const FnProperty*>(prop_handle);
+  }
+
+  Engine::AsyncFn exec_fn;
+  if (deleter == nullptr) {
+    exec_fn = [async_func, func_param](RunContext rctx,
+                                       CallbackOnComplete on_complete) {
+      async_func(&rctx, &on_complete, func_param);
+    };
+  } else {
+    // Wrap func_param in a shared_ptr with deleter such that deleter
+    // will be called when the lambda goes out of scope.
+    std::shared_ptr<void> shared_func_param(func_param, deleter);
+    exec_fn = [async_func, shared_func_param](RunContext rctx,
+                                              CallbackOnComplete on_complete) {
+      async_func(&rctx, &on_complete, shared_func_param.get());
+    };
+  }
+
+  AssertValidNumberVars(num_const_vars, num_mutable_vars);
+  std::vector<VarHandle> const_var_vec(const_vars, const_vars + num_const_vars);
+  std::vector<VarHandle> mutable_var_vec(mutable_vars, mutable_vars + num_mutable_vars);
+  Engine::Get()->PushAsync(exec_fn, exec_ctx, const_var_vec, mutable_var_vec,
+                           prop, priority, opr_name, wait);
+
+  API_END();
+}
+
+int MXEnginePushSync(EngineSyncFunc sync_func, void* func_param,
+                     EngineFuncParamDeleter deleter, ContextHandle ctx_handle,
+                     EngineVarHandle const_vars_handle, int num_const_vars,
+                     EngineVarHandle mutable_vars_handle, int num_mutable_vars,
+                     EngineFnPropertyHandle prop_handle, int priority,
+                     const char* opr_name) {
+  API_BEGIN();
+
+  auto exec_ctx = *static_cast<const Context*>(ctx_handle);
+  auto const_vars = static_cast<VarHandle*>(const_vars_handle);
+  auto mutable_vars = static_cast<VarHandle*>(mutable_vars_handle);
+  auto prop = FnProperty::kNormal;
+  if (prop_handle) {
+    prop = *static_cast<const FnProperty*>(prop_handle);
+  }
+
+  Engine::SyncFn exec_fn;
+  if (deleter == nullptr) {
+    exec_fn = [sync_func, func_param](RunContext rctx) {
+      sync_func(&rctx, func_param);
+    };
+  } else {
+    // Wrap func_param in a shared_ptr with deleter such that deleter
+    // will be called when the lambda goes out of scope.
+    std::shared_ptr<void> shared_func_param(func_param, deleter);
+    exec_fn = [sync_func, shared_func_param](RunContext rctx) {
+      sync_func(&rctx, shared_func_param.get());
+    };
+  }
+
+  AssertValidNumberVars(num_const_vars, num_mutable_vars);
+  std::vector<VarHandle> const_var_vec(const_vars, const_vars + num_const_vars);
+  std::vector<VarHandle> mutable_var_vec(mutable_vars, mutable_vars + num_mutable_vars);
+  Engine::Get()->PushSync(exec_fn, exec_ctx, const_var_vec, mutable_var_vec,
+                          prop, priority, opr_name);
+
+  API_END();
+}
diff --git a/src/operator/nn/cudnn/cudnn_algoreg-inl.h b/src/operator/nn/cudnn/cudnn_algoreg-inl.h
index cef9d6f86940..3f2d24c5bf7e 100644
--- a/src/operator/nn/cudnn/cudnn_algoreg-inl.h
+++ b/src/operator/nn/cudnn/cudnn_algoreg-inl.h
@@ -96,7 +96,7 @@ class CuDNNAlgoReg {
       if (param.cudnn_tune.value() && reg_.size() % 50 == 0) {
         LOG(INFO) << "Running performance tests to find the best convolution "
             "algorithm, "
-            "this can take a while... (setting env variable "
+            "this can take a while... (set the environment variable "
             "MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)";
         if (reg_.size() >= 1000) {
           // Many people are very concerned about this warning, so change the warning once.
diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h
index 82bdda69dcd0..01611dfce191 100644
--- a/src/operator/nn/dropout-inl.h
+++ b/src/operator/nn/dropout-inl.h
@@ -39,7 +39,10 @@
 #include "../random/sampler.h"
 #include "../tensor/elemwise_binary_broadcast_op.h"
 
-#define MXNET_USE_MKL_DROPOUT defined(USE_MKL) && defined(_OPENMP) && !defined(__CUDACC__)
+#if defined(USE_MKL) && defined(_OPENMP) && !defined(__CUDACC__)
+#define MXNET_USE_MKL_DROPOUT 1
+#endif
+
 #if MXNET_USE_MKL_DROPOUT
 #include <omp.h>
 
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
index 0a89c0f31981..a460e33fa548 100644
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -175,12 +175,14 @@ struct ConvolutionParam;
 struct DeconvolutionParam;
 struct SoftmaxParam;
 struct SoftmaxOutputParam;
+struct TransposeParam;
 bool SupportMKLDNNAct(const ActivationParam& param);
 bool SupportMKLDNNAct(const ActivationParam& param, const NDArray &input);
 bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray &input);
 bool SupportMKLDNNDeconv(const DeconvolutionParam& params, const NDArray &input);
 bool SupportMKLDNNSoftmax(const SoftmaxParam& param);
 bool SupportMKLDNNSoftmaxOutput(const SoftmaxOutputParam &param);
+bool SupportMKLDNNTranspose(const TransposeParam& param, const NDArray &data);
 }  // namespace op
 
 static int GetTypeSize(int dtype) {
diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h
index 39f26325b2a5..f3f61b457507 100644
--- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h
@@ -113,6 +113,12 @@ void MKLDNNActivationBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx
 void MKLDNNSum(const mkldnn::memory &arr1, const mkldnn::memory &arr2,
          const mkldnn::memory &out);
 
+void MKLDNNTransposeForward(const nnvm::NodeAttrs& attrs,
+                            const OpContext &ctx,
+                            const NDArray &data,
+                            const OpReqType &req,
+                            const NDArray &output);
+
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/nn/mkldnn/mkldnn_transpose.cc b/src/operator/nn/mkldnn/mkldnn_transpose.cc
new file mode 100644
index 000000000000..0986d0616f75
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_transpose.cc
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_transpose.cc
+ * \brief Implement transpose operator via MKL-DNN reorder primitive
+ * \author Tao Lv
+*/
+
+#if MXNET_USE_MKLDNN == 1
+
+#include <mkldnn.hpp>
+#include "../../tensor/matrix_op-inl.h"
+
+namespace mxnet {
+namespace op {
+
+bool SupportMKLDNNTranspose(const TransposeParam& param,
+                            const NDArray &data) {
+  auto data_ndim = data.shape().ndim();
+
+  if (data_ndim > 4 || data.dtype() != mshadow::kFloat32)
+    return false;
+
+  return true;
+}
+
+typedef ParamOpSign<TransposeParam> MKLDNNTransposeSignature;
+
+class MKLDNNTransposeForward {
+  std::shared_ptr<mkldnn::memory> data_;
+  std::shared_ptr<mkldnn::memory> out_;
+  std::shared_ptr<mkldnn::memory::primitive_desc> dst_pd_;
+  std::shared_ptr<mkldnn::reorder> transpose_;
+
+ public:
+  MKLDNNTransposeForward(const TransposeParam& param,
+                         const NDArray &data) {
+    auto shape = data.shape();
+    auto data_ndim = shape.ndim();
+    auto axes_ndim = param.axes.ndim();
+    auto axes = mxnet::TShape(data_ndim);
+    if (axes_ndim == 0) {
+      for (size_t i = 0; i < data_ndim; i++) {
+        axes[i] = data_ndim - i - 1;
+      }
+    } else {
+      axes = param.axes;
+    }
+
+    auto engine = CpuEngine::Get()->get_engine();
+    auto in_mem = data.GetMKLDNNData();
+    auto src_pd = in_mem->get_primitive_desc();
+    data_ = std::make_shared<mkldnn::memory>(src_pd, nullptr);
+
+    // destination
+    // Not all formats are well defined with a certain name in MKL-DNN.
+    // For example, transpose(NCHW, (0, 2, 1, 3)) -> NHCW, which is not explicitly defined in
+    // MKL-DNN. To support general transposing, we need create destination format from scratch.
+    mkldnn_memory_desc_t dst_fmt;
+    dst_fmt.primitive_kind = mkldnn_memory;
+    dst_fmt.ndims = data_ndim;
+    dst_fmt.data_type = mkldnn_f32;
+    dst_fmt.format = mkldnn_blocked;
+
+    for (size_t i = 0; i < data_ndim; i++)
+      dst_fmt.dims[i] = shape[i];
+
+    unsigned int total_stride = 1;
+    for (int i = data_ndim - 1; i >= 0; i--) {
+      dst_fmt.layout_desc.blocking.padding_dims[i] = shape[i];
+      dst_fmt.layout_desc.blocking.block_dims[i] = 1;
+      dst_fmt.layout_desc.blocking.offset_padding_to_data[i]= 0;
+      // strides[0]: stride between the first elements of adjacent blocks.
+      dst_fmt.layout_desc.blocking.strides[0][axes[i]] = total_stride;
+      // strides[1]: strides between elements in the same block.
+      dst_fmt.layout_desc.blocking.strides[1][axes[i]] = 1;
+
+      total_stride *= shape[axes[i]];
+    }
+
+    dst_fmt.layout_desc.blocking.offset_padding = 0;
+    dst_pd_ = std::make_shared<mkldnn::memory::primitive_desc>(dst_fmt, engine);
+    out_ = std::make_shared<mkldnn::memory>(*dst_pd_, nullptr);
+
+    transpose_ = std::make_shared<mkldnn::reorder>(*data_, *out_);
+  }
+
+  void SetNewMem(const NDArray &data, const NDArray &output) {
+    if (data.IsMKLDNNData()) {
+      this->data_->set_data_handle(data.GetMKLDNNData()->get_data_handle());
+    } else {
+      MSHADOW_TYPE_SWITCH(data.dtype(), DTYPE, {
+        this->data_->set_data_handle(data.data().dptr<DTYPE>());
+      });
+    }
+
+    CHECK(!output.IsMKLDNNData());
+    MSHADOW_TYPE_SWITCH(output.dtype(), DTYPE, {
+      this->out_->set_data_handle(output.data().dptr<DTYPE>());
+    });
+  }
+
+  const mkldnn::reorder &GetFwd() const {
+    return *transpose_;
+  }
+};
+
+static MKLDNNTransposeForward &GetTransposeForward(const TransposeParam& param,
+                                                   const NDArray &data) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<MKLDNNTransposeSignature,
+                                         MKLDNNTransposeForward, OpHash> fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<MKLDNNTransposeSignature,
+                                            MKLDNNTransposeForward, OpHash> fwds;
+#endif
+  MKLDNNTransposeSignature key(param);
+  key.AddSign(data);
+
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+    MKLDNNTransposeForward fwd(param, data);
+    it = AddToCache(&fwds, key, fwd);
+  }
+  return it->second;
+}
+
+void MKLDNNTransposeForward(const nnvm::NodeAttrs& attrs,
+                            const OpContext &ctx,
+                            const NDArray &data,
+                            const OpReqType &req,
+                            const NDArray &output) {
+  const TransposeParam& param = nnvm::get<TransposeParam>(attrs.parsed);
+
+  auto stream = MKLDNNStream::Get();
+  auto fwd = GetTransposeForward(param, data);
+
+  fwd.SetNewMem(data, output);
+  stream->RegisterPrim(fwd.GetFwd());
+  stream->Submit();
+}
+}  // namespace op
+}  // namespace mxnet
+#endif
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 5eecda622729..fa108158b5c9 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -238,6 +238,10 @@ struct TransposeParam : public dmlc::Parameter<TransposeParam> {
     DMLC_DECLARE_FIELD(axes).set_default(mxnet::TShape())
     .describe("Target axis order. By default the axes will be inverted.");
   }
+
+  bool operator==(const TransposeParam &other) const {
+    return this->axes == other.axes;
+  }
 };
 
 template<typename xpu>
@@ -2841,4 +2845,15 @@ inline uint32_t SplitNumOutputs(const NodeAttrs& attrs) {
 }  // namespace op
 }  // namespace mxnet
 
+namespace std {
+template<>
+struct hash<mxnet::op::TransposeParam> {
+  size_t operator()(const mxnet::op::TransposeParam& val) {
+    size_t ret = 0;
+    ret = dmlc::HashCombine(ret, val.axes);
+    return ret;
+  }
+};
+}  // namespace std
+
 #endif  // MXNET_OPERATOR_TENSOR_MATRIX_OP_INL_H_
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index 3bca330f98b0..1431fef13594 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -339,6 +339,35 @@ Example::
   })
 .add_argument("data", "NDArray-or-Symbol", "Input array.");
 
+#if MXNET_USE_MKLDNN == 1
+static void TransposeComputeExCPU(const nnvm::NodeAttrs& attrs,
+                                  const OpContext& ctx,
+                                  const std::vector<NDArray>& inputs,
+                                  const std::vector<OpReqType>& req,
+                                  const std::vector<NDArray>& outputs) {
+  const TransposeParam& param = nnvm::get<TransposeParam>(attrs.parsed);
+  CHECK_EQ(req[0], kWriteTo) << "Transpose does not support inplace";
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+
+  if (SupportMKLDNNTranspose(param, inputs[0])) {
+    MKLDNNTransposeForward(attrs, ctx, inputs[0], req[0], outputs[0]);
+    return;
+  }
+  FallBackCompute(Transpose<cpu>, attrs, ctx, inputs, req, outputs);
+}
+
+inline static bool TransposeStorageType(const nnvm::NodeAttrs& attrs,
+                                        const int dev_mask,
+                                        DispatchMode* dispatch_mode,
+                                        std::vector<int>* in_attrs,
+                                        std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
+}
+#endif
+
 NNVM_REGISTER_OP(transpose)
 .describe(R"code(Permutes the dimensions of an array.
 
@@ -393,6 +422,11 @@ Examples::
     }
   })
 .set_attr<FCompute>("FCompute<cpu>", Transpose<cpu>)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
+.set_attr<FComputeEx>("FComputeEx<cpu>", TransposeComputeExCPU)
+.set_attr<FInferStorageType>("FInferStorageType", TransposeStorageType)
+#endif
 .add_argument("data", "NDArray-or-Symbol", "Source input")
 .add_arguments(TransposeParam::__FIELDS__());
 
diff --git a/src/profiler/profiler.h b/src/profiler/profiler.h
index adea941bda13..f1fac9ae8ddd 100644
--- a/src/profiler/profiler.h
+++ b/src/profiler/profiler.h
@@ -608,6 +608,12 @@ struct ProfileCounter : public ProfileObject {
       return IncrementValue(static_cast<uint64_t>(v));
     }
   }
+
+  inline bool operator >=(int64_t v) {
+      CHECK_GE(v, 0);
+      return value_ >= static_cast<uint64_t>(v);
+  }
+
   /*! \brief operator: object = v */
   inline ProfileCounter& operator = (uint64_t v) {
     SetValue(v);
diff --git a/src/profiler/storage_profiler.h b/src/profiler/storage_profiler.h
index bcbe7e7e3ffd..5ab5983267eb 100644
--- a/src/profiler/storage_profiler.h
+++ b/src/profiler/storage_profiler.h
@@ -66,7 +66,11 @@ class DeviceStorageProfiler {
         Init();  // In case of bug which tries to free first
         const size_t idx = prof->DeviceIndex(handle.ctx.dev_type, handle.ctx.dev_id);
         CHECK_LT(idx, mem_counters_.size()) << "Invalid device index: " << idx;
-        *mem_counters_[idx] -= handle.size;
+        if (*mem_counters_[idx] >= handle.size) {
+            *mem_counters_[idx] -= handle.size;
+        } else {
+            *mem_counters_[idx] = 0;
+        }
       }
     }
   }
diff --git a/tests/cpp/engine/threaded_engine_test.cc b/tests/cpp/engine/threaded_engine_test.cc
index 6d669c19bcaa..405f3b30a176 100644
--- a/tests/cpp/engine/threaded_engine_test.cc
+++ b/tests/cpp/engine/threaded_engine_test.cc
@@ -27,6 +27,7 @@
 #include <dmlc/thread_group.h>
 #include <dmlc/omp.h>
 #include <gtest/gtest.h>
+#include <mxnet/c_api.h>
 #include <mxnet/engine.h>
 #include <dmlc/timer.h>
 #include <cstdio>
@@ -176,6 +177,83 @@ TEST(Engine, RandSumExpr) {
 
 void Foo(mxnet::RunContext, int i) { printf("The fox says %d\n", i); }
 
+void FooAsyncFunc(void*, void* cb_ptr, void* param) {
+  if (param == nullptr) {
+    LOG(INFO) << "The fox asynchronously says receiving nothing.";
+  } else {
+    auto num = static_cast<int*>(param);
+    EXPECT_EQ(*num, 100);
+    LOG(INFO) << "The fox asynchronously says receiving " << *num;
+  }
+  auto cb = *static_cast<mxnet::engine::CallbackOnComplete*>(cb_ptr);
+  cb();
+}
+
+void FooSyncFunc(void*, void* param) {
+  if (param == nullptr) {
+    LOG(INFO) << "The fox synchronously says receiving nothing.";
+  } else {
+    auto num = static_cast<int*>(param);
+    EXPECT_EQ(*num, 101);
+    LOG(INFO) << "The fox synchronously says receiving " << *num;
+  }
+}
+
+void FooFuncDeleter(void* param) {
+  if (param != nullptr) {
+    auto num = static_cast<int*>(param);
+    LOG(INFO) << "The fox says deleting " << *num;
+    delete num;
+  }
+}
+
+TEST(Engine, PushFunc) {
+  auto var = mxnet::Engine::Get()->NewVariable();
+  auto ctx = mxnet::Context{};
+
+  // Test #1
+  LOG(INFO) << "===== Test #1: PushAsync param and deleter =====";
+  int* a = new int(100);
+  int res = MXEnginePushAsync(FooAsyncFunc, a, FooFuncDeleter, &ctx, &var, 1, nullptr, 0);
+  EXPECT_EQ(res, 0);
+
+  // Test #2
+  LOG(INFO) << "===== Test #2: PushAsync NULL param and NULL deleter =====";
+  res = MXEnginePushAsync(FooAsyncFunc, nullptr, nullptr, &ctx, nullptr, 0, &var, 0);
+  EXPECT_EQ(res, 0);
+
+  // Test #3
+  LOG(INFO) << "===== Test #3: PushAsync invalid number of const vars =====";
+  res = MXEnginePushAsync(FooAsyncFunc, nullptr, nullptr, &ctx, &var, -1, nullptr, 0);
+  EXPECT_EQ(res, -1);
+
+  // Test #4
+  LOG(INFO) << "===== Test #4: PushAsync invalid number of mutable vars =====";
+  res = MXEnginePushAsync(FooAsyncFunc, nullptr, nullptr, &ctx, nullptr, 0, &var, -1);
+  EXPECT_EQ(res, -1);
+
+  // Test #5
+  LOG(INFO) << "===== Test #5: PushSync param and deleter =====";
+  int* b = new int(101);
+  res = MXEnginePushSync(FooSyncFunc, b, FooFuncDeleter, &ctx, &var, 1, nullptr, 0);
+  EXPECT_EQ(res, 0);
+
+  // Test #6
+  LOG(INFO) << "===== Test #6: PushSync NULL param and NULL deleter =====";
+  res = MXEnginePushSync(FooSyncFunc, nullptr, nullptr, &ctx, nullptr, 0, &var, 1);
+  EXPECT_EQ(res, 0);
+
+  // Test #7
+  LOG(INFO) << "===== Test #7: PushSync invalid number of const vars =====";
+  res = MXEnginePushSync(FooSyncFunc, nullptr, nullptr, &ctx, &var, -1, nullptr, 0);
+  EXPECT_EQ(res, -1);
+
+  // Test #8
+  LOG(INFO) << "===== Test #8: PushSync invalid number of mutable vars =====";
+  res = MXEnginePushSync(FooSyncFunc, nullptr, nullptr, &ctx, nullptr, 0, &var, -1);
+  EXPECT_EQ(res, -1);
+}
+
 TEST(Engine, basics) {
   auto&& engine = mxnet::Engine::Get();
   auto&& var = engine->NewVariable();
diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py
index 01ba03cab7cd..0610b606c201 100644
--- a/tests/python/mkl/test_mkldnn.py
+++ b/tests/python/mkl/test_mkldnn.py
@@ -473,6 +473,21 @@ def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
     exec1 = custom.bind(mx.cpu(), args={'data': mx.nd.ones([10,3,96,96]), 'conv_weight': mx.nd.ones([8,3,5,5])})
     exec1.forward()[0].wait_to_read()
 
+@with_seed()
+def test_conv_transpose():
+    axes = [(0,2,1,3), (0,2,3,1), (1,2,3,0), (3,2,1,0)]
+    a = np.random.rand(10, 16, 50, 50)
+    b = np.random.rand(32, 16, 3, 3)
+    x = mx.nd.array(a)
+    w = mx.nd.array(b)
+    y = mx.nd.Convolution(data=x, weight=w, kernel=(3, 3), num_group=1, num_filter=32, no_bias=True)
+    for axis in axes:
+        t = mx.nd.transpose(y, axis)
+        t.wait_to_read()
+        s = y.asnumpy()
+        n = np.transpose(s, axis)
+        np.allclose(t.asnumpy(), n)
+
 
 if __name__ == '__main__':
     install.test_mkldnn_install()
diff --git a/tests/python/unittest/test_exc_handling.py b/tests/python/unittest/test_exc_handling.py
index 11a506d263c3..98afcd86571a 100644
--- a/tests/python/unittest/test_exc_handling.py
+++ b/tests/python/unittest/test_exc_handling.py
@@ -173,6 +173,23 @@ def check_resize():
         img.asnumpy()
     assert_raises(MXNetError, check_resize)
 
+@with_seed()
+def test_exc_profiler():
+    def run_training_iteration(data):
+        output = net(data)
+
+    net = gluon.nn.HybridSequential()
+    with net.name_scope():
+        net.add(gluon.nn.Dense(10))
+
+    ctx = default_context()
+    net.collect_params().initialize(mx.init.Xavier(), ctx=ctx)
+    data = mx.nd.ones((3, 4))
+    mx.profiler.set_state("run")
+    run_training_iteration(data)
+    mx.nd.waitall()
+    mx.profiler.set_state("stop")
+
 
 if __name__ == '__main__':
     import nose
diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py
index b410362c8fd1..9d7892010839 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -22,8 +22,7 @@
 from numpy.testing import assert_allclose
 import unittest
 from mxnet.test_utils import almost_equal, assert_almost_equal
-from common import assert_raises_cudnn_not_satisfied
-
+from common import assert_raises_cudnn_not_satisfied, with_seed
 
 def test_rnn():
     cell = gluon.rnn.RNNCell(100, prefix='rnn_')
@@ -244,6 +243,7 @@ def test_bidirectional():
 
 
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
+@with_seed()
 def test_layer_bidirectional():
     class RefBiLSTM(gluon.Block):
         def __init__(self, size, **kwargs):
@@ -279,7 +279,7 @@ def forward(self, inpt):
         ref_net_params[k.replace('l0', 'l0l0').replace('r0', 'r0l0')].set_data(weights[k])
 
     data = mx.random.uniform(shape=(11, 10, in_size))
-    assert_allclose(net(data).asnumpy(), ref_net(data).asnumpy())
+    assert_allclose(net(data).asnumpy(), ref_net(data).asnumpy(), rtol=1e-04, atol=1e-02)
 
 
 
diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py
index 2821c4bbae3c..d8dca753bda4 100644
--- a/tests/python/unittest/test_metric.py
+++ b/tests/python/unittest/test_metric.py
@@ -34,6 +34,7 @@ def test_metrics():
     check_metric('mcc')
     check_metric('perplexity', -1)
     check_metric('pearsonr')
+    check_metric('pcc')
     check_metric('nll_loss')
     check_metric('loss')
     composite = mx.metric.create(['acc', 'f1'])
@@ -89,6 +90,7 @@ def test_global_metric():
     _check_global_metric('mcc', shape=(10,2), average='micro')
     _check_global_metric('perplexity', -1)
     _check_global_metric('pearsonr', use_same_shape=True)
+    _check_global_metric('pcc', shape=(10,2))
     _check_global_metric('nll_loss')
     _check_global_metric('loss')
     _check_global_metric('ce')
@@ -253,6 +255,86 @@ def test_pearsonr():
     _, pearsonr = metric.get()
     assert pearsonr == pearsonr_expected
 
+def cm_batch(cm):
+    # generate a batch yielding a given confusion matrix
+    n = len(cm)
+    ident = np.identity(n)
+    labels = []
+    preds = []
+    for i in range(n):
+        for j in range(n):
+            labels += [ i ] * cm[i][j]
+            preds += [ ident[j] ] * cm[i][j]
+    return ([ mx.nd.array(labels, dtype='int32') ], [ mx.nd.array(preds) ])
+
+def test_pcc():
+    labels, preds = cm_batch([
+        [ 7, 3 ],
+        [ 2, 5 ],
+    ])
+    met_pcc = mx.metric.create('pcc')
+    met_pcc.update(labels, preds)
+    _, pcc = met_pcc.get()
+
+    # pcc should agree with mcc for binary classification
+    met_mcc = mx.metric.create('mcc')
+    met_mcc.update(labels, preds)
+    _, mcc = met_mcc.get()
+    np.testing.assert_almost_equal(pcc, mcc)
+
+    # pcc should agree with Pearson for binary classification
+    met_pear = mx.metric.create('pearsonr')
+    met_pear.update(labels, [p.argmax(axis=1) for p in preds])
+    _, pear = met_pear.get()
+    np.testing.assert_almost_equal(pcc, pear)
+
+    # check multiclass case against reference implementation
+    CM = [
+        [ 23, 13,  3 ],
+        [  7, 19, 11 ],
+        [  2,  5, 17 ],
+    ]
+    K = 3
+    ref = sum(
+        CM[k][k] * CM[l][m] - CM[k][l] * CM[m][k]
+        for k in range(K)
+        for l in range(K)
+        for m in range(K)
+    ) / (sum(
+        sum(CM[k][l] for l in range(K)) * sum(
+            sum(CM[f][g] for g in range(K))
+            for f in range(K)
+            if f != k
+        )
+        for k in range(K)
+    ) * sum(
+        sum(CM[l][k] for l in range(K)) * sum(
+            sum(CM[f][g] for f in range(K))
+            for g in range(K)
+            if g != k
+        )
+        for k in range(K)
+    )) ** 0.5
+    labels, preds = cm_batch(CM)
+    met_pcc.reset()
+    met_pcc.update(labels, preds)
+    _, pcc = met_pcc.get()
+    np.testing.assert_almost_equal(pcc, ref)
+
+    # things that should not change metric score:
+    # * order
+    # * batch size
+    # * update frequency
+    labels = [ [ i ] for i in labels[0] ]
+    labels.reverse()
+    preds = [ [ i.reshape((1, -1)) ] for i in preds[0] ]
+    preds.reverse()
+
+    met_pcc.reset()
+    for l, p in zip(labels, preds):
+        met_pcc.update(l, p)
+    assert pcc == met_pcc.get()[1]
+
 def test_single_array_input():
     pred = mx.nd.array([[1,2,3,4]])
     label = pred + 0.1
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 17618e414343..ccb351f434da 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -698,11 +698,11 @@ def test_symbol_pow():
 def test_pow_fn():
     shape = (3, 4)
     exp = mx.symbol.Variable("exp")
-    y = mx.sym.pow(2, exp)
     x = np.ones(shape)*3
-    check_numeric_gradient(y, [x], numeric_eps=1E-3)
-    check_symbolic_forward(y, [x], [2**x])
-    check_symbolic_backward(y, [x], [np.ones(shape)], [np.log(2) * 2**x])
+    for y in [mx.sym.pow(2, exp), mx.sym.power(2, exp)]:
+        check_numeric_gradient(y, [x], numeric_eps=1E-3)
+        check_symbolic_forward(y, [x], [2**x])
+        check_symbolic_backward(y, [x], [np.ones(shape)], [np.log(2) * 2**x])
 
 
 @with_seed()
@@ -6675,7 +6675,12 @@ def test_binary_math_operators():
                 lambda x, y: np.power(x, y),
                 lambda x, y: np.power(x, y - 1.) * y,
                 lambda x, y: np.power(x, y) * np.log(x),
-                0.2, 5.0, -4.0, 4.0]
+                0.2, 5.0, -4.0, 4.0],
+        'power': [lambda x, y: mx.sym.power(x, y),
+                  lambda x, y: np.power(x, y),
+                  lambda x, y: np.power(x, y - 1.) * y,
+                  lambda x, y: np.power(x, y) * np.log(x),
+                  0.2, 5.0, -4.0, 4.0]
     }
     # Loop over operators
     for name, op in binary_ops.items():
diff --git a/tools/dependencies/libtiff.sh b/tools/dependencies/libtiff.sh
index a1140d52a94c..f57099bde845 100755
--- a/tools/dependencies/libtiff.sh
+++ b/tools/dependencies/libtiff.sh
@@ -19,16 +19,16 @@
 
 # This script builds the static library of libtiff that can be used as dependency of mxnet/opencv.
 set -ex
-TIFF_VERSION="4-0-9"
+TIFF_VERSION="4.0.10"
 if [[ ! -f $DEPS_PATH/lib/libtiff.a ]]; then
     # download and build libtiff
     >&2 echo "Building libtiff..."
     download \
-        https://gitlab.com/libtiff/libtiff/-/archive/Release-v${TIFF_VERSION}/libtiff-Release-v${TIFF_VERSION}.zip \
+        https://download.osgeo.org/libtiff/tiff-${TIFF_VERSION}.zip \
         ${DEPS_PATH}/libtiff.zip
     unzip -q $DEPS_PATH/libtiff.zip -d $DEPS_PATH
     pushd .
-    cd $DEPS_PATH/libtiff-Release-v$TIFF_VERSION
+    cd $DEPS_PATH/tiff-$TIFF_VERSION
     ./configure --quiet --disable-shared --disable-jpeg --disable-zlib --disable-jbig --disable-lzma --prefix=$DEPS_PATH
     $MAKE
     $MAKE install