From 40beeb2a7453f262e2820ff8faa6621abf81a88d Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@ufl.edu>
Date: Thu, 6 Jun 2019 23:13:49 +0000
Subject: [PATCH 01/20] Upgrade archive utility and add back FC improvement

This reverts commit 65434886f6caa7210ed3ff39cd4e950c023d8328.
---
 ci/docker/Dockerfile.build.ubuntu_build_cuda |  2 ++
 ci/docker/install/ubuntu_ar.sh               | 35 ++++++++++++++++++++
 src/operator/nn/fully_connected-inl.h        | 14 +++++++-
 src/operator/nn/fully_connected.cc           |  2 --
 tests/python/unittest/test_operator.py       | 21 ++++++++++++
 5 files changed, 71 insertions(+), 3 deletions(-)
 create mode 100644 ci/docker/install/ubuntu_ar.sh

diff --git a/ci/docker/Dockerfile.build.ubuntu_build_cuda b/ci/docker/Dockerfile.build.ubuntu_build_cuda
index ad1a1c4558b5..5aec340f1731 100644
--- a/ci/docker/Dockerfile.build.ubuntu_build_cuda
+++ b/ci/docker/Dockerfile.build.ubuntu_build_cuda
@@ -43,6 +43,8 @@ COPY install/ubuntu_clang.sh /work/
 RUN /work/ubuntu_clang.sh
 COPY install/ubuntu_mklml.sh /work/
 RUN /work/ubuntu_mklml.sh
+COPY install/ubuntu_ar.sh /work/
+RUN /work/ubuntu_ar.sh
 
 ENV CUDNN_VERSION=7.5.1.10
 COPY install/ubuntu_cudnn.sh /work/
diff --git a/ci/docker/install/ubuntu_ar.sh b/ci/docker/install/ubuntu_ar.sh
new file mode 100644
index 000000000000..e4677e675f02
--- /dev/null
+++ b/ci/docker/install/ubuntu_ar.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# build and install are separated so changes to build don't invalidate
+# the whole docker cache for the image
+
+wget https://mirror.clarkson.edu/gnu/binutils/binutils-2.27.tar.gz
+
+export DEBIAN_FRONTEND=noninteractive
+apt-get update || true
+apt-get install -y \
+    wget
+
+mkdir /opt/binutils_install && mkdir /opt/binutils_install && mkdir /opt/binutils && cd /opt/binutils
+wget -nv https://mirror.clarkson.edu/gnu/binutils/binutils-2.27.tar.gz
+./configure --prefix=/opt/binutils_install --exec-prefix=/opt/binutils_other
+make -j$(nproc)
+make install
+ln -s /opt/binutils_install/bin/ar /usr/local/bin/ar
diff --git a/src/operator/nn/fully_connected-inl.h b/src/operator/nn/fully_connected-inl.h
index e4bb11f6bc56..44af375486fb 100644
--- a/src/operator/nn/fully_connected-inl.h
+++ b/src/operator/nn/fully_connected-inl.h
@@ -36,6 +36,7 @@
 #include "../elemwise_op_common.h"
 #include "../linalg.h"
 #include "../../common/utils.h"
+#include "../tensor/broadcast_reduce_op.h"
 
 namespace mxnet {
 namespace op {
@@ -169,7 +170,18 @@ void FCBackward(const OpContext &ctx, const FullyConnectedParam &param,
   // gradient of bias
   if (!param.no_bias) {
     Tensor<xpu, 1, DType> gbias = in_grad[fullc::kBias].get<xpu, 1, DType>(s);
-    Assign(gbias, req[fullc::kBias], sum_rows(grad));
+    TBlob grad_blob = TBlob(grad);
+    TBlob gbias_blob = TBlob(gbias);
+    mxnet::TShape x(1, 0);
+    mxnet::TShape small;
+    if (shape_assign(&gbias_blob.shape_, Shape2(param.num_hidden, 1))) {
+      small = gbias_blob.shape_;
+    } else {
+      small = ReduceAxesShapeImpl(grad_blob.shape_, dmlc::optional<mxnet::TShape>(x), true, false);
+    }
+    ReduceAxesComputeImpl<xpu, mshadow::red::sum, false, false,
+                          mshadow_op::identity>(ctx, {grad_blob}, {req[fullc::kBias]},
+                                                {in_grad[fullc::kBias]}, small);
   }
   // gradient of data
   // Legacy approach shown here for comparison:
diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc
index a097357ef5a3..27f6595aee9e 100644
--- a/src/operator/nn/fully_connected.cc
+++ b/src/operator/nn/fully_connected.cc
@@ -316,11 +316,9 @@ NNVM_REGISTER_OP(_backward_FullyConnected)
   const FullyConnectedParam& params = nnvm::get<FullyConnectedParam>(attrs.parsed);
   return params.no_bias ? 2 : 3;
 })
-#if MXNET_USE_MKLDNN == 1
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
   return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
 })
-#endif
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
   return std::vector<std::pair<int, int> >{{1, 0}};
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index ab33d2667fbe..e600cef3d04d 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -696,6 +696,27 @@ def test_symbol_pow():
     check_symbolic_backward(test, [data_tmp, exp_tmp], [np.ones(shape)], [data_dir, exp_dir])
 
 
+@with_seed()
+def test_fully_connected():
+    data = mx.sym.var("data")
+    fc_weight = mx.sym.var("weight")
+    fc_bias = mx.sym.var("bias")
+    fc = mx.sym.FullyConnected(data=data, weight=fc_weight, bias=fc_bias, num_hidden=10, no_bias=False, name='fc')
+    data = mx.nd.random.uniform(shape=(5, 5, 5, 13), dtype=np.float32)
+    fc_weight = mx.nd.random.uniform(shape=(10, 325), dtype=np.float32)
+    fc_bias = mx.nd.random.uniform(shape=(10), dtype=np.float32)
+    fc_bias2 = mx.nd.random.uniform(shape=(10, 1), dtype=np.float32)
+    data_np = data.asnumpy().reshape(5, 325)
+    fc_weight_np = np.transpose(fc_weight.asnumpy())
+    fc_bias_np = fc_bias.asnumpy()
+    res = np.dot(data_np, fc_weight_np) + fc_bias.asnumpy()
+    check_symbolic_forward(fc, {'data': data_np, 'weight': fc_weight.asnumpy(), 'bias': fc_bias_np}, {'fc_output': res})
+    check_numeric_gradient(fc, {'data': data_np, 'weight': fc_weight.asnumpy(), 'bias': fc_bias_np},
+                           numeric_eps=1e-2, rtol=1e-4, atol=1e-2)
+    # TODO: Fix Bug #15032 when bias has ndim > 1
+    #check_symbolic_forward(fc, {'data': data_np, 'weight': fc_weight.asnumpy(), 'bias': fc_bias2.asnumpy()}, {'fc_output': res})
+
+
 @with_seed()
 def test_pow_fn():
     shape = (3, 4)

From 928055f04a06244969ed2a834d931d5d329ad648 Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@ufl.edu>
Date: Fri, 7 Jun 2019 00:55:06 +0000
Subject: [PATCH 02/20] Change permissions for Ubuntu AR

---
 ci/docker/install/ubuntu_ar.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 ci/docker/install/ubuntu_ar.sh

diff --git a/ci/docker/install/ubuntu_ar.sh b/ci/docker/install/ubuntu_ar.sh
old mode 100644
new mode 100755

From 2b418d3ea05add64131efb73c9402ae3497573c0 Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@ufl.edu>
Date: Fri, 7 Jun 2019 01:11:37 +0000
Subject: [PATCH 03/20] Extract and cd into binutils dir

---
 ci/docker/install/ubuntu_ar.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/docker/install/ubuntu_ar.sh b/ci/docker/install/ubuntu_ar.sh
index e4677e675f02..5fdda02bce8e 100755
--- a/ci/docker/install/ubuntu_ar.sh
+++ b/ci/docker/install/ubuntu_ar.sh
@@ -29,6 +29,7 @@ apt-get install -y \
 
 mkdir /opt/binutils_install && mkdir /opt/binutils_install && mkdir /opt/binutils && cd /opt/binutils
 wget -nv https://mirror.clarkson.edu/gnu/binutils/binutils-2.27.tar.gz
+tar -xvf binutils-2.27.tar.gz && cd binutils-2.27
 ./configure --prefix=/opt/binutils_install --exec-prefix=/opt/binutils_other
 make -j$(nproc)
 make install

From b8236238fe5fbfb1941b42ba837d6aef8fa16baa Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@ufl.edu>
Date: Fri, 7 Jun 2019 04:29:56 +0000
Subject: [PATCH 04/20] Allow AR path to be chosen by user

---
 Makefile                       | 2 +-
 ci/docker/runtime_functions.sh | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 6fc3c3aca5f6..11803e212d64 100644
--- a/Makefile
+++ b/Makefile
@@ -543,7 +543,7 @@ endif
 # --Wl,--whole-archive -lmxnet --Wl,--no-whole-archive
 lib/libmxnet.a: $(ALLX_DEP)
 	@mkdir -p $(@D)
-	ar crv $@ $(filter %.o, $?)
+	$(AR) crv $@ $(filter %.o, $?)
 
 lib/libmxnet.so: $(ALLX_DEP)
 	@mkdir -p $(@D)
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 1ad67280617d..7148770b342a 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -667,6 +667,7 @@ build_ubuntu_gpu_mkldnn() {
         USE_CUDNN=1                               \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
         USE_SIGNAL_HANDLER=1                      \
+        AR=/usr/local/bin/ar                      \
         -j$(nproc)
 }
 

From d89ce85583f426c0ed0b184dd6e280d16d96e8bc Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@ufl.edu>
Date: Fri, 7 Jun 2019 05:25:15 +0000
Subject: [PATCH 05/20] Add AR path to build

---
 ci/docker/runtime_functions.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 7148770b342a..5c2b29930497 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -667,7 +667,7 @@ build_ubuntu_gpu_mkldnn() {
         USE_CUDNN=1                               \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
         USE_SIGNAL_HANDLER=1                      \
-        AR=/usr/local/bin/ar                      \
+        AR=/opt/binutils_install/bin/ar           \
         -j$(nproc)
 }
 

From f28bcfa2359a8526bf8c72f4823c6e65c8c51753 Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@ufl.edu>
Date: Fri, 7 Jun 2019 19:39:16 +0000
Subject: [PATCH 06/20] Fix AR paths

---
 ci/docker/install/ubuntu_ar.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/docker/install/ubuntu_ar.sh b/ci/docker/install/ubuntu_ar.sh
index 5fdda02bce8e..58836d243048 100755
--- a/ci/docker/install/ubuntu_ar.sh
+++ b/ci/docker/install/ubuntu_ar.sh
@@ -30,7 +30,7 @@ apt-get install -y \
 mkdir /opt/binutils_install && mkdir /opt/binutils_install && mkdir /opt/binutils && cd /opt/binutils
 wget -nv https://mirror.clarkson.edu/gnu/binutils/binutils-2.27.tar.gz
 tar -xvf binutils-2.27.tar.gz && cd binutils-2.27
-./configure --prefix=/opt/binutils_install --exec-prefix=/opt/binutils_other
+./configure --prefix=/opt/binutils_other --exec-prefix=/opt/binutils_install
 make -j$(nproc)
 make install
 ln -s /opt/binutils_install/bin/ar /usr/local/bin/ar

From 7c0e25b7a4eafa3b66f7e0e77f33db910a4e2de5 Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@ufl.edu>
Date: Fri, 7 Jun 2019 19:44:44 +0000
Subject: [PATCH 07/20] Revert AR flag in makefile

---
 Makefile                       | 2 +-
 ci/docker/runtime_functions.sh | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 11803e212d64..6fc3c3aca5f6 100644
--- a/Makefile
+++ b/Makefile
@@ -543,7 +543,7 @@ endif
 # --Wl,--whole-archive -lmxnet --Wl,--no-whole-archive
 lib/libmxnet.a: $(ALLX_DEP)
 	@mkdir -p $(@D)
-	$(AR) crv $@ $(filter %.o, $?)
+	ar crv $@ $(filter %.o, $?)
 
 lib/libmxnet.so: $(ALLX_DEP)
 	@mkdir -p $(@D)
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 5c2b29930497..1ad67280617d 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -667,7 +667,6 @@ build_ubuntu_gpu_mkldnn() {
         USE_CUDNN=1                               \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
         USE_SIGNAL_HANDLER=1                      \
-        AR=/opt/binutils_install/bin/ar           \
         -j$(nproc)
 }
 

From 849c51307197f19606bc198637a43c7d7948d879 Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@ufl.edu>
Date: Fri, 7 Jun 2019 21:43:55 +0000
Subject: [PATCH 08/20] Build from source doc updated

---
 docs/install/build_from_source.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/install/build_from_source.md b/docs/install/build_from_source.md
index 7b00b03abefe..8e18f3d9635c 100644
--- a/docs/install/build_from_source.md
+++ b/docs/install/build_from_source.md
@@ -180,6 +180,8 @@ More information on turning these features on or off are found in the following
 There is a configuration file for make,
 [`make/config.mk`](https://github.com/apache/incubator-mxnet/blob/master/make/config.mk), that contains all the compilation options. You can edit it and then run `make` or `cmake`. `cmake` is recommended for building MXNet (and is required to build with MKLDNN), however you may use `make` instead. For building with Java/Scala/Clojure, only `make` is supported.
 
+**NOTE:** When certain set of build flags are set, MXNet archive increases to more than 4 GB. Since MXNet uses archive internally archive runs into a bug ("File Truncated": [bugreport](https://sourceware.org/bugzilla/show_bug.cgi?id=14625)) for archives greater than 4 GB. Please use ar version 2.27 or greater to overcome this bug. Please see https://github.com/apache/incubator-mxnet/issues/15084 for more details.
+
 <hr>
 
 ## Build MXNet

From 9850ea4ed1d9c6a84161e3be809f7538f61868a3 Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@ufl.edu>
Date: Wed, 12 Jun 2019 19:04:09 +0000
Subject: [PATCH 09/20] Commit for C Predict API

---
 src/c_api/c_predict_api.cc | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/c_api/c_predict_api.cc b/src/c_api/c_predict_api.cc
index 7de23ef935ef..5f29415b1b61 100644
--- a/src/c_api/c_predict_api.cc
+++ b/src/c_api/c_predict_api.cc
@@ -135,6 +135,7 @@ int _CreatePartialOut(const char* symbol_json_str,
 
   // load the parameters
   std::unordered_map<std::string, NDArray> arg_params, aux_params;
+  std::unordered_map<std::string, int> arg_types, aux_types;
   {
     std::unordered_set<std::string> arg_names, aux_names;
     std::vector<std::string> arg_names_vec = sym.ListInputNames(Symbol::kReadOnlyArgs);
@@ -156,15 +157,19 @@ int _CreatePartialOut(const char* symbol_json_str,
         std::string name(names[i].c_str() + 4);
         if (aux_names.count(name) != 0) {
           aux_params[name] = data[i];
+          aux_types[name] = data[i].dtype();
         }
       }
       if (!strncmp(names[i].c_str(), "arg:", 4)) {
         std::string name(names[i].c_str() + 4);
         if (arg_names.count(name) != 0) {
           arg_params[name] = data[i];
+          arg_types[name] = data[i].dtype();
         }
       }
     }
+
+
   }
 
   // shape inference and bind
@@ -179,6 +184,7 @@ int _CreatePartialOut(const char* symbol_json_str,
   mxnet::ShapeVector out_shapes(sym.ListOutputNames().size());
   mxnet::ShapeVector aux_shapes(aux_names.size());
   mxnet::ShapeVector arg_shapes;
+  nnvm::DTypeVector result_arg_types, result_out_types, result_aux_types;
   std::unordered_map<std::string, size_t> key2arg;
   for (size_t i = 0; i < arg_names.size(); ++i) {
     std::string key = arg_names[i];
@@ -187,6 +193,7 @@ int _CreatePartialOut(const char* symbol_json_str,
 
   try {
     mxnet::ShapeVector in_shapes;
+    nnvm::DTypeVector in_types;
     for (std::string key : sym.ListInputNames(Symbol::kAll)) {
       if (known_shape.count(key) != 0) {
         in_shapes.push_back(known_shape[key]);
@@ -194,14 +201,29 @@ int _CreatePartialOut(const char* symbol_json_str,
         in_shapes.emplace_back();
       }
     }
+
+    for (std::string key : sym.ListInputNames(Symbol::kAll)) {
+      if (arg_types.count(key) != 0) {
+        in_types.push_back(arg_types[key]);
+      } else if (aux_types.count(key) != 0) {
+        in_types.push_back(aux_types[key]);
+      }
+    }
     nnvm::Graph g; g.outputs = sym.outputs;
     g = mxnet::exec::InferShape(std::move(g), std::move(in_shapes), "__shape__");
+    g = mxnet::exec::InferType(std::move(g), std::move(in_types, "__dtype__");
     bool infer_complete = (g.GetAttr<size_t>("shape_num_unknown_nodes") == 0);
+    bool infer_type_complete = (g.GetAttr<size_t>("dtype_num_unknown_nodes") == 0);
     CHECK(infer_complete)
       << "The shape information of is not enough to get the shapes";
+    CHECK(infer_type_complete)
+      << "The infer type information is not enough to get the types";
     CopyAttr(g.indexed_graph(),
              g.GetAttr<mxnet::ShapeVector>("shape"),
              &arg_shapes, &out_shapes, &aux_shapes);
+    CopyAttr(g.indexed_graph(),
+             g.GetAttr<nnvm::DTypeVector>("dtype"),
+             &result_arg_types, &result_out_types, &result_aux_types);
   } catch (const mxnet::op::InferShapeError &err) {
     throw dmlc::Error(err.msg);
   }
@@ -210,14 +232,14 @@ int _CreatePartialOut(const char* symbol_json_str,
 
   std::vector<NDArray> arg_arrays, aux_arrays;
   for (size_t i = 0; i < arg_shapes.size(); ++i) {
-    NDArray nd = NDArray(arg_shapes[i], ctx);
+    NDArray nd = NDArray(arg_shapes[i], ctx, false, result_arg_types[i]);
     if (arg_params.count(arg_names[i]) != 0) {
       CopyFromTo(arg_params[arg_names[i]], &nd);
     }
     arg_arrays.push_back(nd);
   }
   for (size_t i = 0; i < aux_shapes.size(); ++i) {
-    NDArray nd = NDArray(aux_shapes[i], ctx);
+    NDArray nd = NDArray(aux_shapes[i], ctx, false, result_aux_types[i]);
     if (aux_params.count(aux_names[i]) != 0) {
       CopyFromTo(aux_params[aux_names[i]], &nd);
     }

From 41f5c866e0867149e672ebff8846d216532aa877 Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@ufl.edu>
Date: Fri, 14 Jun 2019 02:00:09 +0000
Subject: [PATCH 10/20] Add FP16 predict support

---
 amalgamation/python/mxnet_predict.py    | 88 ++++++++++++++++++++++-
 include/mxnet/c_predict_api.h           | 50 +++++++++++++
 src/c_api/c_predict_api.cc              | 96 ++++++++++++++++++++++---
 tests/python/unittest/test_predictor.py | 30 ++++++++
 4 files changed, 252 insertions(+), 12 deletions(-)

diff --git a/amalgamation/python/mxnet_predict.py b/amalgamation/python/mxnet_predict.py
index a91d3849b0d2..4940cf83a9eb 100644
--- a/amalgamation/python/mxnet_predict.py
+++ b/amalgamation/python/mxnet_predict.py
@@ -25,17 +25,63 @@
 
 import os
 import sys
+from array import array
 import ctypes
 import logging
 import numpy as np
 
+# pylint: disable= no-member
+_DTYPE_NP_TO_MX = {
+    None: -1,
+    np.float32: 0,
+    np.float64: 1,
+    np.float16: 2,
+    np.uint8: 3,
+    np.int32: 4,
+    np.int8: 5,
+    np.int64: 6,
+}
+
 __all__ = ["Predictor", "load_ndarray_file"]
 
 if sys.version_info[0] == 3:
     py_str = lambda x: x.decode('utf-8')
+
+    def c_str_array(strings):
+        """Create ctypes const char ** from a list of Python strings.
+
+        Parameters
+        ----------
+        strings : list of strings
+            Python strings.
+
+        Returns
+        -------
+        (ctypes.c_char_p * len(strings))
+            A const char ** pointer that can be passed to C API.
+        """
+        arr = (ctypes.c_char_p * len(strings))()
+        arr[:] = strings
+        return arr
 else:
     py_str = lambda x: x
 
+    def c_str_array(strings):
+        """Create ctypes const char ** from a list of Python strings.
+
+        Parameters
+        ----------
+        strings : list of string
+            Python strings.
+
+        Returns
+        -------
+        (ctypes.c_char_p * len(strings))
+            A const char ** pointer that can be passed to C API.
+        """
+        arr = (ctypes.c_char_p * len(strings))()
+        arr[:] = [s.encode('utf-8') for s in strings]
+        return arr
 
 def c_str(string):
     """"Convert a python string to C string."""
@@ -48,6 +94,11 @@ def c_array(ctype, values):
     """Create ctypes array from a python array."""
     return (ctype * len(values))(*values)
 
+def c_array_buf(ctype, buf):
+    """Create ctypes array from a Python buffer."""
+    return (ctype * len(buf)).from_buffer(buf)
+
+
 
 def _find_lib_path():
     """Find mxnet library."""
@@ -76,6 +127,7 @@ def _find_lib_path():
 def _load_lib():
     """Load libary by searching possible path."""
     lib_path = _find_lib_path()
+    print(lib_path)
     lib = ctypes.cdll.LoadLibrary(lib_path[0])
     # DMatrix functions
     lib.MXGetLastError.restype = ctypes.c_char_p
@@ -90,6 +142,7 @@ def _check_call(ret):
 _LIB = _load_lib()
 # type definitions
 mx_uint = ctypes.c_uint
+mx_int = ctypes.c_int
 mx_float = ctypes.c_float
 mx_float_p = ctypes.POINTER(mx_float)
 PredictorHandle = ctypes.c_void_p
@@ -116,10 +169,13 @@ class Predictor(object):
 
     dev_id : int, optional
         The device id of the predictor.
+
+    type_dict : Dict of str->numpy.dtype
+        Input type dictionary, name->dtype
     """
     def __init__(self, symbol_file,
                  param_raw_bytes, input_shapes,
-                 dev_type="cpu", dev_id=0):
+                 dev_type="cpu", dev_id=0, type_dict=None):
         dev_type = devstr2type[dev_type]
         indptr = [0]
         sdata = []
@@ -133,7 +189,26 @@ def __init__(self, symbol_file,
         handle = PredictorHandle()
         param_raw_bytes = bytearray(param_raw_bytes)
         ptr = (ctypes.c_char * len(param_raw_bytes)).from_buffer(param_raw_bytes)
-        _check_call(_LIB.MXPredCreate(
+
+        # data types
+        num_provided_arg_types = 0
+        # provided type argument names
+        provided_arg_type_names = ctypes.POINTER(ctypes.c_char_p)()
+        # provided types
+        provided_arg_type_data = ctypes.POINTER(mx_uint)()
+        if type_dict is not None:
+            provided_arg_type_names = []
+            provided_arg_type_data = []
+            for k, v in type_dict.items():
+                v = np.dtype(v).type
+                if v in _DTYPE_NP_TO_MX:
+                    provided_arg_type_names.append(k)
+                    provided_arg_type_data.append(_DTYPE_NP_TO_MX[v])
+            num_provided_arg_types = mx_uint(len(provided_arg_type_names))
+            provided_arg_type_names = c_str_array(provided_arg_type_names)
+            provided_arg_type_data = c_array_buf(ctypes.c_int, array('i', provided_arg_type_data))
+
+        _check_call(_LIB.MXPredCreateEx(
             c_str(symbol_file),
             ptr, len(param_raw_bytes),
             ctypes.c_int(dev_type), ctypes.c_int(dev_id),
@@ -141,6 +216,9 @@ def __init__(self, symbol_file,
             c_array(ctypes.c_char_p, keys),
             c_array(mx_uint, indptr),
             c_array(mx_uint, sdata),
+            num_provided_arg_types,
+            provided_arg_type_names,
+            provided_arg_type_data,
             ctypes.byref(handle)))
         self.handle = handle
 
@@ -218,12 +296,16 @@ def get_output(self, index):
         """
         pdata = ctypes.POINTER(mx_uint)()
         ndim = mx_uint()
+        out_type = mx_int()
         _check_call(_LIB.MXPredGetOutputShape(
             self.handle, index,
             ctypes.byref(pdata),
             ctypes.byref(ndim)))
+        _check_call(_LIB.MXPredGetOutputType(
+            self.handle, index,
+            ctypes.byref(out_type)))
         shape = tuple(pdata[:ndim.value])
-        data = np.empty(shape, dtype=np.float32)
+        data = np.empty(shape, dtype=out_type.value)
         _check_call(_LIB.MXPredGetOutput(
             self.handle, mx_uint(index),
             data.ctypes.data_as(mx_float_p),
diff --git a/include/mxnet/c_predict_api.h b/include/mxnet/c_predict_api.h
index ecbbf8dfc819..663e820fb535 100644
--- a/include/mxnet/c_predict_api.h
+++ b/include/mxnet/c_predict_api.h
@@ -85,6 +85,44 @@ MXNET_DLL int MXPredCreate(const char* symbol_json_str,
                            const mx_uint* input_shape_data,
                            PredictorHandle* out);
 
+/*!
+ * \brief create a predictor
+ * \param symbol_json_str The JSON string of the symbol.
+ * \param param_bytes The in-memory raw bytes of parameter ndarray file.
+ * \param param_size The size of parameter ndarray file.
+ * \param dev_type The device type, 1: cpu, 2: gpu
+ * \param dev_id The device id of the predictor.
+ * \param num_input_nodes Number of input nodes to the net.
+ *    For feedforward net, this is 1.
+ * \param input_keys The name of the input argument.
+ *    For feedforward net, this is {"data"}
+ * \param input_shape_indptr Index pointer of shapes of each input node.
+ *    The length of this array = num_input_nodes + 1.
+ *    For feedforward net that takes 4 dimensional input, this is {0, 4}.
+ * \param input_shape_data A flattened data of shapes of each input node.
+ *    For feedforward net that takes 4 dimensional input, this is the shape data.
+ * \param num_provided_arg_dtypes
+ *    The length of provided_arg_dtypes.
+ * \param provided_arg_dtype_names
+ *    The provided_arg_dtype_names the names of args for which dtypes are provided.
+ * \param provided_arg_dtypes
+ *    The provided_arg_dtypes the dtype provided
+ * \param out The created predictor handle.
+ * \return 0 when success, -1 when failure.
+ */
+MXNET_DLL int MXPredCreateEx(const char* symbol_json_str,
+                             const void* param_bytes,
+                             int param_size,
+                             int dev_type, int dev_id,
+                             mx_uint num_input_nodes,
+                             const char** input_keys,
+                             const mx_uint* input_shape_indptr,
+                             const mx_uint* input_shape_data,
+                             const mx_uint num_provided_arg_dtypes,
+                             const char** provided_arg_dtype_names,
+                             const int* provided_arg_dtypes,
+                             PredictorHandle* out);
+
 /*!
  * \brief create a predictor wich customized outputs
  * \param symbol_json_str The JSON string of the symbol.
@@ -186,6 +224,18 @@ MXNET_DLL int MXPredGetOutputShape(PredictorHandle handle,
                                    mx_uint index,
                                    mx_uint** shape_data,
                                    mx_uint* shape_ndim);
+
+/*!
+ * \brief Get the dtype of output node.
+ * The returned data type is only valid before next call to MXPred function.
+ * \param handle The handle of the predictor.
+ * \param out_index The index of the output node, set to 0 if there is only one output.
+ * \param out_dtype The dtype of the output node
+ */
+MXNET_DLL int MXPredGetOutputType(PredictorHandle handle,
+                                  mx_uint out_index,
+                                  const int* out_dtype);
+
 /*!
  * \brief Set the input data of predictor.
  * \param handle The predictor handle.
diff --git a/src/c_api/c_predict_api.cc b/src/c_api/c_predict_api.cc
index 5f29415b1b61..3c9165c1e94d 100644
--- a/src/c_api/c_predict_api.cc
+++ b/src/c_api/c_predict_api.cc
@@ -47,6 +47,9 @@ struct MXAPIPredictor {
   std::vector<NDArray> aux_arrays;
   // output shapes
   mxnet::ShapeVector out_shapes;
+  // output types
+  nnvm::DTypeVector out_dtypes;
+
   // uint32_t buffer for output shapes
   std::vector<uint32_t> out_shapes_buffer;
   // key to arguments
@@ -97,6 +100,9 @@ int _CreatePartialOut(const char* symbol_json_str,
                       // This is used for parallel inference.
                       int num_threads,
                       bool lazy,
+                      const mx_uint num_provided_arg_dtypes,
+                      const char** provided_arg_dtype_names,
+                      const int* provided_arg_dtypes,
                       PredictorHandle* out) {
   using nnvm::Symbol;
 
@@ -169,7 +175,14 @@ int _CreatePartialOut(const char* symbol_json_str,
       }
     }
 
-
+    if (num_provided_arg_dtypes > 0) {
+      for (mx_uint i = 0; i < num_provided_arg_dtypes; ++i) {
+        if (aux_types.count(provided_arg_dtype_names[i]) == 0 &&
+            arg_types.count(provided_arg_dtype_names[i]) == 0) {
+          arg_types[provided_arg_dtype_names[i]] = provided_arg_dtypes[i];
+        }
+      }
+    }
   }
 
   // shape inference and bind
@@ -211,13 +224,10 @@ int _CreatePartialOut(const char* symbol_json_str,
     }
     nnvm::Graph g; g.outputs = sym.outputs;
     g = mxnet::exec::InferShape(std::move(g), std::move(in_shapes), "__shape__");
-    g = mxnet::exec::InferType(std::move(g), std::move(in_types, "__dtype__");
+    g = mxnet::exec::InferType(std::move(g), std::move(in_types), "__dtype__");
     bool infer_complete = (g.GetAttr<size_t>("shape_num_unknown_nodes") == 0);
-    bool infer_type_complete = (g.GetAttr<size_t>("dtype_num_unknown_nodes") == 0);
     CHECK(infer_complete)
       << "The shape information of is not enough to get the shapes";
-    CHECK(infer_type_complete)
-      << "The infer type information is not enough to get the types";
     CopyAttr(g.indexed_graph(),
              g.GetAttr<mxnet::ShapeVector>("shape"),
              &arg_shapes, &out_shapes, &aux_shapes);
@@ -232,19 +242,31 @@ int _CreatePartialOut(const char* symbol_json_str,
 
   std::vector<NDArray> arg_arrays, aux_arrays;
   for (size_t i = 0; i < arg_shapes.size(); ++i) {
-    NDArray nd = NDArray(arg_shapes[i], ctx, false, result_arg_types[i]);
+    NDArray nd;
+    if (result_arg_types[i] != -1) {
+      nd = NDArray(arg_shapes[i], ctx, false, result_arg_types[i]);
+    } else {
+      nd = NDArray(arg_shapes[i], ctx);
+    }
     if (arg_params.count(arg_names[i]) != 0) {
       CopyFromTo(arg_params[arg_names[i]], &nd);
     }
     arg_arrays.push_back(nd);
   }
+
   for (size_t i = 0; i < aux_shapes.size(); ++i) {
-    NDArray nd = NDArray(aux_shapes[i], ctx, false, result_aux_types[i]);
+    NDArray nd;
+    if (result_aux_types[i] != -1) {
+      nd = NDArray(aux_shapes[i], ctx, false, result_aux_types[i]);
+    } else {
+      nd = NDArray(aux_shapes[i], ctx);
+    }
     if (aux_params.count(aux_names[i]) != 0) {
       CopyFromTo(aux_params[aux_names[i]], &nd);
     }
     aux_arrays.push_back(nd);
   }
+
   // bind
   for (int i = 0; i < num_threads; i++) {
     std::unique_ptr<MXAPIPredictor> ret(new MXAPIPredictor());
@@ -254,6 +276,7 @@ int _CreatePartialOut(const char* symbol_json_str,
     ret->arg_arrays = arg_arrays;
     ret->aux_arrays = aux_arrays;
     ret->out_shapes = out_shapes;
+    ret->out_dtypes = result_out_types;
 
     if (!lazy) {
       std::map<std::string, Context> ctx_map;
@@ -294,6 +317,9 @@ int MXPredCreatePartialOut(const char* symbol_json_str,
       output_keys,
       1,
       false,
+      0,
+      nullptr,
+      nullptr,
       out);
 }
 
@@ -317,9 +343,44 @@ int MXPredCreate(const char* symbol_json_str,
       input_shape_indptr,
       input_shape_data,
       0,
-      NULL,
+      nullptr,
       1,
       false,
+      0,
+      nullptr,
+      nullptr,
+      out);
+}
+
+int MXPredCreateEx(const char* symbol_json_str,
+                   const void* param_bytes,
+                   int param_size,
+                   int dev_type, int dev_id,
+                   mx_uint num_input_nodes,
+                   const char** input_keys,
+                   const mx_uint* input_shape_indptr,
+                   const mx_uint* input_shape_data,
+                   const mx_uint num_provided_arg_dtypes,
+                   const char** provided_arg_dtype_names,
+                   const int* provided_arg_dtypes,
+                   PredictorHandle* out) {
+  return _CreatePartialOut(
+      symbol_json_str,
+      param_bytes,
+      param_size,
+      dev_type,
+      dev_id,
+      num_input_nodes,
+      input_keys,
+      input_shape_indptr,
+      input_shape_data,
+      0,
+      nullptr,
+      1,
+      false,
+      num_provided_arg_dtypes,
+      provided_arg_dtype_names,
+      provided_arg_dtypes,
       out);
 }
 
@@ -352,9 +413,12 @@ int MXPredCreateMultiThread(const char* symbol_json_str,
       input_shape_indptr,
       input_shape_data,
       0,
-      NULL,
+      nullptr,
       num_threads,
       true,
+      0,
+      nullptr,
+      nullptr,
       out);
 }
 
@@ -466,6 +530,20 @@ int MXPredGetOutputShape(PredictorHandle handle,
   API_END();
 }
 
+int MXPredGetOutputType(PredictorHandle handle,
+                        mx_uint out_index,
+                        int* out_dtype) {
+  MXAPIPredictor* p = static_cast<MXAPIPredictor*>(handle);
+  API_BEGIN();
+  CHECK_LT(out_index, p->out_arrays.size())
+    << "Index exceed number of outputs";
+
+  const int s = p->out_dtypes[out_index];
+  CHECK_GE(s, 0);
+  out_dtype[out_index] = s;
+  API_END();
+}
+
 int MXPredSetInput(PredictorHandle handle,
                    const char* key,
                    const mx_float* data,
diff --git a/tests/python/unittest/test_predictor.py b/tests/python/unittest/test_predictor.py
index fc2fbf600cbc..a351f4f66520 100644
--- a/tests/python/unittest/test_predictor.py
+++ b/tests/python/unittest/test_predictor.py
@@ -81,6 +81,36 @@ def test_load_ndarray():
     for k in nd_data.keys():
         assert_almost_equal(nd_data[k].asnumpy(), nd_load[k], rtol=1e-5, atol=1e-6)
 
+@with_seed()
+def test_predictor():
+    prefix = 'test_predictor_simple_dense'
+    symbol_file = "%s-symbol.json" % prefix
+    param_file = "%s-0000.params" % prefix
+
+    input1 = np.random.uniform(size=(1, 3))
+    input1 = input1.astype(np.float16)
+
+    block = mx.gluon.nn.HybridSequential()
+    block.add(mx.gluon.nn.Dense(7))
+    block.add(mx.gluon.nn.Dense(3))
+    block.cast(np.float16)
+    block.hybridize()
+    block.initialize(ctx=mx.gpu(0))
+    tmp = mx.nd.array(input1, dtype=np.float16, ctx=mx.gpu(0))
+    out1 = block.forward(tmp)
+    block.export(prefix)
+
+    predictor = Predictor(open(symbol_file, "r").read(),
+                          open(param_file, "rb").read(),
+                          {"data": input1.shape},
+                          dev_type="gpu",
+                          dev_id=0,
+                          type_dict={"data": input1.dtype})
+    predictor.forward(data=input1)
+    predictor_out1 = predictor.get_output(0)
+
+    assert_almost_equal(out1.asnumpy(), predictor_out1, rtol=1e-5, atol=1e-6)
+
 
 if __name__ == '__main__':
     import nose

From 14ff1f75ac55162f62ebb31ba292e3566b2adf00 Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@ufl.edu>
Date: Thu, 20 Jun 2019 09:08:40 +0000
Subject: [PATCH 11/20] Add Test Predictor fixes

---
 amalgamation/python/mxnet_predict.py    | 31 +++++++++++++++++++++----
 include/mxnet/c_predict_api.h           |  2 +-
 tests/python/unittest/test_predictor.py |  2 +-
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/amalgamation/python/mxnet_predict.py b/amalgamation/python/mxnet_predict.py
index 4940cf83a9eb..6a605c519bc8 100644
--- a/amalgamation/python/mxnet_predict.py
+++ b/amalgamation/python/mxnet_predict.py
@@ -42,6 +42,17 @@
     np.int64: 6,
 }
 
+_DTYPE_MX_TO_NP = {
+    -1: None,
+    0: np.float32,
+    1: np.float64,
+    2: np.float16,
+    3: np.uint8,
+    4: np.int32,
+    5: np.int8,
+    6: np.int64,
+}
+
 __all__ = ["Predictor", "load_ndarray_file"]
 
 if sys.version_info[0] == 3:
@@ -220,6 +231,7 @@ def __init__(self, symbol_file,
             provided_arg_type_names,
             provided_arg_type_data,
             ctypes.byref(handle)))
+        self.type_dict = type_dict
         self.handle = handle
 
     def __del__(self):
@@ -238,13 +250,21 @@ def forward(self, **kwargs):
         >>> predictor.forward(data=mydata)
         >>> out = predictor.get_output(0)
         """
+        if self.type_dict and len(self.type_dict) != len(kwargs.items()):
+            raise ValueError("number of kwargs should be same as len of type_dict" \
+                             "Please check your forward pass inputs" \
+                             "or type_dict passed to Predictor instantiation")
+
         for k, v in kwargs.items():
             if not isinstance(v, np.ndarray):
                 raise ValueError("Expect numpy ndarray as input")
-            v = np.asarray(v, dtype=np.float32, order='C')
+            if k in self.type_dict:
+                v = np.asarray(v, dtype=self.type_dict[k], order='C')
+            else:
+                v = np.asarray(v, dtype=np.float32, order='C')
             _check_call(_LIB.MXPredSetInput(
                 self.handle, c_str(k),
-                v.ctypes.data_as(mx_float_p),
+                v.ctypes.data_as(ctypes.c_void_p),
                 mx_uint(v.size)))
         _check_call(_LIB.MXPredForward(self.handle))
 
@@ -305,10 +325,10 @@ def get_output(self, index):
             self.handle, index,
             ctypes.byref(out_type)))
         shape = tuple(pdata[:ndim.value])
-        data = np.empty(shape, dtype=out_type.value)
+        data = np.empty(shape, dtype=_DTYPE_MX_TO_NP[out_type.value])
         _check_call(_LIB.MXPredGetOutput(
             self.handle, mx_uint(index),
-            data.ctypes.data_as(mx_float_p),
+            data.ctypes.data_as(ctypes.c_void_p),
             mx_uint(data.size)))
         return data
 
@@ -355,4 +375,5 @@ def load_ndarray_file(nd_bytes):
     if len(keys) == 0 or len(keys[0]) == 0:
         return arrs
     else:
-        return {keys[i] : arrs[i] for i in range(len(keys))}
+        return {keys[i] : arrs[i] for i in range(len(keys))
+  }
diff --git a/include/mxnet/c_predict_api.h b/include/mxnet/c_predict_api.h
index 663e820fb535..7051d4edac82 100644
--- a/include/mxnet/c_predict_api.h
+++ b/include/mxnet/c_predict_api.h
@@ -234,7 +234,7 @@ MXNET_DLL int MXPredGetOutputShape(PredictorHandle handle,
  */
 MXNET_DLL int MXPredGetOutputType(PredictorHandle handle,
                                   mx_uint out_index,
-                                  const int* out_dtype);
+                                  int* out_dtype);
 
 /*!
  * \brief Set the input data of predictor.
diff --git a/tests/python/unittest/test_predictor.py b/tests/python/unittest/test_predictor.py
index a351f4f66520..b0529d805f4c 100644
--- a/tests/python/unittest/test_predictor.py
+++ b/tests/python/unittest/test_predictor.py
@@ -82,7 +82,7 @@ def test_load_ndarray():
         assert_almost_equal(nd_data[k].asnumpy(), nd_load[k], rtol=1e-5, atol=1e-6)
 
 @with_seed()
-def test_predictor():
+def test_predictor_fp16():
     prefix = 'test_predictor_simple_dense'
     symbol_file = "%s-symbol.json" % prefix
     param_file = "%s-0000.params" % prefix

From 9985decfccc13aa898a9d895262d68fefc7b055b Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@ufl.edu>
Date: Thu, 20 Jun 2019 16:40:51 +0000
Subject: [PATCH 12/20] Add test for predictor

---
 tests/python/unittest/test_predictor.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/python/unittest/test_predictor.py b/tests/python/unittest/test_predictor.py
index b0529d805f4c..ec12df310097 100644
--- a/tests/python/unittest/test_predictor.py
+++ b/tests/python/unittest/test_predictor.py
@@ -108,6 +108,8 @@ def test_predictor_fp16():
                           type_dict={"data": input1.dtype})
     predictor.forward(data=input1)
     predictor_out1 = predictor.get_output(0)
+    assert out1.asnumpy().dtype == predictor_out1.dtype, \
+        "Dtypes of output from C predict API doesnt match with gluon"
 
     assert_almost_equal(out1.asnumpy(), predictor_out1, rtol=1e-5, atol=1e-6)
 

From ac9c81c7fd5682a77f0781caccf05ac78d1ca491 Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@ufl.edu>
Date: Thu, 20 Jun 2019 20:12:25 +0000
Subject: [PATCH 13/20] Cleanup fixes

---
 amalgamation/python/mxnet_predict.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/amalgamation/python/mxnet_predict.py b/amalgamation/python/mxnet_predict.py
index 6a605c519bc8..9ba46e79df41 100644
--- a/amalgamation/python/mxnet_predict.py
+++ b/amalgamation/python/mxnet_predict.py
@@ -138,7 +138,6 @@ def _find_lib_path():
 def _load_lib():
     """Load libary by searching possible path."""
     lib_path = _find_lib_path()
-    print(lib_path)
     lib = ctypes.cdll.LoadLibrary(lib_path[0])
     # DMatrix functions
     lib.MXGetLastError.restype = ctypes.c_char_p
@@ -264,7 +263,7 @@ def forward(self, **kwargs):
                 v = np.asarray(v, dtype=np.float32, order='C')
             _check_call(_LIB.MXPredSetInput(
                 self.handle, c_str(k),
-                v.ctypes.data_as(ctypes.c_void_p),
+                v.ctypes.data_as(mx_float_p),
                 mx_uint(v.size)))
         _check_call(_LIB.MXPredForward(self.handle))
 
@@ -328,7 +327,7 @@ def get_output(self, index):
         data = np.empty(shape, dtype=_DTYPE_MX_TO_NP[out_type.value])
         _check_call(_LIB.MXPredGetOutput(
             self.handle, mx_uint(index),
-            data.ctypes.data_as(ctypes.c_void_p),
+            data.ctypes.data_as(mx_float_p),
             mx_uint(data.size)))
         return data
 

From 15964becd8329bab860344c0a17b3bdaffd20a42 Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@ufl.edu>
Date: Fri, 28 Jun 2019 18:33:22 +0000
Subject: [PATCH 14/20] Fixes

---
 amalgamation/python/mxnet_predict.py | 2 +-
 src/c_api/c_predict_api.cc           | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/amalgamation/python/mxnet_predict.py b/amalgamation/python/mxnet_predict.py
index 9ba46e79df41..3a15a4a8b2a8 100644
--- a/amalgamation/python/mxnet_predict.py
+++ b/amalgamation/python/mxnet_predict.py
@@ -257,7 +257,7 @@ def forward(self, **kwargs):
         for k, v in kwargs.items():
             if not isinstance(v, np.ndarray):
                 raise ValueError("Expect numpy ndarray as input")
-            if k in self.type_dict:
+            if self.type_dict and k in self.type_dict:
                 v = np.asarray(v, dtype=self.type_dict[k], order='C')
             else:
                 v = np.asarray(v, dtype=np.float32, order='C')
diff --git a/src/c_api/c_predict_api.cc b/src/c_api/c_predict_api.cc
index 3c9165c1e94d..eb085f4bcc1e 100644
--- a/src/c_api/c_predict_api.cc
+++ b/src/c_api/c_predict_api.cc
@@ -226,8 +226,15 @@ int _CreatePartialOut(const char* symbol_json_str,
     g = mxnet::exec::InferShape(std::move(g), std::move(in_shapes), "__shape__");
     g = mxnet::exec::InferType(std::move(g), std::move(in_types), "__dtype__");
     bool infer_complete = (g.GetAttr<size_t>("shape_num_unknown_nodes") == 0);
+    // This is tricky for AMP Use case, for example, with only weights input types
+    // cannot be inferred in AMP. Thus for AMP converted model type_dict will be
+    // required
+    bool infer_type_complete = (g.GetAttr<size_t>("dtype_num_unknown_nodes") == 0);
     CHECK(infer_complete)
       << "The shape information of is not enough to get the shapes";
+    CHECK(infer_type_complete)
+        << "The type information is not enough, please provide input arg_types "
+           "with provided_arg_dtype_names and provided_arg_dtypes";
     CopyAttr(g.indexed_graph(),
              g.GetAttr<mxnet::ShapeVector>("shape"),
              &arg_shapes, &out_shapes, &aux_shapes);

From f7a9058e53bbfde1d2d2f220bd6e78656114f737 Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@ufl.edu>
Date: Fri, 28 Jun 2019 22:41:43 +0000
Subject: [PATCH 15/20] Add support for forward pass only for gpu

---
 tests/python/unittest/test_predictor.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/tests/python/unittest/test_predictor.py b/tests/python/unittest/test_predictor.py
index ec12df310097..cd6a976096c2 100644
--- a/tests/python/unittest/test_predictor.py
+++ b/tests/python/unittest/test_predictor.py
@@ -95,8 +95,8 @@ def test_predictor_fp16():
     block.add(mx.gluon.nn.Dense(3))
     block.cast(np.float16)
     block.hybridize()
-    block.initialize(ctx=mx.gpu(0))
-    tmp = mx.nd.array(input1, dtype=np.float16, ctx=mx.gpu(0))
+    block.initialize(ctx=mx.current_context())
+    tmp = mx.nd.array(input1, dtype=np.float16, ctx=mx.current_context())
     out1 = block.forward(tmp)
     block.export(prefix)
 
@@ -106,12 +106,13 @@ def test_predictor_fp16():
                           dev_type="gpu",
                           dev_id=0,
                           type_dict={"data": input1.dtype})
-    predictor.forward(data=input1)
-    predictor_out1 = predictor.get_output(0)
-    assert out1.asnumpy().dtype == predictor_out1.dtype, \
-        "Dtypes of output from C predict API doesnt match with gluon"
+    if ctx.current_context().dev_type == "gpu":
+        predictor.forward(data=input1)
+        predictor_out1 = predictor.get_output(0)
+        assert out1.asnumpy().dtype == predictor_out1.dtype, \
+            "Dtypes of output from C predict API doesnt match with gluon"
 
-    assert_almost_equal(out1.asnumpy(), predictor_out1, rtol=1e-5, atol=1e-6)
+        assert_almost_equal(out1.asnumpy(), predictor_out1, rtol=1e-5, atol=1e-6)
 
 
 if __name__ == '__main__':

From d6f0ceae991e9c62fa1af723ec28928001fc6f1d Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@ufl.edu>
Date: Sat, 29 Jun 2019 01:52:59 +0000
Subject: [PATCH 16/20] Fix Reshape, move test to gpu

---
 src/c_api/c_predict_api.cc              |  1 +
 tests/python/unittest/test_predictor.py | 33 -------------------------
 2 files changed, 1 insertion(+), 33 deletions(-)

diff --git a/src/c_api/c_predict_api.cc b/src/c_api/c_predict_api.cc
index eb085f4bcc1e..7a7e2580e559 100644
--- a/src/c_api/c_predict_api.cc
+++ b/src/c_api/c_predict_api.cc
@@ -514,6 +514,7 @@ int MXPredReshape(mx_uint num_input_nodes,
                                    p->exec.get()));
     ret->out_shapes = out_shapes;
     ret->out_arrays = ret->exec->outputs();
+    ret->out_dtypes = p->out_dtypes;
   }
   *out = ret.release();
   API_END();
diff --git a/tests/python/unittest/test_predictor.py b/tests/python/unittest/test_predictor.py
index cd6a976096c2..fc2fbf600cbc 100644
--- a/tests/python/unittest/test_predictor.py
+++ b/tests/python/unittest/test_predictor.py
@@ -81,39 +81,6 @@ def test_load_ndarray():
     for k in nd_data.keys():
         assert_almost_equal(nd_data[k].asnumpy(), nd_load[k], rtol=1e-5, atol=1e-6)
 
-@with_seed()
-def test_predictor_fp16():
-    prefix = 'test_predictor_simple_dense'
-    symbol_file = "%s-symbol.json" % prefix
-    param_file = "%s-0000.params" % prefix
-
-    input1 = np.random.uniform(size=(1, 3))
-    input1 = input1.astype(np.float16)
-
-    block = mx.gluon.nn.HybridSequential()
-    block.add(mx.gluon.nn.Dense(7))
-    block.add(mx.gluon.nn.Dense(3))
-    block.cast(np.float16)
-    block.hybridize()
-    block.initialize(ctx=mx.current_context())
-    tmp = mx.nd.array(input1, dtype=np.float16, ctx=mx.current_context())
-    out1 = block.forward(tmp)
-    block.export(prefix)
-
-    predictor = Predictor(open(symbol_file, "r").read(),
-                          open(param_file, "rb").read(),
-                          {"data": input1.shape},
-                          dev_type="gpu",
-                          dev_id=0,
-                          type_dict={"data": input1.dtype})
-    if ctx.current_context().dev_type == "gpu":
-        predictor.forward(data=input1)
-        predictor_out1 = predictor.get_output(0)
-        assert out1.asnumpy().dtype == predictor_out1.dtype, \
-            "Dtypes of output from C predict API doesnt match with gluon"
-
-        assert_almost_equal(out1.asnumpy(), predictor_out1, rtol=1e-5, atol=1e-6)
-
 
 if __name__ == '__main__':
     import nose

From a14d2c71f45507ac92259f93a3876bd6f8b1eabe Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@ufl.edu>
Date: Fri, 5 Jul 2019 23:01:31 +0000
Subject: [PATCH 17/20] Add monitor callback for C Predict API

---
 amalgamation/python/mxnet_predict.py | 16 ++++++++++++++++
 include/mxnet/c_predict_api.h        | 15 +++++++++++++++
 src/c_api/c_predict_api.cc           | 22 +++++++++++++++++++++-
 3 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/amalgamation/python/mxnet_predict.py b/amalgamation/python/mxnet_predict.py
index 3a15a4a8b2a8..b64c69042fce 100644
--- a/amalgamation/python/mxnet_predict.py
+++ b/amalgamation/python/mxnet_predict.py
@@ -149,6 +149,14 @@ def _check_call(ret):
     if ret != 0:
         raise RuntimeError(py_str(_LIB.MXGetLastError()))
 
+
+def _monitor_callback_wrapper(callback):
+    """A wrapper for the user-defined handle."""
+    def callback_handle(name, array, _):
+        """ ctypes function """
+        callback(name, array)
+    return callback_handle
+
 _LIB = _load_lib()
 # type definitions
 mx_uint = ctypes.c_uint
@@ -331,6 +339,14 @@ def get_output(self, index):
             mx_uint(data.size)))
         return data
 
+    def set_monitor_callback(self, callback, monitor_all=False):
+        cb_type = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_void_p, ctypes.c_void_p)
+        self._monitor_callback = cb_type(_monitor_callback_wrapper(callback))
+        _check_call(_LIB.MXPredSetMonitorCallback(self.handle,
+                                                  self._monitor_callback,
+                                                  None,
+                                                  ctypes.c_int(monitor_all)))
+
 
 def load_ndarray_file(nd_bytes):
     """Load ndarray file and return as list of numpy array.
diff --git a/include/mxnet/c_predict_api.h b/include/mxnet/c_predict_api.h
index 7051d4edac82..0872642d7ddc 100644
--- a/include/mxnet/c_predict_api.h
+++ b/include/mxnet/c_predict_api.h
@@ -49,6 +49,12 @@ typedef float mx_float;
 typedef void *PredictorHandle;
 /*! \brief handle to NDArray list */
 typedef void *NDListHandle;
+/*! \brief handle to NDArray */
+typedef void *NDArrayHandle;
+/*! \brief callback used for add monitoring to nodes in the graph */
+typedef void (*PredMonitorCallback)(const char*,
+                                    NDArrayHandle,
+                                    void*);
 
 /*!
  * \brief Get the last error happeneed.
@@ -319,6 +325,15 @@ MXNET_DLL int MXNDListGet(NDListHandle handle,
                           const mx_float** out_data,
                           const mx_uint** out_shape,
                           mx_uint* out_ndim);
+
+/*!
+ * \brief set a call back to notify the completion of operation and allow for
+ * additional monitoring
+ */
+MXNET_DLL int MXPredSetMonitorCallback(PredictorHandle handle,
+                                       PredMonitorCallback callback,
+                                       void* callback_handle,
+                                       bool monitor_all);
 /*!
  * \brief Free a MXAPINDList
  * \param handle The handle of the MXAPINDList.
diff --git a/src/c_api/c_predict_api.cc b/src/c_api/c_predict_api.cc
index 7a7e2580e559..b2a685f135ae 100644
--- a/src/c_api/c_predict_api.cc
+++ b/src/c_api/c_predict_api.cc
@@ -220,6 +220,9 @@ int _CreatePartialOut(const char* symbol_json_str,
         in_types.push_back(arg_types[key]);
       } else if (aux_types.count(key) != 0) {
         in_types.push_back(aux_types[key]);
+      } else {
+        // if key not in arg_types or aux_types set to FP32
+        in_types.push_back(0);
       }
     }
     nnvm::Graph g; g.outputs = sym.outputs;
@@ -544,7 +547,8 @@ int MXPredGetOutputType(PredictorHandle handle,
   MXAPIPredictor* p = static_cast<MXAPIPredictor*>(handle);
   API_BEGIN();
   CHECK_LT(out_index, p->out_arrays.size())
-    << "Index exceed number of outputs";
+      << "Index exceed number of outputs, provided out_index should be less than "
+      << p->out_arrays.size();
 
   const int s = p->out_dtypes[out_index];
   CHECK_GE(s, 0);
@@ -651,6 +655,22 @@ int MXNDListGet(NDListHandle handle,
   API_END();
 }
 
+int MXPredSetMonitorCallback(PredictorHandle handle,
+                             PredMonitorCallback callback,
+                             void* callback_handle,
+                             bool monitor_all) {
+  MXAPIPredictor* p = static_cast<MXAPIPredictor*>(handle);
+  API_BEGIN();
+  PredMonitorCallback callback_temp = callback;
+  void* callback_handle_temp = callback_handle;
+  std::function<void(const char*, void*)> clbk
+  = [callback_temp, callback_handle_temp](const char* name, void* handle) {
+    callback_temp(name, handle, callback_handle_temp);
+  };
+  p->exec->SetMonitorCallback(clbk, monitor_all);
+  API_END();
+}
+
 int MXNDListFree(NDListHandle handle) {
   API_BEGIN();
   delete static_cast<MXAPINDList*>(handle);

From 84d8652e8d9632953ec6642c27847d984ad20bbc Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@ufl.edu>
Date: Fri, 5 Jul 2019 23:15:28 +0000
Subject: [PATCH 18/20] Add tests, default dtype and set_monitor_callback

---
 include/mxnet/c_predict_api.h      |   2 +-
 src/c_api/c_predict_api.cc         |   2 +-
 tests/python/gpu/test_predictor.py | 128 +++++++++++++++++++++++++++++
 3 files changed, 130 insertions(+), 2 deletions(-)
 create mode 100644 tests/python/gpu/test_predictor.py

diff --git a/include/mxnet/c_predict_api.h b/include/mxnet/c_predict_api.h
index 0872642d7ddc..18bec625f05f 100644
--- a/include/mxnet/c_predict_api.h
+++ b/include/mxnet/c_predict_api.h
@@ -120,7 +120,7 @@ MXNET_DLL int MXPredCreateEx(const char* symbol_json_str,
                              const void* param_bytes,
                              int param_size,
                              int dev_type, int dev_id,
-                             mx_uint num_input_nodes,
+                             const mx_uint num_input_nodes,
                              const char** input_keys,
                              const mx_uint* input_shape_indptr,
                              const mx_uint* input_shape_data,
diff --git a/src/c_api/c_predict_api.cc b/src/c_api/c_predict_api.cc
index b2a685f135ae..d614b09960c9 100644
--- a/src/c_api/c_predict_api.cc
+++ b/src/c_api/c_predict_api.cc
@@ -91,7 +91,7 @@ int _CreatePartialOut(const char* symbol_json_str,
                       const void* param_bytes,
                       int param_size,
                       int dev_type, int dev_id,
-                      mx_uint num_input_nodes,
+                      const mx_uint num_input_nodes,
                       const char** input_keys,
                       const mx_uint* input_shape_indptr,
                       const mx_uint* input_shape_data,
diff --git a/tests/python/gpu/test_predictor.py b/tests/python/gpu/test_predictor.py
new file mode 100644
index 000000000000..4838a76c7cb1
--- /dev/null
+++ b/tests/python/gpu/test_predictor.py
@@ -0,0 +1,128 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import print_function
+import sys, os
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.append(os.path.join(curr_path, "../../../amalgamation/python/"))
+from mxnet_predict import Predictor, load_ndarray_file
+
+import ctypes
+import numpy as np
+import mxnet as mx
+import mxnet.ndarray as nd
+from mxnet.ndarray import NDArray
+from mxnet import gluon
+from mxnet.test_utils import assert_almost_equal, download_model
+from mxnet.contrib.amp import amp
+from mxnet.base import NDArrayHandle, py_str
+sys.path.insert(0, os.path.join(curr_path, '../unittest'))
+from common import setup_module, with_seed, teardown
+
+@with_seed()
+def test_predictor_with_dtype():
+    prefix = 'test_predictor_simple_dense'
+    symbol_file = "%s-symbol.json" % prefix
+    param_file = "%s-0000.params" % prefix
+
+    input1 = np.random.uniform(size=(1, 3))
+    input1 = input1.astype(np.float16)
+
+    block = mx.gluon.nn.HybridSequential()
+    block.add(mx.gluon.nn.Dense(7))
+    block.add(mx.gluon.nn.Dense(3))
+    block.cast(np.float16)
+    block.hybridize()
+    block.initialize(ctx=mx.gpu(0))
+    tmp = mx.nd.array(input1, dtype=np.float16, ctx=mx.gpu(0))
+    out1 = block.forward(tmp)
+    block.export(prefix)
+
+    predictor = Predictor(open(symbol_file, "r").read(),
+                          open(param_file, "rb").read(),
+                          {"data": input1.shape},
+                          dev_type="gpu",
+                          dev_id=0,
+                          type_dict={"data": input1.dtype})
+    predictor.forward(data=input1)
+    predictor_out1 = predictor.get_output(0)
+
+    assert_almost_equal(out1.asnumpy(), predictor_out1, rtol=1e-5, atol=1e-6)
+
+def compare_module_cpredict(result_sym, result_arg_params, result_aux_params, monitor_callback=False):
+    # Dummmy inputs
+    input1 = np.ones((1, 3, 224, 224))
+    input1 = input1.astype(np.float32)
+    nd_dict = {}
+    def pred_mon_callback(name, arr):
+        nd_dict[name] = arr
+    mod = mx.mod.Module(result_sym, data_names=["data"], label_names=["softmax_label"], context=mx.gpu())
+    mod.bind(data_shapes=[['data', (1, 3, 224, 224)]], label_shapes=[['softmax_label', (1,)]], for_training=False)
+    mod.set_params(result_arg_params, result_aux_params)
+    mod.forward(mx.io.DataBatch(data=[mx.nd.array(input1, ctx=mx.gpu())],
+                                label=[mx.nd.ones((1,), ctx=mx.gpu())]))
+    prefix = "test_predictor_amp"
+    mod.save_checkpoint(prefix, 0, remove_amp_cast=False)
+    sym_file = "{}-symbol.json".format(prefix)
+    params_file = "{}-0000.params".format(prefix)
+    predictor = Predictor(open(sym_file, "r").read(),
+                          open(params_file, "rb").read(),
+                          {'data': (1, 3, 224, 224),
+                           'softmax_label': (1,)},
+                          dev_type="gpu",
+                          dev_id=0)
+    if monitor_callback:
+        predictor.set_monitor_callback(pred_mon_callback, monitor_all=True)
+    predictor.forward(data=input1, softmax_label=mx.nd.ones((1,)).asnumpy())
+    predictor_out1 = predictor.get_output(0)
+    if monitor_callback:
+        assert len(nd_dict) > 0, "Callback not called"
+    assert_almost_equal(mod.get_outputs()[0].asnumpy(), predictor_out1, atol=1e-1, rtol=1e-1)
+
+
+@with_seed()
+def test_predictor_amp():
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    model_path = os.path.join(dir_path, 'model')
+    if not os.path.isdir(model_path):
+        os.mkdir(model_path)
+    prefix, epoch = download_model("imagenet1k-resnet-18", dst_dir=model_path)
+
+    sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+
+
+    # Convert model to mixed precision model, params in FP32
+    result_sym, result_arg_params, result_aux_params = amp.convert_model(sym,
+                                                                         arg_params,
+                                                                         aux_params,
+                                                                         target_dtype="float16",
+                                                                         target_dtype_ops=["Convolution"])
+    compare_module_cpredict(result_sym, result_arg_params, result_aux_params)
+
+    # Convert model to mixed precision model, params in FP16
+    result_sym, result_arg_params, result_aux_params = amp.convert_model(sym,
+                                                                         arg_params,
+                                                                         aux_params,
+                                                                         target_dtype="float16",
+                                                                         target_dtype_ops=["Convolution"],
+                                                                         cast_optional_params=True)
+    compare_module_cpredict(result_sym, result_arg_params, result_aux_params, monitor_callback=True)
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()

From 577deb9ed54e01415e6b8b7b5e771753879df976 Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@ufl.edu>
Date: Fri, 5 Jul 2019 23:33:10 +0000
Subject: [PATCH 19/20] Improve error

---
 src/c_api/c_predict_api.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/c_api/c_predict_api.cc b/src/c_api/c_predict_api.cc
index d614b09960c9..b371fd044dc5 100644
--- a/src/c_api/c_predict_api.cc
+++ b/src/c_api/c_predict_api.cc
@@ -237,7 +237,9 @@ int _CreatePartialOut(const char* symbol_json_str,
       << "The shape information of is not enough to get the shapes";
     CHECK(infer_type_complete)
         << "The type information is not enough, please provide input arg_types "
-           "with provided_arg_dtype_names and provided_arg_dtypes";
+           "with provided_arg_dtype_names and provided_arg_dtypes."
+           "If using amalgamation python frontend you can use type_dict in Predictor API"
+           "to provide this information";
     CopyAttr(g.indexed_graph(),
              g.GetAttr<mxnet::ShapeVector>("shape"),
              &arg_shapes, &out_shapes, &aux_shapes);

From d911bfa5ab6311602ee2a1f4b0fd183c36113969 Mon Sep 17 00:00:00 2001
From: Anirudh Subramanian <anirudh2290@ufl.edu>
Date: Mon, 8 Jul 2019 17:04:39 +0000
Subject: [PATCH 20/20] Fix c_str_array

---
 amalgamation/python/mxnet_predict.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/amalgamation/python/mxnet_predict.py b/amalgamation/python/mxnet_predict.py
index b64c69042fce..48e3cd4a5145 100644
--- a/amalgamation/python/mxnet_predict.py
+++ b/amalgamation/python/mxnet_predict.py
@@ -63,7 +63,7 @@ def c_str_array(strings):
 
         Parameters
         ----------
-        strings : list of strings
+        strings : list of string
             Python strings.
 
         Returns
@@ -72,8 +72,10 @@ def c_str_array(strings):
             A const char ** pointer that can be passed to C API.
         """
         arr = (ctypes.c_char_p * len(strings))()
-        arr[:] = strings
+        arr[:] = [s.encode('utf-8') for s in strings]
         return arr
+
+
 else:
     py_str = lambda x: x
 
@@ -82,7 +84,7 @@ def c_str_array(strings):
 
         Parameters
         ----------
-        strings : list of string
+        strings : list of strings
             Python strings.
 
         Returns
@@ -91,9 +93,10 @@ def c_str_array(strings):
             A const char ** pointer that can be passed to C API.
         """
         arr = (ctypes.c_char_p * len(strings))()
-        arr[:] = [s.encode('utf-8') for s in strings]
+        arr[:] = strings
         return arr
 
+
 def c_str(string):
     """"Convert a python string to C string."""
     if not isinstance(string, str):