From f8cee2e05cab243d677364455651b8d31de275b5 Mon Sep 17 00:00:00 2001
From: haohuw <haohuw@amazon.com>
Date: Sun, 23 Jun 2019 10:56:24 -0700
Subject: [PATCH] enable TensorRT integration with cpp api

---
 cpp-package/example/inference/README.md       |  14 +++
 .../example/inference/imagenet_inference.cpp  | 104 ++++++++++++----
 cpp-package/include/mxnet-cpp/MxNetCpp.h      |   1 +
 cpp-package/include/mxnet-cpp/contrib.h       | 115 ++++++++++++++++++
 cpp-package/include/mxnet-cpp/symbol.h        |  17 +++
 cpp-package/include/mxnet-cpp/symbol.hpp      |  35 ++++++
 6 files changed, 260 insertions(+), 26 deletions(-)
 create mode 100644 cpp-package/include/mxnet-cpp/contrib.h
diff --git a/cpp-package/example/inference/README.md b/cpp-package/example/inference/README.md
index 272586da5da9..81cf9d856c23 100644
--- a/cpp-package/example/inference/README.md
+++ b/cpp-package/example/inference/README.md
@@ -75,6 +75,7 @@ imagenet_inference  --symbol_file <model symbol file in json format>
 		    --num_inference_batches <number of batches used for inference>
 		    --data_layer_type <default: "float32", choices: ["float32", "int8", "uint8"]>
 		    --gpu <whether to run inference on GPU, default: false>
+		    --enableTRT  <whether to run inference with TensorRT, default: false>"
 		    --benchmark <whether to use dummy data to run inference, default: false>
 ```
 
@@ -134,6 +135,19 @@ imagenet_inference.cpp:372: Running the forward pass on model to evaluate the pe
 imagenet_inference.cpp:387: benchmark completed!
 imagenet_inference.cpp:388: batch size: 1 num batch: 500 throughput: xxxx imgs/s latency:xxxx ms
 ```
+For running this example with TensorRT, you can quickly try the following example to run a benchmark test for testing Inception BN:
+```
+./imagenet_inference --symbol_file "./model/Inception-BN-symbol.json" --params_file "./model/Inception-BN-0126.params" --batch_size 16 --num_inference_batches 500 --benchmark --enableTRT
+```
+Sample output will looks like this (the example is running on a AWS P3.2xl machine):
+```
+imagenet_inference.cpp:302: Loading the model from ./model/Inception-BN-symbol.json
+build_subgraph.cc:686: start to execute partition graph.
+imagenet_inference.cpp:317: Loading the model parameters from ./model/Inception-BN-0126.params
+imagenet_inference.cpp:424: Running the forward pass on model to evaluate the performance..
+imagenet_inference.cpp:439:  benchmark completed!
+imagenet_inference.cpp:440:  batch size: 16 num batch: 500 throughput: 6284.78 imgs/s latency:0.159115 ms
+```
 
 ## [sentiment_analysis_rnn.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/sentiment_analysis_rnn.cpp>)
 This example demonstrates how you can load a pre-trained RNN model and use it to predict the sentiment expressed in the given movie review with the MXNet C++ API. The example is capable of processing variable legnth inputs. It performs the following tasks
diff --git a/cpp-package/example/inference/imagenet_inference.cpp b/cpp-package/example/inference/imagenet_inference.cpp
index 7eaf991ada4e..4f5a3bb8bbe6 100644
--- a/cpp-package/example/inference/imagenet_inference.cpp
+++ b/cpp-package/example/inference/imagenet_inference.cpp
@@ -82,6 +82,7 @@ class Predictor {
               const std::string& model_params_file,
               const Shape& input_shape,
               bool use_gpu,
+              bool enable_tensorrt,
               const std::string& dataset,
               const int data_nthreads,
               const std::string& data_layer_type,
@@ -98,6 +99,13 @@ class Predictor {
     bool AdvanceDataIter(int skipped_batches);
     void LoadModel(const std::string& model_json_file);
     void LoadParameters(const std::string& model_parameters_file);
+    void SplitParamMap(const std::map<std::string, NDArray> &paramMap,
+        std::map<std::string, NDArray> *argParamInTargetContext,
+        std::map<std::string, NDArray> *auxParamInTargetContext,
+        Context targetContext);
+    void ConvertParamMapToTargetContext(const std::map<std::string, NDArray> &paramMap,
+        std::map<std::string, NDArray> *paramMapInTargetContext,
+        Context targetContext);
     void InitParameters();
 
     inline bool FileExists(const std::string &name) {
@@ -115,6 +123,7 @@ class Predictor {
 
     MXDataIter *val_iter_;
     bool use_gpu_;
+    bool enable_tensorrt_;
     std::string dataset_;
     int data_nthreads_;
     std::string data_layer_type_;
@@ -134,14 +143,15 @@ class Predictor {
  *                 the input shape is required to be in format Shape(1, number_of_channels, height, width)
  *                 The input image will be resized to (height x width) size before running the inference.
  * 4. use_gpu: determine if run inference on GPU
- * 5. dataset: data file (.rec) to be used for inference
- * 6. data_nthreads: number of threads for data loading
- * 7. data_layer_type: data type for data layer
- * 8. rgb_mean: mean value to be subtracted on R/G/B channel
- * 9. rgb_std: standard deviation on R/G/B channel
- * 10. shuffle_chunk_seed: shuffling chunk seed
- * 11. seed: shuffling seed
- * 12. benchmark: use dummy data for inference
+ * 5. enable_tensorrt: determine if enable TensorRT
+ * 6. dataset: data file (.rec) to be used for inference
+ * 7. data_nthreads: number of threads for data loading
+ * 8. data_layer_type: data type for data layer
+ * 9. rgb_mean: mean value to be subtracted on R/G/B channel
+ * 10. rgb_std: standard deviation on R/G/B channel
+ * 11. shuffle_chunk_seed: shuffling chunk seed
+ * 12. seed: shuffling seed
+ * 13. benchmark: use dummy data for inference
  *
  * The constructor will:
  *  1. Create ImageRecordIter based on the given dataset file.
@@ -152,6 +162,7 @@ Predictor::Predictor(const std::string& model_json_file,
                      const std::string& model_params_file,
                      const Shape& input_shape,
                      bool use_gpu,
+                     bool enable_tensorrt,
                      const std::string& dataset,
                      const int data_nthreads,
                      const std::string& data_layer_type,
@@ -161,6 +172,7 @@ Predictor::Predictor(const std::string& model_json_file,
                      int seed, bool benchmark)
     : input_shape_(input_shape),
       use_gpu_(use_gpu),
+      enable_tensorrt_(enable_tensorrt),
       dataset_(dataset),
       data_nthreads_(data_nthreads),
       data_layer_type_(data_layer_type),
@@ -182,12 +194,12 @@ Predictor::Predictor(const std::string& model_json_file,
   // Load the model
   LoadModel(model_json_file);
   // Initilize the parameters
-  // benchmark=false, load parameters from file
-  // benchmark=true, randomly initialize parameters
-  if (!benchmark_) {
-    LoadParameters(model_params_file);
-  } else {
+  // benchmark=true && model_params_file.empty(), randomly initialize parameters
+  // else, load parameters
+  if (benchmark_ && model_params_file.empty()) {
     InitParameters();
+  } else {
+    LoadParameters(model_params_file);
   }
 
   int dtype = GetDataLayerType();
@@ -289,9 +301,11 @@ void Predictor::LoadModel(const std::string& model_json_file) {
   }
   LG << "Loading the model from " << model_json_file << std::endl;
   net_ = Symbol::Load(model_json_file);
+  if (enable_tensorrt_) {
+    net_ = net_.GetBackendSymbol("TensorRT");
+  }
 }
 
-
 /*
  * The following function loads the model parameters.
  */
@@ -303,20 +317,50 @@ void Predictor::LoadParameters(const std::string& model_parameters_file) {
   LG << "Loading the model parameters from " << model_parameters_file << std::endl;
   std::map<std::string, NDArray> parameters;
   NDArray::Load(model_parameters_file, 0, &parameters);
-  for (const auto &k : parameters) {
-    if (k.first.substr(0, 4) == "aux:") {
-      auto name = k.first.substr(4, k.first.size() - 4);
-      aux_map_[name] = k.second.Copy(global_ctx_);
-    }
-    if (k.first.substr(0, 4) == "arg:") {
-      auto name = k.first.substr(4, k.first.size() - 4);
-      args_map_[name] = k.second.Copy(global_ctx_);
-    }
+  if (enable_tensorrt_) {
+    std::map<std::string, NDArray> intermediate_args_map;
+    std::map<std::string, NDArray> intermediate_aux_map;
+    SplitParamMap(parameters, &intermediate_args_map, &intermediate_aux_map, Context::cpu());
+    contrib::InitTensorRTParams(net_, &intermediate_args_map, &intermediate_aux_map);
+    ConvertParamMapToTargetContext(intermediate_args_map, &args_map_, global_ctx_);
+    ConvertParamMapToTargetContext(intermediate_aux_map, &aux_map_, global_ctx_);
+  } else {
+    SplitParamMap(parameters, &args_map_, &aux_map_, global_ctx_);
   }
   /*WaitAll is need when we copy data between GPU and the main memory*/
   NDArray::WaitAll();
 }
 
+/*
+ * The following function split loaded param map into arg parm
+ *   and aux param with target context
+ */
+void Predictor::SplitParamMap(const std::map<std::string, NDArray> &paramMap,
+    std::map<std::string, NDArray> *argParamInTargetContext,
+    std::map<std::string, NDArray> *auxParamInTargetContext,
+    Context targetContext) {
+  for (const auto& pair : paramMap) {
+    std::string type = pair.first.substr(0, 4);
+    std::string name = pair.first.substr(4);
+    if (type == "arg:") {
+      (*argParamInTargetContext)[name] = pair.second.Copy(targetContext);
+    } else if (type == "aux:") {
+      (*auxParamInTargetContext)[name] = pair.second.Copy(targetContext);
+    }
+  }
+}
+
+/*
+ * The following function copy the param map into the target context
+ */
+void Predictor::ConvertParamMapToTargetContext(const std::map<std::string, NDArray> &paramMap,
+    std::map<std::string, NDArray> *paramMapInTargetContext,
+    Context targetContext) {
+  for (const auto& pair : paramMap) {
+    (*paramMapInTargetContext)[pair.first] = pair.second.Copy(targetContext);
+  }
+}
+
 /*
  * The following function randomly initializes the parameters when benchmark_ is true.
  */
@@ -517,6 +561,8 @@ void printUsage() {
               << "--data_layer_type <default: \"float32\" "
               << "choices: [\"float32\",\"int8\",\"uint8\"]>" << std::endl
               << "--gpu  <whether to run inference on GPU, default: false>" << std::endl
+              << "--enableTRT  <whether to run inference with TensorRT, "
+              << "default: false>" << std::endl
               << "--benchmark <whether to use dummy data to run inference, default: false>"
               << std::endl;
 }
@@ -528,6 +574,7 @@ int main(int argc, char** argv) {
   std::string input_rgb_mean("0 0 0");
   std::string input_rgb_std("1 1 1");
   bool use_gpu = false;
+  bool enable_tensorrt = false;
   bool benchmark = false;
   int batch_size = 64;
   int num_skipped_batches = 0;
@@ -575,6 +622,9 @@ int main(int argc, char** argv) {
       data_layer_type = (index < argc ? argv[index]:data_layer_type);
     } else if (strcmp("--gpu", argv[index]) == 0) {
       use_gpu = true;
+    } else if (strcmp("--enableTRT", argv[index]) == 0) {
+      use_gpu = true;
+      enable_tensorrt = true;
     } else if (strcmp("--benchmark", argv[index]) == 0) {
       benchmark = true;
     } else if (strcmp("--help", argv[index]) == 0) {
@@ -584,7 +634,9 @@ int main(int argc, char** argv) {
     index++;
   }
 
-  if (model_file_json.empty() || (!benchmark && model_file_params.empty())) {
+  if (model_file_json.empty()
+      || (!benchmark && model_file_params.empty())
+      || (enable_tensorrt && model_file_params.empty())) {
     LG << "ERROR: Model details such as symbol, param files are not specified";
     printUsage();
     return 1;
@@ -597,8 +649,8 @@ int main(int argc, char** argv) {
   std::vector<float> rgb_std = createVectorFromString<float>(input_rgb_std);
 
   // Initialize the predictor object
-  Predictor predict(model_file_json, model_file_params, input_data_shape, use_gpu, dataset,
-                    data_nthreads, data_layer_type, rgb_mean, rgb_std, shuffle_chunk_seed,
+  Predictor predict(model_file_json, model_file_params, input_data_shape, use_gpu, enable_tensorrt,
+                    dataset, data_nthreads, data_layer_type, rgb_mean, rgb_std, shuffle_chunk_seed,
                     seed, benchmark);
 
   if (benchmark) {
diff --git a/cpp-package/include/mxnet-cpp/MxNetCpp.h b/cpp-package/include/mxnet-cpp/MxNetCpp.h
index 7ac039dd8816..a513565377fd 100644
--- a/cpp-package/include/mxnet-cpp/MxNetCpp.h
+++ b/cpp-package/include/mxnet-cpp/MxNetCpp.h
@@ -39,5 +39,6 @@
 #include "mxnet-cpp/io.hpp"
 #include "mxnet-cpp/metric.h"
 #include "mxnet-cpp/initializer.h"
+#include "mxnet-cpp/contrib.h"
 
 #endif  // MXNET_CPP_MXNETCPP_H_
diff --git a/cpp-package/include/mxnet-cpp/contrib.h b/cpp-package/include/mxnet-cpp/contrib.h
new file mode 100644
index 000000000000..890ab2bf0062
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/contrib.h
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+*  Copyright (c) 2019 by Contributors
+* \file contrib.h
+* \brief utility function to enable some contrib features
+* \author Haohuan Wang
+*/
+#ifndef MXNET_CPP_CONTRIB_H_
+#define MXNET_CPP_CONTRIB_H_
+
+#include <iostream>
+#include <string>
+#include <map>
+#include <vector>
+#include "mxnet-cpp/symbol.h"
+
+namespace mxnet {
+namespace cpp {
+namespace details {
+
+  /*!
+   * split a string with the given delimiter
+   * @param str string to be parsed
+   * @param delimiter delimiter
+   * @return delimited list of string
+   */
+  inline std::vector<std::string> split(const std::string& str, const std::string& delimiter) {
+    std::vector<std::string> splitted;
+    size_t last = 0;
+    size_t next = 0;
+    while ((next = str.find(delimiter, last)) != std::string::npos) {
+      splitted.push_back(str.substr(last, next - last));
+      last = next + 1;
+    }
+    splitted.push_back(str.substr(last));
+    return splitted;
+  }
+
+}  // namespace details
+
+namespace contrib {
+
+  // needs to be same with
+  //   https://github.com/apache/incubator-mxnet/blob/1c874cfc807cee755c38f6486e8e0f4d94416cd8/src/operator/subgraph/tensorrt/tensorrt-inl.h#L190
+  static const std::string TENSORRT_SUBGRAPH_PARAM_IDENTIFIER = "subgraph_params_names";
+  // needs to be same with
+  //   https://github.com/apache/incubator-mxnet/blob/master/src/operator/subgraph/tensorrt/tensorrt.cc#L244
+  static const std::string TENSORRT_SUBGRAPH_PARAM_PREFIX = "subgraph_param_";
+  /*!
+   * this is a mimic to https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/contrib/tensorrt.py#L37
+   * @param symbol symbol that already called subgraph api
+   * @param argParams original arg params, params needed by tensorrt will be removed after calling this function
+   * @param auxParams original aux params, params needed by tensorrt will be removed after calling this function
+   */
+  inline void InitTensorRTParams(const mxnet::cpp::Symbol& symbol,
+      std::map<std::string, mxnet::cpp::NDArray> *argParams,
+      std::map<std::string, mxnet::cpp::NDArray> *auxParams) {
+    mxnet::cpp::Symbol internals = symbol.GetInternals();
+    mx_uint numSymbol = internals.GetNumOutputs();
+    for (mx_uint i = 0; i < numSymbol; ++i) {
+        std::map<std::string, std::string> attrs = internals[i].ListAttributes();
+        if (attrs.find(TENSORRT_SUBGRAPH_PARAM_IDENTIFIER) != attrs.end()) {
+            std::string new_params_names;
+            std::map<std::string, mxnet::cpp::NDArray> tensorrtParams;
+            std::vector<std::string> keys = details::split(
+                attrs[TENSORRT_SUBGRAPH_PARAM_IDENTIFIER], ";");
+            for (const auto& key : keys) {
+                if (argParams->find(key) != argParams->end()) {
+                    new_params_names += key + ";";
+                    tensorrtParams[TENSORRT_SUBGRAPH_PARAM_PREFIX + key] = (*argParams)[key];
+                    argParams->erase(key);
+                } else if (auxParams->find(key) != auxParams->end()) {
+                    new_params_names += key + ";";
+                    tensorrtParams[TENSORRT_SUBGRAPH_PARAM_PREFIX + key] = (*auxParams)[key];
+                    auxParams->erase(key);
+                }
+            }
+            std::map<std::string, std::string> new_attrs = {};
+            for (const auto& kv : tensorrtParams) {
+                // passing the ndarray address into TRT node attributes to get the weight
+                uint64_t address = reinterpret_cast<uint64_t>(kv.second.GetHandle());
+                new_attrs[kv.first] = std::to_string(address);
+            }
+            if (!new_attrs.empty()) {
+                internals[i].SetAttributes(new_attrs);
+                internals[i].SetAttribute(TENSORRT_SUBGRAPH_PARAM_IDENTIFIER,
+                    new_params_names.substr(0, new_params_names.length() - 1));
+            }
+        }
+    }
+}
+
+}  // namespace contrib
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // MXNET_CPP_CONTRIB_H_
diff --git a/cpp-package/include/mxnet-cpp/symbol.h b/cpp-package/include/mxnet-cpp/symbol.h
index a25824cad602..d72eeaad1a5a 100644
--- a/cpp-package/include/mxnet-cpp/symbol.h
+++ b/cpp-package/include/mxnet-cpp/symbol.h
@@ -178,6 +178,23 @@ class Symbol {
   std::vector<std::string> ListOutputs() const;
   /*! \return get the descriptions of auxiliary data for this symbol */
   std::vector<std::string> ListAuxiliaryStates() const;
+  /*! \return get all attributes for this symbol */
+  std::map<std::string, std::string> ListAttributes() const;
+  /*!
+   * \brief set key-value attribute to the symbol
+   * @param key string represent the key for the attribute
+   * @param value string represent the value for the attribute
+   */
+  void SetAttribute(const std::string& key, const std::string& value);
+  /*!
+   * \brief set a series of key-value attribute to the symbol
+   * @param attrs string:string map represent the key value attributes
+   */
+  void SetAttributes(const std::map<std::string, std::string>& attrs);
+  /*! \return get number of outputs for this symbol */
+  mx_uint GetNumOutputs() const;
+  /*! \return get the new symbol through subgraph API for this symbol */
+  mxnet::cpp::Symbol GetBackendSymbol(const std::string& backendName) const;
   /*! \return get the name of the symbol */
   std::string GetName() const;
   /*!
diff --git a/cpp-package/include/mxnet-cpp/symbol.hpp b/cpp-package/include/mxnet-cpp/symbol.hpp
index 2e3fb7a2d5de..811d894e0ffa 100644
--- a/cpp-package/include/mxnet-cpp/symbol.hpp
+++ b/cpp-package/include/mxnet-cpp/symbol.hpp
@@ -172,6 +172,41 @@ inline std::vector<std::string> Symbol::ListAuxiliaryStates() const {
   return ret;
 }
 
+inline std::map<std::string, std::string> Symbol::ListAttributes() const {
+    mx_uint size;
+    const char** pairs;
+    CHECK_EQ(MXSymbolListAttrShallow(GetHandle(), &size, &pairs), 0);
+    std::map<std::string, std::string> attributes;
+    for (mx_uint i = 0; i < size; ++i) {
+        // pairs is 2 * size with key, value pairs according to
+        //   https://github.com/apache/incubator-mxnet/blob/master/include/mxnet/c_api.h#L1428
+        attributes[pairs[2 * i]] = pairs[2 * i + 1];
+    }
+    return attributes;
+}
+
+inline void Symbol::SetAttribute(const std::string &key, const std::string &value) {
+    CHECK_EQ(MXSymbolSetAttr(GetHandle(), key.c_str(), value.c_str()), 0);
+}
+
+inline void Symbol::SetAttributes(const std::map<std::string, std::string> &attrs) {
+    for (const auto& kv : attrs) {
+        SetAttribute(kv.first, kv.second);
+    }
+}
+
+inline mx_uint Symbol::GetNumOutputs() const {
+    mx_uint numOutputs;
+    CHECK_EQ(MXSymbolGetNumOutputs(GetHandle(), &numOutputs), 0);
+    return numOutputs;
+}
+
+inline mxnet::cpp::Symbol Symbol::GetBackendSymbol(const std::string &backendName) const {
+    SymbolHandle symbolHandle;
+    CHECK_EQ(MXGenBackendSubgraph(GetHandle(), backendName.c_str(), &symbolHandle), 0);
+    return mxnet::cpp::Symbol(symbolHandle);
+}
+
 inline std::string Symbol::GetName() const {
   int success;
   const char* out_name;