[BYOC][TENSOORT] Add support for FP16 on TensorRT BYOC flow (apache#10388)

mikepapadim · Michalis Papapdimitriou · pfk-beta · commit 9b3fbe80fe1f · 2022-04-11T11:53:16.000+02:00
* FP16 support for TRT

* Cleanups on tests

* Fix for typing on output tensor

* Fix icheck

* Add TRT inference builder auto-convert precision flags as attrs in the config

* Address PR comments

* Fix bug on passing the new config attrs to codegen for tensorrt partition

Co-authored-by: Michalis Papapdimitriou &lt;mpapapdimitriou@octoml.ai&gt;
diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py
diff --git a/src/relay/backend/contrib/tensorrt/codegen.cc b/src/relay/backend/contrib/tensorrt/codegen.cc
@@ -46,6 +46,8 @@ struct TensorRTCompilerConfigNode : public tvm::AttrsNode<TensorRTCompilerConfig
   bool use_implicit_batch;
   size_t max_workspace_size;
   bool remove_no_mac_subgraphs;
+  bool use_fp16;
+  bool use_uint8;
 
   TVM_DECLARE_ATTRS(TensorRTCompilerConfigNode, "ext.attrs.TensorRTCompilerConfigNode") {
     TVM_ATTR_FIELD(tensorrt_version)
@@ -54,6 +56,8 @@ struct TensorRTCompilerConfigNode : public tvm::AttrsNode<TensorRTCompilerConfig
     TVM_ATTR_FIELD(use_implicit_batch).set_default(true);
     TVM_ATTR_FIELD(max_workspace_size).set_default(size_t(1) << 30);
     TVM_ATTR_FIELD(remove_no_mac_subgraphs).set_default(false);
+    TVM_ATTR_FIELD(use_fp16).set_default(false);
+    TVM_ATTR_FIELD(use_uint8).set_default(false);
   }
 };
 
@@ -215,13 +219,20 @@ class TensorRTJSONSerializer : public backend::contrib::JSONSerializer {
                                                  std::to_string(cfg.value()->tensorrt_version[2])};
     std::vector<std::string> use_implicit_batch = {std::to_string(cfg.value()->use_implicit_batch)};
     std::vector<std::string> max_workspace_size = {std::to_string(cfg.value()->max_workspace_size)};
-    std::vector<dmlc::any> tensorrt_version_attr, use_implicit_batch_attr, max_workspace_size_attr;
+    std::vector<std::string> use_fp16 = {std::to_string(cfg.value()->use_fp16)};
+    std::vector<std::string> use_uint8 = {std::to_string(cfg.value()->use_uint8)};
+    std::vector<dmlc::any> tensorrt_version_attr, use_implicit_batch_attr, max_workspace_size_attr,
+        use_fp16_attr, use_uint8_attr;
     tensorrt_version_attr.emplace_back(tensorrt_version);
     use_implicit_batch_attr.emplace_back(use_implicit_batch);
     max_workspace_size_attr.emplace_back(max_workspace_size);
+    use_fp16_attr.emplace_back(use_fp16);
+    use_uint8_attr.emplace_back(use_uint8);
     node->SetAttr("tensorrt_version", tensorrt_version_attr);
     node->SetAttr("use_implicit_batch", use_implicit_batch_attr);
     node->SetAttr("max_workspace_size", max_workspace_size_attr);
+    node->SetAttr("use_fp16", use_fp16_attr);
+    node->SetAttr("use_uint8", use_uint8_attr);
   }
 };
 
diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.cc b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
@@ -85,8 +85,13 @@ void TensorRTBuilder::AddInput(int nid, uint32_t entry_id, const JSONGraphNode&
       shape.erase(shape.begin());
     }
     nvinfer1::Dims dims = VectorToTrtDims(shape);
-    ICHECK(TypeMatch(dtypes[i], kDLFloat, 32)) << "Only FP32 inputs are supported.";
-    auto input_tensor = network_->addInput(name.c_str(), nvinfer1::DataType::kFLOAT, dims);
+    ICHECK((dtypes[i].bits != 16 || dtypes[i].bits != 32))
+        << "Invalid input Tensor type. Float16 and Float32 are supported";
+
+    auto tensor_dtype =
+        (dtypes[i].bits == 16) ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT;
+
+    auto input_tensor = network_->addInput(name.c_str(), tensor_dtype, dims);
     node_output_map_[nid].push_back(TensorRTOpInput(input_tensor));
     network_input_names_.push_back(name);
     entry_id_map_[name] = entry_id + i;
@@ -141,15 +146,18 @@ void TensorRTBuilder::AddLayer(int nid, const JSONGraphNode& node) {
     }
     params.inputs.push_back(input);
   }
-  ICHECK(converter->variable_input_count || converter->input_types.size() == params.inputs.size())
-      << "Op expected a different number of inputs.";
 
   // Convert op to TRT.
   converter->Convert(&params);
 
   // Get outputs.
   node_output_map_[nid] = {};
   for (auto out : params.outputs) {
+    auto out_type = params.inputs.at(1).weight.type == params.inputs.at(0).tensor->getType()
+                        ? params.inputs.at(0).tensor->getType()
+                        : params.inputs.at(1).weight.type;
+    out->setType(out_type);
+
     node_output_map_[nid].push_back(TensorRTOpInput(out));
   }
 }
@@ -205,18 +213,17 @@ TensorRTEngineAndContext TensorRTBuilder::BuildEngine() {
 nvinfer1::Weights TensorRTBuilder::GetDLTensorAsWeights(const DLTensor* dptr,
                                                         DLDeviceType src_device) {
   ICHECK_EQ(dptr->device.device_type, src_device);
-  ICHECK(static_cast<int>(dptr->dtype.code) == kDLFloat ||
-         static_cast<int>(dptr->dtype.code) == kDLInt);
-  const auto trt_dtype = static_cast<int>(dptr->dtype.code) == kDLFloat
-                             ? nvinfer1::DataType::kFLOAT
-                             : nvinfer1::DataType::kINT32;
+  ICHECK((dptr->dtype.bits != 16 || dptr->dtype.bits != 32))
+      << "Invalid input Tensor type. Float16 and Float32 are supported";
+  const auto trt_dtype = (static_cast<int>(dptr->dtype.bits) == 16) ? nvinfer1::DataType::kHALF
+                                                                    : nvinfer1::DataType::kFLOAT;
+
   const size_t weight_bytes = GetDataSize(*dptr);
   nvinfer1::Weights weight{trt_dtype, nullptr, 0};
   size_t count = 1;
   for (tvm_index_t i = 0; i < dptr->ndim; ++i) {
     count *= dptr->shape[i];
   }
-  ICHECK_EQ(count * 4, weight_bytes);
   weight.count = count;
   weight.values = new float[count];
   ICHECK_EQ(TVMArrayCopyToBytes(const_cast<DLTensor*>(dptr), const_cast<void*>(weight.values),
@@ -250,7 +257,7 @@ void TensorRTBuilder::CleanUp() {
 #endif
   builder_->destroy();
   for (auto weight : trt_weights_) {
-    if (weight.type == nvinfer1::DataType::kFLOAT) {
+    if (weight.type == nvinfer1::DataType::kFLOAT || weight.type == nvinfer1::DataType::kHALF) {
       delete[] static_cast<const float*>(weight.values);
     } else {
       delete[] static_cast<const uint16_t*>(weight.values);
diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.h b/src/runtime/contrib/tensorrt/tensorrt_builder.h
@@ -68,7 +68,7 @@ class TensorRTBuilder {
    * \param logger TensorRT logger to use for errors and warnings.
    * \param max_workspace_size Workspace size parameter for TensorRT engine build phase.
    * \param use_implicit_batch Whether to use implicit batch mode (default)
-   * \param use_fp16 Whether to use implicit batch mode (default)
+   * \param use_fp16 Whether to automatically convert a model to fp16
    * \param batch_size If use_implicit_batch,
    */
   TensorRTBuilder(TensorRTLogger* logger, const std::vector<const DLTensor*>& data_entry,
diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.cc b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
@@ -49,6 +49,7 @@ nvinfer1::ITensor* TensorRTOpConverter::Reshape(TensorRTOpConverterParams* param
   auto layer = params->network->addShuffle(*input);
   ICHECK(layer != nullptr);
   layer->setReshapeDimensions(VectorToTrtDims(new_shape));
+  layer->setOutputType(0, input->getType());
   return layer->getOutput(0);
 }
 
@@ -99,7 +100,8 @@ nvinfer1::ITensor* TensorRTOpConverter::CreateScalar(
   std::fill_n(dims.d, dims.nbDims, 1);
   float* values = new float[1];
   values[0] = value;
-  nvinfer1::Weights weights{nvinfer1::DataType::kFLOAT, static_cast<void*>(values), 1};
+  const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
+  nvinfer1::Weights weights{weight_type, static_cast<void*>(values), 1};
   params->trt_weights->push_back(weights);
   return params->network->addConstant(dims, weights)->getOutput(0);
 }
@@ -252,7 +254,9 @@ class Conv1DOpConverter : public TensorRTOpConverter {
     input_tensor = shuffle_layer->getOutput(0);
 
     const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], 1);
-    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
+
+    nvinfer1::Weights bias{weight_type, nullptr, 0};
 
     auto conv_layer = params->network->addConvolution(*input_tensor, channels, kernel_size,
                                                       params->inputs.at(1).weight, bias);
@@ -313,7 +317,8 @@ class Conv2DOpConverter : public TensorRTOpConverter {
 #endif
 
     const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], weight_shape[3]);
-    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
+    nvinfer1::Weights bias{weight_type, nullptr, 0};
     auto conv_layer = params->network->addConvolution(*input_tensor, channels, kernel_size,
                                                       params->inputs.at(1).weight, bias);
     ICHECK(conv_layer != nullptr);
@@ -361,7 +366,8 @@ class Conv3DOpConverter : public TensorRTOpConverter {
     const int num_outputs =
         std::stoi(params->node.GetAttr<std::vector<std::string>>("channels")[0]);
     const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]);
-    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
+    nvinfer1::Weights bias{weight_type, nullptr, 0};
     auto conv_layer = params->network->addConvolutionNd(*input_tensor, num_outputs, kernel_size,
                                                         params->inputs.at(1).weight, bias);
     ICHECK(conv_layer != nullptr);
@@ -404,7 +410,8 @@ class DenseOpConverter : public TensorRTOpConverter {
     // Weights are in KC format.
     ICHECK_EQ(params->inputs.at(1).weight_shape.size(), 2);
     const int num_units = params->inputs.at(1).weight_shape[0];
-    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
+    nvinfer1::Weights bias{weight_type, nullptr, 0};
     nvinfer1::IFullyConnectedLayer* fc_layer = params->network->addFullyConnected(
         *input_tensor, num_units, params->inputs.at(1).weight, bias);
     ICHECK(fc_layer != nullptr);
@@ -466,12 +473,15 @@ class BatchNormOpConverter : public TensorRTOpConverter {
     }
 
     void* weight_scale_ptr = new float[gamma.count];
-    nvinfer1::Weights weight_scale{nvinfer1::DataType::kFLOAT, weight_scale_ptr, gamma.count};
+    const nvinfer1::DataType weight_type_scale = params->inputs.at(1).weight.type;
+    nvinfer1::Weights weight_scale{weight_type_scale, weight_scale_ptr, gamma.count};
     params->trt_weights->push_back(weight_scale);
     void* weight_shift_ptr = new float[gamma.count];
-    nvinfer1::Weights weight_shift{nvinfer1::DataType::kFLOAT, weight_shift_ptr, gamma.count};
+    const nvinfer1::DataType weight_type_shift = params->inputs.at(2).weight.type;
+    nvinfer1::Weights weight_shift{weight_type_shift, weight_shift_ptr, gamma.count};
     params->trt_weights->push_back(weight_shift);
-    nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    const nvinfer1::DataType weight_type_power = params->inputs.at(3).weight.type;
+    nvinfer1::Weights power{weight_type_power, nullptr, 0};
 
     // fill in the content of weights for the Scale layer
     const float* gamma_ptr = reinterpret_cast<const float*>(gamma.values);
@@ -911,8 +921,10 @@ class BiasAddOpConverter : public TensorRTOpConverter {
       input_tensor = Reshape(params, input_tensor, new_shape);
     }
 
-    nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, nullptr, 0};
-    nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
+
+    nvinfer1::Weights shift{weight_type, nullptr, 0};
+    nvinfer1::Weights power{weight_type, nullptr, 0};
     nvinfer1::IScaleLayer* scale_layer = params->network->addScale(
         *input_tensor, nvinfer1::ScaleMode::kCHANNEL, params->inputs.at(1).weight, shift, power);
     ICHECK(scale_layer != nullptr);
@@ -962,7 +974,8 @@ class Conv2DTransposeOpConverter : public TensorRTOpConverter {
     const int num_outputs =
         std::stoi(params->node.GetAttr<std::vector<std::string>>("channels")[0]);
     const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], weight_shape[3]);
-    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
+    nvinfer1::Weights bias{weight_type, nullptr, 0};
     auto deconv_layer = params->network->addDeconvolution(*input_tensor, num_outputs, kernel_size,
                                                           params->inputs.at(1).weight, bias);
     ICHECK(deconv_layer != nullptr);
@@ -1020,7 +1033,8 @@ class Conv3DTransposeOpConverter : public TensorRTOpConverter {
     const int num_outputs =
         std::stoi(params->node.GetAttr<std::vector<std::string>>("channels")[0]);
     const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]);
-    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
+    nvinfer1::Weights bias{weight_type, nullptr, 0};
     auto deconv_layer = params->network->addDeconvolutionNd(*input_tensor, num_outputs, kernel_size,
                                                             params->inputs.at(1).weight, bias);
     ICHECK(deconv_layer != nullptr);
diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.h b/src/runtime/contrib/tensorrt/tensorrt_ops.h
@@ -76,7 +76,7 @@ struct TensorRTOpInput {
   std::vector<int> weight_shape;
 
   explicit TensorRTOpInput(nvinfer1::ITensor* tensor)
-      : tensor(tensor), weight({nvinfer1::DataType::kFLOAT, nullptr, 0}), type(kTensor) {}
+      : tensor(tensor), weight({tensor->getType(), nullptr, 0}), type(kTensor) {}
   TensorRTOpInput(nvinfer1::Weights weight, const std::vector<int>& shape)
       : tensor(nullptr), weight(weight), type(kWeight), weight_shape(shape) {}
 };
diff --git a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
@@ -72,7 +72,8 @@ class TensorRTRuntime : public JSONRuntimeBase {
         use_implicit_batch_(true),
         max_workspace_size_(size_t(1) << 30),
         max_batch_size_(-1),
-        multi_engine_mode_(false) {
+        multi_engine_mode_(false),
+        use_fp16_(false) {
     const bool use_int8 = dmlc::GetEnv("TVM_TENSORRT_USE_INT8", false);
     multi_engine_mode_ = dmlc::GetEnv("TVM_TENSORRT_MULTI_ENGINE", false);
     num_calibration_batches_remaining_ = dmlc::GetEnv("TENSORRT_NUM_CALI_INT8", 0);
@@ -304,7 +305,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
   }
 
   void BuildEngineFromJson(int batch_size) {
-    const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false);
+    const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false) || use_fp16_;
     TensorRTBuilder builder(&logger_, data_entry_, max_workspace_size_, use_implicit_batch_,
                             use_fp16, batch_size, calibrator_.get());
     for (size_t i = 0; i < input_nodes_.size(); ++i) {
@@ -492,6 +493,9 @@ class TensorRTRuntime : public JSONRuntimeBase {
    * encountered. Multi-engine mode should give better performance, at a cost of higher memory usage
    * and more time spent building engines. */
   bool multi_engine_mode_;
+
+  /*! \brief Use auto-conversion to fp16 */
+  bool use_fp16_;
 };
 
 runtime::Module TensorRTRuntimeCreate(const String& symbol_name, const String& graph_json,
diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py