Skip to content

Commit 9b3fbe8

Browse files
mikepapadimMichalis Papapdimitriou
authored andcommitted
[BYOC][TENSOORT] Add support for FP16 on TensorRT BYOC flow (apache#10388)
* FP16 support for TRT * Cleanups on tests * Fix for typing on output tensor * Fix icheck * Add TRT inference builder auto-convert precision flags as attrs in the config * Address PR comments * Fix bug on passing the new config attrs to codegen for tensorrt partition Co-authored-by: Michalis Papapdimitriou <[email protected]>
1 parent 601fa38 commit 9b3fbe8

File tree

8 files changed

+416
-296
lines changed

8 files changed

+416
-296
lines changed

python/tvm/relay/op/contrib/tensorrt.py

Lines changed: 65 additions & 75 deletions
Large diffs are not rendered by default.

src/relay/backend/contrib/tensorrt/codegen.cc

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ struct TensorRTCompilerConfigNode : public tvm::AttrsNode<TensorRTCompilerConfig
4646
bool use_implicit_batch;
4747
size_t max_workspace_size;
4848
bool remove_no_mac_subgraphs;
49+
bool use_fp16;
50+
bool use_uint8;
4951

5052
TVM_DECLARE_ATTRS(TensorRTCompilerConfigNode, "ext.attrs.TensorRTCompilerConfigNode") {
5153
TVM_ATTR_FIELD(tensorrt_version)
@@ -54,6 +56,8 @@ struct TensorRTCompilerConfigNode : public tvm::AttrsNode<TensorRTCompilerConfig
5456
TVM_ATTR_FIELD(use_implicit_batch).set_default(true);
5557
TVM_ATTR_FIELD(max_workspace_size).set_default(size_t(1) << 30);
5658
TVM_ATTR_FIELD(remove_no_mac_subgraphs).set_default(false);
59+
TVM_ATTR_FIELD(use_fp16).set_default(false);
60+
TVM_ATTR_FIELD(use_uint8).set_default(false);
5761
}
5862
};
5963

@@ -215,13 +219,20 @@ class TensorRTJSONSerializer : public backend::contrib::JSONSerializer {
215219
std::to_string(cfg.value()->tensorrt_version[2])};
216220
std::vector<std::string> use_implicit_batch = {std::to_string(cfg.value()->use_implicit_batch)};
217221
std::vector<std::string> max_workspace_size = {std::to_string(cfg.value()->max_workspace_size)};
218-
std::vector<dmlc::any> tensorrt_version_attr, use_implicit_batch_attr, max_workspace_size_attr;
222+
std::vector<std::string> use_fp16 = {std::to_string(cfg.value()->use_fp16)};
223+
std::vector<std::string> use_uint8 = {std::to_string(cfg.value()->use_uint8)};
224+
std::vector<dmlc::any> tensorrt_version_attr, use_implicit_batch_attr, max_workspace_size_attr,
225+
use_fp16_attr, use_uint8_attr;
219226
tensorrt_version_attr.emplace_back(tensorrt_version);
220227
use_implicit_batch_attr.emplace_back(use_implicit_batch);
221228
max_workspace_size_attr.emplace_back(max_workspace_size);
229+
use_fp16_attr.emplace_back(use_fp16);
230+
use_uint8_attr.emplace_back(use_uint8);
222231
node->SetAttr("tensorrt_version", tensorrt_version_attr);
223232
node->SetAttr("use_implicit_batch", use_implicit_batch_attr);
224233
node->SetAttr("max_workspace_size", max_workspace_size_attr);
234+
node->SetAttr("use_fp16", use_fp16_attr);
235+
node->SetAttr("use_uint8", use_uint8_attr);
225236
}
226237
};
227238

src/runtime/contrib/tensorrt/tensorrt_builder.cc

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,13 @@ void TensorRTBuilder::AddInput(int nid, uint32_t entry_id, const JSONGraphNode&
8585
shape.erase(shape.begin());
8686
}
8787
nvinfer1::Dims dims = VectorToTrtDims(shape);
88-
ICHECK(TypeMatch(dtypes[i], kDLFloat, 32)) << "Only FP32 inputs are supported.";
89-
auto input_tensor = network_->addInput(name.c_str(), nvinfer1::DataType::kFLOAT, dims);
88+
ICHECK((dtypes[i].bits != 16 || dtypes[i].bits != 32))
89+
<< "Invalid input Tensor type. Float16 and Float32 are supported";
90+
91+
auto tensor_dtype =
92+
(dtypes[i].bits == 16) ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT;
93+
94+
auto input_tensor = network_->addInput(name.c_str(), tensor_dtype, dims);
9095
node_output_map_[nid].push_back(TensorRTOpInput(input_tensor));
9196
network_input_names_.push_back(name);
9297
entry_id_map_[name] = entry_id + i;
@@ -141,15 +146,18 @@ void TensorRTBuilder::AddLayer(int nid, const JSONGraphNode& node) {
141146
}
142147
params.inputs.push_back(input);
143148
}
144-
ICHECK(converter->variable_input_count || converter->input_types.size() == params.inputs.size())
145-
<< "Op expected a different number of inputs.";
146149

147150
// Convert op to TRT.
148151
converter->Convert(&params);
149152

150153
// Get outputs.
151154
node_output_map_[nid] = {};
152155
for (auto out : params.outputs) {
156+
auto out_type = params.inputs.at(1).weight.type == params.inputs.at(0).tensor->getType()
157+
? params.inputs.at(0).tensor->getType()
158+
: params.inputs.at(1).weight.type;
159+
out->setType(out_type);
160+
153161
node_output_map_[nid].push_back(TensorRTOpInput(out));
154162
}
155163
}
@@ -205,18 +213,17 @@ TensorRTEngineAndContext TensorRTBuilder::BuildEngine() {
205213
nvinfer1::Weights TensorRTBuilder::GetDLTensorAsWeights(const DLTensor* dptr,
206214
DLDeviceType src_device) {
207215
ICHECK_EQ(dptr->device.device_type, src_device);
208-
ICHECK(static_cast<int>(dptr->dtype.code) == kDLFloat ||
209-
static_cast<int>(dptr->dtype.code) == kDLInt);
210-
const auto trt_dtype = static_cast<int>(dptr->dtype.code) == kDLFloat
211-
? nvinfer1::DataType::kFLOAT
212-
: nvinfer1::DataType::kINT32;
216+
ICHECK((dptr->dtype.bits != 16 || dptr->dtype.bits != 32))
217+
<< "Invalid input Tensor type. Float16 and Float32 are supported";
218+
const auto trt_dtype = (static_cast<int>(dptr->dtype.bits) == 16) ? nvinfer1::DataType::kHALF
219+
: nvinfer1::DataType::kFLOAT;
220+
213221
const size_t weight_bytes = GetDataSize(*dptr);
214222
nvinfer1::Weights weight{trt_dtype, nullptr, 0};
215223
size_t count = 1;
216224
for (tvm_index_t i = 0; i < dptr->ndim; ++i) {
217225
count *= dptr->shape[i];
218226
}
219-
ICHECK_EQ(count * 4, weight_bytes);
220227
weight.count = count;
221228
weight.values = new float[count];
222229
ICHECK_EQ(TVMArrayCopyToBytes(const_cast<DLTensor*>(dptr), const_cast<void*>(weight.values),
@@ -250,7 +257,7 @@ void TensorRTBuilder::CleanUp() {
250257
#endif
251258
builder_->destroy();
252259
for (auto weight : trt_weights_) {
253-
if (weight.type == nvinfer1::DataType::kFLOAT) {
260+
if (weight.type == nvinfer1::DataType::kFLOAT || weight.type == nvinfer1::DataType::kHALF) {
254261
delete[] static_cast<const float*>(weight.values);
255262
} else {
256263
delete[] static_cast<const uint16_t*>(weight.values);

src/runtime/contrib/tensorrt/tensorrt_builder.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ class TensorRTBuilder {
6868
* \param logger TensorRT logger to use for errors and warnings.
6969
* \param max_workspace_size Workspace size parameter for TensorRT engine build phase.
7070
* \param use_implicit_batch Whether to use implicit batch mode (default)
71-
* \param use_fp16 Whether to use implicit batch mode (default)
71+
* \param use_fp16 Whether to automatically convert a model to fp16
7272
* \param batch_size If use_implicit_batch,
7373
*/
7474
TensorRTBuilder(TensorRTLogger* logger, const std::vector<const DLTensor*>& data_entry,

src/runtime/contrib/tensorrt/tensorrt_ops.cc

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ nvinfer1::ITensor* TensorRTOpConverter::Reshape(TensorRTOpConverterParams* param
4949
auto layer = params->network->addShuffle(*input);
5050
ICHECK(layer != nullptr);
5151
layer->setReshapeDimensions(VectorToTrtDims(new_shape));
52+
layer->setOutputType(0, input->getType());
5253
return layer->getOutput(0);
5354
}
5455

@@ -99,7 +100,8 @@ nvinfer1::ITensor* TensorRTOpConverter::CreateScalar(
99100
std::fill_n(dims.d, dims.nbDims, 1);
100101
float* values = new float[1];
101102
values[0] = value;
102-
nvinfer1::Weights weights{nvinfer1::DataType::kFLOAT, static_cast<void*>(values), 1};
103+
const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
104+
nvinfer1::Weights weights{weight_type, static_cast<void*>(values), 1};
103105
params->trt_weights->push_back(weights);
104106
return params->network->addConstant(dims, weights)->getOutput(0);
105107
}
@@ -252,7 +254,9 @@ class Conv1DOpConverter : public TensorRTOpConverter {
252254
input_tensor = shuffle_layer->getOutput(0);
253255

254256
const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], 1);
255-
nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
257+
const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
258+
259+
nvinfer1::Weights bias{weight_type, nullptr, 0};
256260

257261
auto conv_layer = params->network->addConvolution(*input_tensor, channels, kernel_size,
258262
params->inputs.at(1).weight, bias);
@@ -313,7 +317,8 @@ class Conv2DOpConverter : public TensorRTOpConverter {
313317
#endif
314318

315319
const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], weight_shape[3]);
316-
nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
320+
const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
321+
nvinfer1::Weights bias{weight_type, nullptr, 0};
317322
auto conv_layer = params->network->addConvolution(*input_tensor, channels, kernel_size,
318323
params->inputs.at(1).weight, bias);
319324
ICHECK(conv_layer != nullptr);
@@ -361,7 +366,8 @@ class Conv3DOpConverter : public TensorRTOpConverter {
361366
const int num_outputs =
362367
std::stoi(params->node.GetAttr<std::vector<std::string>>("channels")[0]);
363368
const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]);
364-
nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
369+
const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
370+
nvinfer1::Weights bias{weight_type, nullptr, 0};
365371
auto conv_layer = params->network->addConvolutionNd(*input_tensor, num_outputs, kernel_size,
366372
params->inputs.at(1).weight, bias);
367373
ICHECK(conv_layer != nullptr);
@@ -404,7 +410,8 @@ class DenseOpConverter : public TensorRTOpConverter {
404410
// Weights are in KC format.
405411
ICHECK_EQ(params->inputs.at(1).weight_shape.size(), 2);
406412
const int num_units = params->inputs.at(1).weight_shape[0];
407-
nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
413+
const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
414+
nvinfer1::Weights bias{weight_type, nullptr, 0};
408415
nvinfer1::IFullyConnectedLayer* fc_layer = params->network->addFullyConnected(
409416
*input_tensor, num_units, params->inputs.at(1).weight, bias);
410417
ICHECK(fc_layer != nullptr);
@@ -466,12 +473,15 @@ class BatchNormOpConverter : public TensorRTOpConverter {
466473
}
467474

468475
void* weight_scale_ptr = new float[gamma.count];
469-
nvinfer1::Weights weight_scale{nvinfer1::DataType::kFLOAT, weight_scale_ptr, gamma.count};
476+
const nvinfer1::DataType weight_type_scale = params->inputs.at(1).weight.type;
477+
nvinfer1::Weights weight_scale{weight_type_scale, weight_scale_ptr, gamma.count};
470478
params->trt_weights->push_back(weight_scale);
471479
void* weight_shift_ptr = new float[gamma.count];
472-
nvinfer1::Weights weight_shift{nvinfer1::DataType::kFLOAT, weight_shift_ptr, gamma.count};
480+
const nvinfer1::DataType weight_type_shift = params->inputs.at(2).weight.type;
481+
nvinfer1::Weights weight_shift{weight_type_shift, weight_shift_ptr, gamma.count};
473482
params->trt_weights->push_back(weight_shift);
474-
nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0};
483+
const nvinfer1::DataType weight_type_power = params->inputs.at(3).weight.type;
484+
nvinfer1::Weights power{weight_type_power, nullptr, 0};
475485

476486
// fill in the content of weights for the Scale layer
477487
const float* gamma_ptr = reinterpret_cast<const float*>(gamma.values);
@@ -911,8 +921,10 @@ class BiasAddOpConverter : public TensorRTOpConverter {
911921
input_tensor = Reshape(params, input_tensor, new_shape);
912922
}
913923

914-
nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, nullptr, 0};
915-
nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0};
924+
const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
925+
926+
nvinfer1::Weights shift{weight_type, nullptr, 0};
927+
nvinfer1::Weights power{weight_type, nullptr, 0};
916928
nvinfer1::IScaleLayer* scale_layer = params->network->addScale(
917929
*input_tensor, nvinfer1::ScaleMode::kCHANNEL, params->inputs.at(1).weight, shift, power);
918930
ICHECK(scale_layer != nullptr);
@@ -962,7 +974,8 @@ class Conv2DTransposeOpConverter : public TensorRTOpConverter {
962974
const int num_outputs =
963975
std::stoi(params->node.GetAttr<std::vector<std::string>>("channels")[0]);
964976
const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], weight_shape[3]);
965-
nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
977+
const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
978+
nvinfer1::Weights bias{weight_type, nullptr, 0};
966979
auto deconv_layer = params->network->addDeconvolution(*input_tensor, num_outputs, kernel_size,
967980
params->inputs.at(1).weight, bias);
968981
ICHECK(deconv_layer != nullptr);
@@ -1020,7 +1033,8 @@ class Conv3DTransposeOpConverter : public TensorRTOpConverter {
10201033
const int num_outputs =
10211034
std::stoi(params->node.GetAttr<std::vector<std::string>>("channels")[0]);
10221035
const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]);
1023-
nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
1036+
const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
1037+
nvinfer1::Weights bias{weight_type, nullptr, 0};
10241038
auto deconv_layer = params->network->addDeconvolutionNd(*input_tensor, num_outputs, kernel_size,
10251039
params->inputs.at(1).weight, bias);
10261040
ICHECK(deconv_layer != nullptr);

src/runtime/contrib/tensorrt/tensorrt_ops.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ struct TensorRTOpInput {
7676
std::vector<int> weight_shape;
7777

7878
explicit TensorRTOpInput(nvinfer1::ITensor* tensor)
79-
: tensor(tensor), weight({nvinfer1::DataType::kFLOAT, nullptr, 0}), type(kTensor) {}
79+
: tensor(tensor), weight({tensor->getType(), nullptr, 0}), type(kTensor) {}
8080
TensorRTOpInput(nvinfer1::Weights weight, const std::vector<int>& shape)
8181
: tensor(nullptr), weight(weight), type(kWeight), weight_shape(shape) {}
8282
};

src/runtime/contrib/tensorrt/tensorrt_runtime.cc

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,8 @@ class TensorRTRuntime : public JSONRuntimeBase {
7272
use_implicit_batch_(true),
7373
max_workspace_size_(size_t(1) << 30),
7474
max_batch_size_(-1),
75-
multi_engine_mode_(false) {
75+
multi_engine_mode_(false),
76+
use_fp16_(false) {
7677
const bool use_int8 = dmlc::GetEnv("TVM_TENSORRT_USE_INT8", false);
7778
multi_engine_mode_ = dmlc::GetEnv("TVM_TENSORRT_MULTI_ENGINE", false);
7879
num_calibration_batches_remaining_ = dmlc::GetEnv("TENSORRT_NUM_CALI_INT8", 0);
@@ -304,7 +305,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
304305
}
305306

306307
void BuildEngineFromJson(int batch_size) {
307-
const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false);
308+
const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false) || use_fp16_;
308309
TensorRTBuilder builder(&logger_, data_entry_, max_workspace_size_, use_implicit_batch_,
309310
use_fp16, batch_size, calibrator_.get());
310311
for (size_t i = 0; i < input_nodes_.size(); ++i) {
@@ -492,6 +493,9 @@ class TensorRTRuntime : public JSONRuntimeBase {
492493
* encountered. Multi-engine mode should give better performance, at a cost of higher memory usage
493494
* and more time spent building engines. */
494495
bool multi_engine_mode_;
496+
497+
/*! \brief Use auto-conversion to fp16 */
498+
bool use_fp16_;
495499
};
496500

497501
runtime::Module TensorRTRuntimeCreate(const String& symbol_name, const String& graph_json,

0 commit comments

Comments
 (0)