Skip to content

Commit 6a6640e

Browse files
author
Michalis Papapdimitriou
committed
Add TRT inference builder auto-convert precision flags as attrs in the config
1 parent d357c32 commit 6a6640e

File tree

4 files changed

+25
-4
lines changed

4 files changed

+25
-4
lines changed

python/tvm/relay/op/contrib/tensorrt.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,8 @@ def partition_for_tensorrt(
8787
use_implicit_batch=True,
8888
remove_no_mac_subgraphs=False,
8989
max_workspace_size=1 << 30,
90+
use_fp16=False,
91+
use_uint8=False,
9092
):
9193
"""Partition the graph greedily offloading supported operators to TensorRT.
9294
@@ -110,6 +112,11 @@ def partition_for_tensorrt(
110112
max_workspace_size : Optional[int]
111113
How many bytes of workspace size to allow each subgraph to use for TensorRT engine creation.
112114
See TensorRT documentation for more info.
115+
use_fp16: Optional[bool]
116+
Allows, TRT to automatically convert FP32 inputs to FP16. Also, it is required to be enabled if FP16 inputs tensors and weights are used.
117+
Note that TensorRT will still choose a higher-precision kernel if it results in overall lower runtime, or if no low-precision implementation exists.
118+
use_uint8: Optional[bool]
119+
Allows, TRT to automatically convert FP32 inputs to UINT8.
113120
Returns
114121
-------
115122
mod_and_config : Tuple[Module, Dict[str, Any]]
@@ -120,6 +127,8 @@ def partition_for_tensorrt(
120127
"use_implicit_batch": use_implicit_batch,
121128
"max_workspace_size": max_workspace_size,
122129
"remove_no_mac_subgraphs": remove_no_mac_subgraphs,
130+
"use_fp16": use_fp16,
131+
"use_uint8": use_uint8,
123132
}
124133
if version:
125134
assert isinstance(version, tuple) and len(version) == 3

src/relay/backend/contrib/tensorrt/codegen.cc

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ struct TensorRTCompilerConfigNode : public tvm::AttrsNode<TensorRTCompilerConfig
4646
bool use_implicit_batch;
4747
size_t max_workspace_size;
4848
bool remove_no_mac_subgraphs;
49+
bool use_fp16;
50+
bool use_uint8;
4951

5052
TVM_DECLARE_ATTRS(TensorRTCompilerConfigNode, "ext.attrs.TensorRTCompilerConfigNode") {
5153
TVM_ATTR_FIELD(tensorrt_version)
@@ -54,6 +56,8 @@ struct TensorRTCompilerConfigNode : public tvm::AttrsNode<TensorRTCompilerConfig
5456
TVM_ATTR_FIELD(use_implicit_batch).set_default(true);
5557
TVM_ATTR_FIELD(max_workspace_size).set_default(size_t(1) << 30);
5658
TVM_ATTR_FIELD(remove_no_mac_subgraphs).set_default(false);
59+
TVM_ATTR_FIELD(use_fp16).set_default(false);
60+
TVM_ATTR_FIELD(use_uint8).set_default(false);
5761
}
5862
};
5963

@@ -215,13 +219,18 @@ class TensorRTJSONSerializer : public backend::contrib::JSONSerializer {
215219
std::to_string(cfg.value()->tensorrt_version[2])};
216220
std::vector<std::string> use_implicit_batch = {std::to_string(cfg.value()->use_implicit_batch)};
217221
std::vector<std::string> max_workspace_size = {std::to_string(cfg.value()->max_workspace_size)};
218-
std::vector<dmlc::any> tensorrt_version_attr, use_implicit_batch_attr, max_workspace_size_attr;
222+
std::vector<std::string> use_fp16 = {std::to_string(cfg.value()->use_fp16)};
223+
std::vector<std::string> use_uint8 = {std::to_string(cfg.value()->use_uint8)};
224+
std::vector<dmlc::any> tensorrt_version_attr, use_implicit_batch_attr, max_workspace_size_attr,
225+
use_fp16_attr, use_uint8_attr;
219226
tensorrt_version_attr.emplace_back(tensorrt_version);
220227
use_implicit_batch_attr.emplace_back(use_implicit_batch);
221228
max_workspace_size_attr.emplace_back(max_workspace_size);
222229
node->SetAttr("tensorrt_version", tensorrt_version_attr);
223230
node->SetAttr("use_implicit_batch", use_implicit_batch_attr);
224231
node->SetAttr("max_workspace_size", max_workspace_size_attr);
232+
node->SetAttr("use_fp16", use_fp16_attr);
233+
node->SetAttr("use_uint8", use_uint8_attr);
225234
}
226235
};
227236

src/runtime/contrib/tensorrt/tensorrt_builder.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,6 @@ void TensorRTBuilder::AddLayer(int nid, const JSONGraphNode& node) {
141141
<< " requires weights but got a tensor.";
142142
}
143143
}
144-
VLOG(1) << "INT " << input.type;
145144
params.inputs.push_back(input);
146145
}
147146

src/runtime/contrib/tensorrt/tensorrt_runtime.cc

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,8 @@ class TensorRTRuntime : public JSONRuntimeBase {
7272
use_implicit_batch_(true),
7373
max_workspace_size_(size_t(1) << 30),
7474
max_batch_size_(-1),
75-
multi_engine_mode_(false) {
75+
multi_engine_mode_(false),
76+
use_fp16_(false) {
7677
const bool use_int8 = dmlc::GetEnv("TVM_TENSORRT_USE_INT8", false);
7778
multi_engine_mode_ = dmlc::GetEnv("TVM_TENSORRT_MULTI_ENGINE", false);
7879
num_calibration_batches_remaining_ = dmlc::GetEnv("TENSORRT_NUM_CALI_INT8", 0);
@@ -304,7 +305,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
304305
}
305306

306307
void BuildEngineFromJson(int batch_size) {
307-
const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false);
308+
const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false) || use_fp16_;
308309
TensorRTBuilder builder(&logger_, data_entry_, max_workspace_size_, use_implicit_batch_,
309310
use_fp16, batch_size, calibrator_.get());
310311
for (size_t i = 0; i < input_nodes_.size(); ++i) {
@@ -492,6 +493,9 @@ class TensorRTRuntime : public JSONRuntimeBase {
492493
* encountered. Multi-engine mode should give better performance, at a cost of higher memory usage
493494
* and more time spent building engines. */
494495
bool multi_engine_mode_;
496+
497+
/*! \brief Use auto-conversion to fp16 */
498+
bool use_fp16_;
495499
};
496500

497501
runtime::Module TensorRTRuntimeCreate(const String& symbol_name, const String& graph_json,

0 commit comments

Comments
 (0)