Skip to content

Commit

Permalink
[TensorRT EP] avoid excessive library load/unload overhead when runni…
Browse files Browse the repository at this point in the history
…ng unit tests. (#15639)

TensorRT will load/unload libraries as builder objects are created and
torn down. This will happen for
every single unit test, which leads to excessive test execution time due
to that overhead.
This overhead has steadily increased over the past few TensorRT versions
as the library objects get bigger leading to
8 hours to run all the unit tests. Nvidia suggests to keep a placeholder
builder object around to avoid this.
  • Loading branch information
jywu-msft authored Apr 24, 2023
1 parent c2acf69 commit 8dd32fe
Show file tree
Hide file tree
Showing 8 changed files with 36 additions and 37 deletions.
1 change: 0 additions & 1 deletion cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,6 @@ option(onnxruntime_ENABLE_MICROSOFT_INTERNAL "Use this option to enable/disable
option(onnxruntime_USE_VITISAI "Build with Vitis-AI" OFF)
option(onnxruntime_USE_TENSORRT "Build with TensorRT support" OFF)
option(onnxruntime_USE_TENSORRT_BUILTIN_PARSER "Use TensorRT builtin parser" OFF)
option(onnxruntime_TENSORRT_PLACEHOLDER_BUILDER "Instantiate Placeholder TensorRT Builder" OFF)
option(onnxruntime_ENABLE_LTO "Enable link time optimization" OFF)
option(onnxruntime_CROSS_COMPILING "Cross compiling onnx runtime" OFF)
option(onnxruntime_GCOV_COVERAGE "Compile with options necessary to run code coverage" OFF)
Expand Down
33 changes: 18 additions & 15 deletions cmake/onnxruntime_unittests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ function(AddTest)
target_include_directories(${_UT_TARGET} PRIVATE ${NCCL_INCLUDE_DIRS})
endif()
endif()
if (onnxruntime_USE_TENSORRT)
# used for instantiating placeholder TRT builder to mitigate TRT library load/unload overhead
target_include_directories(${_UT_TARGET} PRIVATE ${TENSORRT_INCLUDE_DIR})
endif()

if(MSVC)
target_compile_options(${_UT_TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /utf-8>"
Expand Down Expand Up @@ -583,6 +587,7 @@ if(onnxruntime_USE_TENSORRT)
list(APPEND onnxruntime_test_framework_src_patterns "${ONNXRUNTIME_ROOT}/core/providers/tensorrt/tensorrt_execution_provider_utils.h")
list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_tensorrt)
list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_tensorrt onnxruntime_providers_shared)
list(APPEND onnxruntime_test_providers_libs ${TENSORRT_LIBRARY_INFER})
endif()

if(onnxruntime_USE_MIGRAPHX)
Expand Down Expand Up @@ -770,21 +775,13 @@ endif()

set(test_all_args)
if (onnxruntime_USE_TENSORRT)
if (onnxruntime_SKIP_AND_PERFORM_FILTERED_TENSORRT_TESTS)
# TRT EP package pipelines takes much longer time to run tests with TRT 8.5. We can't use placeholder to reduce testing time due to application test deadlock.
# Therefore we only run filtered TRT EP tests.
list(APPEND test_all_args "--gtest_filter=*tensorrt_*:*TensorrtExecutionProviderTest*" )
#list(APPEND test_all_args "--gtest_filter=-*cpu_*:*cuda_*:*ContribOpTest*:*QuantGemmTest*:*QLinearConvTest*:*MurmurHash3OpTest*:*PadOpTest*:*QLinearConvTest*" )
else()
# TRT EP CI takes much longer time when updating to TRT 8.2
# So, we only run trt ep and exclude other eps to reduce CI test time.
#
# The test names of model tests were using sequential number in the past.
# This PR https://github.com/microsoft/onnxruntime/pull/10220 (Please see ExpandModelName function in model_tests.cc for more details)
# made test name contain the "ep" and "model path" information, so we can easily filter the tests using cuda ep or other ep with *cpu_* or *xxx_*.
list(APPEND test_all_args "--gtest_filter=-*cpu_*:*cuda_*" )
endif()

# TRT EP CI takes much longer time when updating to TRT 8.2
# So, we only run trt ep and exclude other eps to reduce CI test time.
#
# The test names of model tests were using sequential number in the past.
# This PR https://github.com/microsoft/onnxruntime/pull/10220 (Please see ExpandModelName function in model_tests.cc for more details)
# made test name contain the "ep" and "model path" information, so we can easily filter the tests using cuda ep or other ep with *cpu_* or *xxx_*.
list(APPEND test_all_args "--gtest_filter=-*cpu_*:*cuda_*" )
endif ()

AddTest(
Expand Down Expand Up @@ -1202,6 +1199,9 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
if (onnxruntime_USE_CUDA)
list(APPEND onnxruntime_shared_lib_test_LIBS onnxruntime_test_cuda_ops_lib cudart)
endif()
if (onnxruntime_USE_TENSORRT)
list(APPEND onnxruntime_shared_lib_test_LIBS ${TENSORRT_LIBRARY_INFER})
endif()
if (CMAKE_SYSTEM_NAME STREQUAL "Android")
list(APPEND onnxruntime_shared_lib_test_LIBS ${android_shared_libs})
endif()
Expand Down Expand Up @@ -1465,6 +1465,9 @@ if (NOT onnxruntime_BUILD_WEBASSEMBLY)
${ONNXRUNTIME_CUSTOM_OP_REGISTRATION_TEST_SRC_DIR}/test_registercustomops.cc)

set(onnxruntime_customopregistration_test_LIBS custom_op_library onnxruntime_common onnxruntime_test_utils)
if (onnxruntime_USE_TENSORRT)
list(APPEND onnxruntime_customopregistration_test_LIBS ${TENSORRT_LIBRARY_INFER})
endif()
AddTest(DYN
TARGET onnxruntime_customopregistration_test
SOURCES ${onnxruntime_customopregistration_test_SRC} ${onnxruntime_unittest_main_src}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -363,12 +363,6 @@ std::unique_lock<OrtMutex> TensorrtExecutionProvider::GetApiLock() const {
return std::unique_lock<OrtMutex>(singleton);
}

#ifdef ORT_TENSORRT_PLACEHOLDER_BUILDER
// instantiate global unused builder object which keeps the TRT kernel library in memory
// so that subsequent builders avoid the expensive load / unload process.
auto const placeholder = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(GetTensorrtLogger()));
#endif

TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProviderInfo& info)
: IExecutionProvider{onnxruntime::kTensorrtExecutionProvider, true}, info_(info), device_id_(info.device_id) {
InitProviderOrtApi();
Expand Down
14 changes: 14 additions & 0 deletions onnxruntime/test/unittest_main/test_main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,20 @@ void ortenv_setup() {
ort_env.reset(new Ort::Env(&tpo, ORT_LOGGING_LEVEL_WARNING, "Default"));
}

#ifdef USE_TENSORRT
// TensorRT will load/unload libraries as builder objects are created and torn down. This will happen for
// every single unit test, which leads to excessive test execution time due to that overhead.
// Nvidia suggests to keep a placeholder builder object around to avoid this.
#include "NvInfer.h"
class DummyLogger : public nvinfer1::ILogger {
public:
DummyLogger(Severity verbosity) {}
void log(Severity severity, const char* msg) noexcept override {}
};
DummyLogger trt_logger(nvinfer1::ILogger::Severity::kWARNING);
auto const placeholder = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(trt_logger));
#endif

#define TEST_MAIN main

#if defined(__APPLE__)
Expand Down
11 changes: 0 additions & 11 deletions tools/ci_build/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,9 +502,6 @@ def convert_arg_line_to_args(self, arg_line):
"--use_tensorrt_builtin_parser", action="store_true", default=True, help="Use TensorRT builtin parser"
)
parser.add_argument("--use_tensorrt_oss_parser", action="store_true", help="Use TensorRT OSS parser")
parser.add_argument(
"--tensorrt_placeholder_builder", action="store_true", help="Instantiate Placeholder TensorRT Builder"
)
parser.add_argument("--tensorrt_home", help="Path to TensorRT installation dir")
parser.add_argument("--test_all_timeout", default="10800", help="Set timeout for onnxruntime_test_all")
parser.add_argument("--use_migraphx", action="store_true", help="Build with MIGraphX")
Expand Down Expand Up @@ -911,11 +908,8 @@ def generate_build_tree(
"-Donnxruntime_ENABLE_MICROSOFT_INTERNAL=" + ("ON" if args.enable_msinternal else "OFF"),
"-Donnxruntime_USE_VITISAI=" + ("ON" if args.use_vitisai else "OFF"),
"-Donnxruntime_USE_TENSORRT=" + ("ON" if args.use_tensorrt else "OFF"),
"-Donnxruntime_SKIP_AND_PERFORM_FILTERED_TENSORRT_TESTS="
+ ("ON" if not args.tensorrt_placeholder_builder else "OFF"),
"-Donnxruntime_USE_TENSORRT_BUILTIN_PARSER="
+ ("ON" if args.use_tensorrt_builtin_parser and not args.use_tensorrt_oss_parser else "OFF"),
"-Donnxruntime_TENSORRT_PLACEHOLDER_BUILDER=" + ("ON" if args.tensorrt_placeholder_builder else "OFF"),
# set vars for TVM
"-Donnxruntime_USE_TVM=" + ("ON" if args.use_tvm else "OFF"),
"-Donnxruntime_TVM_CUDA_RUNTIME=" + ("ON" if args.use_tvm and args.tvm_cuda_runtime else "OFF"),
Expand Down Expand Up @@ -1749,11 +1743,6 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
run_subprocess(ctest_cmd, cwd=cwd, dll_path=dll_path)

if args.enable_pybind:
# Disable python tests for TensorRT on Windows due to need to enable placeholder builder
# to reduce test times.
if args.use_tensorrt and is_windows():
return

python_path = None
if args.use_tvm:
python_path = str((Path(build_dir) / config / "_deps" / "tvm-src" / "python").resolve())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ jobs:
--build_wheel \
--enable_onnx_tests --use_cuda --cuda_version=11.8 --cuda_home=/usr/local/cuda-11.8 --cudnn_home=/usr/local/cuda-11.8 \
--enable_pybind --build_java \
--use_tensorrt --use_tensorrt_builtin_parser --tensorrt_placeholder_builder --tensorrt_home /usr \
--use_tensorrt --tensorrt_home /usr \
--cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-11/root/usr/bin/cc CMAKE_CUDA_ARCHITECTURES=75
workingDirectory: $(Build.SourcesDirectory)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ stages:
docker run --gpus all -e CC=/opt/rh/devtoolset-11/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-11/root/usr/bin/c++ -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e NVIDIA_VISIBLE_DEVICES=all --rm --volume /data/onnx:/data/onnx:ro --volume $(Build.SourcesDirectory):/onnxruntime_src --volume $(Build.BinariesDirectory):/build \
--volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecuda118xtrt86build \
/opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release \
--skip_submodule_sync --parallel --build_shared_lib ${{ parameters.buildJavaOption }} --use_tensorrt --use_tensorrt_builtin_parser --tensorrt_placeholder_builder --cuda_version=$(CUDA_VERSION) --cuda_home=/usr/local/cuda-$(CUDA_VERSION) --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-11/root/usr/bin/cc 'CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80'
--skip_submodule_sync --parallel --build_shared_lib ${{ parameters.buildJavaOption }} --use_tensorrt --cuda_version=$(CUDA_VERSION) --cuda_home=/usr/local/cuda-$(CUDA_VERSION) --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-11/root/usr/bin/cc 'CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80'
workingDirectory: $(Build.SourcesDirectory)

- ${{ if eq(parameters.buildJava, true) }}:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ jobs:
displayName: 'Generate cmake config'
inputs:
scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
arguments: '--config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 16 2019" --build_wheel --enable_onnx_tests --use_tensorrt --use_tensorrt_builtin_parser --tensorrt_placeholder_builder --tensorrt_home="C:\local\TensorRT-8.6.0.12.Windows10.x86_64.cuda-11.8" --cuda_version=11.6 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.6" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75'
arguments: '--config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 16 2019" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.0.12.Windows10.x86_64.cuda-11.8" --cuda_version=11.6 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.6" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75'
workingDirectory: '$(Build.BinariesDirectory)'

- task: VSBuild@1
Expand Down Expand Up @@ -84,7 +84,7 @@ jobs:
del wheel_filename_file
python.exe -m pip install -q --upgrade %WHEEL_FILENAME%
set PATH=$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig);%PATH%
python $(Build.SourcesDirectory)\tools\ci_build\build.py --config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 16 2019" --build_wheel --enable_onnx_tests --use_tensorrt --use_tensorrt_builtin_parser --tensorrt_placeholder_builder --tensorrt_home="C:\local\TensorRT-8.6.0.12.Windows10.x86_64.cuda-11.8" --cuda_version=11.6 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.6" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
python $(Build.SourcesDirectory)\tools\ci_build\build.py --config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 16 2019" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.0.12.Windows10.x86_64.cuda-11.8" --cuda_version=11.6 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.6" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)'
displayName: 'Run tests'
Expand Down

0 comments on commit 8dd32fe

Please sign in to comment.