Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions cmake/onnxruntime_unittests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,15 @@ if(onnxruntime_USE_TENSORRT)
list(APPEND onnxruntime_test_providers_libs ${TENSORRT_LIBRARY_INFER})
endif()

if(onnxruntime_USE_NV)
list(APPEND onnxruntime_test_framework_src_patterns ${TEST_SRC_DIR}/providers/nv_tensorrt_rtx/*)
list(APPEND onnxruntime_test_framework_src_patterns "${ONNXRUNTIME_ROOT}/core/providers/nv_tensorrt_rtx/nv_execution_provider_utils.h")
list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_nv_tensorrt_rtx)
list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_nv_tensorrt_rtx onnxruntime_providers_shared)
list(APPEND onnxruntime_test_providers_libs ${TENSORRT_LIBRARY_INFER})
endif()


if(onnxruntime_USE_MIGRAPHX)
list(APPEND onnxruntime_test_framework_src_patterns ${TEST_SRC_DIR}/providers/migraphx/*)
list(APPEND onnxruntime_test_framework_src_patterns "${ONNXRUNTIME_ROOT}/core/providers/migraphx/migraphx_execution_provider_utils.h")
Expand Down
6 changes: 3 additions & 3 deletions include/onnxruntime/core/session/onnxruntime_cxx_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -970,9 +970,9 @@ struct SessionOptionsImpl : ConstSessionOptionsImpl<T> {
SessionOptionsImpl& AppendExecutionProvider_OpenVINO(const OrtOpenVINOProviderOptions& provider_options); ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_OpenVINO
///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_OpenVINO_V2
SessionOptionsImpl& AppendExecutionProvider_OpenVINO_V2(const std::unordered_map<std::string, std::string>& provider_options = {});
SessionOptionsImpl& AppendExecutionProvider_TensorRT(const OrtTensorRTProviderOptions& provider_options); ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_TensorRT
SessionOptionsImpl& AppendExecutionProvider_TensorRT_V2(const OrtTensorRTProviderOptionsV2& provider_options); ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_TensorRT
SessionOptionsImpl& AppendExecutionProvider_MIGraphX(const OrtMIGraphXProviderOptions& provider_options); ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_MIGraphX
SessionOptionsImpl& AppendExecutionProvider_TensorRT(const OrtTensorRTProviderOptions& provider_options); ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_TensorRT
SessionOptionsImpl& AppendExecutionProvider_TensorRT_V2(const OrtTensorRTProviderOptionsV2& provider_options); ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_TensorRT
SessionOptionsImpl& AppendExecutionProvider_MIGraphX(const OrtMIGraphXProviderOptions& provider_options); ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_MIGraphX
///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_CANN
SessionOptionsImpl& AppendExecutionProvider_CANN(const OrtCANNProviderOptions& provider_options);
///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_Dnnl
Expand Down
1,556 changes: 194 additions & 1,362 deletions onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc

Large diffs are not rendered by default.

77 changes: 3 additions & 74 deletions onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,50 +18,6 @@ typedef void* cudnnStatus_t;

namespace onnxruntime {

namespace nv_env_vars {
static const std::string kMaxPartitionIterations = "ORT_NV_MAX_PARTITION_ITERATIONS";
static const std::string kMinSubgraphSize = "ORT_NV_MIN_SUBGRAPH_SIZE";
static const std::string kMaxWorkspaceSize = "ORT_NV_MAX_WORKSPACE_SIZE";
static const std::string kFP16Enable = "ORT_NV_FP16_ENABLE";
static const std::string kINT8Enable = "ORT_NV_INT8_ENABLE";
static const std::string kINT8CalibrationTableName = "ORT_NV_INT8_CALIBRATION_TABLE_NAME";
static const std::string kINT8UseNativeTensorrtCalibrationTable = "ORT_NV_INT8_USE_NATIVE_CALIBRATION_TABLE";
static const std::string kDLAEnable = "ORT_NV_DLA_ENABLE";
static const std::string kDLACore = "ORT_NV_DLA_CORE";
static const std::string kDumpSubgraphs = "ORT_NV_DUMP_SUBGRAPHS";
static const std::string kEngineCacheEnable = "ORT_NV_ENGINE_CACHE_ENABLE";
static const std::string kCachePath = "ORT_NV_CACHE_PATH";
static const std::string kWeightStrippedEngineEnable = "ORT_NV_WEIGHT_STRIPPED_ENGINE_ENABLE";
static const std::string kOnnxModelFolderPath = "ORT_NV_ONNX_MODEL_FOLDER_PATH";
// As a timing cache can be used across multiple ONNX files it makes sense to have a separate cache path
static const std::string kTimingCachePath = "ORT_NV_GLOBAL_CACHE_PATH";
static const std::string kDecryptionEnable = "ORT_NV_ENGINE_DECRYPTION_ENABLE";
static const std::string kDecryptionLibPath = "ORT_NV_ENGINE_DECRYPTION_LIB_PATH";
static const std::string kForceSequentialEngineBuild = "ORT_NV_FORCE_SEQUENTIAL_ENGINE_BUILD";
static const std::string kContextMemorySharingEnable = "ORT_NV_CONTEXT_MEMORY_SHARING_ENABLE";
static const std::string kLayerNormFP32Fallback = "ORT_NV_LAYER_NORM_FP32_FALLBACK";
static const std::string kTimingCacheEnable = "ORT_NV_TIMING_CACHE_ENABLE";
static const std::string kForceTimingCache = "ORT_NV_FORCE_TIMING_CACHE_ENABLE";
static const std::string kDetailedBuildLog = "ORT_NV_DETAILED_BUILD_LOG_ENABLE";
static const std::string kBuildHeuristics = "ORT_NV_BUILD_HEURISTICS_ENABLE";
static const std::string kSparsityEnable = "ORT_NV_SPARSITY_ENABLE";
static const std::string kBuilderOptimizationLevel = "ORT_NV_BUILDER_OPTIMIZATION_LEVEL";
static const std::string kAuxiliaryStreams = "ORT_NV_AUXILIARY_STREAMS";
static const std::string kTacticSources = "ORT_NV_TACTIC_SOURCES";
static const std::string kExtraPluginLibPaths = "ORT_NV_EXTRA_PLUGIN_LIB_PATHS";
static const std::string kProfilesMinShapes = "ORT_NV_PROFILE_MIN_SHAPES";
static const std::string kProfilesMaxShapes = "ORT_NV_PROFILE_MAX_SHAPES";
static const std::string kProfilesOptShapes = "ORT_NV_PROFILE_OPT_SHAPES";
static const std::string kCudaGraphEnable = "ORT_NV_CUDA_GRAPH_ENABLE";
static const std::string kDumpEpContextModel = "ORT_DUMP_EP_CONTEXT_MODEL";
static const std::string kEpContextEmbedMode = "ORT_EP_CONTEXT_EMBED_MODE";
static const std::string kEpContextComputeCapabilityEnable = "ORT_EP_CONTEXT_COMPUTE_CAPABILITY_ENABLE";
static const std::string kEngineCachePrefix = "ORT_NV_CACHE_PREFIX";
static const std::string kOpTypesToExclude = "ORT_NV_OP_TYPES_TO_EXCLUDE";
// Old env variable for backward compatibility
static const std::string kEngineCachePath = "ORT_NV_ENGINE_CACHE_PATH";
} // namespace nv_env_vars

class TensorrtLogger : public nvinfer1::ILogger {
nvinfer1::ILogger::Severity verbosity_;

Expand Down Expand Up @@ -171,36 +127,22 @@ struct TensorrtFuncState {
std::vector<std::unordered_map<std::string, size_t>> output_info;
std::unordered_map<std::string, std::unordered_map<size_t, std::vector<std::vector<int64_t>>>> input_shape_ranges;
std::mutex* tensorrt_mu_ptr = nullptr;
bool fp16_enable = false;
bool int8_enable = false;
bool int8_calibration_cache_available = false;
bool dla_enable = false;
int dla_core = 0;
std::string trt_node_name_with_precision;
bool engine_cache_enable = false;
std::string engine_cache_path;
nvinfer1::IRuntime* runtime = nullptr;
std::vector<nvinfer1::IOptimizationProfile*> profiles;
bool context_memory_sharing_enable = false;
size_t* max_context_mem_size_ptr = nullptr;
std::unordered_map<std::string, float> dynamic_range_map;
bool engine_decryption_enable = false;
int (*engine_decryption)(const char*, char*, size_t*) = nullptr;
int (*engine_encryption)(const char*, char*, size_t) = nullptr;
bool timing_cache_enable = true;
std::string timing_cache_path;
bool force_timing_cache = false;
bool detailed_build_log = false;
bool build_heuristics_enable = false;
bool sparsity_enable = false;
int builder_optimization_level = 3;
int auxiliary_streams = -1;
bool filter_tactic_sources = false;
nvinfer1::TacticSources tactic_sources;
bool cuda_graph_enable = 0;
std::string cache_prefix;
std::string cache_suffix;
bool engine_hw_compatible = false;
};

// Minimum information to construct kernel function state for direct engine load code path
Expand Down Expand Up @@ -233,6 +175,8 @@ std::string GetWeightRefittedEnginePath(std::string engine_cache_path);
class NvExecutionProvider : public IExecutionProvider {
public:
explicit NvExecutionProvider(const NvExecutionProviderInfo& info);
// TODO: we might want to transition to this, it allows for an easier option specification:
// explicit NvExecutionProvider(const ProviderOptions& provider_options_map, const ConfigOptions* config_options);
virtual ~NvExecutionProvider();

cublasHandle_t PerThreadDefaultCublasHandle() {
Expand Down Expand Up @@ -293,45 +237,31 @@ class NvExecutionProvider : public IExecutionProvider {
int max_partition_iterations_ = 1000;
size_t min_subgraph_size_ = 1;
size_t max_workspace_size_ = 0;
bool fp16_enable_ = false;
bool int8_enable_ = false;
bool dla_enable_ = false;
int dla_core_ = 0;
bool force_sequential_engine_build_ = false;
std::string int8_calibration_cache_name_;
bool int8_calibration_cache_available_ = false;
bool int8_use_native_tensorrt_calibration_table_ = false;
bool dump_subgraphs_ = false;
bool engine_cache_enable_ = false;
bool weight_stripped_engine_enable_ = false;
bool weight_stripped_engine_refit_ = false;
std::string onnx_model_folder_path_;
const void* onnx_model_bytestream_;
size_t onnx_model_bytestream_size_;
bool build_heuristics_enable_ = false;
bool sparsity_enable_ = false;
int builder_optimization_level_ = 3;
int auxiliary_streams_ = -1;
std::string tactic_sources_;
std::string global_cache_path_, cache_path_, engine_decryption_lib_path_;
std::string cache_path_, engine_decryption_lib_path_;
std::unique_ptr<nvinfer1::IRuntime> runtime_ = nullptr;
std::mutex tensorrt_mu_;
int device_id_;
std::string compute_capability_;
bool context_memory_sharing_enable_ = false;
bool layer_norm_fp32_fallback_ = false;
size_t max_ctx_mem_size_ = 0;
IAllocatorUniquePtr<void> context_memory_ = nullptr;
mutable char model_path_[4096] = {}; // Reserved for max path length
bool engine_decryption_enable_ = false;
int (*engine_decryption_)(const char*, char*, size_t*) = nullptr;
int (*engine_encryption_)(const char*, char*, size_t) = nullptr;
bool timing_cache_enable_ = false;
bool force_timing_cache_match_ = false;
bool detailed_build_log_ = false;
bool cuda_graph_enable_ = false;
std::string cache_prefix_;
bool engine_hw_compatible_ = false;
std::string op_types_to_exclude_;

// The format is as for TENSORRT_VERSION: (MAJOR * 100 + MINOR) * 100 + PATCH
Expand All @@ -349,7 +279,6 @@ class NvExecutionProvider : public IExecutionProvider {
std::string ctx_model_path_;
std::string ep_cache_context_attr_;
std::string engine_cache_relative_path_to_context_model_dir;
std::unique_ptr<ONNX_NAMESPACE::ModelProto> model_proto_ = ONNX_NAMESPACE::ModelProto::Create();

std::unordered_set<std::string> control_flow_op_set_ = {"If", "Loop", "Scan"};
mutable std::unordered_map<std::string, std::unique_ptr<SubGraphContext>> subgraph_context_map_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#include <iostream>
#include <filesystem>
#include "flatbuffers/idl.h"
#include "ort_trt_int8_cal_table.fbs.h"
#include <NvInferVersion.h>
#include "core/providers/cuda/cuda_pch.h"
#include "core/common/path_string.h"
Expand All @@ -19,84 +18,6 @@ namespace fs = std::filesystem;

namespace onnxruntime {

float ConvertSinglePrecisionIEEE754ToFloat(unsigned long input) {
int s = (input >> 31) & 0x01;
int e = ((input & 0x7f800000) >> 23) - 127;
int p = -1;
double m = 0.0;
for (int i = 0; i < 23; ++i) {
m += ((input >> (23 - i - 1)) & 0x01) * pow(2.0, p--);
}
return static_cast<float>((s ? -1 : 1) * pow(2.0, e) * (m + 1.0));
}

/*
* Read calibration table for INT8 quantization
* Two kind of calibration tables are supported,
* 1. ORT generated calibration table
* The table is pre-serialized by flatbuffers.
* Each entry in the table is a key-value pair,
* key: tensor name, value: maximum absolute value in floating point
* For example,
* data_0 2.008338
* ...
* 2. Native TensorRT generated calibration table
* Data format is defined by TensorRT as,
* tensor name : scale in 32-bit single precision IEEE754 format
* For example,
* TRT-7103-EntropyCalibration2
* data_0: 4000889d
* ...
*/
bool ReadDynamicRange(const std::string file_name, const bool is_trt_calibration_table, std::unordered_map<std::string, float>& dynamic_range_map) {
std::ifstream infile(file_name, std::ios::binary | std::ios::in);
if (!infile) {
return false;
}

if (is_trt_calibration_table) {
// Native TensorRT generated calibration table
std::string line;
char delim = ':';
if (std::getline(infile, line)) {
std::istringstream first_line(line);
std::string version;
std::getline(first_line, version, delim);
std::size_t found = version.find("TRT-");
if (found != std::string::npos) {
while (std::getline(infile, line)) {
std::istringstream in_line(line);
std::string str;
std::getline(in_line, str, delim);
std::string tensor_name = str;
std::getline(in_line, str, delim);
unsigned long scale_int = std::strtoul(str.c_str(), nullptr, 16);
float scale_float = ConvertSinglePrecisionIEEE754ToFloat(scale_int);
float dynamic_range = scale_float * 127.0f;
dynamic_range_map[tensor_name] = dynamic_range;
}
} else {
throw std::runtime_error("This is not a TensorRT generated calibration table " + file_name);
}
}
} else {
// ORT generated calibration table
infile.seekg(0, std::ios::end);
size_t length = infile.tellg();
infile.seekg(0, std::ios::beg);
std::unique_ptr<char[]> data{new char[length]};
infile.read((char*)data.get(), length);
infile.close();
auto flat_table = flatbuffers::GetRoot<CalTableFlatBuffers::TrtTable>((const uint8_t*)data.get());
auto flat_dict = flat_table->dict();
for (size_t i = 0, end = flat_dict->size(); i < end; ++i) {
flatbuffers::uoffset_t idx = static_cast<flatbuffers::uoffset_t>(i);
dynamic_range_map[flat_dict->Get(idx)->key()->str()] = std::stof(flat_dict->Get(idx)->value()->str());
}
}
return true;
}

/*
* Get number of profile setting.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ struct Nv_Provider : Provider {
}

std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory(const void* options) {
const ProviderOptions *provider_options = reinterpret_cast<const ProviderOptions*>(options);
const ProviderOptions* provider_options = reinterpret_cast<const ProviderOptions*>(options);
NvExecutionProviderInfo info = onnxruntime::NvExecutionProviderInfo::FromProviderOptions(*provider_options);
return std::make_shared<NvProviderFactory>(info);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ ONNX_NAMESPACE::ModelProto* CreateCtxModel(const GraphViewer& graph_viewer,
engine_data_str.assign(engine_data, size);
}
attr_1->set_s(engine_data_str);
// TODO(maximilianm) we might want to disable this warning as we only support weightless engines that are really small
// the reason we had this was that the field will be hashed and storing a large bytestream has significant overhead
LOGS_DEFAULT(WARNING) << EPCONTEXT_WARNING;
} else {
attr_1->set_s(engine_cache_path);
Expand Down
Loading