Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions examples/c/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ FetchContent_MakeAvailable(CLI11)
option(USE_CXX "Invoke the C++ example" ON)
option(MODEL_CHAT "Build the Model Chat example" OFF)
option(MODEL_QA "Build the Model Q&A example" OFF)
option(MODEL_COMPILE "Build the Model Compile example" OFF)
option(MODEL_MM "Build the Model Multimodal example" OFF)
option(WHISPER "Build the Whisper example" OFF)

Expand Down Expand Up @@ -113,6 +114,13 @@ if(MODEL_QA)
target_link_libraries(model_qa PRIVATE CLI11::CLI11)
endif()

if(MODEL_COMPILE)
add_executable(model_compile ${EXAMPLES_SOURCE_DIR}/model_compile.cpp ${EXAMPLES_SOURCE_DIR}/common.cpp)
prepare_executable(model_compile)
target_link_libraries(model_compile PRIVATE nlohmann_json::nlohmann_json)
target_link_libraries(model_compile PRIVATE CLI11::CLI11)
endif()

if(MODEL_MM)
add_executable(model_mm ${EXAMPLES_SOURCE_DIR}/model_mm.cpp ${EXAMPLES_SOURCE_DIR}/common.cpp)
prepare_executable(model_mm)
Expand Down
9 changes: 4 additions & 5 deletions examples/c/src/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -236,18 +236,17 @@ void RegisterEP(const std::string& ep, const std::string& ep_path) {
return; // No library path specified, skip registration
}

std::cout << "Registering execution provider: " << ep_path << std::endl;
auto env = Ort::Env();
// Must register on GenAI's OrtEnv (via OgaRegisterExecutionProviderLibrary) so
// GetEpDevices() in ValidateCompiledModel sees the plugin; Ort::Env() is a different env.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to be an ongoing issue as there are some cases where registering on the Ort::Env() is preferred and other cases where registering on GenAI's OrtEnv is preferred. We should find a way to consolidate.

if (ep.compare("cuda") == 0) {
env.RegisterExecutionProviderLibrary("CUDAExecutionProvider", std::filesystem::path(ep_path).c_str());
OgaRegisterExecutionProviderLibrary("CUDAExecutionProvider", ep_path.c_str());
} else if (ep.compare("NvTensorRtRtx") == 0) {
env.RegisterExecutionProviderLibrary("NvTensorRTRTXExecutionProvider", std::filesystem::path(ep_path).c_str());
OgaRegisterExecutionProviderLibrary("NvTensorRTRTXExecutionProvider", ep_path.c_str());
} else {
std::cout << "Warning: EP registration not supported for " << ep << std::endl;
std::cout << "Only 'cuda' and 'NvTensorRtRtx' support plug-in libraries." << std::endl;
return;
}

std::cout << "Registered " << ep << " successfully!" << std::endl;
}

Expand Down
248 changes: 248 additions & 0 deletions examples/c/src/model_compile.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
//
// Model Compile example: runs the same model under different EP and compile configurations
// (CPU, CPU+overlay, NvTensorRtRtx no-compile / 4 options / all options). Use -v for verbose,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Besides appending some GenAI config options and using the overlay API, it seems to be the same logic as the existing examples.

Instead of creating a new standalone example that needs to be continually maintained, let's integrate the necessary logic into the common files. We should update the model-qa and/or model-chat examples in C/C++, C#, and Python to show this capability. In this way, these changes will be continually tested since the model-qa examples are built and tested in the CIs now.

We want to keep the examples consistent across language bindings and reduce maintenance by sharing and re-using logic as much as possible between examples.

// -d for ORT verbose logging (ORTGENAI_ORT_VERBOSE_LOGGING=1).

#include <chrono>
#include <cstdlib>
#include <csignal>
#include <filesystem>
#include <iomanip>
#include <iostream>
#include <string>

#include "common.h"

namespace fs = std::filesystem;

// Enable ONNX Runtime verbose logging. Must be set before any Oga/ORT API use.
// Alternatively set env ORTGENAI_ORT_VERBOSE_LOGGING=1 before launching.
static void SetOrtVerboseLogging() {
#ifdef _WIN32
_putenv("ORTGENAI_ORT_VERBOSE_LOGGING=1");
#else
setenv("ORTGENAI_ORT_VERBOSE_LOGGING", "1", 1);
#endif
}

static const char* kCpuEp = "cpu";
static const char* kNvTensorRtRtxEp = "NvTensorRtRtx";

static const char* kDefaultPrompt = "Tell me about AI and ML";

static double RunOneGeneration(OgaModel& model, OgaTokenizer& tokenizer, bool verbose) {
auto stream = OgaTokenizerStream::Create(tokenizer);
auto sequences = OgaSequences::Create();
tokenizer.Encode(kDefaultPrompt, *sequences);

auto params = OgaGeneratorParams::Create(model);
params->SetSearchOption("max_length", 128);
params->SetSearchOption("batch_size", 1);

auto generator = OgaGenerator::Create(model, *params);
generator->AppendTokenSequences(*sequences);

if (verbose) std::cout << "Prompt: " << kDefaultPrompt << std::endl;
std::cout << "Output: " << std::flush;
auto t0 = Clock::now();
while (!generator->IsDone()) {
generator->GenerateNextToken();
std::cout << stream->Decode(generator->GetNextTokens()[0]) << std::flush;
}
std::cout << std::endl;
return std::chrono::duration<double>(Clock::now() - t0).count();
}

static void PrintTimings(const char* label, double load_time_sec, double inference_time_sec) {
const auto default_precision = std::cout.precision();
std::cout << " " << label << ": "
<< std::fixed << std::setprecision(3)
<< "model load " << load_time_sec << "s, "
<< "inference " << inference_time_sec << "s"
<< std::setprecision(default_precision) << std::endl;
}

// 1) Run model with CPU execution provider only (no compile overlay).
void RunWithCpu(const std::string& model_path, const std::string& ep_path, bool verbose) {
(void)ep_path;
if (verbose) std::cout << "[RunWithCpu] Creating config (CPU, no compile overlay)..." << std::endl;
std::unordered_map<std::string, std::string> ep_options;
GeneratorParamsArgs search_options;
auto config = GetConfig(model_path, kCpuEp, ep_options, search_options);
if (verbose) std::cout << "[RunWithCpu] Creating model..." << std::endl;
auto load_t0 = Clock::now();
auto model = OgaModel::Create(*config);
double load_time = std::chrono::duration<double>(Clock::now() - load_t0).count();
if (verbose) std::cout << "[RunWithCpu] Creating tokenizer..." << std::endl;
auto tokenizer = OgaTokenizer::Create(*model);
double inference_time = RunOneGeneration(*model, *tokenizer, verbose);
PrintTimings("RunWithCpu (CPU, no overlay)", load_time, inference_time);
}

// 2) Run model with CPU execution provider and compile config passed via config_overlay.
void RunWithCpuAndCompileOverlay(const std::string& model_path, const std::string& ep_path, bool verbose) {
(void)ep_path;
if (verbose) std::cout << "[RunWithCpuAndCompileOverlay] Creating config (CPU + compile overlay)..." << std::endl;
std::unordered_map<std::string, std::string> ep_options;
GeneratorParamsArgs search_options;
auto config = GetConfig(model_path, kCpuEp, ep_options, search_options);
config->Overlay(R"({
"model": {
"decoder": {
"compile_options": {
"enable_ep_context": true,
"ep_context_embed_mode": false,
"force_compile_if_needed": true,
"graph_optimization_level": 99
}
}
}
})");
if (verbose) std::cout << "[RunWithCpuAndCompileOverlay] Creating model..." << std::endl;
auto load_t0 = Clock::now();
auto model = OgaModel::Create(*config);
double load_time = std::chrono::duration<double>(Clock::now() - load_t0).count();
if (verbose) std::cout << "[RunWithCpuAndCompileOverlay] Creating tokenizer..." << std::endl;
auto tokenizer = OgaTokenizer::Create(*model);
double inference_time = RunOneGeneration(*model, *tokenizer, verbose);
PrintTimings("RunWithCpuAndCompileOverlay (CPU + overlay)", load_time, inference_time);
}

// 3) Run model with NvTensorRtRtx EP without compile options.
void RunWithNvTensorRtRtxNoCompile(const std::string& model_path, const std::string& ep_path, bool verbose) {
if (ep_path.empty() && verbose) {
std::cout << "Warning: --ep_path not set; NvTensorRTRTX may not be available (only CPU)." << std::endl;
}
if (verbose) std::cout << "[RunWithNvTensorRtRtxNoCompile] Creating config (NvTensorRtRtx, no compile)..." << std::endl;
std::unordered_map<std::string, std::string> ep_options;
GeneratorParamsArgs search_options;
auto config = GetConfig(model_path, kNvTensorRtRtxEp, ep_options, search_options);
if (verbose) std::cout << "[RunWithNvTensorRtRtxNoCompile] Creating model..." << std::endl;
auto load_t0 = Clock::now();
auto model = OgaModel::Create(*config);
double load_time = std::chrono::duration<double>(Clock::now() - load_t0).count();
if (verbose) std::cout << "[RunWithNvTensorRtRtxNoCompile] Creating tokenizer..." << std::endl;
auto tokenizer = OgaTokenizer::Create(*model);
double inference_time = RunOneGeneration(*model, *tokenizer, verbose);
PrintTimings("RunWithNvTensorRtRtxNoCompile (NvTensorRtRtx, no compile)", load_time, inference_time);
}

// 4) Run model with NvTensorRtRtx EP and minimum compile options.
void RunWithNvTensorRtRtxMinimumCompileOptions(const std::string& model_path, const std::string& ep_path, bool verbose) {
if (ep_path.empty() && verbose) {
std::cout << "Warning: --ep_path not set; NvTensorRTRTX may not be available (only CPU)." << std::endl;
}
if (verbose) std::cout << "[RunWithNvTensorRtRtxMinimumCompileOptions] Creating config (NvTensorRtRtx + minimum compile options)..." << std::endl;
std::unordered_map<std::string, std::string> ep_options;
GeneratorParamsArgs search_options;
auto config = GetConfig(model_path, kNvTensorRtRtxEp, ep_options, search_options);
// ep_context_embed_mode must be false for larger models(>2GB) or compilation will error
config->Overlay(R"({
"model": {
"decoder": {
"compile_options": {
"enable_ep_context": true,
"ep_context_embed_mode": false
}
}
}
})");
if (verbose) std::cout << "[RunWithNvTensorRtRtxMinimumCompileOptions] Creating model..." << std::endl;
auto load_t0 = Clock::now();
auto model = OgaModel::Create(*config);
double load_time = std::chrono::duration<double>(Clock::now() - load_t0).count();
if (verbose) std::cout << "[RunWithNvTensorRtRtxMinimumCompileOptions] Creating tokenizer..." << std::endl;
auto tokenizer = OgaTokenizer::Create(*model);
double inference_time = RunOneGeneration(*model, *tokenizer, verbose);
PrintTimings("RunWithNvTensorRtRtxMinimumCompileOptions (minimum options)", load_time, inference_time);
}

// 5) Run model with NvTensorRtRtx EP and all compile options.
void RunWithNvTensorRtRtxCompileAllOptions(const std::string& model_path, const std::string& ep_path, bool verbose) {
if (ep_path.empty() && verbose) {
std::cout << "Warning: --ep_path not set; NvTensorRTRTX may not be available (only CPU)." << std::endl;
}
if (verbose) std::cout << "[RunWithNvTensorRtRtxCompileAllOptions] Creating config (NvTensorRtRtx + all compile options)..." << std::endl;
std::unordered_map<std::string, std::string> ep_options;
GeneratorParamsArgs search_options;
auto config = GetConfig(model_path, kNvTensorRtRtxEp, ep_options, search_options);
// Single config: ep_context_file_path is full path (relative to model dir) including filename, e.g. "contexts/model_ctx.onnx"
config->Overlay(R"({
"model": {
"decoder": {
"compile_options": {
"enable_ep_context": true,
"graph_optimization_level": 99,
"ep_context_file_path": "contexts/ep_context_output/model_ctx.onnx",
"ep_context_embed_mode": false,
"force_compile_if_needed": true
}
}
}
})");
if (verbose) std::cout << "[RunWithNvTensorRtRtxCompileAllOptions] Creating model..." << std::endl;
auto load_t0 = Clock::now();
auto model = OgaModel::Create(*config);
double load_time = std::chrono::duration<double>(Clock::now() - load_t0).count();
if (verbose) std::cout << "[RunWithNvTensorRtRtxCompileAllOptions] Creating tokenizer..." << std::endl;
auto tokenizer = OgaTokenizer::Create(*model);
double inference_time = RunOneGeneration(*model, *tokenizer, verbose);
PrintTimings("RunWithNvTensorRtRtxCompileAllOptions (all options)", load_time, inference_time);
}

int main(int argc, char** argv) {
GeneratorParamsArgs generator_params_args;
GuidanceArgs guidance_args;
std::string model_path, ep = "follow_config", ep_path, system_prompt, user_prompt;
bool verbose = false, debug = false, interactive = false, rewind = true;
std::vector<std::string> image_paths, audio_paths;

if (!ParseArgs(argc, argv, generator_params_args, guidance_args, model_path, ep, ep_path, system_prompt, user_prompt, verbose, debug, interactive, rewind, image_paths, audio_paths)) {
return -1;
}

if (ep.compare(kNvTensorRtRtxEp) == 0 && ep_path.empty()) {
#if defined(_WIN32)
ep_path = (fs::current_path() / "onnxruntime_providers_nv_tensorrt_rtx.dll").string();
#else
ep_path = (fs::current_path() / "libonnxruntime_providers_nv_tensorrt_rtx.so").string();
#endif
}

if (debug) {
SetOrtVerboseLogging();
SetLogger();
}

if (!ep_path.empty()) {
RegisterEP(kNvTensorRtRtxEp, ep_path);
}

OgaHandle handle;

if (verbose) {
std::cout << "Model path: " << model_path << std::endl;
std::cout << "EP path: " << (ep_path.empty() ? "(none)" : ep_path) << std::endl;
}
std::cout << "Timings (model load, inference):" << std::endl;

try {
// RunWithCpu(model_path, ep_path, verbose);
// RunWithCpuAndCompileOverlay(model_path, ep_path, verbose);
//First run the no-compile case
RunWithNvTensorRtRtxNoCompile(model_path, ep_path, verbose);
//Then run for first time compile case, Model load time will be load time at no compile + compile time
RunWithNvTensorRtRtxMinimumCompileOptions(model_path, ep_path, verbose);
//Then run for second time compile case, Model load time must be very less as it is already compiled
RunWithNvTensorRtRtxMinimumCompileOptions(model_path, ep_path, verbose);
//Then run for all compile options,With different ep_context_file_path, ep_context_embed_mode, force_compile_if_needed, graph_optimization_level
RunWithNvTensorRtRtxCompileAllOptions(model_path, ep_path, verbose);
} catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
return -1;
}

return 0;
}
Loading
Loading