microsoft · lnigam · Dec 2, 2025 · Dec 2, 2025 · Feb 26, 2026 · Feb 26, 2026
@@ -24,6 +24,7 @@ FetchContent_MakeAvailable(CLI11)
 option(USE_CXX "Invoke the C++ example" ON)
 option(MODEL_CHAT "Build the Model Chat example" OFF)
 option(MODEL_QA "Build the Model Q&A example" OFF)
+option(MODEL_COMPILE "Build the Model Compile example" OFF)
 option(MODEL_MM "Build the Model Multimodal example" OFF)
 option(WHISPER "Build the Whisper example" OFF)
 
@@ -113,6 +114,13 @@ if(MODEL_QA)
   target_link_libraries(model_qa PRIVATE CLI11::CLI11)
 endif()
 
+if(MODEL_COMPILE)
+  add_executable(model_compile ${EXAMPLES_SOURCE_DIR}/model_compile.cpp ${EXAMPLES_SOURCE_DIR}/common.cpp)
+  prepare_executable(model_compile)
+  target_link_libraries(model_compile PRIVATE nlohmann_json::nlohmann_json)
+  target_link_libraries(model_compile PRIVATE CLI11::CLI11)
+endif()
+
 if(MODEL_MM)
   add_executable(model_mm ${EXAMPLES_SOURCE_DIR}/model_mm.cpp ${EXAMPLES_SOURCE_DIR}/common.cpp)
   prepare_executable(model_mm)

@@ -236,18 +236,17 @@ void RegisterEP(const std::string& ep, const std::string& ep_path) {
     return;  // No library path specified, skip registration
   }
 
-  std::cout << "Registering execution provider: " << ep_path << std::endl;
-  auto env = Ort::Env();
+  // Must register on GenAI's OrtEnv (via OgaRegisterExecutionProviderLibrary) so
+  // GetEpDevices() in ValidateCompiledModel sees the plugin; Ort::Env() is a different env.
   if (ep.compare("cuda") == 0) {
-    env.RegisterExecutionProviderLibrary("CUDAExecutionProvider", std::filesystem::path(ep_path).c_str());
+    OgaRegisterExecutionProviderLibrary("CUDAExecutionProvider", ep_path.c_str());
   } else if (ep.compare("NvTensorRtRtx") == 0) {
-    env.RegisterExecutionProviderLibrary("NvTensorRTRTXExecutionProvider", std::filesystem::path(ep_path).c_str());
+    OgaRegisterExecutionProviderLibrary("NvTensorRTRTXExecutionProvider", ep_path.c_str());
   } else {
     std::cout << "Warning: EP registration not supported for " << ep << std::endl;
     std::cout << "Only 'cuda' and 'NvTensorRtRtx' support plug-in libraries." << std::endl;
     return;
   }
-
   std::cout << "Registered " << ep << " successfully!" << std::endl;
 }
 

@@ -0,0 +1,248 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+//
+// Model Compile example: runs the same model under different EP and compile configurations
+// (CPU, CPU+overlay, NvTensorRtRtx no-compile / 4 options / all options). Use -v for verbose,
+// -d for ORT verbose logging (ORTGENAI_ORT_VERBOSE_LOGGING=1).
+
+#include <chrono>
+#include <cstdlib>
+#include <csignal>
+#include <filesystem>
+#include <iomanip>
+#include <iostream>
+#include <string>
+
+#include "common.h"
+
+namespace fs = std::filesystem;
+
+// Enable ONNX Runtime verbose logging. Must be set before any Oga/ORT API use.
+// Alternatively set env ORTGENAI_ORT_VERBOSE_LOGGING=1 before launching.
+static void SetOrtVerboseLogging() {
+#ifdef _WIN32
+  _putenv("ORTGENAI_ORT_VERBOSE_LOGGING=1");
+#else
+  setenv("ORTGENAI_ORT_VERBOSE_LOGGING", "1", 1);
+#endif
+}
+
+static const char* kCpuEp = "cpu";
+static const char* kNvTensorRtRtxEp = "NvTensorRtRtx";
+
+static const char* kDefaultPrompt = "Tell me about AI and ML";
+
+static double RunOneGeneration(OgaModel& model, OgaTokenizer& tokenizer, bool verbose) {
+  auto stream = OgaTokenizerStream::Create(tokenizer);
+  auto sequences = OgaSequences::Create();
+  tokenizer.Encode(kDefaultPrompt, *sequences);
+
+  auto params = OgaGeneratorParams::Create(model);
+  params->SetSearchOption("max_length", 128);
+  params->SetSearchOption("batch_size", 1);
+
+  auto generator = OgaGenerator::Create(model, *params);
+  generator->AppendTokenSequences(*sequences);
+
+  if (verbose) std::cout << "Prompt: " << kDefaultPrompt << std::endl;
+  std::cout << "Output: " << std::flush;
+  auto t0 = Clock::now();
+  while (!generator->IsDone()) {
+    generator->GenerateNextToken();
+    std::cout << stream->Decode(generator->GetNextTokens()[0]) << std::flush;
+  }
+  std::cout << std::endl;
+  return std::chrono::duration<double>(Clock::now() - t0).count();
+}
+
+static void PrintTimings(const char* label, double load_time_sec, double inference_time_sec) {
+  const auto default_precision = std::cout.precision();
+  std::cout << "  " << label << ": "
+            << std::fixed << std::setprecision(3)
+            << "model load " << load_time_sec << "s, "
+            << "inference " << inference_time_sec << "s"
+            << std::setprecision(default_precision) << std::endl;
+}
+
+// 1) Run model with CPU execution provider only (no compile overlay).
+void RunWithCpu(const std::string& model_path, const std::string& ep_path, bool verbose) {
+  (void)ep_path;
+  if (verbose) std::cout << "[RunWithCpu] Creating config (CPU, no compile overlay)..." << std::endl;
+  std::unordered_map<std::string, std::string> ep_options;
+  GeneratorParamsArgs search_options;
+  auto config = GetConfig(model_path, kCpuEp, ep_options, search_options);
+  if (verbose) std::cout << "[RunWithCpu] Creating model..." << std::endl;
+  auto load_t0 = Clock::now();
+  auto model = OgaModel::Create(*config);
+  double load_time = std::chrono::duration<double>(Clock::now() - load_t0).count();
+  if (verbose) std::cout << "[RunWithCpu] Creating tokenizer..." << std::endl;
+  auto tokenizer = OgaTokenizer::Create(*model);
+  double inference_time = RunOneGeneration(*model, *tokenizer, verbose);
+  PrintTimings("RunWithCpu (CPU, no overlay)", load_time, inference_time);
+}
+
+// 2) Run model with CPU execution provider and compile config passed via config_overlay.
+void RunWithCpuAndCompileOverlay(const std::string& model_path, const std::string& ep_path, bool verbose) {
+  (void)ep_path;
+  if (verbose) std::cout << "[RunWithCpuAndCompileOverlay] Creating config (CPU + compile overlay)..." << std::endl;
+  std::unordered_map<std::string, std::string> ep_options;
+  GeneratorParamsArgs search_options;
+  auto config = GetConfig(model_path, kCpuEp, ep_options, search_options);
+  config->Overlay(R"({
+    "model": {
+      "decoder": {
+        "compile_options": {
+          "enable_ep_context": true,
+          "ep_context_embed_mode": false,
+          "force_compile_if_needed": true,
+          "graph_optimization_level": 99
+        }
+      }
+    }
+  })");
+  if (verbose) std::cout << "[RunWithCpuAndCompileOverlay] Creating model..." << std::endl;
+  auto load_t0 = Clock::now();
+  auto model = OgaModel::Create(*config);
+  double load_time = std::chrono::duration<double>(Clock::now() - load_t0).count();
+  if (verbose) std::cout << "[RunWithCpuAndCompileOverlay] Creating tokenizer..." << std::endl;
+  auto tokenizer = OgaTokenizer::Create(*model);
+  double inference_time = RunOneGeneration(*model, *tokenizer, verbose);
+  PrintTimings("RunWithCpuAndCompileOverlay (CPU + overlay)", load_time, inference_time);
+}
+
+// 3) Run model with NvTensorRtRtx EP without compile options.
+void RunWithNvTensorRtRtxNoCompile(const std::string& model_path, const std::string& ep_path, bool verbose) {
+  if (ep_path.empty() && verbose) {
+    std::cout << "Warning: --ep_path not set; NvTensorRTRTX may not be available (only CPU)." << std::endl;
+  }
+  if (verbose) std::cout << "[RunWithNvTensorRtRtxNoCompile] Creating config (NvTensorRtRtx, no compile)..." << std::endl;
+  std::unordered_map<std::string, std::string> ep_options;
+  GeneratorParamsArgs search_options;
+  auto config = GetConfig(model_path, kNvTensorRtRtxEp, ep_options, search_options);
+  if (verbose) std::cout << "[RunWithNvTensorRtRtxNoCompile] Creating model..." << std::endl;
+  auto load_t0 = Clock::now();
+  auto model = OgaModel::Create(*config);
+  double load_time = std::chrono::duration<double>(Clock::now() - load_t0).count();
+  if (verbose) std::cout << "[RunWithNvTensorRtRtxNoCompile] Creating tokenizer..." << std::endl;
+  auto tokenizer = OgaTokenizer::Create(*model);
+  double inference_time = RunOneGeneration(*model, *tokenizer, verbose);
+  PrintTimings("RunWithNvTensorRtRtxNoCompile (NvTensorRtRtx, no compile)", load_time, inference_time);
+}
+
+// 4) Run model with NvTensorRtRtx EP and minimum compile options.
+void RunWithNvTensorRtRtxMinimumCompileOptions(const std::string& model_path, const std::string& ep_path, bool verbose) {
+  if (ep_path.empty() && verbose) {
+    std::cout << "Warning: --ep_path not set; NvTensorRTRTX may not be available (only CPU)." << std::endl;
+  }
+  if (verbose) std::cout << "[RunWithNvTensorRtRtxMinimumCompileOptions] Creating config (NvTensorRtRtx + minimum compile options)..." << std::endl;
+  std::unordered_map<std::string, std::string> ep_options;
+  GeneratorParamsArgs search_options;
+  auto config = GetConfig(model_path, kNvTensorRtRtxEp, ep_options, search_options);
+  // ep_context_embed_mode must be false for larger models(>2GB) or compilation will error
+  config->Overlay(R"({
+    "model": {
+      "decoder": {
+        "compile_options": {
+          "enable_ep_context": true,
+          "ep_context_embed_mode": false
+        }
+      }
+    }
+  })");
+  if (verbose) std::cout << "[RunWithNvTensorRtRtxMinimumCompileOptions] Creating model..." << std::endl;
+  auto load_t0 = Clock::now();
+  auto model = OgaModel::Create(*config);
+  double load_time = std::chrono::duration<double>(Clock::now() - load_t0).count();
+  if (verbose) std::cout << "[RunWithNvTensorRtRtxMinimumCompileOptions] Creating tokenizer..." << std::endl;
+  auto tokenizer = OgaTokenizer::Create(*model);
+  double inference_time = RunOneGeneration(*model, *tokenizer, verbose);
+  PrintTimings("RunWithNvTensorRtRtxMinimumCompileOptions (minimum options)", load_time, inference_time);
+}
+
+// 5) Run model with NvTensorRtRtx EP and all compile options.
+void RunWithNvTensorRtRtxCompileAllOptions(const std::string& model_path, const std::string& ep_path, bool verbose) {
+  if (ep_path.empty() && verbose) {
+    std::cout << "Warning: --ep_path not set; NvTensorRTRTX may not be available (only CPU)." << std::endl;
+  }
+  if (verbose) std::cout << "[RunWithNvTensorRtRtxCompileAllOptions] Creating config (NvTensorRtRtx + all compile options)..." << std::endl;
+  std::unordered_map<std::string, std::string> ep_options;
+  GeneratorParamsArgs search_options;
+  auto config = GetConfig(model_path, kNvTensorRtRtxEp, ep_options, search_options);
+  // Single config: ep_context_file_path is full path (relative to model dir) including filename, e.g. "contexts/model_ctx.onnx"
+  config->Overlay(R"({
+    "model": {
+      "decoder": {
+        "compile_options": {
+          "enable_ep_context": true,
+          "graph_optimization_level": 99,
+          "ep_context_file_path": "contexts/ep_context_output/model_ctx.onnx",
+          "ep_context_embed_mode": false,
+          "force_compile_if_needed": true
+        }
+      }
+    }
+  })");
+  if (verbose) std::cout << "[RunWithNvTensorRtRtxCompileAllOptions] Creating model..." << std::endl;
+  auto load_t0 = Clock::now();
+  auto model = OgaModel::Create(*config);
+  double load_time = std::chrono::duration<double>(Clock::now() - load_t0).count();
+  if (verbose) std::cout << "[RunWithNvTensorRtRtxCompileAllOptions] Creating tokenizer..." << std::endl;
+  auto tokenizer = OgaTokenizer::Create(*model);
+  double inference_time = RunOneGeneration(*model, *tokenizer, verbose);
+  PrintTimings("RunWithNvTensorRtRtxCompileAllOptions (all options)", load_time, inference_time);
+}
+
+int main(int argc, char** argv) {
+  GeneratorParamsArgs generator_params_args;
+  GuidanceArgs guidance_args;
+  std::string model_path, ep = "follow_config", ep_path, system_prompt, user_prompt;
+  bool verbose = false, debug = false, interactive = false, rewind = true;
+  std::vector<std::string> image_paths, audio_paths;
+
+  if (!ParseArgs(argc, argv, generator_params_args, guidance_args, model_path, ep, ep_path, system_prompt, user_prompt, verbose, debug, interactive, rewind, image_paths, audio_paths)) {
+    return -1;
+  }
+
+  if (ep.compare(kNvTensorRtRtxEp) == 0 && ep_path.empty()) {
+#if defined(_WIN32)
+    ep_path = (fs::current_path() / "onnxruntime_providers_nv_tensorrt_rtx.dll").string();
+#else
+    ep_path = (fs::current_path() / "libonnxruntime_providers_nv_tensorrt_rtx.so").string();
+#endif
+  }
+
+  if (debug) {
+    SetOrtVerboseLogging();
+    SetLogger();
+  }
+
+  if (!ep_path.empty()) {
+    RegisterEP(kNvTensorRtRtxEp, ep_path);
+  }
+
+  OgaHandle handle;
+
+  if (verbose) {
+    std::cout << "Model path: " << model_path << std::endl;
+    std::cout << "EP path: " << (ep_path.empty() ? "(none)" : ep_path) << std::endl;
+  }
+  std::cout << "Timings (model load, inference):" << std::endl;
+
+  try {
+    // RunWithCpu(model_path, ep_path, verbose);
+    // RunWithCpuAndCompileOverlay(model_path, ep_path, verbose);
+    //First run the no-compile case
+    RunWithNvTensorRtRtxNoCompile(model_path, ep_path, verbose);
+    //Then run for first time compile case, Model load time will be load time at no compile + compile time
+    RunWithNvTensorRtRtxMinimumCompileOptions(model_path, ep_path, verbose);
+    //Then run for second time compile case, Model load time must be very less as it is already compiled
+    RunWithNvTensorRtRtxMinimumCompileOptions(model_path, ep_path, verbose);
+    //Then run for all compile options,With different ep_context_file_path, ep_context_embed_mode, force_compile_if_needed, graph_optimization_level
+    RunWithNvTensorRtRtxCompileAllOptions(model_path, ep_path, verbose);
+  } catch (const std::exception& e) {
+    std::cerr << "Error: " << e.what() << std::endl;
+    return -1;
+  }
+
+  return 0;
+}