microsoft · chilo-ms · Oct 17, 2025 · Oct 17, 2025 · Oct 17, 2025 · Oct 17, 2025
diff --git a/.github/workflows/windows_tensorrt_rtx.yml b/.github/workflows/windows_tensorrt_rtx.yml
@@ -0,0 +1,283 @@
+name: Windows GPU TensorRT RTX CI Pipeline
+
+on:
+  push:
+    branches:
+      - main
+      - rel-*
+  pull_request:
+    branches:
+      - main
+      - rel-*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.ref || github.sha }}
+  cancel-in-progress: true
+
+#TODO: enable  --build_nodejs
+jobs:
+  build:
+    name: Windows GPU TensorRT RTX CI Pipeline
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
+    steps:
+      - uses: actions/checkout@v5
+        with:
+          fetch-depth: 0
+          submodules: 'none'
+
+      - uses: actions/setup-python@v6
+        with:
+          python-version: '3.12'
+          architecture: x64
+
+      - name: Locate vcvarsall and Setup Env
+        uses: ./.github/actions/locate-vcvarsall-and-setup-env
+        with:
+          architecture: x64
+
+      - name: Install python modules
+        run: python -m pip install -r .\tools\ci_build\github\windows\python\requirements.txt
+        working-directory: ${{ github.workspace }}
+        shell: cmd
+
+      - name: Download CUDA SDK v12.8
+        working-directory: ${{ runner.temp }}
+        run: |
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" .
+          dir
+        shell: pwsh
+
+      - name: Download TensorRT-RTX-1.1.1.26.Windows.win10.cuda-12.9
+        run: 'azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-RTX-1.1.1.26.Windows.win10.cuda-12.9" ${{ runner.temp }}'
+        shell: pwsh
+
+      - name: Add CUDA to PATH
+        shell: powershell
+        run: |
+          Write-Host "Adding CUDA to PATH"
+          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-RTX-1.1.1.26.Windows.win10.cuda-12.9\lib"
+
+      - uses: actions/setup-node@v5
+        with:
+          node-version: '20.x'
+
+      - uses: actions/setup-java@v5
+        with:
+          distribution: 'temurin'
+          java-version: '17'
+          architecture: x64
+
+      - uses: actions/cache@v4
+        id: onnx-node-tests-cache
+        with:
+          path: ${{ github.workspace }}/js/test/
+          key: onnxnodetests-${{ hashFiles('js/scripts/prepare-onnx-node-tests.ts') }}
+
+      - name: API Documentation Check and generate
+        run: |
+          set ORT_DOXY_SRC=${{ github.workspace }}
+          set ORT_DOXY_OUT=${{ runner.temp }}\build\RelWithDebInfo\RelWithDebInfo
+          mkdir %ORT_DOXY_SRC%
+          mkdir %ORT_DOXY_OUT%
+          "C:\Program Files\doxygen\bin\doxygen.exe" ${{ github.workspace }}\tools\ci_build\github\Doxyfile_csharp.cfg
+        working-directory: ${{ github.workspace }}
+        shell: cmd
+
+      - uses: actions/setup-dotnet@v5
+        env:
+          PROCESSOR_ARCHITECTURE: x64
+        with:
+          dotnet-version: '8.x'
+
+      - name: Use Nuget 6.x
+        uses: nuget/setup-nuget@v2
+        with:
+          nuget-version: '6.x'
+
+      - name: NuGet restore
+        run: nuget restore ${{ github.workspace }}\packages.config -ConfigFile ${{ github.workspace }}\NuGet.config -PackagesDirectory ${{ runner.temp }}\build\RelWithDebInfo
+        shell: cmd
+
+      - name: Set OnnxRuntimeBuildDirectory
+        shell: pwsh
+        run: |
+          $buildDir = Join-Path ${{ runner.temp }} "build"
+          echo "OnnxRuntimeBuildDirectory=$buildDir" >> $env:GITHUB_ENV
+
+      - name: Build and Clean Binaries
+        working-directory: ${{ runner.temp }}
+        run: |
+          npm install -g typescript
+          if ($lastExitCode -ne 0) {
+            exit $lastExitCode
+          }
+          # Execute the build process
+          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags             --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests             --use_cuda --use_nv_tensorrt_rtx --tensorrt_rtx_home="${{ runner.temp }}\TensorRT-RTX-1.1.1.26.Windows.win10.cuda-12.9"             --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          if ($lastExitCode -ne 0) {
+            exit $lastExitCode
+          }
+
+          # Clean up the output directory before uploading artifacts
+          $outputDir = "${{ runner.temp }}\build\RelWithDebInfo"
+          Write-Host "Cleaning up files from $outputDir..."
+
+          Remove-Item -Path "$outputDir\onnxruntime" -Recurse -Force -ErrorAction SilentlyContinue
+          Remove-Item -Path "$outputDir\pybind11" -Recurse -Force -ErrorAction SilentlyContinue
+          Remove-Item -Path "$outputDir\models" -Recurse -Force -ErrorAction SilentlyContinue
+          Remove-Item -Path "$outputDir\vcpkg_installed" -Recurse -Force -ErrorAction SilentlyContinue
+          Remove-Item -Path "$outputDir\_deps" -Recurse -Force -ErrorAction SilentlyContinue
+          Remove-Item -Path "$outputDir\CMakeCache.txt" -Force -ErrorAction SilentlyContinue
+          Remove-Item -Path "$outputDir\CMakeFiles" -Recurse -Force -ErrorAction SilentlyContinue
+          # Remove intermediate object files as in the original script
+          Remove-Item -Path $outputDir -Include "*.obj" -Recurse
+        shell: pwsh
+
+      - name: Upload build artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: build-artifacts
+          path: ${{ runner.temp }}\build
+    env:
+      OrtPackageId: Microsoft.ML.OnnxRuntime.Gpu
+      DOTNET_SKIP_FIRST_TIME_EXPERIENCE: true
+      setVcvars: true
+      ALLOW_RELEASED_ONNX_OPSET_ONLY: '0'
+      DocUpdateNeeded: false
+      ONNXRUNTIME_TEST_GPU_DEVICE_ID: '0'
+      AZCOPY_AUTO_LOGIN_TYPE: MSI
+      AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4
+      ORT_UNIT_TEST_MAIN_LOG_LEVEL: 0
+
+  test:
+    name: Windows GPU TensorRT RTX CI Pipeline Test Job
+    needs: build
+    timeout-minutes: 300
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Win2022-GPU-A10"]
+    steps:
+      - uses: actions/checkout@v5
+        with:
+          fetch-depth: 0
+          submodules: 'none'
+
+      - name: Download build artifacts
+        uses: actions/download-artifact@v5
+        with:
+          name: build-artifacts
+          path: ${{ runner.temp }}\build
+
+      - uses: actions/setup-python@v6
+        with:
+          python-version: '3.12'
+          architecture: x64
+
+      - uses: actions/setup-node@v5
+        with:
+          node-version: '20.x'
+
+      - uses: actions/setup-java@v5
+        with:
+          distribution: 'temurin'
+          java-version: '17'
+          architecture: x64
+
+      - name: Locate vcvarsall and Setup Env
+        uses: ./.github/actions/locate-vcvarsall-and-setup-env
+        with:
+          architecture: x64
+
+      - name: Install python modules
+        run: python -m pip install -r .\tools\ci_build\github\windows\python\requirements.txt
+        working-directory: ${{ github.workspace }}
+        shell: cmd
+
+      - name: Download CUDA SDK v12.8
+        working-directory: ${{ runner.temp }}
+        run: |
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" .
+          dir
+        shell: pwsh
+
+      - name: Download TensorRT-RTX-1.1.1.26.Windows.win10.cuda-12.9
+        run: 'azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-RTX-1.1.1.26.Windows.win10.cuda-12.9" ${{ runner.temp }}'
+        shell: pwsh
+
+      - name: Add CUDA to PATH
+        shell: powershell
+        run: |
+          Write-Host "Adding CUDA to PATH"
+          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-RTX-1.1.1.26.Windows.win10.cuda-12.9\lib"
+
+      - name: Set OnnxRuntimeBuildDirectory
+        shell: pwsh
+        run: |
+          $buildDir = Join-Path ${{ runner.temp }} "build"
+          echo "OnnxRuntimeBuildDirectory=$buildDir" >> $env:GITHUB_ENV
+
+      - name: Install ONNX Runtime Wheel
+        uses: ./.github/actions/install-onnxruntime-wheel
+        with:
+          whl-directory: ${{ runner.temp }}\build\RelWithDebInfo\RelWithDebInfo\dist
+
+      - name: Show total physical memory
+        run: |
+          $ram = (Get-CimInstance Win32_ComputerSystem).TotalPhysicalMemory
+          $ramGB = [math]::Round($ram / 1GB, 2)
+          Write-Host "Total RAM: $ramGB GB"
+        shell: pwsh
+
+      - name: nvidia-smi
+        shell: powershell
+        run: |
+          nvidia-smi.exe
+          nvidia-smi.exe --query-gpu=memory.total,memory.used,memory.free --format=csv
+
+      - name: Run Tests
+        working-directory: ${{ runner.temp }}
+        run: |
+          npm install -g typescript
+          if ($lastExitCode -ne 0) {
+            exit $lastExitCode
+          }
+
+          python.exe ${{ github.workspace }}\tools\python\update_ctest_path.py   "${{ runner.temp }}\build\RelWithDebInfo\CTestTestfile.cmake" "${{ runner.temp }}\build\RelWithDebInfo"
+          if ($lastExitCode -ne 0) {
+            exit $lastExitCode
+          }
+
+          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags             --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests             --use_nv_tensorrt_rtx --tensorrt_rtx_home="${{ runner.temp }}\TensorRT-RTX-1.1.1.26.Windows.win10.cuda-12.9"             --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          if ($lastExitCode -ne 0) {
+            exit $lastExitCode
+          }
+        shell: pwsh
+
+      - name: Check GPU memory usage and fail if >90%
+        shell: powershell
+        run: |
+          $csv = nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader
+          $parts = $csv -split ','
+          $used = [int]($parts[0] -replace ' MiB','')
+          $total = [int]($parts[1] -replace ' MiB','')
+          $percent = ($used / $total) * 100
+          Write-Host "GPU memory used: $used MiB / $total MiB ($percent %)"
+          if ($percent -gt 90) { throw "GPU memory usage exceeded threshold ($percent%)" }
+
+      - name: Validate C# native delegates
+        run: python tools\ValidateNativeDelegateAttributes.py
+        working-directory: ${{ github.workspace }}\csharp
+        shell: cmd
+    env:
+      OrtPackageId: Microsoft.ML.OnnxRuntime.Gpu
+      DOTNET_SKIP_FIRST_TIME_EXPERIENCE: true
+      setVcvars: true
+      ALLOW_RELEASED_ONNX_OPSET_ONLY: '0'
+      DocUpdateNeeded: false
+      ONNXRUNTIME_TEST_GPU_DEVICE_ID: '0'
+      AZCOPY_AUTO_LOGIN_TYPE: MSI
+      AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4
diff --git a/cmake/onnxruntime_providers_nv.cmake b/cmake/onnxruntime_providers_nv.cmake
@@ -2,7 +2,8 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # Licensed under the MIT License.
   find_package(CUDAToolkit REQUIRED 12.8)
-  enable_language(CUDA)
+  # No need to do this here if using onnxruntime build script, because "enable_language(CUDA)" already being called in CMakeLists.txt if CUDA EP is enabled.
+  #enable_language(CUDA)
   if(onnxruntime_DISABLE_CONTRIB_OPS)
     message( FATAL_ERROR "To compile TensorRT execution provider contrib ops have to be enabled to dump an engine using com.microsoft:EPContext node." )
   endif()

diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc
@@ -1736,7 +1736,9 @@ SubGraphCollection_t NvExecutionProvider::GetSupportedList(SubGraphCollection_t
         TensorrtLogger& trt_logger = GetTensorrtLogger(detailed_build_log_);
         auto trt_builder = GetBuilder(trt_logger);
         auto network_flags = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kSTRONGLY_TYPED);
+        LOGS_DEFAULT(INFO) << "[NvTensorRTRTX EP] After createNetworkV2()";
         auto trt_network = std::unique_ptr<nvinfer1::INetworkDefinition>(trt_builder->createNetworkV2(network_flags));
+        LOGS_DEFAULT(INFO) << "[NvTensorRTRTX EP] After createNetworkV2()";
 
         bool is_model_supported = false;
 
@@ -1755,8 +1757,10 @@ SubGraphCollection_t NvExecutionProvider::GetSupportedList(SubGraphCollection_t
             ORT_THROW("'nv_use_external_data_initializer' is only supported on TensorRT RTX 1.1.x.x and above.");
 #endif
           } else {
+            LOGS_DEFAULT(INFO) << "[NvTensorRTRTX EP] Before supportsModelV2()";
             is_model_supported = trt_parser->supportsModelV2(string_buf.data(), string_buf.size(), model_path_);
           }
+          LOGS_DEFAULT(INFO) << "[NvTensorRTRTX EP] After supportsModelV2()";
 
           // Note: Calling getNbSubgraphs or getSubgraphNodes before calling supportsModelV2 results in undefined behavior.
           auto num_subgraphs = trt_parser->getNbSubgraphs();

diff --git a/onnxruntime/test/providers/nv_tensorrt_rtx/test_nv_trt_rtx_ep_util.cc b/onnxruntime/test/providers/nv_tensorrt_rtx/test_nv_trt_rtx_ep_util.cc
@@ -276,15 +276,16 @@ onnxruntime::NodeArg& AddGroupQueryAttention(
 void CreateLargeLLMModel(const PathString& model_path, const PathString& external_data_path) {
   // Model parameters (example: 24 layers, 4096 hidden dim, 32 attention heads, 8 kv heads => GQA)
   int batch_size = 1;
-  int num_layers = 32;
-  int hidden_dim = 2048;
+  int num_layers = 1;
+  int hidden_dim = 128;
   int q_num_heads = 8;
   int kv_num_heads = 1;  // GQA: q_num_heads > kv_num_heads, and divisible.
   int seq_length = 128;  // Short, for demonstration.
   int vocab_size = 32000;
   auto dtype = ONNX_NAMESPACE::TensorProto_DataType_FLOAT16;
 
   // Set up model/graph
+  DefaultLoggingManager().SetDefaultLoggerSeverity(onnxruntime::logging::Severity::kVERBOSE);
   onnxruntime::Model model("LLM_With_GQA", false, DefaultLoggingManager().DefaultLogger());
   auto& graph = model.MainGraph();
 

diff --git a/onnxruntime/test/unittest_main/test_main.cc b/onnxruntime/test/unittest_main/test_main.cc
@@ -71,6 +71,7 @@ extern "C" void ortenv_setup() {
       log_level = static_cast<OrtLoggingLevel>(*log_level_override);
     }
 
+    log_level = ORT_LOGGING_LEVEL_VERBOSE;
     ort_env.reset(new Ort::Env(&tpo, log_level, "Default"));
 
 #if defined(TEST_MAIN_ENABLE_DYNAMIC_PLUGIN_EP_USAGE)