diff --git a/.github/actions/linux-web-init-and-check/action.yml b/.github/actions/linux-web-init-and-check/action.yml
index c250f368a953e..694f026d07d0d 100644
--- a/.github/actions/linux-web-init-and-check/action.yml
+++ b/.github/actions/linux-web-init-and-check/action.yml
@@ -4,7 +4,7 @@ runs:
   using: "composite"
   steps:
     - name: Setup Node.js
-      uses: actions/setup-node@v3
+      uses: actions/setup-node@v5
       with:
         node-version: "22.x"
 
diff --git a/.github/actions/locate-vcvarsall-and-setup-env/action.yml b/.github/actions/locate-vcvarsall-and-setup-env/action.yml
index c4fdc48a7bd63..fba855f14b487 100644
--- a/.github/actions/locate-vcvarsall-and-setup-env/action.yml
+++ b/.github/actions/locate-vcvarsall-and-setup-env/action.yml
@@ -16,8 +16,8 @@ runs:
     - name: Setup VCPKG
       uses: microsoft/onnxruntime-github-actions/setup-build-tools@v0.0.9
       with:
-        vcpkg-version: '2025.06.13'
-        vcpkg-hash: '735923258c5187966698f98ce0f1393b8adc6f84d44fd8829dda7db52828639331764ecf41f50c8e881e497b569f463dbd02dcb027ee9d9ede0711102de256cc'
+        vcpkg-version: '2025.08.27'
+        vcpkg-hash: '9a4b32849792e13bee1d24726f073b3881acae4165206ddf1a6378e44a4ddd05b3ee93f55ff46d8e8873b3cbcd06606212989e248f0bd615a5bf365070074079'
         cmake-version: '3.31.6'
         cmake-hash: '0f1584e8666cf4a65ec514bd02afe281caabf1d45d2c963f3151c41484f457386aa03273ab25776a670be02725354ce0b46f3a5121857416da37366342a833a0'
         add-cmake-to-path: 'true'
diff --git a/.github/actions/macos-ci-setup/action.yml b/.github/actions/macos-ci-setup/action.yml
index b3b95b855526f..5c6eb6193c393 100644
--- a/.github/actions/macos-ci-setup/action.yml
+++ b/.github/actions/macos-ci-setup/action.yml
@@ -31,7 +31,7 @@ runs:
   using: "composite"
   steps:
     - name: Use Python
-      uses: actions/setup-python@v5
+      uses: actions/setup-python@v6
       with:
         python-version: ${{ inputs.python_version }}
 
@@ -43,7 +43,7 @@ runs:
         assert platform.machine().lower() == "${{ inputs.platform_machine}}", "This job expects to be run on an ${{ inputs.platform_machine}} machine."
 
     - name: Use Node.js
-      uses: actions/setup-node@v4
+      uses: actions/setup-node@v5
       with:
         node-version: ${{ inputs.node_version }}
 
@@ -52,7 +52,7 @@ runs:
       run: brew install coreutils ninja
 
     - name: Install Java
-      uses: actions/setup-java@v4
+      uses: actions/setup-java@v5
       with:
         distribution: "temurin"
         java-version: ${{ inputs.java_version }}
diff --git a/.github/workflows/linux-wasm-ci-build-and-test-workflow.yml b/.github/workflows/linux-wasm-ci-build-and-test-workflow.yml
index c30a8cb023f50..e36ecd505fc21 100644
--- a/.github/workflows/linux-wasm-ci-build-and-test-workflow.yml
+++ b/.github/workflows/linux-wasm-ci-build-and-test-workflow.yml
@@ -4,6 +4,9 @@ description: "This is a reusable workflow for Linux WASM CI pipelines to build a
 on:
   workflow_call:
     inputs:
+      job_name:
+        required: true
+        type: string
       build_config:
         required: true
         type: string
@@ -37,16 +40,16 @@ jobs:
     runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
     env:
       buildArch: x64
-      common_build_args: --parallel ${{ inputs.use_vcpkg == true && '--use_vcpkg --use_vcpkg_ms_internal_asset_cache' || '' }} --config ${{ inputs.build_config }} --skip_submodule_sync --build_wasm --enable_wasm_simd ${{ inputs.enable_wasm_threads == true && '--enable_wasm_threads' || '' }} ${{ inputs.extra_build_args }}
+      common_build_args: --parallel --use_cache ${{ inputs.use_vcpkg == true && '--use_vcpkg --use_vcpkg_ms_internal_asset_cache' || '' }} --config ${{ inputs.build_config }} --skip_submodule_sync --build_wasm --enable_wasm_simd ${{ inputs.enable_wasm_threads == true && '--enable_wasm_threads' || '' }} ${{ inputs.extra_build_args }}
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           submodules: recursive
 
       - name: Set up Node.js
-        uses: actions/setup-node@v4
+        uses: actions/setup-node@v6
         with:
           node-version: "22"
 
@@ -56,8 +59,25 @@ jobs:
           python-version: "3.12"
           architecture: ${{ env.buildArch }}
 
-      - uses: microsoft/onnxruntime-github-actions/setup-build-tools@v0.0.9
+      - name: Install python dependencies
+        run: python -m pip install flatbuffers
+
+      - name: Setup CCache
+        uses: actions/cache@v4
+        with:
+          key: ccache | web.yml | ${{ inputs.job_name }}
+          path: ~/.cache/ccache
+
+      - name: Setup VCPKG Cache
+        uses: actions/cache@v4
+        with:
+          key: vcpkg-cache | web.yml | ${{ inputs.job_name }}
+          path: ~/.cache/vcpkg
+
+      - uses: microsoft/onnxruntime-github-actions/setup-build-tools@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
+          ccache-version: 4.13.1
+          ccache-hash: 626407a9b81dd86f8ec9867bff396b32dd1f00344f5b323526579a64f6d4104927f83e8d7a05ad9806fd78f4491e0adb4cff73388000a62050cb1b00766214ee
           vcpkg-version: '2025.06.13'
           vcpkg-hash: '735923258c5187966698f98ce0f1393b8adc6f84d44fd8829dda7db52828639331764ecf41f50c8e881e497b569f463dbd02dcb027ee9d9ede0711102de256cc'
           cmake-version: '3.31.6'
@@ -114,7 +134,7 @@ jobs:
 
       - name: Upload WASM artifacts
         if: ${{ inputs.skip_publish != true }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v6
         with:
           name: ${{ inputs.build_config }}_wasm
           path: ${{ github.workspace }}/artifacts/wasm
@@ -143,7 +163,7 @@ jobs:
 
       - name: Publish test results
         if: ${{ always() && inputs.build_config == 'Debug' }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v6
         with:
           name: test-results
           path: ${{ github.workspace }}/build/**/*.results.xml
diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml
index 6f517f2656e94..9aa8418c55a40 100644
--- a/.github/workflows/linux_ci.yml
+++ b/.github/workflows/linux_ci.yml
@@ -48,6 +48,7 @@ jobs:
       dockerfile_path: tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
       docker_image_repo: onnxruntimecpubuildcix64
       extra_build_flags: '--enable_address_sanitizer'
+      job_identifier: build-linux-x64-debug
       # python_path_prefix: '' # Default empty string is fine, no prefix needed
     secrets:
       GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -63,6 +64,7 @@ jobs:
       docker_image_repo: onnxruntimecpubuildpythonx64
       extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --build_nuget --enable_transformers_tool_test --cmake_extra_defines onnxruntime_BUILD_BENCHMARKS=ON'
       python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH' # $ needs escaping in single quotes
+      job_identifier: build-linux-x64-release
     secrets:
       GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
@@ -77,6 +79,7 @@ jobs:
       docker_image_repo: onnxruntimecpubuildpythonx64 # Shares image with standard x64 release
       extra_build_flags: '--enable_training --use_binskim_compliant_compile_flags --build_wheel --build_nuget --enable_transformers_tool_test --cmake_extra_defines onnxruntime_BUILD_BENCHMARKS=ON'
       python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH' # $ needs escaping in single quotes
+      job_identifier: orttraining-linux-ci-pipeline
     secrets:
       GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
@@ -92,6 +95,7 @@ jobs:
       docker_image_repo: onnxruntimecpubuildciaarch64
       # ASan disabled due to excessive runtime (>4hr). Includes wheel build for basic checks.
       extra_build_flags: '--use_binskim_compliant_compile_flags --build_shared_lib'
+      job_identifier: build-linux-arm64-debug
     secrets:
       GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
@@ -106,5 +110,6 @@ jobs:
       docker_image_repo: onnxruntimecpubuildpythonaarch64
       extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cmake_extra_defines onnxruntime_BUILD_BENCHMARKS=ON'
       python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH' # $ needs escaping in single quotes
+      job_identifier: build-linux-arm64-release
     secrets:
       GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/linux_minimal_build.yml b/.github/workflows/linux_minimal_build.yml
index 92cdbb70e9858..655921342ae00 100644
--- a/.github/workflows/linux_minimal_build.yml
+++ b/.github/workflows/linux_minimal_build.yml
@@ -29,16 +29,30 @@ jobs:
       packages: write
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           submodules: false
 
-      - uses: actions/setup-node@v4
+      - uses: actions/setup-node@v6
         with:
           node-version: 20
 
-      - uses: microsoft/onnxruntime-github-actions/setup-build-tools@v0.0.9
+      - name: Setup CCache
+        uses: actions/cache@v4
         with:
+          key: ccache | linux_minimal_build.yml | build_full_ort
+          path: ~/.cache/ccache
+
+      - name: Setup VCPKG Cache
+        uses: actions/cache@v4
+        with:
+          key: vcpkg-cache | linux_minimal_build.yml | build_full_ort
+          path: ~/.cache/vcpkg
+
+      - uses: microsoft/onnxruntime-github-actions/setup-build-tools@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
+        with:
+          ccache-version: 4.13.1
+          ccache-hash: 626407a9b81dd86f8ec9867bff396b32dd1f00344f5b323526579a64f6d4104927f83e8d7a05ad9806fd78f4491e0adb4cff73388000a62050cb1b00766214ee
           vcpkg-version: '2025.06.13'
           vcpkg-hash: '735923258c5187966698f98ce0f1393b8adc6f84d44fd8829dda7db52828639331764ecf41f50c8e881e497b569f463dbd02dcb027ee9d9ede0711102de256cc'
           cmake-version: '3.31.6'
@@ -47,10 +61,10 @@ jobs:
           disable-terrapin: 'true'
 
       - name: Build Full ORT and Prepare Test Files
-        uses: microsoft/onnxruntime-github-actions/build-and-prep-ort-files@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/build-and-prep-ort-files@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
 
       - name: Upload Test Data Artifact
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v6
         with:
           name: test_data
           path: ${{ runner.temp }}/minimal_build_test_data/
@@ -66,15 +80,27 @@ jobs:
       id-token: write # If using OIDC for ACR login
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           submodules: false
-      - uses: actions/setup-node@v4
+      - uses: actions/setup-node@v6
         with:
           node-version: 20
 
+      - name: Setup CCache
+        uses: actions/cache@v4
+        with:
+          key: ccache | linux_minimal_build.yml | build_minimal_exceptions_disabled
+          path: ~/.cache/ccache
+
+      - name: Setup VCPKG Cache
+        uses: actions/cache@v4
+        with:
+          key: vcpkg-cache | linux_minimal_build.yml | build_minimal_exceptions_disabled
+          path: ~/.cache/vcpkg
+
       - name: Get Docker Image using Action
-        uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/build-docker-image@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         id: build_docker_image_step
         with:
           dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
@@ -85,10 +111,9 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Run Build 2 (Update)
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
-          docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name
-            }}
+          docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }}
           build_config: Debug # From original --config Debug
           mode: 'update' # CMake configure step
           extra_build_flags: >-
@@ -100,10 +125,9 @@ jobs:
             --enable_training_ops
 
       - name: Run Build 2 (Build)
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
-          docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name
-            }}
+          docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }}
           build_config: Debug # From original --config Debug
           mode: 'build' # Actual build step
           extra_build_flags: >-
@@ -125,15 +149,29 @@ jobs:
       id-token: write # If using OIDC for ACR login
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           submodules: false
-      - uses: actions/setup-node@v4
+      - uses: actions/setup-node@v6
         with:
           node-version: 20
 
-      - uses: microsoft/onnxruntime-github-actions/setup-build-tools@v0.0.9
+      - name: Setup CCache
+        uses: actions/cache@v4
         with:
+          key: ccache | linux_minimal_build.yml | build_minimal_custom_ops
+          path: ~/.cache/ccache
+
+      - name: Setup VCPKG Cache
+        uses: actions/cache@v4
+        with:
+          key: vcpkg-cache | linux_minimal_build.yml | build_minimal_custom_ops
+          path: ~/.cache/vcpkg
+
+      - uses: microsoft/onnxruntime-github-actions/setup-build-tools@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
+        with:
+          ccache-version: 4.13.1
+          ccache-hash: 626407a9b81dd86f8ec9867bff396b32dd1f00344f5b323526579a64f6d4104927f83e8d7a05ad9806fd78f4491e0adb4cff73388000a62050cb1b00766214ee
           vcpkg-version: '2025.06.13'
           vcpkg-hash: '735923258c5187966698f98ce0f1393b8adc6f84d44fd8829dda7db52828639331764ecf41f50c8e881e497b569f463dbd02dcb027ee9d9ede0711102de256cc'
           cmake-version: '3.31.6'
@@ -142,7 +180,7 @@ jobs:
           disable-terrapin: 'true'
 
       - name: Build Full ORT and Prepare Test Files
-        uses: microsoft/onnxruntime-github-actions/build-minimal-ort-and-run-tests@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/build-minimal-ort-and-run-tests@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           reduced-ops-config-file: required_ops.ort_models.config
           enable-custom-ops: 'true'
@@ -159,23 +197,38 @@ jobs:
       id-token: write # If using OIDC for ACR login
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           submodules: false
-      - uses: actions/setup-node@v4
+      - uses: actions/setup-node@v6
         with:
           node-version: 20
 
-      - uses: microsoft/onnxruntime-github-actions/setup-build-tools@v0.0.9
+      - name: Setup CCache
+        uses: actions/cache@v4
+        with:
+          key: ccache | linux_minimal_build.yml | build_minimal_type_reduction
+          path: ~/.cache/ccache
+
+      - name: Setup VCPKG Cache
+        uses: actions/cache@v4
         with:
+          key: vcpkg-cache | linux_minimal_build.yml | build_minimal_type_reduction
+          path: ~/.cache/vcpkg
+
+      - uses: microsoft/onnxruntime-github-actions/setup-build-tools@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
+        with:
+          ccache-version: 4.13.1
+          ccache-hash: 626407a9b81dd86f8ec9867bff396b32dd1f00344f5b323526579a64f6d4104927f83e8d7a05ad9806fd78f4491e0adb4cff73388000a62050cb1b00766214ee
           vcpkg-version: '2025.06.13'
           vcpkg-hash: '735923258c5187966698f98ce0f1393b8adc6f84d44fd8829dda7db52828639331764ecf41f50c8e881e497b569f463dbd02dcb027ee9d9ede0711102de256cc'
           cmake-version: '3.31.6'
           cmake-hash: '42395e20b10a8e9ef3e33014f9a4eed08d46ab952e02d2c1bbc8f6133eca0d7719fb75680f9bbff6552f20fcd1b73d86860f7f39388d631f98fb6f622b37cf04'
           add-cmake-to-path: 'true'
           disable-terrapin: 'true'
+
       - name: Build Full ORT and Prepare Test Files
-        uses: microsoft/onnxruntime-github-actions/build-minimal-ort-and-run-tests@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/build-minimal-ort-and-run-tests@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           reduced-ops-config-file: required_ops_and_types.ort_models.config
           enable-type-reduction: 'true'
@@ -191,15 +244,29 @@ jobs:
       id-token: write # If using OIDC for ACR login
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           submodules: false
-      - uses: actions/setup-node@v4
+      - uses: actions/setup-node@v6
         with:
           node-version: 20
 
-      - uses: microsoft/onnxruntime-github-actions/setup-build-tools@v0.0.9
+      - name: Setup CCache
+        uses: actions/cache@v4
+        with:
+          key: ccache | linux_minimal_build.yml | build_minimal_globally_allowed_types
+          path: ~/.cache/ccache
+
+      - name: Setup VCPKG Cache
+        uses: actions/cache@v4
         with:
+          key: vcpkg-cache | linux_minimal_build.yml | build_minimal_globally_allowed_types
+          path: ~/.cache/vcpkg
+
+      - uses: microsoft/onnxruntime-github-actions/setup-build-tools@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
+        with:
+          ccache-version: 4.13.1
+          ccache-hash: 626407a9b81dd86f8ec9867bff396b32dd1f00344f5b323526579a64f6d4104927f83e8d7a05ad9806fd78f4491e0adb4cff73388000a62050cb1b00766214ee
           vcpkg-version: '2025.06.13'
           vcpkg-hash: '735923258c5187966698f98ce0f1393b8adc6f84d44fd8829dda7db52828639331764ecf41f50c8e881e497b569f463dbd02dcb027ee9d9ede0711102de256cc'
           cmake-version: '3.31.6'
@@ -208,7 +275,7 @@ jobs:
           disable-terrapin: 'true'
 
       - name: Build Full ORT and Prepare Test Files
-        uses: microsoft/onnxruntime-github-actions/build-minimal-ort-and-run-tests@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/build-minimal-ort-and-run-tests@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           globally_allowed_types: 'bool,float,int8_t,uint8_t'
           enable-type-reduction: 'true'
@@ -225,15 +292,27 @@ jobs:
       id-token: write # If using OIDC for ACR login
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           submodules: false
-      - uses: actions/setup-node@v4
+      - uses: actions/setup-node@v6
         with:
           node-version: 20
 
+      - name: Setup CCache
+        uses: actions/cache@v4
+        with:
+          key: ccache | linux_minimal_build.yml | build_extended_minimal
+          path: ~/.cache/ccache
+
+      - name: Setup VCPKG Cache
+        uses: actions/cache@v4
+        with:
+          key: vcpkg-cache | linux_minimal_build.yml | build_extended_minimal
+          path: ~/.cache/vcpkg
+
       - name: Get Docker Image using Action
-        uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/build-docker-image@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         id: build_docker_image_step
         with:
           dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
@@ -243,12 +322,10 @@ jobs:
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
-
       - name: Run Build 5 (Update)
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
-          docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name
-            }}
+          docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }}
           build_config: Debug
           mode: 'update'
           extra_build_flags: >-
@@ -258,7 +335,7 @@ jobs:
             --minimal_build extended
 
       - name: Run Build 5 (Build)
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name
             }}
@@ -270,7 +347,7 @@ jobs:
             --use_binskim_compliant_compile_flags
             --minimal_build extended
       - name: Run Build 5 (Test)
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name
             }}
@@ -292,12 +369,12 @@ jobs:
       id-token: write # If using OIDC for ACR login
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           submodules: false
 
       - name: Get Docker Image using Action
-        uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/build-docker-image@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         id: build_docker_image_step
         with:
           dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
@@ -313,8 +390,20 @@ jobs:
           mkdir -p ${{ runner.temp }}/.test_data
           touch ${{ runner.temp }}/.test_data/include_no_operators.config
 
+      - name: Setup CCache
+        uses: actions/cache@v4
+        with:
+          key: ccache | linux_minimal_build.yml | build_regular_no_optional
+          path: ~/.cache/ccache
+
+      - name: Setup VCPKG Cache
+        uses: actions/cache@v4
+        with:
+          key: vcpkg-cache | linux_minimal_build.yml | build_regular_no_optional
+          path: ~/.cache/vcpkg
+
       - name: Run Build 6a (Update)
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name
             }}
@@ -330,7 +419,7 @@ jobs:
             --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
 
       - name: Run Build 6a (Build)
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name
             }}
@@ -347,7 +436,7 @@ jobs:
 
 
       - name: Run Build 6a (Test)
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name
             }}
@@ -372,7 +461,7 @@ jobs:
       id-token: write # If using OIDC for ACR login
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           submodules: false
 
@@ -383,7 +472,7 @@ jobs:
           touch ${{ runner.temp }}/.test_data/include_no_operators.config
 
       - name: Get Docker Image using Action
-        uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/build-docker-image@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         id: build_docker_image_step
         with:
           dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
@@ -393,11 +482,22 @@ jobs:
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
+      - name: Setup CCache
+        uses: actions/cache@v4
+        with:
+          key: ccache | linux_minimal_build.yml | build_minimal_no_optional
+          path: ~/.cache/ccache
+
+      - name: Setup VCPKG Cache
+        uses: actions/cache@v4
+        with:
+          key: vcpkg-cache | linux_minimal_build.yml | build_minimal_no_optional
+          path: ~/.cache/vcpkg
+
       - name: Run Build 6b (Update)
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
-          docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name
-            }}
+          docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }}
           build_config: MinSizeRel # From original --config MinSizeRel
           mode: 'update'
           extra_build_flags: >-
@@ -413,7 +513,7 @@ jobs:
             --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
 
       - name: Run Build 6b (Build)
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name
             }}
@@ -441,7 +541,7 @@ jobs:
       id-token: write # If using OIDC for ACR login
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           submodules: false
 
@@ -452,7 +552,7 @@ jobs:
           touch ${{ runner.temp }}/.test_data/include_no_operators.config
 
       - name: Get Docker Image using Action
-        uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/build-docker-image@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         id: build_docker_image_step
         with:
           dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
@@ -468,8 +568,20 @@ jobs:
           mkdir -p ${{ runner.temp }}/.test_data
           touch ${{ runner.temp }}/.test_data/include_no_operators.config
 
+      - name: Setup CCache
+        uses: actions/cache@v4
+        with:
+          key: ccache | linux_minimal_build.yml | build_extended_minimal_no_optional
+          path: ~/.cache/ccache
+
+      - name: Setup VCPKG Cache
+        uses: actions/cache@v4
+        with:
+          key: vcpkg-cache | linux_minimal_build.yml | build_extended_minimal_no_optional
+          path: ~/.cache/vcpkg
+
       - name: Run Build 6c (Update)
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name
             }}
@@ -488,7 +600,7 @@ jobs:
             --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
 
       - name: Run Build 6c (Build)
-        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         with:
           docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name
             }}
@@ -518,10 +630,10 @@ jobs:
       id-token: write # If using OIDC for ACR login
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           submodules: false
-      - uses: actions/setup-node@v4
+      - uses: actions/setup-node@v6
         with:
           node-version: 20
       - name: Download Test Data Artifact
@@ -531,7 +643,7 @@ jobs:
           path: ${{ runner.temp }}/.test_data/
 
       - name: Get Docker Image using Action
-        uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.9
+        uses: microsoft/onnxruntime-github-actions/build-docker-image@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12
         id: build_docker_image_step
         with:
           dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index af2b36c870201..4775d92367930 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -60,15 +60,15 @@ jobs:
       matrix:
         target_arch: [x86_64, arm64]
 
-    timeout-minutes: 90
+    timeout-minutes: 120
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
       - uses: microsoft/onnxruntime-github-actions/setup-build-tools@v0.0.9
         with:
-          vcpkg-version: '2025.06.13'
-          vcpkg-hash: 735923258c5187966698f98ce0f1393b8adc6f84d44fd8829dda7db52828639331764ecf41f50c8e881e497b569f463dbd02dcb027ee9d9ede0711102de256cc
+          vcpkg-version: '2025.08.27'
+          vcpkg-hash: 9a4b32849792e13bee1d24726f073b3881acae4165206ddf1a6378e44a4ddd05b3ee93f55ff46d8e8873b3cbcd06606212989e248f0bd615a5bf365070074079
           cmake-version: '3.31.8'
           cmake-hash: 99cc9c63ae49f21253efb5921de2ba84ce136018abf08632c92c060ba91d552e0f6acc214e9ba8123dee0cf6d1cf089ca389e321879fd9d719a60d975bcffcc8
           add-cmake-to-path: 'true'
@@ -112,11 +112,11 @@ jobs:
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
       - uses: microsoft/onnxruntime-github-actions/setup-build-tools@v0.0.9
         with:
-          vcpkg-version: '2025.06.13'
-          vcpkg-hash: 735923258c5187966698f98ce0f1393b8adc6f84d44fd8829dda7db52828639331764ecf41f50c8e881e497b569f463dbd02dcb027ee9d9ede0711102de256cc
+          vcpkg-version: '2025.08.27'
+          vcpkg-hash: 9a4b32849792e13bee1d24726f073b3881acae4165206ddf1a6378e44a4ddd05b3ee93f55ff46d8e8873b3cbcd06606212989e248f0bd615a5bf365070074079
           cmake-version: '3.31.8'
           cmake-hash: 99cc9c63ae49f21253efb5921de2ba84ce136018abf08632c92c060ba91d552e0f6acc214e9ba8123dee0cf6d1cf089ca389e321879fd9d719a60d975bcffcc8
           add-cmake-to-path: 'true'
diff --git a/.github/workflows/macos-ci-build-and-test-workflow.yml b/.github/workflows/macos-ci-build-and-test-workflow.yml
index 281538336b0c1..1583dd127886c 100644
--- a/.github/workflows/macos-ci-build-and-test-workflow.yml
+++ b/.github/workflows/macos-ci-build-and-test-workflow.yml
@@ -61,11 +61,11 @@ jobs:
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
       - uses: microsoft/onnxruntime-github-actions/setup-build-tools@v0.0.9
         with:
-          vcpkg-version: '2025.06.13'
-          vcpkg-hash: 735923258c5187966698f98ce0f1393b8adc6f84d44fd8829dda7db52828639331764ecf41f50c8e881e497b569f463dbd02dcb027ee9d9ede0711102de256cc
+          vcpkg-version: '2025.08.27'
+          vcpkg-hash: 9a4b32849792e13bee1d24726f073b3881acae4165206ddf1a6378e44a4ddd05b3ee93f55ff46d8e8873b3cbcd06606212989e248f0bd615a5bf365070074079
           cmake-version: '3.31.8'
           cmake-hash: 99cc9c63ae49f21253efb5921de2ba84ce136018abf08632c92c060ba91d552e0f6acc214e9ba8123dee0cf6d1cf089ca389e321879fd9d719a60d975bcffcc8
           add-cmake-to-path: 'true'
diff --git a/.github/workflows/reusable_linux_build.yml b/.github/workflows/reusable_linux_build.yml
index 1a9c0e0a72031..8f4cf9a26bf46 100644
--- a/.github/workflows/reusable_linux_build.yml
+++ b/.github/workflows/reusable_linux_build.yml
@@ -58,6 +58,11 @@ on:
         required: false
         type: boolean
         default: false
+      job_identifier:
+        description: 'A unique identifier for the job, used for hosted pool tracking'
+        required: false
+        type: string
+        default: 'linux-build'
     secrets:
       GH_TOKEN:
         description: 'GitHub token for accessing actions/packages'
@@ -68,6 +73,7 @@ jobs:
     runs-on:
       - self-hosted
       - "1ES.Pool=${{ inputs.pool_name }}"
+      - "JobId=${{ inputs.job_identifier }}-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}"
     permissions:
       contents: read
       packages: write
@@ -75,9 +81,10 @@ jobs:
       id-token: write
     steps:
       - name: Checkout code
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
 
       - name: Set up Python ${{ inputs.python_version }}
+        if: inputs.architecture != 'arm64'
         uses: actions/setup-python@v6
         with:
           python-version: ${{ inputs.python_version }}
@@ -163,7 +170,7 @@ jobs:
       # ------------- Upload Build Output Step -------------
       - name: Upload Build Output Artifact
         if: inputs.upload_build_output == true
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v6
         with:
           name: build-output-${{ inputs.architecture }}-${{ inputs.build_config }}
           path: ${{ runner.temp }}/${{ inputs.build_config }}
@@ -172,7 +179,7 @@ jobs:
       # ------------- Upload Log on Build Failure Step -------------
       - name: Upload VCPKG Manifest Install Log on Update or Build Failure
         if: steps.update_step.outcome == 'failure' || steps.build_step.outcome == 'failure'
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v6
         with:
           name: vcpkg-manifest-install-log-${{ inputs.architecture }}-${{ inputs.build_config }}
           path: ${{ runner.temp }}/${{ inputs.build_config }}/${{ inputs.build_config }}/vcpkg-manifest-install.log
diff --git a/.github/workflows/web.yml b/.github/workflows/web.yml
index 616c2c6db8a8d..49fd7202cb86d 100644
--- a/.github/workflows/web.yml
+++ b/.github/workflows/web.yml
@@ -22,7 +22,7 @@ jobs:
       commit_sha: ${{ steps.extract_commit.outputs.commit_sha }}
     steps:
       - name: Checkout code
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           submodules: true
 
@@ -38,6 +38,7 @@ jobs:
     needs: precheck
     uses: ./.github/workflows/linux-wasm-ci-build-and-test-workflow.yml
     with:
+      job_name: wasm_Debug
       build_config: Debug
       extra_build_args: "--enable_wasm_profiling"
       build_jsep: true
@@ -47,6 +48,7 @@ jobs:
     needs: precheck
     uses: ./.github/workflows/linux-wasm-ci-build-and-test-workflow.yml
     with:
+      job_name: wasm_Release
       build_config: Release
       extra_build_args: "--target onnxruntime_webassembly --skip_tests --enable_wasm_api_exception_catching --disable_rtti"
       build_jsep: true
@@ -56,6 +58,7 @@ jobs:
     needs: precheck
     uses: ./.github/workflows/linux-wasm-ci-build-and-test-workflow.yml
     with:
+      job_name: wasm_Release_static_library
       build_config: Release
       extra_build_args: "--skip_tests --enable_wasm_api_exception_catching --disable_rtti --build_wasm_static_lib"
       use_vcpkg: false
@@ -68,6 +71,7 @@ jobs:
       - wasm_Debug
     uses: ./.github/workflows/windows-web-ci-workflow.yml
     with:
+      job_name: web_Debug
       commit_override: ${{ needs.precheck.outputs.commit_sha }}
       build_config: Debug
 
@@ -77,5 +81,6 @@ jobs:
       - wasm_Release
     uses: ./.github/workflows/windows-web-ci-workflow.yml
     with:
+      job_name: web_Release
       commit_override: ${{ needs.precheck.outputs.commit_sha }}
       build_config: Release
diff --git a/.github/workflows/windows-web-ci-workflow.yml b/.github/workflows/windows-web-ci-workflow.yml
index 0ea8b3ee33644..abc46681e8220 100644
--- a/.github/workflows/windows-web-ci-workflow.yml
+++ b/.github/workflows/windows-web-ci-workflow.yml
@@ -4,6 +4,9 @@ description: "Windows Web CI pipeline for building and testing ONNX Runtime Web"
 on:
   workflow_call:
     inputs:
+      job_name:
+        required: true
+        type: string
       commit_override:
         type: string
         default: ""
@@ -19,7 +22,11 @@ on:
 
 jobs:
   build_onnxruntime_web:
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Win2022-WEBGPU-A10"]
+    runs-on: [
+      "self-hosted",
+      "1ES.Pool=onnxruntime-github-Win2022-WEBGPU-A10",
+      "JobId=build_onnxruntime_web-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}"
+      ]
 
     env:
       webgpu_commandline_extra_flags: "--chromium-flags=--ignore-gpu-blocklist --chromium-flags=--gpu-vendor-id=0x10de"
@@ -29,7 +36,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           submodules: false
 
@@ -62,12 +69,12 @@ jobs:
           git checkout -- .gitattributes
 
       - name: Setup Node.js
-        uses: actions/setup-node@v4
+        uses: actions/setup-node@v6
         with:
           node-version: "20.x"
 
       - name: Download WebAssembly artifacts
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v7
         with:
           name: ${{ inputs.build_config }}_wasm
           path: ${{ github.workspace }}/artifacts_wasm
@@ -95,7 +102,7 @@ jobs:
         run: npm ci
         working-directory: ${{ github.workspace }}/js/web
 
-      - uses: actions/cache@v4
+      - uses: actions/cache@v5
         id: onnx-node-tests-cache
         with:
           path: ${{ github.workspace }}/js/test/
@@ -173,7 +180,7 @@ jobs:
       # this step is added to help investigate the shader validation failure which is hard to reproduce
       - name: Upload WebGPU shader validation log on failure
         if: ${{ failure() && inputs.build_config == 'Debug' }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v6
         with:
           name: webgpu-shader-validation-logs
           path: ${{ runner.temp }}\web\test\07\chrome_debug.log
@@ -203,7 +210,7 @@ jobs:
 
       - name: Upload NPM packages
         if: ${{ inputs.build_config == 'Release' }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v6
         with:
           name: ${{ inputs.package_name }}
           path: ${{ github.workspace }}\artifacts_npm
diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml
index 0b1bf59733349..8daacd79e9040 100644
--- a/.github/workflows/windows_cuda.yml
+++ b/.github/workflows/windows_cuda.yml
@@ -19,9 +19,13 @@ concurrency:
 jobs:
   build:
     name: Windows GPU CUDA CI Pipeline
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: [
+        "self-hosted",
+        "1ES.Pool=onnxruntime-github-vs2022-latest",
+        "JobId=windows-cuda-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}"
+        ]
     steps:
-      - uses: actions/checkout@v5
+      - uses: actions/checkout@v6
         with:
           fetch-depth: 0
           submodules: 'none'
@@ -41,10 +45,10 @@ jobs:
         working-directory: ${{ github.workspace }}
         shell: cmd
 
-      - name: Download CUDA SDK v12.2
+      - name: Download CUDA SDK v12.8
         working-directory: ${{ runner.temp }}
         run: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" .
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" .
           dir
         shell: pwsh
 
@@ -52,21 +56,21 @@ jobs:
         shell: powershell
         run: |
           Write-Host "Adding CUDA to PATH"
-          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64"
+          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64"
 
-      - uses: actions/setup-node@v4
+      - uses: actions/setup-node@v6
         with:
           node-version: '20.x'
 
-      - uses: actions/setup-java@v4
+      - uses: actions/setup-java@v5
         with:
           distribution: 'temurin'
           java-version: '17'
           architecture: x64
 
-      - uses: actions/cache@v4
+      - uses: actions/cache@v5
         id: onnx-node-tests-cache
         with:
           path: ${{ github.workspace }}/js/test/
@@ -82,7 +86,7 @@ jobs:
         working-directory: ${{ github.workspace }}
         shell: cmd
 
-      - uses: actions/setup-dotnet@v4
+      - uses: actions/setup-dotnet@v5
         env:
           PROCESSOR_ARCHITECTURE: x64
         with:
@@ -111,7 +115,7 @@ jobs:
             exit $lastExitCode
           }
           # Execute the build process
-          python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.2" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
+          python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
@@ -132,7 +136,7 @@ jobs:
         shell: pwsh
 
       - name: Upload build artifacts
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v6
         with:
           name: build-artifacts
           path: ${{ runner.temp }}\build
@@ -150,15 +154,19 @@ jobs:
     name: Windows GPU CUDA CI Pipeline Test Job
     needs: build
     timeout-minutes: 300
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Win2022-GPU-A10"]
+    runs-on: [
+      "self-hosted",
+      "1ES.Pool=onnxruntime-github-Win2022-GPU-A10",
+      "JobId=windows-cuda-test-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}"
+    ]
     steps:
-      - uses: actions/checkout@v5
+      - uses: actions/checkout@v6
         with:
           fetch-depth: 0
           submodules: 'none'
 
       - name: Download build artifacts
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v7
         with:
           name: build-artifacts
           path: ${{ runner.temp }}\build
@@ -168,11 +176,11 @@ jobs:
           python-version: '3.12'
           architecture: x64
 
-      - uses: actions/setup-node@v4
+      - uses: actions/setup-node@v6
         with:
           node-version: '20.x'
 
-      - uses: actions/setup-java@v4
+      - uses: actions/setup-java@v5
         with:
           distribution: 'temurin'
           java-version: '17'
@@ -188,10 +196,10 @@ jobs:
         working-directory: ${{ github.workspace }}
         shell: cmd
 
-      - name: Download CUDA SDK v12.2
+      - name: Download CUDA SDK v12.8
         working-directory: ${{ runner.temp }}
         run: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" .
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" .
           dir
         shell: pwsh
 
@@ -199,9 +207,9 @@ jobs:
         shell: powershell
         run: |
           Write-Host "Adding CUDA to PATH"
-          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64"
+          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64"
 
       - name: Set OnnxRuntimeBuildDirectory
         shell: pwsh
@@ -227,7 +235,7 @@ jobs:
             exit $lastExitCode
           }
           
-          python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.2" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
+          python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86  --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
diff --git a/.github/workflows/windows_qnn_x64.yml b/.github/workflows/windows_qnn_x64.yml
index 4c08d543cefd9..fa07fd47d87e9 100644
--- a/.github/workflows/windows_qnn_x64.yml
+++ b/.github/workflows/windows_qnn_x64.yml
@@ -18,7 +18,7 @@ concurrency:
 jobs:
   build_test_qnn_ep:
     name: Windows x64 QNN CI Pipeline (${{ matrix.QnnLibKind }})
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"]
     timeout-minutes: 120
     strategy:
       matrix: 
diff --git a/.github/workflows/windows_tensorrt.yml b/.github/workflows/windows_tensorrt.yml
index de6fa1529bcb1..5eb08a369cb87 100644
--- a/.github/workflows/windows_tensorrt.yml
+++ b/.github/workflows/windows_tensorrt.yml
@@ -19,9 +19,13 @@ concurrency:
 jobs:
   build:
     name: Windows GPU TensorRT CI Pipeline
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    runs-on: [
+        "self-hosted",
+        "1ES.Pool=onnxruntime-github-vs2022-latest",
+        "JobId=windows-tensorrt-build-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}"
+      ]
     steps:
-      - uses: actions/checkout@v5
+      - uses: actions/checkout@v6
         with:
           fetch-depth: 0
           submodules: 'none'
@@ -41,37 +45,38 @@ jobs:
         working-directory: ${{ github.workspace }}
         shell: cmd
 
-      - name: Download CUDA SDK v12.2
+      - name: Download CUDA SDK v12.8
         working-directory: ${{ runner.temp }}
         run: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" .
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" .
           dir
         shell: pwsh
 
-      - name: Download TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8
-        run: 'azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8" ${{ runner.temp }}'
+      - name: Download TensorRT-10.14.1.48.Windows.win10.cuda-12.9
+        run: 'azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.14.1.48.Windows.win10.cuda-12.9" ${{ runner.temp }}'
         shell: pwsh
 
       - name: Add CUDA to PATH
         shell: powershell
         run: |
           Write-Host "Adding CUDA to PATH"
-          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib"
+          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-10.14.1.48.Windows.win10.cuda-12.9\lib"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-10.14.1.48.Windows.win10.cuda-12.9\bin"
 
-      - uses: actions/setup-node@v4
+      - uses: actions/setup-node@v6
         with:
           node-version: '20.x'
 
-      - uses: actions/setup-java@v4
+      - uses: actions/setup-java@v5
         with:
           distribution: 'temurin'
           java-version: '17'
           architecture: x64
 
-      - uses: actions/cache@v4
+      - uses: actions/cache@v5
         id: onnx-node-tests-cache
         with:
           path: ${{ github.workspace }}/js/test/
@@ -87,7 +92,7 @@ jobs:
         working-directory: ${{ github.workspace }}
         shell: cmd
 
-      - uses: actions/setup-dotnet@v4
+      - uses: actions/setup-dotnet@v5
         env:
           PROCESSOR_ARCHITECTURE: x64
         with:
@@ -116,7 +121,7 @@ jobs:
             exit $lastExitCode
           }
           # Execute the build process
-          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags             --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests             --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8"             --cuda_home="${{ runner.temp }}\v12.2" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags             --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests             --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9"             --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
@@ -137,7 +142,7 @@ jobs:
         shell: pwsh
 
       - name: Upload build artifacts
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v6
         with:
           name: build-artifacts
           path: ${{ runner.temp }}\build
@@ -155,15 +160,19 @@ jobs:
     name: Windows GPU TensorRT CI Pipeline Test Job
     needs: build
     timeout-minutes: 300
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Win2022-GPU-A10"]
+    runs-on: [
+      "self-hosted",
+      "1ES.Pool=onnxruntime-github-Win2022-GPU-A10",
+      "JobId=windows-tensorrt-test-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}"
+    ]
     steps:
-      - uses: actions/checkout@v5
+      - uses: actions/checkout@v6
         with:
           fetch-depth: 0
           submodules: 'none'
 
       - name: Download build artifacts
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v7
         with:
           name: build-artifacts
           path: ${{ runner.temp }}\build
@@ -173,11 +182,11 @@ jobs:
           python-version: '3.12'
           architecture: x64
 
-      - uses: actions/setup-node@v4
+      - uses: actions/setup-node@v6
         with:
           node-version: '20.x'
 
-      - uses: actions/setup-java@v4
+      - uses: actions/setup-java@v5
         with:
           distribution: 'temurin'
           java-version: '17'
@@ -193,25 +202,26 @@ jobs:
         working-directory: ${{ github.workspace }}
         shell: cmd
 
-      - name: Download CUDA SDK v12.2
+      - name: Download CUDA SDK v12.8
         working-directory: ${{ runner.temp }}
         run: |
-          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" .
+          azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" .
           dir
         shell: pwsh
 
-      - name: Download TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8
-        run: 'azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8" ${{ runner.temp }}'
+      - name: Download TensorRT-10.14.1.48.Windows.win10.cuda-12.9
+        run: 'azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.14.1.48.Windows.win10.cuda-12.9" ${{ runner.temp }}'
         shell: pwsh
 
       - name: Add CUDA to PATH
         shell: powershell
         run: |
           Write-Host "Adding CUDA to PATH"
-          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64"
-          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib"
+          Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-10.14.1.48.Windows.win10.cuda-12.9\lib"
+          Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-10.14.1.48.Windows.win10.cuda-12.9\bin"
 
       - name: Set OnnxRuntimeBuildDirectory
         shell: pwsh
@@ -237,7 +247,7 @@ jobs:
             exit $lastExitCode
           }
           
-          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags             --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests             --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8"             --cuda_home="${{ runner.temp }}\v12.2" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags             --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests             --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9"             --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
           if ($lastExitCode -ne 0) {
             exit $lastExitCode
           }
diff --git a/.github/workflows/windows_webgpu.yml b/.github/workflows/windows_webgpu.yml
index e1a8c28f5a1ad..5049e000495bf 100644
--- a/.github/workflows/windows_webgpu.yml
+++ b/.github/workflows/windows_webgpu.yml
@@ -34,7 +34,7 @@ jobs:
       ONNXRUNTIME_TEST_GPU_DEVICE_ID: "0"
     steps:
       - name: Checkout
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           fetch-depth: 0
           submodules: none
@@ -56,12 +56,12 @@ jobs:
         working-directory: ${{ github.workspace }}
 
       - name: Setup Node.js
-        uses: actions/setup-node@v4
+        uses: actions/setup-node@v6
         with:
           node-version: "20.x"
 
       - name: Setup Java
-        uses: actions/setup-java@v4
+        uses: actions/setup-java@v5
         with:
           distribution: "temurin"
           java-version: "17"
@@ -78,7 +78,7 @@ jobs:
         working-directory: ${{ github.workspace }}
 
       - name: Setup .NET
-        uses: actions/setup-dotnet@v4
+        uses: actions/setup-dotnet@v5
         env:
           PROCESSOR_ARCHITECTURE: x64
         with:
@@ -95,7 +95,7 @@ jobs:
         shell: cmd
         working-directory: ${{ github.workspace }}
 
-      - uses: actions/cache@v4
+      - uses: actions/cache@v5
         id: onnx-node-tests-cache
         with:
           path: ${{ github.workspace }}/js/test/
@@ -155,7 +155,7 @@ jobs:
     timeout-minutes: 300
     steps:
       - name: Checkout
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           fetch-depth: 0
           submodules: none
@@ -208,7 +208,7 @@ jobs:
       ONNXRUNTIME_TEST_GPU_DEVICE_ID: "0"
     steps:
       - name: Checkout
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
         with:
           fetch-depth: 0
           submodules: none
@@ -230,12 +230,12 @@ jobs:
         working-directory: ${{ github.workspace }}
 
       - name: Setup Node.js
-        uses: actions/setup-node@v4
+        uses: actions/setup-node@v6
         with:
           node-version: "20.x"
 
       - name: Setup Java
-        uses: actions/setup-java@v4
+        uses: actions/setup-java@v5
         with:
           distribution: "temurin"
           java-version: "17"
@@ -252,7 +252,7 @@ jobs:
         working-directory: ${{ github.workspace }}
 
       - name: Setup .NET
-        uses: actions/setup-dotnet@v4
+        uses: actions/setup-dotnet@v5
         env:
           PROCESSOR_ARCHITECTURE: x64
         with:
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 91707c485d3c5..f36d3f1bd1315 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -182,8 +182,7 @@
 
     # Since CUDA 12.8, compiling diagnostics become stricter
     if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
-      target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--relocatable-device-code=true>")
-      set_target_properties(${target} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+      target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--static-global-template-stub=false>")
       if (MSVC)
         target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4505>")
       endif()
diff --git a/cmake/onnxruntime_providers_vitisai.cmake b/cmake/onnxruntime_providers_vitisai.cmake
index d40ae17e40545..d59c944c8926f 100644
--- a/cmake/onnxruntime_providers_vitisai.cmake
+++ b/cmake/onnxruntime_providers_vitisai.cmake
@@ -19,7 +19,16 @@
     "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
   )
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_vitisai_cc_srcs})
-  onnxruntime_add_shared_library(onnxruntime_providers_vitisai ${onnxruntime_providers_vitisai_cc_srcs})
+  set(onnxruntime_providers_vitisai_all_srcs ${onnxruntime_providers_vitisai_cc_srcs})
+  if(WIN32)
+    # Sets the DLL version info on Windows: https://learn.microsoft.com/en-us/windows/win32/menurc/versioninfo-resource
+    list(APPEND onnxruntime_providers_vitisai_all_srcs "${ONNXRUNTIME_ROOT}/core/providers/vitisai/onnxruntime_providers_vitisai.rc")
+  endif()
+  onnxruntime_add_shared_library(onnxruntime_providers_vitisai ${onnxruntime_providers_vitisai_all_srcs})
+  if(WIN32)
+    # FILE_NAME preprocessor definition is used in onnxruntime_providers_vitisai.rc
+    target_compile_definitions(onnxruntime_providers_vitisai PRIVATE FILE_NAME=\"onnxruntime_providers_vitisai.dll\")
+  endif()
   onnxruntime_add_include_to_target(onnxruntime_providers_vitisai ${ONNXRUNTIME_PROVIDERS_SHARED} ${GSL_TARGET} safeint_interface flatbuffers::flatbuffers  Boost::mp11)
   target_link_libraries(onnxruntime_providers_vitisai PRIVATE ${ONNXRUNTIME_PROVIDERS_SHARED} ${ABSEIL_LIBS})
   if(MSVC)
diff --git a/onnxruntime/core/platform/telemetry.cc b/onnxruntime/core/platform/telemetry.cc
index 6cbbdd4e0a7ef..59087ee725a18 100644
--- a/onnxruntime/core/platform/telemetry.cc
+++ b/onnxruntime/core/platform/telemetry.cc
@@ -91,7 +91,7 @@ void Telemetry::LogRuntimeError(uint32_t session_id, const common::Status& statu
 }
 
 void Telemetry::LogRuntimePerf(uint32_t session_id, uint32_t total_runs_since_last, int64_t total_run_duration_since_last,
-                               std::unordered_map<int64_t, long long> duration_per_batch_size) const {
+                               const std::unordered_map<int64_t, long long>& duration_per_batch_size) const {
   ORT_UNUSED_PARAMETER(session_id);
   ORT_UNUSED_PARAMETER(total_runs_since_last);
   ORT_UNUSED_PARAMETER(total_run_duration_since_last);
@@ -127,4 +127,35 @@ void Telemetry::LogProviderOptions(const std::string& provider_id,
   ORT_UNUSED_PARAMETER(captureState);
 }
 
+void Telemetry::LogModelLoadStart(uint32_t session_id) const {
+  ORT_UNUSED_PARAMETER(session_id);
+}
+
+void Telemetry::LogModelLoadEnd(uint32_t session_id, const common::Status& status) const {
+  ORT_UNUSED_PARAMETER(session_id);
+  ORT_UNUSED_PARAMETER(status);
+}
+
+void Telemetry::LogSessionCreationEnd(uint32_t session_id,
+                                      const common::Status& status) const {
+  ORT_UNUSED_PARAMETER(session_id);
+  ORT_UNUSED_PARAMETER(status);
+}
+
+void Telemetry::LogRegisterEpLibraryWithLibPath(const std::string& registration_name,
+                                                const std::string& lib_path) const {
+  ORT_UNUSED_PARAMETER(registration_name);
+  ORT_UNUSED_PARAMETER(lib_path);
+}
+
+void Telemetry::LogRegisterEpLibraryStart(const std::string& registration_name) const {
+  ORT_UNUSED_PARAMETER(registration_name);
+}
+
+void Telemetry::LogRegisterEpLibraryEnd(const std::string& registration_name,
+                                        const common::Status& status) const {
+  ORT_UNUSED_PARAMETER(registration_name);
+  ORT_UNUSED_PARAMETER(status);
+}
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/platform/telemetry.h b/onnxruntime/core/platform/telemetry.h
index b60345e1b8a80..8bc92b0490ece 100644
--- a/onnxruntime/core/platform/telemetry.h
+++ b/onnxruntime/core/platform/telemetry.h
@@ -70,7 +70,7 @@ class Telemetry {
                                const char* function, uint32_t line) const;
 
   virtual void LogRuntimePerf(uint32_t session_id, uint32_t total_runs_since_last, int64_t total_run_duration_since_last,
-                              std::unordered_map<int64_t, long long> duration_per_batch_size) const;
+                              const std::unordered_map<int64_t, long long>& duration_per_batch_size) const;
 
   virtual void LogExecutionProviderEvent(LUID* adapterLuid) const;
 
@@ -86,6 +86,21 @@ class Telemetry {
                                   const std::string& provider_options_string,
                                   bool captureState) const;
 
+  virtual void LogModelLoadStart(uint32_t session_id) const;
+
+  virtual void LogModelLoadEnd(uint32_t session_id, const common::Status& status) const;
+
+  virtual void LogSessionCreationEnd(uint32_t session_id,
+                                     const common::Status& status) const;
+
+  virtual void LogRegisterEpLibraryWithLibPath(const std::string& registration_name,
+                                               const std::string& lib_path) const;
+
+  virtual void LogRegisterEpLibraryStart(const std::string& registration_name) const;
+
+  virtual void LogRegisterEpLibraryEnd(const std::string& registration_name,
+                                       const common::Status& status) const;
+
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Telemetry);
 };
diff --git a/onnxruntime/core/platform/windows/telemetry.cc b/onnxruntime/core/platform/windows/telemetry.cc
index 029b17eb3502e..3ea94ac3a8492 100644
--- a/onnxruntime/core/platform/windows/telemetry.cc
+++ b/onnxruntime/core/platform/windows/telemetry.cc
@@ -465,7 +465,7 @@ void WindowsTelemetry::LogRuntimeError(uint32_t session_id, const common::Status
 }
 
 void WindowsTelemetry::LogRuntimePerf(uint32_t session_id, uint32_t total_runs_since_last, int64_t total_run_duration_since_last,
-                                      std::unordered_map<int64_t, long long> duration_per_batch_size) const {
+                                      const std::unordered_map<int64_t, long long>& duration_per_batch_size) const {
   if (global_register_count_ == 0 || enabled_ == false)
     return;
 
@@ -605,4 +605,116 @@ void WindowsTelemetry::LogProviderOptions(const std::string& provider_id, const
   }
 }
 
+void WindowsTelemetry::LogModelLoadStart(uint32_t session_id) const {
+  if (global_register_count_ == 0 || enabled_ == false)
+    return;
+
+  TraceLoggingWrite(telemetry_provider_handle,
+                    "ModelLoadStart",
+                    TraceLoggingBool(true, "UTCReplace_AppSessionGuid"),
+                    TelemetryPrivacyDataTag(PDT_ProductAndServiceUsage),
+                    TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES),
+                    TraceLoggingLevel(WINEVENT_LEVEL_INFO),
+                    // Telemetry info
+                    TraceLoggingUInt8(0, "schemaVersion"),
+                    TraceLoggingUInt32(session_id, "sessionId"),
+                    TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
+}
+
+void WindowsTelemetry::LogModelLoadEnd(uint32_t session_id, const common::Status& status) const {
+  if (global_register_count_ == 0 || enabled_ == false)
+    return;
+
+  TraceLoggingWrite(telemetry_provider_handle,
+                    "ModelLoadEnd",
+                    TraceLoggingBool(true, "UTCReplace_AppSessionGuid"),
+                    TelemetryPrivacyDataTag(PDT_ProductAndServicePerformance),
+                    TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES),
+                    TraceLoggingLevel(WINEVENT_LEVEL_INFO),
+                    // Telemetry info
+                    TraceLoggingUInt8(0, "schemaVersion"),
+                    TraceLoggingUInt32(session_id, "sessionId"),
+                    TraceLoggingBool(status.IsOK(), "isSuccess"),
+                    TraceLoggingUInt32(status.Code(), "errorCode"),
+                    TraceLoggingUInt32(status.Category(), "errorCategory"),
+                    TraceLoggingString(status.IsOK() ? "" : status.ErrorMessage().c_str(), "errorMessage"),
+                    TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
+}
+
+void WindowsTelemetry::LogSessionCreationEnd(uint32_t session_id,
+                                             const common::Status& status) const {
+  if (global_register_count_ == 0 || enabled_ == false)
+    return;
+
+  TraceLoggingWrite(telemetry_provider_handle,
+                    "SessionCreationEnd",
+                    TraceLoggingBool(true, "UTCReplace_AppSessionGuid"),
+                    TelemetryPrivacyDataTag(PDT_ProductAndServicePerformance),
+                    TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES),
+                    TraceLoggingLevel(WINEVENT_LEVEL_INFO),
+                    // Telemetry info
+                    TraceLoggingUInt8(0, "schemaVersion"),
+                    TraceLoggingUInt32(session_id, "sessionId"),
+                    TraceLoggingBool(status.IsOK(), "isSuccess"),
+                    TraceLoggingUInt32(status.Code(), "errorCode"),
+                    TraceLoggingUInt32(status.Category(), "errorCategory"),
+                    TraceLoggingString(status.IsOK() ? "" : status.ErrorMessage().c_str(), "errorMessage"),
+                    TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
+}
+
+void WindowsTelemetry::LogRegisterEpLibraryWithLibPath(const std::string& registration_name,
+                                                       const std::string& lib_path) const {
+  if (global_register_count_ == 0 || enabled_ == false)
+    return;
+
+  TraceLoggingWrite(telemetry_provider_handle,
+                    "RegisterEpLibraryWithLibPath",
+                    TraceLoggingBool(true, "UTCReplace_AppSessionGuid"),
+                    TelemetryPrivacyDataTag(PDT_ProductAndServiceUsage),
+                    TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES),
+                    TraceLoggingLevel(WINEVENT_LEVEL_INFO),
+                    // Telemetry info
+                    TraceLoggingUInt8(0, "schemaVersion"),
+                    TraceLoggingString(registration_name.c_str(), "registrationName"),
+                    TraceLoggingString(lib_path.c_str(), "libPath"),
+                    TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
+}
+
+void WindowsTelemetry::LogRegisterEpLibraryStart(const std::string& registration_name) const {
+  if (global_register_count_ == 0 || enabled_ == false)
+    return;
+
+  TraceLoggingWrite(telemetry_provider_handle,
+                    "RegisterEpLibraryStart",
+                    TraceLoggingBool(true, "UTCReplace_AppSessionGuid"),
+                    TelemetryPrivacyDataTag(PDT_ProductAndServiceUsage),
+                    TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES),
+                    TraceLoggingLevel(WINEVENT_LEVEL_INFO),
+                    // Telemetry info
+                    TraceLoggingUInt8(0, "schemaVersion"),
+                    TraceLoggingString(registration_name.c_str(), "registrationName"),
+                    TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
+}
+
+void WindowsTelemetry::LogRegisterEpLibraryEnd(const std::string& registration_name,
+                                               const common::Status& status) const {
+  if (global_register_count_ == 0 || enabled_ == false)
+    return;
+
+  TraceLoggingWrite(telemetry_provider_handle,
+                    "RegisterEpLibraryEnd",
+                    TraceLoggingBool(true, "UTCReplace_AppSessionGuid"),
+                    TelemetryPrivacyDataTag(PDT_ProductAndServicePerformance),
+                    TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES),
+                    TraceLoggingLevel(WINEVENT_LEVEL_INFO),
+                    // Telemetry info
+                    TraceLoggingUInt8(0, "schemaVersion"),
+                    TraceLoggingString(registration_name.c_str(), "registrationName"),
+                    TraceLoggingBool(status.IsOK(), "isSuccess"),
+                    TraceLoggingUInt32(status.Code(), "errorCode"),
+                    TraceLoggingUInt32(status.Category(), "errorCategory"),
+                    TraceLoggingString(status.IsOK() ? "" : status.ErrorMessage().c_str(), "errorMessage"),
+                    TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName"));
+}
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/platform/windows/telemetry.h b/onnxruntime/core/platform/windows/telemetry.h
index 261d14a7fed8c..30621060ae91a 100644
--- a/onnxruntime/core/platform/windows/telemetry.h
+++ b/onnxruntime/core/platform/windows/telemetry.h
@@ -63,7 +63,7 @@ class WindowsTelemetry : public Telemetry {
                        const char* function, uint32_t line) const override;
 
   void LogRuntimePerf(uint32_t session_id, uint32_t total_runs_since_last, int64_t total_run_duration_since_last,
-                      std::unordered_map<int64_t, long long> duration_per_batch_size) const override;
+                      const std::unordered_map<int64_t, long long>& duration_per_batch_size) const override;
 
   void LogExecutionProviderEvent(LUID* adapterLuid) const override;
 
@@ -79,6 +79,21 @@ class WindowsTelemetry : public Telemetry {
                           const std::string& provider_options_string,
                           bool captureState) const override;
 
+  void LogModelLoadStart(uint32_t session_id) const override;
+
+  void LogModelLoadEnd(uint32_t session_id, const common::Status& status) const override;
+
+  void LogSessionCreationEnd(uint32_t session_id,
+                             const common::Status& status) const override;
+
+  void LogRegisterEpLibraryWithLibPath(const std::string& registration_name,
+                                       const std::string& lib_path) const override;
+
+  void LogRegisterEpLibraryStart(const std::string& registration_name) const override;
+
+  void LogRegisterEpLibraryEnd(const std::string& registration_name,
+                               const common::Status& status) const override;
+
   using EtwInternalCallback = std::function<void(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level,
                                                  ULONGLONG MatchAnyKeyword, ULONGLONG MatchAllKeyword,
                                                  PEVENT_FILTER_DESCRIPTOR FilterData, PVOID CallbackContext)>;
diff --git a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
index 4d8a2bc1106ad..fc3ecf5465e6d 100644
--- a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
+++ b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
@@ -626,6 +626,20 @@ Status Mul<T>::Compute(OpKernelContext* context) const {
 
 template <typename T>
 Status Div<T>::Compute(OpKernelContext* context) const {
+  // Integer division by zero is undefined behavior in C++ and causes a hardware exception.
+  // Check for zeros in the divisor before performing the division.
+  // Skip the check if the divisor was already validated as a constant initializer during kernel creation.
+  if constexpr (std::is_integral<T>::value) {
+    if (!divisor_is_validated_constant_) {
+      const Tensor& B = *context->Input<Tensor>(1);
+      const T* b_data = B.Data<T>();
+      const int64_t b_size = B.Shape().Size();
+      for (int64_t i = 0; i < b_size; ++i) {
+        ORT_RETURN_IF(b_data[i] == T{0}, "Integer division by zero");
+      }
+    }
+  }
+
   ProcessBroadcastSpanFuncs funcs{
       [](BroadcastHelper& per_iter_bh) {
         per_iter_bh.OutputEigen<T>() = per_iter_bh.ScalarInput0<T>() / per_iter_bh.EigenInput1<T>().array();
diff --git a/onnxruntime/core/providers/cpu/math/element_wise_ops.h b/onnxruntime/core/providers/cpu/math/element_wise_ops.h
index 66060344c9874..77ef3033a0975 100644
--- a/onnxruntime/core/providers/cpu/math/element_wise_ops.h
+++ b/onnxruntime/core/providers/cpu/math/element_wise_ops.h
@@ -243,9 +243,25 @@ template <typename T>
 class Div final : public OpKernel {
  public:
   Div(const OpKernelInfo& info) : OpKernel(info) {
+    // If the divisor is a constant initializer, validate for integer division by zero once
+    // during kernel creation instead of on every Compute call.
+    if constexpr (std::is_integral<T>::value) {
+      const Tensor* constant_divisor = nullptr;
+      if (info.TryGetConstantInput(1, &constant_divisor)) {
+        const T* b_data = constant_divisor->Data<T>();
+        const int64_t b_size = constant_divisor->Shape().Size();
+        for (int64_t i = 0; i < b_size; ++i) {
+          ORT_ENFORCE(b_data[i] != T{0}, "Integer division by zero");
+        }
+        divisor_is_validated_constant_ = true;
+      }
+    }
   }
 
   Status Compute(OpKernelContext* context) const override;
+
+ private:
+  bool divisor_is_validated_constant_{false};
 };
 
 class Pow final : public OpKernel {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp
index 353f698bb6f2c..076027dd3672f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp
@@ -504,7 +504,7 @@ HRESULT STDMETHODCALLTYPE AbiCustomRegistry::RegisterOperatorKernel(
                     InferAndVerifyOutputSizes(node, &defaultAttributesCapture, shapeInferrerCapture.Get(), constantCpuInputCapture, constantInputGetter, inputShapesOverrides, *outputShapes);
 
                     // Create the kernel while allowing input shape and output shape queries according to options
-                    ComPtr<DmlGraphOpKernelInfoWrapper> kernelInfoWrapper = wil::MakeOrThrow<DmlGraphOpKernelInfoWrapper>(
+                    ComPtr<DmlGraphOpKernelInfoWrapper> kernelInfoWrapper = Dml::SafeMakeOrThrow<DmlGraphOpKernelInfoWrapper>(
                             &protoHelper,
                             executionHandle,
                             true,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
index 18b4b4593f537..ed99ac0fc7fc2 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp
@@ -132,7 +132,7 @@ namespace Dml
         assert(resourceWrapper->GetD3D12Resource()->GetDesc().Width == bucketSize);
         assert(resourceWrapper != nullptr);
 
-        ComPtr<AllocationInfo> allocInfo = wil::MakeOrThrow<AllocationInfo>(
+        ComPtr<AllocationInfo> allocInfo = Dml::SafeMakeOrThrow<AllocationInfo>(
             this,
             ++m_currentAllocationId,
             resourceId,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.cpp
index 54393e9bf1539..2934fd0c11516 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.cpp
@@ -22,7 +22,7 @@ namespace Dml
         ));
 
         ComPtr<DmlResourceWrapper> resourceWrapper;
-        wil::MakeOrThrow<DmlCommittedResourceWrapper>(std::move(resource)).As(&resourceWrapper);
+        Dml::SafeMakeOrThrow<DmlCommittedResourceWrapper>(std::move(resource)).As(&resourceWrapper);
         return resourceWrapper;
     }
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalBufferAllocator.h
index c99d686349e94..158c102d69ee7 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalBufferAllocator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalBufferAllocator.h
@@ -48,9 +48,9 @@ namespace Dml
             constexpr uint64_t pooledResourceId = 0; // Not a pooled resource
 
             Microsoft::WRL::ComPtr<DmlResourceWrapper> resourceWrapper;
-            wil::MakeOrThrow<DmlCommittedResourceWrapper>(std::move(resource)).As(&resourceWrapper);
+            Dml::SafeMakeOrThrow<DmlCommittedResourceWrapper>(std::move(resource)).As(&resourceWrapper);
 
-            Microsoft::WRL::ComPtr<AllocationInfo> allocInfo = wil::MakeOrThrow<AllocationInfo>(
+            Microsoft::WRL::ComPtr<AllocationInfo> allocInfo = Dml::SafeMakeOrThrow<AllocationInfo>(
                 nullptr,
                 0,
                 pooledResourceId,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
index 6bd7de0fba5cb..4ddf8b8640376 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
@@ -232,8 +232,6 @@ namespace DmlGraphFusionHelper
                     }
                 }
 
-                // Tensor sizes in DML must be a multiple of 4 bytes large.
-                tensorByteSize = AlignToPow2<size_t>(tensorByteSize, 4);
                 if(graphSerializationEnabled)
                 {
                     WriteToFile(modelName, ConvertToWString(iter->first) + L".bin", reinterpret_cast<uint8_t*>(tensorPtr), tensorByteSize);
@@ -264,9 +262,10 @@ namespace DmlGraphFusionHelper
                         initializeInputBuffer = CreateCpuResource(providerImpl, tensorPtr, tensorByteSize);
                     }
 
-                    // Set the binding for operator initialization to the buffer
+                    // Set the binding for operator initialization to the buffer.
+                    // DML requires buffer binding sizes to be a multiple of 4 bytes.
                     initInputBindings[i].Buffer = initializeInputBuffer.Get();
-                    initInputBindings[i].SizeInBytes = tensorByteSize;
+                    initInputBindings[i].SizeInBytes = AlignToPow2<size_t>(tensorByteSize, 4);
                     initializeResourceRefs.push_back(std::move(initializeInputBuffer));
                 }
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index 6d8d5453b9fc0..cd7dfd46485af 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -55,7 +55,7 @@ namespace Dml
         _Out_ std::shared_ptr<onnxruntime::KernelRegistry>* registry,
         _Out_ std::shared_ptr<const InternalRegistrationInfoMap>* internalRegInfoMap)
     {
-        ComPtr<AbiCustomRegistry> abiRegistry = wil::MakeOrThrow<AbiCustomRegistry>();
+        ComPtr<AbiCustomRegistry> abiRegistry = Dml::SafeMakeOrThrow<AbiCustomRegistry>();
         Dml::RegisterDmlOperators(abiRegistry.Get());
 
         assert(abiRegistry->GetRegistries().size() == 1);
@@ -88,7 +88,7 @@ namespace Dml
         ComPtr<ID3D12Device> device;
         GRAPHICS_THROW_IF_FAILED(dmlDevice->GetParentDevice(IID_GRAPHICS_PPV_ARGS(device.GetAddressOf())));
 
-        m_impl = wil::MakeOrThrow<ExecutionProviderImpl>(dmlDevice, device.Get(), executionContext, enableMetacommands,
+        m_impl = Dml::SafeMakeOrThrow<ExecutionProviderImpl>(dmlDevice, device.Get(), executionContext, enableMetacommands,
                                                          enableGraphCapture, enableSyncSpinning, disableMemoryArena);
     }
 
@@ -1298,9 +1298,9 @@ namespace Dml
         uint64_t pooledResourceId = 0; // Not a pooled resource
 
         ComPtr<DmlResourceWrapper> resourceWrapper;
-        wil::MakeOrThrow<DmlCommittedResourceWrapper>(pResource).As(&resourceWrapper);
+        Dml::SafeMakeOrThrow<DmlCommittedResourceWrapper>(pResource).As(&resourceWrapper);
 
-        ComPtr<AllocationInfo> allocInfo = wil::MakeOrThrow<AllocationInfo>(nullptr, 0, pooledResourceId, resourceWrapper.Get(), (size_t)pResource->GetDesc().Width);
+        ComPtr<AllocationInfo> allocInfo = Dml::SafeMakeOrThrow<AllocationInfo>(nullptr, 0, pooledResourceId, resourceWrapper.Get(), (size_t)pResource->GetDesc().Width);
         return allocInfo.Detach();
     }
     void FreeGPUAllocation(void* ptr)
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
index 22de743f6e718..51c25d6d40c5b 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
@@ -291,7 +291,7 @@ namespace Dml::GraphDescBuilder
             if (iter != isInitializerTransferable.end())
             {
                 // Using const_cast here is simpler than making surrounding code const correct.
-                tensorWrapper = wil::MakeOrThrow<OnnxTensorWrapper>(const_cast<ONNX_NAMESPACE::TensorProto*>(iter->second.first), modelPath);
+                tensorWrapper = Dml::SafeMakeOrThrow<OnnxTensorWrapper>(const_cast<ONNX_NAMESPACE::TensorProto*>(iter->second.first), modelPath);
             }
             return tensorWrapper;
         };
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
index fe52f27b35bb8..13ce9afa99b1e 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -868,7 +868,7 @@ namespace Windows::AI::MachineLearning::Adapter
               const onnx::TensorProto* tensorProto = &attributeProto->t();
 
               // An empty path is used as external weights are not currently supported in this case
-              Microsoft::WRL::ComPtr<IMLOperatorTensor> tensorWrapper = wil::MakeOrThrow<OnnxTensorWrapper>(const_cast<onnx::TensorProto*>(tensorProto), std::filesystem::path());
+              Microsoft::WRL::ComPtr<IMLOperatorTensor> tensorWrapper = Dml::SafeMakeOrThrow<OnnxTensorWrapper>(const_cast<onnx::TensorProto*>(tensorProto), std::filesystem::path());
               *tensor = tensorWrapper.Detach();
               return S_OK;
             }
@@ -1977,7 +1977,7 @@ namespace Windows::AI::MachineLearning::Adapter
                 auto inputTensor = m_impl->Input<onnxruntime::Tensor>(gsl::narrow_cast<int>(inputIndex));
                 if (inputTensor != nullptr)
                 {
-                    ComPtr<TensorWrapper> tensorWrapper = wil::MakeOrThrow<TensorWrapper>(
+                    ComPtr<TensorWrapper> tensorWrapper = Dml::SafeMakeOrThrow<TensorWrapper>(
                         const_cast<onnxruntime::Tensor*>(inputTensor),
                         IsAllocationInterface(inputTensor->Location()),
                         m_winmlProvider.Get(),
@@ -2019,7 +2019,7 @@ namespace Windows::AI::MachineLearning::Adapter
                 auto elemTensor = const_cast<onnxruntime::Tensor*>(&inputTensorSeq->Get(sequenceIndex));
                 if (elemTensor != nullptr)
                 {
-                    ComPtr<TensorWrapper> tensorWrapper = wil::MakeOrThrow<TensorWrapper>(
+                    ComPtr<TensorWrapper> tensorWrapper = Dml::SafeMakeOrThrow<TensorWrapper>(
                         elemTensor,
                         IsAllocationInterface(elemTensor->Location()),
                         m_winmlProvider.Get(),
@@ -2119,7 +2119,7 @@ namespace Windows::AI::MachineLearning::Adapter
                 auto elemTensor = const_cast<onnxruntime::Tensor*>(&outputTensorSeq->Get(sequenceIndex));
                 if (elemTensor != nullptr)
                 {
-                    ComPtr<TensorWrapper> tensorWrapper = wil::MakeOrThrow<TensorWrapper>(
+                    ComPtr<TensorWrapper> tensorWrapper = Dml::SafeMakeOrThrow<TensorWrapper>(
                         elemTensor,
                         IsAllocationInterface(elemTensor->Location()),
                         m_winmlProvider.Get(),
@@ -2212,7 +2212,7 @@ namespace Windows::AI::MachineLearning::Adapter
                 auto outputTensor = m_impl->Output(outputIndex, shape);
                 if (outputTensor)
                 {
-                    ComPtr<TensorWrapper> tensorWrapper = wil::MakeOrThrow<TensorWrapper>(
+                    ComPtr<TensorWrapper> tensorWrapper = Dml::SafeMakeOrThrow<TensorWrapper>(
                         const_cast<onnxruntime::Tensor*>(outputTensor),
                         IsAllocationInterface(outputTensor->Location()),
                         m_winmlProvider.Get(),
@@ -2377,7 +2377,7 @@ namespace Windows::AI::MachineLearning::Adapter
                 const onnxruntime::Tensor* tensor = nullptr;
                 if (kerneInfo.TryGetConstantInput(index, &tensor))
                 {
-                    tensorWrapper = wil::MakeOrThrow<TensorWrapper>(
+                    tensorWrapper = Dml::SafeMakeOrThrow<TensorWrapper>(
                         const_cast<onnxruntime::Tensor*>(tensor),
                         IsAllocationInterface(tensor->Location()),
                         winmlProviderCapture.Get(),
@@ -2396,7 +2396,7 @@ namespace Windows::AI::MachineLearning::Adapter
             }
 
             // Create the kernel while allowing input shape and output shape queries according to options
-            ComPtr<OpKernelInfoWrapper> kernelInfoWrapper = wil::MakeOrThrow<OpKernelInfoWrapper>(
+            ComPtr<OpKernelInfoWrapper> kernelInfoWrapper = Dml::SafeMakeOrThrow<OpKernelInfoWrapper>(
                 &kerneInfo,
                 m_abiExecutionObject.Get(),
                 nullptr,
@@ -2443,7 +2443,7 @@ namespace Windows::AI::MachineLearning::Adapter
                     const auto* tensor = context->Input<onnxruntime::Tensor>(gsl::narrow_cast<int>(index));
                     if (tensor != nullptr)
                     {
-                        tensorWrapper = wil::MakeOrThrow<TensorWrapper>(
+                        tensorWrapper = Dml::SafeMakeOrThrow<TensorWrapper>(
                             const_cast<onnxruntime::Tensor*>(tensor),
                             IsAllocationInterface(tensor->Location()),
                             winmlProviderCapture.Get(),
@@ -2464,7 +2464,7 @@ namespace Windows::AI::MachineLearning::Adapter
                         for (uint32_t sequenceIndex = 0; sequenceIndex < tensorSequence->Size(); ++sequenceIndex)
                         {
                             auto& tensor = tensorSequence->Get(sequenceIndex);
-                            auto tensorWrapper = wil::MakeOrThrow<TensorWrapper>(
+                            auto tensorWrapper = Dml::SafeMakeOrThrow<TensorWrapper>(
                                 const_cast<onnxruntime::Tensor*>(&tensor),
                                 IsAllocationInterface(tensor.Location()),
                                 winmlProviderCapture.Get(),
@@ -2491,7 +2491,7 @@ namespace Windows::AI::MachineLearning::Adapter
             }
 
             // Create the kernel while allowing input shape and output shape queries according to options
-            ComPtr<OpKernelInfoWrapper> kernelInfoWrapper = wil::MakeOrThrow<OpKernelInfoWrapper>(
+            ComPtr<OpKernelInfoWrapper> kernelInfoWrapper = Dml::SafeMakeOrThrow<OpKernelInfoWrapper>(
                 &Info(),
                 m_abiExecutionObject.Get(),
                 &inputShapes,
@@ -2569,7 +2569,7 @@ namespace Windows::AI::MachineLearning::Adapter
                 EdgeShapes localInferredOutputShapes;
                 ComPtr<IMLOperatorKernel> localKernel = inferShapesAndCreateKernel(local_input_shapes, localInferredOutputShapes);
 
-                ComPtr<OpKernelContextWrapper> kernelContextWrapper = wil::MakeOrThrow<OpKernelContextWrapper>(
+                ComPtr<OpKernelContextWrapper> kernelContextWrapper = Dml::SafeMakeOrThrow<OpKernelContextWrapper>(
                     context,
                     Info().GetExecutionProvider(),
                     m_internalOperator,
@@ -2588,7 +2588,7 @@ namespace Windows::AI::MachineLearning::Adapter
             }
         }
 
-        ComPtr<OpKernelContextWrapper> kernelContextWrapper = wil::MakeOrThrow<OpKernelContextWrapper>(
+        ComPtr<OpKernelContextWrapper> kernelContextWrapper = Dml::SafeMakeOrThrow<OpKernelContextWrapper>(
             context,
             Info().GetExecutionProvider(),
             m_internalOperator,
@@ -2811,7 +2811,7 @@ namespace Windows::AI::MachineLearning::Adapter
         onnxruntime::ProtoHelperNodeContext protoContext(node);
         onnxruntime::OpNodeProtoHelper<onnxruntime::ProtoHelperNodeContext> info(&protoContext);
 
-        ComPtr<MLKernelInferenceContext> inferenceContext = wil::MakeOrThrow<MLKernelInferenceContext>(&info, inputShapes, outputShapes, defaultAttributes, requiredConstantCpuInputs, constantInputGetter);
+        ComPtr<MLKernelInferenceContext> inferenceContext = Dml::SafeMakeOrThrow<MLKernelInferenceContext>(&info, inputShapes, outputShapes, defaultAttributes, requiredConstantCpuInputs, constantInputGetter);
 
         outputShapes.Reset(info.GetOutputCount());
 
@@ -2865,13 +2865,13 @@ namespace Windows::AI::MachineLearning::Adapter
             [ctx](uint32_t index)
             {
                 // An empty path is used as external weights are not currently supported in this case
-                Microsoft::WRL::ComPtr<IMLOperatorTensor> tensorWrapper = wil::MakeOrThrow<OnnxTensorWrapper>(
+                Microsoft::WRL::ComPtr<IMLOperatorTensor> tensorWrapper = Dml::SafeMakeOrThrow<OnnxTensorWrapper>(
                     const_cast<onnx::TensorProto*>(ctx->getInputData(index)), std::filesystem::path());
                 return tensorWrapper;
             }
         );
 
-        return wil::MakeOrThrow<MLSchemaInferenceContext>(info, ctx, requiredConstantCpuInputs, mlOperatorTensorGetter);
+        return Dml::SafeMakeOrThrow<MLSchemaInferenceContext>(info, ctx, requiredConstantCpuInputs, mlOperatorTensorGetter);
     }
 
     MLSchemaInferenceContext::MLSchemaInferenceContext(
@@ -2952,7 +2952,7 @@ namespace Windows::AI::MachineLearning::Adapter
         const AttributeMap* defaultAttributes)
     {
         MLOperatorTensorGetter mLOperatorTensorGetter = MLOperatorTensorGetter();
-        return wil::MakeOrThrow<MLSupportQueryContext>(info, defaultAttributes, mLOperatorTensorGetter);
+        return Dml::SafeMakeOrThrow<MLSupportQueryContext>(info, defaultAttributes, mLOperatorTensorGetter);
     }
 
     MLSupportQueryContext::MLSupportQueryContext(
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
index 1de88a61a0d77..25210c146a6b6 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h
@@ -1097,7 +1097,7 @@ class GpuDFTOperatorFactory : public WRL::Base<IMLOperatorKernelFactory>
                 version = 20;
             }
 
-            auto dftOperator = wil::MakeOrThrow<GpuDFTOperator>(context, version);
+            auto dftOperator = Dml::SafeMakeOrThrow<GpuDFTOperator>(context, version);
             dftOperator.CopyTo(kernel);
             return S_OK;
         }
@@ -1177,8 +1177,8 @@ class GpuDFTOperatorFactory : public WRL::Base<IMLOperatorKernelFactory>
         kernelDescription.options = MLOperatorKernelOptions::None;
         kernelDescription.executionOptions = 0;
 
-        auto shareInferrer = wil::MakeOrThrow<DFTShapeInferrer>();
-        auto factory = wil::MakeOrThrow<GpuDFTOperatorFactory>();
+        auto shareInferrer = Dml::SafeMakeOrThrow<DFTShapeInferrer>();
+        auto factory = Dml::SafeMakeOrThrow<GpuDFTOperatorFactory>();
 
         std::array<uint32_t, 2> requiredConstantCpuInputs = { 1, 2 };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
index 5ba936ddf3976..6d7a089103c9b 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h
@@ -747,7 +747,7 @@ class DmlGridSampleOperatorFactory : public WRL::Base<IMLOperatorKernelFactory>
     {
         try
         {
-            auto dftOperator = wil::MakeOrThrow<DmlGridSampleOperator>(context);
+            auto dftOperator = Dml::SafeMakeOrThrow<DmlGridSampleOperator>(context);
             dftOperator.CopyTo(kernel);
             return S_OK;
         }
@@ -832,8 +832,8 @@ class DmlGridSampleOperatorFactory : public WRL::Base<IMLOperatorKernelFactory>
         kernelDescription.options = MLOperatorKernelOptions::None;
         kernelDescription.executionOptions = 0;
 
-        auto shareInferrer = wil::MakeOrThrow<GridSampleShapeInferrer>();
-        auto factory = wil::MakeOrThrow<DmlGridSampleOperatorFactory>();
+        auto shareInferrer = Dml::SafeMakeOrThrow<GridSampleShapeInferrer>();
+        auto factory = Dml::SafeMakeOrThrow<DmlGridSampleOperatorFactory>();
 
         ComPtr<IMLOperatorRegistryPrivate> registryPrivate;
         ORT_THROW_IF_FAILED(registry->QueryInterface(IID_PPV_ARGS(&registryPrivate)));
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
index 287f1e5b6dfe7..2ee85b01a9a2e 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
@@ -907,4 +907,71 @@ namespace Dml
         bufferTensorDesc->TotalTensorSizeInBytes = (elementSize + 3) & ~3;
     }
 
+    void DmlOperator::BroadcastQuantizationParameters(
+        const MLOperatorKernelCreationContext& kernelInfo,
+        gsl::span<const uint32_t> outputShape
+        )
+    {
+        const uint32_t outputShapeDimCount = gsl::narrow_cast<uint32_t>(outputShape.size());
+
+        uint32_t axis = 0;
+
+        // If an axis was explicitly passed (or the default value 1 is set from the schema),
+        // then other inputs are broadcasting to the shape of the input data tensor.
+        if (kernelInfo.HasAttribute(AttrName::Axis, MLOperatorAttributeType::Int))
+        {
+            // Avoid validating the axis until later because the axis parameter is ignorable unless
+            // broadcasting is actually needed. ONNX opset 13 returns a default value of 1 for the
+            // "axis" attribute even when the attribute doesn't actually exist in the model, which
+            // would cause a validation failure here.
+            const int32_t signedAxis = gsl::narrow_cast<int32_t>(kernelInfo.GetAttribute<int64_t>(AttrName::Axis));
+            axis = Dml::HandleNegativeAxis(signedAxis, outputShapeDimCount, /*validateAxis*/ false);
+        }
+
+        // Explicitly reshape each of the inputs after the first input (scale tensor and optional zero point tensor).
+        for (uint32_t index = 1, inputCount = gsl::narrow_cast<uint32_t>(m_inputTensorDescs.size()); index < inputCount; ++index)
+        {
+            if (!kernelInfo.IsInputValid(index))
+            {
+                continue;
+            }
+
+            auto edgeDesc = kernelInfo.GetInputEdgeDescription(index);
+            assert(edgeDesc.edgeType == MLOperatorEdgeType::Tensor);
+
+            // Fix up the tensor shape by filling with trailing ones. So input[2,3] with axis=0 and scale[2]
+            // becomes scale[2,1], so that broadcasting works correctly.
+            std::vector<uint32_t> inputTensorShape = kernelInfo.GetTensorShapeDescription().GetInputTensorShape(index);
+
+            // If the input tensor is a 1D vector, then extra massaging is needed to project their
+            // 1D vectors back to the full shape for broadcasting along the given axis.
+            // The 1D vector should have a length equal to the output tensor's dimension on that axis.
+            if (inputTensorShape.size() == 1 && inputTensorShape != std::vector<uint32_t>(outputShape.begin(), outputShape.end()))
+            {
+                ML_CHECK_VALID_ARGUMENT(axis < outputShapeDimCount);
+                uint32_t broadcastAxisLength = outputShape[axis];
+                ML_CHECK_VALID_ARGUMENT(
+                    (inputTensorShape[0] == broadcastAxisLength) ||
+                    // Treat as broadcast dimension to match CPU behavior.
+                    (inputTensorShape[0] == 1)
+                );
+                inputTensorShape.insert(inputTensorShape.begin(), axis, 1);
+                inputTensorShape.insert(inputTensorShape.end(), outputShapeDimCount - 1 - axis, 1);
+            }
+            // For any other shape (scalar/ND), leave it alone, and the TensorDesc constructor
+            // will apply broadcasting with standard elementwise alignment.
+
+            m_inputTensorDescs[index] = TensorDesc(
+                edgeDesc.tensorDataType,
+                outputShape,
+                gsl::make_span(inputTensorShape),
+                TensorAxis::DoNotCoerce,
+                TensorAxis::W,
+                TensorAxis::RightAligned,
+                NchwDimensionCount, // minDimensionCount
+                0 // guaranteedBaseOffsetAlignment
+            );
+        }
+    }
+
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
index fa54d4b041b5f..002541e23c47c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
@@ -149,6 +149,15 @@ namespace Dml
             uint32_t minDimensionCount = NchwDimensionCount
             ) const;
 
+        // Reshapes scale and zero_point tensor descriptors (inputs after index 0) so that their
+        // dimension count matches the output shape, enabling correct broadcasting in DML.
+        // For 1D per-axis tensors, the shape is projected along the given axis (e.g. scale[6]
+        // with axis=0 on a 5D output becomes [6,1,1,1,1]).
+        void BroadcastQuantizationParameters(
+            const MLOperatorKernelCreationContext& kernelInfo,
+            gsl::span<const uint32_t> outputShape
+            );
+
         static void TryConvertTensorToBroadcastScalar(
             const MLOperatorKernelCreationContext& kernelInfo,
             const DML_TENSOR_DESC* tensor,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorElementWise.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorElementWise.cpp
index d4d7ee1311874..b64a5265f56e3 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorElementWise.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorElementWise.cpp
@@ -542,64 +542,7 @@ class DmlOperatorElementwiseQLinear : public DmlOperator
         const DML_TENSOR_DATA_TYPE outputDataType = m_outputTensorDescs[0].GetDmlDataType();
         bool hasZeroPointTensor = kernelInfo.IsInputValid(2);
 
-        uint32_t axis = 0;
-
-        // If an axis was given explicitly passed (or the default value 1 is set from the schema),
-        // then other inputs are broadcasting to the shape of the input data tensor.
-        if (kernelInfo.HasAttribute(AttrName::Axis, MLOperatorAttributeType::Int))
-        {
-            // Avoid validating the axis until later because the axis parameter is ignorable unless
-            // broadcasting is actually needed. ONNX opset 13 returns a default value of 1 for the
-            // "axis" attribute even when the attribute doesn't actually exist in the model, which
-            // would cause a validation failure here.
-            const int32_t signedAxis = gsl::narrow_cast<int32_t>(kernelInfo.GetAttribute<int64_t>(AttrName::Axis));
-            axis = Dml::HandleNegativeAxis(signedAxis, outputShapeDimCount, /*validateAxis*/ false);
-        }
-
-        // Explicitly reshape each of the inputs after the first input (scale tensor and optional zero point tensor).
-        for (uint32_t index = 1, inputCount = gsl::narrow_cast<uint32_t>(m_inputTensorDescs.size()); index < inputCount; ++index)
-        {
-            if (!kernelInfo.IsInputValid(index))
-            {
-                continue;
-            }
-
-            auto edgeDesc = kernelInfo.GetInputEdgeDescription(index);
-            assert(edgeDesc.edgeType == MLOperatorEdgeType::Tensor);
-
-            // Fix up the the tensor shape by filling with trailing ones. So input[2,3] with axis=0 and scale[2]
-            // becomes scale[2,1], so that broadcasting works correctly.
-            std::vector<uint32_t> inputTensorShape = kernelInfo.GetTensorShapeDescription().GetInputTensorShape(index);
-
-            // If the input tensor is a 1D vector, then extra massaging is needed to project their
-            // 1D vectors back to the full shape for broadcasting along the given axis.
-            // The 1D vector should have a length equal to the output tensor's dimension on that axis.
-            if (inputTensorShape.size() == 1 && inputTensorShape != outputShape)
-            {
-                ML_CHECK_VALID_ARGUMENT(axis < outputShapeDimCount);
-                uint32_t broadcastAxisLength = outputShape[axis];
-                ML_CHECK_VALID_ARGUMENT(
-                    (inputTensorShape[0] == broadcastAxisLength) ||
-                    // Treat as broadcast dimension to match CPU behavior.
-                    (inputTensorShape[0] == 1)
-                );
-                inputTensorShape.insert(inputTensorShape.begin(), axis, 1);
-                inputTensorShape.insert(inputTensorShape.end(), outputShapeDimCount - 1 - axis, 1);
-            }
-            // For any other shape (scalar/ND), leave it alone, and the TensorDesc constructor
-            // will apply broadcasting with standard elementwise alignment.
-
-            m_inputTensorDescs[index] = TensorDesc(
-                edgeDesc.tensorDataType,
-                gsl::make_span(outputShape),
-                gsl::make_span(inputTensorShape),
-                TensorAxis::DoNotCoerce,
-                TensorAxis::W,
-                TensorAxis::RightAligned,
-                NchwDimensionCount, // minDimensionCount
-                0 // guaranteedBaseOffsetAlignment
-            );
-        }
+        BroadcastQuantizationParameters(kernelInfo, gsl::make_span(outputShape));
 
         std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
         std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
@@ -630,6 +573,8 @@ class DmlOperatorQuantization21 : public DmlOperator
         const DML_TENSOR_DATA_TYPE outputDataType = m_outputTensorDescs[0].GetDmlDataType();
         bool hasZeroPointTensor = kernelInfo.IsInputValid(2);
 
+        BroadcastQuantizationParameters(kernelInfo, gsl::make_span(outputShape));
+
         std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
         std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp
index bc29256dd2e28..83e35ae89282d 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp
@@ -76,7 +76,7 @@ class DmlOperatorNonZero: public DmlOperator
 
         // Create the DML output tensor for the number of nonzero elements
         onnxruntime::Tensor outputCountDml(onnxruntime::DataTypeImpl::GetType<uint32_t>(), m_outputCountShape, executionProvider->GetGpuAllocator());
-        Microsoft::WRL::ComPtr<IMLOperatorTensor> outputCountDmlWrapper = wil::MakeOrThrow<Windows::AI::MachineLearning::Adapter::TensorWrapper>(
+        Microsoft::WRL::ComPtr<IMLOperatorTensor> outputCountDmlWrapper = Dml::SafeMakeOrThrow<Windows::AI::MachineLearning::Adapter::TensorWrapper>(
             &outputCountDml,
             true,
             executionProvider,
@@ -84,7 +84,7 @@ class DmlOperatorNonZero: public DmlOperator
 
         // Create the DML output tensor for the coordinates (not cropped)
         onnxruntime::Tensor intermediateCoordinatesDml(onnxruntime::DataTypeImpl::GetType<int64_t>(), m_outputCoordinatesShape, executionProvider->GetGpuAllocator());
-        Microsoft::WRL::ComPtr<IMLOperatorTensor> intermediateCoordinatesDmlWrapper = wil::MakeOrThrow<Windows::AI::MachineLearning::Adapter::TensorWrapper>(
+        Microsoft::WRL::ComPtr<IMLOperatorTensor> intermediateCoordinatesDmlWrapper = Dml::SafeMakeOrThrow<Windows::AI::MachineLearning::Adapter::TensorWrapper>(
             &intermediateCoordinatesDml,
             true,
             executionProvider,
@@ -105,7 +105,7 @@ class DmlOperatorNonZero: public DmlOperator
 
             // Copy the number of nonzero elements back to the CPU
             onnxruntime::Tensor outputCountCpu(onnxruntime::DataTypeImpl::GetType<uint32_t>(), {1}, executionProvider->GetCpuInputAllocator());
-            Microsoft::WRL::ComPtr<IMLOperatorTensor> outputCountCpuWrapper = wil::MakeOrThrow<Windows::AI::MachineLearning::Adapter::TensorWrapper>(
+            Microsoft::WRL::ComPtr<IMLOperatorTensor> outputCountCpuWrapper = Dml::SafeMakeOrThrow<Windows::AI::MachineLearning::Adapter::TensorWrapper>(
                 &outputCountCpu,
                 false,
                 executionProvider,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
index e2f38231f7295..091a82daefbdc 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h
@@ -238,7 +238,7 @@ class DmlSTFTOperator : public WRL::Base<IMLOperatorKernel>
 
         constexpr uint32_t dftAxis = 1;
         constexpr bool dftIsInverse = false;
-        m_dftOperator.op = wil::MakeOrThrow<GpuDFTOperator>(
+        m_dftOperator.op = Dml::SafeMakeOrThrow<GpuDFTOperator>(
             m_d3dDevice.Get(),
             dftAxis,
             params.isOnesided,
@@ -516,7 +516,7 @@ class DmlSTFTOperatorFactory : public WRL::Base<IMLOperatorKernelFactory>
     {
         try
         {
-            auto dftOperator = wil::MakeOrThrow<DmlSTFTOperator>(context);
+            auto dftOperator = Dml::SafeMakeOrThrow<DmlSTFTOperator>(context);
             dftOperator.CopyTo(kernel);
             return S_OK;
         }
@@ -574,8 +574,8 @@ class DmlSTFTOperatorFactory : public WRL::Base<IMLOperatorKernelFactory>
         kernelDescription.options = MLOperatorKernelOptions::None;
         kernelDescription.executionOptions = 0;
 
-        auto shareInferrer = wil::MakeOrThrow<STFTShapeInferrer>();
-        auto factory = wil::MakeOrThrow<DmlSTFTOperatorFactory>();
+        auto shareInferrer = Dml::SafeMakeOrThrow<STFTShapeInferrer>();
+        auto factory = Dml::SafeMakeOrThrow<DmlSTFTOperatorFactory>();
 
         std::array<uint32_t, 2> requiredConstantCpuInputs = { /*frame_step*/1, /*frame_length*/3 };
 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
index b0b37d01370bc..26f998c7521a2 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
@@ -1314,18 +1314,18 @@ void RegisterDmlOperators(IMLOperatorRegistry* registry)
             totalTypeCount += typeConstraints[i].allowedTypeCount;
         }
 
-        ComPtr<MLOperatorKernelFactory> factory =  wil::MakeOrThrow<MLOperatorKernelFactory>(information.creationFunction);
+        ComPtr<MLOperatorKernelFactory> factory =  Dml::SafeMakeOrThrow<MLOperatorKernelFactory>(information.creationFunction);
         ComPtr<MLOperatorShapeInferrer> shapeInferrer;
 
         if (information.shapeInferenceFunction)
         {
-            shapeInferrer = wil::MakeOrThrow<MLOperatorShapeInferrer>(information.shapeInferenceFunction);
+            shapeInferrer = Dml::SafeMakeOrThrow<MLOperatorShapeInferrer>(information.shapeInferenceFunction);
         }
 
         ComPtr<IMLOperatorSupportQueryPrivate> supportQuery;
         if (information.supportQueryFunction)
         {
-            supportQuery = wil::MakeOrThrow<MLOperatorSupportQuery>(information.supportQueryFunction);
+            supportQuery = Dml::SafeMakeOrThrow<MLOperatorSupportQuery>(information.supportQueryFunction);
         }
 
         ORT_THROW_IF_FAILED(registryPrivate->RegisterOperatorKernel(
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/SafeMakeOrThrow.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/SafeMakeOrThrow.h
new file mode 100644
index 0000000000000..c2740470cbc0a
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/SafeMakeOrThrow.h
@@ -0,0 +1,37 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <wrl/client.h>
+#include <new>
+#include <utility>
+
+// Drop-in replacement for wil::MakeOrThrow that avoids an ASan false positive.
+// WRL's MakeAllocator stores its buffer as char*, so if the constructor throws,
+// ~MakeAllocator calls delete on a char* — passing sizeof(char)=1 to sized
+// operator delete instead of sizeof(T). With the default MSVC allocator, this is
+// benign (sized delete ignores the size), but ASan flags it as
+// new-delete-type-mismatch. This helper uses placement new with correctly-sized
+// cleanup to avoid the issue.
+namespace Dml
+{
+    template <typename T, typename... TArgs>
+    Microsoft::WRL::ComPtr<T> SafeMakeOrThrow(TArgs&&... args)
+    {
+        void* buffer = ::operator new(sizeof(T));
+        T* raw = nullptr;
+        try
+        {
+            raw = new (buffer) T(std::forward<TArgs>(args)...);
+        }
+        catch (...)
+        {
+            ::operator delete(buffer, sizeof(T));
+            throw;
+        }
+        Microsoft::WRL::ComPtr<T> result;
+        result.Attach(raw);
+        return result;
+    }
+} // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h
index e9df3fd20aff9..b9febb8171e0d 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h
@@ -25,6 +25,7 @@
 
 #include <wil/wrl.h>
 #include <wil/result.h>
+#include "SafeMakeOrThrow.h"
 
 #include <gsl/gsl>
 
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h
index ac77616cb96f0..dec84d9945569 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h
@@ -5,6 +5,7 @@
 
 #include "core/providers/dml/DmlExecutionProvider/inc/MLOperatorAuthor.h"
 #include "MLOperatorAuthorPrivate.h"
+#include "core/providers/dml/DmlExecutionProvider/src/SafeMakeOrThrow.h"
 #include "core/framework/int4.h"
 #include <gsl/gsl>
 #include <optional>
@@ -972,7 +973,7 @@ class MLOperatorKernel : public Microsoft::WRL::RuntimeClass<
     {
         ORT_TRY
         {
-            Microsoft::WRL::ComPtr<MLOperatorKernel> kernel = wil::MakeOrThrow<MLOperatorKernel>(MLOperatorKernelCreationContext(&info));
+            Microsoft::WRL::ComPtr<MLOperatorKernel> kernel = Dml::SafeMakeOrThrow<MLOperatorKernel>(MLOperatorKernelCreationContext(&info));
 
             *opKernel = kernel.Detach();
             return S_OK;
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/SchemaInferenceOverrider.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/SchemaInferenceOverrider.h
index fa04bcf6edf41..597780a9f448b 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/SchemaInferenceOverrider.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/SchemaInferenceOverrider.h
@@ -5,6 +5,7 @@
 
 #include "OperatorHelper.h"
 #include "OperatorVersions.h"
+#include "core/providers/dml/DmlExecutionProvider/src/SafeMakeOrThrow.h"
 
 namespace SchemaInferenceOverrider
 {
@@ -21,7 +22,7 @@ namespace SchemaInferenceOverrider
     )
     {
         Microsoft::WRL::ComPtr<MLOperatorShapeInferrer> shapeInferrer =
-            wil::MakeOrThrow<MLOperatorShapeInferrer>(OperatorHelper::ShapeInferenceFunction<T>);
+            Dml::SafeMakeOrThrow<MLOperatorShapeInferrer>(OperatorHelper::ShapeInferenceFunction<T>);
 
         auto schema = const_cast<onnx::OpSchema*>(onnx::OpSchemaRegistry::Schema(name, version));
 
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
index c0d8a4f02bbc3..0884908525dce 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory.cc
+++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -21,6 +21,8 @@ using Microsoft::WRL::ComPtr;
 #include <wil/wrl.h>
 #include <wil/result.h>
 
+#include "core/providers/dml/DmlExecutionProvider/src/SafeMakeOrThrow.h"
+
 #include "core/providers/dml/dml_provider_factory.h"
 #include "core/providers/dml/dml_provider_factory_creator.h"
 #include "core/session/abi_session_options_impl.h"
@@ -86,11 +88,11 @@ std::unique_ptr<IExecutionProvider> DMLProviderFactory::CreateProvider() {
 
     // First, check if an I/O binding API that was used before this session or another session has already created a queue
     if (FAILED(d3d12_device->GetPrivateData(dml_execution_context_guid, &execution_context_ptr_size, execution_context.GetAddressOf()))) {
-      execution_context = wil::MakeOrThrow<Dml::ExecutionContext>(d3d12_device.Get(), dml_device_.Get(), cmd_queue_.Get(), true, true);
+      execution_context = Dml::SafeMakeOrThrow<Dml::ExecutionContext>(d3d12_device.Get(), dml_device_.Get(), cmd_queue_.Get(), true, true);
       ORT_THROW_IF_FAILED(d3d12_device->SetPrivateDataInterface(dml_execution_context_guid, execution_context.Get()));
     }
   } else {
-    execution_context = wil::MakeOrThrow<Dml::ExecutionContext>(d3d12_device.Get(), dml_device_.Get(), cmd_queue_.Get(), cpu_sync_spinning_enabled_, false);
+    execution_context = Dml::SafeMakeOrThrow<Dml::ExecutionContext>(d3d12_device.Get(), dml_device_.Get(), cmd_queue_.Get(), cpu_sync_spinning_enabled_, false);
   }
 
   auto provider = Dml::CreateExecutionProvider(dml_device_.Get(), execution_context.Get(), metacommands_enabled_, graph_capture_enabled_, cpu_sync_spinning_enabled_, disable_memory_arena_);
diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc
index 5fc0b8900730b..5a4d68693730b 100644
--- a/onnxruntime/core/providers/vitisai/imp/global_api.cc
+++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc
@@ -79,7 +79,10 @@ struct OrtVitisAIEpAPI {
   std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>* (*compile_onnx_model_vitisai_ep_with_error_handling)(
       const std::string& model_path, const onnxruntime::Graph& graph, const onnxruntime::ProviderOptions& options, void* status, vaip_core::error_report_func func);
   std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>* (*compile_onnx_model_vitisai_ep_v3)(
-      const std::filesystem::path& model_path, const onnxruntime::Graph& graph, const onnxruntime::ProviderOptions& options, void* status, vaip_core::error_report_func func);
+      const std::string& model_path, const onnxruntime::Graph& graph, const onnxruntime::ProviderOptions& options, void* status, vaip_core::error_report_func func);
+  std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>* (*compile_onnx_model_vitisai_ep_v4)(
+      const std::string& model_path, const onnxruntime::Graph& graph, const onnxruntime::ProviderOptions& options, void* status, vaip_core::error_report_func func, const onnxruntime::logging::Logger& logger);
+  void (*vaip_execution_provider_deletor)(std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>*) noexcept = [](std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>* p) noexcept { delete p; };
   uint32_t (*vaip_get_version)();
   void (*create_ep_context_nodes)(
       const std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>& eps,
@@ -94,6 +97,15 @@ struct OrtVitisAIEpAPI {
   void (*profiler_collect)(
       std::vector<EventInfo>& api_events,
       std::vector<EventInfo>& kernel_events);
+  const char* (*get_compiled_model_compatibility_info)(
+      const std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>* eps,
+      const void* graph_viewer) = nullptr;
+  int (*validate_compiled_model_compatibility_info)(
+      const std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>* eps,
+      const char* compatibility_info,
+      const void* const* devices,
+      size_t num_devices,
+      int* model_compatibility) = nullptr;
   void (*deinitialize_onnxruntime_vitisai_ep)();
   void Ensure() {
     if (handle_)
@@ -126,17 +138,29 @@ struct OrtVitisAIEpAPI {
     auto status1 = env.GetSymbolFromLibrary(handle_, "compile_onnx_model_vitisai_ep_with_error_handling", (void**)&compile_onnx_model_vitisai_ep_with_error_handling);
     auto status2 = env.GetSymbolFromLibrary(handle_, "compile_onnx_model_vitisai_ep_with_options", (void**)&compile_onnx_model_with_options);
     auto status3 = env.GetSymbolFromLibrary(handle_, "compile_onnx_model_vitisai_ep_v3", (void**)&compile_onnx_model_vitisai_ep_v3);
-    if ((!status1.IsOK()) && (!status2.IsOK()) && (!status3.IsOK())) {
+    auto status4 = env.GetSymbolFromLibrary(handle_, "compile_onnx_model_vitisai_ep_v4", (void**)&compile_onnx_model_vitisai_ep_v4);
+    if ((!status1.IsOK()) && (!status2.IsOK()) && (!status3.IsOK()) && (!status4.IsOK())) {
       ::onnxruntime::LogRuntimeError(0, status2, __FILE__, static_cast<const char*>(__FUNCTION__), __LINE__);
       ORT_THROW(status2);
     }
     std::ignore = env.GetSymbolFromLibrary(handle_, "vaip_get_version",
                                            (void**)&vaip_get_version);
     std::ignore = env.GetSymbolFromLibrary(handle_, "profiler_collect", (void**)&profiler_collect);
+    std::ignore = env.GetSymbolFromLibrary(handle_, "get_compiled_model_compatibility_info", (void**)&get_compiled_model_compatibility_info);
+    std::ignore = env.GetSymbolFromLibrary(handle_, "validate_compiled_model_compatibility_info", (void**)&validate_compiled_model_compatibility_info);
     ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "create_ep_context_nodes", (void**)&create_ep_context_nodes));
     ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "vitisai_ep_on_run_start", (void**)&vitisai_ep_on_run_start));
     ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "vitisai_ep_set_ep_dynamic_options", (void**)&vitisai_ep_set_ep_dynamic_options));
     std::ignore = env.GetSymbolFromLibrary(handle_, "deinitialize_onnxruntime_vitisai_ep", (void**)&deinitialize_onnxruntime_vitisai_ep);
+    {
+      typedef void* (*vaip_get_execution_provider_deletor_func_t)();
+      vaip_get_execution_provider_deletor_func_t vaip_get_execution_provider_deletor = nullptr;
+      auto status = env.GetSymbolFromLibrary(handle_, "vaip_get_execution_provider_deletor",
+                                             (void**)&vaip_get_execution_provider_deletor);
+      if (status.IsOK()) {
+        vaip_execution_provider_deletor = reinterpret_cast<decltype(vaip_execution_provider_deletor)>(vaip_get_execution_provider_deletor());
+      };
+    }
   }
   void Clear() {
     if (handle_) {
@@ -166,6 +190,42 @@ void profiler_collect(
   }
 }
 
+std::string get_compiled_model_compatibility_info(
+    const std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>& eps,
+    const onnxruntime::GraphViewer& graph_viewer) {
+  std::string result_str;
+  if (s_library_vitisaiep.get_compiled_model_compatibility_info) {
+    const char* result = s_library_vitisaiep.get_compiled_model_compatibility_info(&eps, &graph_viewer);
+    if (result && result[0] != '\0') {
+      result_str = result;
+    }
+  }
+  return result_str;
+}
+
+Status validate_compiled_model_compatibility_info(
+    const std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>& eps,
+    const std::string& compatibility_info,
+    OrtCompiledModelCompatibility& model_compatibility) {
+  if (s_library_vitisaiep.validate_compiled_model_compatibility_info) {
+    // Call with nullptr devices since ORT provider doesn't have device information
+    int ret_model_compatibility = 0;
+    int status = s_library_vitisaiep.validate_compiled_model_compatibility_info(
+        &eps,
+        compatibility_info.c_str(),
+        nullptr,  // devices - not available
+        0,        // num_devices
+        &ret_model_compatibility);
+    if (status == 0) {
+      model_compatibility = static_cast<OrtCompiledModelCompatibility>(ret_model_compatibility);
+      return Status::OK();
+    }
+  }
+  // Default to NOT_APPLICABLE
+  model_compatibility = OrtCompiledModelCompatibility_EP_NOT_APPLICABLE;
+  return Status::OK();
+}
+
 void change_status_with_error(void* status_ptr, int error_code, const char* error_msg) {
   auto status = reinterpret_cast<Status*>(status_ptr);
   *status = Status(onnxruntime::common::ONNXRUNTIME, error_code, error_msg);
@@ -174,10 +234,19 @@ void change_status_with_error(void* status_ptr, int error_code, const char* erro
 vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>> compile_onnx_model(
     const onnxruntime::GraphViewer& graph_viewer, const onnxruntime::logging::Logger& logger, const onnxruntime::ProviderOptions& options) {
   auto model_path = graph_viewer.ModelPath();
-  if (s_library_vitisaiep.compile_onnx_model_vitisai_ep_v3) {
+  auto vaip_execution_provider_deletor = s_library_vitisaiep.vaip_execution_provider_deletor;
+  if (s_library_vitisaiep.compile_onnx_model_vitisai_ep_v4) {
+    Status status = Status::OK();
+    auto status_ptr = reinterpret_cast<void*>(&status);
+    auto ret = vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_vitisai_ep_v4(model_path.u8string(), graph_viewer.GetGraph(), options, status_ptr, change_status_with_error, logger), vaip_execution_provider_deletor);
+    if (!status.IsOK()) {
+      ORT_THROW(status);
+    }
+    return ret;
+  } else if (s_library_vitisaiep.compile_onnx_model_vitisai_ep_v3) {
     Status status = Status::OK();
     auto status_ptr = reinterpret_cast<void*>(&status);
-    auto ret = vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_vitisai_ep_v3(model_path, graph_viewer.GetGraph(), options, status_ptr, change_status_with_error));
+    auto ret = vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_vitisai_ep_v3(model_path.u8string(), graph_viewer.GetGraph(), options, status_ptr, change_status_with_error), vaip_execution_provider_deletor);
     if (!status.IsOK()) {
       ORT_THROW(status);
     }
@@ -185,13 +254,13 @@ vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>> c
   } else if (s_library_vitisaiep.compile_onnx_model_vitisai_ep_with_error_handling) {
     Status status = Status::OK();
     auto status_ptr = reinterpret_cast<void*>(&status);
-    auto ret = vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_vitisai_ep_with_error_handling(model_path.u8string(), graph_viewer.GetGraph(), options, status_ptr, change_status_with_error));
+    auto ret = vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_vitisai_ep_with_error_handling(model_path.u8string(), graph_viewer.GetGraph(), options, status_ptr, change_status_with_error), vaip_execution_provider_deletor);
     if (!status.IsOK()) {
       ORT_THROW(status);
     }
     return ret;
   } else {
-    return vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_with_options(model_path.u8string(), graph_viewer.GetGraph(), options));
+    return vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_with_options(model_path.u8string(), graph_viewer.GetGraph(), options), vaip_execution_provider_deletor);
   }
 }
 
@@ -317,7 +386,6 @@ void deinitialize_vitisai_ep() {
   s_domains_vitisaiep.clear();
 
   s_library_vitisaiep.Clear();
-  s_kernel_registry_vitisaiep.reset();
 }
 
 static void set_version_info(vaip_core::OrtApiForVaip& api) {
@@ -498,6 +566,7 @@ vaip_core::OrtApiForVaip* create_org_api_hook() {
   the_global_api.tensor_proto_get_shape_unsafe = vaip::tensor_proto_get_shape;
   the_global_api.tensor_proto_data_type = [](const ONNX_NAMESPACE::TensorProto& t) -> int { return t.data_type(); };
   the_global_api.tensor_proto_delete = [](ONNX_NAMESPACE::TensorProto* tp) { delete tp; };
+  the_global_api.tensor_proto_new_bool = vaip::tensor_proto_new_bool;
   the_global_api.tensor_proto_new_i4 = vaip::tensor_proto_new_i4;
   the_global_api.tensor_proto_new_i8 = vaip::tensor_proto_new_i8;
   the_global_api.tensor_proto_new_i16 = vaip::tensor_proto_new_i16;
@@ -588,3 +657,53 @@ vaip_core::OrtApiForVaip* create_org_api_hook() {
     return &the_global_api;
   }
 }
+
+struct ExternalEpLibaray {
+  ExternalEpLibaray(const std::string& libray_name) : libray_name_{libray_name} {
+    Ensure();
+  }
+  onnxruntime::Provider* (*get_provider_api)();
+  void (*create_ep_factories)(void*, const OrtApiBase*, void*, OrtEpFactory**, size_t, size_t*);
+  void (*set_session_option)(OrtSessionOptions*);
+
+  void Ensure() {
+    if (handle_)
+      return;
+    auto& env = Provider_GetHost()->Env__Default();
+    auto library_filename = PathString(LIBRARY_PREFIX) + PathString(libray_name_.begin(), libray_name_.end()) + LIBRARY_EXTENSION;
+    auto full_path = env.GetRuntimePath() + library_filename;
+    ORT_THROW_IF_ERROR(env.LoadDynamicLibrary(full_path, true, &handle_));
+    ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "GetProvider", (void**)&get_provider_api));
+  }
+
+  void Clear() {
+    if (handle_) {
+      auto& env = Provider_GetHost()->Env__Default();
+      auto status = env.UnloadDynamicLibrary(handle_);
+      vai_assert(status.IsOK(), status.ErrorMessage());
+      handle_ = nullptr;
+    }
+  }
+
+ private:
+  std::string libray_name_;
+  void* handle_{};
+};
+static std::unordered_map<std::string, std::unique_ptr<ExternalEpLibaray>> g_external_ep_libaries;
+
+std::unique_ptr<onnxruntime::IExecutionProvider>
+CreateExecutionProviderFromAnotherEp(const std::string& lib, const OrtSessionOptions& session_options,
+                                     std::unordered_map<std::string, std::string>& provider_options) {
+  auto it = g_external_ep_libaries.find(lib);
+  if (it == g_external_ep_libaries.end()) {
+    it = g_external_ep_libaries.emplace(lib, std::make_unique<ExternalEpLibaray>(lib)).first;
+  }
+  auto ep_lib = it->second.get();
+  auto get_provider_func = ep_lib->get_provider_api;
+  auto provider = get_provider_func();
+  std::unique_ptr<onnxruntime::IExecutionProvider> ret;
+  provider->Initialize();
+  std::ignore = provider->CreateIExecutionProvider(nullptr, nullptr, 0, const_cast<onnxruntime::ProviderOptions&>(provider_options), session_options, *((OrtLogger*)nullptr), ret);
+
+  return ret;
+}
\ No newline at end of file
diff --git a/onnxruntime/core/providers/vitisai/imp/graph.cc b/onnxruntime/core/providers/vitisai/imp/graph.cc
index 028ee7fa8c5ce..9e2efac73a20d 100644
--- a/onnxruntime/core/providers/vitisai/imp/graph.cc
+++ b/onnxruntime/core/providers/vitisai/imp/graph.cc
@@ -248,12 +248,6 @@ Node& graph_fuse(Graph& graph, const std::string& name,
   indexed_subgraph->SetMetaDef(std::move(meta_def));
 
   auto& fused_node = graph.FuseSubGraph(*indexed_subgraph, name);
-  auto function_body = fused_node.GetFunctionBody();
-  if (function_body) {
-    auto proto = function_body->Body().ToGraphProto();
-    *proto->mutable_name() = name;
-    fused_node.AddAttribute("body", *proto);
-  }
   for (auto&& o : fused_node.OutputDefs()) {
     graph.UpdateProducerNode(o->Name(), fused_node.Index());
   }
@@ -285,7 +279,7 @@ Model* model_clone(const Model& original_model, int64_t external_data_threshold)
   }
   for (auto& node : original_graph.Nodes()) {
     auto* node_proto = graph_proto->add_node();
-    node->ToProto(*node_proto, false);
+    node->ToProto(*node_proto, true);
     for (auto output : node->OutputDefs()) {
       if (output->Exists()) {
         auto* value_info = graph_proto->mutable_value_info()->Add();
diff --git a/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc b/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc
index 2f1478bf1326b..719ca8dd412bf 100644
--- a/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc
+++ b/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc
@@ -87,6 +87,12 @@ static ONNX_NAMESPACE::TensorProto* tensor_proto_new(const std::string& name, co
   return tensor_proto.release();
 }
 
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_bool(const std::string& name, const std::vector<int64_t>& shape,
+                                                   const std::vector<uint8_t>& data) {
+  return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_BOOL,
+                          reinterpret_cast<const char*>(&data[0]), data.size() * sizeof(data[0]));
+}
+
 ONNX_NAMESPACE::TensorProto* tensor_proto_new_i4(const std::string& name, const std::vector<int64_t>& shape,
                                                  const std::vector<int8_t>& data) {
   return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_INT4,
diff --git a/onnxruntime/core/providers/vitisai/imp/tensor_proto.h b/onnxruntime/core/providers/vitisai/imp/tensor_proto.h
index a7c90ac18b44e..9c35044c43824 100644
--- a/onnxruntime/core/providers/vitisai/imp/tensor_proto.h
+++ b/onnxruntime/core/providers/vitisai/imp/tensor_proto.h
@@ -37,5 +37,7 @@ ONNX_NAMESPACE::TensorProto* tensor_proto_new_fp16(const std::string& name, cons
                                                    const std::vector<int16_t>& data);
 ONNX_NAMESPACE::TensorProto* tensor_proto_new_doubles(const std::string& name, const std::vector<int64_t>& shape,
                                                       const std::vector<double>& data);
+ONNX_NAMESPACE::TensorProto* tensor_proto_new_bool(const std::string& name, const std::vector<int64_t>& shape,
+                                                   const std::vector<uint8_t>& data);
 gsl::span<const char> process_ext_address(const ONNX_NAMESPACE::TensorProto& tensor);
 }  // namespace vaip
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/dll_safe.h b/onnxruntime/core/providers/vitisai/include/vaip/dll_safe.h
index 27bc3ab63187c..a18902c5404be 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/dll_safe.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/dll_safe.h
@@ -17,7 +17,9 @@ class DllSafe {
       : value_{value}, deleter_{[](T* value) noexcept {
           std::default_delete<T>()(value);
         }} {}
-
+  explicit DllSafe(T* value, void (*deleter)(T*) noexcept)
+      : value_{value}, deleter_{deleter} {
+  }
   explicit DllSafe(T&& value) : DllSafe(new T(std::move(value))) {}
   explicit DllSafe(const T& value) : DllSafe(new T(value)) {}
 
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h
index 7791ea430054a..6ebec16a4e0dd 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h
@@ -6,10 +6,12 @@
 #define ORT_API_MANUAL_INIT
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/framework/provider_options.h"
+#include "core/framework/execution_provider.h"
 #include "vaip/my_ort.h"
 #include "vaip/dll_safe.h"
 #include "vaip/custom_op.h"
 #include <optional>
+#include <memory>
 void initialize_vitisai_ep();
 void deinitialize_vitisai_ep();
 vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>> compile_onnx_model(const onnxruntime::GraphViewer& graph_viewer, const onnxruntime::logging::Logger& logger, const onnxruntime::ProviderOptions& options);
@@ -40,3 +42,23 @@ using EventInfo = std::tuple<
 void profiler_collect(
     std::vector<EventInfo>& api_events,
     std::vector<EventInfo>& kernel_events);
+std::unique_ptr<onnxruntime::IExecutionProvider>
+CreateExecutionProviderFromAnotherEp(const std::string& lib, const OrtSessionOptions& session_options,
+                                     std::unordered_map<std::string, std::string>& provider_options);
+
+/**
+ * Get compiled model compatibility information from execution providers.
+ * Returns a JSON string containing compatibility metadata, or an empty string if unavailable.
+ */
+std::string get_compiled_model_compatibility_info(
+    const std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>& eps,
+    const onnxruntime::GraphViewer& graph_viewer);
+
+/**
+ * Validate compiled model compatibility information against current runtime environment.
+ * The model_compatibility is output parameter for the compatibility result.
+ */
+Status validate_compiled_model_compatibility_info(
+    const std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>& eps,
+    const std::string& compatibility_info,
+    OrtCompiledModelCompatibility& model_compatibility);
diff --git a/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h b/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h
index acb258894e11c..6285ff64019cd 100644
--- a/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h
+++ b/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h
@@ -13,7 +13,7 @@ struct OrtApi;
 
 namespace vaip_core {
 
-#define VAIP_ORT_API_MAJOR (18u)
+#define VAIP_ORT_API_MAJOR (19u)
 #define VAIP_ORT_API_MINOR (0u)
 #define VAIP_ORT_API_PATCH (0u)
 struct OrtApiForVaip {
@@ -257,6 +257,9 @@ struct OrtApiForVaip {
   void (*graph_proto_delete)(GraphProto* p);                      // [107]
   void (*graph_infer_shapes)(ModelProto& m);                      // [108]
   DllSafe<std::string> (*graph_save_string)(const Graph& graph);  // [109]
+  TensorProto* (*tensor_proto_new_bool)(const std::string& name,
+                                        const std::vector<int64_t>& shape,
+                                        const std::vector<uint8_t>& data);  // [110]
 };
 
 #ifndef USE_VITISAI
diff --git a/onnxruntime/core/providers/vitisai/onnxruntime_providers_vitisai.rc b/onnxruntime/core/providers/vitisai/onnxruntime_providers_vitisai.rc
new file mode 100644
index 0000000000000..968086ebd2613
--- /dev/null
+++ b/onnxruntime/core/providers/vitisai/onnxruntime_providers_vitisai.rc
@@ -0,0 +1,46 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// This file REQUIRES the following external definitions:
+// FILE_NAME, VER_MAJOR, VER_MINOR, VER_BUILD, VER_PRIVATE, and VER_STRING
+
+#include <Winver.h>
+
+#if defined(DEBUG) || defined(_DEBUG)
+#define VER_DEBUG VS_FF_DEBUG
+#else
+#define VER_DEBUG 0
+#endif
+
+// -----------------------------------------------------------------------------
+
+VS_VERSION_INFO VERSIONINFO
+FILEVERSION     VER_MAJOR, VER_MINOR, VER_BUILD, VER_PRIVATE
+PRODUCTVERSION  VER_MAJOR, VER_MINOR, VER_BUILD, VER_PRIVATE
+FILEFLAGSMASK   VS_FFI_FILEFLAGSMASK
+FILEFLAGS       VER_DEBUG
+FILEOS          VOS__WINDOWS32
+FILETYPE        VFT_DLL
+FILESUBTYPE     VFT2_UNKNOWN
+
+BEGIN
+    BLOCK "StringFileInfo"
+    BEGIN
+        BLOCK "040904E4"
+        BEGIN
+            VALUE "CompanyName",      "Microsoft Corporation"
+            VALUE "FileDescription",  "ONNX Runtime VitisAI Provider"
+            VALUE "FileVersion",      VER_STRING
+            VALUE "InternalName",     "ONNX Runtime VitisAI Provider"
+            VALUE "LegalCopyright",   "\251 Microsoft Corporation. All rights reserved."
+            VALUE "OriginalFilename", FILE_NAME
+            VALUE "ProductName",      "Microsoft\256 Windows\256 Operating System"
+            VALUE "ProductVersion",   VER_STRING
+        END
+    END
+
+    BLOCK "VarFileInfo"
+    BEGIN
+        VALUE "Translation", 0x409, 1252
+    END
+END
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
index 6cafc0495aa6b..7ea25ea115567 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc
@@ -145,6 +145,24 @@ std::unique_ptr<profiling::EpProfiler> VitisAIExecutionProvider::GetProfiler() {
   return std::make_unique<profiling::VitisaiProfiler>();
 }
 
+std::string VitisAIExecutionProvider::GetCompiledModelCompatibilityInfo(
+    const onnxruntime::GraphViewer& graph_viewer) const {
+  if (!execution_providers_) {
+    return {};
+  }
+  return get_compiled_model_compatibility_info(**execution_providers_, graph_viewer);
+}
+
+common::Status VitisAIExecutionProvider::ValidateCompiledModelCompatibilityInfo(
+    const std::string& compatibility_info,
+    OrtCompiledModelCompatibility& model_compatibility) const {
+  if (!execution_providers_) {
+    model_compatibility = OrtCompiledModelCompatibility_EP_NOT_APPLICABLE;
+    return Status::OK();
+  }
+  return validate_compiled_model_compatibility_info(**execution_providers_, compatibility_info, model_compatibility);
+}
+
 std::vector<AllocatorPtr> VitisAIExecutionProvider::CreatePreferredAllocators() {
   std::vector<AllocatorPtr> result;
   // We do not want arena for 4k alignment, as it would not respect alignment.
diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
index 8db4f36dd497a..1a20944692d6e 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
+++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h
@@ -48,6 +48,20 @@ class VitisAIExecutionProvider : public IExecutionProvider {
 
   std::vector<AllocatorPtr> CreatePreferredAllocators() override;
 
+  /**
+   * Get compiled model compatibility information.
+   * This method collects compatibility info from all vaip_core execution providers
+   * and returns it as a JSON string.
+   */
+  std::string GetCompiledModelCompatibilityInfo(const onnxruntime::GraphViewer& graph_viewer) const override;
+
+  /**
+   * Validate compiled model compatibility information.
+   * This method validates the compatibility info against the current runtime environment.
+   */
+  common::Status ValidateCompiledModelCompatibilityInfo(const std::string& compatibility_info,
+                                                        OrtCompiledModelCompatibility& model_compatibility) const override;
+
  private:
   using my_ep_t = vaip_core::DllSafe<std::vector<std::unique_ptr<vaip_core::ExecutionProvider>>>;
   using my_ep_uptr_t = std::shared_ptr<my_ep_t>;
diff --git a/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc b/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc
index 50f924e468ed0..e1a3ca43e162e 100644
--- a/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc
+++ b/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc
@@ -7,7 +7,6 @@
 #include <cctype>
 #include <unordered_map>
 #include <string>
-
 #include "vaip/global_api.h"
 #include "./vitisai_execution_provider.h"
 #include "core/framework/execution_provider.h"
@@ -57,6 +56,10 @@ std::unique_ptr<IExecutionProvider> VitisAIProviderFactory::CreateProvider(const
     }
   }
 
+  auto it = provider_options.find("external_ep_libray");
+  if (it != provider_options.end()) {
+    return CreateExecutionProviderFromAnotherEp(it->second, session_options, provider_options);
+  }
   auto ep_instance = std::make_unique<VitisAIExecutionProvider>(provider_options);
   ep_instance->SetLogger(reinterpret_cast<const logging::Logger*>(&session_logger));
   return ep_instance;
diff --git a/onnxruntime/core/session/environment.cc b/onnxruntime/core/session/environment.cc
index 9c40eb75780ee..0f9a1e299506e 100644
--- a/onnxruntime/core/session/environment.cc
+++ b/onnxruntime/core/session/environment.cc
@@ -499,8 +499,13 @@ Status CreateDataTransferForFactory(OrtEpFactory& ep_factory,
 Status Environment::RegisterExecutionProviderLibrary(const std::string& registration_name,
                                                      std::unique_ptr<EpLibrary> ep_library,
                                                      const std::vector<EpFactoryInternal*>& internal_factories) {
+  const Env& env = Env::Default();
+  env.GetTelemetryProvider().LogRegisterEpLibraryStart(registration_name);
+
   if (ep_libraries_.count(registration_name) > 0) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "library is already registered under ", registration_name);
+    auto status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "library is already registered under ", registration_name);
+    env.GetTelemetryProvider().LogRegisterEpLibraryEnd(registration_name, status);
+    return status;
   }
 
   auto status = Status::OK();
@@ -552,6 +557,7 @@ Status Environment::RegisterExecutionProviderLibrary(const std::string& registra
     });
   }
 
+  env.GetTelemetryProvider().LogRegisterEpLibraryEnd(registration_name, status);
   return status;
 }
 
@@ -571,6 +577,9 @@ Status Environment::CreateAndRegisterInternalEps() {
 Status Environment::RegisterExecutionProviderLibrary(const std::string& registration_name, const ORTCHAR_T* lib_path) {
   std::lock_guard<std::mutex> lock{mutex_};
 
+  std::string lib_file_name = std::filesystem::path(lib_path).filename().string();
+  Env::Default().GetTelemetryProvider().LogRegisterEpLibraryWithLibPath(registration_name, lib_file_name);
+
   std::vector<EpFactoryInternal*> internal_factories = {};
   std::unique_ptr<EpLibrary> ep_library;
 
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index e3291cdce62c5..6323c818bc56a 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -730,6 +730,25 @@ InferenceSession::InferenceSession(const SessionOptions& session_options, const
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
 InferenceSession::~InferenceSession() {
+  // Flush any remaining RuntimePerf counters
+  ORT_TRY {
+    std::lock_guard<std::mutex> telemetry_lock(telemetry_mutex_);
+    if (telemetry_.total_runs_since_last_ > 0) {
+      Env::Default().GetTelemetryProvider().LogRuntimePerf(session_id_,
+                                                           telemetry_.total_runs_since_last_,
+                                                           telemetry_.total_run_duration_since_last_,
+                                                           telemetry_.duration_per_batch_size_);
+    }
+  }
+  ORT_CATCH(const std::exception& e) {
+    ORT_HANDLE_EXCEPTION([&]() {
+      LOGS(*session_logger_, ERROR) << "Error during telemetry flush: " << e.what();
+    });
+  }
+  ORT_CATCH(...) {
+    LOGS(*session_logger_, ERROR) << "Unknown error during telemetry flush";
+  }
+
   if (session_options_.enable_profiling) {
     ORT_TRY {
       EndProfiling();
@@ -969,7 +988,10 @@ common::Status InferenceSession::LoadWithLoader(std::function<common::Status(std
   if (session_profiler_.IsEnabled()) {
     tp = session_profiler_.Start();
   }
+  const Env& env = Env::Default();
   ORT_TRY {
+    env.GetTelemetryProvider().LogModelLoadStart(session_id_);
+
     std::lock_guard<std::mutex> l(session_mutex_);
     if (is_model_loaded_) {  // already loaded
       LOGS(*session_logger_, ERROR) << "This session already contains a loaded model.";
@@ -1005,6 +1027,8 @@ common::Status InferenceSession::LoadWithLoader(std::function<common::Status(std
     session_profiler_.EndTimeAndRecordEvent(profiling::SESSION_EVENT, event_name, tp);
   }
 
+  env.GetTelemetryProvider().LogModelLoadEnd(session_id_, status);
+
   return status;
 }
 
@@ -1598,6 +1622,9 @@ Status InferenceSession::LoadOrtModel(const void* model_data, int model_data_len
 }
 
 Status InferenceSession::LoadOrtModelWithLoader(std::function<Status()> load_ort_format_model_bytes) {
+  const Env& env = Env::Default();
+  env.GetTelemetryProvider().LogModelLoadStart(session_id_);
+
   std::lock_guard<std::mutex> l(session_mutex_);
 
   if (is_model_loaded_) {  // already loaded
@@ -1718,6 +1745,8 @@ Status InferenceSession::LoadOrtModelWithLoader(std::function<Status()> load_ort
 
   is_model_loaded_ = true;
 
+  env.GetTelemetryProvider().LogModelLoadEnd(session_id_, Status::OK());
+
   return Status::OK();
 }
 
@@ -2562,6 +2591,12 @@ common::Status InferenceSession::Initialize() {
     }
   }
 
+  // Log session creation end telemetry
+  {
+    const Env& init_env = Env::Default();
+    init_env.GetTelemetryProvider().LogSessionCreationEnd(session_id_, status);
+  }
+
   return status;
 }
 #if defined(_MSC_VER) && !defined(__clang__)
@@ -3111,24 +3146,31 @@ Status InferenceSession::Run(const RunOptions& run_options,
     break;
   }
 
-  // time to send telemetry?
-  {
-    // Adding lock_guard here to ensure that telemetry updates are thread-safe.
-    std::lock_guard<std::mutex> telemetry_lock(telemetry_mutex_);
-    ++telemetry_.total_runs_since_last_;
-    telemetry_.total_run_duration_since_last_ += TimeDiffMicroSeconds(tp);
-    telemetry_.duration_per_batch_size_[batch_size] += TimeDiffMicroSeconds(tp);
-
-    if (TimeDiffMicroSeconds(telemetry_.time_sent_last_) > Telemetry::kDurationBetweenSending) {
-      // send the telemetry
-      env.GetTelemetryProvider().LogRuntimePerf(session_id_, telemetry_.total_runs_since_last_,
-                                                telemetry_.total_run_duration_since_last_,
-                                                telemetry_.duration_per_batch_size_);
-      // reset counters
-      telemetry_.time_sent_last_ = std::chrono::high_resolution_clock::now();
-      telemetry_.total_runs_since_last_ = 0;
-      telemetry_.total_run_duration_since_last_ = 0;
-      telemetry_.duration_per_batch_size_.clear();
+  // Only include successful inferences in batch since failed inferences can skew the metric
+  if (retval.IsOK()) {
+    // time to send telemetry?
+    {
+      // Adding lock_guard here to ensure that telemetry updates are thread-safe.
+      std::lock_guard<std::mutex> telemetry_lock(telemetry_mutex_);
+      ++telemetry_.total_runs_since_last_;
+      telemetry_.total_run_duration_since_last_ += TimeDiffMicroSeconds(tp);
+      telemetry_.duration_per_batch_size_[batch_size] += TimeDiffMicroSeconds(tp);
+
+      // Emit RuntimePerf on scheduled interval
+      if ((TimeDiffMicroSeconds(telemetry_.time_sent_last_) > telemetry_.runtime_perf_interval_)) {
+        env.GetTelemetryProvider().LogRuntimePerf(session_id_, telemetry_.total_runs_since_last_,
+                                                  telemetry_.total_run_duration_since_last_,
+                                                  telemetry_.duration_per_batch_size_);
+        // reset counters
+        telemetry_.time_sent_last_ = std::chrono::high_resolution_clock::now();
+        telemetry_.total_runs_since_last_ = 0;
+        telemetry_.total_run_duration_since_last_ = 0;
+        telemetry_.duration_per_batch_size_.clear();
+
+        // Double the interval, capping at kRuntimePerfMaxInterval
+        telemetry_.runtime_perf_interval_ = std::min(telemetry_.runtime_perf_interval_ * 2,
+                                                     Telemetry::kRuntimePerfMaxInterval);
+      }
     }
   }
 
diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h
index 8bea15c169ed4..fe36040f313b6 100644
--- a/onnxruntime/core/session/inference_session.h
+++ b/onnxruntime/core/session/inference_session.h
@@ -947,8 +947,10 @@ class InferenceSession {
     std::unordered_map<int64_t, long long> duration_per_batch_size_;  // the duration (us) of Run() calls per batch size since the last report
 
     TimePoint time_sent_last_;  // the TimePoint of the last report
-    // Event Rate per provider < 20 peak events per second
-    constexpr static long long kDurationBetweenSending = 1000 * 1000 * 60 * 10;  // duration in (us).  send a report every 10 mins
+    // RuntimePerf backoff interval: starts at 2s between emissions, doubles each emission, caps at 10 min
+    constexpr static int64_t kRuntimePerfInitialInterval = 2 * 1000 * 1000;    // 2 seconds in (us)
+    constexpr static int64_t kRuntimePerfMaxInterval = 1000 * 1000 * 60 * 10;  // 10 minutes in (us)
+    int64_t runtime_perf_interval_ = kRuntimePerfInitialInterval;
   } telemetry_;
 
   mutable std::mutex telemetry_mutex_;  // to ensure thread-safe access to telemetry data
diff --git a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
index 4b586e24c9bd3..5a2dc18bb2630 100644
--- a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
+++ b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc
@@ -82,7 +82,7 @@ void CheckDataAndShape(const std::vector<T>& data, const std::vector<int64_t>& s
 
   ORT_ENFORCE(static_cast<int64_t>(data.size()) == total_elements, "Data size does not match the shape",
               "Data size: ", data.size(), ", Expected size: ", total_elements,
-              ", Shape: ", VectorToString(shape), " Name:", name, " Type:", typeid(T).name());
+              ", Shape: ", VectorToString(shape), " Name:", name);
 }
 
 // Combinations: types, gather_axis, quantize_axis, block_size, indices, scale shape vs data shape
diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc
index 4b37b6c9438aa..3b33526a36ec4 100644
--- a/onnxruntime/test/providers/base_tester.cc
+++ b/onnxruntime/test/providers/base_tester.cc
@@ -72,7 +72,8 @@ void BaseTester::AddInitializers(onnxruntime::Graph& graph) {
         tensor_proto.add_string_data(string_data[i]);
       }
     } else {
-      auto buffer_size = tensor.DataType()->Size() * shape.Size();
+      // Use CalculateTensorStorageSize to properly handle sub-byte types (e.g., Int4)
+      auto buffer_size = Tensor::CalculateTensorStorageSize(tensor.DataType(), shape);
       utils::SetRawDataInTensorProto(tensor_proto, tensor.DataRaw(), buffer_size);
     }
 
diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
index 0008b68d14f41..5d09bade3b10b 100644
--- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
@@ -937,6 +937,56 @@ TEST(MathOpTest, Div_uint64) {
   test.Run();
 }
 
+TEST(MathOpTest, Div_int8_by_zero) {
+  OpTester test("Div", 14);
+  test.AddInput<int8_t>("A", {3}, {4, 8, 8});
+  test.AddInput<int8_t>("B", {3}, {1, 0, 2});
+  test.AddOutput<int8_t>("C", {3}, {0, 0, 0});
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultCpuExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectFailure,
+           "Integer division by zero",
+           {}, nullptr, &execution_providers);
+}
+
+TEST(MathOpTest, Div_int32_by_zero) {
+  OpTester test("Div");
+  test.AddInput<int32_t>("A", {3}, {4, 8, 8});
+  test.AddInput<int32_t>("B", {3}, {1, 0, 2});
+  test.AddOutput<int32_t>("C", {3}, {0, 0, 0});
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultCpuExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectFailure,
+           "Integer division by zero",
+           {}, nullptr, &execution_providers);
+}
+
+TEST(MathOpTest, Div_int64_by_zero_scalar) {
+  // Scalar divisor of 0 (the exact scenario from the bug report)
+  OpTester test("Div");
+  test.AddInput<int64_t>("A", {3}, {4, 8, 8});
+  test.AddInput<int64_t>("B", {}, {0});
+  test.AddOutput<int64_t>("C", {3}, {0, 0, 0});
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultCpuExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectFailure,
+           "Integer division by zero",
+           {}, nullptr, &execution_providers);
+}
+
+TEST(MathOpTest, Div_int32_by_zero_constant_initializer) {
+  // Divisor is a constant initializer — validated once at kernel creation time
+  OpTester test("Div");
+  test.AddInput<int32_t>("A", {3}, {4, 8, 8});
+  test.AddInput<int32_t>("B", {3}, {1, 0, 2}, true);  // is_initializer = true
+  test.AddOutput<int32_t>("C", {3}, {0, 0, 0});
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultCpuExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectFailure,
+           "Integer division by zero",
+           {}, nullptr, &execution_providers);
+}
+
 TEST(MathOpTest, Div_float) {
   OpTester test("Div");
   std::vector<int64_t> dims{2, 3};
diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc
index cf49601e6c671..0136e5e0f8e04 100644
--- a/onnxruntime/test/providers/cpu/model_tests.cc
+++ b/onnxruntime/test/providers/cpu/model_tests.cc
@@ -678,7 +678,13 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
                                                      ORT_TSTR("fp16_coreml_FNS-Candy"),
                                                      ORT_TSTR("fp16_test_tiny_yolov2"),
                                                      ORT_TSTR("fp16_test_shufflenet"),
-                                                     ORT_TSTR("keras2coreml_SimpleRNN_ImageNet")};
+                                                     ORT_TSTR("keras2coreml_SimpleRNN_ImageNet"),
+                                                     // models from model zoo. #26274: cuDNN frontend no valid engine
+                                                     ORT_TSTR("YOLOv3"),
+                                                     ORT_TSTR("YOLOv3-12"),
+                                                     ORT_TSTR("YOLOv4"),
+                                                     ORT_TSTR("SSD-MobilenetV1"),
+                                                     ORT_TSTR("SSD-MobilenetV1-12")};
   // For ROCm EP, also disable the following tests due to flakiness,
   // mainly with precision issue and random memory access fault.
   static const ORTCHAR_T* rocm_disabled_tests[] = {ORT_TSTR("bvlc_alexnet"),
diff --git a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
index 8fdbf0060eaa0..5183cdb352717 100644
--- a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
@@ -59,6 +59,47 @@ TEST(DequantizeLinearOpTest, Int8_Large) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kWebGpuExecutionProvider});
 }
 
+TEST(DequantizeLinearOpTest, Int4_LargeInitializerInput) {
+  OpTester test("DequantizeLinear", 21);
+  std::vector<int64_t> dims{1024};
+
+  std::vector<Int4x2> x_vals(Int4x2::CalcNumInt4Pairs(static_cast<size_t>(dims[0])), Int4x2{});
+  std::vector<float> expected_y_vals(static_cast<size_t>(dims[0]), 0.f);
+
+  test.AddInput<Int4x2>("x", dims, x_vals, true);
+  test.AddInput<float>("x_scale", {}, {1.0f});
+  test.AddInput<Int4x2>("x_zero_point", {}, {Int4x2(0, 0)});
+  test.AddOutput<float>("y", dims, expected_y_vals);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+// Regression test: int8 tensor whose byte size is not a multiple of 4.
+// DML graph fusion rounds tensor sizes to a multiple of 4 via AlignToPow2.
+// If the original buffer is not padded, the subsequent memcpy reads past the
+// allocation boundary (heap-buffer-overflow detectable with ASan).
+// Mirrors the WebNN PoC: dequantizeLinear with int8[135] (135 % 4 != 0).
+TEST(DequantizeLinearOpTest, Int8_NonAlignedSize_Initializer) {
+  OpTester test("DequantizeLinear", 10);
+  constexpr int64_t kNumElements = 135;  // 135 bytes, AlignToPow2(135,4)=136
+
+  std::vector<int8_t> x_data(kNumElements);
+  std::vector<float> y_expected(kNumElements);
+  const float scale = 0.5f;
+  const int8_t zero_point = 0;
+  for (int64_t i = 0; i < kNumElements; ++i) {
+    x_data[i] = static_cast<int8_t>(i % 127);
+    y_expected[i] = (x_data[i] - zero_point) * scale;
+  }
+
+  // Mark all inputs as initializers so they go through DML's ProcessInputData
+  // → UnpackInitializer → AlignToPow2 code path during graph fusion.
+  test.AddInput<int8_t>("x", {kNumElements}, x_data, /*is_initializer=*/true);
+  test.AddInput<float>("x_scale", {1}, {scale}, /*is_initializer=*/true);
+  test.AddInput<int8_t>("x_zero_point", {1}, {zero_point}, /*is_initializer=*/true);
+  test.AddOutput<float>("y", {kNumElements}, y_expected);
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
 // scalar zero & scale with int4
 TEST(DequantizeLinearOpTest, Int4) {
   OpTester test("DequantizeLinear", 21);
@@ -417,6 +458,90 @@ TEST(QuantizeLinearOpTest, Int8) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
+// Repro for new-delete-type-mismatch in DML EP during graph fusion.
+// QuantizeLinear float32→int8 with 5D input triggers a type-size
+// mismatch (192 bytes allocated, 1 byte deallocated) visible under ASan.
+TEST(QuantizeLinearOpTest, Int8_5D_DML_TypeMismatch) {
+  auto dml_ep = DefaultDmlExecutionProvider();
+  if (!dml_ep) {
+    GTEST_SKIP() << "Skipping because DML EP is not available.";
+  }
+
+  OpTester test("QuantizeLinear", 13);
+  std::vector<int64_t> dims{6, 1, 1, 1, 1};
+  test.AddInput<float>("x", dims, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+  test.AddInput<float>("y_scale", {}, {1.0f});
+  test.AddInput<int8_t>("y_zero_point", {}, {0});
+  test.AddOutput<int8_t>("y", dims, {1, 2, 3, 4, 5, 6});
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.emplace_back(std::move(dml_ep));
+  test.ConfigEps(std::move(execution_providers))
+      .RunWithConfig();
+}
+
+// Same as above but with per-axis quantization along axis 0 to exercise
+// the DML graph fusion path with per-channel int8 quantization.
+TEST(QuantizeLinearOpTest, Int8_5D_PerAxis_DML_TypeMismatch) {
+  auto dml_ep = DefaultDmlExecutionProvider();
+  if (!dml_ep) {
+    GTEST_SKIP() << "Skipping because DML EP is not available.";
+  }
+
+  OpTester test("QuantizeLinear", 13);
+  std::vector<int64_t> dims{6, 1, 1, 1, 1};
+  test.AddAttribute<int64_t>("axis", 0);
+  test.AddInput<float>("x", dims, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+  test.AddInput<float>("y_scale", {6}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
+  test.AddInput<int8_t>("y_zero_point", {6}, {0, 0, 0, 0, 0, 0});
+  test.AddOutput<int8_t>("y", dims, {1, 2, 3, 4, 5, 6});
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.emplace_back(std::move(dml_ep));
+  test.ConfigEps(std::move(execution_providers))
+      .RunWithConfig();
+}
+
+// Opset 21 QuantizeLinear float32→uint8 WITHOUT zero_point.
+// Without zero_point, the output type defaults to uint8.
+TEST(QuantizeLinearOpTest, Uint8_5D_NoZeroPoint_Opset21_DML) {
+  auto dml_ep = DefaultDmlExecutionProvider();
+  if (!dml_ep) {
+    GTEST_SKIP() << "Skipping because DML EP is not available.";
+  }
+
+  OpTester test("QuantizeLinear", 21);
+  std::vector<int64_t> dims{6, 1, 1, 1, 1};
+  test.AddInput<float>("x", dims, {0.0f, 51.0f, 102.0f, 153.0f, 204.0f, 255.0f});
+  test.AddInput<float>("y_scale", {}, {1.0f});
+  test.AddOutput<uint8_t>("y", dims, {0, 51, 102, 153, 204, 255});
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.emplace_back(std::move(dml_ep));
+  test.ConfigEps(std::move(execution_providers))
+      .RunWithConfig();
+}
+
+// Opset 21 QuantizeLinear float32→int8 with zero_point (the customer's exact scenario).
+TEST(QuantizeLinearOpTest, Int8_5D_WithZeroPoint_Opset21_DML) {
+  auto dml_ep = DefaultDmlExecutionProvider();
+  if (!dml_ep) {
+    GTEST_SKIP() << "Skipping because DML EP is not available.";
+  }
+
+  OpTester test("QuantizeLinear", 21);
+  std::vector<int64_t> dims{6, 1, 1, 1, 1};
+  test.AddInput<float>("x", dims, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+  test.AddInput<float>("y_scale", {}, {1.0f});
+  test.AddInput<int8_t>("y_zero_point", {}, {0});
+  test.AddOutput<int8_t>("y", dims, {1, 2, 3, 4, 5, 6});
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.emplace_back(std::move(dml_ep));
+  test.ConfigEps(std::move(execution_providers))
+      .RunWithConfig();
+}
+
 // Test uint16 QuantizeLinear (per tensor)
 TEST(QuantizeLinearOpTest, Uint16) {
   OpTester test("QuantizeLinear", 21);
diff --git a/onnxruntime/test/providers/dml_safe_make_or_throw_test.cc b/onnxruntime/test/providers/dml_safe_make_or_throw_test.cc
new file mode 100644
index 0000000000000..8041f0dae8c28
--- /dev/null
+++ b/onnxruntime/test/providers/dml_safe_make_or_throw_test.cc
@@ -0,0 +1,139 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#ifdef USE_DML
+
+#include "gtest/gtest.h"
+
+#include <wrl/implements.h>
+#include <wrl/client.h>
+#include "core/providers/dml/DmlExecutionProvider/src/SafeMakeOrThrow.h"
+
+#include <stdexcept>
+
+namespace onnxruntime {
+namespace test {
+
+// A trivial COM interface for testing.
+MIDL_INTERFACE("A1B2C3D4-E5F6-7890-ABCD-EF1234567890")
+ITestInterface : public IUnknown {
+  virtual int STDMETHODCALLTYPE GetValue() = 0;
+};
+
+// A RuntimeClass whose constructor succeeds and stores a value.
+class SucceedingClass : public Microsoft::WRL::RuntimeClass<
+                            Microsoft::WRL::RuntimeClassFlags<Microsoft::WRL::ClassicCom>, ITestInterface> {
+ public:
+  int value;
+
+  SucceedingClass(int v) : value(v) {}
+
+  int STDMETHODCALLTYPE GetValue() override { return value; }
+};
+
+// A RuntimeClass that tracks whether its destructor ran.
+class TrackedClass : public Microsoft::WRL::RuntimeClass<
+                         Microsoft::WRL::RuntimeClassFlags<Microsoft::WRL::ClassicCom>, ITestInterface> {
+ public:
+  bool& destroyed;
+
+  TrackedClass(bool& flag) : destroyed(flag) { destroyed = false; }
+  ~TrackedClass() { destroyed = true; }
+
+  int STDMETHODCALLTYPE GetValue() override { return 42; }
+};
+
+// A RuntimeClass whose constructor always throws.
+// Uses a ref-counted witness to verify cleanup: the witness is destroyed
+// (via Release) during stack unwinding if memory is freed correctly.
+class ThrowingClass : public Microsoft::WRL::RuntimeClass<
+                          Microsoft::WRL::RuntimeClassFlags<Microsoft::WRL::ClassicCom>, ITestInterface> {
+ public:
+  Microsoft::WRL::ComPtr<TrackedClass> witness;
+
+  ThrowingClass(bool& witness_destroyed) {
+    // Create a witness that will be destroyed when this object's members
+    // are cleaned up during stack unwinding.
+    witness = Dml::SafeMakeOrThrow<TrackedClass>(witness_destroyed);
+    throw std::runtime_error("intentional throw");
+  }
+
+  int STDMETHODCALLTYPE GetValue() override { return -1; }
+};
+
+// Verify that SafeMakeOrThrow creates an object with ref count 1,
+// and that the object is properly released when the ComPtr goes out of scope.
+TEST(SafeMakeOrThrowTest, SuccessPath_RefCountIsOne) {
+  Microsoft::WRL::ComPtr<SucceedingClass> obj = Dml::SafeMakeOrThrow<SucceedingClass>(123);
+
+  ASSERT_NE(obj.Get(), nullptr);
+  EXPECT_EQ(obj->GetValue(), 123);
+
+  // AddRef/Release to observe ref count: AddRef returns new count.
+  unsigned long refAfterAdd = obj->AddRef();
+  EXPECT_EQ(refAfterAdd, 2u);
+
+  unsigned long refAfterRelease = obj->Release();
+  EXPECT_EQ(refAfterRelease, 1u);
+}
+
+// Verify that the object is destroyed when the last ComPtr releases it.
+TEST(SafeMakeOrThrowTest, SuccessPath_DestructorRunsOnRelease) {
+  bool destroyed = false;
+  {
+    auto obj = Dml::SafeMakeOrThrow<TrackedClass>(destroyed);
+    EXPECT_FALSE(destroyed);
+  }
+  // ComPtr went out of scope — destructor should have run.
+  EXPECT_TRUE(destroyed);
+}
+
+// Verify that copying the ComPtr increments the ref count and
+// the object survives until the last reference is released.
+TEST(SafeMakeOrThrowTest, SuccessPath_MultipleReferences) {
+  bool destroyed = false;
+  Microsoft::WRL::ComPtr<TrackedClass> copy;
+  {
+    auto obj = Dml::SafeMakeOrThrow<TrackedClass>(destroyed);
+    copy = obj;
+    EXPECT_FALSE(destroyed);
+  }
+  // Original ComPtr gone, but copy still holds a reference.
+  EXPECT_FALSE(destroyed);
+
+  copy.Reset();
+  EXPECT_TRUE(destroyed);
+}
+
+// Verify that when the constructor throws, the exception propagates
+// and sub-objects are properly cleaned up (no leak).
+TEST(SafeMakeOrThrowTest, FailurePath_ConstructorThrows) {
+  bool witness_destroyed = false;
+  EXPECT_THROW(
+      Dml::SafeMakeOrThrow<ThrowingClass>(witness_destroyed),
+      std::runtime_error);
+  // The witness ComPtr member was constructed before the throw.
+  // If cleanup worked correctly, the witness should have been destroyed
+  // when the ThrowingClass sub-objects were unwound.
+  EXPECT_TRUE(witness_destroyed);
+}
+
+// Verify that QI works correctly on a SafeMakeOrThrow-created object.
+TEST(SafeMakeOrThrowTest, SuccessPath_QueryInterface) {
+  auto obj = Dml::SafeMakeOrThrow<SucceedingClass>(42);
+
+  Microsoft::WRL::ComPtr<IUnknown> unk;
+  HRESULT hr = obj.As(&unk);
+  EXPECT_EQ(hr, S_OK);
+  EXPECT_NE(unk.Get(), nullptr);
+
+  Microsoft::WRL::ComPtr<ITestInterface> iface;
+  hr = unk.As(&iface);
+  EXPECT_EQ(hr, S_OK);
+  EXPECT_EQ(iface->GetValue(), 42);
+}
+
+}  // namespace test
+}  // namespace onnxruntime
+
+#endif  // USE_DML