diff --git a/.github/actions/linux-web-init-and-check/action.yml b/.github/actions/linux-web-init-and-check/action.yml index c250f368a953e..694f026d07d0d 100644 --- a/.github/actions/linux-web-init-and-check/action.yml +++ b/.github/actions/linux-web-init-and-check/action.yml @@ -4,7 +4,7 @@ runs: using: "composite" steps: - name: Setup Node.js - uses: actions/setup-node@v3 + uses: actions/setup-node@v5 with: node-version: "22.x" diff --git a/.github/actions/locate-vcvarsall-and-setup-env/action.yml b/.github/actions/locate-vcvarsall-and-setup-env/action.yml index c4fdc48a7bd63..fba855f14b487 100644 --- a/.github/actions/locate-vcvarsall-and-setup-env/action.yml +++ b/.github/actions/locate-vcvarsall-and-setup-env/action.yml @@ -16,8 +16,8 @@ runs: - name: Setup VCPKG uses: microsoft/onnxruntime-github-actions/setup-build-tools@v0.0.9 with: - vcpkg-version: '2025.06.13' - vcpkg-hash: '735923258c5187966698f98ce0f1393b8adc6f84d44fd8829dda7db52828639331764ecf41f50c8e881e497b569f463dbd02dcb027ee9d9ede0711102de256cc' + vcpkg-version: '2025.08.27' + vcpkg-hash: '9a4b32849792e13bee1d24726f073b3881acae4165206ddf1a6378e44a4ddd05b3ee93f55ff46d8e8873b3cbcd06606212989e248f0bd615a5bf365070074079' cmake-version: '3.31.6' cmake-hash: '0f1584e8666cf4a65ec514bd02afe281caabf1d45d2c963f3151c41484f457386aa03273ab25776a670be02725354ce0b46f3a5121857416da37366342a833a0' add-cmake-to-path: 'true' diff --git a/.github/actions/macos-ci-setup/action.yml b/.github/actions/macos-ci-setup/action.yml index b3b95b855526f..5c6eb6193c393 100644 --- a/.github/actions/macos-ci-setup/action.yml +++ b/.github/actions/macos-ci-setup/action.yml @@ -31,7 +31,7 @@ runs: using: "composite" steps: - name: Use Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ inputs.python_version }} @@ -43,7 +43,7 @@ runs: assert platform.machine().lower() == "${{ inputs.platform_machine}}", "This job expects to be run on an ${{ inputs.platform_machine}} machine." - name: Use Node.js - uses: actions/setup-node@v4 + uses: actions/setup-node@v5 with: node-version: ${{ inputs.node_version }} @@ -52,7 +52,7 @@ runs: run: brew install coreutils ninja - name: Install Java - uses: actions/setup-java@v4 + uses: actions/setup-java@v5 with: distribution: "temurin" java-version: ${{ inputs.java_version }} diff --git a/.github/workflows/linux-wasm-ci-build-and-test-workflow.yml b/.github/workflows/linux-wasm-ci-build-and-test-workflow.yml index c30a8cb023f50..e36ecd505fc21 100644 --- a/.github/workflows/linux-wasm-ci-build-and-test-workflow.yml +++ b/.github/workflows/linux-wasm-ci-build-and-test-workflow.yml @@ -4,6 +4,9 @@ description: "This is a reusable workflow for Linux WASM CI pipelines to build a on: workflow_call: inputs: + job_name: + required: true + type: string build_config: required: true type: string @@ -37,16 +40,16 @@ jobs: runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"] env: buildArch: x64 - common_build_args: --parallel ${{ inputs.use_vcpkg == true && '--use_vcpkg --use_vcpkg_ms_internal_asset_cache' || '' }} --config ${{ inputs.build_config }} --skip_submodule_sync --build_wasm --enable_wasm_simd ${{ inputs.enable_wasm_threads == true && '--enable_wasm_threads' || '' }} ${{ inputs.extra_build_args }} + common_build_args: --parallel --use_cache ${{ inputs.use_vcpkg == true && '--use_vcpkg --use_vcpkg_ms_internal_asset_cache' || '' }} --config ${{ inputs.build_config }} --skip_submodule_sync --build_wasm --enable_wasm_simd ${{ inputs.enable_wasm_threads == true && '--enable_wasm_threads' || '' }} ${{ inputs.extra_build_args }} steps: - name: Checkout code - uses: actions/checkout@v5 + uses: actions/checkout@v6 with: submodules: recursive - name: Set up Node.js - uses: actions/setup-node@v4 + uses: actions/setup-node@v6 with: node-version: "22" @@ -56,8 +59,25 @@ jobs: python-version: "3.12" architecture: ${{ env.buildArch }} - - uses: microsoft/onnxruntime-github-actions/setup-build-tools@v0.0.9 + - name: Install python dependencies + run: python -m pip install flatbuffers + + - name: Setup CCache + uses: actions/cache@v4 + with: + key: ccache | web.yml | ${{ inputs.job_name }} + path: ~/.cache/ccache + + - name: Setup VCPKG Cache + uses: actions/cache@v4 + with: + key: vcpkg-cache | web.yml | ${{ inputs.job_name }} + path: ~/.cache/vcpkg + + - uses: microsoft/onnxruntime-github-actions/setup-build-tools@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 with: + ccache-version: 4.13.1 + ccache-hash: 626407a9b81dd86f8ec9867bff396b32dd1f00344f5b323526579a64f6d4104927f83e8d7a05ad9806fd78f4491e0adb4cff73388000a62050cb1b00766214ee vcpkg-version: '2025.06.13' vcpkg-hash: '735923258c5187966698f98ce0f1393b8adc6f84d44fd8829dda7db52828639331764ecf41f50c8e881e497b569f463dbd02dcb027ee9d9ede0711102de256cc' cmake-version: '3.31.6' @@ -114,7 +134,7 @@ jobs: - name: Upload WASM artifacts if: ${{ inputs.skip_publish != true }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: ${{ inputs.build_config }}_wasm path: ${{ github.workspace }}/artifacts/wasm @@ -143,7 +163,7 @@ jobs: - name: Publish test results if: ${{ always() && inputs.build_config == 'Debug' }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: test-results path: ${{ github.workspace }}/build/**/*.results.xml diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml index 6f517f2656e94..9aa8418c55a40 100644 --- a/.github/workflows/linux_ci.yml +++ b/.github/workflows/linux_ci.yml @@ -48,6 +48,7 @@ jobs: dockerfile_path: tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile docker_image_repo: onnxruntimecpubuildcix64 extra_build_flags: '--enable_address_sanitizer' + job_identifier: build-linux-x64-debug # python_path_prefix: '' # Default empty string is fine, no prefix needed secrets: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} @@ -63,6 +64,7 @@ jobs: docker_image_repo: onnxruntimecpubuildpythonx64 extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --build_nuget --enable_transformers_tool_test --cmake_extra_defines onnxruntime_BUILD_BENCHMARKS=ON' python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH' # $ needs escaping in single quotes + job_identifier: build-linux-x64-release secrets: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} @@ -77,6 +79,7 @@ jobs: docker_image_repo: onnxruntimecpubuildpythonx64 # Shares image with standard x64 release extra_build_flags: '--enable_training --use_binskim_compliant_compile_flags --build_wheel --build_nuget --enable_transformers_tool_test --cmake_extra_defines onnxruntime_BUILD_BENCHMARKS=ON' python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH' # $ needs escaping in single quotes + job_identifier: orttraining-linux-ci-pipeline secrets: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} @@ -92,6 +95,7 @@ jobs: docker_image_repo: onnxruntimecpubuildciaarch64 # ASan disabled due to excessive runtime (>4hr). Includes wheel build for basic checks. extra_build_flags: '--use_binskim_compliant_compile_flags --build_shared_lib' + job_identifier: build-linux-arm64-debug secrets: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} @@ -106,5 +110,6 @@ jobs: docker_image_repo: onnxruntimecpubuildpythonaarch64 extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cmake_extra_defines onnxruntime_BUILD_BENCHMARKS=ON' python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH' # $ needs escaping in single quotes + job_identifier: build-linux-arm64-release secrets: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/linux_minimal_build.yml b/.github/workflows/linux_minimal_build.yml index 92cdbb70e9858..655921342ae00 100644 --- a/.github/workflows/linux_minimal_build.yml +++ b/.github/workflows/linux_minimal_build.yml @@ -29,16 +29,30 @@ jobs: packages: write steps: - name: Checkout repository - uses: actions/checkout@v5 + uses: actions/checkout@v6 with: submodules: false - - uses: actions/setup-node@v4 + - uses: actions/setup-node@v6 with: node-version: 20 - - uses: microsoft/onnxruntime-github-actions/setup-build-tools@v0.0.9 + - name: Setup CCache + uses: actions/cache@v4 with: + key: ccache | linux_minimal_build.yml | build_full_ort + path: ~/.cache/ccache + + - name: Setup VCPKG Cache + uses: actions/cache@v4 + with: + key: vcpkg-cache | linux_minimal_build.yml | build_full_ort + path: ~/.cache/vcpkg + + - uses: microsoft/onnxruntime-github-actions/setup-build-tools@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 + with: + ccache-version: 4.13.1 + ccache-hash: 626407a9b81dd86f8ec9867bff396b32dd1f00344f5b323526579a64f6d4104927f83e8d7a05ad9806fd78f4491e0adb4cff73388000a62050cb1b00766214ee vcpkg-version: '2025.06.13' vcpkg-hash: '735923258c5187966698f98ce0f1393b8adc6f84d44fd8829dda7db52828639331764ecf41f50c8e881e497b569f463dbd02dcb027ee9d9ede0711102de256cc' cmake-version: '3.31.6' @@ -47,10 +61,10 @@ jobs: disable-terrapin: 'true' - name: Build Full ORT and Prepare Test Files - uses: microsoft/onnxruntime-github-actions/build-and-prep-ort-files@v0.0.9 + uses: microsoft/onnxruntime-github-actions/build-and-prep-ort-files@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 - name: Upload Test Data Artifact - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: test_data path: ${{ runner.temp }}/minimal_build_test_data/ @@ -66,15 +80,27 @@ jobs: id-token: write # If using OIDC for ACR login steps: - name: Checkout repository - uses: actions/checkout@v5 + uses: actions/checkout@v6 with: submodules: false - - uses: actions/setup-node@v4 + - uses: actions/setup-node@v6 with: node-version: 20 + - name: Setup CCache + uses: actions/cache@v4 + with: + key: ccache | linux_minimal_build.yml | build_minimal_exceptions_disabled + path: ~/.cache/ccache + + - name: Setup VCPKG Cache + uses: actions/cache@v4 + with: + key: vcpkg-cache | linux_minimal_build.yml | build_minimal_exceptions_disabled + path: ~/.cache/vcpkg + - name: Get Docker Image using Action - uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.9 + uses: microsoft/onnxruntime-github-actions/build-docker-image@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 id: build_docker_image_step with: dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile @@ -85,10 +111,9 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Run Build 2 (Update) - uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9 + uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 with: - docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name - }} + docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }} build_config: Debug # From original --config Debug mode: 'update' # CMake configure step extra_build_flags: >- @@ -100,10 +125,9 @@ jobs: --enable_training_ops - name: Run Build 2 (Build) - uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9 + uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 with: - docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name - }} + docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }} build_config: Debug # From original --config Debug mode: 'build' # Actual build step extra_build_flags: >- @@ -125,15 +149,29 @@ jobs: id-token: write # If using OIDC for ACR login steps: - name: Checkout repository - uses: actions/checkout@v5 + uses: actions/checkout@v6 with: submodules: false - - uses: actions/setup-node@v4 + - uses: actions/setup-node@v6 with: node-version: 20 - - uses: microsoft/onnxruntime-github-actions/setup-build-tools@v0.0.9 + - name: Setup CCache + uses: actions/cache@v4 with: + key: ccache | linux_minimal_build.yml | build_minimal_custom_ops + path: ~/.cache/ccache + + - name: Setup VCPKG Cache + uses: actions/cache@v4 + with: + key: vcpkg-cache | linux_minimal_build.yml | build_minimal_custom_ops + path: ~/.cache/vcpkg + + - uses: microsoft/onnxruntime-github-actions/setup-build-tools@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 + with: + ccache-version: 4.13.1 + ccache-hash: 626407a9b81dd86f8ec9867bff396b32dd1f00344f5b323526579a64f6d4104927f83e8d7a05ad9806fd78f4491e0adb4cff73388000a62050cb1b00766214ee vcpkg-version: '2025.06.13' vcpkg-hash: '735923258c5187966698f98ce0f1393b8adc6f84d44fd8829dda7db52828639331764ecf41f50c8e881e497b569f463dbd02dcb027ee9d9ede0711102de256cc' cmake-version: '3.31.6' @@ -142,7 +180,7 @@ jobs: disable-terrapin: 'true' - name: Build Full ORT and Prepare Test Files - uses: microsoft/onnxruntime-github-actions/build-minimal-ort-and-run-tests@v0.0.9 + uses: microsoft/onnxruntime-github-actions/build-minimal-ort-and-run-tests@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 with: reduced-ops-config-file: required_ops.ort_models.config enable-custom-ops: 'true' @@ -159,23 +197,38 @@ jobs: id-token: write # If using OIDC for ACR login steps: - name: Checkout repository - uses: actions/checkout@v5 + uses: actions/checkout@v6 with: submodules: false - - uses: actions/setup-node@v4 + - uses: actions/setup-node@v6 with: node-version: 20 - - uses: microsoft/onnxruntime-github-actions/setup-build-tools@v0.0.9 + - name: Setup CCache + uses: actions/cache@v4 + with: + key: ccache | linux_minimal_build.yml | build_minimal_type_reduction + path: ~/.cache/ccache + + - name: Setup VCPKG Cache + uses: actions/cache@v4 with: + key: vcpkg-cache | linux_minimal_build.yml | build_minimal_type_reduction + path: ~/.cache/vcpkg + + - uses: microsoft/onnxruntime-github-actions/setup-build-tools@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 + with: + ccache-version: 4.13.1 + ccache-hash: 626407a9b81dd86f8ec9867bff396b32dd1f00344f5b323526579a64f6d4104927f83e8d7a05ad9806fd78f4491e0adb4cff73388000a62050cb1b00766214ee vcpkg-version: '2025.06.13' vcpkg-hash: '735923258c5187966698f98ce0f1393b8adc6f84d44fd8829dda7db52828639331764ecf41f50c8e881e497b569f463dbd02dcb027ee9d9ede0711102de256cc' cmake-version: '3.31.6' cmake-hash: '42395e20b10a8e9ef3e33014f9a4eed08d46ab952e02d2c1bbc8f6133eca0d7719fb75680f9bbff6552f20fcd1b73d86860f7f39388d631f98fb6f622b37cf04' add-cmake-to-path: 'true' disable-terrapin: 'true' + - name: Build Full ORT and Prepare Test Files - uses: microsoft/onnxruntime-github-actions/build-minimal-ort-and-run-tests@v0.0.9 + uses: microsoft/onnxruntime-github-actions/build-minimal-ort-and-run-tests@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 with: reduced-ops-config-file: required_ops_and_types.ort_models.config enable-type-reduction: 'true' @@ -191,15 +244,29 @@ jobs: id-token: write # If using OIDC for ACR login steps: - name: Checkout repository - uses: actions/checkout@v5 + uses: actions/checkout@v6 with: submodules: false - - uses: actions/setup-node@v4 + - uses: actions/setup-node@v6 with: node-version: 20 - - uses: microsoft/onnxruntime-github-actions/setup-build-tools@v0.0.9 + - name: Setup CCache + uses: actions/cache@v4 + with: + key: ccache | linux_minimal_build.yml | build_minimal_globally_allowed_types + path: ~/.cache/ccache + + - name: Setup VCPKG Cache + uses: actions/cache@v4 with: + key: vcpkg-cache | linux_minimal_build.yml | build_minimal_globally_allowed_types + path: ~/.cache/vcpkg + + - uses: microsoft/onnxruntime-github-actions/setup-build-tools@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 + with: + ccache-version: 4.13.1 + ccache-hash: 626407a9b81dd86f8ec9867bff396b32dd1f00344f5b323526579a64f6d4104927f83e8d7a05ad9806fd78f4491e0adb4cff73388000a62050cb1b00766214ee vcpkg-version: '2025.06.13' vcpkg-hash: '735923258c5187966698f98ce0f1393b8adc6f84d44fd8829dda7db52828639331764ecf41f50c8e881e497b569f463dbd02dcb027ee9d9ede0711102de256cc' cmake-version: '3.31.6' @@ -208,7 +275,7 @@ jobs: disable-terrapin: 'true' - name: Build Full ORT and Prepare Test Files - uses: microsoft/onnxruntime-github-actions/build-minimal-ort-and-run-tests@v0.0.9 + uses: microsoft/onnxruntime-github-actions/build-minimal-ort-and-run-tests@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 with: globally_allowed_types: 'bool,float,int8_t,uint8_t' enable-type-reduction: 'true' @@ -225,15 +292,27 @@ jobs: id-token: write # If using OIDC for ACR login steps: - name: Checkout repository - uses: actions/checkout@v5 + uses: actions/checkout@v6 with: submodules: false - - uses: actions/setup-node@v4 + - uses: actions/setup-node@v6 with: node-version: 20 + - name: Setup CCache + uses: actions/cache@v4 + with: + key: ccache | linux_minimal_build.yml | build_extended_minimal + path: ~/.cache/ccache + + - name: Setup VCPKG Cache + uses: actions/cache@v4 + with: + key: vcpkg-cache | linux_minimal_build.yml | build_extended_minimal + path: ~/.cache/vcpkg + - name: Get Docker Image using Action - uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.9 + uses: microsoft/onnxruntime-github-actions/build-docker-image@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 id: build_docker_image_step with: dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile @@ -243,12 +322,10 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Run Build 5 (Update) - uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9 + uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 with: - docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name - }} + docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }} build_config: Debug mode: 'update' extra_build_flags: >- @@ -258,7 +335,7 @@ jobs: --minimal_build extended - name: Run Build 5 (Build) - uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9 + uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 with: docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }} @@ -270,7 +347,7 @@ jobs: --use_binskim_compliant_compile_flags --minimal_build extended - name: Run Build 5 (Test) - uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9 + uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 with: docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }} @@ -292,12 +369,12 @@ jobs: id-token: write # If using OIDC for ACR login steps: - name: Checkout repository - uses: actions/checkout@v5 + uses: actions/checkout@v6 with: submodules: false - name: Get Docker Image using Action - uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.9 + uses: microsoft/onnxruntime-github-actions/build-docker-image@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 id: build_docker_image_step with: dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile @@ -313,8 +390,20 @@ jobs: mkdir -p ${{ runner.temp }}/.test_data touch ${{ runner.temp }}/.test_data/include_no_operators.config + - name: Setup CCache + uses: actions/cache@v4 + with: + key: ccache | linux_minimal_build.yml | build_regular_no_optional + path: ~/.cache/ccache + + - name: Setup VCPKG Cache + uses: actions/cache@v4 + with: + key: vcpkg-cache | linux_minimal_build.yml | build_regular_no_optional + path: ~/.cache/vcpkg + - name: Run Build 6a (Update) - uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9 + uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 with: docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }} @@ -330,7 +419,7 @@ jobs: --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF - name: Run Build 6a (Build) - uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9 + uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 with: docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }} @@ -347,7 +436,7 @@ jobs: - name: Run Build 6a (Test) - uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9 + uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 with: docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }} @@ -372,7 +461,7 @@ jobs: id-token: write # If using OIDC for ACR login steps: - name: Checkout repository - uses: actions/checkout@v5 + uses: actions/checkout@v6 with: submodules: false @@ -383,7 +472,7 @@ jobs: touch ${{ runner.temp }}/.test_data/include_no_operators.config - name: Get Docker Image using Action - uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.9 + uses: microsoft/onnxruntime-github-actions/build-docker-image@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 id: build_docker_image_step with: dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile @@ -393,11 +482,22 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Setup CCache + uses: actions/cache@v4 + with: + key: ccache | linux_minimal_build.yml | build_minimal_no_optional + path: ~/.cache/ccache + + - name: Setup VCPKG Cache + uses: actions/cache@v4 + with: + key: vcpkg-cache | linux_minimal_build.yml | build_minimal_no_optional + path: ~/.cache/vcpkg + - name: Run Build 6b (Update) - uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9 + uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 with: - docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name - }} + docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }} build_config: MinSizeRel # From original --config MinSizeRel mode: 'update' extra_build_flags: >- @@ -413,7 +513,7 @@ jobs: --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF - name: Run Build 6b (Build) - uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9 + uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 with: docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }} @@ -441,7 +541,7 @@ jobs: id-token: write # If using OIDC for ACR login steps: - name: Checkout repository - uses: actions/checkout@v5 + uses: actions/checkout@v6 with: submodules: false @@ -452,7 +552,7 @@ jobs: touch ${{ runner.temp }}/.test_data/include_no_operators.config - name: Get Docker Image using Action - uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.9 + uses: microsoft/onnxruntime-github-actions/build-docker-image@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 id: build_docker_image_step with: dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile @@ -468,8 +568,20 @@ jobs: mkdir -p ${{ runner.temp }}/.test_data touch ${{ runner.temp }}/.test_data/include_no_operators.config + - name: Setup CCache + uses: actions/cache@v4 + with: + key: ccache | linux_minimal_build.yml | build_extended_minimal_no_optional + path: ~/.cache/ccache + + - name: Setup VCPKG Cache + uses: actions/cache@v4 + with: + key: vcpkg-cache | linux_minimal_build.yml | build_extended_minimal_no_optional + path: ~/.cache/vcpkg + - name: Run Build 6c (Update) - uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9 + uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 with: docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }} @@ -488,7 +600,7 @@ jobs: --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF - name: Run Build 6c (Build) - uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@v0.0.9 + uses: microsoft/onnxruntime-github-actions/run-build-script-in-docker@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 with: docker_image: ${{ steps.build_docker_image_step.outputs.full-image-name }} @@ -518,10 +630,10 @@ jobs: id-token: write # If using OIDC for ACR login steps: - name: Checkout repository - uses: actions/checkout@v5 + uses: actions/checkout@v6 with: submodules: false - - uses: actions/setup-node@v4 + - uses: actions/setup-node@v6 with: node-version: 20 - name: Download Test Data Artifact @@ -531,7 +643,7 @@ jobs: path: ${{ runner.temp }}/.test_data/ - name: Get Docker Image using Action - uses: microsoft/onnxruntime-github-actions/build-docker-image@v0.0.9 + uses: microsoft/onnxruntime-github-actions/build-docker-image@8bad63a3c05d448311dfa8e5f531171c97471aa1 # v0.0.12 id: build_docker_image_step with: dockerfile: ${{ github.workspace }}/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index af2b36c870201..4775d92367930 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -60,15 +60,15 @@ jobs: matrix: target_arch: [x86_64, arm64] - timeout-minutes: 90 + timeout-minutes: 120 steps: - name: Checkout code - uses: actions/checkout@v5 + uses: actions/checkout@v6 - uses: microsoft/onnxruntime-github-actions/setup-build-tools@v0.0.9 with: - vcpkg-version: '2025.06.13' - vcpkg-hash: 735923258c5187966698f98ce0f1393b8adc6f84d44fd8829dda7db52828639331764ecf41f50c8e881e497b569f463dbd02dcb027ee9d9ede0711102de256cc + vcpkg-version: '2025.08.27' + vcpkg-hash: 9a4b32849792e13bee1d24726f073b3881acae4165206ddf1a6378e44a4ddd05b3ee93f55ff46d8e8873b3cbcd06606212989e248f0bd615a5bf365070074079 cmake-version: '3.31.8' cmake-hash: 99cc9c63ae49f21253efb5921de2ba84ce136018abf08632c92c060ba91d552e0f6acc214e9ba8123dee0cf6d1cf089ca389e321879fd9d719a60d975bcffcc8 add-cmake-to-path: 'true' @@ -112,11 +112,11 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v5 + uses: actions/checkout@v6 - uses: microsoft/onnxruntime-github-actions/setup-build-tools@v0.0.9 with: - vcpkg-version: '2025.06.13' - vcpkg-hash: 735923258c5187966698f98ce0f1393b8adc6f84d44fd8829dda7db52828639331764ecf41f50c8e881e497b569f463dbd02dcb027ee9d9ede0711102de256cc + vcpkg-version: '2025.08.27' + vcpkg-hash: 9a4b32849792e13bee1d24726f073b3881acae4165206ddf1a6378e44a4ddd05b3ee93f55ff46d8e8873b3cbcd06606212989e248f0bd615a5bf365070074079 cmake-version: '3.31.8' cmake-hash: 99cc9c63ae49f21253efb5921de2ba84ce136018abf08632c92c060ba91d552e0f6acc214e9ba8123dee0cf6d1cf089ca389e321879fd9d719a60d975bcffcc8 add-cmake-to-path: 'true' diff --git a/.github/workflows/macos-ci-build-and-test-workflow.yml b/.github/workflows/macos-ci-build-and-test-workflow.yml index 281538336b0c1..1583dd127886c 100644 --- a/.github/workflows/macos-ci-build-and-test-workflow.yml +++ b/.github/workflows/macos-ci-build-and-test-workflow.yml @@ -61,11 +61,11 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v5 + uses: actions/checkout@v6 - uses: microsoft/onnxruntime-github-actions/setup-build-tools@v0.0.9 with: - vcpkg-version: '2025.06.13' - vcpkg-hash: 735923258c5187966698f98ce0f1393b8adc6f84d44fd8829dda7db52828639331764ecf41f50c8e881e497b569f463dbd02dcb027ee9d9ede0711102de256cc + vcpkg-version: '2025.08.27' + vcpkg-hash: 9a4b32849792e13bee1d24726f073b3881acae4165206ddf1a6378e44a4ddd05b3ee93f55ff46d8e8873b3cbcd06606212989e248f0bd615a5bf365070074079 cmake-version: '3.31.8' cmake-hash: 99cc9c63ae49f21253efb5921de2ba84ce136018abf08632c92c060ba91d552e0f6acc214e9ba8123dee0cf6d1cf089ca389e321879fd9d719a60d975bcffcc8 add-cmake-to-path: 'true' diff --git a/.github/workflows/reusable_linux_build.yml b/.github/workflows/reusable_linux_build.yml index 1a9c0e0a72031..8f4cf9a26bf46 100644 --- a/.github/workflows/reusable_linux_build.yml +++ b/.github/workflows/reusable_linux_build.yml @@ -58,6 +58,11 @@ on: required: false type: boolean default: false + job_identifier: + description: 'A unique identifier for the job, used for hosted pool tracking' + required: false + type: string + default: 'linux-build' secrets: GH_TOKEN: description: 'GitHub token for accessing actions/packages' @@ -68,6 +73,7 @@ jobs: runs-on: - self-hosted - "1ES.Pool=${{ inputs.pool_name }}" + - "JobId=${{ inputs.job_identifier }}-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}" permissions: contents: read packages: write @@ -75,9 +81,10 @@ jobs: id-token: write steps: - name: Checkout code - uses: actions/checkout@v5 + uses: actions/checkout@v6 - name: Set up Python ${{ inputs.python_version }} + if: inputs.architecture != 'arm64' uses: actions/setup-python@v6 with: python-version: ${{ inputs.python_version }} @@ -163,7 +170,7 @@ jobs: # ------------- Upload Build Output Step ------------- - name: Upload Build Output Artifact if: inputs.upload_build_output == true - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: build-output-${{ inputs.architecture }}-${{ inputs.build_config }} path: ${{ runner.temp }}/${{ inputs.build_config }} @@ -172,7 +179,7 @@ jobs: # ------------- Upload Log on Build Failure Step ------------- - name: Upload VCPKG Manifest Install Log on Update or Build Failure if: steps.update_step.outcome == 'failure' || steps.build_step.outcome == 'failure' - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: vcpkg-manifest-install-log-${{ inputs.architecture }}-${{ inputs.build_config }} path: ${{ runner.temp }}/${{ inputs.build_config }}/${{ inputs.build_config }}/vcpkg-manifest-install.log diff --git a/.github/workflows/web.yml b/.github/workflows/web.yml index 616c2c6db8a8d..49fd7202cb86d 100644 --- a/.github/workflows/web.yml +++ b/.github/workflows/web.yml @@ -22,7 +22,7 @@ jobs: commit_sha: ${{ steps.extract_commit.outputs.commit_sha }} steps: - name: Checkout code - uses: actions/checkout@v5 + uses: actions/checkout@v6 with: submodules: true @@ -38,6 +38,7 @@ jobs: needs: precheck uses: ./.github/workflows/linux-wasm-ci-build-and-test-workflow.yml with: + job_name: wasm_Debug build_config: Debug extra_build_args: "--enable_wasm_profiling" build_jsep: true @@ -47,6 +48,7 @@ jobs: needs: precheck uses: ./.github/workflows/linux-wasm-ci-build-and-test-workflow.yml with: + job_name: wasm_Release build_config: Release extra_build_args: "--target onnxruntime_webassembly --skip_tests --enable_wasm_api_exception_catching --disable_rtti" build_jsep: true @@ -56,6 +58,7 @@ jobs: needs: precheck uses: ./.github/workflows/linux-wasm-ci-build-and-test-workflow.yml with: + job_name: wasm_Release_static_library build_config: Release extra_build_args: "--skip_tests --enable_wasm_api_exception_catching --disable_rtti --build_wasm_static_lib" use_vcpkg: false @@ -68,6 +71,7 @@ jobs: - wasm_Debug uses: ./.github/workflows/windows-web-ci-workflow.yml with: + job_name: web_Debug commit_override: ${{ needs.precheck.outputs.commit_sha }} build_config: Debug @@ -77,5 +81,6 @@ jobs: - wasm_Release uses: ./.github/workflows/windows-web-ci-workflow.yml with: + job_name: web_Release commit_override: ${{ needs.precheck.outputs.commit_sha }} build_config: Release diff --git a/.github/workflows/windows-web-ci-workflow.yml b/.github/workflows/windows-web-ci-workflow.yml index 0ea8b3ee33644..abc46681e8220 100644 --- a/.github/workflows/windows-web-ci-workflow.yml +++ b/.github/workflows/windows-web-ci-workflow.yml @@ -4,6 +4,9 @@ description: "Windows Web CI pipeline for building and testing ONNX Runtime Web" on: workflow_call: inputs: + job_name: + required: true + type: string commit_override: type: string default: "" @@ -19,7 +22,11 @@ on: jobs: build_onnxruntime_web: - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Win2022-WEBGPU-A10"] + runs-on: [ + "self-hosted", + "1ES.Pool=onnxruntime-github-Win2022-WEBGPU-A10", + "JobId=build_onnxruntime_web-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}" + ] env: webgpu_commandline_extra_flags: "--chromium-flags=--ignore-gpu-blocklist --chromium-flags=--gpu-vendor-id=0x10de" @@ -29,7 +36,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v5 + uses: actions/checkout@v6 with: submodules: false @@ -62,12 +69,12 @@ jobs: git checkout -- .gitattributes - name: Setup Node.js - uses: actions/setup-node@v4 + uses: actions/setup-node@v6 with: node-version: "20.x" - name: Download WebAssembly artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v7 with: name: ${{ inputs.build_config }}_wasm path: ${{ github.workspace }}/artifacts_wasm @@ -95,7 +102,7 @@ jobs: run: npm ci working-directory: ${{ github.workspace }}/js/web - - uses: actions/cache@v4 + - uses: actions/cache@v5 id: onnx-node-tests-cache with: path: ${{ github.workspace }}/js/test/ @@ -173,7 +180,7 @@ jobs: # this step is added to help investigate the shader validation failure which is hard to reproduce - name: Upload WebGPU shader validation log on failure if: ${{ failure() && inputs.build_config == 'Debug' }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: webgpu-shader-validation-logs path: ${{ runner.temp }}\web\test\07\chrome_debug.log @@ -203,7 +210,7 @@ jobs: - name: Upload NPM packages if: ${{ inputs.build_config == 'Release' }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: ${{ inputs.package_name }} path: ${{ github.workspace }}\artifacts_npm diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml index 0b1bf59733349..8daacd79e9040 100644 --- a/.github/workflows/windows_cuda.yml +++ b/.github/workflows/windows_cuda.yml @@ -19,9 +19,13 @@ concurrency: jobs: build: name: Windows GPU CUDA CI Pipeline - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: [ + "self-hosted", + "1ES.Pool=onnxruntime-github-vs2022-latest", + "JobId=windows-cuda-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}" + ] steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 with: fetch-depth: 0 submodules: 'none' @@ -41,10 +45,10 @@ jobs: working-directory: ${{ github.workspace }} shell: cmd - - name: Download CUDA SDK v12.2 + - name: Download CUDA SDK v12.8 working-directory: ${{ runner.temp }} run: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" . + azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" . dir shell: pwsh @@ -52,21 +56,21 @@ jobs: shell: powershell run: | Write-Host "Adding CUDA to PATH" - Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64" + Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64" - - uses: actions/setup-node@v4 + - uses: actions/setup-node@v6 with: node-version: '20.x' - - uses: actions/setup-java@v4 + - uses: actions/setup-java@v5 with: distribution: 'temurin' java-version: '17' architecture: x64 - - uses: actions/cache@v4 + - uses: actions/cache@v5 id: onnx-node-tests-cache with: path: ${{ github.workspace }}/js/test/ @@ -82,7 +86,7 @@ jobs: working-directory: ${{ github.workspace }} shell: cmd - - uses: actions/setup-dotnet@v4 + - uses: actions/setup-dotnet@v5 env: PROCESSOR_ARCHITECTURE: x64 with: @@ -111,7 +115,7 @@ jobs: exit $lastExitCode } # Execute the build process - python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.2" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON + python.exe ${{ github.workspace }}\tools\ci_build\build.py --update --build --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON if ($lastExitCode -ne 0) { exit $lastExitCode } @@ -132,7 +136,7 @@ jobs: shell: pwsh - name: Upload build artifacts - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: build-artifacts path: ${{ runner.temp }}\build @@ -150,15 +154,19 @@ jobs: name: Windows GPU CUDA CI Pipeline Test Job needs: build timeout-minutes: 300 - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Win2022-GPU-A10"] + runs-on: [ + "self-hosted", + "1ES.Pool=onnxruntime-github-Win2022-GPU-A10", + "JobId=windows-cuda-test-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}" + ] steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 with: fetch-depth: 0 submodules: 'none' - name: Download build artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v7 with: name: build-artifacts path: ${{ runner.temp }}\build @@ -168,11 +176,11 @@ jobs: python-version: '3.12' architecture: x64 - - uses: actions/setup-node@v4 + - uses: actions/setup-node@v6 with: node-version: '20.x' - - uses: actions/setup-java@v4 + - uses: actions/setup-java@v5 with: distribution: 'temurin' java-version: '17' @@ -188,10 +196,10 @@ jobs: working-directory: ${{ github.workspace }} shell: cmd - - name: Download CUDA SDK v12.2 + - name: Download CUDA SDK v12.8 working-directory: ${{ runner.temp }} run: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" . + azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" . dir shell: pwsh @@ -199,9 +207,9 @@ jobs: shell: powershell run: | Write-Host "Adding CUDA to PATH" - Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64" + Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64" - name: Set OnnxRuntimeBuildDirectory shell: pwsh @@ -227,7 +235,7 @@ jobs: exit $lastExitCode } - python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.2" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON + python.exe ${{ github.workspace }}\tools\ci_build\build.py --test --config RelWithDebInfo --build_dir build --skip_submodule_sync --build_csharp --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --cmake_generator "Visual Studio 17 2022" --build_shared_lib --build_wheel --build_java --use_cuda --cuda_home="$env:RUNNER_TEMP\v12.8" --enable_cuda_profiling --use_vcpkg --use_vcpkg_ms_internal_asset_cache --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON if ($lastExitCode -ne 0) { exit $lastExitCode } diff --git a/.github/workflows/windows_qnn_x64.yml b/.github/workflows/windows_qnn_x64.yml index 4c08d543cefd9..fa07fd47d87e9 100644 --- a/.github/workflows/windows_qnn_x64.yml +++ b/.github/workflows/windows_qnn_x64.yml @@ -18,7 +18,7 @@ concurrency: jobs: build_test_qnn_ep: name: Windows x64 QNN CI Pipeline (${{ matrix.QnnLibKind }}) - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-latest"] timeout-minutes: 120 strategy: matrix: diff --git a/.github/workflows/windows_tensorrt.yml b/.github/workflows/windows_tensorrt.yml index de6fa1529bcb1..5eb08a369cb87 100644 --- a/.github/workflows/windows_tensorrt.yml +++ b/.github/workflows/windows_tensorrt.yml @@ -19,9 +19,13 @@ concurrency: jobs: build: name: Windows GPU TensorRT CI Pipeline - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] + runs-on: [ + "self-hosted", + "1ES.Pool=onnxruntime-github-vs2022-latest", + "JobId=windows-tensorrt-build-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}" + ] steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 with: fetch-depth: 0 submodules: 'none' @@ -41,37 +45,38 @@ jobs: working-directory: ${{ github.workspace }} shell: cmd - - name: Download CUDA SDK v12.2 + - name: Download CUDA SDK v12.8 working-directory: ${{ runner.temp }} run: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" . + azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" . dir shell: pwsh - - name: Download TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8 - run: 'azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8" ${{ runner.temp }}' + - name: Download TensorRT-10.14.1.48.Windows.win10.cuda-12.9 + run: 'azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.14.1.48.Windows.win10.cuda-12.9" ${{ runner.temp }}' shell: pwsh - name: Add CUDA to PATH shell: powershell run: | Write-Host "Adding CUDA to PATH" - Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib" + Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-10.14.1.48.Windows.win10.cuda-12.9\lib" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-10.14.1.48.Windows.win10.cuda-12.9\bin" - - uses: actions/setup-node@v4 + - uses: actions/setup-node@v6 with: node-version: '20.x' - - uses: actions/setup-java@v4 + - uses: actions/setup-java@v5 with: distribution: 'temurin' java-version: '17' architecture: x64 - - uses: actions/cache@v4 + - uses: actions/cache@v5 id: onnx-node-tests-cache with: path: ${{ github.workspace }}/js/test/ @@ -87,7 +92,7 @@ jobs: working-directory: ${{ github.workspace }} shell: cmd - - uses: actions/setup-dotnet@v4 + - uses: actions/setup-dotnet@v5 env: PROCESSOR_ARCHITECTURE: x64 with: @@ -116,7 +121,7 @@ jobs: exit $lastExitCode } # Execute the build process - python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8" --cuda_home="${{ runner.temp }}\v12.2" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --build --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 if ($lastExitCode -ne 0) { exit $lastExitCode } @@ -137,7 +142,7 @@ jobs: shell: pwsh - name: Upload build artifacts - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: build-artifacts path: ${{ runner.temp }}\build @@ -155,15 +160,19 @@ jobs: name: Windows GPU TensorRT CI Pipeline Test Job needs: build timeout-minutes: 300 - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Win2022-GPU-A10"] + runs-on: [ + "self-hosted", + "1ES.Pool=onnxruntime-github-Win2022-GPU-A10", + "JobId=windows-tensorrt-test-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}" + ] steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 with: fetch-depth: 0 submodules: 'none' - name: Download build artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v7 with: name: build-artifacts path: ${{ runner.temp }}\build @@ -173,11 +182,11 @@ jobs: python-version: '3.12' architecture: x64 - - uses: actions/setup-node@v4 + - uses: actions/setup-node@v6 with: node-version: '20.x' - - uses: actions/setup-java@v4 + - uses: actions/setup-java@v5 with: distribution: 'temurin' java-version: '17' @@ -193,25 +202,26 @@ jobs: working-directory: ${{ github.workspace }} shell: cmd - - name: Download CUDA SDK v12.2 + - name: Download CUDA SDK v12.8 working-directory: ${{ runner.temp }} run: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.2" . + azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v12.8" . dir shell: pwsh - - name: Download TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8 - run: 'azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8" ${{ runner.temp }}' + - name: Download TensorRT-10.14.1.48.Windows.win10.cuda-12.9 + run: 'azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.14.1.48.Windows.win10.cuda-12.9" ${{ runner.temp }}' shell: pwsh - name: Add CUDA to PATH shell: powershell run: | Write-Host "Adding CUDA to PATH" - Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\bin" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.2\extras\CUPTI\lib64" - Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8\lib" + Write-Host "CUDA Path: $env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\bin" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\v12.8\extras\CUPTI\lib64" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-10.14.1.48.Windows.win10.cuda-12.9\lib" + Add-Content -Path $env:GITHUB_PATH -Value "$env:RUNNER_TEMP\TensorRT-10.14.1.48.Windows.win10.cuda-12.9\bin" - name: Set OnnxRuntimeBuildDirectory shell: pwsh @@ -237,7 +247,7 @@ jobs: exit $lastExitCode } - python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.9.0.34.Windows10.x86_64.cuda-12.8" --cuda_home="${{ runner.temp }}\v12.2" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --parallel --nvcc_threads 1 --use_binskim_compliant_compile_flags --build_dir build --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="${{ runner.temp }}\TensorRT-10.14.1.48.Windows.win10.cuda-12.9" --cuda_home="${{ runner.temp }}\v12.8" --use_vcpkg --use_vcpkg_ms_internal_asset_cache --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 if ($lastExitCode -ne 0) { exit $lastExitCode } diff --git a/.github/workflows/windows_webgpu.yml b/.github/workflows/windows_webgpu.yml index e1a8c28f5a1ad..5049e000495bf 100644 --- a/.github/workflows/windows_webgpu.yml +++ b/.github/workflows/windows_webgpu.yml @@ -34,7 +34,7 @@ jobs: ONNXRUNTIME_TEST_GPU_DEVICE_ID: "0" steps: - name: Checkout - uses: actions/checkout@v5 + uses: actions/checkout@v6 with: fetch-depth: 0 submodules: none @@ -56,12 +56,12 @@ jobs: working-directory: ${{ github.workspace }} - name: Setup Node.js - uses: actions/setup-node@v4 + uses: actions/setup-node@v6 with: node-version: "20.x" - name: Setup Java - uses: actions/setup-java@v4 + uses: actions/setup-java@v5 with: distribution: "temurin" java-version: "17" @@ -78,7 +78,7 @@ jobs: working-directory: ${{ github.workspace }} - name: Setup .NET - uses: actions/setup-dotnet@v4 + uses: actions/setup-dotnet@v5 env: PROCESSOR_ARCHITECTURE: x64 with: @@ -95,7 +95,7 @@ jobs: shell: cmd working-directory: ${{ github.workspace }} - - uses: actions/cache@v4 + - uses: actions/cache@v5 id: onnx-node-tests-cache with: path: ${{ github.workspace }}/js/test/ @@ -155,7 +155,7 @@ jobs: timeout-minutes: 300 steps: - name: Checkout - uses: actions/checkout@v5 + uses: actions/checkout@v6 with: fetch-depth: 0 submodules: none @@ -208,7 +208,7 @@ jobs: ONNXRUNTIME_TEST_GPU_DEVICE_ID: "0" steps: - name: Checkout - uses: actions/checkout@v5 + uses: actions/checkout@v6 with: fetch-depth: 0 submodules: none @@ -230,12 +230,12 @@ jobs: working-directory: ${{ github.workspace }} - name: Setup Node.js - uses: actions/setup-node@v4 + uses: actions/setup-node@v6 with: node-version: "20.x" - name: Setup Java - uses: actions/setup-java@v4 + uses: actions/setup-java@v5 with: distribution: "temurin" java-version: "17" @@ -252,7 +252,7 @@ jobs: working-directory: ${{ github.workspace }} - name: Setup .NET - uses: actions/setup-dotnet@v4 + uses: actions/setup-dotnet@v5 env: PROCESSOR_ARCHITECTURE: x64 with: diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index 91707c485d3c5..f36d3f1bd1315 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -182,8 +182,7 @@ # Since CUDA 12.8, compiling diagnostics become stricter if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) - target_compile_options(${target} PRIVATE "$<$:--relocatable-device-code=true>") - set_target_properties(${target} PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + target_compile_options(${target} PRIVATE "$<$:--static-global-template-stub=false>") if (MSVC) target_compile_options(${target} PRIVATE "$<$:SHELL:-Xcompiler /wd4505>") endif() diff --git a/cmake/onnxruntime_providers_vitisai.cmake b/cmake/onnxruntime_providers_vitisai.cmake index d40ae17e40545..d59c944c8926f 100644 --- a/cmake/onnxruntime_providers_vitisai.cmake +++ b/cmake/onnxruntime_providers_vitisai.cmake @@ -19,7 +19,16 @@ "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc" ) source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_vitisai_cc_srcs}) - onnxruntime_add_shared_library(onnxruntime_providers_vitisai ${onnxruntime_providers_vitisai_cc_srcs}) + set(onnxruntime_providers_vitisai_all_srcs ${onnxruntime_providers_vitisai_cc_srcs}) + if(WIN32) + # Sets the DLL version info on Windows: https://learn.microsoft.com/en-us/windows/win32/menurc/versioninfo-resource + list(APPEND onnxruntime_providers_vitisai_all_srcs "${ONNXRUNTIME_ROOT}/core/providers/vitisai/onnxruntime_providers_vitisai.rc") + endif() + onnxruntime_add_shared_library(onnxruntime_providers_vitisai ${onnxruntime_providers_vitisai_all_srcs}) + if(WIN32) + # FILE_NAME preprocessor definition is used in onnxruntime_providers_vitisai.rc + target_compile_definitions(onnxruntime_providers_vitisai PRIVATE FILE_NAME=\"onnxruntime_providers_vitisai.dll\") + endif() onnxruntime_add_include_to_target(onnxruntime_providers_vitisai ${ONNXRUNTIME_PROVIDERS_SHARED} ${GSL_TARGET} safeint_interface flatbuffers::flatbuffers Boost::mp11) target_link_libraries(onnxruntime_providers_vitisai PRIVATE ${ONNXRUNTIME_PROVIDERS_SHARED} ${ABSEIL_LIBS}) if(MSVC) diff --git a/onnxruntime/core/platform/telemetry.cc b/onnxruntime/core/platform/telemetry.cc index 6cbbdd4e0a7ef..59087ee725a18 100644 --- a/onnxruntime/core/platform/telemetry.cc +++ b/onnxruntime/core/platform/telemetry.cc @@ -91,7 +91,7 @@ void Telemetry::LogRuntimeError(uint32_t session_id, const common::Status& statu } void Telemetry::LogRuntimePerf(uint32_t session_id, uint32_t total_runs_since_last, int64_t total_run_duration_since_last, - std::unordered_map duration_per_batch_size) const { + const std::unordered_map& duration_per_batch_size) const { ORT_UNUSED_PARAMETER(session_id); ORT_UNUSED_PARAMETER(total_runs_since_last); ORT_UNUSED_PARAMETER(total_run_duration_since_last); @@ -127,4 +127,35 @@ void Telemetry::LogProviderOptions(const std::string& provider_id, ORT_UNUSED_PARAMETER(captureState); } +void Telemetry::LogModelLoadStart(uint32_t session_id) const { + ORT_UNUSED_PARAMETER(session_id); +} + +void Telemetry::LogModelLoadEnd(uint32_t session_id, const common::Status& status) const { + ORT_UNUSED_PARAMETER(session_id); + ORT_UNUSED_PARAMETER(status); +} + +void Telemetry::LogSessionCreationEnd(uint32_t session_id, + const common::Status& status) const { + ORT_UNUSED_PARAMETER(session_id); + ORT_UNUSED_PARAMETER(status); +} + +void Telemetry::LogRegisterEpLibraryWithLibPath(const std::string& registration_name, + const std::string& lib_path) const { + ORT_UNUSED_PARAMETER(registration_name); + ORT_UNUSED_PARAMETER(lib_path); +} + +void Telemetry::LogRegisterEpLibraryStart(const std::string& registration_name) const { + ORT_UNUSED_PARAMETER(registration_name); +} + +void Telemetry::LogRegisterEpLibraryEnd(const std::string& registration_name, + const common::Status& status) const { + ORT_UNUSED_PARAMETER(registration_name); + ORT_UNUSED_PARAMETER(status); +} + } // namespace onnxruntime diff --git a/onnxruntime/core/platform/telemetry.h b/onnxruntime/core/platform/telemetry.h index b60345e1b8a80..8bc92b0490ece 100644 --- a/onnxruntime/core/platform/telemetry.h +++ b/onnxruntime/core/platform/telemetry.h @@ -70,7 +70,7 @@ class Telemetry { const char* function, uint32_t line) const; virtual void LogRuntimePerf(uint32_t session_id, uint32_t total_runs_since_last, int64_t total_run_duration_since_last, - std::unordered_map duration_per_batch_size) const; + const std::unordered_map& duration_per_batch_size) const; virtual void LogExecutionProviderEvent(LUID* adapterLuid) const; @@ -86,6 +86,21 @@ class Telemetry { const std::string& provider_options_string, bool captureState) const; + virtual void LogModelLoadStart(uint32_t session_id) const; + + virtual void LogModelLoadEnd(uint32_t session_id, const common::Status& status) const; + + virtual void LogSessionCreationEnd(uint32_t session_id, + const common::Status& status) const; + + virtual void LogRegisterEpLibraryWithLibPath(const std::string& registration_name, + const std::string& lib_path) const; + + virtual void LogRegisterEpLibraryStart(const std::string& registration_name) const; + + virtual void LogRegisterEpLibraryEnd(const std::string& registration_name, + const common::Status& status) const; + private: ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Telemetry); }; diff --git a/onnxruntime/core/platform/windows/telemetry.cc b/onnxruntime/core/platform/windows/telemetry.cc index 029b17eb3502e..3ea94ac3a8492 100644 --- a/onnxruntime/core/platform/windows/telemetry.cc +++ b/onnxruntime/core/platform/windows/telemetry.cc @@ -465,7 +465,7 @@ void WindowsTelemetry::LogRuntimeError(uint32_t session_id, const common::Status } void WindowsTelemetry::LogRuntimePerf(uint32_t session_id, uint32_t total_runs_since_last, int64_t total_run_duration_since_last, - std::unordered_map duration_per_batch_size) const { + const std::unordered_map& duration_per_batch_size) const { if (global_register_count_ == 0 || enabled_ == false) return; @@ -605,4 +605,116 @@ void WindowsTelemetry::LogProviderOptions(const std::string& provider_id, const } } +void WindowsTelemetry::LogModelLoadStart(uint32_t session_id) const { + if (global_register_count_ == 0 || enabled_ == false) + return; + + TraceLoggingWrite(telemetry_provider_handle, + "ModelLoadStart", + TraceLoggingBool(true, "UTCReplace_AppSessionGuid"), + TelemetryPrivacyDataTag(PDT_ProductAndServiceUsage), + TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES), + TraceLoggingLevel(WINEVENT_LEVEL_INFO), + // Telemetry info + TraceLoggingUInt8(0, "schemaVersion"), + TraceLoggingUInt32(session_id, "sessionId"), + TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName")); +} + +void WindowsTelemetry::LogModelLoadEnd(uint32_t session_id, const common::Status& status) const { + if (global_register_count_ == 0 || enabled_ == false) + return; + + TraceLoggingWrite(telemetry_provider_handle, + "ModelLoadEnd", + TraceLoggingBool(true, "UTCReplace_AppSessionGuid"), + TelemetryPrivacyDataTag(PDT_ProductAndServicePerformance), + TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES), + TraceLoggingLevel(WINEVENT_LEVEL_INFO), + // Telemetry info + TraceLoggingUInt8(0, "schemaVersion"), + TraceLoggingUInt32(session_id, "sessionId"), + TraceLoggingBool(status.IsOK(), "isSuccess"), + TraceLoggingUInt32(status.Code(), "errorCode"), + TraceLoggingUInt32(status.Category(), "errorCategory"), + TraceLoggingString(status.IsOK() ? "" : status.ErrorMessage().c_str(), "errorMessage"), + TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName")); +} + +void WindowsTelemetry::LogSessionCreationEnd(uint32_t session_id, + const common::Status& status) const { + if (global_register_count_ == 0 || enabled_ == false) + return; + + TraceLoggingWrite(telemetry_provider_handle, + "SessionCreationEnd", + TraceLoggingBool(true, "UTCReplace_AppSessionGuid"), + TelemetryPrivacyDataTag(PDT_ProductAndServicePerformance), + TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES), + TraceLoggingLevel(WINEVENT_LEVEL_INFO), + // Telemetry info + TraceLoggingUInt8(0, "schemaVersion"), + TraceLoggingUInt32(session_id, "sessionId"), + TraceLoggingBool(status.IsOK(), "isSuccess"), + TraceLoggingUInt32(status.Code(), "errorCode"), + TraceLoggingUInt32(status.Category(), "errorCategory"), + TraceLoggingString(status.IsOK() ? "" : status.ErrorMessage().c_str(), "errorMessage"), + TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName")); +} + +void WindowsTelemetry::LogRegisterEpLibraryWithLibPath(const std::string& registration_name, + const std::string& lib_path) const { + if (global_register_count_ == 0 || enabled_ == false) + return; + + TraceLoggingWrite(telemetry_provider_handle, + "RegisterEpLibraryWithLibPath", + TraceLoggingBool(true, "UTCReplace_AppSessionGuid"), + TelemetryPrivacyDataTag(PDT_ProductAndServiceUsage), + TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES), + TraceLoggingLevel(WINEVENT_LEVEL_INFO), + // Telemetry info + TraceLoggingUInt8(0, "schemaVersion"), + TraceLoggingString(registration_name.c_str(), "registrationName"), + TraceLoggingString(lib_path.c_str(), "libPath"), + TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName")); +} + +void WindowsTelemetry::LogRegisterEpLibraryStart(const std::string& registration_name) const { + if (global_register_count_ == 0 || enabled_ == false) + return; + + TraceLoggingWrite(telemetry_provider_handle, + "RegisterEpLibraryStart", + TraceLoggingBool(true, "UTCReplace_AppSessionGuid"), + TelemetryPrivacyDataTag(PDT_ProductAndServiceUsage), + TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES), + TraceLoggingLevel(WINEVENT_LEVEL_INFO), + // Telemetry info + TraceLoggingUInt8(0, "schemaVersion"), + TraceLoggingString(registration_name.c_str(), "registrationName"), + TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName")); +} + +void WindowsTelemetry::LogRegisterEpLibraryEnd(const std::string& registration_name, + const common::Status& status) const { + if (global_register_count_ == 0 || enabled_ == false) + return; + + TraceLoggingWrite(telemetry_provider_handle, + "RegisterEpLibraryEnd", + TraceLoggingBool(true, "UTCReplace_AppSessionGuid"), + TelemetryPrivacyDataTag(PDT_ProductAndServicePerformance), + TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES), + TraceLoggingLevel(WINEVENT_LEVEL_INFO), + // Telemetry info + TraceLoggingUInt8(0, "schemaVersion"), + TraceLoggingString(registration_name.c_str(), "registrationName"), + TraceLoggingBool(status.IsOK(), "isSuccess"), + TraceLoggingUInt32(status.Code(), "errorCode"), + TraceLoggingUInt32(status.Category(), "errorCategory"), + TraceLoggingString(status.IsOK() ? "" : status.ErrorMessage().c_str(), "errorMessage"), + TraceLoggingString(ORT_CALLER_FRAMEWORK, "frameworkName")); +} + } // namespace onnxruntime diff --git a/onnxruntime/core/platform/windows/telemetry.h b/onnxruntime/core/platform/windows/telemetry.h index 261d14a7fed8c..30621060ae91a 100644 --- a/onnxruntime/core/platform/windows/telemetry.h +++ b/onnxruntime/core/platform/windows/telemetry.h @@ -63,7 +63,7 @@ class WindowsTelemetry : public Telemetry { const char* function, uint32_t line) const override; void LogRuntimePerf(uint32_t session_id, uint32_t total_runs_since_last, int64_t total_run_duration_since_last, - std::unordered_map duration_per_batch_size) const override; + const std::unordered_map& duration_per_batch_size) const override; void LogExecutionProviderEvent(LUID* adapterLuid) const override; @@ -79,6 +79,21 @@ class WindowsTelemetry : public Telemetry { const std::string& provider_options_string, bool captureState) const override; + void LogModelLoadStart(uint32_t session_id) const override; + + void LogModelLoadEnd(uint32_t session_id, const common::Status& status) const override; + + void LogSessionCreationEnd(uint32_t session_id, + const common::Status& status) const override; + + void LogRegisterEpLibraryWithLibPath(const std::string& registration_name, + const std::string& lib_path) const override; + + void LogRegisterEpLibraryStart(const std::string& registration_name) const override; + + void LogRegisterEpLibraryEnd(const std::string& registration_name, + const common::Status& status) const override; + using EtwInternalCallback = std::function; diff --git a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc index 4d8a2bc1106ad..fc3ecf5465e6d 100644 --- a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc +++ b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc @@ -626,6 +626,20 @@ Status Mul::Compute(OpKernelContext* context) const { template Status Div::Compute(OpKernelContext* context) const { + // Integer division by zero is undefined behavior in C++ and causes a hardware exception. + // Check for zeros in the divisor before performing the division. + // Skip the check if the divisor was already validated as a constant initializer during kernel creation. + if constexpr (std::is_integral::value) { + if (!divisor_is_validated_constant_) { + const Tensor& B = *context->Input(1); + const T* b_data = B.Data(); + const int64_t b_size = B.Shape().Size(); + for (int64_t i = 0; i < b_size; ++i) { + ORT_RETURN_IF(b_data[i] == T{0}, "Integer division by zero"); + } + } + } + ProcessBroadcastSpanFuncs funcs{ [](BroadcastHelper& per_iter_bh) { per_iter_bh.OutputEigen() = per_iter_bh.ScalarInput0() / per_iter_bh.EigenInput1().array(); diff --git a/onnxruntime/core/providers/cpu/math/element_wise_ops.h b/onnxruntime/core/providers/cpu/math/element_wise_ops.h index 66060344c9874..77ef3033a0975 100644 --- a/onnxruntime/core/providers/cpu/math/element_wise_ops.h +++ b/onnxruntime/core/providers/cpu/math/element_wise_ops.h @@ -243,9 +243,25 @@ template class Div final : public OpKernel { public: Div(const OpKernelInfo& info) : OpKernel(info) { + // If the divisor is a constant initializer, validate for integer division by zero once + // during kernel creation instead of on every Compute call. + if constexpr (std::is_integral::value) { + const Tensor* constant_divisor = nullptr; + if (info.TryGetConstantInput(1, &constant_divisor)) { + const T* b_data = constant_divisor->Data(); + const int64_t b_size = constant_divisor->Shape().Size(); + for (int64_t i = 0; i < b_size; ++i) { + ORT_ENFORCE(b_data[i] != T{0}, "Integer division by zero"); + } + divisor_is_validated_constant_ = true; + } + } } Status Compute(OpKernelContext* context) const override; + + private: + bool divisor_is_validated_constant_{false}; }; class Pow final : public OpKernel { diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp index 353f698bb6f2c..076027dd3672f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/AbiCustomRegistry.cpp @@ -504,7 +504,7 @@ HRESULT STDMETHODCALLTYPE AbiCustomRegistry::RegisterOperatorKernel( InferAndVerifyOutputSizes(node, &defaultAttributesCapture, shapeInferrerCapture.Get(), constantCpuInputCapture, constantInputGetter, inputShapesOverrides, *outputShapes); // Create the kernel while allowing input shape and output shape queries according to options - ComPtr kernelInfoWrapper = wil::MakeOrThrow( + ComPtr kernelInfoWrapper = Dml::SafeMakeOrThrow( &protoHelper, executionHandle, true, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp index 18b4b4593f537..ed99ac0fc7fc2 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/BucketizedBufferAllocator.cpp @@ -132,7 +132,7 @@ namespace Dml assert(resourceWrapper->GetD3D12Resource()->GetDesc().Width == bucketSize); assert(resourceWrapper != nullptr); - ComPtr allocInfo = wil::MakeOrThrow( + ComPtr allocInfo = Dml::SafeMakeOrThrow( this, ++m_currentAllocationId, resourceId, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.cpp index 54393e9bf1539..2934fd0c11516 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlCommittedResourceAllocator.cpp @@ -22,7 +22,7 @@ namespace Dml )); ComPtr resourceWrapper; - wil::MakeOrThrow(std::move(resource)).As(&resourceWrapper); + Dml::SafeMakeOrThrow(std::move(resource)).As(&resourceWrapper); return resourceWrapper; } } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalBufferAllocator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalBufferAllocator.h index c99d686349e94..158c102d69ee7 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalBufferAllocator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlExternalBufferAllocator.h @@ -48,9 +48,9 @@ namespace Dml constexpr uint64_t pooledResourceId = 0; // Not a pooled resource Microsoft::WRL::ComPtr resourceWrapper; - wil::MakeOrThrow(std::move(resource)).As(&resourceWrapper); + Dml::SafeMakeOrThrow(std::move(resource)).As(&resourceWrapper); - Microsoft::WRL::ComPtr allocInfo = wil::MakeOrThrow( + Microsoft::WRL::ComPtr allocInfo = Dml::SafeMakeOrThrow( nullptr, 0, pooledResourceId, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp index 6bd7de0fba5cb..4ddf8b8640376 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp @@ -232,8 +232,6 @@ namespace DmlGraphFusionHelper } } - // Tensor sizes in DML must be a multiple of 4 bytes large. - tensorByteSize = AlignToPow2(tensorByteSize, 4); if(graphSerializationEnabled) { WriteToFile(modelName, ConvertToWString(iter->first) + L".bin", reinterpret_cast(tensorPtr), tensorByteSize); @@ -264,9 +262,10 @@ namespace DmlGraphFusionHelper initializeInputBuffer = CreateCpuResource(providerImpl, tensorPtr, tensorByteSize); } - // Set the binding for operator initialization to the buffer + // Set the binding for operator initialization to the buffer. + // DML requires buffer binding sizes to be a multiple of 4 bytes. initInputBindings[i].Buffer = initializeInputBuffer.Get(); - initInputBindings[i].SizeInBytes = tensorByteSize; + initInputBindings[i].SizeInBytes = AlignToPow2(tensorByteSize, 4); initializeResourceRefs.push_back(std::move(initializeInputBuffer)); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index 6d8d5453b9fc0..cd7dfd46485af 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -55,7 +55,7 @@ namespace Dml _Out_ std::shared_ptr* registry, _Out_ std::shared_ptr* internalRegInfoMap) { - ComPtr abiRegistry = wil::MakeOrThrow(); + ComPtr abiRegistry = Dml::SafeMakeOrThrow(); Dml::RegisterDmlOperators(abiRegistry.Get()); assert(abiRegistry->GetRegistries().size() == 1); @@ -88,7 +88,7 @@ namespace Dml ComPtr device; GRAPHICS_THROW_IF_FAILED(dmlDevice->GetParentDevice(IID_GRAPHICS_PPV_ARGS(device.GetAddressOf()))); - m_impl = wil::MakeOrThrow(dmlDevice, device.Get(), executionContext, enableMetacommands, + m_impl = Dml::SafeMakeOrThrow(dmlDevice, device.Get(), executionContext, enableMetacommands, enableGraphCapture, enableSyncSpinning, disableMemoryArena); } @@ -1298,9 +1298,9 @@ namespace Dml uint64_t pooledResourceId = 0; // Not a pooled resource ComPtr resourceWrapper; - wil::MakeOrThrow(pResource).As(&resourceWrapper); + Dml::SafeMakeOrThrow(pResource).As(&resourceWrapper); - ComPtr allocInfo = wil::MakeOrThrow(nullptr, 0, pooledResourceId, resourceWrapper.Get(), (size_t)pResource->GetDesc().Width); + ComPtr allocInfo = Dml::SafeMakeOrThrow(nullptr, 0, pooledResourceId, resourceWrapper.Get(), (size_t)pResource->GetDesc().Width); return allocInfo.Detach(); } void FreeGPUAllocation(void* ptr) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp index 22de743f6e718..51c25d6d40c5b 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp @@ -291,7 +291,7 @@ namespace Dml::GraphDescBuilder if (iter != isInitializerTransferable.end()) { // Using const_cast here is simpler than making surrounding code const correct. - tensorWrapper = wil::MakeOrThrow(const_cast(iter->second.first), modelPath); + tensorWrapper = Dml::SafeMakeOrThrow(const_cast(iter->second.first), modelPath); } return tensorWrapper; }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp index fe52f27b35bb8..13ce9afa99b1e 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp @@ -868,7 +868,7 @@ namespace Windows::AI::MachineLearning::Adapter const onnx::TensorProto* tensorProto = &attributeProto->t(); // An empty path is used as external weights are not currently supported in this case - Microsoft::WRL::ComPtr tensorWrapper = wil::MakeOrThrow(const_cast(tensorProto), std::filesystem::path()); + Microsoft::WRL::ComPtr tensorWrapper = Dml::SafeMakeOrThrow(const_cast(tensorProto), std::filesystem::path()); *tensor = tensorWrapper.Detach(); return S_OK; } @@ -1977,7 +1977,7 @@ namespace Windows::AI::MachineLearning::Adapter auto inputTensor = m_impl->Input(gsl::narrow_cast(inputIndex)); if (inputTensor != nullptr) { - ComPtr tensorWrapper = wil::MakeOrThrow( + ComPtr tensorWrapper = Dml::SafeMakeOrThrow( const_cast(inputTensor), IsAllocationInterface(inputTensor->Location()), m_winmlProvider.Get(), @@ -2019,7 +2019,7 @@ namespace Windows::AI::MachineLearning::Adapter auto elemTensor = const_cast(&inputTensorSeq->Get(sequenceIndex)); if (elemTensor != nullptr) { - ComPtr tensorWrapper = wil::MakeOrThrow( + ComPtr tensorWrapper = Dml::SafeMakeOrThrow( elemTensor, IsAllocationInterface(elemTensor->Location()), m_winmlProvider.Get(), @@ -2119,7 +2119,7 @@ namespace Windows::AI::MachineLearning::Adapter auto elemTensor = const_cast(&outputTensorSeq->Get(sequenceIndex)); if (elemTensor != nullptr) { - ComPtr tensorWrapper = wil::MakeOrThrow( + ComPtr tensorWrapper = Dml::SafeMakeOrThrow( elemTensor, IsAllocationInterface(elemTensor->Location()), m_winmlProvider.Get(), @@ -2212,7 +2212,7 @@ namespace Windows::AI::MachineLearning::Adapter auto outputTensor = m_impl->Output(outputIndex, shape); if (outputTensor) { - ComPtr tensorWrapper = wil::MakeOrThrow( + ComPtr tensorWrapper = Dml::SafeMakeOrThrow( const_cast(outputTensor), IsAllocationInterface(outputTensor->Location()), m_winmlProvider.Get(), @@ -2377,7 +2377,7 @@ namespace Windows::AI::MachineLearning::Adapter const onnxruntime::Tensor* tensor = nullptr; if (kerneInfo.TryGetConstantInput(index, &tensor)) { - tensorWrapper = wil::MakeOrThrow( + tensorWrapper = Dml::SafeMakeOrThrow( const_cast(tensor), IsAllocationInterface(tensor->Location()), winmlProviderCapture.Get(), @@ -2396,7 +2396,7 @@ namespace Windows::AI::MachineLearning::Adapter } // Create the kernel while allowing input shape and output shape queries according to options - ComPtr kernelInfoWrapper = wil::MakeOrThrow( + ComPtr kernelInfoWrapper = Dml::SafeMakeOrThrow( &kerneInfo, m_abiExecutionObject.Get(), nullptr, @@ -2443,7 +2443,7 @@ namespace Windows::AI::MachineLearning::Adapter const auto* tensor = context->Input(gsl::narrow_cast(index)); if (tensor != nullptr) { - tensorWrapper = wil::MakeOrThrow( + tensorWrapper = Dml::SafeMakeOrThrow( const_cast(tensor), IsAllocationInterface(tensor->Location()), winmlProviderCapture.Get(), @@ -2464,7 +2464,7 @@ namespace Windows::AI::MachineLearning::Adapter for (uint32_t sequenceIndex = 0; sequenceIndex < tensorSequence->Size(); ++sequenceIndex) { auto& tensor = tensorSequence->Get(sequenceIndex); - auto tensorWrapper = wil::MakeOrThrow( + auto tensorWrapper = Dml::SafeMakeOrThrow( const_cast(&tensor), IsAllocationInterface(tensor.Location()), winmlProviderCapture.Get(), @@ -2491,7 +2491,7 @@ namespace Windows::AI::MachineLearning::Adapter } // Create the kernel while allowing input shape and output shape queries according to options - ComPtr kernelInfoWrapper = wil::MakeOrThrow( + ComPtr kernelInfoWrapper = Dml::SafeMakeOrThrow( &Info(), m_abiExecutionObject.Get(), &inputShapes, @@ -2569,7 +2569,7 @@ namespace Windows::AI::MachineLearning::Adapter EdgeShapes localInferredOutputShapes; ComPtr localKernel = inferShapesAndCreateKernel(local_input_shapes, localInferredOutputShapes); - ComPtr kernelContextWrapper = wil::MakeOrThrow( + ComPtr kernelContextWrapper = Dml::SafeMakeOrThrow( context, Info().GetExecutionProvider(), m_internalOperator, @@ -2588,7 +2588,7 @@ namespace Windows::AI::MachineLearning::Adapter } } - ComPtr kernelContextWrapper = wil::MakeOrThrow( + ComPtr kernelContextWrapper = Dml::SafeMakeOrThrow( context, Info().GetExecutionProvider(), m_internalOperator, @@ -2811,7 +2811,7 @@ namespace Windows::AI::MachineLearning::Adapter onnxruntime::ProtoHelperNodeContext protoContext(node); onnxruntime::OpNodeProtoHelper info(&protoContext); - ComPtr inferenceContext = wil::MakeOrThrow(&info, inputShapes, outputShapes, defaultAttributes, requiredConstantCpuInputs, constantInputGetter); + ComPtr inferenceContext = Dml::SafeMakeOrThrow(&info, inputShapes, outputShapes, defaultAttributes, requiredConstantCpuInputs, constantInputGetter); outputShapes.Reset(info.GetOutputCount()); @@ -2865,13 +2865,13 @@ namespace Windows::AI::MachineLearning::Adapter [ctx](uint32_t index) { // An empty path is used as external weights are not currently supported in this case - Microsoft::WRL::ComPtr tensorWrapper = wil::MakeOrThrow( + Microsoft::WRL::ComPtr tensorWrapper = Dml::SafeMakeOrThrow( const_cast(ctx->getInputData(index)), std::filesystem::path()); return tensorWrapper; } ); - return wil::MakeOrThrow(info, ctx, requiredConstantCpuInputs, mlOperatorTensorGetter); + return Dml::SafeMakeOrThrow(info, ctx, requiredConstantCpuInputs, mlOperatorTensorGetter); } MLSchemaInferenceContext::MLSchemaInferenceContext( @@ -2952,7 +2952,7 @@ namespace Windows::AI::MachineLearning::Adapter const AttributeMap* defaultAttributes) { MLOperatorTensorGetter mLOperatorTensorGetter = MLOperatorTensorGetter(); - return wil::MakeOrThrow(info, defaultAttributes, mLOperatorTensorGetter); + return Dml::SafeMakeOrThrow(info, defaultAttributes, mLOperatorTensorGetter); } MLSupportQueryContext::MLSupportQueryContext( diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h index 1de88a61a0d77..25210c146a6b6 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlDFT.h @@ -1097,7 +1097,7 @@ class GpuDFTOperatorFactory : public WRL::Base version = 20; } - auto dftOperator = wil::MakeOrThrow(context, version); + auto dftOperator = Dml::SafeMakeOrThrow(context, version); dftOperator.CopyTo(kernel); return S_OK; } @@ -1177,8 +1177,8 @@ class GpuDFTOperatorFactory : public WRL::Base kernelDescription.options = MLOperatorKernelOptions::None; kernelDescription.executionOptions = 0; - auto shareInferrer = wil::MakeOrThrow(); - auto factory = wil::MakeOrThrow(); + auto shareInferrer = Dml::SafeMakeOrThrow(); + auto factory = Dml::SafeMakeOrThrow(); std::array requiredConstantCpuInputs = { 1, 2 }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h index 5ba936ddf3976..6d7a089103c9b 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlGridSample.h @@ -747,7 +747,7 @@ class DmlGridSampleOperatorFactory : public WRL::Base { try { - auto dftOperator = wil::MakeOrThrow(context); + auto dftOperator = Dml::SafeMakeOrThrow(context); dftOperator.CopyTo(kernel); return S_OK; } @@ -832,8 +832,8 @@ class DmlGridSampleOperatorFactory : public WRL::Base kernelDescription.options = MLOperatorKernelOptions::None; kernelDescription.executionOptions = 0; - auto shareInferrer = wil::MakeOrThrow(); - auto factory = wil::MakeOrThrow(); + auto shareInferrer = Dml::SafeMakeOrThrow(); + auto factory = Dml::SafeMakeOrThrow(); ComPtr registryPrivate; ORT_THROW_IF_FAILED(registry->QueryInterface(IID_PPV_ARGS(®istryPrivate))); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp index 287f1e5b6dfe7..2ee85b01a9a2e 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp @@ -907,4 +907,71 @@ namespace Dml bufferTensorDesc->TotalTensorSizeInBytes = (elementSize + 3) & ~3; } + void DmlOperator::BroadcastQuantizationParameters( + const MLOperatorKernelCreationContext& kernelInfo, + gsl::span outputShape + ) + { + const uint32_t outputShapeDimCount = gsl::narrow_cast(outputShape.size()); + + uint32_t axis = 0; + + // If an axis was explicitly passed (or the default value 1 is set from the schema), + // then other inputs are broadcasting to the shape of the input data tensor. + if (kernelInfo.HasAttribute(AttrName::Axis, MLOperatorAttributeType::Int)) + { + // Avoid validating the axis until later because the axis parameter is ignorable unless + // broadcasting is actually needed. ONNX opset 13 returns a default value of 1 for the + // "axis" attribute even when the attribute doesn't actually exist in the model, which + // would cause a validation failure here. + const int32_t signedAxis = gsl::narrow_cast(kernelInfo.GetAttribute(AttrName::Axis)); + axis = Dml::HandleNegativeAxis(signedAxis, outputShapeDimCount, /*validateAxis*/ false); + } + + // Explicitly reshape each of the inputs after the first input (scale tensor and optional zero point tensor). + for (uint32_t index = 1, inputCount = gsl::narrow_cast(m_inputTensorDescs.size()); index < inputCount; ++index) + { + if (!kernelInfo.IsInputValid(index)) + { + continue; + } + + auto edgeDesc = kernelInfo.GetInputEdgeDescription(index); + assert(edgeDesc.edgeType == MLOperatorEdgeType::Tensor); + + // Fix up the tensor shape by filling with trailing ones. So input[2,3] with axis=0 and scale[2] + // becomes scale[2,1], so that broadcasting works correctly. + std::vector inputTensorShape = kernelInfo.GetTensorShapeDescription().GetInputTensorShape(index); + + // If the input tensor is a 1D vector, then extra massaging is needed to project their + // 1D vectors back to the full shape for broadcasting along the given axis. + // The 1D vector should have a length equal to the output tensor's dimension on that axis. + if (inputTensorShape.size() == 1 && inputTensorShape != std::vector(outputShape.begin(), outputShape.end())) + { + ML_CHECK_VALID_ARGUMENT(axis < outputShapeDimCount); + uint32_t broadcastAxisLength = outputShape[axis]; + ML_CHECK_VALID_ARGUMENT( + (inputTensorShape[0] == broadcastAxisLength) || + // Treat as broadcast dimension to match CPU behavior. + (inputTensorShape[0] == 1) + ); + inputTensorShape.insert(inputTensorShape.begin(), axis, 1); + inputTensorShape.insert(inputTensorShape.end(), outputShapeDimCount - 1 - axis, 1); + } + // For any other shape (scalar/ND), leave it alone, and the TensorDesc constructor + // will apply broadcasting with standard elementwise alignment. + + m_inputTensorDescs[index] = TensorDesc( + edgeDesc.tensorDataType, + outputShape, + gsl::make_span(inputTensorShape), + TensorAxis::DoNotCoerce, + TensorAxis::W, + TensorAxis::RightAligned, + NchwDimensionCount, // minDimensionCount + 0 // guaranteedBaseOffsetAlignment + ); + } + } + } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h index fa54d4b041b5f..002541e23c47c 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h @@ -149,6 +149,15 @@ namespace Dml uint32_t minDimensionCount = NchwDimensionCount ) const; + // Reshapes scale and zero_point tensor descriptors (inputs after index 0) so that their + // dimension count matches the output shape, enabling correct broadcasting in DML. + // For 1D per-axis tensors, the shape is projected along the given axis (e.g. scale[6] + // with axis=0 on a 5D output becomes [6,1,1,1,1]). + void BroadcastQuantizationParameters( + const MLOperatorKernelCreationContext& kernelInfo, + gsl::span outputShape + ); + static void TryConvertTensorToBroadcastScalar( const MLOperatorKernelCreationContext& kernelInfo, const DML_TENSOR_DESC* tensor, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorElementWise.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorElementWise.cpp index d4d7ee1311874..b64a5265f56e3 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorElementWise.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorElementWise.cpp @@ -542,64 +542,7 @@ class DmlOperatorElementwiseQLinear : public DmlOperator const DML_TENSOR_DATA_TYPE outputDataType = m_outputTensorDescs[0].GetDmlDataType(); bool hasZeroPointTensor = kernelInfo.IsInputValid(2); - uint32_t axis = 0; - - // If an axis was given explicitly passed (or the default value 1 is set from the schema), - // then other inputs are broadcasting to the shape of the input data tensor. - if (kernelInfo.HasAttribute(AttrName::Axis, MLOperatorAttributeType::Int)) - { - // Avoid validating the axis until later because the axis parameter is ignorable unless - // broadcasting is actually needed. ONNX opset 13 returns a default value of 1 for the - // "axis" attribute even when the attribute doesn't actually exist in the model, which - // would cause a validation failure here. - const int32_t signedAxis = gsl::narrow_cast(kernelInfo.GetAttribute(AttrName::Axis)); - axis = Dml::HandleNegativeAxis(signedAxis, outputShapeDimCount, /*validateAxis*/ false); - } - - // Explicitly reshape each of the inputs after the first input (scale tensor and optional zero point tensor). - for (uint32_t index = 1, inputCount = gsl::narrow_cast(m_inputTensorDescs.size()); index < inputCount; ++index) - { - if (!kernelInfo.IsInputValid(index)) - { - continue; - } - - auto edgeDesc = kernelInfo.GetInputEdgeDescription(index); - assert(edgeDesc.edgeType == MLOperatorEdgeType::Tensor); - - // Fix up the the tensor shape by filling with trailing ones. So input[2,3] with axis=0 and scale[2] - // becomes scale[2,1], so that broadcasting works correctly. - std::vector inputTensorShape = kernelInfo.GetTensorShapeDescription().GetInputTensorShape(index); - - // If the input tensor is a 1D vector, then extra massaging is needed to project their - // 1D vectors back to the full shape for broadcasting along the given axis. - // The 1D vector should have a length equal to the output tensor's dimension on that axis. - if (inputTensorShape.size() == 1 && inputTensorShape != outputShape) - { - ML_CHECK_VALID_ARGUMENT(axis < outputShapeDimCount); - uint32_t broadcastAxisLength = outputShape[axis]; - ML_CHECK_VALID_ARGUMENT( - (inputTensorShape[0] == broadcastAxisLength) || - // Treat as broadcast dimension to match CPU behavior. - (inputTensorShape[0] == 1) - ); - inputTensorShape.insert(inputTensorShape.begin(), axis, 1); - inputTensorShape.insert(inputTensorShape.end(), outputShapeDimCount - 1 - axis, 1); - } - // For any other shape (scalar/ND), leave it alone, and the TensorDesc constructor - // will apply broadcasting with standard elementwise alignment. - - m_inputTensorDescs[index] = TensorDesc( - edgeDesc.tensorDataType, - gsl::make_span(outputShape), - gsl::make_span(inputTensorShape), - TensorAxis::DoNotCoerce, - TensorAxis::W, - TensorAxis::RightAligned, - NchwDimensionCount, // minDimensionCount - 0 // guaranteedBaseOffsetAlignment - ); - } + BroadcastQuantizationParameters(kernelInfo, gsl::make_span(outputShape)); std::vector inputDescs = GetDmlInputDescs(); std::vector outputDescs = GetDmlOutputDescs(); @@ -630,6 +573,8 @@ class DmlOperatorQuantization21 : public DmlOperator const DML_TENSOR_DATA_TYPE outputDataType = m_outputTensorDescs[0].GetDmlDataType(); bool hasZeroPointTensor = kernelInfo.IsInputValid(2); + BroadcastQuantizationParameters(kernelInfo, gsl::make_span(outputShape)); + std::vector inputDescs = GetDmlInputDescs(); std::vector outputDescs = GetDmlOutputDescs(); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp index bc29256dd2e28..83e35ae89282d 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorNonZero.cpp @@ -76,7 +76,7 @@ class DmlOperatorNonZero: public DmlOperator // Create the DML output tensor for the number of nonzero elements onnxruntime::Tensor outputCountDml(onnxruntime::DataTypeImpl::GetType(), m_outputCountShape, executionProvider->GetGpuAllocator()); - Microsoft::WRL::ComPtr outputCountDmlWrapper = wil::MakeOrThrow( + Microsoft::WRL::ComPtr outputCountDmlWrapper = Dml::SafeMakeOrThrow( &outputCountDml, true, executionProvider, @@ -84,7 +84,7 @@ class DmlOperatorNonZero: public DmlOperator // Create the DML output tensor for the coordinates (not cropped) onnxruntime::Tensor intermediateCoordinatesDml(onnxruntime::DataTypeImpl::GetType(), m_outputCoordinatesShape, executionProvider->GetGpuAllocator()); - Microsoft::WRL::ComPtr intermediateCoordinatesDmlWrapper = wil::MakeOrThrow( + Microsoft::WRL::ComPtr intermediateCoordinatesDmlWrapper = Dml::SafeMakeOrThrow( &intermediateCoordinatesDml, true, executionProvider, @@ -105,7 +105,7 @@ class DmlOperatorNonZero: public DmlOperator // Copy the number of nonzero elements back to the CPU onnxruntime::Tensor outputCountCpu(onnxruntime::DataTypeImpl::GetType(), {1}, executionProvider->GetCpuInputAllocator()); - Microsoft::WRL::ComPtr outputCountCpuWrapper = wil::MakeOrThrow( + Microsoft::WRL::ComPtr outputCountCpuWrapper = Dml::SafeMakeOrThrow( &outputCountCpu, false, executionProvider, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h index e2f38231f7295..091a82daefbdc 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlSTFT.h @@ -238,7 +238,7 @@ class DmlSTFTOperator : public WRL::Base constexpr uint32_t dftAxis = 1; constexpr bool dftIsInverse = false; - m_dftOperator.op = wil::MakeOrThrow( + m_dftOperator.op = Dml::SafeMakeOrThrow( m_d3dDevice.Get(), dftAxis, params.isOnesided, @@ -516,7 +516,7 @@ class DmlSTFTOperatorFactory : public WRL::Base { try { - auto dftOperator = wil::MakeOrThrow(context); + auto dftOperator = Dml::SafeMakeOrThrow(context); dftOperator.CopyTo(kernel); return S_OK; } @@ -574,8 +574,8 @@ class DmlSTFTOperatorFactory : public WRL::Base kernelDescription.options = MLOperatorKernelOptions::None; kernelDescription.executionOptions = 0; - auto shareInferrer = wil::MakeOrThrow(); - auto factory = wil::MakeOrThrow(); + auto shareInferrer = Dml::SafeMakeOrThrow(); + auto factory = Dml::SafeMakeOrThrow(); std::array requiredConstantCpuInputs = { /*frame_step*/1, /*frame_length*/3 }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp index b0b37d01370bc..26f998c7521a2 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp @@ -1314,18 +1314,18 @@ void RegisterDmlOperators(IMLOperatorRegistry* registry) totalTypeCount += typeConstraints[i].allowedTypeCount; } - ComPtr factory = wil::MakeOrThrow(information.creationFunction); + ComPtr factory = Dml::SafeMakeOrThrow(information.creationFunction); ComPtr shapeInferrer; if (information.shapeInferenceFunction) { - shapeInferrer = wil::MakeOrThrow(information.shapeInferenceFunction); + shapeInferrer = Dml::SafeMakeOrThrow(information.shapeInferenceFunction); } ComPtr supportQuery; if (information.supportQueryFunction) { - supportQuery = wil::MakeOrThrow(information.supportQueryFunction); + supportQuery = Dml::SafeMakeOrThrow(information.supportQueryFunction); } ORT_THROW_IF_FAILED(registryPrivate->RegisterOperatorKernel( diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/SafeMakeOrThrow.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/SafeMakeOrThrow.h new file mode 100644 index 0000000000000..c2740470cbc0a --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/SafeMakeOrThrow.h @@ -0,0 +1,37 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include + +// Drop-in replacement for wil::MakeOrThrow that avoids an ASan false positive. +// WRL's MakeAllocator stores its buffer as char*, so if the constructor throws, +// ~MakeAllocator calls delete on a char* — passing sizeof(char)=1 to sized +// operator delete instead of sizeof(T). With the default MSVC allocator, this is +// benign (sized delete ignores the size), but ASan flags it as +// new-delete-type-mismatch. This helper uses placement new with correctly-sized +// cleanup to avoid the issue. +namespace Dml +{ + template + Microsoft::WRL::ComPtr SafeMakeOrThrow(TArgs&&... args) + { + void* buffer = ::operator new(sizeof(T)); + T* raw = nullptr; + try + { + raw = new (buffer) T(std::forward(args)...); + } + catch (...) + { + ::operator delete(buffer, sizeof(T)); + throw; + } + Microsoft::WRL::ComPtr result; + result.Attach(raw); + return result; + } +} // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h index e9df3fd20aff9..b9febb8171e0d 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h @@ -25,6 +25,7 @@ #include #include +#include "SafeMakeOrThrow.h" #include diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h index ac77616cb96f0..dec84d9945569 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h @@ -5,6 +5,7 @@ #include "core/providers/dml/DmlExecutionProvider/inc/MLOperatorAuthor.h" #include "MLOperatorAuthorPrivate.h" +#include "core/providers/dml/DmlExecutionProvider/src/SafeMakeOrThrow.h" #include "core/framework/int4.h" #include #include @@ -972,7 +973,7 @@ class MLOperatorKernel : public Microsoft::WRL::RuntimeClass< { ORT_TRY { - Microsoft::WRL::ComPtr kernel = wil::MakeOrThrow(MLOperatorKernelCreationContext(&info)); + Microsoft::WRL::ComPtr kernel = Dml::SafeMakeOrThrow(MLOperatorKernelCreationContext(&info)); *opKernel = kernel.Detach(); return S_OK; diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/SchemaInferenceOverrider.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/SchemaInferenceOverrider.h index fa04bcf6edf41..597780a9f448b 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/SchemaInferenceOverrider.h +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/SchemaInferenceOverrider.h @@ -5,6 +5,7 @@ #include "OperatorHelper.h" #include "OperatorVersions.h" +#include "core/providers/dml/DmlExecutionProvider/src/SafeMakeOrThrow.h" namespace SchemaInferenceOverrider { @@ -21,7 +22,7 @@ namespace SchemaInferenceOverrider ) { Microsoft::WRL::ComPtr shapeInferrer = - wil::MakeOrThrow(OperatorHelper::ShapeInferenceFunction); + Dml::SafeMakeOrThrow(OperatorHelper::ShapeInferenceFunction); auto schema = const_cast(onnx::OpSchemaRegistry::Schema(name, version)); diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc index c0d8a4f02bbc3..0884908525dce 100644 --- a/onnxruntime/core/providers/dml/dml_provider_factory.cc +++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc @@ -21,6 +21,8 @@ using Microsoft::WRL::ComPtr; #include #include +#include "core/providers/dml/DmlExecutionProvider/src/SafeMakeOrThrow.h" + #include "core/providers/dml/dml_provider_factory.h" #include "core/providers/dml/dml_provider_factory_creator.h" #include "core/session/abi_session_options_impl.h" @@ -86,11 +88,11 @@ std::unique_ptr DMLProviderFactory::CreateProvider() { // First, check if an I/O binding API that was used before this session or another session has already created a queue if (FAILED(d3d12_device->GetPrivateData(dml_execution_context_guid, &execution_context_ptr_size, execution_context.GetAddressOf()))) { - execution_context = wil::MakeOrThrow(d3d12_device.Get(), dml_device_.Get(), cmd_queue_.Get(), true, true); + execution_context = Dml::SafeMakeOrThrow(d3d12_device.Get(), dml_device_.Get(), cmd_queue_.Get(), true, true); ORT_THROW_IF_FAILED(d3d12_device->SetPrivateDataInterface(dml_execution_context_guid, execution_context.Get())); } } else { - execution_context = wil::MakeOrThrow(d3d12_device.Get(), dml_device_.Get(), cmd_queue_.Get(), cpu_sync_spinning_enabled_, false); + execution_context = Dml::SafeMakeOrThrow(d3d12_device.Get(), dml_device_.Get(), cmd_queue_.Get(), cpu_sync_spinning_enabled_, false); } auto provider = Dml::CreateExecutionProvider(dml_device_.Get(), execution_context.Get(), metacommands_enabled_, graph_capture_enabled_, cpu_sync_spinning_enabled_, disable_memory_arena_); diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc index 5fc0b8900730b..5a4d68693730b 100644 --- a/onnxruntime/core/providers/vitisai/imp/global_api.cc +++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc @@ -79,7 +79,10 @@ struct OrtVitisAIEpAPI { std::vector>* (*compile_onnx_model_vitisai_ep_with_error_handling)( const std::string& model_path, const onnxruntime::Graph& graph, const onnxruntime::ProviderOptions& options, void* status, vaip_core::error_report_func func); std::vector>* (*compile_onnx_model_vitisai_ep_v3)( - const std::filesystem::path& model_path, const onnxruntime::Graph& graph, const onnxruntime::ProviderOptions& options, void* status, vaip_core::error_report_func func); + const std::string& model_path, const onnxruntime::Graph& graph, const onnxruntime::ProviderOptions& options, void* status, vaip_core::error_report_func func); + std::vector>* (*compile_onnx_model_vitisai_ep_v4)( + const std::string& model_path, const onnxruntime::Graph& graph, const onnxruntime::ProviderOptions& options, void* status, vaip_core::error_report_func func, const onnxruntime::logging::Logger& logger); + void (*vaip_execution_provider_deletor)(std::vector>*) noexcept = [](std::vector>* p) noexcept { delete p; }; uint32_t (*vaip_get_version)(); void (*create_ep_context_nodes)( const std::vector>& eps, @@ -94,6 +97,15 @@ struct OrtVitisAIEpAPI { void (*profiler_collect)( std::vector& api_events, std::vector& kernel_events); + const char* (*get_compiled_model_compatibility_info)( + const std::vector>* eps, + const void* graph_viewer) = nullptr; + int (*validate_compiled_model_compatibility_info)( + const std::vector>* eps, + const char* compatibility_info, + const void* const* devices, + size_t num_devices, + int* model_compatibility) = nullptr; void (*deinitialize_onnxruntime_vitisai_ep)(); void Ensure() { if (handle_) @@ -126,17 +138,29 @@ struct OrtVitisAIEpAPI { auto status1 = env.GetSymbolFromLibrary(handle_, "compile_onnx_model_vitisai_ep_with_error_handling", (void**)&compile_onnx_model_vitisai_ep_with_error_handling); auto status2 = env.GetSymbolFromLibrary(handle_, "compile_onnx_model_vitisai_ep_with_options", (void**)&compile_onnx_model_with_options); auto status3 = env.GetSymbolFromLibrary(handle_, "compile_onnx_model_vitisai_ep_v3", (void**)&compile_onnx_model_vitisai_ep_v3); - if ((!status1.IsOK()) && (!status2.IsOK()) && (!status3.IsOK())) { + auto status4 = env.GetSymbolFromLibrary(handle_, "compile_onnx_model_vitisai_ep_v4", (void**)&compile_onnx_model_vitisai_ep_v4); + if ((!status1.IsOK()) && (!status2.IsOK()) && (!status3.IsOK()) && (!status4.IsOK())) { ::onnxruntime::LogRuntimeError(0, status2, __FILE__, static_cast(__FUNCTION__), __LINE__); ORT_THROW(status2); } std::ignore = env.GetSymbolFromLibrary(handle_, "vaip_get_version", (void**)&vaip_get_version); std::ignore = env.GetSymbolFromLibrary(handle_, "profiler_collect", (void**)&profiler_collect); + std::ignore = env.GetSymbolFromLibrary(handle_, "get_compiled_model_compatibility_info", (void**)&get_compiled_model_compatibility_info); + std::ignore = env.GetSymbolFromLibrary(handle_, "validate_compiled_model_compatibility_info", (void**)&validate_compiled_model_compatibility_info); ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "create_ep_context_nodes", (void**)&create_ep_context_nodes)); ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "vitisai_ep_on_run_start", (void**)&vitisai_ep_on_run_start)); ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "vitisai_ep_set_ep_dynamic_options", (void**)&vitisai_ep_set_ep_dynamic_options)); std::ignore = env.GetSymbolFromLibrary(handle_, "deinitialize_onnxruntime_vitisai_ep", (void**)&deinitialize_onnxruntime_vitisai_ep); + { + typedef void* (*vaip_get_execution_provider_deletor_func_t)(); + vaip_get_execution_provider_deletor_func_t vaip_get_execution_provider_deletor = nullptr; + auto status = env.GetSymbolFromLibrary(handle_, "vaip_get_execution_provider_deletor", + (void**)&vaip_get_execution_provider_deletor); + if (status.IsOK()) { + vaip_execution_provider_deletor = reinterpret_cast(vaip_get_execution_provider_deletor()); + }; + } } void Clear() { if (handle_) { @@ -166,6 +190,42 @@ void profiler_collect( } } +std::string get_compiled_model_compatibility_info( + const std::vector>& eps, + const onnxruntime::GraphViewer& graph_viewer) { + std::string result_str; + if (s_library_vitisaiep.get_compiled_model_compatibility_info) { + const char* result = s_library_vitisaiep.get_compiled_model_compatibility_info(&eps, &graph_viewer); + if (result && result[0] != '\0') { + result_str = result; + } + } + return result_str; +} + +Status validate_compiled_model_compatibility_info( + const std::vector>& eps, + const std::string& compatibility_info, + OrtCompiledModelCompatibility& model_compatibility) { + if (s_library_vitisaiep.validate_compiled_model_compatibility_info) { + // Call with nullptr devices since ORT provider doesn't have device information + int ret_model_compatibility = 0; + int status = s_library_vitisaiep.validate_compiled_model_compatibility_info( + &eps, + compatibility_info.c_str(), + nullptr, // devices - not available + 0, // num_devices + &ret_model_compatibility); + if (status == 0) { + model_compatibility = static_cast(ret_model_compatibility); + return Status::OK(); + } + } + // Default to NOT_APPLICABLE + model_compatibility = OrtCompiledModelCompatibility_EP_NOT_APPLICABLE; + return Status::OK(); +} + void change_status_with_error(void* status_ptr, int error_code, const char* error_msg) { auto status = reinterpret_cast(status_ptr); *status = Status(onnxruntime::common::ONNXRUNTIME, error_code, error_msg); @@ -174,10 +234,19 @@ void change_status_with_error(void* status_ptr, int error_code, const char* erro vaip_core::DllSafe>> compile_onnx_model( const onnxruntime::GraphViewer& graph_viewer, const onnxruntime::logging::Logger& logger, const onnxruntime::ProviderOptions& options) { auto model_path = graph_viewer.ModelPath(); - if (s_library_vitisaiep.compile_onnx_model_vitisai_ep_v3) { + auto vaip_execution_provider_deletor = s_library_vitisaiep.vaip_execution_provider_deletor; + if (s_library_vitisaiep.compile_onnx_model_vitisai_ep_v4) { + Status status = Status::OK(); + auto status_ptr = reinterpret_cast(&status); + auto ret = vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_vitisai_ep_v4(model_path.u8string(), graph_viewer.GetGraph(), options, status_ptr, change_status_with_error, logger), vaip_execution_provider_deletor); + if (!status.IsOK()) { + ORT_THROW(status); + } + return ret; + } else if (s_library_vitisaiep.compile_onnx_model_vitisai_ep_v3) { Status status = Status::OK(); auto status_ptr = reinterpret_cast(&status); - auto ret = vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_vitisai_ep_v3(model_path, graph_viewer.GetGraph(), options, status_ptr, change_status_with_error)); + auto ret = vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_vitisai_ep_v3(model_path.u8string(), graph_viewer.GetGraph(), options, status_ptr, change_status_with_error), vaip_execution_provider_deletor); if (!status.IsOK()) { ORT_THROW(status); } @@ -185,13 +254,13 @@ vaip_core::DllSafe>> c } else if (s_library_vitisaiep.compile_onnx_model_vitisai_ep_with_error_handling) { Status status = Status::OK(); auto status_ptr = reinterpret_cast(&status); - auto ret = vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_vitisai_ep_with_error_handling(model_path.u8string(), graph_viewer.GetGraph(), options, status_ptr, change_status_with_error)); + auto ret = vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_vitisai_ep_with_error_handling(model_path.u8string(), graph_viewer.GetGraph(), options, status_ptr, change_status_with_error), vaip_execution_provider_deletor); if (!status.IsOK()) { ORT_THROW(status); } return ret; } else { - return vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_with_options(model_path.u8string(), graph_viewer.GetGraph(), options)); + return vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_with_options(model_path.u8string(), graph_viewer.GetGraph(), options), vaip_execution_provider_deletor); } } @@ -317,7 +386,6 @@ void deinitialize_vitisai_ep() { s_domains_vitisaiep.clear(); s_library_vitisaiep.Clear(); - s_kernel_registry_vitisaiep.reset(); } static void set_version_info(vaip_core::OrtApiForVaip& api) { @@ -498,6 +566,7 @@ vaip_core::OrtApiForVaip* create_org_api_hook() { the_global_api.tensor_proto_get_shape_unsafe = vaip::tensor_proto_get_shape; the_global_api.tensor_proto_data_type = [](const ONNX_NAMESPACE::TensorProto& t) -> int { return t.data_type(); }; the_global_api.tensor_proto_delete = [](ONNX_NAMESPACE::TensorProto* tp) { delete tp; }; + the_global_api.tensor_proto_new_bool = vaip::tensor_proto_new_bool; the_global_api.tensor_proto_new_i4 = vaip::tensor_proto_new_i4; the_global_api.tensor_proto_new_i8 = vaip::tensor_proto_new_i8; the_global_api.tensor_proto_new_i16 = vaip::tensor_proto_new_i16; @@ -588,3 +657,53 @@ vaip_core::OrtApiForVaip* create_org_api_hook() { return &the_global_api; } } + +struct ExternalEpLibaray { + ExternalEpLibaray(const std::string& libray_name) : libray_name_{libray_name} { + Ensure(); + } + onnxruntime::Provider* (*get_provider_api)(); + void (*create_ep_factories)(void*, const OrtApiBase*, void*, OrtEpFactory**, size_t, size_t*); + void (*set_session_option)(OrtSessionOptions*); + + void Ensure() { + if (handle_) + return; + auto& env = Provider_GetHost()->Env__Default(); + auto library_filename = PathString(LIBRARY_PREFIX) + PathString(libray_name_.begin(), libray_name_.end()) + LIBRARY_EXTENSION; + auto full_path = env.GetRuntimePath() + library_filename; + ORT_THROW_IF_ERROR(env.LoadDynamicLibrary(full_path, true, &handle_)); + ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "GetProvider", (void**)&get_provider_api)); + } + + void Clear() { + if (handle_) { + auto& env = Provider_GetHost()->Env__Default(); + auto status = env.UnloadDynamicLibrary(handle_); + vai_assert(status.IsOK(), status.ErrorMessage()); + handle_ = nullptr; + } + } + + private: + std::string libray_name_; + void* handle_{}; +}; +static std::unordered_map> g_external_ep_libaries; + +std::unique_ptr +CreateExecutionProviderFromAnotherEp(const std::string& lib, const OrtSessionOptions& session_options, + std::unordered_map& provider_options) { + auto it = g_external_ep_libaries.find(lib); + if (it == g_external_ep_libaries.end()) { + it = g_external_ep_libaries.emplace(lib, std::make_unique(lib)).first; + } + auto ep_lib = it->second.get(); + auto get_provider_func = ep_lib->get_provider_api; + auto provider = get_provider_func(); + std::unique_ptr ret; + provider->Initialize(); + std::ignore = provider->CreateIExecutionProvider(nullptr, nullptr, 0, const_cast(provider_options), session_options, *((OrtLogger*)nullptr), ret); + + return ret; +} \ No newline at end of file diff --git a/onnxruntime/core/providers/vitisai/imp/graph.cc b/onnxruntime/core/providers/vitisai/imp/graph.cc index 028ee7fa8c5ce..9e2efac73a20d 100644 --- a/onnxruntime/core/providers/vitisai/imp/graph.cc +++ b/onnxruntime/core/providers/vitisai/imp/graph.cc @@ -248,12 +248,6 @@ Node& graph_fuse(Graph& graph, const std::string& name, indexed_subgraph->SetMetaDef(std::move(meta_def)); auto& fused_node = graph.FuseSubGraph(*indexed_subgraph, name); - auto function_body = fused_node.GetFunctionBody(); - if (function_body) { - auto proto = function_body->Body().ToGraphProto(); - *proto->mutable_name() = name; - fused_node.AddAttribute("body", *proto); - } for (auto&& o : fused_node.OutputDefs()) { graph.UpdateProducerNode(o->Name(), fused_node.Index()); } @@ -285,7 +279,7 @@ Model* model_clone(const Model& original_model, int64_t external_data_threshold) } for (auto& node : original_graph.Nodes()) { auto* node_proto = graph_proto->add_node(); - node->ToProto(*node_proto, false); + node->ToProto(*node_proto, true); for (auto output : node->OutputDefs()) { if (output->Exists()) { auto* value_info = graph_proto->mutable_value_info()->Add(); diff --git a/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc b/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc index 2f1478bf1326b..719ca8dd412bf 100644 --- a/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc +++ b/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc @@ -87,6 +87,12 @@ static ONNX_NAMESPACE::TensorProto* tensor_proto_new(const std::string& name, co return tensor_proto.release(); } +ONNX_NAMESPACE::TensorProto* tensor_proto_new_bool(const std::string& name, const std::vector& shape, + const std::vector& data) { + return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_BOOL, + reinterpret_cast(&data[0]), data.size() * sizeof(data[0])); +} + ONNX_NAMESPACE::TensorProto* tensor_proto_new_i4(const std::string& name, const std::vector& shape, const std::vector& data) { return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_INT4, diff --git a/onnxruntime/core/providers/vitisai/imp/tensor_proto.h b/onnxruntime/core/providers/vitisai/imp/tensor_proto.h index a7c90ac18b44e..9c35044c43824 100644 --- a/onnxruntime/core/providers/vitisai/imp/tensor_proto.h +++ b/onnxruntime/core/providers/vitisai/imp/tensor_proto.h @@ -37,5 +37,7 @@ ONNX_NAMESPACE::TensorProto* tensor_proto_new_fp16(const std::string& name, cons const std::vector& data); ONNX_NAMESPACE::TensorProto* tensor_proto_new_doubles(const std::string& name, const std::vector& shape, const std::vector& data); +ONNX_NAMESPACE::TensorProto* tensor_proto_new_bool(const std::string& name, const std::vector& shape, + const std::vector& data); gsl::span process_ext_address(const ONNX_NAMESPACE::TensorProto& tensor); } // namespace vaip diff --git a/onnxruntime/core/providers/vitisai/include/vaip/dll_safe.h b/onnxruntime/core/providers/vitisai/include/vaip/dll_safe.h index 27bc3ab63187c..a18902c5404be 100644 --- a/onnxruntime/core/providers/vitisai/include/vaip/dll_safe.h +++ b/onnxruntime/core/providers/vitisai/include/vaip/dll_safe.h @@ -17,7 +17,9 @@ class DllSafe { : value_{value}, deleter_{[](T* value) noexcept { std::default_delete()(value); }} {} - + explicit DllSafe(T* value, void (*deleter)(T*) noexcept) + : value_{value}, deleter_{deleter} { + } explicit DllSafe(T&& value) : DllSafe(new T(std::move(value))) {} explicit DllSafe(const T& value) : DllSafe(new T(value)) {} diff --git a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h index 7791ea430054a..6ebec16a4e0dd 100644 --- a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h +++ b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h @@ -6,10 +6,12 @@ #define ORT_API_MANUAL_INIT #include "core/session/onnxruntime_cxx_api.h" #include "core/framework/provider_options.h" +#include "core/framework/execution_provider.h" #include "vaip/my_ort.h" #include "vaip/dll_safe.h" #include "vaip/custom_op.h" #include +#include void initialize_vitisai_ep(); void deinitialize_vitisai_ep(); vaip_core::DllSafe>> compile_onnx_model(const onnxruntime::GraphViewer& graph_viewer, const onnxruntime::logging::Logger& logger, const onnxruntime::ProviderOptions& options); @@ -40,3 +42,23 @@ using EventInfo = std::tuple< void profiler_collect( std::vector& api_events, std::vector& kernel_events); +std::unique_ptr +CreateExecutionProviderFromAnotherEp(const std::string& lib, const OrtSessionOptions& session_options, + std::unordered_map& provider_options); + +/** + * Get compiled model compatibility information from execution providers. + * Returns a JSON string containing compatibility metadata, or an empty string if unavailable. + */ +std::string get_compiled_model_compatibility_info( + const std::vector>& eps, + const onnxruntime::GraphViewer& graph_viewer); + +/** + * Validate compiled model compatibility information against current runtime environment. + * The model_compatibility is output parameter for the compatibility result. + */ +Status validate_compiled_model_compatibility_info( + const std::vector>& eps, + const std::string& compatibility_info, + OrtCompiledModelCompatibility& model_compatibility); diff --git a/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h b/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h index acb258894e11c..6285ff64019cd 100644 --- a/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h +++ b/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h @@ -13,7 +13,7 @@ struct OrtApi; namespace vaip_core { -#define VAIP_ORT_API_MAJOR (18u) +#define VAIP_ORT_API_MAJOR (19u) #define VAIP_ORT_API_MINOR (0u) #define VAIP_ORT_API_PATCH (0u) struct OrtApiForVaip { @@ -257,6 +257,9 @@ struct OrtApiForVaip { void (*graph_proto_delete)(GraphProto* p); // [107] void (*graph_infer_shapes)(ModelProto& m); // [108] DllSafe (*graph_save_string)(const Graph& graph); // [109] + TensorProto* (*tensor_proto_new_bool)(const std::string& name, + const std::vector& shape, + const std::vector& data); // [110] }; #ifndef USE_VITISAI diff --git a/onnxruntime/core/providers/vitisai/onnxruntime_providers_vitisai.rc b/onnxruntime/core/providers/vitisai/onnxruntime_providers_vitisai.rc new file mode 100644 index 0000000000000..968086ebd2613 --- /dev/null +++ b/onnxruntime/core/providers/vitisai/onnxruntime_providers_vitisai.rc @@ -0,0 +1,46 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +// This file REQUIRES the following external definitions: +// FILE_NAME, VER_MAJOR, VER_MINOR, VER_BUILD, VER_PRIVATE, and VER_STRING + +#include + +#if defined(DEBUG) || defined(_DEBUG) +#define VER_DEBUG VS_FF_DEBUG +#else +#define VER_DEBUG 0 +#endif + +// ----------------------------------------------------------------------------- + +VS_VERSION_INFO VERSIONINFO +FILEVERSION VER_MAJOR, VER_MINOR, VER_BUILD, VER_PRIVATE +PRODUCTVERSION VER_MAJOR, VER_MINOR, VER_BUILD, VER_PRIVATE +FILEFLAGSMASK VS_FFI_FILEFLAGSMASK +FILEFLAGS VER_DEBUG +FILEOS VOS__WINDOWS32 +FILETYPE VFT_DLL +FILESUBTYPE VFT2_UNKNOWN + +BEGIN + BLOCK "StringFileInfo" + BEGIN + BLOCK "040904E4" + BEGIN + VALUE "CompanyName", "Microsoft Corporation" + VALUE "FileDescription", "ONNX Runtime VitisAI Provider" + VALUE "FileVersion", VER_STRING + VALUE "InternalName", "ONNX Runtime VitisAI Provider" + VALUE "LegalCopyright", "\251 Microsoft Corporation. All rights reserved." + VALUE "OriginalFilename", FILE_NAME + VALUE "ProductName", "Microsoft\256 Windows\256 Operating System" + VALUE "ProductVersion", VER_STRING + END + END + + BLOCK "VarFileInfo" + BEGIN + VALUE "Translation", 0x409, 1252 + END +END diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc index 6cafc0495aa6b..7ea25ea115567 100644 --- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc +++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc @@ -145,6 +145,24 @@ std::unique_ptr VitisAIExecutionProvider::GetProfiler() { return std::make_unique(); } +std::string VitisAIExecutionProvider::GetCompiledModelCompatibilityInfo( + const onnxruntime::GraphViewer& graph_viewer) const { + if (!execution_providers_) { + return {}; + } + return get_compiled_model_compatibility_info(**execution_providers_, graph_viewer); +} + +common::Status VitisAIExecutionProvider::ValidateCompiledModelCompatibilityInfo( + const std::string& compatibility_info, + OrtCompiledModelCompatibility& model_compatibility) const { + if (!execution_providers_) { + model_compatibility = OrtCompiledModelCompatibility_EP_NOT_APPLICABLE; + return Status::OK(); + } + return validate_compiled_model_compatibility_info(**execution_providers_, compatibility_info, model_compatibility); +} + std::vector VitisAIExecutionProvider::CreatePreferredAllocators() { std::vector result; // We do not want arena for 4k alignment, as it would not respect alignment. diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h index 8db4f36dd497a..1a20944692d6e 100644 --- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h +++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h @@ -48,6 +48,20 @@ class VitisAIExecutionProvider : public IExecutionProvider { std::vector CreatePreferredAllocators() override; + /** + * Get compiled model compatibility information. + * This method collects compatibility info from all vaip_core execution providers + * and returns it as a JSON string. + */ + std::string GetCompiledModelCompatibilityInfo(const onnxruntime::GraphViewer& graph_viewer) const override; + + /** + * Validate compiled model compatibility information. + * This method validates the compatibility info against the current runtime environment. + */ + common::Status ValidateCompiledModelCompatibilityInfo(const std::string& compatibility_info, + OrtCompiledModelCompatibility& model_compatibility) const override; + private: using my_ep_t = vaip_core::DllSafe>>; using my_ep_uptr_t = std::shared_ptr; diff --git a/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc b/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc index 50f924e468ed0..e1a3ca43e162e 100644 --- a/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc +++ b/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc @@ -7,7 +7,6 @@ #include #include #include - #include "vaip/global_api.h" #include "./vitisai_execution_provider.h" #include "core/framework/execution_provider.h" @@ -57,6 +56,10 @@ std::unique_ptr VitisAIProviderFactory::CreateProvider(const } } + auto it = provider_options.find("external_ep_libray"); + if (it != provider_options.end()) { + return CreateExecutionProviderFromAnotherEp(it->second, session_options, provider_options); + } auto ep_instance = std::make_unique(provider_options); ep_instance->SetLogger(reinterpret_cast(&session_logger)); return ep_instance; diff --git a/onnxruntime/core/session/environment.cc b/onnxruntime/core/session/environment.cc index 9c40eb75780ee..0f9a1e299506e 100644 --- a/onnxruntime/core/session/environment.cc +++ b/onnxruntime/core/session/environment.cc @@ -499,8 +499,13 @@ Status CreateDataTransferForFactory(OrtEpFactory& ep_factory, Status Environment::RegisterExecutionProviderLibrary(const std::string& registration_name, std::unique_ptr ep_library, const std::vector& internal_factories) { + const Env& env = Env::Default(); + env.GetTelemetryProvider().LogRegisterEpLibraryStart(registration_name); + if (ep_libraries_.count(registration_name) > 0) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "library is already registered under ", registration_name); + auto status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "library is already registered under ", registration_name); + env.GetTelemetryProvider().LogRegisterEpLibraryEnd(registration_name, status); + return status; } auto status = Status::OK(); @@ -552,6 +557,7 @@ Status Environment::RegisterExecutionProviderLibrary(const std::string& registra }); } + env.GetTelemetryProvider().LogRegisterEpLibraryEnd(registration_name, status); return status; } @@ -571,6 +577,9 @@ Status Environment::CreateAndRegisterInternalEps() { Status Environment::RegisterExecutionProviderLibrary(const std::string& registration_name, const ORTCHAR_T* lib_path) { std::lock_guard lock{mutex_}; + std::string lib_file_name = std::filesystem::path(lib_path).filename().string(); + Env::Default().GetTelemetryProvider().LogRegisterEpLibraryWithLibPath(registration_name, lib_file_name); + std::vector internal_factories = {}; std::unique_ptr ep_library; diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index e3291cdce62c5..6323c818bc56a 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -730,6 +730,25 @@ InferenceSession::InferenceSession(const SessionOptions& session_options, const #endif // !defined(ORT_MINIMAL_BUILD) InferenceSession::~InferenceSession() { + // Flush any remaining RuntimePerf counters + ORT_TRY { + std::lock_guard telemetry_lock(telemetry_mutex_); + if (telemetry_.total_runs_since_last_ > 0) { + Env::Default().GetTelemetryProvider().LogRuntimePerf(session_id_, + telemetry_.total_runs_since_last_, + telemetry_.total_run_duration_since_last_, + telemetry_.duration_per_batch_size_); + } + } + ORT_CATCH(const std::exception& e) { + ORT_HANDLE_EXCEPTION([&]() { + LOGS(*session_logger_, ERROR) << "Error during telemetry flush: " << e.what(); + }); + } + ORT_CATCH(...) { + LOGS(*session_logger_, ERROR) << "Unknown error during telemetry flush"; + } + if (session_options_.enable_profiling) { ORT_TRY { EndProfiling(); @@ -969,7 +988,10 @@ common::Status InferenceSession::LoadWithLoader(std::function l(session_mutex_); if (is_model_loaded_) { // already loaded LOGS(*session_logger_, ERROR) << "This session already contains a loaded model."; @@ -1005,6 +1027,8 @@ common::Status InferenceSession::LoadWithLoader(std::function load_ort_format_model_bytes) { + const Env& env = Env::Default(); + env.GetTelemetryProvider().LogModelLoadStart(session_id_); + std::lock_guard l(session_mutex_); if (is_model_loaded_) { // already loaded @@ -1718,6 +1745,8 @@ Status InferenceSession::LoadOrtModelWithLoader(std::function load_ort is_model_loaded_ = true; + env.GetTelemetryProvider().LogModelLoadEnd(session_id_, Status::OK()); + return Status::OK(); } @@ -2562,6 +2591,12 @@ common::Status InferenceSession::Initialize() { } } + // Log session creation end telemetry + { + const Env& init_env = Env::Default(); + init_env.GetTelemetryProvider().LogSessionCreationEnd(session_id_, status); + } + return status; } #if defined(_MSC_VER) && !defined(__clang__) @@ -3111,24 +3146,31 @@ Status InferenceSession::Run(const RunOptions& run_options, break; } - // time to send telemetry? - { - // Adding lock_guard here to ensure that telemetry updates are thread-safe. - std::lock_guard telemetry_lock(telemetry_mutex_); - ++telemetry_.total_runs_since_last_; - telemetry_.total_run_duration_since_last_ += TimeDiffMicroSeconds(tp); - telemetry_.duration_per_batch_size_[batch_size] += TimeDiffMicroSeconds(tp); - - if (TimeDiffMicroSeconds(telemetry_.time_sent_last_) > Telemetry::kDurationBetweenSending) { - // send the telemetry - env.GetTelemetryProvider().LogRuntimePerf(session_id_, telemetry_.total_runs_since_last_, - telemetry_.total_run_duration_since_last_, - telemetry_.duration_per_batch_size_); - // reset counters - telemetry_.time_sent_last_ = std::chrono::high_resolution_clock::now(); - telemetry_.total_runs_since_last_ = 0; - telemetry_.total_run_duration_since_last_ = 0; - telemetry_.duration_per_batch_size_.clear(); + // Only include successful inferences in batch since failed inferences can skew the metric + if (retval.IsOK()) { + // time to send telemetry? + { + // Adding lock_guard here to ensure that telemetry updates are thread-safe. + std::lock_guard telemetry_lock(telemetry_mutex_); + ++telemetry_.total_runs_since_last_; + telemetry_.total_run_duration_since_last_ += TimeDiffMicroSeconds(tp); + telemetry_.duration_per_batch_size_[batch_size] += TimeDiffMicroSeconds(tp); + + // Emit RuntimePerf on scheduled interval + if ((TimeDiffMicroSeconds(telemetry_.time_sent_last_) > telemetry_.runtime_perf_interval_)) { + env.GetTelemetryProvider().LogRuntimePerf(session_id_, telemetry_.total_runs_since_last_, + telemetry_.total_run_duration_since_last_, + telemetry_.duration_per_batch_size_); + // reset counters + telemetry_.time_sent_last_ = std::chrono::high_resolution_clock::now(); + telemetry_.total_runs_since_last_ = 0; + telemetry_.total_run_duration_since_last_ = 0; + telemetry_.duration_per_batch_size_.clear(); + + // Double the interval, capping at kRuntimePerfMaxInterval + telemetry_.runtime_perf_interval_ = std::min(telemetry_.runtime_perf_interval_ * 2, + Telemetry::kRuntimePerfMaxInterval); + } } } diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h index 8bea15c169ed4..fe36040f313b6 100644 --- a/onnxruntime/core/session/inference_session.h +++ b/onnxruntime/core/session/inference_session.h @@ -947,8 +947,10 @@ class InferenceSession { std::unordered_map duration_per_batch_size_; // the duration (us) of Run() calls per batch size since the last report TimePoint time_sent_last_; // the TimePoint of the last report - // Event Rate per provider < 20 peak events per second - constexpr static long long kDurationBetweenSending = 1000 * 1000 * 60 * 10; // duration in (us). send a report every 10 mins + // RuntimePerf backoff interval: starts at 2s between emissions, doubles each emission, caps at 10 min + constexpr static int64_t kRuntimePerfInitialInterval = 2 * 1000 * 1000; // 2 seconds in (us) + constexpr static int64_t kRuntimePerfMaxInterval = 1000 * 1000 * 60 * 10; // 10 minutes in (us) + int64_t runtime_perf_interval_ = kRuntimePerfInitialInterval; } telemetry_; mutable std::mutex telemetry_mutex_; // to ensure thread-safe access to telemetry data diff --git a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc index 4b586e24c9bd3..5a2dc18bb2630 100644 --- a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc +++ b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc @@ -82,7 +82,7 @@ void CheckDataAndShape(const std::vector& data, const std::vector& s ORT_ENFORCE(static_cast(data.size()) == total_elements, "Data size does not match the shape", "Data size: ", data.size(), ", Expected size: ", total_elements, - ", Shape: ", VectorToString(shape), " Name:", name, " Type:", typeid(T).name()); + ", Shape: ", VectorToString(shape), " Name:", name); } // Combinations: types, gather_axis, quantize_axis, block_size, indices, scale shape vs data shape diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc index 4b37b6c9438aa..3b33526a36ec4 100644 --- a/onnxruntime/test/providers/base_tester.cc +++ b/onnxruntime/test/providers/base_tester.cc @@ -72,7 +72,8 @@ void BaseTester::AddInitializers(onnxruntime::Graph& graph) { tensor_proto.add_string_data(string_data[i]); } } else { - auto buffer_size = tensor.DataType()->Size() * shape.Size(); + // Use CalculateTensorStorageSize to properly handle sub-byte types (e.g., Int4) + auto buffer_size = Tensor::CalculateTensorStorageSize(tensor.DataType(), shape); utils::SetRawDataInTensorProto(tensor_proto, tensor.DataRaw(), buffer_size); } diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc index 0008b68d14f41..5d09bade3b10b 100644 --- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc +++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc @@ -937,6 +937,56 @@ TEST(MathOpTest, Div_uint64) { test.Run(); } +TEST(MathOpTest, Div_int8_by_zero) { + OpTester test("Div", 14); + test.AddInput("A", {3}, {4, 8, 8}); + test.AddInput("B", {3}, {1, 0, 2}); + test.AddOutput("C", {3}, {0, 0, 0}); + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectFailure, + "Integer division by zero", + {}, nullptr, &execution_providers); +} + +TEST(MathOpTest, Div_int32_by_zero) { + OpTester test("Div"); + test.AddInput("A", {3}, {4, 8, 8}); + test.AddInput("B", {3}, {1, 0, 2}); + test.AddOutput("C", {3}, {0, 0, 0}); + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectFailure, + "Integer division by zero", + {}, nullptr, &execution_providers); +} + +TEST(MathOpTest, Div_int64_by_zero_scalar) { + // Scalar divisor of 0 (the exact scenario from the bug report) + OpTester test("Div"); + test.AddInput("A", {3}, {4, 8, 8}); + test.AddInput("B", {}, {0}); + test.AddOutput("C", {3}, {0, 0, 0}); + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectFailure, + "Integer division by zero", + {}, nullptr, &execution_providers); +} + +TEST(MathOpTest, Div_int32_by_zero_constant_initializer) { + // Divisor is a constant initializer — validated once at kernel creation time + OpTester test("Div"); + test.AddInput("A", {3}, {4, 8, 8}); + test.AddInput("B", {3}, {1, 0, 2}, true); // is_initializer = true + test.AddOutput("C", {3}, {0, 0, 0}); + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectFailure, + "Integer division by zero", + {}, nullptr, &execution_providers); +} + TEST(MathOpTest, Div_float) { OpTester test("Div"); std::vector dims{2, 3}; diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc index cf49601e6c671..0136e5e0f8e04 100644 --- a/onnxruntime/test/providers/cpu/model_tests.cc +++ b/onnxruntime/test/providers/cpu/model_tests.cc @@ -678,7 +678,13 @@ ::std::vector<::std::basic_string> GetParameterStrings() { ORT_TSTR("fp16_coreml_FNS-Candy"), ORT_TSTR("fp16_test_tiny_yolov2"), ORT_TSTR("fp16_test_shufflenet"), - ORT_TSTR("keras2coreml_SimpleRNN_ImageNet")}; + ORT_TSTR("keras2coreml_SimpleRNN_ImageNet"), + // models from model zoo. #26274: cuDNN frontend no valid engine + ORT_TSTR("YOLOv3"), + ORT_TSTR("YOLOv3-12"), + ORT_TSTR("YOLOv4"), + ORT_TSTR("SSD-MobilenetV1"), + ORT_TSTR("SSD-MobilenetV1-12")}; // For ROCm EP, also disable the following tests due to flakiness, // mainly with precision issue and random memory access fault. static const ORTCHAR_T* rocm_disabled_tests[] = {ORT_TSTR("bvlc_alexnet"), diff --git a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc index 8fdbf0060eaa0..5183cdb352717 100644 --- a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc @@ -59,6 +59,47 @@ TEST(DequantizeLinearOpTest, Int8_Large) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kWebGpuExecutionProvider}); } +TEST(DequantizeLinearOpTest, Int4_LargeInitializerInput) { + OpTester test("DequantizeLinear", 21); + std::vector dims{1024}; + + std::vector x_vals(Int4x2::CalcNumInt4Pairs(static_cast(dims[0])), Int4x2{}); + std::vector expected_y_vals(static_cast(dims[0]), 0.f); + + test.AddInput("x", dims, x_vals, true); + test.AddInput("x_scale", {}, {1.0f}); + test.AddInput("x_zero_point", {}, {Int4x2(0, 0)}); + test.AddOutput("y", dims, expected_y_vals); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); +} + +// Regression test: int8 tensor whose byte size is not a multiple of 4. +// DML graph fusion rounds tensor sizes to a multiple of 4 via AlignToPow2. +// If the original buffer is not padded, the subsequent memcpy reads past the +// allocation boundary (heap-buffer-overflow detectable with ASan). +// Mirrors the WebNN PoC: dequantizeLinear with int8[135] (135 % 4 != 0). +TEST(DequantizeLinearOpTest, Int8_NonAlignedSize_Initializer) { + OpTester test("DequantizeLinear", 10); + constexpr int64_t kNumElements = 135; // 135 bytes, AlignToPow2(135,4)=136 + + std::vector x_data(kNumElements); + std::vector y_expected(kNumElements); + const float scale = 0.5f; + const int8_t zero_point = 0; + for (int64_t i = 0; i < kNumElements; ++i) { + x_data[i] = static_cast(i % 127); + y_expected[i] = (x_data[i] - zero_point) * scale; + } + + // Mark all inputs as initializers so they go through DML's ProcessInputData + // → UnpackInitializer → AlignToPow2 code path during graph fusion. + test.AddInput("x", {kNumElements}, x_data, /*is_initializer=*/true); + test.AddInput("x_scale", {1}, {scale}, /*is_initializer=*/true); + test.AddInput("x_zero_point", {1}, {zero_point}, /*is_initializer=*/true); + test.AddOutput("y", {kNumElements}, y_expected); + + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); +} // scalar zero & scale with int4 TEST(DequantizeLinearOpTest, Int4) { OpTester test("DequantizeLinear", 21); @@ -417,6 +458,90 @@ TEST(QuantizeLinearOpTest, Int8) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } +// Repro for new-delete-type-mismatch in DML EP during graph fusion. +// QuantizeLinear float32→int8 with 5D input triggers a type-size +// mismatch (192 bytes allocated, 1 byte deallocated) visible under ASan. +TEST(QuantizeLinearOpTest, Int8_5D_DML_TypeMismatch) { + auto dml_ep = DefaultDmlExecutionProvider(); + if (!dml_ep) { + GTEST_SKIP() << "Skipping because DML EP is not available."; + } + + OpTester test("QuantizeLinear", 13); + std::vector dims{6, 1, 1, 1, 1}; + test.AddInput("x", dims, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}); + test.AddInput("y_scale", {}, {1.0f}); + test.AddInput("y_zero_point", {}, {0}); + test.AddOutput("y", dims, {1, 2, 3, 4, 5, 6}); + + std::vector> execution_providers; + execution_providers.emplace_back(std::move(dml_ep)); + test.ConfigEps(std::move(execution_providers)) + .RunWithConfig(); +} + +// Same as above but with per-axis quantization along axis 0 to exercise +// the DML graph fusion path with per-channel int8 quantization. +TEST(QuantizeLinearOpTest, Int8_5D_PerAxis_DML_TypeMismatch) { + auto dml_ep = DefaultDmlExecutionProvider(); + if (!dml_ep) { + GTEST_SKIP() << "Skipping because DML EP is not available."; + } + + OpTester test("QuantizeLinear", 13); + std::vector dims{6, 1, 1, 1, 1}; + test.AddAttribute("axis", 0); + test.AddInput("x", dims, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}); + test.AddInput("y_scale", {6}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); + test.AddInput("y_zero_point", {6}, {0, 0, 0, 0, 0, 0}); + test.AddOutput("y", dims, {1, 2, 3, 4, 5, 6}); + + std::vector> execution_providers; + execution_providers.emplace_back(std::move(dml_ep)); + test.ConfigEps(std::move(execution_providers)) + .RunWithConfig(); +} + +// Opset 21 QuantizeLinear float32→uint8 WITHOUT zero_point. +// Without zero_point, the output type defaults to uint8. +TEST(QuantizeLinearOpTest, Uint8_5D_NoZeroPoint_Opset21_DML) { + auto dml_ep = DefaultDmlExecutionProvider(); + if (!dml_ep) { + GTEST_SKIP() << "Skipping because DML EP is not available."; + } + + OpTester test("QuantizeLinear", 21); + std::vector dims{6, 1, 1, 1, 1}; + test.AddInput("x", dims, {0.0f, 51.0f, 102.0f, 153.0f, 204.0f, 255.0f}); + test.AddInput("y_scale", {}, {1.0f}); + test.AddOutput("y", dims, {0, 51, 102, 153, 204, 255}); + + std::vector> execution_providers; + execution_providers.emplace_back(std::move(dml_ep)); + test.ConfigEps(std::move(execution_providers)) + .RunWithConfig(); +} + +// Opset 21 QuantizeLinear float32→int8 with zero_point (the customer's exact scenario). +TEST(QuantizeLinearOpTest, Int8_5D_WithZeroPoint_Opset21_DML) { + auto dml_ep = DefaultDmlExecutionProvider(); + if (!dml_ep) { + GTEST_SKIP() << "Skipping because DML EP is not available."; + } + + OpTester test("QuantizeLinear", 21); + std::vector dims{6, 1, 1, 1, 1}; + test.AddInput("x", dims, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}); + test.AddInput("y_scale", {}, {1.0f}); + test.AddInput("y_zero_point", {}, {0}); + test.AddOutput("y", dims, {1, 2, 3, 4, 5, 6}); + + std::vector> execution_providers; + execution_providers.emplace_back(std::move(dml_ep)); + test.ConfigEps(std::move(execution_providers)) + .RunWithConfig(); +} + // Test uint16 QuantizeLinear (per tensor) TEST(QuantizeLinearOpTest, Uint16) { OpTester test("QuantizeLinear", 21); diff --git a/onnxruntime/test/providers/dml_safe_make_or_throw_test.cc b/onnxruntime/test/providers/dml_safe_make_or_throw_test.cc new file mode 100644 index 0000000000000..8041f0dae8c28 --- /dev/null +++ b/onnxruntime/test/providers/dml_safe_make_or_throw_test.cc @@ -0,0 +1,139 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#ifdef USE_DML + +#include "gtest/gtest.h" + +#include +#include +#include "core/providers/dml/DmlExecutionProvider/src/SafeMakeOrThrow.h" + +#include + +namespace onnxruntime { +namespace test { + +// A trivial COM interface for testing. +MIDL_INTERFACE("A1B2C3D4-E5F6-7890-ABCD-EF1234567890") +ITestInterface : public IUnknown { + virtual int STDMETHODCALLTYPE GetValue() = 0; +}; + +// A RuntimeClass whose constructor succeeds and stores a value. +class SucceedingClass : public Microsoft::WRL::RuntimeClass< + Microsoft::WRL::RuntimeClassFlags, ITestInterface> { + public: + int value; + + SucceedingClass(int v) : value(v) {} + + int STDMETHODCALLTYPE GetValue() override { return value; } +}; + +// A RuntimeClass that tracks whether its destructor ran. +class TrackedClass : public Microsoft::WRL::RuntimeClass< + Microsoft::WRL::RuntimeClassFlags, ITestInterface> { + public: + bool& destroyed; + + TrackedClass(bool& flag) : destroyed(flag) { destroyed = false; } + ~TrackedClass() { destroyed = true; } + + int STDMETHODCALLTYPE GetValue() override { return 42; } +}; + +// A RuntimeClass whose constructor always throws. +// Uses a ref-counted witness to verify cleanup: the witness is destroyed +// (via Release) during stack unwinding if memory is freed correctly. +class ThrowingClass : public Microsoft::WRL::RuntimeClass< + Microsoft::WRL::RuntimeClassFlags, ITestInterface> { + public: + Microsoft::WRL::ComPtr witness; + + ThrowingClass(bool& witness_destroyed) { + // Create a witness that will be destroyed when this object's members + // are cleaned up during stack unwinding. + witness = Dml::SafeMakeOrThrow(witness_destroyed); + throw std::runtime_error("intentional throw"); + } + + int STDMETHODCALLTYPE GetValue() override { return -1; } +}; + +// Verify that SafeMakeOrThrow creates an object with ref count 1, +// and that the object is properly released when the ComPtr goes out of scope. +TEST(SafeMakeOrThrowTest, SuccessPath_RefCountIsOne) { + Microsoft::WRL::ComPtr obj = Dml::SafeMakeOrThrow(123); + + ASSERT_NE(obj.Get(), nullptr); + EXPECT_EQ(obj->GetValue(), 123); + + // AddRef/Release to observe ref count: AddRef returns new count. + unsigned long refAfterAdd = obj->AddRef(); + EXPECT_EQ(refAfterAdd, 2u); + + unsigned long refAfterRelease = obj->Release(); + EXPECT_EQ(refAfterRelease, 1u); +} + +// Verify that the object is destroyed when the last ComPtr releases it. +TEST(SafeMakeOrThrowTest, SuccessPath_DestructorRunsOnRelease) { + bool destroyed = false; + { + auto obj = Dml::SafeMakeOrThrow(destroyed); + EXPECT_FALSE(destroyed); + } + // ComPtr went out of scope — destructor should have run. + EXPECT_TRUE(destroyed); +} + +// Verify that copying the ComPtr increments the ref count and +// the object survives until the last reference is released. +TEST(SafeMakeOrThrowTest, SuccessPath_MultipleReferences) { + bool destroyed = false; + Microsoft::WRL::ComPtr copy; + { + auto obj = Dml::SafeMakeOrThrow(destroyed); + copy = obj; + EXPECT_FALSE(destroyed); + } + // Original ComPtr gone, but copy still holds a reference. + EXPECT_FALSE(destroyed); + + copy.Reset(); + EXPECT_TRUE(destroyed); +} + +// Verify that when the constructor throws, the exception propagates +// and sub-objects are properly cleaned up (no leak). +TEST(SafeMakeOrThrowTest, FailurePath_ConstructorThrows) { + bool witness_destroyed = false; + EXPECT_THROW( + Dml::SafeMakeOrThrow(witness_destroyed), + std::runtime_error); + // The witness ComPtr member was constructed before the throw. + // If cleanup worked correctly, the witness should have been destroyed + // when the ThrowingClass sub-objects were unwound. + EXPECT_TRUE(witness_destroyed); +} + +// Verify that QI works correctly on a SafeMakeOrThrow-created object. +TEST(SafeMakeOrThrowTest, SuccessPath_QueryInterface) { + auto obj = Dml::SafeMakeOrThrow(42); + + Microsoft::WRL::ComPtr unk; + HRESULT hr = obj.As(&unk); + EXPECT_EQ(hr, S_OK); + EXPECT_NE(unk.Get(), nullptr); + + Microsoft::WRL::ComPtr iface; + hr = unk.As(&iface); + EXPECT_EQ(hr, S_OK); + EXPECT_EQ(iface->GetValue(), 42); +} + +} // namespace test +} // namespace onnxruntime + +#endif // USE_DML