diff --git a/.github/actions/setup-android-ndk/action.yml b/.github/actions/setup-android-ndk/action.yml new file mode 100644 index 0000000000000..fea9745396e81 --- /dev/null +++ b/.github/actions/setup-android-ndk/action.yml @@ -0,0 +1,98 @@ +# .github/actions/setup-android-ndk/action.yml +name: 'Setup Android NDK' +description: 'Installs and configures a specific version of the Android NDK' +inputs: + ndk-version: + description: 'The version of the Android NDK to install (e.g., 27.2.12479018)' + required: true + default: '27.2.12479018' + android-sdk-root: + description: 'The root directory of the Android SDK' + required: true + default: '/usr/local/lib/android/sdk' + +runs: + using: "composite" # Use a composite action for multiple shell commands + steps: + - name: Install coreutils and ninja + shell: bash + run: sudo apt-get update -y && sudo apt-get install -y coreutils ninja-build + + - name: Install Android NDK + shell: bash + run: | + set -e + "${{ inputs.android-sdk-root }}/cmdline-tools/latest/bin/sdkmanager" --install "ndk;${{ inputs.ndk-version }}" + + NDK_PATH="${{ inputs.android-sdk-root }}/ndk/${{ inputs.ndk-version }}" + if [[ ! -d "${NDK_PATH}" ]]; then + echo "NDK directory is not in expected location: ${NDK_PATH}" + exit 1 + fi + + # Use standard environment variable setting in bash and add to GITHUB_ENV + echo "ANDROID_NDK_HOME=${NDK_PATH}" >> $GITHUB_ENV + echo "ANDROID_NDK_ROOT=${NDK_PATH}" >> $GITHUB_ENV + echo "ANDROID_NDK_HOME: ${NDK_PATH}" + echo "ANDROID_NDK_ROOT: ${NDK_PATH}" + + - name: Check if emulator are installed and add to PATH + shell: bash + run: | + if [[ ":$PATH:" == *":${ANDROID_SDK_ROOT}/emulator:"* ]]; then + echo "${ANDROID_SDK_ROOT}/emulator is in PATH" + else + ${ANDROID_SDK_ROOT}/cmdline-tools/latest/bin/sdkmanager --install "emulator" + echo "${ANDROID_SDK_ROOT}/emulator" >> $GITHUB_PATH + fi + + - name: Check if platform tools are installed and add to PATH + shell: bash + run: | + if [[ ":$PATH:" == *":${ANDROID_SDK_ROOT}/platform-tools:"* ]]; then + echo "${ANDROID_SDK_ROOT}/platform-tools is in PATH" + else + ${ANDROID_SDK_ROOT}/cmdline-tools/latest/bin/sdkmanager --install "platform-tools" + echo "${ANDROID_SDK_ROOT}/platform-tools" >> $GITHUB_PATH + fi + ls -R "${ANDROID_SDK_ROOT}/platform-tools" + + - name: Create Android Emulator + shell: bash + env: + ANDROID_AVD_HOME: ${{ runner.temp }}/android-avd + run: | + python3 tools/python/run_android_emulator.py \ + --android-sdk-root "${ANDROID_SDK_ROOT}" \ + --create-avd --system-image "system-images;android-31;default;x86_64" + + - name: List Android AVDs + shell: bash + env: + ANDROID_AVD_HOME: ${{ runner.temp }}/android-avd + run: | + "${ANDROID_SDK_ROOT}/cmdline-tools/latest/bin/avdmanager" list avd + + - name: Check emulator.pid does not exist + shell: bash + run: | + if test -f ./emulator.pid; then + echo "Emulator PID file was not expected to exist but does and has pid: `cat ./emulator.pid`" + exit 1 + fi + + - name: Start Android Emulator + shell: bash + env: + ANDROID_AVD_HOME: ${{ runner.temp }}/android-avd + run: | + set -e -x + python3 tools/python/run_android_emulator.py \ + --android-sdk-root "${ANDROID_SDK_ROOT}" \ + --start --emulator-extra-args="-partition-size 2047" \ + --emulator-pid-file ./emulator.pid + echo "Emulator PID: `cat ./emulator.pid`" + + - name: View Android ENVs + shell: bash + run: env | grep ANDROID \ No newline at end of file diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml new file mode 100644 index 0000000000000..64c40946c49c5 --- /dev/null +++ b/.github/workflows/android.yml @@ -0,0 +1,147 @@ +name: Android CI +# This workflow is used to build and test on Android Emulator on Linux + +on: + push: + branches: + - main + - rel-* + pull_request: + branches: + - main + - rel-* + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + +jobs: + android_nnapi_ep: + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"] + steps: + - uses: actions/checkout@v4 + + - name: Use jdk 17 + uses: actions/setup-java@v4 + with: + distribution: 'temurin' + java-version: '17' + architecture: x64 + + - name: Setup Android NDK + uses: ./.github/actions/setup-android-ndk + with: + ndk-version: 27.2.12479018 + + - name: Export GitHub Actions cache environment variables + uses: actions/github-script@v7 + with: + script: | + core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || ''); + core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || ''); + + - name: NNAPI EP, Build, Test on Android Emulator + run: >- + python3 tools/ci_build/build.py + --enable_lto + --android + --build_dir build_nnapi + --android_sdk_path "$ANDROID_HOME" + --android_ndk_path "$ANDROID_NDK_HOME" + --android_abi=x86_64 + --android_api=29 + --skip_submodule_sync + --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache + --use_nnapi + --build_shared_lib + --cmake_generator=Ninja + --build_java + shell: bash + + + - name: Build Minimal ORT with NNAPI and run tests + run: tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh "$(pwd)" + shell: bash + + - name: Install psutil for emulator shutdown by run_android_emulator.py + if: always() + run: python3 -m pip install psutil + shell: bash + + - name: Stop Android Emulator + if: always() + run: | + env | grep ANDROID + if test -f ${{ github.workspace }}/emulator.pid; then + echo "Emulator PID:"`cat ${{ github.workspace }}/emulator.pid` + python3 tools/python/run_android_emulator.py \ + --android-sdk-root "${ANDROID_SDK_ROOT}" \ + --stop \ + --emulator-pid-file ${{ github.workspace }}/emulator.pid + rm ${{ github.workspace }}/emulator.pid + else + echo "Emulator PID file was expected to exist but does not." + fi + shell: bash + + android_cpu_ep: + name: Android CI Pipeline + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"] + steps: + - uses: actions/checkout@v4 + + - name: Use jdk 17 + uses: actions/setup-java@v4 + with: + distribution: 'temurin' + java-version: '17' + architecture: x64 + + - name: Setup Android NDK + uses: ./.github/actions/setup-android-ndk + with: + ndk-version: 27.2.12479018 + + - name: Export GitHub Actions cache environment variables + uses: actions/github-script@v7 + with: + script: | + core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || ''); + core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || ''); + + - name: CPU EP, Build and Test + run: >- + python3 tools/ci_build/build.py + --enable_lto + --android + --build_dir build + --android_sdk_path $ANDROID_HOME + --android_ndk_path $ANDROID_NDK_HOME + --android_abi=x86_64 + --android_api=30 + --skip_submodule_sync + --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache + --cmake_generator=Ninja + --build_java + shell: bash + + - name: Install psutil for emulator shutdown by run_android_emulator.py + if: always() + run: python3 -m pip install psutil + shell: bash + + - name: Stop Android Emulator + if: always() + run: | + if test -f ${{ github.workspace }}/emulator.pid; then + echo "Emulator PID:"`cat ${{ github.workspace }}/emulator.pid` + python3 tools/python/run_android_emulator.py \ + --android-sdk-root "${ANDROID_SDK_ROOT}" \ + --stop \ + --emulator-pid-file ${{ github.workspace }}/emulator.pid + rm ${{ github.workspace }}/emulator.pid + else + echo "Emulator PID file was expected to exist but does not." + fi + shell: bash diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index 50dd25898ad35..86b1cd5ee90e7 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -154,6 +154,46 @@ jobs: --use_xnnpack \ --use_binskim_compliant_compile_flags + ARM64-Xcode16-webgpu: + runs-on: macos-15 + + env: + xcode_version: 16 + + timeout-minutes: 60 + + steps: + - uses: actions/setup-python@v5 + with: + python-version: ${{ env.python_version }} + + - name: Verify ARM64 machine + shell: python + run: | + import platform + assert platform.machine() == "arm64", "This job expects to be run on an ARM64 machine." + + - name: Use Xcode ${{ env.xcode_version }} + shell: bash + run: | + XCODE_DEVELOPER_DIR="/Applications/Xcode_${{ env.xcode_version }}.app/Contents/Developer" + sudo xcode-select --switch "${XCODE_DEVELOPER_DIR}" + + - uses: actions/checkout@v4 + + - name: Build and test + shell: bash + run: | + python ./tools/ci_build/build.py \ + --build_dir ./build \ + --update \ + --build --parallel \ + --test \ + --build_shared_lib \ + --build_nodejs \ + --use_webgpu \ + --use_binskim_compliant_compile_flags + ARM64-Xcode16-targeting-iphonesimulator: runs-on: macos-15 @@ -164,7 +204,7 @@ jobs: matrix: target_arch: [x86_64, arm64] - timeout-minutes: 75 + timeout-minutes: 90 steps: - uses: actions/setup-python@v5 diff --git a/cmake/CMakePresets.json b/cmake/CMakePresets.json index 4987edaf85513..8d63912f6eaee 100644 --- a/cmake/CMakePresets.json +++ b/cmake/CMakePresets.json @@ -109,6 +109,29 @@ "rhs": "Darwin" } }, + { + "name": "arm64-osx", + "inherits": [ + "unit-test" + ], + "generator": "Xcode", + "binaryDir": "${sourceParentDir}/cmake_build/arm64-osx", + "installDir": "${sourceParentDir}/cmake_build/out", + "cacheVariables": { + "CMAKE_OSX_ARCHITECTURES": "arm64", + "onnxruntime_BUILD_SHARED_LIB": true, + "onnxruntime_USE_XNNPACK": false, + "onnxruntime_USE_COREML": true, + "onnxruntime_BUILD_OBJC": true, + "onnxruntime_BUILD_APPLE_FRAMEWORK": true, + "CMAKE_CONFIGURATION_TYPES": "Debug;Release" + }, + "condition": { + "type": "equals", + "lhs": "${hostSystemName}", + "rhs": "Darwin" + } + }, { "name": "x64-osx-vcpkg", "inherits": [ diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md index b64641230f249..f582abca34706 100644 --- a/docs/ContribOperators.md +++ b/docs/ContribOperators.md @@ -1625,7 +1625,7 @@ This version of the operator has been available since version 1 of the 'com.micr #### Type Constraints
-
T : tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)
+
T : tensor(bool), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)
Constrain input and output types.
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index 91c68a1f5afb6..8d256a2088279 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -582,7 +582,7 @@ Do not modify directly.* | Op Name | Parameters | OpSet Version | Types Supported | |---------|------------|---------------|-----------------| |**Operator Domain:** *ai.onnx*|||| -|Abs|*in* X:**T**
*out* Y:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| +|Abs|*in* X:**T**
*out* Y:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| |||[6, 12]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| |Add|*in* A:**T**
*in* B:**T**
*out* C:**T**|14+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| |||13|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)| @@ -839,7 +839,7 @@ Do not modify directly.* |Shrink|*in* input:**T**
*out* output:**T**|9+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| |Sigmoid|*in* X:**T**
*out* Y:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)| |||[6, 12]|**T** = tensor(double), tensor(float), tensor(float16)| -|Sign|*in* input:**T**
*out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| +|Sign|*in* input:**T**
*out* output:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| |SimplifiedLayerNormalization|*in* X:**T**
*in* scale:**V**
*out* Y:**V**
*out* inv_std_var:**U**|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)
**U** = tensor(double), tensor(float)
**V** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)| |Sin|*in* input:**T**
*out* output:**T**|7+|**T** = tensor(double), tensor(float), tensor(float16)| |Size|*in* data:**T**
*out* size:**T1**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**T1** = tensor(int64)| diff --git a/js/react_native/android/src/main/AndroidManifest.xml b/js/react_native/android/src/main/AndroidManifest.xml index c6e3cb45e16e5..a2f47b6057db7 100644 --- a/js/react_native/android/src/main/AndroidManifest.xml +++ b/js/react_native/android/src/main/AndroidManifest.xml @@ -1,3 +1,2 @@ - + diff --git a/js/react_native/e2e/.detoxrc.js b/js/react_native/e2e/.detoxrc.js index e886a363d378b..1d49f06213e51 100644 --- a/js/react_native/e2e/.detoxrc.js +++ b/js/react_native/e2e/.detoxrc.js @@ -38,8 +38,8 @@ module.exports = { simulator: { type: 'ios.simulator', device: { - type: 'iPhone 14', - os: 'iOS 16.4', + type: 'iPhone 15', + os: 'iOS 17.4', }, }, attached: { diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts index 3979af7fa1ec9..bb532e0fbae74 100644 --- a/js/web/lib/wasm/wasm-core-impl.ts +++ b/js/web/lib/wasm/wasm-core-impl.ts @@ -309,12 +309,12 @@ export const createSession = async ( if (context) { wasm.currentContext = context as MLContext; } else if (gpuDevice) { - wasm.currentContext = await wasm.jsepCreateMLContext!(gpuDevice); + wasm.currentContext = await wasm.webnnCreateMLContext!(gpuDevice); } else { - wasm.currentContext = await wasm.jsepCreateMLContext!({ deviceType, powerPreference }); + wasm.currentContext = await wasm.webnnCreateMLContext!({ deviceType, powerPreference }); } } else { - wasm.currentContext = await wasm.jsepCreateMLContext!(); + wasm.currentContext = await wasm.webnnCreateMLContext!(); } break; } @@ -330,7 +330,7 @@ export const createSession = async ( // clear current MLContext after session creation if (wasm.currentContext) { - wasm.jsepRegisterMLContext!(sessionHandle, wasm.currentContext); + wasm.webnnRegisterMLContext!(sessionHandle, wasm.currentContext); wasm.currentContext = undefined; wasm.shouldTransferToMLTensor = true; } @@ -454,6 +454,7 @@ export const releaseSession = (sessionId: number): void => { } wasm.jsepOnReleaseSession?.(sessionId); + wasm.webnnOnReleaseSession?.(sessionId); wasm.webgpuOnReleaseSession?.(sessionId); inputNamesUTF8Encoded.forEach((buf) => wasm._OrtFree(buf)); @@ -520,7 +521,7 @@ export const prepareInputOutputTensor = async ( const mlTensor = tensor[2].mlTensor as MLTensor; dataByteLength = calculateTensorSizeInBytes(tensorDataTypeStringToEnum(dataType), dims)!; - const registerMLTensor = wasm.jsepRegisterMLTensor; + const registerMLTensor = wasm.webnnRegisterMLTensor; if (!registerMLTensor) { throw new Error('Tensor location "ml-tensor" is not supported without using WebNN.'); } @@ -540,7 +541,7 @@ export const prepareInputOutputTensor = async ( wasm.setValue(rawData + i * ptrSize, allocWasmString(data[i], allocs), '*'); } } else { - const isGraphInput = wasm.jsepIsGraphInput; + const isGraphInput = wasm.webnnIsGraphInput; if (dataType !== 'string' && isGraphInput) { const tensorNameUTF8 = wasm._OrtGetInputName(sessionId, index); const tensorName = wasm.UTF8ToString(tensorNameUTF8); @@ -549,8 +550,8 @@ export const prepareInputOutputTensor = async ( const dataTypeEnum = tensorDataTypeStringToEnum(dataType); dataByteLength = calculateTensorSizeInBytes(dataTypeEnum, dims)!; actualLocation = 'ml-tensor'; - const createTemporaryTensor = wasm.jsepCreateTemporaryTensor; - const uploadTensor = wasm.jsepUploadTensor; + const createTemporaryTensor = wasm.webnnCreateTemporaryTensor; + const uploadTensor = wasm.webnnUploadTensor; if (!createTemporaryTensor || !uploadTensor) { throw new Error('Tensor location "ml-tensor" is not supported without using WebNN.'); } @@ -722,6 +723,7 @@ export const run = async ( } wasm.jsepOnRunStart?.(sessionHandle); + wasm.webnnOnRunStart?.(sessionHandle); let errorCode: number; if (!BUILD_DEFS.DISABLE_JSEP && ioBindingState) { @@ -862,8 +864,8 @@ export const run = async ( ]); } } else if (preferredLocation === 'ml-tensor' && size > 0) { - const ensureTensor = wasm.jsepEnsureTensor; - const isInt64Supported = wasm.jsepIsInt64Supported; + const ensureTensor = wasm.webnnEnsureTensor; + const isInt64Supported = wasm.webnnIsInt64Supported; if (!ensureTensor || !isInt64Supported) { throw new Error('preferredLocation "ml-tensor" is not supported without using WebNN.'); } @@ -890,9 +892,9 @@ export const run = async ( dims, { mlTensor, - download: wasm.jsepCreateMLTensorDownloader!(dataOffset, type), + download: wasm.webnnCreateMLTensorDownloader!(dataOffset, type), dispose: () => { - wasm.jsepReleaseTensorId!(dataOffset); + wasm.webnnReleaseTensorId!(dataOffset); wasm._OrtReleaseTensor(tensor); }, }, @@ -915,7 +917,7 @@ export const run = async ( if (!keepOutputTensor) { wasm._OrtReleaseTensor(tensor); } - wasm.jsepOnRunEnd?.(sessionHandle); + wasm.webnnOnRunEnd?.(sessionHandle); } } diff --git a/js/web/lib/wasm/wasm-types.ts b/js/web/lib/wasm/wasm-types.ts index 6de54078af031..752bac28d7efb 100644 --- a/js/web/lib/wasm/wasm-types.ts +++ b/js/web/lib/wasm/wasm-types.ts @@ -156,12 +156,26 @@ export declare namespace JSEP { */ shouldTransferToMLTensor: boolean; + /** + * [exported from pre-jsep.js] Called when InferenceSession.run started. This function will be called before + * _OrtRun[WithBinding]() is called. + * @param sessionId - specify the session ID. + */ + webnnOnRunStart: (sessionId: number) => void; + /** + * [exported from pre-jsep.js] Release a session. This function will be called before _OrtReleaseSession() is + * called. + * @param sessionId - specify the session ID. + * @returns + */ + webnnOnReleaseSession: (sessionId: number) => void; + /** * [exported from pre-jsep.js] Called when InferenceSession.run finished. This function will be called after * _OrtRun[WithBinding]() is called. * @param sessionId - specify the session ID. */ - jsepOnRunEnd: (sessionId: number) => void; + webnnOnRunEnd: (sessionId: number) => void; /** * [exported from pre-jsep.js] Register MLContext for a session. @@ -169,18 +183,18 @@ export declare namespace JSEP { * @param context - specify the MLContext. * @returns */ - jsepRegisterMLContext: (sessionId: number, context: MLContext) => void; + webnnRegisterMLContext: (sessionId: number, context: MLContext) => void; /** * [exported from pre-jsep.js] Reserve a MLTensor ID attached to the current session. * @returns the MLTensor ID. */ - jsepReserveTensorId: () => number; + webnnReserveTensorId: () => number; /** * [exported from pre-jsep.js] Release an MLTensor ID from use and destroys underlying MLTensor if no longer in use. * @param tensorId - specify the MLTensor ID. * @returns */ - jsepReleaseTensorId: (tensorId: number) => void; + webnnReleaseTensorId: (tensorId: number) => void; /** * [exported from pre-jsep.js] Ensure that an MLTensor of a given type and shape exists for a MLTensor ID. * @param sessionId - specify the session ID or current active session ID if undefined. @@ -190,7 +204,7 @@ export declare namespace JSEP { * @param copyOld - specify whether to copy the old tensor if a new tensor was created. * @returns the MLTensor associated with the tensor ID. */ - jsepEnsureTensor: ( + webnnEnsureTensor: ( sessionId: number | undefined, tensorId: number, dataType: DataType, @@ -203,20 +217,20 @@ export declare namespace JSEP { * @param data - specify the data to upload. It can be a TensorProto::data_type or a WebNN MLOperandDataType. * @returns */ - jsepUploadTensor: (tensorId: number, data: Uint8Array) => void; + webnnUploadTensor: (tensorId: number, data: Uint8Array) => void; /** * [exported from pre-jsep.js] Download data from an MLTensor. * @param tensorId - specify the MLTensor ID. * @returns the downloaded data. */ - jsepDownloadTensor: (tensorId: number, dstBuffer: ArrayBufferView | ArrayBuffer) => Promise; + webnnDownloadTensor: (tensorId: number, dstBuffer: ArrayBufferView | ArrayBuffer) => Promise; /** * [exported from pre-jsep.js] Creates a downloader function to download data from an MLTensor. * @param tensorId - specify the MLTensor ID. * @param type - specify the data type. * @returns the downloader function. */ - jsepCreateMLTensorDownloader: ( + webnnCreateMLTensorDownloader: ( tensorId: number, type: Tensor.MLTensorDataTypes, ) => () => Promise; @@ -228,7 +242,7 @@ export declare namespace JSEP { * @param dimensions - specify the dimensions. * @returns the MLTensor ID for the external MLTensor. */ - jsepRegisterMLTensor: ( + webnnRegisterMLTensor: ( sessionId: number, tensor: MLTensor, onnxDataType: DataType, @@ -240,7 +254,7 @@ export declare namespace JSEP { * @param optionsOrGpuDevice - specify the options or GPUDevice. * @returns */ - jsepCreateMLContext(optionsOrGpuDevice?: MLContextOptions | GPUDevice): Promise; + webnnCreateMLContext(optionsOrGpuDevice?: MLContextOptions | GPUDevice): Promise; /** * [exported from pre-jsep.js] Register a WebNN Constant operand from external data. @@ -252,7 +266,7 @@ export declare namespace JSEP { * @param shouldConvertInt64ToInt32 - specify whether to convert int64 to int32. * @returns the WebNN Constant operand for the specified external data. */ - jsepRegisterMLConstant( + webnnRegisterMLConstant( externalFilePath: string, dataOffset: number, dataLength: number, @@ -265,14 +279,14 @@ export declare namespace JSEP { * [exported from pre-jsep.js] Register a WebNN graph input. * @param inputName - specify the input name. */ - jsepRegisterGraphInput: (inputName: string) => void; + webnnRegisterGraphInput: (inputName: string) => void; /** * [exported from pre-jsep.js] Check if a graph input is a WebNN graph input. * @param sessionId - specify the session ID. * @param inputName - specify the input name. * @returns whether the input is a WebNN graph input. */ - jsepIsGraphInput: (sessionId: number, inputName: string) => boolean; + webnnIsGraphInput: (sessionId: number, inputName: string) => boolean; /** * [exported from pre-jsep.js] Create a temporary MLTensor for a session. * @param sessionId - specify the session ID. @@ -280,13 +294,13 @@ export declare namespace JSEP { * @param shape - specify the shape. * @returns the MLTensor ID for the temporary MLTensor. */ - jsepCreateTemporaryTensor: (sessionId: number, dataType: DataType, shape: readonly number[]) => Promise; + webnnCreateTemporaryTensor: (sessionId: number, dataType: DataType, shape: readonly number[]) => Promise; /** * [exported from pre-jsep.js] Check if a session's associated WebNN Context supports int64. * @param sessionId - specify the session ID. * @returns whether the WebNN Context supports int64. */ - jsepIsInt64Supported: (sessionId: number) => boolean; + webnnIsInt64Supported: (sessionId: number) => boolean; } } diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc index 58ddf60df79f0..52c705abb1003 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc +++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc @@ -224,12 +224,12 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const { // Shader is designed to be dispatched as Dispatch(num_heads, new_sequence_length / workgroup_size_x, 1) // Each lane/thread is responsible for a single q. shader.MainFunctionBody() << R"MAIN_FN( - let head_idx = workgroup_id.x; + let head_idx = u32(workgroup_idx / uniforms.num_seq_tile); let capped_sg_id = min(sg_id, max_k_step); let capped_sg_size = min(sg_size, max_k_step); // Load Q - let q_idx_global = workgroup_id.y * workgroup_size_x + local_idx; + let q_idx_global = (workgroup_idx % uniforms.num_seq_tile) * workgroup_size_x + local_idx; let valid_q = q_idx_global < uniforms.new_sequence_length; if (valid_q) { @@ -445,7 +445,8 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co std::string cache_hint = std::to_string(has_attention_bias) + std::to_string(parameters.head_size_) + std::to_string(parameters.num_heads_); - program.SetDispatchGroupSize(parameters.num_heads_, (parameters.sequence_length_ + tile_size - 1) / tile_size, 1) + const uint32_t num_seq_tile = (parameters.sequence_length_ + tile_size - 1) / tile_size; + program.SetDispatchGroupSize(parameters.num_heads_ * num_seq_tile) .SetWorkgroupSize(tile_size) .CacheHint(cache_hint) .AddUniformVariables({{static_cast(parameters.sequence_length_)}, @@ -454,7 +455,8 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co {static_cast(parameters.total_sequence_length_ - parameters.kv_sequence_length_)}, {static_cast(parameters.is_gqa_ ? 1 : 0)}, {static_cast(parameters.n_reps)}, - {alpha}}); + {alpha}, + {num_seq_tile}}); return context.RunProgram(program); } diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h index 2c2b888538843..8931403641a81 100644 --- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h +++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h @@ -52,7 +52,8 @@ class FlashAttentionProgram final : public Program { {"past_sequence_length", ProgramUniformVariableDataType::Uint32}, {"is_gqa", ProgramUniformVariableDataType::Uint32}, {"n_reps", ProgramUniformVariableDataType::Uint32}, - {"alpha", ProgramUniformVariableDataType::Float32}); + {"alpha", ProgramUniformVariableDataType::Float32}, + {"num_seq_tile", ProgramUniformVariableDataType::Uint32}); private: bool has_attention_bias_; diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc index 05cbfb1f99c48..a25d8e68f11cd 100644 --- a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc +++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc @@ -12,21 +12,12 @@ Status DP4AMatMulQuantizeProgram::GenerateShaderCode(ShaderHelper& shader) const shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias); shader.AddOutput("output", ShaderUsage::UseUniform); shader.AddOutput("scales", ShaderUsage::UseUniform); - shader.AdditionalImplementation() << R"ADDNL_FN( - fn readInput(offset: u32) -> input_a_value_t - { - if (offset > uniforms.input_size) { - return input_a_value_t(0); - } - return input_a[offset]; - } - )ADDNL_FN"; shader.MainFunctionBody() << R"MAIN_FN( var local_a : array, 32>; var max_value:vec4 = vec4(0); for (var idx:u32=0;idx<32;idx+=1) { - local_a[idx] = readInput(workgroup_idx*32 + idx); + local_a[idx] = input_a[workgroup_idx*32 + idx]; max_value = max(max_value, abs(local_a[idx])); } var scale = max(max_value.x, max_value.y); @@ -147,8 +138,8 @@ Status DP4AMatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const { shader.MainFunctionBody() << R"MAIN_FN( // During the load phase we use all 256 threads to load 64 rows of A/B. // For each row we load tile_size_k_vec (2) vectorized elements, which are 32 elements of K. - let a_global_base = workgroup_id.x * tile_size; - let b_global_base = workgroup_id.y * tile_size; + let a_global_base = u32(workgroup_idx / uniforms.num_N_tile) * tile_size; + let b_global_base = (workgroup_idx % uniforms.num_N_tile) * tile_size; let load_AorB = u32(local_idx/128); let load_row = u32((local_idx%128)/2); let load_col = u32(local_idx%2); @@ -279,17 +270,16 @@ Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor Tensor a_scale = context.CreateGPUTensor(a->DataType(), a_scales_dims); quantize_program.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, static_cast(kVec4Components)}}) .AddOutputs({{&a_quant, ProgramTensorMetadataDependency::Rank, a_quant.Shape(), 1}, - {&a_scale, ProgramTensorMetadataDependency::Rank, a_scale.Shape(), 1}}) - .AddUniformVariable({static_cast(M * K / kVec4Components)}); + {&a_scale, ProgramTensorMetadataDependency::Rank, a_scale.Shape(), 1}}); ORT_RETURN_IF_ERROR(context.RunProgram(quantize_program)); constexpr uint32_t kTileSize = 64; TensorShape reshaped_y_shape{1, M, N / kVec4Components}; + uint32_t num_M_tile = (M + kTileSize - 1) / kTileSize; + uint32_t num_N_tile = (N + kTileSize - 1) / kTileSize; DP4AMatMulNBitsProgram mul_program{block_size}; mul_program.SetWorkgroupSize(256); - mul_program.SetDispatchGroupSize( - (M + kTileSize - 1) / kTileSize, - (N + kTileSize - 1) / kTileSize, 1); + mul_program.SetDispatchGroupSize(num_M_tile * num_N_tile); mul_program.AddInputs({{&a_quant, ProgramTensorMetadataDependency::TypeAndRank, static_cast(kVec4Components)}, {&a_scale, ProgramTensorMetadataDependency::TypeAndRank, 1}, {b, ProgramTensorMetadataDependency::TypeAndRank, static_cast(kVec2Components * kU32Components)}, @@ -298,7 +288,8 @@ Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor {static_cast(N)}, {static_cast(K)}, {static_cast(K / 8)}, - {static_cast(K / 16)}}) + {static_cast(K / 16)}, + {num_N_tile}}) .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, reshaped_y_shape, static_cast(kVec4Components)}) .CacheHint("Block" + std::to_string(block_size)); return context.RunProgram(mul_program); @@ -317,7 +308,7 @@ bool CanApplyDP4AMatrixMatMulNBits(onnxruntime::webgpu::ComputeContext& context, bool use_dp4a = context.Device().HasFeature(wgpu::FeatureName::Subgroups) && context.AdapterInfo().backendType != wgpu::BackendType::Metal; return (accuracy_level == 4 && block_size % 32 == 0 && - batch_count == 1 && components_k == 4 && K % 64 == 0 && N % 16 == 0 && + batch_count == 1 && components_k == 4 && K % 128 == 0 && N % 16 == 0 && !has_zero_points && use_dp4a); } diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h index 15b86d78301ad..f0157ca3e8c97 100644 --- a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h +++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h @@ -16,7 +16,6 @@ class DP4AMatMulQuantizeProgram final : public Program { @@ -28,7 +27,8 @@ class DP4AMatMulNBitsProgram final : public Program { {"N", ProgramUniformVariableDataType::Uint32}, {"K", ProgramUniformVariableDataType::Uint32}, {"K8", ProgramUniformVariableDataType::Uint32}, - {"K16", ProgramUniformVariableDataType::Uint32}); + {"K16", ProgramUniformVariableDataType::Uint32}, + {"num_N_tile", ProgramUniformVariableDataType::Uint32}); private: uint32_t block_size_; diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc index e45787299f3ad..7b4a45ce8aa0f 100644 --- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc @@ -3361,7 +3361,8 @@ void RegisterContribSchemas() { OpSchema::NonDifferentiable) .TypeConstraint( "T", - {"tensor(int8)", + {"tensor(bool)", + "tensor(int8)", "tensor(int16)", "tensor(int32)", "tensor(int64)", diff --git a/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc index 4481a5172966b..3fffc6d0a68c4 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc @@ -97,7 +97,6 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const logging::Logger& logger) const { const auto& op_type(node.OpType()); -#if defined(COREML_ENABLE_MLPROGRAM) if (model_builder.CreateMLProgram()) { using namespace CoreML::Specification::MILSpec; // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#module-coremltools.converters.mil.mil.ops.defs.iOS15.activation @@ -166,9 +165,7 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, model_builder.AddOperation(std::move(op)); - } else -#endif // (COREML_ENABLE_MLPROGRAM) - { + } else { std::unique_ptr layer = model_builder.CreateNNLayer(node); if (op_type == "Sigmoid") { diff --git a/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc index 6169090a36014..dfa01c8187741 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc @@ -32,7 +32,6 @@ Status ArgMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const int64_t keepdims = helper.Get("keepdims", 1); const bool removedim = keepdims != 1; -#if defined(COREML_ENABLE_MLPROGRAM) if (model_builder.CreateMLProgram()) { using namespace CoreML::Specification::MILSpec; // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#module-coremltools.converters.mil.mil.ops.defs.iOS15.reduction @@ -46,9 +45,7 @@ Status ArgMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, // the output of ArgMax must be int32 AddOperationOutput(*op, *node.OutputDefs()[0], output_datatype); model_builder.AddOperation(std::move(op)); - } else -#endif // (COREML_ENABLE_MLPROGRAM) - { + } else { auto* coreml_argmax = layer->mutable_argmax(); coreml_argmax->set_axis(axis); coreml_argmax->set_removedim(removedim); @@ -91,11 +88,9 @@ bool ArgMaxOpBuilder::IsOpSupportedImpl(const Node& node, return false; } -#if defined(COREML_ENABLE_MLPROGRAM) if (input_params.create_mlprogram) { return true; } -#endif // If there are multiple downstream nodes and cast (toint32) is one of them // not supported, exit here diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc index 2817f34bc64f2..9e7fcd788664c 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc @@ -6,6 +6,7 @@ #include "core/providers/coreml/builders/helper.h" #include "core/providers/coreml/builders/impl/base_op_builder.h" #include "core/providers/coreml/builders/model_builder.h" +#include "core/providers/coreml/model/host_utils.h" #include "core/providers/shared/utils/utils.h" using namespace CoreML::Specification; @@ -113,10 +114,12 @@ bool BaseOpBuilder::IsInputDtypeSupport(const Node& node, size_t idx, return true; } +#if CAN_BUILD_COREML6_OR_LATER // only MLProgram support FP16 if (input_params.create_mlprogram && input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) { return true; } +#endif LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Input type: [" << input_type << "] is not currently supported"; return false; diff --git a/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc index 442194cb31cbc..e547f2e42e527 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc @@ -57,7 +57,6 @@ Status BatchNormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_bu const auto eps = helper.Get("epsilon", 1e-5f); const auto channels = scale_tensor.dims()[0]; -#if defined(COREML_ENABLE_MLPROGRAM) if (model_builder.CreateMLProgram()) { using namespace CoreML::Specification::MILSpec; // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.normalization.batch_norm @@ -78,9 +77,7 @@ Status BatchNormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_bu AddOperationOutput(*op, *node.OutputDefs()[0]); model_builder.AddOperation(std::move(op)); - } else -#endif // (COREML_ENABLE_MLPROGRAM) - { + } else { auto* coreml_batch_norm = layer->mutable_batchnorm(); coreml_batch_norm->set_channels(channels); coreml_batch_norm->set_epsilon(eps); diff --git a/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc index 0482620b269a4..d7c78e05362ed 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc @@ -56,7 +56,6 @@ bool CheckIfBothInputShapesMatch(const Node& node, const logging::Logger& logger } } // namespace -#if defined(COREML_ENABLE_MLPROGRAM) static std::vector InferOutputShape(const std::vector& a, const std::vector& b) { std::vector output_shape; int64_t i_a = 0, j_b = 0; @@ -112,14 +111,12 @@ static void AddVariadicInputs(std::unique_ptr layer = model_builder.CreateNNLayer(node); if (op_type == "Add") { diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc index 6f9bb35c27d80..684653aa21273 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc @@ -150,7 +150,6 @@ void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span data); -#if defined(COREML_ENABLE_MLPROGRAM) // // MLProgram utils // @@ -174,6 +173,5 @@ void AddOperationOutput(COREML_SPEC::MILSpec::Operation& op, const NodeArg& outp /// Number of spatial dims in input. Generally rank - 2 (ignore N and C dims). void AddPadTypeAndPads(COREML_SPEC::MILSpec::Operation& op, ModelBuilder& model_builder, std::string_view op_type, const NodeAttrHelper& helper, int num_spatial_dims); -#endif // defined(COREML_ENABLE_MLPROGRAM) } // namespace coreml } // namespace onnxruntime diff --git a/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc index 7c7363d4c81ad..8abee92451338 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc @@ -27,9 +27,8 @@ class CastOpBuilder : public BaseOpBuilder { Status CastOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& model_builder, [[maybe_unused]] const Node& node, [[maybe_unused]] const logging::Logger& logger) const { -// This is a special handling case for ArgMax Op, where argmax is followed by a cast to int32 type. -// The ArgMax is fused with the Cast node and produces an int32 output. -#if defined(COREML_ENABLE_MLPROGRAM) + // This is a special handling case for ArgMax Op, where argmax is followed by a cast to int32 type. + // The ArgMax is fused with the Cast node and produces an int32 output. if (model_builder.CreateMLProgram()) { using namespace CoreML::Specification::MILSpec; // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.elementwise_unary.cast @@ -73,7 +72,6 @@ Status CastOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& model AddOperationOutput(*op, *node.OutputDefs()[0], cast_to_type); model_builder.AddOperation(std::move(op)); } -#endif return Status::OK(); } @@ -134,7 +132,6 @@ bool CastOpBuilder::HasSupportedInputsImpl(const Node& node, [[maybe_unused]] co return false; } -#if defined(COREML_ENABLE_MLPROGRAM) if (input_params.create_mlprogram) { if ((input_type == ONNX_NAMESPACE::TensorProto_DataType_INT32 || input_type == ONNX_NAMESPACE::TensorProto_DataType_INT64 || @@ -152,7 +149,6 @@ bool CastOpBuilder::HasSupportedInputsImpl(const Node& node, [[maybe_unused]] co return false; } } -#endif // only support int64 coming from ArgMax (check for ArgMax is done in IsOpSupportedImpl()) if (input_type != ONNX_NAMESPACE::TensorProto_DataType_INT64) { diff --git a/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc index f7046c213a8cb..9e68070a0e693 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc @@ -64,7 +64,6 @@ Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, bool has_min = min != std::numeric_limits::lowest(); bool has_max = max != std::numeric_limits::max(); -#if defined(COREML_ENABLE_MLPROGRAM) if (model_builder.CreateMLProgram()) { using namespace CoreML::Specification::MILSpec; @@ -121,9 +120,7 @@ Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, AddOperationOutput(*op, output); model_builder.AddOperation(std::move(op)); - } else -#endif // defined(COREML_ENABLE_MLPROGRAM) - { + } else { // TODO: CoreML has a Clip layer for NeuralNetwork. Added in CoreML 4. We could potentially use that if available // to simplify. // https://apple.github.io/coremltools/mlmodel/Format/NeuralNetwork.html#cliplayerparams diff --git a/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc index 9ea0030290abd..34ce2438095ad 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc @@ -26,7 +26,6 @@ class ConcatOpBuilder : public BaseOpBuilder { Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node, const logging::Logger& logger) const { -#if defined(COREML_ENABLE_MLPROGRAM) if (model_builder.CreateMLProgram()) { using namespace CoreML::Specification::MILSpec; // NOLINT @@ -45,7 +44,6 @@ Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, AddOperationOutput(*op, *node.OutputDefs()[0]); model_builder.AddOperation(std::move(op)); } else // NOLINT -#endif // defined(COREML_ENABLE_MLPROGRAM) { std::unique_ptr layer = model_builder.CreateNNLayer(node); diff --git a/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc index 38125957bf481..18823bcc78d19 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc @@ -52,7 +52,6 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N NodeAttrHelper helper(node); -#if defined(COREML_ENABLE_MLPROGRAM) if (model_builder.CreateMLProgram()) { using namespace CoreML::Specification::MILSpec; @@ -89,9 +88,7 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N AddOperationOutput(*conv_op, *node.OutputDefs()[0]); model_builder.AddOperation(std::move(conv_op)); - } else -#endif // defined(COREML_ENABLE_MLPROGRAM) - { + } else { std::unique_ptr layer = model_builder.CreateNNLayer(node); auto strides = helper.Get("strides", std::vector{1, 1}); @@ -225,14 +222,11 @@ bool ConvOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara const auto& weight_name = input_defs[1]->Name(); const auto* weight = input_params.graph_viewer.GetConstantInitializer(weight_name); -#if defined(COREML_ENABLE_MLPROGRAM) if (input_params.create_mlprogram) { // ML Program supports non-const weight, 1D, 2D and 3D. // keep to 1D and 2D for consistency with the NeuralNetwork implementation for now. // add 3D support as/when needed. - } else -#endif // defined (COREML_ENABLE_MLPROGRAM) - { + } else { if (!weight) { LOGS(logger, VERBOSE) << "The weight of Conv [" << name << "] must be a constant initializer"; return false; @@ -257,7 +251,6 @@ bool ConvOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara NodeAttrHelper helper(node); -#if defined(COREML_ENABLE_MLPROGRAM) // spec says same_lower is supported in CoreML 5. it lies. CoreML 6 is required otherwise you get // `Unexpected value for parameter pad_type[0] "same_lower" not in ("custom", "same", "valid").` // We _could_ manually calculate the pads, but not implementing that until we have a real use case to justify @@ -269,7 +262,6 @@ bool ConvOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara return false; } } -#endif // there's no equivalent to allow a manual kernel shape in CoreML. // it's OK if a specified kernel_shape matches kH and kW dims of the weight input. diff --git a/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc index 5b6d9d72ab3c9..2e2c898b0e10a 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc @@ -28,7 +28,6 @@ class ConvTransposeOpBuilder : public BaseOpBuilder { Status ConvTransposeOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& model_builder, [[maybe_unused]] const Node& node, const logging::Logger& /*logger*/) const { -#if defined(COREML_ENABLE_MLPROGRAM) using namespace CoreML::Specification::MILSpec; // NOLINT const auto input_defs = node.InputDefs(); const auto output_defs = node.OutputDefs(); @@ -80,7 +79,6 @@ Status ConvTransposeOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuild AddOperationOutput(*op, *output_defs[0]); model_builder.AddOperation(std::move(op)); -#endif // defined(COREML_ENABLE_MLPROGRAM) return Status::OK(); } diff --git a/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc index fec14dfd093a0..1a74b1eea97fe 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc @@ -33,7 +33,6 @@ Status DepthToSpaceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, NodeAttrHelper helper(node); int64_t blocksize = *helper.GetInt64("blocksize"); // required attribute -#if defined(COREML_ENABLE_MLPROGRAM) if (model_builder.CreateMLProgram()) { using namespace CoreML::Specification::MILSpec; // NOLINT @@ -105,7 +104,6 @@ Status DepthToSpaceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, model_builder.AddOperation(std::move(reshape2)); } } else // NOLINT -#endif // if defined(COREML_ENABLE_MLPROGRAM) { const auto& output_name = output_defs[0]->Name(); std::unique_ptr layer = model_builder.CreateNNLayer(node); diff --git a/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc index e685c09ef43ca..4f84f7c36259c 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc @@ -33,7 +33,6 @@ void GemmOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Nod const auto& input_defs(node.InputDefs()); const bool is_gemm = op == "Gemm"; -#if defined(COREML_ENABLE_MLPROGRAM) if (model_builder.CreateMLProgram()) { // we have to transpose the weight input of Gemm if transB is false, and potentially override the bias shape if (is_gemm) { @@ -58,9 +57,7 @@ void GemmOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Nod } } } - } else -#endif // defined(COREML_ENABLE_MLPROGRAM) - { + } else { // We have already embedded the weights (matrix B and C(if any)) into the coreml layer // No need to copy them later to reduce memory consumption model_builder.AddInitializerToSkip(input_defs[1]->Name()); @@ -123,7 +120,6 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N const auto K = transB ? b1 : b0; const auto N = transB ? b0 : b1; // we already checked it and dtype must be existed. -#if defined(COREML_ENABLE_MLPROGRAM) auto input_dtype = a.TypeAsProto()->tensor_type().elem_type(); if (model_builder.CreateMLProgram()) { using namespace CoreML::Specification::MILSpec; @@ -207,9 +203,7 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N AddOperationOutput(*matmul_op, *node.OutputDefs()[0]); model_builder.AddOperation(std::move(matmul_op)); } - } else -#endif // defined(COREML_ENABLE_MLPROGRAM) - { + } else { auto* coreml_inner_product = layer->mutable_innerproduct(); *layer->mutable_input()->Add() = a.Name(); diff --git a/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc index 6dcf14c16f111..f558f423752e8 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc @@ -42,7 +42,6 @@ class GridSampleOpBuilder : public BaseOpBuilder { Status GridSampleOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& model_builder, [[maybe_unused]] const Node& node, [[maybe_unused]] const logging::Logger& logger) const { -#if defined(COREML_ENABLE_MLPROGRAM) using namespace CoreML::Specification::MILSpec; // NOLINT // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.image_resizing.resample @@ -80,7 +79,6 @@ Status GridSampleOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& AddOperationOutput(*op, *output_defs[0]); model_builder.AddOperation(std::move(op)); -#endif return Status::OK(); } diff --git a/onnxruntime/core/providers/coreml/builders/impl/normalization_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/normalization_op_builder.cc index b4dc8d1647ad0..c0db144602ee2 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/normalization_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/normalization_op_builder.cc @@ -49,7 +49,6 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl( if (node.OpType() == "GroupNormalization") { return AddGroupNormToModelBuilderImpl(model_builder, node, logger); } -#if defined(COREML_ENABLE_MLPROGRAM) const auto& input_defs = node.InputDefs(); NodeAttrHelper helper(node); const auto& scale_tensor = *model_builder.GetConstantInitializer(input_defs[1]->Name()); @@ -94,7 +93,6 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl( AddOperationOutput(*op, *node.OutputDefs()[0]); model_builder.AddOperation(std::move(op)); } -#endif // (COREML_ENABLE_MLPROGRAM) return Status::OK(); } @@ -103,7 +101,6 @@ Status NormalizationOpBuilder::AddGroupNormToModelBuilderImpl( [[maybe_unused]] ModelBuilder& model_builder, [[maybe_unused]] const Node& node, [[maybe_unused]] const logging::Logger& logger) const { -#if defined(COREML_ENABLE_MLPROGRAM) const auto& input_defs = node.InputDefs(); NodeAttrHelper helper(node); // Coreml hasn't supported GroupNorm yet. @@ -184,7 +181,6 @@ Status NormalizationOpBuilder::AddGroupNormToModelBuilderImpl( model_builder.AddOperation(std::move(mul)); model_builder.AddOperation(std::move(add)); } -#endif // (COREML_ENABLE_MLPROGRAM) return Status::OK(); } diff --git a/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc index 17910ba6fd486..e43eef75007cc 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc @@ -29,7 +29,6 @@ Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const auto& op_type = node.OpType(); const auto& input_defs = node.InputDefs(); -#if defined(COREML_ENABLE_MLPROGRAM) if (model_builder.CreateMLProgram()) { using namespace CoreML::Specification::MILSpec; @@ -91,9 +90,7 @@ Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, AddOperationOutput(*op, *node.OutputDefs()[0]); model_builder.AddOperation(std::move(op)); - } else -#endif // defined(COREML_ENABLE_MLPROGRAM) - { + } else { std::unique_ptr layer = model_builder.CreateNNLayer(node); auto* coreml_pool = layer->mutable_pooling(); diff --git a/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc index d533b867bd454..a4609eb2a0584 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc @@ -71,7 +71,6 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, co const bool keepdims = helper.Get("keepdims", 1) != 0; const bool noop_with_empty_axes = helper.Get("noop_with_empty_axes", 0) != 0; -#if defined(COREML_ENABLE_MLPROGRAM) if (model_builder.CreateMLProgram()) { using namespace CoreML::Specification::MILSpec; @@ -103,9 +102,7 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, co AddOperationOutput(*op, *node.OutputDefs()[0]); model_builder.AddOperation(std::move(op)); - } else -#endif // (COREML_ENABLE_MLPROGRAM) - { + } else { std::unique_ptr layer = model_builder.CreateNNLayer(node); if (op_type == "ReduceSum") { diff --git a/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc index 27d24d9c21893..b35d6971623ed 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc @@ -50,7 +50,6 @@ Status ReshapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, // ReshapeHelper applies the ONNX rules to create the concrete output shape ReshapeHelper helper(TensorShape(input_shape), new_shape); -#if defined(COREML_ENABLE_MLPROGRAM) if (model_builder.CreateMLProgram()) { using namespace CoreML::Specification::MILSpec; @@ -64,9 +63,7 @@ Status ReshapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, AddOperationOutput(*reshape_op, *node.OutputDefs()[0]); model_builder.AddOperation(std::move(reshape_op)); - } else -#endif // defined(COREML_ENABLE_MLPROGRAM) - { + } else { std::unique_ptr layer = model_builder.CreateNNLayer(node); *layer->mutable_reshapestatic()->mutable_targetshape() = {new_shape.cbegin(), new_shape.cend()}; diff --git a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc index 7ff66e4a79e37..837573003e515 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc @@ -212,7 +212,6 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const num_sizes = output_sizes.size(); } -#if defined(COREML_ENABLE_MLPROGRAM) if (model_builder.CreateMLProgram()) { using namespace CoreML::Specification::MILSpec; // NOLINT @@ -279,9 +278,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const AddOperationOutput(*op, *output_defs[0]); model_builder.AddOperation(std::move(op)); - } else // NOLINT -#endif - { + } else { std::unique_ptr layer = model_builder.CreateNNLayer(node); auto* coreml_upsample = layer->mutable_upsample(); diff --git a/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc index 243f949bdd48e..d1c87b033d323 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc @@ -25,7 +25,6 @@ Status ShapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const const logging::Logger& /*logger*/) const { const auto& input_defs = node.InputDefs(); -#if defined(COREML_ENABLE_MLPROGRAM) if (model_builder.CreateMLProgram()) { using namespace CoreML::Specification::MILSpec; NodeAttrHelper node_attr_helper{node}; @@ -63,9 +62,7 @@ Status ShapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const AddOperationOutput(*op, *node.OutputDefs()[0], output_datatype); model_builder.AddOperation(std::move(op)); } - } else // NOLINT -#endif - { + } else { auto layer = model_builder.CreateNNLayer(node); layer->mutable_getshape(); *layer->mutable_input()->Add() = input_defs[0]->Name(); diff --git a/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc index 6b3fe75fa592d..368e47e40f831 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc @@ -127,7 +127,6 @@ Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const SliceOp::PrepareForComputeMetadata compute_metadata{data_shape}; ORT_RETURN_IF_ERROR(PrepareSliceComputeMetadata(node, model_builder.GetGraphViewer(), compute_metadata)); -#if defined(COREML_ENABLE_MLPROGRAM) if (model_builder.CreateMLProgram()) { using namespace CoreML::Specification::MILSpec; // NOLINT // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation.slice_by_index @@ -178,9 +177,7 @@ Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const model_builder.AddOperation(std::move(op)); - } else // NOLINT -#endif // defined(COREML_ENABLE_MLPROGRAM) - { + } else { auto layer = model_builder.CreateNNLayer(node); *layer->mutable_input()->Add() = input_defs[0]->Name(); *layer->mutable_output()->Add() = output_defs[0]->Name(); @@ -222,7 +219,6 @@ bool SliceOpBuilder::HasSupportedInputsImpl(const Node& node, return false; } -#ifdef COREML_ENABLE_MLPROGRAM // The [Doc](https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation.slice_by_index) // says ML Program slice_by_index supports fp16 in CoreML 5 (iOS 15). // It's incorrect and CoreML 6+ (iOS16, CoreML spec version >= 7) is required otherwise only float is supported. @@ -230,13 +226,11 @@ bool SliceOpBuilder::HasSupportedInputsImpl(const Node& node, // CoreML 6:https://github.com/apple/coremltools/blob/c3ea4cf56fef1176417246c1b85363417f3e713d/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_transformation.py#L495 if (input_params.create_mlprogram && input_params.coreml_version >= 6 && input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) { - } else -#endif // nolint - if (input_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT && - input_type != ONNX_NAMESPACE::TensorProto_DataType_INT64) { - LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Input type: [" << input_type << "] is not supported"; - return false; - } + } else if (input_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT && + input_type != ONNX_NAMESPACE::TensorProto_DataType_INT64) { + LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Input type: [" << input_type << "] is not supported"; + return false; + } return true; } diff --git a/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc index c6e331feed326..2411cd459fecd 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc @@ -37,7 +37,6 @@ Status SoftmaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const auto axis = helper.Get("axis", axis_default_value); auto axis_nonnegative = HandleNegativeAxis(axis, data_shape.size()); -#if defined(COREML_ENABLE_MLPROGRAM) // CoreML's softmax match onnx's softmax behavior since opset 13. // For opset < 13, we need to reshape to 2D and set axis to -1 to simulate onnx softmax behavior. // [B,D,...](onnx softmax opset 12, axis=1)->[B,D*...](CoreML softmax, axis=-1)->[B,D,...](reshape back) @@ -78,9 +77,7 @@ Status SoftmaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, AddOperationOutput(*reshape2, *node.OutputDefs()[0]); model_builder.AddOperation(std::move(reshape2)); } - } else // NOLINT -#endif - { + } else { if (node.SinceVersion() >= 13 || (data_shape.size() == 2)) { auto* coreml_softmaxnd = layer->mutable_softmaxnd(); coreml_softmaxnd->set_axis(axis); diff --git a/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc index 6372f3136123b..717d344982473 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc @@ -56,7 +56,6 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, return std::make_tuple(remainder, chunk_size); }; -#if defined(COREML_ENABLE_MLPROGRAM) if (model_builder.CreateMLProgram()) { using namespace CoreML::Specification::MILSpec; std::unique_ptr split_op = model_builder.CreateOperation(node, "split"); @@ -95,9 +94,7 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, } model_builder.AddOperation(std::move(split_op)); - } else -#endif - { + } else { std::unique_ptr layer = model_builder.CreateNNLayer(node); auto* coreml_splitnd = layer->mutable_splitnd(); coreml_splitnd->set_axis(axis); diff --git a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc index a1b3a18265c70..81bef11906b74 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc @@ -58,7 +58,6 @@ void SqueezeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const } } -#if defined(COREML_ENABLE_MLPROGRAM) void HandleX86ArchUnsqueezeScalarInput(ModelBuilder& model_builder, const Node& node, const logging::Logger& logger) { const auto& input_defs(node.InputDefs()); @@ -74,7 +73,6 @@ void HandleX86ArchUnsqueezeScalarInput(ModelBuilder& model_builder, AddOperationOutput(*op, *node.OutputDefs()[0]); model_builder.AddOperation(std::move(op)); } -#endif Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node, @@ -83,7 +81,7 @@ Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, auto* coreml_squeeze = layer->mutable_squeeze(); TensorShapeVector axes; GetAxes(model_builder, node, axes); -#if defined(COREML_ENABLE_MLPROGRAM) + const auto& input_defs(node.InputDefs()); if (model_builder.CreateMLProgram()) { using namespace CoreML::Specification::MILSpec; @@ -105,9 +103,7 @@ Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, } AddOperationOutput(*op, *node.OutputDefs()[0]); model_builder.AddOperation(std::move(op)); - } else // NOLINT -#endif - { + } else { if (axes.empty()) { coreml_squeeze->set_squeezeall(true); } else { diff --git a/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc index 831c4cf4d08ba..5bb7e4c11967a 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc @@ -34,7 +34,6 @@ Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, ORT_RETURN_IF_NOT(perm.size() == input_dims, "Perm and input should have same dimension"); } -#if defined(COREML_ENABLE_MLPROGRAM) if (model_builder.CreateMLProgram()) { using namespace CoreML::Specification::MILSpec; @@ -44,9 +43,7 @@ Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, AddOperationOutput(*op, *node.OutputDefs()[0]); model_builder.AddOperation(std::move(op)); - } else -#endif // defined(COREML_ENABLE_MLPROGRAM) - { + } else { std::unique_ptr layer = model_builder.CreateNNLayer(node); *layer->mutable_transpose()->mutable_axes() = {perm.cbegin(), perm.cend()}; diff --git a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc index bc3cad004aec1..dd495894ab8bb 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc @@ -25,7 +25,6 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const const auto& op_type(node.OpType()); const auto& input_defs(node.InputDefs()); -#if defined(COREML_ENABLE_MLPROGRAM) if (model_builder.CreateMLProgram()) { using namespace CoreML::Specification::MILSpec; @@ -58,9 +57,7 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const AddOperationOutput(*op, *node.OutputDefs()[0]); model_builder.AddOperation(std::move(op)); - } else // NOLINT -#endif // defined (COREML_ENABLE_MLPROGRAM) - { + } else { std::unique_ptr layer = model_builder.CreateNNLayer(node); if (op_type == "Sqrt") { diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.cc b/onnxruntime/core/providers/coreml/builders/model_builder.cc index f8952301d59a9..3551f5759201e 100644 --- a/onnxruntime/core/providers/coreml/builders/model_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/model_builder.cc @@ -17,20 +17,17 @@ #include "core/providers/coreml/shape_utils.h" #include "core/optimizer/initializer.h" -#if defined(COREML_ENABLE_MLPROGRAM) // includes from coremltools-src in _deps #include "modelpackage/src/ModelPackage.hpp" #include "mlmodel/src/MILBlob/Blob/StorageWriter.hpp" using MILBlob::Blob::StorageWriter; -#endif - using namespace CoreML::Specification; namespace onnxruntime { namespace coreml { namespace { -#if defined(COREML_ENABLE_MLPROGRAM) + // Should the initializer be written to file or kept as an immediate value bool ShouldWriteInitializerToWeightsFile(const ONNX_NAMESPACE::TensorProto& tensor_proto) { // https://github.com/apple/coremltools/blob/dbb0094fd0cb936469e35320bf37e866ef7a1da4/coremltools/converters/mil/backend/mil/load.py#L51-L57 @@ -388,8 +385,6 @@ void CreateEmptyFile(const std::string& filename) { ORT_ENFORCE(file.is_open(), "Failed to open file ", filename); } -#endif // defined(COREML_ENABLE_MLPROGRAM) - std::string GetModelOutputPath(const CoreMLOptions& coreml_options, const GraphViewer& graph_viewer, const logging::Logger& logger) { @@ -479,7 +474,6 @@ ModelBuilder::ModelBuilder(const GraphViewer& graph_viewer, const logging::Logge } if (create_ml_program_) { -#if defined(COREML_ENABLE_MLPROGRAM) coreml_model_->set_specificationversion(CoreMLSpecVersion()); MILSpec::Program& mlprogram = *coreml_model_->mutable_mlprogram(); mlprogram.set_version(1); @@ -503,12 +497,6 @@ ModelBuilder::ModelBuilder(const GraphViewer& graph_viewer, const logging::Logge "CoreML Model Weights"); auto weights_info = mlpackage_->findItem(weights_id); weights_file_writer_ = std::make_unique(weights_info->path() + "/weight.bin"); -#else - // should never happen due to handling in coreml_execution_provider.cc - // throw here so all other code in this class can assume create_ml_program_ is only ever true in a build - // where ML Program support is enabled. - ORT_THROW("ML Program is not enabled in this build"); -#endif } else { // We support CorelML Specification Version 4 (Core ML 3) coreml_model_->set_specificationversion(4); @@ -561,7 +549,6 @@ void ModelBuilder::AddLayer(std::unique_ptr layer) { /* * ML Program related helpers */ -#if defined(COREML_ENABLE_MLPROGRAM) const std::string& ModelBuilder::GetSafeName(const std::string& name) { // Check the name is valid according to the MILSpec rules // `Identifiers, generally used for names and keys, must match the regular expression [A-Za-z\_][A-Za-z0-9\_@]*.` @@ -737,8 +724,6 @@ std::string_view ModelBuilder::AddConstantImpl(std::string_view op_type, std::st return AddTensorValueAsConstantOperation(op_type, value_type, std::move(input_value)); } -#endif // defined(COREML_ENABLE_MLPROGRAM) - /* * General implementation */ @@ -775,13 +760,10 @@ Status ModelBuilder::RegisterInitializers() { continue; } -#if defined(COREML_ENABLE_MLPROGRAM) if (create_ml_program_) { MILSpec::Value coreml_tensor = OnnxTensorToCoreMLTensor(tensor, *weights_file_writer_); ORT_IGNORE_RETURN_VALUE(AddConstantOperation(name, std::move(coreml_tensor))); - } else -#endif - { + } else { std::unique_ptr layer = std::make_unique(); layer->set_name(GetUniqueName("initializer_" + name)); @@ -915,7 +897,6 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i return Status::OK(); } -#if defined(COREML_ENABLE_MLPROGRAM) if (create_ml_program_) { if (is_input) { // the model inputs need to be wired up as args to the 'main' function. @@ -935,7 +916,6 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i *mlprogram_main_block_->mutable_outputs()->Add() = name; } } -#endif // defined(COREML_ENABLE_MLPROGRAM) return Status::OK(); } @@ -980,11 +960,9 @@ Status ModelBuilder::CreateModel() { ORT_RETURN_IF_ERROR(ProcessNodes()); ORT_RETURN_IF_ERROR(RegisterModelOutputs()); -#if defined(COREML_ENABLE_MLPROGRAM) if (create_ml_program_) { SanitizeNames(); } -#endif return Status::OK(); } @@ -992,7 +970,6 @@ Status ModelBuilder::CreateModel() { Status ModelBuilder::SaveModel() { std::string output_path = model_output_path_; -#if defined(COREML_ENABLE_MLPROGRAM) if (create_ml_program_) { // we need to jump through some hoops to get the model path the ML Program load wants. std::string tmp_model_path = model_output_path_ + "/tmp/model.mlmodel"; @@ -1003,7 +980,6 @@ Status ModelBuilder::SaveModel() { auto model_info = mlpackage_->findItem(model_id); output_path = model_info->path(); } -#endif // scope this so the stream is closed and flushed by the ofstream dtor { @@ -1012,19 +988,16 @@ Status ModelBuilder::SaveModel() { ORT_RETURN_IF_NOT(coreml_model_->SerializeToOstream(&stream), "Saving the CoreML model failed. Path=", output_path); } -#if defined(COREML_ENABLE_MLPROGRAM) // need to delete the ModelPackage instance for it to write out the manifest. clear out the other ML Program // related types as well. mlprogram_main_block_ = nullptr; mlpackage_.reset(); weights_file_writer_.reset(); -#endif return Status::OK(); } Status ModelBuilder::LoadModel(std::unique_ptr& model) { -#if defined(COREML_ENABLE_MLPROGRAM) if (create_ml_program_) { // we need to provide the sanitized names for model inputs/outputs so that info is captured. // the input/output matching when we execute the model from the CoreML EP is based on order, so the change @@ -1058,9 +1031,7 @@ Status ModelBuilder::LoadModel(std::unique_ptr& model) { std::move(scalar_outputs_), std::move(int64_outputs_), logger_, coreml_options_); - } else -#endif - { + } else { model = std::make_unique(model_output_path_, std::move(onnx_input_names_), std::move(onnx_output_names_), @@ -1073,7 +1044,6 @@ Status ModelBuilder::LoadModel(std::unique_ptr& model) { return model->LoadModel(); // load using CoreML API, including compilation } -#if defined(COREML_ENABLE_MLPROGRAM) std::string_view ModelBuilder::AddConstant(std::string_view op_type, std::string_view value_type, const ONNX_NAMESPACE::TensorProto& tensor, std::optional> shape) { @@ -1114,7 +1084,6 @@ std::string_view ModelBuilder::AddConstant(std::string_view op_type, std::string return ret; } -#endif // static Status ModelBuilder::Build(const GraphViewer& graph_viewer, const logging::Logger& logger, int32_t coreml_version, const CoreMLOptions& coreml_options, diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.h b/onnxruntime/core/providers/coreml/builders/model_builder.h index 28c7dc42da581..f3012e8137e8c 100644 --- a/onnxruntime/core/providers/coreml/builders/model_builder.h +++ b/onnxruntime/core/providers/coreml/builders/model_builder.h @@ -9,7 +9,6 @@ #include "core/providers/coreml/model/model.h" #include "core/providers/coreml/coreml_options.h" -#if defined(COREML_ENABLE_MLPROGRAM) // coremltools classes namespace MPL { class ModelPackage; @@ -20,7 +19,6 @@ namespace Blob { class StorageWriter; } } // namespace MILBlob -#endif namespace onnxruntime { namespace coreml { @@ -58,11 +56,7 @@ class ModelBuilder { // Returns true if we are creating an ML Program bool CreateMLProgram() const { -#if defined(COREML_ENABLE_MLPROGRAM) return create_ml_program_; -#else - return false; -#endif } /* @@ -76,7 +70,6 @@ class ModelBuilder { // Add layer to the Core ML NeuralNetwork model void AddLayer(std::unique_ptr layer); -#if defined(COREML_ENABLE_MLPROGRAM) /* * MLProgram helpers */ @@ -147,7 +140,6 @@ class ModelBuilder { // add the operation to the main function void AddOperation(std::unique_ptr operation); -#endif /* * General helpers @@ -176,7 +168,6 @@ class ModelBuilder { const logging::Logger& Logger() const { return logger_; } private: -#if defined(COREML_ENABLE_MLPROGRAM) template std::string_view AddConstantImpl(std::string_view op_type, std::string_view value_type, gsl::span value, std::optional> shape = std::nullopt); @@ -190,7 +181,6 @@ class ModelBuilder { const std::string& AddConstantOperation(std::string_view name, COREML_SPEC::MILSpec::Value&& initializer); const std::string& AddTensorValueAsConstantOperation(std::string_view op_type, std::string_view value_type, COREML_SPEC::MILSpec::Value&& input_value); -#endif // Convert the ONNX model in graph_viewer_ to a CoreML::Specification::Model and serialize to disk. // We then load it using CoreML in order compile it. @@ -237,7 +227,6 @@ class ModelBuilder { uint32_t name_token_{0}; std::unordered_set unique_names_; -#if defined(COREML_ENABLE_MLPROGRAM) // mlprogram_main_ is the main block of the CoreML ML Program. // It is set in CreateModel to the CoreML Model.mlprogram.functions['main'].block_specializations['CoreML'] // entry we create. @@ -254,7 +243,6 @@ class ModelBuilder { // This means an op builder author doesn't need to be aware of the renaming. // https://github.com/apple/coremltools/blob/8b37641f243b1a3e81452feea311c6e30dcc9287/coremltools/converters/mil/mil/passes/defs/preprocess.py#L146-L149 std::unordered_map values_to_rename_; -#endif }; } // namespace coreml diff --git a/onnxruntime/core/providers/coreml/coreml_options.cc b/onnxruntime/core/providers/coreml/coreml_options.cc index 14ae55de9266b..c441a2eff56e0 100644 --- a/onnxruntime/core/providers/coreml/coreml_options.cc +++ b/onnxruntime/core/providers/coreml/coreml_options.cc @@ -15,18 +15,6 @@ CoreMLOptions::CoreMLOptions(uint32_t coreml_flags) { create_mlprogram_ = (coreml_flags & COREML_FLAG_CREATE_MLPROGRAM) != 0; enable_on_subgraph_ = (coreml_flags & COREML_FLAG_ENABLE_ON_SUBGRAPH) != 0; -#if defined(COREML_ENABLE_MLPROGRAM) - if (coreml::util::CoreMLVersion() < MINIMUM_COREML_MLPROGRAM_VERSION && create_mlprogram_ != 0) { - LOGS_DEFAULT(WARNING) << "ML Program is not supported on this OS version. Falling back to NeuralNetwork."; - create_mlprogram_ = false; - } -#else - if (create_mlprogram_ != 0) { - LOGS_DEFAULT(WARNING) << "ML Program is not supported in this build. Falling back to NeuralNetwork."; - create_mlprogram_ = false; - } -#endif - compute_units_ = 0; // 0 for all if (coreml_flags & COREML_FLAG_USE_CPU_ONLY) { diff --git a/onnxruntime/core/providers/coreml/model/host_utils.h b/onnxruntime/core/providers/coreml/model/host_utils.h index 145c64e5320d3..f654b4d5701b9 100644 --- a/onnxruntime/core/providers/coreml/model/host_utils.h +++ b/onnxruntime/core/providers/coreml/model/host_utils.h @@ -43,7 +43,13 @@ #define API_AVAILABLE_COREML7 API_AVAILABLE(macos(14), ios(17)) #define API_AVAILABLE_COREML8 API_AVAILABLE(macos(15), ios(18)) -// @available is used in implementation code +// The previous macros are used in header files to declare the availability of the APIs. +// The following macros are used in build time checks to determine if the APIs are available. +#define CAN_BUILD_COREML8_OR_LATER (__MAC_OS_X_VERSION_MAX_ALLOWED >= 150000 && __IPHONE_OS_VERSION_MAX_ALLOWED >= 180000) +#define CAN_BUILD_COREML7_OR_LATER (__MAC_OS_X_VERSION_MAX_ALLOWED >= 140000 && __IPHONE_OS_VERSION_MAX_ALLOWED >= 170000) +#define CAN_BUILD_COREML6_OR_LATER (__MAC_OS_X_VERSION_MAX_ALLOWED >= 130000 && __IPHONE_OS_VERSION_MAX_ALLOWED >= 160000) + +// @available is used in implementation code to check the availability of the APIs at runtime. // Base required OS to run CoreML Specification Version 4 (Core ML 3) #define HAS_COREML3_OR_LATER @available(macOS 10.15, iOS 13, *) #define HAS_COREML4_OR_LATER @available(macOS 11, iOS 14, *) @@ -54,8 +60,7 @@ #endif -#define MINIMUM_COREML_VERSION 3 // first version we support -#define MINIMUM_COREML_MLPROGRAM_VERSION 5 // first version where ML Program was available +#define MINIMUM_COREML_VERSION 5 // first version we support namespace onnxruntime { namespace coreml { diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm index 5211b89ec17c6..71664021ea2fb 100644 --- a/onnxruntime/core/providers/coreml/model/model.mm +++ b/onnxruntime/core/providers/coreml/model/model.mm @@ -363,13 +363,12 @@ void ProfileComputePlan(NSURL* compileUrl, MLModelConfiguration* config) { #endif } -#if __has_include() +#if __has_include() && CAN_BUILD_COREML8_OR_LATER #define HAS_COREMLOPTIMIZATIONHINT 1 #else #define HAS_COREMLOPTIMIZATIONHINT 0 #endif -API_AVAILABLE_COREML8 void ConfigureOptimizationHints(MLModelConfiguration* config, const CoreMLOptions& coreml_options) { #if HAS_COREMLOPTIMIZATIONHINT MLOptimizationHints* optimizationHints = [[MLOptimizationHints alloc] init]; diff --git a/onnxruntime/core/providers/cpu/tensor/cast_op.cc b/onnxruntime/core/providers/cpu/tensor/cast_op.cc index 35f3b12aeba35..639a49cb43a4f 100644 --- a/onnxruntime/core/providers/cpu/tensor/cast_op.cc +++ b/onnxruntime/core/providers/cpu/tensor/cast_op.cc @@ -254,11 +254,32 @@ struct TensorCasterNoSat { // tensor MLFloat16 -> float template <> struct TensorCaster { - void Cast(const OpKernelContext&, const TensorShape& shape, const Tensor& in, Tensor& out) const { + void Cast(const OpKernelContext& ctx, const TensorShape& shape, const Tensor& in, Tensor& out) const { auto out_data = out.MutableData(); auto in_data = in.Data(); const size_t shape_size = narrow(shape.Size()); - MlasConvertHalfToFloatBuffer(in_data, out_data, shape_size); + + // Check if the tensor is long enough to use threads + if (shape_size <= 128000) { + MlasConvertHalfToFloatBuffer(in_data, out_data, shape_size); + return; + } + // Calculate the number of compute cyles per implementation + auto cpu_info = CPUIDInfo::GetCPUIDInfo(); + double num_compute_cycles; + if (cpu_info.HasSSE3()) { + num_compute_cycles = static_cast(shape_size >> 1); + } else if (cpu_info.HasAVX2()) { + num_compute_cycles = static_cast(shape_size >> 2); + } else { + num_compute_cycles = static_cast(shape_size * 10); + } + + concurrency::ThreadPool::TryParallelFor(ctx.GetOperatorThreadPool(), shape_size, + {shape_size * 2.f, shape_size * 4.f, num_compute_cycles}, + [in_data, out_data](std::ptrdiff_t first_span, std::ptrdiff_t last_span) { + MlasConvertHalfToFloatBuffer(in_data + first_span, out_data + first_span, static_cast(last_span - first_span)); + }); } }; diff --git a/onnxruntime/core/providers/cuda/cu_inc/common.cuh b/onnxruntime/core/providers/cuda/cu_inc/common.cuh index 55935a9eae86d..2d2551a156099 100644 --- a/onnxruntime/core/providers/cuda/cu_inc/common.cuh +++ b/onnxruntime/core/providers/cuda/cu_inc/common.cuh @@ -441,6 +441,9 @@ __device__ __inline__ T _Sign(T a) { return _Signum(a, std::is_signed()); } template <> __device__ __inline__ half _Sign(half a) { return _Signum(a, std::true_type()); } +template <> +__device__ __inline__ BFloat16 _Sign(BFloat16 a) { return _Signum(static_cast(a), std::true_type()); } + template __device__ __inline__ T _Normcdf(T a); diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 54fb4429c0536..886fddd8f8a27 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -1013,6 +1013,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Abs); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Abs); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Abs); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Abs); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int8_t, Neg); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int16_t, Neg); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Neg); @@ -1188,6 +1189,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Sign); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Sign); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Sign); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Sign); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, BFloat16, Add); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, BFloat16, Sub); @@ -1996,6 +1998,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -2169,6 +2172,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc index fb03b4326c4e8..86a1b0f5b6102 100644 --- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc +++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc @@ -213,19 +213,19 @@ Status IsNaN::ComputeInternal(OpKernelContext* context) const { UNARY_OP_TYPED(name, ver, float) \ UNARY_OP_TYPED(name, ver, double) -#define UNARY_OP_CSILHFD(name, ver) \ +#define UNARY_OP_CSILHFDX(name, ver) \ UNARY_OP_TYPED(name, ver, int8_t) \ UNARY_OP_TYPED(name, ver, int16_t) \ UNARY_OP_TYPED(name, ver, int32_t) \ UNARY_OP_TYPED(name, ver, int64_t) \ UNARY_OP_HFDX(name, ver) -#define UNARY_OP_BWUZCSILHFD(name, ver) \ - UNARY_OP_TYPED(name, ver, uint8_t) \ - UNARY_OP_TYPED(name, ver, uint16_t) \ - UNARY_OP_TYPED(name, ver, uint32_t) \ - UNARY_OP_TYPED(name, ver, uint64_t) \ - UNARY_OP_CSILHFD(name, ver) +#define UNARY_OP_BWUZCSILHFDX(name, ver) \ + UNARY_OP_TYPED(name, ver, uint8_t) \ + UNARY_OP_TYPED(name, ver, uint16_t) \ + UNARY_OP_TYPED(name, ver, uint32_t) \ + UNARY_OP_TYPED(name, ver, uint64_t) \ + UNARY_OP_CSILHFDX(name, ver) UNARY_OP_VERSIONED_BWUZCSILHFD(Abs, 6, 12) UNARY_OP_VERSIONED_CSILHFD(Neg, 6, 12) @@ -237,8 +237,8 @@ UNARY_OP_VERSIONED_HFD(Log, 6, 12) UNARY_OP_VERSIONED_HFD(Exp, 6, 12) UNARY_OP_VERSIONED_HFD(Erf, 9, 12) -UNARY_OP_BWUZCSILHFD(Abs, 13) -UNARY_OP_CSILHFD(Neg, 13) +UNARY_OP_BWUZCSILHFDX(Abs, 13) +UNARY_OP_CSILHFDX(Neg, 13) UNARY_OP_HFD(Floor, 13) UNARY_OP_HFD(Ceil, 13) UNARY_OP_HFD(Reciprocal, 13) @@ -246,7 +246,7 @@ UNARY_OP_HFDX(Sqrt, 13) UNARY_OP_HFD(Log, 13) UNARY_OP_HFDX(Exp, 13) UNARY_OP_HFDX(Erf, 13) -UNARY_OP_BWUZCSILHFD(Sign, 13) +UNARY_OP_BWUZCSILHFDX(Sign, 13) UNARY_LOGICALOP_NOT_TYPED(1, bool) UNARY_OP_HFD(Round, 11) diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc index 3d66003fb2bca..77579dfc793ee 100644 --- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc +++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc @@ -47,6 +47,7 @@ OpBuilderRegistrations::OpBuilderRegistrations() { CreateSimpleOpBuilder("Sin", *this); CreateSimpleOpBuilder("Sqrt", *this); CreateSimpleOpBuilder("Sub", *this); + CreateSimpleOpBuilder("Sum", *this); CreateSimpleOpBuilder("Tanh", *this); CreateSimpleOpBuilder("Concat", *this); diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h index a79f4056083c5..df9d0de8e0e3e 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h @@ -158,6 +158,7 @@ class BaseOpBuilder : public IOpBuilder { {"Softmax", QNN_OP_SOFTMAX}, {"Sqrt", QNN_OP_ELEMENT_WISE_SQUARE_ROOT}, {"Sub", QNN_OP_ELEMENT_WISE_SUBTRACT}, + {"Sum", QNN_OP_ELEMENT_WISE_ADD}, {"Tanh", QNN_OP_TANH}, {"Transpose", QNN_OP_TRANSPOSE}, {"GridSample", QNN_OP_GRID_SAMPLE}, diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc index 48c637cd2e951..229d86082f6dc 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc @@ -56,11 +56,18 @@ Status SimpleOpBuilder::ExplicitOpCheck(QnnModelWrapper& qnn_model_wrapper, padding_mode.c_str()); } - // ONNX's Min and Max operators accept a variable number of inputs (i.e., variadic). - // However, QNN's Min and Max operators must take in exactly two inputs. + // ONNX's Min, Max, and Sum operators accept a variable number of inputs (i.e., variadic). + // However, QNN's Min, Max, and Add operators must take in exactly two inputs. if (op_type == "Min" || op_type == "Max") { ORT_RETURN_IF_NOT(node_unit.Inputs().size() == 2, - "QNN EP only supports Min and Max operators with exactly 2 inputs."); + "QNN EP only supports ", op_type.c_str(), " operator with exactly 2 inputs."); + } + + if (op_type == "Sum") { + size_t inputs_num = node_unit.Inputs().size(); + ORT_RETURN_IF_NOT(inputs_num == 2, + "QNN EP supports Sum operator with QNN_OP_ELEMENT_WISE_ADD, which takes exactly 2 inputs. Got ONNX's Sum operator with ", + std::to_string(inputs_num).c_str(), " inputs."); } if (op_type == "DequantizeLinear") { diff --git a/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.cc b/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.cc index 13004af25726d..6891b8159b090 100644 --- a/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.cc +++ b/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.cc @@ -4,15 +4,18 @@ #include "core/providers/common.h" #include "core/providers/webgpu/math/binary_elementwise_ops.h" #include "core/providers/webgpu/shader_helper.h" +#include "core/providers/webgpu/string_macros.h" #include "core/providers/webgpu/webgpu_supported_types.h" namespace onnxruntime { namespace webgpu { Status BinaryElementwiseProgram::GenerateShaderCode(ShaderHelper& shader) const { - const auto& a = shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias); - const auto& b = shader.AddInput("input_b", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias); + const auto& a = shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias); + const auto& b = shader.AddInput("input_b", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias); const auto& c = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias); + shader.AdditionalImplementation() << additional_impl_; + shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size"); // check whether can use element-wise mode. @@ -142,8 +145,15 @@ Status BinaryElementwise::ComputeInternal(ComputeContext& context) const { } uint32_t vec_size = onnxruntime::narrow((size + 3) / 4); + + std::string additional_impl; + if (get_additional_impl_) { + additional_impl = get_additional_impl_(lhs_tensor->GetElementType(), rhs_tensor->GetElementType()); + } + BinaryElementwiseProgram program{kernel_name_, expression_, + additional_impl, is_broadcast, is_lhs_scalar, is_rhs_scalar, @@ -273,7 +283,28 @@ WEBGPU_BINARY_VERSIONED_KERNEL(Sub, 7, 12, Sub, WebGpuSupportedNumberTypes()) WEBGPU_BINARY_VERSIONED_KERNEL(Sub, 13, 13, Sub, WebGpuSupportedNumberTypes()) WEBGPU_BINARY_KERNEL(Sub, 14, Sub, WebGpuSupportedNumberTypes()) -WEBGPU_BINARY_IMPL(Pow, "output_value_t(pow(vec4(a), vec4(b)))") +std::string GetPowImpl(int lhs_element_type, int /* rhs_element_type */) { + SS(s, 1024); + std::string round_str; + if (lhs_element_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32) { + round_str = "round"; + } + + s << "fn pow_custom(a : input_a_element_t, b : f32) -> input_a_element_t {\n" + " if (b == 0.0) {\n" + " return input_a_element_t(1.0);\n" + " } else if (a < input_a_element_t(0.0) && b != floor(b)) {\n" + " return input_a_element_t(pow(f32(a), b)); // NaN\n" + " }\n" + << " return select(sign(a), input_a_element_t(1.0), round(abs(b) % 2.0) != 1.0) * input_a_element_t(" << round_str << "(pow(f32(abs(a)), b)));\n" + << "}\n" + "fn pow_v(a : vec4, b : vec4) -> vec4 {\n" + " return vec4(pow_custom(a.x, f32(b.x)), pow_custom(a.y, f32(b.y)), pow_custom(a.z, f32(b.z)), pow_custom(a.w, f32(b.w)));\n" + "}\n"; + return SS_GET(s); +} + +WEBGPU_BINARY_IMPL(Pow, "pow_v(a, b)", GetPowImpl) WEBGPU_BINARY_VERSIONED_KERNEL(Pow, 7, 11, Pow, WebGpuSupportedNumberTypes()) WEBGPU_BINARY_VERSIONED_KERNEL_2(Pow, 12, 12, Pow, WebGpuSupportedNumberTypes(), WebGpuSupportedNumberTypes()) WEBGPU_BINARY_VERSIONED_KERNEL_2(Pow, 13, 14, Pow, WebGpuSupportedNumberTypes(), WebGpuSupportedNumberTypes()) diff --git a/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.h b/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.h index 84cbcdf3244d8..f80accfb934f8 100644 --- a/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.h +++ b/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.h @@ -14,11 +14,13 @@ class BinaryElementwiseProgram final : public Program public: BinaryElementwiseProgram(const std::string& kernel_name, const std::string& expression, + const std::string& additional_impl, const bool is_broadcast, const bool is_lhs_scalar, const bool is_rhs_scalar, const bool vectorize) : Program{kernel_name}, expression_{expression}, + additional_impl_{additional_impl}, is_broadcast_{is_broadcast}, is_lhs_scalar_{is_lhs_scalar}, is_rhs_scalar_{is_rhs_scalar}, @@ -29,7 +31,8 @@ class BinaryElementwiseProgram final : public Program WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"vec_size", ProgramUniformVariableDataType::Uint32}); private: - std::string expression_; + std::string_view expression_; + std::string_view additional_impl_; bool is_broadcast_; bool is_lhs_scalar_; bool is_rhs_scalar_; @@ -38,11 +41,15 @@ class BinaryElementwiseProgram final : public Program class BinaryElementwise : public WebGpuKernel { public: + using GetAdditionalImplementationFunction = std::string (*)(int lhs_element_type, int rhs_element_type); + BinaryElementwise(const OpKernelInfo& info, const std::string& kernel_name, - const std::string& expression) : WebGpuKernel{info}, - kernel_name_{kernel_name}, - expression_{expression} {} + const std::string& expression, + const GetAdditionalImplementationFunction get_additional_impl = nullptr) : WebGpuKernel{info}, + kernel_name_{kernel_name}, + expression_{expression}, + get_additional_impl_{get_additional_impl} {} protected: Status ComputeInternal(ComputeContext& context) const final; @@ -50,6 +57,7 @@ class BinaryElementwise : public WebGpuKernel { private: std::string kernel_name_; std::string expression_; + const GetAdditionalImplementationFunction get_additional_impl_; }; } // namespace webgpu diff --git a/onnxruntime/core/providers/webgpu/math/cum_sum.cc b/onnxruntime/core/providers/webgpu/math/cum_sum.cc new file mode 100644 index 0000000000000..bc4cd70a238fc --- /dev/null +++ b/onnxruntime/core/providers/webgpu/math/cum_sum.cc @@ -0,0 +1,98 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/webgpu/math/cum_sum.h" +#include "core/providers/webgpu/shader_helper.h" +#include "core/providers/webgpu/webgpu_supported_types.h" + +namespace onnxruntime { +namespace webgpu { + +ONNX_OPERATOR_VERSIONED_KERNEL_EX( + CumSum, + kOnnxDomain, + 11, 13, + kWebGpuExecutionProvider, + (*KernelDefBuilder::Create()) + .TypeConstraint("T", WebGpuSupportedFloatTypes()) + .TypeConstraint("T2", {DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType()}) + .InputMemoryType(OrtMemTypeCPU, 1), + CumSum); + +ONNX_OPERATOR_KERNEL_EX( + CumSum, + kOnnxDomain, + 14, + kWebGpuExecutionProvider, + (*KernelDefBuilder::Create()) + .TypeConstraint("T", WebGpuSupportedFloatTypes()) + .TypeConstraint("T2", {DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType()}) + .InputMemoryType(OrtMemTypeCPU, 1), + CumSum); + +Status CumSumProgram::GenerateShaderCode(ShaderHelper& shader) const { + const ShaderVariableHelper& input = shader.AddInput("input", ShaderUsage::UseUniform); + const ShaderVariableHelper& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias); + + shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size") + << "var input_indices = " << input.OffsetToIndices("global_idx") << ";\n" + << "var sum : output_value_t = 0;\n" + << "var first : i32 = 0;\n" + << "if (uniforms.reverse == 1) {\n" + << " first = i32(" + input.IndicesGet("input_indices", "uniforms.axis") + ");\n" + << " if (uniforms.exclusive == 1) { first += 1; }\n" + << "}\n\n" + << "var last : i32 = 0;\n" + << "if (uniforms.reverse == 1) {\n" + << " last = i32(" << GetElementAt("uniforms.input_shape", "uniforms.axis", input.Rank()) << ");\n" + << "} else {\n" + << " last = i32(" + input.IndicesGet("input_indices", "uniforms.axis") + ");\n" + << " if (uniforms.exclusive == 0) { last += 1; }\n" + << "}\n\n" + << "for (var i : i32 = first; i < last; i++) {\n" + << " " << input.IndicesSet("input_indices", "uniforms.axis", "u32(i)") << ";\n" + << " sum = sum + " << input.GetByIndices("input_indices") << ";\n" + << "}\n" + << output.SetByOffset("global_idx", "sum"); + + return Status::OK(); +} + +Status CumSum::ComputeInternal(ComputeContext& context) const { + const auto* input_tensor = context.Input(0); + const TensorShape& input_shape = input_tensor->Shape(); + int64_t input_rank = input_shape.NumDimensions(); + + const auto* axis_tensor = context.Input(1); + const auto* axis_data = axis_tensor->Data(); + int64_t axis = static_cast(axis_data[0]); + + ORT_ENFORCE(-input_rank <= axis && axis < input_rank, "Axes attribute must be within range -input_rank <= axis < input_rank."); + // Handle negative axis + if (axis < 0) { + axis += input_rank; + } + + auto* output_tensor = context.Output(0, input_shape); + int64_t output_size = output_tensor->Shape().Size(); + + if (output_size == 0) { + return Status::OK(); + } + + CumSumProgram program{}; + program + .AddInput({input_tensor}) + .AddOutput({output_tensor, ProgramTensorMetadataDependency::TypeAndRank}) + .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE) + .AddUniformVariables({{static_cast(output_size)}, + {static_cast(axis)}, + {static_cast(exclusive_)}, + {static_cast(reverse_)}}); + return context.RunProgram(program); +} + +} // namespace webgpu +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/providers/webgpu/math/cum_sum.h b/onnxruntime/core/providers/webgpu/math/cum_sum.h new file mode 100644 index 0000000000000..6a66ee0ed7b04 --- /dev/null +++ b/onnxruntime/core/providers/webgpu/math/cum_sum.h @@ -0,0 +1,39 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/providers/webgpu/webgpu_kernel.h" +#include "core/providers/webgpu/program.h" + +namespace onnxruntime { +namespace webgpu { + +class CumSumProgram final : public Program { + public: + CumSumProgram() : Program{"CumSum"} {} + + Status GenerateShaderCode(ShaderHelper& sh) const override; + + WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32}, + {"axis", ProgramUniformVariableDataType::Uint32}, + {"exclusive", ProgramUniformVariableDataType::Uint32}, + {"reverse", ProgramUniformVariableDataType::Uint32}); +}; + +class CumSum final : public WebGpuKernel { + public: + CumSum(const OpKernelInfo& info) : WebGpuKernel(info) { + exclusive_ = info.GetAttrOrDefault("exclusive", 0); + reverse_ = info.GetAttrOrDefault("reverse", 0); + } + + Status ComputeInternal(ComputeContext& context) const override; + + private: + int64_t exclusive_; + int64_t reverse_; +}; + +} // namespace webgpu +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc index 1a56cafdb3952..11fa30c798809 100644 --- a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc @@ -11,7 +11,7 @@ namespace onnxruntime { namespace webgpu { -#define REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceOp, begin, end) \ +#define REGISTER_REDUCE_VERSIONED_KERNEL(ReduceOp, begin, end) \ ONNX_OPERATOR_VERSIONED_KERNEL_EX( \ ReduceOp, \ kOnnxDomain, \ @@ -20,7 +20,16 @@ namespace webgpu { (*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedNumberTypes()), \ ReduceOp); -#define REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceOp, version) \ +#define REGISTER_REDUCE_VERSIONED_KERNEL_WITH_AXIS_IN_INPUT(ReduceOp, begin, end) \ + ONNX_OPERATOR_VERSIONED_KERNEL_EX( \ + ReduceOp, \ + kOnnxDomain, \ + begin, end, \ + kWebGpuExecutionProvider, \ + (*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedNumberTypes()).InputMemoryType(OrtMemTypeCPUInput, 1), \ + ReduceOp); + +#define REGISTER_REDUCE_KERNEL(ReduceOp, version) \ ONNX_OPERATOR_KERNEL_EX( \ ReduceOp, \ kOnnxDomain, \ @@ -29,58 +38,66 @@ namespace webgpu { (*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedNumberTypes()).InputMemoryType(OrtMemTypeCPUInput, 1), \ ReduceOp); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 1, 10); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 11, 12); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 13, 17); -REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMean, 18); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMean, 1, 10); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMean, 11, 12); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMean, 13, 17); +REGISTER_REDUCE_KERNEL(ReduceMean, 18); + +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMax, 1, 10); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMax, 11, 11); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMax, 12, 12); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMax, 13, 17); +REGISTER_REDUCE_VERSIONED_KERNEL_WITH_AXIS_IN_INPUT(ReduceMax, 18, 19); +REGISTER_REDUCE_KERNEL(ReduceMax, 20); + +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMin, 1, 10); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMin, 11, 11); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMin, 12, 12); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMin, 13, 17); +REGISTER_REDUCE_VERSIONED_KERNEL_WITH_AXIS_IN_INPUT(ReduceMin, 18, 19); +REGISTER_REDUCE_KERNEL(ReduceMin, 20); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 1, 10); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 11, 11); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 12, 12); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 13, 17); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 18, 19); -REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMax, 20); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceSum, 1, 10); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceSum, 11, 12); +REGISTER_REDUCE_KERNEL(ReduceSum, 13); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 1, 10); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 11, 11); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 12, 12); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 13, 17); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 18, 19); -REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMin, 20); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceProd, 1, 10); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceProd, 11, 12); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceProd, 13, 17); +REGISTER_REDUCE_KERNEL(ReduceProd, 18); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSum, 1, 10); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSum, 11, 12); -REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceSum, 13); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceL1, 1, 10); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceL1, 11, 12); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceL1, 13, 17); +REGISTER_REDUCE_KERNEL(ReduceL1, 18); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceProd, 1, 10); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceProd, 11, 12); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceProd, 13, 17); -REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceProd, 18); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceL2, 1, 10); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceL2, 11, 12); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceL2, 13, 17); +REGISTER_REDUCE_KERNEL(ReduceL2, 18); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL1, 1, 10); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL1, 11, 12); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL1, 13, 17); -REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceL1, 18); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSum, 1, 10); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSum, 11, 12); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSum, 13, 17); +REGISTER_REDUCE_KERNEL(ReduceLogSum, 18); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL2, 1, 10); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL2, 11, 12); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL2, 13, 17); -REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceL2, 18); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceSumSquare, 1, 10); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceSumSquare, 11, 12); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceSumSquare, 13, 17); +REGISTER_REDUCE_KERNEL(ReduceSumSquare, 18); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSum, 1, 10); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSum, 11, 12); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSum, 13, 17); -REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceLogSum, 18); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSumExp, 1, 10); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSumExp, 11, 12); +REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSumExp, 13, 17); +REGISTER_REDUCE_KERNEL(ReduceLogSumExp, 18); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSumSquare, 1, 10); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSumSquare, 11, 12); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSumSquare, 13, 17); -REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceSumSquare, 18); +REGISTER_REDUCE_VERSIONED_KERNEL(ArgMax, 1, 10); +REGISTER_REDUCE_VERSIONED_KERNEL(ArgMax, 11, 12); +REGISTER_REDUCE_KERNEL(ArgMax, 13); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSumExp, 1, 10); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSumExp, 11, 12); -REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSumExp, 13, 17); -REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceLogSumExp, 18); +REGISTER_REDUCE_VERSIONED_KERNEL(ArgMin, 1, 10); +REGISTER_REDUCE_VERSIONED_KERNEL(ArgMin, 11, 12); +REGISTER_REDUCE_KERNEL(ArgMin, 13); Status ReduceKernelProgram::GenerateShaderCode(ShaderHelper& shader) const { const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias); @@ -105,6 +122,9 @@ Status ReduceKernelProgram::GenerateShaderCode(ShaderHelper& shader) const { std::stringstream ss; std::string index = "i" + std::to_string(i); ss << "for (var " << index << " : u32 = 0; " << index << " < " << input.IndicesGet("uniforms.input_shape", i) << "; " << index << "++) {\n"; + if (loop_body.find("last_index") != std::string::npos) { + ss << "let last_index = " + index + ";\n"; + } ss << input.IndicesSet("input_indices", i, index) << ";\n"; ss << loop_body << "\n"; ss << "}\n"; @@ -171,12 +191,13 @@ Status ReduceKernel::ComputeInternal(ComputeContext& context) auto output = context.Output(0, input_tensor->Shape()); // We need to run the operation even for scalar inputs for these ops const auto code = GetOpSpecificCode(input_tensor); + constexpr uint32_t output_size = 1; + constexpr uint32_t reduce_axes = 0; ReduceKernelProgram program(name_, keepdims_, noop_with_empty_axes_, input_axes, code, false); - std::vector reduce_axes = {0}; program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank}) .AddOutput({output, ProgramTensorMetadataDependency::TypeAndRank}) .SetDispatchGroupSize(1) - .AddUniformVariables({{1}, {static_cast(noop_with_empty_axes_ ? 1 : 0)}, {reduce_axes}}); + .AddUniformVariables({{output_size}, {static_cast(noop_with_empty_axes_ ? 1 : 0)}, {reduce_axes}}); return context.RunProgram(program); } else { // For other ops, or when axes is empty with noop_with_empty_axes_ true, just copy the input @@ -328,5 +349,25 @@ ReduceOpSpecificCode ReduceLogSumExp::GetOpSpecificCode(const Tensor* input_tens return code; } +ReduceOpSpecificCode ArgMin::GetOpSpecificCode(const Tensor* input_tensor) const { + ORT_UNUSED_PARAMETER(input_tensor); + std::string op = (select_last_index_) ? "<=" : "<"; + std::string loop_header = "var best_element = first_element; var best_index = u32(0);"; + std::string loop_body = "if (current_element " + op + " best_element) { best_element = current_element; best_index = last_index; };"; + std::string loop_footer = "let output_value = output_value_t(best_index);"; + ReduceOpSpecificCode code({loop_header, loop_body, loop_footer}); + return code; +} + +ReduceOpSpecificCode ArgMax::GetOpSpecificCode(const Tensor* input_tensor) const { + ORT_UNUSED_PARAMETER(input_tensor); + std::string op = (select_last_index_) ? ">=" : ">"; + std::string loop_header = "var best_element = first_element; var best_index = u32(0);"; + std::string loop_body = "if (current_element " + op + " best_element) { best_element = current_element; best_index = last_index; };"; + std::string loop_footer = "let output_value = output_value_t(best_index);"; + ReduceOpSpecificCode code({loop_header, loop_body, loop_footer}); + return code; +} + } // namespace webgpu -} // namespace onnxruntime \ No newline at end of file +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h index 291d931f41c05..70ae6d3c71eb9 100644 --- a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h +++ b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h @@ -119,5 +119,17 @@ class ReduceLogSumExp final : public ReduceKernel { ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override; }; +class ArgMin final : public ReduceKernel { + public: + ArgMin(const OpKernelInfo& info) : ReduceKernel(info, "ArgMin", true) {} + ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override; +}; + +class ArgMax final : public ReduceKernel { + public: + ArgMax(const OpKernelInfo& info) : ReduceKernel(info, "ArgMax", true) {} + ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override; +}; + } // namespace webgpu } // namespace onnxruntime diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc index dfb2e4b6ce665..aacbcc5fb4f0a 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc @@ -297,12 +297,12 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 12, MatMul); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, MatMul); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, float, ArgMax); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, float, ArgMax); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, float, ArgMax); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, float, ArgMin); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, float, ArgMin); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, float, ArgMin); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ArgMax); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ArgMax); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, ArgMax); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ArgMin); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ArgMin); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, ArgMin); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, Softmax); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Softmax); @@ -624,13 +624,13 @@ std::unique_ptr RegisterKernels() { // BuildKernelCreateInfo, // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -713,8 +713,8 @@ std::unique_ptr RegisterKernels() { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, diff --git a/onnxruntime/core/providers/webnn/allocator.cc b/onnxruntime/core/providers/webnn/allocator.cc index 9c5cd651e1f00..8cf5b8cd72a5c 100644 --- a/onnxruntime/core/providers/webnn/allocator.cc +++ b/onnxruntime/core/providers/webnn/allocator.cc @@ -16,7 +16,7 @@ void* WebNNTensorAllocator::Alloc(size_t size) { // We don't need to transfer the tensor to an MLTensor, so we don't need to allocate an MLTensor id. return nullptr; } - void* p = EM_ASM_PTR({ return Module.jsepReserveTensorId(); }); + void* p = EM_ASM_PTR({ return Module.webnnReserveTensorId(); }); allocations_[p] = size; stats_.num_allocs++; stats_.bytes_in_use += SafeInt(size); @@ -27,7 +27,7 @@ void WebNNTensorAllocator::Free(void* p) { if (p == nullptr) { return; } - EM_ASM({ Module.jsepReleaseTensorId($0); }, p); + EM_ASM({ Module.webnnReleaseTensorId($0); }, p); size_t size = allocations_[p]; stats_.bytes_in_use -= size; allocations_.erase(p); diff --git a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc index 6814b019f699c..08580ab2861d7 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc @@ -54,9 +54,9 @@ Status ArgMaxMinOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const auto& op_type = node.OpType(); if (op_type == "ArgMax") { - output = model_builder.GetBuilder().call("argMax", input, narrow(axis), options); + output = model_builder.GetBuilder().call("argMax", input, SafeInt(axis).Ref(), options); } else if (op_type == "ArgMin") { - output = model_builder.GetBuilder().call("argMin", input, narrow(axis), options); + output = model_builder.GetBuilder().call("argMin", input, SafeInt(axis).Ref(), options); } else { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "ArgMaxMinOpBuilder, unknown op: ", op_type); } diff --git a/onnxruntime/core/providers/webnn/builders/impl/builder_utils.cc b/onnxruntime/core/providers/webnn/builders/impl/builder_utils.cc index 113cc3df5438d..63e2345243282 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/builder_utils.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/builder_utils.cc @@ -2,7 +2,6 @@ // Copyright (c) Intel Corporation. All rights reserved. // Licensed under the MIT License. -#include #include #include "core/providers/shared/utils/utils.h" diff --git a/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc index 1bbe56ef9b477..ee2512ddd8b5a 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc @@ -2,7 +2,6 @@ // Copyright (c) Intel Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/common/safeint.h" #include "core/providers/common.h" #include "core/providers/shared/utils/utils.h" #include "core/providers/webnn/builders/helper.h" diff --git a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc index 1361b7dd5c14b..4c393e8a9bdba 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc @@ -2,7 +2,6 @@ // Copyright (c) Intel Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/common/safeint.h" #include "core/optimizer/initializer.h" #include "core/providers/common.h" #include "core/providers/shared/utils/utils.h" diff --git a/onnxruntime/core/providers/webnn/builders/impl/cumsum_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/cumsum_op_builder.cc index be30c5520d62e..99be8f75771ad 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/cumsum_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/cumsum_op_builder.cc @@ -2,7 +2,6 @@ // Copyright (c) Intel Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/common/safeint.h" #include "core/framework/tensorprotoutils.h" #include "core/optimizer/initializer.h" #include "core/providers/common.h" @@ -64,8 +63,8 @@ Status CumSumOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const options.set("label", node.Name()); emscripten::val output = emscripten::val::object(); - output = model_builder.GetBuilder().call("cumulativeSum", input, gsl::narrow(webnn_axis), - options); + output = model_builder.GetBuilder().call("cumulativeSum", input, + SafeInt(webnn_axis).Ref(), options); model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output)); return Status::OK(); } diff --git a/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc index 55746bb1f61f0..f3363b1e186d5 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc @@ -2,7 +2,6 @@ // Copyright (c) Intel Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/common/safeint.h" #include "core/optimizer/initializer.h" #include "core/providers/common.h" #include "core/providers/shared/utils/utils.h" diff --git a/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc index 1f51e26fecfa5..6cee04bac3e2b 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc @@ -2,7 +2,6 @@ // Copyright (c) Intel Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/common/safeint.h" #include "core/framework/tensorprotoutils.h" #include "core/optimizer/initializer.h" #include "core/providers/common.h" diff --git a/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc index 8402f05d8e234..3f813f08279e7 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc @@ -2,7 +2,6 @@ // Copyright (c) Intel Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/common/safeint.h" #include "core/framework/tensorprotoutils.h" #include "core/optimizer/initializer.h" #include "core/providers/common.h" diff --git a/onnxruntime/core/providers/webnn/builders/impl/flatten_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/flatten_op_builder.cc index d0ece026a7048..c4ff280b95b6e 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/flatten_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/flatten_op_builder.cc @@ -2,7 +2,6 @@ // Copyright (c) Intel Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/common/safeint.h" #include "core/providers/common.h" #include "core/providers/shared/utils/utils.h" #include "core/providers/webnn/builders/helper.h" diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc index c5cc8e86bb308..1f24124745a19 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc @@ -2,7 +2,6 @@ // Copyright (c) Intel Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/common/safeint.h" #include "core/providers/common.h" #include "core/providers/shared/utils/utils.h" #include "core/providers/webnn/builders/helper.h" @@ -86,9 +85,9 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N else if (extended_a_shape) { std::vector new_shape; for (size_t i = 0; i < b_shape.size() - 2; i++) { - new_shape.push_back(narrow(b_shape[i])); + new_shape.push_back(SafeInt(b_shape[i])); } - new_shape.push_back(narrow(b_shape.back())); + new_shape.push_back(SafeInt(b_shape.back())); output = model_builder.GetBuilder().call("reshape", output, emscripten::val::array(new_shape), @@ -98,7 +97,7 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N else if (extended_b_shape) { std::vector new_shape; for (size_t i = 0; i < a_shape.size() - 1; i++) { - new_shape.push_back(narrow(a_shape[i])); + new_shape.push_back(SafeInt(a_shape[i])); } output = model_builder.GetBuilder().call("reshape", output, diff --git a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc index a090c21fe3356..5b57df7f184e7 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc @@ -2,7 +2,6 @@ // Copyright (c) Intel Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/common/safeint.h" #include "core/optimizer/initializer.h" #include "core/providers/common.h" #include "core/providers/shared/utils/utils.h" diff --git a/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc index d8373a45e4423..e8f26af928ab3 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc @@ -2,7 +2,6 @@ // Copyright (c) Intel Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/common/safeint.h" #include "core/providers/common.h" #include "core/providers/shared/utils/utils.h" #include "core/providers/webnn/builders/helper.h" diff --git a/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc index b338d27986279..79ad3574e07e9 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc @@ -2,7 +2,6 @@ // Copyright (c) Intel Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/common/safeint.h" #include "core/providers/common.h" #include "core/providers/shared/utils/utils.h" #include "core/providers/webnn/builders/helper.h" diff --git a/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc index 1bb6523c6f86a..ed62b2bd69618 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc @@ -2,7 +2,6 @@ // Copyright (c) Intel Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/common/safeint.h" #include "core/optimizer/initializer.h" #include "core/providers/common.h" #include "core/providers/shared/utils/utils.h" diff --git a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc index 93ad933d71c34..b23fbeba1ddc8 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc @@ -2,7 +2,6 @@ // Copyright (c) Intel Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/common/safeint.h" #include "core/providers/common.h" #include "core/providers/shared/utils/utils.h" #include "core/providers/webnn/builders/helper.h" diff --git a/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc index 0a438e98ad737..2fc47430a1c66 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc @@ -2,7 +2,6 @@ // Copyright (c) Intel Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/common/safeint.h" #include "core/framework/tensorprotoutils.h" #include "core/optimizer/initializer.h" #include "core/providers/common.h" diff --git a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc index 58515d2df54ec..eec6911a686cf 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc @@ -4,7 +4,6 @@ #include -#include "core/common/safeint.h" #include "core/providers/common.h" #include "core/framework/tensorprotoutils.h" #include "core/providers/webnn/builders/helper.h" diff --git a/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc index 1688dfc97a0c4..0a84835ee9fc0 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc @@ -89,7 +89,7 @@ Status RotaryEmbeddingOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_build emscripten::val wnn_builder = model_builder.GetBuilder(); NodeAttrHelper helper(node); - const bool interleaved = gsl::narrow_cast(helper.Get("interleaved", 0)); + const bool interleaved = static_cast(helper.Get("interleaved", 0)); uint32_t num_heads = helper.Get("num_heads", 0); uint32_t rotary_embedding_dim = helper.Get("rotary_embedding_dim", 0); diff --git a/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc index 4adc5920de7fa..468c0e24a3e88 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc @@ -2,7 +2,6 @@ // Copyright (c) Intel Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/common/safeint.h" #include "core/framework/tensorprotoutils.h" #include "core/optimizer/initializer.h" #include "core/providers/common.h" diff --git a/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc index b1b737b114998..0e754b53e78d1 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc @@ -2,7 +2,6 @@ // Copyright (c) Intel Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/common/safeint.h" #include "core/providers/common.h" #include "core/providers/shared/utils/utils.h" #include "core/providers/webnn/builders/helper.h" diff --git a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc index 06dbacf995a28..21b44b1066694 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc @@ -2,7 +2,6 @@ // Copyright (c) Intel Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/common/safeint.h" #include "core/optimizer/initializer.h" #include "core/providers/common.h" #include "core/providers/shared/utils/utils.h" @@ -75,8 +74,8 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, // Check that the splits evenly divide. if (split_count > 0 && splits.empty() && input_shape[axis] % split_count != 0) { // Divide inputs into variable size outputs: - splits.insert(splits.end(), split_count - 1, narrow(input_shape[axis]) / split_count); - splits.insert(splits.end(), narrow(input_shape[axis]) % split_count); + splits.insert(splits.end(), split_count - 1, SafeInt(input_shape[axis]) / split_count); + splits.insert(splits.end(), SafeInt(input_shape[axis]) % split_count); } if (splits.empty()) { diff --git a/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc index a3be9d7e2ceee..5687b1133c628 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc @@ -2,7 +2,6 @@ // Copyright (c) Intel Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/common/safeint.h" #include "core/providers/common.h" #include "core/providers/shared/utils/utils.h" #include "core/providers/webnn/builders/helper.h" diff --git a/onnxruntime/core/providers/webnn/builders/impl/tile_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/tile_op_builder.cc index 672a3a510d54d..259bb0552b7c7 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/tile_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/tile_op_builder.cc @@ -2,7 +2,6 @@ // Copyright (c) Intel Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/common/safeint.h" #include "core/framework/tensorprotoutils.h" #include "core/optimizer/initializer.h" #include "core/providers/common.h" diff --git a/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc index ac440e0119bac..452071f469c4f 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc @@ -2,7 +2,6 @@ // Copyright (c) Intel Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/common/safeint.h" #include "core/providers/shared/utils/utils.h" #include "core/providers/webnn/builders/helper.h" #include "core/providers/webnn/builders/model_builder.h" diff --git a/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc index 0c818533918a4..f2092d6163713 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc @@ -59,7 +59,7 @@ Status TriangularOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, std::vector unpacked_tensor; ORT_RETURN_IF_ERROR(onnxruntime::utils::UnpackInitializerData(diagonal_tensor, unpacked_tensor)); const auto diagonal = *reinterpret_cast(unpacked_tensor.data()); - options.set("diagonal", narrow(diagonal)); + options.set("diagonal", SafeInt(diagonal).Ref()); } output = model_builder.GetBuilder().call("triangular", input, options); diff --git a/onnxruntime/core/providers/webnn/builders/model.cc b/onnxruntime/core/providers/webnn/builders/model.cc index 35964d85862e4..40fdfc609e6a1 100644 --- a/onnxruntime/core/providers/webnn/builders/model.cc +++ b/onnxruntime/core/providers/webnn/builders/model.cc @@ -8,7 +8,6 @@ #include "core/common/common.h" #include "core/common/inlined_containers.h" #include "core/common/logging/logging.h" -#include "core/common/safeint.h" #include "core/graph/onnx_protobuf.h" #include "core/providers/common.h" #include "model.h" @@ -157,7 +156,7 @@ onnxruntime::common::Status Model::Compute(const InlinedHashMap& inputs, const InlinedHashMap& outputs) { - auto jsepEnsureTensor = emscripten::val::module_property("jsepEnsureTensor"); + auto webnnEnsureTensor = emscripten::val::module_property("webnnEnsureTensor"); auto promises = emscripten::val::array(); for (const auto& [_, tensor] : inputs) { emscripten::val shape = emscripten::val::array(); @@ -165,7 +164,7 @@ onnxruntime::common::Status Model::Dispatch(const InlinedHashMap(dim); shape.call("push", dim_val); } - auto ml_tensor = jsepEnsureTensor(emscripten::val::undefined(), reinterpret_cast(tensor.buffer), tensor.tensor_info.data_type, shape, true); + auto ml_tensor = webnnEnsureTensor(emscripten::val::undefined(), reinterpret_cast(tensor.buffer), tensor.tensor_info.data_type, shape, true); promises.call("push", ml_tensor); } for (const auto& [_, tensor] : outputs) { @@ -174,7 +173,7 @@ onnxruntime::common::Status Model::Dispatch(const InlinedHashMap(dim); shape.call("push", dim_val); } - auto ml_tensor = jsepEnsureTensor(emscripten::val::undefined(), reinterpret_cast(tensor.buffer), tensor.tensor_info.data_type, shape, false); + auto ml_tensor = webnnEnsureTensor(emscripten::val::undefined(), reinterpret_cast(tensor.buffer), tensor.tensor_info.data_type, shape, false); promises.call("push", ml_tensor); } auto ml_tensors = emscripten::val::global("Promise").call("all", promises).await(); diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.cc b/onnxruntime/core/providers/webnn/builders/model_builder.cc index 661b2ad7056c2..399cc5faf6273 100644 --- a/onnxruntime/core/providers/webnn/builders/model_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/model_builder.cc @@ -9,7 +9,6 @@ #include "helper.h" #include "op_builder_factory.h" -#include "core/common/safeint.h" #include "core/framework/tensorprotoutils.h" #include "core/providers/common.h" #include "core/providers/shared/utils/utils.h" @@ -140,13 +139,13 @@ Status ModelBuilder::RegisterInitializers() { ORT_RETURN_IF_ERROR(utils::GetExternalDataInfo( tensor, graph_viewer_.ModelPath(), external_file_path, data_offset, tensor_byte_size)); - auto jsepRegisterMLConstant = emscripten::val::module_property("jsepRegisterMLConstant"); - operand = jsepRegisterMLConstant(emscripten::val(external_file_path), - static_cast(data_offset), - static_cast(tensor_byte_size), - wnn_builder_, - desc, - should_convert_int64_to_int32); + auto webnnRegisterMLConstant = emscripten::val::module_property("webnnRegisterMLConstant"); + operand = webnnRegisterMLConstant(emscripten::val(external_file_path), + static_cast(data_offset), + static_cast(tensor_byte_size), + wnn_builder_, + desc, + should_convert_int64_to_int32); } else { if (tensor.has_raw_data()) { tensor_ptr = reinterpret_cast(const_cast(tensor.raw_data().c_str())); @@ -288,7 +287,7 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i desc.set("dataType", emscripten::val("int32")); } wnn_operands_.insert(std::make_pair(name, wnn_builder_.call("input", name, desc))); - emscripten::val::module_property("jsepRegisterGraphInput")(name); + emscripten::val::module_property("webnnRegisterGraphInput")(name); input_names_.push_back(name); } else { output_names_.push_back(name); diff --git a/onnxruntime/core/providers/webnn/data_transfer.cc b/onnxruntime/core/providers/webnn/data_transfer.cc index 44e9bf9edf3d9..aa85277b72453 100644 --- a/onnxruntime/core/providers/webnn/data_transfer.cc +++ b/onnxruntime/core/providers/webnn/data_transfer.cc @@ -29,11 +29,11 @@ common::Status DataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const { const auto& dst_device = dst.Location().device; if (dst_device.Type() == OrtDevice::GPU) { - EM_ASM({ Module.jsepUploadTensor($0, HEAPU8.subarray($1, $1 + $2)); }, dst_data, reinterpret_cast(src_data), bytes); + EM_ASM({ Module.webnnUploadTensor($0, HEAPU8.subarray($1, $1 + $2)); }, dst_data, reinterpret_cast(src_data), bytes); } else { - auto jsepDownloadTensor = emscripten::val::module_property("jsepDownloadTensor"); + auto webnnDownloadTensor = emscripten::val::module_property("webnnDownloadTensor"); auto subarray = emscripten::typed_memory_view(bytes, static_cast(dst_data)); - jsepDownloadTensor(reinterpret_cast(src_data), subarray).await(); + webnnDownloadTensor(reinterpret_cast(src_data), subarray).await(); } } diff --git a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc index 7410ff66add30..2da7c6499933a 100644 --- a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc +++ b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc @@ -10,7 +10,6 @@ #include "core/framework/kernel_registry.h" #include "core/graph/graph_viewer.h" #include "core/session/onnxruntime_cxx_api.h" -#include "core/common/safeint.h" #include "core/providers/webnn/allocator.h" #include "core/providers/webnn/data_transfer.h" #include "core/providers/partitioning_utils.h" @@ -284,7 +283,7 @@ class WebNNMemcpy : public OpKernel { explicit WebNNMemcpy(const OpKernelInfo& info) : OpKernel(info) {} Status Compute(OpKernelContext* context) const override { - auto jsepEnsureTensor = emscripten::val::module_property("jsepEnsureTensor"); + auto webnnEnsureTensor = emscripten::val::module_property("webnnEnsureTensor"); const auto* X = context->Input(0); ORT_ENFORCE(X != nullptr, "Memcpy: input tensor is null"); auto* Y = context->Output(0, X->Shape()); @@ -294,10 +293,10 @@ class WebNNMemcpy : public OpKernel { shape.call("push", SafeInt(dim).Ref()); } - jsepEnsureTensor(emscripten::val::undefined(), - reinterpret_cast(Y->MutableDataRaw()), - Y->GetElementType(), - shape, false) + webnnEnsureTensor(emscripten::val::undefined(), + reinterpret_cast(Y->MutableDataRaw()), + Y->GetElementType(), + shape, false) .await(); const auto* data_transfer = Info().GetDataTransferManager().GetDataTransfer(X->Location().device, Y->Location().device); diff --git a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc index c828ae9400174..8d972f7d63bc1 100644 --- a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc +++ b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc @@ -57,7 +57,7 @@ bool MaxPool::IsOnnxNodeSupported(const NodeUnit& node_unit, // input of maxpool could be fp16/fp32/fp64,i8/u8 according to ONNX if (x_type == nullptr || (x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT && -// because pool_fp16_op_test can be enabled by other preprocessor, for example, COREML_ENABLE_MLPROGRAM +// because pool_fp16_op_test can be enabled by other preprocessor, for example, USE_COREML #ifdef XNNPACK_FP16_SUPPORTED x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 && #endif diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py index 6b5f7526cc506..785eb9c485d25 100644 --- a/onnxruntime/python/onnxruntime_inference_collection.py +++ b/onnxruntime/python/onnxruntime_inference_collection.py @@ -15,6 +15,9 @@ from onnxruntime.capi import _pybind_state as C if typing.TYPE_CHECKING: + import numpy as np + import numpy.typing as npt + import onnxruntime @@ -59,22 +62,22 @@ def export_adapter(self, file_path: os.PathLike): """ self._adapter.export_adapter(file_path) - def get_format_version(self): + def get_format_version(self) -> int: return self._adapter.format_version - def set_adapter_version(self, adapter_version: int): + def set_adapter_version(self, adapter_version: int) -> None: self._adapter.adapter_version = adapter_version - def get_adapter_version(self): + def get_adapter_version(self) -> int: return self._adapter.adapter_version - def set_model_version(self, model_version: int): + def set_model_version(self, model_version: int) -> None: self._adapter.model_version = model_version - def get_model_version(self): + def get_model_version(self) -> int: return self._adapter.model_version - def set_parameters(self, params: dict[str, OrtValue]): + def set_parameters(self, params: dict[str, OrtValue]) -> None: self._adapter.parameters = {k: v._ortvalue for k, v in params.items()} def get_parameters(self) -> dict[str, OrtValue]: @@ -174,27 +177,27 @@ def __init__(self): self._sess = None self._enable_fallback = True - def get_session_options(self): + def get_session_options(self) -> onnxruntime.SessionOptions: "Return the session options. See :class:`onnxruntime.SessionOptions`." return self._sess_options - def get_inputs(self): + def get_inputs(self) -> Sequence[onnxruntime.NodeArg]: "Return the inputs metadata as a list of :class:`onnxruntime.NodeArg`." return self._inputs_meta - def get_outputs(self): + def get_outputs(self) -> Sequence[onnxruntime.NodeArg]: "Return the outputs metadata as a list of :class:`onnxruntime.NodeArg`." return self._outputs_meta - def get_overridable_initializers(self): + def get_overridable_initializers(self) -> Sequence[onnxruntime.NodeArg]: "Return the inputs (including initializers) metadata as a list of :class:`onnxruntime.NodeArg`." return self._overridable_initializers - def get_modelmeta(self): + def get_modelmeta(self) -> onnxruntime.ModelMetadata: "Return the metadata. See :class:`onnxruntime.ModelMetadata`." return self._model_meta - def get_providers(self): + def get_providers(self) -> Sequence[str]: "Return list of registered execution providers." return self._providers @@ -202,7 +205,7 @@ def get_provider_options(self): "Return registered execution providers' configurations." return self._provider_options - def set_providers(self, providers=None, provider_options=None): + def set_providers(self, providers=None, provider_options=None) -> None: """ Register the input list of execution providers. The underlying session is re-created. @@ -224,13 +227,13 @@ def set_providers(self, providers=None, provider_options=None): # recreate the underlying C.InferenceSession self._reset_session(providers, provider_options) - def disable_fallback(self): + def disable_fallback(self) -> None: """ Disable session.run() fallback mechanism. """ self._enable_fallback = False - def enable_fallback(self): + def enable_fallback(self) -> None: """ Enable session.Run() fallback mechanism. If session.Run() fails due to an internal Execution Provider failure, reset the Execution Providers enabled for this session. @@ -249,7 +252,7 @@ def _validate_input(self, feed_input_names): f"Required inputs ({missing_input_names}) are missing from input feed ({feed_input_names})." ) - def run(self, output_names, input_feed, run_options=None): + def run(self, output_names, input_feed, run_options=None) -> Sequence[np.ndarray | SparseTensor | list | dict]: """ Compute the predictions. @@ -308,7 +311,7 @@ def callback(results: np.ndarray, user_data: MyData, err: str) -> None: output_names = [output.name for output in self._outputs_meta] return self._sess.run_async(output_names, input_feed, callback, user_data, run_options) - def run_with_ort_values(self, output_names, input_dict_ort_values, run_options=None): + def run_with_ort_values(self, output_names, input_dict_ort_values, run_options=None) -> Sequence[OrtValue]: """ Compute the predictions. @@ -367,7 +370,7 @@ def get_profiling_start_time_ns(self): """ return self._sess.get_profiling_start_time_ns - def io_binding(self): + def io_binding(self) -> IOBinding: "Return an onnxruntime.IOBinding object`." return IOBinding(self) @@ -550,7 +553,7 @@ def _create_inference_session(self, providers, provider_options, disabled_optimi self._provider_options = self._sess.get_provider_options() self._profiling_start_time_ns = self._sess.get_profiling_start_time_ns - def _reset_session(self, providers, provider_options): + def _reset_session(self, providers, provider_options) -> None: "release underlying session object." # meta data references session internal structures # so they must be set to None to decrement _sess reference count. @@ -721,7 +724,7 @@ class OrtValue: This class provides APIs to construct and deal with OrtValues. """ - def __init__(self, ortvalue, numpy_obj=None): + def __init__(self, ortvalue: C.OrtValue, numpy_obj: np.ndarray | None = None): if isinstance(ortvalue, C.OrtValue): self._ortvalue = ortvalue # Hold a ref count to the numpy object if the OrtValue is backed directly @@ -733,11 +736,11 @@ def __init__(self, ortvalue, numpy_obj=None): "`Provided ortvalue` needs to be of type `onnxruntime.capi.onnxruntime_pybind11_state.OrtValue`" ) - def _get_c_value(self): + def _get_c_value(self) -> C.OrtValue: return self._ortvalue - @staticmethod - def ortvalue_from_numpy(numpy_obj, device_type="cpu", device_id=0): + @classmethod + def ortvalue_from_numpy(cls, numpy_obj: np.ndarray, /, device_type="cpu", device_id=0) -> OrtValue: """ Factory method to construct an OrtValue (which holds a Tensor) from a given Numpy object A copy of the data in the Numpy object is held by the OrtValue only if the device is NOT cpu @@ -749,7 +752,7 @@ def ortvalue_from_numpy(numpy_obj, device_type="cpu", device_id=0): # Hold a reference to the numpy object (if device_type is 'cpu') as the OrtValue # is backed directly by the data buffer of the numpy object and so the numpy object # must be around until this OrtValue instance is around - return OrtValue( + return cls( C.OrtValue.ortvalue_from_numpy( numpy_obj, C.OrtDevice( @@ -761,8 +764,8 @@ def ortvalue_from_numpy(numpy_obj, device_type="cpu", device_id=0): numpy_obj if device_type.lower() == "cpu" else None, ) - @staticmethod - def ortvalue_from_numpy_with_onnx_type(data, onnx_element_type: int): + @classmethod + def ortvalue_from_numpy_with_onnx_type(cls, data: np.ndarray, /, onnx_element_type: int) -> OrtValue: """ This method creates an instance of OrtValue on top of the numpy array. No data copy is made and the lifespan of the resulting OrtValue should never @@ -771,12 +774,14 @@ def ortvalue_from_numpy_with_onnx_type(data, onnx_element_type: int): when we want to use an ONNX data type that is not supported by numpy. :param data: numpy.ndarray. - :param onnx_elemenet_type: a valid onnx TensorProto::DataType enum value + :param onnx_element_type: a valid onnx TensorProto::DataType enum value """ - return OrtValue(C.OrtValue.ortvalue_from_numpy_with_onnx_type(data, onnx_element_type), data) + return cls(C.OrtValue.ortvalue_from_numpy_with_onnx_type(data, onnx_element_type), data) - @staticmethod - def ortvalue_from_shape_and_type(shape, element_type, device_type: str = "cpu", device_id: int = 0): + @classmethod + def ortvalue_from_shape_and_type( + cls, shape: Sequence[int], element_type, device_type: str = "cpu", device_id: int = 0 + ) -> OrtValue: """ Factory method to construct an OrtValue (which holds a Tensor) from given shape and element_type @@ -788,7 +793,7 @@ def ortvalue_from_shape_and_type(shape, element_type, device_type: str = "cpu", # Integer for onnx element type (see https://onnx.ai/onnx/api/mapping.html). # This is helpful for some data type (like TensorProto.BFLOAT16) that is not available in numpy. if isinstance(element_type, int): - return OrtValue( + return cls( C.OrtValue.ortvalue_from_shape_and_onnx_type( shape, element_type, @@ -800,7 +805,7 @@ def ortvalue_from_shape_and_type(shape, element_type, device_type: str = "cpu", ) ) - return OrtValue( + return cls( C.OrtValue.ortvalue_from_shape_and_type( shape, element_type, @@ -812,77 +817,77 @@ def ortvalue_from_shape_and_type(shape, element_type, device_type: str = "cpu", ) ) - @staticmethod - def ort_value_from_sparse_tensor(sparse_tensor): + @classmethod + def ort_value_from_sparse_tensor(cls, sparse_tensor: SparseTensor) -> OrtValue: """ The function will construct an OrtValue instance from a valid SparseTensor The new instance of OrtValue will assume the ownership of sparse_tensor """ - return OrtValue(C.OrtValue.ort_value_from_sparse_tensor(sparse_tensor._get_c_tensor())) + return cls(C.OrtValue.ort_value_from_sparse_tensor(sparse_tensor._get_c_tensor())) - def as_sparse_tensor(self): + def as_sparse_tensor(self) -> SparseTensor: """ The function will return SparseTensor contained in this OrtValue """ return SparseTensor(self._ortvalue.as_sparse_tensor()) - def data_ptr(self): + def data_ptr(self) -> int: """ Returns the address of the first element in the OrtValue's data buffer """ return self._ortvalue.data_ptr() - def device_name(self): + def device_name(self) -> str: """ Returns the name of the device where the OrtValue's data buffer resides e.g. cpu, cuda, cann """ return self._ortvalue.device_name().lower() - def shape(self): + def shape(self) -> Sequence[int]: """ Returns the shape of the data in the OrtValue """ return self._ortvalue.shape() - def data_type(self): + def data_type(self) -> str: """ - Returns the data type of the data in the OrtValue + Returns the data type of the data in the OrtValue. E.g. 'tensor(int64)' """ return self._ortvalue.data_type() - def element_type(self): + def element_type(self) -> int: """ Returns the proto type of the data in the OrtValue if the OrtValue is a tensor. """ return self._ortvalue.element_type() - def has_value(self): + def has_value(self) -> bool: """ Returns True if the OrtValue corresponding to an optional type contains data, else returns False """ return self._ortvalue.has_value() - def is_tensor(self): + def is_tensor(self) -> bool: """ Returns True if the OrtValue contains a Tensor, else returns False """ return self._ortvalue.is_tensor() - def is_sparse_tensor(self): + def is_sparse_tensor(self) -> bool: """ Returns True if the OrtValue contains a SparseTensor, else returns False """ return self._ortvalue.is_sparse_tensor() - def is_tensor_sequence(self): + def is_tensor_sequence(self) -> bool: """ Returns True if the OrtValue contains a Tensor Sequence, else returns False """ return self._ortvalue.is_tensor_sequence() - def numpy(self): + def numpy(self) -> np.ndarray: """ Returns a Numpy object from the OrtValue. Valid only for OrtValues holding Tensors. Throws for OrtValues holding non-Tensors. @@ -890,7 +895,7 @@ def numpy(self): """ return self._ortvalue.numpy() - def update_inplace(self, np_arr): + def update_inplace(self, np_arr) -> None: """ Update the OrtValue in place with a new Numpy array. The numpy contents are copied over to the device memory backing the OrtValue. It can be used @@ -948,7 +953,7 @@ class SparseTensor: depending on the format """ - def __init__(self, sparse_tensor): + def __init__(self, sparse_tensor: C.SparseTensor): """ Internal constructor """ @@ -960,11 +965,17 @@ def __init__(self, sparse_tensor): "`Provided object` needs to be of type `onnxruntime.capi.onnxruntime_pybind11_state.SparseTensor`" ) - def _get_c_tensor(self): + def _get_c_tensor(self) -> C.SparseTensor: return self._tensor - @staticmethod - def sparse_coo_from_numpy(dense_shape, values, coo_indices, ort_device): + @classmethod + def sparse_coo_from_numpy( + cls, + dense_shape: npt.NDArray[np.int64], + values: np.ndarray, + coo_indices: npt.NDArray[np.int64], + ort_device: OrtDevice, + ) -> SparseTensor: """ Factory method to construct a SparseTensor in COO format from given arguments @@ -985,12 +996,17 @@ def sparse_coo_from_numpy(dense_shape, values, coo_indices, ort_device): For strings and objects, it will create a copy of the arrays in CPU memory as ORT does not support those on other devices and their memory can not be mapped. """ - return SparseTensor( - C.SparseTensor.sparse_coo_from_numpy(dense_shape, values, coo_indices, ort_device._get_c_device()) - ) + return cls(C.SparseTensor.sparse_coo_from_numpy(dense_shape, values, coo_indices, ort_device._get_c_device())) - @staticmethod - def sparse_csr_from_numpy(dense_shape, values, inner_indices, outer_indices, ort_device): + @classmethod + def sparse_csr_from_numpy( + cls, + dense_shape: npt.NDArray[np.int64], + values: np.ndarray, + inner_indices: npt.NDArray[np.int64], + outer_indices: npt.NDArray[np.int64], + ort_device: OrtDevice, + ) -> SparseTensor: """ Factory method to construct a SparseTensor in CSR format from given arguments @@ -1011,7 +1027,7 @@ def sparse_csr_from_numpy(dense_shape, values, inner_indices, outer_indices, ort For strings and objects, it will create a copy of the arrays in CPU memory as ORT does not support those on other devices and their memory can not be mapped. """ - return SparseTensor( + return cls( C.SparseTensor.sparse_csr_from_numpy( dense_shape, values, @@ -1021,7 +1037,7 @@ def sparse_csr_from_numpy(dense_shape, values, inner_indices, outer_indices, ort ) ) - def values(self): + def values(self) -> np.ndarray: """ The method returns a numpy array that is backed by the native memory if the data type is numeric. Otherwise, the returned numpy array that contains @@ -1093,19 +1109,19 @@ def format(self): """ return self._tensor.format - def dense_shape(self): + def dense_shape(self) -> npt.NDArray[np.int64]: """ Returns a numpy array(int64) containing a dense shape of a sparse tensor """ return self._tensor.dense_shape() - def data_type(self): + def data_type(self) -> str: """ Returns a string data type of the data in the OrtValue """ return self._tensor.data_type() - def device_name(self): + def device_name(self) -> str: """ Returns the name of the device where the SparseTensor data buffers reside e.g. cpu, cuda """ diff --git a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc index 4611dc9082734..e22445edc0f5b 100644 --- a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc +++ b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc @@ -404,7 +404,7 @@ TYPED_TEST(LayerNormTest, LayerNorm17_opset) { // Execution provider entry invalid. // when other EPs support layer-norm fp16, this test should be updated to include them. if (std::is_same::value) { -#if !defined(COREML_ENABLE_MLPROGRAM) +#if !defined(USE_COREML) return; #endif } diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc index b1779ded4a675..81323cb51a887 100644 --- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc @@ -389,6 +389,7 @@ TEST(MatMulNBits, Float32_Accuracy4) { TestMatMulNBitsTyped(); TestMatMulNBitsTyped(); TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); TestMatMulNBitsTyped(); TestMatMulNBitsTyped(); TestMatMulNBitsTyped(); @@ -458,6 +459,7 @@ TEST(MatMulNBits, Float16_Accuracy4) { TestMatMulNBitsTyped(); TestMatMulNBitsTyped(); TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); TestMatMulNBitsTyped(); TestMatMulNBitsTyped(); TestMatMulNBitsTyped(); @@ -528,8 +530,10 @@ TEST(MatMulNBits, Float16Cuda) { for (auto K : {16, 32, 64, 128, 256, 1024, 93, 1234}) { for (auto block_size : {16, 32, 64, 128}) { for (auto has_gidx : has_gidx_options) { -#ifdef USE_DML +#if defined(USE_DML) RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.04f); +#elif defined(USE_WEBGPU) + RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.03f); #else RunTest(M, N, K, block_size, 0, false, true, has_gidx); RunTest(M, N, K, block_size, 0, true, true, has_gidx, false); diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc index a9aa78b7a3229..3505193b77683 100644 --- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc +++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc @@ -246,7 +246,7 @@ TEST(CoreMLExecutionProviderTest, TestOrtFormatModel) { #endif } -#if defined(COREML_ENABLE_MLPROGRAM) +#if defined(USE_COREML) // Names in CoreML cannot start with [0-9] or contain anything but "[a-z][A-Z][0-9]_" // Test that we fix invalid names in model inputs, initializers and outputs. // This is only enforced for ML Program, so we only do name sanitization when creating an ML Program format model. diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc index 724118d7419d2..9201da348e75c 100644 --- a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc +++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc @@ -125,7 +125,7 @@ TEST_F(ActivationOpTest, Relu) { {}, {}, /*is_tensorrt_supported=*/false, /*opset_version= */ 14); -#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM) +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(USE_COREML) TestActivationOp( "Relu", input_values_fp16, @@ -139,7 +139,7 @@ TEST_F(ActivationOpTest, Relu) { #endif // MLAS_F16VEC_INTRINSICS_SUPPORTED } -#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM) +#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML) TEST_F(ActivationOpTest, Sigmoid_fp16) { #ifdef USE_CUDA int min_cuda_architecture = 530; @@ -413,7 +413,7 @@ TEST_F(ActivationOpTest, LeakyRelu) { {{"alpha", alpha}}, {}); } -#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM) +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(USE_COREML) TEST_F(ActivationOpTest, LeakyRelu_fp16) { OpTester test("LeakyRelu", 11); float alpha = 0.01f; // oneDNN set alpha equal to 0.01 diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.h b/onnxruntime/test/providers/cpu/activation/activation_op_test.h index 59813f433dc41..04d116e29d3b0 100644 --- a/onnxruntime/test/providers/cpu/activation/activation_op_test.h +++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.h @@ -105,7 +105,7 @@ class ActivationOpTest : public ::testing::Test { std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution dist(low, high); -#ifdef COREML_ENABLE_MLPROGRAM +#ifdef USE_COREML // please check onnxruntime/onnxruntime/core/providers/coreml/builders/helper.cc:81 std::vector batch_size_list = {1, 2, 4, 9, 100}; #else diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc index d87ee861752c7..fbd9d10a56c77 100644 --- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc +++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc @@ -32,7 +32,7 @@ void TestBinaryFloat16(const char* op_name, bool enable_bf16 = true) { { std::vector> execution_providers; -#ifdef COREML_ENABLE_MLPROGRAM +#ifdef USE_COREML execution_providers.push_back(DefaultCoreMLExecutionProvider(true)); #elif USE_CUDA execution_providers.push_back(DefaultCudaExecutionProvider()); @@ -76,7 +76,7 @@ void TestUnaryFloat16(const char* op_name, bool run_bf16 = true) { { std::vector> execution_providers; -#ifdef COREML_ENABLE_MLPROGRAM +#ifdef USE_COREML execution_providers.push_back(DefaultCoreMLExecutionProvider(true)); #elif USE_CUDA execution_providers.push_back(DefaultCudaExecutionProvider()); @@ -968,8 +968,15 @@ TEST(MathOpTest, Abs) { test.Run(); } -#ifdef USE_DNNL +#if defined(USE_CUDA) || defined(USE_DNNL) TEST(MathOpTest, Abs_bfloat16) { +#ifdef USE_CUDA + int min_cuda_architecture = 530; + if (!HasCudaEnvironment(min_cuda_architecture)) { + LOGS_DEFAULT(WARNING) << "Hardware does NOT support BF16"; + return; + } +#endif #ifdef USE_DNNL if (!DnnlHasBF16Support()) { LOGS_DEFAULT(WARNING) << "Hardware does NOT support BF16"; @@ -980,9 +987,19 @@ TEST(MathOpTest, Abs_bfloat16) { std::vector dims{2, 2}; test_bf16.AddInput("X", dims, MakeBFloat16({1.0f, -2.0f, -0.0f, -10.0f})); test_bf16.AddOutput("Y", dims, MakeBFloat16({1.0f, 2.0f, 0.0f, 10.0f})); - test_bf16.Run(); + + std::vector> execution_providers; +#if defined(USE_CUDA) + execution_providers.push_back(DefaultCudaExecutionProvider()); +#endif + +#if defined(USE_DNNL) + execution_providers.push_back(DefaultDnnlExecutionProvider()); +#endif + + test_bf16.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); } -#endif // USE_DNNL +#endif // USE_CUDA || USE_DNNL TEST(MathOpTest, Abs_int8) { OpTester test("Abs"); @@ -1409,7 +1426,7 @@ TEST(MathOpTest, Pow_float16_float16) { dims, {1.0f, 256.0f, 2.0f, 1.0f}, false); } -#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM) +#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML) TEST(MathOpTest, Pow_float_float16) { OpTester test("Pow", 12); std::vector dims{4}; @@ -1423,7 +1440,7 @@ TEST(MathOpTest, Pow_float_float16) { execution_providers.push_back(DefaultCudaExecutionProvider()); #elif USE_ROCM execution_providers.push_back(DefaultRocmExecutionProvider()); -#elif COREML_ENABLE_MLPROGRAM +#elif USE_COREML execution_providers.push_back(DefaultCoreMLExecutionProvider(true)); #endif test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); diff --git a/onnxruntime/test/providers/cpu/math/matmul_test.cc b/onnxruntime/test/providers/cpu/math/matmul_test.cc index 298e870f348fc..dd8cbed15e5ef 100644 --- a/onnxruntime/test/providers/cpu/math/matmul_test.cc +++ b/onnxruntime/test/providers/cpu/math/matmul_test.cc @@ -210,7 +210,7 @@ TEST(MathOpTest, MatMulFloatType) { RunMatMulTest(7, false, true); } -#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM) || defined(USE_XNNPACK) +#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML) || defined(USE_XNNPACK) TEST(MathOpTest, MatMulFloat16) { #ifdef USE_CUDA int min_cuda_architecture = 530; @@ -276,7 +276,7 @@ TEST(MathOpTest, MatMulZeroKInt32Type) { RunMatMulZeroKTest(); } -#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM) || defined(USE_XNNPACK) +#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML) || defined(USE_XNNPACK) TEST(MathOpTest, MatMul_Float16) { #ifdef USE_CUDA int min_cuda_architecture = 530; diff --git a/onnxruntime/test/providers/cpu/math/sign_test.cc b/onnxruntime/test/providers/cpu/math/sign_test.cc index a01c2b26ea8b5..0da6a2ed55f2c 100644 --- a/onnxruntime/test/providers/cpu/math/sign_test.cc +++ b/onnxruntime/test/providers/cpu/math/sign_test.cc @@ -207,7 +207,7 @@ TEST(MathOpTest, Sign_MLFloat16) { // test.Run(OpTester::ExpectResult::kExpectSuccess); //} -#if defined(USE_DNNL) +#if defined(USE_CUDA) || defined(USE_DNNL) TEST(MathOpTest, Sign_bfloat16) { #ifdef USE_DNNL if (!DnnlHasBF16Support()) { @@ -228,9 +228,15 @@ TEST(MathOpTest, Sign_bfloat16) { TestImpl(input.cbegin(), input.cend(), std::back_inserter(output)); test.AddOutput("output", input_dims, output); std::vector> execution_providers; + +#if defined(USE_CUDA) + execution_providers.push_back(DefaultCudaExecutionProvider()); +#endif + #if defined(USE_DNNL) execution_providers.push_back(DefaultDnnlExecutionProvider()); -#endif // USE_DNNL +#endif + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); } #endif diff --git a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc index f8ebca5ff9a1b..a529d572d7cca 100644 --- a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc @@ -704,7 +704,7 @@ TEST(BatchNormTest, NonSpatial_Complicated) { } // Only CUDA and ROCm kernels have float 16 support -#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM) +#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML) TEST(BatchNormTest, BatchNorm2d_fp16) { vector X{-0.91221f, -0.283559f, 0.937637f, 2.09818f, -0.100199f, -0.608113f, 0.444562f, -1.07505f, 0.940591f, -0.922262f, 0.0931303f, 0.69611f, 1.55187f, 0.159808f, 0.914874f, -1.24856f, -1.98928f, -0.331621f, diff --git a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc index 4253e36e02548..d1350db8ec12e 100644 --- a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc +++ b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc @@ -3,7 +3,7 @@ #include "core/mlas/inc/mlas.h" -#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM) || defined(USE_XNNPACK) +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(USE_COREML) || defined(USE_XNNPACK) #include "gtest/gtest.h" #include "test/providers/provider_test_utils.h" @@ -30,7 +30,7 @@ struct ConvOpAndTestAttributes { /* Please notice that, we have predefined macros in the head of the file -#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM) +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(USE_COREML) When we have these two macro defines, this UT will turn into green light and work. If attributes.activation is set the NhwcFusedConv contrib op is used. diff --git a/onnxruntime/test/providers/cpu/nn/group_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/group_norm_op_test.cc index ac517193a2c77..3d8d188867023 100644 --- a/onnxruntime/test/providers/cpu/nn/group_norm_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/group_norm_op_test.cc @@ -6,7 +6,7 @@ #include "test/common/tensor_op_test_utils.h" #include "test/util/include/default_providers.h" -#ifdef COREML_ENABLE_MLPROGRAM +#ifdef USE_COREML using namespace std; namespace onnxruntime { namespace test { diff --git a/onnxruntime/test/providers/cpu/nn/instance_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/instance_norm_op_test.cc index 341bb8a4fc957..46b74f2c2eb9d 100644 --- a/onnxruntime/test/providers/cpu/nn/instance_norm_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/instance_norm_op_test.cc @@ -121,7 +121,7 @@ TEST(InstanceNormalizationOpTest, InstanceNormBatch2) { } // Only CUDA and ROCm kernels have float 16 support -#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM) +#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML) TEST(InstanceNormalizationOpTest, InstanceNormBatch1_fp16) { OpTester test("InstanceNormalization"); diff --git a/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc index d4e0af5011525..c14fc1fb62ae5 100644 --- a/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc @@ -3,7 +3,7 @@ #include "core/mlas/inc/mlas.h" -#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM) || defined(USE_XNNPACK) +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(USE_COREML) || defined(USE_XNNPACK) #include "core/providers/cpu/nn/pool.h" #include "gtest/gtest.h" diff --git a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc index 24a8c8491b632..f1d612276174f 100644 --- a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc @@ -70,7 +70,7 @@ TEST(PoolTest, MaxPool) { // Only CUDA kernel has float 16 support // Disable for now, still investigating the issue with cudnn lib -#if defined(USE_CUDA) || defined(COREML_ENABLE_MLPROGRAM) +#if defined(USE_CUDA) || defined(USE_COREML) TEST(PoolTest, MaxPool_F16) { #if defined(USE_CUDA) int min_cuda_architecture = 530; diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc index 4bc97d035c7f7..92cd82c2c9420 100644 --- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc +++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc @@ -1375,7 +1375,7 @@ TEST(ReductionOpTest, ReduceMax_double) { test.Run(); } -#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM) +#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML) TEST(ReductionOpTest, ReduceMax_half) { OpTester test("ReduceMax"); test.AddAttribute("axes", std::vector{1, 2}); @@ -2158,7 +2158,7 @@ TEST(ReductionOpTest, ReduceMin_double) { test.Run(); } -#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM) +#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML) TEST(ReductionOpTest, ReduceMin_half) { OpTester test("ReduceMin"); test.AddAttribute("axes", std::vector{0, 2}); @@ -2356,7 +2356,7 @@ TEST(ReductionOpTest, ReduceSum_int32) { test.Run(); } -#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM) +#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML) TEST(ReductionOpTest, ReduceSumHalfHalf) { OpTester test("ReduceSum"); test.AddAttribute("keepdims", (int64_t)0); diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc index e261d66a0d22a..d62ffe644e4cc 100644 --- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc +++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc @@ -694,7 +694,9 @@ "^test_gelu_tanh_2_expanded_cpu", "^test_dynamicquantizelinear_expanded_cpu", "^test_center_crop_pad_crop_negative_axes_hwc*", // failed due to new types or shape infer with negative axis for CenterCropPad. - "^test_center_crop_pad_crop_negative_axes_hwc_expanded*" // failed due to new types or shape infer with negative axis for CenterCropPad. + "^test_center_crop_pad_crop_negative_axes_hwc_expanded*", // failed due to new types or shape infer with negative axis for CenterCropPad. + "^test_reduce_max_empty_set", + "^test_reduce_min_empty_set" ], "current_failing_tests_pure_DML": [ "^test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_cpu", diff --git a/onnxruntime/wasm/pre-jsep.js b/onnxruntime/wasm/pre-jsep.js index a11c6d741d110..cca8da0525fbe 100644 --- a/onnxruntime/wasm/pre-jsep.js +++ b/onnxruntime/wasm/pre-jsep.js @@ -97,41 +97,40 @@ Module["jsepInit"] = (name, params) => { // Functions called via emscripten::val::module_property need to be assigned by name so that the minifier doesn't // change the name. + const backend = params[0]; [ - Module.jsepBackend, - Module.jsepReserveTensorId, - Module.jsepReleaseTensorId, - Module["jsepEnsureTensor"], - Module.jsepUploadTensor, - Module["jsepDownloadTensor"], - ] = params; + Module.webnnReserveTensorId, + Module.webnnReleaseTensorId, + Module["webnnEnsureTensor"], + Module.webnnUploadTensor, + Module["webnnDownloadTensor"], + ] = params.slice(1); // This function is called from both JS and an EM_ASM block, it needs both a minifiable name and an explicit name. - Module["jsepReleaseTensorId"] = Module.jsepReleaseTensorId; - Module["jsepUploadTensor"] = Module.jsepUploadTensor; + Module["webnnReleaseTensorId"] = Module.webnnReleaseTensorId; + Module["webnnUploadTensor"] = Module.webnnUploadTensor; // Functions called from JS also need to have explicit names. - const backend = Module.jsepBackend; - Module["jsepOnRunStart"] = (sessionId) => { + Module["webnnOnRunStart"] = (sessionId) => { return backend["onRunStart"](sessionId); }; - Module["jsepOnRunEnd"] = backend["onRunEnd"].bind(backend); - Module["jsepRegisterMLContext"] = (sessionId, mlContext) => { + Module["webnnOnRunEnd"] = backend["onRunEnd"].bind(backend); + Module["webnnRegisterMLContext"] = (sessionId, mlContext) => { backend["registerMLContext"](sessionId, mlContext); }; - Module["jsepOnReleaseSession"] = (sessionId) => { + Module["webnnOnReleaseSession"] = (sessionId) => { backend["onReleaseSession"](sessionId); }; - Module["jsepCreateMLTensorDownloader"] = (tensorId, type) => { + Module["webnnCreateMLTensorDownloader"] = (tensorId, type) => { return backend["createMLTensorDownloader"](tensorId, type); }; - Module["jsepRegisterMLTensor"] = (sessionId, tensor, dataType, shape) => { + Module["webnnRegisterMLTensor"] = (sessionId, tensor, dataType, shape) => { return backend["registerMLTensor"](sessionId, tensor, dataType, shape); }; - Module["jsepCreateMLContext"] = (optionsOrGpuDevice) => { + Module["webnnCreateMLContext"] = (optionsOrGpuDevice) => { return backend["createMLContext"](optionsOrGpuDevice); }; - Module["jsepRegisterMLConstant"] = ( + Module["webnnRegisterMLConstant"] = ( externalFilePath, dataOffset, dataLength, @@ -149,9 +148,12 @@ Module["jsepInit"] = (name, params) => { shouldConvertInt64ToInt32, ); }; - Module['jsepRegisterGraphInput'] = backend['registerGraphInput'].bind(backend); - Module['jsepIsGraphInput'] = backend['isGraphInput'].bind(backend); - Module['jsepCreateTemporaryTensor'] = backend['createTemporaryTensor'].bind(backend); - Module['jsepIsInt64Supported'] = backend['isInt64Supported'].bind(backend); + Module["webnnRegisterGraphInput"] = + backend["registerGraphInput"].bind(backend); + Module["webnnIsGraphInput"] = backend["isGraphInput"].bind(backend); + + Module["webnnCreateTemporaryTensor"] = + backend["createTemporaryTensor"].bind(backend); + Module["webnnIsInt64Supported"] = backend["isInt64Supported"].bind(backend); } }; diff --git a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml deleted file mode 100644 index 3cceadd1b8ef5..0000000000000 --- a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml +++ /dev/null @@ -1,241 +0,0 @@ -##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py #### -### please do rerun set-trigger-rules.py ### -trigger: - branches: - include: - - main - - rel-* - paths: - exclude: - - docs/** - - README.md - - CONTRIBUTING.md - - BUILD.md - - 'js/web' - - 'onnxruntime/core/providers/js' -pr: - branches: - include: - - main - - rel-* - paths: - exclude: - - docs/** - - README.md - - CONTRIBUTING.md - - BUILD.md - - 'js/web' - - 'onnxruntime/core/providers/js' -#### end trigger #### - -# Known Limits -# 1. Anchors are not supported in GHA -# https://github.community/t/support-for-yaml-anchors/16128/90 -# 2. today most cloud-based CI services are still lacking hardware acceleration support from the host VM, -# which is the no.1 blocker for running tests on modern Android Emulators (especially on recent API levels) on CI. - -# It'd better to check out https://github.com/microsoft/onnxruntime/wiki/Leverage-Existing-Artifacts -# to save debugging time. -parameters: -- name: specificArtifact - displayName: Use Specific Artifact - type: boolean - default: false -- name: runId - displayName: Specific Artifact's RunId - type: number - default: 0 - -stages: -# Separate stage for building CPU vs NNAPI as we only want CodeQL to run on one of them so we don't get duplicate -# issues for code that is built in both. We pick NNAPI as that includes the NNAPI EP code. -- stage: BUILD_AND_TEST_CPU - dependsOn: [] - variables: - Codeql.Enabled: false - ANDROID_AVD_HOME: $(Agent.TempDirectory) - jobs: - - job: BUILD_AND_TEST_CPU - pool: onnxruntime-Ubuntu2204-AMD-CPU - workspace: - clean: all - timeoutInMinutes: 30 - steps: - - task: JavaToolInstaller@0 - displayName: Use jdk 17 - inputs: - versionSpec: '17' - jdkArchitectureOption: 'x64' - jdkSourceOption: 'PreInstalled' - - - script: sudo apt-get update -y && sudo apt-get install -y coreutils ninja-build - displayName: Install coreutils and ninja - - - template: templates/use-android-ndk.yml - - template: templates/use-android-emulator.yml - parameters: - create: true - start: true - - script: | - env | grep ANDROID - displayName: View Android ENVs - - script: | - python3 tools/ci_build/build.py \ - --enable_lto \ - --android \ - --build_dir build \ - --android_sdk_path $ANDROID_HOME \ - --android_ndk_path $ANDROID_NDK_HOME \ - --android_abi=x86_64 \ - --android_api=30 \ - --skip_submodule_sync \ - --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache \ - --cmake_generator=Ninja \ - --build_java - displayName: CPU EP, Build and Test - - template: templates/use-android-emulator.yml - parameters: - stop: true - - - template: templates/clean-agent-build-directory-step.yml - -- stage: BUILD_AND_TEST_NNAPI_EP - dependsOn: [] - condition: notIn(variables['Build.Reason'], 'IndividualCI', 'BatchedCI') - variables: - ANDROID_AVD_HOME: $(Agent.TempDirectory) - Codeql.ProjectConfigPath: .github/workflows - Codeql.Enabled: true - Codeql.Language: cpp - ${{ if variables['Codeql.Enabled'] }}: - JobsTimeout: 120 - ${{ else }}: - JobsTimeout: 60 - jobs: - - job: BUILD_AND_TEST_NNAPI_EP - pool: onnxruntime-Ubuntu2204-AMD-CPU - timeoutInMinutes: ${{ variables.JobsTimeout }} - workspace: - clean: all - steps: - - task: JavaToolInstaller@0 - displayName: Use jdk 17 - inputs: - versionSpec: '17' - jdkArchitectureOption: 'x64' - jdkSourceOption: 'PreInstalled' - - - script: sudo apt-get update -y && sudo apt-get install -y coreutils ninja-build - displayName: Install coreutils and ninja - - template: templates/use-android-emulator.yml - parameters: - create: true - start: true - - - script: | - env | grep ANDROID - displayName: View Android ENVs - - - script: | - python3 tools/ci_build/build.py \ - --enable_lto \ - --android \ - --build_dir build_nnapi \ - --android_sdk_path $ANDROID_HOME \ - --android_ndk_path $ANDROID_NDK_HOME \ - --android_abi=x86_64 \ - --android_api=29 \ - --skip_submodule_sync \ - --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache \ - --use_nnapi \ - --build_shared_lib \ - --cmake_generator=Ninja \ - --build_java - displayName: NNAPI EP, Build, Test on Android Emulator - - - script: /bin/bash tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh $(pwd) - # Build Minimal ORT with NNAPI and reduced Ops, run unit tests on Android Emulator - displayName: Build Minimal ORT with NNAPI and run tests - - - template: templates/use-android-emulator.yml - parameters: - stop: true - - - template: templates/clean-agent-build-directory-step.yml - -- stage: MAIN_BUILD_STAGE - # The below jobs only run on build of main branch. - # because coverage report is hard to support in cross machines. - displayName: NNAPI MAIN BUILD&TEST - dependsOn: [] - condition: in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI') - variables: - ANDROID_AVD_HOME: $(Agent.TempDirectory) - jobs: - - job: NNAPI_EP_MASTER - pool: onnxruntime-Ubuntu2204-AMD-CPU - timeoutInMinutes: 180 - workspace: - clean: all - condition: in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI') - steps: - - task: JavaToolInstaller@0 - displayName: Use jdk 17 - inputs: - versionSpec: '17' - jdkArchitectureOption: 'x64' - jdkSourceOption: 'PreInstalled' - - - template: templates/use-android-ndk.yml - - - template: templates/use-android-emulator.yml - parameters: - create: true - start: true - - - script: | - python3 tools/ci_build/build.py \ - --enable_lto \ - --android \ - --build_dir build_nnapi \ - --android_sdk_path $ANDROID_HOME \ - --android_ndk_path $ANDROID_NDK_HOME \ - --android_abi=x86_64 \ - --android_api=29 \ - --skip_submodule_sync \ - --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache \ - --use_nnapi \ - --build_shared_lib \ - --cmake_generator=Ninja \ - --build_java \ - --code_coverage - displayName: NNAPI EP, Build, Test, CodeCoverage on Android Emulator - - # We need to use llvm-cov from the NDK. - - script: | - export GCOV="$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/linux-x86_64/bin/llvm-cov gcov" - python3 -m pip install gcovr - python3 tools/ci_build/coverage.py --build_dir build_nnapi --android_sdk_path $ANDROID_HOME - displayName: Retrieve runtime code coverage files from the emulator and analyze - - - script: cat '$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt' - displayName: Print coverage report - - # - task: AzureCLI@2 - # displayName: 'Post Android Code Coverage To DashBoard' - # inputs: - # azureSubscription: AIInfraBuild - # scriptType: bash - # scriptPath: $(Build.SourcesDirectory)/tools/ci_build/github/linux/upload_code_coverage_data.sh - # arguments: '"$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt" "https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=$(Build.BuildId)" arm android nnapi' - # workingDirectory: '$(Build.BinariesDirectory)' - - - script: /bin/bash tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh $(pwd) - # Build Minimal ORT with NNAPI and reduced Ops, run unit tests on Android Emulator - displayName: Build Minimal ORT with NNAPI and run tests - - - template: templates/use-android-emulator.yml - parameters: - stop: true - - - template: templates/clean-agent-build-directory-step.yml diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml index 23c968f35a27f..70d8e954808a5 100644 --- a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml @@ -56,7 +56,7 @@ extends: # Update the pool with your team's 1ES hosted pool. pool: name: "Azure Pipelines" - image: "macOS-13" + image: "macOS-14" os: macOS sdl: sourceAnalysisPool: diff --git a/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml b/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml index 4518a168879a2..a2a0e3bcace9f 100644 --- a/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml +++ b/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml @@ -11,7 +11,7 @@ stages: clean: all timeoutInMinutes: 120 pool: - vmImage: 'macOS-13' + vmImage: 'macOS-14' variables: - name: OnnxRuntimeBuildDirectory diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml index 07d21333270a8..a6e38d0bc93f3 100644 --- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml @@ -11,7 +11,7 @@ stages: workspace: clean: all pool: - vmImage: 'macOS-13' + vmImage: 'macOS-14' variables: - name: OnnxRuntimeBuildDirectory diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml index 6b421184c490e..78c07c28d6f4e 100644 --- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml +++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml @@ -427,7 +427,7 @@ stages: - job: IosDynamicFramework timeoutInMinutes: 120 pool: - vmImage: "macOS-13" + vmImage: "macOS-14" steps: - task: UsePythonVersion@0 @@ -463,7 +463,7 @@ stages: - job: IosMinimalTrainingBuild timeoutInMinutes: 120 pool: - vmImage: "macOS-13" + vmImage: "macOS-14" steps: - task: UsePythonVersion@0 diff --git a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml index 7a78c6ba0fcdf..01c1366107292 100644 --- a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml @@ -29,7 +29,7 @@ stages: parameters: job_name: Test_MAC_Wheels machine_pool: - vmImage: 'macOS-13' + vmImage: 'macOS-14' itemPattern: '*/*mac*x86_64.whl' arch: 'x86_64' - template: templates/py-package-smoking-test.yml diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml index 5e783607e3622..d64ee07aa2131 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml @@ -201,7 +201,7 @@ stages: clean: all pool: name: "Azure Pipelines" - image: "macOS-13" + image: "macOS-14" os: macOS variables: MACOSX_DEPLOYMENT_TARGET: '13.3' diff --git a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml index 3886ceb1ed58f..366ee3fcf4e92 100644 --- a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml +++ b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml @@ -24,6 +24,8 @@ jobs: pool: 'onnxruntime-Ubuntu2204-AMD-CPU' workspace: clean: all + pool: + vmImage: 'macOS-14' variables: runCodesignValidationInjection: false ANDROID_AVD_HOME: $(Agent.TempDirectory) diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml index c4559d4e0b918..7a46bdc7cde0a 100644 --- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml +++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml @@ -117,7 +117,7 @@ stages: workspace: clean: all pool: - vmImage: 'macOS-13' + vmImage: 'macOS-14' timeoutInMinutes: 300 steps: - template: set-version-number-variables-step.yml @@ -788,7 +788,7 @@ stages: - template: ../nuget/templates/test_macos.yml parameters: - AgentPool : macOS-13 + AgentPool : macOS-14 ArtifactSuffix: 'CPU' - template: ../nodejs/templates/test_win.yml @@ -824,4 +824,4 @@ stages: OS: MacOS BuildId: ${{ parameters.BuildId }} SpecificArtifact: ${{ parameters.SpecificArtifact }} - PoolName: 'macOS-13' + PoolName: 'macOS-14' diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml index ab31e592d7d71..8c725c1d6b9a0 100644 --- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml @@ -68,7 +68,7 @@ stages: jobs: - job: MacOS_C_API_Package_Publish pool: - vmImage: 'macOS-13' + vmImage: 'macOS-14' steps: - checkout: none - template: flex-downloadPipelineArtifact.yml diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml index 32908753f2909..c7ae7bb3a0026 100644 --- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml +++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml @@ -37,7 +37,7 @@ jobs: PROTO_CACHE_DIR: $(Pipeline.Workspace)/ccache_proto ORT_CACHE_DIR: $(Pipeline.Workspace)/ccache_ort pool: - vmImage: 'macOS-13' + vmImage: 'macOS-14' timeoutInMinutes: 300 steps: - checkout: self diff --git a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml index 58ebdd52998ea..c1309d345d819 100644 --- a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml @@ -64,11 +64,11 @@ stages: - job: ReactNative_CI_iOS ${{ if eq(parameters.is1ES, false) }}: pool: - vmImage: 'macOS-13' + vmImage: 'macOS-14' ${{ if eq(parameters.is1ES, true) }}: pool: name: 'Azure Pipelines' - image: 'macOS-13' + image: 'macOS-14' os: 'macOS' timeoutInMinutes: 120 @@ -212,7 +212,7 @@ stages: scheme: 'OnnxruntimeModuleTest' packageApp: false destinationPlatformOption: 'iOS' - destinationSimulators: 'iPhone 14,OS=16.4' + destinationSimulators: 'iPhone 15,OS=17.4' workingDirectory: '$(Build.SourcesDirectory)/js/react_native/ios' xcprettyArgs: '--output build/reports/test-results.xml' publishJUnitResults: true diff --git a/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml b/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml index 2cf698aefa8bd..3c1bfcd60fedd 100644 --- a/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml +++ b/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml @@ -3,7 +3,7 @@ parameters: - name: xcodeVersion type: string - default: "14.3.1" + default: "15.3.0" steps: - bash: | diff --git a/tools/python/util/android/android.py b/tools/python/util/android/android.py index 8f3ed97cae53f..cd420ca1483c7 100644 --- a/tools/python/util/android/android.py +++ b/tools/python/util/android/android.py @@ -46,18 +46,36 @@ def filename(name, windows_extension): def create_virtual_device(sdk_tool_paths: SdkToolPaths, system_image_package_name: str, avd_name: str): run(sdk_tool_paths.sdkmanager, "--install", system_image_package_name, input=b"y") - - run( - sdk_tool_paths.avdmanager, - "create", - "avd", - "--name", - avd_name, - "--package", - system_image_package_name, - "--force", - input=b"no", - ) + android_avd_home = os.environ["ANDROID_AVD_HOME"] + + if android_avd_home is not None: + if not os.path.exists(android_avd_home): + os.makedirs(android_avd_home) + run( + sdk_tool_paths.avdmanager, + "create", + "avd", + "--name", + avd_name, + "--package", + system_image_package_name, + "--force", + "--path", + android_avd_home, + input=b"no", + ) + else: + run( + sdk_tool_paths.avdmanager, + "create", + "avd", + "--name", + avd_name, + "--package", + system_image_package_name, + "--force", + input=b"no", + ) _process_creationflags = subprocess.CREATE_NEW_PROCESS_GROUP if is_windows() else 0