diff --git a/.github/actions/setup-android-ndk/action.yml b/.github/actions/setup-android-ndk/action.yml
new file mode 100644
index 0000000000000..fea9745396e81
--- /dev/null
+++ b/.github/actions/setup-android-ndk/action.yml
@@ -0,0 +1,98 @@
+# .github/actions/setup-android-ndk/action.yml
+name: 'Setup Android NDK'
+description: 'Installs and configures a specific version of the Android NDK'
+inputs:
+ ndk-version:
+ description: 'The version of the Android NDK to install (e.g., 27.2.12479018)'
+ required: true
+ default: '27.2.12479018'
+ android-sdk-root:
+ description: 'The root directory of the Android SDK'
+ required: true
+ default: '/usr/local/lib/android/sdk'
+
+runs:
+ using: "composite" # Use a composite action for multiple shell commands
+ steps:
+ - name: Install coreutils and ninja
+ shell: bash
+ run: sudo apt-get update -y && sudo apt-get install -y coreutils ninja-build
+
+ - name: Install Android NDK
+ shell: bash
+ run: |
+ set -e
+ "${{ inputs.android-sdk-root }}/cmdline-tools/latest/bin/sdkmanager" --install "ndk;${{ inputs.ndk-version }}"
+
+ NDK_PATH="${{ inputs.android-sdk-root }}/ndk/${{ inputs.ndk-version }}"
+ if [[ ! -d "${NDK_PATH}" ]]; then
+ echo "NDK directory is not in expected location: ${NDK_PATH}"
+ exit 1
+ fi
+
+ # Use standard environment variable setting in bash and add to GITHUB_ENV
+ echo "ANDROID_NDK_HOME=${NDK_PATH}" >> $GITHUB_ENV
+ echo "ANDROID_NDK_ROOT=${NDK_PATH}" >> $GITHUB_ENV
+ echo "ANDROID_NDK_HOME: ${NDK_PATH}"
+ echo "ANDROID_NDK_ROOT: ${NDK_PATH}"
+
+ - name: Check if emulator are installed and add to PATH
+ shell: bash
+ run: |
+ if [[ ":$PATH:" == *":${ANDROID_SDK_ROOT}/emulator:"* ]]; then
+ echo "${ANDROID_SDK_ROOT}/emulator is in PATH"
+ else
+ ${ANDROID_SDK_ROOT}/cmdline-tools/latest/bin/sdkmanager --install "emulator"
+ echo "${ANDROID_SDK_ROOT}/emulator" >> $GITHUB_PATH
+ fi
+
+ - name: Check if platform tools are installed and add to PATH
+ shell: bash
+ run: |
+ if [[ ":$PATH:" == *":${ANDROID_SDK_ROOT}/platform-tools:"* ]]; then
+ echo "${ANDROID_SDK_ROOT}/platform-tools is in PATH"
+ else
+ ${ANDROID_SDK_ROOT}/cmdline-tools/latest/bin/sdkmanager --install "platform-tools"
+ echo "${ANDROID_SDK_ROOT}/platform-tools" >> $GITHUB_PATH
+ fi
+ ls -R "${ANDROID_SDK_ROOT}/platform-tools"
+
+ - name: Create Android Emulator
+ shell: bash
+ env:
+ ANDROID_AVD_HOME: ${{ runner.temp }}/android-avd
+ run: |
+ python3 tools/python/run_android_emulator.py \
+ --android-sdk-root "${ANDROID_SDK_ROOT}" \
+ --create-avd --system-image "system-images;android-31;default;x86_64"
+
+ - name: List Android AVDs
+ shell: bash
+ env:
+ ANDROID_AVD_HOME: ${{ runner.temp }}/android-avd
+ run: |
+ "${ANDROID_SDK_ROOT}/cmdline-tools/latest/bin/avdmanager" list avd
+
+ - name: Check emulator.pid does not exist
+ shell: bash
+ run: |
+ if test -f ./emulator.pid; then
+ echo "Emulator PID file was not expected to exist but does and has pid: `cat ./emulator.pid`"
+ exit 1
+ fi
+
+ - name: Start Android Emulator
+ shell: bash
+ env:
+ ANDROID_AVD_HOME: ${{ runner.temp }}/android-avd
+ run: |
+ set -e -x
+ python3 tools/python/run_android_emulator.py \
+ --android-sdk-root "${ANDROID_SDK_ROOT}" \
+ --start --emulator-extra-args="-partition-size 2047" \
+ --emulator-pid-file ./emulator.pid
+ echo "Emulator PID: `cat ./emulator.pid`"
+
+ - name: View Android ENVs
+ shell: bash
+ run: env | grep ANDROID
\ No newline at end of file
diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml
new file mode 100644
index 0000000000000..64c40946c49c5
--- /dev/null
+++ b/.github/workflows/android.yml
@@ -0,0 +1,147 @@
+name: Android CI
+# This workflow is used to build and test on Android Emulator on Linux
+
+on:
+ push:
+ branches:
+ - main
+ - rel-*
+ pull_request:
+ branches:
+ - main
+ - rel-*
+ workflow_dispatch:
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' }}
+ cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+jobs:
+ android_nnapi_ep:
+ runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Use jdk 17
+ uses: actions/setup-java@v4
+ with:
+ distribution: 'temurin'
+ java-version: '17'
+ architecture: x64
+
+ - name: Setup Android NDK
+ uses: ./.github/actions/setup-android-ndk
+ with:
+ ndk-version: 27.2.12479018
+
+ - name: Export GitHub Actions cache environment variables
+ uses: actions/github-script@v7
+ with:
+ script: |
+ core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
+ core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
+
+ - name: NNAPI EP, Build, Test on Android Emulator
+ run: >-
+ python3 tools/ci_build/build.py
+ --enable_lto
+ --android
+ --build_dir build_nnapi
+ --android_sdk_path "$ANDROID_HOME"
+ --android_ndk_path "$ANDROID_NDK_HOME"
+ --android_abi=x86_64
+ --android_api=29
+ --skip_submodule_sync
+ --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache
+ --use_nnapi
+ --build_shared_lib
+ --cmake_generator=Ninja
+ --build_java
+ shell: bash
+
+
+ - name: Build Minimal ORT with NNAPI and run tests
+ run: tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh "$(pwd)"
+ shell: bash
+
+ - name: Install psutil for emulator shutdown by run_android_emulator.py
+ if: always()
+ run: python3 -m pip install psutil
+ shell: bash
+
+ - name: Stop Android Emulator
+ if: always()
+ run: |
+ env | grep ANDROID
+ if test -f ${{ github.workspace }}/emulator.pid; then
+ echo "Emulator PID:"`cat ${{ github.workspace }}/emulator.pid`
+ python3 tools/python/run_android_emulator.py \
+ --android-sdk-root "${ANDROID_SDK_ROOT}" \
+ --stop \
+ --emulator-pid-file ${{ github.workspace }}/emulator.pid
+ rm ${{ github.workspace }}/emulator.pid
+ else
+ echo "Emulator PID file was expected to exist but does not."
+ fi
+ shell: bash
+
+ android_cpu_ep:
+ name: Android CI Pipeline
+ runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Use jdk 17
+ uses: actions/setup-java@v4
+ with:
+ distribution: 'temurin'
+ java-version: '17'
+ architecture: x64
+
+ - name: Setup Android NDK
+ uses: ./.github/actions/setup-android-ndk
+ with:
+ ndk-version: 27.2.12479018
+
+ - name: Export GitHub Actions cache environment variables
+ uses: actions/github-script@v7
+ with:
+ script: |
+ core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
+ core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
+
+ - name: CPU EP, Build and Test
+ run: >-
+ python3 tools/ci_build/build.py
+ --enable_lto
+ --android
+ --build_dir build
+ --android_sdk_path $ANDROID_HOME
+ --android_ndk_path $ANDROID_NDK_HOME
+ --android_abi=x86_64
+ --android_api=30
+ --skip_submodule_sync
+ --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache
+ --cmake_generator=Ninja
+ --build_java
+ shell: bash
+
+ - name: Install psutil for emulator shutdown by run_android_emulator.py
+ if: always()
+ run: python3 -m pip install psutil
+ shell: bash
+
+ - name: Stop Android Emulator
+ if: always()
+ run: |
+ if test -f ${{ github.workspace }}/emulator.pid; then
+ echo "Emulator PID:"`cat ${{ github.workspace }}/emulator.pid`
+ python3 tools/python/run_android_emulator.py \
+ --android-sdk-root "${ANDROID_SDK_ROOT}" \
+ --stop \
+ --emulator-pid-file ${{ github.workspace }}/emulator.pid
+ rm ${{ github.workspace }}/emulator.pid
+ else
+ echo "Emulator PID file was expected to exist but does not."
+ fi
+ shell: bash
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index 50dd25898ad35..86b1cd5ee90e7 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -154,6 +154,46 @@ jobs:
--use_xnnpack \
--use_binskim_compliant_compile_flags
+ ARM64-Xcode16-webgpu:
+ runs-on: macos-15
+
+ env:
+ xcode_version: 16
+
+ timeout-minutes: 60
+
+ steps:
+ - uses: actions/setup-python@v5
+ with:
+ python-version: ${{ env.python_version }}
+
+ - name: Verify ARM64 machine
+ shell: python
+ run: |
+ import platform
+ assert platform.machine() == "arm64", "This job expects to be run on an ARM64 machine."
+
+ - name: Use Xcode ${{ env.xcode_version }}
+ shell: bash
+ run: |
+ XCODE_DEVELOPER_DIR="/Applications/Xcode_${{ env.xcode_version }}.app/Contents/Developer"
+ sudo xcode-select --switch "${XCODE_DEVELOPER_DIR}"
+
+ - uses: actions/checkout@v4
+
+ - name: Build and test
+ shell: bash
+ run: |
+ python ./tools/ci_build/build.py \
+ --build_dir ./build \
+ --update \
+ --build --parallel \
+ --test \
+ --build_shared_lib \
+ --build_nodejs \
+ --use_webgpu \
+ --use_binskim_compliant_compile_flags
+
ARM64-Xcode16-targeting-iphonesimulator:
runs-on: macos-15
@@ -164,7 +204,7 @@ jobs:
matrix:
target_arch: [x86_64, arm64]
- timeout-minutes: 75
+ timeout-minutes: 90
steps:
- uses: actions/setup-python@v5
diff --git a/cmake/CMakePresets.json b/cmake/CMakePresets.json
index 4987edaf85513..8d63912f6eaee 100644
--- a/cmake/CMakePresets.json
+++ b/cmake/CMakePresets.json
@@ -109,6 +109,29 @@
"rhs": "Darwin"
}
},
+ {
+ "name": "arm64-osx",
+ "inherits": [
+ "unit-test"
+ ],
+ "generator": "Xcode",
+ "binaryDir": "${sourceParentDir}/cmake_build/arm64-osx",
+ "installDir": "${sourceParentDir}/cmake_build/out",
+ "cacheVariables": {
+ "CMAKE_OSX_ARCHITECTURES": "arm64",
+ "onnxruntime_BUILD_SHARED_LIB": true,
+ "onnxruntime_USE_XNNPACK": false,
+ "onnxruntime_USE_COREML": true,
+ "onnxruntime_BUILD_OBJC": true,
+ "onnxruntime_BUILD_APPLE_FRAMEWORK": true,
+ "CMAKE_CONFIGURATION_TYPES": "Debug;Release"
+ },
+ "condition": {
+ "type": "equals",
+ "lhs": "${hostSystemName}",
+ "rhs": "Darwin"
+ }
+ },
{
"name": "x64-osx-vcpkg",
"inherits": [
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index b64641230f249..f582abca34706 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -1625,7 +1625,7 @@ This version of the operator has been available since version 1 of the 'com.micr
#### Type Constraints
-- T : tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)
+- T : tensor(bool), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)
- Constrain input and output types.
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 91c68a1f5afb6..8d256a2088279 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -582,7 +582,7 @@ Do not modify directly.*
| Op Name | Parameters | OpSet Version | Types Supported |
|---------|------------|---------------|-----------------|
|**Operator Domain:** *ai.onnx*||||
-|Abs|*in* X:**T**
*out* Y:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Abs|*in* X:**T**
*out* Y:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[6, 12]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|Add|*in* A:**T**
*in* B:**T**
*out* C:**T**|14+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||13|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
@@ -839,7 +839,7 @@ Do not modify directly.*
|Shrink|*in* input:**T**
*out* output:**T**|9+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|Sigmoid|*in* X:**T**
*out* Y:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
|||[6, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
-|Sign|*in* input:**T**
*out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Sign|*in* input:**T**
*out* output:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|SimplifiedLayerNormalization|*in* X:**T**
*in* scale:**V**
*out* Y:**V**
*out* inv_std_var:**U**|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)
**U** = tensor(double), tensor(float)
**V** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
|Sin|*in* input:**T**
*out* output:**T**|7+|**T** = tensor(double), tensor(float), tensor(float16)|
|Size|*in* data:**T**
*out* size:**T1**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**T1** = tensor(int64)|
diff --git a/js/react_native/android/src/main/AndroidManifest.xml b/js/react_native/android/src/main/AndroidManifest.xml
index c6e3cb45e16e5..a2f47b6057db7 100644
--- a/js/react_native/android/src/main/AndroidManifest.xml
+++ b/js/react_native/android/src/main/AndroidManifest.xml
@@ -1,3 +1,2 @@
-
+
diff --git a/js/react_native/e2e/.detoxrc.js b/js/react_native/e2e/.detoxrc.js
index e886a363d378b..1d49f06213e51 100644
--- a/js/react_native/e2e/.detoxrc.js
+++ b/js/react_native/e2e/.detoxrc.js
@@ -38,8 +38,8 @@ module.exports = {
simulator: {
type: 'ios.simulator',
device: {
- type: 'iPhone 14',
- os: 'iOS 16.4',
+ type: 'iPhone 15',
+ os: 'iOS 17.4',
},
},
attached: {
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index 3979af7fa1ec9..bb532e0fbae74 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -309,12 +309,12 @@ export const createSession = async (
if (context) {
wasm.currentContext = context as MLContext;
} else if (gpuDevice) {
- wasm.currentContext = await wasm.jsepCreateMLContext!(gpuDevice);
+ wasm.currentContext = await wasm.webnnCreateMLContext!(gpuDevice);
} else {
- wasm.currentContext = await wasm.jsepCreateMLContext!({ deviceType, powerPreference });
+ wasm.currentContext = await wasm.webnnCreateMLContext!({ deviceType, powerPreference });
}
} else {
- wasm.currentContext = await wasm.jsepCreateMLContext!();
+ wasm.currentContext = await wasm.webnnCreateMLContext!();
}
break;
}
@@ -330,7 +330,7 @@ export const createSession = async (
// clear current MLContext after session creation
if (wasm.currentContext) {
- wasm.jsepRegisterMLContext!(sessionHandle, wasm.currentContext);
+ wasm.webnnRegisterMLContext!(sessionHandle, wasm.currentContext);
wasm.currentContext = undefined;
wasm.shouldTransferToMLTensor = true;
}
@@ -454,6 +454,7 @@ export const releaseSession = (sessionId: number): void => {
}
wasm.jsepOnReleaseSession?.(sessionId);
+ wasm.webnnOnReleaseSession?.(sessionId);
wasm.webgpuOnReleaseSession?.(sessionId);
inputNamesUTF8Encoded.forEach((buf) => wasm._OrtFree(buf));
@@ -520,7 +521,7 @@ export const prepareInputOutputTensor = async (
const mlTensor = tensor[2].mlTensor as MLTensor;
dataByteLength = calculateTensorSizeInBytes(tensorDataTypeStringToEnum(dataType), dims)!;
- const registerMLTensor = wasm.jsepRegisterMLTensor;
+ const registerMLTensor = wasm.webnnRegisterMLTensor;
if (!registerMLTensor) {
throw new Error('Tensor location "ml-tensor" is not supported without using WebNN.');
}
@@ -540,7 +541,7 @@ export const prepareInputOutputTensor = async (
wasm.setValue(rawData + i * ptrSize, allocWasmString(data[i], allocs), '*');
}
} else {
- const isGraphInput = wasm.jsepIsGraphInput;
+ const isGraphInput = wasm.webnnIsGraphInput;
if (dataType !== 'string' && isGraphInput) {
const tensorNameUTF8 = wasm._OrtGetInputName(sessionId, index);
const tensorName = wasm.UTF8ToString(tensorNameUTF8);
@@ -549,8 +550,8 @@ export const prepareInputOutputTensor = async (
const dataTypeEnum = tensorDataTypeStringToEnum(dataType);
dataByteLength = calculateTensorSizeInBytes(dataTypeEnum, dims)!;
actualLocation = 'ml-tensor';
- const createTemporaryTensor = wasm.jsepCreateTemporaryTensor;
- const uploadTensor = wasm.jsepUploadTensor;
+ const createTemporaryTensor = wasm.webnnCreateTemporaryTensor;
+ const uploadTensor = wasm.webnnUploadTensor;
if (!createTemporaryTensor || !uploadTensor) {
throw new Error('Tensor location "ml-tensor" is not supported without using WebNN.');
}
@@ -722,6 +723,7 @@ export const run = async (
}
wasm.jsepOnRunStart?.(sessionHandle);
+ wasm.webnnOnRunStart?.(sessionHandle);
let errorCode: number;
if (!BUILD_DEFS.DISABLE_JSEP && ioBindingState) {
@@ -862,8 +864,8 @@ export const run = async (
]);
}
} else if (preferredLocation === 'ml-tensor' && size > 0) {
- const ensureTensor = wasm.jsepEnsureTensor;
- const isInt64Supported = wasm.jsepIsInt64Supported;
+ const ensureTensor = wasm.webnnEnsureTensor;
+ const isInt64Supported = wasm.webnnIsInt64Supported;
if (!ensureTensor || !isInt64Supported) {
throw new Error('preferredLocation "ml-tensor" is not supported without using WebNN.');
}
@@ -890,9 +892,9 @@ export const run = async (
dims,
{
mlTensor,
- download: wasm.jsepCreateMLTensorDownloader!(dataOffset, type),
+ download: wasm.webnnCreateMLTensorDownloader!(dataOffset, type),
dispose: () => {
- wasm.jsepReleaseTensorId!(dataOffset);
+ wasm.webnnReleaseTensorId!(dataOffset);
wasm._OrtReleaseTensor(tensor);
},
},
@@ -915,7 +917,7 @@ export const run = async (
if (!keepOutputTensor) {
wasm._OrtReleaseTensor(tensor);
}
- wasm.jsepOnRunEnd?.(sessionHandle);
+ wasm.webnnOnRunEnd?.(sessionHandle);
}
}
diff --git a/js/web/lib/wasm/wasm-types.ts b/js/web/lib/wasm/wasm-types.ts
index 6de54078af031..752bac28d7efb 100644
--- a/js/web/lib/wasm/wasm-types.ts
+++ b/js/web/lib/wasm/wasm-types.ts
@@ -156,12 +156,26 @@ export declare namespace JSEP {
*/
shouldTransferToMLTensor: boolean;
+ /**
+ * [exported from pre-jsep.js] Called when InferenceSession.run started. This function will be called before
+ * _OrtRun[WithBinding]() is called.
+ * @param sessionId - specify the session ID.
+ */
+ webnnOnRunStart: (sessionId: number) => void;
+ /**
+ * [exported from pre-jsep.js] Release a session. This function will be called before _OrtReleaseSession() is
+ * called.
+ * @param sessionId - specify the session ID.
+ * @returns
+ */
+ webnnOnReleaseSession: (sessionId: number) => void;
+
/**
* [exported from pre-jsep.js] Called when InferenceSession.run finished. This function will be called after
* _OrtRun[WithBinding]() is called.
* @param sessionId - specify the session ID.
*/
- jsepOnRunEnd: (sessionId: number) => void;
+ webnnOnRunEnd: (sessionId: number) => void;
/**
* [exported from pre-jsep.js] Register MLContext for a session.
@@ -169,18 +183,18 @@ export declare namespace JSEP {
* @param context - specify the MLContext.
* @returns
*/
- jsepRegisterMLContext: (sessionId: number, context: MLContext) => void;
+ webnnRegisterMLContext: (sessionId: number, context: MLContext) => void;
/**
* [exported from pre-jsep.js] Reserve a MLTensor ID attached to the current session.
* @returns the MLTensor ID.
*/
- jsepReserveTensorId: () => number;
+ webnnReserveTensorId: () => number;
/**
* [exported from pre-jsep.js] Release an MLTensor ID from use and destroys underlying MLTensor if no longer in use.
* @param tensorId - specify the MLTensor ID.
* @returns
*/
- jsepReleaseTensorId: (tensorId: number) => void;
+ webnnReleaseTensorId: (tensorId: number) => void;
/**
* [exported from pre-jsep.js] Ensure that an MLTensor of a given type and shape exists for a MLTensor ID.
* @param sessionId - specify the session ID or current active session ID if undefined.
@@ -190,7 +204,7 @@ export declare namespace JSEP {
* @param copyOld - specify whether to copy the old tensor if a new tensor was created.
* @returns the MLTensor associated with the tensor ID.
*/
- jsepEnsureTensor: (
+ webnnEnsureTensor: (
sessionId: number | undefined,
tensorId: number,
dataType: DataType,
@@ -203,20 +217,20 @@ export declare namespace JSEP {
* @param data - specify the data to upload. It can be a TensorProto::data_type or a WebNN MLOperandDataType.
* @returns
*/
- jsepUploadTensor: (tensorId: number, data: Uint8Array) => void;
+ webnnUploadTensor: (tensorId: number, data: Uint8Array) => void;
/**
* [exported from pre-jsep.js] Download data from an MLTensor.
* @param tensorId - specify the MLTensor ID.
* @returns the downloaded data.
*/
- jsepDownloadTensor: (tensorId: number, dstBuffer: ArrayBufferView | ArrayBuffer) => Promise;
+ webnnDownloadTensor: (tensorId: number, dstBuffer: ArrayBufferView | ArrayBuffer) => Promise;
/**
* [exported from pre-jsep.js] Creates a downloader function to download data from an MLTensor.
* @param tensorId - specify the MLTensor ID.
* @param type - specify the data type.
* @returns the downloader function.
*/
- jsepCreateMLTensorDownloader: (
+ webnnCreateMLTensorDownloader: (
tensorId: number,
type: Tensor.MLTensorDataTypes,
) => () => Promise;
@@ -228,7 +242,7 @@ export declare namespace JSEP {
* @param dimensions - specify the dimensions.
* @returns the MLTensor ID for the external MLTensor.
*/
- jsepRegisterMLTensor: (
+ webnnRegisterMLTensor: (
sessionId: number,
tensor: MLTensor,
onnxDataType: DataType,
@@ -240,7 +254,7 @@ export declare namespace JSEP {
* @param optionsOrGpuDevice - specify the options or GPUDevice.
* @returns
*/
- jsepCreateMLContext(optionsOrGpuDevice?: MLContextOptions | GPUDevice): Promise;
+ webnnCreateMLContext(optionsOrGpuDevice?: MLContextOptions | GPUDevice): Promise;
/**
* [exported from pre-jsep.js] Register a WebNN Constant operand from external data.
@@ -252,7 +266,7 @@ export declare namespace JSEP {
* @param shouldConvertInt64ToInt32 - specify whether to convert int64 to int32.
* @returns the WebNN Constant operand for the specified external data.
*/
- jsepRegisterMLConstant(
+ webnnRegisterMLConstant(
externalFilePath: string,
dataOffset: number,
dataLength: number,
@@ -265,14 +279,14 @@ export declare namespace JSEP {
* [exported from pre-jsep.js] Register a WebNN graph input.
* @param inputName - specify the input name.
*/
- jsepRegisterGraphInput: (inputName: string) => void;
+ webnnRegisterGraphInput: (inputName: string) => void;
/**
* [exported from pre-jsep.js] Check if a graph input is a WebNN graph input.
* @param sessionId - specify the session ID.
* @param inputName - specify the input name.
* @returns whether the input is a WebNN graph input.
*/
- jsepIsGraphInput: (sessionId: number, inputName: string) => boolean;
+ webnnIsGraphInput: (sessionId: number, inputName: string) => boolean;
/**
* [exported from pre-jsep.js] Create a temporary MLTensor for a session.
* @param sessionId - specify the session ID.
@@ -280,13 +294,13 @@ export declare namespace JSEP {
* @param shape - specify the shape.
* @returns the MLTensor ID for the temporary MLTensor.
*/
- jsepCreateTemporaryTensor: (sessionId: number, dataType: DataType, shape: readonly number[]) => Promise;
+ webnnCreateTemporaryTensor: (sessionId: number, dataType: DataType, shape: readonly number[]) => Promise;
/**
* [exported from pre-jsep.js] Check if a session's associated WebNN Context supports int64.
* @param sessionId - specify the session ID.
* @returns whether the WebNN Context supports int64.
*/
- jsepIsInt64Supported: (sessionId: number) => boolean;
+ webnnIsInt64Supported: (sessionId: number) => boolean;
}
}
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
index 58ddf60df79f0..52c705abb1003 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
@@ -224,12 +224,12 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
// Shader is designed to be dispatched as Dispatch(num_heads, new_sequence_length / workgroup_size_x, 1)
// Each lane/thread is responsible for a single q.
shader.MainFunctionBody() << R"MAIN_FN(
- let head_idx = workgroup_id.x;
+ let head_idx = u32(workgroup_idx / uniforms.num_seq_tile);
let capped_sg_id = min(sg_id, max_k_step);
let capped_sg_size = min(sg_size, max_k_step);
// Load Q
- let q_idx_global = workgroup_id.y * workgroup_size_x + local_idx;
+ let q_idx_global = (workgroup_idx % uniforms.num_seq_tile) * workgroup_size_x + local_idx;
let valid_q = q_idx_global < uniforms.new_sequence_length;
if (valid_q)
{
@@ -445,7 +445,8 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
std::string cache_hint = std::to_string(has_attention_bias) +
std::to_string(parameters.head_size_) +
std::to_string(parameters.num_heads_);
- program.SetDispatchGroupSize(parameters.num_heads_, (parameters.sequence_length_ + tile_size - 1) / tile_size, 1)
+ const uint32_t num_seq_tile = (parameters.sequence_length_ + tile_size - 1) / tile_size;
+ program.SetDispatchGroupSize(parameters.num_heads_ * num_seq_tile)
.SetWorkgroupSize(tile_size)
.CacheHint(cache_hint)
.AddUniformVariables({{static_cast(parameters.sequence_length_)},
@@ -454,7 +455,8 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
{static_cast(parameters.total_sequence_length_ - parameters.kv_sequence_length_)},
{static_cast(parameters.is_gqa_ ? 1 : 0)},
{static_cast(parameters.n_reps)},
- {alpha}});
+ {alpha},
+ {num_seq_tile}});
return context.RunProgram(program);
}
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
index 2c2b888538843..8931403641a81 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
@@ -52,7 +52,8 @@ class FlashAttentionProgram final : public Program {
{"past_sequence_length", ProgramUniformVariableDataType::Uint32},
{"is_gqa", ProgramUniformVariableDataType::Uint32},
{"n_reps", ProgramUniformVariableDataType::Uint32},
- {"alpha", ProgramUniformVariableDataType::Float32});
+ {"alpha", ProgramUniformVariableDataType::Float32},
+ {"num_seq_tile", ProgramUniformVariableDataType::Uint32});
private:
bool has_attention_bias_;
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
index 05cbfb1f99c48..a25d8e68f11cd 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
@@ -12,21 +12,12 @@ Status DP4AMatMulQuantizeProgram::GenerateShaderCode(ShaderHelper& shader) const
shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
shader.AddOutput("output", ShaderUsage::UseUniform);
shader.AddOutput("scales", ShaderUsage::UseUniform);
- shader.AdditionalImplementation() << R"ADDNL_FN(
- fn readInput(offset: u32) -> input_a_value_t
- {
- if (offset > uniforms.input_size) {
- return input_a_value_t(0);
- }
- return input_a[offset];
- }
- )ADDNL_FN";
shader.MainFunctionBody() << R"MAIN_FN(
var local_a : array, 32>;
var max_value:vec4 = vec4(0);
for (var idx:u32=0;idx<32;idx+=1)
{
- local_a[idx] = readInput(workgroup_idx*32 + idx);
+ local_a[idx] = input_a[workgroup_idx*32 + idx];
max_value = max(max_value, abs(local_a[idx]));
}
var scale = max(max_value.x, max_value.y);
@@ -147,8 +138,8 @@ Status DP4AMatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
shader.MainFunctionBody() << R"MAIN_FN(
// During the load phase we use all 256 threads to load 64 rows of A/B.
// For each row we load tile_size_k_vec (2) vectorized elements, which are 32 elements of K.
- let a_global_base = workgroup_id.x * tile_size;
- let b_global_base = workgroup_id.y * tile_size;
+ let a_global_base = u32(workgroup_idx / uniforms.num_N_tile) * tile_size;
+ let b_global_base = (workgroup_idx % uniforms.num_N_tile) * tile_size;
let load_AorB = u32(local_idx/128);
let load_row = u32((local_idx%128)/2);
let load_col = u32(local_idx%2);
@@ -279,17 +270,16 @@ Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor
Tensor a_scale = context.CreateGPUTensor(a->DataType(), a_scales_dims);
quantize_program.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, static_cast(kVec4Components)}})
.AddOutputs({{&a_quant, ProgramTensorMetadataDependency::Rank, a_quant.Shape(), 1},
- {&a_scale, ProgramTensorMetadataDependency::Rank, a_scale.Shape(), 1}})
- .AddUniformVariable({static_cast(M * K / kVec4Components)});
+ {&a_scale, ProgramTensorMetadataDependency::Rank, a_scale.Shape(), 1}});
ORT_RETURN_IF_ERROR(context.RunProgram(quantize_program));
constexpr uint32_t kTileSize = 64;
TensorShape reshaped_y_shape{1, M, N / kVec4Components};
+ uint32_t num_M_tile = (M + kTileSize - 1) / kTileSize;
+ uint32_t num_N_tile = (N + kTileSize - 1) / kTileSize;
DP4AMatMulNBitsProgram mul_program{block_size};
mul_program.SetWorkgroupSize(256);
- mul_program.SetDispatchGroupSize(
- (M + kTileSize - 1) / kTileSize,
- (N + kTileSize - 1) / kTileSize, 1);
+ mul_program.SetDispatchGroupSize(num_M_tile * num_N_tile);
mul_program.AddInputs({{&a_quant, ProgramTensorMetadataDependency::TypeAndRank, static_cast(kVec4Components)},
{&a_scale, ProgramTensorMetadataDependency::TypeAndRank, 1},
{b, ProgramTensorMetadataDependency::TypeAndRank, static_cast(kVec2Components * kU32Components)},
@@ -298,7 +288,8 @@ Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor
{static_cast(N)},
{static_cast(K)},
{static_cast(K / 8)},
- {static_cast(K / 16)}})
+ {static_cast(K / 16)},
+ {num_N_tile}})
.AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, reshaped_y_shape, static_cast(kVec4Components)})
.CacheHint("Block" + std::to_string(block_size));
return context.RunProgram(mul_program);
@@ -317,7 +308,7 @@ bool CanApplyDP4AMatrixMatMulNBits(onnxruntime::webgpu::ComputeContext& context,
bool use_dp4a = context.Device().HasFeature(wgpu::FeatureName::Subgroups) &&
context.AdapterInfo().backendType != wgpu::BackendType::Metal;
return (accuracy_level == 4 && block_size % 32 == 0 &&
- batch_count == 1 && components_k == 4 && K % 64 == 0 && N % 16 == 0 &&
+ batch_count == 1 && components_k == 4 && K % 128 == 0 && N % 16 == 0 &&
!has_zero_points && use_dp4a);
}
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h
index 15b86d78301ad..f0157ca3e8c97 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h
+++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h
@@ -16,7 +16,6 @@ class DP4AMatMulQuantizeProgram final : public Program {
@@ -28,7 +27,8 @@ class DP4AMatMulNBitsProgram final : public Program {
{"N", ProgramUniformVariableDataType::Uint32},
{"K", ProgramUniformVariableDataType::Uint32},
{"K8", ProgramUniformVariableDataType::Uint32},
- {"K16", ProgramUniformVariableDataType::Uint32});
+ {"K16", ProgramUniformVariableDataType::Uint32},
+ {"num_N_tile", ProgramUniformVariableDataType::Uint32});
private:
uint32_t block_size_;
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index e45787299f3ad..7b4a45ce8aa0f 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -3361,7 +3361,8 @@ void RegisterContribSchemas() {
OpSchema::NonDifferentiable)
.TypeConstraint(
"T",
- {"tensor(int8)",
+ {"tensor(bool)",
+ "tensor(int8)",
"tensor(int16)",
"tensor(int32)",
"tensor(int64)",
diff --git a/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
index 4481a5172966b..3fffc6d0a68c4 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
@@ -97,7 +97,6 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
const logging::Logger& logger) const {
const auto& op_type(node.OpType());
-#if defined(COREML_ENABLE_MLPROGRAM)
if (model_builder.CreateMLProgram()) {
using namespace CoreML::Specification::MILSpec;
// https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#module-coremltools.converters.mil.mil.ops.defs.iOS15.activation
@@ -166,9 +165,7 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
model_builder.AddOperation(std::move(op));
- } else
-#endif // (COREML_ENABLE_MLPROGRAM)
- {
+ } else {
std::unique_ptr layer = model_builder.CreateNNLayer(node);
if (op_type == "Sigmoid") {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc
index 6169090a36014..dfa01c8187741 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc
@@ -32,7 +32,6 @@ Status ArgMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
const int64_t keepdims = helper.Get("keepdims", 1);
const bool removedim = keepdims != 1;
-#if defined(COREML_ENABLE_MLPROGRAM)
if (model_builder.CreateMLProgram()) {
using namespace CoreML::Specification::MILSpec;
// https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#module-coremltools.converters.mil.mil.ops.defs.iOS15.reduction
@@ -46,9 +45,7 @@ Status ArgMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
// the output of ArgMax must be int32
AddOperationOutput(*op, *node.OutputDefs()[0], output_datatype);
model_builder.AddOperation(std::move(op));
- } else
-#endif // (COREML_ENABLE_MLPROGRAM)
- {
+ } else {
auto* coreml_argmax = layer->mutable_argmax();
coreml_argmax->set_axis(axis);
coreml_argmax->set_removedim(removedim);
@@ -91,11 +88,9 @@ bool ArgMaxOpBuilder::IsOpSupportedImpl(const Node& node,
return false;
}
-#if defined(COREML_ENABLE_MLPROGRAM)
if (input_params.create_mlprogram) {
return true;
}
-#endif
// If there are multiple downstream nodes and cast (toint32) is one of them
// not supported, exit here
diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
index 2817f34bc64f2..9e7fcd788664c 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
@@ -6,6 +6,7 @@
#include "core/providers/coreml/builders/helper.h"
#include "core/providers/coreml/builders/impl/base_op_builder.h"
#include "core/providers/coreml/builders/model_builder.h"
+#include "core/providers/coreml/model/host_utils.h"
#include "core/providers/shared/utils/utils.h"
using namespace CoreML::Specification;
@@ -113,10 +114,12 @@ bool BaseOpBuilder::IsInputDtypeSupport(const Node& node, size_t idx,
return true;
}
+#if CAN_BUILD_COREML6_OR_LATER
// only MLProgram support FP16
if (input_params.create_mlprogram && input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
return true;
}
+#endif
LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Input type: [" << input_type << "] is not currently supported";
return false;
diff --git a/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc
index 442194cb31cbc..e547f2e42e527 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc
@@ -57,7 +57,6 @@ Status BatchNormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_bu
const auto eps = helper.Get("epsilon", 1e-5f);
const auto channels = scale_tensor.dims()[0];
-#if defined(COREML_ENABLE_MLPROGRAM)
if (model_builder.CreateMLProgram()) {
using namespace CoreML::Specification::MILSpec;
// https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.normalization.batch_norm
@@ -78,9 +77,7 @@ Status BatchNormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_bu
AddOperationOutput(*op, *node.OutputDefs()[0]);
model_builder.AddOperation(std::move(op));
- } else
-#endif // (COREML_ENABLE_MLPROGRAM)
- {
+ } else {
auto* coreml_batch_norm = layer->mutable_batchnorm();
coreml_batch_norm->set_channels(channels);
coreml_batch_norm->set_epsilon(eps);
diff --git a/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
index 0482620b269a4..d7c78e05362ed 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
@@ -56,7 +56,6 @@ bool CheckIfBothInputShapesMatch(const Node& node, const logging::Logger& logger
}
} // namespace
-#if defined(COREML_ENABLE_MLPROGRAM)
static std::vector InferOutputShape(const std::vector& a, const std::vector& b) {
std::vector output_shape;
int64_t i_a = 0, j_b = 0;
@@ -112,14 +111,12 @@ static void AddVariadicInputs(std::unique_ptr layer = model_builder.CreateNNLayer(node);
if (op_type == "Add") {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
index 6f9bb35c27d80..684653aa21273 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
@@ -150,7 +150,6 @@ void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span data);
-#if defined(COREML_ENABLE_MLPROGRAM)
//
// MLProgram utils
//
@@ -174,6 +173,5 @@ void AddOperationOutput(COREML_SPEC::MILSpec::Operation& op, const NodeArg& outp
/// Number of spatial dims in input. Generally rank - 2 (ignore N and C dims).
void AddPadTypeAndPads(COREML_SPEC::MILSpec::Operation& op, ModelBuilder& model_builder, std::string_view op_type,
const NodeAttrHelper& helper, int num_spatial_dims);
-#endif // defined(COREML_ENABLE_MLPROGRAM)
} // namespace coreml
} // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc
index 7c7363d4c81ad..8abee92451338 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc
@@ -27,9 +27,8 @@ class CastOpBuilder : public BaseOpBuilder {
Status CastOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& model_builder,
[[maybe_unused]] const Node& node,
[[maybe_unused]] const logging::Logger& logger) const {
-// This is a special handling case for ArgMax Op, where argmax is followed by a cast to int32 type.
-// The ArgMax is fused with the Cast node and produces an int32 output.
-#if defined(COREML_ENABLE_MLPROGRAM)
+ // This is a special handling case for ArgMax Op, where argmax is followed by a cast to int32 type.
+ // The ArgMax is fused with the Cast node and produces an int32 output.
if (model_builder.CreateMLProgram()) {
using namespace CoreML::Specification::MILSpec;
// https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.elementwise_unary.cast
@@ -73,7 +72,6 @@ Status CastOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& model
AddOperationOutput(*op, *node.OutputDefs()[0], cast_to_type);
model_builder.AddOperation(std::move(op));
}
-#endif
return Status::OK();
}
@@ -134,7 +132,6 @@ bool CastOpBuilder::HasSupportedInputsImpl(const Node& node, [[maybe_unused]] co
return false;
}
-#if defined(COREML_ENABLE_MLPROGRAM)
if (input_params.create_mlprogram) {
if ((input_type == ONNX_NAMESPACE::TensorProto_DataType_INT32 ||
input_type == ONNX_NAMESPACE::TensorProto_DataType_INT64 ||
@@ -152,7 +149,6 @@ bool CastOpBuilder::HasSupportedInputsImpl(const Node& node, [[maybe_unused]] co
return false;
}
}
-#endif
// only support int64 coming from ArgMax (check for ArgMax is done in IsOpSupportedImpl())
if (input_type != ONNX_NAMESPACE::TensorProto_DataType_INT64) {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
index f7046c213a8cb..9e68070a0e693 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
@@ -64,7 +64,6 @@ Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
bool has_min = min != std::numeric_limits::lowest();
bool has_max = max != std::numeric_limits::max();
-#if defined(COREML_ENABLE_MLPROGRAM)
if (model_builder.CreateMLProgram()) {
using namespace CoreML::Specification::MILSpec;
@@ -121,9 +120,7 @@ Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
AddOperationOutput(*op, output);
model_builder.AddOperation(std::move(op));
- } else
-#endif // defined(COREML_ENABLE_MLPROGRAM)
- {
+ } else {
// TODO: CoreML has a Clip layer for NeuralNetwork. Added in CoreML 4. We could potentially use that if available
// to simplify.
// https://apple.github.io/coremltools/mlmodel/Format/NeuralNetwork.html#cliplayerparams
diff --git a/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc
index 9ea0030290abd..34ce2438095ad 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc
@@ -26,7 +26,6 @@ class ConcatOpBuilder : public BaseOpBuilder {
Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
const Node& node,
const logging::Logger& logger) const {
-#if defined(COREML_ENABLE_MLPROGRAM)
if (model_builder.CreateMLProgram()) {
using namespace CoreML::Specification::MILSpec; // NOLINT
@@ -45,7 +44,6 @@ Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
AddOperationOutput(*op, *node.OutputDefs()[0]);
model_builder.AddOperation(std::move(op));
} else // NOLINT
-#endif // defined(COREML_ENABLE_MLPROGRAM)
{
std::unique_ptr layer = model_builder.CreateNNLayer(node);
diff --git a/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
index 38125957bf481..18823bcc78d19 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
@@ -52,7 +52,6 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
NodeAttrHelper helper(node);
-#if defined(COREML_ENABLE_MLPROGRAM)
if (model_builder.CreateMLProgram()) {
using namespace CoreML::Specification::MILSpec;
@@ -89,9 +88,7 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
AddOperationOutput(*conv_op, *node.OutputDefs()[0]);
model_builder.AddOperation(std::move(conv_op));
- } else
-#endif // defined(COREML_ENABLE_MLPROGRAM)
- {
+ } else {
std::unique_ptr layer = model_builder.CreateNNLayer(node);
auto strides = helper.Get("strides", std::vector{1, 1});
@@ -225,14 +222,11 @@ bool ConvOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara
const auto& weight_name = input_defs[1]->Name();
const auto* weight = input_params.graph_viewer.GetConstantInitializer(weight_name);
-#if defined(COREML_ENABLE_MLPROGRAM)
if (input_params.create_mlprogram) {
// ML Program supports non-const weight, 1D, 2D and 3D.
// keep to 1D and 2D for consistency with the NeuralNetwork implementation for now.
// add 3D support as/when needed.
- } else
-#endif // defined (COREML_ENABLE_MLPROGRAM)
- {
+ } else {
if (!weight) {
LOGS(logger, VERBOSE) << "The weight of Conv [" << name << "] must be a constant initializer";
return false;
@@ -257,7 +251,6 @@ bool ConvOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara
NodeAttrHelper helper(node);
-#if defined(COREML_ENABLE_MLPROGRAM)
// spec says same_lower is supported in CoreML 5. it lies. CoreML 6 is required otherwise you get
// `Unexpected value for parameter pad_type[0] "same_lower" not in ("custom", "same", "valid").`
// We _could_ manually calculate the pads, but not implementing that until we have a real use case to justify
@@ -269,7 +262,6 @@ bool ConvOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara
return false;
}
}
-#endif
// there's no equivalent to allow a manual kernel shape in CoreML.
// it's OK if a specified kernel_shape matches kH and kW dims of the weight input.
diff --git a/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc
index 5b6d9d72ab3c9..2e2c898b0e10a 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc
@@ -28,7 +28,6 @@ class ConvTransposeOpBuilder : public BaseOpBuilder {
Status ConvTransposeOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& model_builder,
[[maybe_unused]] const Node& node,
const logging::Logger& /*logger*/) const {
-#if defined(COREML_ENABLE_MLPROGRAM)
using namespace CoreML::Specification::MILSpec; // NOLINT
const auto input_defs = node.InputDefs();
const auto output_defs = node.OutputDefs();
@@ -80,7 +79,6 @@ Status ConvTransposeOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuild
AddOperationOutput(*op, *output_defs[0]);
model_builder.AddOperation(std::move(op));
-#endif // defined(COREML_ENABLE_MLPROGRAM)
return Status::OK();
}
diff --git a/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc
index fec14dfd093a0..1a74b1eea97fe 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc
@@ -33,7 +33,6 @@ Status DepthToSpaceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
NodeAttrHelper helper(node);
int64_t blocksize = *helper.GetInt64("blocksize"); // required attribute
-#if defined(COREML_ENABLE_MLPROGRAM)
if (model_builder.CreateMLProgram()) {
using namespace CoreML::Specification::MILSpec; // NOLINT
@@ -105,7 +104,6 @@ Status DepthToSpaceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
model_builder.AddOperation(std::move(reshape2));
}
} else // NOLINT
-#endif // if defined(COREML_ENABLE_MLPROGRAM)
{
const auto& output_name = output_defs[0]->Name();
std::unique_ptr layer = model_builder.CreateNNLayer(node);
diff --git a/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc
index e685c09ef43ca..4f84f7c36259c 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc
@@ -33,7 +33,6 @@ void GemmOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Nod
const auto& input_defs(node.InputDefs());
const bool is_gemm = op == "Gemm";
-#if defined(COREML_ENABLE_MLPROGRAM)
if (model_builder.CreateMLProgram()) {
// we have to transpose the weight input of Gemm if transB is false, and potentially override the bias shape
if (is_gemm) {
@@ -58,9 +57,7 @@ void GemmOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Nod
}
}
}
- } else
-#endif // defined(COREML_ENABLE_MLPROGRAM)
- {
+ } else {
// We have already embedded the weights (matrix B and C(if any)) into the coreml layer
// No need to copy them later to reduce memory consumption
model_builder.AddInitializerToSkip(input_defs[1]->Name());
@@ -123,7 +120,6 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
const auto K = transB ? b1 : b0;
const auto N = transB ? b0 : b1;
// we already checked it and dtype must be existed.
-#if defined(COREML_ENABLE_MLPROGRAM)
auto input_dtype = a.TypeAsProto()->tensor_type().elem_type();
if (model_builder.CreateMLProgram()) {
using namespace CoreML::Specification::MILSpec;
@@ -207,9 +203,7 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
AddOperationOutput(*matmul_op, *node.OutputDefs()[0]);
model_builder.AddOperation(std::move(matmul_op));
}
- } else
-#endif // defined(COREML_ENABLE_MLPROGRAM)
- {
+ } else {
auto* coreml_inner_product = layer->mutable_innerproduct();
*layer->mutable_input()->Add() = a.Name();
diff --git a/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc
index 6dcf14c16f111..f558f423752e8 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc
@@ -42,7 +42,6 @@ class GridSampleOpBuilder : public BaseOpBuilder {
Status GridSampleOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& model_builder,
[[maybe_unused]] const Node& node,
[[maybe_unused]] const logging::Logger& logger) const {
-#if defined(COREML_ENABLE_MLPROGRAM)
using namespace CoreML::Specification::MILSpec; // NOLINT
// https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.image_resizing.resample
@@ -80,7 +79,6 @@ Status GridSampleOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder&
AddOperationOutput(*op, *output_defs[0]);
model_builder.AddOperation(std::move(op));
-#endif
return Status::OK();
}
diff --git a/onnxruntime/core/providers/coreml/builders/impl/normalization_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/normalization_op_builder.cc
index b4dc8d1647ad0..c0db144602ee2 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/normalization_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/normalization_op_builder.cc
@@ -49,7 +49,6 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(
if (node.OpType() == "GroupNormalization") {
return AddGroupNormToModelBuilderImpl(model_builder, node, logger);
}
-#if defined(COREML_ENABLE_MLPROGRAM)
const auto& input_defs = node.InputDefs();
NodeAttrHelper helper(node);
const auto& scale_tensor = *model_builder.GetConstantInitializer(input_defs[1]->Name());
@@ -94,7 +93,6 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(
AddOperationOutput(*op, *node.OutputDefs()[0]);
model_builder.AddOperation(std::move(op));
}
-#endif // (COREML_ENABLE_MLPROGRAM)
return Status::OK();
}
@@ -103,7 +101,6 @@ Status NormalizationOpBuilder::AddGroupNormToModelBuilderImpl(
[[maybe_unused]] ModelBuilder& model_builder,
[[maybe_unused]] const Node& node,
[[maybe_unused]] const logging::Logger& logger) const {
-#if defined(COREML_ENABLE_MLPROGRAM)
const auto& input_defs = node.InputDefs();
NodeAttrHelper helper(node);
// Coreml hasn't supported GroupNorm yet.
@@ -184,7 +181,6 @@ Status NormalizationOpBuilder::AddGroupNormToModelBuilderImpl(
model_builder.AddOperation(std::move(mul));
model_builder.AddOperation(std::move(add));
}
-#endif // (COREML_ENABLE_MLPROGRAM)
return Status::OK();
}
diff --git a/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc
index 17910ba6fd486..e43eef75007cc 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc
@@ -29,7 +29,6 @@ Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
const auto& op_type = node.OpType();
const auto& input_defs = node.InputDefs();
-#if defined(COREML_ENABLE_MLPROGRAM)
if (model_builder.CreateMLProgram()) {
using namespace CoreML::Specification::MILSpec;
@@ -91,9 +90,7 @@ Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
AddOperationOutput(*op, *node.OutputDefs()[0]);
model_builder.AddOperation(std::move(op));
- } else
-#endif // defined(COREML_ENABLE_MLPROGRAM)
- {
+ } else {
std::unique_ptr layer = model_builder.CreateNNLayer(node);
auto* coreml_pool = layer->mutable_pooling();
diff --git a/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc
index d533b867bd454..a4609eb2a0584 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc
@@ -71,7 +71,6 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, co
const bool keepdims = helper.Get("keepdims", 1) != 0;
const bool noop_with_empty_axes = helper.Get("noop_with_empty_axes", 0) != 0;
-#if defined(COREML_ENABLE_MLPROGRAM)
if (model_builder.CreateMLProgram()) {
using namespace CoreML::Specification::MILSpec;
@@ -103,9 +102,7 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, co
AddOperationOutput(*op, *node.OutputDefs()[0]);
model_builder.AddOperation(std::move(op));
- } else
-#endif // (COREML_ENABLE_MLPROGRAM)
- {
+ } else {
std::unique_ptr layer = model_builder.CreateNNLayer(node);
if (op_type == "ReduceSum") {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc
index 27d24d9c21893..b35d6971623ed 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc
@@ -50,7 +50,6 @@ Status ReshapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
// ReshapeHelper applies the ONNX rules to create the concrete output shape
ReshapeHelper helper(TensorShape(input_shape), new_shape);
-#if defined(COREML_ENABLE_MLPROGRAM)
if (model_builder.CreateMLProgram()) {
using namespace CoreML::Specification::MILSpec;
@@ -64,9 +63,7 @@ Status ReshapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
AddOperationOutput(*reshape_op, *node.OutputDefs()[0]);
model_builder.AddOperation(std::move(reshape_op));
- } else
-#endif // defined(COREML_ENABLE_MLPROGRAM)
- {
+ } else {
std::unique_ptr layer = model_builder.CreateNNLayer(node);
*layer->mutable_reshapestatic()->mutable_targetshape() = {new_shape.cbegin(), new_shape.cend()};
diff --git a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
index 7ff66e4a79e37..837573003e515 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
@@ -212,7 +212,6 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
num_sizes = output_sizes.size();
}
-#if defined(COREML_ENABLE_MLPROGRAM)
if (model_builder.CreateMLProgram()) {
using namespace CoreML::Specification::MILSpec; // NOLINT
@@ -279,9 +278,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
AddOperationOutput(*op, *output_defs[0]);
model_builder.AddOperation(std::move(op));
- } else // NOLINT
-#endif
- {
+ } else {
std::unique_ptr layer = model_builder.CreateNNLayer(node);
auto* coreml_upsample = layer->mutable_upsample();
diff --git a/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc
index 243f949bdd48e..d1c87b033d323 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc
@@ -25,7 +25,6 @@ Status ShapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
const logging::Logger& /*logger*/) const {
const auto& input_defs = node.InputDefs();
-#if defined(COREML_ENABLE_MLPROGRAM)
if (model_builder.CreateMLProgram()) {
using namespace CoreML::Specification::MILSpec;
NodeAttrHelper node_attr_helper{node};
@@ -63,9 +62,7 @@ Status ShapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
AddOperationOutput(*op, *node.OutputDefs()[0], output_datatype);
model_builder.AddOperation(std::move(op));
}
- } else // NOLINT
-#endif
- {
+ } else {
auto layer = model_builder.CreateNNLayer(node);
layer->mutable_getshape();
*layer->mutable_input()->Add() = input_defs[0]->Name();
diff --git a/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc
index 6b3fe75fa592d..368e47e40f831 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc
@@ -127,7 +127,6 @@ Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
SliceOp::PrepareForComputeMetadata compute_metadata{data_shape};
ORT_RETURN_IF_ERROR(PrepareSliceComputeMetadata(node, model_builder.GetGraphViewer(), compute_metadata));
-#if defined(COREML_ENABLE_MLPROGRAM)
if (model_builder.CreateMLProgram()) {
using namespace CoreML::Specification::MILSpec; // NOLINT
// https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation.slice_by_index
@@ -178,9 +177,7 @@ Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
model_builder.AddOperation(std::move(op));
- } else // NOLINT
-#endif // defined(COREML_ENABLE_MLPROGRAM)
- {
+ } else {
auto layer = model_builder.CreateNNLayer(node);
*layer->mutable_input()->Add() = input_defs[0]->Name();
*layer->mutable_output()->Add() = output_defs[0]->Name();
@@ -222,7 +219,6 @@ bool SliceOpBuilder::HasSupportedInputsImpl(const Node& node,
return false;
}
-#ifdef COREML_ENABLE_MLPROGRAM
// The [Doc](https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation.slice_by_index)
// says ML Program slice_by_index supports fp16 in CoreML 5 (iOS 15).
// It's incorrect and CoreML 6+ (iOS16, CoreML spec version >= 7) is required otherwise only float is supported.
@@ -230,13 +226,11 @@ bool SliceOpBuilder::HasSupportedInputsImpl(const Node& node,
// CoreML 6:https://github.com/apple/coremltools/blob/c3ea4cf56fef1176417246c1b85363417f3e713d/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_transformation.py#L495
if (input_params.create_mlprogram && input_params.coreml_version >= 6 &&
input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
- } else
-#endif // nolint
- if (input_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT &&
- input_type != ONNX_NAMESPACE::TensorProto_DataType_INT64) {
- LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Input type: [" << input_type << "] is not supported";
- return false;
- }
+ } else if (input_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT &&
+ input_type != ONNX_NAMESPACE::TensorProto_DataType_INT64) {
+ LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Input type: [" << input_type << "] is not supported";
+ return false;
+ }
return true;
}
diff --git a/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
index c6e331feed326..2411cd459fecd 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
@@ -37,7 +37,6 @@ Status SoftmaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
const auto axis = helper.Get("axis", axis_default_value);
auto axis_nonnegative = HandleNegativeAxis(axis, data_shape.size());
-#if defined(COREML_ENABLE_MLPROGRAM)
// CoreML's softmax match onnx's softmax behavior since opset 13.
// For opset < 13, we need to reshape to 2D and set axis to -1 to simulate onnx softmax behavior.
// [B,D,...](onnx softmax opset 12, axis=1)->[B,D*...](CoreML softmax, axis=-1)->[B,D,...](reshape back)
@@ -78,9 +77,7 @@ Status SoftmaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
AddOperationOutput(*reshape2, *node.OutputDefs()[0]);
model_builder.AddOperation(std::move(reshape2));
}
- } else // NOLINT
-#endif
- {
+ } else {
if (node.SinceVersion() >= 13 || (data_shape.size() == 2)) {
auto* coreml_softmaxnd = layer->mutable_softmaxnd();
coreml_softmaxnd->set_axis(axis);
diff --git a/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
index 6372f3136123b..717d344982473 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
@@ -56,7 +56,6 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
return std::make_tuple(remainder, chunk_size);
};
-#if defined(COREML_ENABLE_MLPROGRAM)
if (model_builder.CreateMLProgram()) {
using namespace CoreML::Specification::MILSpec;
std::unique_ptr split_op = model_builder.CreateOperation(node, "split");
@@ -95,9 +94,7 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
}
model_builder.AddOperation(std::move(split_op));
- } else
-#endif
- {
+ } else {
std::unique_ptr layer = model_builder.CreateNNLayer(node);
auto* coreml_splitnd = layer->mutable_splitnd();
coreml_splitnd->set_axis(axis);
diff --git a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
index a1b3a18265c70..81bef11906b74 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
@@ -58,7 +58,6 @@ void SqueezeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const
}
}
-#if defined(COREML_ENABLE_MLPROGRAM)
void HandleX86ArchUnsqueezeScalarInput(ModelBuilder& model_builder,
const Node& node, const logging::Logger& logger) {
const auto& input_defs(node.InputDefs());
@@ -74,7 +73,6 @@ void HandleX86ArchUnsqueezeScalarInput(ModelBuilder& model_builder,
AddOperationOutput(*op, *node.OutputDefs()[0]);
model_builder.AddOperation(std::move(op));
}
-#endif
Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
const Node& node,
@@ -83,7 +81,7 @@ Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
auto* coreml_squeeze = layer->mutable_squeeze();
TensorShapeVector axes;
GetAxes(model_builder, node, axes);
-#if defined(COREML_ENABLE_MLPROGRAM)
+
const auto& input_defs(node.InputDefs());
if (model_builder.CreateMLProgram()) {
using namespace CoreML::Specification::MILSpec;
@@ -105,9 +103,7 @@ Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
}
AddOperationOutput(*op, *node.OutputDefs()[0]);
model_builder.AddOperation(std::move(op));
- } else // NOLINT
-#endif
- {
+ } else {
if (axes.empty()) {
coreml_squeeze->set_squeezeall(true);
} else {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
index 831c4cf4d08ba..5bb7e4c11967a 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
@@ -34,7 +34,6 @@ Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
ORT_RETURN_IF_NOT(perm.size() == input_dims, "Perm and input should have same dimension");
}
-#if defined(COREML_ENABLE_MLPROGRAM)
if (model_builder.CreateMLProgram()) {
using namespace CoreML::Specification::MILSpec;
@@ -44,9 +43,7 @@ Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
AddOperationOutput(*op, *node.OutputDefs()[0]);
model_builder.AddOperation(std::move(op));
- } else
-#endif // defined(COREML_ENABLE_MLPROGRAM)
- {
+ } else {
std::unique_ptr layer = model_builder.CreateNNLayer(node);
*layer->mutable_transpose()->mutable_axes() = {perm.cbegin(), perm.cend()};
diff --git a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
index bc3cad004aec1..dd495894ab8bb 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
@@ -25,7 +25,6 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
const auto& op_type(node.OpType());
const auto& input_defs(node.InputDefs());
-#if defined(COREML_ENABLE_MLPROGRAM)
if (model_builder.CreateMLProgram()) {
using namespace CoreML::Specification::MILSpec;
@@ -58,9 +57,7 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
AddOperationOutput(*op, *node.OutputDefs()[0]);
model_builder.AddOperation(std::move(op));
- } else // NOLINT
-#endif // defined (COREML_ENABLE_MLPROGRAM)
- {
+ } else {
std::unique_ptr layer = model_builder.CreateNNLayer(node);
if (op_type == "Sqrt") {
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.cc b/onnxruntime/core/providers/coreml/builders/model_builder.cc
index f8952301d59a9..3551f5759201e 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.cc
@@ -17,20 +17,17 @@
#include "core/providers/coreml/shape_utils.h"
#include "core/optimizer/initializer.h"
-#if defined(COREML_ENABLE_MLPROGRAM)
// includes from coremltools-src in _deps
#include "modelpackage/src/ModelPackage.hpp"
#include "mlmodel/src/MILBlob/Blob/StorageWriter.hpp"
using MILBlob::Blob::StorageWriter;
-#endif
-
using namespace CoreML::Specification;
namespace onnxruntime {
namespace coreml {
namespace {
-#if defined(COREML_ENABLE_MLPROGRAM)
+
// Should the initializer be written to file or kept as an immediate value
bool ShouldWriteInitializerToWeightsFile(const ONNX_NAMESPACE::TensorProto& tensor_proto) {
// https://github.com/apple/coremltools/blob/dbb0094fd0cb936469e35320bf37e866ef7a1da4/coremltools/converters/mil/backend/mil/load.py#L51-L57
@@ -388,8 +385,6 @@ void CreateEmptyFile(const std::string& filename) {
ORT_ENFORCE(file.is_open(), "Failed to open file ", filename);
}
-#endif // defined(COREML_ENABLE_MLPROGRAM)
-
std::string GetModelOutputPath(const CoreMLOptions& coreml_options,
const GraphViewer& graph_viewer,
const logging::Logger& logger) {
@@ -479,7 +474,6 @@ ModelBuilder::ModelBuilder(const GraphViewer& graph_viewer, const logging::Logge
}
if (create_ml_program_) {
-#if defined(COREML_ENABLE_MLPROGRAM)
coreml_model_->set_specificationversion(CoreMLSpecVersion());
MILSpec::Program& mlprogram = *coreml_model_->mutable_mlprogram();
mlprogram.set_version(1);
@@ -503,12 +497,6 @@ ModelBuilder::ModelBuilder(const GraphViewer& graph_viewer, const logging::Logge
"CoreML Model Weights");
auto weights_info = mlpackage_->findItem(weights_id);
weights_file_writer_ = std::make_unique(weights_info->path() + "/weight.bin");
-#else
- // should never happen due to handling in coreml_execution_provider.cc
- // throw here so all other code in this class can assume create_ml_program_ is only ever true in a build
- // where ML Program support is enabled.
- ORT_THROW("ML Program is not enabled in this build");
-#endif
} else {
// We support CorelML Specification Version 4 (Core ML 3)
coreml_model_->set_specificationversion(4);
@@ -561,7 +549,6 @@ void ModelBuilder::AddLayer(std::unique_ptr layer) {
/*
* ML Program related helpers
*/
-#if defined(COREML_ENABLE_MLPROGRAM)
const std::string& ModelBuilder::GetSafeName(const std::string& name) {
// Check the name is valid according to the MILSpec rules
// `Identifiers, generally used for names and keys, must match the regular expression [A-Za-z\_][A-Za-z0-9\_@]*.`
@@ -737,8 +724,6 @@ std::string_view ModelBuilder::AddConstantImpl(std::string_view op_type, std::st
return AddTensorValueAsConstantOperation(op_type, value_type, std::move(input_value));
}
-#endif // defined(COREML_ENABLE_MLPROGRAM)
-
/*
* General implementation
*/
@@ -775,13 +760,10 @@ Status ModelBuilder::RegisterInitializers() {
continue;
}
-#if defined(COREML_ENABLE_MLPROGRAM)
if (create_ml_program_) {
MILSpec::Value coreml_tensor = OnnxTensorToCoreMLTensor(tensor, *weights_file_writer_);
ORT_IGNORE_RETURN_VALUE(AddConstantOperation(name, std::move(coreml_tensor)));
- } else
-#endif
- {
+ } else {
std::unique_ptr layer = std::make_unique();
layer->set_name(GetUniqueName("initializer_" + name));
@@ -915,7 +897,6 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
return Status::OK();
}
-#if defined(COREML_ENABLE_MLPROGRAM)
if (create_ml_program_) {
if (is_input) {
// the model inputs need to be wired up as args to the 'main' function.
@@ -935,7 +916,6 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
*mlprogram_main_block_->mutable_outputs()->Add() = name;
}
}
-#endif // defined(COREML_ENABLE_MLPROGRAM)
return Status::OK();
}
@@ -980,11 +960,9 @@ Status ModelBuilder::CreateModel() {
ORT_RETURN_IF_ERROR(ProcessNodes());
ORT_RETURN_IF_ERROR(RegisterModelOutputs());
-#if defined(COREML_ENABLE_MLPROGRAM)
if (create_ml_program_) {
SanitizeNames();
}
-#endif
return Status::OK();
}
@@ -992,7 +970,6 @@ Status ModelBuilder::CreateModel() {
Status ModelBuilder::SaveModel() {
std::string output_path = model_output_path_;
-#if defined(COREML_ENABLE_MLPROGRAM)
if (create_ml_program_) {
// we need to jump through some hoops to get the model path the ML Program load wants.
std::string tmp_model_path = model_output_path_ + "/tmp/model.mlmodel";
@@ -1003,7 +980,6 @@ Status ModelBuilder::SaveModel() {
auto model_info = mlpackage_->findItem(model_id);
output_path = model_info->path();
}
-#endif
// scope this so the stream is closed and flushed by the ofstream dtor
{
@@ -1012,19 +988,16 @@ Status ModelBuilder::SaveModel() {
ORT_RETURN_IF_NOT(coreml_model_->SerializeToOstream(&stream), "Saving the CoreML model failed. Path=", output_path);
}
-#if defined(COREML_ENABLE_MLPROGRAM)
// need to delete the ModelPackage instance for it to write out the manifest. clear out the other ML Program
// related types as well.
mlprogram_main_block_ = nullptr;
mlpackage_.reset();
weights_file_writer_.reset();
-#endif
return Status::OK();
}
Status ModelBuilder::LoadModel(std::unique_ptr& model) {
-#if defined(COREML_ENABLE_MLPROGRAM)
if (create_ml_program_) {
// we need to provide the sanitized names for model inputs/outputs so that info is captured.
// the input/output matching when we execute the model from the CoreML EP is based on order, so the change
@@ -1058,9 +1031,7 @@ Status ModelBuilder::LoadModel(std::unique_ptr& model) {
std::move(scalar_outputs_),
std::move(int64_outputs_),
logger_, coreml_options_);
- } else
-#endif
- {
+ } else {
model = std::make_unique(model_output_path_,
std::move(onnx_input_names_),
std::move(onnx_output_names_),
@@ -1073,7 +1044,6 @@ Status ModelBuilder::LoadModel(std::unique_ptr& model) {
return model->LoadModel(); // load using CoreML API, including compilation
}
-#if defined(COREML_ENABLE_MLPROGRAM)
std::string_view ModelBuilder::AddConstant(std::string_view op_type, std::string_view value_type,
const ONNX_NAMESPACE::TensorProto& tensor,
std::optional> shape) {
@@ -1114,7 +1084,6 @@ std::string_view ModelBuilder::AddConstant(std::string_view op_type, std::string
return ret;
}
-#endif
// static
Status ModelBuilder::Build(const GraphViewer& graph_viewer, const logging::Logger& logger,
int32_t coreml_version, const CoreMLOptions& coreml_options,
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.h b/onnxruntime/core/providers/coreml/builders/model_builder.h
index 28c7dc42da581..f3012e8137e8c 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.h
@@ -9,7 +9,6 @@
#include "core/providers/coreml/model/model.h"
#include "core/providers/coreml/coreml_options.h"
-#if defined(COREML_ENABLE_MLPROGRAM)
// coremltools classes
namespace MPL {
class ModelPackage;
@@ -20,7 +19,6 @@ namespace Blob {
class StorageWriter;
}
} // namespace MILBlob
-#endif
namespace onnxruntime {
namespace coreml {
@@ -58,11 +56,7 @@ class ModelBuilder {
// Returns true if we are creating an ML Program
bool CreateMLProgram() const {
-#if defined(COREML_ENABLE_MLPROGRAM)
return create_ml_program_;
-#else
- return false;
-#endif
}
/*
@@ -76,7 +70,6 @@ class ModelBuilder {
// Add layer to the Core ML NeuralNetwork model
void AddLayer(std::unique_ptr layer);
-#if defined(COREML_ENABLE_MLPROGRAM)
/*
* MLProgram helpers
*/
@@ -147,7 +140,6 @@ class ModelBuilder {
// add the operation to the main function
void AddOperation(std::unique_ptr operation);
-#endif
/*
* General helpers
@@ -176,7 +168,6 @@ class ModelBuilder {
const logging::Logger& Logger() const { return logger_; }
private:
-#if defined(COREML_ENABLE_MLPROGRAM)
template
std::string_view AddConstantImpl(std::string_view op_type, std::string_view value_type, gsl::span value,
std::optional> shape = std::nullopt);
@@ -190,7 +181,6 @@ class ModelBuilder {
const std::string& AddConstantOperation(std::string_view name, COREML_SPEC::MILSpec::Value&& initializer);
const std::string& AddTensorValueAsConstantOperation(std::string_view op_type, std::string_view value_type,
COREML_SPEC::MILSpec::Value&& input_value);
-#endif
// Convert the ONNX model in graph_viewer_ to a CoreML::Specification::Model and serialize to disk.
// We then load it using CoreML in order compile it.
@@ -237,7 +227,6 @@ class ModelBuilder {
uint32_t name_token_{0};
std::unordered_set unique_names_;
-#if defined(COREML_ENABLE_MLPROGRAM)
// mlprogram_main_ is the main block of the CoreML ML Program.
// It is set in CreateModel to the CoreML Model.mlprogram.functions['main'].block_specializations['CoreML']
// entry we create.
@@ -254,7 +243,6 @@ class ModelBuilder {
// This means an op builder author doesn't need to be aware of the renaming.
// https://github.com/apple/coremltools/blob/8b37641f243b1a3e81452feea311c6e30dcc9287/coremltools/converters/mil/mil/passes/defs/preprocess.py#L146-L149
std::unordered_map values_to_rename_;
-#endif
};
} // namespace coreml
diff --git a/onnxruntime/core/providers/coreml/coreml_options.cc b/onnxruntime/core/providers/coreml/coreml_options.cc
index 14ae55de9266b..c441a2eff56e0 100644
--- a/onnxruntime/core/providers/coreml/coreml_options.cc
+++ b/onnxruntime/core/providers/coreml/coreml_options.cc
@@ -15,18 +15,6 @@ CoreMLOptions::CoreMLOptions(uint32_t coreml_flags) {
create_mlprogram_ = (coreml_flags & COREML_FLAG_CREATE_MLPROGRAM) != 0;
enable_on_subgraph_ = (coreml_flags & COREML_FLAG_ENABLE_ON_SUBGRAPH) != 0;
-#if defined(COREML_ENABLE_MLPROGRAM)
- if (coreml::util::CoreMLVersion() < MINIMUM_COREML_MLPROGRAM_VERSION && create_mlprogram_ != 0) {
- LOGS_DEFAULT(WARNING) << "ML Program is not supported on this OS version. Falling back to NeuralNetwork.";
- create_mlprogram_ = false;
- }
-#else
- if (create_mlprogram_ != 0) {
- LOGS_DEFAULT(WARNING) << "ML Program is not supported in this build. Falling back to NeuralNetwork.";
- create_mlprogram_ = false;
- }
-#endif
-
compute_units_ = 0; // 0 for all
if (coreml_flags & COREML_FLAG_USE_CPU_ONLY) {
diff --git a/onnxruntime/core/providers/coreml/model/host_utils.h b/onnxruntime/core/providers/coreml/model/host_utils.h
index 145c64e5320d3..f654b4d5701b9 100644
--- a/onnxruntime/core/providers/coreml/model/host_utils.h
+++ b/onnxruntime/core/providers/coreml/model/host_utils.h
@@ -43,7 +43,13 @@
#define API_AVAILABLE_COREML7 API_AVAILABLE(macos(14), ios(17))
#define API_AVAILABLE_COREML8 API_AVAILABLE(macos(15), ios(18))
-// @available is used in implementation code
+// The previous macros are used in header files to declare the availability of the APIs.
+// The following macros are used in build time checks to determine if the APIs are available.
+#define CAN_BUILD_COREML8_OR_LATER (__MAC_OS_X_VERSION_MAX_ALLOWED >= 150000 && __IPHONE_OS_VERSION_MAX_ALLOWED >= 180000)
+#define CAN_BUILD_COREML7_OR_LATER (__MAC_OS_X_VERSION_MAX_ALLOWED >= 140000 && __IPHONE_OS_VERSION_MAX_ALLOWED >= 170000)
+#define CAN_BUILD_COREML6_OR_LATER (__MAC_OS_X_VERSION_MAX_ALLOWED >= 130000 && __IPHONE_OS_VERSION_MAX_ALLOWED >= 160000)
+
+// @available is used in implementation code to check the availability of the APIs at runtime.
// Base required OS to run CoreML Specification Version 4 (Core ML 3)
#define HAS_COREML3_OR_LATER @available(macOS 10.15, iOS 13, *)
#define HAS_COREML4_OR_LATER @available(macOS 11, iOS 14, *)
@@ -54,8 +60,7 @@
#endif
-#define MINIMUM_COREML_VERSION 3 // first version we support
-#define MINIMUM_COREML_MLPROGRAM_VERSION 5 // first version where ML Program was available
+#define MINIMUM_COREML_VERSION 5 // first version we support
namespace onnxruntime {
namespace coreml {
diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm
index 5211b89ec17c6..71664021ea2fb 100644
--- a/onnxruntime/core/providers/coreml/model/model.mm
+++ b/onnxruntime/core/providers/coreml/model/model.mm
@@ -363,13 +363,12 @@ void ProfileComputePlan(NSURL* compileUrl, MLModelConfiguration* config) {
#endif
}
-#if __has_include()
+#if __has_include() && CAN_BUILD_COREML8_OR_LATER
#define HAS_COREMLOPTIMIZATIONHINT 1
#else
#define HAS_COREMLOPTIMIZATIONHINT 0
#endif
-API_AVAILABLE_COREML8
void ConfigureOptimizationHints(MLModelConfiguration* config, const CoreMLOptions& coreml_options) {
#if HAS_COREMLOPTIMIZATIONHINT
MLOptimizationHints* optimizationHints = [[MLOptimizationHints alloc] init];
diff --git a/onnxruntime/core/providers/cpu/tensor/cast_op.cc b/onnxruntime/core/providers/cpu/tensor/cast_op.cc
index 35f3b12aeba35..639a49cb43a4f 100644
--- a/onnxruntime/core/providers/cpu/tensor/cast_op.cc
+++ b/onnxruntime/core/providers/cpu/tensor/cast_op.cc
@@ -254,11 +254,32 @@ struct TensorCasterNoSat {
// tensor MLFloat16 -> float
template <>
struct TensorCaster {
- void Cast(const OpKernelContext&, const TensorShape& shape, const Tensor& in, Tensor& out) const {
+ void Cast(const OpKernelContext& ctx, const TensorShape& shape, const Tensor& in, Tensor& out) const {
auto out_data = out.MutableData();
auto in_data = in.Data();
const size_t shape_size = narrow(shape.Size());
- MlasConvertHalfToFloatBuffer(in_data, out_data, shape_size);
+
+ // Check if the tensor is long enough to use threads
+ if (shape_size <= 128000) {
+ MlasConvertHalfToFloatBuffer(in_data, out_data, shape_size);
+ return;
+ }
+ // Calculate the number of compute cyles per implementation
+ auto cpu_info = CPUIDInfo::GetCPUIDInfo();
+ double num_compute_cycles;
+ if (cpu_info.HasSSE3()) {
+ num_compute_cycles = static_cast(shape_size >> 1);
+ } else if (cpu_info.HasAVX2()) {
+ num_compute_cycles = static_cast(shape_size >> 2);
+ } else {
+ num_compute_cycles = static_cast(shape_size * 10);
+ }
+
+ concurrency::ThreadPool::TryParallelFor(ctx.GetOperatorThreadPool(), shape_size,
+ {shape_size * 2.f, shape_size * 4.f, num_compute_cycles},
+ [in_data, out_data](std::ptrdiff_t first_span, std::ptrdiff_t last_span) {
+ MlasConvertHalfToFloatBuffer(in_data + first_span, out_data + first_span, static_cast(last_span - first_span));
+ });
}
};
diff --git a/onnxruntime/core/providers/cuda/cu_inc/common.cuh b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
index 55935a9eae86d..2d2551a156099 100644
--- a/onnxruntime/core/providers/cuda/cu_inc/common.cuh
+++ b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
@@ -441,6 +441,9 @@ __device__ __inline__ T _Sign(T a) { return _Signum(a, std::is_signed()); }
template <>
__device__ __inline__ half _Sign(half a) { return _Signum(a, std::true_type()); }
+template <>
+__device__ __inline__ BFloat16 _Sign(BFloat16 a) { return _Signum(static_cast(a), std::true_type()); }
+
template
__device__ __inline__ T _Normcdf(T a);
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 54fb4429c0536..886fddd8f8a27 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -1013,6 +1013,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Abs);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Abs);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Abs);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Abs);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int8_t, Neg);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int16_t, Neg);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Neg);
@@ -1188,6 +1189,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Sign);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Sign);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Sign);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Sign);
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, BFloat16, Add);
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, BFloat16, Sub);
@@ -1996,6 +1998,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo,
BuildKernelCreateInfo,
BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
BuildKernelCreateInfo,
BuildKernelCreateInfo,
BuildKernelCreateInfo,
@@ -2169,6 +2172,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo,
BuildKernelCreateInfo,
BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
BuildKernelCreateInfo,
BuildKernelCreateInfo,
diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc
index fb03b4326c4e8..86a1b0f5b6102 100644
--- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc
+++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc
@@ -213,19 +213,19 @@ Status IsNaN::ComputeInternal(OpKernelContext* context) const {
UNARY_OP_TYPED(name, ver, float) \
UNARY_OP_TYPED(name, ver, double)
-#define UNARY_OP_CSILHFD(name, ver) \
+#define UNARY_OP_CSILHFDX(name, ver) \
UNARY_OP_TYPED(name, ver, int8_t) \
UNARY_OP_TYPED(name, ver, int16_t) \
UNARY_OP_TYPED(name, ver, int32_t) \
UNARY_OP_TYPED(name, ver, int64_t) \
UNARY_OP_HFDX(name, ver)
-#define UNARY_OP_BWUZCSILHFD(name, ver) \
- UNARY_OP_TYPED(name, ver, uint8_t) \
- UNARY_OP_TYPED(name, ver, uint16_t) \
- UNARY_OP_TYPED(name, ver, uint32_t) \
- UNARY_OP_TYPED(name, ver, uint64_t) \
- UNARY_OP_CSILHFD(name, ver)
+#define UNARY_OP_BWUZCSILHFDX(name, ver) \
+ UNARY_OP_TYPED(name, ver, uint8_t) \
+ UNARY_OP_TYPED(name, ver, uint16_t) \
+ UNARY_OP_TYPED(name, ver, uint32_t) \
+ UNARY_OP_TYPED(name, ver, uint64_t) \
+ UNARY_OP_CSILHFDX(name, ver)
UNARY_OP_VERSIONED_BWUZCSILHFD(Abs, 6, 12)
UNARY_OP_VERSIONED_CSILHFD(Neg, 6, 12)
@@ -237,8 +237,8 @@ UNARY_OP_VERSIONED_HFD(Log, 6, 12)
UNARY_OP_VERSIONED_HFD(Exp, 6, 12)
UNARY_OP_VERSIONED_HFD(Erf, 9, 12)
-UNARY_OP_BWUZCSILHFD(Abs, 13)
-UNARY_OP_CSILHFD(Neg, 13)
+UNARY_OP_BWUZCSILHFDX(Abs, 13)
+UNARY_OP_CSILHFDX(Neg, 13)
UNARY_OP_HFD(Floor, 13)
UNARY_OP_HFD(Ceil, 13)
UNARY_OP_HFD(Reciprocal, 13)
@@ -246,7 +246,7 @@ UNARY_OP_HFDX(Sqrt, 13)
UNARY_OP_HFD(Log, 13)
UNARY_OP_HFDX(Exp, 13)
UNARY_OP_HFDX(Erf, 13)
-UNARY_OP_BWUZCSILHFD(Sign, 13)
+UNARY_OP_BWUZCSILHFDX(Sign, 13)
UNARY_LOGICALOP_NOT_TYPED(1, bool)
UNARY_OP_HFD(Round, 11)
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
index 3d66003fb2bca..77579dfc793ee 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
+++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
@@ -47,6 +47,7 @@ OpBuilderRegistrations::OpBuilderRegistrations() {
CreateSimpleOpBuilder("Sin", *this);
CreateSimpleOpBuilder("Sqrt", *this);
CreateSimpleOpBuilder("Sub", *this);
+ CreateSimpleOpBuilder("Sum", *this);
CreateSimpleOpBuilder("Tanh", *this);
CreateSimpleOpBuilder("Concat", *this);
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
index a79f4056083c5..df9d0de8e0e3e 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
@@ -158,6 +158,7 @@ class BaseOpBuilder : public IOpBuilder {
{"Softmax", QNN_OP_SOFTMAX},
{"Sqrt", QNN_OP_ELEMENT_WISE_SQUARE_ROOT},
{"Sub", QNN_OP_ELEMENT_WISE_SUBTRACT},
+ {"Sum", QNN_OP_ELEMENT_WISE_ADD},
{"Tanh", QNN_OP_TANH},
{"Transpose", QNN_OP_TRANSPOSE},
{"GridSample", QNN_OP_GRID_SAMPLE},
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
index 48c637cd2e951..229d86082f6dc 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
@@ -56,11 +56,18 @@ Status SimpleOpBuilder::ExplicitOpCheck(QnnModelWrapper& qnn_model_wrapper,
padding_mode.c_str());
}
- // ONNX's Min and Max operators accept a variable number of inputs (i.e., variadic).
- // However, QNN's Min and Max operators must take in exactly two inputs.
+ // ONNX's Min, Max, and Sum operators accept a variable number of inputs (i.e., variadic).
+ // However, QNN's Min, Max, and Add operators must take in exactly two inputs.
if (op_type == "Min" || op_type == "Max") {
ORT_RETURN_IF_NOT(node_unit.Inputs().size() == 2,
- "QNN EP only supports Min and Max operators with exactly 2 inputs.");
+ "QNN EP only supports ", op_type.c_str(), " operator with exactly 2 inputs.");
+ }
+
+ if (op_type == "Sum") {
+ size_t inputs_num = node_unit.Inputs().size();
+ ORT_RETURN_IF_NOT(inputs_num == 2,
+ "QNN EP supports Sum operator with QNN_OP_ELEMENT_WISE_ADD, which takes exactly 2 inputs. Got ONNX's Sum operator with ",
+ std::to_string(inputs_num).c_str(), " inputs.");
}
if (op_type == "DequantizeLinear") {
diff --git a/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.cc b/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.cc
index 13004af25726d..6891b8159b090 100644
--- a/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.cc
+++ b/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.cc
@@ -4,15 +4,18 @@
#include "core/providers/common.h"
#include "core/providers/webgpu/math/binary_elementwise_ops.h"
#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/string_macros.h"
#include "core/providers/webgpu/webgpu_supported_types.h"
namespace onnxruntime {
namespace webgpu {
Status BinaryElementwiseProgram::GenerateShaderCode(ShaderHelper& shader) const {
- const auto& a = shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
- const auto& b = shader.AddInput("input_b", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
+ const auto& a = shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
+ const auto& b = shader.AddInput("input_b", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
const auto& c = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
+ shader.AdditionalImplementation() << additional_impl_;
+
shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size");
// check whether can use element-wise mode.
@@ -142,8 +145,15 @@ Status BinaryElementwise::ComputeInternal(ComputeContext& context) const {
}
uint32_t vec_size = onnxruntime::narrow((size + 3) / 4);
+
+ std::string additional_impl;
+ if (get_additional_impl_) {
+ additional_impl = get_additional_impl_(lhs_tensor->GetElementType(), rhs_tensor->GetElementType());
+ }
+
BinaryElementwiseProgram program{kernel_name_,
expression_,
+ additional_impl,
is_broadcast,
is_lhs_scalar,
is_rhs_scalar,
@@ -273,7 +283,28 @@ WEBGPU_BINARY_VERSIONED_KERNEL(Sub, 7, 12, Sub, WebGpuSupportedNumberTypes())
WEBGPU_BINARY_VERSIONED_KERNEL(Sub, 13, 13, Sub, WebGpuSupportedNumberTypes())
WEBGPU_BINARY_KERNEL(Sub, 14, Sub, WebGpuSupportedNumberTypes())
-WEBGPU_BINARY_IMPL(Pow, "output_value_t(pow(vec4(a), vec4(b)))")
+std::string GetPowImpl(int lhs_element_type, int /* rhs_element_type */) {
+ SS(s, 1024);
+ std::string round_str;
+ if (lhs_element_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32) {
+ round_str = "round";
+ }
+
+ s << "fn pow_custom(a : input_a_element_t, b : f32) -> input_a_element_t {\n"
+ " if (b == 0.0) {\n"
+ " return input_a_element_t(1.0);\n"
+ " } else if (a < input_a_element_t(0.0) && b != floor(b)) {\n"
+ " return input_a_element_t(pow(f32(a), b)); // NaN\n"
+ " }\n"
+ << " return select(sign(a), input_a_element_t(1.0), round(abs(b) % 2.0) != 1.0) * input_a_element_t(" << round_str << "(pow(f32(abs(a)), b)));\n"
+ << "}\n"
+ "fn pow_v(a : vec4, b : vec4) -> vec4 {\n"
+ " return vec4(pow_custom(a.x, f32(b.x)), pow_custom(a.y, f32(b.y)), pow_custom(a.z, f32(b.z)), pow_custom(a.w, f32(b.w)));\n"
+ "}\n";
+ return SS_GET(s);
+}
+
+WEBGPU_BINARY_IMPL(Pow, "pow_v(a, b)", GetPowImpl)
WEBGPU_BINARY_VERSIONED_KERNEL(Pow, 7, 11, Pow, WebGpuSupportedNumberTypes())
WEBGPU_BINARY_VERSIONED_KERNEL_2(Pow, 12, 12, Pow, WebGpuSupportedNumberTypes(), WebGpuSupportedNumberTypes())
WEBGPU_BINARY_VERSIONED_KERNEL_2(Pow, 13, 14, Pow, WebGpuSupportedNumberTypes(), WebGpuSupportedNumberTypes())
diff --git a/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.h b/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.h
index 84cbcdf3244d8..f80accfb934f8 100644
--- a/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.h
+++ b/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.h
@@ -14,11 +14,13 @@ class BinaryElementwiseProgram final : public Program
public:
BinaryElementwiseProgram(const std::string& kernel_name,
const std::string& expression,
+ const std::string& additional_impl,
const bool is_broadcast,
const bool is_lhs_scalar,
const bool is_rhs_scalar,
const bool vectorize) : Program{kernel_name},
expression_{expression},
+ additional_impl_{additional_impl},
is_broadcast_{is_broadcast},
is_lhs_scalar_{is_lhs_scalar},
is_rhs_scalar_{is_rhs_scalar},
@@ -29,7 +31,8 @@ class BinaryElementwiseProgram final : public Program
WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"vec_size", ProgramUniformVariableDataType::Uint32});
private:
- std::string expression_;
+ std::string_view expression_;
+ std::string_view additional_impl_;
bool is_broadcast_;
bool is_lhs_scalar_;
bool is_rhs_scalar_;
@@ -38,11 +41,15 @@ class BinaryElementwiseProgram final : public Program
class BinaryElementwise : public WebGpuKernel {
public:
+ using GetAdditionalImplementationFunction = std::string (*)(int lhs_element_type, int rhs_element_type);
+
BinaryElementwise(const OpKernelInfo& info,
const std::string& kernel_name,
- const std::string& expression) : WebGpuKernel{info},
- kernel_name_{kernel_name},
- expression_{expression} {}
+ const std::string& expression,
+ const GetAdditionalImplementationFunction get_additional_impl = nullptr) : WebGpuKernel{info},
+ kernel_name_{kernel_name},
+ expression_{expression},
+ get_additional_impl_{get_additional_impl} {}
protected:
Status ComputeInternal(ComputeContext& context) const final;
@@ -50,6 +57,7 @@ class BinaryElementwise : public WebGpuKernel {
private:
std::string kernel_name_;
std::string expression_;
+ const GetAdditionalImplementationFunction get_additional_impl_;
};
} // namespace webgpu
diff --git a/onnxruntime/core/providers/webgpu/math/cum_sum.cc b/onnxruntime/core/providers/webgpu/math/cum_sum.cc
new file mode 100644
index 0000000000000..bc4cd70a238fc
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/math/cum_sum.cc
@@ -0,0 +1,98 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/webgpu/math/cum_sum.h"
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+ CumSum,
+ kOnnxDomain,
+ 11, 13,
+ kWebGpuExecutionProvider,
+ (*KernelDefBuilder::Create())
+ .TypeConstraint("T", WebGpuSupportedFloatTypes())
+ .TypeConstraint("T2", {DataTypeImpl::GetTensorType(),
+ DataTypeImpl::GetTensorType()})
+ .InputMemoryType(OrtMemTypeCPU, 1),
+ CumSum);
+
+ONNX_OPERATOR_KERNEL_EX(
+ CumSum,
+ kOnnxDomain,
+ 14,
+ kWebGpuExecutionProvider,
+ (*KernelDefBuilder::Create())
+ .TypeConstraint("T", WebGpuSupportedFloatTypes())
+ .TypeConstraint("T2", {DataTypeImpl::GetTensorType(),
+ DataTypeImpl::GetTensorType()})
+ .InputMemoryType(OrtMemTypeCPU, 1),
+ CumSum);
+
+Status CumSumProgram::GenerateShaderCode(ShaderHelper& shader) const {
+ const ShaderVariableHelper& input = shader.AddInput("input", ShaderUsage::UseUniform);
+ const ShaderVariableHelper& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
+
+ shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
+ << "var input_indices = " << input.OffsetToIndices("global_idx") << ";\n"
+ << "var sum : output_value_t = 0;\n"
+ << "var first : i32 = 0;\n"
+ << "if (uniforms.reverse == 1) {\n"
+ << " first = i32(" + input.IndicesGet("input_indices", "uniforms.axis") + ");\n"
+ << " if (uniforms.exclusive == 1) { first += 1; }\n"
+ << "}\n\n"
+ << "var last : i32 = 0;\n"
+ << "if (uniforms.reverse == 1) {\n"
+ << " last = i32(" << GetElementAt("uniforms.input_shape", "uniforms.axis", input.Rank()) << ");\n"
+ << "} else {\n"
+ << " last = i32(" + input.IndicesGet("input_indices", "uniforms.axis") + ");\n"
+ << " if (uniforms.exclusive == 0) { last += 1; }\n"
+ << "}\n\n"
+ << "for (var i : i32 = first; i < last; i++) {\n"
+ << " " << input.IndicesSet("input_indices", "uniforms.axis", "u32(i)") << ";\n"
+ << " sum = sum + " << input.GetByIndices("input_indices") << ";\n"
+ << "}\n"
+ << output.SetByOffset("global_idx", "sum");
+
+ return Status::OK();
+}
+
+Status CumSum::ComputeInternal(ComputeContext& context) const {
+ const auto* input_tensor = context.Input(0);
+ const TensorShape& input_shape = input_tensor->Shape();
+ int64_t input_rank = input_shape.NumDimensions();
+
+ const auto* axis_tensor = context.Input(1);
+ const auto* axis_data = axis_tensor->Data();
+ int64_t axis = static_cast(axis_data[0]);
+
+ ORT_ENFORCE(-input_rank <= axis && axis < input_rank, "Axes attribute must be within range -input_rank <= axis < input_rank.");
+ // Handle negative axis
+ if (axis < 0) {
+ axis += input_rank;
+ }
+
+ auto* output_tensor = context.Output(0, input_shape);
+ int64_t output_size = output_tensor->Shape().Size();
+
+ if (output_size == 0) {
+ return Status::OK();
+ }
+
+ CumSumProgram program{};
+ program
+ .AddInput({input_tensor})
+ .AddOutput({output_tensor, ProgramTensorMetadataDependency::TypeAndRank})
+ .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+ .AddUniformVariables({{static_cast(output_size)},
+ {static_cast(axis)},
+ {static_cast(exclusive_)},
+ {static_cast(reverse_)}});
+ return context.RunProgram(program);
+}
+
+} // namespace webgpu
+} // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/webgpu/math/cum_sum.h b/onnxruntime/core/providers/webgpu/math/cum_sum.h
new file mode 100644
index 0000000000000..6a66ee0ed7b04
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/math/cum_sum.h
@@ -0,0 +1,39 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/webgpu_kernel.h"
+#include "core/providers/webgpu/program.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+class CumSumProgram final : public Program {
+ public:
+ CumSumProgram() : Program{"CumSum"} {}
+
+ Status GenerateShaderCode(ShaderHelper& sh) const override;
+
+ WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32},
+ {"axis", ProgramUniformVariableDataType::Uint32},
+ {"exclusive", ProgramUniformVariableDataType::Uint32},
+ {"reverse", ProgramUniformVariableDataType::Uint32});
+};
+
+class CumSum final : public WebGpuKernel {
+ public:
+ CumSum(const OpKernelInfo& info) : WebGpuKernel(info) {
+ exclusive_ = info.GetAttrOrDefault("exclusive", 0);
+ reverse_ = info.GetAttrOrDefault("reverse", 0);
+ }
+
+ Status ComputeInternal(ComputeContext& context) const override;
+
+ private:
+ int64_t exclusive_;
+ int64_t reverse_;
+};
+
+} // namespace webgpu
+} // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
index 1a56cafdb3952..11fa30c798809 100644
--- a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
@@ -11,7 +11,7 @@
namespace onnxruntime {
namespace webgpu {
-#define REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceOp, begin, end) \
+#define REGISTER_REDUCE_VERSIONED_KERNEL(ReduceOp, begin, end) \
ONNX_OPERATOR_VERSIONED_KERNEL_EX( \
ReduceOp, \
kOnnxDomain, \
@@ -20,7 +20,16 @@ namespace webgpu {
(*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedNumberTypes()), \
ReduceOp);
-#define REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceOp, version) \
+#define REGISTER_REDUCE_VERSIONED_KERNEL_WITH_AXIS_IN_INPUT(ReduceOp, begin, end) \
+ ONNX_OPERATOR_VERSIONED_KERNEL_EX( \
+ ReduceOp, \
+ kOnnxDomain, \
+ begin, end, \
+ kWebGpuExecutionProvider, \
+ (*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedNumberTypes()).InputMemoryType(OrtMemTypeCPUInput, 1), \
+ ReduceOp);
+
+#define REGISTER_REDUCE_KERNEL(ReduceOp, version) \
ONNX_OPERATOR_KERNEL_EX( \
ReduceOp, \
kOnnxDomain, \
@@ -29,58 +38,66 @@ namespace webgpu {
(*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedNumberTypes()).InputMemoryType(OrtMemTypeCPUInput, 1), \
ReduceOp);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMean, 18);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMean, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMean, 11, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMean, 13, 17);
+REGISTER_REDUCE_KERNEL(ReduceMean, 18);
+
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMax, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMax, 11, 11);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMax, 12, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMax, 13, 17);
+REGISTER_REDUCE_VERSIONED_KERNEL_WITH_AXIS_IN_INPUT(ReduceMax, 18, 19);
+REGISTER_REDUCE_KERNEL(ReduceMax, 20);
+
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMin, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMin, 11, 11);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMin, 12, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMin, 13, 17);
+REGISTER_REDUCE_VERSIONED_KERNEL_WITH_AXIS_IN_INPUT(ReduceMin, 18, 19);
+REGISTER_REDUCE_KERNEL(ReduceMin, 20);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 11, 11);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 12, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 18, 19);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMax, 20);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceSum, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceSum, 11, 12);
+REGISTER_REDUCE_KERNEL(ReduceSum, 13);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 11, 11);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 12, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 18, 19);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMin, 20);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceProd, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceProd, 11, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceProd, 13, 17);
+REGISTER_REDUCE_KERNEL(ReduceProd, 18);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSum, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSum, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceSum, 13);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceL1, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceL1, 11, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceL1, 13, 17);
+REGISTER_REDUCE_KERNEL(ReduceL1, 18);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceProd, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceProd, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceProd, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceProd, 18);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceL2, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceL2, 11, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceL2, 13, 17);
+REGISTER_REDUCE_KERNEL(ReduceL2, 18);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL1, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL1, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL1, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceL1, 18);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSum, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSum, 11, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSum, 13, 17);
+REGISTER_REDUCE_KERNEL(ReduceLogSum, 18);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL2, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL2, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL2, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceL2, 18);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceSumSquare, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceSumSquare, 11, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceSumSquare, 13, 17);
+REGISTER_REDUCE_KERNEL(ReduceSumSquare, 18);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSum, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSum, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSum, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceLogSum, 18);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSumExp, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSumExp, 11, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSumExp, 13, 17);
+REGISTER_REDUCE_KERNEL(ReduceLogSumExp, 18);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSumSquare, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSumSquare, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSumSquare, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceSumSquare, 18);
+REGISTER_REDUCE_VERSIONED_KERNEL(ArgMax, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ArgMax, 11, 12);
+REGISTER_REDUCE_KERNEL(ArgMax, 13);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSumExp, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSumExp, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSumExp, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceLogSumExp, 18);
+REGISTER_REDUCE_VERSIONED_KERNEL(ArgMin, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ArgMin, 11, 12);
+REGISTER_REDUCE_KERNEL(ArgMin, 13);
Status ReduceKernelProgram::GenerateShaderCode(ShaderHelper& shader) const {
const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
@@ -105,6 +122,9 @@ Status ReduceKernelProgram::GenerateShaderCode(ShaderHelper& shader) const {
std::stringstream ss;
std::string index = "i" + std::to_string(i);
ss << "for (var " << index << " : u32 = 0; " << index << " < " << input.IndicesGet("uniforms.input_shape", i) << "; " << index << "++) {\n";
+ if (loop_body.find("last_index") != std::string::npos) {
+ ss << "let last_index = " + index + ";\n";
+ }
ss << input.IndicesSet("input_indices", i, index) << ";\n";
ss << loop_body << "\n";
ss << "}\n";
@@ -171,12 +191,13 @@ Status ReduceKernel::ComputeInternal(ComputeContext& context)
auto output = context.Output(0, input_tensor->Shape());
// We need to run the operation even for scalar inputs for these ops
const auto code = GetOpSpecificCode(input_tensor);
+ constexpr uint32_t output_size = 1;
+ constexpr uint32_t reduce_axes = 0;
ReduceKernelProgram program(name_, keepdims_, noop_with_empty_axes_, input_axes, code, false);
- std::vector reduce_axes = {0};
program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank})
.AddOutput({output, ProgramTensorMetadataDependency::TypeAndRank})
.SetDispatchGroupSize(1)
- .AddUniformVariables({{1}, {static_cast(noop_with_empty_axes_ ? 1 : 0)}, {reduce_axes}});
+ .AddUniformVariables({{output_size}, {static_cast(noop_with_empty_axes_ ? 1 : 0)}, {reduce_axes}});
return context.RunProgram(program);
} else {
// For other ops, or when axes is empty with noop_with_empty_axes_ true, just copy the input
@@ -328,5 +349,25 @@ ReduceOpSpecificCode ReduceLogSumExp::GetOpSpecificCode(const Tensor* input_tens
return code;
}
+ReduceOpSpecificCode ArgMin::GetOpSpecificCode(const Tensor* input_tensor) const {
+ ORT_UNUSED_PARAMETER(input_tensor);
+ std::string op = (select_last_index_) ? "<=" : "<";
+ std::string loop_header = "var best_element = first_element; var best_index = u32(0);";
+ std::string loop_body = "if (current_element " + op + " best_element) { best_element = current_element; best_index = last_index; };";
+ std::string loop_footer = "let output_value = output_value_t(best_index);";
+ ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
+ return code;
+}
+
+ReduceOpSpecificCode ArgMax::GetOpSpecificCode(const Tensor* input_tensor) const {
+ ORT_UNUSED_PARAMETER(input_tensor);
+ std::string op = (select_last_index_) ? ">=" : ">";
+ std::string loop_header = "var best_element = first_element; var best_index = u32(0);";
+ std::string loop_body = "if (current_element " + op + " best_element) { best_element = current_element; best_index = last_index; };";
+ std::string loop_footer = "let output_value = output_value_t(best_index);";
+ ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
+ return code;
+}
+
} // namespace webgpu
-} // namespace onnxruntime
\ No newline at end of file
+} // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h
index 291d931f41c05..70ae6d3c71eb9 100644
--- a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h
+++ b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h
@@ -119,5 +119,17 @@ class ReduceLogSumExp final : public ReduceKernel {
ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
};
+class ArgMin final : public ReduceKernel {
+ public:
+ ArgMin(const OpKernelInfo& info) : ReduceKernel(info, "ArgMin", true) {}
+ ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
+};
+
+class ArgMax final : public ReduceKernel {
+ public:
+ ArgMax(const OpKernelInfo& info) : ReduceKernel(info, "ArgMax", true) {}
+ ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
+};
+
} // namespace webgpu
} // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index dfb2e4b6ce665..aacbcc5fb4f0a 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -297,12 +297,12 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13,
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 12, MatMul);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, MatMul);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, float, ArgMax);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, float, ArgMax);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, float, ArgMax);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, float, ArgMin);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, float, ArgMin);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, float, ArgMin);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ArgMax);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ArgMax);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, ArgMax);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ArgMin);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ArgMin);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, ArgMin);
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, Softmax);
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Softmax);
@@ -624,13 +624,13 @@ std::unique_ptr RegisterKernels() {
// BuildKernelCreateInfo,
// BuildKernelCreateInfo,
- // BuildKernelCreateInfo,
- // BuildKernelCreateInfo,
- // BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
- // BuildKernelCreateInfo,
- // BuildKernelCreateInfo,
- // BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
BuildKernelCreateInfo,
BuildKernelCreateInfo,
@@ -713,8 +713,8 @@ std::unique_ptr RegisterKernels() {
BuildKernelCreateInfo,
BuildKernelCreateInfo,
BuildKernelCreateInfo,
- // BuildKernelCreateInfo,
- // BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
// BuildKernelCreateInfo,
// BuildKernelCreateInfo,
// BuildKernelCreateInfo,
diff --git a/onnxruntime/core/providers/webnn/allocator.cc b/onnxruntime/core/providers/webnn/allocator.cc
index 9c5cd651e1f00..8cf5b8cd72a5c 100644
--- a/onnxruntime/core/providers/webnn/allocator.cc
+++ b/onnxruntime/core/providers/webnn/allocator.cc
@@ -16,7 +16,7 @@ void* WebNNTensorAllocator::Alloc(size_t size) {
// We don't need to transfer the tensor to an MLTensor, so we don't need to allocate an MLTensor id.
return nullptr;
}
- void* p = EM_ASM_PTR({ return Module.jsepReserveTensorId(); });
+ void* p = EM_ASM_PTR({ return Module.webnnReserveTensorId(); });
allocations_[p] = size;
stats_.num_allocs++;
stats_.bytes_in_use += SafeInt(size);
@@ -27,7 +27,7 @@ void WebNNTensorAllocator::Free(void* p) {
if (p == nullptr) {
return;
}
- EM_ASM({ Module.jsepReleaseTensorId($0); }, p);
+ EM_ASM({ Module.webnnReleaseTensorId($0); }, p);
size_t size = allocations_[p];
stats_.bytes_in_use -= size;
allocations_.erase(p);
diff --git a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
index 6814b019f699c..08580ab2861d7 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
@@ -54,9 +54,9 @@ Status ArgMaxMinOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
const auto& op_type = node.OpType();
if (op_type == "ArgMax") {
- output = model_builder.GetBuilder().call("argMax", input, narrow(axis), options);
+ output = model_builder.GetBuilder().call("argMax", input, SafeInt(axis).Ref(), options);
} else if (op_type == "ArgMin") {
- output = model_builder.GetBuilder().call("argMin", input, narrow(axis), options);
+ output = model_builder.GetBuilder().call("argMin", input, SafeInt(axis).Ref(), options);
} else {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "ArgMaxMinOpBuilder, unknown op: ", op_type);
}
diff --git a/onnxruntime/core/providers/webnn/builders/impl/builder_utils.cc b/onnxruntime/core/providers/webnn/builders/impl/builder_utils.cc
index 113cc3df5438d..63e2345243282 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/builder_utils.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/builder_utils.cc
@@ -2,7 +2,6 @@
// Copyright (c) Intel Corporation. All rights reserved.
// Licensed under the MIT License.
-#include
#include
#include "core/providers/shared/utils/utils.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc
index 1bbe56ef9b477..ee2512ddd8b5a 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc
@@ -2,7 +2,6 @@
// Copyright (c) Intel Corporation. All rights reserved.
// Licensed under the MIT License.
-#include "core/common/safeint.h"
#include "core/providers/common.h"
#include "core/providers/shared/utils/utils.h"
#include "core/providers/webnn/builders/helper.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
index 1361b7dd5c14b..4c393e8a9bdba 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
@@ -2,7 +2,6 @@
// Copyright (c) Intel Corporation. All rights reserved.
// Licensed under the MIT License.
-#include "core/common/safeint.h"
#include "core/optimizer/initializer.h"
#include "core/providers/common.h"
#include "core/providers/shared/utils/utils.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/cumsum_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/cumsum_op_builder.cc
index be30c5520d62e..99be8f75771ad 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/cumsum_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/cumsum_op_builder.cc
@@ -2,7 +2,6 @@
// Copyright (c) Intel Corporation. All rights reserved.
// Licensed under the MIT License.
-#include "core/common/safeint.h"
#include "core/framework/tensorprotoutils.h"
#include "core/optimizer/initializer.h"
#include "core/providers/common.h"
@@ -64,8 +63,8 @@ Status CumSumOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
options.set("label", node.Name());
emscripten::val output = emscripten::val::object();
- output = model_builder.GetBuilder().call("cumulativeSum", input, gsl::narrow(webnn_axis),
- options);
+ output = model_builder.GetBuilder().call("cumulativeSum", input,
+ SafeInt(webnn_axis).Ref(), options);
model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
return Status::OK();
}
diff --git a/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc
index 55746bb1f61f0..f3363b1e186d5 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc
@@ -2,7 +2,6 @@
// Copyright (c) Intel Corporation. All rights reserved.
// Licensed under the MIT License.
-#include "core/common/safeint.h"
#include "core/optimizer/initializer.h"
#include "core/providers/common.h"
#include "core/providers/shared/utils/utils.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc
index 1f51e26fecfa5..6cee04bac3e2b 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc
@@ -2,7 +2,6 @@
// Copyright (c) Intel Corporation. All rights reserved.
// Licensed under the MIT License.
-#include "core/common/safeint.h"
#include "core/framework/tensorprotoutils.h"
#include "core/optimizer/initializer.h"
#include "core/providers/common.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc
index 8402f05d8e234..3f813f08279e7 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc
@@ -2,7 +2,6 @@
// Copyright (c) Intel Corporation. All rights reserved.
// Licensed under the MIT License.
-#include "core/common/safeint.h"
#include "core/framework/tensorprotoutils.h"
#include "core/optimizer/initializer.h"
#include "core/providers/common.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/flatten_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/flatten_op_builder.cc
index d0ece026a7048..c4ff280b95b6e 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/flatten_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/flatten_op_builder.cc
@@ -2,7 +2,6 @@
// Copyright (c) Intel Corporation. All rights reserved.
// Licensed under the MIT License.
-#include "core/common/safeint.h"
#include "core/providers/common.h"
#include "core/providers/shared/utils/utils.h"
#include "core/providers/webnn/builders/helper.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
index c5cc8e86bb308..1f24124745a19 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
@@ -2,7 +2,6 @@
// Copyright (c) Intel Corporation. All rights reserved.
// Licensed under the MIT License.
-#include "core/common/safeint.h"
#include "core/providers/common.h"
#include "core/providers/shared/utils/utils.h"
#include "core/providers/webnn/builders/helper.h"
@@ -86,9 +85,9 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
else if (extended_a_shape) {
std::vector new_shape;
for (size_t i = 0; i < b_shape.size() - 2; i++) {
- new_shape.push_back(narrow(b_shape[i]));
+ new_shape.push_back(SafeInt(b_shape[i]));
}
- new_shape.push_back(narrow(b_shape.back()));
+ new_shape.push_back(SafeInt(b_shape.back()));
output = model_builder.GetBuilder().call("reshape",
output,
emscripten::val::array(new_shape),
@@ -98,7 +97,7 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
else if (extended_b_shape) {
std::vector new_shape;
for (size_t i = 0; i < a_shape.size() - 1; i++) {
- new_shape.push_back(narrow(a_shape[i]));
+ new_shape.push_back(SafeInt(a_shape[i]));
}
output = model_builder.GetBuilder().call("reshape",
output,
diff --git a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
index a090c21fe3356..5b57df7f184e7 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
@@ -2,7 +2,6 @@
// Copyright (c) Intel Corporation. All rights reserved.
// Licensed under the MIT License.
-#include "core/common/safeint.h"
#include "core/optimizer/initializer.h"
#include "core/providers/common.h"
#include "core/providers/shared/utils/utils.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc
index d8373a45e4423..e8f26af928ab3 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc
@@ -2,7 +2,6 @@
// Copyright (c) Intel Corporation. All rights reserved.
// Licensed under the MIT License.
-#include "core/common/safeint.h"
#include "core/providers/common.h"
#include "core/providers/shared/utils/utils.h"
#include "core/providers/webnn/builders/helper.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc
index b338d27986279..79ad3574e07e9 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc
@@ -2,7 +2,6 @@
// Copyright (c) Intel Corporation. All rights reserved.
// Licensed under the MIT License.
-#include "core/common/safeint.h"
#include "core/providers/common.h"
#include "core/providers/shared/utils/utils.h"
#include "core/providers/webnn/builders/helper.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc
index 1bb6523c6f86a..ed62b2bd69618 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc
@@ -2,7 +2,6 @@
// Copyright (c) Intel Corporation. All rights reserved.
// Licensed under the MIT License.
-#include "core/common/safeint.h"
#include "core/optimizer/initializer.h"
#include "core/providers/common.h"
#include "core/providers/shared/utils/utils.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
index 93ad933d71c34..b23fbeba1ddc8 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
@@ -2,7 +2,6 @@
// Copyright (c) Intel Corporation. All rights reserved.
// Licensed under the MIT License.
-#include "core/common/safeint.h"
#include "core/providers/common.h"
#include "core/providers/shared/utils/utils.h"
#include "core/providers/webnn/builders/helper.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc
index 0a438e98ad737..2fc47430a1c66 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc
@@ -2,7 +2,6 @@
// Copyright (c) Intel Corporation. All rights reserved.
// Licensed under the MIT License.
-#include "core/common/safeint.h"
#include "core/framework/tensorprotoutils.h"
#include "core/optimizer/initializer.h"
#include "core/providers/common.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
index 58515d2df54ec..eec6911a686cf 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
@@ -4,7 +4,6 @@
#include
-#include "core/common/safeint.h"
#include "core/providers/common.h"
#include "core/framework/tensorprotoutils.h"
#include "core/providers/webnn/builders/helper.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc
index 1688dfc97a0c4..0a84835ee9fc0 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc
@@ -89,7 +89,7 @@ Status RotaryEmbeddingOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_build
emscripten::val wnn_builder = model_builder.GetBuilder();
NodeAttrHelper helper(node);
- const bool interleaved = gsl::narrow_cast(helper.Get("interleaved", 0));
+ const bool interleaved = static_cast(helper.Get("interleaved", 0));
uint32_t num_heads = helper.Get("num_heads", 0);
uint32_t rotary_embedding_dim = helper.Get("rotary_embedding_dim", 0);
diff --git a/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc
index 4adc5920de7fa..468c0e24a3e88 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc
@@ -2,7 +2,6 @@
// Copyright (c) Intel Corporation. All rights reserved.
// Licensed under the MIT License.
-#include "core/common/safeint.h"
#include "core/framework/tensorprotoutils.h"
#include "core/optimizer/initializer.h"
#include "core/providers/common.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc
index b1b737b114998..0e754b53e78d1 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc
@@ -2,7 +2,6 @@
// Copyright (c) Intel Corporation. All rights reserved.
// Licensed under the MIT License.
-#include "core/common/safeint.h"
#include "core/providers/common.h"
#include "core/providers/shared/utils/utils.h"
#include "core/providers/webnn/builders/helper.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
index 06dbacf995a28..21b44b1066694 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
@@ -2,7 +2,6 @@
// Copyright (c) Intel Corporation. All rights reserved.
// Licensed under the MIT License.
-#include "core/common/safeint.h"
#include "core/optimizer/initializer.h"
#include "core/providers/common.h"
#include "core/providers/shared/utils/utils.h"
@@ -75,8 +74,8 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
// Check that the splits evenly divide.
if (split_count > 0 && splits.empty() && input_shape[axis] % split_count != 0) {
// Divide inputs into variable size outputs:
- splits.insert(splits.end(), split_count - 1, narrow(input_shape[axis]) / split_count);
- splits.insert(splits.end(), narrow(input_shape[axis]) % split_count);
+ splits.insert(splits.end(), split_count - 1, SafeInt(input_shape[axis]) / split_count);
+ splits.insert(splits.end(), SafeInt(input_shape[axis]) % split_count);
}
if (splits.empty()) {
diff --git a/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc
index a3be9d7e2ceee..5687b1133c628 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc
@@ -2,7 +2,6 @@
// Copyright (c) Intel Corporation. All rights reserved.
// Licensed under the MIT License.
-#include "core/common/safeint.h"
#include "core/providers/common.h"
#include "core/providers/shared/utils/utils.h"
#include "core/providers/webnn/builders/helper.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/tile_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/tile_op_builder.cc
index 672a3a510d54d..259bb0552b7c7 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/tile_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/tile_op_builder.cc
@@ -2,7 +2,6 @@
// Copyright (c) Intel Corporation. All rights reserved.
// Licensed under the MIT License.
-#include "core/common/safeint.h"
#include "core/framework/tensorprotoutils.h"
#include "core/optimizer/initializer.h"
#include "core/providers/common.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc
index ac440e0119bac..452071f469c4f 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc
@@ -2,7 +2,6 @@
// Copyright (c) Intel Corporation. All rights reserved.
// Licensed under the MIT License.
-#include "core/common/safeint.h"
#include "core/providers/shared/utils/utils.h"
#include "core/providers/webnn/builders/helper.h"
#include "core/providers/webnn/builders/model_builder.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc
index 0c818533918a4..f2092d6163713 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc
@@ -59,7 +59,7 @@ Status TriangularOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
std::vector unpacked_tensor;
ORT_RETURN_IF_ERROR(onnxruntime::utils::UnpackInitializerData(diagonal_tensor, unpacked_tensor));
const auto diagonal = *reinterpret_cast(unpacked_tensor.data());
- options.set("diagonal", narrow(diagonal));
+ options.set("diagonal", SafeInt(diagonal).Ref());
}
output = model_builder.GetBuilder().call("triangular", input, options);
diff --git a/onnxruntime/core/providers/webnn/builders/model.cc b/onnxruntime/core/providers/webnn/builders/model.cc
index 35964d85862e4..40fdfc609e6a1 100644
--- a/onnxruntime/core/providers/webnn/builders/model.cc
+++ b/onnxruntime/core/providers/webnn/builders/model.cc
@@ -8,7 +8,6 @@
#include "core/common/common.h"
#include "core/common/inlined_containers.h"
#include "core/common/logging/logging.h"
-#include "core/common/safeint.h"
#include "core/graph/onnx_protobuf.h"
#include "core/providers/common.h"
#include "model.h"
@@ -157,7 +156,7 @@ onnxruntime::common::Status Model::Compute(const InlinedHashMap& inputs,
const InlinedHashMap& outputs) {
- auto jsepEnsureTensor = emscripten::val::module_property("jsepEnsureTensor");
+ auto webnnEnsureTensor = emscripten::val::module_property("webnnEnsureTensor");
auto promises = emscripten::val::array();
for (const auto& [_, tensor] : inputs) {
emscripten::val shape = emscripten::val::array();
@@ -165,7 +164,7 @@ onnxruntime::common::Status Model::Dispatch(const InlinedHashMap(dim);
shape.call("push", dim_val);
}
- auto ml_tensor = jsepEnsureTensor(emscripten::val::undefined(), reinterpret_cast(tensor.buffer), tensor.tensor_info.data_type, shape, true);
+ auto ml_tensor = webnnEnsureTensor(emscripten::val::undefined(), reinterpret_cast(tensor.buffer), tensor.tensor_info.data_type, shape, true);
promises.call("push", ml_tensor);
}
for (const auto& [_, tensor] : outputs) {
@@ -174,7 +173,7 @@ onnxruntime::common::Status Model::Dispatch(const InlinedHashMap(dim);
shape.call("push", dim_val);
}
- auto ml_tensor = jsepEnsureTensor(emscripten::val::undefined(), reinterpret_cast(tensor.buffer), tensor.tensor_info.data_type, shape, false);
+ auto ml_tensor = webnnEnsureTensor(emscripten::val::undefined(), reinterpret_cast(tensor.buffer), tensor.tensor_info.data_type, shape, false);
promises.call("push", ml_tensor);
}
auto ml_tensors = emscripten::val::global("Promise").call("all", promises).await();
diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.cc b/onnxruntime/core/providers/webnn/builders/model_builder.cc
index 661b2ad7056c2..399cc5faf6273 100644
--- a/onnxruntime/core/providers/webnn/builders/model_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/model_builder.cc
@@ -9,7 +9,6 @@
#include "helper.h"
#include "op_builder_factory.h"
-#include "core/common/safeint.h"
#include "core/framework/tensorprotoutils.h"
#include "core/providers/common.h"
#include "core/providers/shared/utils/utils.h"
@@ -140,13 +139,13 @@ Status ModelBuilder::RegisterInitializers() {
ORT_RETURN_IF_ERROR(utils::GetExternalDataInfo(
tensor, graph_viewer_.ModelPath(), external_file_path, data_offset, tensor_byte_size));
- auto jsepRegisterMLConstant = emscripten::val::module_property("jsepRegisterMLConstant");
- operand = jsepRegisterMLConstant(emscripten::val(external_file_path),
- static_cast(data_offset),
- static_cast(tensor_byte_size),
- wnn_builder_,
- desc,
- should_convert_int64_to_int32);
+ auto webnnRegisterMLConstant = emscripten::val::module_property("webnnRegisterMLConstant");
+ operand = webnnRegisterMLConstant(emscripten::val(external_file_path),
+ static_cast(data_offset),
+ static_cast(tensor_byte_size),
+ wnn_builder_,
+ desc,
+ should_convert_int64_to_int32);
} else {
if (tensor.has_raw_data()) {
tensor_ptr = reinterpret_cast(const_cast(tensor.raw_data().c_str()));
@@ -288,7 +287,7 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
desc.set("dataType", emscripten::val("int32"));
}
wnn_operands_.insert(std::make_pair(name, wnn_builder_.call("input", name, desc)));
- emscripten::val::module_property("jsepRegisterGraphInput")(name);
+ emscripten::val::module_property("webnnRegisterGraphInput")(name);
input_names_.push_back(name);
} else {
output_names_.push_back(name);
diff --git a/onnxruntime/core/providers/webnn/data_transfer.cc b/onnxruntime/core/providers/webnn/data_transfer.cc
index 44e9bf9edf3d9..aa85277b72453 100644
--- a/onnxruntime/core/providers/webnn/data_transfer.cc
+++ b/onnxruntime/core/providers/webnn/data_transfer.cc
@@ -29,11 +29,11 @@ common::Status DataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const {
const auto& dst_device = dst.Location().device;
if (dst_device.Type() == OrtDevice::GPU) {
- EM_ASM({ Module.jsepUploadTensor($0, HEAPU8.subarray($1, $1 + $2)); }, dst_data, reinterpret_cast(src_data), bytes);
+ EM_ASM({ Module.webnnUploadTensor($0, HEAPU8.subarray($1, $1 + $2)); }, dst_data, reinterpret_cast(src_data), bytes);
} else {
- auto jsepDownloadTensor = emscripten::val::module_property("jsepDownloadTensor");
+ auto webnnDownloadTensor = emscripten::val::module_property("webnnDownloadTensor");
auto subarray = emscripten::typed_memory_view(bytes, static_cast(dst_data));
- jsepDownloadTensor(reinterpret_cast(src_data), subarray).await();
+ webnnDownloadTensor(reinterpret_cast(src_data), subarray).await();
}
}
diff --git a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
index 7410ff66add30..2da7c6499933a 100644
--- a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
+++ b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
@@ -10,7 +10,6 @@
#include "core/framework/kernel_registry.h"
#include "core/graph/graph_viewer.h"
#include "core/session/onnxruntime_cxx_api.h"
-#include "core/common/safeint.h"
#include "core/providers/webnn/allocator.h"
#include "core/providers/webnn/data_transfer.h"
#include "core/providers/partitioning_utils.h"
@@ -284,7 +283,7 @@ class WebNNMemcpy : public OpKernel {
explicit WebNNMemcpy(const OpKernelInfo& info) : OpKernel(info) {}
Status Compute(OpKernelContext* context) const override {
- auto jsepEnsureTensor = emscripten::val::module_property("jsepEnsureTensor");
+ auto webnnEnsureTensor = emscripten::val::module_property("webnnEnsureTensor");
const auto* X = context->Input(0);
ORT_ENFORCE(X != nullptr, "Memcpy: input tensor is null");
auto* Y = context->Output(0, X->Shape());
@@ -294,10 +293,10 @@ class WebNNMemcpy : public OpKernel {
shape.call("push", SafeInt(dim).Ref());
}
- jsepEnsureTensor(emscripten::val::undefined(),
- reinterpret_cast(Y->MutableDataRaw()),
- Y->GetElementType(),
- shape, false)
+ webnnEnsureTensor(emscripten::val::undefined(),
+ reinterpret_cast(Y->MutableDataRaw()),
+ Y->GetElementType(),
+ shape, false)
.await();
const auto* data_transfer = Info().GetDataTransferManager().GetDataTransfer(X->Location().device, Y->Location().device);
diff --git a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc
index c828ae9400174..8d972f7d63bc1 100644
--- a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc
+++ b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc
@@ -57,7 +57,7 @@ bool MaxPool::IsOnnxNodeSupported(const NodeUnit& node_unit,
// input of maxpool could be fp16/fp32/fp64,i8/u8 according to ONNX
if (x_type == nullptr ||
(x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT &&
-// because pool_fp16_op_test can be enabled by other preprocessor, for example, COREML_ENABLE_MLPROGRAM
+// because pool_fp16_op_test can be enabled by other preprocessor, for example, USE_COREML
#ifdef XNNPACK_FP16_SUPPORTED
x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 &&
#endif
diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py
index 6b5f7526cc506..785eb9c485d25 100644
--- a/onnxruntime/python/onnxruntime_inference_collection.py
+++ b/onnxruntime/python/onnxruntime_inference_collection.py
@@ -15,6 +15,9 @@
from onnxruntime.capi import _pybind_state as C
if typing.TYPE_CHECKING:
+ import numpy as np
+ import numpy.typing as npt
+
import onnxruntime
@@ -59,22 +62,22 @@ def export_adapter(self, file_path: os.PathLike):
"""
self._adapter.export_adapter(file_path)
- def get_format_version(self):
+ def get_format_version(self) -> int:
return self._adapter.format_version
- def set_adapter_version(self, adapter_version: int):
+ def set_adapter_version(self, adapter_version: int) -> None:
self._adapter.adapter_version = adapter_version
- def get_adapter_version(self):
+ def get_adapter_version(self) -> int:
return self._adapter.adapter_version
- def set_model_version(self, model_version: int):
+ def set_model_version(self, model_version: int) -> None:
self._adapter.model_version = model_version
- def get_model_version(self):
+ def get_model_version(self) -> int:
return self._adapter.model_version
- def set_parameters(self, params: dict[str, OrtValue]):
+ def set_parameters(self, params: dict[str, OrtValue]) -> None:
self._adapter.parameters = {k: v._ortvalue for k, v in params.items()}
def get_parameters(self) -> dict[str, OrtValue]:
@@ -174,27 +177,27 @@ def __init__(self):
self._sess = None
self._enable_fallback = True
- def get_session_options(self):
+ def get_session_options(self) -> onnxruntime.SessionOptions:
"Return the session options. See :class:`onnxruntime.SessionOptions`."
return self._sess_options
- def get_inputs(self):
+ def get_inputs(self) -> Sequence[onnxruntime.NodeArg]:
"Return the inputs metadata as a list of :class:`onnxruntime.NodeArg`."
return self._inputs_meta
- def get_outputs(self):
+ def get_outputs(self) -> Sequence[onnxruntime.NodeArg]:
"Return the outputs metadata as a list of :class:`onnxruntime.NodeArg`."
return self._outputs_meta
- def get_overridable_initializers(self):
+ def get_overridable_initializers(self) -> Sequence[onnxruntime.NodeArg]:
"Return the inputs (including initializers) metadata as a list of :class:`onnxruntime.NodeArg`."
return self._overridable_initializers
- def get_modelmeta(self):
+ def get_modelmeta(self) -> onnxruntime.ModelMetadata:
"Return the metadata. See :class:`onnxruntime.ModelMetadata`."
return self._model_meta
- def get_providers(self):
+ def get_providers(self) -> Sequence[str]:
"Return list of registered execution providers."
return self._providers
@@ -202,7 +205,7 @@ def get_provider_options(self):
"Return registered execution providers' configurations."
return self._provider_options
- def set_providers(self, providers=None, provider_options=None):
+ def set_providers(self, providers=None, provider_options=None) -> None:
"""
Register the input list of execution providers. The underlying session is re-created.
@@ -224,13 +227,13 @@ def set_providers(self, providers=None, provider_options=None):
# recreate the underlying C.InferenceSession
self._reset_session(providers, provider_options)
- def disable_fallback(self):
+ def disable_fallback(self) -> None:
"""
Disable session.run() fallback mechanism.
"""
self._enable_fallback = False
- def enable_fallback(self):
+ def enable_fallback(self) -> None:
"""
Enable session.Run() fallback mechanism. If session.Run() fails due to an internal Execution Provider failure,
reset the Execution Providers enabled for this session.
@@ -249,7 +252,7 @@ def _validate_input(self, feed_input_names):
f"Required inputs ({missing_input_names}) are missing from input feed ({feed_input_names})."
)
- def run(self, output_names, input_feed, run_options=None):
+ def run(self, output_names, input_feed, run_options=None) -> Sequence[np.ndarray | SparseTensor | list | dict]:
"""
Compute the predictions.
@@ -308,7 +311,7 @@ def callback(results: np.ndarray, user_data: MyData, err: str) -> None:
output_names = [output.name for output in self._outputs_meta]
return self._sess.run_async(output_names, input_feed, callback, user_data, run_options)
- def run_with_ort_values(self, output_names, input_dict_ort_values, run_options=None):
+ def run_with_ort_values(self, output_names, input_dict_ort_values, run_options=None) -> Sequence[OrtValue]:
"""
Compute the predictions.
@@ -367,7 +370,7 @@ def get_profiling_start_time_ns(self):
"""
return self._sess.get_profiling_start_time_ns
- def io_binding(self):
+ def io_binding(self) -> IOBinding:
"Return an onnxruntime.IOBinding object`."
return IOBinding(self)
@@ -550,7 +553,7 @@ def _create_inference_session(self, providers, provider_options, disabled_optimi
self._provider_options = self._sess.get_provider_options()
self._profiling_start_time_ns = self._sess.get_profiling_start_time_ns
- def _reset_session(self, providers, provider_options):
+ def _reset_session(self, providers, provider_options) -> None:
"release underlying session object."
# meta data references session internal structures
# so they must be set to None to decrement _sess reference count.
@@ -721,7 +724,7 @@ class OrtValue:
This class provides APIs to construct and deal with OrtValues.
"""
- def __init__(self, ortvalue, numpy_obj=None):
+ def __init__(self, ortvalue: C.OrtValue, numpy_obj: np.ndarray | None = None):
if isinstance(ortvalue, C.OrtValue):
self._ortvalue = ortvalue
# Hold a ref count to the numpy object if the OrtValue is backed directly
@@ -733,11 +736,11 @@ def __init__(self, ortvalue, numpy_obj=None):
"`Provided ortvalue` needs to be of type `onnxruntime.capi.onnxruntime_pybind11_state.OrtValue`"
)
- def _get_c_value(self):
+ def _get_c_value(self) -> C.OrtValue:
return self._ortvalue
- @staticmethod
- def ortvalue_from_numpy(numpy_obj, device_type="cpu", device_id=0):
+ @classmethod
+ def ortvalue_from_numpy(cls, numpy_obj: np.ndarray, /, device_type="cpu", device_id=0) -> OrtValue:
"""
Factory method to construct an OrtValue (which holds a Tensor) from a given Numpy object
A copy of the data in the Numpy object is held by the OrtValue only if the device is NOT cpu
@@ -749,7 +752,7 @@ def ortvalue_from_numpy(numpy_obj, device_type="cpu", device_id=0):
# Hold a reference to the numpy object (if device_type is 'cpu') as the OrtValue
# is backed directly by the data buffer of the numpy object and so the numpy object
# must be around until this OrtValue instance is around
- return OrtValue(
+ return cls(
C.OrtValue.ortvalue_from_numpy(
numpy_obj,
C.OrtDevice(
@@ -761,8 +764,8 @@ def ortvalue_from_numpy(numpy_obj, device_type="cpu", device_id=0):
numpy_obj if device_type.lower() == "cpu" else None,
)
- @staticmethod
- def ortvalue_from_numpy_with_onnx_type(data, onnx_element_type: int):
+ @classmethod
+ def ortvalue_from_numpy_with_onnx_type(cls, data: np.ndarray, /, onnx_element_type: int) -> OrtValue:
"""
This method creates an instance of OrtValue on top of the numpy array.
No data copy is made and the lifespan of the resulting OrtValue should never
@@ -771,12 +774,14 @@ def ortvalue_from_numpy_with_onnx_type(data, onnx_element_type: int):
when we want to use an ONNX data type that is not supported by numpy.
:param data: numpy.ndarray.
- :param onnx_elemenet_type: a valid onnx TensorProto::DataType enum value
+ :param onnx_element_type: a valid onnx TensorProto::DataType enum value
"""
- return OrtValue(C.OrtValue.ortvalue_from_numpy_with_onnx_type(data, onnx_element_type), data)
+ return cls(C.OrtValue.ortvalue_from_numpy_with_onnx_type(data, onnx_element_type), data)
- @staticmethod
- def ortvalue_from_shape_and_type(shape, element_type, device_type: str = "cpu", device_id: int = 0):
+ @classmethod
+ def ortvalue_from_shape_and_type(
+ cls, shape: Sequence[int], element_type, device_type: str = "cpu", device_id: int = 0
+ ) -> OrtValue:
"""
Factory method to construct an OrtValue (which holds a Tensor) from given shape and element_type
@@ -788,7 +793,7 @@ def ortvalue_from_shape_and_type(shape, element_type, device_type: str = "cpu",
# Integer for onnx element type (see https://onnx.ai/onnx/api/mapping.html).
# This is helpful for some data type (like TensorProto.BFLOAT16) that is not available in numpy.
if isinstance(element_type, int):
- return OrtValue(
+ return cls(
C.OrtValue.ortvalue_from_shape_and_onnx_type(
shape,
element_type,
@@ -800,7 +805,7 @@ def ortvalue_from_shape_and_type(shape, element_type, device_type: str = "cpu",
)
)
- return OrtValue(
+ return cls(
C.OrtValue.ortvalue_from_shape_and_type(
shape,
element_type,
@@ -812,77 +817,77 @@ def ortvalue_from_shape_and_type(shape, element_type, device_type: str = "cpu",
)
)
- @staticmethod
- def ort_value_from_sparse_tensor(sparse_tensor):
+ @classmethod
+ def ort_value_from_sparse_tensor(cls, sparse_tensor: SparseTensor) -> OrtValue:
"""
The function will construct an OrtValue instance from a valid SparseTensor
The new instance of OrtValue will assume the ownership of sparse_tensor
"""
- return OrtValue(C.OrtValue.ort_value_from_sparse_tensor(sparse_tensor._get_c_tensor()))
+ return cls(C.OrtValue.ort_value_from_sparse_tensor(sparse_tensor._get_c_tensor()))
- def as_sparse_tensor(self):
+ def as_sparse_tensor(self) -> SparseTensor:
"""
The function will return SparseTensor contained in this OrtValue
"""
return SparseTensor(self._ortvalue.as_sparse_tensor())
- def data_ptr(self):
+ def data_ptr(self) -> int:
"""
Returns the address of the first element in the OrtValue's data buffer
"""
return self._ortvalue.data_ptr()
- def device_name(self):
+ def device_name(self) -> str:
"""
Returns the name of the device where the OrtValue's data buffer resides e.g. cpu, cuda, cann
"""
return self._ortvalue.device_name().lower()
- def shape(self):
+ def shape(self) -> Sequence[int]:
"""
Returns the shape of the data in the OrtValue
"""
return self._ortvalue.shape()
- def data_type(self):
+ def data_type(self) -> str:
"""
- Returns the data type of the data in the OrtValue
+ Returns the data type of the data in the OrtValue. E.g. 'tensor(int64)'
"""
return self._ortvalue.data_type()
- def element_type(self):
+ def element_type(self) -> int:
"""
Returns the proto type of the data in the OrtValue
if the OrtValue is a tensor.
"""
return self._ortvalue.element_type()
- def has_value(self):
+ def has_value(self) -> bool:
"""
Returns True if the OrtValue corresponding to an
optional type contains data, else returns False
"""
return self._ortvalue.has_value()
- def is_tensor(self):
+ def is_tensor(self) -> bool:
"""
Returns True if the OrtValue contains a Tensor, else returns False
"""
return self._ortvalue.is_tensor()
- def is_sparse_tensor(self):
+ def is_sparse_tensor(self) -> bool:
"""
Returns True if the OrtValue contains a SparseTensor, else returns False
"""
return self._ortvalue.is_sparse_tensor()
- def is_tensor_sequence(self):
+ def is_tensor_sequence(self) -> bool:
"""
Returns True if the OrtValue contains a Tensor Sequence, else returns False
"""
return self._ortvalue.is_tensor_sequence()
- def numpy(self):
+ def numpy(self) -> np.ndarray:
"""
Returns a Numpy object from the OrtValue.
Valid only for OrtValues holding Tensors. Throws for OrtValues holding non-Tensors.
@@ -890,7 +895,7 @@ def numpy(self):
"""
return self._ortvalue.numpy()
- def update_inplace(self, np_arr):
+ def update_inplace(self, np_arr) -> None:
"""
Update the OrtValue in place with a new Numpy array. The numpy contents
are copied over to the device memory backing the OrtValue. It can be used
@@ -948,7 +953,7 @@ class SparseTensor:
depending on the format
"""
- def __init__(self, sparse_tensor):
+ def __init__(self, sparse_tensor: C.SparseTensor):
"""
Internal constructor
"""
@@ -960,11 +965,17 @@ def __init__(self, sparse_tensor):
"`Provided object` needs to be of type `onnxruntime.capi.onnxruntime_pybind11_state.SparseTensor`"
)
- def _get_c_tensor(self):
+ def _get_c_tensor(self) -> C.SparseTensor:
return self._tensor
- @staticmethod
- def sparse_coo_from_numpy(dense_shape, values, coo_indices, ort_device):
+ @classmethod
+ def sparse_coo_from_numpy(
+ cls,
+ dense_shape: npt.NDArray[np.int64],
+ values: np.ndarray,
+ coo_indices: npt.NDArray[np.int64],
+ ort_device: OrtDevice,
+ ) -> SparseTensor:
"""
Factory method to construct a SparseTensor in COO format from given arguments
@@ -985,12 +996,17 @@ def sparse_coo_from_numpy(dense_shape, values, coo_indices, ort_device):
For strings and objects, it will create a copy of the arrays in CPU memory as ORT does not support those
on other devices and their memory can not be mapped.
"""
- return SparseTensor(
- C.SparseTensor.sparse_coo_from_numpy(dense_shape, values, coo_indices, ort_device._get_c_device())
- )
+ return cls(C.SparseTensor.sparse_coo_from_numpy(dense_shape, values, coo_indices, ort_device._get_c_device()))
- @staticmethod
- def sparse_csr_from_numpy(dense_shape, values, inner_indices, outer_indices, ort_device):
+ @classmethod
+ def sparse_csr_from_numpy(
+ cls,
+ dense_shape: npt.NDArray[np.int64],
+ values: np.ndarray,
+ inner_indices: npt.NDArray[np.int64],
+ outer_indices: npt.NDArray[np.int64],
+ ort_device: OrtDevice,
+ ) -> SparseTensor:
"""
Factory method to construct a SparseTensor in CSR format from given arguments
@@ -1011,7 +1027,7 @@ def sparse_csr_from_numpy(dense_shape, values, inner_indices, outer_indices, ort
For strings and objects, it will create a copy of the arrays in CPU memory as ORT does not support those
on other devices and their memory can not be mapped.
"""
- return SparseTensor(
+ return cls(
C.SparseTensor.sparse_csr_from_numpy(
dense_shape,
values,
@@ -1021,7 +1037,7 @@ def sparse_csr_from_numpy(dense_shape, values, inner_indices, outer_indices, ort
)
)
- def values(self):
+ def values(self) -> np.ndarray:
"""
The method returns a numpy array that is backed by the native memory
if the data type is numeric. Otherwise, the returned numpy array that contains
@@ -1093,19 +1109,19 @@ def format(self):
"""
return self._tensor.format
- def dense_shape(self):
+ def dense_shape(self) -> npt.NDArray[np.int64]:
"""
Returns a numpy array(int64) containing a dense shape of a sparse tensor
"""
return self._tensor.dense_shape()
- def data_type(self):
+ def data_type(self) -> str:
"""
Returns a string data type of the data in the OrtValue
"""
return self._tensor.data_type()
- def device_name(self):
+ def device_name(self) -> str:
"""
Returns the name of the device where the SparseTensor data buffers reside e.g. cpu, cuda
"""
diff --git a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
index 4611dc9082734..e22445edc0f5b 100644
--- a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
+++ b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
@@ -404,7 +404,7 @@ TYPED_TEST(LayerNormTest, LayerNorm17_opset) {
// Execution provider entry invalid.
// when other EPs support layer-norm fp16, this test should be updated to include them.
if (std::is_same::value) {
-#if !defined(COREML_ENABLE_MLPROGRAM)
+#if !defined(USE_COREML)
return;
#endif
}
diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
index b1779ded4a675..81323cb51a887 100644
--- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -389,6 +389,7 @@ TEST(MatMulNBits, Float32_Accuracy4) {
TestMatMulNBitsTyped();
TestMatMulNBitsTyped();
TestMatMulNBitsTyped();
+ TestMatMulNBitsTyped();
TestMatMulNBitsTyped();
TestMatMulNBitsTyped();
TestMatMulNBitsTyped();
@@ -458,6 +459,7 @@ TEST(MatMulNBits, Float16_Accuracy4) {
TestMatMulNBitsTyped();
TestMatMulNBitsTyped();
TestMatMulNBitsTyped();
+ TestMatMulNBitsTyped();
TestMatMulNBitsTyped();
TestMatMulNBitsTyped();
TestMatMulNBitsTyped();
@@ -528,8 +530,10 @@ TEST(MatMulNBits, Float16Cuda) {
for (auto K : {16, 32, 64, 128, 256, 1024, 93, 1234}) {
for (auto block_size : {16, 32, 64, 128}) {
for (auto has_gidx : has_gidx_options) {
-#ifdef USE_DML
+#if defined(USE_DML)
RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.04f);
+#elif defined(USE_WEBGPU)
+ RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.03f);
#else
RunTest(M, N, K, block_size, 0, false, true, has_gidx);
RunTest(M, N, K, block_size, 0, true, true, has_gidx, false);
diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
index a9aa78b7a3229..3505193b77683 100644
--- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc
+++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
@@ -246,7 +246,7 @@ TEST(CoreMLExecutionProviderTest, TestOrtFormatModel) {
#endif
}
-#if defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_COREML)
// Names in CoreML cannot start with [0-9] or contain anything but "[a-z][A-Z][0-9]_"
// Test that we fix invalid names in model inputs, initializers and outputs.
// This is only enforced for ML Program, so we only do name sanitization when creating an ML Program format model.
diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
index 724118d7419d2..9201da348e75c 100644
--- a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
+++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
@@ -125,7 +125,7 @@ TEST_F(ActivationOpTest, Relu) {
{}, {},
/*is_tensorrt_supported=*/false,
/*opset_version= */ 14);
-#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(USE_COREML)
TestActivationOp(
"Relu",
input_values_fp16,
@@ -139,7 +139,7 @@ TEST_F(ActivationOpTest, Relu) {
#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED
}
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML)
TEST_F(ActivationOpTest, Sigmoid_fp16) {
#ifdef USE_CUDA
int min_cuda_architecture = 530;
@@ -413,7 +413,7 @@ TEST_F(ActivationOpTest, LeakyRelu) {
{{"alpha", alpha}}, {});
}
-#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(USE_COREML)
TEST_F(ActivationOpTest, LeakyRelu_fp16) {
OpTester test("LeakyRelu", 11);
float alpha = 0.01f; // oneDNN set alpha equal to 0.01
diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.h b/onnxruntime/test/providers/cpu/activation/activation_op_test.h
index 59813f433dc41..04d116e29d3b0 100644
--- a/onnxruntime/test/providers/cpu/activation/activation_op_test.h
+++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.h
@@ -105,7 +105,7 @@ class ActivationOpTest : public ::testing::Test {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution dist(low, high);
-#ifdef COREML_ENABLE_MLPROGRAM
+#ifdef USE_COREML
// please check onnxruntime/onnxruntime/core/providers/coreml/builders/helper.cc:81
std::vector batch_size_list = {1, 2, 4, 9, 100};
#else
diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
index d87ee861752c7..fbd9d10a56c77 100644
--- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
@@ -32,7 +32,7 @@ void TestBinaryFloat16(const char* op_name,
bool enable_bf16 = true) {
{
std::vector> execution_providers;
-#ifdef COREML_ENABLE_MLPROGRAM
+#ifdef USE_COREML
execution_providers.push_back(DefaultCoreMLExecutionProvider(true));
#elif USE_CUDA
execution_providers.push_back(DefaultCudaExecutionProvider());
@@ -76,7 +76,7 @@ void TestUnaryFloat16(const char* op_name,
bool run_bf16 = true) {
{
std::vector> execution_providers;
-#ifdef COREML_ENABLE_MLPROGRAM
+#ifdef USE_COREML
execution_providers.push_back(DefaultCoreMLExecutionProvider(true));
#elif USE_CUDA
execution_providers.push_back(DefaultCudaExecutionProvider());
@@ -968,8 +968,15 @@ TEST(MathOpTest, Abs) {
test.Run();
}
-#ifdef USE_DNNL
+#if defined(USE_CUDA) || defined(USE_DNNL)
TEST(MathOpTest, Abs_bfloat16) {
+#ifdef USE_CUDA
+ int min_cuda_architecture = 530;
+ if (!HasCudaEnvironment(min_cuda_architecture)) {
+ LOGS_DEFAULT(WARNING) << "Hardware does NOT support BF16";
+ return;
+ }
+#endif
#ifdef USE_DNNL
if (!DnnlHasBF16Support()) {
LOGS_DEFAULT(WARNING) << "Hardware does NOT support BF16";
@@ -980,9 +987,19 @@ TEST(MathOpTest, Abs_bfloat16) {
std::vector dims{2, 2};
test_bf16.AddInput("X", dims, MakeBFloat16({1.0f, -2.0f, -0.0f, -10.0f}));
test_bf16.AddOutput("Y", dims, MakeBFloat16({1.0f, 2.0f, 0.0f, 10.0f}));
- test_bf16.Run();
+
+ std::vector> execution_providers;
+#if defined(USE_CUDA)
+ execution_providers.push_back(DefaultCudaExecutionProvider());
+#endif
+
+#if defined(USE_DNNL)
+ execution_providers.push_back(DefaultDnnlExecutionProvider());
+#endif
+
+ test_bf16.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
}
-#endif // USE_DNNL
+#endif // USE_CUDA || USE_DNNL
TEST(MathOpTest, Abs_int8) {
OpTester test("Abs");
@@ -1409,7 +1426,7 @@ TEST(MathOpTest, Pow_float16_float16) {
dims, {1.0f, 256.0f, 2.0f, 1.0f}, false);
}
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML)
TEST(MathOpTest, Pow_float_float16) {
OpTester test("Pow", 12);
std::vector dims{4};
@@ -1423,7 +1440,7 @@ TEST(MathOpTest, Pow_float_float16) {
execution_providers.push_back(DefaultCudaExecutionProvider());
#elif USE_ROCM
execution_providers.push_back(DefaultRocmExecutionProvider());
-#elif COREML_ENABLE_MLPROGRAM
+#elif USE_COREML
execution_providers.push_back(DefaultCoreMLExecutionProvider(true));
#endif
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
diff --git a/onnxruntime/test/providers/cpu/math/matmul_test.cc b/onnxruntime/test/providers/cpu/math/matmul_test.cc
index 298e870f348fc..dd8cbed15e5ef 100644
--- a/onnxruntime/test/providers/cpu/math/matmul_test.cc
+++ b/onnxruntime/test/providers/cpu/math/matmul_test.cc
@@ -210,7 +210,7 @@ TEST(MathOpTest, MatMulFloatType) {
RunMatMulTest(7, false, true);
}
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM) || defined(USE_XNNPACK)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML) || defined(USE_XNNPACK)
TEST(MathOpTest, MatMulFloat16) {
#ifdef USE_CUDA
int min_cuda_architecture = 530;
@@ -276,7 +276,7 @@ TEST(MathOpTest, MatMulZeroKInt32Type) {
RunMatMulZeroKTest();
}
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM) || defined(USE_XNNPACK)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML) || defined(USE_XNNPACK)
TEST(MathOpTest, MatMul_Float16) {
#ifdef USE_CUDA
int min_cuda_architecture = 530;
diff --git a/onnxruntime/test/providers/cpu/math/sign_test.cc b/onnxruntime/test/providers/cpu/math/sign_test.cc
index a01c2b26ea8b5..0da6a2ed55f2c 100644
--- a/onnxruntime/test/providers/cpu/math/sign_test.cc
+++ b/onnxruntime/test/providers/cpu/math/sign_test.cc
@@ -207,7 +207,7 @@ TEST(MathOpTest, Sign_MLFloat16) {
// test.Run(OpTester::ExpectResult::kExpectSuccess);
//}
-#if defined(USE_DNNL)
+#if defined(USE_CUDA) || defined(USE_DNNL)
TEST(MathOpTest, Sign_bfloat16) {
#ifdef USE_DNNL
if (!DnnlHasBF16Support()) {
@@ -228,9 +228,15 @@ TEST(MathOpTest, Sign_bfloat16) {
TestImpl(input.cbegin(), input.cend(), std::back_inserter(output));
test.AddOutput("output", input_dims, output);
std::vector> execution_providers;
+
+#if defined(USE_CUDA)
+ execution_providers.push_back(DefaultCudaExecutionProvider());
+#endif
+
#if defined(USE_DNNL)
execution_providers.push_back(DefaultDnnlExecutionProvider());
-#endif // USE_DNNL
+#endif
+
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
}
#endif
diff --git a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
index f8ebca5ff9a1b..a529d572d7cca 100644
--- a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
@@ -704,7 +704,7 @@ TEST(BatchNormTest, NonSpatial_Complicated) {
}
// Only CUDA and ROCm kernels have float 16 support
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML)
TEST(BatchNormTest, BatchNorm2d_fp16) {
vector X{-0.91221f, -0.283559f, 0.937637f, 2.09818f, -0.100199f, -0.608113f, 0.444562f, -1.07505f, 0.940591f,
-0.922262f, 0.0931303f, 0.69611f, 1.55187f, 0.159808f, 0.914874f, -1.24856f, -1.98928f, -0.331621f,
diff --git a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc
index 4253e36e02548..d1350db8ec12e 100644
--- a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc
@@ -3,7 +3,7 @@
#include "core/mlas/inc/mlas.h"
-#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM) || defined(USE_XNNPACK)
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(USE_COREML) || defined(USE_XNNPACK)
#include "gtest/gtest.h"
#include "test/providers/provider_test_utils.h"
@@ -30,7 +30,7 @@ struct ConvOpAndTestAttributes {
/*
Please notice that, we have predefined macros in the head of the file
-#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(USE_COREML)
When we have these two macro defines, this UT will turn into green light and work.
If attributes.activation is set the NhwcFusedConv contrib op is used.
diff --git a/onnxruntime/test/providers/cpu/nn/group_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/group_norm_op_test.cc
index ac517193a2c77..3d8d188867023 100644
--- a/onnxruntime/test/providers/cpu/nn/group_norm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/group_norm_op_test.cc
@@ -6,7 +6,7 @@
#include "test/common/tensor_op_test_utils.h"
#include "test/util/include/default_providers.h"
-#ifdef COREML_ENABLE_MLPROGRAM
+#ifdef USE_COREML
using namespace std;
namespace onnxruntime {
namespace test {
diff --git a/onnxruntime/test/providers/cpu/nn/instance_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/instance_norm_op_test.cc
index 341bb8a4fc957..46b74f2c2eb9d 100644
--- a/onnxruntime/test/providers/cpu/nn/instance_norm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/instance_norm_op_test.cc
@@ -121,7 +121,7 @@ TEST(InstanceNormalizationOpTest, InstanceNormBatch2) {
}
// Only CUDA and ROCm kernels have float 16 support
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML)
TEST(InstanceNormalizationOpTest, InstanceNormBatch1_fp16) {
OpTester test("InstanceNormalization");
diff --git a/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc
index d4e0af5011525..c14fc1fb62ae5 100644
--- a/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc
@@ -3,7 +3,7 @@
#include "core/mlas/inc/mlas.h"
-#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM) || defined(USE_XNNPACK)
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(USE_COREML) || defined(USE_XNNPACK)
#include "core/providers/cpu/nn/pool.h"
#include "gtest/gtest.h"
diff --git a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
index 24a8c8491b632..f1d612276174f 100644
--- a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
@@ -70,7 +70,7 @@ TEST(PoolTest, MaxPool) {
// Only CUDA kernel has float 16 support
// Disable for now, still investigating the issue with cudnn lib
-#if defined(USE_CUDA) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_CUDA) || defined(USE_COREML)
TEST(PoolTest, MaxPool_F16) {
#if defined(USE_CUDA)
int min_cuda_architecture = 530;
diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
index 4bc97d035c7f7..92cd82c2c9420 100644
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@@ -1375,7 +1375,7 @@ TEST(ReductionOpTest, ReduceMax_double) {
test.Run();
}
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML)
TEST(ReductionOpTest, ReduceMax_half) {
OpTester test("ReduceMax");
test.AddAttribute("axes", std::vector{1, 2});
@@ -2158,7 +2158,7 @@ TEST(ReductionOpTest, ReduceMin_double) {
test.Run();
}
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML)
TEST(ReductionOpTest, ReduceMin_half) {
OpTester test("ReduceMin");
test.AddAttribute("axes", std::vector{0, 2});
@@ -2356,7 +2356,7 @@ TEST(ReductionOpTest, ReduceSum_int32) {
test.Run();
}
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML)
TEST(ReductionOpTest, ReduceSumHalfHalf) {
OpTester test("ReduceSum");
test.AddAttribute("keepdims", (int64_t)0);
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index e261d66a0d22a..d62ffe644e4cc 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -694,7 +694,9 @@
"^test_gelu_tanh_2_expanded_cpu",
"^test_dynamicquantizelinear_expanded_cpu",
"^test_center_crop_pad_crop_negative_axes_hwc*", // failed due to new types or shape infer with negative axis for CenterCropPad.
- "^test_center_crop_pad_crop_negative_axes_hwc_expanded*" // failed due to new types or shape infer with negative axis for CenterCropPad.
+ "^test_center_crop_pad_crop_negative_axes_hwc_expanded*", // failed due to new types or shape infer with negative axis for CenterCropPad.
+ "^test_reduce_max_empty_set",
+ "^test_reduce_min_empty_set"
],
"current_failing_tests_pure_DML": [
"^test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_cpu",
diff --git a/onnxruntime/wasm/pre-jsep.js b/onnxruntime/wasm/pre-jsep.js
index a11c6d741d110..cca8da0525fbe 100644
--- a/onnxruntime/wasm/pre-jsep.js
+++ b/onnxruntime/wasm/pre-jsep.js
@@ -97,41 +97,40 @@ Module["jsepInit"] = (name, params) => {
// Functions called via emscripten::val::module_property need to be assigned by name so that the minifier doesn't
// change the name.
+ const backend = params[0];
[
- Module.jsepBackend,
- Module.jsepReserveTensorId,
- Module.jsepReleaseTensorId,
- Module["jsepEnsureTensor"],
- Module.jsepUploadTensor,
- Module["jsepDownloadTensor"],
- ] = params;
+ Module.webnnReserveTensorId,
+ Module.webnnReleaseTensorId,
+ Module["webnnEnsureTensor"],
+ Module.webnnUploadTensor,
+ Module["webnnDownloadTensor"],
+ ] = params.slice(1);
// This function is called from both JS and an EM_ASM block, it needs both a minifiable name and an explicit name.
- Module["jsepReleaseTensorId"] = Module.jsepReleaseTensorId;
- Module["jsepUploadTensor"] = Module.jsepUploadTensor;
+ Module["webnnReleaseTensorId"] = Module.webnnReleaseTensorId;
+ Module["webnnUploadTensor"] = Module.webnnUploadTensor;
// Functions called from JS also need to have explicit names.
- const backend = Module.jsepBackend;
- Module["jsepOnRunStart"] = (sessionId) => {
+ Module["webnnOnRunStart"] = (sessionId) => {
return backend["onRunStart"](sessionId);
};
- Module["jsepOnRunEnd"] = backend["onRunEnd"].bind(backend);
- Module["jsepRegisterMLContext"] = (sessionId, mlContext) => {
+ Module["webnnOnRunEnd"] = backend["onRunEnd"].bind(backend);
+ Module["webnnRegisterMLContext"] = (sessionId, mlContext) => {
backend["registerMLContext"](sessionId, mlContext);
};
- Module["jsepOnReleaseSession"] = (sessionId) => {
+ Module["webnnOnReleaseSession"] = (sessionId) => {
backend["onReleaseSession"](sessionId);
};
- Module["jsepCreateMLTensorDownloader"] = (tensorId, type) => {
+ Module["webnnCreateMLTensorDownloader"] = (tensorId, type) => {
return backend["createMLTensorDownloader"](tensorId, type);
};
- Module["jsepRegisterMLTensor"] = (sessionId, tensor, dataType, shape) => {
+ Module["webnnRegisterMLTensor"] = (sessionId, tensor, dataType, shape) => {
return backend["registerMLTensor"](sessionId, tensor, dataType, shape);
};
- Module["jsepCreateMLContext"] = (optionsOrGpuDevice) => {
+ Module["webnnCreateMLContext"] = (optionsOrGpuDevice) => {
return backend["createMLContext"](optionsOrGpuDevice);
};
- Module["jsepRegisterMLConstant"] = (
+ Module["webnnRegisterMLConstant"] = (
externalFilePath,
dataOffset,
dataLength,
@@ -149,9 +148,12 @@ Module["jsepInit"] = (name, params) => {
shouldConvertInt64ToInt32,
);
};
- Module['jsepRegisterGraphInput'] = backend['registerGraphInput'].bind(backend);
- Module['jsepIsGraphInput'] = backend['isGraphInput'].bind(backend);
- Module['jsepCreateTemporaryTensor'] = backend['createTemporaryTensor'].bind(backend);
- Module['jsepIsInt64Supported'] = backend['isInt64Supported'].bind(backend);
+ Module["webnnRegisterGraphInput"] =
+ backend["registerGraphInput"].bind(backend);
+ Module["webnnIsGraphInput"] = backend["isGraphInput"].bind(backend);
+
+ Module["webnnCreateTemporaryTensor"] =
+ backend["createTemporaryTensor"].bind(backend);
+ Module["webnnIsInt64Supported"] = backend["isInt64Supported"].bind(backend);
}
};
diff --git a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
deleted file mode 100644
index 3cceadd1b8ef5..0000000000000
--- a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
+++ /dev/null
@@ -1,241 +0,0 @@
-##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
-### please do rerun set-trigger-rules.py ###
-trigger:
- branches:
- include:
- - main
- - rel-*
- paths:
- exclude:
- - docs/**
- - README.md
- - CONTRIBUTING.md
- - BUILD.md
- - 'js/web'
- - 'onnxruntime/core/providers/js'
-pr:
- branches:
- include:
- - main
- - rel-*
- paths:
- exclude:
- - docs/**
- - README.md
- - CONTRIBUTING.md
- - BUILD.md
- - 'js/web'
- - 'onnxruntime/core/providers/js'
-#### end trigger ####
-
-# Known Limits
-# 1. Anchors are not supported in GHA
-# https://github.community/t/support-for-yaml-anchors/16128/90
-# 2. today most cloud-based CI services are still lacking hardware acceleration support from the host VM,
-# which is the no.1 blocker for running tests on modern Android Emulators (especially on recent API levels) on CI.
-
-# It'd better to check out https://github.com/microsoft/onnxruntime/wiki/Leverage-Existing-Artifacts
-# to save debugging time.
-parameters:
-- name: specificArtifact
- displayName: Use Specific Artifact
- type: boolean
- default: false
-- name: runId
- displayName: Specific Artifact's RunId
- type: number
- default: 0
-
-stages:
-# Separate stage for building CPU vs NNAPI as we only want CodeQL to run on one of them so we don't get duplicate
-# issues for code that is built in both. We pick NNAPI as that includes the NNAPI EP code.
-- stage: BUILD_AND_TEST_CPU
- dependsOn: []
- variables:
- Codeql.Enabled: false
- ANDROID_AVD_HOME: $(Agent.TempDirectory)
- jobs:
- - job: BUILD_AND_TEST_CPU
- pool: onnxruntime-Ubuntu2204-AMD-CPU
- workspace:
- clean: all
- timeoutInMinutes: 30
- steps:
- - task: JavaToolInstaller@0
- displayName: Use jdk 17
- inputs:
- versionSpec: '17'
- jdkArchitectureOption: 'x64'
- jdkSourceOption: 'PreInstalled'
-
- - script: sudo apt-get update -y && sudo apt-get install -y coreutils ninja-build
- displayName: Install coreutils and ninja
-
- - template: templates/use-android-ndk.yml
- - template: templates/use-android-emulator.yml
- parameters:
- create: true
- start: true
- - script: |
- env | grep ANDROID
- displayName: View Android ENVs
- - script: |
- python3 tools/ci_build/build.py \
- --enable_lto \
- --android \
- --build_dir build \
- --android_sdk_path $ANDROID_HOME \
- --android_ndk_path $ANDROID_NDK_HOME \
- --android_abi=x86_64 \
- --android_api=30 \
- --skip_submodule_sync \
- --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache \
- --cmake_generator=Ninja \
- --build_java
- displayName: CPU EP, Build and Test
- - template: templates/use-android-emulator.yml
- parameters:
- stop: true
-
- - template: templates/clean-agent-build-directory-step.yml
-
-- stage: BUILD_AND_TEST_NNAPI_EP
- dependsOn: []
- condition: notIn(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')
- variables:
- ANDROID_AVD_HOME: $(Agent.TempDirectory)
- Codeql.ProjectConfigPath: .github/workflows
- Codeql.Enabled: true
- Codeql.Language: cpp
- ${{ if variables['Codeql.Enabled'] }}:
- JobsTimeout: 120
- ${{ else }}:
- JobsTimeout: 60
- jobs:
- - job: BUILD_AND_TEST_NNAPI_EP
- pool: onnxruntime-Ubuntu2204-AMD-CPU
- timeoutInMinutes: ${{ variables.JobsTimeout }}
- workspace:
- clean: all
- steps:
- - task: JavaToolInstaller@0
- displayName: Use jdk 17
- inputs:
- versionSpec: '17'
- jdkArchitectureOption: 'x64'
- jdkSourceOption: 'PreInstalled'
-
- - script: sudo apt-get update -y && sudo apt-get install -y coreutils ninja-build
- displayName: Install coreutils and ninja
- - template: templates/use-android-emulator.yml
- parameters:
- create: true
- start: true
-
- - script: |
- env | grep ANDROID
- displayName: View Android ENVs
-
- - script: |
- python3 tools/ci_build/build.py \
- --enable_lto \
- --android \
- --build_dir build_nnapi \
- --android_sdk_path $ANDROID_HOME \
- --android_ndk_path $ANDROID_NDK_HOME \
- --android_abi=x86_64 \
- --android_api=29 \
- --skip_submodule_sync \
- --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache \
- --use_nnapi \
- --build_shared_lib \
- --cmake_generator=Ninja \
- --build_java
- displayName: NNAPI EP, Build, Test on Android Emulator
-
- - script: /bin/bash tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh $(pwd)
- # Build Minimal ORT with NNAPI and reduced Ops, run unit tests on Android Emulator
- displayName: Build Minimal ORT with NNAPI and run tests
-
- - template: templates/use-android-emulator.yml
- parameters:
- stop: true
-
- - template: templates/clean-agent-build-directory-step.yml
-
-- stage: MAIN_BUILD_STAGE
- # The below jobs only run on build of main branch.
- # because coverage report is hard to support in cross machines.
- displayName: NNAPI MAIN BUILD&TEST
- dependsOn: []
- condition: in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')
- variables:
- ANDROID_AVD_HOME: $(Agent.TempDirectory)
- jobs:
- - job: NNAPI_EP_MASTER
- pool: onnxruntime-Ubuntu2204-AMD-CPU
- timeoutInMinutes: 180
- workspace:
- clean: all
- condition: in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')
- steps:
- - task: JavaToolInstaller@0
- displayName: Use jdk 17
- inputs:
- versionSpec: '17'
- jdkArchitectureOption: 'x64'
- jdkSourceOption: 'PreInstalled'
-
- - template: templates/use-android-ndk.yml
-
- - template: templates/use-android-emulator.yml
- parameters:
- create: true
- start: true
-
- - script: |
- python3 tools/ci_build/build.py \
- --enable_lto \
- --android \
- --build_dir build_nnapi \
- --android_sdk_path $ANDROID_HOME \
- --android_ndk_path $ANDROID_NDK_HOME \
- --android_abi=x86_64 \
- --android_api=29 \
- --skip_submodule_sync \
- --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache \
- --use_nnapi \
- --build_shared_lib \
- --cmake_generator=Ninja \
- --build_java \
- --code_coverage
- displayName: NNAPI EP, Build, Test, CodeCoverage on Android Emulator
-
- # We need to use llvm-cov from the NDK.
- - script: |
- export GCOV="$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/linux-x86_64/bin/llvm-cov gcov"
- python3 -m pip install gcovr
- python3 tools/ci_build/coverage.py --build_dir build_nnapi --android_sdk_path $ANDROID_HOME
- displayName: Retrieve runtime code coverage files from the emulator and analyze
-
- - script: cat '$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt'
- displayName: Print coverage report
-
- # - task: AzureCLI@2
- # displayName: 'Post Android Code Coverage To DashBoard'
- # inputs:
- # azureSubscription: AIInfraBuild
- # scriptType: bash
- # scriptPath: $(Build.SourcesDirectory)/tools/ci_build/github/linux/upload_code_coverage_data.sh
- # arguments: '"$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt" "https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=$(Build.BuildId)" arm android nnapi'
- # workingDirectory: '$(Build.BinariesDirectory)'
-
- - script: /bin/bash tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh $(pwd)
- # Build Minimal ORT with NNAPI and reduced Ops, run unit tests on Android Emulator
- displayName: Build Minimal ORT with NNAPI and run tests
-
- - template: templates/use-android-emulator.yml
- parameters:
- stop: true
-
- - template: templates/clean-agent-build-directory-step.yml
diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
index 23c968f35a27f..70d8e954808a5 100644
--- a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
@@ -56,7 +56,7 @@ extends:
# Update the pool with your team's 1ES hosted pool.
pool:
name: "Azure Pipelines"
- image: "macOS-13"
+ image: "macOS-14"
os: macOS
sdl:
sourceAnalysisPool:
diff --git a/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml b/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml
index 4518a168879a2..a2a0e3bcace9f 100644
--- a/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml
+++ b/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml
@@ -11,7 +11,7 @@ stages:
clean: all
timeoutInMinutes: 120
pool:
- vmImage: 'macOS-13'
+ vmImage: 'macOS-14'
variables:
- name: OnnxRuntimeBuildDirectory
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml
index 07d21333270a8..a6e38d0bc93f3 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml
@@ -11,7 +11,7 @@ stages:
workspace:
clean: all
pool:
- vmImage: 'macOS-13'
+ vmImage: 'macOS-14'
variables:
- name: OnnxRuntimeBuildDirectory
diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index 6b421184c490e..78c07c28d6f4e 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -427,7 +427,7 @@ stages:
- job: IosDynamicFramework
timeoutInMinutes: 120
pool:
- vmImage: "macOS-13"
+ vmImage: "macOS-14"
steps:
- task: UsePythonVersion@0
@@ -463,7 +463,7 @@ stages:
- job: IosMinimalTrainingBuild
timeoutInMinutes: 120
pool:
- vmImage: "macOS-13"
+ vmImage: "macOS-14"
steps:
- task: UsePythonVersion@0
diff --git a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
index 7a78c6ba0fcdf..01c1366107292 100644
--- a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
@@ -29,7 +29,7 @@ stages:
parameters:
job_name: Test_MAC_Wheels
machine_pool:
- vmImage: 'macOS-13'
+ vmImage: 'macOS-14'
itemPattern: '*/*mac*x86_64.whl'
arch: 'x86_64'
- template: templates/py-package-smoking-test.yml
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
index 5e783607e3622..d64ee07aa2131 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
@@ -201,7 +201,7 @@ stages:
clean: all
pool:
name: "Azure Pipelines"
- image: "macOS-13"
+ image: "macOS-14"
os: macOS
variables:
MACOSX_DEPLOYMENT_TARGET: '13.3'
diff --git a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
index 3886ceb1ed58f..366ee3fcf4e92 100644
--- a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
@@ -24,6 +24,8 @@ jobs:
pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
workspace:
clean: all
+ pool:
+ vmImage: 'macOS-14'
variables:
runCodesignValidationInjection: false
ANDROID_AVD_HOME: $(Agent.TempDirectory)
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index c4559d4e0b918..7a46bdc7cde0a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -117,7 +117,7 @@ stages:
workspace:
clean: all
pool:
- vmImage: 'macOS-13'
+ vmImage: 'macOS-14'
timeoutInMinutes: 300
steps:
- template: set-version-number-variables-step.yml
@@ -788,7 +788,7 @@ stages:
- template: ../nuget/templates/test_macos.yml
parameters:
- AgentPool : macOS-13
+ AgentPool : macOS-14
ArtifactSuffix: 'CPU'
- template: ../nodejs/templates/test_win.yml
@@ -824,4 +824,4 @@ stages:
OS: MacOS
BuildId: ${{ parameters.BuildId }}
SpecificArtifact: ${{ parameters.SpecificArtifact }}
- PoolName: 'macOS-13'
+ PoolName: 'macOS-14'
diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml
index ab31e592d7d71..8c725c1d6b9a0 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml
@@ -68,7 +68,7 @@ stages:
jobs:
- job: MacOS_C_API_Package_Publish
pool:
- vmImage: 'macOS-13'
+ vmImage: 'macOS-14'
steps:
- checkout: none
- template: flex-downloadPipelineArtifact.yml
diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
index 32908753f2909..c7ae7bb3a0026 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
@@ -37,7 +37,7 @@ jobs:
PROTO_CACHE_DIR: $(Pipeline.Workspace)/ccache_proto
ORT_CACHE_DIR: $(Pipeline.Workspace)/ccache_ort
pool:
- vmImage: 'macOS-13'
+ vmImage: 'macOS-14'
timeoutInMinutes: 300
steps:
- checkout: self
diff --git a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
index 58ebdd52998ea..c1309d345d819 100644
--- a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
@@ -64,11 +64,11 @@ stages:
- job: ReactNative_CI_iOS
${{ if eq(parameters.is1ES, false) }}:
pool:
- vmImage: 'macOS-13'
+ vmImage: 'macOS-14'
${{ if eq(parameters.is1ES, true) }}:
pool:
name: 'Azure Pipelines'
- image: 'macOS-13'
+ image: 'macOS-14'
os: 'macOS'
timeoutInMinutes: 120
@@ -212,7 +212,7 @@ stages:
scheme: 'OnnxruntimeModuleTest'
packageApp: false
destinationPlatformOption: 'iOS'
- destinationSimulators: 'iPhone 14,OS=16.4'
+ destinationSimulators: 'iPhone 15,OS=17.4'
workingDirectory: '$(Build.SourcesDirectory)/js/react_native/ios'
xcprettyArgs: '--output build/reports/test-results.xml'
publishJUnitResults: true
diff --git a/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml b/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml
index 2cf698aefa8bd..3c1bfcd60fedd 100644
--- a/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml
@@ -3,7 +3,7 @@
parameters:
- name: xcodeVersion
type: string
- default: "14.3.1"
+ default: "15.3.0"
steps:
- bash: |
diff --git a/tools/python/util/android/android.py b/tools/python/util/android/android.py
index 8f3ed97cae53f..cd420ca1483c7 100644
--- a/tools/python/util/android/android.py
+++ b/tools/python/util/android/android.py
@@ -46,18 +46,36 @@ def filename(name, windows_extension):
def create_virtual_device(sdk_tool_paths: SdkToolPaths, system_image_package_name: str, avd_name: str):
run(sdk_tool_paths.sdkmanager, "--install", system_image_package_name, input=b"y")
-
- run(
- sdk_tool_paths.avdmanager,
- "create",
- "avd",
- "--name",
- avd_name,
- "--package",
- system_image_package_name,
- "--force",
- input=b"no",
- )
+ android_avd_home = os.environ["ANDROID_AVD_HOME"]
+
+ if android_avd_home is not None:
+ if not os.path.exists(android_avd_home):
+ os.makedirs(android_avd_home)
+ run(
+ sdk_tool_paths.avdmanager,
+ "create",
+ "avd",
+ "--name",
+ avd_name,
+ "--package",
+ system_image_package_name,
+ "--force",
+ "--path",
+ android_avd_home,
+ input=b"no",
+ )
+ else:
+ run(
+ sdk_tool_paths.avdmanager,
+ "create",
+ "avd",
+ "--name",
+ avd_name,
+ "--package",
+ system_image_package_name,
+ "--force",
+ input=b"no",
+ )
_process_creationflags = subprocess.CREATE_NEW_PROCESS_GROUP if is_windows() else 0