diff --git a/.github/actions/setup-android-ndk/action.yml b/.github/actions/setup-android-ndk/action.yml
new file mode 100644
index 0000000000000..fea9745396e81
--- /dev/null
+++ b/.github/actions/setup-android-ndk/action.yml
@@ -0,0 +1,98 @@
+# .github/actions/setup-android-ndk/action.yml
+name: 'Setup Android NDK'
+description: 'Installs and configures a specific version of the Android NDK'
+inputs:
+  ndk-version:
+    description: 'The version of the Android NDK to install (e.g., 27.2.12479018)'
+    required: true
+    default: '27.2.12479018'
+  android-sdk-root:
+    description: 'The root directory of the Android SDK'
+    required: true
+    default: '/usr/local/lib/android/sdk'
+
+runs:
+  using: "composite"  # Use a composite action for multiple shell commands
+  steps:
+    - name: Install coreutils and ninja
+      shell: bash
+      run: sudo apt-get update -y && sudo apt-get install -y coreutils ninja-build
+
+    - name: Install Android NDK
+      shell: bash
+      run: |
+        set -e
+        "${{ inputs.android-sdk-root }}/cmdline-tools/latest/bin/sdkmanager" --install "ndk;${{ inputs.ndk-version }}"
+
+        NDK_PATH="${{ inputs.android-sdk-root }}/ndk/${{ inputs.ndk-version }}"
+        if [[ ! -d "${NDK_PATH}" ]]; then
+          echo "NDK directory is not in expected location: ${NDK_PATH}"
+          exit 1
+        fi
+
+        # Use standard environment variable setting in bash and add to GITHUB_ENV
+        echo "ANDROID_NDK_HOME=${NDK_PATH}" >> $GITHUB_ENV
+        echo "ANDROID_NDK_ROOT=${NDK_PATH}" >> $GITHUB_ENV
+        echo "ANDROID_NDK_HOME: ${NDK_PATH}"
+        echo "ANDROID_NDK_ROOT: ${NDK_PATH}"
+        
+    - name: Check if emulator are installed and add to PATH
+      shell: bash
+      run: |
+          if [[ ":$PATH:" == *":${ANDROID_SDK_ROOT}/emulator:"* ]]; then
+            echo "${ANDROID_SDK_ROOT}/emulator is in PATH"
+          else
+            ${ANDROID_SDK_ROOT}/cmdline-tools/latest/bin/sdkmanager --install "emulator"
+            echo "${ANDROID_SDK_ROOT}/emulator" >> $GITHUB_PATH
+          fi
+
+    - name: Check if platform tools are installed and add to PATH
+      shell: bash
+      run: |
+          if [[ ":$PATH:" == *":${ANDROID_SDK_ROOT}/platform-tools:"* ]]; then
+            echo "${ANDROID_SDK_ROOT}/platform-tools is in PATH"
+          else
+            ${ANDROID_SDK_ROOT}/cmdline-tools/latest/bin/sdkmanager --install "platform-tools"
+            echo "${ANDROID_SDK_ROOT}/platform-tools" >> $GITHUB_PATH
+          fi
+          ls -R "${ANDROID_SDK_ROOT}/platform-tools"
+
+    - name: Create Android Emulator
+      shell: bash
+      env:
+        ANDROID_AVD_HOME: ${{ runner.temp }}/android-avd
+      run: |
+          python3 tools/python/run_android_emulator.py \
+            --android-sdk-root "${ANDROID_SDK_ROOT}" \
+            --create-avd --system-image "system-images;android-31;default;x86_64"
+
+    - name: List Android AVDs
+      shell: bash
+      env:
+        ANDROID_AVD_HOME: ${{ runner.temp }}/android-avd
+      run: |
+          "${ANDROID_SDK_ROOT}/cmdline-tools/latest/bin/avdmanager" list avd
+
+    - name: Check emulator.pid does not exist
+      shell: bash
+      run: |
+          if test -f ./emulator.pid; then
+            echo "Emulator PID file was not expected to exist but does and has pid: `cat ./emulator.pid`"
+            exit 1
+          fi
+
+    - name: Start Android Emulator
+      shell: bash
+      env:
+        ANDROID_AVD_HOME: ${{ runner.temp }}/android-avd
+      run: |
+          set -e -x
+          python3 tools/python/run_android_emulator.py \
+            --android-sdk-root "${ANDROID_SDK_ROOT}" \
+            --start --emulator-extra-args="-partition-size 2047" \
+            --emulator-pid-file ./emulator.pid
+          echo "Emulator PID: `cat ./emulator.pid`"
+
+    - name: View Android ENVs
+      shell: bash
+      run: env | grep ANDROID
\ No newline at end of file
diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml
new file mode 100644
index 0000000000000..64c40946c49c5
--- /dev/null
+++ b/.github/workflows/android.yml
@@ -0,0 +1,147 @@
+name: Android CI
+# This workflow is used to build and test on Android Emulator on Linux
+
+on:
+  push:
+    branches:
+      - main
+      - rel-*
+  pull_request:
+    branches:
+      - main
+      - rel-*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+jobs:
+  android_nnapi_ep:
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Use jdk 17
+        uses: actions/setup-java@v4
+        with:
+          distribution: 'temurin'
+          java-version: '17'
+          architecture: x64
+
+      - name: Setup Android NDK
+        uses: ./.github/actions/setup-android-ndk
+        with:
+          ndk-version: 27.2.12479018
+
+      - name: Export GitHub Actions cache environment variables
+        uses: actions/github-script@v7
+        with:
+          script: |
+            core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
+            core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
+
+      - name: NNAPI EP, Build, Test on Android Emulator
+        run: >-
+          python3 tools/ci_build/build.py
+          --enable_lto
+          --android
+          --build_dir build_nnapi
+          --android_sdk_path "$ANDROID_HOME"
+          --android_ndk_path "$ANDROID_NDK_HOME"
+          --android_abi=x86_64
+          --android_api=29
+          --skip_submodule_sync
+          --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache
+          --use_nnapi
+          --build_shared_lib
+          --cmake_generator=Ninja
+          --build_java
+        shell: bash
+
+
+      - name: Build Minimal ORT with NNAPI and run tests
+        run:  tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh "$(pwd)"
+        shell: bash
+
+      - name: Install psutil for emulator shutdown by run_android_emulator.py
+        if: always()
+        run: python3 -m pip install psutil
+        shell: bash
+
+      - name: Stop Android Emulator
+        if: always()
+        run: |
+          env | grep ANDROID
+          if test -f ${{ github.workspace }}/emulator.pid; then
+            echo "Emulator PID:"`cat ${{ github.workspace }}/emulator.pid`
+            python3 tools/python/run_android_emulator.py \
+              --android-sdk-root "${ANDROID_SDK_ROOT}" \
+              --stop \
+              --emulator-pid-file ${{ github.workspace }}/emulator.pid
+            rm ${{ github.workspace }}/emulator.pid
+          else
+            echo "Emulator PID file was expected to exist but does not."
+          fi
+        shell: bash
+
+  android_cpu_ep:
+    name: Android CI Pipeline
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Use jdk 17
+        uses: actions/setup-java@v4
+        with:
+          distribution: 'temurin'
+          java-version: '17'
+          architecture: x64
+
+      - name: Setup Android NDK
+        uses: ./.github/actions/setup-android-ndk
+        with:
+          ndk-version: 27.2.12479018
+
+      - name: Export GitHub Actions cache environment variables
+        uses: actions/github-script@v7
+        with:
+          script: |
+            core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
+            core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
+
+      - name: CPU EP, Build and Test
+        run: >-
+          python3 tools/ci_build/build.py
+          --enable_lto
+          --android
+          --build_dir build
+          --android_sdk_path $ANDROID_HOME
+          --android_ndk_path $ANDROID_NDK_HOME
+          --android_abi=x86_64
+          --android_api=30
+          --skip_submodule_sync
+          --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache
+          --cmake_generator=Ninja
+          --build_java
+        shell: bash
+
+      - name: Install psutil for emulator shutdown by run_android_emulator.py
+        if: always()
+        run: python3 -m pip install psutil
+        shell: bash
+
+      - name: Stop Android Emulator
+        if: always()
+        run: |
+          if test -f ${{ github.workspace }}/emulator.pid; then
+           echo "Emulator PID:"`cat ${{ github.workspace }}/emulator.pid`
+           python3 tools/python/run_android_emulator.py \
+           --android-sdk-root "${ANDROID_SDK_ROOT}" \
+           --stop \
+           --emulator-pid-file ${{ github.workspace }}/emulator.pid
+           rm ${{ github.workspace }}/emulator.pid
+          else
+           echo "Emulator PID file was expected to exist but does not."
+          fi
+        shell: bash
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index 50dd25898ad35..86b1cd5ee90e7 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -154,6 +154,46 @@ jobs:
           --use_xnnpack \
           --use_binskim_compliant_compile_flags
 
+  ARM64-Xcode16-webgpu:
+    runs-on: macos-15
+
+    env:
+      xcode_version: 16
+
+    timeout-minutes: 60
+
+    steps:
+    - uses: actions/setup-python@v5
+      with:
+        python-version: ${{ env.python_version }}
+
+    - name: Verify ARM64 machine
+      shell: python
+      run: |
+        import platform
+        assert platform.machine() == "arm64", "This job expects to be run on an ARM64 machine."
+
+    - name: Use Xcode ${{ env.xcode_version }}
+      shell: bash
+      run: |
+        XCODE_DEVELOPER_DIR="/Applications/Xcode_${{ env.xcode_version }}.app/Contents/Developer"
+        sudo xcode-select --switch "${XCODE_DEVELOPER_DIR}"
+
+    - uses: actions/checkout@v4
+
+    - name: Build and test
+      shell: bash
+      run: |
+        python ./tools/ci_build/build.py \
+          --build_dir ./build \
+          --update \
+          --build --parallel \
+          --test \
+          --build_shared_lib \
+          --build_nodejs \
+          --use_webgpu \
+          --use_binskim_compliant_compile_flags
+
   ARM64-Xcode16-targeting-iphonesimulator:
     runs-on: macos-15
 
@@ -164,7 +204,7 @@ jobs:
       matrix:
         target_arch: [x86_64, arm64]
 
-    timeout-minutes: 75
+    timeout-minutes: 90
 
     steps:
     - uses: actions/setup-python@v5
diff --git a/cmake/CMakePresets.json b/cmake/CMakePresets.json
index 4987edaf85513..8d63912f6eaee 100644
--- a/cmake/CMakePresets.json
+++ b/cmake/CMakePresets.json
@@ -109,6 +109,29 @@
                 "rhs": "Darwin"
             }
         },
+        {
+            "name": "arm64-osx",
+            "inherits": [
+                "unit-test"
+            ],
+            "generator": "Xcode",
+            "binaryDir": "${sourceParentDir}/cmake_build/arm64-osx",
+            "installDir": "${sourceParentDir}/cmake_build/out",
+            "cacheVariables": {
+                "CMAKE_OSX_ARCHITECTURES": "arm64",
+                "onnxruntime_BUILD_SHARED_LIB": true,
+                "onnxruntime_USE_XNNPACK": false,
+                "onnxruntime_USE_COREML": true,
+                "onnxruntime_BUILD_OBJC": true,
+                "onnxruntime_BUILD_APPLE_FRAMEWORK": true,
+                "CMAKE_CONFIGURATION_TYPES": "Debug;Release"
+            },
+            "condition": {
+                "type": "equals",
+                "lhs": "${hostSystemName}",
+                "rhs": "Darwin"
+            }
+        },
         {
             "name": "x64-osx-vcpkg",
             "inherits": [
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index b64641230f249..f582abca34706 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -1625,7 +1625,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
+<dt><tt>T</tt> : tensor(bool), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
 <dd>Constrain input and output types.</dd>
 </dl>
 
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 91c68a1f5afb6..8d256a2088279 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -582,7 +582,7 @@ Do not modify directly.*
 | Op Name | Parameters | OpSet Version | Types Supported |
 |---------|------------|---------------|-----------------|
 |**Operator Domain:** *ai.onnx*||||
-|Abs|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Abs|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[6, 12]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |Add|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|14+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||13|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
@@ -839,7 +839,7 @@ Do not modify directly.*
 |Shrink|*in* input:**T**<br> *out* output:**T**|9+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |Sigmoid|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
 |||[6, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
-|Sign|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Sign|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |SimplifiedLayerNormalization|*in* X:**T**<br> *in* scale:**V**<br> *out* Y:**V**<br> *out* inv_std_var:**U**|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)<br/> **U** = tensor(double), tensor(float)<br/> **V** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
 |Sin|*in* input:**T**<br> *out* output:**T**|7+|**T** = tensor(double), tensor(float), tensor(float16)|
 |Size|*in* data:**T**<br> *out* size:**T1**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
diff --git a/js/react_native/android/src/main/AndroidManifest.xml b/js/react_native/android/src/main/AndroidManifest.xml
index c6e3cb45e16e5..a2f47b6057db7 100644
--- a/js/react_native/android/src/main/AndroidManifest.xml
+++ b/js/react_native/android/src/main/AndroidManifest.xml
@@ -1,3 +1,2 @@
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-          package="ai.onnxruntime.reactnative">
+<manifest xmlns:android="http://schemas.android.com/apk/res/android">
 </manifest>
diff --git a/js/react_native/e2e/.detoxrc.js b/js/react_native/e2e/.detoxrc.js
index e886a363d378b..1d49f06213e51 100644
--- a/js/react_native/e2e/.detoxrc.js
+++ b/js/react_native/e2e/.detoxrc.js
@@ -38,8 +38,8 @@ module.exports = {
     simulator: {
       type: 'ios.simulator',
       device: {
-        type: 'iPhone 14',
-        os: 'iOS 16.4',
+        type: 'iPhone 15',
+        os: 'iOS 17.4',
       },
     },
     attached: {
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index 3979af7fa1ec9..bb532e0fbae74 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -309,12 +309,12 @@ export const createSession = async (
           if (context) {
             wasm.currentContext = context as MLContext;
           } else if (gpuDevice) {
-            wasm.currentContext = await wasm.jsepCreateMLContext!(gpuDevice);
+            wasm.currentContext = await wasm.webnnCreateMLContext!(gpuDevice);
           } else {
-            wasm.currentContext = await wasm.jsepCreateMLContext!({ deviceType, powerPreference });
+            wasm.currentContext = await wasm.webnnCreateMLContext!({ deviceType, powerPreference });
           }
         } else {
-          wasm.currentContext = await wasm.jsepCreateMLContext!();
+          wasm.currentContext = await wasm.webnnCreateMLContext!();
         }
         break;
       }
@@ -330,7 +330,7 @@ export const createSession = async (
 
     // clear current MLContext after session creation
     if (wasm.currentContext) {
-      wasm.jsepRegisterMLContext!(sessionHandle, wasm.currentContext);
+      wasm.webnnRegisterMLContext!(sessionHandle, wasm.currentContext);
       wasm.currentContext = undefined;
       wasm.shouldTransferToMLTensor = true;
     }
@@ -454,6 +454,7 @@ export const releaseSession = (sessionId: number): void => {
   }
 
   wasm.jsepOnReleaseSession?.(sessionId);
+  wasm.webnnOnReleaseSession?.(sessionId);
   wasm.webgpuOnReleaseSession?.(sessionId);
 
   inputNamesUTF8Encoded.forEach((buf) => wasm._OrtFree(buf));
@@ -520,7 +521,7 @@ export const prepareInputOutputTensor = async (
     const mlTensor = tensor[2].mlTensor as MLTensor;
     dataByteLength = calculateTensorSizeInBytes(tensorDataTypeStringToEnum(dataType), dims)!;
 
-    const registerMLTensor = wasm.jsepRegisterMLTensor;
+    const registerMLTensor = wasm.webnnRegisterMLTensor;
     if (!registerMLTensor) {
       throw new Error('Tensor location "ml-tensor" is not supported without using WebNN.');
     }
@@ -540,7 +541,7 @@ export const prepareInputOutputTensor = async (
         wasm.setValue(rawData + i * ptrSize, allocWasmString(data[i], allocs), '*');
       }
     } else {
-      const isGraphInput = wasm.jsepIsGraphInput;
+      const isGraphInput = wasm.webnnIsGraphInput;
       if (dataType !== 'string' && isGraphInput) {
         const tensorNameUTF8 = wasm._OrtGetInputName(sessionId, index);
         const tensorName = wasm.UTF8ToString(tensorNameUTF8);
@@ -549,8 +550,8 @@ export const prepareInputOutputTensor = async (
           const dataTypeEnum = tensorDataTypeStringToEnum(dataType);
           dataByteLength = calculateTensorSizeInBytes(dataTypeEnum, dims)!;
           actualLocation = 'ml-tensor';
-          const createTemporaryTensor = wasm.jsepCreateTemporaryTensor;
-          const uploadTensor = wasm.jsepUploadTensor;
+          const createTemporaryTensor = wasm.webnnCreateTemporaryTensor;
+          const uploadTensor = wasm.webnnUploadTensor;
           if (!createTemporaryTensor || !uploadTensor) {
             throw new Error('Tensor location "ml-tensor" is not supported without using WebNN.');
           }
@@ -722,6 +723,7 @@ export const run = async (
     }
 
     wasm.jsepOnRunStart?.(sessionHandle);
+    wasm.webnnOnRunStart?.(sessionHandle);
 
     let errorCode: number;
     if (!BUILD_DEFS.DISABLE_JSEP && ioBindingState) {
@@ -862,8 +864,8 @@ export const run = async (
               ]);
             }
           } else if (preferredLocation === 'ml-tensor' && size > 0) {
-            const ensureTensor = wasm.jsepEnsureTensor;
-            const isInt64Supported = wasm.jsepIsInt64Supported;
+            const ensureTensor = wasm.webnnEnsureTensor;
+            const isInt64Supported = wasm.webnnIsInt64Supported;
             if (!ensureTensor || !isInt64Supported) {
               throw new Error('preferredLocation "ml-tensor" is not supported without using WebNN.');
             }
@@ -890,9 +892,9 @@ export const run = async (
               dims,
               {
                 mlTensor,
-                download: wasm.jsepCreateMLTensorDownloader!(dataOffset, type),
+                download: wasm.webnnCreateMLTensorDownloader!(dataOffset, type),
                 dispose: () => {
-                  wasm.jsepReleaseTensorId!(dataOffset);
+                  wasm.webnnReleaseTensorId!(dataOffset);
                   wasm._OrtReleaseTensor(tensor);
                 },
               },
@@ -915,7 +917,7 @@ export const run = async (
         if (!keepOutputTensor) {
           wasm._OrtReleaseTensor(tensor);
         }
-        wasm.jsepOnRunEnd?.(sessionHandle);
+        wasm.webnnOnRunEnd?.(sessionHandle);
       }
     }
 
diff --git a/js/web/lib/wasm/wasm-types.ts b/js/web/lib/wasm/wasm-types.ts
index 6de54078af031..752bac28d7efb 100644
--- a/js/web/lib/wasm/wasm-types.ts
+++ b/js/web/lib/wasm/wasm-types.ts
@@ -156,12 +156,26 @@ export declare namespace JSEP {
      */
     shouldTransferToMLTensor: boolean;
 
+    /**
+     *  [exported from pre-jsep.js] Called when InferenceSession.run started. This function will be called before
+     * _OrtRun[WithBinding]() is called.
+     * @param sessionId - specify the session ID.
+     */
+    webnnOnRunStart: (sessionId: number) => void;
+    /**
+     * [exported from pre-jsep.js] Release a session. This function will be called before _OrtReleaseSession() is
+     * called.
+     * @param sessionId - specify the session ID.
+     * @returns
+     */
+    webnnOnReleaseSession: (sessionId: number) => void;
+
     /**
      * [exported from pre-jsep.js] Called when InferenceSession.run finished. This function will be called after
      * _OrtRun[WithBinding]() is called.
      * @param sessionId - specify the session ID.
      */
-    jsepOnRunEnd: (sessionId: number) => void;
+    webnnOnRunEnd: (sessionId: number) => void;
 
     /**
      * [exported from pre-jsep.js] Register MLContext for a session.
@@ -169,18 +183,18 @@ export declare namespace JSEP {
      * @param context - specify the MLContext.
      * @returns
      */
-    jsepRegisterMLContext: (sessionId: number, context: MLContext) => void;
+    webnnRegisterMLContext: (sessionId: number, context: MLContext) => void;
     /**
      * [exported from pre-jsep.js] Reserve a MLTensor ID attached to the current session.
      * @returns the MLTensor ID.
      */
-    jsepReserveTensorId: () => number;
+    webnnReserveTensorId: () => number;
     /**
      * [exported from pre-jsep.js] Release an MLTensor ID from use and destroys underlying MLTensor if no longer in use.
      * @param tensorId - specify the MLTensor ID.
      * @returns
      */
-    jsepReleaseTensorId: (tensorId: number) => void;
+    webnnReleaseTensorId: (tensorId: number) => void;
     /**
      * [exported from pre-jsep.js] Ensure that an MLTensor of a given type and shape exists for a MLTensor ID.
      * @param sessionId - specify the session ID or current active session ID if undefined.
@@ -190,7 +204,7 @@ export declare namespace JSEP {
      * @param copyOld - specify whether to copy the old tensor if a new tensor was created.
      * @returns the MLTensor associated with the tensor ID.
      */
-    jsepEnsureTensor: (
+    webnnEnsureTensor: (
       sessionId: number | undefined,
       tensorId: number,
       dataType: DataType,
@@ -203,20 +217,20 @@ export declare namespace JSEP {
      * @param data - specify the data to upload. It can be a TensorProto::data_type or a WebNN MLOperandDataType.
      * @returns
      */
-    jsepUploadTensor: (tensorId: number, data: Uint8Array) => void;
+    webnnUploadTensor: (tensorId: number, data: Uint8Array) => void;
     /**
      * [exported from pre-jsep.js] Download data from an MLTensor.
      * @param tensorId - specify the MLTensor ID.
      * @returns the downloaded data.
      */
-    jsepDownloadTensor: (tensorId: number, dstBuffer: ArrayBufferView | ArrayBuffer) => Promise<undefined>;
+    webnnDownloadTensor: (tensorId: number, dstBuffer: ArrayBufferView | ArrayBuffer) => Promise<undefined>;
     /**
      * [exported from pre-jsep.js] Creates a downloader function to download data from an MLTensor.
      * @param tensorId - specify the MLTensor ID.
      * @param type - specify the data type.
      * @returns the downloader function.
      */
-    jsepCreateMLTensorDownloader: (
+    webnnCreateMLTensorDownloader: (
       tensorId: number,
       type: Tensor.MLTensorDataTypes,
     ) => () => Promise<Tensor.DataTypeMap[Tensor.MLTensorDataTypes]>;
@@ -228,7 +242,7 @@ export declare namespace JSEP {
      * @param dimensions - specify the dimensions.
      * @returns the MLTensor ID for the external MLTensor.
      */
-    jsepRegisterMLTensor: (
+    webnnRegisterMLTensor: (
       sessionId: number,
       tensor: MLTensor,
       onnxDataType: DataType,
@@ -240,7 +254,7 @@ export declare namespace JSEP {
      * @param optionsOrGpuDevice - specify the options or GPUDevice.
      * @returns
      */
-    jsepCreateMLContext(optionsOrGpuDevice?: MLContextOptions | GPUDevice): Promise<MLContext>;
+    webnnCreateMLContext(optionsOrGpuDevice?: MLContextOptions | GPUDevice): Promise<MLContext>;
 
     /**
      * [exported from pre-jsep.js] Register a WebNN Constant operand from external data.
@@ -252,7 +266,7 @@ export declare namespace JSEP {
      * @param shouldConvertInt64ToInt32 - specify whether to convert int64 to int32.
      * @returns the WebNN Constant operand for the specified external data.
      */
-    jsepRegisterMLConstant(
+    webnnRegisterMLConstant(
       externalFilePath: string,
       dataOffset: number,
       dataLength: number,
@@ -265,14 +279,14 @@ export declare namespace JSEP {
      * [exported from pre-jsep.js] Register a WebNN graph input.
      * @param inputName - specify the input name.
      */
-    jsepRegisterGraphInput: (inputName: string) => void;
+    webnnRegisterGraphInput: (inputName: string) => void;
     /**
      * [exported from pre-jsep.js] Check if a graph input is a WebNN graph input.
      * @param sessionId - specify the session ID.
      * @param inputName - specify the input name.
      * @returns whether the input is a WebNN graph input.
      */
-    jsepIsGraphInput: (sessionId: number, inputName: string) => boolean;
+    webnnIsGraphInput: (sessionId: number, inputName: string) => boolean;
     /**
      * [exported from pre-jsep.js] Create a temporary MLTensor for a session.
      * @param sessionId - specify the session ID.
@@ -280,13 +294,13 @@ export declare namespace JSEP {
      * @param shape - specify the shape.
      * @returns the MLTensor ID for the temporary MLTensor.
      */
-    jsepCreateTemporaryTensor: (sessionId: number, dataType: DataType, shape: readonly number[]) => Promise<number>;
+    webnnCreateTemporaryTensor: (sessionId: number, dataType: DataType, shape: readonly number[]) => Promise<number>;
     /**
      * [exported from pre-jsep.js] Check if a session's associated WebNN Context supports int64.
      * @param sessionId - specify the session ID.
      * @returns whether the WebNN Context supports int64.
      */
-    jsepIsInt64Supported: (sessionId: number) => boolean;
+    webnnIsInt64Supported: (sessionId: number) => boolean;
   }
 }
 
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
index 58ddf60df79f0..52c705abb1003 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
@@ -224,12 +224,12 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
   // Shader is designed to be dispatched as Dispatch(num_heads, new_sequence_length / workgroup_size_x, 1)
   // Each lane/thread is responsible for a single q.
   shader.MainFunctionBody() << R"MAIN_FN(
-  let head_idx = workgroup_id.x;
+  let head_idx = u32(workgroup_idx / uniforms.num_seq_tile);
   let capped_sg_id = min(sg_id, max_k_step);
   let capped_sg_size = min(sg_size, max_k_step);
 
   // Load Q
-  let q_idx_global = workgroup_id.y * workgroup_size_x + local_idx;
+  let q_idx_global = (workgroup_idx % uniforms.num_seq_tile) * workgroup_size_x + local_idx;
   let valid_q = q_idx_global < uniforms.new_sequence_length;
   if (valid_q)
   {
@@ -445,7 +445,8 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
   std::string cache_hint = std::to_string(has_attention_bias) +
                            std::to_string(parameters.head_size_) +
                            std::to_string(parameters.num_heads_);
-  program.SetDispatchGroupSize(parameters.num_heads_, (parameters.sequence_length_ + tile_size - 1) / tile_size, 1)
+  const uint32_t num_seq_tile = (parameters.sequence_length_ + tile_size - 1) / tile_size;
+  program.SetDispatchGroupSize(parameters.num_heads_ * num_seq_tile)
       .SetWorkgroupSize(tile_size)
       .CacheHint(cache_hint)
       .AddUniformVariables({{static_cast<uint32_t>(parameters.sequence_length_)},
@@ -454,7 +455,8 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
                             {static_cast<uint32_t>(parameters.total_sequence_length_ - parameters.kv_sequence_length_)},
                             {static_cast<uint32_t>(parameters.is_gqa_ ? 1 : 0)},
                             {static_cast<uint32_t>(parameters.n_reps)},
-                            {alpha}});
+                            {alpha},
+                            {num_seq_tile}});
 
   return context.RunProgram(program);
 }
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
index 2c2b888538843..8931403641a81 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
@@ -52,7 +52,8 @@ class FlashAttentionProgram final : public Program<FlashAttentionProgram> {
                                           {"past_sequence_length", ProgramUniformVariableDataType::Uint32},
                                           {"is_gqa", ProgramUniformVariableDataType::Uint32},
                                           {"n_reps", ProgramUniformVariableDataType::Uint32},
-                                          {"alpha", ProgramUniformVariableDataType::Float32});
+                                          {"alpha", ProgramUniformVariableDataType::Float32},
+                                          {"num_seq_tile", ProgramUniformVariableDataType::Uint32});
 
  private:
   bool has_attention_bias_;
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
index 05cbfb1f99c48..a25d8e68f11cd 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
@@ -12,21 +12,12 @@ Status DP4AMatMulQuantizeProgram::GenerateShaderCode(ShaderHelper& shader) const
   shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
   shader.AddOutput("output", ShaderUsage::UseUniform);
   shader.AddOutput("scales", ShaderUsage::UseUniform);
-  shader.AdditionalImplementation() << R"ADDNL_FN(
-        fn readInput(offset: u32) -> input_a_value_t
-        {
-            if (offset > uniforms.input_size) {
-                return input_a_value_t(0);
-            }
-            return input_a[offset];
-        }
-    )ADDNL_FN";
   shader.MainFunctionBody() << R"MAIN_FN(
         var local_a : array<vec4<input_a_element_t>, 32>;
         var max_value:vec4<input_a_element_t> = vec4<input_a_element_t>(0);
         for (var idx:u32=0;idx<32;idx+=1)
         {
-            local_a[idx] = readInput(workgroup_idx*32 + idx);
+            local_a[idx] = input_a[workgroup_idx*32 + idx];
             max_value = max(max_value, abs(local_a[idx]));
         }
         var scale = max(max_value.x, max_value.y);
@@ -147,8 +138,8 @@ Status DP4AMatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
   shader.MainFunctionBody() << R"MAIN_FN(
         // During the load phase we use all 256 threads to load 64 rows of A/B.
         // For each row we load tile_size_k_vec (2) vectorized elements, which are 32 elements of K.
-        let a_global_base = workgroup_id.x * tile_size;
-        let b_global_base = workgroup_id.y * tile_size;
+        let a_global_base = u32(workgroup_idx / uniforms.num_N_tile) * tile_size;
+        let b_global_base = (workgroup_idx % uniforms.num_N_tile) * tile_size;
         let load_AorB = u32(local_idx/128);
         let load_row = u32((local_idx%128)/2);
         let load_col = u32(local_idx%2);
@@ -279,17 +270,16 @@ Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor
   Tensor a_scale = context.CreateGPUTensor(a->DataType(), a_scales_dims);
   quantize_program.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(kVec4Components)}})
       .AddOutputs({{&a_quant, ProgramTensorMetadataDependency::Rank, a_quant.Shape(), 1},
-                   {&a_scale, ProgramTensorMetadataDependency::Rank, a_scale.Shape(), 1}})
-      .AddUniformVariable({static_cast<uint32_t>(M * K / kVec4Components)});
+                   {&a_scale, ProgramTensorMetadataDependency::Rank, a_scale.Shape(), 1}});
   ORT_RETURN_IF_ERROR(context.RunProgram(quantize_program));
 
   constexpr uint32_t kTileSize = 64;
   TensorShape reshaped_y_shape{1, M, N / kVec4Components};
+  uint32_t num_M_tile = (M + kTileSize - 1) / kTileSize;
+  uint32_t num_N_tile = (N + kTileSize - 1) / kTileSize;
   DP4AMatMulNBitsProgram mul_program{block_size};
   mul_program.SetWorkgroupSize(256);
-  mul_program.SetDispatchGroupSize(
-      (M + kTileSize - 1) / kTileSize,
-      (N + kTileSize - 1) / kTileSize, 1);
+  mul_program.SetDispatchGroupSize(num_M_tile * num_N_tile);
   mul_program.AddInputs({{&a_quant, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(kVec4Components)},
                          {&a_scale, ProgramTensorMetadataDependency::TypeAndRank, 1},
                          {b, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(kVec2Components * kU32Components)},
@@ -298,7 +288,8 @@ Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor
                             {static_cast<uint32_t>(N)},
                             {static_cast<uint32_t>(K)},
                             {static_cast<uint32_t>(K / 8)},
-                            {static_cast<uint32_t>(K / 16)}})
+                            {static_cast<uint32_t>(K / 16)},
+                            {num_N_tile}})
       .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, reshaped_y_shape, static_cast<int>(kVec4Components)})
       .CacheHint("Block" + std::to_string(block_size));
   return context.RunProgram(mul_program);
@@ -317,7 +308,7 @@ bool CanApplyDP4AMatrixMatMulNBits(onnxruntime::webgpu::ComputeContext& context,
   bool use_dp4a = context.Device().HasFeature(wgpu::FeatureName::Subgroups) &&
                   context.AdapterInfo().backendType != wgpu::BackendType::Metal;
   return (accuracy_level == 4 && block_size % 32 == 0 &&
-          batch_count == 1 && components_k == 4 && K % 64 == 0 && N % 16 == 0 &&
+          batch_count == 1 && components_k == 4 && K % 128 == 0 && N % 16 == 0 &&
           !has_zero_points && use_dp4a);
 }
 
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h
index 15b86d78301ad..f0157ca3e8c97 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h
+++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h
@@ -16,7 +16,6 @@ class DP4AMatMulQuantizeProgram final : public Program<DP4AMatMulQuantizeProgram
  public:
   DP4AMatMulQuantizeProgram() : Program{"DP4AMatMulQuantize"} {}
   Status GenerateShaderCode(ShaderHelper& sh) const override;
-  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"input_size", ProgramUniformVariableDataType::Uint32});
 };
 
 class DP4AMatMulNBitsProgram final : public Program<DP4AMatMulNBitsProgram> {
@@ -28,7 +27,8 @@ class DP4AMatMulNBitsProgram final : public Program<DP4AMatMulNBitsProgram> {
       {"N", ProgramUniformVariableDataType::Uint32},
       {"K", ProgramUniformVariableDataType::Uint32},
       {"K8", ProgramUniformVariableDataType::Uint32},
-      {"K16", ProgramUniformVariableDataType::Uint32});
+      {"K16", ProgramUniformVariableDataType::Uint32},
+      {"num_N_tile", ProgramUniformVariableDataType::Uint32});
 
  private:
   uint32_t block_size_;
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index e45787299f3ad..7b4a45ce8aa0f 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -3361,7 +3361,8 @@ void RegisterContribSchemas() {
           OpSchema::NonDifferentiable)
       .TypeConstraint(
           "T",
-          {"tensor(int8)",
+          {"tensor(bool)",
+           "tensor(int8)",
            "tensor(int16)",
            "tensor(int32)",
            "tensor(int64)",
diff --git a/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
index 4481a5172966b..3fffc6d0a68c4 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
@@ -97,7 +97,6 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                   const logging::Logger& logger) const {
   const auto& op_type(node.OpType());
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
     // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#module-coremltools.converters.mil.mil.ops.defs.iOS15.activation
@@ -166,9 +165,7 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
     model_builder.AddOperation(std::move(op));
 
-  } else
-#endif  // (COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
     if (op_type == "Sigmoid") {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc
index 6169090a36014..dfa01c8187741 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc
@@ -32,7 +32,6 @@ Status ArgMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   const int64_t keepdims = helper.Get("keepdims", 1);
   const bool removedim = keepdims != 1;
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
     // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#module-coremltools.converters.mil.mil.ops.defs.iOS15.reduction
@@ -46,9 +45,7 @@ Status ArgMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     // the output of ArgMax must be int32
     AddOperationOutput(*op, *node.OutputDefs()[0], output_datatype);
     model_builder.AddOperation(std::move(op));
-  } else
-#endif  // (COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     auto* coreml_argmax = layer->mutable_argmax();
     coreml_argmax->set_axis(axis);
     coreml_argmax->set_removedim(removedim);
@@ -91,11 +88,9 @@ bool ArgMaxOpBuilder::IsOpSupportedImpl(const Node& node,
     return false;
   }
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (input_params.create_mlprogram) {
     return true;
   }
-#endif
 
   // If there are multiple downstream nodes and cast (toint32) is one of them
   // not supported, exit here
diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
index 2817f34bc64f2..9e7fcd788664c 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
@@ -6,6 +6,7 @@
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
 #include "core/providers/coreml/builders/model_builder.h"
+#include "core/providers/coreml/model/host_utils.h"
 #include "core/providers/shared/utils/utils.h"
 
 using namespace CoreML::Specification;
@@ -113,10 +114,12 @@ bool BaseOpBuilder::IsInputDtypeSupport(const Node& node, size_t idx,
     return true;
   }
 
+#if CAN_BUILD_COREML6_OR_LATER
   // only MLProgram support FP16
   if (input_params.create_mlprogram && input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
     return true;
   }
+#endif
 
   LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Input type: [" << input_type << "] is not currently supported";
   return false;
diff --git a/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc
index 442194cb31cbc..e547f2e42e527 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc
@@ -57,7 +57,6 @@ Status BatchNormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_bu
   const auto eps = helper.Get("epsilon", 1e-5f);
   const auto channels = scale_tensor.dims()[0];
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
     // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.normalization.batch_norm
@@ -78,9 +77,7 @@ Status BatchNormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_bu
 
     AddOperationOutput(*op, *node.OutputDefs()[0]);
     model_builder.AddOperation(std::move(op));
-  } else
-#endif  // (COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     auto* coreml_batch_norm = layer->mutable_batchnorm();
     coreml_batch_norm->set_channels(channels);
     coreml_batch_norm->set_epsilon(eps);
diff --git a/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
index 0482620b269a4..d7c78e05362ed 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
@@ -56,7 +56,6 @@ bool CheckIfBothInputShapesMatch(const Node& node, const logging::Logger& logger
 }
 }  // namespace
 
-#if defined(COREML_ENABLE_MLPROGRAM)
 static std::vector<int64_t> InferOutputShape(const std::vector<int64_t>& a, const std::vector<int64_t>& b) {
   std::vector<int64_t> output_shape;
   int64_t i_a = 0, j_b = 0;
@@ -112,14 +111,12 @@ static void AddVariadicInputs(std::unique_ptr<CoreML::Specification::MILSpec::Op
   }
   *op = std::move(op_prev);
 }
-#endif
 
 Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                               const logging::Logger& logger) const {
   const auto& op_type(node.OpType());
   const auto& input_defs(node.InputDefs());
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
 
@@ -153,9 +150,7 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
     }
     AddOperationOutput(*op, *node.OutputDefs()[0]);
     model_builder.AddOperation(std::move(op));
-  } else
-#endif  // defined (COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
     if (op_type == "Add") {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
index 6f9bb35c27d80..684653aa21273 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
@@ -150,7 +150,6 @@ void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<c
   CreateCoreMLWeightConvertingDataToFloats(weight, data);
 }
 
-#if defined(COREML_ENABLE_MLPROGRAM)
 //
 // ML Program Utils
 //
@@ -448,6 +447,5 @@ void AddPadTypeAndPads(COREML_SPEC::MILSpec::Operation& op, ModelBuilder& model_
     }
   }
 }
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
 }  // namespace coreml
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
index f38afc0ec181d..b72b66362b014 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
@@ -50,7 +50,6 @@ void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<c
 // Copy the int64_t array to a coreml weight
 void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<const int64_t> data);
 
-#if defined(COREML_ENABLE_MLPROGRAM)
 //
 // MLProgram utils
 //
@@ -174,6 +173,5 @@ void AddOperationOutput(COREML_SPEC::MILSpec::Operation& op, const NodeArg& outp
 /// <param name="num_spatial_dims">Number of spatial dims in input. Generally rank - 2 (ignore N and C dims).</param>
 void AddPadTypeAndPads(COREML_SPEC::MILSpec::Operation& op, ModelBuilder& model_builder, std::string_view op_type,
                        const NodeAttrHelper& helper, int num_spatial_dims);
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
 }  // namespace coreml
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc
index 7c7363d4c81ad..8abee92451338 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc
@@ -27,9 +27,8 @@ class CastOpBuilder : public BaseOpBuilder {
 Status CastOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& model_builder,
                                             [[maybe_unused]] const Node& node,
                                             [[maybe_unused]] const logging::Logger& logger) const {
-// This is a special handling case for ArgMax Op, where argmax is followed by a cast to int32 type.
-// The ArgMax is fused with the Cast node and produces an int32 output.
-#if defined(COREML_ENABLE_MLPROGRAM)
+  // This is a special handling case for ArgMax Op, where argmax is followed by a cast to int32 type.
+  // The ArgMax is fused with the Cast node and produces an int32 output.
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
     // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.elementwise_unary.cast
@@ -73,7 +72,6 @@ Status CastOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& model
     AddOperationOutput(*op, *node.OutputDefs()[0], cast_to_type);
     model_builder.AddOperation(std::move(op));
   }
-#endif
 
   return Status::OK();
 }
@@ -134,7 +132,6 @@ bool CastOpBuilder::HasSupportedInputsImpl(const Node& node, [[maybe_unused]] co
     return false;
   }
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (input_params.create_mlprogram) {
     if ((input_type == ONNX_NAMESPACE::TensorProto_DataType_INT32 ||
          input_type == ONNX_NAMESPACE::TensorProto_DataType_INT64 ||
@@ -152,7 +149,6 @@ bool CastOpBuilder::HasSupportedInputsImpl(const Node& node, [[maybe_unused]] co
       return false;
     }
   }
-#endif
 
   // only support int64 coming from ArgMax (check for ArgMax is done in IsOpSupportedImpl())
   if (input_type != ONNX_NAMESPACE::TensorProto_DataType_INT64) {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
index f7046c213a8cb..9e68070a0e693 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
@@ -64,7 +64,6 @@ Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   bool has_min = min != std::numeric_limits<float>::lowest();
   bool has_max = max != std::numeric_limits<float>::max();
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
 
@@ -121,9 +120,7 @@ Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
     AddOperationOutput(*op, output);
     model_builder.AddOperation(std::move(op));
-  } else
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     // TODO: CoreML has a Clip layer for NeuralNetwork. Added in CoreML 4. We could potentially use that if available
     // to simplify.
     // https://apple.github.io/coremltools/mlmodel/Format/NeuralNetwork.html#cliplayerparams
diff --git a/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc
index 9ea0030290abd..34ce2438095ad 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc
@@ -26,7 +26,6 @@ class ConcatOpBuilder : public BaseOpBuilder {
 Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                               const Node& node,
                                               const logging::Logger& logger) const {
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;  // NOLINT
 
@@ -45,7 +44,6 @@ Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     AddOperationOutput(*op, *node.OutputDefs()[0]);
     model_builder.AddOperation(std::move(op));
   } else  // NOLINT
-#endif    // defined(COREML_ENABLE_MLPROGRAM)
   {
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
diff --git a/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
index 38125957bf481..18823bcc78d19 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
@@ -52,7 +52,6 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
 
   NodeAttrHelper helper(node);
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
 
@@ -89,9 +88,7 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     AddOperationOutput(*conv_op, *node.OutputDefs()[0]);
 
     model_builder.AddOperation(std::move(conv_op));
-  } else
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
     auto strides = helper.Get("strides", std::vector<int64_t>{1, 1});
@@ -225,14 +222,11 @@ bool ConvOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara
   const auto& weight_name = input_defs[1]->Name();
   const auto* weight = input_params.graph_viewer.GetConstantInitializer(weight_name);
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (input_params.create_mlprogram) {
     // ML Program supports non-const weight, 1D, 2D and 3D.
     // keep to 1D and 2D for consistency with the NeuralNetwork implementation for now.
     // add 3D support as/when needed.
-  } else
-#endif  // defined (COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     if (!weight) {
       LOGS(logger, VERBOSE) << "The weight of Conv [" << name << "] must be a constant initializer";
       return false;
@@ -257,7 +251,6 @@ bool ConvOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara
 
   NodeAttrHelper helper(node);
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   // spec says same_lower is supported in CoreML 5. it lies. CoreML 6 is required otherwise you get
   //   `Unexpected value for parameter pad_type[0] "same_lower" not in ("custom", "same", "valid").`
   // We _could_ manually calculate the pads, but not implementing that until we have a real use case to justify
@@ -269,7 +262,6 @@ bool ConvOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara
       return false;
     }
   }
-#endif
 
   // there's no equivalent to allow a manual kernel shape in CoreML.
   // it's OK if a specified kernel_shape matches kH and kW dims of the weight input.
diff --git a/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc
index 5b6d9d72ab3c9..2e2c898b0e10a 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc
@@ -28,7 +28,6 @@ class ConvTransposeOpBuilder : public BaseOpBuilder {
 Status ConvTransposeOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& model_builder,
                                                      [[maybe_unused]] const Node& node,
                                                      const logging::Logger& /*logger*/) const {
-#if defined(COREML_ENABLE_MLPROGRAM)
   using namespace CoreML::Specification::MILSpec;  // NOLINT
   const auto input_defs = node.InputDefs();
   const auto output_defs = node.OutputDefs();
@@ -80,7 +79,6 @@ Status ConvTransposeOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuild
   AddOperationOutput(*op, *output_defs[0]);
 
   model_builder.AddOperation(std::move(op));
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc
index fec14dfd093a0..1a74b1eea97fe 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc
@@ -33,7 +33,6 @@ Status DepthToSpaceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   NodeAttrHelper helper(node);
   int64_t blocksize = *helper.GetInt64("blocksize");  // required attribute
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;  // NOLINT
 
@@ -105,7 +104,6 @@ Status DepthToSpaceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
       model_builder.AddOperation(std::move(reshape2));
     }
   } else  // NOLINT
-#endif    // if defined(COREML_ENABLE_MLPROGRAM)
   {
     const auto& output_name = output_defs[0]->Name();
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
diff --git a/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc
index e685c09ef43ca..4f84f7c36259c 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc
@@ -33,7 +33,6 @@ void GemmOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Nod
   const auto& input_defs(node.InputDefs());
   const bool is_gemm = op == "Gemm";
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     // we have to transpose the weight input of Gemm if transB is false, and potentially override the bias shape
     if (is_gemm) {
@@ -58,9 +57,7 @@ void GemmOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Nod
         }
       }
     }
-  } else
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     // We have already embedded the weights (matrix B and C(if any)) into the coreml layer
     // No need to copy them later to reduce memory consumption
     model_builder.AddInitializerToSkip(input_defs[1]->Name());
@@ -123,7 +120,6 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   const auto K = transB ? b1 : b0;
   const auto N = transB ? b0 : b1;
   // we already checked it and dtype must be existed.
-#if defined(COREML_ENABLE_MLPROGRAM)
   auto input_dtype = a.TypeAsProto()->tensor_type().elem_type();
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
@@ -207,9 +203,7 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
       AddOperationOutput(*matmul_op, *node.OutputDefs()[0]);
       model_builder.AddOperation(std::move(matmul_op));
     }
-  } else
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     auto* coreml_inner_product = layer->mutable_innerproduct();
 
     *layer->mutable_input()->Add() = a.Name();
diff --git a/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc
index 6dcf14c16f111..f558f423752e8 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc
@@ -42,7 +42,6 @@ class GridSampleOpBuilder : public BaseOpBuilder {
 Status GridSampleOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& model_builder,
                                                   [[maybe_unused]] const Node& node,
                                                   [[maybe_unused]] const logging::Logger& logger) const {
-#if defined(COREML_ENABLE_MLPROGRAM)
   using namespace CoreML::Specification::MILSpec;  // NOLINT
   // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.image_resizing.resample
 
@@ -80,7 +79,6 @@ Status GridSampleOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder&
   AddOperationOutput(*op, *output_defs[0]);
 
   model_builder.AddOperation(std::move(op));
-#endif
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/providers/coreml/builders/impl/normalization_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/normalization_op_builder.cc
index b4dc8d1647ad0..c0db144602ee2 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/normalization_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/normalization_op_builder.cc
@@ -49,7 +49,6 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(
   if (node.OpType() == "GroupNormalization") {
     return AddGroupNormToModelBuilderImpl(model_builder, node, logger);
   }
-#if defined(COREML_ENABLE_MLPROGRAM)
   const auto& input_defs = node.InputDefs();
   NodeAttrHelper helper(node);
   const auto& scale_tensor = *model_builder.GetConstantInitializer(input_defs[1]->Name());
@@ -94,7 +93,6 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(
     AddOperationOutput(*op, *node.OutputDefs()[0]);
     model_builder.AddOperation(std::move(op));
   }
-#endif  // (COREML_ENABLE_MLPROGRAM)
 
   return Status::OK();
 }
@@ -103,7 +101,6 @@ Status NormalizationOpBuilder::AddGroupNormToModelBuilderImpl(
     [[maybe_unused]] ModelBuilder& model_builder,
     [[maybe_unused]] const Node& node,
     [[maybe_unused]] const logging::Logger& logger) const {
-#if defined(COREML_ENABLE_MLPROGRAM)
   const auto& input_defs = node.InputDefs();
   NodeAttrHelper helper(node);
   // Coreml hasn't supported GroupNorm yet.
@@ -184,7 +181,6 @@ Status NormalizationOpBuilder::AddGroupNormToModelBuilderImpl(
     model_builder.AddOperation(std::move(mul));
     model_builder.AddOperation(std::move(add));
   }
-#endif  // (COREML_ENABLE_MLPROGRAM)
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc
index 17910ba6fd486..e43eef75007cc 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc
@@ -29,7 +29,6 @@ Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   const auto& op_type = node.OpType();
   const auto& input_defs = node.InputDefs();
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
 
@@ -91,9 +90,7 @@ Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     AddOperationOutput(*op, *node.OutputDefs()[0]);
     model_builder.AddOperation(std::move(op));
 
-  } else
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
     auto* coreml_pool = layer->mutable_pooling();
diff --git a/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc
index d533b867bd454..a4609eb2a0584 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc
@@ -71,7 +71,6 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, co
 
   const bool keepdims = helper.Get("keepdims", 1) != 0;
   const bool noop_with_empty_axes = helper.Get("noop_with_empty_axes", 0) != 0;
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
 
@@ -103,9 +102,7 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, co
     AddOperationOutput(*op, *node.OutputDefs()[0]);
 
     model_builder.AddOperation(std::move(op));
-  } else
-#endif  // (COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
     if (op_type == "ReduceSum") {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc
index 27d24d9c21893..b35d6971623ed 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc
@@ -50,7 +50,6 @@ Status ReshapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   // ReshapeHelper applies the ONNX rules to create the concrete output shape
   ReshapeHelper helper(TensorShape(input_shape), new_shape);
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
 
@@ -64,9 +63,7 @@ Status ReshapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     AddOperationOutput(*reshape_op, *node.OutputDefs()[0]);
 
     model_builder.AddOperation(std::move(reshape_op));
-  } else
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
     *layer->mutable_reshapestatic()->mutable_targetshape() = {new_shape.cbegin(), new_shape.cend()};
diff --git a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
index 7ff66e4a79e37..837573003e515 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
@@ -212,7 +212,6 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
     num_sizes = output_sizes.size();
   }
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;  // NOLINT
 
@@ -279,9 +278,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 
     AddOperationOutput(*op, *output_defs[0]);
     model_builder.AddOperation(std::move(op));
-  } else  // NOLINT
-#endif
-  {
+  } else {
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
     auto* coreml_upsample = layer->mutable_upsample();
diff --git a/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc
index 243f949bdd48e..d1c87b033d323 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc
@@ -25,7 +25,6 @@ Status ShapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
                                              const logging::Logger& /*logger*/) const {
   const auto& input_defs = node.InputDefs();
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
     NodeAttrHelper node_attr_helper{node};
@@ -63,9 +62,7 @@ Status ShapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
       AddOperationOutput(*op, *node.OutputDefs()[0], output_datatype);
       model_builder.AddOperation(std::move(op));
     }
-  } else  // NOLINT
-#endif
-  {
+  } else {
     auto layer = model_builder.CreateNNLayer(node);
     layer->mutable_getshape();
     *layer->mutable_input()->Add() = input_defs[0]->Name();
diff --git a/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc
index 6b3fe75fa592d..368e47e40f831 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc
@@ -127,7 +127,6 @@ Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   SliceOp::PrepareForComputeMetadata compute_metadata{data_shape};
   ORT_RETURN_IF_ERROR(PrepareSliceComputeMetadata(node, model_builder.GetGraphViewer(), compute_metadata));
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;  // NOLINT
     // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation.slice_by_index
@@ -178,9 +177,7 @@ Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 
     model_builder.AddOperation(std::move(op));
 
-  } else  // NOLINT
-#endif    // defined(COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     auto layer = model_builder.CreateNNLayer(node);
     *layer->mutable_input()->Add() = input_defs[0]->Name();
     *layer->mutable_output()->Add() = output_defs[0]->Name();
@@ -222,7 +219,6 @@ bool SliceOpBuilder::HasSupportedInputsImpl(const Node& node,
     return false;
   }
 
-#ifdef COREML_ENABLE_MLPROGRAM
   // The [Doc](https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation.slice_by_index)
   // says ML Program slice_by_index supports fp16 in CoreML 5 (iOS 15).
   // It's incorrect and CoreML 6+ (iOS16, CoreML spec version >= 7) is required otherwise only float is supported.
@@ -230,13 +226,11 @@ bool SliceOpBuilder::HasSupportedInputsImpl(const Node& node,
   // CoreML 6:https://github.com/apple/coremltools/blob/c3ea4cf56fef1176417246c1b85363417f3e713d/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_transformation.py#L495
   if (input_params.create_mlprogram && input_params.coreml_version >= 6 &&
       input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
-  } else
-#endif  // nolint
-    if (input_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT &&
-        input_type != ONNX_NAMESPACE::TensorProto_DataType_INT64) {
-      LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Input type: [" << input_type << "] is not supported";
-      return false;
-    }
+  } else if (input_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT &&
+             input_type != ONNX_NAMESPACE::TensorProto_DataType_INT64) {
+    LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Input type: [" << input_type << "] is not supported";
+    return false;
+  }
 
   return true;
 }
diff --git a/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
index c6e331feed326..2411cd459fecd 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
@@ -37,7 +37,6 @@ Status SoftmaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   const auto axis = helper.Get("axis", axis_default_value);
   auto axis_nonnegative = HandleNegativeAxis(axis, data_shape.size());
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   // CoreML's softmax match onnx's softmax behavior since opset 13.
   // For opset < 13, we need to reshape to 2D and set axis to -1 to simulate onnx softmax behavior.
   // [B,D,...](onnx softmax opset 12, axis=1)->[B,D*...](CoreML softmax, axis=-1)->[B,D,...](reshape back)
@@ -78,9 +77,7 @@ Status SoftmaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
       AddOperationOutput(*reshape2, *node.OutputDefs()[0]);
       model_builder.AddOperation(std::move(reshape2));
     }
-  } else  // NOLINT
-#endif
-  {
+  } else {
     if (node.SinceVersion() >= 13 || (data_shape.size() == 2)) {
       auto* coreml_softmaxnd = layer->mutable_softmaxnd();
       coreml_softmaxnd->set_axis(axis);
diff --git a/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
index 6372f3136123b..717d344982473 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
@@ -56,7 +56,6 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     return std::make_tuple(remainder, chunk_size);
   };
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
     std::unique_ptr<Operation> split_op = model_builder.CreateOperation(node, "split");
@@ -95,9 +94,7 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     }
     model_builder.AddOperation(std::move(split_op));
 
-  } else
-#endif
-  {
+  } else {
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
     auto* coreml_splitnd = layer->mutable_splitnd();
     coreml_splitnd->set_axis(axis);
diff --git a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
index a1b3a18265c70..81bef11906b74 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
@@ -58,7 +58,6 @@ void SqueezeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const
   }
 }
 
-#if defined(COREML_ENABLE_MLPROGRAM)
 void HandleX86ArchUnsqueezeScalarInput(ModelBuilder& model_builder,
                                        const Node& node, const logging::Logger& logger) {
   const auto& input_defs(node.InputDefs());
@@ -74,7 +73,6 @@ void HandleX86ArchUnsqueezeScalarInput(ModelBuilder& model_builder,
   AddOperationOutput(*op, *node.OutputDefs()[0]);
   model_builder.AddOperation(std::move(op));
 }
-#endif
 
 Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                const Node& node,
@@ -83,7 +81,7 @@ Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   auto* coreml_squeeze = layer->mutable_squeeze();
   TensorShapeVector axes;
   GetAxes(model_builder, node, axes);
-#if defined(COREML_ENABLE_MLPROGRAM)
+
   const auto& input_defs(node.InputDefs());
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
@@ -105,9 +103,7 @@ Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     }
     AddOperationOutput(*op, *node.OutputDefs()[0]);
     model_builder.AddOperation(std::move(op));
-  } else  // NOLINT
-#endif
-  {
+  } else {
     if (axes.empty()) {
       coreml_squeeze->set_squeezeall(true);
     } else {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
index 831c4cf4d08ba..5bb7e4c11967a 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
@@ -34,7 +34,6 @@ Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     ORT_RETURN_IF_NOT(perm.size() == input_dims, "Perm and input should have same dimension");
   }
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
 
@@ -44,9 +43,7 @@ Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     AddOperationOutput(*op, *node.OutputDefs()[0]);
     model_builder.AddOperation(std::move(op));
 
-  } else
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
     *layer->mutable_transpose()->mutable_axes() = {perm.cbegin(), perm.cend()};
 
diff --git a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
index bc3cad004aec1..dd495894ab8bb 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
@@ -25,7 +25,6 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   const auto& op_type(node.OpType());
   const auto& input_defs(node.InputDefs());
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
 
@@ -58,9 +57,7 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
     AddOperationOutput(*op, *node.OutputDefs()[0]);
 
     model_builder.AddOperation(std::move(op));
-  } else  // NOLINT
-#endif    // defined (COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
     if (op_type == "Sqrt") {
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.cc b/onnxruntime/core/providers/coreml/builders/model_builder.cc
index f8952301d59a9..3551f5759201e 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.cc
@@ -17,20 +17,17 @@
 #include "core/providers/coreml/shape_utils.h"
 #include "core/optimizer/initializer.h"
 
-#if defined(COREML_ENABLE_MLPROGRAM)
 // includes from coremltools-src in _deps
 #include "modelpackage/src/ModelPackage.hpp"
 #include "mlmodel/src/MILBlob/Blob/StorageWriter.hpp"
 using MILBlob::Blob::StorageWriter;
-#endif
-
 using namespace CoreML::Specification;
 
 namespace onnxruntime {
 namespace coreml {
 
 namespace {
-#if defined(COREML_ENABLE_MLPROGRAM)
+
 // Should the initializer be written to file or kept as an immediate value
 bool ShouldWriteInitializerToWeightsFile(const ONNX_NAMESPACE::TensorProto& tensor_proto) {
   // https://github.com/apple/coremltools/blob/dbb0094fd0cb936469e35320bf37e866ef7a1da4/coremltools/converters/mil/backend/mil/load.py#L51-L57
@@ -388,8 +385,6 @@ void CreateEmptyFile(const std::string& filename) {
   ORT_ENFORCE(file.is_open(), "Failed to open file ", filename);
 }
 
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
-
 std::string GetModelOutputPath(const CoreMLOptions& coreml_options,
                                const GraphViewer& graph_viewer,
                                const logging::Logger& logger) {
@@ -479,7 +474,6 @@ ModelBuilder::ModelBuilder(const GraphViewer& graph_viewer, const logging::Logge
   }
 
   if (create_ml_program_) {
-#if defined(COREML_ENABLE_MLPROGRAM)
     coreml_model_->set_specificationversion(CoreMLSpecVersion());
     MILSpec::Program& mlprogram = *coreml_model_->mutable_mlprogram();
     mlprogram.set_version(1);
@@ -503,12 +497,6 @@ ModelBuilder::ModelBuilder(const GraphViewer& graph_viewer, const logging::Logge
                                                  "CoreML Model Weights");
     auto weights_info = mlpackage_->findItem(weights_id);
     weights_file_writer_ = std::make_unique<StorageWriter>(weights_info->path() + "/weight.bin");
-#else
-    // should never happen due to handling in coreml_execution_provider.cc
-    // throw here so all other code in this class can assume create_ml_program_ is only ever true in a build
-    // where ML Program support is enabled.
-    ORT_THROW("ML Program is not enabled in this build");
-#endif
   } else {
     // We support CorelML Specification Version 4 (Core ML 3)
     coreml_model_->set_specificationversion(4);
@@ -561,7 +549,6 @@ void ModelBuilder::AddLayer(std::unique_ptr<NeuralNetworkLayer> layer) {
 /*
  * ML Program related helpers
  */
-#if defined(COREML_ENABLE_MLPROGRAM)
 const std::string& ModelBuilder::GetSafeName(const std::string& name) {
   // Check the name is valid according to the MILSpec rules
   // `Identifiers, generally used for names and keys, must match the regular expression [A-Za-z\_][A-Za-z0-9\_@]*.`
@@ -737,8 +724,6 @@ std::string_view ModelBuilder::AddConstantImpl(std::string_view op_type, std::st
   return AddTensorValueAsConstantOperation(op_type, value_type, std::move(input_value));
 }
 
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
-
 /*
  * General implementation
  */
@@ -775,13 +760,10 @@ Status ModelBuilder::RegisterInitializers() {
       continue;
     }
 
-#if defined(COREML_ENABLE_MLPROGRAM)
     if (create_ml_program_) {
       MILSpec::Value coreml_tensor = OnnxTensorToCoreMLTensor(tensor, *weights_file_writer_);
       ORT_IGNORE_RETURN_VALUE(AddConstantOperation(name, std::move(coreml_tensor)));
-    } else
-#endif
-    {
+    } else {
       std::unique_ptr<NeuralNetworkLayer> layer = std::make_unique<NeuralNetworkLayer>();
       layer->set_name(GetUniqueName("initializer_" + name));
 
@@ -915,7 +897,6 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
     return Status::OK();
   }
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (create_ml_program_) {
     if (is_input) {
       // the model inputs need to be wired up as args to the 'main' function.
@@ -935,7 +916,6 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
       *mlprogram_main_block_->mutable_outputs()->Add() = name;
     }
   }
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
 
   return Status::OK();
 }
@@ -980,11 +960,9 @@ Status ModelBuilder::CreateModel() {
   ORT_RETURN_IF_ERROR(ProcessNodes());
   ORT_RETURN_IF_ERROR(RegisterModelOutputs());
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (create_ml_program_) {
     SanitizeNames();
   }
-#endif
 
   return Status::OK();
 }
@@ -992,7 +970,6 @@ Status ModelBuilder::CreateModel() {
 Status ModelBuilder::SaveModel() {
   std::string output_path = model_output_path_;
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (create_ml_program_) {
     // we need to jump through some hoops to get the model path the ML Program load wants.
     std::string tmp_model_path = model_output_path_ + "/tmp/model.mlmodel";
@@ -1003,7 +980,6 @@ Status ModelBuilder::SaveModel() {
     auto model_info = mlpackage_->findItem(model_id);
     output_path = model_info->path();
   }
-#endif
 
   // scope this so the stream is closed and flushed by the ofstream dtor
   {
@@ -1012,19 +988,16 @@ Status ModelBuilder::SaveModel() {
     ORT_RETURN_IF_NOT(coreml_model_->SerializeToOstream(&stream), "Saving the CoreML model failed. Path=", output_path);
   }
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   // need to delete the ModelPackage instance for it to write out the manifest. clear out the other ML Program
   // related types as well.
   mlprogram_main_block_ = nullptr;
   mlpackage_.reset();
   weights_file_writer_.reset();
-#endif
 
   return Status::OK();
 }
 
 Status ModelBuilder::LoadModel(std::unique_ptr<Model>& model) {
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (create_ml_program_) {
     // we need to provide the sanitized names for model inputs/outputs so that info is captured.
     // the input/output matching when we execute the model from the CoreML EP is based on order, so the change
@@ -1058,9 +1031,7 @@ Status ModelBuilder::LoadModel(std::unique_ptr<Model>& model) {
                                     std::move(scalar_outputs_),
                                     std::move(int64_outputs_),
                                     logger_, coreml_options_);
-  } else
-#endif
-  {
+  } else {
     model = std::make_unique<Model>(model_output_path_,
                                     std::move(onnx_input_names_),
                                     std::move(onnx_output_names_),
@@ -1073,7 +1044,6 @@ Status ModelBuilder::LoadModel(std::unique_ptr<Model>& model) {
   return model->LoadModel();  // load using CoreML API, including compilation
 }
 
-#if defined(COREML_ENABLE_MLPROGRAM)
 std::string_view ModelBuilder::AddConstant(std::string_view op_type, std::string_view value_type,
                                            const ONNX_NAMESPACE::TensorProto& tensor,
                                            std::optional<gsl::span<const int64_t>> shape) {
@@ -1114,7 +1084,6 @@ std::string_view ModelBuilder::AddConstant(std::string_view op_type, std::string
 
   return ret;
 }
-#endif
 // static
 Status ModelBuilder::Build(const GraphViewer& graph_viewer, const logging::Logger& logger,
                            int32_t coreml_version, const CoreMLOptions& coreml_options,
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.h b/onnxruntime/core/providers/coreml/builders/model_builder.h
index 28c7dc42da581..f3012e8137e8c 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.h
@@ -9,7 +9,6 @@
 #include "core/providers/coreml/model/model.h"
 #include "core/providers/coreml/coreml_options.h"
 
-#if defined(COREML_ENABLE_MLPROGRAM)
 // coremltools classes
 namespace MPL {
 class ModelPackage;
@@ -20,7 +19,6 @@ namespace Blob {
 class StorageWriter;
 }
 }  // namespace MILBlob
-#endif
 
 namespace onnxruntime {
 namespace coreml {
@@ -58,11 +56,7 @@ class ModelBuilder {
 
   // Returns true if we are creating an ML Program
   bool CreateMLProgram() const {
-#if defined(COREML_ENABLE_MLPROGRAM)
     return create_ml_program_;
-#else
-    return false;
-#endif
   }
 
   /*
@@ -76,7 +70,6 @@ class ModelBuilder {
   // Add layer to the Core ML NeuralNetwork model
   void AddLayer(std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer);
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   /*
    * MLProgram helpers
    */
@@ -147,7 +140,6 @@ class ModelBuilder {
 
   // add the operation to the main function
   void AddOperation(std::unique_ptr<COREML_SPEC::MILSpec::Operation> operation);
-#endif
 
   /*
    * General helpers
@@ -176,7 +168,6 @@ class ModelBuilder {
   const logging::Logger& Logger() const { return logger_; }
 
  private:
-#if defined(COREML_ENABLE_MLPROGRAM)
   template <typename T>
   std::string_view AddConstantImpl(std::string_view op_type, std::string_view value_type, gsl::span<const T> value,
                                    std::optional<gsl::span<const int64_t>> shape = std::nullopt);
@@ -190,7 +181,6 @@ class ModelBuilder {
   const std::string& AddConstantOperation(std::string_view name, COREML_SPEC::MILSpec::Value&& initializer);
   const std::string& AddTensorValueAsConstantOperation(std::string_view op_type, std::string_view value_type,
                                                        COREML_SPEC::MILSpec::Value&& input_value);
-#endif
 
   // Convert the ONNX model in graph_viewer_ to a CoreML::Specification::Model and serialize to disk.
   // We then load it using CoreML in order compile it.
@@ -237,7 +227,6 @@ class ModelBuilder {
   uint32_t name_token_{0};
   std::unordered_set<std::string> unique_names_;
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   // mlprogram_main_ is the main block of the CoreML ML Program.
   // It is set in CreateModel to the CoreML Model.mlprogram.functions['main'].block_specializations['CoreML<ver>']
   // entry we create.
@@ -254,7 +243,6 @@ class ModelBuilder {
   // This means an op builder author doesn't need to be aware of the renaming.
   // https://github.com/apple/coremltools/blob/8b37641f243b1a3e81452feea311c6e30dcc9287/coremltools/converters/mil/mil/passes/defs/preprocess.py#L146-L149
   std::unordered_map<std::string, std::string> values_to_rename_;
-#endif
 };
 
 }  // namespace coreml
diff --git a/onnxruntime/core/providers/coreml/coreml_options.cc b/onnxruntime/core/providers/coreml/coreml_options.cc
index 14ae55de9266b..c441a2eff56e0 100644
--- a/onnxruntime/core/providers/coreml/coreml_options.cc
+++ b/onnxruntime/core/providers/coreml/coreml_options.cc
@@ -15,18 +15,6 @@ CoreMLOptions::CoreMLOptions(uint32_t coreml_flags) {
   create_mlprogram_ = (coreml_flags & COREML_FLAG_CREATE_MLPROGRAM) != 0;
   enable_on_subgraph_ = (coreml_flags & COREML_FLAG_ENABLE_ON_SUBGRAPH) != 0;
 
-#if defined(COREML_ENABLE_MLPROGRAM)
-  if (coreml::util::CoreMLVersion() < MINIMUM_COREML_MLPROGRAM_VERSION && create_mlprogram_ != 0) {
-    LOGS_DEFAULT(WARNING) << "ML Program is not supported on this OS version. Falling back to NeuralNetwork.";
-    create_mlprogram_ = false;
-  }
-#else
-  if (create_mlprogram_ != 0) {
-    LOGS_DEFAULT(WARNING) << "ML Program is not supported in this build. Falling back to NeuralNetwork.";
-    create_mlprogram_ = false;
-  }
-#endif
-
   compute_units_ = 0;  // 0 for all
 
   if (coreml_flags & COREML_FLAG_USE_CPU_ONLY) {
diff --git a/onnxruntime/core/providers/coreml/model/host_utils.h b/onnxruntime/core/providers/coreml/model/host_utils.h
index 145c64e5320d3..f654b4d5701b9 100644
--- a/onnxruntime/core/providers/coreml/model/host_utils.h
+++ b/onnxruntime/core/providers/coreml/model/host_utils.h
@@ -43,7 +43,13 @@
 #define API_AVAILABLE_COREML7 API_AVAILABLE(macos(14), ios(17))
 #define API_AVAILABLE_COREML8 API_AVAILABLE(macos(15), ios(18))
 
-// @available is used in implementation code
+// The previous macros are used in header files to declare the availability of the APIs.
+// The following macros are used in  build time checks to determine if the APIs are available.
+#define CAN_BUILD_COREML8_OR_LATER (__MAC_OS_X_VERSION_MAX_ALLOWED >= 150000 && __IPHONE_OS_VERSION_MAX_ALLOWED >= 180000)
+#define CAN_BUILD_COREML7_OR_LATER (__MAC_OS_X_VERSION_MAX_ALLOWED >= 140000 && __IPHONE_OS_VERSION_MAX_ALLOWED >= 170000)
+#define CAN_BUILD_COREML6_OR_LATER (__MAC_OS_X_VERSION_MAX_ALLOWED >= 130000 && __IPHONE_OS_VERSION_MAX_ALLOWED >= 160000)
+
+// @available is used in implementation code to check the availability of the APIs at runtime.
 // Base required OS to run CoreML Specification Version 4 (Core ML 3)
 #define HAS_COREML3_OR_LATER @available(macOS 10.15, iOS 13, *)
 #define HAS_COREML4_OR_LATER @available(macOS 11, iOS 14, *)
@@ -54,8 +60,7 @@
 
 #endif
 
-#define MINIMUM_COREML_VERSION 3            // first version we support
-#define MINIMUM_COREML_MLPROGRAM_VERSION 5  // first version where ML Program was available
+#define MINIMUM_COREML_VERSION 5  // first version we support
 
 namespace onnxruntime {
 namespace coreml {
diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm
index 5211b89ec17c6..71664021ea2fb 100644
--- a/onnxruntime/core/providers/coreml/model/model.mm
+++ b/onnxruntime/core/providers/coreml/model/model.mm
@@ -363,13 +363,12 @@ void ProfileComputePlan(NSURL* compileUrl, MLModelConfiguration* config) {
 #endif
 }
 
-#if __has_include(<CoreML/MLOptimizationHints.h>)
+#if __has_include(<CoreML/MLOptimizationHints.h>) && CAN_BUILD_COREML8_OR_LATER
 #define HAS_COREMLOPTIMIZATIONHINT 1
 #else
 #define HAS_COREMLOPTIMIZATIONHINT 0
 #endif
 
-API_AVAILABLE_COREML8
 void ConfigureOptimizationHints(MLModelConfiguration* config, const CoreMLOptions& coreml_options) {
 #if HAS_COREMLOPTIMIZATIONHINT
   MLOptimizationHints* optimizationHints = [[MLOptimizationHints alloc] init];
diff --git a/onnxruntime/core/providers/cpu/tensor/cast_op.cc b/onnxruntime/core/providers/cpu/tensor/cast_op.cc
index 35f3b12aeba35..639a49cb43a4f 100644
--- a/onnxruntime/core/providers/cpu/tensor/cast_op.cc
+++ b/onnxruntime/core/providers/cpu/tensor/cast_op.cc
@@ -254,11 +254,32 @@ struct TensorCasterNoSat<std::string, DstType> {
 // tensor MLFloat16 -> float
 template <>
 struct TensorCaster<MLFloat16, float> {
-  void Cast(const OpKernelContext&, const TensorShape& shape, const Tensor& in, Tensor& out) const {
+  void Cast(const OpKernelContext& ctx, const TensorShape& shape, const Tensor& in, Tensor& out) const {
     auto out_data = out.MutableData<float>();
     auto in_data = in.Data<MLFloat16>();
     const size_t shape_size = narrow<size_t>(shape.Size());
-    MlasConvertHalfToFloatBuffer(in_data, out_data, shape_size);
+
+    // Check if the tensor is long enough to use threads
+    if (shape_size <= 128000) {
+      MlasConvertHalfToFloatBuffer(in_data, out_data, shape_size);
+      return;
+    }
+    // Calculate the number of compute cyles per implementation
+    auto cpu_info = CPUIDInfo::GetCPUIDInfo();
+    double num_compute_cycles;
+    if (cpu_info.HasSSE3()) {
+      num_compute_cycles = static_cast<double>(shape_size >> 1);
+    } else if (cpu_info.HasAVX2()) {
+      num_compute_cycles = static_cast<double>(shape_size >> 2);
+    } else {
+      num_compute_cycles = static_cast<double>(shape_size * 10);
+    }
+
+    concurrency::ThreadPool::TryParallelFor(ctx.GetOperatorThreadPool(), shape_size,
+                                            {shape_size * 2.f, shape_size * 4.f, num_compute_cycles},
+                                            [in_data, out_data](std::ptrdiff_t first_span, std::ptrdiff_t last_span) {
+                                              MlasConvertHalfToFloatBuffer(in_data + first_span, out_data + first_span, static_cast<size_t>(last_span - first_span));
+                                            });
   }
 };
 
diff --git a/onnxruntime/core/providers/cuda/cu_inc/common.cuh b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
index 55935a9eae86d..2d2551a156099 100644
--- a/onnxruntime/core/providers/cuda/cu_inc/common.cuh
+++ b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
@@ -441,6 +441,9 @@ __device__ __inline__ T _Sign(T a) { return _Signum(a, std::is_signed<T>()); }
 template <>
 __device__ __inline__ half _Sign(half a) { return _Signum(a, std::true_type()); }
 
+template <>
+__device__ __inline__ BFloat16 _Sign(BFloat16 a) { return _Signum(static_cast<float>(a), std::true_type()); }
+
 template <typename T>
 __device__ __inline__ T _Normcdf(T a);
 
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 54fb4429c0536..886fddd8f8a27 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -1013,6 +1013,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Abs);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Abs);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Abs);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Abs);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int8_t, Neg);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int16_t, Neg);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Neg);
@@ -1188,6 +1189,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Sign);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Sign);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Sign);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Sign);
 
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, BFloat16, Add);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, BFloat16, Sub);
@@ -1996,6 +1998,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Abs)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Abs)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Abs)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int8_t, Neg)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int16_t, Neg)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Neg)>,
@@ -2169,6 +2172,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Sign)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Sign)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Sign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Sign)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, BFloat16, Add)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, BFloat16, Sub)>,
diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc
index fb03b4326c4e8..86a1b0f5b6102 100644
--- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc
+++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc
@@ -213,19 +213,19 @@ Status IsNaN::ComputeInternal(OpKernelContext* context) const {
   UNARY_OP_TYPED(name, ver, float)     \
   UNARY_OP_TYPED(name, ver, double)
 
-#define UNARY_OP_CSILHFD(name, ver)  \
+#define UNARY_OP_CSILHFDX(name, ver) \
   UNARY_OP_TYPED(name, ver, int8_t)  \
   UNARY_OP_TYPED(name, ver, int16_t) \
   UNARY_OP_TYPED(name, ver, int32_t) \
   UNARY_OP_TYPED(name, ver, int64_t) \
   UNARY_OP_HFDX(name, ver)
 
-#define UNARY_OP_BWUZCSILHFD(name, ver) \
-  UNARY_OP_TYPED(name, ver, uint8_t)    \
-  UNARY_OP_TYPED(name, ver, uint16_t)   \
-  UNARY_OP_TYPED(name, ver, uint32_t)   \
-  UNARY_OP_TYPED(name, ver, uint64_t)   \
-  UNARY_OP_CSILHFD(name, ver)
+#define UNARY_OP_BWUZCSILHFDX(name, ver) \
+  UNARY_OP_TYPED(name, ver, uint8_t)     \
+  UNARY_OP_TYPED(name, ver, uint16_t)    \
+  UNARY_OP_TYPED(name, ver, uint32_t)    \
+  UNARY_OP_TYPED(name, ver, uint64_t)    \
+  UNARY_OP_CSILHFDX(name, ver)
 
 UNARY_OP_VERSIONED_BWUZCSILHFD(Abs, 6, 12)
 UNARY_OP_VERSIONED_CSILHFD(Neg, 6, 12)
@@ -237,8 +237,8 @@ UNARY_OP_VERSIONED_HFD(Log, 6, 12)
 UNARY_OP_VERSIONED_HFD(Exp, 6, 12)
 UNARY_OP_VERSIONED_HFD(Erf, 9, 12)
 
-UNARY_OP_BWUZCSILHFD(Abs, 13)
-UNARY_OP_CSILHFD(Neg, 13)
+UNARY_OP_BWUZCSILHFDX(Abs, 13)
+UNARY_OP_CSILHFDX(Neg, 13)
 UNARY_OP_HFD(Floor, 13)
 UNARY_OP_HFD(Ceil, 13)
 UNARY_OP_HFD(Reciprocal, 13)
@@ -246,7 +246,7 @@ UNARY_OP_HFDX(Sqrt, 13)
 UNARY_OP_HFD(Log, 13)
 UNARY_OP_HFDX(Exp, 13)
 UNARY_OP_HFDX(Erf, 13)
-UNARY_OP_BWUZCSILHFD(Sign, 13)
+UNARY_OP_BWUZCSILHFDX(Sign, 13)
 
 UNARY_LOGICALOP_NOT_TYPED(1, bool)
 UNARY_OP_HFD(Round, 11)
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
index 3d66003fb2bca..77579dfc793ee 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
+++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
@@ -47,6 +47,7 @@ OpBuilderRegistrations::OpBuilderRegistrations() {
     CreateSimpleOpBuilder("Sin", *this);
     CreateSimpleOpBuilder("Sqrt", *this);
     CreateSimpleOpBuilder("Sub", *this);
+    CreateSimpleOpBuilder("Sum", *this);
     CreateSimpleOpBuilder("Tanh", *this);
 
     CreateSimpleOpBuilder("Concat", *this);
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
index a79f4056083c5..df9d0de8e0e3e 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
@@ -158,6 +158,7 @@ class BaseOpBuilder : public IOpBuilder {
         {"Softmax", QNN_OP_SOFTMAX},
         {"Sqrt", QNN_OP_ELEMENT_WISE_SQUARE_ROOT},
         {"Sub", QNN_OP_ELEMENT_WISE_SUBTRACT},
+        {"Sum", QNN_OP_ELEMENT_WISE_ADD},
         {"Tanh", QNN_OP_TANH},
         {"Transpose", QNN_OP_TRANSPOSE},
         {"GridSample", QNN_OP_GRID_SAMPLE},
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
index 48c637cd2e951..229d86082f6dc 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
@@ -56,11 +56,18 @@ Status SimpleOpBuilder::ExplicitOpCheck(QnnModelWrapper& qnn_model_wrapper,
                       padding_mode.c_str());
   }
 
-  // ONNX's Min and Max operators accept a variable number of inputs (i.e., variadic).
-  // However, QNN's Min and Max operators must take in exactly two inputs.
+  // ONNX's Min, Max, and Sum operators accept a variable number of inputs (i.e., variadic).
+  // However, QNN's Min, Max, and Add operators must take in exactly two inputs.
   if (op_type == "Min" || op_type == "Max") {
     ORT_RETURN_IF_NOT(node_unit.Inputs().size() == 2,
-                      "QNN EP only supports Min and Max operators with exactly 2 inputs.");
+                      "QNN EP only supports ", op_type.c_str(), " operator with exactly 2 inputs.");
+  }
+
+  if (op_type == "Sum") {
+    size_t inputs_num = node_unit.Inputs().size();
+    ORT_RETURN_IF_NOT(inputs_num == 2,
+                      "QNN EP supports Sum operator with QNN_OP_ELEMENT_WISE_ADD, which takes exactly 2 inputs. Got ONNX's Sum operator with ",
+                      std::to_string(inputs_num).c_str(), " inputs.");
   }
 
   if (op_type == "DequantizeLinear") {
diff --git a/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.cc b/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.cc
index 13004af25726d..6891b8159b090 100644
--- a/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.cc
+++ b/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.cc
@@ -4,15 +4,18 @@
 #include "core/providers/common.h"
 #include "core/providers/webgpu/math/binary_elementwise_ops.h"
 #include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/string_macros.h"
 #include "core/providers/webgpu/webgpu_supported_types.h"
 
 namespace onnxruntime {
 namespace webgpu {
 Status BinaryElementwiseProgram::GenerateShaderCode(ShaderHelper& shader) const {
-  const auto& a = shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
-  const auto& b = shader.AddInput("input_b", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
+  const auto& a = shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
+  const auto& b = shader.AddInput("input_b", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
   const auto& c = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
 
+  shader.AdditionalImplementation() << additional_impl_;
+
   shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size");
 
   // check whether can use element-wise mode.
@@ -142,8 +145,15 @@ Status BinaryElementwise::ComputeInternal(ComputeContext& context) const {
   }
 
   uint32_t vec_size = onnxruntime::narrow<uint32_t>((size + 3) / 4);
+
+  std::string additional_impl;
+  if (get_additional_impl_) {
+    additional_impl = get_additional_impl_(lhs_tensor->GetElementType(), rhs_tensor->GetElementType());
+  }
+
   BinaryElementwiseProgram program{kernel_name_,
                                    expression_,
+                                   additional_impl,
                                    is_broadcast,
                                    is_lhs_scalar,
                                    is_rhs_scalar,
@@ -273,7 +283,28 @@ WEBGPU_BINARY_VERSIONED_KERNEL(Sub, 7, 12, Sub, WebGpuSupportedNumberTypes())
 WEBGPU_BINARY_VERSIONED_KERNEL(Sub, 13, 13, Sub, WebGpuSupportedNumberTypes())
 WEBGPU_BINARY_KERNEL(Sub, 14, Sub, WebGpuSupportedNumberTypes())
 
-WEBGPU_BINARY_IMPL(Pow, "output_value_t(pow(vec4<f32>(a), vec4<f32>(b)))")
+std::string GetPowImpl(int lhs_element_type, int /* rhs_element_type */) {
+  SS(s, 1024);
+  std::string round_str;
+  if (lhs_element_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32) {
+    round_str = "round";
+  }
+
+  s << "fn pow_custom(a : input_a_element_t, b : f32) -> input_a_element_t {\n"
+       "  if (b == 0.0) {\n"
+       "    return input_a_element_t(1.0);\n"
+       "  } else if (a < input_a_element_t(0.0) && b != floor(b)) {\n"
+       "    return input_a_element_t(pow(f32(a), b)); // NaN\n"
+       "  }\n"
+    << "  return select(sign(a), input_a_element_t(1.0), round(abs(b) % 2.0) != 1.0) * input_a_element_t(" << round_str << "(pow(f32(abs(a)), b)));\n"
+    << "}\n"
+       "fn pow_v(a : vec4<input_a_element_t>, b : vec4<input_b_element_t>) -> vec4<input_a_element_t> {\n"
+       "  return vec4<input_a_element_t>(pow_custom(a.x, f32(b.x)), pow_custom(a.y, f32(b.y)), pow_custom(a.z, f32(b.z)), pow_custom(a.w, f32(b.w)));\n"
+       "}\n";
+  return SS_GET(s);
+}
+
+WEBGPU_BINARY_IMPL(Pow, "pow_v(a, b)", GetPowImpl)
 WEBGPU_BINARY_VERSIONED_KERNEL(Pow, 7, 11, Pow, WebGpuSupportedNumberTypes())
 WEBGPU_BINARY_VERSIONED_KERNEL_2(Pow, 12, 12, Pow, WebGpuSupportedNumberTypes(), WebGpuSupportedNumberTypes())
 WEBGPU_BINARY_VERSIONED_KERNEL_2(Pow, 13, 14, Pow, WebGpuSupportedNumberTypes(), WebGpuSupportedNumberTypes())
diff --git a/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.h b/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.h
index 84cbcdf3244d8..f80accfb934f8 100644
--- a/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.h
+++ b/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.h
@@ -14,11 +14,13 @@ class BinaryElementwiseProgram final : public Program<BinaryElementwiseProgram>
  public:
   BinaryElementwiseProgram(const std::string& kernel_name,
                            const std::string& expression,
+                           const std::string& additional_impl,
                            const bool is_broadcast,
                            const bool is_lhs_scalar,
                            const bool is_rhs_scalar,
                            const bool vectorize) : Program{kernel_name},
                                                    expression_{expression},
+                                                   additional_impl_{additional_impl},
                                                    is_broadcast_{is_broadcast},
                                                    is_lhs_scalar_{is_lhs_scalar},
                                                    is_rhs_scalar_{is_rhs_scalar},
@@ -29,7 +31,8 @@ class BinaryElementwiseProgram final : public Program<BinaryElementwiseProgram>
   WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"vec_size", ProgramUniformVariableDataType::Uint32});
 
  private:
-  std::string expression_;
+  std::string_view expression_;
+  std::string_view additional_impl_;
   bool is_broadcast_;
   bool is_lhs_scalar_;
   bool is_rhs_scalar_;
@@ -38,11 +41,15 @@ class BinaryElementwiseProgram final : public Program<BinaryElementwiseProgram>
 
 class BinaryElementwise : public WebGpuKernel {
  public:
+  using GetAdditionalImplementationFunction = std::string (*)(int lhs_element_type, int rhs_element_type);
+
   BinaryElementwise(const OpKernelInfo& info,
                     const std::string& kernel_name,
-                    const std::string& expression) : WebGpuKernel{info},
-                                                     kernel_name_{kernel_name},
-                                                     expression_{expression} {}
+                    const std::string& expression,
+                    const GetAdditionalImplementationFunction get_additional_impl = nullptr) : WebGpuKernel{info},
+                                                                                               kernel_name_{kernel_name},
+                                                                                               expression_{expression},
+                                                                                               get_additional_impl_{get_additional_impl} {}
 
  protected:
   Status ComputeInternal(ComputeContext& context) const final;
@@ -50,6 +57,7 @@ class BinaryElementwise : public WebGpuKernel {
  private:
   std::string kernel_name_;
   std::string expression_;
+  const GetAdditionalImplementationFunction get_additional_impl_;
 };
 
 }  // namespace webgpu
diff --git a/onnxruntime/core/providers/webgpu/math/cum_sum.cc b/onnxruntime/core/providers/webgpu/math/cum_sum.cc
new file mode 100644
index 0000000000000..bc4cd70a238fc
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/math/cum_sum.cc
@@ -0,0 +1,98 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/webgpu/math/cum_sum.h"
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    CumSum,
+    kOnnxDomain,
+    11, 13,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", WebGpuSupportedFloatTypes())
+        .TypeConstraint("T2", {DataTypeImpl::GetTensorType<int32_t>(),
+                               DataTypeImpl::GetTensorType<int64_t>()})
+        .InputMemoryType(OrtMemTypeCPU, 1),
+    CumSum);
+
+ONNX_OPERATOR_KERNEL_EX(
+    CumSum,
+    kOnnxDomain,
+    14,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", WebGpuSupportedFloatTypes())
+        .TypeConstraint("T2", {DataTypeImpl::GetTensorType<int32_t>(),
+                               DataTypeImpl::GetTensorType<int64_t>()})
+        .InputMemoryType(OrtMemTypeCPU, 1),
+    CumSum);
+
+Status CumSumProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  const ShaderVariableHelper& input = shader.AddInput("input", ShaderUsage::UseUniform);
+  const ShaderVariableHelper& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
+
+  shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
+                            << "var input_indices = " << input.OffsetToIndices("global_idx") << ";\n"
+                            << "var sum : output_value_t = 0;\n"
+                            << "var first : i32 = 0;\n"
+                            << "if (uniforms.reverse == 1) {\n"
+                            << "  first = i32(" + input.IndicesGet("input_indices", "uniforms.axis") + ");\n"
+                            << "  if (uniforms.exclusive == 1) { first += 1; }\n"
+                            << "}\n\n"
+                            << "var last : i32 = 0;\n"
+                            << "if (uniforms.reverse == 1) {\n"
+                            << "  last = i32(" << GetElementAt("uniforms.input_shape", "uniforms.axis", input.Rank()) << ");\n"
+                            << "} else {\n"
+                            << "  last = i32(" + input.IndicesGet("input_indices", "uniforms.axis") + ");\n"
+                            << "  if (uniforms.exclusive == 0) { last += 1; }\n"
+                            << "}\n\n"
+                            << "for (var i : i32 = first; i < last; i++) {\n"
+                            << "  " << input.IndicesSet("input_indices", "uniforms.axis", "u32(i)") << ";\n"
+                            << "  sum = sum + " << input.GetByIndices("input_indices") << ";\n"
+                            << "}\n"
+                            << output.SetByOffset("global_idx", "sum");
+
+  return Status::OK();
+}
+
+Status CumSum::ComputeInternal(ComputeContext& context) const {
+  const auto* input_tensor = context.Input(0);
+  const TensorShape& input_shape = input_tensor->Shape();
+  int64_t input_rank = input_shape.NumDimensions();
+
+  const auto* axis_tensor = context.Input(1);
+  const auto* axis_data = axis_tensor->Data<int>();
+  int64_t axis = static_cast<int64_t>(axis_data[0]);
+
+  ORT_ENFORCE(-input_rank <= axis && axis < input_rank, "Axes attribute must be within range -input_rank <= axis < input_rank.");
+  // Handle negative axis
+  if (axis < 0) {
+    axis += input_rank;
+  }
+
+  auto* output_tensor = context.Output(0, input_shape);
+  int64_t output_size = output_tensor->Shape().Size();
+
+  if (output_size == 0) {
+    return Status::OK();
+  }
+
+  CumSumProgram program{};
+  program
+      .AddInput({input_tensor})
+      .AddOutput({output_tensor, ProgramTensorMetadataDependency::TypeAndRank})
+      .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+      .AddUniformVariables({{static_cast<uint32_t>(output_size)},
+                            {static_cast<uint32_t>(axis)},
+                            {static_cast<uint32_t>(exclusive_)},
+                            {static_cast<uint32_t>(reverse_)}});
+  return context.RunProgram(program);
+}
+
+}  // namespace webgpu
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/webgpu/math/cum_sum.h b/onnxruntime/core/providers/webgpu/math/cum_sum.h
new file mode 100644
index 0000000000000..6a66ee0ed7b04
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/math/cum_sum.h
@@ -0,0 +1,39 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/webgpu_kernel.h"
+#include "core/providers/webgpu/program.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+class CumSumProgram final : public Program<CumSumProgram> {
+ public:
+  CumSumProgram() : Program{"CumSum"} {}
+
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32},
+                                          {"axis", ProgramUniformVariableDataType::Uint32},
+                                          {"exclusive", ProgramUniformVariableDataType::Uint32},
+                                          {"reverse", ProgramUniformVariableDataType::Uint32});
+};
+
+class CumSum final : public WebGpuKernel {
+ public:
+  CumSum(const OpKernelInfo& info) : WebGpuKernel(info) {
+    exclusive_ = info.GetAttrOrDefault<int64_t>("exclusive", 0);
+    reverse_ = info.GetAttrOrDefault<int64_t>("reverse", 0);
+  }
+
+  Status ComputeInternal(ComputeContext& context) const override;
+
+ private:
+  int64_t exclusive_;
+  int64_t reverse_;
+};
+
+}  // namespace webgpu
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
index 1a56cafdb3952..11fa30c798809 100644
--- a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
@@ -11,7 +11,7 @@
 namespace onnxruntime {
 namespace webgpu {
 
-#define REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceOp, begin, end)              \
+#define REGISTER_REDUCE_VERSIONED_KERNEL(ReduceOp, begin, end)                         \
   ONNX_OPERATOR_VERSIONED_KERNEL_EX(                                                   \
       ReduceOp,                                                                        \
       kOnnxDomain,                                                                     \
@@ -20,7 +20,16 @@ namespace webgpu {
       (*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedNumberTypes()), \
       ReduceOp);
 
-#define REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceOp, version)                                                                  \
+#define REGISTER_REDUCE_VERSIONED_KERNEL_WITH_AXIS_IN_INPUT(ReduceOp, begin, end)                                             \
+  ONNX_OPERATOR_VERSIONED_KERNEL_EX(                                                                                          \
+      ReduceOp,                                                                                                               \
+      kOnnxDomain,                                                                                                            \
+      begin, end,                                                                                                             \
+      kWebGpuExecutionProvider,                                                                                               \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedNumberTypes()).InputMemoryType(OrtMemTypeCPUInput, 1), \
+      ReduceOp);
+
+#define REGISTER_REDUCE_KERNEL(ReduceOp, version)                                                                             \
   ONNX_OPERATOR_KERNEL_EX(                                                                                                    \
       ReduceOp,                                                                                                               \
       kOnnxDomain,                                                                                                            \
@@ -29,58 +38,66 @@ namespace webgpu {
       (*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedNumberTypes()).InputMemoryType(OrtMemTypeCPUInput, 1), \
       ReduceOp);
 
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMean, 18);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMean, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMean, 11, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMean, 13, 17);
+REGISTER_REDUCE_KERNEL(ReduceMean, 18);
+
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMax, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMax, 11, 11);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMax, 12, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMax, 13, 17);
+REGISTER_REDUCE_VERSIONED_KERNEL_WITH_AXIS_IN_INPUT(ReduceMax, 18, 19);
+REGISTER_REDUCE_KERNEL(ReduceMax, 20);
+
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMin, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMin, 11, 11);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMin, 12, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMin, 13, 17);
+REGISTER_REDUCE_VERSIONED_KERNEL_WITH_AXIS_IN_INPUT(ReduceMin, 18, 19);
+REGISTER_REDUCE_KERNEL(ReduceMin, 20);
 
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 11, 11);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 12, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 18, 19);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMax, 20);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceSum, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceSum, 11, 12);
+REGISTER_REDUCE_KERNEL(ReduceSum, 13);
 
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 11, 11);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 12, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 18, 19);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMin, 20);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceProd, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceProd, 11, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceProd, 13, 17);
+REGISTER_REDUCE_KERNEL(ReduceProd, 18);
 
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSum, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSum, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceSum, 13);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceL1, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceL1, 11, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceL1, 13, 17);
+REGISTER_REDUCE_KERNEL(ReduceL1, 18);
 
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceProd, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceProd, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceProd, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceProd, 18);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceL2, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceL2, 11, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceL2, 13, 17);
+REGISTER_REDUCE_KERNEL(ReduceL2, 18);
 
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL1, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL1, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL1, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceL1, 18);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSum, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSum, 11, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSum, 13, 17);
+REGISTER_REDUCE_KERNEL(ReduceLogSum, 18);
 
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL2, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL2, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL2, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceL2, 18);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceSumSquare, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceSumSquare, 11, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceSumSquare, 13, 17);
+REGISTER_REDUCE_KERNEL(ReduceSumSquare, 18);
 
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSum, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSum, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSum, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceLogSum, 18);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSumExp, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSumExp, 11, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSumExp, 13, 17);
+REGISTER_REDUCE_KERNEL(ReduceLogSumExp, 18);
 
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSumSquare, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSumSquare, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSumSquare, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceSumSquare, 18);
+REGISTER_REDUCE_VERSIONED_KERNEL(ArgMax, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ArgMax, 11, 12);
+REGISTER_REDUCE_KERNEL(ArgMax, 13);
 
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSumExp, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSumExp, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSumExp, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceLogSumExp, 18);
+REGISTER_REDUCE_VERSIONED_KERNEL(ArgMin, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ArgMin, 11, 12);
+REGISTER_REDUCE_KERNEL(ArgMin, 13);
 
 Status ReduceKernelProgram::GenerateShaderCode(ShaderHelper& shader) const {
   const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
@@ -105,6 +122,9 @@ Status ReduceKernelProgram::GenerateShaderCode(ShaderHelper& shader) const {
       std::stringstream ss;
       std::string index = "i" + std::to_string(i);
       ss << "for (var " << index << " : u32 = 0; " << index << " < " << input.IndicesGet("uniforms.input_shape", i) << "; " << index << "++) {\n";
+      if (loop_body.find("last_index") != std::string::npos) {
+        ss << "let last_index = " + index + ";\n";
+      }
       ss << input.IndicesSet("input_indices", i, index) << ";\n";
       ss << loop_body << "\n";
       ss << "}\n";
@@ -171,12 +191,13 @@ Status ReduceKernel<allow_multi_axes>::ComputeInternal(ComputeContext& context)
         auto output = context.Output(0, input_tensor->Shape());
         // We need to run the operation even for scalar inputs for these ops
         const auto code = GetOpSpecificCode(input_tensor);
+        constexpr uint32_t output_size = 1;
+        constexpr uint32_t reduce_axes = 0;
         ReduceKernelProgram program(name_, keepdims_, noop_with_empty_axes_, input_axes, code, false);
-        std::vector<uint32_t> reduce_axes = {0};
         program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank})
             .AddOutput({output, ProgramTensorMetadataDependency::TypeAndRank})
             .SetDispatchGroupSize(1)
-            .AddUniformVariables({{1}, {static_cast<uint32_t>(noop_with_empty_axes_ ? 1 : 0)}, {reduce_axes}});
+            .AddUniformVariables({{output_size}, {static_cast<uint32_t>(noop_with_empty_axes_ ? 1 : 0)}, {reduce_axes}});
         return context.RunProgram(program);
       } else {
         // For other ops, or when axes is empty with noop_with_empty_axes_ true, just copy the input
@@ -328,5 +349,25 @@ ReduceOpSpecificCode ReduceLogSumExp::GetOpSpecificCode(const Tensor* input_tens
   return code;
 }
 
+ReduceOpSpecificCode ArgMin::GetOpSpecificCode(const Tensor* input_tensor) const {
+  ORT_UNUSED_PARAMETER(input_tensor);
+  std::string op = (select_last_index_) ? "<=" : "<";
+  std::string loop_header = "var best_element = first_element; var best_index = u32(0);";
+  std::string loop_body = "if (current_element " + op + " best_element) { best_element = current_element; best_index = last_index; };";
+  std::string loop_footer = "let output_value = output_value_t(best_index);";
+  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
+  return code;
+}
+
+ReduceOpSpecificCode ArgMax::GetOpSpecificCode(const Tensor* input_tensor) const {
+  ORT_UNUSED_PARAMETER(input_tensor);
+  std::string op = (select_last_index_) ? ">=" : ">";
+  std::string loop_header = "var best_element = first_element; var best_index = u32(0);";
+  std::string loop_body = "if (current_element " + op + " best_element) { best_element = current_element; best_index = last_index; };";
+  std::string loop_footer = "let output_value = output_value_t(best_index);";
+  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
+  return code;
+}
+
 }  // namespace webgpu
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h
index 291d931f41c05..70ae6d3c71eb9 100644
--- a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h
+++ b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h
@@ -119,5 +119,17 @@ class ReduceLogSumExp final : public ReduceKernel<true> {
   ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
 };
 
+class ArgMin final : public ReduceKernel<false> {
+ public:
+  ArgMin(const OpKernelInfo& info) : ReduceKernel<false>(info, "ArgMin", true) {}
+  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
+};
+
+class ArgMax final : public ReduceKernel<false> {
+ public:
+  ArgMax(const OpKernelInfo& info) : ReduceKernel<false>(info, "ArgMax", true) {}
+  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
+};
+
 }  // namespace webgpu
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index dfb2e4b6ce665..aacbcc5fb4f0a 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -297,12 +297,12 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13,
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 12, MatMul);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, MatMul);
 
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, float, ArgMax);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, float, ArgMax);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, float, ArgMax);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, float, ArgMin);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, float, ArgMin);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, float, ArgMin);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ArgMax);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ArgMax);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, ArgMax);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ArgMin);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ArgMin);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, ArgMin);
 
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, Softmax);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Softmax);
@@ -624,13 +624,13 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 12, MatMul)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, MatMul)>,
 
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, float, ArgMax)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, float, ArgMax)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, float, ArgMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ArgMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ArgMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, ArgMax)>,
 
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, float, ArgMin)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, float, ArgMin)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, float, ArgMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ArgMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ArgMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, ArgMin)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, Softmax)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Softmax)>,
@@ -713,8 +713,8 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 9, 13, BatchNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 14, 14, BatchNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 15, BatchNormalization)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 13, CumSum)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 14, CumSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 13, CumSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 14, CumSum)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 10, 12, uint8_t, DequantizeLinear)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 10, 12, int8_t, DequantizeLinear)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 10, 12, int32_t, DequantizeLinear)>,
diff --git a/onnxruntime/core/providers/webnn/allocator.cc b/onnxruntime/core/providers/webnn/allocator.cc
index 9c5cd651e1f00..8cf5b8cd72a5c 100644
--- a/onnxruntime/core/providers/webnn/allocator.cc
+++ b/onnxruntime/core/providers/webnn/allocator.cc
@@ -16,7 +16,7 @@ void* WebNNTensorAllocator::Alloc(size_t size) {
     // We don't need to transfer the tensor to an MLTensor, so we don't need to allocate an MLTensor id.
     return nullptr;
   }
-  void* p = EM_ASM_PTR({ return Module.jsepReserveTensorId(); });
+  void* p = EM_ASM_PTR({ return Module.webnnReserveTensorId(); });
   allocations_[p] = size;
   stats_.num_allocs++;
   stats_.bytes_in_use += SafeInt<int64_t>(size);
@@ -27,7 +27,7 @@ void WebNNTensorAllocator::Free(void* p) {
   if (p == nullptr) {
     return;
   }
-  EM_ASM({ Module.jsepReleaseTensorId($0); }, p);
+  EM_ASM({ Module.webnnReleaseTensorId($0); }, p);
   size_t size = allocations_[p];
   stats_.bytes_in_use -= size;
   allocations_.erase(p);
diff --git a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
index 6814b019f699c..08580ab2861d7 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
@@ -54,9 +54,9 @@ Status ArgMaxMinOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
   const auto& op_type = node.OpType();
   if (op_type == "ArgMax") {
-    output = model_builder.GetBuilder().call<emscripten::val>("argMax", input, narrow<uint32_t>(axis), options);
+    output = model_builder.GetBuilder().call<emscripten::val>("argMax", input, SafeInt<uint32_t>(axis).Ref(), options);
   } else if (op_type == "ArgMin") {
-    output = model_builder.GetBuilder().call<emscripten::val>("argMin", input, narrow<uint32_t>(axis), options);
+    output = model_builder.GetBuilder().call<emscripten::val>("argMin", input, SafeInt<uint32_t>(axis).Ref(), options);
   } else {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "ArgMaxMinOpBuilder, unknown op: ", op_type);
   }
diff --git a/onnxruntime/core/providers/webnn/builders/impl/builder_utils.cc b/onnxruntime/core/providers/webnn/builders/impl/builder_utils.cc
index 113cc3df5438d..63e2345243282 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/builder_utils.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/builder_utils.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <core/common/safeint.h>
 #include <core/providers/common.h>
 #include "core/providers/shared/utils/utils.h"
 
diff --git a/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc
index 1bbe56ef9b477..ee2512ddd8b5a 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/providers/webnn/builders/helper.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
index 1361b7dd5c14b..4c393e8a9bdba 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/cumsum_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/cumsum_op_builder.cc
index be30c5520d62e..99be8f75771ad 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/cumsum_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/cumsum_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
@@ -64,8 +63,8 @@ Status CumSumOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   options.set("label", node.Name());
 
   emscripten::val output = emscripten::val::object();
-  output = model_builder.GetBuilder().call<emscripten::val>("cumulativeSum", input, gsl::narrow<uint32_t>(webnn_axis),
-                                                            options);
+  output = model_builder.GetBuilder().call<emscripten::val>("cumulativeSum", input,
+                                                            SafeInt<uint32_t>(webnn_axis).Ref(), options);
   model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc
index 55746bb1f61f0..f3363b1e186d5 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc
index 1f51e26fecfa5..6cee04bac3e2b 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc
index 8402f05d8e234..3f813f08279e7 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/flatten_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/flatten_op_builder.cc
index d0ece026a7048..c4ff280b95b6e 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/flatten_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/flatten_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/providers/webnn/builders/helper.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
index c5cc8e86bb308..1f24124745a19 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/providers/webnn/builders/helper.h"
@@ -86,9 +85,9 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     else if (extended_a_shape) {
       std::vector<uint32_t> new_shape;
       for (size_t i = 0; i < b_shape.size() - 2; i++) {
-        new_shape.push_back(narrow<uint32_t>(b_shape[i]));
+        new_shape.push_back(SafeInt<uint32_t>(b_shape[i]));
       }
-      new_shape.push_back(narrow<uint32_t>(b_shape.back()));
+      new_shape.push_back(SafeInt<uint32_t>(b_shape.back()));
       output = model_builder.GetBuilder().call<emscripten::val>("reshape",
                                                                 output,
                                                                 emscripten::val::array(new_shape),
@@ -98,7 +97,7 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     else if (extended_b_shape) {
       std::vector<uint32_t> new_shape;
       for (size_t i = 0; i < a_shape.size() - 1; i++) {
-        new_shape.push_back(narrow<uint32_t>(a_shape[i]));
+        new_shape.push_back(SafeInt<uint32_t>(a_shape[i]));
       }
       output = model_builder.GetBuilder().call<emscripten::val>("reshape",
                                                                 output,
diff --git a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
index a090c21fe3356..5b57df7f184e7 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc
index d8373a45e4423..e8f26af928ab3 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/providers/webnn/builders/helper.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc
index b338d27986279..79ad3574e07e9 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/providers/webnn/builders/helper.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc
index 1bb6523c6f86a..ed62b2bd69618 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
index 93ad933d71c34..b23fbeba1ddc8 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/providers/webnn/builders/helper.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc
index 0a438e98ad737..2fc47430a1c66 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
index 58515d2df54ec..eec6911a686cf 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
@@ -4,7 +4,6 @@
 
 #include <math.h>
 
-#include "core/common/safeint.h"
 #include "core/providers/common.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/webnn/builders/helper.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc
index 1688dfc97a0c4..0a84835ee9fc0 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc
@@ -89,7 +89,7 @@ Status RotaryEmbeddingOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_build
   emscripten::val wnn_builder = model_builder.GetBuilder();
 
   NodeAttrHelper helper(node);
-  const bool interleaved = gsl::narrow_cast<bool>(helper.Get("interleaved", 0));
+  const bool interleaved = static_cast<bool>(helper.Get("interleaved", 0));
   uint32_t num_heads = helper.Get("num_heads", 0);
   uint32_t rotary_embedding_dim = helper.Get("rotary_embedding_dim", 0);
 
diff --git a/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc
index 4adc5920de7fa..468c0e24a3e88 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc
index b1b737b114998..0e754b53e78d1 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/providers/webnn/builders/helper.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
index 06dbacf995a28..21b44b1066694 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
@@ -75,8 +74,8 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   // Check that the splits evenly divide.
   if (split_count > 0 && splits.empty() && input_shape[axis] % split_count != 0) {
     // Divide inputs into variable size outputs:
-    splits.insert(splits.end(), split_count - 1, narrow<uint32_t>(input_shape[axis]) / split_count);
-    splits.insert(splits.end(), narrow<uint32_t>(input_shape[axis]) % split_count);
+    splits.insert(splits.end(), split_count - 1, SafeInt<uint32_t>(input_shape[axis]) / split_count);
+    splits.insert(splits.end(), SafeInt<uint32_t>(input_shape[axis]) % split_count);
   }
 
   if (splits.empty()) {
diff --git a/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc
index a3be9d7e2ceee..5687b1133c628 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/providers/webnn/builders/helper.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/tile_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/tile_op_builder.cc
index 672a3a510d54d..259bb0552b7c7 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/tile_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/tile_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc
index ac440e0119bac..452071f469c4f 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/providers/webnn/builders/helper.h"
 #include "core/providers/webnn/builders/model_builder.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc
index 0c818533918a4..f2092d6163713 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc
@@ -59,7 +59,7 @@ Status TriangularOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     std::vector<uint8_t> unpacked_tensor;
     ORT_RETURN_IF_ERROR(onnxruntime::utils::UnpackInitializerData(diagonal_tensor, unpacked_tensor));
     const auto diagonal = *reinterpret_cast<int64_t*>(unpacked_tensor.data());
-    options.set("diagonal", narrow<int32_t>(diagonal));
+    options.set("diagonal", SafeInt<int32_t>(diagonal).Ref());
   }
 
   output = model_builder.GetBuilder().call<emscripten::val>("triangular", input, options);
diff --git a/onnxruntime/core/providers/webnn/builders/model.cc b/onnxruntime/core/providers/webnn/builders/model.cc
index 35964d85862e4..40fdfc609e6a1 100644
--- a/onnxruntime/core/providers/webnn/builders/model.cc
+++ b/onnxruntime/core/providers/webnn/builders/model.cc
@@ -8,7 +8,6 @@
 #include "core/common/common.h"
 #include "core/common/inlined_containers.h"
 #include "core/common/logging/logging.h"
-#include "core/common/safeint.h"
 #include "core/graph/onnx_protobuf.h"
 #include "core/providers/common.h"
 #include "model.h"
@@ -157,7 +156,7 @@ onnxruntime::common::Status Model::Compute(const InlinedHashMap<std::string, Onn
 
 onnxruntime::common::Status Model::Dispatch(const InlinedHashMap<std::string, OnnxTensorData>& inputs,
                                             const InlinedHashMap<std::string, OnnxTensorData>& outputs) {
-  auto jsepEnsureTensor = emscripten::val::module_property("jsepEnsureTensor");
+  auto webnnEnsureTensor = emscripten::val::module_property("webnnEnsureTensor");
   auto promises = emscripten::val::array();
   for (const auto& [_, tensor] : inputs) {
     emscripten::val shape = emscripten::val::array();
@@ -165,7 +164,7 @@ onnxruntime::common::Status Model::Dispatch(const InlinedHashMap<std::string, On
       uint32_t dim_val = SafeInt<uint32_t>(dim);
       shape.call<void>("push", dim_val);
     }
-    auto ml_tensor = jsepEnsureTensor(emscripten::val::undefined(), reinterpret_cast<intptr_t>(tensor.buffer), tensor.tensor_info.data_type, shape, true);
+    auto ml_tensor = webnnEnsureTensor(emscripten::val::undefined(), reinterpret_cast<intptr_t>(tensor.buffer), tensor.tensor_info.data_type, shape, true);
     promises.call<void>("push", ml_tensor);
   }
   for (const auto& [_, tensor] : outputs) {
@@ -174,7 +173,7 @@ onnxruntime::common::Status Model::Dispatch(const InlinedHashMap<std::string, On
       uint32_t dim_val = SafeInt<uint32_t>(dim);
       shape.call<void>("push", dim_val);
     }
-    auto ml_tensor = jsepEnsureTensor(emscripten::val::undefined(), reinterpret_cast<intptr_t>(tensor.buffer), tensor.tensor_info.data_type, shape, false);
+    auto ml_tensor = webnnEnsureTensor(emscripten::val::undefined(), reinterpret_cast<intptr_t>(tensor.buffer), tensor.tensor_info.data_type, shape, false);
     promises.call<void>("push", ml_tensor);
   }
   auto ml_tensors = emscripten::val::global("Promise").call<emscripten::val>("all", promises).await();
diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.cc b/onnxruntime/core/providers/webnn/builders/model_builder.cc
index 661b2ad7056c2..399cc5faf6273 100644
--- a/onnxruntime/core/providers/webnn/builders/model_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/model_builder.cc
@@ -9,7 +9,6 @@
 #include "helper.h"
 #include "op_builder_factory.h"
 
-#include "core/common/safeint.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
@@ -140,13 +139,13 @@ Status ModelBuilder::RegisterInitializers() {
         ORT_RETURN_IF_ERROR(utils::GetExternalDataInfo(
             tensor, graph_viewer_.ModelPath(), external_file_path, data_offset, tensor_byte_size));
 
-        auto jsepRegisterMLConstant = emscripten::val::module_property("jsepRegisterMLConstant");
-        operand = jsepRegisterMLConstant(emscripten::val(external_file_path),
-                                         static_cast<int32_t>(data_offset),
-                                         static_cast<int32_t>(tensor_byte_size),
-                                         wnn_builder_,
-                                         desc,
-                                         should_convert_int64_to_int32);
+        auto webnnRegisterMLConstant = emscripten::val::module_property("webnnRegisterMLConstant");
+        operand = webnnRegisterMLConstant(emscripten::val(external_file_path),
+                                          static_cast<int32_t>(data_offset),
+                                          static_cast<int32_t>(tensor_byte_size),
+                                          wnn_builder_,
+                                          desc,
+                                          should_convert_int64_to_int32);
       } else {
         if (tensor.has_raw_data()) {
           tensor_ptr = reinterpret_cast<std::byte*>(const_cast<char*>(tensor.raw_data().c_str()));
@@ -288,7 +287,7 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
       desc.set("dataType", emscripten::val("int32"));
     }
     wnn_operands_.insert(std::make_pair(name, wnn_builder_.call<emscripten::val>("input", name, desc)));
-    emscripten::val::module_property("jsepRegisterGraphInput")(name);
+    emscripten::val::module_property("webnnRegisterGraphInput")(name);
     input_names_.push_back(name);
   } else {
     output_names_.push_back(name);
diff --git a/onnxruntime/core/providers/webnn/data_transfer.cc b/onnxruntime/core/providers/webnn/data_transfer.cc
index 44e9bf9edf3d9..aa85277b72453 100644
--- a/onnxruntime/core/providers/webnn/data_transfer.cc
+++ b/onnxruntime/core/providers/webnn/data_transfer.cc
@@ -29,11 +29,11 @@ common::Status DataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const {
     const auto& dst_device = dst.Location().device;
 
     if (dst_device.Type() == OrtDevice::GPU) {
-      EM_ASM({ Module.jsepUploadTensor($0, HEAPU8.subarray($1, $1 + $2)); }, dst_data, reinterpret_cast<intptr_t>(src_data), bytes);
+      EM_ASM({ Module.webnnUploadTensor($0, HEAPU8.subarray($1, $1 + $2)); }, dst_data, reinterpret_cast<intptr_t>(src_data), bytes);
     } else {
-      auto jsepDownloadTensor = emscripten::val::module_property("jsepDownloadTensor");
+      auto webnnDownloadTensor = emscripten::val::module_property("webnnDownloadTensor");
       auto subarray = emscripten::typed_memory_view(bytes, static_cast<char*>(dst_data));
-      jsepDownloadTensor(reinterpret_cast<intptr_t>(src_data), subarray).await();
+      webnnDownloadTensor(reinterpret_cast<intptr_t>(src_data), subarray).await();
     }
   }
 
diff --git a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
index 7410ff66add30..2da7c6499933a 100644
--- a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
+++ b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
@@ -10,7 +10,6 @@
 #include "core/framework/kernel_registry.h"
 #include "core/graph/graph_viewer.h"
 #include "core/session/onnxruntime_cxx_api.h"
-#include "core/common/safeint.h"
 #include "core/providers/webnn/allocator.h"
 #include "core/providers/webnn/data_transfer.h"
 #include "core/providers/partitioning_utils.h"
@@ -284,7 +283,7 @@ class WebNNMemcpy : public OpKernel {
   explicit WebNNMemcpy(const OpKernelInfo& info) : OpKernel(info) {}
 
   Status Compute(OpKernelContext* context) const override {
-    auto jsepEnsureTensor = emscripten::val::module_property("jsepEnsureTensor");
+    auto webnnEnsureTensor = emscripten::val::module_property("webnnEnsureTensor");
     const auto* X = context->Input<Tensor>(0);
     ORT_ENFORCE(X != nullptr, "Memcpy: input tensor is null");
     auto* Y = context->Output(0, X->Shape());
@@ -294,10 +293,10 @@ class WebNNMemcpy : public OpKernel {
       shape.call<void>("push", SafeInt<uint32_t>(dim).Ref());
     }
 
-    jsepEnsureTensor(emscripten::val::undefined(),
-                     reinterpret_cast<intptr_t>(Y->MutableDataRaw()),
-                     Y->GetElementType(),
-                     shape, false)
+    webnnEnsureTensor(emscripten::val::undefined(),
+                      reinterpret_cast<intptr_t>(Y->MutableDataRaw()),
+                      Y->GetElementType(),
+                      shape, false)
         .await();
 
     const auto* data_transfer = Info().GetDataTransferManager().GetDataTransfer(X->Location().device, Y->Location().device);
diff --git a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc
index c828ae9400174..8d972f7d63bc1 100644
--- a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc
+++ b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc
@@ -57,7 +57,7 @@ bool MaxPool::IsOnnxNodeSupported(const NodeUnit& node_unit,
     // input of maxpool could be fp16/fp32/fp64,i8/u8 according to ONNX
     if (x_type == nullptr ||
         (x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT &&
-// because pool_fp16_op_test can be enabled by other preprocessor, for example, COREML_ENABLE_MLPROGRAM
+// because pool_fp16_op_test can be enabled by other preprocessor, for example, USE_COREML
 #ifdef XNNPACK_FP16_SUPPORTED
          x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 &&
 #endif
diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py
index 6b5f7526cc506..785eb9c485d25 100644
--- a/onnxruntime/python/onnxruntime_inference_collection.py
+++ b/onnxruntime/python/onnxruntime_inference_collection.py
@@ -15,6 +15,9 @@
 from onnxruntime.capi import _pybind_state as C
 
 if typing.TYPE_CHECKING:
+    import numpy as np
+    import numpy.typing as npt
+
     import onnxruntime
 
 
@@ -59,22 +62,22 @@ def export_adapter(self, file_path: os.PathLike):
         """
         self._adapter.export_adapter(file_path)
 
-    def get_format_version(self):
+    def get_format_version(self) -> int:
         return self._adapter.format_version
 
-    def set_adapter_version(self, adapter_version: int):
+    def set_adapter_version(self, adapter_version: int) -> None:
         self._adapter.adapter_version = adapter_version
 
-    def get_adapter_version(self):
+    def get_adapter_version(self) -> int:
         return self._adapter.adapter_version
 
-    def set_model_version(self, model_version: int):
+    def set_model_version(self, model_version: int) -> None:
         self._adapter.model_version = model_version
 
-    def get_model_version(self):
+    def get_model_version(self) -> int:
         return self._adapter.model_version
 
-    def set_parameters(self, params: dict[str, OrtValue]):
+    def set_parameters(self, params: dict[str, OrtValue]) -> None:
         self._adapter.parameters = {k: v._ortvalue for k, v in params.items()}
 
     def get_parameters(self) -> dict[str, OrtValue]:
@@ -174,27 +177,27 @@ def __init__(self):
         self._sess = None
         self._enable_fallback = True
 
-    def get_session_options(self):
+    def get_session_options(self) -> onnxruntime.SessionOptions:
         "Return the session options. See :class:`onnxruntime.SessionOptions`."
         return self._sess_options
 
-    def get_inputs(self):
+    def get_inputs(self) -> Sequence[onnxruntime.NodeArg]:
         "Return the inputs metadata as a list of :class:`onnxruntime.NodeArg`."
         return self._inputs_meta
 
-    def get_outputs(self):
+    def get_outputs(self) -> Sequence[onnxruntime.NodeArg]:
         "Return the outputs metadata as a list of :class:`onnxruntime.NodeArg`."
         return self._outputs_meta
 
-    def get_overridable_initializers(self):
+    def get_overridable_initializers(self) -> Sequence[onnxruntime.NodeArg]:
         "Return the inputs (including initializers) metadata as a list of :class:`onnxruntime.NodeArg`."
         return self._overridable_initializers
 
-    def get_modelmeta(self):
+    def get_modelmeta(self) -> onnxruntime.ModelMetadata:
         "Return the metadata. See :class:`onnxruntime.ModelMetadata`."
         return self._model_meta
 
-    def get_providers(self):
+    def get_providers(self) -> Sequence[str]:
         "Return list of registered execution providers."
         return self._providers
 
@@ -202,7 +205,7 @@ def get_provider_options(self):
         "Return registered execution providers' configurations."
         return self._provider_options
 
-    def set_providers(self, providers=None, provider_options=None):
+    def set_providers(self, providers=None, provider_options=None) -> None:
         """
         Register the input list of execution providers. The underlying session is re-created.
 
@@ -224,13 +227,13 @@ def set_providers(self, providers=None, provider_options=None):
         # recreate the underlying C.InferenceSession
         self._reset_session(providers, provider_options)
 
-    def disable_fallback(self):
+    def disable_fallback(self) -> None:
         """
         Disable session.run() fallback mechanism.
         """
         self._enable_fallback = False
 
-    def enable_fallback(self):
+    def enable_fallback(self) -> None:
         """
         Enable session.Run() fallback mechanism. If session.Run() fails due to an internal Execution Provider failure,
         reset the Execution Providers enabled for this session.
@@ -249,7 +252,7 @@ def _validate_input(self, feed_input_names):
                 f"Required inputs ({missing_input_names}) are missing from input feed ({feed_input_names})."
             )
 
-    def run(self, output_names, input_feed, run_options=None):
+    def run(self, output_names, input_feed, run_options=None) -> Sequence[np.ndarray | SparseTensor | list | dict]:
         """
         Compute the predictions.
 
@@ -308,7 +311,7 @@ def callback(results: np.ndarray, user_data: MyData, err: str) -> None:
             output_names = [output.name for output in self._outputs_meta]
         return self._sess.run_async(output_names, input_feed, callback, user_data, run_options)
 
-    def run_with_ort_values(self, output_names, input_dict_ort_values, run_options=None):
+    def run_with_ort_values(self, output_names, input_dict_ort_values, run_options=None) -> Sequence[OrtValue]:
         """
         Compute the predictions.
 
@@ -367,7 +370,7 @@ def get_profiling_start_time_ns(self):
         """
         return self._sess.get_profiling_start_time_ns
 
-    def io_binding(self):
+    def io_binding(self) -> IOBinding:
         "Return an onnxruntime.IOBinding object`."
         return IOBinding(self)
 
@@ -550,7 +553,7 @@ def _create_inference_session(self, providers, provider_options, disabled_optimi
         self._provider_options = self._sess.get_provider_options()
         self._profiling_start_time_ns = self._sess.get_profiling_start_time_ns
 
-    def _reset_session(self, providers, provider_options):
+    def _reset_session(self, providers, provider_options) -> None:
         "release underlying session object."
         # meta data references session internal structures
         # so they must be set to None to decrement _sess reference count.
@@ -721,7 +724,7 @@ class OrtValue:
     This class provides APIs to construct and deal with OrtValues.
     """
 
-    def __init__(self, ortvalue, numpy_obj=None):
+    def __init__(self, ortvalue: C.OrtValue, numpy_obj: np.ndarray | None = None):
         if isinstance(ortvalue, C.OrtValue):
             self._ortvalue = ortvalue
             # Hold a ref count to the numpy object if the OrtValue is backed directly
@@ -733,11 +736,11 @@ def __init__(self, ortvalue, numpy_obj=None):
                 "`Provided ortvalue` needs to be of type `onnxruntime.capi.onnxruntime_pybind11_state.OrtValue`"
             )
 
-    def _get_c_value(self):
+    def _get_c_value(self) -> C.OrtValue:
         return self._ortvalue
 
-    @staticmethod
-    def ortvalue_from_numpy(numpy_obj, device_type="cpu", device_id=0):
+    @classmethod
+    def ortvalue_from_numpy(cls, numpy_obj: np.ndarray, /, device_type="cpu", device_id=0) -> OrtValue:
         """
         Factory method to construct an OrtValue (which holds a Tensor) from a given Numpy object
         A copy of the data in the Numpy object is held by the OrtValue only if the device is NOT cpu
@@ -749,7 +752,7 @@ def ortvalue_from_numpy(numpy_obj, device_type="cpu", device_id=0):
         # Hold a reference to the numpy object (if device_type is 'cpu') as the OrtValue
         # is backed directly by the data buffer of the numpy object and so the numpy object
         # must be around until this OrtValue instance is around
-        return OrtValue(
+        return cls(
             C.OrtValue.ortvalue_from_numpy(
                 numpy_obj,
                 C.OrtDevice(
@@ -761,8 +764,8 @@ def ortvalue_from_numpy(numpy_obj, device_type="cpu", device_id=0):
             numpy_obj if device_type.lower() == "cpu" else None,
         )
 
-    @staticmethod
-    def ortvalue_from_numpy_with_onnx_type(data, onnx_element_type: int):
+    @classmethod
+    def ortvalue_from_numpy_with_onnx_type(cls, data: np.ndarray, /, onnx_element_type: int) -> OrtValue:
         """
         This method creates an instance of OrtValue on top of the numpy array.
         No data copy is made and the lifespan of the resulting OrtValue should never
@@ -771,12 +774,14 @@ def ortvalue_from_numpy_with_onnx_type(data, onnx_element_type: int):
         when we want to use an ONNX data type that is not supported by numpy.
 
         :param data: numpy.ndarray.
-        :param onnx_elemenet_type: a valid onnx TensorProto::DataType enum value
+        :param onnx_element_type: a valid onnx TensorProto::DataType enum value
         """
-        return OrtValue(C.OrtValue.ortvalue_from_numpy_with_onnx_type(data, onnx_element_type), data)
+        return cls(C.OrtValue.ortvalue_from_numpy_with_onnx_type(data, onnx_element_type), data)
 
-    @staticmethod
-    def ortvalue_from_shape_and_type(shape, element_type, device_type: str = "cpu", device_id: int = 0):
+    @classmethod
+    def ortvalue_from_shape_and_type(
+        cls, shape: Sequence[int], element_type, device_type: str = "cpu", device_id: int = 0
+    ) -> OrtValue:
         """
         Factory method to construct an OrtValue (which holds a Tensor) from given shape and element_type
 
@@ -788,7 +793,7 @@ def ortvalue_from_shape_and_type(shape, element_type, device_type: str = "cpu",
         # Integer for onnx element type (see https://onnx.ai/onnx/api/mapping.html).
         # This is helpful for some data type (like TensorProto.BFLOAT16) that is not available in numpy.
         if isinstance(element_type, int):
-            return OrtValue(
+            return cls(
                 C.OrtValue.ortvalue_from_shape_and_onnx_type(
                     shape,
                     element_type,
@@ -800,7 +805,7 @@ def ortvalue_from_shape_and_type(shape, element_type, device_type: str = "cpu",
                 )
             )
 
-        return OrtValue(
+        return cls(
             C.OrtValue.ortvalue_from_shape_and_type(
                 shape,
                 element_type,
@@ -812,77 +817,77 @@ def ortvalue_from_shape_and_type(shape, element_type, device_type: str = "cpu",
             )
         )
 
-    @staticmethod
-    def ort_value_from_sparse_tensor(sparse_tensor):
+    @classmethod
+    def ort_value_from_sparse_tensor(cls, sparse_tensor: SparseTensor) -> OrtValue:
         """
         The function will construct an OrtValue instance from a valid SparseTensor
         The new instance of OrtValue will assume the ownership of sparse_tensor
         """
-        return OrtValue(C.OrtValue.ort_value_from_sparse_tensor(sparse_tensor._get_c_tensor()))
+        return cls(C.OrtValue.ort_value_from_sparse_tensor(sparse_tensor._get_c_tensor()))
 
-    def as_sparse_tensor(self):
+    def as_sparse_tensor(self) -> SparseTensor:
         """
         The function will return SparseTensor contained in this OrtValue
         """
         return SparseTensor(self._ortvalue.as_sparse_tensor())
 
-    def data_ptr(self):
+    def data_ptr(self) -> int:
         """
         Returns the address of the first element in the OrtValue's data buffer
         """
         return self._ortvalue.data_ptr()
 
-    def device_name(self):
+    def device_name(self) -> str:
         """
         Returns the name of the device where the OrtValue's data buffer resides e.g. cpu, cuda, cann
         """
         return self._ortvalue.device_name().lower()
 
-    def shape(self):
+    def shape(self) -> Sequence[int]:
         """
         Returns the shape of the data in the OrtValue
         """
         return self._ortvalue.shape()
 
-    def data_type(self):
+    def data_type(self) -> str:
         """
-        Returns the data type of the data in the OrtValue
+        Returns the data type of the data in the OrtValue. E.g. 'tensor(int64)'
         """
         return self._ortvalue.data_type()
 
-    def element_type(self):
+    def element_type(self) -> int:
         """
         Returns the proto type of the data in the OrtValue
         if the OrtValue is a tensor.
         """
         return self._ortvalue.element_type()
 
-    def has_value(self):
+    def has_value(self) -> bool:
         """
         Returns True if the OrtValue corresponding to an
         optional type contains data, else returns False
         """
         return self._ortvalue.has_value()
 
-    def is_tensor(self):
+    def is_tensor(self) -> bool:
         """
         Returns True if the OrtValue contains a Tensor, else returns False
         """
         return self._ortvalue.is_tensor()
 
-    def is_sparse_tensor(self):
+    def is_sparse_tensor(self) -> bool:
         """
         Returns True if the OrtValue contains a SparseTensor, else returns False
         """
         return self._ortvalue.is_sparse_tensor()
 
-    def is_tensor_sequence(self):
+    def is_tensor_sequence(self) -> bool:
         """
         Returns True if the OrtValue contains a Tensor Sequence, else returns False
         """
         return self._ortvalue.is_tensor_sequence()
 
-    def numpy(self):
+    def numpy(self) -> np.ndarray:
         """
         Returns a Numpy object from the OrtValue.
         Valid only for OrtValues holding Tensors. Throws for OrtValues holding non-Tensors.
@@ -890,7 +895,7 @@ def numpy(self):
         """
         return self._ortvalue.numpy()
 
-    def update_inplace(self, np_arr):
+    def update_inplace(self, np_arr) -> None:
         """
         Update the OrtValue in place with a new Numpy array. The numpy contents
         are copied over to the device memory backing the OrtValue. It can be used
@@ -948,7 +953,7 @@ class SparseTensor:
     depending on the format
     """
 
-    def __init__(self, sparse_tensor):
+    def __init__(self, sparse_tensor: C.SparseTensor):
         """
         Internal constructor
         """
@@ -960,11 +965,17 @@ def __init__(self, sparse_tensor):
                 "`Provided object` needs to be of type `onnxruntime.capi.onnxruntime_pybind11_state.SparseTensor`"
             )
 
-    def _get_c_tensor(self):
+    def _get_c_tensor(self) -> C.SparseTensor:
         return self._tensor
 
-    @staticmethod
-    def sparse_coo_from_numpy(dense_shape, values, coo_indices, ort_device):
+    @classmethod
+    def sparse_coo_from_numpy(
+        cls,
+        dense_shape: npt.NDArray[np.int64],
+        values: np.ndarray,
+        coo_indices: npt.NDArray[np.int64],
+        ort_device: OrtDevice,
+    ) -> SparseTensor:
         """
         Factory method to construct a SparseTensor in COO format from given arguments
 
@@ -985,12 +996,17 @@ def sparse_coo_from_numpy(dense_shape, values, coo_indices, ort_device):
         For strings and objects, it will create a copy of the arrays in CPU memory as ORT does not support those
         on other devices and their memory can not be mapped.
         """
-        return SparseTensor(
-            C.SparseTensor.sparse_coo_from_numpy(dense_shape, values, coo_indices, ort_device._get_c_device())
-        )
+        return cls(C.SparseTensor.sparse_coo_from_numpy(dense_shape, values, coo_indices, ort_device._get_c_device()))
 
-    @staticmethod
-    def sparse_csr_from_numpy(dense_shape, values, inner_indices, outer_indices, ort_device):
+    @classmethod
+    def sparse_csr_from_numpy(
+        cls,
+        dense_shape: npt.NDArray[np.int64],
+        values: np.ndarray,
+        inner_indices: npt.NDArray[np.int64],
+        outer_indices: npt.NDArray[np.int64],
+        ort_device: OrtDevice,
+    ) -> SparseTensor:
         """
         Factory method to construct a SparseTensor in CSR format from given arguments
 
@@ -1011,7 +1027,7 @@ def sparse_csr_from_numpy(dense_shape, values, inner_indices, outer_indices, ort
         For strings and objects, it will create a copy of the arrays in CPU memory as ORT does not support those
         on other devices and their memory can not be mapped.
         """
-        return SparseTensor(
+        return cls(
             C.SparseTensor.sparse_csr_from_numpy(
                 dense_shape,
                 values,
@@ -1021,7 +1037,7 @@ def sparse_csr_from_numpy(dense_shape, values, inner_indices, outer_indices, ort
             )
         )
 
-    def values(self):
+    def values(self) -> np.ndarray:
         """
         The method returns a numpy array that is backed by the native memory
         if the data type is numeric. Otherwise, the returned numpy array that contains
@@ -1093,19 +1109,19 @@ def format(self):
         """
         return self._tensor.format
 
-    def dense_shape(self):
+    def dense_shape(self) -> npt.NDArray[np.int64]:
         """
         Returns a numpy array(int64) containing a dense shape of a sparse tensor
         """
         return self._tensor.dense_shape()
 
-    def data_type(self):
+    def data_type(self) -> str:
         """
         Returns a string data type of the data in the OrtValue
         """
         return self._tensor.data_type()
 
-    def device_name(self):
+    def device_name(self) -> str:
         """
         Returns the name of the device where the SparseTensor data buffers reside e.g. cpu, cuda
         """
diff --git a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
index 4611dc9082734..e22445edc0f5b 100644
--- a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
+++ b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
@@ -404,7 +404,7 @@ TYPED_TEST(LayerNormTest, LayerNorm17_opset) {
   // Execution provider entry invalid.
   // when other EPs support layer-norm fp16, this test should be updated to include them.
   if (std::is_same<TypeParam, MLFloat16>::value) {
-#if !defined(COREML_ENABLE_MLPROGRAM)
+#if !defined(USE_COREML)
     return;
 #endif
   }
diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
index b1779ded4a675..81323cb51a887 100644
--- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -389,6 +389,7 @@ TEST(MatMulNBits, Float32_Accuracy4) {
   TestMatMulNBitsTyped<float, 100, 288, 16, 16, 4>();
   TestMatMulNBitsTyped<float, 100, 288, 1024, 16, 4>();
   TestMatMulNBitsTyped<float, 100, 288, 1024, 128, 4>();
+  TestMatMulNBitsTyped<float, 100, 288, 192, 64, 4>();
   TestMatMulNBitsTyped<float, 100, 288, 93, 32, 4>();
   TestMatMulNBitsTyped<float, 100, 288, 93, 128, 4>();
   TestMatMulNBitsTyped<float, 100, 288, 1234, 16, 4>();
@@ -458,6 +459,7 @@ TEST(MatMulNBits, Float16_Accuracy4) {
   TestMatMulNBitsTyped<MLFloat16, 100, 288, 16, 16, 4>();
   TestMatMulNBitsTyped<MLFloat16, 100, 288, 1024, 16, 4>();
   TestMatMulNBitsTyped<MLFloat16, 100, 288, 1024, 128, 4>();
+  TestMatMulNBitsTyped<MLFloat16, 100, 288, 192, 64, 4>();
   TestMatMulNBitsTyped<MLFloat16, 100, 288, 93, 32, 4>();
   TestMatMulNBitsTyped<MLFloat16, 100, 288, 93, 128, 4>();
   TestMatMulNBitsTyped<MLFloat16, 100, 288, 1234, 16, 4>();
@@ -528,8 +530,10 @@ TEST(MatMulNBits, Float16Cuda) {
       for (auto K : {16, 32, 64, 128, 256, 1024, 93, 1234}) {
         for (auto block_size : {16, 32, 64, 128}) {
           for (auto has_gidx : has_gidx_options) {
-#ifdef USE_DML
+#if defined(USE_DML)
             RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.04f);
+#elif defined(USE_WEBGPU)
+            RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.03f);
 #else
             RunTest(M, N, K, block_size, 0, false, true, has_gidx);
             RunTest(M, N, K, block_size, 0, true, true, has_gidx, false);
diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
index a9aa78b7a3229..3505193b77683 100644
--- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc
+++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
@@ -246,7 +246,7 @@ TEST(CoreMLExecutionProviderTest, TestOrtFormatModel) {
 #endif
 }
 
-#if defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_COREML)
 // Names in CoreML cannot start with [0-9] or contain anything but "[a-z][A-Z][0-9]_"
 // Test that we fix invalid names in model inputs, initializers and outputs.
 // This is only enforced for ML Program, so we only do name sanitization when creating an ML Program format model.
diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
index 724118d7419d2..9201da348e75c 100644
--- a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
+++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
@@ -125,7 +125,7 @@ TEST_F(ActivationOpTest, Relu) {
       {}, {},
       /*is_tensorrt_supported=*/false,
       /*opset_version= */ 14);
-#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(USE_COREML)
   TestActivationOp<MLFloat16>(
       "Relu",
       input_values_fp16,
@@ -139,7 +139,7 @@ TEST_F(ActivationOpTest, Relu) {
 #endif  // MLAS_F16VEC_INTRINSICS_SUPPORTED
 }
 
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML)
 TEST_F(ActivationOpTest, Sigmoid_fp16) {
 #ifdef USE_CUDA
   int min_cuda_architecture = 530;
@@ -413,7 +413,7 @@ TEST_F(ActivationOpTest, LeakyRelu) {
                           {{"alpha", alpha}}, {});
 }
 
-#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(USE_COREML)
 TEST_F(ActivationOpTest, LeakyRelu_fp16) {
   OpTester test("LeakyRelu", 11);
   float alpha = 0.01f;  // oneDNN set alpha equal to 0.01
diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.h b/onnxruntime/test/providers/cpu/activation/activation_op_test.h
index 59813f433dc41..04d116e29d3b0 100644
--- a/onnxruntime/test/providers/cpu/activation/activation_op_test.h
+++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.h
@@ -105,7 +105,7 @@ class ActivationOpTest : public ::testing::Test {
     std::random_device rd;
     std::mt19937 gen(rd());
     std::uniform_real_distribution<float> dist(low, high);
-#ifdef COREML_ENABLE_MLPROGRAM
+#ifdef USE_COREML
     // please check onnxruntime/onnxruntime/core/providers/coreml/builders/helper.cc:81
     std::vector<std::size_t> batch_size_list = {1, 2, 4, 9, 100};
 #else
diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
index d87ee861752c7..fbd9d10a56c77 100644
--- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
@@ -32,7 +32,7 @@ void TestBinaryFloat16(const char* op_name,
                        bool enable_bf16 = true) {
   {
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
-#ifdef COREML_ENABLE_MLPROGRAM
+#ifdef USE_COREML
     execution_providers.push_back(DefaultCoreMLExecutionProvider(true));
 #elif USE_CUDA
     execution_providers.push_back(DefaultCudaExecutionProvider());
@@ -76,7 +76,7 @@ void TestUnaryFloat16(const char* op_name,
                       bool run_bf16 = true) {
   {
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
-#ifdef COREML_ENABLE_MLPROGRAM
+#ifdef USE_COREML
     execution_providers.push_back(DefaultCoreMLExecutionProvider(true));
 #elif USE_CUDA
     execution_providers.push_back(DefaultCudaExecutionProvider());
@@ -968,8 +968,15 @@ TEST(MathOpTest, Abs) {
   test.Run();
 }
 
-#ifdef USE_DNNL
+#if defined(USE_CUDA) || defined(USE_DNNL)
 TEST(MathOpTest, Abs_bfloat16) {
+#ifdef USE_CUDA
+  int min_cuda_architecture = 530;
+  if (!HasCudaEnvironment(min_cuda_architecture)) {
+    LOGS_DEFAULT(WARNING) << "Hardware does NOT support BF16";
+    return;
+  }
+#endif
 #ifdef USE_DNNL
   if (!DnnlHasBF16Support()) {
     LOGS_DEFAULT(WARNING) << "Hardware does NOT support BF16";
@@ -980,9 +987,19 @@ TEST(MathOpTest, Abs_bfloat16) {
   std::vector<int64_t> dims{2, 2};
   test_bf16.AddInput<BFloat16>("X", dims, MakeBFloat16({1.0f, -2.0f, -0.0f, -10.0f}));
   test_bf16.AddOutput<BFloat16>("Y", dims, MakeBFloat16({1.0f, 2.0f, 0.0f, 10.0f}));
-  test_bf16.Run();
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+#if defined(USE_CUDA)
+  execution_providers.push_back(DefaultCudaExecutionProvider());
+#endif
+
+#if defined(USE_DNNL)
+  execution_providers.push_back(DefaultDnnlExecutionProvider());
+#endif
+
+  test_bf16.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
-#endif  //  USE_DNNL
+#endif  //  USE_CUDA || USE_DNNL
 
 TEST(MathOpTest, Abs_int8) {
   OpTester test("Abs");
@@ -1409,7 +1426,7 @@ TEST(MathOpTest, Pow_float16_float16) {
                     dims, {1.0f, 256.0f, 2.0f, 1.0f}, false);
 }
 
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML)
 TEST(MathOpTest, Pow_float_float16) {
   OpTester test("Pow", 12);
   std::vector<int64_t> dims{4};
@@ -1423,7 +1440,7 @@ TEST(MathOpTest, Pow_float_float16) {
   execution_providers.push_back(DefaultCudaExecutionProvider());
 #elif USE_ROCM
   execution_providers.push_back(DefaultRocmExecutionProvider());
-#elif COREML_ENABLE_MLPROGRAM
+#elif USE_COREML
   execution_providers.push_back(DefaultCoreMLExecutionProvider(true));
 #endif
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
diff --git a/onnxruntime/test/providers/cpu/math/matmul_test.cc b/onnxruntime/test/providers/cpu/math/matmul_test.cc
index 298e870f348fc..dd8cbed15e5ef 100644
--- a/onnxruntime/test/providers/cpu/math/matmul_test.cc
+++ b/onnxruntime/test/providers/cpu/math/matmul_test.cc
@@ -210,7 +210,7 @@ TEST(MathOpTest, MatMulFloatType) {
   RunMatMulTest<float>(7, false, true);
 }
 
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM) || defined(USE_XNNPACK)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML) || defined(USE_XNNPACK)
 TEST(MathOpTest, MatMulFloat16) {
 #ifdef USE_CUDA
   int min_cuda_architecture = 530;
@@ -276,7 +276,7 @@ TEST(MathOpTest, MatMulZeroKInt32Type) {
   RunMatMulZeroKTest<int32_t>();
 }
 
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM) || defined(USE_XNNPACK)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML) || defined(USE_XNNPACK)
 TEST(MathOpTest, MatMul_Float16) {
 #ifdef USE_CUDA
   int min_cuda_architecture = 530;
diff --git a/onnxruntime/test/providers/cpu/math/sign_test.cc b/onnxruntime/test/providers/cpu/math/sign_test.cc
index a01c2b26ea8b5..0da6a2ed55f2c 100644
--- a/onnxruntime/test/providers/cpu/math/sign_test.cc
+++ b/onnxruntime/test/providers/cpu/math/sign_test.cc
@@ -207,7 +207,7 @@ TEST(MathOpTest, Sign_MLFloat16) {
 //  test.Run(OpTester::ExpectResult::kExpectSuccess);
 //}
 
-#if defined(USE_DNNL)
+#if defined(USE_CUDA) || defined(USE_DNNL)
 TEST(MathOpTest, Sign_bfloat16) {
 #ifdef USE_DNNL
   if (!DnnlHasBF16Support()) {
@@ -228,9 +228,15 @@ TEST(MathOpTest, Sign_bfloat16) {
   TestImpl<BFloat16>(input.cbegin(), input.cend(), std::back_inserter(output));
   test.AddOutput<BFloat16>("output", input_dims, output);
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+
+#if defined(USE_CUDA)
+  execution_providers.push_back(DefaultCudaExecutionProvider());
+#endif
+
 #if defined(USE_DNNL)
   execution_providers.push_back(DefaultDnnlExecutionProvider());
-#endif  //  USE_DNNL
+#endif
+
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 #endif
diff --git a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
index f8ebca5ff9a1b..a529d572d7cca 100644
--- a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
@@ -704,7 +704,7 @@ TEST(BatchNormTest, NonSpatial_Complicated) {
 }
 
 // Only CUDA and ROCm kernels have float 16 support
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML)
 TEST(BatchNormTest, BatchNorm2d_fp16) {
   vector<float> X{-0.91221f, -0.283559f, 0.937637f, 2.09818f, -0.100199f, -0.608113f, 0.444562f, -1.07505f, 0.940591f,
                   -0.922262f, 0.0931303f, 0.69611f, 1.55187f, 0.159808f, 0.914874f, -1.24856f, -1.98928f, -0.331621f,
diff --git a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc
index 4253e36e02548..d1350db8ec12e 100644
--- a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc
@@ -3,7 +3,7 @@
 
 #include "core/mlas/inc/mlas.h"
 
-#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM) || defined(USE_XNNPACK)
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(USE_COREML) || defined(USE_XNNPACK)
 
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
@@ -30,7 +30,7 @@ struct ConvOpAndTestAttributes {
 
 /*
 Please notice that, we have predefined macros in the head of the file
-#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(USE_COREML)
 When we have these two macro defines, this UT will turn into green light and work.
 
 If attributes.activation is set the NhwcFusedConv contrib op is used.
diff --git a/onnxruntime/test/providers/cpu/nn/group_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/group_norm_op_test.cc
index ac517193a2c77..3d8d188867023 100644
--- a/onnxruntime/test/providers/cpu/nn/group_norm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/group_norm_op_test.cc
@@ -6,7 +6,7 @@
 #include "test/common/tensor_op_test_utils.h"
 #include "test/util/include/default_providers.h"
 
-#ifdef COREML_ENABLE_MLPROGRAM
+#ifdef USE_COREML
 using namespace std;
 namespace onnxruntime {
 namespace test {
diff --git a/onnxruntime/test/providers/cpu/nn/instance_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/instance_norm_op_test.cc
index 341bb8a4fc957..46b74f2c2eb9d 100644
--- a/onnxruntime/test/providers/cpu/nn/instance_norm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/instance_norm_op_test.cc
@@ -121,7 +121,7 @@ TEST(InstanceNormalizationOpTest, InstanceNormBatch2) {
 }
 
 // Only CUDA and ROCm kernels have float 16 support
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML)
 
 TEST(InstanceNormalizationOpTest, InstanceNormBatch1_fp16) {
   OpTester test("InstanceNormalization");
diff --git a/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc
index d4e0af5011525..c14fc1fb62ae5 100644
--- a/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc
@@ -3,7 +3,7 @@
 
 #include "core/mlas/inc/mlas.h"
 
-#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM) || defined(USE_XNNPACK)
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(USE_COREML) || defined(USE_XNNPACK)
 
 #include "core/providers/cpu/nn/pool.h"
 #include "gtest/gtest.h"
diff --git a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
index 24a8c8491b632..f1d612276174f 100644
--- a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
@@ -70,7 +70,7 @@ TEST(PoolTest, MaxPool) {
 
 // Only CUDA kernel has float 16 support
 // Disable for now, still investigating the issue with cudnn lib
-#if defined(USE_CUDA) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_CUDA) || defined(USE_COREML)
 TEST(PoolTest, MaxPool_F16) {
 #if defined(USE_CUDA)
   int min_cuda_architecture = 530;
diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
index 4bc97d035c7f7..92cd82c2c9420 100644
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@@ -1375,7 +1375,7 @@ TEST(ReductionOpTest, ReduceMax_double) {
   test.Run();
 }
 
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML)
 TEST(ReductionOpTest, ReduceMax_half) {
   OpTester test("ReduceMax");
   test.AddAttribute("axes", std::vector<int64_t>{1, 2});
@@ -2158,7 +2158,7 @@ TEST(ReductionOpTest, ReduceMin_double) {
   test.Run();
 }
 
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML)
 TEST(ReductionOpTest, ReduceMin_half) {
   OpTester test("ReduceMin");
   test.AddAttribute("axes", std::vector<int64_t>{0, 2});
@@ -2356,7 +2356,7 @@ TEST(ReductionOpTest, ReduceSum_int32) {
   test.Run();
 }
 
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML)
 TEST(ReductionOpTest, ReduceSumHalfHalf) {
   OpTester test("ReduceSum");
   test.AddAttribute("keepdims", (int64_t)0);
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index e261d66a0d22a..d62ffe644e4cc 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -694,7 +694,9 @@
         "^test_gelu_tanh_2_expanded_cpu",
         "^test_dynamicquantizelinear_expanded_cpu",
         "^test_center_crop_pad_crop_negative_axes_hwc*", // failed due to new types or shape infer with negative axis for CenterCropPad.
-        "^test_center_crop_pad_crop_negative_axes_hwc_expanded*" // failed due to new types or shape infer with negative axis for CenterCropPad.
+        "^test_center_crop_pad_crop_negative_axes_hwc_expanded*", // failed due to new types or shape infer with negative axis for CenterCropPad.
+        "^test_reduce_max_empty_set",
+        "^test_reduce_min_empty_set"
     ],
     "current_failing_tests_pure_DML": [
         "^test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_cpu",
diff --git a/onnxruntime/wasm/pre-jsep.js b/onnxruntime/wasm/pre-jsep.js
index a11c6d741d110..cca8da0525fbe 100644
--- a/onnxruntime/wasm/pre-jsep.js
+++ b/onnxruntime/wasm/pre-jsep.js
@@ -97,41 +97,40 @@ Module["jsepInit"] = (name, params) => {
     // Functions called via emscripten::val::module_property need to be assigned by name so that the minifier doesn't
     // change the name.
 
+    const backend = params[0];
     [
-      Module.jsepBackend,
-      Module.jsepReserveTensorId,
-      Module.jsepReleaseTensorId,
-      Module["jsepEnsureTensor"],
-      Module.jsepUploadTensor,
-      Module["jsepDownloadTensor"],
-    ] = params;
+      Module.webnnReserveTensorId,
+      Module.webnnReleaseTensorId,
+      Module["webnnEnsureTensor"],
+      Module.webnnUploadTensor,
+      Module["webnnDownloadTensor"],
+    ] = params.slice(1);
 
     // This function is called from both JS and an EM_ASM block, it needs both a minifiable name and an explicit name.
-    Module["jsepReleaseTensorId"] = Module.jsepReleaseTensorId;
-    Module["jsepUploadTensor"] = Module.jsepUploadTensor;
+    Module["webnnReleaseTensorId"] = Module.webnnReleaseTensorId;
+    Module["webnnUploadTensor"] = Module.webnnUploadTensor;
 
     // Functions called from JS also need to have explicit names.
-    const backend = Module.jsepBackend;
-    Module["jsepOnRunStart"] = (sessionId) => {
+    Module["webnnOnRunStart"] = (sessionId) => {
       return backend["onRunStart"](sessionId);
     };
-    Module["jsepOnRunEnd"] = backend["onRunEnd"].bind(backend);
-    Module["jsepRegisterMLContext"] = (sessionId, mlContext) => {
+    Module["webnnOnRunEnd"] = backend["onRunEnd"].bind(backend);
+    Module["webnnRegisterMLContext"] = (sessionId, mlContext) => {
       backend["registerMLContext"](sessionId, mlContext);
     };
-    Module["jsepOnReleaseSession"] = (sessionId) => {
+    Module["webnnOnReleaseSession"] = (sessionId) => {
       backend["onReleaseSession"](sessionId);
     };
-    Module["jsepCreateMLTensorDownloader"] = (tensorId, type) => {
+    Module["webnnCreateMLTensorDownloader"] = (tensorId, type) => {
       return backend["createMLTensorDownloader"](tensorId, type);
     };
-    Module["jsepRegisterMLTensor"] = (sessionId, tensor, dataType, shape) => {
+    Module["webnnRegisterMLTensor"] = (sessionId, tensor, dataType, shape) => {
       return backend["registerMLTensor"](sessionId, tensor, dataType, shape);
     };
-    Module["jsepCreateMLContext"] = (optionsOrGpuDevice) => {
+    Module["webnnCreateMLContext"] = (optionsOrGpuDevice) => {
       return backend["createMLContext"](optionsOrGpuDevice);
     };
-    Module["jsepRegisterMLConstant"] = (
+    Module["webnnRegisterMLConstant"] = (
       externalFilePath,
       dataOffset,
       dataLength,
@@ -149,9 +148,12 @@ Module["jsepInit"] = (name, params) => {
         shouldConvertInt64ToInt32,
       );
     };
-    Module['jsepRegisterGraphInput'] = backend['registerGraphInput'].bind(backend);
-    Module['jsepIsGraphInput'] = backend['isGraphInput'].bind(backend);
-    Module['jsepCreateTemporaryTensor'] = backend['createTemporaryTensor'].bind(backend);
-    Module['jsepIsInt64Supported'] = backend['isInt64Supported'].bind(backend);
+    Module["webnnRegisterGraphInput"] =
+      backend["registerGraphInput"].bind(backend);
+    Module["webnnIsGraphInput"] = backend["isGraphInput"].bind(backend);
+
+    Module["webnnCreateTemporaryTensor"] =
+      backend["createTemporaryTensor"].bind(backend);
+    Module["webnnIsInt64Supported"] = backend["isInt64Supported"].bind(backend);
   }
 };
diff --git a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
deleted file mode 100644
index 3cceadd1b8ef5..0000000000000
--- a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
+++ /dev/null
@@ -1,241 +0,0 @@
-##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
-### please do rerun set-trigger-rules.py ###
-trigger:
-  branches:
-    include:
-    - main
-    - rel-*
-  paths:
-    exclude:
-    - docs/**
-    - README.md
-    - CONTRIBUTING.md
-    - BUILD.md
-    - 'js/web'
-    - 'onnxruntime/core/providers/js'
-pr:
-  branches:
-    include:
-    - main
-    - rel-*
-  paths:
-    exclude:
-    - docs/**
-    - README.md
-    - CONTRIBUTING.md
-    - BUILD.md
-    - 'js/web'
-    - 'onnxruntime/core/providers/js'
-#### end trigger ####
-
-# Known Limits
-# 1. Anchors are not supported in GHA
-# https://github.community/t/support-for-yaml-anchors/16128/90
-# 2. today most cloud-based CI services are still lacking hardware acceleration support from the host VM,
-# which is the no.1 blocker for running tests on modern Android Emulators (especially on recent API levels) on CI.
-
-# It'd better to check out https://github.com/microsoft/onnxruntime/wiki/Leverage-Existing-Artifacts
-# to save debugging time.
-parameters:
-- name: specificArtifact
-  displayName: Use Specific Artifact
-  type: boolean
-  default: false
-- name: runId
-  displayName: Specific Artifact's RunId
-  type: number
-  default: 0
-
-stages:
-# Separate stage for building CPU vs NNAPI as we only want CodeQL to run on one of them so we don't get duplicate
-# issues for code that is built in both. We pick NNAPI as that includes the NNAPI EP code.
-- stage: BUILD_AND_TEST_CPU
-  dependsOn: []
-  variables:
-    Codeql.Enabled: false
-    ANDROID_AVD_HOME: $(Agent.TempDirectory)
-  jobs:
-  - job: BUILD_AND_TEST_CPU
-    pool: onnxruntime-Ubuntu2204-AMD-CPU
-    workspace:
-      clean: all
-    timeoutInMinutes: 30
-    steps:
-    - task: JavaToolInstaller@0
-      displayName: Use jdk 17
-      inputs:
-        versionSpec: '17'
-        jdkArchitectureOption: 'x64'
-        jdkSourceOption: 'PreInstalled'
-
-    - script: sudo apt-get update -y && sudo apt-get install -y coreutils ninja-build
-      displayName: Install coreutils and ninja
-
-    - template: templates/use-android-ndk.yml
-    - template: templates/use-android-emulator.yml
-      parameters:
-        create: true
-        start: true
-    - script: |
-        env | grep ANDROID
-      displayName: View Android ENVs
-    - script: |
-        python3 tools/ci_build/build.py \
-          --enable_lto \
-          --android \
-          --build_dir build \
-          --android_sdk_path $ANDROID_HOME \
-          --android_ndk_path $ANDROID_NDK_HOME \
-          --android_abi=x86_64 \
-          --android_api=30 \
-          --skip_submodule_sync \
-          --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache \
-          --cmake_generator=Ninja \
-          --build_java
-      displayName: CPU EP, Build and Test
-    - template: templates/use-android-emulator.yml
-      parameters:
-        stop: true
-
-    - template: templates/clean-agent-build-directory-step.yml
-
-- stage: BUILD_AND_TEST_NNAPI_EP
-  dependsOn: []
-  condition: notIn(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')
-  variables:
-    ANDROID_AVD_HOME: $(Agent.TempDirectory)
-    Codeql.ProjectConfigPath: .github/workflows
-    Codeql.Enabled: true
-    Codeql.Language: cpp
-    ${{ if variables['Codeql.Enabled'] }}:
-      JobsTimeout: 120
-    ${{ else }}:
-      JobsTimeout: 60
-  jobs:
-  - job: BUILD_AND_TEST_NNAPI_EP
-    pool: onnxruntime-Ubuntu2204-AMD-CPU
-    timeoutInMinutes: ${{ variables.JobsTimeout }}
-    workspace:
-      clean: all
-    steps:
-    - task: JavaToolInstaller@0
-      displayName: Use jdk 17
-      inputs:
-        versionSpec: '17'
-        jdkArchitectureOption: 'x64'
-        jdkSourceOption: 'PreInstalled'
-
-    - script: sudo apt-get update -y && sudo apt-get install -y coreutils ninja-build
-      displayName: Install coreutils and ninja
-    - template: templates/use-android-emulator.yml
-      parameters:
-        create: true
-        start: true
-
-    - script: |
-        env | grep ANDROID
-      displayName: View Android ENVs
-
-    - script: |
-        python3 tools/ci_build/build.py \
-        --enable_lto \
-        --android \
-        --build_dir build_nnapi \
-        --android_sdk_path $ANDROID_HOME \
-        --android_ndk_path $ANDROID_NDK_HOME \
-        --android_abi=x86_64 \
-        --android_api=29 \
-        --skip_submodule_sync \
-        --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache \
-        --use_nnapi \
-        --build_shared_lib \
-        --cmake_generator=Ninja \
-        --build_java
-      displayName: NNAPI EP, Build, Test on Android Emulator
-
-    - script: /bin/bash tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh $(pwd)
-      # Build Minimal ORT with NNAPI and reduced Ops, run unit tests on Android Emulator
-      displayName: Build Minimal ORT with NNAPI and run tests
-
-    - template: templates/use-android-emulator.yml
-      parameters:
-        stop: true
-
-    - template: templates/clean-agent-build-directory-step.yml
-
-- stage: MAIN_BUILD_STAGE
-  # The below jobs only run on build of main branch.
-  # because coverage report is hard to support in cross machines.
-  displayName: NNAPI MAIN BUILD&TEST
-  dependsOn: []
-  condition: in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')
-  variables:
-    ANDROID_AVD_HOME: $(Agent.TempDirectory)
-  jobs:
-  - job: NNAPI_EP_MASTER
-    pool: onnxruntime-Ubuntu2204-AMD-CPU
-    timeoutInMinutes: 180
-    workspace:
-      clean: all
-    condition: in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')
-    steps:
-      - task: JavaToolInstaller@0
-        displayName: Use jdk 17
-        inputs:
-          versionSpec: '17'
-          jdkArchitectureOption: 'x64'
-          jdkSourceOption: 'PreInstalled'
-
-      - template: templates/use-android-ndk.yml
-
-      - template: templates/use-android-emulator.yml
-        parameters:
-          create: true
-          start: true
-
-      - script: |
-          python3 tools/ci_build/build.py \
-          --enable_lto \
-          --android \
-          --build_dir build_nnapi \
-          --android_sdk_path $ANDROID_HOME \
-          --android_ndk_path $ANDROID_NDK_HOME \
-          --android_abi=x86_64 \
-          --android_api=29 \
-          --skip_submodule_sync \
-          --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache \
-          --use_nnapi \
-          --build_shared_lib \
-          --cmake_generator=Ninja \
-          --build_java \
-          --code_coverage
-        displayName: NNAPI EP, Build, Test, CodeCoverage on Android Emulator
-
-      # We need to use llvm-cov from the NDK.
-      - script: |
-          export GCOV="$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/linux-x86_64/bin/llvm-cov gcov"
-          python3 -m pip install gcovr
-          python3 tools/ci_build/coverage.py --build_dir build_nnapi --android_sdk_path $ANDROID_HOME
-        displayName: Retrieve runtime code coverage files from the emulator and analyze
-
-      - script: cat '$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt'
-        displayName: Print coverage report
-
-      # - task: AzureCLI@2
-      #   displayName: 'Post Android Code Coverage To DashBoard'
-      #   inputs:
-      #     azureSubscription: AIInfraBuild
-      #     scriptType: bash
-      #     scriptPath: $(Build.SourcesDirectory)/tools/ci_build/github/linux/upload_code_coverage_data.sh
-      #     arguments: '"$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt" "https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=$(Build.BuildId)" arm android nnapi'
-      #     workingDirectory: '$(Build.BinariesDirectory)'
-
-      - script: /bin/bash tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh $(pwd)
-        # Build Minimal ORT with NNAPI and reduced Ops, run unit tests on Android Emulator
-        displayName: Build Minimal ORT with NNAPI and run tests
-
-      - template: templates/use-android-emulator.yml
-        parameters:
-          stop: true
-
-      - template: templates/clean-agent-build-directory-step.yml
diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
index 23c968f35a27f..70d8e954808a5 100644
--- a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
@@ -56,7 +56,7 @@ extends:
     # Update the pool with your team's 1ES hosted pool.
     pool:
       name: "Azure Pipelines"
-      image: "macOS-13"
+      image: "macOS-14"
       os: macOS
     sdl:
       sourceAnalysisPool:
diff --git a/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml b/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml
index 4518a168879a2..a2a0e3bcace9f 100644
--- a/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml
+++ b/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml
@@ -11,7 +11,7 @@ stages:
       clean: all
     timeoutInMinutes:  120
     pool:
-      vmImage: 'macOS-13'
+      vmImage: 'macOS-14'
 
     variables:
     - name: OnnxRuntimeBuildDirectory
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml
index 07d21333270a8..a6e38d0bc93f3 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml
@@ -11,7 +11,7 @@ stages:
     workspace:
       clean: all
     pool:
-      vmImage: 'macOS-13'
+      vmImage: 'macOS-14'
 
     variables:
     - name: OnnxRuntimeBuildDirectory
diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index 6b421184c490e..78c07c28d6f4e 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -427,7 +427,7 @@ stages:
   - job: IosDynamicFramework
     timeoutInMinutes: 120
     pool:
-      vmImage: "macOS-13"
+      vmImage: "macOS-14"
 
     steps:
     - task: UsePythonVersion@0
@@ -463,7 +463,7 @@ stages:
   - job: IosMinimalTrainingBuild
     timeoutInMinutes: 120
     pool:
-      vmImage: "macOS-13"
+      vmImage: "macOS-14"
 
     steps:
     - task: UsePythonVersion@0
diff --git a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
index 7a78c6ba0fcdf..01c1366107292 100644
--- a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
@@ -29,7 +29,7 @@ stages:
       parameters:
         job_name: Test_MAC_Wheels
         machine_pool:
-          vmImage: 'macOS-13'
+          vmImage: 'macOS-14'
         itemPattern: '*/*mac*x86_64.whl'
         arch: 'x86_64'
     - template: templates/py-package-smoking-test.yml
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
index 5e783607e3622..d64ee07aa2131 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
@@ -201,7 +201,7 @@ stages:
         clean: all
       pool:
         name: "Azure Pipelines"
-        image: "macOS-13"
+        image: "macOS-14"
         os: macOS
       variables:
         MACOSX_DEPLOYMENT_TARGET: '13.3'
diff --git a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
index 3886ceb1ed58f..366ee3fcf4e92 100644
--- a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
@@ -24,6 +24,8 @@ jobs:
   pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
   workspace:
     clean: all
+  pool:
+    vmImage: 'macOS-14'
   variables:
     runCodesignValidationInjection: false
     ANDROID_AVD_HOME: $(Agent.TempDirectory)
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index c4559d4e0b918..7a46bdc7cde0a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -117,7 +117,7 @@ stages:
     workspace:
       clean: all
     pool:
-      vmImage: 'macOS-13'
+      vmImage: 'macOS-14'
     timeoutInMinutes: 300
     steps:
     - template: set-version-number-variables-step.yml
@@ -788,7 +788,7 @@ stages:
 
 - template: ../nuget/templates/test_macos.yml
   parameters:
-    AgentPool : macOS-13
+    AgentPool : macOS-14
     ArtifactSuffix: 'CPU'
 
 - template: ../nodejs/templates/test_win.yml
@@ -824,4 +824,4 @@ stages:
     OS: MacOS
     BuildId: ${{ parameters.BuildId }}
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
-    PoolName: 'macOS-13'
+    PoolName: 'macOS-14'
diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml
index ab31e592d7d71..8c725c1d6b9a0 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml
@@ -68,7 +68,7 @@ stages:
   jobs:
     - job: MacOS_C_API_Package_Publish
       pool:
-          vmImage: 'macOS-13'
+          vmImage: 'macOS-14'
       steps:
       - checkout: none
       - template: flex-downloadPipelineArtifact.yml
diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
index 32908753f2909..c7ae7bb3a0026 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
@@ -37,7 +37,7 @@ jobs:
     PROTO_CACHE_DIR: $(Pipeline.Workspace)/ccache_proto
     ORT_CACHE_DIR: $(Pipeline.Workspace)/ccache_ort
   pool:
-    vmImage: 'macOS-13'
+    vmImage: 'macOS-14'
   timeoutInMinutes: 300
   steps:
   - checkout: self
diff --git a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
index 58ebdd52998ea..c1309d345d819 100644
--- a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
@@ -64,11 +64,11 @@ stages:
   - job: ReactNative_CI_iOS
     ${{ if eq(parameters.is1ES, false) }}:
       pool:
-        vmImage: 'macOS-13'
+        vmImage: 'macOS-14'
     ${{ if eq(parameters.is1ES, true) }}:
       pool:
         name: 'Azure Pipelines'
-        image: 'macOS-13'
+        image: 'macOS-14'
         os: 'macOS'
 
     timeoutInMinutes: 120
@@ -212,7 +212,7 @@ stages:
         scheme: 'OnnxruntimeModuleTest'
         packageApp: false
         destinationPlatformOption: 'iOS'
-        destinationSimulators: 'iPhone 14,OS=16.4'
+        destinationSimulators: 'iPhone 15,OS=17.4'
         workingDirectory: '$(Build.SourcesDirectory)/js/react_native/ios'
         xcprettyArgs: '--output build/reports/test-results.xml'
         publishJUnitResults: true
diff --git a/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml b/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml
index 2cf698aefa8bd..3c1bfcd60fedd 100644
--- a/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml
@@ -3,7 +3,7 @@
 parameters:
 - name: xcodeVersion
   type: string
-  default: "14.3.1"
+  default: "15.3.0"
 
 steps:
 - bash: |
diff --git a/tools/python/util/android/android.py b/tools/python/util/android/android.py
index 8f3ed97cae53f..cd420ca1483c7 100644
--- a/tools/python/util/android/android.py
+++ b/tools/python/util/android/android.py
@@ -46,18 +46,36 @@ def filename(name, windows_extension):
 
 def create_virtual_device(sdk_tool_paths: SdkToolPaths, system_image_package_name: str, avd_name: str):
     run(sdk_tool_paths.sdkmanager, "--install", system_image_package_name, input=b"y")
-
-    run(
-        sdk_tool_paths.avdmanager,
-        "create",
-        "avd",
-        "--name",
-        avd_name,
-        "--package",
-        system_image_package_name,
-        "--force",
-        input=b"no",
-    )
+    android_avd_home = os.environ["ANDROID_AVD_HOME"]
+
+    if android_avd_home is not None:
+        if not os.path.exists(android_avd_home):
+            os.makedirs(android_avd_home)
+        run(
+            sdk_tool_paths.avdmanager,
+            "create",
+            "avd",
+            "--name",
+            avd_name,
+            "--package",
+            system_image_package_name,
+            "--force",
+            "--path",
+            android_avd_home,
+            input=b"no",
+        )
+    else:
+        run(
+            sdk_tool_paths.avdmanager,
+            "create",
+            "avd",
+            "--name",
+            avd_name,
+            "--package",
+            system_image_package_name,
+            "--force",
+            input=b"no",
+        )
 
 
 _process_creationflags = subprocess.CREATE_NEW_PROCESS_GROUP if is_windows() else 0