From a46d2127fb0b1a1a75b771f98810c01235f7be06 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Tue, 18 Mar 2025 08:22:40 -0700
Subject: [PATCH 01/21] add bool support to EPContext schema to unblock some
 models (#24065)

### Description
add bool support to EPContext schema to unblock some models
---
 docs/ContribOperators.md                           | 2 +-
 onnxruntime/core/graph/contrib_ops/contrib_defs.cc | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index b64641230f249..f582abca34706 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -1625,7 +1625,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
+<dt><tt>T</tt> : tensor(bool), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(float16), tensor(float), tensor(double), tensor(bfloat16)</dt>
 <dd>Constrain input and output types.</dd>
 </dl>
 
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index e45787299f3ad..7b4a45ce8aa0f 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -3361,7 +3361,8 @@ void RegisterContribSchemas() {
           OpSchema::NonDifferentiable)
       .TypeConstraint(
           "T",
-          {"tensor(int8)",
+          {"tensor(bool)",
+           "tensor(int8)",
            "tensor(int16)",
            "tensor(int32)",
            "tensor(int64)",

From b3aa5a3c21d3a5fd0e22e6ae0907086ddf582c6f Mon Sep 17 00:00:00 2001
From: Prathik Rao <prathik.rao@gmail.com>
Date: Tue, 18 Mar 2025 14:50:28 -0700
Subject: [PATCH 02/21] [WebGPU EP] fix for reduce min/max error on MacOS CI
 (#24077)

### Error

```Traceback
/onnxruntime/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc:146 [allow_multi_axes = true] Axes values must be in the range [-rank, rank-1]. Got: 446098880
```
---
 .../webgpu/reduction/reduction_ops.cc         | 99 ++++++++++---------
 .../onnx_backend_test_series_filters.jsonc    |  4 +-
 2 files changed, 57 insertions(+), 46 deletions(-)

diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
index 1a56cafdb3952..a0213f63494d3 100644
--- a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
@@ -11,7 +11,7 @@
 namespace onnxruntime {
 namespace webgpu {
 
-#define REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceOp, begin, end)              \
+#define REGISTER_REDUCE_VERSIONED_KERNEL(ReduceOp, begin, end)                         \
   ONNX_OPERATOR_VERSIONED_KERNEL_EX(                                                   \
       ReduceOp,                                                                        \
       kOnnxDomain,                                                                     \
@@ -20,7 +20,16 @@ namespace webgpu {
       (*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedNumberTypes()), \
       ReduceOp);
 
-#define REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceOp, version)                                                                  \
+#define REGISTER_REDUCE_VERSIONED_KERNEL_WITH_AXIS_IN_INPUT(ReduceOp, begin, end)                                             \
+  ONNX_OPERATOR_VERSIONED_KERNEL_EX(                                                                                          \
+      ReduceOp,                                                                                                               \
+      kOnnxDomain,                                                                                                            \
+      begin, end,                                                                                                             \
+      kWebGpuExecutionProvider,                                                                                               \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedNumberTypes()).InputMemoryType(OrtMemTypeCPUInput, 1), \
+      ReduceOp);
+
+#define REGISTER_REDUCE_KERNEL(ReduceOp, version)                                                                             \
   ONNX_OPERATOR_KERNEL_EX(                                                                                                    \
       ReduceOp,                                                                                                               \
       kOnnxDomain,                                                                                                            \
@@ -29,58 +38,58 @@ namespace webgpu {
       (*KernelDefBuilder::Create()).TypeConstraint("T", WebGpuSupportedNumberTypes()).InputMemoryType(OrtMemTypeCPUInput, 1), \
       ReduceOp);
 
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMean, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMean, 18);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMean, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMean, 11, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMean, 13, 17);
+REGISTER_REDUCE_KERNEL(ReduceMean, 18);
 
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 11, 11);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 12, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMax, 18, 19);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMax, 20);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMax, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMax, 11, 11);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMax, 12, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMax, 13, 17);
+REGISTER_REDUCE_VERSIONED_KERNEL_WITH_AXIS_IN_INPUT(ReduceMax, 18, 19);
+REGISTER_REDUCE_KERNEL(ReduceMax, 20);
 
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 11, 11);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 12, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceMin, 18, 19);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceMin, 20);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMin, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMin, 11, 11);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMin, 12, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceMin, 13, 17);
+REGISTER_REDUCE_VERSIONED_KERNEL_WITH_AXIS_IN_INPUT(ReduceMin, 18, 19);
+REGISTER_REDUCE_KERNEL(ReduceMin, 20);
 
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSum, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSum, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceSum, 13);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceSum, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceSum, 11, 12);
+REGISTER_REDUCE_KERNEL(ReduceSum, 13);
 
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceProd, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceProd, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceProd, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceProd, 18);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceProd, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceProd, 11, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceProd, 13, 17);
+REGISTER_REDUCE_KERNEL(ReduceProd, 18);
 
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL1, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL1, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL1, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceL1, 18);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceL1, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceL1, 11, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceL1, 13, 17);
+REGISTER_REDUCE_KERNEL(ReduceL1, 18);
 
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL2, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL2, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceL2, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceL2, 18);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceL2, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceL2, 11, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceL2, 13, 17);
+REGISTER_REDUCE_KERNEL(ReduceL2, 18);
 
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSum, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSum, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSum, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceLogSum, 18);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSum, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSum, 11, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSum, 13, 17);
+REGISTER_REDUCE_KERNEL(ReduceLogSum, 18);
 
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSumSquare, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSumSquare, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceSumSquare, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceSumSquare, 18);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceSumSquare, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceSumSquare, 11, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceSumSquare, 13, 17);
+REGISTER_REDUCE_KERNEL(ReduceSumSquare, 18);
 
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSumExp, 1, 10);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSumExp, 11, 12);
-REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ReduceLogSumExp, 13, 17);
-REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceLogSumExp, 18);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSumExp, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSumExp, 11, 12);
+REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSumExp, 13, 17);
+REGISTER_REDUCE_KERNEL(ReduceLogSumExp, 18);
 
 Status ReduceKernelProgram::GenerateShaderCode(ShaderHelper& shader) const {
   const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index e261d66a0d22a..d62ffe644e4cc 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -694,7 +694,9 @@
         "^test_gelu_tanh_2_expanded_cpu",
         "^test_dynamicquantizelinear_expanded_cpu",
         "^test_center_crop_pad_crop_negative_axes_hwc*", // failed due to new types or shape infer with negative axis for CenterCropPad.
-        "^test_center_crop_pad_crop_negative_axes_hwc_expanded*" // failed due to new types or shape infer with negative axis for CenterCropPad.
+        "^test_center_crop_pad_crop_negative_axes_hwc_expanded*", // failed due to new types or shape infer with negative axis for CenterCropPad.
+        "^test_reduce_max_empty_set",
+        "^test_reduce_min_empty_set"
     ],
     "current_failing_tests_pure_DML": [
         "^test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_cpu",

From e495750a3413daaa9d98aa8639c61ad28a8a3516 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Tue, 18 Mar 2025 18:24:34 -0400
Subject: [PATCH 03/21] Upgrade current MacOS-13 to 14 (#23293)

### Description
Upgrade current MacOS-13 to 14


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

- [x] Update the RN to 0.73.x+ to have the newer version of boost

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 cmake/CMakePresets.json                       | 23 +++++++++++++++++++
 js/react_native/e2e/.detoxrc.js               |  4 ++--
 .../coreml/builders/impl/base_op_builder.cc   |  3 +++
 .../core/providers/coreml/model/host_utils.h  |  8 ++++++-
 .../core/providers/coreml/model/model.mm      |  3 +--
 .../mac-ios-packaging-pipeline.yml            |  2 +-
 .../nodejs/templates/test_macos.yml           |  2 +-
 .../nuget/templates/test_macos.yml            |  2 +-
 .../azure-pipelines/post-merge-jobs.yml       |  4 ++--
 .../py-package-test-pipeline.yml              |  2 +-
 .../stages/py-cpu-packaging-stage.yml         |  2 +-
 .../templates/android-java-api-aar-test.yml   |  2 ++
 .../azure-pipelines/templates/c-api-cpu.yml   |  6 ++---
 .../templates/mac-cpu-packaging-pipeline.yml  |  2 +-
 .../templates/mac-cpu-packing-jobs.yml        |  2 +-
 .../templates/react-native-ci.yml             |  6 ++---
 .../templates/use-xcode-version.yml           |  2 +-
 17 files changed, 54 insertions(+), 21 deletions(-)

diff --git a/cmake/CMakePresets.json b/cmake/CMakePresets.json
index 4987edaf85513..8d63912f6eaee 100644
--- a/cmake/CMakePresets.json
+++ b/cmake/CMakePresets.json
@@ -109,6 +109,29 @@
                 "rhs": "Darwin"
             }
         },
+        {
+            "name": "arm64-osx",
+            "inherits": [
+                "unit-test"
+            ],
+            "generator": "Xcode",
+            "binaryDir": "${sourceParentDir}/cmake_build/arm64-osx",
+            "installDir": "${sourceParentDir}/cmake_build/out",
+            "cacheVariables": {
+                "CMAKE_OSX_ARCHITECTURES": "arm64",
+                "onnxruntime_BUILD_SHARED_LIB": true,
+                "onnxruntime_USE_XNNPACK": false,
+                "onnxruntime_USE_COREML": true,
+                "onnxruntime_BUILD_OBJC": true,
+                "onnxruntime_BUILD_APPLE_FRAMEWORK": true,
+                "CMAKE_CONFIGURATION_TYPES": "Debug;Release"
+            },
+            "condition": {
+                "type": "equals",
+                "lhs": "${hostSystemName}",
+                "rhs": "Darwin"
+            }
+        },
         {
             "name": "x64-osx-vcpkg",
             "inherits": [
diff --git a/js/react_native/e2e/.detoxrc.js b/js/react_native/e2e/.detoxrc.js
index e886a363d378b..1d49f06213e51 100644
--- a/js/react_native/e2e/.detoxrc.js
+++ b/js/react_native/e2e/.detoxrc.js
@@ -38,8 +38,8 @@ module.exports = {
     simulator: {
       type: 'ios.simulator',
       device: {
-        type: 'iPhone 14',
-        os: 'iOS 16.4',
+        type: 'iPhone 15',
+        os: 'iOS 17.4',
       },
     },
     attached: {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
index 2817f34bc64f2..9e7fcd788664c 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/base_op_builder.cc
@@ -6,6 +6,7 @@
 #include "core/providers/coreml/builders/helper.h"
 #include "core/providers/coreml/builders/impl/base_op_builder.h"
 #include "core/providers/coreml/builders/model_builder.h"
+#include "core/providers/coreml/model/host_utils.h"
 #include "core/providers/shared/utils/utils.h"
 
 using namespace CoreML::Specification;
@@ -113,10 +114,12 @@ bool BaseOpBuilder::IsInputDtypeSupport(const Node& node, size_t idx,
     return true;
   }
 
+#if CAN_BUILD_COREML6_OR_LATER
   // only MLProgram support FP16
   if (input_params.create_mlprogram && input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
     return true;
   }
+#endif
 
   LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Input type: [" << input_type << "] is not currently supported";
   return false;
diff --git a/onnxruntime/core/providers/coreml/model/host_utils.h b/onnxruntime/core/providers/coreml/model/host_utils.h
index 145c64e5320d3..31ee2bd3e2494 100644
--- a/onnxruntime/core/providers/coreml/model/host_utils.h
+++ b/onnxruntime/core/providers/coreml/model/host_utils.h
@@ -43,7 +43,13 @@
 #define API_AVAILABLE_COREML7 API_AVAILABLE(macos(14), ios(17))
 #define API_AVAILABLE_COREML8 API_AVAILABLE(macos(15), ios(18))
 
-// @available is used in implementation code
+// The previous macros are used in header files to declare the availability of the APIs.
+// The following macros are used in  build time checks to determine if the APIs are available.
+#define CAN_BUILD_COREML8_OR_LATER (__MAC_OS_X_VERSION_MAX_ALLOWED >= 150000 && __IPHONE_OS_VERSION_MAX_ALLOWED >= 180000)
+#define CAN_BUILD_COREML7_OR_LATER (__MAC_OS_X_VERSION_MAX_ALLOWED >= 140000 && __IPHONE_OS_VERSION_MAX_ALLOWED >= 170000)
+#define CAN_BUILD_COREML6_OR_LATER (__MAC_OS_X_VERSION_MAX_ALLOWED >= 130000 && __IPHONE_OS_VERSION_MAX_ALLOWED >= 160000)
+
+// @available is used in implementation code to check the availability of the APIs at runtime.
 // Base required OS to run CoreML Specification Version 4 (Core ML 3)
 #define HAS_COREML3_OR_LATER @available(macOS 10.15, iOS 13, *)
 #define HAS_COREML4_OR_LATER @available(macOS 11, iOS 14, *)
diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm
index 5211b89ec17c6..71664021ea2fb 100644
--- a/onnxruntime/core/providers/coreml/model/model.mm
+++ b/onnxruntime/core/providers/coreml/model/model.mm
@@ -363,13 +363,12 @@ void ProfileComputePlan(NSURL* compileUrl, MLModelConfiguration* config) {
 #endif
 }
 
-#if __has_include(<CoreML/MLOptimizationHints.h>)
+#if __has_include(<CoreML/MLOptimizationHints.h>) && CAN_BUILD_COREML8_OR_LATER
 #define HAS_COREMLOPTIMIZATIONHINT 1
 #else
 #define HAS_COREMLOPTIMIZATIONHINT 0
 #endif
 
-API_AVAILABLE_COREML8
 void ConfigureOptimizationHints(MLModelConfiguration* config, const CoreMLOptions& coreml_options) {
 #if HAS_COREMLOPTIMIZATIONHINT
   MLOptimizationHints* optimizationHints = [[MLOptimizationHints alloc] init];
diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
index 23c968f35a27f..70d8e954808a5 100644
--- a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
@@ -56,7 +56,7 @@ extends:
     # Update the pool with your team's 1ES hosted pool.
     pool:
       name: "Azure Pipelines"
-      image: "macOS-13"
+      image: "macOS-14"
       os: macOS
     sdl:
       sourceAnalysisPool:
diff --git a/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml b/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml
index 4518a168879a2..a2a0e3bcace9f 100644
--- a/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml
+++ b/tools/ci_build/github/azure-pipelines/nodejs/templates/test_macos.yml
@@ -11,7 +11,7 @@ stages:
       clean: all
     timeoutInMinutes:  120
     pool:
-      vmImage: 'macOS-13'
+      vmImage: 'macOS-14'
 
     variables:
     - name: OnnxRuntimeBuildDirectory
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml
index 07d21333270a8..a6e38d0bc93f3 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_macos.yml
@@ -11,7 +11,7 @@ stages:
     workspace:
       clean: all
     pool:
-      vmImage: 'macOS-13'
+      vmImage: 'macOS-14'
 
     variables:
     - name: OnnxRuntimeBuildDirectory
diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index 6b421184c490e..78c07c28d6f4e 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -427,7 +427,7 @@ stages:
   - job: IosDynamicFramework
     timeoutInMinutes: 120
     pool:
-      vmImage: "macOS-13"
+      vmImage: "macOS-14"
 
     steps:
     - task: UsePythonVersion@0
@@ -463,7 +463,7 @@ stages:
   - job: IosMinimalTrainingBuild
     timeoutInMinutes: 120
     pool:
-      vmImage: "macOS-13"
+      vmImage: "macOS-14"
 
     steps:
     - task: UsePythonVersion@0
diff --git a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
index 7a78c6ba0fcdf..01c1366107292 100644
--- a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
@@ -29,7 +29,7 @@ stages:
       parameters:
         job_name: Test_MAC_Wheels
         machine_pool:
-          vmImage: 'macOS-13'
+          vmImage: 'macOS-14'
         itemPattern: '*/*mac*x86_64.whl'
         arch: 'x86_64'
     - template: templates/py-package-smoking-test.yml
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
index 5e783607e3622..d64ee07aa2131 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
@@ -201,7 +201,7 @@ stages:
         clean: all
       pool:
         name: "Azure Pipelines"
-        image: "macOS-13"
+        image: "macOS-14"
         os: macOS
       variables:
         MACOSX_DEPLOYMENT_TARGET: '13.3'
diff --git a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
index 3886ceb1ed58f..366ee3fcf4e92 100644
--- a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
@@ -24,6 +24,8 @@ jobs:
   pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
   workspace:
     clean: all
+  pool:
+    vmImage: 'macOS-14'
   variables:
     runCodesignValidationInjection: false
     ANDROID_AVD_HOME: $(Agent.TempDirectory)
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index c4559d4e0b918..7a46bdc7cde0a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -117,7 +117,7 @@ stages:
     workspace:
       clean: all
     pool:
-      vmImage: 'macOS-13'
+      vmImage: 'macOS-14'
     timeoutInMinutes: 300
     steps:
     - template: set-version-number-variables-step.yml
@@ -788,7 +788,7 @@ stages:
 
 - template: ../nuget/templates/test_macos.yml
   parameters:
-    AgentPool : macOS-13
+    AgentPool : macOS-14
     ArtifactSuffix: 'CPU'
 
 - template: ../nodejs/templates/test_win.yml
@@ -824,4 +824,4 @@ stages:
     OS: MacOS
     BuildId: ${{ parameters.BuildId }}
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
-    PoolName: 'macOS-13'
+    PoolName: 'macOS-14'
diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml
index ab31e592d7d71..8c725c1d6b9a0 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml
@@ -68,7 +68,7 @@ stages:
   jobs:
     - job: MacOS_C_API_Package_Publish
       pool:
-          vmImage: 'macOS-13'
+          vmImage: 'macOS-14'
       steps:
       - checkout: none
       - template: flex-downloadPipelineArtifact.yml
diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
index 32908753f2909..c7ae7bb3a0026 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
@@ -37,7 +37,7 @@ jobs:
     PROTO_CACHE_DIR: $(Pipeline.Workspace)/ccache_proto
     ORT_CACHE_DIR: $(Pipeline.Workspace)/ccache_ort
   pool:
-    vmImage: 'macOS-13'
+    vmImage: 'macOS-14'
   timeoutInMinutes: 300
   steps:
   - checkout: self
diff --git a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
index 58ebdd52998ea..c1309d345d819 100644
--- a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml
@@ -64,11 +64,11 @@ stages:
   - job: ReactNative_CI_iOS
     ${{ if eq(parameters.is1ES, false) }}:
       pool:
-        vmImage: 'macOS-13'
+        vmImage: 'macOS-14'
     ${{ if eq(parameters.is1ES, true) }}:
       pool:
         name: 'Azure Pipelines'
-        image: 'macOS-13'
+        image: 'macOS-14'
         os: 'macOS'
 
     timeoutInMinutes: 120
@@ -212,7 +212,7 @@ stages:
         scheme: 'OnnxruntimeModuleTest'
         packageApp: false
         destinationPlatformOption: 'iOS'
-        destinationSimulators: 'iPhone 14,OS=16.4'
+        destinationSimulators: 'iPhone 15,OS=17.4'
         workingDirectory: '$(Build.SourcesDirectory)/js/react_native/ios'
         xcprettyArgs: '--output build/reports/test-results.xml'
         publishJUnitResults: true
diff --git a/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml b/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml
index 2cf698aefa8bd..3c1bfcd60fedd 100644
--- a/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml
@@ -3,7 +3,7 @@
 parameters:
 - name: xcodeVersion
   type: string
-  default: "14.3.1"
+  default: "15.3.0"
 
 steps:
 - bash: |

From c6a267543a4e8dce5bab0168e3ac2d4acc98a352 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Wed, 19 Mar 2025 08:50:28 +1000
Subject: [PATCH 04/21] Fix CUDA EP Abs and Sign bfloat16 support (#23914)

### Description
<!-- Describe your changes. -->
Abs and Sign had bfloat16 kernels created but not registered with the
CUDA EP. Additionally Sign bfloat16 didn't work.
* register bfloat16 kernels with CUDA EP
* fix incorrectly named macro by adding 'X' as they add bfloat16
registration
* add specialization for bfloat16 to _Sign
  * copied existing pattern. not sure if there's a better way
* update tests



### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
#23875
---
 docs/OperatorKernels.md                       |  4 ++--
 .../core/providers/cuda/cu_inc/common.cuh     |  3 +++
 .../providers/cuda/cuda_execution_provider.cc |  4 ++++
 .../cuda/math/unary_elementwise_ops.cc        | 20 ++++++++--------
 .../cpu/math/element_wise_ops_test.cc         | 23 ++++++++++++++++---
 .../test/providers/cpu/math/sign_test.cc      | 10 ++++++--
 6 files changed, 47 insertions(+), 17 deletions(-)

diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 91c68a1f5afb6..8d256a2088279 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -582,7 +582,7 @@ Do not modify directly.*
 | Op Name | Parameters | OpSet Version | Types Supported |
 |---------|------------|---------------|-----------------|
 |**Operator Domain:** *ai.onnx*||||
-|Abs|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Abs|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[6, 12]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |Add|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|14+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||13|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
@@ -839,7 +839,7 @@ Do not modify directly.*
 |Shrink|*in* input:**T**<br> *out* output:**T**|9+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |Sigmoid|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
 |||[6, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
-|Sign|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Sign|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |SimplifiedLayerNormalization|*in* X:**T**<br> *in* scale:**V**<br> *out* Y:**V**<br> *out* inv_std_var:**U**|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)<br/> **U** = tensor(double), tensor(float)<br/> **V** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
 |Sin|*in* input:**T**<br> *out* output:**T**|7+|**T** = tensor(double), tensor(float), tensor(float16)|
 |Size|*in* data:**T**<br> *out* size:**T1**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T1** = tensor(int64)|
diff --git a/onnxruntime/core/providers/cuda/cu_inc/common.cuh b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
index 55935a9eae86d..2d2551a156099 100644
--- a/onnxruntime/core/providers/cuda/cu_inc/common.cuh
+++ b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
@@ -441,6 +441,9 @@ __device__ __inline__ T _Sign(T a) { return _Signum(a, std::is_signed<T>()); }
 template <>
 __device__ __inline__ half _Sign(half a) { return _Signum(a, std::true_type()); }
 
+template <>
+__device__ __inline__ BFloat16 _Sign(BFloat16 a) { return _Signum(static_cast<float>(a), std::true_type()); }
+
 template <typename T>
 __device__ __inline__ T _Normcdf(T a);
 
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 54fb4429c0536..886fddd8f8a27 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -1013,6 +1013,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Abs);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Abs);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Abs);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Abs);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int8_t, Neg);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int16_t, Neg);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Neg);
@@ -1188,6 +1189,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Sign);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Sign);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Sign);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Sign);
 
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, BFloat16, Add);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, BFloat16, Sub);
@@ -1996,6 +1998,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Abs)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Abs)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Abs)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Abs)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int8_t, Neg)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int16_t, Neg)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Neg)>,
@@ -2169,6 +2172,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, Sign)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, Sign)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, Sign)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, BFloat16, Sign)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, BFloat16, Add)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, BFloat16, Sub)>,
diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc
index fb03b4326c4e8..86a1b0f5b6102 100644
--- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc
+++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc
@@ -213,19 +213,19 @@ Status IsNaN::ComputeInternal(OpKernelContext* context) const {
   UNARY_OP_TYPED(name, ver, float)     \
   UNARY_OP_TYPED(name, ver, double)
 
-#define UNARY_OP_CSILHFD(name, ver)  \
+#define UNARY_OP_CSILHFDX(name, ver) \
   UNARY_OP_TYPED(name, ver, int8_t)  \
   UNARY_OP_TYPED(name, ver, int16_t) \
   UNARY_OP_TYPED(name, ver, int32_t) \
   UNARY_OP_TYPED(name, ver, int64_t) \
   UNARY_OP_HFDX(name, ver)
 
-#define UNARY_OP_BWUZCSILHFD(name, ver) \
-  UNARY_OP_TYPED(name, ver, uint8_t)    \
-  UNARY_OP_TYPED(name, ver, uint16_t)   \
-  UNARY_OP_TYPED(name, ver, uint32_t)   \
-  UNARY_OP_TYPED(name, ver, uint64_t)   \
-  UNARY_OP_CSILHFD(name, ver)
+#define UNARY_OP_BWUZCSILHFDX(name, ver) \
+  UNARY_OP_TYPED(name, ver, uint8_t)     \
+  UNARY_OP_TYPED(name, ver, uint16_t)    \
+  UNARY_OP_TYPED(name, ver, uint32_t)    \
+  UNARY_OP_TYPED(name, ver, uint64_t)    \
+  UNARY_OP_CSILHFDX(name, ver)
 
 UNARY_OP_VERSIONED_BWUZCSILHFD(Abs, 6, 12)
 UNARY_OP_VERSIONED_CSILHFD(Neg, 6, 12)
@@ -237,8 +237,8 @@ UNARY_OP_VERSIONED_HFD(Log, 6, 12)
 UNARY_OP_VERSIONED_HFD(Exp, 6, 12)
 UNARY_OP_VERSIONED_HFD(Erf, 9, 12)
 
-UNARY_OP_BWUZCSILHFD(Abs, 13)
-UNARY_OP_CSILHFD(Neg, 13)
+UNARY_OP_BWUZCSILHFDX(Abs, 13)
+UNARY_OP_CSILHFDX(Neg, 13)
 UNARY_OP_HFD(Floor, 13)
 UNARY_OP_HFD(Ceil, 13)
 UNARY_OP_HFD(Reciprocal, 13)
@@ -246,7 +246,7 @@ UNARY_OP_HFDX(Sqrt, 13)
 UNARY_OP_HFD(Log, 13)
 UNARY_OP_HFDX(Exp, 13)
 UNARY_OP_HFDX(Erf, 13)
-UNARY_OP_BWUZCSILHFD(Sign, 13)
+UNARY_OP_BWUZCSILHFDX(Sign, 13)
 
 UNARY_LOGICALOP_NOT_TYPED(1, bool)
 UNARY_OP_HFD(Round, 11)
diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
index d87ee861752c7..30e7f63919216 100644
--- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
@@ -968,8 +968,15 @@ TEST(MathOpTest, Abs) {
   test.Run();
 }
 
-#ifdef USE_DNNL
+#if defined(USE_CUDA) || defined(USE_DNNL)
 TEST(MathOpTest, Abs_bfloat16) {
+#ifdef USE_CUDA
+  int min_cuda_architecture = 530;
+  if (!HasCudaEnvironment(min_cuda_architecture)) {
+    LOGS_DEFAULT(WARNING) << "Hardware does NOT support BF16";
+    return;
+  }
+#endif
 #ifdef USE_DNNL
   if (!DnnlHasBF16Support()) {
     LOGS_DEFAULT(WARNING) << "Hardware does NOT support BF16";
@@ -980,9 +987,19 @@ TEST(MathOpTest, Abs_bfloat16) {
   std::vector<int64_t> dims{2, 2};
   test_bf16.AddInput<BFloat16>("X", dims, MakeBFloat16({1.0f, -2.0f, -0.0f, -10.0f}));
   test_bf16.AddOutput<BFloat16>("Y", dims, MakeBFloat16({1.0f, 2.0f, 0.0f, 10.0f}));
-  test_bf16.Run();
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+#if defined(USE_CUDA)
+  execution_providers.push_back(DefaultCudaExecutionProvider());
+#endif
+
+#if defined(USE_DNNL)
+  execution_providers.push_back(DefaultDnnlExecutionProvider());
+#endif
+
+  test_bf16.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
-#endif  //  USE_DNNL
+#endif  //  USE_CUDA || USE_DNNL
 
 TEST(MathOpTest, Abs_int8) {
   OpTester test("Abs");
diff --git a/onnxruntime/test/providers/cpu/math/sign_test.cc b/onnxruntime/test/providers/cpu/math/sign_test.cc
index a01c2b26ea8b5..0da6a2ed55f2c 100644
--- a/onnxruntime/test/providers/cpu/math/sign_test.cc
+++ b/onnxruntime/test/providers/cpu/math/sign_test.cc
@@ -207,7 +207,7 @@ TEST(MathOpTest, Sign_MLFloat16) {
 //  test.Run(OpTester::ExpectResult::kExpectSuccess);
 //}
 
-#if defined(USE_DNNL)
+#if defined(USE_CUDA) || defined(USE_DNNL)
 TEST(MathOpTest, Sign_bfloat16) {
 #ifdef USE_DNNL
   if (!DnnlHasBF16Support()) {
@@ -228,9 +228,15 @@ TEST(MathOpTest, Sign_bfloat16) {
   TestImpl<BFloat16>(input.cbegin(), input.cend(), std::back_inserter(output));
   test.AddOutput<BFloat16>("output", input_dims, output);
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+
+#if defined(USE_CUDA)
+  execution_providers.push_back(DefaultCudaExecutionProvider());
+#endif
+
 #if defined(USE_DNNL)
   execution_providers.push_back(DefaultDnnlExecutionProvider());
-#endif  //  USE_DNNL
+#endif
+
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 #endif

From 12fea57253942991cf4f3de15f505cd03eaa9f2f Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Tue, 18 Mar 2025 16:04:13 -0700
Subject: [PATCH 05/21] Improve typing for OrtValue and other public Python
 interfaces (#24086)

### Description

Improve the OrtValue interface typing and changed `staticmethod` to
`classmethod` for constructors to follow python conventions
(https://google.github.io/styleguide/pyguide.html#2174-decision).
---
 .../onnxruntime_inference_collection.py       | 140 ++++++++++--------
 1 file changed, 78 insertions(+), 62 deletions(-)

diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py
index 6b5f7526cc506..785eb9c485d25 100644
--- a/onnxruntime/python/onnxruntime_inference_collection.py
+++ b/onnxruntime/python/onnxruntime_inference_collection.py
@@ -15,6 +15,9 @@
 from onnxruntime.capi import _pybind_state as C
 
 if typing.TYPE_CHECKING:
+    import numpy as np
+    import numpy.typing as npt
+
     import onnxruntime
 
 
@@ -59,22 +62,22 @@ def export_adapter(self, file_path: os.PathLike):
         """
         self._adapter.export_adapter(file_path)
 
-    def get_format_version(self):
+    def get_format_version(self) -> int:
         return self._adapter.format_version
 
-    def set_adapter_version(self, adapter_version: int):
+    def set_adapter_version(self, adapter_version: int) -> None:
         self._adapter.adapter_version = adapter_version
 
-    def get_adapter_version(self):
+    def get_adapter_version(self) -> int:
         return self._adapter.adapter_version
 
-    def set_model_version(self, model_version: int):
+    def set_model_version(self, model_version: int) -> None:
         self._adapter.model_version = model_version
 
-    def get_model_version(self):
+    def get_model_version(self) -> int:
         return self._adapter.model_version
 
-    def set_parameters(self, params: dict[str, OrtValue]):
+    def set_parameters(self, params: dict[str, OrtValue]) -> None:
         self._adapter.parameters = {k: v._ortvalue for k, v in params.items()}
 
     def get_parameters(self) -> dict[str, OrtValue]:
@@ -174,27 +177,27 @@ def __init__(self):
         self._sess = None
         self._enable_fallback = True
 
-    def get_session_options(self):
+    def get_session_options(self) -> onnxruntime.SessionOptions:
         "Return the session options. See :class:`onnxruntime.SessionOptions`."
         return self._sess_options
 
-    def get_inputs(self):
+    def get_inputs(self) -> Sequence[onnxruntime.NodeArg]:
         "Return the inputs metadata as a list of :class:`onnxruntime.NodeArg`."
         return self._inputs_meta
 
-    def get_outputs(self):
+    def get_outputs(self) -> Sequence[onnxruntime.NodeArg]:
         "Return the outputs metadata as a list of :class:`onnxruntime.NodeArg`."
         return self._outputs_meta
 
-    def get_overridable_initializers(self):
+    def get_overridable_initializers(self) -> Sequence[onnxruntime.NodeArg]:
         "Return the inputs (including initializers) metadata as a list of :class:`onnxruntime.NodeArg`."
         return self._overridable_initializers
 
-    def get_modelmeta(self):
+    def get_modelmeta(self) -> onnxruntime.ModelMetadata:
         "Return the metadata. See :class:`onnxruntime.ModelMetadata`."
         return self._model_meta
 
-    def get_providers(self):
+    def get_providers(self) -> Sequence[str]:
         "Return list of registered execution providers."
         return self._providers
 
@@ -202,7 +205,7 @@ def get_provider_options(self):
         "Return registered execution providers' configurations."
         return self._provider_options
 
-    def set_providers(self, providers=None, provider_options=None):
+    def set_providers(self, providers=None, provider_options=None) -> None:
         """
         Register the input list of execution providers. The underlying session is re-created.
 
@@ -224,13 +227,13 @@ def set_providers(self, providers=None, provider_options=None):
         # recreate the underlying C.InferenceSession
         self._reset_session(providers, provider_options)
 
-    def disable_fallback(self):
+    def disable_fallback(self) -> None:
         """
         Disable session.run() fallback mechanism.
         """
         self._enable_fallback = False
 
-    def enable_fallback(self):
+    def enable_fallback(self) -> None:
         """
         Enable session.Run() fallback mechanism. If session.Run() fails due to an internal Execution Provider failure,
         reset the Execution Providers enabled for this session.
@@ -249,7 +252,7 @@ def _validate_input(self, feed_input_names):
                 f"Required inputs ({missing_input_names}) are missing from input feed ({feed_input_names})."
             )
 
-    def run(self, output_names, input_feed, run_options=None):
+    def run(self, output_names, input_feed, run_options=None) -> Sequence[np.ndarray | SparseTensor | list | dict]:
         """
         Compute the predictions.
 
@@ -308,7 +311,7 @@ def callback(results: np.ndarray, user_data: MyData, err: str) -> None:
             output_names = [output.name for output in self._outputs_meta]
         return self._sess.run_async(output_names, input_feed, callback, user_data, run_options)
 
-    def run_with_ort_values(self, output_names, input_dict_ort_values, run_options=None):
+    def run_with_ort_values(self, output_names, input_dict_ort_values, run_options=None) -> Sequence[OrtValue]:
         """
         Compute the predictions.
 
@@ -367,7 +370,7 @@ def get_profiling_start_time_ns(self):
         """
         return self._sess.get_profiling_start_time_ns
 
-    def io_binding(self):
+    def io_binding(self) -> IOBinding:
         "Return an onnxruntime.IOBinding object`."
         return IOBinding(self)
 
@@ -550,7 +553,7 @@ def _create_inference_session(self, providers, provider_options, disabled_optimi
         self._provider_options = self._sess.get_provider_options()
         self._profiling_start_time_ns = self._sess.get_profiling_start_time_ns
 
-    def _reset_session(self, providers, provider_options):
+    def _reset_session(self, providers, provider_options) -> None:
         "release underlying session object."
         # meta data references session internal structures
         # so they must be set to None to decrement _sess reference count.
@@ -721,7 +724,7 @@ class OrtValue:
     This class provides APIs to construct and deal with OrtValues.
     """
 
-    def __init__(self, ortvalue, numpy_obj=None):
+    def __init__(self, ortvalue: C.OrtValue, numpy_obj: np.ndarray | None = None):
         if isinstance(ortvalue, C.OrtValue):
             self._ortvalue = ortvalue
             # Hold a ref count to the numpy object if the OrtValue is backed directly
@@ -733,11 +736,11 @@ def __init__(self, ortvalue, numpy_obj=None):
                 "`Provided ortvalue` needs to be of type `onnxruntime.capi.onnxruntime_pybind11_state.OrtValue`"
             )
 
-    def _get_c_value(self):
+    def _get_c_value(self) -> C.OrtValue:
         return self._ortvalue
 
-    @staticmethod
-    def ortvalue_from_numpy(numpy_obj, device_type="cpu", device_id=0):
+    @classmethod
+    def ortvalue_from_numpy(cls, numpy_obj: np.ndarray, /, device_type="cpu", device_id=0) -> OrtValue:
         """
         Factory method to construct an OrtValue (which holds a Tensor) from a given Numpy object
         A copy of the data in the Numpy object is held by the OrtValue only if the device is NOT cpu
@@ -749,7 +752,7 @@ def ortvalue_from_numpy(numpy_obj, device_type="cpu", device_id=0):
         # Hold a reference to the numpy object (if device_type is 'cpu') as the OrtValue
         # is backed directly by the data buffer of the numpy object and so the numpy object
         # must be around until this OrtValue instance is around
-        return OrtValue(
+        return cls(
             C.OrtValue.ortvalue_from_numpy(
                 numpy_obj,
                 C.OrtDevice(
@@ -761,8 +764,8 @@ def ortvalue_from_numpy(numpy_obj, device_type="cpu", device_id=0):
             numpy_obj if device_type.lower() == "cpu" else None,
         )
 
-    @staticmethod
-    def ortvalue_from_numpy_with_onnx_type(data, onnx_element_type: int):
+    @classmethod
+    def ortvalue_from_numpy_with_onnx_type(cls, data: np.ndarray, /, onnx_element_type: int) -> OrtValue:
         """
         This method creates an instance of OrtValue on top of the numpy array.
         No data copy is made and the lifespan of the resulting OrtValue should never
@@ -771,12 +774,14 @@ def ortvalue_from_numpy_with_onnx_type(data, onnx_element_type: int):
         when we want to use an ONNX data type that is not supported by numpy.
 
         :param data: numpy.ndarray.
-        :param onnx_elemenet_type: a valid onnx TensorProto::DataType enum value
+        :param onnx_element_type: a valid onnx TensorProto::DataType enum value
         """
-        return OrtValue(C.OrtValue.ortvalue_from_numpy_with_onnx_type(data, onnx_element_type), data)
+        return cls(C.OrtValue.ortvalue_from_numpy_with_onnx_type(data, onnx_element_type), data)
 
-    @staticmethod
-    def ortvalue_from_shape_and_type(shape, element_type, device_type: str = "cpu", device_id: int = 0):
+    @classmethod
+    def ortvalue_from_shape_and_type(
+        cls, shape: Sequence[int], element_type, device_type: str = "cpu", device_id: int = 0
+    ) -> OrtValue:
         """
         Factory method to construct an OrtValue (which holds a Tensor) from given shape and element_type
 
@@ -788,7 +793,7 @@ def ortvalue_from_shape_and_type(shape, element_type, device_type: str = "cpu",
         # Integer for onnx element type (see https://onnx.ai/onnx/api/mapping.html).
         # This is helpful for some data type (like TensorProto.BFLOAT16) that is not available in numpy.
         if isinstance(element_type, int):
-            return OrtValue(
+            return cls(
                 C.OrtValue.ortvalue_from_shape_and_onnx_type(
                     shape,
                     element_type,
@@ -800,7 +805,7 @@ def ortvalue_from_shape_and_type(shape, element_type, device_type: str = "cpu",
                 )
             )
 
-        return OrtValue(
+        return cls(
             C.OrtValue.ortvalue_from_shape_and_type(
                 shape,
                 element_type,
@@ -812,77 +817,77 @@ def ortvalue_from_shape_and_type(shape, element_type, device_type: str = "cpu",
             )
         )
 
-    @staticmethod
-    def ort_value_from_sparse_tensor(sparse_tensor):
+    @classmethod
+    def ort_value_from_sparse_tensor(cls, sparse_tensor: SparseTensor) -> OrtValue:
         """
         The function will construct an OrtValue instance from a valid SparseTensor
         The new instance of OrtValue will assume the ownership of sparse_tensor
         """
-        return OrtValue(C.OrtValue.ort_value_from_sparse_tensor(sparse_tensor._get_c_tensor()))
+        return cls(C.OrtValue.ort_value_from_sparse_tensor(sparse_tensor._get_c_tensor()))
 
-    def as_sparse_tensor(self):
+    def as_sparse_tensor(self) -> SparseTensor:
         """
         The function will return SparseTensor contained in this OrtValue
         """
         return SparseTensor(self._ortvalue.as_sparse_tensor())
 
-    def data_ptr(self):
+    def data_ptr(self) -> int:
         """
         Returns the address of the first element in the OrtValue's data buffer
         """
         return self._ortvalue.data_ptr()
 
-    def device_name(self):
+    def device_name(self) -> str:
         """
         Returns the name of the device where the OrtValue's data buffer resides e.g. cpu, cuda, cann
         """
         return self._ortvalue.device_name().lower()
 
-    def shape(self):
+    def shape(self) -> Sequence[int]:
         """
         Returns the shape of the data in the OrtValue
         """
         return self._ortvalue.shape()
 
-    def data_type(self):
+    def data_type(self) -> str:
         """
-        Returns the data type of the data in the OrtValue
+        Returns the data type of the data in the OrtValue. E.g. 'tensor(int64)'
         """
         return self._ortvalue.data_type()
 
-    def element_type(self):
+    def element_type(self) -> int:
         """
         Returns the proto type of the data in the OrtValue
         if the OrtValue is a tensor.
         """
         return self._ortvalue.element_type()
 
-    def has_value(self):
+    def has_value(self) -> bool:
         """
         Returns True if the OrtValue corresponding to an
         optional type contains data, else returns False
         """
         return self._ortvalue.has_value()
 
-    def is_tensor(self):
+    def is_tensor(self) -> bool:
         """
         Returns True if the OrtValue contains a Tensor, else returns False
         """
         return self._ortvalue.is_tensor()
 
-    def is_sparse_tensor(self):
+    def is_sparse_tensor(self) -> bool:
         """
         Returns True if the OrtValue contains a SparseTensor, else returns False
         """
         return self._ortvalue.is_sparse_tensor()
 
-    def is_tensor_sequence(self):
+    def is_tensor_sequence(self) -> bool:
         """
         Returns True if the OrtValue contains a Tensor Sequence, else returns False
         """
         return self._ortvalue.is_tensor_sequence()
 
-    def numpy(self):
+    def numpy(self) -> np.ndarray:
         """
         Returns a Numpy object from the OrtValue.
         Valid only for OrtValues holding Tensors. Throws for OrtValues holding non-Tensors.
@@ -890,7 +895,7 @@ def numpy(self):
         """
         return self._ortvalue.numpy()
 
-    def update_inplace(self, np_arr):
+    def update_inplace(self, np_arr) -> None:
         """
         Update the OrtValue in place with a new Numpy array. The numpy contents
         are copied over to the device memory backing the OrtValue. It can be used
@@ -948,7 +953,7 @@ class SparseTensor:
     depending on the format
     """
 
-    def __init__(self, sparse_tensor):
+    def __init__(self, sparse_tensor: C.SparseTensor):
         """
         Internal constructor
         """
@@ -960,11 +965,17 @@ def __init__(self, sparse_tensor):
                 "`Provided object` needs to be of type `onnxruntime.capi.onnxruntime_pybind11_state.SparseTensor`"
             )
 
-    def _get_c_tensor(self):
+    def _get_c_tensor(self) -> C.SparseTensor:
         return self._tensor
 
-    @staticmethod
-    def sparse_coo_from_numpy(dense_shape, values, coo_indices, ort_device):
+    @classmethod
+    def sparse_coo_from_numpy(
+        cls,
+        dense_shape: npt.NDArray[np.int64],
+        values: np.ndarray,
+        coo_indices: npt.NDArray[np.int64],
+        ort_device: OrtDevice,
+    ) -> SparseTensor:
         """
         Factory method to construct a SparseTensor in COO format from given arguments
 
@@ -985,12 +996,17 @@ def sparse_coo_from_numpy(dense_shape, values, coo_indices, ort_device):
         For strings and objects, it will create a copy of the arrays in CPU memory as ORT does not support those
         on other devices and their memory can not be mapped.
         """
-        return SparseTensor(
-            C.SparseTensor.sparse_coo_from_numpy(dense_shape, values, coo_indices, ort_device._get_c_device())
-        )
+        return cls(C.SparseTensor.sparse_coo_from_numpy(dense_shape, values, coo_indices, ort_device._get_c_device()))
 
-    @staticmethod
-    def sparse_csr_from_numpy(dense_shape, values, inner_indices, outer_indices, ort_device):
+    @classmethod
+    def sparse_csr_from_numpy(
+        cls,
+        dense_shape: npt.NDArray[np.int64],
+        values: np.ndarray,
+        inner_indices: npt.NDArray[np.int64],
+        outer_indices: npt.NDArray[np.int64],
+        ort_device: OrtDevice,
+    ) -> SparseTensor:
         """
         Factory method to construct a SparseTensor in CSR format from given arguments
 
@@ -1011,7 +1027,7 @@ def sparse_csr_from_numpy(dense_shape, values, inner_indices, outer_indices, ort
         For strings and objects, it will create a copy of the arrays in CPU memory as ORT does not support those
         on other devices and their memory can not be mapped.
         """
-        return SparseTensor(
+        return cls(
             C.SparseTensor.sparse_csr_from_numpy(
                 dense_shape,
                 values,
@@ -1021,7 +1037,7 @@ def sparse_csr_from_numpy(dense_shape, values, inner_indices, outer_indices, ort
             )
         )
 
-    def values(self):
+    def values(self) -> np.ndarray:
         """
         The method returns a numpy array that is backed by the native memory
         if the data type is numeric. Otherwise, the returned numpy array that contains
@@ -1093,19 +1109,19 @@ def format(self):
         """
         return self._tensor.format
 
-    def dense_shape(self):
+    def dense_shape(self) -> npt.NDArray[np.int64]:
         """
         Returns a numpy array(int64) containing a dense shape of a sparse tensor
         """
         return self._tensor.dense_shape()
 
-    def data_type(self):
+    def data_type(self) -> str:
         """
         Returns a string data type of the data in the OrtValue
         """
         return self._tensor.data_type()
 
-    def device_name(self):
+    def device_name(self) -> str:
         """
         Returns the name of the device where the SparseTensor data buffers reside e.g. cpu, cuda
         """

From a85977dd4f1b66ced92276e8fe22ace2b82465ee Mon Sep 17 00:00:00 2001
From: Jiajia Qin <jiajiaqin@microsoft.com>
Date: Wed, 19 Mar 2025 07:46:10 +0800
Subject: [PATCH 06/21] [webgpu] Limit that K must be divisible by 128 to apply
 dp4a matmul (#24078)

The DP4AMatMulQuantize shader needs to make sure that K is divisible by
128. Otherwise, we need align the scale
to have shape [M, ceil(K / 128)]. To simplify the shader, we limit that
K must be divisible by 128 to apply dp4a matmul.
---
 .../webgpu/quantization/dp4a_matmul_nbits.cc     | 16 +++-------------
 .../webgpu/quantization/dp4a_matmul_nbits.h      |  1 -
 .../test/contrib_ops/matmul_4bits_test.cc        |  2 ++
 3 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
index 05cbfb1f99c48..65807b072bc80 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
@@ -12,21 +12,12 @@ Status DP4AMatMulQuantizeProgram::GenerateShaderCode(ShaderHelper& shader) const
   shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
   shader.AddOutput("output", ShaderUsage::UseUniform);
   shader.AddOutput("scales", ShaderUsage::UseUniform);
-  shader.AdditionalImplementation() << R"ADDNL_FN(
-        fn readInput(offset: u32) -> input_a_value_t
-        {
-            if (offset > uniforms.input_size) {
-                return input_a_value_t(0);
-            }
-            return input_a[offset];
-        }
-    )ADDNL_FN";
   shader.MainFunctionBody() << R"MAIN_FN(
         var local_a : array<vec4<input_a_element_t>, 32>;
         var max_value:vec4<input_a_element_t> = vec4<input_a_element_t>(0);
         for (var idx:u32=0;idx<32;idx+=1)
         {
-            local_a[idx] = readInput(workgroup_idx*32 + idx);
+            local_a[idx] = input_a[workgroup_idx*32 + idx];
             max_value = max(max_value, abs(local_a[idx]));
         }
         var scale = max(max_value.x, max_value.y);
@@ -279,8 +270,7 @@ Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor
   Tensor a_scale = context.CreateGPUTensor(a->DataType(), a_scales_dims);
   quantize_program.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(kVec4Components)}})
       .AddOutputs({{&a_quant, ProgramTensorMetadataDependency::Rank, a_quant.Shape(), 1},
-                   {&a_scale, ProgramTensorMetadataDependency::Rank, a_scale.Shape(), 1}})
-      .AddUniformVariable({static_cast<uint32_t>(M * K / kVec4Components)});
+                   {&a_scale, ProgramTensorMetadataDependency::Rank, a_scale.Shape(), 1}});
   ORT_RETURN_IF_ERROR(context.RunProgram(quantize_program));
 
   constexpr uint32_t kTileSize = 64;
@@ -317,7 +307,7 @@ bool CanApplyDP4AMatrixMatMulNBits(onnxruntime::webgpu::ComputeContext& context,
   bool use_dp4a = context.Device().HasFeature(wgpu::FeatureName::Subgroups) &&
                   context.AdapterInfo().backendType != wgpu::BackendType::Metal;
   return (accuracy_level == 4 && block_size % 32 == 0 &&
-          batch_count == 1 && components_k == 4 && K % 64 == 0 && N % 16 == 0 &&
+          batch_count == 1 && components_k == 4 && K % 128 == 0 && N % 16 == 0 &&
           !has_zero_points && use_dp4a);
 }
 
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h
index 15b86d78301ad..7e4a8f5d68437 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h
+++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h
@@ -16,7 +16,6 @@ class DP4AMatMulQuantizeProgram final : public Program<DP4AMatMulQuantizeProgram
  public:
   DP4AMatMulQuantizeProgram() : Program{"DP4AMatMulQuantize"} {}
   Status GenerateShaderCode(ShaderHelper& sh) const override;
-  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"input_size", ProgramUniformVariableDataType::Uint32});
 };
 
 class DP4AMatMulNBitsProgram final : public Program<DP4AMatMulNBitsProgram> {
diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
index b1779ded4a675..8187253311ed3 100644
--- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -389,6 +389,7 @@ TEST(MatMulNBits, Float32_Accuracy4) {
   TestMatMulNBitsTyped<float, 100, 288, 16, 16, 4>();
   TestMatMulNBitsTyped<float, 100, 288, 1024, 16, 4>();
   TestMatMulNBitsTyped<float, 100, 288, 1024, 128, 4>();
+  TestMatMulNBitsTyped<float, 100, 288, 192, 64, 4>();
   TestMatMulNBitsTyped<float, 100, 288, 93, 32, 4>();
   TestMatMulNBitsTyped<float, 100, 288, 93, 128, 4>();
   TestMatMulNBitsTyped<float, 100, 288, 1234, 16, 4>();
@@ -458,6 +459,7 @@ TEST(MatMulNBits, Float16_Accuracy4) {
   TestMatMulNBitsTyped<MLFloat16, 100, 288, 16, 16, 4>();
   TestMatMulNBitsTyped<MLFloat16, 100, 288, 1024, 16, 4>();
   TestMatMulNBitsTyped<MLFloat16, 100, 288, 1024, 128, 4>();
+  TestMatMulNBitsTyped<MLFloat16, 100, 288, 192, 64, 4>();
   TestMatMulNBitsTyped<MLFloat16, 100, 288, 93, 32, 4>();
   TestMatMulNBitsTyped<MLFloat16, 100, 288, 93, 128, 4>();
   TestMatMulNBitsTyped<MLFloat16, 100, 288, 1234, 16, 4>();

From d98046b36f208da486032fe71de3b76fa28826ed Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 18 Mar 2025 17:53:57 -0700
Subject: [PATCH 07/21] Add macOS ARM64 pipeline for webgpu (#24060)

### Description

Add macOS ARM64 pipeline for webgpu.

This pipeline is a temporary one. I created this pipeline because the
current code already fails on macOS ARM64 for WebGPU EP. Adding this
pipeline allows to check the status of the fix, and eventually when the
build passes, this pipeline will be merged with the existing macOS arm64
pipeline.
---
 .github/workflows/mac.yml | 40 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index 50dd25898ad35..fe7c7fb7ab4c8 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -154,6 +154,46 @@ jobs:
           --use_xnnpack \
           --use_binskim_compliant_compile_flags
 
+  ARM64-Xcode16-webgpu:
+    runs-on: macos-15
+
+    env:
+      xcode_version: 16
+
+    timeout-minutes: 60
+
+    steps:
+    - uses: actions/setup-python@v5
+      with:
+        python-version: ${{ env.python_version }}
+
+    - name: Verify ARM64 machine
+      shell: python
+      run: |
+        import platform
+        assert platform.machine() == "arm64", "This job expects to be run on an ARM64 machine."
+
+    - name: Use Xcode ${{ env.xcode_version }}
+      shell: bash
+      run: |
+        XCODE_DEVELOPER_DIR="/Applications/Xcode_${{ env.xcode_version }}.app/Contents/Developer"
+        sudo xcode-select --switch "${XCODE_DEVELOPER_DIR}"
+
+    - uses: actions/checkout@v4
+
+    - name: Build and test
+      shell: bash
+      run: |
+        python ./tools/ci_build/build.py \
+          --build_dir ./build \
+          --update \
+          --build --parallel \
+          --test \
+          --build_shared_lib \
+          --build_nodejs \
+          --use_webgpu \
+          --use_binskim_compliant_compile_flags
+
   ARM64-Xcode16-targeting-iphonesimulator:
     runs-on: macos-15
 

From eceae8b2e7447820c22c4a4cf73f7bbc6717c807 Mon Sep 17 00:00:00 2001
From: Enrico Galli <enrico.galli@intel.com>
Date: Tue, 18 Mar 2025 23:09:06 -0700
Subject: [PATCH 08/21] [WebNN/WebGPU JS] Fix shared Module methods overriding
 each other (#23998)

- Renamed all conflicting WebNN methods from `jsep*` to `webnn*`.
- WebNN doesn't need flush(), therefore it doesn't need to set
`jsepBackend`.

This PR addresses issue microsoft/webnn-developer-preview#78
---
 js/web/lib/wasm/wasm-core-impl.ts             | 28 +++++------
 js/web/lib/wasm/wasm-types.ts                 | 44 ++++++++++++------
 onnxruntime/core/providers/webnn/allocator.cc |  4 +-
 .../core/providers/webnn/builders/model.cc    |  6 +--
 .../providers/webnn/builders/model_builder.cc | 16 +++----
 .../core/providers/webnn/data_transfer.cc     |  6 +--
 .../webnn/webnn_execution_provider.cc         | 10 ++--
 onnxruntime/wasm/pre-jsep.js                  | 46 ++++++++++---------
 8 files changed, 89 insertions(+), 71 deletions(-)

diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index 3979af7fa1ec9..bb532e0fbae74 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -309,12 +309,12 @@ export const createSession = async (
           if (context) {
             wasm.currentContext = context as MLContext;
           } else if (gpuDevice) {
-            wasm.currentContext = await wasm.jsepCreateMLContext!(gpuDevice);
+            wasm.currentContext = await wasm.webnnCreateMLContext!(gpuDevice);
           } else {
-            wasm.currentContext = await wasm.jsepCreateMLContext!({ deviceType, powerPreference });
+            wasm.currentContext = await wasm.webnnCreateMLContext!({ deviceType, powerPreference });
           }
         } else {
-          wasm.currentContext = await wasm.jsepCreateMLContext!();
+          wasm.currentContext = await wasm.webnnCreateMLContext!();
         }
         break;
       }
@@ -330,7 +330,7 @@ export const createSession = async (
 
     // clear current MLContext after session creation
     if (wasm.currentContext) {
-      wasm.jsepRegisterMLContext!(sessionHandle, wasm.currentContext);
+      wasm.webnnRegisterMLContext!(sessionHandle, wasm.currentContext);
       wasm.currentContext = undefined;
       wasm.shouldTransferToMLTensor = true;
     }
@@ -454,6 +454,7 @@ export const releaseSession = (sessionId: number): void => {
   }
 
   wasm.jsepOnReleaseSession?.(sessionId);
+  wasm.webnnOnReleaseSession?.(sessionId);
   wasm.webgpuOnReleaseSession?.(sessionId);
 
   inputNamesUTF8Encoded.forEach((buf) => wasm._OrtFree(buf));
@@ -520,7 +521,7 @@ export const prepareInputOutputTensor = async (
     const mlTensor = tensor[2].mlTensor as MLTensor;
     dataByteLength = calculateTensorSizeInBytes(tensorDataTypeStringToEnum(dataType), dims)!;
 
-    const registerMLTensor = wasm.jsepRegisterMLTensor;
+    const registerMLTensor = wasm.webnnRegisterMLTensor;
     if (!registerMLTensor) {
       throw new Error('Tensor location "ml-tensor" is not supported without using WebNN.');
     }
@@ -540,7 +541,7 @@ export const prepareInputOutputTensor = async (
         wasm.setValue(rawData + i * ptrSize, allocWasmString(data[i], allocs), '*');
       }
     } else {
-      const isGraphInput = wasm.jsepIsGraphInput;
+      const isGraphInput = wasm.webnnIsGraphInput;
       if (dataType !== 'string' && isGraphInput) {
         const tensorNameUTF8 = wasm._OrtGetInputName(sessionId, index);
         const tensorName = wasm.UTF8ToString(tensorNameUTF8);
@@ -549,8 +550,8 @@ export const prepareInputOutputTensor = async (
           const dataTypeEnum = tensorDataTypeStringToEnum(dataType);
           dataByteLength = calculateTensorSizeInBytes(dataTypeEnum, dims)!;
           actualLocation = 'ml-tensor';
-          const createTemporaryTensor = wasm.jsepCreateTemporaryTensor;
-          const uploadTensor = wasm.jsepUploadTensor;
+          const createTemporaryTensor = wasm.webnnCreateTemporaryTensor;
+          const uploadTensor = wasm.webnnUploadTensor;
           if (!createTemporaryTensor || !uploadTensor) {
             throw new Error('Tensor location "ml-tensor" is not supported without using WebNN.');
           }
@@ -722,6 +723,7 @@ export const run = async (
     }
 
     wasm.jsepOnRunStart?.(sessionHandle);
+    wasm.webnnOnRunStart?.(sessionHandle);
 
     let errorCode: number;
     if (!BUILD_DEFS.DISABLE_JSEP && ioBindingState) {
@@ -862,8 +864,8 @@ export const run = async (
               ]);
             }
           } else if (preferredLocation === 'ml-tensor' && size > 0) {
-            const ensureTensor = wasm.jsepEnsureTensor;
-            const isInt64Supported = wasm.jsepIsInt64Supported;
+            const ensureTensor = wasm.webnnEnsureTensor;
+            const isInt64Supported = wasm.webnnIsInt64Supported;
             if (!ensureTensor || !isInt64Supported) {
               throw new Error('preferredLocation "ml-tensor" is not supported without using WebNN.');
             }
@@ -890,9 +892,9 @@ export const run = async (
               dims,
               {
                 mlTensor,
-                download: wasm.jsepCreateMLTensorDownloader!(dataOffset, type),
+                download: wasm.webnnCreateMLTensorDownloader!(dataOffset, type),
                 dispose: () => {
-                  wasm.jsepReleaseTensorId!(dataOffset);
+                  wasm.webnnReleaseTensorId!(dataOffset);
                   wasm._OrtReleaseTensor(tensor);
                 },
               },
@@ -915,7 +917,7 @@ export const run = async (
         if (!keepOutputTensor) {
           wasm._OrtReleaseTensor(tensor);
         }
-        wasm.jsepOnRunEnd?.(sessionHandle);
+        wasm.webnnOnRunEnd?.(sessionHandle);
       }
     }
 
diff --git a/js/web/lib/wasm/wasm-types.ts b/js/web/lib/wasm/wasm-types.ts
index 6de54078af031..752bac28d7efb 100644
--- a/js/web/lib/wasm/wasm-types.ts
+++ b/js/web/lib/wasm/wasm-types.ts
@@ -156,12 +156,26 @@ export declare namespace JSEP {
      */
     shouldTransferToMLTensor: boolean;
 
+    /**
+     *  [exported from pre-jsep.js] Called when InferenceSession.run started. This function will be called before
+     * _OrtRun[WithBinding]() is called.
+     * @param sessionId - specify the session ID.
+     */
+    webnnOnRunStart: (sessionId: number) => void;
+    /**
+     * [exported from pre-jsep.js] Release a session. This function will be called before _OrtReleaseSession() is
+     * called.
+     * @param sessionId - specify the session ID.
+     * @returns
+     */
+    webnnOnReleaseSession: (sessionId: number) => void;
+
     /**
      * [exported from pre-jsep.js] Called when InferenceSession.run finished. This function will be called after
      * _OrtRun[WithBinding]() is called.
      * @param sessionId - specify the session ID.
      */
-    jsepOnRunEnd: (sessionId: number) => void;
+    webnnOnRunEnd: (sessionId: number) => void;
 
     /**
      * [exported from pre-jsep.js] Register MLContext for a session.
@@ -169,18 +183,18 @@ export declare namespace JSEP {
      * @param context - specify the MLContext.
      * @returns
      */
-    jsepRegisterMLContext: (sessionId: number, context: MLContext) => void;
+    webnnRegisterMLContext: (sessionId: number, context: MLContext) => void;
     /**
      * [exported from pre-jsep.js] Reserve a MLTensor ID attached to the current session.
      * @returns the MLTensor ID.
      */
-    jsepReserveTensorId: () => number;
+    webnnReserveTensorId: () => number;
     /**
      * [exported from pre-jsep.js] Release an MLTensor ID from use and destroys underlying MLTensor if no longer in use.
      * @param tensorId - specify the MLTensor ID.
      * @returns
      */
-    jsepReleaseTensorId: (tensorId: number) => void;
+    webnnReleaseTensorId: (tensorId: number) => void;
     /**
      * [exported from pre-jsep.js] Ensure that an MLTensor of a given type and shape exists for a MLTensor ID.
      * @param sessionId - specify the session ID or current active session ID if undefined.
@@ -190,7 +204,7 @@ export declare namespace JSEP {
      * @param copyOld - specify whether to copy the old tensor if a new tensor was created.
      * @returns the MLTensor associated with the tensor ID.
      */
-    jsepEnsureTensor: (
+    webnnEnsureTensor: (
       sessionId: number | undefined,
       tensorId: number,
       dataType: DataType,
@@ -203,20 +217,20 @@ export declare namespace JSEP {
      * @param data - specify the data to upload. It can be a TensorProto::data_type or a WebNN MLOperandDataType.
      * @returns
      */
-    jsepUploadTensor: (tensorId: number, data: Uint8Array) => void;
+    webnnUploadTensor: (tensorId: number, data: Uint8Array) => void;
     /**
      * [exported from pre-jsep.js] Download data from an MLTensor.
      * @param tensorId - specify the MLTensor ID.
      * @returns the downloaded data.
      */
-    jsepDownloadTensor: (tensorId: number, dstBuffer: ArrayBufferView | ArrayBuffer) => Promise<undefined>;
+    webnnDownloadTensor: (tensorId: number, dstBuffer: ArrayBufferView | ArrayBuffer) => Promise<undefined>;
     /**
      * [exported from pre-jsep.js] Creates a downloader function to download data from an MLTensor.
      * @param tensorId - specify the MLTensor ID.
      * @param type - specify the data type.
      * @returns the downloader function.
      */
-    jsepCreateMLTensorDownloader: (
+    webnnCreateMLTensorDownloader: (
       tensorId: number,
       type: Tensor.MLTensorDataTypes,
     ) => () => Promise<Tensor.DataTypeMap[Tensor.MLTensorDataTypes]>;
@@ -228,7 +242,7 @@ export declare namespace JSEP {
      * @param dimensions - specify the dimensions.
      * @returns the MLTensor ID for the external MLTensor.
      */
-    jsepRegisterMLTensor: (
+    webnnRegisterMLTensor: (
       sessionId: number,
       tensor: MLTensor,
       onnxDataType: DataType,
@@ -240,7 +254,7 @@ export declare namespace JSEP {
      * @param optionsOrGpuDevice - specify the options or GPUDevice.
      * @returns
      */
-    jsepCreateMLContext(optionsOrGpuDevice?: MLContextOptions | GPUDevice): Promise<MLContext>;
+    webnnCreateMLContext(optionsOrGpuDevice?: MLContextOptions | GPUDevice): Promise<MLContext>;
 
     /**
      * [exported from pre-jsep.js] Register a WebNN Constant operand from external data.
@@ -252,7 +266,7 @@ export declare namespace JSEP {
      * @param shouldConvertInt64ToInt32 - specify whether to convert int64 to int32.
      * @returns the WebNN Constant operand for the specified external data.
      */
-    jsepRegisterMLConstant(
+    webnnRegisterMLConstant(
       externalFilePath: string,
       dataOffset: number,
       dataLength: number,
@@ -265,14 +279,14 @@ export declare namespace JSEP {
      * [exported from pre-jsep.js] Register a WebNN graph input.
      * @param inputName - specify the input name.
      */
-    jsepRegisterGraphInput: (inputName: string) => void;
+    webnnRegisterGraphInput: (inputName: string) => void;
     /**
      * [exported from pre-jsep.js] Check if a graph input is a WebNN graph input.
      * @param sessionId - specify the session ID.
      * @param inputName - specify the input name.
      * @returns whether the input is a WebNN graph input.
      */
-    jsepIsGraphInput: (sessionId: number, inputName: string) => boolean;
+    webnnIsGraphInput: (sessionId: number, inputName: string) => boolean;
     /**
      * [exported from pre-jsep.js] Create a temporary MLTensor for a session.
      * @param sessionId - specify the session ID.
@@ -280,13 +294,13 @@ export declare namespace JSEP {
      * @param shape - specify the shape.
      * @returns the MLTensor ID for the temporary MLTensor.
      */
-    jsepCreateTemporaryTensor: (sessionId: number, dataType: DataType, shape: readonly number[]) => Promise<number>;
+    webnnCreateTemporaryTensor: (sessionId: number, dataType: DataType, shape: readonly number[]) => Promise<number>;
     /**
      * [exported from pre-jsep.js] Check if a session's associated WebNN Context supports int64.
      * @param sessionId - specify the session ID.
      * @returns whether the WebNN Context supports int64.
      */
-    jsepIsInt64Supported: (sessionId: number) => boolean;
+    webnnIsInt64Supported: (sessionId: number) => boolean;
   }
 }
 
diff --git a/onnxruntime/core/providers/webnn/allocator.cc b/onnxruntime/core/providers/webnn/allocator.cc
index 9c5cd651e1f00..8cf5b8cd72a5c 100644
--- a/onnxruntime/core/providers/webnn/allocator.cc
+++ b/onnxruntime/core/providers/webnn/allocator.cc
@@ -16,7 +16,7 @@ void* WebNNTensorAllocator::Alloc(size_t size) {
     // We don't need to transfer the tensor to an MLTensor, so we don't need to allocate an MLTensor id.
     return nullptr;
   }
-  void* p = EM_ASM_PTR({ return Module.jsepReserveTensorId(); });
+  void* p = EM_ASM_PTR({ return Module.webnnReserveTensorId(); });
   allocations_[p] = size;
   stats_.num_allocs++;
   stats_.bytes_in_use += SafeInt<int64_t>(size);
@@ -27,7 +27,7 @@ void WebNNTensorAllocator::Free(void* p) {
   if (p == nullptr) {
     return;
   }
-  EM_ASM({ Module.jsepReleaseTensorId($0); }, p);
+  EM_ASM({ Module.webnnReleaseTensorId($0); }, p);
   size_t size = allocations_[p];
   stats_.bytes_in_use -= size;
   allocations_.erase(p);
diff --git a/onnxruntime/core/providers/webnn/builders/model.cc b/onnxruntime/core/providers/webnn/builders/model.cc
index 35964d85862e4..492e2f717e30e 100644
--- a/onnxruntime/core/providers/webnn/builders/model.cc
+++ b/onnxruntime/core/providers/webnn/builders/model.cc
@@ -157,7 +157,7 @@ onnxruntime::common::Status Model::Compute(const InlinedHashMap<std::string, Onn
 
 onnxruntime::common::Status Model::Dispatch(const InlinedHashMap<std::string, OnnxTensorData>& inputs,
                                             const InlinedHashMap<std::string, OnnxTensorData>& outputs) {
-  auto jsepEnsureTensor = emscripten::val::module_property("jsepEnsureTensor");
+  auto webnnEnsureTensor = emscripten::val::module_property("webnnEnsureTensor");
   auto promises = emscripten::val::array();
   for (const auto& [_, tensor] : inputs) {
     emscripten::val shape = emscripten::val::array();
@@ -165,7 +165,7 @@ onnxruntime::common::Status Model::Dispatch(const InlinedHashMap<std::string, On
       uint32_t dim_val = SafeInt<uint32_t>(dim);
       shape.call<void>("push", dim_val);
     }
-    auto ml_tensor = jsepEnsureTensor(emscripten::val::undefined(), reinterpret_cast<intptr_t>(tensor.buffer), tensor.tensor_info.data_type, shape, true);
+    auto ml_tensor = webnnEnsureTensor(emscripten::val::undefined(), reinterpret_cast<intptr_t>(tensor.buffer), tensor.tensor_info.data_type, shape, true);
     promises.call<void>("push", ml_tensor);
   }
   for (const auto& [_, tensor] : outputs) {
@@ -174,7 +174,7 @@ onnxruntime::common::Status Model::Dispatch(const InlinedHashMap<std::string, On
       uint32_t dim_val = SafeInt<uint32_t>(dim);
       shape.call<void>("push", dim_val);
     }
-    auto ml_tensor = jsepEnsureTensor(emscripten::val::undefined(), reinterpret_cast<intptr_t>(tensor.buffer), tensor.tensor_info.data_type, shape, false);
+    auto ml_tensor = webnnEnsureTensor(emscripten::val::undefined(), reinterpret_cast<intptr_t>(tensor.buffer), tensor.tensor_info.data_type, shape, false);
     promises.call<void>("push", ml_tensor);
   }
   auto ml_tensors = emscripten::val::global("Promise").call<emscripten::val>("all", promises).await();
diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.cc b/onnxruntime/core/providers/webnn/builders/model_builder.cc
index 661b2ad7056c2..ed6ab7d2d7115 100644
--- a/onnxruntime/core/providers/webnn/builders/model_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/model_builder.cc
@@ -140,13 +140,13 @@ Status ModelBuilder::RegisterInitializers() {
         ORT_RETURN_IF_ERROR(utils::GetExternalDataInfo(
             tensor, graph_viewer_.ModelPath(), external_file_path, data_offset, tensor_byte_size));
 
-        auto jsepRegisterMLConstant = emscripten::val::module_property("jsepRegisterMLConstant");
-        operand = jsepRegisterMLConstant(emscripten::val(external_file_path),
-                                         static_cast<int32_t>(data_offset),
-                                         static_cast<int32_t>(tensor_byte_size),
-                                         wnn_builder_,
-                                         desc,
-                                         should_convert_int64_to_int32);
+        auto webnnRegisterMLConstant = emscripten::val::module_property("webnnRegisterMLConstant");
+        operand = webnnRegisterMLConstant(emscripten::val(external_file_path),
+                                          static_cast<int32_t>(data_offset),
+                                          static_cast<int32_t>(tensor_byte_size),
+                                          wnn_builder_,
+                                          desc,
+                                          should_convert_int64_to_int32);
       } else {
         if (tensor.has_raw_data()) {
           tensor_ptr = reinterpret_cast<std::byte*>(const_cast<char*>(tensor.raw_data().c_str()));
@@ -288,7 +288,7 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
       desc.set("dataType", emscripten::val("int32"));
     }
     wnn_operands_.insert(std::make_pair(name, wnn_builder_.call<emscripten::val>("input", name, desc)));
-    emscripten::val::module_property("jsepRegisterGraphInput")(name);
+    emscripten::val::module_property("webnnRegisterGraphInput")(name);
     input_names_.push_back(name);
   } else {
     output_names_.push_back(name);
diff --git a/onnxruntime/core/providers/webnn/data_transfer.cc b/onnxruntime/core/providers/webnn/data_transfer.cc
index 44e9bf9edf3d9..aa85277b72453 100644
--- a/onnxruntime/core/providers/webnn/data_transfer.cc
+++ b/onnxruntime/core/providers/webnn/data_transfer.cc
@@ -29,11 +29,11 @@ common::Status DataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const {
     const auto& dst_device = dst.Location().device;
 
     if (dst_device.Type() == OrtDevice::GPU) {
-      EM_ASM({ Module.jsepUploadTensor($0, HEAPU8.subarray($1, $1 + $2)); }, dst_data, reinterpret_cast<intptr_t>(src_data), bytes);
+      EM_ASM({ Module.webnnUploadTensor($0, HEAPU8.subarray($1, $1 + $2)); }, dst_data, reinterpret_cast<intptr_t>(src_data), bytes);
     } else {
-      auto jsepDownloadTensor = emscripten::val::module_property("jsepDownloadTensor");
+      auto webnnDownloadTensor = emscripten::val::module_property("webnnDownloadTensor");
       auto subarray = emscripten::typed_memory_view(bytes, static_cast<char*>(dst_data));
-      jsepDownloadTensor(reinterpret_cast<intptr_t>(src_data), subarray).await();
+      webnnDownloadTensor(reinterpret_cast<intptr_t>(src_data), subarray).await();
     }
   }
 
diff --git a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
index 7410ff66add30..c527ba213e55b 100644
--- a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
+++ b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
@@ -284,7 +284,7 @@ class WebNNMemcpy : public OpKernel {
   explicit WebNNMemcpy(const OpKernelInfo& info) : OpKernel(info) {}
 
   Status Compute(OpKernelContext* context) const override {
-    auto jsepEnsureTensor = emscripten::val::module_property("jsepEnsureTensor");
+    auto webnnEnsureTensor = emscripten::val::module_property("webnnEnsureTensor");
     const auto* X = context->Input<Tensor>(0);
     ORT_ENFORCE(X != nullptr, "Memcpy: input tensor is null");
     auto* Y = context->Output(0, X->Shape());
@@ -294,10 +294,10 @@ class WebNNMemcpy : public OpKernel {
       shape.call<void>("push", SafeInt<uint32_t>(dim).Ref());
     }
 
-    jsepEnsureTensor(emscripten::val::undefined(),
-                     reinterpret_cast<intptr_t>(Y->MutableDataRaw()),
-                     Y->GetElementType(),
-                     shape, false)
+    webnnEnsureTensor(emscripten::val::undefined(),
+                      reinterpret_cast<intptr_t>(Y->MutableDataRaw()),
+                      Y->GetElementType(),
+                      shape, false)
         .await();
 
     const auto* data_transfer = Info().GetDataTransferManager().GetDataTransfer(X->Location().device, Y->Location().device);
diff --git a/onnxruntime/wasm/pre-jsep.js b/onnxruntime/wasm/pre-jsep.js
index a11c6d741d110..cca8da0525fbe 100644
--- a/onnxruntime/wasm/pre-jsep.js
+++ b/onnxruntime/wasm/pre-jsep.js
@@ -97,41 +97,40 @@ Module["jsepInit"] = (name, params) => {
     // Functions called via emscripten::val::module_property need to be assigned by name so that the minifier doesn't
     // change the name.
 
+    const backend = params[0];
     [
-      Module.jsepBackend,
-      Module.jsepReserveTensorId,
-      Module.jsepReleaseTensorId,
-      Module["jsepEnsureTensor"],
-      Module.jsepUploadTensor,
-      Module["jsepDownloadTensor"],
-    ] = params;
+      Module.webnnReserveTensorId,
+      Module.webnnReleaseTensorId,
+      Module["webnnEnsureTensor"],
+      Module.webnnUploadTensor,
+      Module["webnnDownloadTensor"],
+    ] = params.slice(1);
 
     // This function is called from both JS and an EM_ASM block, it needs both a minifiable name and an explicit name.
-    Module["jsepReleaseTensorId"] = Module.jsepReleaseTensorId;
-    Module["jsepUploadTensor"] = Module.jsepUploadTensor;
+    Module["webnnReleaseTensorId"] = Module.webnnReleaseTensorId;
+    Module["webnnUploadTensor"] = Module.webnnUploadTensor;
 
     // Functions called from JS also need to have explicit names.
-    const backend = Module.jsepBackend;
-    Module["jsepOnRunStart"] = (sessionId) => {
+    Module["webnnOnRunStart"] = (sessionId) => {
       return backend["onRunStart"](sessionId);
     };
-    Module["jsepOnRunEnd"] = backend["onRunEnd"].bind(backend);
-    Module["jsepRegisterMLContext"] = (sessionId, mlContext) => {
+    Module["webnnOnRunEnd"] = backend["onRunEnd"].bind(backend);
+    Module["webnnRegisterMLContext"] = (sessionId, mlContext) => {
       backend["registerMLContext"](sessionId, mlContext);
     };
-    Module["jsepOnReleaseSession"] = (sessionId) => {
+    Module["webnnOnReleaseSession"] = (sessionId) => {
       backend["onReleaseSession"](sessionId);
     };
-    Module["jsepCreateMLTensorDownloader"] = (tensorId, type) => {
+    Module["webnnCreateMLTensorDownloader"] = (tensorId, type) => {
       return backend["createMLTensorDownloader"](tensorId, type);
     };
-    Module["jsepRegisterMLTensor"] = (sessionId, tensor, dataType, shape) => {
+    Module["webnnRegisterMLTensor"] = (sessionId, tensor, dataType, shape) => {
       return backend["registerMLTensor"](sessionId, tensor, dataType, shape);
     };
-    Module["jsepCreateMLContext"] = (optionsOrGpuDevice) => {
+    Module["webnnCreateMLContext"] = (optionsOrGpuDevice) => {
       return backend["createMLContext"](optionsOrGpuDevice);
     };
-    Module["jsepRegisterMLConstant"] = (
+    Module["webnnRegisterMLConstant"] = (
       externalFilePath,
       dataOffset,
       dataLength,
@@ -149,9 +148,12 @@ Module["jsepInit"] = (name, params) => {
         shouldConvertInt64ToInt32,
       );
     };
-    Module['jsepRegisterGraphInput'] = backend['registerGraphInput'].bind(backend);
-    Module['jsepIsGraphInput'] = backend['isGraphInput'].bind(backend);
-    Module['jsepCreateTemporaryTensor'] = backend['createTemporaryTensor'].bind(backend);
-    Module['jsepIsInt64Supported'] = backend['isInt64Supported'].bind(backend);
+    Module["webnnRegisterGraphInput"] =
+      backend["registerGraphInput"].bind(backend);
+    Module["webnnIsGraphInput"] = backend["isGraphInput"].bind(backend);
+
+    Module["webnnCreateTemporaryTensor"] =
+      backend["createTemporaryTensor"].bind(backend);
+    Module["webnnIsInt64Supported"] = backend["isInt64Supported"].bind(backend);
   }
 };

From 7fc7d5ec750decb1ff546184f94d90719ccdb00c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Erick=20Mu=C3=B1oz?= <erick.munoz.alvarado@intel.com>
Date: Wed, 19 Mar 2025 02:27:25 -0600
Subject: [PATCH 09/21] Enable multithreading on FP16 to FP32 cast operator
 (#23619)

### Description
Enables multithreading on FP16 to FP32 cast operator.



### Motivation and Context
Improves CPU performance on FP16 models that require casting to FP32.
---
 .../core/providers/cpu/tensor/cast_op.cc      | 25 +++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/cast_op.cc b/onnxruntime/core/providers/cpu/tensor/cast_op.cc
index 35f3b12aeba35..639a49cb43a4f 100644
--- a/onnxruntime/core/providers/cpu/tensor/cast_op.cc
+++ b/onnxruntime/core/providers/cpu/tensor/cast_op.cc
@@ -254,11 +254,32 @@ struct TensorCasterNoSat<std::string, DstType> {
 // tensor MLFloat16 -> float
 template <>
 struct TensorCaster<MLFloat16, float> {
-  void Cast(const OpKernelContext&, const TensorShape& shape, const Tensor& in, Tensor& out) const {
+  void Cast(const OpKernelContext& ctx, const TensorShape& shape, const Tensor& in, Tensor& out) const {
     auto out_data = out.MutableData<float>();
     auto in_data = in.Data<MLFloat16>();
     const size_t shape_size = narrow<size_t>(shape.Size());
-    MlasConvertHalfToFloatBuffer(in_data, out_data, shape_size);
+
+    // Check if the tensor is long enough to use threads
+    if (shape_size <= 128000) {
+      MlasConvertHalfToFloatBuffer(in_data, out_data, shape_size);
+      return;
+    }
+    // Calculate the number of compute cyles per implementation
+    auto cpu_info = CPUIDInfo::GetCPUIDInfo();
+    double num_compute_cycles;
+    if (cpu_info.HasSSE3()) {
+      num_compute_cycles = static_cast<double>(shape_size >> 1);
+    } else if (cpu_info.HasAVX2()) {
+      num_compute_cycles = static_cast<double>(shape_size >> 2);
+    } else {
+      num_compute_cycles = static_cast<double>(shape_size * 10);
+    }
+
+    concurrency::ThreadPool::TryParallelFor(ctx.GetOperatorThreadPool(), shape_size,
+                                            {shape_size * 2.f, shape_size * 4.f, num_compute_cycles},
+                                            [in_data, out_data](std::ptrdiff_t first_span, std::ptrdiff_t last_span) {
+                                              MlasConvertHalfToFloatBuffer(in_data + first_span, out_data + first_span, static_cast<size_t>(last_span - first_span));
+                                            });
   }
 };
 

From 3488ba39f7fc8c847cdf55f44f5ebe09c3026614 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Wed, 19 Mar 2025 09:32:50 -0700
Subject: [PATCH 10/21] Move Android CI Pipeline to Github Actions (#24094)

### Description
Move Android CI Pipeline to Github Actions
---
 .github/actions/setup-android-ndk/action.yml  |  98 +++++++
 .github/workflows/android.yml                 | 147 +++++++++++
 ...ndroid-x86_64-crosscompile-ci-pipeline.yml | 241 ------------------
 tools/python/util/android/android.py          |  42 ++-
 4 files changed, 275 insertions(+), 253 deletions(-)
 create mode 100644 .github/actions/setup-android-ndk/action.yml
 create mode 100644 .github/workflows/android.yml
 delete mode 100644 tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml

diff --git a/.github/actions/setup-android-ndk/action.yml b/.github/actions/setup-android-ndk/action.yml
new file mode 100644
index 0000000000000..fea9745396e81
--- /dev/null
+++ b/.github/actions/setup-android-ndk/action.yml
@@ -0,0 +1,98 @@
+# .github/actions/setup-android-ndk/action.yml
+name: 'Setup Android NDK'
+description: 'Installs and configures a specific version of the Android NDK'
+inputs:
+  ndk-version:
+    description: 'The version of the Android NDK to install (e.g., 27.2.12479018)'
+    required: true
+    default: '27.2.12479018'
+  android-sdk-root:
+    description: 'The root directory of the Android SDK'
+    required: true
+    default: '/usr/local/lib/android/sdk'
+
+runs:
+  using: "composite"  # Use a composite action for multiple shell commands
+  steps:
+    - name: Install coreutils and ninja
+      shell: bash
+      run: sudo apt-get update -y && sudo apt-get install -y coreutils ninja-build
+
+    - name: Install Android NDK
+      shell: bash
+      run: |
+        set -e
+        "${{ inputs.android-sdk-root }}/cmdline-tools/latest/bin/sdkmanager" --install "ndk;${{ inputs.ndk-version }}"
+
+        NDK_PATH="${{ inputs.android-sdk-root }}/ndk/${{ inputs.ndk-version }}"
+        if [[ ! -d "${NDK_PATH}" ]]; then
+          echo "NDK directory is not in expected location: ${NDK_PATH}"
+          exit 1
+        fi
+
+        # Use standard environment variable setting in bash and add to GITHUB_ENV
+        echo "ANDROID_NDK_HOME=${NDK_PATH}" >> $GITHUB_ENV
+        echo "ANDROID_NDK_ROOT=${NDK_PATH}" >> $GITHUB_ENV
+        echo "ANDROID_NDK_HOME: ${NDK_PATH}"
+        echo "ANDROID_NDK_ROOT: ${NDK_PATH}"
+        
+    - name: Check if emulator are installed and add to PATH
+      shell: bash
+      run: |
+          if [[ ":$PATH:" == *":${ANDROID_SDK_ROOT}/emulator:"* ]]; then
+            echo "${ANDROID_SDK_ROOT}/emulator is in PATH"
+          else
+            ${ANDROID_SDK_ROOT}/cmdline-tools/latest/bin/sdkmanager --install "emulator"
+            echo "${ANDROID_SDK_ROOT}/emulator" >> $GITHUB_PATH
+          fi
+
+    - name: Check if platform tools are installed and add to PATH
+      shell: bash
+      run: |
+          if [[ ":$PATH:" == *":${ANDROID_SDK_ROOT}/platform-tools:"* ]]; then
+            echo "${ANDROID_SDK_ROOT}/platform-tools is in PATH"
+          else
+            ${ANDROID_SDK_ROOT}/cmdline-tools/latest/bin/sdkmanager --install "platform-tools"
+            echo "${ANDROID_SDK_ROOT}/platform-tools" >> $GITHUB_PATH
+          fi
+          ls -R "${ANDROID_SDK_ROOT}/platform-tools"
+
+    - name: Create Android Emulator
+      shell: bash
+      env:
+        ANDROID_AVD_HOME: ${{ runner.temp }}/android-avd
+      run: |
+          python3 tools/python/run_android_emulator.py \
+            --android-sdk-root "${ANDROID_SDK_ROOT}" \
+            --create-avd --system-image "system-images;android-31;default;x86_64"
+
+    - name: List Android AVDs
+      shell: bash
+      env:
+        ANDROID_AVD_HOME: ${{ runner.temp }}/android-avd
+      run: |
+          "${ANDROID_SDK_ROOT}/cmdline-tools/latest/bin/avdmanager" list avd
+
+    - name: Check emulator.pid does not exist
+      shell: bash
+      run: |
+          if test -f ./emulator.pid; then
+            echo "Emulator PID file was not expected to exist but does and has pid: `cat ./emulator.pid`"
+            exit 1
+          fi
+
+    - name: Start Android Emulator
+      shell: bash
+      env:
+        ANDROID_AVD_HOME: ${{ runner.temp }}/android-avd
+      run: |
+          set -e -x
+          python3 tools/python/run_android_emulator.py \
+            --android-sdk-root "${ANDROID_SDK_ROOT}" \
+            --start --emulator-extra-args="-partition-size 2047" \
+            --emulator-pid-file ./emulator.pid
+          echo "Emulator PID: `cat ./emulator.pid`"
+
+    - name: View Android ENVs
+      shell: bash
+      run: env | grep ANDROID
\ No newline at end of file
diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml
new file mode 100644
index 0000000000000..64c40946c49c5
--- /dev/null
+++ b/.github/workflows/android.yml
@@ -0,0 +1,147 @@
+name: Android CI
+# This workflow is used to build and test on Android Emulator on Linux
+
+on:
+  push:
+    branches:
+      - main
+      - rel-*
+  pull_request:
+    branches:
+      - main
+      - rel-*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+jobs:
+  android_nnapi_ep:
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Use jdk 17
+        uses: actions/setup-java@v4
+        with:
+          distribution: 'temurin'
+          java-version: '17'
+          architecture: x64
+
+      - name: Setup Android NDK
+        uses: ./.github/actions/setup-android-ndk
+        with:
+          ndk-version: 27.2.12479018
+
+      - name: Export GitHub Actions cache environment variables
+        uses: actions/github-script@v7
+        with:
+          script: |
+            core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
+            core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
+
+      - name: NNAPI EP, Build, Test on Android Emulator
+        run: >-
+          python3 tools/ci_build/build.py
+          --enable_lto
+          --android
+          --build_dir build_nnapi
+          --android_sdk_path "$ANDROID_HOME"
+          --android_ndk_path "$ANDROID_NDK_HOME"
+          --android_abi=x86_64
+          --android_api=29
+          --skip_submodule_sync
+          --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache
+          --use_nnapi
+          --build_shared_lib
+          --cmake_generator=Ninja
+          --build_java
+        shell: bash
+
+
+      - name: Build Minimal ORT with NNAPI and run tests
+        run:  tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh "$(pwd)"
+        shell: bash
+
+      - name: Install psutil for emulator shutdown by run_android_emulator.py
+        if: always()
+        run: python3 -m pip install psutil
+        shell: bash
+
+      - name: Stop Android Emulator
+        if: always()
+        run: |
+          env | grep ANDROID
+          if test -f ${{ github.workspace }}/emulator.pid; then
+            echo "Emulator PID:"`cat ${{ github.workspace }}/emulator.pid`
+            python3 tools/python/run_android_emulator.py \
+              --android-sdk-root "${ANDROID_SDK_ROOT}" \
+              --stop \
+              --emulator-pid-file ${{ github.workspace }}/emulator.pid
+            rm ${{ github.workspace }}/emulator.pid
+          else
+            echo "Emulator PID file was expected to exist but does not."
+          fi
+        shell: bash
+
+  android_cpu_ep:
+    name: Android CI Pipeline
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"]
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Use jdk 17
+        uses: actions/setup-java@v4
+        with:
+          distribution: 'temurin'
+          java-version: '17'
+          architecture: x64
+
+      - name: Setup Android NDK
+        uses: ./.github/actions/setup-android-ndk
+        with:
+          ndk-version: 27.2.12479018
+
+      - name: Export GitHub Actions cache environment variables
+        uses: actions/github-script@v7
+        with:
+          script: |
+            core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
+            core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
+
+      - name: CPU EP, Build and Test
+        run: >-
+          python3 tools/ci_build/build.py
+          --enable_lto
+          --android
+          --build_dir build
+          --android_sdk_path $ANDROID_HOME
+          --android_ndk_path $ANDROID_NDK_HOME
+          --android_abi=x86_64
+          --android_api=30
+          --skip_submodule_sync
+          --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache
+          --cmake_generator=Ninja
+          --build_java
+        shell: bash
+
+      - name: Install psutil for emulator shutdown by run_android_emulator.py
+        if: always()
+        run: python3 -m pip install psutil
+        shell: bash
+
+      - name: Stop Android Emulator
+        if: always()
+        run: |
+          if test -f ${{ github.workspace }}/emulator.pid; then
+           echo "Emulator PID:"`cat ${{ github.workspace }}/emulator.pid`
+           python3 tools/python/run_android_emulator.py \
+           --android-sdk-root "${ANDROID_SDK_ROOT}" \
+           --stop \
+           --emulator-pid-file ${{ github.workspace }}/emulator.pid
+           rm ${{ github.workspace }}/emulator.pid
+          else
+           echo "Emulator PID file was expected to exist but does not."
+          fi
+        shell: bash
diff --git a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
deleted file mode 100644
index 3cceadd1b8ef5..0000000000000
--- a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml
+++ /dev/null
@@ -1,241 +0,0 @@
-##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
-### please do rerun set-trigger-rules.py ###
-trigger:
-  branches:
-    include:
-    - main
-    - rel-*
-  paths:
-    exclude:
-    - docs/**
-    - README.md
-    - CONTRIBUTING.md
-    - BUILD.md
-    - 'js/web'
-    - 'onnxruntime/core/providers/js'
-pr:
-  branches:
-    include:
-    - main
-    - rel-*
-  paths:
-    exclude:
-    - docs/**
-    - README.md
-    - CONTRIBUTING.md
-    - BUILD.md
-    - 'js/web'
-    - 'onnxruntime/core/providers/js'
-#### end trigger ####
-
-# Known Limits
-# 1. Anchors are not supported in GHA
-# https://github.community/t/support-for-yaml-anchors/16128/90
-# 2. today most cloud-based CI services are still lacking hardware acceleration support from the host VM,
-# which is the no.1 blocker for running tests on modern Android Emulators (especially on recent API levels) on CI.
-
-# It'd better to check out https://github.com/microsoft/onnxruntime/wiki/Leverage-Existing-Artifacts
-# to save debugging time.
-parameters:
-- name: specificArtifact
-  displayName: Use Specific Artifact
-  type: boolean
-  default: false
-- name: runId
-  displayName: Specific Artifact's RunId
-  type: number
-  default: 0
-
-stages:
-# Separate stage for building CPU vs NNAPI as we only want CodeQL to run on one of them so we don't get duplicate
-# issues for code that is built in both. We pick NNAPI as that includes the NNAPI EP code.
-- stage: BUILD_AND_TEST_CPU
-  dependsOn: []
-  variables:
-    Codeql.Enabled: false
-    ANDROID_AVD_HOME: $(Agent.TempDirectory)
-  jobs:
-  - job: BUILD_AND_TEST_CPU
-    pool: onnxruntime-Ubuntu2204-AMD-CPU
-    workspace:
-      clean: all
-    timeoutInMinutes: 30
-    steps:
-    - task: JavaToolInstaller@0
-      displayName: Use jdk 17
-      inputs:
-        versionSpec: '17'
-        jdkArchitectureOption: 'x64'
-        jdkSourceOption: 'PreInstalled'
-
-    - script: sudo apt-get update -y && sudo apt-get install -y coreutils ninja-build
-      displayName: Install coreutils and ninja
-
-    - template: templates/use-android-ndk.yml
-    - template: templates/use-android-emulator.yml
-      parameters:
-        create: true
-        start: true
-    - script: |
-        env | grep ANDROID
-      displayName: View Android ENVs
-    - script: |
-        python3 tools/ci_build/build.py \
-          --enable_lto \
-          --android \
-          --build_dir build \
-          --android_sdk_path $ANDROID_HOME \
-          --android_ndk_path $ANDROID_NDK_HOME \
-          --android_abi=x86_64 \
-          --android_api=30 \
-          --skip_submodule_sync \
-          --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache \
-          --cmake_generator=Ninja \
-          --build_java
-      displayName: CPU EP, Build and Test
-    - template: templates/use-android-emulator.yml
-      parameters:
-        stop: true
-
-    - template: templates/clean-agent-build-directory-step.yml
-
-- stage: BUILD_AND_TEST_NNAPI_EP
-  dependsOn: []
-  condition: notIn(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')
-  variables:
-    ANDROID_AVD_HOME: $(Agent.TempDirectory)
-    Codeql.ProjectConfigPath: .github/workflows
-    Codeql.Enabled: true
-    Codeql.Language: cpp
-    ${{ if variables['Codeql.Enabled'] }}:
-      JobsTimeout: 120
-    ${{ else }}:
-      JobsTimeout: 60
-  jobs:
-  - job: BUILD_AND_TEST_NNAPI_EP
-    pool: onnxruntime-Ubuntu2204-AMD-CPU
-    timeoutInMinutes: ${{ variables.JobsTimeout }}
-    workspace:
-      clean: all
-    steps:
-    - task: JavaToolInstaller@0
-      displayName: Use jdk 17
-      inputs:
-        versionSpec: '17'
-        jdkArchitectureOption: 'x64'
-        jdkSourceOption: 'PreInstalled'
-
-    - script: sudo apt-get update -y && sudo apt-get install -y coreutils ninja-build
-      displayName: Install coreutils and ninja
-    - template: templates/use-android-emulator.yml
-      parameters:
-        create: true
-        start: true
-
-    - script: |
-        env | grep ANDROID
-      displayName: View Android ENVs
-
-    - script: |
-        python3 tools/ci_build/build.py \
-        --enable_lto \
-        --android \
-        --build_dir build_nnapi \
-        --android_sdk_path $ANDROID_HOME \
-        --android_ndk_path $ANDROID_NDK_HOME \
-        --android_abi=x86_64 \
-        --android_api=29 \
-        --skip_submodule_sync \
-        --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache \
-        --use_nnapi \
-        --build_shared_lib \
-        --cmake_generator=Ninja \
-        --build_java
-      displayName: NNAPI EP, Build, Test on Android Emulator
-
-    - script: /bin/bash tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh $(pwd)
-      # Build Minimal ORT with NNAPI and reduced Ops, run unit tests on Android Emulator
-      displayName: Build Minimal ORT with NNAPI and run tests
-
-    - template: templates/use-android-emulator.yml
-      parameters:
-        stop: true
-
-    - template: templates/clean-agent-build-directory-step.yml
-
-- stage: MAIN_BUILD_STAGE
-  # The below jobs only run on build of main branch.
-  # because coverage report is hard to support in cross machines.
-  displayName: NNAPI MAIN BUILD&TEST
-  dependsOn: []
-  condition: in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')
-  variables:
-    ANDROID_AVD_HOME: $(Agent.TempDirectory)
-  jobs:
-  - job: NNAPI_EP_MASTER
-    pool: onnxruntime-Ubuntu2204-AMD-CPU
-    timeoutInMinutes: 180
-    workspace:
-      clean: all
-    condition: in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')
-    steps:
-      - task: JavaToolInstaller@0
-        displayName: Use jdk 17
-        inputs:
-          versionSpec: '17'
-          jdkArchitectureOption: 'x64'
-          jdkSourceOption: 'PreInstalled'
-
-      - template: templates/use-android-ndk.yml
-
-      - template: templates/use-android-emulator.yml
-        parameters:
-          create: true
-          start: true
-
-      - script: |
-          python3 tools/ci_build/build.py \
-          --enable_lto \
-          --android \
-          --build_dir build_nnapi \
-          --android_sdk_path $ANDROID_HOME \
-          --android_ndk_path $ANDROID_NDK_HOME \
-          --android_abi=x86_64 \
-          --android_api=29 \
-          --skip_submodule_sync \
-          --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache \
-          --use_nnapi \
-          --build_shared_lib \
-          --cmake_generator=Ninja \
-          --build_java \
-          --code_coverage
-        displayName: NNAPI EP, Build, Test, CodeCoverage on Android Emulator
-
-      # We need to use llvm-cov from the NDK.
-      - script: |
-          export GCOV="$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/linux-x86_64/bin/llvm-cov gcov"
-          python3 -m pip install gcovr
-          python3 tools/ci_build/coverage.py --build_dir build_nnapi --android_sdk_path $ANDROID_HOME
-        displayName: Retrieve runtime code coverage files from the emulator and analyze
-
-      - script: cat '$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt'
-        displayName: Print coverage report
-
-      # - task: AzureCLI@2
-      #   displayName: 'Post Android Code Coverage To DashBoard'
-      #   inputs:
-      #     azureSubscription: AIInfraBuild
-      #     scriptType: bash
-      #     scriptPath: $(Build.SourcesDirectory)/tools/ci_build/github/linux/upload_code_coverage_data.sh
-      #     arguments: '"$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt" "https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=$(Build.BuildId)" arm android nnapi'
-      #     workingDirectory: '$(Build.BinariesDirectory)'
-
-      - script: /bin/bash tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh $(pwd)
-        # Build Minimal ORT with NNAPI and reduced Ops, run unit tests on Android Emulator
-        displayName: Build Minimal ORT with NNAPI and run tests
-
-      - template: templates/use-android-emulator.yml
-        parameters:
-          stop: true
-
-      - template: templates/clean-agent-build-directory-step.yml
diff --git a/tools/python/util/android/android.py b/tools/python/util/android/android.py
index 8f3ed97cae53f..cd420ca1483c7 100644
--- a/tools/python/util/android/android.py
+++ b/tools/python/util/android/android.py
@@ -46,18 +46,36 @@ def filename(name, windows_extension):
 
 def create_virtual_device(sdk_tool_paths: SdkToolPaths, system_image_package_name: str, avd_name: str):
     run(sdk_tool_paths.sdkmanager, "--install", system_image_package_name, input=b"y")
-
-    run(
-        sdk_tool_paths.avdmanager,
-        "create",
-        "avd",
-        "--name",
-        avd_name,
-        "--package",
-        system_image_package_name,
-        "--force",
-        input=b"no",
-    )
+    android_avd_home = os.environ["ANDROID_AVD_HOME"]
+
+    if android_avd_home is not None:
+        if not os.path.exists(android_avd_home):
+            os.makedirs(android_avd_home)
+        run(
+            sdk_tool_paths.avdmanager,
+            "create",
+            "avd",
+            "--name",
+            avd_name,
+            "--package",
+            system_image_package_name,
+            "--force",
+            "--path",
+            android_avd_home,
+            input=b"no",
+        )
+    else:
+        run(
+            sdk_tool_paths.avdmanager,
+            "create",
+            "avd",
+            "--name",
+            avd_name,
+            "--package",
+            system_image_package_name,
+            "--force",
+            input=b"no",
+        )
 
 
 _process_creationflags = subprocess.CREATE_NEW_PROCESS_GROUP if is_windows() else 0

From 7444feebe4465743bc36e6a644239ca66e7eda26 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Wed, 19 Mar 2025 09:42:31 -0700
Subject: [PATCH 11/21] Cleanup CoreML EP's code to remove
 COREML_ENABLE_MLPROGRAM (#23490)

### Description
Cleanup CoreML EP's code to remove the COREML_ENABLE_MLPROGRAM macro.
Also, increase MINIMUM_COREML_VERSION(first version we support) to 5 .
---
 .../builders/impl/activation_op_builder.cc    |  5 +--
 .../coreml/builders/impl/argmax_op_builder.cc |  7 +---
 .../builders/impl/batch_norm_op_builder.cc    |  5 +--
 .../coreml/builders/impl/binary_op_builder.cc |  7 +---
 .../coreml/builders/impl/builder_utils.cc     |  2 -
 .../coreml/builders/impl/builder_utils.h      |  2 -
 .../coreml/builders/impl/cast_op_builder.cc   |  8 +---
 .../coreml/builders/impl/clip_op_builder.cc   |  5 +--
 .../coreml/builders/impl/concat_op_builder.cc |  2 -
 .../coreml/builders/impl/conv_op_builder.cc   | 12 +-----
 .../builders/impl/convtranspose_op_builder.cc |  2 -
 .../builders/impl/depthtospace_op_builder.cc  |  2 -
 .../coreml/builders/impl/gemm_op_builder.cc   | 10 +----
 .../builders/impl/gridsample_op_builder.cc    |  2 -
 .../builders/impl/normalization_op_builder.cc |  4 --
 .../coreml/builders/impl/pool_op_builder.cc   |  5 +--
 .../builders/impl/reduction_op_builder.cc     |  5 +--
 .../builders/impl/reshape_op_builder.cc       |  5 +--
 .../coreml/builders/impl/resize_op_builder.cc |  5 +--
 .../coreml/builders/impl/shape_op_builder.cc  |  5 +--
 .../coreml/builders/impl/slice_op_builder.cc  | 18 +++------
 .../builders/impl/softmax_op_builder.cc       |  5 +--
 .../coreml/builders/impl/split_op_builder.cc  |  5 +--
 .../builders/impl/squeeze_op_builder.cc       |  8 +---
 .../builders/impl/transpose_op_builder.cc     |  5 +--
 .../coreml/builders/impl/unary_op_builder.cc  |  5 +--
 .../coreml/builders/model_builder.cc          | 37 ++-----------------
 .../providers/coreml/builders/model_builder.h | 12 ------
 .../core/providers/coreml/coreml_options.cc   | 12 ------
 .../core/providers/coreml/model/host_utils.h  |  3 +-
 .../core/providers/xnnpack/nn/max_pool.cc     |  2 +-
 .../test/contrib_ops/layer_norm_op_test.cc    |  2 +-
 .../providers/coreml/coreml_basic_test.cc     |  2 +-
 .../cpu/activation/activation_op_test.cc      |  6 +--
 .../cpu/activation/activation_op_test.h       |  2 +-
 .../cpu/math/element_wise_ops_test.cc         |  8 ++--
 .../test/providers/cpu/math/matmul_test.cc    |  4 +-
 .../providers/cpu/nn/batch_norm_op_test.cc    |  2 +-
 .../test/providers/cpu/nn/conv_fp16_test.cc   |  4 +-
 .../providers/cpu/nn/group_norm_op_test.cc    |  2 +-
 .../providers/cpu/nn/instance_norm_op_test.cc |  2 +-
 .../providers/cpu/nn/pool_fp16_op_test.cc     |  2 +-
 .../test/providers/cpu/nn/pool_op_test.cc     |  2 +-
 .../cpu/reduction/reduction_ops_test.cc       |  6 +--
 44 files changed, 55 insertions(+), 201 deletions(-)

diff --git a/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
index 4481a5172966b..3fffc6d0a68c4 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc
@@ -97,7 +97,6 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                   const logging::Logger& logger) const {
   const auto& op_type(node.OpType());
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
     // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#module-coremltools.converters.mil.mil.ops.defs.iOS15.activation
@@ -166,9 +165,7 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
     model_builder.AddOperation(std::move(op));
 
-  } else
-#endif  // (COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
     if (op_type == "Sigmoid") {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc
index 6169090a36014..dfa01c8187741 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/argmax_op_builder.cc
@@ -32,7 +32,6 @@ Status ArgMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   const int64_t keepdims = helper.Get("keepdims", 1);
   const bool removedim = keepdims != 1;
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
     // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#module-coremltools.converters.mil.mil.ops.defs.iOS15.reduction
@@ -46,9 +45,7 @@ Status ArgMaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     // the output of ArgMax must be int32
     AddOperationOutput(*op, *node.OutputDefs()[0], output_datatype);
     model_builder.AddOperation(std::move(op));
-  } else
-#endif  // (COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     auto* coreml_argmax = layer->mutable_argmax();
     coreml_argmax->set_axis(axis);
     coreml_argmax->set_removedim(removedim);
@@ -91,11 +88,9 @@ bool ArgMaxOpBuilder::IsOpSupportedImpl(const Node& node,
     return false;
   }
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (input_params.create_mlprogram) {
     return true;
   }
-#endif
 
   // If there are multiple downstream nodes and cast (toint32) is one of them
   // not supported, exit here
diff --git a/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc
index 442194cb31cbc..e547f2e42e527 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc
@@ -57,7 +57,6 @@ Status BatchNormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_bu
   const auto eps = helper.Get("epsilon", 1e-5f);
   const auto channels = scale_tensor.dims()[0];
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
     // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.normalization.batch_norm
@@ -78,9 +77,7 @@ Status BatchNormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_bu
 
     AddOperationOutput(*op, *node.OutputDefs()[0]);
     model_builder.AddOperation(std::move(op));
-  } else
-#endif  // (COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     auto* coreml_batch_norm = layer->mutable_batchnorm();
     coreml_batch_norm->set_channels(channels);
     coreml_batch_norm->set_epsilon(eps);
diff --git a/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
index 0482620b269a4..d7c78e05362ed 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/binary_op_builder.cc
@@ -56,7 +56,6 @@ bool CheckIfBothInputShapesMatch(const Node& node, const logging::Logger& logger
 }
 }  // namespace
 
-#if defined(COREML_ENABLE_MLPROGRAM)
 static std::vector<int64_t> InferOutputShape(const std::vector<int64_t>& a, const std::vector<int64_t>& b) {
   std::vector<int64_t> output_shape;
   int64_t i_a = 0, j_b = 0;
@@ -112,14 +111,12 @@ static void AddVariadicInputs(std::unique_ptr<CoreML::Specification::MILSpec::Op
   }
   *op = std::move(op_prev);
 }
-#endif
 
 Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                               const logging::Logger& logger) const {
   const auto& op_type(node.OpType());
   const auto& input_defs(node.InputDefs());
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
 
@@ -153,9 +150,7 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
     }
     AddOperationOutput(*op, *node.OutputDefs()[0]);
     model_builder.AddOperation(std::move(op));
-  } else
-#endif  // defined (COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
     if (op_type == "Add") {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
index 6f9bb35c27d80..684653aa21273 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc
@@ -150,7 +150,6 @@ void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<c
   CreateCoreMLWeightConvertingDataToFloats(weight, data);
 }
 
-#if defined(COREML_ENABLE_MLPROGRAM)
 //
 // ML Program Utils
 //
@@ -448,6 +447,5 @@ void AddPadTypeAndPads(COREML_SPEC::MILSpec::Operation& op, ModelBuilder& model_
     }
   }
 }
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
 }  // namespace coreml
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
index f38afc0ec181d..b72b66362b014 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
+++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h
@@ -50,7 +50,6 @@ void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<c
 // Copy the int64_t array to a coreml weight
 void CreateCoreMLWeight(CoreML::Specification::WeightParams& weight, gsl::span<const int64_t> data);
 
-#if defined(COREML_ENABLE_MLPROGRAM)
 //
 // MLProgram utils
 //
@@ -174,6 +173,5 @@ void AddOperationOutput(COREML_SPEC::MILSpec::Operation& op, const NodeArg& outp
 /// <param name="num_spatial_dims">Number of spatial dims in input. Generally rank - 2 (ignore N and C dims).</param>
 void AddPadTypeAndPads(COREML_SPEC::MILSpec::Operation& op, ModelBuilder& model_builder, std::string_view op_type,
                        const NodeAttrHelper& helper, int num_spatial_dims);
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
 }  // namespace coreml
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc
index 7c7363d4c81ad..8abee92451338 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/cast_op_builder.cc
@@ -27,9 +27,8 @@ class CastOpBuilder : public BaseOpBuilder {
 Status CastOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& model_builder,
                                             [[maybe_unused]] const Node& node,
                                             [[maybe_unused]] const logging::Logger& logger) const {
-// This is a special handling case for ArgMax Op, where argmax is followed by a cast to int32 type.
-// The ArgMax is fused with the Cast node and produces an int32 output.
-#if defined(COREML_ENABLE_MLPROGRAM)
+  // This is a special handling case for ArgMax Op, where argmax is followed by a cast to int32 type.
+  // The ArgMax is fused with the Cast node and produces an int32 output.
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
     // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.elementwise_unary.cast
@@ -73,7 +72,6 @@ Status CastOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& model
     AddOperationOutput(*op, *node.OutputDefs()[0], cast_to_type);
     model_builder.AddOperation(std::move(op));
   }
-#endif
 
   return Status::OK();
 }
@@ -134,7 +132,6 @@ bool CastOpBuilder::HasSupportedInputsImpl(const Node& node, [[maybe_unused]] co
     return false;
   }
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (input_params.create_mlprogram) {
     if ((input_type == ONNX_NAMESPACE::TensorProto_DataType_INT32 ||
          input_type == ONNX_NAMESPACE::TensorProto_DataType_INT64 ||
@@ -152,7 +149,6 @@ bool CastOpBuilder::HasSupportedInputsImpl(const Node& node, [[maybe_unused]] co
       return false;
     }
   }
-#endif
 
   // only support int64 coming from ArgMax (check for ArgMax is done in IsOpSupportedImpl())
   if (input_type != ONNX_NAMESPACE::TensorProto_DataType_INT64) {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
index f7046c213a8cb..9e68070a0e693 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/clip_op_builder.cc
@@ -64,7 +64,6 @@ Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   bool has_min = min != std::numeric_limits<float>::lowest();
   bool has_max = max != std::numeric_limits<float>::max();
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
 
@@ -121,9 +120,7 @@ Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
     AddOperationOutput(*op, output);
     model_builder.AddOperation(std::move(op));
-  } else
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     // TODO: CoreML has a Clip layer for NeuralNetwork. Added in CoreML 4. We could potentially use that if available
     // to simplify.
     // https://apple.github.io/coremltools/mlmodel/Format/NeuralNetwork.html#cliplayerparams
diff --git a/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc
index 9ea0030290abd..34ce2438095ad 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc
@@ -26,7 +26,6 @@ class ConcatOpBuilder : public BaseOpBuilder {
 Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                               const Node& node,
                                               const logging::Logger& logger) const {
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;  // NOLINT
 
@@ -45,7 +44,6 @@ Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     AddOperationOutput(*op, *node.OutputDefs()[0]);
     model_builder.AddOperation(std::move(op));
   } else  // NOLINT
-#endif    // defined(COREML_ENABLE_MLPROGRAM)
   {
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
diff --git a/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
index 38125957bf481..18823bcc78d19 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
@@ -52,7 +52,6 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
 
   NodeAttrHelper helper(node);
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
 
@@ -89,9 +88,7 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     AddOperationOutput(*conv_op, *node.OutputDefs()[0]);
 
     model_builder.AddOperation(std::move(conv_op));
-  } else
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
     auto strides = helper.Get("strides", std::vector<int64_t>{1, 1});
@@ -225,14 +222,11 @@ bool ConvOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara
   const auto& weight_name = input_defs[1]->Name();
   const auto* weight = input_params.graph_viewer.GetConstantInitializer(weight_name);
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (input_params.create_mlprogram) {
     // ML Program supports non-const weight, 1D, 2D and 3D.
     // keep to 1D and 2D for consistency with the NeuralNetwork implementation for now.
     // add 3D support as/when needed.
-  } else
-#endif  // defined (COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     if (!weight) {
       LOGS(logger, VERBOSE) << "The weight of Conv [" << name << "] must be a constant initializer";
       return false;
@@ -257,7 +251,6 @@ bool ConvOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara
 
   NodeAttrHelper helper(node);
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   // spec says same_lower is supported in CoreML 5. it lies. CoreML 6 is required otherwise you get
   //   `Unexpected value for parameter pad_type[0] "same_lower" not in ("custom", "same", "valid").`
   // We _could_ manually calculate the pads, but not implementing that until we have a real use case to justify
@@ -269,7 +262,6 @@ bool ConvOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPara
       return false;
     }
   }
-#endif
 
   // there's no equivalent to allow a manual kernel shape in CoreML.
   // it's OK if a specified kernel_shape matches kH and kW dims of the weight input.
diff --git a/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc
index 5b6d9d72ab3c9..2e2c898b0e10a 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc
@@ -28,7 +28,6 @@ class ConvTransposeOpBuilder : public BaseOpBuilder {
 Status ConvTransposeOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& model_builder,
                                                      [[maybe_unused]] const Node& node,
                                                      const logging::Logger& /*logger*/) const {
-#if defined(COREML_ENABLE_MLPROGRAM)
   using namespace CoreML::Specification::MILSpec;  // NOLINT
   const auto input_defs = node.InputDefs();
   const auto output_defs = node.OutputDefs();
@@ -80,7 +79,6 @@ Status ConvTransposeOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuild
   AddOperationOutput(*op, *output_defs[0]);
 
   model_builder.AddOperation(std::move(op));
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc
index fec14dfd093a0..1a74b1eea97fe 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc
@@ -33,7 +33,6 @@ Status DepthToSpaceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   NodeAttrHelper helper(node);
   int64_t blocksize = *helper.GetInt64("blocksize");  // required attribute
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;  // NOLINT
 
@@ -105,7 +104,6 @@ Status DepthToSpaceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
       model_builder.AddOperation(std::move(reshape2));
     }
   } else  // NOLINT
-#endif    // if defined(COREML_ENABLE_MLPROGRAM)
   {
     const auto& output_name = output_defs[0]->Name();
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
diff --git a/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc
index e685c09ef43ca..4f84f7c36259c 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/gemm_op_builder.cc
@@ -33,7 +33,6 @@ void GemmOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Nod
   const auto& input_defs(node.InputDefs());
   const bool is_gemm = op == "Gemm";
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     // we have to transpose the weight input of Gemm if transB is false, and potentially override the bias shape
     if (is_gemm) {
@@ -58,9 +57,7 @@ void GemmOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Nod
         }
       }
     }
-  } else
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     // We have already embedded the weights (matrix B and C(if any)) into the coreml layer
     // No need to copy them later to reduce memory consumption
     model_builder.AddInitializerToSkip(input_defs[1]->Name());
@@ -123,7 +120,6 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   const auto K = transB ? b1 : b0;
   const auto N = transB ? b0 : b1;
   // we already checked it and dtype must be existed.
-#if defined(COREML_ENABLE_MLPROGRAM)
   auto input_dtype = a.TypeAsProto()->tensor_type().elem_type();
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
@@ -207,9 +203,7 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
       AddOperationOutput(*matmul_op, *node.OutputDefs()[0]);
       model_builder.AddOperation(std::move(matmul_op));
     }
-  } else
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     auto* coreml_inner_product = layer->mutable_innerproduct();
 
     *layer->mutable_input()->Add() = a.Name();
diff --git a/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc
index 6dcf14c16f111..f558f423752e8 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc
@@ -42,7 +42,6 @@ class GridSampleOpBuilder : public BaseOpBuilder {
 Status GridSampleOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& model_builder,
                                                   [[maybe_unused]] const Node& node,
                                                   [[maybe_unused]] const logging::Logger& logger) const {
-#if defined(COREML_ENABLE_MLPROGRAM)
   using namespace CoreML::Specification::MILSpec;  // NOLINT
   // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.image_resizing.resample
 
@@ -80,7 +79,6 @@ Status GridSampleOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder&
   AddOperationOutput(*op, *output_defs[0]);
 
   model_builder.AddOperation(std::move(op));
-#endif
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/providers/coreml/builders/impl/normalization_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/normalization_op_builder.cc
index b4dc8d1647ad0..c0db144602ee2 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/normalization_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/normalization_op_builder.cc
@@ -49,7 +49,6 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(
   if (node.OpType() == "GroupNormalization") {
     return AddGroupNormToModelBuilderImpl(model_builder, node, logger);
   }
-#if defined(COREML_ENABLE_MLPROGRAM)
   const auto& input_defs = node.InputDefs();
   NodeAttrHelper helper(node);
   const auto& scale_tensor = *model_builder.GetConstantInitializer(input_defs[1]->Name());
@@ -94,7 +93,6 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(
     AddOperationOutput(*op, *node.OutputDefs()[0]);
     model_builder.AddOperation(std::move(op));
   }
-#endif  // (COREML_ENABLE_MLPROGRAM)
 
   return Status::OK();
 }
@@ -103,7 +101,6 @@ Status NormalizationOpBuilder::AddGroupNormToModelBuilderImpl(
     [[maybe_unused]] ModelBuilder& model_builder,
     [[maybe_unused]] const Node& node,
     [[maybe_unused]] const logging::Logger& logger) const {
-#if defined(COREML_ENABLE_MLPROGRAM)
   const auto& input_defs = node.InputDefs();
   NodeAttrHelper helper(node);
   // Coreml hasn't supported GroupNorm yet.
@@ -184,7 +181,6 @@ Status NormalizationOpBuilder::AddGroupNormToModelBuilderImpl(
     model_builder.AddOperation(std::move(mul));
     model_builder.AddOperation(std::move(add));
   }
-#endif  // (COREML_ENABLE_MLPROGRAM)
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc
index 17910ba6fd486..e43eef75007cc 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/pool_op_builder.cc
@@ -29,7 +29,6 @@ Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   const auto& op_type = node.OpType();
   const auto& input_defs = node.InputDefs();
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
 
@@ -91,9 +90,7 @@ Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     AddOperationOutput(*op, *node.OutputDefs()[0]);
     model_builder.AddOperation(std::move(op));
 
-  } else
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
     auto* coreml_pool = layer->mutable_pooling();
diff --git a/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc
index d533b867bd454..a4609eb2a0584 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc
@@ -71,7 +71,6 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, co
 
   const bool keepdims = helper.Get("keepdims", 1) != 0;
   const bool noop_with_empty_axes = helper.Get("noop_with_empty_axes", 0) != 0;
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
 
@@ -103,9 +102,7 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, co
     AddOperationOutput(*op, *node.OutputDefs()[0]);
 
     model_builder.AddOperation(std::move(op));
-  } else
-#endif  // (COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
     if (op_type == "ReduceSum") {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc
index 27d24d9c21893..b35d6971623ed 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/reshape_op_builder.cc
@@ -50,7 +50,6 @@ Status ReshapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   // ReshapeHelper applies the ONNX rules to create the concrete output shape
   ReshapeHelper helper(TensorShape(input_shape), new_shape);
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
 
@@ -64,9 +63,7 @@ Status ReshapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     AddOperationOutput(*reshape_op, *node.OutputDefs()[0]);
 
     model_builder.AddOperation(std::move(reshape_op));
-  } else
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
     *layer->mutable_reshapestatic()->mutable_targetshape() = {new_shape.cbegin(), new_shape.cend()};
diff --git a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
index 7ff66e4a79e37..837573003e515 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc
@@ -212,7 +212,6 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
     num_sizes = output_sizes.size();
   }
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;  // NOLINT
 
@@ -279,9 +278,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 
     AddOperationOutput(*op, *output_defs[0]);
     model_builder.AddOperation(std::move(op));
-  } else  // NOLINT
-#endif
-  {
+  } else {
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
     auto* coreml_upsample = layer->mutable_upsample();
diff --git a/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc
index 243f949bdd48e..d1c87b033d323 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/shape_op_builder.cc
@@ -25,7 +25,6 @@ Status ShapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
                                              const logging::Logger& /*logger*/) const {
   const auto& input_defs = node.InputDefs();
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
     NodeAttrHelper node_attr_helper{node};
@@ -63,9 +62,7 @@ Status ShapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
       AddOperationOutput(*op, *node.OutputDefs()[0], output_datatype);
       model_builder.AddOperation(std::move(op));
     }
-  } else  // NOLINT
-#endif
-  {
+  } else {
     auto layer = model_builder.CreateNNLayer(node);
     layer->mutable_getshape();
     *layer->mutable_input()->Add() = input_defs[0]->Name();
diff --git a/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc
index 6b3fe75fa592d..368e47e40f831 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/slice_op_builder.cc
@@ -127,7 +127,6 @@ Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   SliceOp::PrepareForComputeMetadata compute_metadata{data_shape};
   ORT_RETURN_IF_ERROR(PrepareSliceComputeMetadata(node, model_builder.GetGraphViewer(), compute_metadata));
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;  // NOLINT
     // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation.slice_by_index
@@ -178,9 +177,7 @@ Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 
     model_builder.AddOperation(std::move(op));
 
-  } else  // NOLINT
-#endif    // defined(COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     auto layer = model_builder.CreateNNLayer(node);
     *layer->mutable_input()->Add() = input_defs[0]->Name();
     *layer->mutable_output()->Add() = output_defs[0]->Name();
@@ -222,7 +219,6 @@ bool SliceOpBuilder::HasSupportedInputsImpl(const Node& node,
     return false;
   }
 
-#ifdef COREML_ENABLE_MLPROGRAM
   // The [Doc](https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation.slice_by_index)
   // says ML Program slice_by_index supports fp16 in CoreML 5 (iOS 15).
   // It's incorrect and CoreML 6+ (iOS16, CoreML spec version >= 7) is required otherwise only float is supported.
@@ -230,13 +226,11 @@ bool SliceOpBuilder::HasSupportedInputsImpl(const Node& node,
   // CoreML 6:https://github.com/apple/coremltools/blob/c3ea4cf56fef1176417246c1b85363417f3e713d/coremltools/converters/mil/mil/ops/defs/iOS15/tensor_transformation.py#L495
   if (input_params.create_mlprogram && input_params.coreml_version >= 6 &&
       input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
-  } else
-#endif  // nolint
-    if (input_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT &&
-        input_type != ONNX_NAMESPACE::TensorProto_DataType_INT64) {
-      LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Input type: [" << input_type << "] is not supported";
-      return false;
-    }
+  } else if (input_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT &&
+             input_type != ONNX_NAMESPACE::TensorProto_DataType_INT64) {
+    LOGS(logger, VERBOSE) << "[" << node.OpType() << "] Input type: [" << input_type << "] is not supported";
+    return false;
+  }
 
   return true;
 }
diff --git a/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
index c6e331feed326..2411cd459fecd 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/softmax_op_builder.cc
@@ -37,7 +37,6 @@ Status SoftmaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   const auto axis = helper.Get("axis", axis_default_value);
   auto axis_nonnegative = HandleNegativeAxis(axis, data_shape.size());
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   // CoreML's softmax match onnx's softmax behavior since opset 13.
   // For opset < 13, we need to reshape to 2D and set axis to -1 to simulate onnx softmax behavior.
   // [B,D,...](onnx softmax opset 12, axis=1)->[B,D*...](CoreML softmax, axis=-1)->[B,D,...](reshape back)
@@ -78,9 +77,7 @@ Status SoftmaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
       AddOperationOutput(*reshape2, *node.OutputDefs()[0]);
       model_builder.AddOperation(std::move(reshape2));
     }
-  } else  // NOLINT
-#endif
-  {
+  } else {
     if (node.SinceVersion() >= 13 || (data_shape.size() == 2)) {
       auto* coreml_softmaxnd = layer->mutable_softmaxnd();
       coreml_softmaxnd->set_axis(axis);
diff --git a/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
index 6372f3136123b..717d344982473 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
@@ -56,7 +56,6 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     return std::make_tuple(remainder, chunk_size);
   };
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
     std::unique_ptr<Operation> split_op = model_builder.CreateOperation(node, "split");
@@ -95,9 +94,7 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     }
     model_builder.AddOperation(std::move(split_op));
 
-  } else
-#endif
-  {
+  } else {
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
     auto* coreml_splitnd = layer->mutable_splitnd();
     coreml_splitnd->set_axis(axis);
diff --git a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
index a1b3a18265c70..81bef11906b74 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
@@ -58,7 +58,6 @@ void SqueezeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const
   }
 }
 
-#if defined(COREML_ENABLE_MLPROGRAM)
 void HandleX86ArchUnsqueezeScalarInput(ModelBuilder& model_builder,
                                        const Node& node, const logging::Logger& logger) {
   const auto& input_defs(node.InputDefs());
@@ -74,7 +73,6 @@ void HandleX86ArchUnsqueezeScalarInput(ModelBuilder& model_builder,
   AddOperationOutput(*op, *node.OutputDefs()[0]);
   model_builder.AddOperation(std::move(op));
 }
-#endif
 
 Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                const Node& node,
@@ -83,7 +81,7 @@ Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   auto* coreml_squeeze = layer->mutable_squeeze();
   TensorShapeVector axes;
   GetAxes(model_builder, node, axes);
-#if defined(COREML_ENABLE_MLPROGRAM)
+
   const auto& input_defs(node.InputDefs());
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
@@ -105,9 +103,7 @@ Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     }
     AddOperationOutput(*op, *node.OutputDefs()[0]);
     model_builder.AddOperation(std::move(op));
-  } else  // NOLINT
-#endif
-  {
+  } else {
     if (axes.empty()) {
       coreml_squeeze->set_squeezeall(true);
     } else {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
index 831c4cf4d08ba..5bb7e4c11967a 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc
@@ -34,7 +34,6 @@ Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     ORT_RETURN_IF_NOT(perm.size() == input_dims, "Perm and input should have same dimension");
   }
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
 
@@ -44,9 +43,7 @@ Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     AddOperationOutput(*op, *node.OutputDefs()[0]);
     model_builder.AddOperation(std::move(op));
 
-  } else
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
     *layer->mutable_transpose()->mutable_axes() = {perm.cbegin(), perm.cend()};
 
diff --git a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
index bc3cad004aec1..dd495894ab8bb 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/unary_op_builder.cc
@@ -25,7 +25,6 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   const auto& op_type(node.OpType());
   const auto& input_defs(node.InputDefs());
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
 
@@ -58,9 +57,7 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
     AddOperationOutput(*op, *node.OutputDefs()[0]);
 
     model_builder.AddOperation(std::move(op));
-  } else  // NOLINT
-#endif    // defined (COREML_ENABLE_MLPROGRAM)
-  {
+  } else {
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
     if (op_type == "Sqrt") {
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.cc b/onnxruntime/core/providers/coreml/builders/model_builder.cc
index f8952301d59a9..3551f5759201e 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.cc
@@ -17,20 +17,17 @@
 #include "core/providers/coreml/shape_utils.h"
 #include "core/optimizer/initializer.h"
 
-#if defined(COREML_ENABLE_MLPROGRAM)
 // includes from coremltools-src in _deps
 #include "modelpackage/src/ModelPackage.hpp"
 #include "mlmodel/src/MILBlob/Blob/StorageWriter.hpp"
 using MILBlob::Blob::StorageWriter;
-#endif
-
 using namespace CoreML::Specification;
 
 namespace onnxruntime {
 namespace coreml {
 
 namespace {
-#if defined(COREML_ENABLE_MLPROGRAM)
+
 // Should the initializer be written to file or kept as an immediate value
 bool ShouldWriteInitializerToWeightsFile(const ONNX_NAMESPACE::TensorProto& tensor_proto) {
   // https://github.com/apple/coremltools/blob/dbb0094fd0cb936469e35320bf37e866ef7a1da4/coremltools/converters/mil/backend/mil/load.py#L51-L57
@@ -388,8 +385,6 @@ void CreateEmptyFile(const std::string& filename) {
   ORT_ENFORCE(file.is_open(), "Failed to open file ", filename);
 }
 
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
-
 std::string GetModelOutputPath(const CoreMLOptions& coreml_options,
                                const GraphViewer& graph_viewer,
                                const logging::Logger& logger) {
@@ -479,7 +474,6 @@ ModelBuilder::ModelBuilder(const GraphViewer& graph_viewer, const logging::Logge
   }
 
   if (create_ml_program_) {
-#if defined(COREML_ENABLE_MLPROGRAM)
     coreml_model_->set_specificationversion(CoreMLSpecVersion());
     MILSpec::Program& mlprogram = *coreml_model_->mutable_mlprogram();
     mlprogram.set_version(1);
@@ -503,12 +497,6 @@ ModelBuilder::ModelBuilder(const GraphViewer& graph_viewer, const logging::Logge
                                                  "CoreML Model Weights");
     auto weights_info = mlpackage_->findItem(weights_id);
     weights_file_writer_ = std::make_unique<StorageWriter>(weights_info->path() + "/weight.bin");
-#else
-    // should never happen due to handling in coreml_execution_provider.cc
-    // throw here so all other code in this class can assume create_ml_program_ is only ever true in a build
-    // where ML Program support is enabled.
-    ORT_THROW("ML Program is not enabled in this build");
-#endif
   } else {
     // We support CorelML Specification Version 4 (Core ML 3)
     coreml_model_->set_specificationversion(4);
@@ -561,7 +549,6 @@ void ModelBuilder::AddLayer(std::unique_ptr<NeuralNetworkLayer> layer) {
 /*
  * ML Program related helpers
  */
-#if defined(COREML_ENABLE_MLPROGRAM)
 const std::string& ModelBuilder::GetSafeName(const std::string& name) {
   // Check the name is valid according to the MILSpec rules
   // `Identifiers, generally used for names and keys, must match the regular expression [A-Za-z\_][A-Za-z0-9\_@]*.`
@@ -737,8 +724,6 @@ std::string_view ModelBuilder::AddConstantImpl(std::string_view op_type, std::st
   return AddTensorValueAsConstantOperation(op_type, value_type, std::move(input_value));
 }
 
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
-
 /*
  * General implementation
  */
@@ -775,13 +760,10 @@ Status ModelBuilder::RegisterInitializers() {
       continue;
     }
 
-#if defined(COREML_ENABLE_MLPROGRAM)
     if (create_ml_program_) {
       MILSpec::Value coreml_tensor = OnnxTensorToCoreMLTensor(tensor, *weights_file_writer_);
       ORT_IGNORE_RETURN_VALUE(AddConstantOperation(name, std::move(coreml_tensor)));
-    } else
-#endif
-    {
+    } else {
       std::unique_ptr<NeuralNetworkLayer> layer = std::make_unique<NeuralNetworkLayer>();
       layer->set_name(GetUniqueName("initializer_" + name));
 
@@ -915,7 +897,6 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
     return Status::OK();
   }
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (create_ml_program_) {
     if (is_input) {
       // the model inputs need to be wired up as args to the 'main' function.
@@ -935,7 +916,6 @@ Status ModelBuilder::RegisterModelInputOutput(const NodeArg& node_arg, bool is_i
       *mlprogram_main_block_->mutable_outputs()->Add() = name;
     }
   }
-#endif  // defined(COREML_ENABLE_MLPROGRAM)
 
   return Status::OK();
 }
@@ -980,11 +960,9 @@ Status ModelBuilder::CreateModel() {
   ORT_RETURN_IF_ERROR(ProcessNodes());
   ORT_RETURN_IF_ERROR(RegisterModelOutputs());
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (create_ml_program_) {
     SanitizeNames();
   }
-#endif
 
   return Status::OK();
 }
@@ -992,7 +970,6 @@ Status ModelBuilder::CreateModel() {
 Status ModelBuilder::SaveModel() {
   std::string output_path = model_output_path_;
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (create_ml_program_) {
     // we need to jump through some hoops to get the model path the ML Program load wants.
     std::string tmp_model_path = model_output_path_ + "/tmp/model.mlmodel";
@@ -1003,7 +980,6 @@ Status ModelBuilder::SaveModel() {
     auto model_info = mlpackage_->findItem(model_id);
     output_path = model_info->path();
   }
-#endif
 
   // scope this so the stream is closed and flushed by the ofstream dtor
   {
@@ -1012,19 +988,16 @@ Status ModelBuilder::SaveModel() {
     ORT_RETURN_IF_NOT(coreml_model_->SerializeToOstream(&stream), "Saving the CoreML model failed. Path=", output_path);
   }
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   // need to delete the ModelPackage instance for it to write out the manifest. clear out the other ML Program
   // related types as well.
   mlprogram_main_block_ = nullptr;
   mlpackage_.reset();
   weights_file_writer_.reset();
-#endif
 
   return Status::OK();
 }
 
 Status ModelBuilder::LoadModel(std::unique_ptr<Model>& model) {
-#if defined(COREML_ENABLE_MLPROGRAM)
   if (create_ml_program_) {
     // we need to provide the sanitized names for model inputs/outputs so that info is captured.
     // the input/output matching when we execute the model from the CoreML EP is based on order, so the change
@@ -1058,9 +1031,7 @@ Status ModelBuilder::LoadModel(std::unique_ptr<Model>& model) {
                                     std::move(scalar_outputs_),
                                     std::move(int64_outputs_),
                                     logger_, coreml_options_);
-  } else
-#endif
-  {
+  } else {
     model = std::make_unique<Model>(model_output_path_,
                                     std::move(onnx_input_names_),
                                     std::move(onnx_output_names_),
@@ -1073,7 +1044,6 @@ Status ModelBuilder::LoadModel(std::unique_ptr<Model>& model) {
   return model->LoadModel();  // load using CoreML API, including compilation
 }
 
-#if defined(COREML_ENABLE_MLPROGRAM)
 std::string_view ModelBuilder::AddConstant(std::string_view op_type, std::string_view value_type,
                                            const ONNX_NAMESPACE::TensorProto& tensor,
                                            std::optional<gsl::span<const int64_t>> shape) {
@@ -1114,7 +1084,6 @@ std::string_view ModelBuilder::AddConstant(std::string_view op_type, std::string
 
   return ret;
 }
-#endif
 // static
 Status ModelBuilder::Build(const GraphViewer& graph_viewer, const logging::Logger& logger,
                            int32_t coreml_version, const CoreMLOptions& coreml_options,
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.h b/onnxruntime/core/providers/coreml/builders/model_builder.h
index 28c7dc42da581..f3012e8137e8c 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.h
@@ -9,7 +9,6 @@
 #include "core/providers/coreml/model/model.h"
 #include "core/providers/coreml/coreml_options.h"
 
-#if defined(COREML_ENABLE_MLPROGRAM)
 // coremltools classes
 namespace MPL {
 class ModelPackage;
@@ -20,7 +19,6 @@ namespace Blob {
 class StorageWriter;
 }
 }  // namespace MILBlob
-#endif
 
 namespace onnxruntime {
 namespace coreml {
@@ -58,11 +56,7 @@ class ModelBuilder {
 
   // Returns true if we are creating an ML Program
   bool CreateMLProgram() const {
-#if defined(COREML_ENABLE_MLPROGRAM)
     return create_ml_program_;
-#else
-    return false;
-#endif
   }
 
   /*
@@ -76,7 +70,6 @@ class ModelBuilder {
   // Add layer to the Core ML NeuralNetwork model
   void AddLayer(std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer);
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   /*
    * MLProgram helpers
    */
@@ -147,7 +140,6 @@ class ModelBuilder {
 
   // add the operation to the main function
   void AddOperation(std::unique_ptr<COREML_SPEC::MILSpec::Operation> operation);
-#endif
 
   /*
    * General helpers
@@ -176,7 +168,6 @@ class ModelBuilder {
   const logging::Logger& Logger() const { return logger_; }
 
  private:
-#if defined(COREML_ENABLE_MLPROGRAM)
   template <typename T>
   std::string_view AddConstantImpl(std::string_view op_type, std::string_view value_type, gsl::span<const T> value,
                                    std::optional<gsl::span<const int64_t>> shape = std::nullopt);
@@ -190,7 +181,6 @@ class ModelBuilder {
   const std::string& AddConstantOperation(std::string_view name, COREML_SPEC::MILSpec::Value&& initializer);
   const std::string& AddTensorValueAsConstantOperation(std::string_view op_type, std::string_view value_type,
                                                        COREML_SPEC::MILSpec::Value&& input_value);
-#endif
 
   // Convert the ONNX model in graph_viewer_ to a CoreML::Specification::Model and serialize to disk.
   // We then load it using CoreML in order compile it.
@@ -237,7 +227,6 @@ class ModelBuilder {
   uint32_t name_token_{0};
   std::unordered_set<std::string> unique_names_;
 
-#if defined(COREML_ENABLE_MLPROGRAM)
   // mlprogram_main_ is the main block of the CoreML ML Program.
   // It is set in CreateModel to the CoreML Model.mlprogram.functions['main'].block_specializations['CoreML<ver>']
   // entry we create.
@@ -254,7 +243,6 @@ class ModelBuilder {
   // This means an op builder author doesn't need to be aware of the renaming.
   // https://github.com/apple/coremltools/blob/8b37641f243b1a3e81452feea311c6e30dcc9287/coremltools/converters/mil/mil/passes/defs/preprocess.py#L146-L149
   std::unordered_map<std::string, std::string> values_to_rename_;
-#endif
 };
 
 }  // namespace coreml
diff --git a/onnxruntime/core/providers/coreml/coreml_options.cc b/onnxruntime/core/providers/coreml/coreml_options.cc
index 14ae55de9266b..c441a2eff56e0 100644
--- a/onnxruntime/core/providers/coreml/coreml_options.cc
+++ b/onnxruntime/core/providers/coreml/coreml_options.cc
@@ -15,18 +15,6 @@ CoreMLOptions::CoreMLOptions(uint32_t coreml_flags) {
   create_mlprogram_ = (coreml_flags & COREML_FLAG_CREATE_MLPROGRAM) != 0;
   enable_on_subgraph_ = (coreml_flags & COREML_FLAG_ENABLE_ON_SUBGRAPH) != 0;
 
-#if defined(COREML_ENABLE_MLPROGRAM)
-  if (coreml::util::CoreMLVersion() < MINIMUM_COREML_MLPROGRAM_VERSION && create_mlprogram_ != 0) {
-    LOGS_DEFAULT(WARNING) << "ML Program is not supported on this OS version. Falling back to NeuralNetwork.";
-    create_mlprogram_ = false;
-  }
-#else
-  if (create_mlprogram_ != 0) {
-    LOGS_DEFAULT(WARNING) << "ML Program is not supported in this build. Falling back to NeuralNetwork.";
-    create_mlprogram_ = false;
-  }
-#endif
-
   compute_units_ = 0;  // 0 for all
 
   if (coreml_flags & COREML_FLAG_USE_CPU_ONLY) {
diff --git a/onnxruntime/core/providers/coreml/model/host_utils.h b/onnxruntime/core/providers/coreml/model/host_utils.h
index 31ee2bd3e2494..f654b4d5701b9 100644
--- a/onnxruntime/core/providers/coreml/model/host_utils.h
+++ b/onnxruntime/core/providers/coreml/model/host_utils.h
@@ -60,8 +60,7 @@
 
 #endif
 
-#define MINIMUM_COREML_VERSION 3            // first version we support
-#define MINIMUM_COREML_MLPROGRAM_VERSION 5  // first version where ML Program was available
+#define MINIMUM_COREML_VERSION 5  // first version we support
 
 namespace onnxruntime {
 namespace coreml {
diff --git a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc
index c828ae9400174..8d972f7d63bc1 100644
--- a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc
+++ b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc
@@ -57,7 +57,7 @@ bool MaxPool::IsOnnxNodeSupported(const NodeUnit& node_unit,
     // input of maxpool could be fp16/fp32/fp64,i8/u8 according to ONNX
     if (x_type == nullptr ||
         (x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT &&
-// because pool_fp16_op_test can be enabled by other preprocessor, for example, COREML_ENABLE_MLPROGRAM
+// because pool_fp16_op_test can be enabled by other preprocessor, for example, USE_COREML
 #ifdef XNNPACK_FP16_SUPPORTED
          x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 &&
 #endif
diff --git a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
index 4611dc9082734..e22445edc0f5b 100644
--- a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
+++ b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
@@ -404,7 +404,7 @@ TYPED_TEST(LayerNormTest, LayerNorm17_opset) {
   // Execution provider entry invalid.
   // when other EPs support layer-norm fp16, this test should be updated to include them.
   if (std::is_same<TypeParam, MLFloat16>::value) {
-#if !defined(COREML_ENABLE_MLPROGRAM)
+#if !defined(USE_COREML)
     return;
 #endif
   }
diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
index a9aa78b7a3229..3505193b77683 100644
--- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc
+++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
@@ -246,7 +246,7 @@ TEST(CoreMLExecutionProviderTest, TestOrtFormatModel) {
 #endif
 }
 
-#if defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_COREML)
 // Names in CoreML cannot start with [0-9] or contain anything but "[a-z][A-Z][0-9]_"
 // Test that we fix invalid names in model inputs, initializers and outputs.
 // This is only enforced for ML Program, so we only do name sanitization when creating an ML Program format model.
diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
index 724118d7419d2..9201da348e75c 100644
--- a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
+++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
@@ -125,7 +125,7 @@ TEST_F(ActivationOpTest, Relu) {
       {}, {},
       /*is_tensorrt_supported=*/false,
       /*opset_version= */ 14);
-#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(USE_COREML)
   TestActivationOp<MLFloat16>(
       "Relu",
       input_values_fp16,
@@ -139,7 +139,7 @@ TEST_F(ActivationOpTest, Relu) {
 #endif  // MLAS_F16VEC_INTRINSICS_SUPPORTED
 }
 
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML)
 TEST_F(ActivationOpTest, Sigmoid_fp16) {
 #ifdef USE_CUDA
   int min_cuda_architecture = 530;
@@ -413,7 +413,7 @@ TEST_F(ActivationOpTest, LeakyRelu) {
                           {{"alpha", alpha}}, {});
 }
 
-#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(USE_COREML)
 TEST_F(ActivationOpTest, LeakyRelu_fp16) {
   OpTester test("LeakyRelu", 11);
   float alpha = 0.01f;  // oneDNN set alpha equal to 0.01
diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.h b/onnxruntime/test/providers/cpu/activation/activation_op_test.h
index 59813f433dc41..04d116e29d3b0 100644
--- a/onnxruntime/test/providers/cpu/activation/activation_op_test.h
+++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.h
@@ -105,7 +105,7 @@ class ActivationOpTest : public ::testing::Test {
     std::random_device rd;
     std::mt19937 gen(rd());
     std::uniform_real_distribution<float> dist(low, high);
-#ifdef COREML_ENABLE_MLPROGRAM
+#ifdef USE_COREML
     // please check onnxruntime/onnxruntime/core/providers/coreml/builders/helper.cc:81
     std::vector<std::size_t> batch_size_list = {1, 2, 4, 9, 100};
 #else
diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
index 30e7f63919216..fbd9d10a56c77 100644
--- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc
@@ -32,7 +32,7 @@ void TestBinaryFloat16(const char* op_name,
                        bool enable_bf16 = true) {
   {
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
-#ifdef COREML_ENABLE_MLPROGRAM
+#ifdef USE_COREML
     execution_providers.push_back(DefaultCoreMLExecutionProvider(true));
 #elif USE_CUDA
     execution_providers.push_back(DefaultCudaExecutionProvider());
@@ -76,7 +76,7 @@ void TestUnaryFloat16(const char* op_name,
                       bool run_bf16 = true) {
   {
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
-#ifdef COREML_ENABLE_MLPROGRAM
+#ifdef USE_COREML
     execution_providers.push_back(DefaultCoreMLExecutionProvider(true));
 #elif USE_CUDA
     execution_providers.push_back(DefaultCudaExecutionProvider());
@@ -1426,7 +1426,7 @@ TEST(MathOpTest, Pow_float16_float16) {
                     dims, {1.0f, 256.0f, 2.0f, 1.0f}, false);
 }
 
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML)
 TEST(MathOpTest, Pow_float_float16) {
   OpTester test("Pow", 12);
   std::vector<int64_t> dims{4};
@@ -1440,7 +1440,7 @@ TEST(MathOpTest, Pow_float_float16) {
   execution_providers.push_back(DefaultCudaExecutionProvider());
 #elif USE_ROCM
   execution_providers.push_back(DefaultRocmExecutionProvider());
-#elif COREML_ENABLE_MLPROGRAM
+#elif USE_COREML
   execution_providers.push_back(DefaultCoreMLExecutionProvider(true));
 #endif
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
diff --git a/onnxruntime/test/providers/cpu/math/matmul_test.cc b/onnxruntime/test/providers/cpu/math/matmul_test.cc
index 298e870f348fc..dd8cbed15e5ef 100644
--- a/onnxruntime/test/providers/cpu/math/matmul_test.cc
+++ b/onnxruntime/test/providers/cpu/math/matmul_test.cc
@@ -210,7 +210,7 @@ TEST(MathOpTest, MatMulFloatType) {
   RunMatMulTest<float>(7, false, true);
 }
 
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM) || defined(USE_XNNPACK)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML) || defined(USE_XNNPACK)
 TEST(MathOpTest, MatMulFloat16) {
 #ifdef USE_CUDA
   int min_cuda_architecture = 530;
@@ -276,7 +276,7 @@ TEST(MathOpTest, MatMulZeroKInt32Type) {
   RunMatMulZeroKTest<int32_t>();
 }
 
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM) || defined(USE_XNNPACK)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML) || defined(USE_XNNPACK)
 TEST(MathOpTest, MatMul_Float16) {
 #ifdef USE_CUDA
   int min_cuda_architecture = 530;
diff --git a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
index f8ebca5ff9a1b..a529d572d7cca 100644
--- a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc
@@ -704,7 +704,7 @@ TEST(BatchNormTest, NonSpatial_Complicated) {
 }
 
 // Only CUDA and ROCm kernels have float 16 support
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML)
 TEST(BatchNormTest, BatchNorm2d_fp16) {
   vector<float> X{-0.91221f, -0.283559f, 0.937637f, 2.09818f, -0.100199f, -0.608113f, 0.444562f, -1.07505f, 0.940591f,
                   -0.922262f, 0.0931303f, 0.69611f, 1.55187f, 0.159808f, 0.914874f, -1.24856f, -1.98928f, -0.331621f,
diff --git a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc
index 4253e36e02548..d1350db8ec12e 100644
--- a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc
@@ -3,7 +3,7 @@
 
 #include "core/mlas/inc/mlas.h"
 
-#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM) || defined(USE_XNNPACK)
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(USE_COREML) || defined(USE_XNNPACK)
 
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
@@ -30,7 +30,7 @@ struct ConvOpAndTestAttributes {
 
 /*
 Please notice that, we have predefined macros in the head of the file
-#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(USE_COREML)
 When we have these two macro defines, this UT will turn into green light and work.
 
 If attributes.activation is set the NhwcFusedConv contrib op is used.
diff --git a/onnxruntime/test/providers/cpu/nn/group_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/group_norm_op_test.cc
index ac517193a2c77..3d8d188867023 100644
--- a/onnxruntime/test/providers/cpu/nn/group_norm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/group_norm_op_test.cc
@@ -6,7 +6,7 @@
 #include "test/common/tensor_op_test_utils.h"
 #include "test/util/include/default_providers.h"
 
-#ifdef COREML_ENABLE_MLPROGRAM
+#ifdef USE_COREML
 using namespace std;
 namespace onnxruntime {
 namespace test {
diff --git a/onnxruntime/test/providers/cpu/nn/instance_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/instance_norm_op_test.cc
index 341bb8a4fc957..46b74f2c2eb9d 100644
--- a/onnxruntime/test/providers/cpu/nn/instance_norm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/instance_norm_op_test.cc
@@ -121,7 +121,7 @@ TEST(InstanceNormalizationOpTest, InstanceNormBatch2) {
 }
 
 // Only CUDA and ROCm kernels have float 16 support
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML)
 
 TEST(InstanceNormalizationOpTest, InstanceNormBatch1_fp16) {
   OpTester test("InstanceNormalization");
diff --git a/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc
index d4e0af5011525..c14fc1fb62ae5 100644
--- a/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc
@@ -3,7 +3,7 @@
 
 #include "core/mlas/inc/mlas.h"
 
-#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM) || defined(USE_XNNPACK)
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(USE_COREML) || defined(USE_XNNPACK)
 
 #include "core/providers/cpu/nn/pool.h"
 #include "gtest/gtest.h"
diff --git a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
index 24a8c8491b632..f1d612276174f 100644
--- a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
@@ -70,7 +70,7 @@ TEST(PoolTest, MaxPool) {
 
 // Only CUDA kernel has float 16 support
 // Disable for now, still investigating the issue with cudnn lib
-#if defined(USE_CUDA) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_CUDA) || defined(USE_COREML)
 TEST(PoolTest, MaxPool_F16) {
 #if defined(USE_CUDA)
   int min_cuda_architecture = 530;
diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
index 4bc97d035c7f7..92cd82c2c9420 100644
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@@ -1375,7 +1375,7 @@ TEST(ReductionOpTest, ReduceMax_double) {
   test.Run();
 }
 
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML)
 TEST(ReductionOpTest, ReduceMax_half) {
   OpTester test("ReduceMax");
   test.AddAttribute("axes", std::vector<int64_t>{1, 2});
@@ -2158,7 +2158,7 @@ TEST(ReductionOpTest, ReduceMin_double) {
   test.Run();
 }
 
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML)
 TEST(ReductionOpTest, ReduceMin_half) {
   OpTester test("ReduceMin");
   test.AddAttribute("axes", std::vector<int64_t>{0, 2});
@@ -2356,7 +2356,7 @@ TEST(ReductionOpTest, ReduceSum_int32) {
   test.Run();
 }
 
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(COREML_ENABLE_MLPROGRAM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_COREML)
 TEST(ReductionOpTest, ReduceSumHalfHalf) {
   OpTester test("ReduceSum");
   test.AddAttribute("keepdims", (int64_t)0);

From b626409ee4ef0e659fb16461b96d4a1d266933c3 Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Wed, 19 Mar 2025 09:52:26 -0700
Subject: [PATCH 12/21] webgpu ep support for argmax/argmin (#24089)

---
 .../webgpu/reduction/reduction_ops.cc         | 33 ++++++++++++++++++-
 .../webgpu/reduction/reduction_ops.h          | 12 +++++++
 .../webgpu/webgpu_execution_provider.cc       | 24 +++++++-------
 3 files changed, 56 insertions(+), 13 deletions(-)

diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
index a0213f63494d3..9548386ded06c 100644
--- a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
@@ -91,6 +91,14 @@ REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSumExp, 11, 12);
 REGISTER_REDUCE_VERSIONED_KERNEL(ReduceLogSumExp, 13, 17);
 REGISTER_REDUCE_KERNEL(ReduceLogSumExp, 18);
 
+REGISTER_REDUCE_VERSIONED_KERNEL(ArgMax, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ArgMax, 11, 12);
+REGISTER_REDUCE_KERNEL(ArgMax, 13);
+
+REGISTER_REDUCE_VERSIONED_KERNEL(ArgMin, 1, 10);
+REGISTER_REDUCE_VERSIONED_KERNEL(ArgMin, 11, 12);
+REGISTER_REDUCE_KERNEL(ArgMin, 13);
+
 Status ReduceKernelProgram::GenerateShaderCode(ShaderHelper& shader) const {
   const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
   if (is_input_empty_) {
@@ -114,6 +122,9 @@ Status ReduceKernelProgram::GenerateShaderCode(ShaderHelper& shader) const {
       std::stringstream ss;
       std::string index = "i" + std::to_string(i);
       ss << "for (var " << index << " : u32 = 0; " << index << " < " << input.IndicesGet("uniforms.input_shape", i) << "; " << index << "++) {\n";
+      if (loop_body.find("last_index") != std::string::npos) {
+        ss << "let last_index = " + index + ";\n";
+      }
       ss << input.IndicesSet("input_indices", i, index) << ";\n";
       ss << loop_body << "\n";
       ss << "}\n";
@@ -337,5 +348,25 @@ ReduceOpSpecificCode ReduceLogSumExp::GetOpSpecificCode(const Tensor* input_tens
   return code;
 }
 
+ReduceOpSpecificCode ArgMin::GetOpSpecificCode(const Tensor* input_tensor) const {
+  ORT_UNUSED_PARAMETER(input_tensor);
+  std::string op = (select_last_index_) ? "<=" : "<";
+  std::string loop_header = "var best_element = first_element; var best_index = u32(0);";
+  std::string loop_body = "if (current_element " + op + " best_element) { best_element = current_element; best_index = last_index; };";
+  std::string loop_footer = "let output_value = output_value_t(best_index);";
+  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
+  return code;
+}
+
+ReduceOpSpecificCode ArgMax::GetOpSpecificCode(const Tensor* input_tensor) const {
+  ORT_UNUSED_PARAMETER(input_tensor);
+  std::string op = (select_last_index_) ? ">=" : ">";
+  std::string loop_header = "var best_element = first_element; var best_index = u32(0);";
+  std::string loop_body = "if (current_element " + op + " best_element) { best_element = current_element; best_index = last_index; };";
+  std::string loop_footer = "let output_value = output_value_t(best_index);";
+  ReduceOpSpecificCode code({loop_header, loop_body, loop_footer});
+  return code;
+}
+
 }  // namespace webgpu
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h
index 291d931f41c05..70ae6d3c71eb9 100644
--- a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h
+++ b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.h
@@ -119,5 +119,17 @@ class ReduceLogSumExp final : public ReduceKernel<true> {
   ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
 };
 
+class ArgMin final : public ReduceKernel<false> {
+ public:
+  ArgMin(const OpKernelInfo& info) : ReduceKernel<false>(info, "ArgMin", true) {}
+  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
+};
+
+class ArgMax final : public ReduceKernel<false> {
+ public:
+  ArgMax(const OpKernelInfo& info) : ReduceKernel<false>(info, "ArgMax", true) {}
+  ReduceOpSpecificCode GetOpSpecificCode(const Tensor* input_tensor) const override;
+};
+
 }  // namespace webgpu
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index dfb2e4b6ce665..6f81bead5e5b1 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -297,12 +297,12 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13,
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 12, MatMul);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, MatMul);
 
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, float, ArgMax);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, float, ArgMax);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, float, ArgMax);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, float, ArgMin);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, float, ArgMin);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, float, ArgMin);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ArgMax);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ArgMax);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, ArgMax);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ArgMin);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ArgMin);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, ArgMin);
 
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, Softmax);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Softmax);
@@ -624,13 +624,13 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 12, MatMul)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, MatMul)>,
 
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, float, ArgMax)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, float, ArgMax)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, float, ArgMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ArgMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ArgMax)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, ArgMax)>,
 
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, float, ArgMin)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, float, ArgMin)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, float, ArgMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, ArgMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, ArgMin)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, ArgMin)>,
 
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 10, Softmax)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, Softmax)>,

From d8ed4da1dfee781919247d9ce001f246489c8f90 Mon Sep 17 00:00:00 2001
From: Caroline Zhu <wolfivyaura@gmail.com>
Date: Wed, 19 Mar 2025 10:30:44 -0700
Subject: [PATCH 13/21] [mobile/reactnative] Remove namespace from
 AndroidManifest.XML to resolve warning (#23847)

### Description
Removes namespace from AndroidManifest.XML



### Motivation and Context
- Resolves #21681
---
 js/react_native/android/src/main/AndroidManifest.xml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/js/react_native/android/src/main/AndroidManifest.xml b/js/react_native/android/src/main/AndroidManifest.xml
index c6e3cb45e16e5..a2f47b6057db7 100644
--- a/js/react_native/android/src/main/AndroidManifest.xml
+++ b/js/react_native/android/src/main/AndroidManifest.xml
@@ -1,3 +1,2 @@
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-          package="ai.onnxruntime.reactnative">
+<manifest xmlns:android="http://schemas.android.com/apk/res/android">
 </manifest>

From 80441e4ec8a3bc9e74eb251ffac4b9456ba0b5f3 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 19 Mar 2025 11:03:14 -0700
Subject: [PATCH 14/21] [WebGPU EP] fix implementation of Pow (#24088)

### Description

Use custom implementation for Pow to fix test failures.
---
 .../webgpu/math/binary_elementwise_ops.cc     | 37 +++++++++++++++++--
 .../webgpu/math/binary_elementwise_ops.h      | 16 ++++++--
 2 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.cc b/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.cc
index 13004af25726d..6891b8159b090 100644
--- a/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.cc
+++ b/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.cc
@@ -4,15 +4,18 @@
 #include "core/providers/common.h"
 #include "core/providers/webgpu/math/binary_elementwise_ops.h"
 #include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/string_macros.h"
 #include "core/providers/webgpu/webgpu_supported_types.h"
 
 namespace onnxruntime {
 namespace webgpu {
 Status BinaryElementwiseProgram::GenerateShaderCode(ShaderHelper& shader) const {
-  const auto& a = shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
-  const auto& b = shader.AddInput("input_b", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
+  const auto& a = shader.AddInput("input_a", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
+  const auto& b = shader.AddInput("input_b", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
   const auto& c = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
 
+  shader.AdditionalImplementation() << additional_impl_;
+
   shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size");
 
   // check whether can use element-wise mode.
@@ -142,8 +145,15 @@ Status BinaryElementwise::ComputeInternal(ComputeContext& context) const {
   }
 
   uint32_t vec_size = onnxruntime::narrow<uint32_t>((size + 3) / 4);
+
+  std::string additional_impl;
+  if (get_additional_impl_) {
+    additional_impl = get_additional_impl_(lhs_tensor->GetElementType(), rhs_tensor->GetElementType());
+  }
+
   BinaryElementwiseProgram program{kernel_name_,
                                    expression_,
+                                   additional_impl,
                                    is_broadcast,
                                    is_lhs_scalar,
                                    is_rhs_scalar,
@@ -273,7 +283,28 @@ WEBGPU_BINARY_VERSIONED_KERNEL(Sub, 7, 12, Sub, WebGpuSupportedNumberTypes())
 WEBGPU_BINARY_VERSIONED_KERNEL(Sub, 13, 13, Sub, WebGpuSupportedNumberTypes())
 WEBGPU_BINARY_KERNEL(Sub, 14, Sub, WebGpuSupportedNumberTypes())
 
-WEBGPU_BINARY_IMPL(Pow, "output_value_t(pow(vec4<f32>(a), vec4<f32>(b)))")
+std::string GetPowImpl(int lhs_element_type, int /* rhs_element_type */) {
+  SS(s, 1024);
+  std::string round_str;
+  if (lhs_element_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32) {
+    round_str = "round";
+  }
+
+  s << "fn pow_custom(a : input_a_element_t, b : f32) -> input_a_element_t {\n"
+       "  if (b == 0.0) {\n"
+       "    return input_a_element_t(1.0);\n"
+       "  } else if (a < input_a_element_t(0.0) && b != floor(b)) {\n"
+       "    return input_a_element_t(pow(f32(a), b)); // NaN\n"
+       "  }\n"
+    << "  return select(sign(a), input_a_element_t(1.0), round(abs(b) % 2.0) != 1.0) * input_a_element_t(" << round_str << "(pow(f32(abs(a)), b)));\n"
+    << "}\n"
+       "fn pow_v(a : vec4<input_a_element_t>, b : vec4<input_b_element_t>) -> vec4<input_a_element_t> {\n"
+       "  return vec4<input_a_element_t>(pow_custom(a.x, f32(b.x)), pow_custom(a.y, f32(b.y)), pow_custom(a.z, f32(b.z)), pow_custom(a.w, f32(b.w)));\n"
+       "}\n";
+  return SS_GET(s);
+}
+
+WEBGPU_BINARY_IMPL(Pow, "pow_v(a, b)", GetPowImpl)
 WEBGPU_BINARY_VERSIONED_KERNEL(Pow, 7, 11, Pow, WebGpuSupportedNumberTypes())
 WEBGPU_BINARY_VERSIONED_KERNEL_2(Pow, 12, 12, Pow, WebGpuSupportedNumberTypes(), WebGpuSupportedNumberTypes())
 WEBGPU_BINARY_VERSIONED_KERNEL_2(Pow, 13, 14, Pow, WebGpuSupportedNumberTypes(), WebGpuSupportedNumberTypes())
diff --git a/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.h b/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.h
index 84cbcdf3244d8..f80accfb934f8 100644
--- a/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.h
+++ b/onnxruntime/core/providers/webgpu/math/binary_elementwise_ops.h
@@ -14,11 +14,13 @@ class BinaryElementwiseProgram final : public Program<BinaryElementwiseProgram>
  public:
   BinaryElementwiseProgram(const std::string& kernel_name,
                            const std::string& expression,
+                           const std::string& additional_impl,
                            const bool is_broadcast,
                            const bool is_lhs_scalar,
                            const bool is_rhs_scalar,
                            const bool vectorize) : Program{kernel_name},
                                                    expression_{expression},
+                                                   additional_impl_{additional_impl},
                                                    is_broadcast_{is_broadcast},
                                                    is_lhs_scalar_{is_lhs_scalar},
                                                    is_rhs_scalar_{is_rhs_scalar},
@@ -29,7 +31,8 @@ class BinaryElementwiseProgram final : public Program<BinaryElementwiseProgram>
   WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"vec_size", ProgramUniformVariableDataType::Uint32});
 
  private:
-  std::string expression_;
+  std::string_view expression_;
+  std::string_view additional_impl_;
   bool is_broadcast_;
   bool is_lhs_scalar_;
   bool is_rhs_scalar_;
@@ -38,11 +41,15 @@ class BinaryElementwiseProgram final : public Program<BinaryElementwiseProgram>
 
 class BinaryElementwise : public WebGpuKernel {
  public:
+  using GetAdditionalImplementationFunction = std::string (*)(int lhs_element_type, int rhs_element_type);
+
   BinaryElementwise(const OpKernelInfo& info,
                     const std::string& kernel_name,
-                    const std::string& expression) : WebGpuKernel{info},
-                                                     kernel_name_{kernel_name},
-                                                     expression_{expression} {}
+                    const std::string& expression,
+                    const GetAdditionalImplementationFunction get_additional_impl = nullptr) : WebGpuKernel{info},
+                                                                                               kernel_name_{kernel_name},
+                                                                                               expression_{expression},
+                                                                                               get_additional_impl_{get_additional_impl} {}
 
  protected:
   Status ComputeInternal(ComputeContext& context) const final;
@@ -50,6 +57,7 @@ class BinaryElementwise : public WebGpuKernel {
  private:
   std::string kernel_name_;
   std::string expression_;
+  const GetAdditionalImplementationFunction get_additional_impl_;
 };
 
 }  // namespace webgpu

From 731b27e25d95258b959fa963117e2a6ef11f0050 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 19 Mar 2025 15:48:51 -0700
Subject: [PATCH 15/21] Increase timeout to 90min for
 ARM64-Xcode16-targeting-iphonesimulator (#24091)

### Description
<!-- Describe your changes. -->

There are still some timeout for the pipeline. further extend the
timeout to 90 minutes for ARM64-Xcode16-targeting-iphonesimulator.

It takes quite a while if all build cache is missing.

### Motivation and Context

The pipeline sometimes failed because of timeout. There is a previous PR
#24030 to increase the timeout from 60min to 75 min but it looks like
not enough.
---
 .github/workflows/mac.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index fe7c7fb7ab4c8..86b1cd5ee90e7 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -204,7 +204,7 @@ jobs:
       matrix:
         target_arch: [x86_64, arm64]
 
-    timeout-minutes: 75
+    timeout-minutes: 90
 
     steps:
     - uses: actions/setup-python@v5

From da7874c856019a9bb8e841f88de42162e8af3e8d Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 19 Mar 2025 16:23:18 -0700
Subject: [PATCH 16/21] [WebGPU] fix test failure in Reduce operators on macOS
 ARM64 (#24108)

### Description

fix test failure in Reduce operators on macOS ARM64

```
[E:onnxruntime:ReduceL1, sequential_executor.cc:572 ExecuteKernel] Non-zero status code returned while running ReduceL1 node. Name:'node1' Status Message: webgpu_context.cc:259 Run Uniform variable[0] (output_size) data type mismatch in program "ReduceL1", Expected: u32, Actual: i32
```
---
 onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
index 9548386ded06c..11fa30c798809 100644
--- a/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/webgpu/reduction/reduction_ops.cc
@@ -191,12 +191,13 @@ Status ReduceKernel<allow_multi_axes>::ComputeInternal(ComputeContext& context)
         auto output = context.Output(0, input_tensor->Shape());
         // We need to run the operation even for scalar inputs for these ops
         const auto code = GetOpSpecificCode(input_tensor);
+        constexpr uint32_t output_size = 1;
+        constexpr uint32_t reduce_axes = 0;
         ReduceKernelProgram program(name_, keepdims_, noop_with_empty_axes_, input_axes, code, false);
-        std::vector<uint32_t> reduce_axes = {0};
         program.AddInput({input_tensor, ProgramTensorMetadataDependency::TypeAndRank})
             .AddOutput({output, ProgramTensorMetadataDependency::TypeAndRank})
             .SetDispatchGroupSize(1)
-            .AddUniformVariables({{1}, {static_cast<uint32_t>(noop_with_empty_axes_ ? 1 : 0)}, {reduce_axes}});
+            .AddUniformVariables({{output_size}, {static_cast<uint32_t>(noop_with_empty_axes_ ? 1 : 0)}, {reduce_axes}});
         return context.RunProgram(program);
       } else {
         // For other ops, or when axes is empty with noop_with_empty_axes_ true, just copy the input

From 8d21bf727c96a59425d8e80e4ca1a0a321f220ef Mon Sep 17 00:00:00 2001
From: Prathik Rao <prathik.rao@gmail.com>
Date: Wed, 19 Mar 2025 16:28:31 -0700
Subject: [PATCH 17/21] [WebGPU EP] Implements CumSum Operator (#24047)

Increases WebGPU EP op coverage.
---
 .../core/providers/webgpu/math/cum_sum.cc     | 98 +++++++++++++++++++
 .../core/providers/webgpu/math/cum_sum.h      | 39 ++++++++
 .../webgpu/webgpu_execution_provider.cc       |  4 +-
 3 files changed, 139 insertions(+), 2 deletions(-)
 create mode 100644 onnxruntime/core/providers/webgpu/math/cum_sum.cc
 create mode 100644 onnxruntime/core/providers/webgpu/math/cum_sum.h

diff --git a/onnxruntime/core/providers/webgpu/math/cum_sum.cc b/onnxruntime/core/providers/webgpu/math/cum_sum.cc
new file mode 100644
index 0000000000000..bc4cd70a238fc
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/math/cum_sum.cc
@@ -0,0 +1,98 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/webgpu/math/cum_sum.h"
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    CumSum,
+    kOnnxDomain,
+    11, 13,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", WebGpuSupportedFloatTypes())
+        .TypeConstraint("T2", {DataTypeImpl::GetTensorType<int32_t>(),
+                               DataTypeImpl::GetTensorType<int64_t>()})
+        .InputMemoryType(OrtMemTypeCPU, 1),
+    CumSum);
+
+ONNX_OPERATOR_KERNEL_EX(
+    CumSum,
+    kOnnxDomain,
+    14,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", WebGpuSupportedFloatTypes())
+        .TypeConstraint("T2", {DataTypeImpl::GetTensorType<int32_t>(),
+                               DataTypeImpl::GetTensorType<int64_t>()})
+        .InputMemoryType(OrtMemTypeCPU, 1),
+    CumSum);
+
+Status CumSumProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  const ShaderVariableHelper& input = shader.AddInput("input", ShaderUsage::UseUniform);
+  const ShaderVariableHelper& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
+
+  shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
+                            << "var input_indices = " << input.OffsetToIndices("global_idx") << ";\n"
+                            << "var sum : output_value_t = 0;\n"
+                            << "var first : i32 = 0;\n"
+                            << "if (uniforms.reverse == 1) {\n"
+                            << "  first = i32(" + input.IndicesGet("input_indices", "uniforms.axis") + ");\n"
+                            << "  if (uniforms.exclusive == 1) { first += 1; }\n"
+                            << "}\n\n"
+                            << "var last : i32 = 0;\n"
+                            << "if (uniforms.reverse == 1) {\n"
+                            << "  last = i32(" << GetElementAt("uniforms.input_shape", "uniforms.axis", input.Rank()) << ");\n"
+                            << "} else {\n"
+                            << "  last = i32(" + input.IndicesGet("input_indices", "uniforms.axis") + ");\n"
+                            << "  if (uniforms.exclusive == 0) { last += 1; }\n"
+                            << "}\n\n"
+                            << "for (var i : i32 = first; i < last; i++) {\n"
+                            << "  " << input.IndicesSet("input_indices", "uniforms.axis", "u32(i)") << ";\n"
+                            << "  sum = sum + " << input.GetByIndices("input_indices") << ";\n"
+                            << "}\n"
+                            << output.SetByOffset("global_idx", "sum");
+
+  return Status::OK();
+}
+
+Status CumSum::ComputeInternal(ComputeContext& context) const {
+  const auto* input_tensor = context.Input(0);
+  const TensorShape& input_shape = input_tensor->Shape();
+  int64_t input_rank = input_shape.NumDimensions();
+
+  const auto* axis_tensor = context.Input(1);
+  const auto* axis_data = axis_tensor->Data<int>();
+  int64_t axis = static_cast<int64_t>(axis_data[0]);
+
+  ORT_ENFORCE(-input_rank <= axis && axis < input_rank, "Axes attribute must be within range -input_rank <= axis < input_rank.");
+  // Handle negative axis
+  if (axis < 0) {
+    axis += input_rank;
+  }
+
+  auto* output_tensor = context.Output(0, input_shape);
+  int64_t output_size = output_tensor->Shape().Size();
+
+  if (output_size == 0) {
+    return Status::OK();
+  }
+
+  CumSumProgram program{};
+  program
+      .AddInput({input_tensor})
+      .AddOutput({output_tensor, ProgramTensorMetadataDependency::TypeAndRank})
+      .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+      .AddUniformVariables({{static_cast<uint32_t>(output_size)},
+                            {static_cast<uint32_t>(axis)},
+                            {static_cast<uint32_t>(exclusive_)},
+                            {static_cast<uint32_t>(reverse_)}});
+  return context.RunProgram(program);
+}
+
+}  // namespace webgpu
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/webgpu/math/cum_sum.h b/onnxruntime/core/providers/webgpu/math/cum_sum.h
new file mode 100644
index 0000000000000..6a66ee0ed7b04
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/math/cum_sum.h
@@ -0,0 +1,39 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/webgpu_kernel.h"
+#include "core/providers/webgpu/program.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+class CumSumProgram final : public Program<CumSumProgram> {
+ public:
+  CumSumProgram() : Program{"CumSum"} {}
+
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32},
+                                          {"axis", ProgramUniformVariableDataType::Uint32},
+                                          {"exclusive", ProgramUniformVariableDataType::Uint32},
+                                          {"reverse", ProgramUniformVariableDataType::Uint32});
+};
+
+class CumSum final : public WebGpuKernel {
+ public:
+  CumSum(const OpKernelInfo& info) : WebGpuKernel(info) {
+    exclusive_ = info.GetAttrOrDefault<int64_t>("exclusive", 0);
+    reverse_ = info.GetAttrOrDefault<int64_t>("reverse", 0);
+  }
+
+  Status ComputeInternal(ComputeContext& context) const override;
+
+ private:
+  int64_t exclusive_;
+  int64_t reverse_;
+};
+
+}  // namespace webgpu
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index 6f81bead5e5b1..aacbcc5fb4f0a 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -713,8 +713,8 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 9, 13, BatchNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 14, 14, BatchNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSInternalNHWCDomain, 15, BatchNormalization)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 13, CumSum)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 14, CumSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 13, CumSum)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 14, CumSum)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 10, 12, uint8_t, DequantizeLinear)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 10, 12, int8_t, DequantizeLinear)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 10, 12, int32_t, DequantizeLinear)>,

From 81a892042d06c7fcab0aca0f0b724fa1a4b92630 Mon Sep 17 00:00:00 2001
From: Jiajia Qin <jiajiaqin@microsoft.com>
Date: Thu, 20 Mar 2025 07:55:30 +0800
Subject: [PATCH 18/21] [webgpu] Use 1d dispatch group size (#24084)

This PR uses 1d disptach group size and uses workgroup_idx instead of
workgroup.x|workgroup.y in case they are normalized.
---
 .../contrib_ops/webgpu/bert/flash_attention.cc      | 10 ++++++----
 .../contrib_ops/webgpu/bert/flash_attention.h       |  3 ++-
 .../webgpu/quantization/dp4a_matmul_nbits.cc        | 13 +++++++------
 .../webgpu/quantization/dp4a_matmul_nbits.h         |  3 ++-
 4 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
index 58ddf60df79f0..52c705abb1003 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.cc
@@ -224,12 +224,12 @@ Status FlashAttentionProgram::GenerateShaderCode(ShaderHelper& shader) const {
   // Shader is designed to be dispatched as Dispatch(num_heads, new_sequence_length / workgroup_size_x, 1)
   // Each lane/thread is responsible for a single q.
   shader.MainFunctionBody() << R"MAIN_FN(
-  let head_idx = workgroup_id.x;
+  let head_idx = u32(workgroup_idx / uniforms.num_seq_tile);
   let capped_sg_id = min(sg_id, max_k_step);
   let capped_sg_size = min(sg_size, max_k_step);
 
   // Load Q
-  let q_idx_global = workgroup_id.y * workgroup_size_x + local_idx;
+  let q_idx_global = (workgroup_idx % uniforms.num_seq_tile) * workgroup_size_x + local_idx;
   let valid_q = q_idx_global < uniforms.new_sequence_length;
   if (valid_q)
   {
@@ -445,7 +445,8 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
   std::string cache_hint = std::to_string(has_attention_bias) +
                            std::to_string(parameters.head_size_) +
                            std::to_string(parameters.num_heads_);
-  program.SetDispatchGroupSize(parameters.num_heads_, (parameters.sequence_length_ + tile_size - 1) / tile_size, 1)
+  const uint32_t num_seq_tile = (parameters.sequence_length_ + tile_size - 1) / tile_size;
+  program.SetDispatchGroupSize(parameters.num_heads_ * num_seq_tile)
       .SetWorkgroupSize(tile_size)
       .CacheHint(cache_hint)
       .AddUniformVariables({{static_cast<uint32_t>(parameters.sequence_length_)},
@@ -454,7 +455,8 @@ Status ApplyFlashAttention(const Tensor* Q, const Tensor* K, const Tensor* V, co
                             {static_cast<uint32_t>(parameters.total_sequence_length_ - parameters.kv_sequence_length_)},
                             {static_cast<uint32_t>(parameters.is_gqa_ ? 1 : 0)},
                             {static_cast<uint32_t>(parameters.n_reps)},
-                            {alpha}});
+                            {alpha},
+                            {num_seq_tile}});
 
   return context.RunProgram(program);
 }
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
index 2c2b888538843..8931403641a81 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
@@ -52,7 +52,8 @@ class FlashAttentionProgram final : public Program<FlashAttentionProgram> {
                                           {"past_sequence_length", ProgramUniformVariableDataType::Uint32},
                                           {"is_gqa", ProgramUniformVariableDataType::Uint32},
                                           {"n_reps", ProgramUniformVariableDataType::Uint32},
-                                          {"alpha", ProgramUniformVariableDataType::Float32});
+                                          {"alpha", ProgramUniformVariableDataType::Float32},
+                                          {"num_seq_tile", ProgramUniformVariableDataType::Uint32});
 
  private:
   bool has_attention_bias_;
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
index 65807b072bc80..a25d8e68f11cd 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
@@ -138,8 +138,8 @@ Status DP4AMatMulNBitsProgram::GenerateShaderCode(ShaderHelper& shader) const {
   shader.MainFunctionBody() << R"MAIN_FN(
         // During the load phase we use all 256 threads to load 64 rows of A/B.
         // For each row we load tile_size_k_vec (2) vectorized elements, which are 32 elements of K.
-        let a_global_base = workgroup_id.x * tile_size;
-        let b_global_base = workgroup_id.y * tile_size;
+        let a_global_base = u32(workgroup_idx / uniforms.num_N_tile) * tile_size;
+        let b_global_base = (workgroup_idx % uniforms.num_N_tile) * tile_size;
         let load_AorB = u32(local_idx/128);
         let load_row = u32((local_idx%128)/2);
         let load_col = u32(local_idx%2);
@@ -275,11 +275,11 @@ Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor
 
   constexpr uint32_t kTileSize = 64;
   TensorShape reshaped_y_shape{1, M, N / kVec4Components};
+  uint32_t num_M_tile = (M + kTileSize - 1) / kTileSize;
+  uint32_t num_N_tile = (N + kTileSize - 1) / kTileSize;
   DP4AMatMulNBitsProgram mul_program{block_size};
   mul_program.SetWorkgroupSize(256);
-  mul_program.SetDispatchGroupSize(
-      (M + kTileSize - 1) / kTileSize,
-      (N + kTileSize - 1) / kTileSize, 1);
+  mul_program.SetDispatchGroupSize(num_M_tile * num_N_tile);
   mul_program.AddInputs({{&a_quant, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(kVec4Components)},
                          {&a_scale, ProgramTensorMetadataDependency::TypeAndRank, 1},
                          {b, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(kVec2Components * kU32Components)},
@@ -288,7 +288,8 @@ Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor
                             {static_cast<uint32_t>(N)},
                             {static_cast<uint32_t>(K)},
                             {static_cast<uint32_t>(K / 8)},
-                            {static_cast<uint32_t>(K / 16)}})
+                            {static_cast<uint32_t>(K / 16)},
+                            {num_N_tile}})
       .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, reshaped_y_shape, static_cast<int>(kVec4Components)})
       .CacheHint("Block" + std::to_string(block_size));
   return context.RunProgram(mul_program);
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h
index 7e4a8f5d68437..f0157ca3e8c97 100644
--- a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h
+++ b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h
@@ -27,7 +27,8 @@ class DP4AMatMulNBitsProgram final : public Program<DP4AMatMulNBitsProgram> {
       {"N", ProgramUniformVariableDataType::Uint32},
       {"K", ProgramUniformVariableDataType::Uint32},
       {"K8", ProgramUniformVariableDataType::Uint32},
-      {"K16", ProgramUniformVariableDataType::Uint32});
+      {"K16", ProgramUniformVariableDataType::Uint32},
+      {"num_N_tile", ProgramUniformVariableDataType::Uint32});
 
  private:
   uint32_t block_size_;

From 9dcb99cdf73f112e9bf2c6c1613897ffc31b6477 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 19 Mar 2025 17:32:11 -0700
Subject: [PATCH 19/21] [WebGPU] fix test failure in MatMulNBits on macOS ARM64
 (#24109)

### Description

abs_error is slightly loosen from 0.02 to 0.03 to allow test cases on
macOS arm64 to pass.
---
 onnxruntime/test/contrib_ops/matmul_4bits_test.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
index 8187253311ed3..81323cb51a887 100644
--- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -530,8 +530,10 @@ TEST(MatMulNBits, Float16Cuda) {
       for (auto K : {16, 32, 64, 128, 256, 1024, 93, 1234}) {
         for (auto block_size : {16, 32, 64, 128}) {
           for (auto has_gidx : has_gidx_options) {
-#ifdef USE_DML
+#if defined(USE_DML)
             RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.04f);
+#elif defined(USE_WEBGPU)
+            RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.03f);
 #else
             RunTest(M, N, K, block_size, 0, false, true, has_gidx);
             RunTest(M, N, K, block_size, 0, true, true, has_gidx, false);

From 4d5e274f0e45533227cbc2a2eed2bec83d42949b Mon Sep 17 00:00:00 2001
From: chuteng-quic <quic_chuteng@quicinc.com>
Date: Thu, 20 Mar 2025 10:37:16 +0800
Subject: [PATCH 20/21] [QNN-EP] Add support for Sum operator with 2 inputs
 (#24098)

### Description
<!-- Describe your changes. -->
* Add Sum to op builder in QNN-EP
* Now we can limit the support to Sum with 2 inputs.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
* Enhance QNN-EP support for Sum with two inputs
---
 .../providers/qnn/builder/op_builder_factory.cc     |  1 +
 .../qnn/builder/opbuilder/base_op_builder.h         |  1 +
 .../qnn/builder/opbuilder/simple_op_builder.cc      | 13 ++++++++++---
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
index 3d66003fb2bca..77579dfc793ee 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
+++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
@@ -47,6 +47,7 @@ OpBuilderRegistrations::OpBuilderRegistrations() {
     CreateSimpleOpBuilder("Sin", *this);
     CreateSimpleOpBuilder("Sqrt", *this);
     CreateSimpleOpBuilder("Sub", *this);
+    CreateSimpleOpBuilder("Sum", *this);
     CreateSimpleOpBuilder("Tanh", *this);
 
     CreateSimpleOpBuilder("Concat", *this);
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
index a79f4056083c5..df9d0de8e0e3e 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
@@ -158,6 +158,7 @@ class BaseOpBuilder : public IOpBuilder {
         {"Softmax", QNN_OP_SOFTMAX},
         {"Sqrt", QNN_OP_ELEMENT_WISE_SQUARE_ROOT},
         {"Sub", QNN_OP_ELEMENT_WISE_SUBTRACT},
+        {"Sum", QNN_OP_ELEMENT_WISE_ADD},
         {"Tanh", QNN_OP_TANH},
         {"Transpose", QNN_OP_TRANSPOSE},
         {"GridSample", QNN_OP_GRID_SAMPLE},
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
index 48c637cd2e951..229d86082f6dc 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
@@ -56,11 +56,18 @@ Status SimpleOpBuilder::ExplicitOpCheck(QnnModelWrapper& qnn_model_wrapper,
                       padding_mode.c_str());
   }
 
-  // ONNX's Min and Max operators accept a variable number of inputs (i.e., variadic).
-  // However, QNN's Min and Max operators must take in exactly two inputs.
+  // ONNX's Min, Max, and Sum operators accept a variable number of inputs (i.e., variadic).
+  // However, QNN's Min, Max, and Add operators must take in exactly two inputs.
   if (op_type == "Min" || op_type == "Max") {
     ORT_RETURN_IF_NOT(node_unit.Inputs().size() == 2,
-                      "QNN EP only supports Min and Max operators with exactly 2 inputs.");
+                      "QNN EP only supports ", op_type.c_str(), " operator with exactly 2 inputs.");
+  }
+
+  if (op_type == "Sum") {
+    size_t inputs_num = node_unit.Inputs().size();
+    ORT_RETURN_IF_NOT(inputs_num == 2,
+                      "QNN EP supports Sum operator with QNN_OP_ELEMENT_WISE_ADD, which takes exactly 2 inputs. Got ONNX's Sum operator with ",
+                      std::to_string(inputs_num).c_str(), " inputs.");
   }
 
   if (op_type == "DequantizeLinear") {

From 5d43f0ab1997d7f99b4aca94e49e24707c2a7566 Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Thu, 20 Mar 2025 16:10:54 +0800
Subject: [PATCH 21/21] [WebNN] Replace narrow with SafeInt for consistently in
 integer handling (#24059)

Remove redundant header files BTW.
---
 .../providers/webnn/builders/impl/argmax_min_op_builder.cc | 4 ++--
 .../core/providers/webnn/builders/impl/builder_utils.cc    | 1 -
 .../providers/webnn/builders/impl/concat_op_builder.cc     | 1 -
 .../core/providers/webnn/builders/impl/conv_op_builder.cc  | 1 -
 .../providers/webnn/builders/impl/cumsum_op_builder.cc     | 5 ++---
 .../builders/impl/dynamicQuantizeLinear_op_builder.cc      | 1 -
 .../providers/webnn/builders/impl/einsum_op_builder.cc     | 1 -
 .../providers/webnn/builders/impl/expand_op_builder.cc     | 1 -
 .../providers/webnn/builders/impl/flatten_op_builder.cc    | 1 -
 .../core/providers/webnn/builders/impl/gemm_op_builder.cc  | 7 +++----
 .../webnn/builders/impl/normalization_op_builder.cc        | 1 -
 .../core/providers/webnn/builders/impl/pad_op_builder.cc   | 1 -
 .../core/providers/webnn/builders/impl/pool_op_builder.cc  | 1 -
 .../core/providers/webnn/builders/impl/qdq_op_builder.cc   | 1 -
 .../providers/webnn/builders/impl/reduction_op_builder.cc  | 1 -
 .../providers/webnn/builders/impl/reshape_op_builder.cc    | 1 -
 .../providers/webnn/builders/impl/resize_op_builder.cc     | 1 -
 .../webnn/builders/impl/rotaryEmbedding_op_builder.cc      | 2 +-
 .../core/providers/webnn/builders/impl/slice_op_builder.cc | 1 -
 .../providers/webnn/builders/impl/softmax_op_builder.cc    | 1 -
 .../core/providers/webnn/builders/impl/split_op_builder.cc | 5 ++---
 .../webnn/builders/impl/squeeze_unsqueeze_op_builder.cc    | 1 -
 .../core/providers/webnn/builders/impl/tile_op_builder.cc  | 1 -
 .../providers/webnn/builders/impl/transpose_op_builder.cc  | 1 -
 .../providers/webnn/builders/impl/triangular_op_builder.cc | 2 +-
 onnxruntime/core/providers/webnn/builders/model.cc         | 1 -
 onnxruntime/core/providers/webnn/builders/model_builder.cc | 1 -
 .../core/providers/webnn/webnn_execution_provider.cc       | 1 -
 28 files changed, 11 insertions(+), 36 deletions(-)

diff --git a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
index 6814b019f699c..08580ab2861d7 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc
@@ -54,9 +54,9 @@ Status ArgMaxMinOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
   const auto& op_type = node.OpType();
   if (op_type == "ArgMax") {
-    output = model_builder.GetBuilder().call<emscripten::val>("argMax", input, narrow<uint32_t>(axis), options);
+    output = model_builder.GetBuilder().call<emscripten::val>("argMax", input, SafeInt<uint32_t>(axis).Ref(), options);
   } else if (op_type == "ArgMin") {
-    output = model_builder.GetBuilder().call<emscripten::val>("argMin", input, narrow<uint32_t>(axis), options);
+    output = model_builder.GetBuilder().call<emscripten::val>("argMin", input, SafeInt<uint32_t>(axis).Ref(), options);
   } else {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "ArgMaxMinOpBuilder, unknown op: ", op_type);
   }
diff --git a/onnxruntime/core/providers/webnn/builders/impl/builder_utils.cc b/onnxruntime/core/providers/webnn/builders/impl/builder_utils.cc
index 113cc3df5438d..63e2345243282 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/builder_utils.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/builder_utils.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <core/common/safeint.h>
 #include <core/providers/common.h>
 #include "core/providers/shared/utils/utils.h"
 
diff --git a/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc
index 1bbe56ef9b477..ee2512ddd8b5a 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/providers/webnn/builders/helper.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
index 1361b7dd5c14b..4c393e8a9bdba 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/cumsum_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/cumsum_op_builder.cc
index be30c5520d62e..99be8f75771ad 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/cumsum_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/cumsum_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
@@ -64,8 +63,8 @@ Status CumSumOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   options.set("label", node.Name());
 
   emscripten::val output = emscripten::val::object();
-  output = model_builder.GetBuilder().call<emscripten::val>("cumulativeSum", input, gsl::narrow<uint32_t>(webnn_axis),
-                                                            options);
+  output = model_builder.GetBuilder().call<emscripten::val>("cumulativeSum", input,
+                                                            SafeInt<uint32_t>(webnn_axis).Ref(), options);
   model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc
index 55746bb1f61f0..f3363b1e186d5 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc
index 1f51e26fecfa5..6cee04bac3e2b 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc
index 8402f05d8e234..3f813f08279e7 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/flatten_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/flatten_op_builder.cc
index d0ece026a7048..c4ff280b95b6e 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/flatten_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/flatten_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/providers/webnn/builders/helper.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
index c5cc8e86bb308..1f24124745a19 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/providers/webnn/builders/helper.h"
@@ -86,9 +85,9 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     else if (extended_a_shape) {
       std::vector<uint32_t> new_shape;
       for (size_t i = 0; i < b_shape.size() - 2; i++) {
-        new_shape.push_back(narrow<uint32_t>(b_shape[i]));
+        new_shape.push_back(SafeInt<uint32_t>(b_shape[i]));
       }
-      new_shape.push_back(narrow<uint32_t>(b_shape.back()));
+      new_shape.push_back(SafeInt<uint32_t>(b_shape.back()));
       output = model_builder.GetBuilder().call<emscripten::val>("reshape",
                                                                 output,
                                                                 emscripten::val::array(new_shape),
@@ -98,7 +97,7 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
     else if (extended_b_shape) {
       std::vector<uint32_t> new_shape;
       for (size_t i = 0; i < a_shape.size() - 1; i++) {
-        new_shape.push_back(narrow<uint32_t>(a_shape[i]));
+        new_shape.push_back(SafeInt<uint32_t>(a_shape[i]));
       }
       output = model_builder.GetBuilder().call<emscripten::val>("reshape",
                                                                 output,
diff --git a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
index a090c21fe3356..5b57df7f184e7 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc
index d8373a45e4423..e8f26af928ab3 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/providers/webnn/builders/helper.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc
index b338d27986279..79ad3574e07e9 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/providers/webnn/builders/helper.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc
index 1bb6523c6f86a..ed62b2bd69618 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/qdq_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
index 93ad933d71c34..b23fbeba1ddc8 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/providers/webnn/builders/helper.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc
index 0a438e98ad737..2fc47430a1c66 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
index 58515d2df54ec..eec6911a686cf 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
@@ -4,7 +4,6 @@
 
 #include <math.h>
 
-#include "core/common/safeint.h"
 #include "core/providers/common.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/webnn/builders/helper.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc
index 1688dfc97a0c4..0a84835ee9fc0 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/rotaryEmbedding_op_builder.cc
@@ -89,7 +89,7 @@ Status RotaryEmbeddingOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_build
   emscripten::val wnn_builder = model_builder.GetBuilder();
 
   NodeAttrHelper helper(node);
-  const bool interleaved = gsl::narrow_cast<bool>(helper.Get("interleaved", 0));
+  const bool interleaved = static_cast<bool>(helper.Get("interleaved", 0));
   uint32_t num_heads = helper.Get("num_heads", 0);
   uint32_t rotary_embedding_dim = helper.Get("rotary_embedding_dim", 0);
 
diff --git a/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc
index 4adc5920de7fa..468c0e24a3e88 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc
index b1b737b114998..0e754b53e78d1 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/providers/webnn/builders/helper.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
index 06dbacf995a28..21b44b1066694 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
@@ -75,8 +74,8 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   // Check that the splits evenly divide.
   if (split_count > 0 && splits.empty() && input_shape[axis] % split_count != 0) {
     // Divide inputs into variable size outputs:
-    splits.insert(splits.end(), split_count - 1, narrow<uint32_t>(input_shape[axis]) / split_count);
-    splits.insert(splits.end(), narrow<uint32_t>(input_shape[axis]) % split_count);
+    splits.insert(splits.end(), split_count - 1, SafeInt<uint32_t>(input_shape[axis]) / split_count);
+    splits.insert(splits.end(), SafeInt<uint32_t>(input_shape[axis]) % split_count);
   }
 
   if (splits.empty()) {
diff --git a/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc
index a3be9d7e2ceee..5687b1133c628 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/providers/webnn/builders/helper.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/tile_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/tile_op_builder.cc
index 672a3a510d54d..259bb0552b7c7 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/tile_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/tile_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/optimizer/initializer.h"
 #include "core/providers/common.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc
index ac440e0119bac..452071f469c4f 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc
@@ -2,7 +2,6 @@
 // Copyright (c) Intel Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/providers/webnn/builders/helper.h"
 #include "core/providers/webnn/builders/model_builder.h"
diff --git a/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc
index 0c818533918a4..f2092d6163713 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc
@@ -59,7 +59,7 @@ Status TriangularOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     std::vector<uint8_t> unpacked_tensor;
     ORT_RETURN_IF_ERROR(onnxruntime::utils::UnpackInitializerData(diagonal_tensor, unpacked_tensor));
     const auto diagonal = *reinterpret_cast<int64_t*>(unpacked_tensor.data());
-    options.set("diagonal", narrow<int32_t>(diagonal));
+    options.set("diagonal", SafeInt<int32_t>(diagonal).Ref());
   }
 
   output = model_builder.GetBuilder().call<emscripten::val>("triangular", input, options);
diff --git a/onnxruntime/core/providers/webnn/builders/model.cc b/onnxruntime/core/providers/webnn/builders/model.cc
index 492e2f717e30e..40fdfc609e6a1 100644
--- a/onnxruntime/core/providers/webnn/builders/model.cc
+++ b/onnxruntime/core/providers/webnn/builders/model.cc
@@ -8,7 +8,6 @@
 #include "core/common/common.h"
 #include "core/common/inlined_containers.h"
 #include "core/common/logging/logging.h"
-#include "core/common/safeint.h"
 #include "core/graph/onnx_protobuf.h"
 #include "core/providers/common.h"
 #include "model.h"
diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.cc b/onnxruntime/core/providers/webnn/builders/model_builder.cc
index ed6ab7d2d7115..399cc5faf6273 100644
--- a/onnxruntime/core/providers/webnn/builders/model_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/model_builder.cc
@@ -9,7 +9,6 @@
 #include "helper.h"
 #include "op_builder_factory.h"
 
-#include "core/common/safeint.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
diff --git a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
index c527ba213e55b..2da7c6499933a 100644
--- a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
+++ b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
@@ -10,7 +10,6 @@
 #include "core/framework/kernel_registry.h"
 #include "core/graph/graph_viewer.h"
 #include "core/session/onnxruntime_cxx_api.h"
-#include "core/common/safeint.h"
 #include "core/providers/webnn/allocator.h"
 #include "core/providers/webnn/data_transfer.h"
 #include "core/providers/partitioning_utils.h"