From 5981814c5f6291f0440f90d7ab13e949a1b44cc7 Mon Sep 17 00:00:00 2001
From: Tianlei WU <tlwu@microsoft.com>
Date: Thu, 26 Feb 2026 14:11:56 -0800
Subject: [PATCH 01/21] Update version number to 1.24.3

---
 VERSION_NUMBER                                | 2 +-
 docs/python/README.rst                        | 5 +++++
 js/common/lib/version.ts                      | 2 +-
 js/common/package-lock.json                   | 4 ++--
 js/common/package.json                        | 2 +-
 js/node/lib/version.ts                        | 2 +-
 js/node/package-lock.json                     | 6 +++---
 js/node/package.json                          | 2 +-
 js/node/script/install-metadata-versions.js   | 2 +-
 js/react_native/lib/version.ts                | 2 +-
 js/react_native/package-lock.json             | 6 +++---
 js/react_native/package.json                  | 2 +-
 js/web/lib/version.ts                         | 2 +-
 js/web/package-lock.json                      | 6 +++---
 js/web/package.json                           | 2 +-
 onnxruntime/__init__.py                       | 2 +-
 onnxruntime/core/session/onnxruntime_c_api.cc | 2 +-
 17 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/VERSION_NUMBER b/VERSION_NUMBER
index e4a973f913f4f..ae96cc7310aaa 100644
--- a/VERSION_NUMBER
+++ b/VERSION_NUMBER
@@ -1 +1 @@
-1.24.2
+1.24.3
diff --git a/docs/python/README.rst b/docs/python/README.rst
index 6124ffe8d27b4..06f5b0ebf3094 100644
--- a/docs/python/README.rst
+++ b/docs/python/README.rst
@@ -8,6 +8,11 @@ For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://ak
 Changes
 -------
 
+1.24.3
+^^^^^^
+
+Release Notes : https://github.com/Microsoft/onnxruntime/releases/tag/v1.24.3
+
 1.24.2
 ^^^^^^
 
diff --git a/js/common/lib/version.ts b/js/common/lib/version.ts
index 67d95af396d53..9e819d0a932ef 100644
--- a/js/common/lib/version.ts
+++ b/js/common/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.24.2';
+export const version = '1.24.3';
diff --git a/js/common/package-lock.json b/js/common/package-lock.json
index e0be9c1a1e1e8..c647d7b75ca38 100644
--- a/js/common/package-lock.json
+++ b/js/common/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-common",
-  "version": "1.24.2",
+  "version": "1.24.3",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-common",
-      "version": "1.24.2",
+      "version": "1.24.3",
       "license": "MIT",
       "devDependencies": {
         "globby": "^15.0.0",
diff --git a/js/common/package.json b/js/common/package.json
index 25f71c7260c6a..17a651d6ac995 100644
--- a/js/common/package.json
+++ b/js/common/package.json
@@ -2,7 +2,7 @@
   "license": "MIT",
   "type": "module",
   "name": "onnxruntime-common",
-  "version": "1.24.2",
+  "version": "1.24.3",
   "repository": {
     "url": "https://github.com/Microsoft/onnxruntime.git",
     "type": "git"
diff --git a/js/node/lib/version.ts b/js/node/lib/version.ts
index 67d95af396d53..9e819d0a932ef 100644
--- a/js/node/lib/version.ts
+++ b/js/node/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.24.2';
+export const version = '1.24.3';
diff --git a/js/node/package-lock.json b/js/node/package-lock.json
index b2281eb1de5ce..8d890063b532b 100644
--- a/js/node/package-lock.json
+++ b/js/node/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-node",
-  "version": "1.24.2",
+  "version": "1.24.3",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-node",
-      "version": "1.24.2",
+      "version": "1.24.3",
       "hasInstallScript": true,
       "license": "MIT",
       "os": [
@@ -30,7 +30,7 @@
     },
     "../common": {
       "name": "onnxruntime-common",
-      "version": "1.24.2",
+      "version": "1.24.3",
       "license": "MIT",
       "devDependencies": {
         "globby": "^15.0.0",
diff --git a/js/node/package.json b/js/node/package.json
index 6e94e027f1a30..96be689947352 100644
--- a/js/node/package.json
+++ b/js/node/package.json
@@ -11,7 +11,7 @@
       6
     ]
   },
-  "version": "1.24.2",
+  "version": "1.24.3",
   "dependencies": {
     "adm-zip": "^0.5.16",
     "global-agent": "^3.0.0",
diff --git a/js/node/script/install-metadata-versions.js b/js/node/script/install-metadata-versions.js
index e9d30780834f4..1747fbc37259e 100644
--- a/js/node/script/install-metadata-versions.js
+++ b/js/node/script/install-metadata-versions.js
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-module.exports = { nuget: [{ feed: 'nuget', version: '1.24.2' }] };
+module.exports = { nuget: [{ feed: 'nuget', version: '1.24.3' }] };
diff --git a/js/react_native/lib/version.ts b/js/react_native/lib/version.ts
index 67d95af396d53..9e819d0a932ef 100644
--- a/js/react_native/lib/version.ts
+++ b/js/react_native/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.24.2';
+export const version = '1.24.3';
diff --git a/js/react_native/package-lock.json b/js/react_native/package-lock.json
index 57f40050e6aa6..61f389ecbd618 100644
--- a/js/react_native/package-lock.json
+++ b/js/react_native/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-react-native",
-  "version": "1.24.2",
+  "version": "1.24.3",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-react-native",
-      "version": "1.24.2",
+      "version": "1.24.3",
       "license": "MIT",
       "dependencies": {
         "onnxruntime-common": "file:../common"
@@ -30,7 +30,7 @@
     },
     "../common": {
       "name": "onnxruntime-common",
-      "version": "1.24.2",
+      "version": "1.24.3",
       "license": "MIT",
       "devDependencies": {
         "globby": "^15.0.0",
diff --git a/js/react_native/package.json b/js/react_native/package.json
index 4d7a03a68a264..7c846efae8b62 100644
--- a/js/react_native/package.json
+++ b/js/react_native/package.json
@@ -37,7 +37,7 @@
     "registry": "https://registry.npmjs.org/"
   },
   "source": "lib/index",
-  "version": "1.24.2",
+  "version": "1.24.3",
   "main": "dist/commonjs/index",
   "homepage": "https://github.com/microsoft/onnxruntime/blob/main/js/react_native/README.md",
   "files": [
diff --git a/js/web/lib/version.ts b/js/web/lib/version.ts
index 67d95af396d53..9e819d0a932ef 100644
--- a/js/web/lib/version.ts
+++ b/js/web/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.24.2';
+export const version = '1.24.3';
diff --git a/js/web/package-lock.json b/js/web/package-lock.json
index 5782c9b2bfcc3..01a6e497a9a75 100644
--- a/js/web/package-lock.json
+++ b/js/web/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-web",
-  "version": "1.24.2",
+  "version": "1.24.3",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-web",
-      "version": "1.24.2",
+      "version": "1.24.3",
       "license": "MIT",
       "dependencies": {
         "flatbuffers": "^25.1.24",
@@ -50,7 +50,7 @@
     },
     "../common": {
       "name": "onnxruntime-common",
-      "version": "1.24.2",
+      "version": "1.24.3",
       "license": "MIT",
       "devDependencies": {
         "globby": "^15.0.0",
diff --git a/js/web/package.json b/js/web/package.json
index f0a2dcb410a2a..23d89ca57199f 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -7,7 +7,7 @@
     "type": "git"
   },
   "author": "fs-eire",
-  "version": "1.24.2",
+  "version": "1.24.3",
   "jsdelivr": "dist/ort.min.js",
   "dependencies": {
     "flatbuffers": "^25.1.24",
diff --git a/onnxruntime/__init__.py b/onnxruntime/__init__.py
index 98096104ed304..ace3e4dd4bf46 100644
--- a/onnxruntime/__init__.py
+++ b/onnxruntime/__init__.py
@@ -10,7 +10,7 @@
 
 import contextlib
 
-__version__ = "1.24.2"
+__version__ = "1.24.3"
 __author__ = "Microsoft"
 
 # we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 2806eb7a7a8d8..aa6a69adb549c 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -4843,7 +4843,7 @@ static_assert(offsetof(OrtApi, CreateExternalInitializerInfo) / sizeof(void*) ==
 static_assert(offsetof(OrtApi, GetTensorElementTypeAndShapeDataReference) / sizeof(void*) == 414, "Size of version 24 API cannot change");
 
 // So that nobody forgets to finish an API version, this check will serve as a reminder:
-static_assert(std::string_view(ORT_VERSION) == "1.24.2",
+static_assert(std::string_view(ORT_VERSION) == "1.24.3",
               "ORT_Version change detected, please follow below steps to ensure OrtApi is updated properly");
 // 1. Update the hardcoded version string in above static_assert to silence it
 // 2. If there were any APIs added to ort_api_1_to_24 above:

From 9e0e9c366ead6baac63b7e521b02cc78e0e46d94 Mon Sep 17 00:00:00 2001
From: patryk-kaiser-ARM <patryk.kaiser@arm.com>
Date: Sat, 21 Feb 2026 00:11:09 +0000
Subject: [PATCH 02/21] [MLAS] Adding DynamicQGemm function pointers and
 ukernel interface (#27403)

### Description
* Adding function pointer overrides to KleidiAI DynamicQGemm
* Making use of ukernel interface for DynamicQGemm to select between SME
and SME2 variants

### Motivation and Context
Fixes https://github.com/microsoft/onnxruntime/issues/26377
---
 .../core/mlas/lib/kai_ukernel_interface.cpp   | 61 ++++++++++++++
 .../core/mlas/lib/kai_ukernel_interface.h     |  4 +
 .../core/mlas/lib/kleidiai/mlasi_kleidiai.h   |  4 +-
 .../core/mlas/lib/kleidiai/qgemm_kleidiai.cpp | 83 +++++--------------
 onnxruntime/core/mlas/lib/mlasi.h             | 66 ++++++++-------
 onnxruntime/core/mlas/lib/platform.cpp        |  9 +-
 onnxruntime/core/mlas/lib/qgemm.cpp           | 13 ++-
 onnxruntime/core/mlas/lib/sgemm.cpp           | 12 +--
 8 files changed, 145 insertions(+), 107 deletions(-)

diff --git a/onnxruntime/core/mlas/lib/kai_ukernel_interface.cpp b/onnxruntime/core/mlas/lib/kai_ukernel_interface.cpp
index a406f371a3bd2..06f9d97b872c7 100644
--- a/onnxruntime/core/mlas/lib/kai_ukernel_interface.cpp
+++ b/onnxruntime/core/mlas/lib/kai_ukernel_interface.cpp
@@ -19,8 +19,12 @@
 #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.h"
 #include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa.h"
 #include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa.h"
+
+#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa.h"
+#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa.h"
 #if defined(ENABLE_QMX_KERNELS)
 #include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa.h"
+#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa.h"
 #endif // ENABLE_QMX_KERNELS
 
 const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod =
@@ -125,6 +129,32 @@ const kai_matmul_clamp_f32_f32p_f32p_ukernel sgemm_gemm_sme2 =
     kai_get_dst_size_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa,
     kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa};
 
+const kai_matmul_clamp_f32_qai8dxp_qsi8cxp_ukernel qgemm_gemm_sme =
+    {kai_get_m_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa,
+    kai_get_n_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa,
+    kai_get_mr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa,
+    kai_get_nr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa,
+    kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa,
+    kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa,
+    kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa,
+    kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa,
+    kai_get_dst_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa,
+    kai_get_dst_size_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa,
+    kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa};
+
+const kai_matmul_clamp_f32_qai8dxp_qsi8cxp_ukernel qgemm_gemm_sme2 =
+    {kai_get_m_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+    kai_get_n_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+    kai_get_mr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+    kai_get_nr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+    kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+    kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+    kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+    kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+    kai_get_dst_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+    kai_get_dst_size_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+    kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa};
+
 #if defined(ENABLE_QMX_KERNELS)
 const kai_matmul_clamp_f32_f32p_f32p_ukernel sgemm_gemm_qmx =
     {kai_get_m_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
@@ -138,6 +168,19 @@ const kai_matmul_clamp_f32_f32p_f32p_ukernel sgemm_gemm_qmx =
     kai_get_dst_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
     kai_get_dst_size_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
     kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa};
+
+const kai_matmul_clamp_f32_qai8dxp_qsi8cxp_ukernel qgemm_gemm_qmx =
+    {kai_get_m_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa,
+    kai_get_n_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa,
+    kai_get_mr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa,
+    kai_get_nr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa,
+    kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa,
+    kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa,
+    kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa,
+    kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa,
+    kai_get_dst_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa,
+    kai_get_dst_size_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa,
+    kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa};
 #endif // ENABLE_QMX_KERNELS
 
 const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& GetKleidiAIGemmUKernel() {
@@ -181,3 +224,21 @@ const kai_matmul_clamp_f32_f32_f32p_ukernel& GetKleidiAISGemvUKernel() {
         return sgemm_gemv_sme;
     }
 }
+
+const kai_matmul_clamp_f32_qai8dxp_qsi8cxp_ukernel& GetKleidiAIQGemmUKernel() {
+    if (MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME2()) {
+        return qgemm_gemm_sme2;
+    } else {
+#if defined(ENABLE_QMX_KERNELS)
+        if (ArmKleidiAI::vendor_name.compare("Qualcomm") == 0)
+        {
+            KLEIDIAI_KERNEL_LOG("QGEMM: Using QMX Kernel");
+            return qgemm_gemm_qmx;
+        } else {
+            return qgemm_gemm_sme;
+        }
+#else
+        return qgemm_gemm_sme;
+#endif // ENABLE_QMX_KERNELS
+    }
+}
diff --git a/onnxruntime/core/mlas/lib/kai_ukernel_interface.h b/onnxruntime/core/mlas/lib/kai_ukernel_interface.h
index e69c72329d64b..7bd8959b0b5bd 100644
--- a/onnxruntime/core/mlas/lib/kai_ukernel_interface.h
+++ b/onnxruntime/core/mlas/lib/kai_ukernel_interface.h
@@ -12,8 +12,12 @@
 
 #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p_interface.h"
 
+#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp_qsi8cxp_interface.h"
+
 const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& GetKleidiAIGemmUKernel();
 const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& GetKleidiAIGemvUKernel();
 
 const kai_matmul_clamp_f32_f32p_f32p_ukernel& GetKleidiAISGemmUKernel();
 const kai_matmul_clamp_f32_f32_f32p_ukernel& GetKleidiAISGemvUKernel();
+
+const kai_matmul_clamp_f32_qai8dxp_qsi8cxp_ukernel& GetKleidiAIQGemmUKernel();
\ No newline at end of file
diff --git a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
index d4df09bb94a93..4c088e8660874 100644
--- a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
+++ b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
@@ -107,14 +107,14 @@ MlasGemmBatch(
 
 size_t
 MLASCALL
-MlasDynamicQgemmPackBSize(
+MlasDynamicQGemmPackBSize(
     size_t N,
     size_t K
 );
 
 void
 MLASCALL
-MlasDynamicQgemmPackB(
+MlasDynamicQGemmPackB(
     size_t N,
     size_t K,
     const int8_t* B,
diff --git a/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp
index 9b2deac69ff63..b6a23735bd131 100644
--- a/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp
+++ b/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp
@@ -10,9 +10,7 @@
 #include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h"
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon.h"
 
-#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa.h"
-#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa.h"
-#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot.h"
+#include "kai_ukernel_interface.h"
 #if defined(ENABLE_QMX_KERNELS)
 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa.h"
 #endif // ENABLE_QMX_KERNELS
@@ -26,11 +24,13 @@ struct KaiTlsBuffersQgemm {
 };
 static thread_local KaiTlsBuffersQgemm g_kai_tls_qgemm;
 
+const kai_matmul_clamp_f32_qai8dxp_qsi8cxp_ukernel qgemm_gemm = GetKleidiAIQGemmUKernel();
+
 // Matmul with float output of dynamic-quantized A and symmetric-quantized B.
 
 size_t
 MLASCALL
-ArmKleidiAI::MlasDynamicQgemmPackBSize(
+ArmKleidiAI::MlasDynamicQGemmPackBSize(
     size_t N,
     size_t K
 ) {
@@ -39,10 +39,9 @@ ArmKleidiAI::MlasDynamicQgemmPackBSize(
         return 0;
     }
 
-    // Default to sme2_mopa, but this may not always be the most optimal kernel variant to use.
-    auto nr = kai_get_nr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa();
-    auto kr = kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa();
-    auto sr = kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa();
+    auto nr = qgemm_gemm.get_nr();
+    auto kr = qgemm_gemm.get_kr();
+    auto sr = qgemm_gemm.get_sr();
 
     // Regardless of kernel variant, use the NEON packing variant.
     KLEIDIAI_KERNEL_LOG("kai_run_rhs_pack_kxn_qsi8cxp_qsi8cx_neon Groups=1"
@@ -52,7 +51,7 @@ ArmKleidiAI::MlasDynamicQgemmPackBSize(
 
 void
 MLASCALL
-ArmKleidiAI::MlasDynamicQgemmPackB(
+ArmKleidiAI::MlasDynamicQGemmPackB(
     size_t N,
     size_t K,
     const int8_t* B,
@@ -65,10 +64,9 @@ ArmKleidiAI::MlasDynamicQgemmPackB(
         return;
     }
 
-    // Default to sme2_mopa, but this may not always be the most optimal kernel variant to use.
-    auto nr = kai_get_nr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa();
-    auto kr = kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa();
-    auto sr = kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa();
+    auto nr = qgemm_gemm.get_nr();
+    auto kr = qgemm_gemm.get_kr();
+    auto sr = qgemm_gemm.get_sr();
 
     // y - float output
     // scale_factor_lhs - lhs scaling factor
@@ -105,17 +103,12 @@ ArmKleidiAI::MlasDynamicQGemmBatch(
     MLAS_THREADPOOL* ThreadPool
 ) {
 
-    const size_t mr = UseSME2 ? kai_get_mr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa()
-                              : kai_get_mr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa();
-    const size_t kr = UseSME2 ? kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa()
-                              : kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa();
-    const size_t sr = UseSME2 ? kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa()
-                              : kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa();
+    const size_t mr = qgemm_gemm.get_mr();
+    const size_t kr = qgemm_gemm.get_kr();
+    const size_t sr = qgemm_gemm.get_sr();
 
-    size_t m_step = UseSME2 ? kai_get_m_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa()
-                            : kai_get_m_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa();
-    size_t n_step = UseSME2 ? kai_get_n_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa()
-                            : kai_get_n_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa();
+    size_t m_step = qgemm_gemm.get_m_step();
+    size_t n_step = qgemm_gemm.get_n_step();
 
     if (BatchSize == 0 || Shape.M == 0 || Shape.N == 0 || Shape.K == 0) {
         return;
@@ -216,17 +209,13 @@ ArmKleidiAI::MlasDynamicQGemmBatch(
         ptrdiff_t NIdx = (tid % (dim[1] * dim[2])) % dim[2];
 
         // Get rhs tile, B
-        const size_t rhs_packed_offset =
-           UseSME2 ? kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa(NIdx * n_step, Shape.K)
-                   : kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa(NIdx * n_step, Shape.K);
+        const size_t rhs_packed_offset = qgemm_gemm.get_rhs_packed_offset(NIdx * n_step, Shape.K);
 
         const std::byte* B_base = reinterpret_cast<const std::byte*>(DataParams[BIdx].PackedB);
         auto BTile = reinterpret_cast<const void*>(B_base + rhs_packed_offset);
 
         // Get lhs tile, A
-        const size_t lhs_packed_offset =
-            UseSME2 ? kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa(MIdx * m_step, Shape.K)
-                    : kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa(MIdx * m_step, Shape.K);
+        const size_t lhs_packed_offset =qgemm_gemm.get_lhs_packed_offset(MIdx * m_step, Shape.K);
 
         const std::byte* A_base = tls_lhs_base[BIdx]; // LhsPackedData + LhsPackedStride * BIdx; OR DataParams[batch_idx].Workspace;
         auto ATile = reinterpret_cast<const std::byte*>(A_base + lhs_packed_offset);
@@ -240,46 +229,12 @@ ArmKleidiAI::MlasDynamicQGemmBatch(
         NIdx * n_step * sizeof(float)
         );
 
-        if (UseSME2) {
-            kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa(
+        qgemm_gemm.run_matmul(
                 TileSizeM, TileSizeN, Shape.K, ATile, BTile,
                 dst_tile,
                 DataParams[BIdx].ldc * sizeof(float),
                 sizeof(float),
                 -std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
                 );
-            }
-        else {
-            #if defined(ENABLE_QMX_KERNELS)
-                if(ArmKleidiAI::vendor_name.compare("Qualcomm") == 0)
-                {
-                    kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa(
-                        TileSizeM, TileSizeN, Shape.K, ATile, BTile,
-                        dst_tile,
-                        DataParams[BIdx].ldc * sizeof(float),
-                        sizeof(float),
-                        -std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
-                        );
-                }
-                else
-                {
-                    kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa(
-                        TileSizeM, TileSizeN, Shape.K, ATile, BTile,
-                        dst_tile,
-                        DataParams[BIdx].ldc * sizeof(float),
-                        sizeof(float),
-                        -std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
-                        );
-                }
-            #else
-                kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa(
-                        TileSizeM, TileSizeN, Shape.K, ATile, BTile,
-                        dst_tile,
-                        DataParams[BIdx].ldc * sizeof(float),
-                        sizeof(float),
-                        -std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
-                        );
-            #endif // ENABLE_QMX_KERNELS
-        }
     });
 }
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
index e75ca3dc90e60..ac7528853c596 100644
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@@ -851,17 +851,9 @@ bool
     MLAS_THREADPOOL* ThreadPool
     );
 
-typedef void (MLASCALL MLAS_GEMM_BATCH)(
-    CBLAS_TRANSPOSE TransA,
-    CBLAS_TRANSPOSE TransB,
-    size_t M,
-    size_t N,
-    size_t K,
-    const MLAS_SGEMM_DATA_PARAMS* Data,
-    size_t BatchSize,
-    MLAS_THREADPOOL* ThreadPool);
-
-typedef bool (MLASCALL MLAS_GEMM_BATCH_OVERRIDE)(
+typedef
+bool
+(MLASCALL MLAS_SGEMM_BATCH_OVERRIDE)(
     CBLAS_TRANSPOSE TransA,
     CBLAS_TRANSPOSE TransB,
     size_t M,
@@ -871,19 +863,17 @@ typedef bool (MLASCALL MLAS_GEMM_BATCH_OVERRIDE)(
     size_t BatchSize,
     MLAS_THREADPOOL* ThreadPool);
 
-typedef size_t (MLASCALL MLAS_GEMM_PACK_B_SIZE)(
-    CBLAS_TRANSPOSE TransA,
-    CBLAS_TRANSPOSE TransB,
-    size_t N,
-    size_t K);
-
-typedef size_t (MLASCALL MLAS_GEMM_PACK_B_SIZE_OVERRIDE)(
+typedef
+size_t
+(MLASCALL MLAS_SGEMM_PACK_B_SIZE_OVERRIDE)(
     CBLAS_TRANSPOSE TransA,
     CBLAS_TRANSPOSE TransB,
     size_t N,
     size_t K);
 
-typedef void (MLASCALL MLAS_GEMM_PACK_B)(
+typedef
+bool
+(MLASCALL MLAS_SGEMM_PACK_B_OVERRIDE)(
     CBLAS_TRANSPOSE TransA,
     CBLAS_TRANSPOSE TransB,
     size_t N,
@@ -892,13 +882,28 @@ typedef void (MLASCALL MLAS_GEMM_PACK_B)(
     size_t ldb,
     void* PackedB);
 
-typedef bool (MLASCALL MLAS_GEMM_PACK_B_OVERRIDE)(
-    CBLAS_TRANSPOSE TransA,
-    CBLAS_TRANSPOSE TransB,
+typedef
+void
+(MLASCALL MLAS_DYNAMIC_QGEMM_BATCH_OVERRIDE)(
+    const MLAS_GEMM_DYN_QUANT_SHAPE_PARAMS& Shape,
+    const MLAS_GEMM_DYN_QUANT_DATA_PARAMS* DataParams,
+    const size_t BatchN,
+    MLAS_THREADPOOL* ThreadPool);
+
+typedef
+size_t
+(MLASCALL MLAS_DYNAMIC_QGEMM_PACK_B_SIZE_OVERRIDE)(
+    size_t N,
+    size_t K);
+
+typedef
+void
+(MLASCALL MLAS_DYNAMIC_QGEMM_PACK_B_OVERRIDE)(
     size_t N,
     size_t K,
-    const float* B,
-    size_t ldb,
+    const int8_t* B,
+    const float* Scales,
+    const float* Bias,
     void* PackedB);
 
 extern "C" {
@@ -1333,10 +1338,15 @@ struct MLAS_PLATFORM {
     bool Avx512Supported_ = false;
     bool ArmNeonIsQuantActivationsUnsigned = false;
 
-    // Mlas overrides initialisation
-    MLAS_GEMM_BATCH_OVERRIDE* MlasGemmBatchOverride = nullptr;
-    MLAS_GEMM_PACK_B_SIZE_OVERRIDE* MlasGemmPackBSizeOverride = nullptr;
-    MLAS_GEMM_PACK_B_OVERRIDE* MlasGemmPackBOverride = nullptr;
+    // MLAS SGemm overrides
+    MLAS_SGEMM_BATCH_OVERRIDE* MlasSGemmBatchOverride = nullptr;
+    MLAS_SGEMM_PACK_B_SIZE_OVERRIDE* MlasSGemmPackBSizeOverride = nullptr;
+    MLAS_SGEMM_PACK_B_OVERRIDE* MlasSGemmPackBOverride = nullptr;
+    // MLAS Dynamic QGemm overrides
+    MLAS_DYNAMIC_QGEMM_BATCH_OVERRIDE* MlasDynamicQGemmBatchOverride = nullptr;
+    MLAS_DYNAMIC_QGEMM_PACK_B_SIZE_OVERRIDE* MlasDynamicQGemmPackBSizeOverride = nullptr;
+    MLAS_DYNAMIC_QGEMM_PACK_B_OVERRIDE* MlasDynamicQGemmPackBOverride = nullptr;
+    // MLAS Conv overrides
     MLAS_CONV_PREPARE_FLOAT_OVERRIDE* MlasConvPrepareOverride = nullptr;
     MLAS_CONV_FLOAT_OVERRIDE* MlasConvOverride = nullptr;
 
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
index b913b1c3b8c26..6ebd6be068b12 100644
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@@ -610,9 +610,12 @@ Return Value:
 
 #if defined(USE_KLEIDIAI)
     if(MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME()){
-        this->MlasGemmBatchOverride = ArmKleidiAI::MlasGemmBatch;
-        this->MlasGemmPackBSizeOverride = ArmKleidiAI::MlasGemmPackBSize;
-        this->MlasGemmPackBOverride = ArmKleidiAI::MlasGemmPackB;
+        this->MlasSGemmBatchOverride = ArmKleidiAI::MlasGemmBatch;
+        this->MlasSGemmPackBSizeOverride = ArmKleidiAI::MlasGemmPackBSize;
+        this->MlasSGemmPackBOverride = ArmKleidiAI::MlasGemmPackB;
+        this->MlasDynamicQGemmBatchOverride = ArmKleidiAI::MlasDynamicQGemmBatch;
+        this->MlasDynamicQGemmPackBSizeOverride = ArmKleidiAI::MlasDynamicQGemmPackBSize;
+        this->MlasDynamicQGemmPackBOverride = ArmKleidiAI::MlasDynamicQGemmPackB;
         this->MlasConvPrepareOverride = ArmKleidiAI::MlasConvPrepare;
         this->MlasConvOverride = ArmKleidiAI::MlasConv;
     }
diff --git a/onnxruntime/core/mlas/lib/qgemm.cpp b/onnxruntime/core/mlas/lib/qgemm.cpp
index 3fc69a607de3f..e3e3acc147e49 100644
--- a/onnxruntime/core/mlas/lib/qgemm.cpp
+++ b/onnxruntime/core/mlas/lib/qgemm.cpp
@@ -224,7 +224,9 @@ MlasDynamicQGemmBatch (
 
 #if defined(USE_KLEIDIAI)
     //No fallback
-    ArmKleidiAI::MlasDynamicQGemmBatch(Shape, DataParams, BatchN, ThreadPool);
+    if (GetMlasPlatform().MlasDynamicQGemmBatchOverride != nullptr) {
+        GetMlasPlatform().MlasDynamicQGemmBatchOverride(Shape, DataParams, BatchN, ThreadPool);
+    }
 #endif
 
     MLAS_UNREFERENCED_PARAMETER(Shape);
@@ -348,8 +350,9 @@ MlasDynamicQgemmPackBSize(
     size_t bytes = 0;
 #if defined(USE_KLEIDIAI)
     //No fallback available
-    //TODO: Insert Override
-    bytes = ArmKleidiAI::MlasDynamicQgemmPackBSize(N, K);
+    if (GetMlasPlatform().MlasDynamicQGemmPackBSizeOverride != nullptr) {
+        GetMlasPlatform().MlasDynamicQGemmPackBSizeOverride(N, K);
+    }
 #endif
 
     MLAS_UNREFERENCED_PARAMETER(N);
@@ -442,7 +445,9 @@ MlasDynamicQgemmPackB(
 
 #if defined(USE_KLEIDIAI)
     //No fallback
-    ArmKleidiAI::MlasDynamicQgemmPackB(N, K, B, Scales, Bias, PackedB);
+    if (GetMlasPlatform().MlasDynamicQGemmPackBOverride != nullptr) {
+        GetMlasPlatform().MlasDynamicQGemmPackBOverride(N, K, B, Scales, Bias, PackedB);
+    }
 #endif
 
     MLAS_UNREFERENCED_PARAMETER(N);
diff --git a/onnxruntime/core/mlas/lib/sgemm.cpp b/onnxruntime/core/mlas/lib/sgemm.cpp
index 02e38b6ef432e..7117f20b82ce5 100644
--- a/onnxruntime/core/mlas/lib/sgemm.cpp
+++ b/onnxruntime/core/mlas/lib/sgemm.cpp
@@ -1573,10 +1573,10 @@ MlasGemmBatch(
     )
 {
     // Override
-    if(GetMlasPlatform().MlasGemmBatchOverride != nullptr &&
+    if(GetMlasPlatform().MlasSGemmBatchOverride != nullptr &&
         // TODO: Remove once KAI supports transposing for A
         TransA != CBLAS_TRANSPOSE::CblasTrans &&
-        GetMlasPlatform().MlasGemmBatchOverride(TransA, TransB, M, N, K, Data, BatchSize, ThreadPool)){
+        GetMlasPlatform().MlasSGemmBatchOverride(TransA, TransB, M, N, K, Data, BatchSize, ThreadPool)){
         return;
     }
     //
@@ -1671,12 +1671,12 @@ Return Value:
     //
     // KleidiAI or other override
     #if defined(USE_KLEIDIAI)
-    if (GetMlasPlatform().MlasGemmPackBSizeOverride != nullptr &&
+    if (GetMlasPlatform().MlasSGemmPackBSizeOverride != nullptr &&
         // TODO: Remove once KAI supports transposing for A
         TransA != CBLAS_TRANSPOSE::CblasTrans) {
         size_t bytes_required;
         //TODO pass status by reference to indicate success/fail
-        bytes_required = GetMlasPlatform().MlasGemmPackBSizeOverride(TransA, TransB, N, K);
+        bytes_required = GetMlasPlatform().MlasSGemmPackBSizeOverride(TransA, TransB, N, K);
         if (bytes_required != 0){// If ArmKleidiAI::MlasGemmPackBSize ran to completion
             return bytes_required;
         }
@@ -1738,10 +1738,10 @@ Return Value:
 --*/
 {
 #if defined(USE_KLEIDIAI)
-    if (GetMlasPlatform().MlasGemmPackBOverride != nullptr  &&
+    if (GetMlasPlatform().MlasSGemmPackBOverride != nullptr  &&
         // TODO: Remove once KAI supports transposing for A
         TransA != CBLAS_TRANSPOSE::CblasTrans    &&
-        GetMlasPlatform().MlasGemmPackBOverride(TransA, TransB, N, K, B, ldb, PackedB)){
+        GetMlasPlatform().MlasSGemmPackBOverride(TransA, TransB, N, K, B, ldb, PackedB)){
          return;
     }
 #endif

From c0373adc80aa35a126b793f975a152b60545b1ac Mon Sep 17 00:00:00 2001
From: don <70039285+0-don@users.noreply.github.com>
Date: Sat, 21 Feb 2026 04:20:22 +0100
Subject: [PATCH 03/21] [js/web] Use embedded WASM module in Blob URL workers
 when wasmBinary is provided (#27318)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes #27317

When running inside a Blob URL Web Worker with `wasmBinary` provided and
`numThreads=1`, `isSameOrigin(scriptSrc)` can fail because blob: URLs
have opaque origins. This causes a fallback to dynamic
`import('./ort-wasm-simd-threaded.mjs')` which doesn't exist in that
context.

Since `wasmBinary` is already provided and no worker spawning is needed
(single-threaded), the embedded Emscripten module can be used directly —
no URL resolution or same-origin check is needed.

**Change:** One line in `wasm-utils-import.ts` line 275:
```typescript
// Before:
useEmbeddedModule = isSameOrigin(scriptSrc);

// After:
useEmbeddedModule = isSameOrigin(scriptSrc) || (isWasmOverridden && !isMultiThreaded);
```

This extends the existing pattern from the `!scriptSrc` case (line 268)
to also apply when `scriptSrc` is available but fails same-origin
checks. The condition (`wasmBinary` provided + single-threaded)
guarantees no file resolution or worker spawning is needed.
---
 js/web/lib/wasm/wasm-utils-import.ts | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/js/web/lib/wasm/wasm-utils-import.ts b/js/web/lib/wasm/wasm-utils-import.ts
index e2e46bb37dcfc..6c899d1ae9cf5 100644
--- a/js/web/lib/wasm/wasm-utils-import.ts
+++ b/js/web/lib/wasm/wasm-utils-import.ts
@@ -272,7 +272,9 @@ export const importWasmModule = async (
       }
     } else {
       // if the script source is available, we can check if it is from the same origin.
-      useEmbeddedModule = isSameOrigin(scriptSrc);
+      // Also use the embedded module when wasmBinary is provided and single-threaded (eg. Blob URL workers
+      // where isSameOrigin fails but no file resolution or worker spawning is needed).
+      useEmbeddedModule = isSameOrigin(scriptSrc) || (isWasmOverridden && !isMultiThreaded);
     }
   }
   if (useEmbeddedModule) {

From 6521e5e4f7b01f7d6632c6b4e76d15917bdc127a Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Mon, 23 Feb 2026 09:05:09 -0800
Subject: [PATCH 04/21] Fix refcount bug in map input conversion that caused
 shutdown segfault (#27413)

Fix Python refcount bug in map input conversion that caused shutdown
segfault in `onnxruntime_test_python_mlops.py` ( see
https://github.com/microsoft/onnxruntime/issues/27392).

## Summary
This PR fixes a Python reference-count ownership bug in the map
conversion path in `onnxruntime/python/onnxruntime_pybind_mlvalue.cc`.

In Python 3.14 test runs, the bug could manifest as a segmentation fault
after tests completed (typically at interpreter shutdown), even when
test assertions passed. .

## Root Cause
In `CreateMapMLValue_LoopIntoMap`, error paths decremented `item`
unconditionally.

- In single-map flow, `item` is a **borrowed reference** (must not be
decref'd there).
- In iterator/vector-map flow, `item` is an **owned reference** (must be
decref'd).

The unconditional decref in borrowed-reference flow caused refcount
corruption and eventually a crash.

## Fix
Add explicit ownership handling for `item`:

- `CreateMapMLValue_LoopIntoMap(..., bool owns_item_ref, ...)`
- Pass `owns_item_ref = false` from single-map path
(`CreateMapMLValue_Map`)
- Pass `owns_item_ref = true` from vector-map path
(`CreateMapMLValue_VectorMap`)
- Only `Py_XDECREF(item)` on error when `owns_item_ref` is true

This preserves existing behavior while correcting reference ownership.

## Validation
```bash
cd onnxruntime/test/python
python onnxruntime_test_python_mlops.py
```

Result:
- `OK`
- Exit code `0` (no shutdown segfault)

## Notes
- Although this became reproducible in Python 3.14, the underlying
refcount bug is version-agnostic C-extension undefined behavior.
---
 onnxruntime/python/onnxruntime_pybind_mlvalue.cc | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
index d96d229c942cb..89651c2d955de 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
@@ -794,7 +794,7 @@ std::string _get_type_name(std::string&) {
 #if !defined(DISABLE_ML_OPS)
 template <typename KeyType, typename ValueType, typename KeyGetterType, typename ValueGetterType>
 static void CreateMapMLValue_LoopIntoMap(Py_ssize_t& pos, PyObject*& key, const std::string& name_input, PyObject*& value,
-                                         PyObject* item, std::map<KeyType, ValueType>& current,
+                                         PyObject* item, bool owns_item_ref, std::map<KeyType, ValueType>& current,
                                          KeyGetterType keyGetter, ValueGetterType valueGetter) {
   KeyType ckey;
   ValueType cvalue;
@@ -806,7 +806,9 @@ static void CreateMapMLValue_LoopIntoMap(Py_ssize_t& pos, PyObject*& key, const
       std::string sType = spyType;
       Py_XDECREF(pStr);
       Py_XDECREF(pType);
-      Py_XDECREF(item);
+      if (owns_item_ref) {
+        Py_XDECREF(item);
+      }
       throw std::runtime_error(std::string("Unexpected key type  ") + sType +
                                std::string(", it cannot be linked to C type ") +
                                _get_type_name(ckey) + std::string(" for input '") +
@@ -820,7 +822,9 @@ static void CreateMapMLValue_LoopIntoMap(Py_ssize_t& pos, PyObject*& key, const
       std::string sType = spyType;
       Py_XDECREF(pStr);
       Py_XDECREF(pType);
-      Py_XDECREF(item);
+      if (owns_item_ref) {
+        Py_XDECREF(item);
+      }
       throw std::runtime_error(std::string("Unexpected value type  ") + sType +
                                std::string(", it cannot be linked to C type ") +
                                _get_type_name(ckey) + std::string(" for input '") +
@@ -836,7 +840,7 @@ static void CreateMapMLValue_Map(Py_ssize_t& pos, PyObject*& key, const std::str
                                  ValueGetterType valueGetter) {
   std::unique_ptr<std::map<KeyType, ValueType>> dst;
   dst = std::make_unique<std::map<KeyType, ValueType>>();
-  CreateMapMLValue_LoopIntoMap(pos, key, name_input, value, item, *dst, keyGetter, valueGetter);
+  CreateMapMLValue_LoopIntoMap(pos, key, name_input, value, item, false, *dst, keyGetter, valueGetter);
   p_mlvalue->Init(dst.release(), DataTypeImpl::GetType<std::map<KeyType, ValueType>>(),
                   DataTypeImpl::GetType<std::map<KeyType, ValueType>>()->GetDeleteFunc());
 }
@@ -850,7 +854,7 @@ void CreateMapMLValue_VectorMap(Py_ssize_t& pos, PyObject*& key, const std::stri
   int index = 0;
   do {
     dstVector->push_back(std::map<KeyType, ValueType>());
-    CreateMapMLValue_LoopIntoMap(pos, key, name_input, value, item, (*dstVector)[index], keyGetter, valueGetter);
+    CreateMapMLValue_LoopIntoMap(pos, key, name_input, value, item, true, (*dstVector)[index], keyGetter, valueGetter);
     Py_DECREF(item);
     ++index;
     item = iterator == NULL ? NULL : PyIter_Next(iterator);

From 9f7316dacd14f26a713994973d3ec77fa7764800 Mon Sep 17 00:00:00 2001
From: Jonathan Clohessy <jonathan.clohessy@arm.com>
Date: Mon, 23 Feb 2026 19:56:58 +0000
Subject: [PATCH 05/21] Fix error where bytes is not assigned for dynamic qgemm
 pack b size (#27421)

### Description
Fix for dynamic qgemm pack B size. Byte assignment accidentally removed
in previous commit. Which causes test failures with the following error
message

C++ exception with description "Dynamic QGEMM requires non-null PackedB
pointer." thrown in the test body.

Signed-off-by: Jonathan Clohessy <Jonathan.Clohessy@arm.com>
---
 onnxruntime/core/mlas/lib/qgemm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/mlas/lib/qgemm.cpp b/onnxruntime/core/mlas/lib/qgemm.cpp
index e3e3acc147e49..5f6a8f8394470 100644
--- a/onnxruntime/core/mlas/lib/qgemm.cpp
+++ b/onnxruntime/core/mlas/lib/qgemm.cpp
@@ -351,7 +351,7 @@ MlasDynamicQgemmPackBSize(
 #if defined(USE_KLEIDIAI)
     //No fallback available
     if (GetMlasPlatform().MlasDynamicQGemmPackBSizeOverride != nullptr) {
-        GetMlasPlatform().MlasDynamicQGemmPackBSizeOverride(N, K);
+       bytes = GetMlasPlatform().MlasDynamicQGemmPackBSizeOverride(N, K);
     }
 #endif
 

From 933077e8aa0b5fecb4720a33e4ec98808e4fecc5 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Mon, 23 Feb 2026 16:43:25 -0800
Subject: [PATCH 06/21] Fix DllImportResolver (#27397)

### Description
This PR addresses two issues related to the newly added
`DllImportResolver` for `.NET` native library loading:

1. **Fix IL3000 Warning during Native AOT / Single-File Publish**
When publishing projects that reference `Microsoft.ML.OnnxRuntime` as a
single file or using Native AOT in .NET 9, the compiler reports an
`IL3000` warning/error because `DllImportResolver` accesses
`Assembly.Location`. In these deployment models, `Assembly.Location`
always returns an empty string or throws.
Since `DllImportResolver` already correctly handles the empty string
failure and falls back to `AppContext.BaseDirectory` (which is fully
supported), this PR adds the `[UnconditionalSuppressMessage]` attribute
to suppress the build warning statically.

2. **Fix `TypeInitializationException` in `NativeMethods` Static
Constructor**
Users reported a `System.TypeInitializationException: The type
initializer for 'Microsoft.ML.OnnxRuntime.NativeMethods' threw an
exception.` when initializing the ONNX Runtime environment.
This occurs because the `DllImportResolver` (registered in the static
constructor) is invoked on the first P/Invoke (`OrtGetApiBase`). If any
API within the resolver throws an unhandled exception (for instance,
`AppContext.BaseDirectory` throwing `AppDomainUnloadedException` in
sandboxed AppDomains or `Environment.GetEnvironmentVariable` throwing
`SecurityException`), the exception bubbles up and crashes the
application with a type initialization failure.
This PR wraps the `DllImportResolver` logic in a `try-catch` block
(specifically handling `AppContext.BaseDirectory` edge cases) so that
any resolution failure safely swallows the error and falls back to
`IntPtr.Zero`, allowing the default .NET Platform Invoke mechanism to
take over and throw a standard `DllNotFoundException` instead of a fatal
type initialization crash.
A unit test (`TestDllImportResolverDoesNotThrow`) has been added to
`OrtEnvTests.cs` to verify that `DllImportResolver` successfully
swallows internal exceptions without crashing the initialization
process.

### Motivation and Context
These changes ensure that .NET developers can safely compile Native
AOT/Single-File applications without build errors and prevent hard
application crashes in environments with restricted permissions.
---
 .../NativeMethods.shared.cs                   | 156 ++++++++++--------
 .../OrtEnvTests.cs                            |  43 +++++
 2 files changed, 129 insertions(+), 70 deletions(-)

diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
index abe73b77f4071..a6b267c6802cf 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
@@ -892,99 +892,115 @@ internal class NativeLib
         /// On Windows, it explicitly loads the library with a lowercase .dll extension to handle
         /// case-sensitive filesystems.
         /// </summary>
+#if NET5_0_OR_GREATER
+        [System.Diagnostics.CodeAnalysis.UnconditionalSuppressMessage("SingleFile", "IL3000:Avoid accessing Assembly file path when publishing as a single file", Justification = "We also check AppContext.BaseDirectory as a fallback")]
+#endif
         private static IntPtr DllImportResolver(string libraryName, Assembly assembly, DllImportSearchPath? searchPath)
         {
-            if (libraryName == NativeLib.DllName || libraryName == OrtExtensionsNativeMethods.ExtensionsDllName)
+            try
             {
-                string mappedName = null;
-                if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
-                {
-                    // Explicitly load with .dll extension to avoid issues where the OS might try .DLL
-                    mappedName = libraryName + ".dll";
-                }
-                else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
+                if (libraryName == NativeLib.DllName || libraryName == OrtExtensionsNativeMethods.ExtensionsDllName)
                 {
-                    // Explicitly load with .so extension and lib prefix
-                    mappedName = "lib" + libraryName + ".so";
-                }
-                else if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
-                {
-                    // Explicitly load with .dylib extension and lib prefix
-                    mappedName = "lib" + libraryName + ".dylib";
-                }
-
-                if (mappedName != null)
-                {
-                    // 1. Try default loading (name only)
-                    if (NativeLibrary.TryLoad(mappedName, assembly, searchPath, out IntPtr handle))
+                    string mappedName = null;
+                    if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
                     {
-                        return handle;
+                        // Explicitly load with .dll extension to avoid issues where the OS might try .DLL
+                        mappedName = libraryName + ".dll";
                     }
-
-                    // 2. Try relative to assembly location (look into runtimes subfolders)
-                    string assemblyLocation = null;
-                    try { assemblyLocation = assembly.Location; } catch { }
-                    if (!string.IsNullOrEmpty(assemblyLocation))
+                    else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
+                    {
+                        // Explicitly load with .so extension and lib prefix
+                        mappedName = "lib" + libraryName + ".so";
+                    }
+                    else if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
                     {
-                        string assemblyDir = System.IO.Path.GetDirectoryName(assemblyLocation);
-                        string rid = RuntimeInformation.RuntimeIdentifier;
+                        // Explicitly load with .dylib extension and lib prefix
+                        mappedName = "lib" + libraryName + ".dylib";
+                    }
 
-                        // Probe the specific RID first, then common fallbacks for the current OS
-                        string[] ridsToTry;
-                        if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
-                        {
-                            ridsToTry = new[] { rid, "win-x64", "win-arm64" };
-                        }
-                        else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
-                        {
-                            ridsToTry = new[] { rid, "linux-x64", "linux-arm64" };
-                        }
-                        else if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
-                        {
-                            // We no longer provide osx-x64 in official package since 1.24.
-                            // However, we keep it in the list for build-from-source users.
-                            ridsToTry = new[] { rid, "osx-arm64", "osx-x64" };
-                        }
-                        else
+                    if (mappedName != null)
+                    {
+                        // 1. Try default loading (name only)
+                        if (NativeLibrary.TryLoad(mappedName, assembly, searchPath, out IntPtr handle))
                         {
-                            ridsToTry = new[] { rid };
+                            return handle;
                         }
 
-                        foreach (var tryRid in ridsToTry)
+                        // 2. Try relative to assembly location (look into runtimes subfolders)
+                        string assemblyLocation = null;
+                        try { assemblyLocation = assembly.Location; } catch { }
+                        if (!string.IsNullOrEmpty(assemblyLocation))
                         {
-                            string probePath = System.IO.Path.Combine(assemblyDir, "runtimes", tryRid, "native", mappedName);
-                            if (System.IO.File.Exists(probePath) && NativeLibrary.TryLoad(probePath, assembly, searchPath, out handle))
+                            string assemblyDir = System.IO.Path.GetDirectoryName(assemblyLocation);
+                            string rid = RuntimeInformation.RuntimeIdentifier;
+
+                            // Probe the specific RID first, then common fallbacks for the current OS
+                            string[] ridsToTry;
+                            if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
                             {
-                                LogLibLoad($"[DllImportResolver] Loaded {mappedName} from: {probePath}");
-                                return handle;
+                                ridsToTry = new[] { rid, "win-x64", "win-arm64" };
+                            }
+                            else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
+                            {
+                                ridsToTry = new[] { rid, "linux-x64", "linux-arm64" };
+                            }
+                            else if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
+                            {
+                                // We no longer provide osx-x64 in official package since 1.24.
+                                // However, we keep it in the list for build-from-source users.
+                                ridsToTry = new[] { rid, "osx-arm64", "osx-x64" };
+                            }
+                            else
+                            {
+                                ridsToTry = new[] { rid };
                             }
-                        }
-                    }
 
-                    // 3. Try AppContext.BaseDirectory as a fallback
-                    string baseDir = AppContext.BaseDirectory;
-                    if (!string.IsNullOrEmpty(baseDir))
-                    {
-                        string probePath = System.IO.Path.Combine(baseDir, mappedName);
-                        if (NativeLibrary.TryLoad(probePath, assembly, searchPath, out handle))
-                        {
-                            LogLibLoad($"[DllImportResolver] Loaded {mappedName} from: {probePath}");
-                            return handle;
+                            foreach (var tryRid in ridsToTry)
+                            {
+                                string probePath = System.IO.Path.Combine(assemblyDir, "runtimes", tryRid, "native", mappedName);
+                                if (System.IO.File.Exists(probePath) && NativeLibrary.TryLoad(probePath, assembly, searchPath, out handle))
+                                {
+                                    LogLibLoad($"[DllImportResolver] Loaded {mappedName} from: {probePath}");
+                                    return handle;
+                                }
+                            }
                         }
 
-                        string rid = RuntimeInformation.RuntimeIdentifier;
-                        probePath = System.IO.Path.Combine(baseDir, "runtimes", rid, "native", mappedName);
-                        if (NativeLibrary.TryLoad(probePath, assembly, searchPath, out handle))
+                        // 3. Try AppContext.BaseDirectory as a fallback
+                        try
                         {
-                            LogLibLoad($"[DllImportResolver] Loaded {mappedName} from: {probePath}");
-                            return handle;
+                            string baseDir = AppContext.BaseDirectory;
+                            if (!string.IsNullOrEmpty(baseDir))
+                            {
+                                string probePath = System.IO.Path.Combine(baseDir, mappedName);
+                                if (NativeLibrary.TryLoad(probePath, assembly, searchPath, out handle))
+                                {
+                                    LogLibLoad($"[DllImportResolver] Loaded {mappedName} from: {probePath}");
+                                    return handle;
+                                }
+
+                                string rid = RuntimeInformation.RuntimeIdentifier;
+                                probePath = System.IO.Path.Combine(baseDir, "runtimes", rid, "native", mappedName);
+                                if (NativeLibrary.TryLoad(probePath, assembly, searchPath, out handle))
+                                {
+                                    LogLibLoad($"[DllImportResolver] Loaded {mappedName} from: {probePath}");
+                                    return handle;
+                                }
+                            }
                         }
-                    }
+                        catch { } // Ignore AppDomainUnloadedException or similar from AppContext.BaseDirectory
 
-                    LogLibLoad($"[DllImportResolver] Failed loading {mappedName} (RID: {RuntimeInformation.RuntimeIdentifier}, Assembly: {assemblyLocation})");
+                        LogLibLoad($"[DllImportResolver] Failed loading {mappedName} (RID: {RuntimeInformation.RuntimeIdentifier}, Assembly: {assemblyLocation})");
 
+                    }
                 }
             }
+            catch (Exception ex)
+            {
+                // Unhandled exceptions inside DllImportResolver can result in TypeInitializationException.
+                // Log and swallow the error, returning IntPtr.Zero to fall back to default CLR logic.
+                try { System.Diagnostics.Trace.WriteLine($"[DllImportResolver] Exception during resolution: {ex}"); } catch { }
+            }
 
             // Fall back to default resolution
             return IntPtr.Zero;
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtEnvTests.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtEnvTests.cs
index 94f8e927c1331..aa1b683acd668 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtEnvTests.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtEnvTests.cs
@@ -489,4 +489,47 @@ void TestCopyTensors()
             }
         }
     }
+
+    [Collection("Ort Inference Tests")]
+    public class OrtEnvDllImportResolverTest
+    {
+        [Fact(DisplayName = "TestDllImportResolverDoesNotThrow")]
+        public void TestDllImportResolverDoesNotThrow()
+        {
+            // The DllImportResolver is a private static method in NativeMethods.
+            var nativeMethodsType = typeof(OrtEnv).Assembly.GetType("Microsoft.ML.OnnxRuntime.NativeMethods");
+            Assert.NotNull(nativeMethodsType);
+
+            // It might not be defined on all platforms (defined when !NETSTANDARD2_0 && !__ANDROID__ && !__IOS__).
+            var resolverMethod = nativeMethodsType.GetMethod("DllImportResolver", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static);
+
+            if (resolverMethod != null)
+            {
+                try
+                {
+                    // Invoke with null assembly to force it into edge cases where assembly.Location would throw NullReferenceException.
+                    // It should catch the exception and return IntPtr.Zero gracefully rather than throwing.
+                    var result = resolverMethod.Invoke(null, new object[] { "onnxruntime", null, null });
+
+                    // If it reaches here without throwing TargetInvocationException, the try-catch in DllImportResolver works.
+                    Assert.True(result is IntPtr);
+                }
+                catch (System.Reflection.TargetInvocationException ex)
+                {
+                    // If NativeMethods..cctor() threw because the native library is missing,
+                    // we will get a TypeInitializationException wrapping a DllNotFoundException (or DllImportException).
+                    // This is acceptable locally. What we want to avoid is NullReferenceException from DllImportResolver.
+                    if (ex.InnerException is TypeInitializationException typeInitEx)
+                    {
+                        Assert.IsNotType<NullReferenceException>(typeInitEx.InnerException);
+                    }
+                    else
+                    {
+                        Assert.IsNotType<NullReferenceException>(ex.InnerException);
+                        throw;
+                    }
+                }
+            }
+        }
+    }
 }

From 101edf81b6db73ff15ae3d410d4b134e98f3ed93 Mon Sep 17 00:00:00 2001
From: Hariharan Seshadri <shariharan91@gmail.com>
Date: Tue, 24 Feb 2026 08:17:47 -0800
Subject: [PATCH 07/21] MatmulNBits prepacking scales fix (#27412)

### Description
Fix incorrect scales element count while pre-packing scales while we
processing the B input in the Prepack() method of MatmulNBits operator


### Motivation and Context
Fix potential crash due to incorrect element count

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 .../contrib_ops/cpu/quantization/matmul_nbits.cc       |  6 +++---
 onnxruntime/test/contrib_ops/matmul_4bits_test.cc      | 10 ++++++++++
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
index 53e95bd8c5627..cc93799059f43 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
@@ -359,9 +359,9 @@ Status MatMulNBits<MLFloat16>::PrePack(const Tensor& tensor, int input_idx, /*ou
     OpKernel::Info().TryGetConstantInput(InputIndex::scales, &scales);
     if (scales && MlasQNBitGemmScalesPacked(K_, nbits_, block_size_, compute_type_, has_zp_input_)) {
       auto sptr = scales->Data<MLFloat16>();
-      auto tensor_size = static_cast<size_t>(tensor.Shape().Size());
-      auto ptr = IAllocator::MakeUniquePtr<float>(alloc, tensor_size, true);
-      MlasConvertHalfToFloatBuffer(sptr, ptr.get(), tensor_size);
+      auto scales_size = static_cast<size_t>(scales->Shape().Size());
+      auto ptr = IAllocator::MakeUniquePtr<float>(alloc, scales_size, true);
+      MlasConvertHalfToFloatBuffer(sptr, ptr.get(), scales_size);
       scales_fp32_ = std::move(ptr);
     }
 
diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
index 8cbdc381a1a70..66f87142d3a34 100644
--- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -495,6 +495,16 @@ TEST(MatMulNBits, Float16_4b_Accuracy4) {
   TestMatMulNBitsTyped<MLFloat16, 100, 288, 93, 32, 4>();
   TestMatMulNBitsTyped<MLFloat16, 100, 288, 93, 128, 4>();
   TestMatMulNBitsTyped<MLFloat16, 100, 288, 1234, 16, 4>();
+
+  // See PR #27412 for details on the following test case,
+  // which is added to cover a specific failure case in the past.
+  // 6144, 2048
+
+  // Since K is larger (more change of larger error),
+  // and N is larger (more chance of havinga value with larger error),
+  // we set a higher tolerance for this case to avoid false positives
+  // and flaky failures.
+  TestMatMulNBitsTyped<MLFloat16, 369, 6144, 2048, 32, 4>(0.2f, 0.03f);
 }
 
 TEST(MatMulNBits, LegacyShape_4b) {

From ca18b54124746274096b45602b28c84835e70229 Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Tue, 24 Feb 2026 12:12:23 -0800
Subject: [PATCH 08/21] Fix validation for external data paths for models
 loaded from bytes (#27430)

### Description
This PR fixes the validation of external data paths when ONNX models are
loaded from bytes (in-memory). Previously, when a model was loaded from
bytes without an explicit external data folder path being set, path
using ".." sequences were not properly validated, potentially allowing
access to arbitrary files on the filesystem.


### Motivation and Context
Address a security concern
---
 .../core/framework/tensorprotoutils.cc        | 15 ++++
 .../test/framework/tensorutils_test.cc        | 26 +++++-
 onnxruntime/test/shared_lib/test_inference.cc | 87 +++++++++++++++++++
 3 files changed, 125 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index 961012536126b..e4c7830ffbb55 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -371,6 +371,21 @@ Status ValidateExternalDataPath(const std::filesystem::path& base_dir,
                              "External data path: ", location, " (resolved path: ", resolved,
                              ") escapes model directory: ", base_dir);
     }
+  } else {
+    // The basedir is empty, which occurs when 1) the session loads a model from bytes and 2) the application does not
+    // set an external file folder path via the session config option
+    // `kOrtSessionOptionsModelExternalInitializersFileFolderPath`.
+
+    // We conservatively check that the normalized relative path does not contain ".." path components that would allow
+    // access to arbitrary files outside of the current working directory. Based on ONNX checker validation.
+    auto norm_location = location.lexically_normal();
+
+    for (const auto& path_component : norm_location) {
+      if (path_component == ORT_TSTR("..")) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "External data path: ", location,
+                               " (model loaded from bytes) escapes working directory");
+      }
+    }
   }
   return Status::OK();
 }
diff --git a/onnxruntime/test/framework/tensorutils_test.cc b/onnxruntime/test/framework/tensorutils_test.cc
index 0d7b583faf27b..c9b61a7a39632 100644
--- a/onnxruntime/test/framework/tensorutils_test.cc
+++ b/onnxruntime/test/framework/tensorutils_test.cc
@@ -530,9 +530,6 @@ TEST_F(PathValidationTest, ValidateExternalDataPath) {
   // Valid relative path.
   ASSERT_STATUS_OK(utils::ValidateExternalDataPath(base_dir_, "data.bin"));
 
-  // Empty base directory.
-  ASSERT_STATUS_OK(utils::ValidateExternalDataPath("", "data.bin"));
-
   // Empty location.
   // Only validate it is not an absolute path.
   ASSERT_TRUE(utils::ValidateExternalDataPath(base_dir_, "").IsOK());
@@ -555,6 +552,29 @@ TEST_F(PathValidationTest, ValidateExternalDataPath) {
 
   // Base directory does not exist.
   ASSERT_STATUS_OK(utils::ValidateExternalDataPath("non_existent_dir", "data.bin"));
+
+  //
+  // Tests for an empty base directory.
+  // The base directory would be empty when 1) the session loads a model from bytes and 2) the application does not
+  // set an external file folder path via the session config option
+  // kOrtSessionOptionsModelExternalInitializersFileFolderPath.
+  //
+
+  // A simple filename is ok (would not escape current working directory).
+  ASSERT_STATUS_OK(utils::ValidateExternalDataPath("", "data.bin"));
+  ASSERT_STATUS_OK(utils::ValidateExternalDataPath("", "./data.bin"));
+
+  // A ".." that is not a path component (part of the filename) is ok
+  ASSERT_STATUS_OK(utils::ValidateExternalDataPath("", "data..bin"));
+
+  // A path that would escape the current working directory is invalid.
+  ASSERT_FALSE(utils::ValidateExternalDataPath("", "../data.bin").IsOK());
+
+  // A path that uses ".." but would not escape the current working directory should be fine.
+  ASSERT_STATUS_OK(utils::ValidateExternalDataPath("", "a/../data.bin"));
+
+  // A path with multiple internal ".." that would escape current working direction should fail.
+  ASSERT_FALSE(utils::ValidateExternalDataPath("", "a/../../data.bin").IsOK());
 }
 
 TEST_F(PathValidationTest, ValidateExternalDataPathWithSymlinkInside) {
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index 4e991716dd108..e472cbcee12d6 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -4918,6 +4918,93 @@ TEST(CApiTest, ModelWithExternalDataOutsideModelDirectoryShouldFailToLoad) {
       << "Exception message should indicate external data or security issue. Got: " << exception_message;
 }
 
+TEST(CApiTest, InMemoryModel_ExternalDataOutsideWorkingDirectory_FailToLoad) {
+  // Attempt to create an ORT session with the malicious model (loaded from bytes).
+  // This should fail due to the use of an external file path that is not under current working directory.
+  // i.e. ../../../../etc/passwd
+  constexpr const ORTCHAR_T* model_path = TSTR("testdata/test_arbitrary_external_file.onnx");
+
+  Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test");
+  Ort::SessionOptions session_options;
+
+  // Load model contents into array
+  std::ifstream model_file_stream(model_path, std::ios::in | std::ios::binary);
+  ASSERT_TRUE(model_file_stream.good());
+  model_file_stream.seekg(0, std::ios::end);
+  const auto file_contents_size = onnxruntime::narrow<size_t>(model_file_stream.tellg());
+  model_file_stream.seekg(0, std::ios::beg);
+  std::vector<char> file_contents(file_contents_size, 0);
+  model_file_stream.read(&file_contents[0], file_contents_size);
+  model_file_stream.close();
+
+  bool exception_thrown = false;
+  std::string exception_message;
+
+  try {
+    // This should throw an exception due to malicious external data
+    Ort::Session session(env, file_contents.data(), file_contents_size, session_options);
+  } catch (const Ort::Exception& e) {
+    exception_thrown = true;
+    exception_message = e.what();
+  } catch (const std::exception& e) {
+    exception_thrown = true;
+    exception_message = e.what();
+  }
+
+  // Verify that loading the model failed
+  EXPECT_TRUE(exception_thrown) << "Expected model loading to fail due to malicious external data path";
+
+  // Verify that the exception message indicates security or external data issues
+  EXPECT_TRUE(exception_message.find("External data path") != std::string::npos &&
+              exception_message.find("escapes working directory") != std::string::npos)
+      << "Exception message should indicate external data or security issue. Got: " << exception_message;
+}
+
+TEST(CApiTest, InMemoryModel_SessionConfigExternalFileFolder_ExternalDataOutsideModelDirectory_FailToLoad) {
+  // Attempt to create an ORT session with the malicious model (loaded from bytes).
+  // A valid external file folder path is explicitly set via session options.
+  // However, this should still fail due to the use of an external file path that escapes the set directory.
+  // i.e. ../../../../etc/passwd
+  constexpr const ORTCHAR_T* model_path = TSTR("testdata/test_arbitrary_external_file.onnx");
+
+  Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test");
+  Ort::SessionOptions session_options;
+  session_options.AddConfigEntry(kOrtSessionOptionsModelExternalInitializersFileFolderPath, "testdata");
+
+  // Load model contents into array
+  std::ifstream model_file_stream(model_path, std::ios::in | std::ios::binary);
+  ASSERT_TRUE(model_file_stream.good());
+  model_file_stream.seekg(0, std::ios::end);
+  const auto file_contents_size = onnxruntime::narrow<size_t>(model_file_stream.tellg());
+  model_file_stream.seekg(0, std::ios::beg);
+  std::vector<char> file_contents(file_contents_size, 0);
+  model_file_stream.read(&file_contents[0], file_contents_size);
+  model_file_stream.close();
+
+  bool exception_thrown = false;
+  std::string exception_message;
+
+  try {
+    // This should throw an exception due to malicious external data
+    Ort::Session session(env, file_contents.data(), file_contents_size, session_options);
+  } catch (const Ort::Exception& e) {
+    exception_thrown = true;
+    exception_message = e.what();
+  } catch (const std::exception& e) {
+    exception_thrown = true;
+    exception_message = e.what();
+  }
+
+  // Verify that loading the model failed
+  EXPECT_TRUE(exception_thrown) << "Expected model loading to fail due to malicious external data path";
+
+  // Verify that the exception message indicates security or external data issues
+  EXPECT_TRUE(exception_message.find("External data path") != std::string::npos &&
+              exception_message.find("escapes both model directory") != std::string::npos &&
+              exception_message.find("and real model directory") != std::string::npos)
+      << "Exception message should indicate external data or security issue. Got: " << exception_message;
+}
+
 #ifdef ORT_ENABLE_STREAM
 #if USE_CUDA
 

From 2dffaaba0463a64ae76869624a79157df56e3b93 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Tue, 24 Feb 2026 15:13:44 -0800
Subject: [PATCH 09/21] Enable Python 3.14 CI and Upgrade Dependencies (#27401)

This pull request enables Python 3.14 testing in the CI pipelines and
upgrades several key dependencies to support the new Python version.

Previously, python 3.14 CI was not enabled since some dependent packages
not support python 3.14 at that time. Now it is the time to upgrade
them.
Key Python dependencies have been updated to versions that support
Python 3.14. The conditional version checks (based on `python_version`)
have been removed in favor of these updated versions across all
environments:
- **pybind**: Upgraded to `3.0.2`.
- **numpy**: Upgraded to `2.4.2`.
- **onnxscript**: Upgraded to `0.6.2`.
- **onnx-ir**: Upgraded to `0.1.16`.
- **onnx**: Standardized on `1.20.1`.
- **torch**: Upgraded to `2.10.0`.
- **triton**: Upgraded to `3.5.0`.
These updates affect multiple `requirements.txt` files across Linux and
Windows Docker images and build stages.
- Use `dynamo=False` for onnx export in failed python tests since
PyTorch 2.10 changed `dynamo=True` as default, which broke a few test
cases.
The conditional logic that previously skipped Python 3.14 tests has been
removed from the Azure Pipelines configuration.
- **Python 3.14 Tests Enabled**: Removed `condition: and(succeeded(),
ne('${{ parameters.PYTHON_VERSION }}', '3.14'))` from
`py-win-webgpu-stage.yml`.
- **Test Execution Flow**: Updated `py-win-cpu.yml` to remove the
restriction that prevented `onnxruntime` tests and
`onnx_backend_test_series.py` from running on Python 3.14.
https://github.com/microsoft/onnxruntime/issues/27392
---
 .github/actions/macos-ci-setup/action.yml     |  2 +-
 .github/workflows/linux_ci.yml                | 17 ++++++++++-
 .github/workflows/mac.yml                     |  6 +++-
 .../macos-ci-build-and-test-workflow.yml      |  2 +-
 .github/workflows/windows_cuda.yml            |  4 +--
 cmake/deps.txt                                |  2 +-
 cmake/external/pybind11.cmake                 |  3 +-
 cmake/vcpkg-ports/pybind11/portfile.cmake     |  3 +-
 cmake/vcpkg-ports/pybind11/vcpkg.json         |  2 +-
 .../tools/pytorch_export_contrib_ops.py       | 30 ++++++++++++-------
 .../tools/transformers/fusion_attention.py    |  6 ++--
 .../transformers/large_model_exporter.py      |  1 +
 .../transformers/models/gpt2/gpt2_helper.py   |  2 +-
 .../models/llama/convert_to_onnx.py           |  2 ++
 .../models/whisper/whisper_decoder.py         |  3 +-
 .../models/whisper/whisper_encoder.py         |  3 +-
 .../whisper/whisper_encoder_decoder_init.py   |  3 +-
 .../transformers/torch_onnx_export_helper.py  |  1 +
 .../python/test_pytorch_export_contrib_ops.py | 15 ++++++----
 .../python/transformers/parity_utilities.py   |  1 +
 .../python/transformers/test_gelu_fusions.py  |  7 ++++-
 .../test_parity_huggingface_gpt_attention.py  |  1 +
 .../python/transformers/test_phi_vision.py    |  2 ++
 .../test/python/transformers/test_whisper.py  |  7 ++---
 tools/ci_build/build.py                       |  6 ++--
 .../stages/py-win-gpu-stage.yml               |  2 --
 .../stages/py-win-webgpu-stage.yml            |  2 --
 .../azure-pipelines/templates/py-win-cpu.yml  | 24 +++++++--------
 .../python/cpu/scripts/requirements.txt       |  6 ++--
 .../docker/scripts/lort/requirements.txt      |  6 ++--
 .../docker/scripts/manylinux/requirements.txt |  9 +++---
 .../linux/docker/scripts/requirements.txt     | 10 +++----
 .../github/linux/python/requirements.txt      | 10 +++----
 .../github/windows/python/requirements.txt    | 10 +++----
 .../transformers-test/requirements.txt        | 12 ++++----
 35 files changed, 130 insertions(+), 92 deletions(-)

diff --git a/.github/actions/macos-ci-setup/action.yml b/.github/actions/macos-ci-setup/action.yml
index 0d60eeae8aee3..054676d301820 100644
--- a/.github/actions/macos-ci-setup/action.yml
+++ b/.github/actions/macos-ci-setup/action.yml
@@ -8,7 +8,7 @@ inputs:
   python_version:
     required: false
     type: string
-    default: "3.11"
+    default: "3.14"
   node_version:
     required: false
     type: string
diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml
index 9aa8418c55a40..dd8cbfdc71a9c 100644
--- a/.github/workflows/linux_ci.yml
+++ b/.github/workflows/linux_ci.yml
@@ -68,6 +68,21 @@ jobs:
     secrets:
       GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
+  build-linux-x64-release-py314:
+    name: Build Linux x64 Release (Python 3.14)
+    uses: ./.github/workflows/reusable_linux_build.yml
+    with:
+      pool_name: "onnxruntime-github-Ubuntu2204-AMD-CPU"
+      build_config: Release
+      architecture: x64
+      dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
+      docker_image_repo: onnxruntimecpubuildpythonx64
+      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --build_nuget --enable_transformers_tool_test --cmake_extra_defines onnxruntime_BUILD_BENCHMARKS=ON'
+      python_path_prefix: 'PATH=/opt/python/cp314-cp314/bin:$PATH' # $ needs escaping in single quotes
+      job_identifier: build-linux-x64-release-py314
+    secrets:
+      GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
   orttraining-linux-ci-pipeline:
     name: Build Linux x64 Release with training
     uses: ./.github/workflows/reusable_linux_build.yml
@@ -109,7 +124,7 @@ jobs:
       dockerfile_path: tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile
       docker_image_repo: onnxruntimecpubuildpythonaarch64
       extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cmake_extra_defines onnxruntime_BUILD_BENCHMARKS=ON'
-      python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH' # $ needs escaping in single quotes
+      python_path_prefix: 'PATH=/opt/python/cp314-cp314/bin:$PATH' # $ needs escaping in single quotes
       job_identifier: build-linux-arm64-release
     secrets:
       GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index 32e126c34ac39..0f8b4a42f48ae 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -16,7 +16,7 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  python_version: 3.11
+  python_version: "3.14"
 
 jobs:
   cpu:
@@ -28,6 +28,7 @@ jobs:
           {"machine": "arm64", "target": "arm64", "build_config": "Debug"},
           {"machine": "arm64", "target": "arm64", "build_config": "Release"}
         ]
+      python_version: "3.14"
 
   coreml:
     uses: ./.github/workflows/macos-ci-build-and-test-workflow.yml
@@ -39,6 +40,7 @@ jobs:
           {"machine": "arm64", "target": "arm64", "build_config": "Debug"},
           {"machine": "arm64", "target": "arm64", "build_config": "Release"}
         ]
+      python_version: "3.14"
 
   xnnpack:
     uses: ./.github/workflows/macos-ci-build-and-test-workflow.yml
@@ -49,6 +51,7 @@ jobs:
         [
           {"machine": "arm64", "target": "arm64", "build_config": "Debug"}
         ]
+      python_version: "3.14"
 
   webgpu:
     uses: ./.github/workflows/macos-ci-build-and-test-workflow.yml
@@ -60,6 +63,7 @@ jobs:
           {"machine": "arm64", "target": "arm64", "build_config": "Debug"},
           {"machine": "arm64", "target": "arm64", "build_config": "Release"}
         ]
+      python_version: "3.14"
 
   iphone_simulator:
     runs-on: macos-15
diff --git a/.github/workflows/macos-ci-build-and-test-workflow.yml b/.github/workflows/macos-ci-build-and-test-workflow.yml
index 75002fdf12c00..76198c7f5c1ce 100644
--- a/.github/workflows/macos-ci-build-and-test-workflow.yml
+++ b/.github/workflows/macos-ci-build-and-test-workflow.yml
@@ -19,7 +19,7 @@ on:
       python_version:
         required: false
         type: string
-        default: "3.11"
+        default: "3.14"
       matrix_include:
         required: false
         type: string
diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml
index 89ae03981ecef..7b93086fbb77d 100644
--- a/.github/workflows/windows_cuda.yml
+++ b/.github/workflows/windows_cuda.yml
@@ -32,7 +32,7 @@ jobs:
 
       - uses: actions/setup-python@v6
         with:
-          python-version: '3.12'
+          python-version: '3.14'
           architecture: x64
 
       - name: Locate vcvarsall and Setup Env
@@ -173,7 +173,7 @@ jobs:
 
       - uses: actions/setup-python@v6
         with:
-          python-version: '3.12'
+          python-version: '3.14'
           architecture: x64
 
       - uses: actions/setup-node@v6
diff --git a/cmake/deps.txt b/cmake/deps.txt
index 578dd8fd23d09..65c74060e8deb 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -46,7 +46,7 @@ protoc_linux_aarch64;https://github.com/protocolbuffers/protobuf/releases/downlo
 protoc_mac_universal;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-osx-universal_binary.zip;23710c3d1c2036d8d65a6a22234372fa2d7af9ef
 psimd;https://github.com/Maratyszcza/psimd/archive/072586a71b55b7f8c584153d223e95687148a900.zip;1f5454b01f06f9656b77e4a5e2e31d7422487013
 pthreadpool;https://github.com/google/pthreadpool/archive/dcc9f28589066af0dbd4555579281230abbf74dd.zip;533a77943203ef15ca608bcd9dbe2c94da7451d2
-pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f780292da9db273c8ef06ccf5fd4b623624143e9
+pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v3.0.2.zip;a064e663b4d7a337ac291d1bef7337ef4e60a1ae
 pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/403d652dca4c1046e8145950b1c0997a9f748b57.zip;30b2a07fe4bae8574f89176e56274cacdd6d135b
 re2;https://github.com/google/re2/archive/refs/tags/2024-07-02.zip;646e1728269cde7fcef990bf4a8e87b047882e88
 safeint;https://github.com/dcleblanc/SafeInt/archive/refs/tags/3.0.28.zip;23f252040ff6cb9f1fd18575b32fa8fb5928daac
diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
index 79280c97a899e..ba14667bc3c88 100644
--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
@@ -6,7 +6,6 @@ onnxruntime_fetchcontent_declare(
     URL ${DEP_URL_pybind11}
     URL_HASH SHA1=${DEP_SHA1_pybind11}
     EXCLUDE_FROM_ALL
-    FIND_PACKAGE_ARGS 2.13 NAMES pybind11
+    FIND_PACKAGE_ARGS 3.0 NAMES pybind11
 )
 onnxruntime_fetchcontent_makeavailable(pybind11_project)
-
diff --git a/cmake/vcpkg-ports/pybind11/portfile.cmake b/cmake/vcpkg-ports/pybind11/portfile.cmake
index 2c63582d1ee15..4e4cd30a26df1 100644
--- a/cmake/vcpkg-ports/pybind11/portfile.cmake
+++ b/cmake/vcpkg-ports/pybind11/portfile.cmake
@@ -2,7 +2,8 @@ vcpkg_from_github(
     OUT_SOURCE_PATH SOURCE_PATH
     REPO pybind/pybind11
     REF "v${VERSION}"
-    SHA512 497c25b33b09a9c42f67131ab82e35d689e8ce089dd7639be997305ff9a6d502447b79c824508c455d559e61f0186335b54dd2771d903a7c1621833930622d1a
+    # SHA512 for the zip (not tar.gz) file.
+    SHA512 786b1bf534ac67a8d5669f8babf67bb13e48b3a3da1b6344e43ae10a84b80bbc8fea5f12a65fd18739c341fefef5622c5dc096db964dff33cc62ea4259b2e2c1
     HEAD_REF master
 )
 
diff --git a/cmake/vcpkg-ports/pybind11/vcpkg.json b/cmake/vcpkg-ports/pybind11/vcpkg.json
index a730d32017885..058e2235fea08 100644
--- a/cmake/vcpkg-ports/pybind11/vcpkg.json
+++ b/cmake/vcpkg-ports/pybind11/vcpkg.json
@@ -1,6 +1,6 @@
 {
   "name": "pybind11",
-  "version": "2.13.6",
+  "version": "3.0.2",
   "description": "pybind11 is a lightweight header-only library that exposes C++ types in Python and vice versa, mainly to create Python bindings of existing C++ code",
   "homepage": "https://github.com/pybind/pybind11",
   "license": "BSD-3-Clause",
diff --git a/onnxruntime/python/tools/pytorch_export_contrib_ops.py b/onnxruntime/python/tools/pytorch_export_contrib_ops.py
index 1c5e31af99d82..0bd75e5c92e4c 100644
--- a/onnxruntime/python/tools/pytorch_export_contrib_ops.py
+++ b/onnxruntime/python/tools/pytorch_export_contrib_ops.py
@@ -6,6 +6,7 @@
 PyTorch-ONNX exporter (torch.onnx.export).
 """
 
+import contextlib
 import typing
 
 try:
@@ -22,7 +23,7 @@
 _registered_ops: typing.AbstractSet[str] = set()
 
 
-def _reg(symbolic_fn: typing.Callable, namespace: str = ""):
+def _reg(symbolic_fn: typing.Callable, namespace: str = "aten"):
     name = f"{namespace}::{symbolic_fn.__name__}"
     torch.onnx.register_custom_op_symbolic(name, symbolic_fn, _OPSET_VERSION)
     _registered_ops.add(name)
@@ -49,13 +50,6 @@ def grid_sampler(g, input, grid, mode, padding_mode, align_corners):
         padding_mode_str = ["zeros", "border", "reflection"][padding_mode]
         align_corners = int(symbolic_helper._maybe_get_const(align_corners, "b"))
 
-        # From opset v13 onward, the output shape can be specified with
-        # (N, C, H, W) (N, H_out, W_out, 2) => (N, C, H_out, W_out)
-        # input_shape = input.type().sizes()
-        # gird_shape = grid.type().sizes()
-        # output_shape = input_shape[:2] + gird_shape[1:3]
-        # g.op(...).setType(input.type().with_sizes(output_shape))
-
         return g.op(
             "com.microsoft::GridSample",
             input,
@@ -71,15 +65,24 @@ def inverse(g, self):
         return g.op("com.microsoft::Inverse", self).setType(self.type())
 
     _reg(inverse)
+    torch.onnx.register_custom_op_symbolic("aten::linalg_inv", inverse, _OPSET_VERSION)
+    _registered_ops.add("aten::linalg_inv")
+
+    def gelu(g, self: torch._C.Value, approximate="none"):
+        # PyTorch can emit aten::gelu with or without the optional approximate arg.
+        if not isinstance(approximate, str):
+            approximate = symbolic_helper._maybe_get_const(approximate, "s")
 
-    @torch.onnx.symbolic_helper.parse_args("v", "s")
-    def gelu(g, self: torch._C.Value, approximate: str = "none"):
-        # Use microsoft::Gelu for performance if possible. It only supports approximate == "none"
+        # Use microsoft::Gelu for performance if possible. It only supports approximate == "none".
         if approximate == "none":
             return g.op("com.microsoft::Gelu", self).setType(self.type())
         return torch.onnx.symbolic_opset9.gelu(g, self, approximate)
 
     _reg(gelu)
+    # Some PyTorch versions dispatch GELU symbolic lookup by exporter opset.
+    # Registering across stable opsets keeps ORT Gelu fusion consistently enabled.
+    for opset in range(9, 21):
+        torch.onnx.register_custom_op_symbolic("aten::gelu", gelu, opset)
 
     def triu(g, self, diagonal):
         return g.op("com.microsoft::Trilu", self, diagonal, upper_i=1).setType(self.type())
@@ -127,3 +130,8 @@ def unregister():
             for version in symbolic_helper._onnx_stable_opsets:
                 if version >= _OPSET_VERSION and symbolic_registry.is_registered_op(kind, namespace, version):
                     del symbolic_registry._registry[(namespace, version)][kind]
+
+    # Also clean up gelu's multi-opset registrations (see register()).
+    for opset in range(9, 21):
+        with contextlib.suppress(Exception):
+            torch.onnx.unregister_custom_op_symbolic("aten::gelu", opset)
diff --git a/onnxruntime/python/tools/transformers/fusion_attention.py b/onnxruntime/python/tools/transformers/fusion_attention.py
index 08f8691d8b2b5..de7f0a044c118 100644
--- a/onnxruntime/python/tools/transformers/fusion_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_attention.py
@@ -1112,11 +1112,11 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node):
             if (
                 (mul_val is None)
                 or not (isinstance(mul_val, np.ndarray) and mul_val.size == 1)
-                or (float(mul_val) >= 0)
+                or (mul_val.item() >= 0)
             ):
                 return
-            if float(mul_val) != -10000:
-                self.mask_filter_value = float(mul_val)
+            if mul_val.item() != -10000:
+                self.mask_filter_value = mul_val.item()
 
         if matmul_v.input[0] == root_input and matmul_q.input[0] == root_input and matmul_k.input[0] == root_input:
             mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0]) if not is_no_mask_attention else None
diff --git a/onnxruntime/python/tools/transformers/large_model_exporter.py b/onnxruntime/python/tools/transformers/large_model_exporter.py
index 29829a6c475d9..f4d9e28d4ecb2 100644
--- a/onnxruntime/python/tools/transformers/large_model_exporter.py
+++ b/onnxruntime/python/tools/transformers/large_model_exporter.py
@@ -290,6 +290,7 @@ def do_export_internal(model: nn.Module, onnx_io_tuple: tuple, onnx_inputs: tupl
             input_names=onnx_inp_names,
             output_names=onnx_out_names,
             dynamic_axes=onnx_dynamic_axes,
+            dynamo=False,
         )
 
         onnx_path.unlink(missing_ok=True)
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
index b405c19b04689..0b86d5f038cd8 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
@@ -473,7 +473,7 @@ def export_onnx(
                     input_names=input_names,
                     output_names=output_names,
                     dynamic_axes=dynamic_axes,
-                    opset_version=11,
+                    opset_version=14,
                     do_constant_folding=True,
                     use_external_data_format=True,
                     verbose=verbose,
diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
index 2cb6a733c5bc7..17a4ef58914d6 100644
--- a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
@@ -235,6 +235,7 @@ def run_torchscript_separate_export(
         opset_version=torch_export_onnx_opset_version,
         do_constant_folding=True,
         verbose=args.verbose,
+        dynamo=False,
     )
 
     # Check decoder_model.onnx and save all external data to one file
@@ -294,6 +295,7 @@ def run_torchscript_separate_export(
         opset_version=torch_export_onnx_opset_version,
         do_constant_folding=True,
         verbose=args.verbose,
+        dynamo=False,
     )
 
     # Check decoder_with_past_model.onnx and save all external data to one file
diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py
index e10e616d35d38..31fb60f86faf1 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py
@@ -391,8 +391,9 @@ def export_onnx(
                 input_names=input_names,
                 output_names=output_names,
                 dynamic_axes=dynamic_axes,
-                opset_version=17,
+                opset_version=18,
                 do_constant_folding=True,
+                dynamo=False,
                 verbose=verbose,
             )
 
diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder.py
index 851f641442016..48d4e12a38a43 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder.py
@@ -110,8 +110,9 @@ def export_onnx(
                 input_names=input_names,
                 output_names=output_names,
                 dynamic_axes=dynamic_axes,
-                opset_version=17,
+                opset_version=18,
                 do_constant_folding=True,
+                dynamo=False,
                 verbose=verbose,
             )
 
diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py
index cd81edc1001be..35ec59b2bca69 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py
@@ -293,8 +293,9 @@ def export_onnx(
                 input_names=input_names,
                 output_names=output_names,
                 dynamic_axes=dynamic_axes,
-                opset_version=17,
+                opset_version=18,
                 do_constant_folding=True,
+                dynamo=False,
                 verbose=verbose,
             )
 
diff --git a/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py b/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py
index 66f24c47f6cdb..a8c2ad1967acb 100644
--- a/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py
+++ b/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py
@@ -49,6 +49,7 @@ def torch_onnx_export(
             keep_initializers_as_inputs=keep_initializers_as_inputs,
             custom_opsets=custom_opsets,
             export_modules_as_functions=export_modules_as_functions,
+            dynamo=False,
         )
     else:
         torch.onnx.export(
diff --git a/onnxruntime/test/python/test_pytorch_export_contrib_ops.py b/onnxruntime/test/python/test_pytorch_export_contrib_ops.py
index e7ea83dd00297..afefc4e616a87 100644
--- a/onnxruntime/test/python/test_pytorch_export_contrib_ops.py
+++ b/onnxruntime/test/python/test_pytorch_export_contrib_ops.py
@@ -59,6 +59,9 @@ def setUp(self):
         torch.manual_seed(0)
         pytorch_export_contrib_ops.register()
 
+    def tearDown(self):
+        pytorch_export_contrib_ops.unregister()
+
     def run_test(
         self,
         model,
@@ -101,6 +104,7 @@ def run_test(
                 input_names=input_names,
                 output_names=output_names,
                 custom_opsets=custom_opsets,
+                dynamo=False,
             )
 
             # compute onnxruntime output prediction
@@ -143,12 +147,13 @@ def test_gelu_is_fused_by_default(self):
             f,
             opset_version=self.opset_version,
             custom_opsets={"com.microsoft": 1},
+            dynamo=False,
         )
         f.seek(0)
         onnx_model = onnx.load(f)
-        node = onnx_model.graph.node[0]
-        self.assertEqual(node.op_type, "Gelu")
-        self.assertEqual(node.domain, "com.microsoft")
+        # Default GELU should be mapped to ORT contrib Gelu for performance.
+        gelu_nodes = [n for n in onnx_model.graph.node if n.op_type == "Gelu" and n.domain == "com.microsoft"]
+        self.assertEqual(len(gelu_nodes), 1)
 
     @parameterized.parameterized.expand([("default_approximate", "none"), ("tanh_approximate", "tanh")])
     @unittest.skipIf(_torch_version_lower_than("1.12"), "Gelu's approximate parameter unsupported in PyTorch < 1.12")
@@ -230,8 +235,8 @@ def forward(self, input):
 # IR version 4 style export.
 ONNXExporterTest_opset9_IRv4 = type(
     "TestONNXRuntime_opset9_IRv4",
-    (unittest.TestCase,),
-    dict(ONNXExporterTest.__dict__, keep_initializers_as_inputs=False),
+    (ONNXExporterTest,),
+    dict(keep_initializers_as_inputs=False),
 )
 
 
diff --git a/onnxruntime/test/python/transformers/parity_utilities.py b/onnxruntime/test/python/transformers/parity_utilities.py
index fa16f0e67a523..04a1ed06773e7 100644
--- a/onnxruntime/test/python/transformers/parity_utilities.py
+++ b/onnxruntime/test/python/transformers/parity_utilities.py
@@ -92,6 +92,7 @@ def export_onnx(model, onnx_model_path, float16, hidden_size, device):
         dynamic_axes=dynamic_axes,
         opset_version=11,
         do_constant_folding=True,
+        dynamo=False,
     )
     print("exported:", onnx_model_path)
 
diff --git a/onnxruntime/test/python/transformers/test_gelu_fusions.py b/onnxruntime/test/python/transformers/test_gelu_fusions.py
index 11ae1401ff8ed..a63e2653f2fbc 100644
--- a/onnxruntime/test/python/transformers/test_gelu_fusions.py
+++ b/onnxruntime/test/python/transformers/test_gelu_fusions.py
@@ -75,17 +75,22 @@ def test_fusions(self, test_case, dynamo):
         dummy_input = torch.ones(3, dtype=torch.float32)
         test_name = f"{operator}_{source}"
         onnx_path = f"{test_name}.onnx"
+
+        # For Torch 2.10+, torch.nn.functional.gelu(approximate="tanh") exports as Gelu node.
+        # So we force opset_version=18 here.
         torch.onnx.export(
             model,
             (dummy_input,),
             onnx_path,
             input_names=["input"],
             output_names=["output"],
-            dynamo=dynamo,
+            opset_version=18,
+            dynamo=False,
             optimize=True,  # Only meaningful when dynamo is True
         )
         optimizer = optimize_model(onnx_path, "bert")
         # optimizer.save_model_to_file(f"{operator}_{source}_opt.onnx")
+
         os.remove(onnx_path)
         # Remove the associated .data file (dynamo)
         data_path = onnx_path + ".data"
diff --git a/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py b/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py
index 444d86da75ba6..c07eb39e6df75 100644
--- a/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py
+++ b/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py
@@ -253,6 +253,7 @@ def export_onnx(model, onnx_model_path, float16, hidden_size, num_attention_head
         dynamic_axes=dynamic_axes,
         opset_version=11,
         do_constant_folding=True,
+        dynamo=False,
     )
     print("exported:", onnx_model_path)
 
diff --git a/onnxruntime/test/python/transformers/test_phi_vision.py b/onnxruntime/test/python/transformers/test_phi_vision.py
index d276366706af9..5a5fa926eb255 100644
--- a/onnxruntime/test/python/transformers/test_phi_vision.py
+++ b/onnxruntime/test/python/transformers/test_phi_vision.py
@@ -208,6 +208,7 @@ def export(self, model, inputs):
                     "input": {0: "batch", 1: "seq"},
                     "attention_mask": {0: "batch", 2: "seq", 3: "seq"},
                 },
+                dynamo=False,
             )
         else:
             torch.onnx.export(
@@ -217,6 +218,7 @@ def export(self, model, inputs):
                 export_params=True,
                 opset_version=14,
                 do_constant_folding=True,
+                dynamo=False,
             )
 
     def tearDown(self):
diff --git a/onnxruntime/test/python/transformers/test_whisper.py b/onnxruntime/test/python/transformers/test_whisper.py
index e3ca8e6b6ac9c..e90a14f8d7d61 100644
--- a/onnxruntime/test/python/transformers/test_whisper.py
+++ b/onnxruntime/test/python/transformers/test_whisper.py
@@ -471,8 +471,9 @@ def export(self, model, inputs, input_names, output_names, dynamic_axes):
             input_names=input_names,
             output_names=output_names,
             dynamic_axes=dynamic_axes,
-            opset_version=17,
+            opset_version=18,
             do_constant_folding=True,
+            dynamo=False,
             verbose=False,
         )
 
@@ -530,9 +531,7 @@ def test_hf_whisper_encoder_self_attention(self, precision, ep):
             use_gpu=True,
             only_onnxruntime=False,
         )
-        name = f"hf_{precision}_encoder_self_attention.onnx"
-        # optimized_model.save_model_to_file(name)  # Uncomment for debugging purposes
-        self.verify_fusion(optimized_model, name)
+        self.verify_fusion(optimized_model, f"hf_{precision}_encoder_self_attention.onnx")
 
     @parameterized.expand(
         [
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index c8beeef8aa509..de64183e1bb18 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1761,7 +1761,7 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                 # Install cpu only version of torch when cuda is not enabled in Linux.
                 extra = [] if args.use_cuda and is_linux() else ["--index-url", "https://download.pytorch.org/whl/cpu"]
                 run_subprocess(
-                    [sys.executable, "-m", "pip", "install", "torch==2.8.0", "torchvision==0.23.0", *extra],
+                    [sys.executable, "-m", "pip", "install", "torch==2.10.0", "torchvision==0.25.0", *extra],
                     cwd=cwd,
                     dll_path=dll_path,
                     python_path=python_path,
@@ -1838,11 +1838,9 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                         [sys.executable, "-m", "unittest", "discover", "-s", "quantization"], cwd=cwd, dll_path=dll_path
                     )
 
-                    # onnx package does not support python 3.14 yet so skip the transformers tests for python 3.14.
-                    # we can remove this check when onnx package supports python 3.14.
                     if args.enable_transformers_tool_test and (sys.version_info.major, sys.version_info.minor) < (
                         3,
-                        14,
+                        15,
                     ):
                         import google.protobuf  # noqa: PLC0415
                         import numpy  # noqa: PLC0415
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
index 7e47227c23d5b..385cee35eb95d 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
@@ -175,8 +175,6 @@ stages:
 
   - stage: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Tests
     dependsOn: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Build
-    # Skip this stage for Python 3.14 for now until onnx package support python 3.14.
-    condition: and(succeeded(), ne('${{ parameters.PYTHON_VERSION }}', '3.14'))
     jobs:
     - job: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Tests
       workspace:
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-win-webgpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-win-webgpu-stage.yml
index 8bd8521d80104..1897d94db76c7 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-win-webgpu-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-win-webgpu-stage.yml
@@ -131,8 +131,6 @@ stages:
 
   - stage: Win_py_webgpu_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Tests
     dependsOn: Win_py_webgpu_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Build
-    # Skip this stage for Python 3.14 for now until onnx package support python 3.14.
-    condition: and(succeeded(), ne('${{ parameters.PYTHON_VERSION }}', '3.14'))
     jobs:
     - job: Win_py_webgpu_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Tests
       workspace:
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-cpu.yml
index 09603f2350657..326cfd7829f2f 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-cpu.yml
@@ -149,20 +149,18 @@ jobs:
 
 
   - powershell: |
-      if ("$(PythonVersion)" -notcontains "3.14") {
-        python -m pip uninstall -y onnxruntime onnxruntime-gpu -qq
-        Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate}
-        Remove-Item -Recurse -Force onnxruntime
-        if ("$(ExtraParam)".Split() -contains "--use_azure") {
-
-          if( "${{parameters.architecture}}" -eq 'arm64') {
-            $env:path="$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\arm64-windows\bin;$env:path"
-          }  else {
-            $env:path="$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\x64-windows\bin;$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\x86-windows\bin;$env:path"
-          }
-          python onnxruntime_test_python_azure.py
+      python -m pip uninstall -y onnxruntime onnxruntime-gpu -qq
+      Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate}
+      Remove-Item -Recurse -Force onnxruntime
+      if ("$(ExtraParam)".Split() -contains "--use_azure") {
+
+        if( "${{parameters.architecture}}" -eq 'arm64') {
+          $env:path="$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\arm64-windows\bin;$env:path"
+        }  else {
+          $env:path="$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\x64-windows\bin;$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\x86-windows\bin;$env:path"
         }
-        python onnx_backend_test_series.py
+        python onnxruntime_test_python_azure.py
       }
+      python onnx_backend_test_series.py
     workingDirectory: '$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}'
     displayName: 'Run Python Tests'
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
index 42bee7a892b11..7e2b6e74cfdde 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
@@ -1,5 +1,5 @@
-numpy==2.2.6; python_version < "3.14"
-numpy==2.3.2; python_version >= "3.14"
+numpy==2.2.6; python_version < "3.11"
+numpy==2.4.2; python_version >= "3.11"
 mypy
 pytest
 setuptools>=68.2.2
@@ -7,4 +7,4 @@ wheel
 protobuf==4.25.8
 sympy==1.14
 flatbuffers
-onnx==1.20.1; python_version < "3.14"
+onnx==1.20.1
diff --git a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
index c5fc16837e093..63a8e96d8c128 100644
--- a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
@@ -3,13 +3,13 @@ beartype==0.15.0
 flatbuffers
 cerberus
 h5py
-onnx==1.20.1; python_version < "3.14"
+onnx==1.20.1
 # Python dependencies required for pytorch development
 astunparse
 expecttest!=0.2.0
 hypothesis
-numpy==2.2.6; python_version < "3.14"
-numpy==2.3.2; python_version >= "3.14"
+numpy==2.2.6; python_version < "3.11"
+numpy==2.4.2; python_version >= "3.11"
 psutil
 pyyaml
 requests
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
index 8f5d0776501c0..ffcad5ee67208 100644
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
@@ -1,5 +1,5 @@
-numpy==2.2.6; python_version < "3.14"
-numpy==2.3.2; python_version >= "3.14"
+numpy==2.2.6; python_version < "3.11"
+numpy==2.4.2; python_version >= "3.11"
 mypy
 pytest
 setuptools>=68.2.2
@@ -8,6 +8,5 @@ protobuf==6.33.0
 sympy==1.14
 flatbuffers
 neural-compressor>=2.2.1
-triton==3.2.0; python_version < "3.14"
-triton==3.5.0; python_version >= "3.14"
-onnx==1.20.1; python_version < "3.14"
+triton==3.5.0
+onnx==1.20.1
diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt
index 85a9c6391af80..ad57cc715589b 100644
--- a/tools/ci_build/github/linux/docker/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt
@@ -1,6 +1,6 @@
 cerberus
-numpy==2.2.6; python_version < "3.14"
-numpy==2.3.2; python_version >= "3.14"
+numpy==2.2.6; python_version < "3.11"
+numpy==2.4.2; python_version >= "3.11"
 mypy
 pytest
 setuptools==78.1.1
@@ -10,6 +10,6 @@ sympy==1.14
 flatbuffers
 protobuf==6.33.0
 packaging
-onnxscript==0.5.3; python_version < "3.14"
-onnx-ir==0.1.10; python_version < "3.14"
-onnx==1.20.1; python_version < "3.14"
+onnxscript==0.6.2
+onnx-ir==0.1.16
+onnx==1.20.1
diff --git a/tools/ci_build/github/linux/python/requirements.txt b/tools/ci_build/github/linux/python/requirements.txt
index 6a474973d4f0c..d95e44bb3a280 100644
--- a/tools/ci_build/github/linux/python/requirements.txt
+++ b/tools/ci_build/github/linux/python/requirements.txt
@@ -1,5 +1,5 @@
-numpy==2.2.6; python_version < "3.14"
-numpy==2.3.2; python_version >= "3.14"
+numpy==2.2.6; python_version < "3.11"
+numpy==2.4.2; python_version >= "3.11"
 mypy
 pytest
 setuptools>=68.2.2
@@ -8,8 +8,8 @@ protobuf==6.33.0
 sympy==1.14
 flatbuffers
 psutil
-onnxscript==0.5.3; python_version < "3.14"
-onnx-ir==0.1.10; python_version < "3.14"
+onnxscript==0.6.2
+onnx-ir==0.1.16
 jinja2
 markupsafe
-onnx==1.20.1; python_version < "3.14"
+onnx==1.20.1
diff --git a/tools/ci_build/github/windows/python/requirements.txt b/tools/ci_build/github/windows/python/requirements.txt
index a86eef170bc25..83593ff47e453 100644
--- a/tools/ci_build/github/windows/python/requirements.txt
+++ b/tools/ci_build/github/windows/python/requirements.txt
@@ -1,5 +1,5 @@
-numpy==2.2.6; python_version < "3.14"
-numpy==2.3.2; python_version >= "3.14"
+numpy==2.2.6; python_version < "3.11"
+numpy==2.4.2; python_version >= "3.11"
 mypy
 pytest
 setuptools>=68.2.2
@@ -8,10 +8,10 @@ protobuf==6.33.0
 sympy==1.14
 flatbuffers
 psutil
-onnxscript==0.5.3; python_version < "3.14"
-onnx-ir==0.1.10; python_version < "3.14"
+onnxscript==0.6.2
+onnx-ir==0.1.16
 jinja2
 markupsafe
 semver
 packaging
-onnx==1.20.1; python_version < "3.14"
+onnx==1.20.1
diff --git a/tools/ci_build/requirements/transformers-test/requirements.txt b/tools/ci_build/requirements/transformers-test/requirements.txt
index 1523b420bfdbd..c764225dbc98d 100644
--- a/tools/ci_build/requirements/transformers-test/requirements.txt
+++ b/tools/ci_build/requirements/transformers-test/requirements.txt
@@ -2,14 +2,14 @@
 packaging
 # protobuf and numpy is same as tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
 protobuf==6.33.0
-numpy==2.2.6; python_version < "3.14"
-numpy==2.3.2; python_version >= "3.14"
-torch==2.8.0
-torchvision==0.23.0
+numpy==2.2.6; python_version < "3.11"
+numpy==2.4.2; python_version >= "3.11"
+torch==2.10.0
+torchvision==0.25.0
 transformers==4.52.1
 parameterized>=0.8.1
 sentencepiece
 psutil
 einops
-onnxscript==0.5.3; python_version < "3.14"
-onnx-ir==0.1.10; python_version < "3.14"
+onnxscript==0.6.2
+onnx-ir==0.1.16

From bb0fe86babe345b55b5593c80b9cd98600f8103e Mon Sep 17 00:00:00 2001
From: Lukas Folle <126877803+lukas-folle-snkeos@users.noreply.github.com>
Date: Wed, 25 Feb 2026 01:45:33 +0100
Subject: [PATCH 10/21] fix: out of bounds access for resize operation (#27419)

### Description
This PR fixes:
* An out-of-bounds write in CUDA Resize for LINEAR mode when running
trilinear paths (3D/5D)
* A race condition for the reduction kernel

### Root cause
1. The temporary dims-mapping buffer for LINEAR mode was sized using
only H+W, while the trilinear coordinate mapping kernel writes D+H+W
entries.
2. shared-memory race in the block-level reduction loop inside
[reduction_functions.cu](vscode-file://vscode-app/c:/Users/lukas.folle/AppData/Local/Programs/Microsoft%20VS%20Code/072586267e/resources/app/out/vs/code/electron-browser/workbench/workbench.html).
The condition allowed threads outside the active lower half to update
shared memory in the same stride phase, creating overlapping read/write
hazards

My colleague @korbinian-mechlem-snkeos noticed this warning from
compute-sanitzer
> ========= Invalid __global__ write of size 4 bytes
========= at void
onnxruntime::cuda::_ResizeTrilinearCoordinateMapping<float,
onnxruntime::cuda::TransformCoordinate_HALF_PIXEL>(long long, long long,
long long, long long, long long, long long, float, float, float, float,
float, float, float, float, float, unsigned long long, bool, const T2 &,
onnxruntime::cuda::LinearMappingInfo *)+0x400
=========     by thread (17,0,0) in block (2,0,0)
=========     Address 0xb28fff7cc is out of bounds
========= and is 205 bytes after the nearest allocation at 0xb28fff400
of size 768 bytes
========= Saved host backtrace up to driver entry point at kernel launch
time

AND

> ========= Warning: Race reported between Read access at void
onnxruntime::cuda::detail::reduce_matrix_columns_kernel<float, float,
float, onnxruntime::cuda::Identity, onnxruntime::cuda::Identity,
(bool)0>(int, int, const T1 *, T2 *, T3 *, int *)+0xe80
========= and Write access at void
onnxruntime::cuda::detail::reduce_matrix_columns_kernel<float, float,
float, onnxruntime::cuda::Identity, onnxruntime::cuda::Identity,
(bool)0>(int, int, const T1 *, T2 *, T3 *, int *)+0xea0 [337920 hazards]

### Motivation and Context
Update LINEAR buffer size calculation to:
* use H+W for bilinear (2D/4D)
* use D+H+W for trilinear (3D/5D)

Prevents invalid global writes and intermittent CUDA memory errors in
trilinear resize workloads.

@johannes-rehm-snkeos
---
 .../cuda/reduction/reduction_functions.cu     |  2 +-
 .../core/providers/cuda/tensor/resize_impl.cu |  7 ++++
 .../providers/cpu/tensor/resize_op_test.cc    | 32 ++++++++++++++++++
 .../test_cases/reduction_functions_test.cc    | 33 +++++++++++++++++++
 4 files changed, 73 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu
index 51c80d272bb96..62801c8da1e5f 100644
--- a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu
+++ b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu
@@ -209,7 +209,7 @@ __device__ void reduce_all(
   // the size of shared_memory equals to the number of warps.
 #pragma unroll
   for (int stride = MAX_NUM_WARPS_PER_BLOCK / 2; stride > 0; stride /= 2) {
-    if (tid_in_block + stride < num_warps_in_block) {
+    if (tid_in_block < stride && tid_in_block + stride < num_warps_in_block) {
       shared_memory[tid_in_block] += shared_memory[tid_in_block + stride];
     }
     __syncthreads();
diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
index a96d4c82a7fdc..963fa020d033a 100644
--- a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
@@ -585,6 +585,13 @@ size_t CalcResizeBufferSize(const onnxruntime::UpsampleMode upsample_mode,
                  static_cast<size_t>(std::accumulate(output_dims.begin(),
                                                      output_dims.end(), (int64_t)0));
     case UpsampleMode::LINEAR:
+      // For LINEAR mode:
+      // - bilinear (2-D/4-D) uses mapping for [H, W]
+      // - trilinear (3-D/5-D) uses mapping for [D, H, W]
+      if (output_dims.size() == 3 || output_dims.size() == 5) {
+        return sizeof(LinearMappingInfo) *
+               static_cast<size_t>(std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 3, (int64_t)0));
+      }
       return sizeof(LinearMappingInfo) *
              static_cast<size_t>(std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, (int64_t)0));
     case UpsampleMode::CUBIC:
diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
index 200a1aded8204..8fd994baec713 100644
--- a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
@@ -1015,6 +1015,38 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_5DTrilinear_pytorch_half_pixel) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: results mismatch
 }
 
+TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_5DTrilinear_CudaRegression) {
+  auto cuda_ep = DefaultCudaExecutionProvider();
+  if (!cuda_ep) {
+    GTEST_SKIP() << "CUDA EP not available";
+  }
+
+  OpTester test("Resize", 13);
+  std::vector<float> roi{};
+  std::vector<float> scales{1.0f, 1.0f, 2.0f, 2.0f, 2.0f};
+
+  test.AddAttribute("mode", "linear");
+  test.AddAttribute("coordinate_transformation_mode", "pytorch_half_pixel");
+
+  constexpr int64_t N = 1, C = 1, D = 3, H = 4, W = 5;
+  std::vector<float> X(static_cast<size_t>(N * C * D * H * W), 1.0f);
+
+  test.AddInput<float>("X", {N, C, D, H, W}, X);
+  test.AddInput<float>("roi", {0}, roi);
+  test.AddInput<float>("scales", {5}, scales);
+
+  constexpr int64_t out_D = D * 2;
+  constexpr int64_t out_H = H * 2;
+  constexpr int64_t out_W = W * 2;
+  std::vector<float> Y(static_cast<size_t>(N * C * out_D * out_H * out_W), 1.0f);
+
+  test.AddOutput<float>("Y", {N, C, out_D, out_H, out_W}, Y);
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(std::move(cuda_ep));
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
 TEST(ResizeOpTest, ResizeOpLinearScalesNoOpTest) {
   // To test NNAPI EP, we need the scales/sizes to be in initializers
   auto run_test = [](bool scales_in_initializer) {
diff --git a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
index ec7e98528504e..593255b9e9c23 100644
--- a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
@@ -177,6 +177,35 @@ void TestReduceColumnsToColumn(int m, int n, float relative_error_tolerance = 1e
 
   CheckDeviceValues(m, d_out.get(), expected_column.data(), relative_error_tolerance);
 }
+
+void TestReduceColumnsToColumnRepeated(int m, int n, int iterations, float relative_error_tolerance = 1e-4f) {
+  SCOPED_TRACE(MakeString("m: ", m, ", n:", n, ", iterations: ", iterations));
+
+  const TensorShape shape{m, n};
+  RandomValueGenerator random{};
+  const auto values = random.Uniform<float>(shape.GetDims(), 1.0f, 10.0f);
+  const auto expected_column = ExpectedReduceMatrixColumnsOutput(m, n, values);
+
+  auto d_in = AllocateDeviceMemory<float>(m * n);
+  auto d_out = AllocateDeviceMemory<float>(m);
+
+  cudaMemcpy(d_in.get(), values.data(), m * n * sizeof(float), cudaMemcpyHostToDevice);
+
+  size_t buffer_size_in_bytes =
+      compute_reduce_matrix_columns_buffer_size<float>(m, n);
+  auto d_buffer = AllocateDeviceMemory<char>(buffer_size_in_bytes);
+
+  for (int i = 0; i < iterations; ++i) {
+    ASSERT_STATUS_OK(reduce_matrix_columns(
+        0,
+        d_in.get(), d_out.get(),
+        m, n,
+        d_buffer.get(), buffer_size_in_bytes));
+
+    ASSERT_TRUE(CUDA_CALL(cudaDeviceSynchronize()).IsOK());
+    CheckDeviceValues(m, d_out.get(), expected_column.data(), relative_error_tolerance);
+  }
+}
 }  // namespace
 
 TEST(ReductionFunctionsTest, ReduceRowToScalar) {
@@ -205,6 +234,10 @@ TEST(ReductionFunctionsTest, ReduceColumnsToColumn) {
   }
 }
 
+TEST(ReductionFunctionsTest, ReduceColumnsToColumnRepeated) {
+  TestReduceColumnsToColumnRepeated(17, 8192, 100, 2e-4f);
+}
+
 TEST(ReductionFunctionsTest, BufferOffsets) {
   const int m = 2048;
   const int n = 1024;

From 19f9f773a5f553b7e536179dcdd321ffb8c35e1b Mon Sep 17 00:00:00 2001
From: Chi Lo <54722500+chilo-ms@users.noreply.github.com>
Date: Thu, 26 Feb 2026 08:46:06 -0800
Subject: [PATCH 11/21] Fix GatherCopyData Integer Truncation Leading to Heap
 Out-of-Bounds Read/Write  (#27444)

### Description
This pull request improves the robustness and correctness of the CPU
implementation of the Gather operator in ONNX Runtime. The key changes
focus on preventing integer overflow issues in parallel processing and
output shape calculations, as well as enhancing test coverage to verify
these safeguards.

Enhancements to overflow handling and parallel processing:

* Changed the lambda function in `GatherCopyData` to use `ptrdiff_t`
instead of `int64_t` for the index, and explicitly cast batch and i
variables, ensuring safer arithmetic for large tensor sizes.
* Updated the parallel loop in `GatherCopyData` to iterate using
`ptrdiff_t` indices, preventing potential overflow when processing large
tensors.

Testing improvements:

* Added a new unit test `Gather_overflow_check` in `gather_op_test.cc`
to verify that the Gather operator correctly handles very large output
shapes without overflowing, specifically testing dimensions that exceed
the 32-bit integer limit.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../core/providers/cpu/tensor/gather.cc       | 20 ++++++-----
 .../providers/cpu/tensor/gather_op_test.cc    | 35 +++++++++++++++++++
 2 files changed, 46 insertions(+), 9 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/tensor/gather.cc b/onnxruntime/core/providers/cpu/tensor/gather.cc
index 38a16ee83c86b..e36f2c7f89183 100644
--- a/onnxruntime/core/providers/cpu/tensor/gather.cc
+++ b/onnxruntime/core/providers/cpu/tensor/gather.cc
@@ -102,9 +102,9 @@ Status GatherCopyData(const Tensor* indices_tensor, const uint8_t* src_base, uin
     }
   }
 
-  auto lambda = [&](int64_t index) {
-    int64_t batch = index / N;
-    int64_t i = index % N;
+  auto lambda = [&](ptrdiff_t index) {
+    const int64_t batch = static_cast<int64_t>(index / N);
+    const int64_t i = static_cast<int64_t>(index % N);
 
     const int64_t src_offset_batch = batch * data_batch_bytes;
     const int64_t dst_offset_batch = batch * gathered_batch_bytes;
@@ -120,12 +120,14 @@ Status GatherCopyData(const Tensor* indices_tensor, const uint8_t* src_base, uin
       memcpy(dst_base + dst_offset, src_base + src_offset, narrow<size_t>(block_size));
     }
   };
-  concurrency::ThreadPool::TryParallelFor(tp, SafeInt<ptrdiff_t>(M) * N, static_cast<double>(block_size),
-                                          [&lambda](ptrdiff_t first, ptrdiff_t last) {
-                                            for (int index = static_cast<int>(first), end = static_cast<int>(last); index < end; ++index) {
-                                              lambda(index);
-                                            }
-                                          });
+
+  concurrency::ThreadPool::TryParallelFor(
+      tp, SafeInt<ptrdiff_t>(M) * N, static_cast<double>(block_size),
+      [&lambda](ptrdiff_t first, ptrdiff_t last) {
+        for (ptrdiff_t index = first; index < last; ++index) {
+          lambda(index);
+        }
+      });
 
   return Status::OK();
 }
diff --git a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
index 35066bd68c65e..adbe86a14dabf 100644
--- a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
@@ -341,6 +341,41 @@ TEST(GatherOpTest, Gather_axis1_indices2d_string) {
   test.Run();
 }
 
+TEST(GatherOpTest, Gather_overflow_check) {
+// Skip on 32-bit platforms where size_t overflow would truncate the large expected
+// output shape and where allocating the full reference tensor is infeasible.
+#if SIZE_MAX <= UINT32_MAX
+  GTEST_SKIP() << "Gather_overflow_check skipped on 32-bit platforms.";
+#endif
+
+  // The test uses dimensions (65537, 2) and indices of length 65537, which produce an output
+  // shape of (65537, 65537).
+  //
+  // 65537 x 65537 = 4,295,098,369 which is greater than the maximum value of a 32-bit integer (2,147,483,647).
+  //
+  // This test is to verify CPU implementation of the Gather operator doesn't overflow when calculating
+  // the output shape and generating the output tensor.
+
+  OpTester test("Gather");
+  test.AddAttribute<int64_t>("axis", 1LL);
+
+  // Inputs
+  const std::vector<int64_t> data_dims{65537, 2};
+  const std::vector<int64_t> indices_dims{65537};
+  std::vector<uint8_t> data_values(static_cast<size_t>(data_dims[0] * data_dims[1]), 1);
+  std::vector<int64_t> indices_values(static_cast<size_t>(indices_dims[0]), 1);
+  std::vector<uint8_t> expected_output_values(static_cast<size_t>(65537) * static_cast<size_t>(65537), 1);
+
+  test.AddInput<uint8_t>("data", {65537, 2}, data_values);
+  test.AddInput<int64_t>("indices", {65537}, indices_values);
+  test.AddOutput<uint8_t>("output", {65537, 65537}, expected_output_values);
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.emplace_back(DefaultCpuExecutionProvider());
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
 TEST(GatherOpTest, Gather_axis1_indices2d_bool) {
   OpTester test("Gather");
   test.AddAttribute<int64_t>("axis", 1LL);

From fbf081ca5b841f104ade54d746c32ad154f7f1af Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 26 Feb 2026 23:08:59 +0000
Subject: [PATCH 12/21] [web] fix usage of wasmBinary together with a blob URL
 for .mjs (#27411)

### Description

Fixes the issue 1 described in #27317


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 js/web/lib/wasm/wasm-factory.ts | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/js/web/lib/wasm/wasm-factory.ts b/js/web/lib/wasm/wasm-factory.ts
index a9ef6c72314dd..ba4b9578207f0 100644
--- a/js/web/lib/wasm/wasm-factory.ts
+++ b/js/web/lib/wasm/wasm-factory.ts
@@ -194,6 +194,13 @@ export const initializeWebAssembly = async (flags: Env.WebAssemblyFlags): Promis
       if (wasmBinaryOverride) {
         // Set a custom buffer which contains the WebAssembly binary. This will skip the wasm file fetching.
         config.wasmBinary = wasmBinaryOverride;
+
+        // Offer an implementation of locateFile() that returns the file name directly. This helps to avoid an error
+        // thrown later from the following code when `import.meta.url` is a blob URL:
+        // ```
+        //   return new URL("ort-wasm-simd-threaded.jsep.wasm", import.meta.url).href;
+        // ```
+        config.locateFile = (fileName) => fileName;
       } else if (wasmPathOverride || wasmPrefixOverride) {
         // A callback function to locate the WebAssembly file. The function should return the full path of the file.
         //

From 2c039f39e79b6fd9ab3155141e65fbbbe8c04063 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 26 Feb 2026 23:09:16 +0000
Subject: [PATCH 13/21] [web] remove the unhelpful "Unknown CPU vendor"
 warning. (#27399)

### Description

remove the "Unknown CPU vendor" warning for webassembly.

CPU info is not supposed to expose in a browser environment, so it is
expected to have no CPU info at runtime. Disable the confusing warning
message for WebAssembly.

### Motivation and Context

fixes #27336
---
 onnxruntime/core/common/cpuid_info.cc | 4 ++++
 onnxruntime/core/common/cpuid_info.h  | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/common/cpuid_info.cc b/onnxruntime/core/common/cpuid_info.cc
index afea9f62419fa..00711e416e4e3 100644
--- a/onnxruntime/core/common/cpuid_info.cc
+++ b/onnxruntime/core/common/cpuid_info.cc
@@ -366,7 +366,11 @@ CPUIDInfo::CPUIDInfo() {
 #endif  // defined(CPUINFO_SUPPORTED)
 
   // Note: This should be run after cpuinfo initialization if cpuinfo is enabled.
+  // On Wasm/Emscripten, cpuinfo cannot detect the CPU vendor so skip to avoid
+  // an unhelpful "Unknown CPU vendor" warning.
+#if !defined(__wasm__)
   VendorInfoInit();
+#endif
 
 #ifdef CPUIDINFO_ARCH_X86
   X86Init();
diff --git a/onnxruntime/core/common/cpuid_info.h b/onnxruntime/core/common/cpuid_info.h
index ca9315c7ef95d..be301019df5c0 100644
--- a/onnxruntime/core/common/cpuid_info.h
+++ b/onnxruntime/core/common/cpuid_info.h
@@ -168,7 +168,7 @@ class CPUIDInfo {
   bool has_arm_sme2_{false};
 
   std::string vendor_;
-  uint32_t vendor_id_;
+  uint32_t vendor_id_{0};
 };
 
 }  // namespace onnxruntime

From c03997393d04a9cd02ec14e2f5708fc71818ca90 Mon Sep 17 00:00:00 2001
From: Chaya <cha182350@gmail.com>
Date: Wed, 18 Feb 2026 20:25:53 +0200
Subject: [PATCH 14/21] Fix GatherND division by zero when batch dimensions
 mismatch (#27090)

Fixes #23828

Added validation to check:
- num_batches is not zero
- num_slices is divisible by num_batches

Before this fix, mismatched batch dimensions caused a crash due to
division by zero.

### Description
<!-- Describe your changes. -->
This PR fixes a division by zero crash in the GatherND operator when
batch dimensions mismatch between input and indices tensors.

Changes made:

Added validation in gather_nd.cc to check that num_batches is not zero
before division
Added validation that num_slices is divisible by num_batches
Added a unit test to verify the fix


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Description
Fixes #23828

When batch_dims is set but the actual batch dimensions of the input
tensor and indices tensor don't align correctly, the code performs a
division that can result in division by zero, causing a crash.

For example, with:

Input shape: [2, 2, 2]
Indices shape: [2, 1]
batch_dims=1
The calculation num_slices / num_batches would crash if num_batches is
0, or produce unexpected results if they don't divide evenly.

This fix returns a clear error message instead of crashing.

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 .../core/providers/cpu/tensor/gather_nd.cc    | 12 ++++++
 .../providers/cpu/tensor/gather_nd_op_test.cc | 43 +++++++++++++++++++
 2 files changed, 55 insertions(+)

diff --git a/onnxruntime/core/providers/cpu/tensor/gather_nd.cc b/onnxruntime/core/providers/cpu/tensor/gather_nd.cc
index ad3faa70ed6af..a0a848eef0dff 100644
--- a/onnxruntime/core/providers/cpu/tensor/gather_nd.cc
+++ b/onnxruntime/core/providers/cpu/tensor/gather_nd.cc
@@ -66,6 +66,18 @@ Status GatherNDBase::PrepareForCompute(const TensorShape& input_shape, const Ten
   const auto num_slices = indices_shape.SizeToDimension(indices_shape.NumDimensions() - 1);
   const auto slice_size = input_shape.SizeFromDimension(SafeInt<size_t>(batch_dims_) + num_slice_dims);
   const auto num_batches = input_shape.SizeToDimension(SafeInt<size_t>(batch_dims_));
+
+  // Validate batch dimensions to prevent division by zero
+  if (num_batches == 0) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "GatherND: input tensor batch dimensions cannot be zero");
+  }
+  if (num_slices % num_batches != 0) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "GatherND: indices batch size (", num_slices,
+                           ") is not divisible by input batch size (", num_batches, ")");
+  }
+
   const auto input_batch_stride = input_shape.SizeFromDimension(SafeInt<size_t>(batch_dims_));
   const auto num_slices_per_batch = num_slices / num_batches;
   std::vector<int64_t> sizes_from_slice_dims(onnxruntime::narrow<size_t>(num_slice_dims));
diff --git a/onnxruntime/test/providers/cpu/tensor/gather_nd_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_nd_op_test.cc
index 081b4b484a73b..a8f3b99b2b3d3 100644
--- a/onnxruntime/test/providers/cpu/tensor/gather_nd_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/gather_nd_op_test.cc
@@ -329,5 +329,48 @@ TEST(GatherNDOpTest, GatherND_slice_int64_t) {
   test.Run();
 }
 
+// Test for issue #23828: GatherND should return error instead of crashing
+// when batch dimensions mismatch between input and indices
+TEST(GatherNDOpTest, GatherND_batch_dims_mismatch_error) {
+  OpTester test("GatherND", 12, kOnnxDomain);
+  test.AddAttribute<int64_t>("batch_dims", 1);
+
+  // Input has 3 batches, but indices has 2 slices (indices batch size 2), which is not divisible by 3 - mismatch!
+  test.AddInput<float>("data", {3, 3}, {0.f, 1.f, 2.f, 10.f, 11.f, 12.f, 20.f, 21.f, 22.f});
+  test.AddInput<int64_t>("indices", {2, 1}, {1, 2});
+  test.AddOutput<float>("output", {2}, {0.f, 0.f});  // dummy output, won't be used
+
+  // Force execution only on CPU
+  std::vector<std::unique_ptr<onnxruntime::IExecutionProvider>> cpu_only_ep;
+  cpu_only_ep.push_back(DefaultCpuExecutionProvider());
+
+  test.Run(OpTester::ExpectResult::kExpectFailure,
+           "GatherND: indices batch size (2) is not divisible by input batch size (3)",
+           {},             // no excluded providers needed
+           nullptr,        // no RunOptions
+           &cpu_only_ep);  // force CPU
+}
+
+// Test for issue #23828: GatherND should return error when input batch dimension is zero
+TEST(GatherNDOpTest, GatherND_zero_batch_dims_error) {
+  OpTester test("GatherND", 12, kOnnxDomain);
+  test.AddAttribute<int64_t>("batch_dims", 1);
+
+  // Input has 0 batches - should fail with clear error instead of division by zero
+  test.AddInput<float>("data", {0, 3}, {});
+  test.AddInput<int64_t>("indices", {2, 1}, {1, 2});
+  test.AddOutput<float>("output", {2}, {0.f, 0.f});  // dummy output, won't be used
+
+  // Force execution only on CPU
+  std::vector<std::unique_ptr<onnxruntime::IExecutionProvider>> cpu_only_ep;
+  cpu_only_ep.push_back(DefaultCpuExecutionProvider());
+
+  test.Run(OpTester::ExpectResult::kExpectFailure,
+           "GatherND: input tensor batch dimensions cannot be zero",
+           {},
+           nullptr,
+           &cpu_only_ep);  // force CPU
+}
+
 }  // namespace test
 }  // namespace onnxruntime

From 694bc394ce19fabf021c8a216906e123c958fb22 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Wed, 18 Feb 2026 10:40:32 -0800
Subject: [PATCH 15/21] Fix QMoE CPU Operator (#27360)

This PR addresses several issues in the QMoE CPU implementation,
improves MLAS documentation.

## Changes

### 1. QMoE CPU Operator Fixes
- **Corrected Bias Handling**: Renamed `fc2_bias_handled_by_q4_gemm` to
`fc2_bias_added_by_mlas` and updated the logic to consistently track
whether FC2 bias has been applied. This ensures that bias is not
double-counted or missed when using `DirectQ4Gemm`.
- **SwiGLU Attribute Update**: Switched from `swiglu_interleaved` to
`swiglu_fusion` in both the C++ operator and the Python test
infrastructure to align with the latest QMoE implementation standards.

### 2. MLAS Documentation
- **Clarified Buffer Shapes**: Added explicit documentation to
`MlasQ4GemmPackB` to specify that the input `FpData` buffer expects a
shape of `[K, N]`. This helps prevent layout-related errors in future
integrations.

### 3. Test Updates
- **PyTorch Parity Fixes**: Refactored
`onnxruntime/test/python/transformers/test_qmoe_cpu.py` to use
`swiglu_fusion` and improved the test structure for better parity checks
with PyTorch.

## Verification
- Verified by running `test_qmoe_cpu.py` to ensure all QMoE parity tests
pass on CPU.
---
 .../cpu/moe/moe_quantization_cpu.cc           |  29 +--
 onnxruntime/core/mlas/inc/mlas_q4.h           |   8 +-
 .../test/python/transformers/test_qmoe_cpu.py | 183 ++++++++++--------
 3 files changed, 126 insertions(+), 94 deletions(-)

diff --git a/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc b/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc
index 6d1d191689466..14bddaf324ae7 100644
--- a/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc
+++ b/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc
@@ -118,13 +118,23 @@ Status ConvertToMlasQ4Format(const uint8_t* quantized_data,
 
   DequantizeBlockWithMlas(quantized_data, scales, zero_points, block_size, num_bits, rows, cols, temp_float, nullptr);
 
-  size_t packed_size = MlasQ4GemmPackBSize(qtype, static_cast<size_t>(cols), static_cast<size_t>(rows));
+  // Transpose from N x K (weights) to K x N.
+  // DirectQ4Gemm expects weights to be packed in a specific layout ([K, N] logically)
+  auto transposed_float_buffer = IAllocator::MakeUniquePtr<float>(allocator, static_cast<size_t>(rows * cols));
+  float* transposed_float = transposed_float_buffer.get();
+  for (int64_t r = 0; r < rows; ++r) {
+    for (int64_t c = 0; c < cols; ++c) {
+      transposed_float[c * rows + r] = temp_float[r * cols + c];
+    }
+  }
+
+  size_t packed_size = MlasQ4GemmPackBSize(qtype, static_cast<size_t>(rows), static_cast<size_t>(cols));
   if (packed_size == 0) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "MLAS Q4 packing not supported for this configuration");
   }
 
   mlas_packed_buffer = IAllocator::MakeUniquePtr<uint8_t>(allocator, packed_size);
-  MlasQ4GemmPackB(qtype, mlas_packed_buffer.get(), temp_float, static_cast<size_t>(cols), static_cast<size_t>(rows), static_cast<size_t>(cols));
+  MlasQ4GemmPackB(qtype, mlas_packed_buffer.get(), transposed_float, static_cast<size_t>(rows), static_cast<size_t>(cols), static_cast<size_t>(rows));
 
   return Status::OK();
 }
@@ -634,6 +644,7 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
     float* thread_bias2_buffer = thread_bias1_buffer + static_cast<size_t>(fc1_out_features);
 
     for (int64_t expert_idx : expert_batch) {
+      bool fc2_bias_added_by_mlas = false;
       const auto& routes = expert_token_map[static_cast<size_t>(expert_idx)];
       if (routes.empty()) {
         continue;
@@ -711,8 +722,6 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
       bool use_direct_q4_gemm = (fc1_zp_data == nullptr) &&
                                 CanUseMlasQ4Gemm(expert_weight_bits_, is_fc1_block_wise ? block_size_ : 0,
                                                  fc1_out_features, hidden_size, q_type);
-      bool fc1_used_direct_q4 = false;
-      bool fc1_bias_handled_by_q4_gemm = false;
 
       if (use_direct_q4_gemm) {
         IAllocatorUniquePtr<uint8_t> mlas_packed_fc1;
@@ -750,7 +759,6 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
                                             num_expert_tokens, fc1_out_features, hidden_size, q_type, tp);
 
           if (gemm_status.IsOK()) {
-            fc1_used_direct_q4 = true;
             goto fc1_gemm_done;
           }
         }
@@ -797,8 +805,7 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
                0.0f, C1, n,
                tp);
 
-      fc1_bias_handled_by_q4_gemm = fc1_used_direct_q4 && has_fc1_bias;
-      if (has_fc1_bias && !fc1_bias_handled_by_q4_gemm) {
+      if (has_fc1_bias) {
         const T* B1_bias = fc1_bias_data + expert_idx * fc1_out_features;
         if constexpr (std::is_same_v<T, MLFloat16>) {
           MlasConvertHalfToFloatBuffer(reinterpret_cast<const MLFloat16*>(B1_bias), thread_bias1_buffer, static_cast<size_t>(fc1_out_features));
@@ -891,7 +898,6 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
       bool use_direct_q4_gemm_fc2 = (fc2_zp_data == nullptr) &&
                                     CanUseMlasQ4Gemm(expert_weight_bits_, is_fc2_block_wise ? block_size_ : 0,
                                                      hidden_size, inter_size, q_type2);
-      bool fc2_used_direct_q4 = false;
 
       if (use_direct_q4_gemm_fc2) {
         IAllocatorUniquePtr<uint8_t> mlas_packed_fc2;
@@ -929,7 +935,7 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
                                             num_expert_tokens, hidden_size, inter_size, q_type2, tp);
 
           if (gemm_status.IsOK()) {
-            fc2_used_direct_q4 = true;
+            fc2_bias_added_by_mlas = true;
             goto fc2_gemm_done;
           }
         }
@@ -979,8 +985,7 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
 
     fc2_gemm_done:
 
-      bool fc2_bias_handled_by_q4_gemm = fc2_used_direct_q4 && has_fc2_bias;
-      if (has_fc2_bias && !fc2_bias_handled_by_q4_gemm) {
+      if (has_fc2_bias && !fc2_bias_added_by_mlas) {
         const T* B2_bias = fc2_bias_data + expert_idx * hidden_size;
         if constexpr (std::is_same_v<T, MLFloat16>) {
           MlasConvertHalfToFloatBuffer(reinterpret_cast<const MLFloat16*>(B2_bias), thread_bias2_buffer, static_cast<size_t>(hidden_size));
@@ -1015,7 +1020,7 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
         float* dest = thread_local_outputs + static_cast<size_t>(thread_id) * output_buffer_size + buffer_offset;
         const float* src = C2 + i * hidden_size;
 
-        if (has_fc2_bias && !fc2_bias_handled_by_q4_gemm) {
+        if (has_fc2_bias && !fc2_bias_added_by_mlas) {
           const size_t unroll_factor = narrow<size_t>(GetUnrollFactor(hidden_size));
           size_t j = 0;
           for (; j + unroll_factor <= narrow<size_t>(hidden_size); j += unroll_factor) {
diff --git a/onnxruntime/core/mlas/inc/mlas_q4.h b/onnxruntime/core/mlas/inc/mlas_q4.h
index 69f0435615079..d60e5b0164fe8 100644
--- a/onnxruntime/core/mlas/inc/mlas_q4.h
+++ b/onnxruntime/core/mlas/inc/mlas_q4.h
@@ -57,10 +57,10 @@ MlasQ4GemmPackBSize(
  *
  * @param QType      type of block quantization
  * @param PackedBuf  destination buffer
- * @param FpData     the pointer to fp32 matrix
- * @param N          the number of columns of matrix B.
- * @param K          the number of rows of matrix B.
- * @param ldb        leading dimension of B
+ * @param FpData     the pointer to fp32 matrix, with shape [K, N].
+ * @param N          the number of columns of matrix B (Output Channels).
+ * @param K          the number of rows of matrix B (Input Channels).
+ * @param ldb        leading dimension of FpData (usually N)
 */
 void
 MLASCALL
diff --git a/onnxruntime/test/python/transformers/test_qmoe_cpu.py b/onnxruntime/test/python/transformers/test_qmoe_cpu.py
index 90ebb148a26a5..238ac4d1f077d 100644
--- a/onnxruntime/test/python/transformers/test_qmoe_cpu.py
+++ b/onnxruntime/test/python/transformers/test_qmoe_cpu.py
@@ -364,7 +364,7 @@ def create_cpu_moe_onnx_graph(
     use_swiglu=False,
     use_quant=False,
     quant_bits=4,
-    swiglu_interleaved=False,
+    swiglu_fusion=0,
     block_size=0,
 ):
     if not has_onnx:
@@ -400,10 +400,10 @@ def create_cpu_moe_onnx_graph(
             "router_probs",  # 1
             "fc1_experts_weights",  # 2
             "fc1_scales",  # 3
-            "",  # 4: fc1_bias
+            "fc1_experts_bias" if fc1_bias is not None else "",  # 4
             "fc2_experts_weights",  # 5
             "fc2_scales",  # 6
-            "",  # 7: fc2_bias
+            "fc2_experts_bias" if fc2_bias is not None else "",  # 7
             "",  # 8: fc3_weights
             "",  # 9: fc3_scales
             "",  # 10: fc3_bias
@@ -442,11 +442,10 @@ def create_cpu_moe_onnx_graph(
             normalize_routing_weights=normalize_routing,
             activation_type=activation,
             # Add new attributes with backwards-compatible default values
-            swiglu_fusion=1 if use_swiglu else 0,  # 1 if using SwiGLU activation
+            swiglu_fusion=swiglu_fusion,
             swiglu_limit=7.0,
             activation_alpha=1.702,
             activation_beta=1.0,
-            swiglu_interleaved=1 if swiglu_interleaved else 0,  # Enable this attribute
             domain="com.microsoft",
         ),
     ]
@@ -559,6 +558,30 @@ def create_cpu_moe_onnx_graph(
             )
         )
 
+    if fc1_bias is not None:
+        fc1_bias_np = fc1_bias.detach().cpu().numpy().astype(ort_to_numpy_type_map[onnx_dtype])
+        initializers.append(
+            helper.make_tensor(
+                "fc1_experts_bias",
+                onnx_dtype,
+                list(fc1_bias.shape),
+                fc1_bias_np.flatten().tolist(),
+                raw=False,
+            )
+        )
+
+    if fc2_bias is not None:
+        fc2_bias_np = fc2_bias.detach().cpu().numpy().astype(ort_to_numpy_type_map[onnx_dtype])
+        initializers.append(
+            helper.make_tensor(
+                "fc2_experts_bias",
+                onnx_dtype,
+                list(fc2_bias.shape),
+                fc2_bias_np.flatten().tolist(),
+                raw=False,
+            )
+        )
+
     graph_inputs = [
         helper.make_tensor_value_info("input", onnx_dtype, [sequence_length, hidden_size]),
     ]
@@ -626,7 +649,7 @@ def __init__(
         self.num_experts_per_token = num_experts_per_token
 
 
-def swiglu(x: torch.Tensor, alpha: float = 1.702, limit: float = 7.0):
+def swiglu(x: torch.Tensor, alpha: float = 1.702, beta: float = 1.0, limit: float = 7.0):
     dim = x.shape[-1]
     x = x.view(-1, dim // 2, 2)
     x_glu, x_linear = x[..., 0], x[..., 1]
@@ -635,8 +658,8 @@ def swiglu(x: torch.Tensor, alpha: float = 1.702, limit: float = 7.0):
         x_glu = x_glu.clamp(max=limit)
         x_linear = x_linear.clamp(min=-limit, max=limit)
 
-    y = x_glu * torch.sigmoid(alpha * x_glu) * (x_linear + 1)
-    return y
+    y = x_glu * torch.sigmoid(alpha * x_glu) * (x_linear + beta)
+    return y.view(-1, dim // 2)
 
 
 class MoEBlockSparseTop2MLP(nn.Module):
@@ -855,7 +878,7 @@ def ort_forward(self, hidden_states: torch.Tensor, enable_performance_test=False
                 e = time.time()
                 time_ms = (e - s) / repeat * 1000
                 is_swiglu = hasattr(self, "use_swiglu") and self.use_swiglu
-                is_interleaved = hasattr(self, "swiglu_interleaved") and self.swiglu_interleaved
+                is_interleaved = hasattr(self, "swiglu_fusion") and self.swiglu_fusion == 1
                 act_type = f"SwiGLU(interleaved={is_interleaved})" if is_swiglu else "SiLU"
                 print(f"ORT Performance - {act_type} {self.quant_bits}-bit: {time_ms:.3f} ms/inference")
 
@@ -868,62 +891,80 @@ def recreate_onnx_model(self):
         """Recreate the ONNX model with the current weights to reflect any changes to the quantization code."""
 
         w1_list, w2_list = [], []
+        w1_bias_list, w2_bias_list = [], []
         w1_scale_list, w2_scale_list = [], []
         w1_zp_list, w2_zp_list = [], []
 
         is_4_bit = self.quant_bits == 4
         for i in range(self.num_experts):
-            if self.block_size > 0:
-                # Use block-wise quantization
-                w1_scale, pre_qweight1, w1_qdq, w1_zp = quant_dequant_blockwise(
-                    self.experts[i].w1.weight, self.block_size, is_4_bit, asymmetric=self.use_asymmetric_quant
-                )
-                w2_scale, pre_qweight2, w2_qdq, w2_zp = quant_dequant_blockwise(
-                    self.experts[i].w2.weight, self.block_size, is_4_bit, asymmetric=self.use_asymmetric_quant
-                )
+            if hasattr(self.experts[i], "w3"):
+                w1, w3 = self.experts[i].w1.weight, self.experts[i].w3.weight
+                w2 = self.experts[i].w2.weight
+                w1_bias = self.experts[i].w1.bias
+                w3_bias = getattr(self.experts[i].w3, "bias", None)
+
+                # Combine and interleave w1 and w3 for the fused kernel
+                w1_combined = torch.cat([w1, w3], dim=0)  # [2*inter, hidden]
+                if getattr(self, "swiglu_fusion", 0) == 1:
+                    w1_combined = w1_combined.view(2, -1, self.hidden_dim).transpose(0, 1).reshape(-1, self.hidden_dim)
+
+                if self.block_size > 0:
+                    w1_scale, pre_qweight1, w1_qdq, w1_zp = quant_dequant_blockwise(
+                        w1_combined, self.block_size, is_4_bit, asymmetric=self.use_asymmetric_quant
+                    )
+                    w2_scale, pre_qweight2, w2_qdq, w2_zp = quant_dequant_blockwise(
+                        w2, self.block_size, is_4_bit, asymmetric=self.use_asymmetric_quant
+                    )
+                else:
+                    w1_scale, pre_qweight1, w1_qdq, w1_zp = quant_dequant(
+                        w1_combined, is_4_bit, asymmetric=self.use_asymmetric_quant
+                    )
+                    w2_scale, pre_qweight2, w2_qdq, w2_zp = quant_dequant(
+                        w2, is_4_bit, asymmetric=self.use_asymmetric_quant
+                    )
+
+                if w1_bias is not None and w3_bias is not None:
+                    b1_combined = torch.cat([w1_bias, w3_bias], dim=0)
+                    if getattr(self, "swiglu_fusion", 0) == 1:
+                        b1_combined = b1_combined.view(2, -1).transpose(0, 1).reshape(-1)
+                    w1_bias_list.append(b1_combined.detach().cpu())
+                elif w1_bias is not None:
+                    w1_bias_list.append(w1_bias.detach().cpu())
             else:
-                # Use row-wise quantization
-                w1_scale, pre_qweight1, w1_qdq, w1_zp = quant_dequant(
-                    self.experts[i].w1.weight, is_4_bit, asymmetric=self.use_asymmetric_quant
-                )
-                w2_scale, pre_qweight2, w2_qdq, w2_zp = quant_dequant(
-                    self.experts[i].w2.weight, is_4_bit, asymmetric=self.use_asymmetric_quant
-                )
+                # PhiMoESwiGLUMLP already has interleaved weights in w1
+                w1 = self.experts[i].w1.weight
+                w2 = self.experts[i].w2.weight
+                w1_bias = self.experts[i].w1.bias
 
-            if self.use_swiglu:
-                if self.swiglu_interleaved:
-                    pass
+                if self.block_size > 0:
+                    w1_scale, pre_qweight1, w1_qdq, w1_zp = quant_dequant_blockwise(
+                        w1, self.block_size, is_4_bit, asymmetric=self.use_asymmetric_quant
+                    )
+                    w2_scale, pre_qweight2, w2_qdq, w2_zp = quant_dequant_blockwise(
+                        w2, self.block_size, is_4_bit, asymmetric=self.use_asymmetric_quant
+                    )
                 else:
-                    if self.block_size > 0:
-                        w3_scale, pre_qweight3, w3_qdq, w3_zp = quant_dequant_blockwise(
-                            self.experts[i].w3.weight, self.block_size, is_4_bit, asymmetric=self.use_asymmetric_quant
-                        )
-                    else:
-                        w3_scale, pre_qweight3, w3_qdq, w3_zp = quant_dequant(
-                            self.experts[i].w3.weight, is_4_bit, asymmetric=self.use_asymmetric_quant
-                        )
-
-                    gate_weights = pre_qweight1
-                    value_weights = pre_qweight3
-                    gate_scales = w1_scale
-                    value_scales = w3_scale
-                    gate_zp = w1_zp
-                    value_zp = w3_zp
-
-                    pre_qweight1 = torch.cat([gate_weights, value_weights], dim=0)
-                    w1_scale = torch.cat([gate_scales, value_scales], dim=0)
-                    if w1_zp is not None and w3_zp is not None:
-                        w1_zp = torch.cat([gate_zp, value_zp], dim=0)
-
-                if self.swiglu_interleaved:
-                    self.experts[i].w1.weight = nn.Parameter(w1_qdq.contiguous().clone())
+                    w1_scale, pre_qweight1, w1_qdq, w1_zp = quant_dequant(
+                        w1, is_4_bit, asymmetric=self.use_asymmetric_quant
+                    )
+                    w2_scale, pre_qweight2, w2_qdq, w2_zp = quant_dequant(
+                        w2, is_4_bit, asymmetric=self.use_asymmetric_quant
+                    )
+                if w1_bias is not None:
+                    w1_bias_list.append(w1_bias.detach().cpu())
 
+            if self.use_swiglu:
+                if getattr(self, "swiglu_fusion", 0) == 1:
+                    self.experts[i].w1.weight = nn.Parameter(w1_qdq.contiguous().clone())
                 else:
                     intermediate_size = self.experts[i].w1.weight.shape[0]
                     gate_dequant = w1_qdq[:intermediate_size].contiguous().clone()
                     value_dequant = w1_qdq[intermediate_size:].contiguous().clone()
-                    self.experts[i].w1.weight.data = gate_dequant
-                    self.experts[i].w3.weight.data = value_dequant
+                    if hasattr(self.experts[i], "w3"):
+                        self.experts[i].w1.weight.data = gate_dequant
+                        self.experts[i].w3.weight.data = value_dequant
+                    else:
+                        self.experts[i].w1.weight.data = w1_qdq.contiguous().clone()
             else:
                 self.experts[i].w1.weight.data = w1_qdq.contiguous().clone()
 
@@ -931,6 +972,9 @@ def recreate_onnx_model(self):
 
             w1_list.append(pre_qweight1)
             w2_list.append(pre_qweight2)
+
+            if self.experts[i].w2.bias is not None:
+                w2_bias_list.append(self.experts[i].w2.bias)
             w1_scale_list.append(w1_scale)
             w2_scale_list.append(w2_scale)
             if w1_zp is not None:
@@ -963,9 +1007,9 @@ def recreate_onnx_model(self):
                 onnx_dtype=self.onnx_dtype,
                 fc1_experts_weights=self.moe_experts_weight1,
                 fc2_experts_weights=self.moe_experts_weight2,
-                # Biases are not used in QMoE
-                fc1_bias=None,
-                fc2_bias=None,
+                # Pass collected biases
+                fc1_bias=torch.stack(w1_bias_list, dim=0) if w1_bias_list else None,
+                fc2_bias=torch.stack(w2_bias_list, dim=0) if w2_bias_list else None,
                 # Scales are used for dequantization
                 fc1_scales=moe_experts_weight_scale1,
                 fc2_scales=moe_experts_weight_scale2,
@@ -975,7 +1019,7 @@ def recreate_onnx_model(self):
                 use_swiglu=self.use_swiglu,
                 use_quant=True,  # Always use QMoE
                 quant_bits=self.quant_bits,
-                swiglu_interleaved=self.swiglu_interleaved if hasattr(self, "swiglu_interleaved") else False,
+                swiglu_fusion=getattr(self, "swiglu_fusion", 0),
                 block_size=self.block_size,  # Add block_size for block-wise quantization
             )
         except Exception:
@@ -1020,7 +1064,7 @@ def parity_check(self):
             max_diff = (torch_output.cpu() - ort_output.cpu()).abs().max()
 
         is_swiglu = hasattr(self, "use_swiglu") and self.use_swiglu
-        is_interleaved = hasattr(self, "swiglu_interleaved") and self.swiglu_interleaved
+        is_interleaved = getattr(self, "swiglu_fusion", 0) == 1
         act_type = f"SwiGLU(interleaved={is_interleaved})" if is_swiglu else "SiLU"
         quant_type = "Asymmetric" if self.use_asymmetric_quant else "Symmetric"
         block_type = f"Block({self.block_size})" if self.block_size > 0 else "Row"
@@ -1047,24 +1091,6 @@ def parity_check(self):
             )
             print("Torch sample:", torch_output.cpu().reshape(-1, hidden_dim)[i, k].item())
             print("ORT  sample:", ort_output.cpu().reshape(-1, hidden_dim)[i, k].item())
-            # Print routing and per-expert contributions for this token from the PyTorch reference
-            try:
-                hidden_states_flat = hidden_state.view(-1, hidden_dim)
-                token_vec = hidden_states_flat[i : i + 1]
-                gate_logits = self.gate(token_vec)
-                topk_vals, topk_experts = torch.topk(gate_logits, self.top_k, dim=-1)
-                topk_soft = F.softmax(topk_vals, dim=1)
-                print("Gate logits:", gate_logits.detach().cpu().numpy())
-                print("Selected experts:", topk_experts.detach().cpu().numpy())
-                print("Routing weights:", topk_soft.detach().cpu().numpy())
-                # Compute per-expert contributions for selected experts
-                for idx_e, e in enumerate(topk_experts[0].tolist()):
-                    expert_layer = self.experts[e]
-                    expert_out = expert_layer(token_vec)
-                    contrib = expert_out[0, k].item() * topk_soft[0, idx_e].item()
-                    print(f"Expert {e} contrib at hidden {k}: {contrib}")
-            except Exception as _:
-                pass
 
         ort_dtype_quant_bits_tolerance_map = {
             "FP32:0": (5e-3, 1e-3),
@@ -1128,7 +1154,7 @@ def __init__(
         self.num_experts = config.num_local_experts
         self.top_k = config.num_experts_per_token
         self.use_swiglu = True
-        self.swiglu_interleaved = True
+        self.swiglu_fusion = 1
         self.block_size = block_size
         use_quant = self.quant_bits > 0
 
@@ -1232,7 +1258,7 @@ def __init__(
         self.top_k = config.num_experts_per_tok
         self.router_jitter_noise = config.router_jitter_noise
         self.use_swiglu = True
-        self.swiglu_interleaved = True
+        self.swiglu_fusion = 1
         self.block_size = block_size
         use_quant = self.quant_bits > 0
 
@@ -1314,7 +1340,8 @@ def __init__(
             use_swiglu=self.use_swiglu,
             use_quant=use_quant,
             quant_bits=self.quant_bits,
-            swiglu_interleaved=self.swiglu_interleaved,
+            # swiglu_fusion=1 means fused and interleaved, which is the standard for QMoE.
+            swiglu_fusion=getattr(self, "swiglu_fusion", 0),
             block_size=self.block_size,
         )
 

From 52bb39585914fc5ff78f711135e7e1fdbdd73fed Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Sat, 21 Feb 2026 00:46:48 -0800
Subject: [PATCH 16/21] QMoE CPU Performance Update (Up to 4x on 4-bit)
 (#27364)

## Summary
This change improves QMoE CPU performance by moving more work to prepack
time and enabling the DirectQ4 GEMM fast path where appropriate, while
preserving an env-var switch for performance/accuracy A/B testing.

This PR introduces:
- Prepack and cache infrastructure for QMoE expert weights.
- DirectQ4 packed-B cache built during prepack (instead of mutable
runtime cache in `Compute()`).
- Fast-path support for block-wise cases (including block size 32 where
supported by MLAS Q4 type).
- Runtime toggle via `ORT_USE_MLAS_Q4_GEMM_MOE`.
- Default fast-path policy refined to avoid known accuracy-loss
scenarios unless explicitly overridden by env var.
- Test and benchmark refinements for QMoE CPU validation.

## Key Implementation Changes

### 1. Prepack-time cache build
- Moves DirectQ4 packed-B cache construction to prepack stage.
- Removes mutable runtime cache maintenance from `Compute()`.
- Reduces per-inference overhead and avoids mutable shared cache
complexity.

### 2. Fast path vs fallback
- Keeps two execution modes:
- DirectQ4 GEMM fast path (`MlasQ4GemmPackB` + `DirectQ4Gemm` cache
usage).
  - Fallback path (`DequantizePrePacked` + `MlasGemm`).
- Allows controlled fallback for accuracy-sensitive configurations.

### 3. Environment variable behavior
- `ORT_USE_MLAS_Q4_GEMM_MOE=1`: force fast path when supported.
- `ORT_USE_MLAS_Q4_GEMM_MOE=0`: force fallback path.
- Unset: use default policy that enables fast path unless a known
accuracy-loss pattern is detected.

### 4. Test updates
- QMoE CPU tests were refined to validate env-var on/off behavior and
no-env behavior.
- Coverage includes parity checks for symmetric/asymmetric,
row-wise/block-wise settings.

## Benchmark Results (1000 inferences, `benchmark_qmoe.py`)

Note: PyTorch latency fluctuates across runs and is excluded from
conclusions below.

### ORT results comparison
| Config | Baseline ORT Time (ms) | Baseline ORT tok/s | New ORT Time
(env=0) (ms) | New ORT tok/s (env=0) | New ORT Time (env=1) (ms) | New
ORT tok/s (env=1) |
|---|---:|---:|---:|---:|---:|---:|
| Medium-4bit | 748.594 | 1.3 | 237.219 | 4.2 | 178.943 | 5.6 |
| Medium-8bit | 209.277 | 4.8 | 212.074 | 4.7 | 203.882 | 4.9 |

### ORT speedup vs baseline
| Config | env=0 speedup vs baseline (time) | env=1 speedup vs baseline
(time) |
|---|---:|---:|
| Medium-4bit | 3.16x faster | 4.18x faster |
| Medium-8bit | 0.99x (about flat) | 1.03x faster |

## Accuracy Notes
- `env=1` (forced fast path) provides the best 4-bit performance but may
show non-zero max diff in known cases.
- `env=0` (fallback) maintains parity behavior with zero observed max
diff in the reported benchmark table.
- Default no-env policy is designed to avoid known accuracy-loss cases
while still enabling fast path where safe.
---
 onnxruntime/contrib_ops/cpu/moe/moe_helper.h  | 136 +++--
 .../cpu/moe/moe_quantization_cpu.cc           | 485 ++++++++++++++++--
 .../cpu/moe/moe_quantization_cpu.h            |  24 +
 .../debug_node_inputs_outputs_utils.cc        |   4 +-
 .../python/transformers/benchmark_qmoe.py     | 191 +++++++
 .../test/python/transformers/test_qmoe_cpu.py | 106 ++--
 6 files changed, 839 insertions(+), 107 deletions(-)
 create mode 100644 onnxruntime/test/python/transformers/benchmark_qmoe.py

diff --git a/onnxruntime/contrib_ops/cpu/moe/moe_helper.h b/onnxruntime/contrib_ops/cpu/moe/moe_helper.h
index 257c5a189b3bd..bd30418030dc2 100644
--- a/onnxruntime/contrib_ops/cpu/moe/moe_helper.h
+++ b/onnxruntime/contrib_ops/cpu/moe/moe_helper.h
@@ -35,44 +35,86 @@ struct MoEParameters {
 };
 namespace moe_helper {
 
+// Helper to check shape dimensions
+#define ASSERT_SHAPE_DIMENSION(shape_ptr, dim, name)                             \
+  if (shape_ptr != nullptr) {                                                    \
+    if (shape_ptr->NumDimensions() != dim) {                                     \
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input '", name,     \
+                             "' is expected to have ", dim, " dimensions, got ", \
+                             shape_ptr->NumDimensions());                        \
+    }                                                                            \
+  }
+
+#define ASSERT_SHAPE_3D(shape_ptr, name) ASSERT_SHAPE_DIMENSION(shape_ptr, 3, name)
+
+#define CHECK_SHAPE(shape_ptr, name, ...)                                    \
+  if (shape_ptr != nullptr) {                                                \
+    const TensorShape& expected_shape = make_shape(__VA_ARGS__);             \
+    if (*shape_ptr != expected_shape) {                                      \
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input '", name, \
+                             "' is expected to have shape ", expected_shape, \
+                             ", got ", *shape_ptr);                          \
+    }                                                                        \
+  }
+
 template <typename Tensor>
 Status CheckInputs(MoEParameters& parameters,
-                   const Tensor* input,                // required
-                   const Tensor* router_probs,         // required
-                   const Tensor* fc1_experts_weights,  // required
-                   const Tensor* fc1_experts_bias,     // optional
-                   const Tensor* fc1_experts_scales,   // required for qMoE; NULL for MOE
-                   const Tensor* fc1_zero_points,      // optional, for qMoE
-                   const Tensor* fc2_experts_weights,  // required
-                   const Tensor* fc2_experts_bias,     // optional
-                   const Tensor* fc2_experts_scales,   // required for qMoE; NULL for MOE
-                   const Tensor* fc2_zero_points,      // optional, for qMoE
-                   const Tensor* fc3_experts_weights,  // optional
-                   const Tensor* fc3_experts_bias,     // optional
-                   const Tensor* fc3_experts_scales,   // required for qMoE; NULL for MOE
-                   const Tensor* fc3_zero_points,      // optional, for qMoE
-                   const int64_t pack_size,            // number of weights packed together (like 2 for uint4 packed to uint8)
+                   const Tensor* input,                           // required
+                   const Tensor* router_probs,                    // required
+                   const TensorShape* fc1_experts_weights_shape,  // required
+                   const Tensor* fc1_experts_bias,                // optional
+                   const Tensor* fc1_experts_scales,              // required for qMoE; NULL for MOE
+                   const Tensor* fc1_zero_points,                 // optional, for qMoE
+                   const TensorShape* fc2_experts_weights_shape,  // required
+                   const Tensor* fc2_experts_bias,                // optional
+                   const Tensor* fc2_experts_scales,              // required for qMoE; NULL for MOE
+                   const Tensor* fc2_zero_points,                 // optional, for qMoE
+                   const TensorShape* fc3_experts_weights_shape,  // optional
+                   const Tensor* fc3_experts_bias,                // optional
+                   const Tensor* fc3_experts_scales,              // required for qMoE; NULL for MOE
+                   const Tensor* fc3_zero_points,                 // optional, for qMoE
+                   const int64_t pack_size,                       // number of weights packed together (like 2 for uint4 packed to uint8)
                    const bool is_fused_swiglu,
                    const int64_t block_size = 0) {  // block size for block-wise quantization
-  // Check dimensions of input to avoid input_dims index out of range. CHECK_TENSOR_SHAPE will verify each tensor later.
+  // Required inputs
+  if (input == nullptr) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'input' is required.");
+  }
   ASSERT_TENSOR_2D_OR_3D(input);
-  ASSERT_TENSOR_3D(fc1_experts_weights);
-  ASSERT_TENSOR_3D(fc2_experts_weights);
+
+  if (router_probs == nullptr) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'router_probs' is required.");
+  }
   ASSERT_TENSOR_2D(router_probs);
 
+  if (fc1_experts_weights_shape == nullptr) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'fc1_experts_weights' is required.");
+  }
+  ASSERT_SHAPE_3D(fc1_experts_weights_shape, "fc1_experts_weights");
+
+  if (fc2_experts_weights_shape == nullptr) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'fc2_experts_weights' is required.");
+  }
+  ASSERT_SHAPE_3D(fc2_experts_weights_shape, "fc2_experts_weights");
+
   const auto& input_dims = input->Shape().GetDims();
   const auto& router_probs_dims = router_probs->Shape().GetDims();
-  const auto& fc1_experts_weights_dims = fc1_experts_weights->Shape().GetDims();
-  const auto& fc2_experts_weights_dims = fc2_experts_weights->Shape().GetDims();
 
   int64_t num_rows = input_dims.size() == 2 ? input_dims[0] : input_dims[0] * input_dims[1];
   int64_t hidden_size = input_dims[input_dims.size() - 1];
-  int64_t local_num_experts = fc1_experts_weights_dims[0];
   int64_t num_experts = router_probs_dims[1];
-  int64_t inter_size = (fc2_experts_weights_dims[1] * fc2_experts_weights_dims[2] * pack_size) / hidden_size;
 
-  const bool legacy_shape = (hidden_size != inter_size && fc2_experts_weights_dims[1] == inter_size) ||
-                            (hidden_size == inter_size && is_fused_swiglu && fc1_experts_weights_dims[1] == hidden_size);
+  int64_t local_num_experts = fc1_experts_weights_shape->GetDims()[0];
+
+  int64_t inter_size = (fc2_experts_weights_shape->GetDims()[1] *
+                        fc2_experts_weights_shape->GetDims()[2] * pack_size) /
+                       hidden_size;
+
+  bool legacy_shape = false;
+  const auto& fc2_experts_weights_dims = fc2_experts_weights_shape->GetDims();
+  const auto& fc1_experts_weights_dims = fc1_experts_weights_shape->GetDims();
+  legacy_shape = (hidden_size != inter_size && fc2_experts_weights_dims[1] == inter_size) ||
+                 (hidden_size == inter_size && is_fused_swiglu && fc1_experts_weights_dims[1] == hidden_size);
 
   // Fused swiglu doubles the output dimension of FC1 since it fused two GEMMs into one.
   const int64_t fc1_inter_size = is_fused_swiglu ? (inter_size + inter_size) : inter_size;
@@ -80,13 +122,13 @@ Status CheckInputs(MoEParameters& parameters,
 
   if (legacy_shape) {
     // legacy shape does not match column major memory layout. This is for backward compatibility.
-    CHECK_TENSOR_SHAPE(fc1_experts_weights, num_experts, hidden_size, fc1_inter_size / pack_size);
-    CHECK_TENSOR_SHAPE(fc2_experts_weights, num_experts, inter_size, hidden_size / pack_size);
-    CHECK_TENSOR_SHAPE(fc3_experts_weights, num_experts, hidden_size, inter_size / pack_size);
+    CHECK_SHAPE(fc1_experts_weights_shape, "fc1_experts_weights", num_experts, hidden_size, fc1_inter_size / pack_size);
+    CHECK_SHAPE(fc2_experts_weights_shape, "fc2_experts_weights", num_experts, inter_size, hidden_size / pack_size);
+    CHECK_SHAPE(fc3_experts_weights_shape, "fc3_experts_weights", num_experts, hidden_size, inter_size / pack_size);
   } else {
-    CHECK_TENSOR_SHAPE(fc1_experts_weights, num_experts, fc1_inter_size, hidden_size / pack_size);
-    CHECK_TENSOR_SHAPE(fc2_experts_weights, num_experts, hidden_size, inter_size / pack_size);
-    CHECK_TENSOR_SHAPE(fc3_experts_weights, num_experts, inter_size, hidden_size / pack_size);
+    CHECK_SHAPE(fc1_experts_weights_shape, "fc1_experts_weights", num_experts, fc1_inter_size, hidden_size / pack_size);
+    CHECK_SHAPE(fc2_experts_weights_shape, "fc2_experts_weights", num_experts, hidden_size, inter_size / pack_size);
+    CHECK_SHAPE(fc3_experts_weights_shape, "fc3_experts_weights", num_experts, inter_size, hidden_size / pack_size);
   }
 
   CHECK_TENSOR_SHAPE(router_probs, num_rows, num_experts);
@@ -168,9 +210,11 @@ Status CheckInputs(MoEParameters& parameters,
     }
   }
 
-  if (fc3_experts_weights == nullptr) {
+  if (fc3_experts_weights_shape == nullptr) {
+    // If fc3 weights are not provided, ensure no other fc3 parameters are provided
     ORT_ENFORCE(fc3_experts_bias == nullptr && fc3_experts_scales == nullptr && fc3_zero_points == nullptr);
   } else {
+    // If fc3 weights are provided, ensure scales logic is consistent
     ORT_ENFORCE(fc1_experts_scales == nullptr || fc3_experts_scales != nullptr);  // MOE no scale, or qMOE need scales
   }
 
@@ -200,6 +244,36 @@ Status CheckInputs(MoEParameters& parameters,
   return Status::OK();
 }
 
+template <typename Tensor>
+Status CheckInputs(MoEParameters& parameters,
+                   const Tensor* input,                // required
+                   const Tensor* router_probs,         // required
+                   const Tensor* fc1_experts_weights,  // required
+                   const Tensor* fc1_experts_bias,     // optional
+                   const Tensor* fc1_experts_scales,   // required for qMoE; NULL for MOE
+                   const Tensor* fc1_zero_points,      // optional, for qMoE
+                   const Tensor* fc2_experts_weights,  // required
+                   const Tensor* fc2_experts_bias,     // optional
+                   const Tensor* fc2_experts_scales,   // required for qMoE; NULL for MOE
+                   const Tensor* fc2_zero_points,      // optional, for qMoE
+                   const Tensor* fc3_experts_weights,  // optional
+                   const Tensor* fc3_experts_bias,     // optional
+                   const Tensor* fc3_experts_scales,   // required for qMoE; NULL for MOE
+                   const Tensor* fc3_zero_points,      // optional, for qMoE
+                   const int64_t pack_size,            // number of weights packed together (like 2 for uint4 packed to uint8)
+                   const bool is_fused_swiglu,
+                   const int64_t block_size = 0) {  // block size for block-wise quantization
+
+  const TensorShape* fc1_shape = (fc1_experts_weights != nullptr) ? &fc1_experts_weights->Shape() : nullptr;
+  const TensorShape* fc2_shape = (fc2_experts_weights != nullptr) ? &fc2_experts_weights->Shape() : nullptr;
+  const TensorShape* fc3_shape = (fc3_experts_weights != nullptr) ? &fc3_experts_weights->Shape() : nullptr;
+
+  return CheckInputs(parameters, input, router_probs, fc1_shape, fc1_experts_bias, fc1_experts_scales, fc1_zero_points,
+                     fc2_shape, fc2_experts_bias, fc2_experts_scales, fc2_zero_points,
+                     fc3_shape, fc3_experts_bias, fc3_experts_scales, fc3_zero_points,
+                     pack_size, is_fused_swiglu, block_size);
+}
+
 }  // namespace moe_helper
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc b/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc
index 14bddaf324ae7..81d2b0f8efdc6 100644
--- a/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc
+++ b/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc
@@ -13,6 +13,7 @@
 #include "core/common/narrow.h"
 #include "core/framework/tensor_type_and_shape.h"
 #include "core/util/math.h"
+#include "core/platform/env_var_utils.h"
 #include "contrib_ops/cpu/moe/moe_utils.h"
 #include "contrib_ops/cpu/moe/moe_helper.h"
 
@@ -69,13 +70,13 @@ bool CanUseMlasQ4Gemm(int64_t expert_weight_bits, int64_t block_size,
     out_qtype = BlkQ4Sym64;
   } else if (block_size == 128) {
     out_qtype = BlkQ4Sym128;
-  } else if (block_size == 0) {
+  } else if (block_size == 0 || block_size == 32) {
     out_qtype = BlkQ4Sym;
   } else {
     return false;
   }
 
-  size_t expected_size = MlasQ4GemmPackBSize(out_qtype, static_cast<size_t>(cols), static_cast<size_t>(rows));
+  size_t expected_size = MlasQ4GemmPackBSize(out_qtype, static_cast<size_t>(rows), static_cast<size_t>(cols));
   return expected_size > 0;
 }
 
@@ -84,6 +85,8 @@ bool CanUseMlasQ4Gemm(int64_t expert_weight_bits, int64_t block_size,
 namespace onnxruntime {
 namespace contrib {
 
+constexpr const char* kUseMlasQ4GemmMoe = "ORT_USE_MLAS_Q4_GEMM_MOE";
+
 template <typename TScale>
 void DequantizeBlockWithMlas(const uint8_t* quantized_data,
                              const TScale* scales,
@@ -364,6 +367,257 @@ void DequantizeBlock(const uint8_t* quantized_data,
   DequantizeBlockWithMlas(quantized_data, scales, zero_points, block_size, num_bits, rows, cols, dequantized_data, thread_pool);
 }
 
+template <typename TScale>
+void DequantizePrePacked(const uint8_t* prepacked_data,
+                         const TScale* scales,
+                         const uint8_t* zero_points,
+                         int64_t block_size,
+                         int64_t rows,
+                         int64_t cols,
+                         float* dequantized_data,
+                         const gsl::span<const int64_t>& scale_dims) {
+  // prepacked_data is [cols, rows] (transposed, unpacked)
+  // dequantized_data is [cols, rows] (transposed)
+  // scales, zero_points correspond to original [rows, cols] layout
+
+  const float default_zp_4bit = 8.0f;
+  const int64_t blocks_per_row = (block_size > 0) ? ((cols + block_size - 1) / block_size) : 1;
+  const int64_t zp_pack_size = 2;  // Always 2 for 4-bit
+
+  // Iterate over Columns (K) then Rows (N) because prepacked_data is [K, N]
+  for (int64_t c = 0; c < cols; ++c) {
+    for (int64_t r = 0; r < rows; ++r) {
+      uint8_t val = prepacked_data[c * rows + r];
+
+      int64_t block_idx = (block_size > 0) ? (c / block_size) : 0;
+      if (block_size > 0) block_idx = std::min(block_idx, blocks_per_row - 1);
+
+      int64_t scale_idx;
+      if (scale_dims.size() == 3 && scale_dims[2] > 1) {  // block-wise
+        scale_idx = r * blocks_per_row + block_idx;
+      } else {  // per-channel
+        scale_idx = r;
+      }
+
+      float scale = static_cast<float>(scales[scale_idx]);
+      float zp = default_zp_4bit;
+
+      if (zero_points != nullptr) {
+        int64_t zp_idx;
+        bool is_lower_nibble;
+
+        if (scale_dims.size() == 3 && scale_dims[2] > 1) {  // block-wise
+          int64_t zp_blocks_packed = (blocks_per_row + zp_pack_size - 1) / zp_pack_size;
+          zp_idx = r * zp_blocks_packed + block_idx / 2;
+          is_lower_nibble = (block_idx % 2 == 0);
+        } else {
+          zp_idx = r / 2;
+          is_lower_nibble = (r % 2 == 0);
+        }
+
+        uint8_t packed_zp = zero_points[zp_idx];
+        zp = is_lower_nibble ? static_cast<float>(packed_zp & 0x0F) : static_cast<float>(packed_zp >> 4);
+      }
+
+      dequantized_data[c * rows + r] = scale * (static_cast<float>(val) - zp);
+    }
+  }
+}
+
+template <typename TScale>
+Status BuildDirectQ4PackedBCache(const uint8_t* prepacked_weights,
+                                 const TScale* scales_data,
+                                 int64_t num_experts,
+                                 int64_t rows,
+                                 int64_t cols,
+                                 int64_t block_size,
+                                 const gsl::span<const int64_t>& scales_dims,
+                                 MLAS_BLK_QUANT_TYPE qtype,
+                                 AllocatorPtr allocator,
+                                 IAllocatorUniquePtr<void>& packed_b) {
+  const size_t packed_size = MlasQ4GemmPackBSize(qtype, static_cast<size_t>(rows), static_cast<size_t>(cols));
+  if (packed_size == 0) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Failed to compute MLAS Q4 packed size for cache");
+  }
+
+  const bool is_block_wise = (scales_dims.size() == 3 && scales_dims[2] > 1);
+  const int64_t scales_expert_stride = is_block_wise ? (rows * scales_dims[2]) : rows;
+  const size_t prepacked_expert_stride = static_cast<size_t>(rows * cols);
+  const size_t total_packed_size = packed_size * static_cast<size_t>(num_experts);
+
+  packed_b = IAllocator::MakeUniquePtr<void>(allocator, total_packed_size, true);
+  uint8_t* packed_b_ptr = static_cast<uint8_t*>(packed_b.get());
+
+  std::vector<float> dequantized_transposed(static_cast<size_t>(rows * cols));
+  for (int64_t expert_idx = 0; expert_idx < num_experts; ++expert_idx) {
+    const uint8_t* expert_prepacked = prepacked_weights + static_cast<size_t>(expert_idx) * prepacked_expert_stride;
+    const TScale* expert_scales = scales_data + expert_idx * scales_expert_stride;
+
+    DequantizePrePacked(expert_prepacked, expert_scales, nullptr, block_size, rows, cols,
+                        dequantized_transposed.data(), scales_dims);
+
+    MlasQ4GemmPackB(qtype, packed_b_ptr + expert_idx * packed_size, dequantized_transposed.data(),
+                    static_cast<size_t>(rows), static_cast<size_t>(cols), static_cast<size_t>(rows));
+  }
+
+  return Status::OK();
+}
+
+template <typename T>
+Status QMoECPU<T>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                           /*out*/ bool& is_packed,
+                           /*out*/ PrePackedWeights* prepacked_weights) {
+  is_packed = false;
+
+  // If scales are prepacked, they are constant initializers.
+  if (input_idx == 3) {
+    return Status::OK();
+  }
+  if (input_idx == 6) {
+    return Status::OK();
+  }
+
+  // Only support PrePack for FC1 (2) and FC2 (5) weights
+  // and only if expert_weight_bits_ == 4 (since we unpack to uint8)
+  if (expert_weight_bits_ != 4) {
+    return Status::OK();
+  }
+
+  if (input_idx == 2 || input_idx == 5) {
+    const auto& shape = tensor.Shape();
+    const int64_t num_experts = shape[0];
+    const int64_t rows = shape[1];
+    const int64_t cols_packed = shape[2];
+    const int64_t cols = cols_packed * 2;
+
+    size_t packed_size = static_cast<size_t>(num_experts * rows * cols);
+    auto packed_buffer = IAllocator::MakeUniquePtr<void>(alloc, packed_size, true);
+    uint8_t* dst_base = static_cast<uint8_t*>(packed_buffer.get());
+    const uint8_t* src_base = static_cast<const uint8_t*>(tensor.DataRaw());
+
+    for (int64_t i = 0; i < num_experts; ++i) {
+      const uint8_t* src = src_base + i * rows * cols_packed;
+      uint8_t* dst = dst_base + i * rows * cols;
+
+      for (int64_t r = 0; r < rows; ++r) {
+        for (int64_t c = 0; c < cols; ++c) {
+          uint8_t packed_val = src[r * cols_packed + (c / 2)];
+          uint8_t val = (c % 2 == 0) ? (packed_val & 0x0F) : (packed_val >> 4);
+
+          dst[c * rows + r] = val;
+        }
+      }
+    }
+
+    if (input_idx == 2) {
+      fc1_shape_ = shape;
+    } else if (input_idx == 5) {
+      fc2_shape_ = shape;
+    }
+
+    if (prepacked_weights) {
+      prepacked_weights->buffers_.push_back(std::move(packed_buffer));
+      prepacked_weights->buffer_sizes_.push_back(packed_size);
+      is_packed = true;
+
+      // Pack Shape (Buffer 1)
+      auto dims = shape.GetDims();
+      size_t rank_bytes = sizeof(int64_t);
+      size_t dims_bytes = dims.size() * sizeof(int64_t);
+      size_t shape_size = rank_bytes + dims_bytes;
+
+      auto shape_buffer = IAllocator::MakeUniquePtr<void>(alloc, shape_size);
+      int64_t* buffer_data = static_cast<int64_t*>(shape_buffer.get());
+      *buffer_data = static_cast<int64_t>(dims.size());
+      memcpy(buffer_data + 1, dims.data(), dims_bytes);
+
+      prepacked_weights->buffers_.push_back(std::move(shape_buffer));
+      prepacked_weights->buffer_sizes_.push_back(shape_size);
+
+      // Try build MLAS Q4 cache if scales are available
+      if (use_mlas_q4_gemm_) {
+        const Tensor* scales_tensor = nullptr;
+        MLAS_BLK_QUANT_TYPE qtype = BlkQ4Sym;
+        int scales_idx = -1;
+        int zp_idx = -1;
+
+        if (input_idx == 2) {  // FC1
+          scales_idx = 3;
+          zp_idx = 11;
+        } else if (input_idx == 5) {  // FC2
+          scales_idx = 6;
+          zp_idx = 12;
+        }
+
+        if (scales_idx != -1 &&
+            (zp_idx >= static_cast<int>(Info().node().InputDefs().size()) || !Info().node().InputDefs()[zp_idx]->Exists()) &&
+            Info().TryGetConstantInput(scales_idx, &scales_tensor) &&
+            scales_tensor != nullptr &&
+            CanUseMlasQ4Gemm(expert_weight_bits_, block_size_, rows, cols, qtype)) {
+          IAllocatorUniquePtr<void> cache_buffer;
+          const auto& scales_dims = scales_tensor->Shape().GetDims();
+          const T* scales_data = scales_tensor->Data<T>();
+          // Use the simple packed buffer we just created (buffer 0) as input
+          const uint8_t* simple_packed = dst_base;
+
+          if (BuildDirectQ4PackedBCache(simple_packed, scales_data, num_experts, rows, cols,
+                                        block_size_, scales_dims, qtype,
+                                        alloc, cache_buffer)
+                  .IsOK()) {
+            // Store the MLAS Q4 cache as buffer 2 (after unpacked weights and shape).
+            size_t cache_size = MlasQ4GemmPackBSize(qtype, static_cast<size_t>(rows), static_cast<size_t>(cols)) * static_cast<size_t>(num_experts);
+            prepacked_weights->buffers_.push_back(std::move(cache_buffer));
+            prepacked_weights->buffer_sizes_.push_back(cache_size);
+          }
+        }
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+template <typename T>
+Status QMoECPU<T>::UseSharedPrePackedBuffers_V2(std::vector<BufferUniquePtr>& prepacked_buffers,
+                                                gsl::span<const size_t> /*prepacked_buffer_sizes*/,
+                                                int input_idx,
+                                                /*out*/ bool& used_shared_buffers) {
+  used_shared_buffers = false;
+
+  if (expert_weight_bits_ != 4) {
+    return Status::OK();
+  }
+
+  if ((input_idx == 2 || input_idx == 5) && !prepacked_buffers.empty()) {
+    auto parse_shape = [&](TensorShape& shape) {
+      if (prepacked_buffers.size() > 1) {
+        int64_t* buffer_data = static_cast<int64_t*>(prepacked_buffers[1].get());
+        int64_t rank = buffer_data[0];
+        std::vector<int64_t> dims(static_cast<size_t>(rank));
+        memcpy(dims.data(), buffer_data + 1, static_cast<size_t>(rank) * sizeof(int64_t));
+        shape = TensorShape(dims);
+      }
+    };
+
+    if (input_idx == 2) {
+      packed_fc1_ = std::move(prepacked_buffers[0]);
+      parse_shape(fc1_shape_);
+      if (prepacked_buffers.size() > 2) {
+        packed_fc1_mlas_cache_ = std::move(prepacked_buffers[2]);
+      }
+    } else if (input_idx == 5) {
+      packed_fc2_ = std::move(prepacked_buffers[0]);
+      parse_shape(fc2_shape_);
+      if (prepacked_buffers.size() > 2) {
+        packed_fc2_mlas_cache_ = std::move(prepacked_buffers[2]);
+      }
+    }
+    used_shared_buffers = true;
+  }
+
+  return Status::OK();
+}
+
 template <typename T>
 QMoECPU<T>::QMoECPU(const OpKernelInfo& op_kernel_info)
     : OpKernel(op_kernel_info),
@@ -372,21 +626,32 @@ QMoECPU<T>::QMoECPU(const OpKernelInfo& op_kernel_info)
   ORT_ENFORCE(expert_weight_bits_ == 4 || expert_weight_bits_ == 8,
               "Attribute 'expert_weight_bits' must be 4 or 8.");
   block_size_ = op_kernel_info.GetAttrOrDefault<int64_t>("block_size", 0);
+  ORT_ENFORCE(block_size_ >= 0);
 
   if (block_size_ > 0) {
     ORT_ENFORCE(block_size_ >= 16, "block_size must be >= 16 when provided.");
     ORT_ENFORCE((block_size_ & (block_size_ - 1)) == 0, "block_size must be a power of 2.");
   }
+
+  const auto use_mlas_q4_gemm = ParseEnvironmentVariable<bool>(kUseMlasQ4GemmMoe);
+  if (use_mlas_q4_gemm.has_value()) {
+    use_mlas_q4_gemm_ = *use_mlas_q4_gemm;
+    use_mlas_q4_gemm_overridden_ = true;
+  } else {
+    // Default policy: enable fast path unless this run hits a known accuracy-loss configuration.
+    use_mlas_q4_gemm_ = true;
+    use_mlas_q4_gemm_overridden_ = false;
+  }
 }
 
 template <typename T>
 Status QMoECPU<T>::Compute(OpKernelContext* context) const {
   const auto* input = context->Input<Tensor>(0);
   const auto* router_probs = context->Input<Tensor>(1);
-  const auto* fc1_experts_weights = context->Input<Tensor>(2);
+  const auto* fc1_experts_weights = packed_fc1_ ? nullptr : context->Input<Tensor>(2);
   const auto* fc1_scales = context->Input<Tensor>(3);
   const auto* fc1_experts_bias = context->Input<Tensor>(4);
-  const auto* fc2_experts_weights = context->Input<Tensor>(5);
+  const auto* fc2_experts_weights = packed_fc2_ ? nullptr : context->Input<Tensor>(5);
   const auto* fc2_scales = context->Input<Tensor>(6);
   const auto* fc2_experts_bias = context->Input<Tensor>(7);
   const auto* fc3_experts_weights = context->Input<Tensor>(8);
@@ -396,17 +661,21 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
   const auto* fc2_zero_points = context->Input<Tensor>(12);
   const auto* fc3_zero_points = context->Input<Tensor>(13);
 
+  const TensorShape* fc1_shape_ptr = packed_fc1_ ? &fc1_shape_ : (fc1_experts_weights ? &fc1_experts_weights->Shape() : nullptr);
+  const TensorShape* fc2_shape_ptr = packed_fc2_ ? &fc2_shape_ : (fc2_experts_weights ? &fc2_experts_weights->Shape() : nullptr);
+  const TensorShape* fc3_shape_ptr = fc3_experts_weights ? &fc3_experts_weights->Shape() : nullptr;
+
   MoEParameters moe_params;
   ORT_RETURN_IF_ERROR(moe_helper::CheckInputs<Tensor>(
       moe_params, input, router_probs,
-      fc1_experts_weights, fc1_experts_bias, fc1_scales, fc1_zero_points,
-      fc2_experts_weights, fc2_experts_bias, fc2_scales, fc2_zero_points,
-      fc3_experts_weights, fc3_experts_bias, fc3_scales, fc3_zero_points,
+      fc1_shape_ptr, fc1_experts_bias, fc1_scales, fc1_zero_points,
+      fc2_shape_ptr, fc2_experts_bias, fc2_scales, fc2_zero_points,
+      fc3_shape_ptr, fc3_experts_bias, fc3_scales, fc3_zero_points,
       expert_weight_bits_ == 4 ? 2 : 1,
-      true,
+      activation_type_ == ActivationType::SwiGLU,
       block_size_));
 
-  if (fc3_experts_weights || fc3_experts_bias || fc3_scales || fc3_zero_points) {
+  if (fc3_shape_ptr || fc3_experts_bias || fc3_scales || fc3_zero_points) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "FC3 gating is not yet implemented on CPU for QMoE");
   }
 
@@ -569,8 +838,8 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
   const bool is_fc1_block_wise = (fc1_scales_dims.size() == 3 && fc1_scales_dims[2] > 1);
   const bool is_fc2_block_wise = (fc2_scales_dims.size() == 3 && fc2_scales_dims[2] > 1);
 
-  const uint8_t* fc1_weights_data = fc1_experts_weights->Data<uint8_t>();
-  const uint8_t* fc2_weights_data = fc2_experts_weights->Data<uint8_t>();
+  const uint8_t* fc1_weights_data = (packed_fc1_ != nullptr) ? nullptr : fc1_experts_weights->template Data<uint8_t>();
+  const uint8_t* fc2_weights_data = (packed_fc2_ != nullptr) ? nullptr : fc2_experts_weights->template Data<uint8_t>();
   const T* fc1_scales_data = fc1_scales->Data<T>();
   const T* fc2_scales_data = fc2_scales->Data<T>();
   const T* fc1_bias_data = fc1_experts_bias ? fc1_experts_bias->Data<T>() : nullptr;
@@ -578,6 +847,13 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
   const uint8_t* fc1_zp_data = fc1_zero_points ? fc1_zero_points->Data<uint8_t>() : nullptr;
   const uint8_t* fc2_zp_data = fc2_zero_points ? fc2_zero_points->Data<uint8_t>() : nullptr;
 
+  // Known loss-prone case from parity testing: 4-bit symmetric path (row-wise and block-wise).
+  const bool known_accuracy_loss_case = (expert_weight_bits_ == 4) &&
+                                        (fc1_zp_data == nullptr) && (fc2_zp_data == nullptr);
+  const bool use_mlas_q4_gemm_effective = use_mlas_q4_gemm_overridden_
+                                              ? use_mlas_q4_gemm_
+                                              : (use_mlas_q4_gemm_ && !known_accuracy_loss_case);
+
   const int64_t pack_unit = (8 / expert_weight_bits_);
   const int64_t fc1_packed_cols = (hidden_size + pack_unit - 1) / pack_unit;
   const int64_t fc2_packed_cols = (inter_size + pack_unit - 1) / pack_unit;
@@ -605,6 +881,22 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
     fc2_zp_expert_stride = (hidden_size + zp_pack_size - 1) / zp_pack_size;
   }
 
+  MLAS_BLK_QUANT_TYPE fc1_direct_qtype = BlkQ4Sym;
+  MLAS_BLK_QUANT_TYPE fc2_direct_qtype = BlkQ4Sym;
+
+  // Use pre-packed MLAS cache if available
+  const void* fc1_direct_q4_cache_ptr = nullptr;
+  if (use_mlas_q4_gemm_effective && packed_fc1_mlas_cache_ && fc1_zp_data == nullptr &&
+      CanUseMlasQ4Gemm(expert_weight_bits_, is_fc1_block_wise ? block_size_ : 0, fc1_out_features, hidden_size, fc1_direct_qtype)) {
+    fc1_direct_q4_cache_ptr = packed_fc1_mlas_cache_.get();
+  }
+
+  const void* fc2_direct_q4_cache_ptr = nullptr;
+  if (use_mlas_q4_gemm_effective && packed_fc2_mlas_cache_ && fc2_zp_data == nullptr &&
+      CanUseMlasQ4Gemm(expert_weight_bits_, is_fc2_block_wise ? block_size_ : 0, hidden_size, inter_size, fc2_direct_qtype)) {
+    fc2_direct_q4_cache_ptr = packed_fc2_mlas_cache_.get();
+  }
+
   std::vector<std::pair<int64_t, size_t>> expert_workload;
   size_t total_work = 0;
 
@@ -718,10 +1010,57 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
       const size_t k = static_cast<size_t>(hidden_size);
 
       MLAS_BLK_QUANT_TYPE q_type = BlkQ4Sym;  // Initialize to default
-      // Direct Q4 GEMM only supports symmetric quantization, so we disable it if zero_points are provided.
-      bool use_direct_q4_gemm = (fc1_zp_data == nullptr) &&
-                                CanUseMlasQ4Gemm(expert_weight_bits_, is_fc1_block_wise ? block_size_ : 0,
-                                                 fc1_out_features, hidden_size, q_type);
+      bool use_direct_q4_gemm = use_mlas_q4_gemm_effective &&
+                                ((fc1_direct_q4_cache_ptr != nullptr) ||
+                                 ((packed_fc1_ == nullptr) && (fc1_zp_data == nullptr) &&
+                                  CanUseMlasQ4Gemm(expert_weight_bits_, is_fc1_block_wise ? block_size_ : 0,
+                                                   fc1_out_features, hidden_size, q_type)));
+
+      if (packed_fc1_ != nullptr) {
+        if (use_mlas_q4_gemm_effective && fc1_zp_data == nullptr &&
+            CanUseMlasQ4Gemm(expert_weight_bits_, is_fc1_block_wise ? block_size_ : 0,
+                             fc1_out_features, hidden_size, q_type)) {
+          if (fc1_direct_q4_cache_ptr != nullptr) {
+            float* fc1_bias_float = nullptr;
+            if (has_fc1_bias) {
+              const T* B1_bias = fc1_bias_data + expert_idx * fc1_out_features;
+              if constexpr (std::is_same_v<T, MLFloat16>) {
+                MlasConvertHalfToFloatBuffer(reinterpret_cast<const MLFloat16*>(B1_bias), thread_bias1_buffer, static_cast<size_t>(fc1_out_features));
+              } else {
+                std::memcpy(thread_bias1_buffer, B1_bias, static_cast<size_t>(fc1_out_features) * sizeof(float));
+              }
+              fc1_bias_float = thread_bias1_buffer;
+            }
+
+            size_t packed_size = MlasQ4GemmPackBSize(q_type, static_cast<size_t>(fc1_out_features), static_cast<size_t>(hidden_size));
+            const uint8_t* packed_b = static_cast<const uint8_t*>(fc1_direct_q4_cache_ptr) + expert_idx * packed_size;
+
+            Status gemm_status = DirectQ4Gemm(A1, packed_b, fc1_bias_float, C1,
+                                              num_expert_tokens, fc1_out_features, hidden_size, fc1_direct_qtype, tp);
+            if (gemm_status.IsOK()) {
+              goto fc1_gemm_done;
+            }
+          }
+        }
+
+        // Fallback: Dequantize from PrePacked (transposed, unpacked) -> MlasGemm
+        const uint8_t* current_packed_ptr = static_cast<const uint8_t*>(packed_fc1_.get()) + expert_idx * fc1_out_features * hidden_size;
+
+        DequantizePrePacked(current_packed_ptr, fc1_scales_ptr, fc1_zp_ptr,
+                            is_fc1_block_wise ? block_size_ : 0,
+                            fc1_out_features, hidden_size,
+                            B1_dequant, fc1_scales_dims);
+
+        // Use MlasGemm with B1_dequant (which is already float transposed)
+        MlasGemm(CblasNoTrans, CblasNoTrans,
+                 m, n, k,
+                 1.0f, A1, k,
+                 B1_dequant, n,
+                 0.0f, C1, n,
+                 tp);
+
+        goto fc1_bias_handling;
+      }
 
       if (use_direct_q4_gemm) {
         IAllocatorUniquePtr<uint8_t> mlas_packed_fc1;
@@ -739,12 +1078,10 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
 
         if (convert_status.IsOK()) {
           float* fc1_bias_float = nullptr;
-          IAllocatorUniquePtr<float> fc1_bias_buffer;
 
           if (has_fc1_bias) {
             const T* B1_bias = fc1_bias_data + expert_idx * fc1_out_features;
-            fc1_bias_buffer = IAllocator::MakeUniquePtr<float>(allocator, static_cast<size_t>(fc1_out_features));
-            fc1_bias_float = fc1_bias_buffer.get();
+            fc1_bias_float = thread_bias1_buffer;
 
             if constexpr (std::is_same_v<T, MLFloat16>) {
               MlasConvertHalfToFloatBuffer(reinterpret_cast<const MLFloat16*>(B1_bias), fc1_bias_float, static_cast<size_t>(fc1_out_features));
@@ -805,6 +1142,8 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
                0.0f, C1, n,
                tp);
 
+    fc1_bias_handling:
+
       if (has_fc1_bias) {
         const T* B1_bias = fc1_bias_data + expert_idx * fc1_out_features;
         if constexpr (std::is_same_v<T, MLFloat16>) {
@@ -844,22 +1183,30 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
 
     fc1_gemm_done:
 
-      const int64_t activation_threshold = std::max(int64_t{4}, 256 / std::max(int64_t{1}, inter_size));
-      if (num_expert_tokens >= activation_threshold && tp != nullptr) {
-        const int64_t activation_block_size = std::max(int64_t{1}, std::min(int64_t{64}, activation_threshold));
-        const int64_t num_activation_blocks = (num_expert_tokens + activation_block_size - 1) / activation_block_size;
-
-        if (num_activation_blocks > 1) {
-          concurrency::ThreadPool::TrySimpleParallelFor(tp, narrow<int>(num_activation_blocks), [&](std::ptrdiff_t block_idx) {
-            const int64_t start_token = block_idx * activation_block_size;
-            const int64_t end_token = std::min(start_token + activation_block_size, num_expert_tokens);
-
-            for (int64_t i = start_token; i < end_token; ++i) {
+      if (activation_type_ == ActivationType::SwiGLU) {
+        const int64_t activation_threshold = std::max(int64_t{4}, 256 / std::max(int64_t{1}, inter_size));
+        if (num_expert_tokens >= activation_threshold && tp != nullptr) {
+          const int64_t activation_block_size = std::max(int64_t{1}, std::min(int64_t{64}, activation_threshold));
+          const int64_t num_activation_blocks = (num_expert_tokens + activation_block_size - 1) / activation_block_size;
+
+          if (num_activation_blocks > 1) {
+            concurrency::ThreadPool::TrySimpleParallelFor(tp, narrow<int>(num_activation_blocks), [&](std::ptrdiff_t block_idx) {
+              const int64_t start_token = block_idx * activation_block_size;
+              const int64_t end_token = std::min(start_token + activation_block_size, num_expert_tokens);
+
+              for (int64_t i = start_token; i < end_token; ++i) {
+                const float* C1_token = C1 + i * fc1_out_features;
+                float* A2_token = A2 + i * inter_size;
+                ApplySwiGLUActivation(C1_token, A2_token, inter_size, true, activation_alpha_, activation_beta_, swiglu_limit_);
+              }
+            });
+          } else {
+            for (int64_t i = 0; i < num_expert_tokens; ++i) {
               const float* C1_token = C1 + i * fc1_out_features;
               float* A2_token = A2 + i * inter_size;
               ApplySwiGLUActivation(C1_token, A2_token, inter_size, true, activation_alpha_, activation_beta_, swiglu_limit_);
             }
-          });
+          }
         } else {
           for (int64_t i = 0; i < num_expert_tokens; ++i) {
             const float* C1_token = C1 + i * fc1_out_features;
@@ -868,11 +1215,8 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
           }
         }
       } else {
-        for (int64_t i = 0; i < num_expert_tokens; ++i) {
-          const float* C1_token = C1 + i * fc1_out_features;
-          float* A2_token = A2 + i * inter_size;
-          ApplySwiGLUActivation(C1_token, A2_token, inter_size, true, activation_alpha_, activation_beta_, swiglu_limit_);
-        }
+        ApplyActivationVectorized(C1, num_expert_tokens * fc1_out_features);
+        std::copy(C1, C1 + (num_expert_tokens * fc1_out_features), A2);
       }
 
       const T* fc2_scales_ptr;
@@ -895,9 +1239,58 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
       const size_t k2 = static_cast<size_t>(inter_size);
 
       MLAS_BLK_QUANT_TYPE q_type2 = BlkQ4Sym;  // Initialize to default
-      bool use_direct_q4_gemm_fc2 = (fc2_zp_data == nullptr) &&
-                                    CanUseMlasQ4Gemm(expert_weight_bits_, is_fc2_block_wise ? block_size_ : 0,
-                                                     hidden_size, inter_size, q_type2);
+      bool use_direct_q4_gemm_fc2 = use_mlas_q4_gemm_effective &&
+                                    ((fc2_direct_q4_cache_ptr != nullptr) ||
+                                     ((packed_fc2_ == nullptr) && (fc2_zp_data == nullptr) &&
+                                      CanUseMlasQ4Gemm(expert_weight_bits_, is_fc2_block_wise ? block_size_ : 0,
+                                                       hidden_size, inter_size, q_type2)));
+
+      if (packed_fc2_ != nullptr) {
+        if (use_mlas_q4_gemm_effective && fc2_zp_data == nullptr &&
+            CanUseMlasQ4Gemm(expert_weight_bits_, is_fc2_block_wise ? block_size_ : 0,
+                             hidden_size, inter_size, q_type2)) {
+          if (fc2_direct_q4_cache_ptr != nullptr) {
+            float* fc2_bias_float = nullptr;
+            if (has_fc2_bias) {
+              const T* B2_bias = fc2_bias_data + expert_idx * hidden_size;
+              if constexpr (std::is_same_v<T, MLFloat16>) {
+                MlasConvertHalfToFloatBuffer(reinterpret_cast<const MLFloat16*>(B2_bias), thread_bias2_buffer, static_cast<size_t>(hidden_size));
+              } else {
+                std::memcpy(thread_bias2_buffer, B2_bias, static_cast<size_t>(hidden_size) * sizeof(float));
+              }
+              fc2_bias_float = thread_bias2_buffer;
+            }
+
+            size_t packed_size = MlasQ4GemmPackBSize(q_type2, static_cast<size_t>(hidden_size), static_cast<size_t>(inter_size));
+            const uint8_t* packed_b = static_cast<const uint8_t*>(fc2_direct_q4_cache_ptr) + expert_idx * packed_size;
+
+            Status gemm_status = DirectQ4Gemm(A2, packed_b, fc2_bias_float, C2,
+                                              num_expert_tokens, hidden_size, inter_size, fc2_direct_qtype, tp);
+            if (gemm_status.IsOK()) {
+              fc2_bias_added_by_mlas = true;
+              goto fc2_gemm_done;
+            }
+          }
+        }
+
+        // Dequantize from PrePacked (transposed, unpacked)
+        const uint8_t* current_packed_ptr = static_cast<const uint8_t*>(packed_fc2_.get()) + expert_idx * hidden_size * inter_size;
+
+        DequantizePrePacked(current_packed_ptr, fc2_scales_ptr, fc2_zp_ptr,
+                            is_fc2_block_wise ? block_size_ : 0,
+                            hidden_size, inter_size,
+                            B2_dequant, fc2_scales_dims);
+
+        // Fallback
+        MlasGemm(CblasNoTrans, CblasNoTrans,
+                 m2, n2, k2,
+                 1.0f, A2, k2,
+                 B2_dequant, n2,
+                 0.0f, C2, n2,
+                 tp);
+
+        goto fc2_gemm_done;
+      }
 
       if (use_direct_q4_gemm_fc2) {
         IAllocatorUniquePtr<uint8_t> mlas_packed_fc2;
@@ -915,12 +1308,10 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
 
         if (convert_status.IsOK()) {
           float* fc2_bias_float = nullptr;
-          IAllocatorUniquePtr<float> fc2_bias_buffer;
 
           if (has_fc2_bias) {
             const T* B2_bias = fc2_bias_data + expert_idx * hidden_size;
-            fc2_bias_buffer = IAllocator::MakeUniquePtr<float>(allocator, static_cast<size_t>(hidden_size));
-            fc2_bias_float = fc2_bias_buffer.get();
+            fc2_bias_float = thread_bias2_buffer;
 
             if constexpr (std::is_same_v<T, MLFloat16>) {
               MlasConvertHalfToFloatBuffer(reinterpret_cast<const MLFloat16*>(B2_bias), fc2_bias_float, static_cast<size_t>(hidden_size));
@@ -1114,10 +1505,22 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
   return Status::OK();
 }
 
+template <typename T>
+void QMoECPU<T>::ApplyActivationVectorized(float* data, int64_t size) const {
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = ApplyActivation(data[i], activation_type_);
+  }
+}
+
 template QMoECPU<float>::QMoECPU(const OpKernelInfo& op_kernel_info);
+
 template Status QMoECPU<float>::Compute(OpKernelContext* context) const;
+template Status QMoECPU<float>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool& is_packed, PrePackedWeights* prepacked_weights);
+template Status QMoECPU<float>::UseSharedPrePackedBuffers_V2(std::vector<BufferUniquePtr>& prepacked_buffers, gsl::span<const size_t> prepacked_buffer_sizes, int input_idx, bool& used_shared_buffers);
 template QMoECPU<MLFloat16>::QMoECPU(const OpKernelInfo& op_kernel_info);
 template Status QMoECPU<MLFloat16>::Compute(OpKernelContext* context) const;
+template Status QMoECPU<MLFloat16>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool& is_packed, PrePackedWeights* prepacked_weights);
+template Status QMoECPU<MLFloat16>::UseSharedPrePackedBuffers_V2(std::vector<BufferUniquePtr>& prepacked_buffers, gsl::span<const size_t> prepacked_buffer_sizes, int input_idx, bool& used_shared_buffers);
 
 // Kernel Registration
 ONNX_OPERATOR_TYPED_KERNEL_EX(
diff --git a/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.h b/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.h
index 890580e051a8e..f678a27190c90 100644
--- a/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.h
+++ b/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.h
@@ -5,7 +5,9 @@
 
 #include "core/common/common.h"
 #include "core/framework/op_kernel.h"
+#include "core/mlas/inc/mlas_q4.h"
 #include "contrib_ops/cpu/moe/moe_base_cpu.h"
+#include <vector>
 
 namespace onnxruntime {
 namespace contrib {
@@ -26,8 +28,30 @@ class QMoECPU final : public OpKernel, public MoEBaseCPU {
   Status Compute(OpKernelContext* context) const override;
 
  private:
+  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 /*out*/ bool& is_packed,
+                 /*out*/ PrePackedWeights* prepacked_weights) override;
+
+  Status UseSharedPrePackedBuffers_V2(std::vector<BufferUniquePtr>& prepacked_buffers,
+                                      gsl::span<const size_t> prepacked_buffer_sizes,
+                                      int input_idx,
+                                      /*out*/ bool& used_shared_buffers) override;
+
+  void ApplyActivationVectorized(float* data, int64_t size) const;
+
   int64_t expert_weight_bits_;
   int64_t block_size_;
+  bool use_mlas_q4_gemm_{false};
+  bool use_mlas_q4_gemm_overridden_{false};
+
+  IAllocatorUniquePtr<void> packed_fc1_;
+  IAllocatorUniquePtr<void> packed_fc2_;
+
+  TensorShape fc1_shape_;
+  TensorShape fc2_shape_;
+
+  IAllocatorUniquePtr<void> packed_fc1_mlas_cache_;
+  IAllocatorUniquePtr<void> packed_fc2_mlas_cache_;
 };
 
 }  // namespace contrib
diff --git a/onnxruntime/core/framework/debug_node_inputs_outputs_utils.cc b/onnxruntime/core/framework/debug_node_inputs_outputs_utils.cc
index 38dd8de01147c..5137c22d6cf61 100644
--- a/onnxruntime/core/framework/debug_node_inputs_outputs_utils.cc
+++ b/onnxruntime/core/framework/debug_node_inputs_outputs_utils.cc
@@ -621,8 +621,8 @@ void DumpNodeInputs(
           std::cout << " is non-tensor type.\n";
         }
       } else {
-        // this could happen with an empty Optional input
-        std::cout << " was missing data type\n";
+        // this could happen with an empty Optional input or the tensor is removed after pre-packing.
+        std::cout << " was missing data type (maybe pre-packed).\n";
       }
     } else {
       std::cout << "Input " << i << " is optional and was not provided.\n";
diff --git a/onnxruntime/test/python/transformers/benchmark_qmoe.py b/onnxruntime/test/python/transformers/benchmark_qmoe.py
new file mode 100644
index 0000000000000..b96c9cdcf5c3a
--- /dev/null
+++ b/onnxruntime/test/python/transformers/benchmark_qmoe.py
@@ -0,0 +1,191 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+import os
+import sys
+import time
+import unittest
+
+import numpy
+import torch
+
+# Add current directory to path to allow importing from test_qmoe_cpu
+current_dir = os.path.dirname(os.path.abspath(__file__))
+if current_dir not in sys.path:
+    sys.path.append(current_dir)
+
+from test_qmoe_cpu import PhiMoEConfig, PhiMoESparseMoeBlock, TensorProto  # noqa: E402
+
+# Reduces number of tests to run for faster pipeline checks
+pipeline_mode = os.getenv("PIPELINE_MODE", "1") == "1"
+
+
+@unittest.skipIf(pipeline_mode, "Skip benchmark in CI pipeline.")
+class TestQMoESwiGLUBenchmark(unittest.TestCase):
+    """Benchmark tests for QMoE SwiGLU performance measurement."""
+
+    def test_qmoe_swiglu_throughput_benchmark(self):
+        """Comprehensive throughput benchmark for QMoE SwiGLU across different configurations."""
+        print("\n=== QMoE SwiGLU Throughput Benchmark ===")
+
+        # Test configurations: (name, hidden_size, intermediate_size, num_experts, top_k, quant_bits)
+        configs = [
+            ("Medium-4bit", 2880, 2880, 32, 4, 4),
+            ("Medium-8bit", 2880, 2880, 32, 4, 8),
+        ]
+
+        batch_size = 1
+        sequence_length = 512
+        num_runs = 1000
+
+        results = []
+
+        for config_name, hidden_size, intermediate_size, num_experts, top_k, quant_bits in configs:
+            torch.manual_seed(42)
+            numpy.random.seed(42)
+
+            torch_output = None
+            ort_output = None
+
+            print(f"\nTesting {config_name}:")
+            print(f"  Hidden: {hidden_size}, Intermediate: {intermediate_size}")
+            print(f"  Experts: {num_experts}, Top-K: {top_k}, Quant: {quant_bits}-bit")
+
+            try:
+                # Create config and model
+                config = PhiMoEConfig(
+                    hidden_size=hidden_size,
+                    intermediate_size=intermediate_size,
+                    num_local_experts=num_experts,
+                    num_experts_per_tok=top_k,
+                )
+
+                qmoe_swiglu = PhiMoESparseMoeBlock(
+                    config,
+                    batch_size=batch_size,
+                    sequence_length=sequence_length,
+                    quant_bits=quant_bits,
+                    onnx_dtype=TensorProto.FLOAT,
+                )
+
+                # Create test input with fixed sequence length to match ONNX model
+                full_hidden_states = torch.randn(batch_size, sequence_length, hidden_size).to(torch.float32)
+
+                # For TTFT simulation, we'll measure single forward pass time
+                # This represents the time to process one token in autoregressive generation
+
+                # Warm up with full context
+                for _ in range(3):
+                    _ = qmoe_swiglu.forward(full_hidden_states)
+
+                # Benchmark PyTorch TTFT (Time to First Token)
+                # Measure time for a single forward pass (represents token generation time)
+                torch.manual_seed(42)
+
+                start_time = time.time()
+                for _ in range(num_runs):
+                    torch_output = qmoe_swiglu.forward(full_hidden_states)
+                end_time = time.time()
+                torch_ttft_ms = (end_time - start_time) / num_runs * 1000
+
+                # Calculate tokens per second (throughput)
+                # For sequence generation, this represents the rate at which we can generate tokens
+                torch_tokens_per_sec = 1000.0 / torch_ttft_ms  # 1 token / (time_ms / 1000)
+
+                print(f"  PyTorch TTFT: {torch_ttft_ms:.3f} ms (per token generation time)")
+                print(f"  PyTorch Throughput: {torch_tokens_per_sec:.1f} tokens/sec")
+
+                # Benchmark ONNX Runtime
+                ort_ttft_ms = 0
+                ort_tokens_per_sec = 0
+                speedup = 0
+                throughput_ratio = 0
+                max_diff = 0
+
+                model_updated = qmoe_swiglu.recreate_onnx_model()
+                if model_updated and qmoe_swiglu.ort_sess is not None:
+                    # Warm up ORT with full context
+                    for _ in range(3):
+                        _ = qmoe_swiglu.ort_forward(full_hidden_states)
+
+                    torch.manual_seed(42)
+
+                    # Measure ONNX Runtime TTFT (Time to First Token)
+                    start_time = time.time()
+                    for _ in range(num_runs):
+                        ort_output = qmoe_swiglu.ort_forward(full_hidden_states)
+                    end_time = time.time()
+                    ort_ttft_ms = (end_time - start_time) / num_runs * 1000
+
+                    # Calculate tokens per second for ONNX Runtime
+                    ort_tokens_per_sec = 1000.0 / ort_ttft_ms  # 1 token / (time_ms / 1000)
+
+                    speedup = torch_ttft_ms / ort_ttft_ms if ort_ttft_ms > 0 else 0
+                    throughput_ratio = ort_tokens_per_sec / torch_tokens_per_sec if torch_tokens_per_sec > 0 else 0
+
+                    print(f"  ONNX RT TTFT: {ort_ttft_ms:.3f} ms (per token generation time)")
+                    print(f"  ONNX RT Throughput: {ort_tokens_per_sec:.1f} tokens/sec")
+                    print(f"  TTFT Speedup: {speedup:.2f}x")
+                    print(f"  Throughput Gain: {throughput_ratio:.2f}x")
+                else:
+                    print("  ONNX RT: Not available")
+
+                # Calculate max difference if both outputs available
+                if torch_output is not None and ort_output is not None:
+                    max_diff = (torch_output.cpu() - ort_output.cpu()).abs().max().item()
+                    print(f"  Max diff: {max_diff:.6f}")
+
+                results.append(
+                    {
+                        "config": config_name,
+                        "torch_ttft_ms": torch_ttft_ms,
+                        "torch_tokens_per_sec": torch_tokens_per_sec,
+                        "ort_ttft_ms": ort_ttft_ms,
+                        "ort_tokens_per_sec": ort_tokens_per_sec,
+                        "speedup": speedup,
+                        "throughput_ratio": throughput_ratio,
+                        "max_diff": max_diff,
+                    }
+                )
+
+            except Exception as e:
+                print(f"  Error: {e}")
+                continue
+
+        # Summary
+        print("\n=== Token Generation Time & Throughput Summary ===")
+        print(
+            f"{'Config':<15} {'PT Time':<10} {'PT tok/s':<10} {'ORT Time':<11} {'ORT tok/s':<11} {'Time Gain':<10} {'Throughput':<11} {'Max Diff':<10}"
+        )
+        print("-" * 105)
+        for result in results:
+            config = result["config"]
+            torch_ttft = result["torch_ttft_ms"]
+            torch_tps = result["torch_tokens_per_sec"]
+            ort_ttft = result["ort_ttft_ms"]
+            ort_tps = result["ort_tokens_per_sec"]
+            speedup = result["speedup"]
+            throughput_ratio = result["throughput_ratio"]
+            max_diff = result["max_diff"]
+
+            ort_ttft_str = f"{ort_ttft:.3f}" if ort_ttft > 0 else "N/A"
+            ort_tps_str = f"{ort_tps:.1f}" if ort_tps > 0 else "N/A"
+            speedup_str = f"{speedup:.2f}x" if speedup > 0 else "N/A"
+            throughput_str = f"{throughput_ratio:.2f}x" if throughput_ratio > 0 else "N/A"
+
+            print(
+                f"{config:<15} {torch_ttft:<10.3f} {torch_tps:<10.1f} {ort_ttft_str:<11} {ort_tps_str:<11} {speedup_str:<10} {throughput_str:<11} {max_diff:<10.6f}"
+            )
+
+        print("\nNotes:")
+        print("- Time: Token generation time in ms (lower is better)")
+        print("- tok/s: Tokens per second throughput (higher is better)")
+        print("- Time Gain: ORT speedup for latency (higher is better)")
+        print("- Throughput: ORT throughput improvement (higher is better)")
+
+
+if __name__ == "__main__":
+    benchmark = TestQMoESwiGLUBenchmark()
+    benchmark.test_qmoe_swiglu_throughput_benchmark()
diff --git a/onnxruntime/test/python/transformers/test_qmoe_cpu.py b/onnxruntime/test/python/transformers/test_qmoe_cpu.py
index 238ac4d1f077d..8415c7b08b77c 100644
--- a/onnxruntime/test/python/transformers/test_qmoe_cpu.py
+++ b/onnxruntime/test/python/transformers/test_qmoe_cpu.py
@@ -23,9 +23,11 @@
 # normalization on the selected experts. This provides proper weight distribution
 # while maintaining computational efficiency.
 # --------------------------------------------------------------------------
+import os
 import time
 import unittest
 from collections import OrderedDict
+from contextlib import contextmanager
 
 import numpy
 import torch
@@ -76,6 +78,8 @@ class TensorProtoPlaceholder:
 
 ort_provider = ["CPUExecutionProvider"]
 
+ORT_USE_MLAS_Q4_GEMM_MOE = "ORT_USE_MLAS_Q4_GEMM_MOE"
+
 torch.manual_seed(42)
 numpy.random.seed(42)
 
@@ -1137,6 +1141,43 @@ def small_test_cases():
             yield batch_size, sequence_length
 
 
+def with_mlas_q4_mode(test_cases):
+    expanded_cases = []
+    for case in test_cases:
+        quant_bits = case[2]
+        if quant_bits == 4:
+            expanded_cases.append((*case, None))
+            expanded_cases.append((*case, False))
+            expanded_cases.append((*case, True))
+        else:
+            expanded_cases.append((*case, None))
+    return expanded_cases
+
+
+@contextmanager
+def scoped_env_var(name: str, value: str):
+    previous = os.environ.get(name)
+    os.environ[name] = value
+    try:
+        yield
+    finally:
+        if previous is None:
+            os.environ.pop(name, None)
+        else:
+            os.environ[name] = previous
+
+
+def run_parity_with_mlas_q4_mode(test_runner, enable_mlas_q4_gemm: bool | None):
+    if enable_mlas_q4_gemm is None:  # No env var
+        test_runner()
+    else:
+        env_value = "1" if enable_mlas_q4_gemm else "0"
+        mode = "enabled" if enable_mlas_q4_gemm else "disabled"
+        print(f"DirectQ4 mode ({ORT_USE_MLAS_Q4_GEMM_MOE}) is {mode}")
+        with scoped_env_var(ORT_USE_MLAS_Q4_GEMM_MOE, env_value):
+            test_runner()
+
+
 class SwigluMoEBlock(SparseMoeBlockORTHelper):
     def __init__(
         self,
@@ -1381,8 +1422,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return final_hidden_states
 
 
-disable_cpu_qmoe_tests = False
-
 # Define test cases for different MoE types
 phi3_test_cases = [
     (1, 32, 4),
@@ -1400,10 +1439,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 ]
 
 
-@unittest.skipIf(disable_cpu_qmoe_tests, "Skipping qMoE cpu tests")
 class TestPhiQMoECPU(unittest.TestCase):
-    @parameterized.expand(phi3_test_cases)
-    def test_phi3_qmoe_parity_cpu(self, batch_size, sequence_length, quant_bits):
+    @parameterized.expand(with_mlas_q4_mode(phi3_test_cases))
+    def test_phi3_qmoe_parity_cpu(self, batch_size, sequence_length, quant_bits, enable_mlas_q4_gemm):
         # Create unique seed based on test parameters to ensure different inputs for each test
         base_seed = 2000  # Different base seed from other tests
         param_hash = hash((batch_size, sequence_length, quant_bits))
@@ -1438,10 +1476,10 @@ def test_phi3_qmoe_parity_cpu(self, batch_size, sequence_length, quant_bits):
         self.assertFalse(torch.isnan(torch_result).any())
         self.assertFalse(torch.isinf(torch_result).any())
 
-        phi3_moe.parity_check()
+        run_parity_with_mlas_q4_mode(phi3_moe.parity_check, enable_mlas_q4_gemm)
 
-    @parameterized.expand(phi3_test_cases)
-    def test_phi3_qmoe_asymmetric_parity_cpu(self, batch_size, sequence_length, quant_bits):
+    @parameterized.expand(with_mlas_q4_mode(phi3_test_cases))
+    def test_phi3_qmoe_asymmetric_parity_cpu(self, batch_size, sequence_length, quant_bits, enable_mlas_q4_gemm):
         base_seed = 3000
         param_hash = hash((batch_size, sequence_length, quant_bits))
         unique_seed = base_seed + abs(param_hash) % 1000
@@ -1463,10 +1501,12 @@ def test_phi3_qmoe_asymmetric_parity_cpu(self, batch_size, sequence_length, quan
             onnx_dtype=TensorProto.FLOAT,
             use_asymmetric_quant=True,
         )
-        phi3_moe.parity_check()
+        run_parity_with_mlas_q4_mode(phi3_moe.parity_check, enable_mlas_q4_gemm)
 
-    @parameterized.expand(phi3_blockwise_test_cases)
-    def test_phi3_qmoe_blockwise_parity_cpu(self, batch_size, sequence_length, quant_bits, block_size):
+    @parameterized.expand(with_mlas_q4_mode(phi3_blockwise_test_cases))
+    def test_phi3_qmoe_blockwise_parity_cpu(
+        self, batch_size, sequence_length, quant_bits, block_size, enable_mlas_q4_gemm
+    ):
         torch.manual_seed(42)
         numpy.random.seed(42)
 
@@ -1495,10 +1535,12 @@ def test_phi3_qmoe_blockwise_parity_cpu(self, batch_size, sequence_length, quant
         self.assertFalse(torch.isnan(torch_result).any())
         self.assertFalse(torch.isinf(torch_result).any())
 
-        phi3_moe.parity_check()
+        run_parity_with_mlas_q4_mode(phi3_moe.parity_check, enable_mlas_q4_gemm)
 
-    @parameterized.expand(phi3_blockwise_test_cases)
-    def test_phi3_qmoe_blockwise_asymmetric_parity_cpu(self, batch_size, sequence_length, quant_bits, block_size):
+    @parameterized.expand(with_mlas_q4_mode(phi3_blockwise_test_cases))
+    def test_phi3_qmoe_blockwise_asymmetric_parity_cpu(
+        self, batch_size, sequence_length, quant_bits, block_size, enable_mlas_q4_gemm
+    ):
         torch.manual_seed(43)
         numpy.random.seed(43)
 
@@ -1516,10 +1558,8 @@ def test_phi3_qmoe_blockwise_asymmetric_parity_cpu(self, batch_size, sequence_le
             block_size=block_size,
             use_asymmetric_quant=True,
         )
-        phi3_moe.parity_check()
-
+        run_parity_with_mlas_q4_mode(phi3_moe.parity_check, enable_mlas_q4_gemm)
 
-disable_cpu_qmoe_tests = False
 
 swiglu_test_cases = [
     (1, 32, 4),
@@ -1537,10 +1577,9 @@ def test_phi3_qmoe_blockwise_asymmetric_parity_cpu(self, batch_size, sequence_le
 ]
 
 
-@unittest.skipIf(disable_cpu_qmoe_tests, "Skipping qMoE cpu tests")
 class TestSwigluQMoECPU(unittest.TestCase):
-    @parameterized.expand(swiglu_test_cases)
-    def test_swiglu_qmoe_parity_cpu(self, batch_size, sequence_length, quant_bits):
+    @parameterized.expand(with_mlas_q4_mode(swiglu_test_cases))
+    def test_swiglu_qmoe_parity_cpu(self, batch_size, sequence_length, quant_bits, enable_mlas_q4_gemm):
         # Create unique seed based on test parameters to ensure different inputs for each test
         base_seed = 1000  # Different base seed from regular MoE tests
         param_hash = hash((batch_size, sequence_length, quant_bits))
@@ -1574,10 +1613,10 @@ def test_swiglu_qmoe_parity_cpu(self, batch_size, sequence_length, quant_bits):
         self.assertFalse(torch.isnan(torch_result).any())
         self.assertFalse(torch.isinf(torch_result).any())
 
-        swiglu_moe.parity_check()
+        run_parity_with_mlas_q4_mode(swiglu_moe.parity_check, enable_mlas_q4_gemm)
 
-    @parameterized.expand(swiglu_test_cases)
-    def test_swiglu_qmoe_asymmetric_parity_cpu(self, batch_size, sequence_length, quant_bits):
+    @parameterized.expand(with_mlas_q4_mode(swiglu_test_cases))
+    def test_swiglu_qmoe_asymmetric_parity_cpu(self, batch_size, sequence_length, quant_bits, enable_mlas_q4_gemm):
         base_seed = 1100
         param_hash = hash((batch_size, sequence_length, quant_bits))
         unique_seed = base_seed + abs(param_hash) % 1000
@@ -1599,10 +1638,12 @@ def test_swiglu_qmoe_asymmetric_parity_cpu(self, batch_size, sequence_length, qu
             onnx_dtype=TensorProto.FLOAT,
             use_asymmetric_quant=True,
         )
-        swiglu_moe.parity_check()
+        run_parity_with_mlas_q4_mode(swiglu_moe.parity_check, enable_mlas_q4_gemm)
 
-    @parameterized.expand(swiglu_blockwise_test_cases)
-    def test_swiglu_qmoe_blockwise_parity_cpu(self, batch_size, sequence_length, quant_bits, block_size):
+    @parameterized.expand(with_mlas_q4_mode(swiglu_blockwise_test_cases))
+    def test_swiglu_qmoe_blockwise_parity_cpu(
+        self, batch_size, sequence_length, quant_bits, block_size, enable_mlas_q4_gemm
+    ):
         torch.manual_seed(42)
         numpy.random.seed(42)
 
@@ -1630,10 +1671,12 @@ def test_swiglu_qmoe_blockwise_parity_cpu(self, batch_size, sequence_length, qua
         self.assertFalse(torch.isnan(torch_result).any())
         self.assertFalse(torch.isinf(torch_result).any())
 
-        swiglu_moe.parity_check()
+        run_parity_with_mlas_q4_mode(swiglu_moe.parity_check, enable_mlas_q4_gemm)
 
-    @parameterized.expand(swiglu_blockwise_test_cases)
-    def test_swiglu_qmoe_blockwise_asymmetric_parity_cpu(self, batch_size, sequence_length, quant_bits, block_size):
+    @parameterized.expand(with_mlas_q4_mode(swiglu_blockwise_test_cases))
+    def test_swiglu_qmoe_blockwise_asymmetric_parity_cpu(
+        self, batch_size, sequence_length, quant_bits, block_size, enable_mlas_q4_gemm
+    ):
         torch.manual_seed(43)
         numpy.random.seed(43)
 
@@ -1651,7 +1694,7 @@ def test_swiglu_qmoe_blockwise_asymmetric_parity_cpu(self, batch_size, sequence_
             block_size=block_size,
             use_asymmetric_quant=True,
         )
-        swiglu_moe.parity_check()
+        run_parity_with_mlas_q4_mode(swiglu_moe.parity_check, enable_mlas_q4_gemm)
 
 
 @unittest.skipIf(True, "Skipping QMoE CPU benchmark tests")
@@ -1660,9 +1703,6 @@ class TestQMoESwiGLUBenchmark(unittest.TestCase):
 
     def test_qmoe_swiglu_throughput_benchmark(self):
         """Comprehensive throughput benchmark for QMoE SwiGLU across different configurations."""
-        if disable_cpu_qmoe_tests:
-            self.skipTest("QMoE CPU tests disabled")
-
         print("\n=== QMoE SwiGLU Throughput Benchmark ===")
 
         # Test configurations: (name, hidden_size, intermediate_size, num_experts, top_k, quant_bits)

From 948e629076d6a5729eaa761ba06792341d34ad89 Mon Sep 17 00:00:00 2001
From: Copilot <198982749+Copilot@users.noreply.github.com>
Date: Thu, 26 Feb 2026 12:02:07 +0000
Subject: [PATCH 17/21] Fix SkipLayerNorm fusion incorrectly applied when
 gamma/beta are not 1D (#27459)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Description

The `SkipLayerNormFusion` optimizer skips fusion when the
`LayerNormalization` gamma or beta inputs are not 1D tensors (e.g. shape
`[1, 1, hidden_size]`). The `SkipLayerNormalization` kernel strictly
requires 1D gamma/beta, so fusing without this check caused a hard
runtime error.

- **`skip_layer_norm_fusion.cc`**: After matching the Add+LayerNorm
pattern, check that gamma (and beta if present) have exactly 1 dimension
before proceeding with fusion. If shape info is unavailable (dynamic),
fusion is allowed and runtime validation takes over.
- **`graph_transform_test_layernorm.cc`**: Added
`SkipLayerNormFusion_3DGamma_NoFusion` test — builds a graph with `Add +
LayerNormalization` where gamma/beta are `[1, 1, 4]` and asserts no
`SkipLayerNormalization` node is created.

### Motivation and Context

Models with residual connections followed by `LayerNormalization` where
the scale/bias tensors carry extra batch/sequence dimensions (e.g.
exported as `[1, 1, hidden_size]` rather than `[hidden_size]`) would
trigger fusion and then fail at runtime:

```
Non-zero status code returned while running SkipLayerNormalization node.
Status Message: gamma is expected to have 1 dimension, got 3
```

The error only appeared with 3D inputs and disappeared at
`ORT_ENABLE_BASIC` optimization level (which disables the fusion),
confirming the optimizer as the source of the regression.

<!-- START COPILOT CODING AGENT TIPS -->
---

💬 We'd love your input! Share your thoughts on Copilot coding agent in
our [2 minute survey](https://gh.io/copilot-coding-agent-survey).

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: tianleiwu <30328909+tianleiwu@users.noreply.github.com>
---
 .../core/optimizer/skip_layer_norm_fusion.cc  | 15 +++++++++
 .../graph_transform_test_layernorm.cc         | 31 +++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc b/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc
index 655364357999a..3727ac0918115 100644
--- a/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc
+++ b/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc
@@ -254,6 +254,21 @@ Status SkipLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_le
       continue;
     }
 
+    // SkipLayerNormalization kernel requires gamma and beta to be 1D.
+    // Skip fusion if gamma or beta have more than 1 dimension.
+    const NodeArg* gamma_arg = ln_node.MutableInputDefs()[1];
+    const TensorShapeProto* gamma_shape = gamma_arg->Shape();
+    if (gamma_shape != nullptr && gamma_shape->dim_size() != 1) {
+      continue;
+    }
+    if (ln_node.MutableInputDefs().size() > 2) {
+      const NodeArg* beta_arg = ln_node.MutableInputDefs()[2];
+      const TensorShapeProto* beta_shape = beta_arg->Shape();
+      if (beta_shape != nullptr && beta_shape->dim_size() != 1) {
+        continue;
+      }
+    }
+
     NodeArg beta_place_holder("", nullptr);
 
     // Get the inputs for the new SkipLayerNormalization node.
diff --git a/onnxruntime/test/optimizer/graph_transform_test_layernorm.cc b/onnxruntime/test/optimizer/graph_transform_test_layernorm.cc
index 0afb836192b0a..4615b6a57b558 100644
--- a/onnxruntime/test/optimizer/graph_transform_test_layernorm.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test_layernorm.cc
@@ -638,6 +638,37 @@ TEST_F(GraphTransformationTests, SkipLayerNormFusionTest) {
   TestSkipLayerNormFusion(MODEL_FOLDER "fusion/skip_layer_norm_format3_graph_output.onnx", 1, 1, 0, 0, logger_.get());
 }
 
+// SkipLayerNorm fusion should not be applied when gamma/beta have more than 1 dimension,
+// because the SkipLayerNormalization kernel requires 1D gamma/beta.
+TEST_F(GraphTransformationTests, SkipLayerNormFusion_3DGamma_NoFusion) {
+  auto build_test_case = [](ModelTestBuilder& builder) {
+    // Inputs: A and B are 3D [16, 32, 4]
+    auto* input_a = builder.MakeInput<float>({16, 32, 4}, -1.0f, 1.0f);
+    auto* input_b = builder.MakeInput<float>({16, 32, 4}, -1.0f, 1.0f);
+    // gamma and beta have 3D shape [1, 1, 4] (not 1D)
+    auto* gamma = builder.MakeInitializer<float>({1, 1, 4}, {1.0f, 2.0f, 3.0f, 4.0f});
+    auto* beta = builder.MakeInitializer<float>({1, 1, 4}, {0.1f, 0.2f, 0.3f, 0.4f});
+    auto* add_out = builder.MakeIntermediate();
+    auto* ln_out = builder.MakeOutput();
+
+    builder.AddNode("Add", {input_a, input_b}, {add_out});
+    builder.AddNode("LayerNormalization", {add_out, gamma, beta}, {ln_out})
+        .AddAttribute("axis", static_cast<int64_t>(-1));
+  };
+
+  auto post_graph_checker = [](Graph& graph) {
+    // SkipLayerNormalization should NOT have been created because gamma/beta are 3D.
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Add"] == 1);
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["LayerNormalization"] == 1);
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["com.microsoft.SkipLayerNormalization"] == 0);
+    return Status::OK();
+  };
+
+  ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 17, *logger_,
+                                        std::make_unique<SkipLayerNormFusion>(),
+                                        TransformerLevel::Level2, 1, nullptr, post_graph_checker));
+}
+
 TEST_F(GraphTransformationTests, GroupQueryAttentionFusionTest) {
   TestGQAFusion(MODEL_FOLDER "fusion/gqa_fusion_quantized_simple.onnx", 1, 0, logger_.get());
   TestGQAFusion(MODEL_FOLDER "fusion/gqa_fusion_different_head_sizes.onnx", 0, 1, logger_.get());

From 8f3b84f7396aaeb3db28bc8ece17264cfb456464 Mon Sep 17 00:00:00 2001
From: Tianlei WU <tlwu@microsoft.com>
Date: Thu, 19 Feb 2026 14:08:10 -0800
Subject: [PATCH 18/21] Increase ios build timeout to 360 minutes

---
 .../templates/stages/mac-ios-packaging-build-stage.yml          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
index 7d6e272533696..8303547a47566 100644
--- a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
@@ -22,7 +22,7 @@ stages:
       buildSettingsFile: "tools/ci_build/github/apple/default_full_apple_framework_build_settings.json"
       cPodName: onnxruntime-c
       objcPodName: onnxruntime-objc
-    timeoutInMinutes: 270
+    timeoutInMinutes: 360
     templateContext:
       outputs:
       - output: pipelineArtifact

From 6a2cd627e8b0e2226598284de2f30f3fa08b4eb8 Mon Sep 17 00:00:00 2001
From: Erik <erscor@microsoft.com>
Date: Thu, 19 Feb 2026 15:13:33 -0500
Subject: [PATCH 19/21] Propagate parameters correctly

---
 tools/ci_build/github/azure-pipelines/dml-nuget-packaging.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/ci_build/github/azure-pipelines/dml-nuget-packaging.yml b/tools/ci_build/github/azure-pipelines/dml-nuget-packaging.yml
index 3cf28655c36e7..0b63a4f5b83c1 100644
--- a/tools/ci_build/github/azure-pipelines/dml-nuget-packaging.yml
+++ b/tools/ci_build/github/azure-pipelines/dml-nuget-packaging.yml
@@ -111,3 +111,4 @@ extends:
     - template: stages/nuget_dml_packaging_stage.yml
       parameters:
         DoEsrp: ${{ parameters.DoEsrp }}
+        IsReleaseBuild: ${{ parameters.IsReleaseBuild }}

From f11112c3664a6336571fdfa646ed42aed3dbc972 Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Thu, 26 Feb 2026 18:25:02 -0800
Subject: [PATCH 20/21] Build Windows ARM64X binaries as part of packaging
 pipeline (#27316) <!-- Describe your changes. -->

- Add ARM64X build to packaging pipeline. An additional zip archive
artifact with the ARM64X binaries will be produced.
- Add basic C++ sample program.
- Add binary archive tests using the C++ sample program to package test
pipeline.
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Address request for ARM64X binaries.
Add testing of binary archives to package test pipeline.
---
 samples/cxx/CMakeLists.txt                    |  50 ++++++
 samples/cxx/README.md                         |  92 ++++++++++
 samples/cxx/add_model.onnx                    | Bin 0 -> 100 bytes
 samples/cxx/generate_model.py                 |  42 +++++
 samples/cxx/main.cc                           | 170 ++++++++++++++++++
 .../c-api-noopenmp-test-pipelines.yml         |  54 ++++++
 .../qnn-ep-nuget-packaging-pipeline.yml       |   1 -
 .../templates/build-win-arm64x-steps.yml      |  28 +++
 ...-package-and-publish-steps-windows-qnn.yml | 141 ---------------
 ...acts-package-and-publish-steps-windows.yml |  14 +-
 .../azure-pipelines/templates/c-api-cpu.yml   |  14 ++
 .../azure-pipelines/templates/qnn-ep-win.yml  |  20 +--
 .../templates/set-variable.yml                |  32 ++++
 .../templates/test-binary-archive-stage.yml   | 121 +++++++++++++
 .../azure-pipelines/templates/win-ci.yml      |  40 ++++-
 15 files changed, 657 insertions(+), 162 deletions(-)
 create mode 100644 samples/cxx/CMakeLists.txt
 create mode 100644 samples/cxx/README.md
 create mode 100644 samples/cxx/add_model.onnx
 create mode 100644 samples/cxx/generate_model.py
 create mode 100644 samples/cxx/main.cc
 create mode 100644 tools/ci_build/github/azure-pipelines/templates/build-win-arm64x-steps.yml
 delete mode 100644 tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows-qnn.yml
 create mode 100644 tools/ci_build/github/azure-pipelines/templates/set-variable.yml
 create mode 100644 tools/ci_build/github/azure-pipelines/templates/test-binary-archive-stage.yml

diff --git a/samples/cxx/CMakeLists.txt b/samples/cxx/CMakeLists.txt
new file mode 100644
index 0000000000000..875e37c64eda2
--- /dev/null
+++ b/samples/cxx/CMakeLists.txt
@@ -0,0 +1,50 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+cmake_minimum_required(VERSION 3.28)
+
+project(onnxruntime_sample CXX)
+
+set(CMAKE_CXX_STANDARD 20)
+
+foreach(VAR IN ITEMS ORT_LIBRARY_DIR ORT_HEADER_DIR)
+    if (NOT DEFINED ${VAR})
+        message(FATAL_ERROR "Required variable ${VAR} is not set. "
+            "Set ORT_LIBRARY_DIR to the ONNX Runtime lib directory and "
+            "ORT_HEADER_DIR to the ONNX Runtime include directory.")
+    endif()
+endforeach()
+
+# Resolve to absolute paths
+get_filename_component(ORT_LIBRARY_DIR "${ORT_LIBRARY_DIR}" ABSOLUTE)
+get_filename_component(ORT_HEADER_DIR "${ORT_HEADER_DIR}" ABSOLUTE)
+
+#
+# onnxruntime_sample_program
+#
+block()
+add_executable(onnxruntime_sample_program)
+
+target_sources(onnxruntime_sample_program PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/main.cc)
+
+target_include_directories(onnxruntime_sample_program PRIVATE ${ORT_HEADER_DIR})
+
+target_link_directories(onnxruntime_sample_program PRIVATE ${ORT_LIBRARY_DIR})
+target_link_libraries(onnxruntime_sample_program PRIVATE onnxruntime)
+
+# Copy ONNX Runtime shared libraries next to the executable.
+# Collect shared library files from the ORT library directory based on platform.
+if (WIN32)
+    file(GLOB ORT_SHARED_LIBS "${ORT_LIBRARY_DIR}/*.dll")
+elseif (APPLE)
+    file(GLOB ORT_SHARED_LIBS "${ORT_LIBRARY_DIR}/*.dylib")
+else()
+    file(GLOB ORT_SHARED_LIBS "${ORT_LIBRARY_DIR}/*.so" "${ORT_LIBRARY_DIR}/*.so.*")
+endif()
+
+add_custom_command(TARGET onnxruntime_sample_program POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different
+        ${ORT_SHARED_LIBS}
+        $<TARGET_FILE_DIR:onnxruntime_sample_program>
+)
+endblock()
diff --git a/samples/cxx/README.md b/samples/cxx/README.md
new file mode 100644
index 0000000000000..1904c082cef7a
--- /dev/null
+++ b/samples/cxx/README.md
@@ -0,0 +1,92 @@
+# ONNX Runtime C++ Sample
+
+A minimal C++ program demonstrating basic ONNX Runtime inference. It loads an ONNX model that adds two float tensors (`C = A + B`), runs inference, and verifies the result.
+
+## Prerequisites
+
+- CMake 3.28 or later
+- C++20 compatible compiler (e.g., Visual Studio 2022)
+- An ONNX Runtime release package (download from [GitHub releases](https://github.com/microsoft/onnxruntime/releases))
+- For model generation:
+  - Python with the `onnx` package
+
+## Directory Structure
+
+```
+samples/cxx/
+├── CMakeLists.txt         # Build configuration
+├── main.cc                # Sample program source
+├── add_model.onnx         # ONNX model (C = A + B)
+├── generate_model.py      # Script to generate the ONNX model
+└── README.md              # This file
+```
+
+## Steps
+
+### 1. Extract the ONNX Runtime package
+
+Download and extract an ONNX Runtime release archive. For example:
+
+```
+tar -xf onnxruntime-win-x64-1.25.0.zip
+```
+
+This creates a directory like `onnxruntime-win-x64-1.25.0/` containing `include/` and `lib/` subdirectories.
+
+### 2. [Optional] Generate the ONNX model
+
+```
+cd samples/cxx
+pip install onnx
+python generate_model.py
+```
+
+This creates `add_model.onnx` in the current directory.
+
+### 3. Configure and build
+
+From the `samples/cxx` directory:
+
+**Windows:**
+```
+cmake -S . -B build ^
+    -DORT_HEADER_DIR:PATH=path\to\onnxruntime-win-x64-1.25.0\include ^
+    -DORT_LIBRARY_DIR:PATH=path\to\onnxruntime-win-x64-1.25.0\lib
+cmake --build build --config Release
+```
+
+**Linux / macOS:**
+```
+cmake -S . -B build \
+    -DORT_HEADER_DIR:PATH=path/to/onnxruntime-linux-x64-1.25.0/include \
+    -DORT_LIBRARY_DIR:PATH=path/to/onnxruntime-linux-x64-1.25.0/lib
+cmake --build build --config Release
+```
+
+Adjust the paths to match your extracted package name and location.
+
+The build automatically copies the ONNX Runtime shared libraries next to the executable.
+
+#### CMake Variables
+
+| Variable | Description |
+|---|---|
+| `ORT_HEADER_DIR` | Path to the ONNX Runtime `include` directory |
+| `ORT_LIBRARY_DIR` | Path to the ONNX Runtime `lib` directory |
+
+### 4. Run
+
+**Windows:**
+```
+build\Release\onnxruntime_sample_program.exe
+```
+
+**Linux / macOS:**
+```
+./build/onnxruntime_sample_program
+```
+
+You can also pass a model path as an argument:
+```
+onnxruntime_sample_program path/to/add_model.onnx
+```
diff --git a/samples/cxx/add_model.onnx b/samples/cxx/add_model.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..36308c1372a2209a204da40511e9fcfcf510921f
GIT binary patch
literal 100
zcmd<!wTj~6<6?B=VssK>bXH<^Oi2;qOiW3MPcKR=$cPdKN(u3C@o+E-ad0tlFaj|%
VNYDvgFbOE=j4tTJ!o?uK3jnf}3dR5c

literal 0
HcmV?d00001

diff --git a/samples/cxx/generate_model.py b/samples/cxx/generate_model.py
new file mode 100644
index 0000000000000..9ac70ab29deb4
--- /dev/null
+++ b/samples/cxx/generate_model.py
@@ -0,0 +1,42 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Generate a simple ONNX model that computes C = A + B.
+
+Inputs:
+  A : float tensor of shape [1, 3]
+  B : float tensor of shape [1, 3]
+
+Output:
+  C : float tensor of shape [1, 3]
+
+Usage:
+  pip install onnx
+  python generate_model.py
+"""
+
+from onnx import TensorProto, helper, save_model
+from onnx.checker import check_model
+
+
+def main():
+    # Define inputs and output
+    a = helper.make_tensor_value_info("A", TensorProto.FLOAT, [1, 3])
+    b = helper.make_tensor_value_info("B", TensorProto.FLOAT, [1, 3])
+    c = helper.make_tensor_value_info("C", TensorProto.FLOAT, [1, 3])
+
+    # Create the Add node
+    add_node = helper.make_node("Add", inputs=["A", "B"], outputs=["C"])
+
+    # Build the graph and model
+    graph = helper.make_graph([add_node], "add_graph", [a, b], [c])
+    model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
+
+    # Validate and save
+    check_model(model)
+    save_model(model, "add_model.onnx")
+    print("Saved add_model.onnx")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/samples/cxx/main.cc b/samples/cxx/main.cc
new file mode 100644
index 0000000000000..4e31e033ab8c7
--- /dev/null
+++ b/samples/cxx/main.cc
@@ -0,0 +1,170 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// Sample program demonstrating basic ONNX Runtime C++ API usage.
+// Loads a simple ONNX model (C = A + B), runs inference, and prints the result.
+//
+// Generate the model first:  python generate_model.py
+
+#include <array>
+#include <cstdint>
+#include <cstdlib>
+#include <filesystem>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "onnxruntime_cxx_api.h"
+
+// Throw std::runtime_error if `condition` is false. Includes file and line info.
+#define THROW_IF_NOT(condition)                                  \
+  do {                                                           \
+    if (!(condition)) {                                          \
+      throw std::runtime_error(std::string(__FILE__) + ":" +     \
+                               std::to_string(__LINE__) + ": " + \
+                               "check failed: " #condition);     \
+    }                                                            \
+  } while (0)
+
+int main(int argc, char* argv[]) {
+  try {
+    // -----------------------------------------------------------------------
+    // 1. Initialize the ONNX Runtime environment
+    // -----------------------------------------------------------------------
+    Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "onnxruntime_sample");
+    std::cout << "ONNX Runtime version: " << Ort::GetVersionString() << "\n\n";
+
+    // -----------------------------------------------------------------------
+    // 2. Create session options (could add execution providers here)
+    // -----------------------------------------------------------------------
+    Ort::SessionOptions session_options;
+    session_options.SetIntraOpNumThreads(1);
+    session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_BASIC);
+
+    // -----------------------------------------------------------------------
+    // 3. Load the ONNX model from a file
+    //    Generate with:  python generate_model.py
+    // -----------------------------------------------------------------------
+    const std::filesystem::path model_path = (argc > 1) ? argv[1] : "add_model.onnx";
+    std::cout << "Loading model: " << model_path.string() << "\n";
+
+    Ort::Session session(env, model_path.native().c_str(), session_options);
+
+    // -----------------------------------------------------------------------
+    // 4. Query model metadata: input/output names and shapes
+    // -----------------------------------------------------------------------
+    Ort::AllocatorWithDefaultOptions allocator;
+
+    const size_t num_inputs = session.GetInputCount();
+    const size_t num_outputs = session.GetOutputCount();
+    std::cout << "Model inputs:  " << num_inputs << "\n";
+    std::cout << "Model outputs: " << num_outputs << "\n";
+
+    // Collect input/output names
+    std::vector<std::string> input_names;
+    std::vector<std::string> output_names;
+
+    for (size_t i = 0; i < num_inputs; ++i) {
+      auto name = session.GetInputNameAllocated(i, allocator);
+      std::cout << "  Input  " << i << ": " << name.get() << "\n";
+      input_names.emplace_back(name.get());
+    }
+    for (size_t i = 0; i < num_outputs; ++i) {
+      auto name = session.GetOutputNameAllocated(i, allocator);
+      std::cout << "  Output " << i << ": " << name.get() << "\n";
+      output_names.emplace_back(name.get());
+    }
+    std::cout << "\n";
+
+    // -----------------------------------------------------------------------
+    // 5. Prepare input tensors
+    // -----------------------------------------------------------------------
+    // Our model expects two float tensors of shape [1, 3].
+    constexpr int64_t batch_size = 1;
+    constexpr int64_t num_elements = 3;
+    const std::array<int64_t, 2> input_shape = {batch_size, num_elements};
+
+    std::array<float, num_elements> input_a = {1.0f, 2.0f, 3.0f};
+    std::array<float, num_elements> input_b = {4.0f, 5.0f, 6.0f};
+
+    auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+
+    auto tensor_a = Ort::Value::CreateTensor<float>(
+        memory_info, input_a.data(), input_a.size(),
+        input_shape.data(), input_shape.size());
+
+    auto tensor_b = Ort::Value::CreateTensor<float>(
+        memory_info, input_b.data(), input_b.size(),
+        input_shape.data(), input_shape.size());
+
+    THROW_IF_NOT(tensor_a.IsTensor());
+    THROW_IF_NOT(tensor_b.IsTensor());
+
+    // The Run() API expects arrays of C strings for input/output names.
+    std::vector<const char*> input_name_ptrs;
+    std::vector<const char*> output_name_ptrs;
+    for (const auto& n : input_names) input_name_ptrs.push_back(n.c_str());
+    for (const auto& n : output_names) output_name_ptrs.push_back(n.c_str());
+
+    std::array<Ort::Value, 2> input_tensors{std::move(tensor_a), std::move(tensor_b)};
+
+    // -----------------------------------------------------------------------
+    // 6. Run inference
+    // -----------------------------------------------------------------------
+    std::cout << "Running inference...\n";
+
+    Ort::RunOptions run_options;
+    auto output_tensors = session.Run(
+        run_options,
+        input_name_ptrs.data(), input_tensors.data(), input_tensors.size(),
+        output_name_ptrs.data(), output_name_ptrs.size());
+
+    // -----------------------------------------------------------------------
+    // 7. Process output
+    // -----------------------------------------------------------------------
+    THROW_IF_NOT(!output_tensors.empty() && output_tensors[0].IsTensor());
+
+    const float* output_data = output_tensors[0].GetTensorData<float>();
+    auto type_info = output_tensors[0].GetTensorTypeAndShapeInfo();
+    size_t output_count = type_info.GetElementCount();
+
+    std::cout << "\nInputs:\n";
+    std::cout << "  A = [";
+    for (size_t i = 0; i < input_a.size(); ++i) {
+      std::cout << (i ? ", " : "") << input_a[i];
+    }
+    std::cout << "]\n";
+
+    std::cout << "  B = [";
+    for (size_t i = 0; i < input_b.size(); ++i) {
+      std::cout << (i ? ", " : "") << input_b[i];
+    }
+    std::cout << "]\n";
+
+    std::cout << "\nOutput (A + B):\n";
+    std::cout << "  C = [";
+    for (size_t i = 0; i < output_count; ++i) {
+      std::cout << (i ? ", " : "") << output_data[i];
+    }
+    std::cout << "]\n";
+
+    // Verify correctness
+    bool correct = true;
+    for (size_t i = 0; i < num_elements; ++i) {
+      if (output_data[i] != input_a[i] + input_b[i]) {
+        correct = false;
+        break;
+      }
+    }
+    std::cout << "\nResult: " << (correct ? "PASS" : "FAIL") << "\n";
+
+    return correct ? EXIT_SUCCESS : EXIT_FAILURE;
+  } catch (const Ort::Exception& e) {
+    std::cerr << "ONNX Runtime error: " << e.what() << "\n";
+    return EXIT_FAILURE;
+  } catch (const std::exception& e) {
+    std::cerr << "Error: " << e.what() << "\n";
+    return EXIT_FAILURE;
+  }
+}
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml
index 8d96c1ae99e0a..5ddac928b32d3 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml
@@ -160,7 +160,61 @@ stages:
     NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu.Linux'
     CudaVersion: 12.8
 
+- template: templates/test-binary-archive-stage.yml
+  parameters:
+    artifactName: onnxruntime-linux-aarch64
+    artifactPipelineResource: build
+    previousStageName: Setup
+    platform: linux-aarch64
+    agentPool: onnxruntime-linux-ARM64-CPU-2019
+
+- template: templates/test-binary-archive-stage.yml
+  parameters:
+    artifactName: onnxruntime-linux-x64
+    artifactPipelineResource: build
+    previousStageName: Setup
+    platform: linux-x64
+    agentPool: onnxruntime-Ubuntu2204-AMD-CPU
 
+- template: templates/test-binary-archive-stage.yml
+  parameters:
+    artifactName: onnxruntime-osx-arm64
+    artifactPipelineResource: build
+    previousStageName: Setup
+    platform: osx-arm64
+    agentPool:
+      name: AcesShared
+      os: macOS
+      demands:
+      - ImageOverride -equals ACES_VM_SharedPool_Sequoia
+    agentSetupSteps:
+    - template: templates/setup-build-tools.yml
+      parameters:
+        host_cpu_arch: arm64
+
+- template: templates/test-binary-archive-stage.yml
+  parameters:
+    artifactName: onnxruntime-win-arm64
+    artifactPipelineResource: build
+    previousStageName: Setup
+    platform: win-arm64
+    agentPool: onnxruntime-qnn-windows-vs-2022-arm64
+
+- template: templates/test-binary-archive-stage.yml
+  parameters:
+    artifactName: onnxruntime-win-arm64x
+    artifactPipelineResource: build
+    previousStageName: Setup
+    platform: win-arm64x
+    agentPool: onnxruntime-qnn-windows-vs-2022-arm64
+
+- template: templates/test-binary-archive-stage.yml
+  parameters:
+    artifactName: onnxruntime-win-x64
+    artifactPipelineResource: build
+    previousStageName: Setup
+    platform: win-x64
+    agentPool: onnxruntime-Win-CPU-VS2022-Latest
 
 # Run GPU tests.
 - stage: Windows_Packaging_cuda_Testing
diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
index 0481a356cf9a1..2a8e222a9e192 100644
--- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
@@ -96,6 +96,5 @@ extends:
         QnnSdk: ${{ parameters.QnnSdk }}
         IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
         DoEsrp: ${{ parameters.DoEsrp }}
-        ArtifactName: 'drop-nuget-qnn-arm64x'
         StageName: 'OnnxRuntime_QNN_Nuget_Win_Arm64x'
         build_config: ${{ parameters.build_config }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/build-win-arm64x-steps.yml b/tools/ci_build/github/azure-pipelines/templates/build-win-arm64x-steps.yml
new file mode 100644
index 0000000000000..50e7cbb13d6e1
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/templates/build-win-arm64x-steps.yml
@@ -0,0 +1,28 @@
+# Runs a Windows ARM64X build in `buildDirectory`.
+
+parameters:
+  buildDirectory: '$(Build.BinariesDirectory)'
+  additionalBuildPyArgs: ''
+
+steps:
+- task: PythonScript@0
+  displayName: 'Build arm64 project for arm64x - generate the def & lib file for next build'
+  inputs:
+    scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+    arguments: >
+      ${{ parameters.additionalBuildPyArgs }}
+      --build_shared_lib
+      --arm64
+      --buildasx
+      --build_dir="${{ parameters.buildDirectory }}/arm64"
+
+- task: PythonScript@0
+  displayName: 'Build arm64ec project for arm64x - the real arm64x'
+  inputs:
+    scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+    arguments: >
+      ${{ parameters.additionalBuildPyArgs }}
+      --build_shared_lib
+      --arm64ec
+      --buildasx
+      --build_dir="${{ parameters.buildDirectory }}"
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows-qnn.yml
deleted file mode 100644
index ab3e0ebaab39a..0000000000000
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows-qnn.yml
+++ /dev/null
@@ -1,141 +0,0 @@
-# sets up common build tools for the windows build machines before build
-
-parameters:
-- name: DoEsrp
-  displayName: Run code sign tasks? Must be true if you are doing an Onnx Runtime release.
-  type: boolean
-  default: true
-
-- name: buildConfig
-  displayName: buildConfig
-  type: string
-  default: 'RelWithDebInfo'
-
-- name: artifactName
-  displayName: artifactName，like 'onnxruntime-win-x64-1.6.0'
-  type: string
-  default: ''
-
-- name: artifactNameNoVersionString
-  type: string
-  default: 'onnxruntime-win-x64'
-
-- name: commitId
-  displayName: commitId
-  type: string
-  default: ''
-
-- name: trtEnabled
-  displayName: Include TRT EP libraries?
-  type: boolean
-  default: true
-
-steps:
-    - ${{if or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-'))}}:
-      - template: publish-symbolrequestprod-api.yml
-        parameters:
-          ${{if eq(variables['Build.SourceBranch'], 'refs/heads/main')}}:
-            symbolExpiryTime: 60
-          includePublicSymbolServer: true
-          symbolsArtifactName: ${{parameters.artifactNameNoVersionString}}
-          symbolsVersion: $(Build.BuildId)
-          symbolProject: 'ONNX Runtime'
-          subscription: 'OnnxrunTimeCodeSign_20240611'
-          searchPattern: |
-            $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime.pdb
-            $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_*.pdb
-
-
-    - task: CmdLine@2
-      displayName: 'Copy build artifacts for zipping'
-      inputs:
-        script: |
-          mkdir $(Build.BinariesDirectory)\${{parameters.artifactName}}
-          mkdir $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          mkdir $(Build.BinariesDirectory)\${{parameters.artifactName}}\include
-
-          if exist $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_cuda.dll (
-            echo "cuda context headers copied"
-            mkdir $(Build.BinariesDirectory)\${{parameters.artifactName}}\include\core\providers\cuda
-            copy $(Build.SourcesDirectory)\include\onnxruntime\core\providers\resource.h $(Build.BinariesDirectory)\${{parameters.artifactName}}\include\core\providers
-            copy $(Build.SourcesDirectory)\include\onnxruntime\core\providers\custom_op_context.h $(Build.BinariesDirectory)\${{parameters.artifactName}}\include\core\providers
-            copy $(Build.SourcesDirectory)\include\onnxruntime\core\providers\cuda\cuda_context.h $(Build.BinariesDirectory)\${{parameters.artifactName}}\include\core\providers\cuda
-            copy $(Build.SourcesDirectory)\include\onnxruntime\core\providers\cuda\cuda_resource.h $(Build.BinariesDirectory)\${{parameters.artifactName}}\include\core\providers\cuda
-          )
-
-          echo "Directories created"
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_shared.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_shared.lib $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_shared.pdb $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_cuda.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_cuda.pdb $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_cuda.lib $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-
-          # Copy WebGPU dependencies if required
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\dxcompiler.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\dxil.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-
-          # Copy QNN dependencies if required
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_qnn.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\libQnnHtp*.so $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib /Y
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\libqnnhtp*.cat $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib /Y
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnCpu.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnGpu.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtp.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpPrepare.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpV68Stub.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpV73Stub.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpV81Stub.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnSaver.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnSystem.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\Qualcomm_LICENSE.pdf $(Build.BinariesDirectory)\${{parameters.artifactName}}
-
-          # copy trt ep libraries only when trt ep is enabled
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_tensorrt.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_tensorrt.pdb $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_tensorrt.lib $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime.pdb $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime.lib $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.SourcesDirectory)\include\onnxruntime\core\session\onnxruntime_*.h  $(Build.BinariesDirectory)\${{parameters.artifactName}}\include
-          copy $(Build.SourcesDirectory)\include\onnxruntime\core\framework\provider_options.h  $(Build.BinariesDirectory)\${{parameters.artifactName}}\include
-          copy $(Build.SourcesDirectory)\include\onnxruntime\core\providers\cpu\cpu_provider_factory.h  $(Build.BinariesDirectory)\${{parameters.artifactName}}\include
-          copy $(Build.SourcesDirectory)\orttraining\orttraining\training_api\include\onnxruntime_training*.h  $(Build.BinariesDirectory)\${{parameters.artifactName}}\include
-
-          REM copy the README, license and TPN
-          copy $(Build.SourcesDirectory)\README.md $(Build.BinariesDirectory)\${{parameters.artifactName}}\README.md
-          copy $(Build.SourcesDirectory)\docs\Privacy.md $(Build.BinariesDirectory)\${{parameters.artifactName}}\Privacy.md
-          copy $(Build.SourcesDirectory)\LICENSE $(Build.BinariesDirectory)\${{parameters.artifactName}}\LICENSE
-          copy $(Build.SourcesDirectory)\ThirdPartyNotices.txt $(Build.BinariesDirectory)\${{parameters.artifactName}}\ThirdPartyNotices.txt
-          copy $(Build.SourcesDirectory)\VERSION_NUMBER $(Build.BinariesDirectory)\${{parameters.artifactName}}\VERSION_NUMBER
-          @echo ${{parameters.commitId}} > $(Build.BinariesDirectory)\${{parameters.artifactName}}\GIT_COMMIT_ID
-
-        workingDirectory: '$(Build.BinariesDirectory)\${{parameters.buildConfig}}'
-
-    - ${{ if eq(parameters.DoEsrp, true) }}:
-      - template: win-esrp-dll.yml
-        parameters:
-          FolderPath: '$(Build.BinariesDirectory)\${{parameters.artifactName}}'
-          DisplayName: 'ESRP - Sign Native dlls'
-          DoEsrp: ${{parameters.DoEsrp}}
-          Pattern: '*.dll,*.exe'
-
-    - task: DeleteFiles@1
-      displayName: 'Delete CodeSignSummary*.md'
-      inputs:
-        SourceFolder: '$(Build.BinariesDirectory)\${{parameters.artifactName}}'
-        Contents: 'CodeSignSummary*.md'
-
-    - task: ArchiveFiles@2
-      inputs:
-        rootFolderOrFile: '$(Build.BinariesDirectory)\${{parameters.artifactName}}'
-        includeRootFolder: true
-        archiveType: 'zip' # Options: zip, 7z, tar, wim
-        archiveFile: '$(Build.ArtifactStagingDirectory)\${{parameters.artifactName}}.zip'
-        replaceExistingArchive: true
-
-    - task: 1ES.PublishPipelineArtifact@1
-      inputs:
-        targetPath: '$(Build.ArtifactStagingDirectory)'
-        artifactName: '${{parameters.artifactNameNoVersionString}}'
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
index 28a1960aac27b..5f9dd5677e7bc 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
@@ -12,7 +12,7 @@ parameters:
   default: 'RelWithDebInfo'
 
 - name: artifactName
-  displayName: artifactName，like 'onnxruntime-win-x64-1.6.0'
+  displayName: artifactName, like 'onnxruntime-win-x64-1.6.0'
   type: string
   default: ''
 
@@ -30,6 +30,11 @@ parameters:
   type: boolean
   default: true
 
+- name: publishArtifactStagingDirectory
+  displayName: Whether to publish the artifact staging directory as an artifact named `artifactNameNoVersionString`.
+  type: boolean
+  default: false
+
 steps:
     - ${{if or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-'))}}:
       - template: publish-symbolrequestprod-api.yml
@@ -89,6 +94,7 @@ steps:
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpV81Stub.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnSaver.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnSystem.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
+          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\Qualcomm_LICENSE.pdf $(Build.BinariesDirectory)\${{parameters.artifactName}}
 
           # copy trt ep libraries only when trt ep is enabled
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_tensorrt.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
@@ -133,3 +139,9 @@ steps:
         archiveType: 'zip' # Options: zip, 7z, tar, wim
         archiveFile: '$(Build.ArtifactStagingDirectory)\${{parameters.artifactName}}.zip'
         replaceExistingArchive: true
+
+    - ${{ if parameters.publishArtifactStagingDirectory }}:
+      - task: 1ES.PublishPipelineArtifact@1
+        inputs:
+          targetPath: '$(Build.ArtifactStagingDirectory)'
+          artifactName: '${{parameters.artifactNameNoVersionString}}'
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index a0f023325be04..448dbafcaaaac 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -163,6 +163,20 @@ stages:
     PreReleaseVersionSuffixString: ${{ parameters.PreReleaseVersionSuffixString }}
     PreReleaseVersionSuffixNumber: ${{ parameters.PreReleaseVersionSuffixNumber }}
 
+- template: win-ci.yml
+  parameters:
+    DoEsrp: true
+    stage_name_suffix: CPU_arm64x_${{ parameters.BuildVariant }}
+    buildArch: x64
+    msbuildPlatform: arm64x
+    packageName: arm64x
+    buildparameter: ${{ parameters.AdditionalBuildFlags }} ${{ parameters.AdditionalWinBuildFlags}}
+    runTests: false
+    buildJava: false
+    buildNodejs: false
+    PreReleaseVersionSuffixString: ${{ parameters.PreReleaseVersionSuffixString }}
+    PreReleaseVersionSuffixNumber: ${{ parameters.PreReleaseVersionSuffixNumber }}
+
 - template: win-ci.yml
   parameters:
     DoEsrp: true
diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
index 8a1c4f8a39316..7e176b67f6685 100644
--- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
@@ -48,7 +48,7 @@ stages:
     variables:
       OrtPackageId: ${{ parameters.OrtNugetPackageId }}
       ReleaseVersionSuffix: $[stageDependencies.Setup.Set_Variables.outputs['Set_Release_Version_Suffix.ReleaseVersionSuffix']]
-      commonBuildArgs: '--skip_submodule_sync --build_shared_lib --client_package_build --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags ${{ parameters.AdditionalBuildArgs}}'
+      commonBuildArgs: '--skip_submodule_sync --build_shared_lib --client_package_build --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags --config ${{ parameters.build_config }} ${{ parameters.AdditionalBuildArgs}}'
 
     steps:
     - template: set-version-number-variables-step.yml
@@ -61,17 +61,10 @@ stages:
       parameters:
         QnnSDKVersion: ${{ parameters.QnnSdk }}
 
-    - task: PythonScript@0
-      displayName: 'Build arm64x project - generate the def & lib file for next build'
-      inputs:
-        scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-        arguments: ' --arm64 --buildasx --build_dir $(Build.BinariesDirectory)\arm64x --use_qnn --qnn_home $(QnnSDKRootDir) $(commonBuildArgs)'
-
-    - task: PythonScript@0
-      displayName: 'Build arm64ecx project - the real arm64x'
-      inputs:
-        scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-        arguments: ' --arm64ec --buildasx --build_dir $(Build.BinariesDirectory) --use_qnn --qnn_home $(QnnSDKRootDir) $(commonBuildArgs)'
+    - template: build-win-arm64x-steps.yml
+      parameters:
+        buildDirectory: '$(Build.BinariesDirectory)'
+        additionalBuildPyArgs: '$(commonBuildArgs) --use_qnn --qnn_home $(QnnSDKRootDir)'
 
     - task: CmdLine@2
       displayName: 'Print contents of binaries directory'
@@ -87,12 +80,13 @@ stages:
         Pattern: 'onnxruntime*.dll'
 
     - ${{ if eq(parameters.PublishArchive, true) }}:
-      - template: c-api-artifacts-package-and-publish-steps-windows-qnn.yml
+      - template: c-api-artifacts-package-and-publish-steps-windows.yml
         parameters:
           buildConfig: ${{ parameters.build_config }}
           artifactName: 'onnxruntime-win-arm64x-qnn'
           artifactNameNoVersionString: 'onnxruntime-win-arm64x-qnn'
           DoEsrp: ${{ parameters.DoEsrp }}
+          publishArtifactStagingDirectory: true
 
     - task: MSBuild@1
       displayName: 'Restore NuGet Packages and create project.assets.json'
diff --git a/tools/ci_build/github/azure-pipelines/templates/set-variable.yml b/tools/ci_build/github/azure-pipelines/templates/set-variable.yml
new file mode 100644
index 0000000000000..2cf49f2f067c2
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/templates/set-variable.yml
@@ -0,0 +1,32 @@
+# Sets an ADO pipeline variable.
+# See https://learn.microsoft.com/en-us/azure/devops/pipelines/process/set-variables-scripts
+
+parameters:
+- name: name
+  type: string
+
+- name: value
+  type: string
+
+steps:
+- task: PythonScript@0
+  displayName: 'Set variable - ${{ parameters.name }}'
+  inputs:
+    scriptSource: inline
+    script: |
+      import os
+
+      variable_name = os.getenv("VARIABLE_NAME")
+      variable_value = os.getenv("VARIABLE_VALUE")
+
+      if not variable_name.isidentifier():
+          raise ValueError(f"Variable name is not a valid identifier: '{variable_name}'")
+
+      if "\n" in variable_value:
+          raise ValueError(f"Variable value should not contain any newlines: '{variable_value}'")
+
+      print(f"Setting variable: {variable_name} = '{variable_value}'")
+      print(f"##vso[task.setvariable variable={variable_name}]{variable_value}")
+  env:
+    VARIABLE_NAME: ${{ parameters.name }}
+    VARIABLE_VALUE: ${{ parameters.value }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/test-binary-archive-stage.yml b/tools/ci_build/github/azure-pipelines/templates/test-binary-archive-stage.yml
new file mode 100644
index 0000000000000..b9b9cdc6b0eb3
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/templates/test-binary-archive-stage.yml
@@ -0,0 +1,121 @@
+# Tests an ONNX Runtime binary archive produced by the packaging pipeline.
+
+parameters:
+- name: artifactName
+  type: string
+- name: artifactPipelineResource
+  type: string
+- name: previousStageName
+  type: string
+  default: ''
+- name: platform
+  type: string
+- name: agentPool
+  type: object
+- name: agentSetupSteps
+  type: stepList
+  default: []
+
+stages:
+- stage: Binary_Archive_Testing_${{ replace(parameters.platform, '-', '_') }}
+  ${{ if ne(parameters.previousStageName, '') }}:
+    dependsOn: ${{ parameters.previousStageName }}
+
+  jobs:
+  - job: Binary_Archive_Testing_${{ replace(parameters.platform, '-', '_') }}
+    pool: ${{ parameters.agentPool }}
+
+    variables:
+    - name: buildConfig
+      value: Release
+    - name: relativePathFromBuildToOutputDir
+      ${{ if startsWith(parameters.platform, 'win') }}:
+        value: "${{ variables['buildConfig'] }}"
+      ${{ else }}:
+        value: "."
+
+    steps:
+    - checkout: self
+      clean: true
+      submodules: none
+
+    - ${{ each agentSetupStep in parameters.agentSetupSteps }}:
+      - ${{ agentSetupStep }}
+
+    - download: ${{ parameters.artifactPipelineResource }}
+      artifact: ${{ parameters.artifactName }}
+      patterns: |
+        *.zip
+        *.tgz
+      displayName: Download binary archive for ${{ parameters.platform }}
+
+    # Extract the binary archive.
+    # The archive contains a top-level directory like onnxruntime-<platform>-<version>/.
+    # After extraction, set ORT_PACKAGE_DIR to the extracted directory.
+    - ${{ if startsWith(parameters.platform, 'win') }}:
+      - task: PowerShell@2
+        displayName: 'Extract binary archive'
+        inputs:
+          targetType: 'inline'
+          script: |
+            $artifactDir = "$(Pipeline.Workspace)/${{ parameters.artifactPipelineResource }}/${{ parameters.artifactName }}"
+            $archive = (Get-ChildItem -Path $artifactDir -Filter *.zip)[0].FullName
+            Write-Host "Extracting $archive"
+            Expand-Archive -Path $archive -DestinationPath $(Build.BinariesDirectory)
+            $extractedDir = (Get-ChildItem -Path $(Build.BinariesDirectory) -Directory | Where-Object { $_.Name -like "onnxruntime-*" })[0].FullName
+            Write-Host "Extracted to $extractedDir"
+            Write-Host "##vso[task.setvariable variable=ORT_PACKAGE_DIR]$extractedDir"
+
+    - ${{ else }}:
+      - bash: |
+          set -ex
+          artifact_dir="$(Pipeline.Workspace)/${{ parameters.artifactPipelineResource }}/${{ parameters.artifactName }}"
+          archive=$(find "$artifact_dir" -name '*.tgz' | head -1)
+          echo "Extracting $archive"
+          tar -xzf "$archive" -C $(Build.BinariesDirectory)
+          extracted_dir=$(find $(Build.BinariesDirectory) -maxdepth 1 -type d -name 'onnxruntime-*' | head -1)
+          echo "Extracted to $extracted_dir"
+
+          # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote.
+          set +x
+          echo "##vso[task.setvariable variable=ORT_PACKAGE_DIR]$extracted_dir"
+        displayName: 'Extract binary archive'
+
+    # Build and run the C++ sample using the extracted ONNX Runtime package.
+
+    - script: >
+        cmake
+        -S $(Build.SourcesDirectory)/samples/cxx
+        -B $(Build.BinariesDirectory)/sample_build
+        -DORT_HEADER_DIR:PATH=$(ORT_PACKAGE_DIR)/include
+        -DORT_LIBRARY_DIR:PATH=$(ORT_PACKAGE_DIR)/lib
+      displayName: 'Generate C++ sample build system'
+
+    - script: |
+        cmake --build $(Build.BinariesDirectory)/sample_build --config $(buildConfig)
+      displayName: 'Build C++ sample'
+
+    - script: >
+        $(Build.BinariesDirectory)/sample_build/$(relativePathFromBuildToOutputDir)/onnxruntime_sample_program
+        $(Build.SourcesDirectory)/samples/cxx/add_model.onnx
+      displayName: 'Run C++ sample'
+
+    # For win-arm64x, also build and run for ARM64EC.
+    - ${{ if eq(parameters.platform, 'win-arm64x') }}:
+      - script: >
+          cmake
+          -S $(Build.SourcesDirectory)/samples/cxx
+          -B $(Build.BinariesDirectory)/sample_build_arm64ec
+          -DORT_HEADER_DIR:PATH=$(ORT_PACKAGE_DIR)/include
+          -DORT_LIBRARY_DIR:PATH=$(ORT_PACKAGE_DIR)/lib
+          -A ARM64EC
+        displayName: 'Generate C++ sample build system (ARM64EC)'
+
+      - script: |
+          cmake --build $(Build.BinariesDirectory)/sample_build_arm64ec --config $(buildConfig)
+        displayName: 'Build C++ sample (ARM64EC)'
+
+      - script: >
+          $(Build.BinariesDirectory)/sample_build_arm64ec/$(relativePathFromBuildToOutputDir)/onnxruntime_sample_program
+          $(Build.SourcesDirectory)/samples/cxx/add_model.onnx
+        displayName: 'Run C++ sample (ARM64EC)'
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index cfb752ddc2b58..8a5584c111525 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -177,12 +177,40 @@ stages:
 
       - script: python -m pip install -r $(Build.SourcesDirectory)\tools\ci_build\github\windows\python\requirements.txt
 
-      - task: PythonScript@0
-        displayName: 'Generate cmake config'
-        inputs:
-          scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: '--parallel 16 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --build --cmake_generator "$(VSGenerator)" --enable_onnx_tests $(TelemetryOption) ${{ parameters.buildparameter }} $(timeoutParameter) $(buildJavaParameter)'
-          workingDirectory: '$(Build.BinariesDirectory)'
+      - template: set-variable.yml
+        parameters:
+          name: commonBuildPyArgs
+          value: >-
+            --config RelWithDebInfo
+            --parallel
+            --use_vcpkg
+            --use_vcpkg_ms_internal_asset_cache
+            --use_binskim_compliant_compile_flags
+            --enable_lto
+            --disable_rtti
+            --skip_submodule_sync
+            --build_shared_lib
+            --update --build
+            --cmake_generator "$(VSGenerator)"
+            --enable_onnx_tests
+            $(TelemetryOption)
+            ${{ parameters.buildparameter }}
+            $(timeoutParameter)
+            $(buildJavaParameter)
+
+      - ${{ if eq(parameters.msbuildPlatform, 'arm64x') }}:
+        - template: build-win-arm64x-steps.yml
+          parameters:
+            buildDirectory: '$(Build.BinariesDirectory)'
+            additionalBuildPyArgs: '$(commonBuildPyArgs)'
+
+      - ${{ else }}:
+        - task: PythonScript@0
+          displayName: 'Generate build system and build'
+          inputs:
+            scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+            arguments: '$(commonBuildPyArgs) --build_dir $(Build.BinariesDirectory)'
+            workingDirectory: '$(Build.BinariesDirectory)'
 
       # For CPU job, tests are run in the same machine as building
       - ${{ if eq(parameters.buildJava, 'true') }}:

From f40854986949e558f27a3f0d4dc7a678201adec7 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 26 Feb 2026 20:30:07 -0800
Subject: [PATCH 21/21] [Patch] allows new memory info name for WebGPU (#27475)

### Description

allows new memory info name for WebGPU.

### Motivation and Context

This allows at least 1.24.3 works with future (1.25.x) WebGPU plugin DLL
---
 onnxruntime/core/framework/allocator.cc | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc
index a656abb098911..7648aaf8f9d33 100644
--- a/onnxruntime/core/framework/allocator.cc
+++ b/onnxruntime/core/framework/allocator.cc
@@ -237,7 +237,19 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtA
         OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::AMD, device_id),
         mem_type1);
   } else if (strcmp(name1, onnxruntime::WEBGPU_BUFFER) == 0 ||
-             strcmp(name1, onnxruntime::WEBNN_TENSOR) == 0) {
+             strcmp(name1, onnxruntime::WEBNN_TENSOR) == 0 ||
+             // PR #27207 (merged to main/1.25.x, not in 1.24.x) shortened the WebGPU/WebNN
+             // memory info names from "WebGPU_Buffer"/"WebNN_Tensor" to "WebGPU_Buf"/"WebNN_Ten"
+             // to enable Small String Optimization (SSO) on wasm32 (emscripten), where strings
+             // must be <= 10 chars for SSO.
+             //
+             // A WebGPU/WebNN plugin EP built against 1.25.x will use the new short names.
+             // Accept both old and new names here so that plugin EPs targeting either 1.24.x
+             // or 1.25.x can work with this 1.24.x runtime.
+             //
+             // See: https://github.com/microsoft/onnxruntime/pull/27207
+             strcmp(name1, "WebGPU_Buf") == 0 ||
+             strcmp(name1, "WebNN_Ten") == 0) {
     *out = new OrtMemoryInfo(
         name1, type,
         OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::NONE, device_id),