diff --git a/.github/actions/macos-ci-setup/action.yml b/.github/actions/macos-ci-setup/action.yml
index 0d60eeae8aee3..054676d301820 100644
--- a/.github/actions/macos-ci-setup/action.yml
+++ b/.github/actions/macos-ci-setup/action.yml
@@ -8,7 +8,7 @@ inputs:
   python_version:
     required: false
     type: string
-    default: "3.11"
+    default: "3.14"
   node_version:
     required: false
     type: string
diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml
index 9aa8418c55a40..dd8cbfdc71a9c 100644
--- a/.github/workflows/linux_ci.yml
+++ b/.github/workflows/linux_ci.yml
@@ -68,6 +68,21 @@ jobs:
     secrets:
       GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
+  build-linux-x64-release-py314:
+    name: Build Linux x64 Release (Python 3.14)
+    uses: ./.github/workflows/reusable_linux_build.yml
+    with:
+      pool_name: "onnxruntime-github-Ubuntu2204-AMD-CPU"
+      build_config: Release
+      architecture: x64
+      dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
+      docker_image_repo: onnxruntimecpubuildpythonx64
+      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --build_nuget --enable_transformers_tool_test --cmake_extra_defines onnxruntime_BUILD_BENCHMARKS=ON'
+      python_path_prefix: 'PATH=/opt/python/cp314-cp314/bin:$PATH' # $ needs escaping in single quotes
+      job_identifier: build-linux-x64-release-py314
+    secrets:
+      GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
   orttraining-linux-ci-pipeline:
     name: Build Linux x64 Release with training
     uses: ./.github/workflows/reusable_linux_build.yml
@@ -109,7 +124,7 @@ jobs:
       dockerfile_path: tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile
       docker_image_repo: onnxruntimecpubuildpythonaarch64
       extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cmake_extra_defines onnxruntime_BUILD_BENCHMARKS=ON'
-      python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH' # $ needs escaping in single quotes
+      python_path_prefix: 'PATH=/opt/python/cp314-cp314/bin:$PATH' # $ needs escaping in single quotes
       job_identifier: build-linux-arm64-release
     secrets:
       GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index 32e126c34ac39..0f8b4a42f48ae 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -16,7 +16,7 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  python_version: 3.11
+  python_version: "3.14"
 
 jobs:
   cpu:
@@ -28,6 +28,7 @@ jobs:
           {"machine": "arm64", "target": "arm64", "build_config": "Debug"},
           {"machine": "arm64", "target": "arm64", "build_config": "Release"}
         ]
+      python_version: "3.14"
 
   coreml:
     uses: ./.github/workflows/macos-ci-build-and-test-workflow.yml
@@ -39,6 +40,7 @@ jobs:
           {"machine": "arm64", "target": "arm64", "build_config": "Debug"},
           {"machine": "arm64", "target": "arm64", "build_config": "Release"}
         ]
+      python_version: "3.14"
 
   xnnpack:
     uses: ./.github/workflows/macos-ci-build-and-test-workflow.yml
@@ -49,6 +51,7 @@ jobs:
         [
           {"machine": "arm64", "target": "arm64", "build_config": "Debug"}
         ]
+      python_version: "3.14"
 
   webgpu:
     uses: ./.github/workflows/macos-ci-build-and-test-workflow.yml
@@ -60,6 +63,7 @@ jobs:
           {"machine": "arm64", "target": "arm64", "build_config": "Debug"},
           {"machine": "arm64", "target": "arm64", "build_config": "Release"}
         ]
+      python_version: "3.14"
 
   iphone_simulator:
     runs-on: macos-15
diff --git a/.github/workflows/macos-ci-build-and-test-workflow.yml b/.github/workflows/macos-ci-build-and-test-workflow.yml
index 75002fdf12c00..76198c7f5c1ce 100644
--- a/.github/workflows/macos-ci-build-and-test-workflow.yml
+++ b/.github/workflows/macos-ci-build-and-test-workflow.yml
@@ -19,7 +19,7 @@ on:
       python_version:
         required: false
         type: string
-        default: "3.11"
+        default: "3.14"
       matrix_include:
         required: false
         type: string
diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml
index 89ae03981ecef..7b93086fbb77d 100644
--- a/.github/workflows/windows_cuda.yml
+++ b/.github/workflows/windows_cuda.yml
@@ -32,7 +32,7 @@ jobs:
 
       - uses: actions/setup-python@v6
         with:
-          python-version: '3.12'
+          python-version: '3.14'
           architecture: x64
 
       - name: Locate vcvarsall and Setup Env
@@ -173,7 +173,7 @@ jobs:
 
       - uses: actions/setup-python@v6
         with:
-          python-version: '3.12'
+          python-version: '3.14'
           architecture: x64
 
       - uses: actions/setup-node@v6
diff --git a/VERSION_NUMBER b/VERSION_NUMBER
index e4a973f913f4f..ae96cc7310aaa 100644
--- a/VERSION_NUMBER
+++ b/VERSION_NUMBER
@@ -1 +1 @@
-1.24.2
+1.24.3
diff --git a/cmake/deps.txt b/cmake/deps.txt
index 578dd8fd23d09..65c74060e8deb 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -46,7 +46,7 @@ protoc_linux_aarch64;https://github.com/protocolbuffers/protobuf/releases/downlo
 protoc_mac_universal;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-osx-universal_binary.zip;23710c3d1c2036d8d65a6a22234372fa2d7af9ef
 psimd;https://github.com/Maratyszcza/psimd/archive/072586a71b55b7f8c584153d223e95687148a900.zip;1f5454b01f06f9656b77e4a5e2e31d7422487013
 pthreadpool;https://github.com/google/pthreadpool/archive/dcc9f28589066af0dbd4555579281230abbf74dd.zip;533a77943203ef15ca608bcd9dbe2c94da7451d2
-pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f780292da9db273c8ef06ccf5fd4b623624143e9
+pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v3.0.2.zip;a064e663b4d7a337ac291d1bef7337ef4e60a1ae
 pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/403d652dca4c1046e8145950b1c0997a9f748b57.zip;30b2a07fe4bae8574f89176e56274cacdd6d135b
 re2;https://github.com/google/re2/archive/refs/tags/2024-07-02.zip;646e1728269cde7fcef990bf4a8e87b047882e88
 safeint;https://github.com/dcleblanc/SafeInt/archive/refs/tags/3.0.28.zip;23f252040ff6cb9f1fd18575b32fa8fb5928daac
diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
index 79280c97a899e..ba14667bc3c88 100644
--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
@@ -6,7 +6,6 @@ onnxruntime_fetchcontent_declare(
     URL ${DEP_URL_pybind11}
     URL_HASH SHA1=${DEP_SHA1_pybind11}
     EXCLUDE_FROM_ALL
-    FIND_PACKAGE_ARGS 2.13 NAMES pybind11
+    FIND_PACKAGE_ARGS 3.0 NAMES pybind11
 )
 onnxruntime_fetchcontent_makeavailable(pybind11_project)
-
diff --git a/cmake/vcpkg-ports/pybind11/portfile.cmake b/cmake/vcpkg-ports/pybind11/portfile.cmake
index 2c63582d1ee15..4e4cd30a26df1 100644
--- a/cmake/vcpkg-ports/pybind11/portfile.cmake
+++ b/cmake/vcpkg-ports/pybind11/portfile.cmake
@@ -2,7 +2,8 @@ vcpkg_from_github(
     OUT_SOURCE_PATH SOURCE_PATH
     REPO pybind/pybind11
     REF "v${VERSION}"
-    SHA512 497c25b33b09a9c42f67131ab82e35d689e8ce089dd7639be997305ff9a6d502447b79c824508c455d559e61f0186335b54dd2771d903a7c1621833930622d1a
+    # SHA512 for the zip (not tar.gz) file.
+    SHA512 786b1bf534ac67a8d5669f8babf67bb13e48b3a3da1b6344e43ae10a84b80bbc8fea5f12a65fd18739c341fefef5622c5dc096db964dff33cc62ea4259b2e2c1
     HEAD_REF master
 )
 
diff --git a/cmake/vcpkg-ports/pybind11/vcpkg.json b/cmake/vcpkg-ports/pybind11/vcpkg.json
index a730d32017885..058e2235fea08 100644
--- a/cmake/vcpkg-ports/pybind11/vcpkg.json
+++ b/cmake/vcpkg-ports/pybind11/vcpkg.json
@@ -1,6 +1,6 @@
 {
   "name": "pybind11",
-  "version": "2.13.6",
+  "version": "3.0.2",
   "description": "pybind11 is a lightweight header-only library that exposes C++ types in Python and vice versa, mainly to create Python bindings of existing C++ code",
   "homepage": "https://github.com/pybind/pybind11",
   "license": "BSD-3-Clause",
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
index abe73b77f4071..a6b267c6802cf 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
@@ -892,99 +892,115 @@ internal class NativeLib
         /// On Windows, it explicitly loads the library with a lowercase .dll extension to handle
         /// case-sensitive filesystems.
         /// </summary>
+#if NET5_0_OR_GREATER
+        [System.Diagnostics.CodeAnalysis.UnconditionalSuppressMessage("SingleFile", "IL3000:Avoid accessing Assembly file path when publishing as a single file", Justification = "We also check AppContext.BaseDirectory as a fallback")]
+#endif
         private static IntPtr DllImportResolver(string libraryName, Assembly assembly, DllImportSearchPath? searchPath)
         {
-            if (libraryName == NativeLib.DllName || libraryName == OrtExtensionsNativeMethods.ExtensionsDllName)
+            try
             {
-                string mappedName = null;
-                if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
-                {
-                    // Explicitly load with .dll extension to avoid issues where the OS might try .DLL
-                    mappedName = libraryName + ".dll";
-                }
-                else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
+                if (libraryName == NativeLib.DllName || libraryName == OrtExtensionsNativeMethods.ExtensionsDllName)
                 {
-                    // Explicitly load with .so extension and lib prefix
-                    mappedName = "lib" + libraryName + ".so";
-                }
-                else if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
-                {
-                    // Explicitly load with .dylib extension and lib prefix
-                    mappedName = "lib" + libraryName + ".dylib";
-                }
-
-                if (mappedName != null)
-                {
-                    // 1. Try default loading (name only)
-                    if (NativeLibrary.TryLoad(mappedName, assembly, searchPath, out IntPtr handle))
+                    string mappedName = null;
+                    if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
                     {
-                        return handle;
+                        // Explicitly load with .dll extension to avoid issues where the OS might try .DLL
+                        mappedName = libraryName + ".dll";
                     }
-
-                    // 2. Try relative to assembly location (look into runtimes subfolders)
-                    string assemblyLocation = null;
-                    try { assemblyLocation = assembly.Location; } catch { }
-                    if (!string.IsNullOrEmpty(assemblyLocation))
+                    else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
+                    {
+                        // Explicitly load with .so extension and lib prefix
+                        mappedName = "lib" + libraryName + ".so";
+                    }
+                    else if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
                     {
-                        string assemblyDir = System.IO.Path.GetDirectoryName(assemblyLocation);
-                        string rid = RuntimeInformation.RuntimeIdentifier;
+                        // Explicitly load with .dylib extension and lib prefix
+                        mappedName = "lib" + libraryName + ".dylib";
+                    }
 
-                        // Probe the specific RID first, then common fallbacks for the current OS
-                        string[] ridsToTry;
-                        if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
-                        {
-                            ridsToTry = new[] { rid, "win-x64", "win-arm64" };
-                        }
-                        else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
-                        {
-                            ridsToTry = new[] { rid, "linux-x64", "linux-arm64" };
-                        }
-                        else if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
-                        {
-                            // We no longer provide osx-x64 in official package since 1.24.
-                            // However, we keep it in the list for build-from-source users.
-                            ridsToTry = new[] { rid, "osx-arm64", "osx-x64" };
-                        }
-                        else
+                    if (mappedName != null)
+                    {
+                        // 1. Try default loading (name only)
+                        if (NativeLibrary.TryLoad(mappedName, assembly, searchPath, out IntPtr handle))
                         {
-                            ridsToTry = new[] { rid };
+                            return handle;
                         }
 
-                        foreach (var tryRid in ridsToTry)
+                        // 2. Try relative to assembly location (look into runtimes subfolders)
+                        string assemblyLocation = null;
+                        try { assemblyLocation = assembly.Location; } catch { }
+                        if (!string.IsNullOrEmpty(assemblyLocation))
                         {
-                            string probePath = System.IO.Path.Combine(assemblyDir, "runtimes", tryRid, "native", mappedName);
-                            if (System.IO.File.Exists(probePath) && NativeLibrary.TryLoad(probePath, assembly, searchPath, out handle))
+                            string assemblyDir = System.IO.Path.GetDirectoryName(assemblyLocation);
+                            string rid = RuntimeInformation.RuntimeIdentifier;
+
+                            // Probe the specific RID first, then common fallbacks for the current OS
+                            string[] ridsToTry;
+                            if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
                             {
-                                LogLibLoad($"[DllImportResolver] Loaded {mappedName} from: {probePath}");
-                                return handle;
+                                ridsToTry = new[] { rid, "win-x64", "win-arm64" };
+                            }
+                            else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
+                            {
+                                ridsToTry = new[] { rid, "linux-x64", "linux-arm64" };
+                            }
+                            else if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
+                            {
+                                // We no longer provide osx-x64 in official package since 1.24.
+                                // However, we keep it in the list for build-from-source users.
+                                ridsToTry = new[] { rid, "osx-arm64", "osx-x64" };
+                            }
+                            else
+                            {
+                                ridsToTry = new[] { rid };
                             }
-                        }
-                    }
 
-                    // 3. Try AppContext.BaseDirectory as a fallback
-                    string baseDir = AppContext.BaseDirectory;
-                    if (!string.IsNullOrEmpty(baseDir))
-                    {
-                        string probePath = System.IO.Path.Combine(baseDir, mappedName);
-                        if (NativeLibrary.TryLoad(probePath, assembly, searchPath, out handle))
-                        {
-                            LogLibLoad($"[DllImportResolver] Loaded {mappedName} from: {probePath}");
-                            return handle;
+                            foreach (var tryRid in ridsToTry)
+                            {
+                                string probePath = System.IO.Path.Combine(assemblyDir, "runtimes", tryRid, "native", mappedName);
+                                if (System.IO.File.Exists(probePath) && NativeLibrary.TryLoad(probePath, assembly, searchPath, out handle))
+                                {
+                                    LogLibLoad($"[DllImportResolver] Loaded {mappedName} from: {probePath}");
+                                    return handle;
+                                }
+                            }
                         }
 
-                        string rid = RuntimeInformation.RuntimeIdentifier;
-                        probePath = System.IO.Path.Combine(baseDir, "runtimes", rid, "native", mappedName);
-                        if (NativeLibrary.TryLoad(probePath, assembly, searchPath, out handle))
+                        // 3. Try AppContext.BaseDirectory as a fallback
+                        try
                         {
-                            LogLibLoad($"[DllImportResolver] Loaded {mappedName} from: {probePath}");
-                            return handle;
+                            string baseDir = AppContext.BaseDirectory;
+                            if (!string.IsNullOrEmpty(baseDir))
+                            {
+                                string probePath = System.IO.Path.Combine(baseDir, mappedName);
+                                if (NativeLibrary.TryLoad(probePath, assembly, searchPath, out handle))
+                                {
+                                    LogLibLoad($"[DllImportResolver] Loaded {mappedName} from: {probePath}");
+                                    return handle;
+                                }
+
+                                string rid = RuntimeInformation.RuntimeIdentifier;
+                                probePath = System.IO.Path.Combine(baseDir, "runtimes", rid, "native", mappedName);
+                                if (NativeLibrary.TryLoad(probePath, assembly, searchPath, out handle))
+                                {
+                                    LogLibLoad($"[DllImportResolver] Loaded {mappedName} from: {probePath}");
+                                    return handle;
+                                }
+                            }
                         }
-                    }
+                        catch { } // Ignore AppDomainUnloadedException or similar from AppContext.BaseDirectory
 
-                    LogLibLoad($"[DllImportResolver] Failed loading {mappedName} (RID: {RuntimeInformation.RuntimeIdentifier}, Assembly: {assemblyLocation})");
+                        LogLibLoad($"[DllImportResolver] Failed loading {mappedName} (RID: {RuntimeInformation.RuntimeIdentifier}, Assembly: {assemblyLocation})");
 
+                    }
                 }
             }
+            catch (Exception ex)
+            {
+                // Unhandled exceptions inside DllImportResolver can result in TypeInitializationException.
+                // Log and swallow the error, returning IntPtr.Zero to fall back to default CLR logic.
+                try { System.Diagnostics.Trace.WriteLine($"[DllImportResolver] Exception during resolution: {ex}"); } catch { }
+            }
 
             // Fall back to default resolution
             return IntPtr.Zero;
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtEnvTests.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtEnvTests.cs
index 94f8e927c1331..aa1b683acd668 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtEnvTests.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtEnvTests.cs
@@ -489,4 +489,47 @@ void TestCopyTensors()
             }
         }
     }
+
+    [Collection("Ort Inference Tests")]
+    public class OrtEnvDllImportResolverTest
+    {
+        [Fact(DisplayName = "TestDllImportResolverDoesNotThrow")]
+        public void TestDllImportResolverDoesNotThrow()
+        {
+            // The DllImportResolver is a private static method in NativeMethods.
+            var nativeMethodsType = typeof(OrtEnv).Assembly.GetType("Microsoft.ML.OnnxRuntime.NativeMethods");
+            Assert.NotNull(nativeMethodsType);
+
+            // It might not be defined on all platforms (defined when !NETSTANDARD2_0 && !__ANDROID__ && !__IOS__).
+            var resolverMethod = nativeMethodsType.GetMethod("DllImportResolver", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static);
+
+            if (resolverMethod != null)
+            {
+                try
+                {
+                    // Invoke with null assembly to force it into edge cases where assembly.Location would throw NullReferenceException.
+                    // It should catch the exception and return IntPtr.Zero gracefully rather than throwing.
+                    var result = resolverMethod.Invoke(null, new object[] { "onnxruntime", null, null });
+
+                    // If it reaches here without throwing TargetInvocationException, the try-catch in DllImportResolver works.
+                    Assert.True(result is IntPtr);
+                }
+                catch (System.Reflection.TargetInvocationException ex)
+                {
+                    // If NativeMethods..cctor() threw because the native library is missing,
+                    // we will get a TypeInitializationException wrapping a DllNotFoundException (or DllImportException).
+                    // This is acceptable locally. What we want to avoid is NullReferenceException from DllImportResolver.
+                    if (ex.InnerException is TypeInitializationException typeInitEx)
+                    {
+                        Assert.IsNotType<NullReferenceException>(typeInitEx.InnerException);
+                    }
+                    else
+                    {
+                        Assert.IsNotType<NullReferenceException>(ex.InnerException);
+                        throw;
+                    }
+                }
+            }
+        }
+    }
 }
diff --git a/docs/python/README.rst b/docs/python/README.rst
index 6124ffe8d27b4..06f5b0ebf3094 100644
--- a/docs/python/README.rst
+++ b/docs/python/README.rst
@@ -8,6 +8,11 @@ For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://ak
 Changes
 -------
 
+1.24.3
+^^^^^^
+
+Release Notes : https://github.com/Microsoft/onnxruntime/releases/tag/v1.24.3
+
 1.24.2
 ^^^^^^
 
diff --git a/js/common/lib/version.ts b/js/common/lib/version.ts
index 67d95af396d53..9e819d0a932ef 100644
--- a/js/common/lib/version.ts
+++ b/js/common/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.24.2';
+export const version = '1.24.3';
diff --git a/js/common/package-lock.json b/js/common/package-lock.json
index e0be9c1a1e1e8..c647d7b75ca38 100644
--- a/js/common/package-lock.json
+++ b/js/common/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-common",
-  "version": "1.24.2",
+  "version": "1.24.3",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-common",
-      "version": "1.24.2",
+      "version": "1.24.3",
       "license": "MIT",
       "devDependencies": {
         "globby": "^15.0.0",
diff --git a/js/common/package.json b/js/common/package.json
index 25f71c7260c6a..17a651d6ac995 100644
--- a/js/common/package.json
+++ b/js/common/package.json
@@ -2,7 +2,7 @@
   "license": "MIT",
   "type": "module",
   "name": "onnxruntime-common",
-  "version": "1.24.2",
+  "version": "1.24.3",
   "repository": {
     "url": "https://github.com/Microsoft/onnxruntime.git",
     "type": "git"
diff --git a/js/node/lib/version.ts b/js/node/lib/version.ts
index 67d95af396d53..9e819d0a932ef 100644
--- a/js/node/lib/version.ts
+++ b/js/node/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.24.2';
+export const version = '1.24.3';
diff --git a/js/node/package-lock.json b/js/node/package-lock.json
index b2281eb1de5ce..8d890063b532b 100644
--- a/js/node/package-lock.json
+++ b/js/node/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-node",
-  "version": "1.24.2",
+  "version": "1.24.3",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-node",
-      "version": "1.24.2",
+      "version": "1.24.3",
       "hasInstallScript": true,
       "license": "MIT",
       "os": [
@@ -30,7 +30,7 @@
     },
     "../common": {
       "name": "onnxruntime-common",
-      "version": "1.24.2",
+      "version": "1.24.3",
       "license": "MIT",
       "devDependencies": {
         "globby": "^15.0.0",
diff --git a/js/node/package.json b/js/node/package.json
index 6e94e027f1a30..96be689947352 100644
--- a/js/node/package.json
+++ b/js/node/package.json
@@ -11,7 +11,7 @@
       6
     ]
   },
-  "version": "1.24.2",
+  "version": "1.24.3",
   "dependencies": {
     "adm-zip": "^0.5.16",
     "global-agent": "^3.0.0",
diff --git a/js/node/script/install-metadata-versions.js b/js/node/script/install-metadata-versions.js
index e9d30780834f4..1747fbc37259e 100644
--- a/js/node/script/install-metadata-versions.js
+++ b/js/node/script/install-metadata-versions.js
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-module.exports = { nuget: [{ feed: 'nuget', version: '1.24.2' }] };
+module.exports = { nuget: [{ feed: 'nuget', version: '1.24.3' }] };
diff --git a/js/react_native/lib/version.ts b/js/react_native/lib/version.ts
index 67d95af396d53..9e819d0a932ef 100644
--- a/js/react_native/lib/version.ts
+++ b/js/react_native/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.24.2';
+export const version = '1.24.3';
diff --git a/js/react_native/package-lock.json b/js/react_native/package-lock.json
index 57f40050e6aa6..61f389ecbd618 100644
--- a/js/react_native/package-lock.json
+++ b/js/react_native/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-react-native",
-  "version": "1.24.2",
+  "version": "1.24.3",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-react-native",
-      "version": "1.24.2",
+      "version": "1.24.3",
       "license": "MIT",
       "dependencies": {
         "onnxruntime-common": "file:../common"
@@ -30,7 +30,7 @@
     },
     "../common": {
       "name": "onnxruntime-common",
-      "version": "1.24.2",
+      "version": "1.24.3",
       "license": "MIT",
       "devDependencies": {
         "globby": "^15.0.0",
diff --git a/js/react_native/package.json b/js/react_native/package.json
index 4d7a03a68a264..7c846efae8b62 100644
--- a/js/react_native/package.json
+++ b/js/react_native/package.json
@@ -37,7 +37,7 @@
     "registry": "https://registry.npmjs.org/"
   },
   "source": "lib/index",
-  "version": "1.24.2",
+  "version": "1.24.3",
   "main": "dist/commonjs/index",
   "homepage": "https://github.com/microsoft/onnxruntime/blob/main/js/react_native/README.md",
   "files": [
diff --git a/js/web/lib/version.ts b/js/web/lib/version.ts
index 67d95af396d53..9e819d0a932ef 100644
--- a/js/web/lib/version.ts
+++ b/js/web/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.24.2';
+export const version = '1.24.3';
diff --git a/js/web/lib/wasm/wasm-factory.ts b/js/web/lib/wasm/wasm-factory.ts
index a9ef6c72314dd..ba4b9578207f0 100644
--- a/js/web/lib/wasm/wasm-factory.ts
+++ b/js/web/lib/wasm/wasm-factory.ts
@@ -194,6 +194,13 @@ export const initializeWebAssembly = async (flags: Env.WebAssemblyFlags): Promis
       if (wasmBinaryOverride) {
         // Set a custom buffer which contains the WebAssembly binary. This will skip the wasm file fetching.
         config.wasmBinary = wasmBinaryOverride;
+
+        // Offer an implementation of locateFile() that returns the file name directly. This helps to avoid an error
+        // thrown later from the following code when `import.meta.url` is a blob URL:
+        // ```
+        //   return new URL("ort-wasm-simd-threaded.jsep.wasm", import.meta.url).href;
+        // ```
+        config.locateFile = (fileName) => fileName;
       } else if (wasmPathOverride || wasmPrefixOverride) {
         // A callback function to locate the WebAssembly file. The function should return the full path of the file.
         //
diff --git a/js/web/lib/wasm/wasm-utils-import.ts b/js/web/lib/wasm/wasm-utils-import.ts
index e2e46bb37dcfc..6c899d1ae9cf5 100644
--- a/js/web/lib/wasm/wasm-utils-import.ts
+++ b/js/web/lib/wasm/wasm-utils-import.ts
@@ -272,7 +272,9 @@ export const importWasmModule = async (
       }
     } else {
       // if the script source is available, we can check if it is from the same origin.
-      useEmbeddedModule = isSameOrigin(scriptSrc);
+      // Also use the embedded module when wasmBinary is provided and single-threaded (eg. Blob URL workers
+      // where isSameOrigin fails but no file resolution or worker spawning is needed).
+      useEmbeddedModule = isSameOrigin(scriptSrc) || (isWasmOverridden && !isMultiThreaded);
     }
   }
   if (useEmbeddedModule) {
diff --git a/js/web/package-lock.json b/js/web/package-lock.json
index 5782c9b2bfcc3..01a6e497a9a75 100644
--- a/js/web/package-lock.json
+++ b/js/web/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-web",
-  "version": "1.24.2",
+  "version": "1.24.3",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-web",
-      "version": "1.24.2",
+      "version": "1.24.3",
       "license": "MIT",
       "dependencies": {
         "flatbuffers": "^25.1.24",
@@ -50,7 +50,7 @@
     },
     "../common": {
       "name": "onnxruntime-common",
-      "version": "1.24.2",
+      "version": "1.24.3",
       "license": "MIT",
       "devDependencies": {
         "globby": "^15.0.0",
diff --git a/js/web/package.json b/js/web/package.json
index f0a2dcb410a2a..23d89ca57199f 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -7,7 +7,7 @@
     "type": "git"
   },
   "author": "fs-eire",
-  "version": "1.24.2",
+  "version": "1.24.3",
   "jsdelivr": "dist/ort.min.js",
   "dependencies": {
     "flatbuffers": "^25.1.24",
diff --git a/onnxruntime/__init__.py b/onnxruntime/__init__.py
index 98096104ed304..ace3e4dd4bf46 100644
--- a/onnxruntime/__init__.py
+++ b/onnxruntime/__init__.py
@@ -10,7 +10,7 @@
 
 import contextlib
 
-__version__ = "1.24.2"
+__version__ = "1.24.3"
 __author__ = "Microsoft"
 
 # we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).
diff --git a/onnxruntime/contrib_ops/cpu/moe/moe_helper.h b/onnxruntime/contrib_ops/cpu/moe/moe_helper.h
index 257c5a189b3bd..bd30418030dc2 100644
--- a/onnxruntime/contrib_ops/cpu/moe/moe_helper.h
+++ b/onnxruntime/contrib_ops/cpu/moe/moe_helper.h
@@ -35,44 +35,86 @@ struct MoEParameters {
 };
 namespace moe_helper {
 
+// Helper to check shape dimensions
+#define ASSERT_SHAPE_DIMENSION(shape_ptr, dim, name)                             \
+  if (shape_ptr != nullptr) {                                                    \
+    if (shape_ptr->NumDimensions() != dim) {                                     \
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input '", name,     \
+                             "' is expected to have ", dim, " dimensions, got ", \
+                             shape_ptr->NumDimensions());                        \
+    }                                                                            \
+  }
+
+#define ASSERT_SHAPE_3D(shape_ptr, name) ASSERT_SHAPE_DIMENSION(shape_ptr, 3, name)
+
+#define CHECK_SHAPE(shape_ptr, name, ...)                                    \
+  if (shape_ptr != nullptr) {                                                \
+    const TensorShape& expected_shape = make_shape(__VA_ARGS__);             \
+    if (*shape_ptr != expected_shape) {                                      \
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input '", name, \
+                             "' is expected to have shape ", expected_shape, \
+                             ", got ", *shape_ptr);                          \
+    }                                                                        \
+  }
+
 template <typename Tensor>
 Status CheckInputs(MoEParameters& parameters,
-                   const Tensor* input,                // required
-                   const Tensor* router_probs,         // required
-                   const Tensor* fc1_experts_weights,  // required
-                   const Tensor* fc1_experts_bias,     // optional
-                   const Tensor* fc1_experts_scales,   // required for qMoE; NULL for MOE
-                   const Tensor* fc1_zero_points,      // optional, for qMoE
-                   const Tensor* fc2_experts_weights,  // required
-                   const Tensor* fc2_experts_bias,     // optional
-                   const Tensor* fc2_experts_scales,   // required for qMoE; NULL for MOE
-                   const Tensor* fc2_zero_points,      // optional, for qMoE
-                   const Tensor* fc3_experts_weights,  // optional
-                   const Tensor* fc3_experts_bias,     // optional
-                   const Tensor* fc3_experts_scales,   // required for qMoE; NULL for MOE
-                   const Tensor* fc3_zero_points,      // optional, for qMoE
-                   const int64_t pack_size,            // number of weights packed together (like 2 for uint4 packed to uint8)
+                   const Tensor* input,                           // required
+                   const Tensor* router_probs,                    // required
+                   const TensorShape* fc1_experts_weights_shape,  // required
+                   const Tensor* fc1_experts_bias,                // optional
+                   const Tensor* fc1_experts_scales,              // required for qMoE; NULL for MOE
+                   const Tensor* fc1_zero_points,                 // optional, for qMoE
+                   const TensorShape* fc2_experts_weights_shape,  // required
+                   const Tensor* fc2_experts_bias,                // optional
+                   const Tensor* fc2_experts_scales,              // required for qMoE; NULL for MOE
+                   const Tensor* fc2_zero_points,                 // optional, for qMoE
+                   const TensorShape* fc3_experts_weights_shape,  // optional
+                   const Tensor* fc3_experts_bias,                // optional
+                   const Tensor* fc3_experts_scales,              // required for qMoE; NULL for MOE
+                   const Tensor* fc3_zero_points,                 // optional, for qMoE
+                   const int64_t pack_size,                       // number of weights packed together (like 2 for uint4 packed to uint8)
                    const bool is_fused_swiglu,
                    const int64_t block_size = 0) {  // block size for block-wise quantization
-  // Check dimensions of input to avoid input_dims index out of range. CHECK_TENSOR_SHAPE will verify each tensor later.
+  // Required inputs
+  if (input == nullptr) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'input' is required.");
+  }
   ASSERT_TENSOR_2D_OR_3D(input);
-  ASSERT_TENSOR_3D(fc1_experts_weights);
-  ASSERT_TENSOR_3D(fc2_experts_weights);
+
+  if (router_probs == nullptr) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'router_probs' is required.");
+  }
   ASSERT_TENSOR_2D(router_probs);
 
+  if (fc1_experts_weights_shape == nullptr) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'fc1_experts_weights' is required.");
+  }
+  ASSERT_SHAPE_3D(fc1_experts_weights_shape, "fc1_experts_weights");
+
+  if (fc2_experts_weights_shape == nullptr) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'fc2_experts_weights' is required.");
+  }
+  ASSERT_SHAPE_3D(fc2_experts_weights_shape, "fc2_experts_weights");
+
   const auto& input_dims = input->Shape().GetDims();
   const auto& router_probs_dims = router_probs->Shape().GetDims();
-  const auto& fc1_experts_weights_dims = fc1_experts_weights->Shape().GetDims();
-  const auto& fc2_experts_weights_dims = fc2_experts_weights->Shape().GetDims();
 
   int64_t num_rows = input_dims.size() == 2 ? input_dims[0] : input_dims[0] * input_dims[1];
   int64_t hidden_size = input_dims[input_dims.size() - 1];
-  int64_t local_num_experts = fc1_experts_weights_dims[0];
   int64_t num_experts = router_probs_dims[1];
-  int64_t inter_size = (fc2_experts_weights_dims[1] * fc2_experts_weights_dims[2] * pack_size) / hidden_size;
 
-  const bool legacy_shape = (hidden_size != inter_size && fc2_experts_weights_dims[1] == inter_size) ||
-                            (hidden_size == inter_size && is_fused_swiglu && fc1_experts_weights_dims[1] == hidden_size);
+  int64_t local_num_experts = fc1_experts_weights_shape->GetDims()[0];
+
+  int64_t inter_size = (fc2_experts_weights_shape->GetDims()[1] *
+                        fc2_experts_weights_shape->GetDims()[2] * pack_size) /
+                       hidden_size;
+
+  bool legacy_shape = false;
+  const auto& fc2_experts_weights_dims = fc2_experts_weights_shape->GetDims();
+  const auto& fc1_experts_weights_dims = fc1_experts_weights_shape->GetDims();
+  legacy_shape = (hidden_size != inter_size && fc2_experts_weights_dims[1] == inter_size) ||
+                 (hidden_size == inter_size && is_fused_swiglu && fc1_experts_weights_dims[1] == hidden_size);
 
   // Fused swiglu doubles the output dimension of FC1 since it fused two GEMMs into one.
   const int64_t fc1_inter_size = is_fused_swiglu ? (inter_size + inter_size) : inter_size;
@@ -80,13 +122,13 @@ Status CheckInputs(MoEParameters& parameters,
 
   if (legacy_shape) {
     // legacy shape does not match column major memory layout. This is for backward compatibility.
-    CHECK_TENSOR_SHAPE(fc1_experts_weights, num_experts, hidden_size, fc1_inter_size / pack_size);
-    CHECK_TENSOR_SHAPE(fc2_experts_weights, num_experts, inter_size, hidden_size / pack_size);
-    CHECK_TENSOR_SHAPE(fc3_experts_weights, num_experts, hidden_size, inter_size / pack_size);
+    CHECK_SHAPE(fc1_experts_weights_shape, "fc1_experts_weights", num_experts, hidden_size, fc1_inter_size / pack_size);
+    CHECK_SHAPE(fc2_experts_weights_shape, "fc2_experts_weights", num_experts, inter_size, hidden_size / pack_size);
+    CHECK_SHAPE(fc3_experts_weights_shape, "fc3_experts_weights", num_experts, hidden_size, inter_size / pack_size);
   } else {
-    CHECK_TENSOR_SHAPE(fc1_experts_weights, num_experts, fc1_inter_size, hidden_size / pack_size);
-    CHECK_TENSOR_SHAPE(fc2_experts_weights, num_experts, hidden_size, inter_size / pack_size);
-    CHECK_TENSOR_SHAPE(fc3_experts_weights, num_experts, inter_size, hidden_size / pack_size);
+    CHECK_SHAPE(fc1_experts_weights_shape, "fc1_experts_weights", num_experts, fc1_inter_size, hidden_size / pack_size);
+    CHECK_SHAPE(fc2_experts_weights_shape, "fc2_experts_weights", num_experts, hidden_size, inter_size / pack_size);
+    CHECK_SHAPE(fc3_experts_weights_shape, "fc3_experts_weights", num_experts, inter_size, hidden_size / pack_size);
   }
 
   CHECK_TENSOR_SHAPE(router_probs, num_rows, num_experts);
@@ -168,9 +210,11 @@ Status CheckInputs(MoEParameters& parameters,
     }
   }
 
-  if (fc3_experts_weights == nullptr) {
+  if (fc3_experts_weights_shape == nullptr) {
+    // If fc3 weights are not provided, ensure no other fc3 parameters are provided
     ORT_ENFORCE(fc3_experts_bias == nullptr && fc3_experts_scales == nullptr && fc3_zero_points == nullptr);
   } else {
+    // If fc3 weights are provided, ensure scales logic is consistent
     ORT_ENFORCE(fc1_experts_scales == nullptr || fc3_experts_scales != nullptr);  // MOE no scale, or qMOE need scales
   }
 
@@ -200,6 +244,36 @@ Status CheckInputs(MoEParameters& parameters,
   return Status::OK();
 }
 
+template <typename Tensor>
+Status CheckInputs(MoEParameters& parameters,
+                   const Tensor* input,                // required
+                   const Tensor* router_probs,         // required
+                   const Tensor* fc1_experts_weights,  // required
+                   const Tensor* fc1_experts_bias,     // optional
+                   const Tensor* fc1_experts_scales,   // required for qMoE; NULL for MOE
+                   const Tensor* fc1_zero_points,      // optional, for qMoE
+                   const Tensor* fc2_experts_weights,  // required
+                   const Tensor* fc2_experts_bias,     // optional
+                   const Tensor* fc2_experts_scales,   // required for qMoE; NULL for MOE
+                   const Tensor* fc2_zero_points,      // optional, for qMoE
+                   const Tensor* fc3_experts_weights,  // optional
+                   const Tensor* fc3_experts_bias,     // optional
+                   const Tensor* fc3_experts_scales,   // required for qMoE; NULL for MOE
+                   const Tensor* fc3_zero_points,      // optional, for qMoE
+                   const int64_t pack_size,            // number of weights packed together (like 2 for uint4 packed to uint8)
+                   const bool is_fused_swiglu,
+                   const int64_t block_size = 0) {  // block size for block-wise quantization
+
+  const TensorShape* fc1_shape = (fc1_experts_weights != nullptr) ? &fc1_experts_weights->Shape() : nullptr;
+  const TensorShape* fc2_shape = (fc2_experts_weights != nullptr) ? &fc2_experts_weights->Shape() : nullptr;
+  const TensorShape* fc3_shape = (fc3_experts_weights != nullptr) ? &fc3_experts_weights->Shape() : nullptr;
+
+  return CheckInputs(parameters, input, router_probs, fc1_shape, fc1_experts_bias, fc1_experts_scales, fc1_zero_points,
+                     fc2_shape, fc2_experts_bias, fc2_experts_scales, fc2_zero_points,
+                     fc3_shape, fc3_experts_bias, fc3_experts_scales, fc3_zero_points,
+                     pack_size, is_fused_swiglu, block_size);
+}
+
 }  // namespace moe_helper
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc b/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc
index 6d1d191689466..81d2b0f8efdc6 100644
--- a/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc
+++ b/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc
@@ -13,6 +13,7 @@
 #include "core/common/narrow.h"
 #include "core/framework/tensor_type_and_shape.h"
 #include "core/util/math.h"
+#include "core/platform/env_var_utils.h"
 #include "contrib_ops/cpu/moe/moe_utils.h"
 #include "contrib_ops/cpu/moe/moe_helper.h"
 
@@ -69,13 +70,13 @@ bool CanUseMlasQ4Gemm(int64_t expert_weight_bits, int64_t block_size,
     out_qtype = BlkQ4Sym64;
   } else if (block_size == 128) {
     out_qtype = BlkQ4Sym128;
-  } else if (block_size == 0) {
+  } else if (block_size == 0 || block_size == 32) {
     out_qtype = BlkQ4Sym;
   } else {
     return false;
   }
 
-  size_t expected_size = MlasQ4GemmPackBSize(out_qtype, static_cast<size_t>(cols), static_cast<size_t>(rows));
+  size_t expected_size = MlasQ4GemmPackBSize(out_qtype, static_cast<size_t>(rows), static_cast<size_t>(cols));
   return expected_size > 0;
 }
 
@@ -84,6 +85,8 @@ bool CanUseMlasQ4Gemm(int64_t expert_weight_bits, int64_t block_size,
 namespace onnxruntime {
 namespace contrib {
 
+constexpr const char* kUseMlasQ4GemmMoe = "ORT_USE_MLAS_Q4_GEMM_MOE";
+
 template <typename TScale>
 void DequantizeBlockWithMlas(const uint8_t* quantized_data,
                              const TScale* scales,
@@ -118,13 +121,23 @@ Status ConvertToMlasQ4Format(const uint8_t* quantized_data,
 
   DequantizeBlockWithMlas(quantized_data, scales, zero_points, block_size, num_bits, rows, cols, temp_float, nullptr);
 
-  size_t packed_size = MlasQ4GemmPackBSize(qtype, static_cast<size_t>(cols), static_cast<size_t>(rows));
+  // Transpose from N x K (weights) to K x N.
+  // DirectQ4Gemm expects weights to be packed in a specific layout ([K, N] logically)
+  auto transposed_float_buffer = IAllocator::MakeUniquePtr<float>(allocator, static_cast<size_t>(rows * cols));
+  float* transposed_float = transposed_float_buffer.get();
+  for (int64_t r = 0; r < rows; ++r) {
+    for (int64_t c = 0; c < cols; ++c) {
+      transposed_float[c * rows + r] = temp_float[r * cols + c];
+    }
+  }
+
+  size_t packed_size = MlasQ4GemmPackBSize(qtype, static_cast<size_t>(rows), static_cast<size_t>(cols));
   if (packed_size == 0) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "MLAS Q4 packing not supported for this configuration");
   }
 
   mlas_packed_buffer = IAllocator::MakeUniquePtr<uint8_t>(allocator, packed_size);
-  MlasQ4GemmPackB(qtype, mlas_packed_buffer.get(), temp_float, static_cast<size_t>(cols), static_cast<size_t>(rows), static_cast<size_t>(cols));
+  MlasQ4GemmPackB(qtype, mlas_packed_buffer.get(), transposed_float, static_cast<size_t>(rows), static_cast<size_t>(cols), static_cast<size_t>(rows));
 
   return Status::OK();
 }
@@ -354,6 +367,257 @@ void DequantizeBlock(const uint8_t* quantized_data,
   DequantizeBlockWithMlas(quantized_data, scales, zero_points, block_size, num_bits, rows, cols, dequantized_data, thread_pool);
 }
 
+template <typename TScale>
+void DequantizePrePacked(const uint8_t* prepacked_data,
+                         const TScale* scales,
+                         const uint8_t* zero_points,
+                         int64_t block_size,
+                         int64_t rows,
+                         int64_t cols,
+                         float* dequantized_data,
+                         const gsl::span<const int64_t>& scale_dims) {
+  // prepacked_data is [cols, rows] (transposed, unpacked)
+  // dequantized_data is [cols, rows] (transposed)
+  // scales, zero_points correspond to original [rows, cols] layout
+
+  const float default_zp_4bit = 8.0f;
+  const int64_t blocks_per_row = (block_size > 0) ? ((cols + block_size - 1) / block_size) : 1;
+  const int64_t zp_pack_size = 2;  // Always 2 for 4-bit
+
+  // Iterate over Columns (K) then Rows (N) because prepacked_data is [K, N]
+  for (int64_t c = 0; c < cols; ++c) {
+    for (int64_t r = 0; r < rows; ++r) {
+      uint8_t val = prepacked_data[c * rows + r];
+
+      int64_t block_idx = (block_size > 0) ? (c / block_size) : 0;
+      if (block_size > 0) block_idx = std::min(block_idx, blocks_per_row - 1);
+
+      int64_t scale_idx;
+      if (scale_dims.size() == 3 && scale_dims[2] > 1) {  // block-wise
+        scale_idx = r * blocks_per_row + block_idx;
+      } else {  // per-channel
+        scale_idx = r;
+      }
+
+      float scale = static_cast<float>(scales[scale_idx]);
+      float zp = default_zp_4bit;
+
+      if (zero_points != nullptr) {
+        int64_t zp_idx;
+        bool is_lower_nibble;
+
+        if (scale_dims.size() == 3 && scale_dims[2] > 1) {  // block-wise
+          int64_t zp_blocks_packed = (blocks_per_row + zp_pack_size - 1) / zp_pack_size;
+          zp_idx = r * zp_blocks_packed + block_idx / 2;
+          is_lower_nibble = (block_idx % 2 == 0);
+        } else {
+          zp_idx = r / 2;
+          is_lower_nibble = (r % 2 == 0);
+        }
+
+        uint8_t packed_zp = zero_points[zp_idx];
+        zp = is_lower_nibble ? static_cast<float>(packed_zp & 0x0F) : static_cast<float>(packed_zp >> 4);
+      }
+
+      dequantized_data[c * rows + r] = scale * (static_cast<float>(val) - zp);
+    }
+  }
+}
+
+template <typename TScale>
+Status BuildDirectQ4PackedBCache(const uint8_t* prepacked_weights,
+                                 const TScale* scales_data,
+                                 int64_t num_experts,
+                                 int64_t rows,
+                                 int64_t cols,
+                                 int64_t block_size,
+                                 const gsl::span<const int64_t>& scales_dims,
+                                 MLAS_BLK_QUANT_TYPE qtype,
+                                 AllocatorPtr allocator,
+                                 IAllocatorUniquePtr<void>& packed_b) {
+  const size_t packed_size = MlasQ4GemmPackBSize(qtype, static_cast<size_t>(rows), static_cast<size_t>(cols));
+  if (packed_size == 0) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Failed to compute MLAS Q4 packed size for cache");
+  }
+
+  const bool is_block_wise = (scales_dims.size() == 3 && scales_dims[2] > 1);
+  const int64_t scales_expert_stride = is_block_wise ? (rows * scales_dims[2]) : rows;
+  const size_t prepacked_expert_stride = static_cast<size_t>(rows * cols);
+  const size_t total_packed_size = packed_size * static_cast<size_t>(num_experts);
+
+  packed_b = IAllocator::MakeUniquePtr<void>(allocator, total_packed_size, true);
+  uint8_t* packed_b_ptr = static_cast<uint8_t*>(packed_b.get());
+
+  std::vector<float> dequantized_transposed(static_cast<size_t>(rows * cols));
+  for (int64_t expert_idx = 0; expert_idx < num_experts; ++expert_idx) {
+    const uint8_t* expert_prepacked = prepacked_weights + static_cast<size_t>(expert_idx) * prepacked_expert_stride;
+    const TScale* expert_scales = scales_data + expert_idx * scales_expert_stride;
+
+    DequantizePrePacked(expert_prepacked, expert_scales, nullptr, block_size, rows, cols,
+                        dequantized_transposed.data(), scales_dims);
+
+    MlasQ4GemmPackB(qtype, packed_b_ptr + expert_idx * packed_size, dequantized_transposed.data(),
+                    static_cast<size_t>(rows), static_cast<size_t>(cols), static_cast<size_t>(rows));
+  }
+
+  return Status::OK();
+}
+
+template <typename T>
+Status QMoECPU<T>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                           /*out*/ bool& is_packed,
+                           /*out*/ PrePackedWeights* prepacked_weights) {
+  is_packed = false;
+
+  // If scales are prepacked, they are constant initializers.
+  if (input_idx == 3) {
+    return Status::OK();
+  }
+  if (input_idx == 6) {
+    return Status::OK();
+  }
+
+  // Only support PrePack for FC1 (2) and FC2 (5) weights
+  // and only if expert_weight_bits_ == 4 (since we unpack to uint8)
+  if (expert_weight_bits_ != 4) {
+    return Status::OK();
+  }
+
+  if (input_idx == 2 || input_idx == 5) {
+    const auto& shape = tensor.Shape();
+    const int64_t num_experts = shape[0];
+    const int64_t rows = shape[1];
+    const int64_t cols_packed = shape[2];
+    const int64_t cols = cols_packed * 2;
+
+    size_t packed_size = static_cast<size_t>(num_experts * rows * cols);
+    auto packed_buffer = IAllocator::MakeUniquePtr<void>(alloc, packed_size, true);
+    uint8_t* dst_base = static_cast<uint8_t*>(packed_buffer.get());
+    const uint8_t* src_base = static_cast<const uint8_t*>(tensor.DataRaw());
+
+    for (int64_t i = 0; i < num_experts; ++i) {
+      const uint8_t* src = src_base + i * rows * cols_packed;
+      uint8_t* dst = dst_base + i * rows * cols;
+
+      for (int64_t r = 0; r < rows; ++r) {
+        for (int64_t c = 0; c < cols; ++c) {
+          uint8_t packed_val = src[r * cols_packed + (c / 2)];
+          uint8_t val = (c % 2 == 0) ? (packed_val & 0x0F) : (packed_val >> 4);
+
+          dst[c * rows + r] = val;
+        }
+      }
+    }
+
+    if (input_idx == 2) {
+      fc1_shape_ = shape;
+    } else if (input_idx == 5) {
+      fc2_shape_ = shape;
+    }
+
+    if (prepacked_weights) {
+      prepacked_weights->buffers_.push_back(std::move(packed_buffer));
+      prepacked_weights->buffer_sizes_.push_back(packed_size);
+      is_packed = true;
+
+      // Pack Shape (Buffer 1)
+      auto dims = shape.GetDims();
+      size_t rank_bytes = sizeof(int64_t);
+      size_t dims_bytes = dims.size() * sizeof(int64_t);
+      size_t shape_size = rank_bytes + dims_bytes;
+
+      auto shape_buffer = IAllocator::MakeUniquePtr<void>(alloc, shape_size);
+      int64_t* buffer_data = static_cast<int64_t*>(shape_buffer.get());
+      *buffer_data = static_cast<int64_t>(dims.size());
+      memcpy(buffer_data + 1, dims.data(), dims_bytes);
+
+      prepacked_weights->buffers_.push_back(std::move(shape_buffer));
+      prepacked_weights->buffer_sizes_.push_back(shape_size);
+
+      // Try build MLAS Q4 cache if scales are available
+      if (use_mlas_q4_gemm_) {
+        const Tensor* scales_tensor = nullptr;
+        MLAS_BLK_QUANT_TYPE qtype = BlkQ4Sym;
+        int scales_idx = -1;
+        int zp_idx = -1;
+
+        if (input_idx == 2) {  // FC1
+          scales_idx = 3;
+          zp_idx = 11;
+        } else if (input_idx == 5) {  // FC2
+          scales_idx = 6;
+          zp_idx = 12;
+        }
+
+        if (scales_idx != -1 &&
+            (zp_idx >= static_cast<int>(Info().node().InputDefs().size()) || !Info().node().InputDefs()[zp_idx]->Exists()) &&
+            Info().TryGetConstantInput(scales_idx, &scales_tensor) &&
+            scales_tensor != nullptr &&
+            CanUseMlasQ4Gemm(expert_weight_bits_, block_size_, rows, cols, qtype)) {
+          IAllocatorUniquePtr<void> cache_buffer;
+          const auto& scales_dims = scales_tensor->Shape().GetDims();
+          const T* scales_data = scales_tensor->Data<T>();
+          // Use the simple packed buffer we just created (buffer 0) as input
+          const uint8_t* simple_packed = dst_base;
+
+          if (BuildDirectQ4PackedBCache(simple_packed, scales_data, num_experts, rows, cols,
+                                        block_size_, scales_dims, qtype,
+                                        alloc, cache_buffer)
+                  .IsOK()) {
+            // Store the MLAS Q4 cache as buffer 2 (after unpacked weights and shape).
+            size_t cache_size = MlasQ4GemmPackBSize(qtype, static_cast<size_t>(rows), static_cast<size_t>(cols)) * static_cast<size_t>(num_experts);
+            prepacked_weights->buffers_.push_back(std::move(cache_buffer));
+            prepacked_weights->buffer_sizes_.push_back(cache_size);
+          }
+        }
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+template <typename T>
+Status QMoECPU<T>::UseSharedPrePackedBuffers_V2(std::vector<BufferUniquePtr>& prepacked_buffers,
+                                                gsl::span<const size_t> /*prepacked_buffer_sizes*/,
+                                                int input_idx,
+                                                /*out*/ bool& used_shared_buffers) {
+  used_shared_buffers = false;
+
+  if (expert_weight_bits_ != 4) {
+    return Status::OK();
+  }
+
+  if ((input_idx == 2 || input_idx == 5) && !prepacked_buffers.empty()) {
+    auto parse_shape = [&](TensorShape& shape) {
+      if (prepacked_buffers.size() > 1) {
+        int64_t* buffer_data = static_cast<int64_t*>(prepacked_buffers[1].get());
+        int64_t rank = buffer_data[0];
+        std::vector<int64_t> dims(static_cast<size_t>(rank));
+        memcpy(dims.data(), buffer_data + 1, static_cast<size_t>(rank) * sizeof(int64_t));
+        shape = TensorShape(dims);
+      }
+    };
+
+    if (input_idx == 2) {
+      packed_fc1_ = std::move(prepacked_buffers[0]);
+      parse_shape(fc1_shape_);
+      if (prepacked_buffers.size() > 2) {
+        packed_fc1_mlas_cache_ = std::move(prepacked_buffers[2]);
+      }
+    } else if (input_idx == 5) {
+      packed_fc2_ = std::move(prepacked_buffers[0]);
+      parse_shape(fc2_shape_);
+      if (prepacked_buffers.size() > 2) {
+        packed_fc2_mlas_cache_ = std::move(prepacked_buffers[2]);
+      }
+    }
+    used_shared_buffers = true;
+  }
+
+  return Status::OK();
+}
+
 template <typename T>
 QMoECPU<T>::QMoECPU(const OpKernelInfo& op_kernel_info)
     : OpKernel(op_kernel_info),
@@ -362,21 +626,32 @@ QMoECPU<T>::QMoECPU(const OpKernelInfo& op_kernel_info)
   ORT_ENFORCE(expert_weight_bits_ == 4 || expert_weight_bits_ == 8,
               "Attribute 'expert_weight_bits' must be 4 or 8.");
   block_size_ = op_kernel_info.GetAttrOrDefault<int64_t>("block_size", 0);
+  ORT_ENFORCE(block_size_ >= 0);
 
   if (block_size_ > 0) {
     ORT_ENFORCE(block_size_ >= 16, "block_size must be >= 16 when provided.");
     ORT_ENFORCE((block_size_ & (block_size_ - 1)) == 0, "block_size must be a power of 2.");
   }
+
+  const auto use_mlas_q4_gemm = ParseEnvironmentVariable<bool>(kUseMlasQ4GemmMoe);
+  if (use_mlas_q4_gemm.has_value()) {
+    use_mlas_q4_gemm_ = *use_mlas_q4_gemm;
+    use_mlas_q4_gemm_overridden_ = true;
+  } else {
+    // Default policy: enable fast path unless this run hits a known accuracy-loss configuration.
+    use_mlas_q4_gemm_ = true;
+    use_mlas_q4_gemm_overridden_ = false;
+  }
 }
 
 template <typename T>
 Status QMoECPU<T>::Compute(OpKernelContext* context) const {
   const auto* input = context->Input<Tensor>(0);
   const auto* router_probs = context->Input<Tensor>(1);
-  const auto* fc1_experts_weights = context->Input<Tensor>(2);
+  const auto* fc1_experts_weights = packed_fc1_ ? nullptr : context->Input<Tensor>(2);
   const auto* fc1_scales = context->Input<Tensor>(3);
   const auto* fc1_experts_bias = context->Input<Tensor>(4);
-  const auto* fc2_experts_weights = context->Input<Tensor>(5);
+  const auto* fc2_experts_weights = packed_fc2_ ? nullptr : context->Input<Tensor>(5);
   const auto* fc2_scales = context->Input<Tensor>(6);
   const auto* fc2_experts_bias = context->Input<Tensor>(7);
   const auto* fc3_experts_weights = context->Input<Tensor>(8);
@@ -386,17 +661,21 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
   const auto* fc2_zero_points = context->Input<Tensor>(12);
   const auto* fc3_zero_points = context->Input<Tensor>(13);
 
+  const TensorShape* fc1_shape_ptr = packed_fc1_ ? &fc1_shape_ : (fc1_experts_weights ? &fc1_experts_weights->Shape() : nullptr);
+  const TensorShape* fc2_shape_ptr = packed_fc2_ ? &fc2_shape_ : (fc2_experts_weights ? &fc2_experts_weights->Shape() : nullptr);
+  const TensorShape* fc3_shape_ptr = fc3_experts_weights ? &fc3_experts_weights->Shape() : nullptr;
+
   MoEParameters moe_params;
   ORT_RETURN_IF_ERROR(moe_helper::CheckInputs<Tensor>(
       moe_params, input, router_probs,
-      fc1_experts_weights, fc1_experts_bias, fc1_scales, fc1_zero_points,
-      fc2_experts_weights, fc2_experts_bias, fc2_scales, fc2_zero_points,
-      fc3_experts_weights, fc3_experts_bias, fc3_scales, fc3_zero_points,
+      fc1_shape_ptr, fc1_experts_bias, fc1_scales, fc1_zero_points,
+      fc2_shape_ptr, fc2_experts_bias, fc2_scales, fc2_zero_points,
+      fc3_shape_ptr, fc3_experts_bias, fc3_scales, fc3_zero_points,
       expert_weight_bits_ == 4 ? 2 : 1,
-      true,
+      activation_type_ == ActivationType::SwiGLU,
       block_size_));
 
-  if (fc3_experts_weights || fc3_experts_bias || fc3_scales || fc3_zero_points) {
+  if (fc3_shape_ptr || fc3_experts_bias || fc3_scales || fc3_zero_points) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "FC3 gating is not yet implemented on CPU for QMoE");
   }
 
@@ -559,8 +838,8 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
   const bool is_fc1_block_wise = (fc1_scales_dims.size() == 3 && fc1_scales_dims[2] > 1);
   const bool is_fc2_block_wise = (fc2_scales_dims.size() == 3 && fc2_scales_dims[2] > 1);
 
-  const uint8_t* fc1_weights_data = fc1_experts_weights->Data<uint8_t>();
-  const uint8_t* fc2_weights_data = fc2_experts_weights->Data<uint8_t>();
+  const uint8_t* fc1_weights_data = (packed_fc1_ != nullptr) ? nullptr : fc1_experts_weights->template Data<uint8_t>();
+  const uint8_t* fc2_weights_data = (packed_fc2_ != nullptr) ? nullptr : fc2_experts_weights->template Data<uint8_t>();
   const T* fc1_scales_data = fc1_scales->Data<T>();
   const T* fc2_scales_data = fc2_scales->Data<T>();
   const T* fc1_bias_data = fc1_experts_bias ? fc1_experts_bias->Data<T>() : nullptr;
@@ -568,6 +847,13 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
   const uint8_t* fc1_zp_data = fc1_zero_points ? fc1_zero_points->Data<uint8_t>() : nullptr;
   const uint8_t* fc2_zp_data = fc2_zero_points ? fc2_zero_points->Data<uint8_t>() : nullptr;
 
+  // Known loss-prone case from parity testing: 4-bit symmetric path (row-wise and block-wise).
+  const bool known_accuracy_loss_case = (expert_weight_bits_ == 4) &&
+                                        (fc1_zp_data == nullptr) && (fc2_zp_data == nullptr);
+  const bool use_mlas_q4_gemm_effective = use_mlas_q4_gemm_overridden_
+                                              ? use_mlas_q4_gemm_
+                                              : (use_mlas_q4_gemm_ && !known_accuracy_loss_case);
+
   const int64_t pack_unit = (8 / expert_weight_bits_);
   const int64_t fc1_packed_cols = (hidden_size + pack_unit - 1) / pack_unit;
   const int64_t fc2_packed_cols = (inter_size + pack_unit - 1) / pack_unit;
@@ -595,6 +881,22 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
     fc2_zp_expert_stride = (hidden_size + zp_pack_size - 1) / zp_pack_size;
   }
 
+  MLAS_BLK_QUANT_TYPE fc1_direct_qtype = BlkQ4Sym;
+  MLAS_BLK_QUANT_TYPE fc2_direct_qtype = BlkQ4Sym;
+
+  // Use pre-packed MLAS cache if available
+  const void* fc1_direct_q4_cache_ptr = nullptr;
+  if (use_mlas_q4_gemm_effective && packed_fc1_mlas_cache_ && fc1_zp_data == nullptr &&
+      CanUseMlasQ4Gemm(expert_weight_bits_, is_fc1_block_wise ? block_size_ : 0, fc1_out_features, hidden_size, fc1_direct_qtype)) {
+    fc1_direct_q4_cache_ptr = packed_fc1_mlas_cache_.get();
+  }
+
+  const void* fc2_direct_q4_cache_ptr = nullptr;
+  if (use_mlas_q4_gemm_effective && packed_fc2_mlas_cache_ && fc2_zp_data == nullptr &&
+      CanUseMlasQ4Gemm(expert_weight_bits_, is_fc2_block_wise ? block_size_ : 0, hidden_size, inter_size, fc2_direct_qtype)) {
+    fc2_direct_q4_cache_ptr = packed_fc2_mlas_cache_.get();
+  }
+
   std::vector<std::pair<int64_t, size_t>> expert_workload;
   size_t total_work = 0;
 
@@ -634,6 +936,7 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
     float* thread_bias2_buffer = thread_bias1_buffer + static_cast<size_t>(fc1_out_features);
 
     for (int64_t expert_idx : expert_batch) {
+      bool fc2_bias_added_by_mlas = false;
       const auto& routes = expert_token_map[static_cast<size_t>(expert_idx)];
       if (routes.empty()) {
         continue;
@@ -707,12 +1010,57 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
       const size_t k = static_cast<size_t>(hidden_size);
 
       MLAS_BLK_QUANT_TYPE q_type = BlkQ4Sym;  // Initialize to default
-      // Direct Q4 GEMM only supports symmetric quantization, so we disable it if zero_points are provided.
-      bool use_direct_q4_gemm = (fc1_zp_data == nullptr) &&
-                                CanUseMlasQ4Gemm(expert_weight_bits_, is_fc1_block_wise ? block_size_ : 0,
-                                                 fc1_out_features, hidden_size, q_type);
-      bool fc1_used_direct_q4 = false;
-      bool fc1_bias_handled_by_q4_gemm = false;
+      bool use_direct_q4_gemm = use_mlas_q4_gemm_effective &&
+                                ((fc1_direct_q4_cache_ptr != nullptr) ||
+                                 ((packed_fc1_ == nullptr) && (fc1_zp_data == nullptr) &&
+                                  CanUseMlasQ4Gemm(expert_weight_bits_, is_fc1_block_wise ? block_size_ : 0,
+                                                   fc1_out_features, hidden_size, q_type)));
+
+      if (packed_fc1_ != nullptr) {
+        if (use_mlas_q4_gemm_effective && fc1_zp_data == nullptr &&
+            CanUseMlasQ4Gemm(expert_weight_bits_, is_fc1_block_wise ? block_size_ : 0,
+                             fc1_out_features, hidden_size, q_type)) {
+          if (fc1_direct_q4_cache_ptr != nullptr) {
+            float* fc1_bias_float = nullptr;
+            if (has_fc1_bias) {
+              const T* B1_bias = fc1_bias_data + expert_idx * fc1_out_features;
+              if constexpr (std::is_same_v<T, MLFloat16>) {
+                MlasConvertHalfToFloatBuffer(reinterpret_cast<const MLFloat16*>(B1_bias), thread_bias1_buffer, static_cast<size_t>(fc1_out_features));
+              } else {
+                std::memcpy(thread_bias1_buffer, B1_bias, static_cast<size_t>(fc1_out_features) * sizeof(float));
+              }
+              fc1_bias_float = thread_bias1_buffer;
+            }
+
+            size_t packed_size = MlasQ4GemmPackBSize(q_type, static_cast<size_t>(fc1_out_features), static_cast<size_t>(hidden_size));
+            const uint8_t* packed_b = static_cast<const uint8_t*>(fc1_direct_q4_cache_ptr) + expert_idx * packed_size;
+
+            Status gemm_status = DirectQ4Gemm(A1, packed_b, fc1_bias_float, C1,
+                                              num_expert_tokens, fc1_out_features, hidden_size, fc1_direct_qtype, tp);
+            if (gemm_status.IsOK()) {
+              goto fc1_gemm_done;
+            }
+          }
+        }
+
+        // Fallback: Dequantize from PrePacked (transposed, unpacked) -> MlasGemm
+        const uint8_t* current_packed_ptr = static_cast<const uint8_t*>(packed_fc1_.get()) + expert_idx * fc1_out_features * hidden_size;
+
+        DequantizePrePacked(current_packed_ptr, fc1_scales_ptr, fc1_zp_ptr,
+                            is_fc1_block_wise ? block_size_ : 0,
+                            fc1_out_features, hidden_size,
+                            B1_dequant, fc1_scales_dims);
+
+        // Use MlasGemm with B1_dequant (which is already float transposed)
+        MlasGemm(CblasNoTrans, CblasNoTrans,
+                 m, n, k,
+                 1.0f, A1, k,
+                 B1_dequant, n,
+                 0.0f, C1, n,
+                 tp);
+
+        goto fc1_bias_handling;
+      }
 
       if (use_direct_q4_gemm) {
         IAllocatorUniquePtr<uint8_t> mlas_packed_fc1;
@@ -730,12 +1078,10 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
 
         if (convert_status.IsOK()) {
           float* fc1_bias_float = nullptr;
-          IAllocatorUniquePtr<float> fc1_bias_buffer;
 
           if (has_fc1_bias) {
             const T* B1_bias = fc1_bias_data + expert_idx * fc1_out_features;
-            fc1_bias_buffer = IAllocator::MakeUniquePtr<float>(allocator, static_cast<size_t>(fc1_out_features));
-            fc1_bias_float = fc1_bias_buffer.get();
+            fc1_bias_float = thread_bias1_buffer;
 
             if constexpr (std::is_same_v<T, MLFloat16>) {
               MlasConvertHalfToFloatBuffer(reinterpret_cast<const MLFloat16*>(B1_bias), fc1_bias_float, static_cast<size_t>(fc1_out_features));
@@ -750,7 +1096,6 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
                                             num_expert_tokens, fc1_out_features, hidden_size, q_type, tp);
 
           if (gemm_status.IsOK()) {
-            fc1_used_direct_q4 = true;
             goto fc1_gemm_done;
           }
         }
@@ -797,8 +1142,9 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
                0.0f, C1, n,
                tp);
 
-      fc1_bias_handled_by_q4_gemm = fc1_used_direct_q4 && has_fc1_bias;
-      if (has_fc1_bias && !fc1_bias_handled_by_q4_gemm) {
+    fc1_bias_handling:
+
+      if (has_fc1_bias) {
         const T* B1_bias = fc1_bias_data + expert_idx * fc1_out_features;
         if constexpr (std::is_same_v<T, MLFloat16>) {
           MlasConvertHalfToFloatBuffer(reinterpret_cast<const MLFloat16*>(B1_bias), thread_bias1_buffer, static_cast<size_t>(fc1_out_features));
@@ -837,22 +1183,30 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
 
     fc1_gemm_done:
 
-      const int64_t activation_threshold = std::max(int64_t{4}, 256 / std::max(int64_t{1}, inter_size));
-      if (num_expert_tokens >= activation_threshold && tp != nullptr) {
-        const int64_t activation_block_size = std::max(int64_t{1}, std::min(int64_t{64}, activation_threshold));
-        const int64_t num_activation_blocks = (num_expert_tokens + activation_block_size - 1) / activation_block_size;
-
-        if (num_activation_blocks > 1) {
-          concurrency::ThreadPool::TrySimpleParallelFor(tp, narrow<int>(num_activation_blocks), [&](std::ptrdiff_t block_idx) {
-            const int64_t start_token = block_idx * activation_block_size;
-            const int64_t end_token = std::min(start_token + activation_block_size, num_expert_tokens);
-
-            for (int64_t i = start_token; i < end_token; ++i) {
+      if (activation_type_ == ActivationType::SwiGLU) {
+        const int64_t activation_threshold = std::max(int64_t{4}, 256 / std::max(int64_t{1}, inter_size));
+        if (num_expert_tokens >= activation_threshold && tp != nullptr) {
+          const int64_t activation_block_size = std::max(int64_t{1}, std::min(int64_t{64}, activation_threshold));
+          const int64_t num_activation_blocks = (num_expert_tokens + activation_block_size - 1) / activation_block_size;
+
+          if (num_activation_blocks > 1) {
+            concurrency::ThreadPool::TrySimpleParallelFor(tp, narrow<int>(num_activation_blocks), [&](std::ptrdiff_t block_idx) {
+              const int64_t start_token = block_idx * activation_block_size;
+              const int64_t end_token = std::min(start_token + activation_block_size, num_expert_tokens);
+
+              for (int64_t i = start_token; i < end_token; ++i) {
+                const float* C1_token = C1 + i * fc1_out_features;
+                float* A2_token = A2 + i * inter_size;
+                ApplySwiGLUActivation(C1_token, A2_token, inter_size, true, activation_alpha_, activation_beta_, swiglu_limit_);
+              }
+            });
+          } else {
+            for (int64_t i = 0; i < num_expert_tokens; ++i) {
               const float* C1_token = C1 + i * fc1_out_features;
               float* A2_token = A2 + i * inter_size;
               ApplySwiGLUActivation(C1_token, A2_token, inter_size, true, activation_alpha_, activation_beta_, swiglu_limit_);
             }
-          });
+          }
         } else {
           for (int64_t i = 0; i < num_expert_tokens; ++i) {
             const float* C1_token = C1 + i * fc1_out_features;
@@ -861,11 +1215,8 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
           }
         }
       } else {
-        for (int64_t i = 0; i < num_expert_tokens; ++i) {
-          const float* C1_token = C1 + i * fc1_out_features;
-          float* A2_token = A2 + i * inter_size;
-          ApplySwiGLUActivation(C1_token, A2_token, inter_size, true, activation_alpha_, activation_beta_, swiglu_limit_);
-        }
+        ApplyActivationVectorized(C1, num_expert_tokens * fc1_out_features);
+        std::copy(C1, C1 + (num_expert_tokens * fc1_out_features), A2);
       }
 
       const T* fc2_scales_ptr;
@@ -888,10 +1239,58 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
       const size_t k2 = static_cast<size_t>(inter_size);
 
       MLAS_BLK_QUANT_TYPE q_type2 = BlkQ4Sym;  // Initialize to default
-      bool use_direct_q4_gemm_fc2 = (fc2_zp_data == nullptr) &&
-                                    CanUseMlasQ4Gemm(expert_weight_bits_, is_fc2_block_wise ? block_size_ : 0,
-                                                     hidden_size, inter_size, q_type2);
-      bool fc2_used_direct_q4 = false;
+      bool use_direct_q4_gemm_fc2 = use_mlas_q4_gemm_effective &&
+                                    ((fc2_direct_q4_cache_ptr != nullptr) ||
+                                     ((packed_fc2_ == nullptr) && (fc2_zp_data == nullptr) &&
+                                      CanUseMlasQ4Gemm(expert_weight_bits_, is_fc2_block_wise ? block_size_ : 0,
+                                                       hidden_size, inter_size, q_type2)));
+
+      if (packed_fc2_ != nullptr) {
+        if (use_mlas_q4_gemm_effective && fc2_zp_data == nullptr &&
+            CanUseMlasQ4Gemm(expert_weight_bits_, is_fc2_block_wise ? block_size_ : 0,
+                             hidden_size, inter_size, q_type2)) {
+          if (fc2_direct_q4_cache_ptr != nullptr) {
+            float* fc2_bias_float = nullptr;
+            if (has_fc2_bias) {
+              const T* B2_bias = fc2_bias_data + expert_idx * hidden_size;
+              if constexpr (std::is_same_v<T, MLFloat16>) {
+                MlasConvertHalfToFloatBuffer(reinterpret_cast<const MLFloat16*>(B2_bias), thread_bias2_buffer, static_cast<size_t>(hidden_size));
+              } else {
+                std::memcpy(thread_bias2_buffer, B2_bias, static_cast<size_t>(hidden_size) * sizeof(float));
+              }
+              fc2_bias_float = thread_bias2_buffer;
+            }
+
+            size_t packed_size = MlasQ4GemmPackBSize(q_type2, static_cast<size_t>(hidden_size), static_cast<size_t>(inter_size));
+            const uint8_t* packed_b = static_cast<const uint8_t*>(fc2_direct_q4_cache_ptr) + expert_idx * packed_size;
+
+            Status gemm_status = DirectQ4Gemm(A2, packed_b, fc2_bias_float, C2,
+                                              num_expert_tokens, hidden_size, inter_size, fc2_direct_qtype, tp);
+            if (gemm_status.IsOK()) {
+              fc2_bias_added_by_mlas = true;
+              goto fc2_gemm_done;
+            }
+          }
+        }
+
+        // Dequantize from PrePacked (transposed, unpacked)
+        const uint8_t* current_packed_ptr = static_cast<const uint8_t*>(packed_fc2_.get()) + expert_idx * hidden_size * inter_size;
+
+        DequantizePrePacked(current_packed_ptr, fc2_scales_ptr, fc2_zp_ptr,
+                            is_fc2_block_wise ? block_size_ : 0,
+                            hidden_size, inter_size,
+                            B2_dequant, fc2_scales_dims);
+
+        // Fallback
+        MlasGemm(CblasNoTrans, CblasNoTrans,
+                 m2, n2, k2,
+                 1.0f, A2, k2,
+                 B2_dequant, n2,
+                 0.0f, C2, n2,
+                 tp);
+
+        goto fc2_gemm_done;
+      }
 
       if (use_direct_q4_gemm_fc2) {
         IAllocatorUniquePtr<uint8_t> mlas_packed_fc2;
@@ -909,12 +1308,10 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
 
         if (convert_status.IsOK()) {
           float* fc2_bias_float = nullptr;
-          IAllocatorUniquePtr<float> fc2_bias_buffer;
 
           if (has_fc2_bias) {
             const T* B2_bias = fc2_bias_data + expert_idx * hidden_size;
-            fc2_bias_buffer = IAllocator::MakeUniquePtr<float>(allocator, static_cast<size_t>(hidden_size));
-            fc2_bias_float = fc2_bias_buffer.get();
+            fc2_bias_float = thread_bias2_buffer;
 
             if constexpr (std::is_same_v<T, MLFloat16>) {
               MlasConvertHalfToFloatBuffer(reinterpret_cast<const MLFloat16*>(B2_bias), fc2_bias_float, static_cast<size_t>(hidden_size));
@@ -929,7 +1326,7 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
                                             num_expert_tokens, hidden_size, inter_size, q_type2, tp);
 
           if (gemm_status.IsOK()) {
-            fc2_used_direct_q4 = true;
+            fc2_bias_added_by_mlas = true;
             goto fc2_gemm_done;
           }
         }
@@ -979,8 +1376,7 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
 
     fc2_gemm_done:
 
-      bool fc2_bias_handled_by_q4_gemm = fc2_used_direct_q4 && has_fc2_bias;
-      if (has_fc2_bias && !fc2_bias_handled_by_q4_gemm) {
+      if (has_fc2_bias && !fc2_bias_added_by_mlas) {
         const T* B2_bias = fc2_bias_data + expert_idx * hidden_size;
         if constexpr (std::is_same_v<T, MLFloat16>) {
           MlasConvertHalfToFloatBuffer(reinterpret_cast<const MLFloat16*>(B2_bias), thread_bias2_buffer, static_cast<size_t>(hidden_size));
@@ -1015,7 +1411,7 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
         float* dest = thread_local_outputs + static_cast<size_t>(thread_id) * output_buffer_size + buffer_offset;
         const float* src = C2 + i * hidden_size;
 
-        if (has_fc2_bias && !fc2_bias_handled_by_q4_gemm) {
+        if (has_fc2_bias && !fc2_bias_added_by_mlas) {
           const size_t unroll_factor = narrow<size_t>(GetUnrollFactor(hidden_size));
           size_t j = 0;
           for (; j + unroll_factor <= narrow<size_t>(hidden_size); j += unroll_factor) {
@@ -1109,10 +1505,22 @@ Status QMoECPU<T>::Compute(OpKernelContext* context) const {
   return Status::OK();
 }
 
+template <typename T>
+void QMoECPU<T>::ApplyActivationVectorized(float* data, int64_t size) const {
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = ApplyActivation(data[i], activation_type_);
+  }
+}
+
 template QMoECPU<float>::QMoECPU(const OpKernelInfo& op_kernel_info);
+
 template Status QMoECPU<float>::Compute(OpKernelContext* context) const;
+template Status QMoECPU<float>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool& is_packed, PrePackedWeights* prepacked_weights);
+template Status QMoECPU<float>::UseSharedPrePackedBuffers_V2(std::vector<BufferUniquePtr>& prepacked_buffers, gsl::span<const size_t> prepacked_buffer_sizes, int input_idx, bool& used_shared_buffers);
 template QMoECPU<MLFloat16>::QMoECPU(const OpKernelInfo& op_kernel_info);
 template Status QMoECPU<MLFloat16>::Compute(OpKernelContext* context) const;
+template Status QMoECPU<MLFloat16>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool& is_packed, PrePackedWeights* prepacked_weights);
+template Status QMoECPU<MLFloat16>::UseSharedPrePackedBuffers_V2(std::vector<BufferUniquePtr>& prepacked_buffers, gsl::span<const size_t> prepacked_buffer_sizes, int input_idx, bool& used_shared_buffers);
 
 // Kernel Registration
 ONNX_OPERATOR_TYPED_KERNEL_EX(
diff --git a/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.h b/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.h
index 890580e051a8e..f678a27190c90 100644
--- a/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.h
+++ b/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.h
@@ -5,7 +5,9 @@
 
 #include "core/common/common.h"
 #include "core/framework/op_kernel.h"
+#include "core/mlas/inc/mlas_q4.h"
 #include "contrib_ops/cpu/moe/moe_base_cpu.h"
+#include <vector>
 
 namespace onnxruntime {
 namespace contrib {
@@ -26,8 +28,30 @@ class QMoECPU final : public OpKernel, public MoEBaseCPU {
   Status Compute(OpKernelContext* context) const override;
 
  private:
+  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 /*out*/ bool& is_packed,
+                 /*out*/ PrePackedWeights* prepacked_weights) override;
+
+  Status UseSharedPrePackedBuffers_V2(std::vector<BufferUniquePtr>& prepacked_buffers,
+                                      gsl::span<const size_t> prepacked_buffer_sizes,
+                                      int input_idx,
+                                      /*out*/ bool& used_shared_buffers) override;
+
+  void ApplyActivationVectorized(float* data, int64_t size) const;
+
   int64_t expert_weight_bits_;
   int64_t block_size_;
+  bool use_mlas_q4_gemm_{false};
+  bool use_mlas_q4_gemm_overridden_{false};
+
+  IAllocatorUniquePtr<void> packed_fc1_;
+  IAllocatorUniquePtr<void> packed_fc2_;
+
+  TensorShape fc1_shape_;
+  TensorShape fc2_shape_;
+
+  IAllocatorUniquePtr<void> packed_fc1_mlas_cache_;
+  IAllocatorUniquePtr<void> packed_fc2_mlas_cache_;
 };
 
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
index 53e95bd8c5627..cc93799059f43 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
@@ -359,9 +359,9 @@ Status MatMulNBits<MLFloat16>::PrePack(const Tensor& tensor, int input_idx, /*ou
     OpKernel::Info().TryGetConstantInput(InputIndex::scales, &scales);
     if (scales && MlasQNBitGemmScalesPacked(K_, nbits_, block_size_, compute_type_, has_zp_input_)) {
       auto sptr = scales->Data<MLFloat16>();
-      auto tensor_size = static_cast<size_t>(tensor.Shape().Size());
-      auto ptr = IAllocator::MakeUniquePtr<float>(alloc, tensor_size, true);
-      MlasConvertHalfToFloatBuffer(sptr, ptr.get(), tensor_size);
+      auto scales_size = static_cast<size_t>(scales->Shape().Size());
+      auto ptr = IAllocator::MakeUniquePtr<float>(alloc, scales_size, true);
+      MlasConvertHalfToFloatBuffer(sptr, ptr.get(), scales_size);
       scales_fp32_ = std::move(ptr);
     }
 
diff --git a/onnxruntime/core/common/cpuid_info.cc b/onnxruntime/core/common/cpuid_info.cc
index afea9f62419fa..00711e416e4e3 100644
--- a/onnxruntime/core/common/cpuid_info.cc
+++ b/onnxruntime/core/common/cpuid_info.cc
@@ -366,7 +366,11 @@ CPUIDInfo::CPUIDInfo() {
 #endif  // defined(CPUINFO_SUPPORTED)
 
   // Note: This should be run after cpuinfo initialization if cpuinfo is enabled.
+  // On Wasm/Emscripten, cpuinfo cannot detect the CPU vendor so skip to avoid
+  // an unhelpful "Unknown CPU vendor" warning.
+#if !defined(__wasm__)
   VendorInfoInit();
+#endif
 
 #ifdef CPUIDINFO_ARCH_X86
   X86Init();
diff --git a/onnxruntime/core/common/cpuid_info.h b/onnxruntime/core/common/cpuid_info.h
index ca9315c7ef95d..be301019df5c0 100644
--- a/onnxruntime/core/common/cpuid_info.h
+++ b/onnxruntime/core/common/cpuid_info.h
@@ -168,7 +168,7 @@ class CPUIDInfo {
   bool has_arm_sme2_{false};
 
   std::string vendor_;
-  uint32_t vendor_id_;
+  uint32_t vendor_id_{0};
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc
index a656abb098911..7648aaf8f9d33 100644
--- a/onnxruntime/core/framework/allocator.cc
+++ b/onnxruntime/core/framework/allocator.cc
@@ -237,7 +237,19 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtA
         OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::AMD, device_id),
         mem_type1);
   } else if (strcmp(name1, onnxruntime::WEBGPU_BUFFER) == 0 ||
-             strcmp(name1, onnxruntime::WEBNN_TENSOR) == 0) {
+             strcmp(name1, onnxruntime::WEBNN_TENSOR) == 0 ||
+             // PR #27207 (merged to main/1.25.x, not in 1.24.x) shortened the WebGPU/WebNN
+             // memory info names from "WebGPU_Buffer"/"WebNN_Tensor" to "WebGPU_Buf"/"WebNN_Ten"
+             // to enable Small String Optimization (SSO) on wasm32 (emscripten), where strings
+             // must be <= 10 chars for SSO.
+             //
+             // A WebGPU/WebNN plugin EP built against 1.25.x will use the new short names.
+             // Accept both old and new names here so that plugin EPs targeting either 1.24.x
+             // or 1.25.x can work with this 1.24.x runtime.
+             //
+             // See: https://github.com/microsoft/onnxruntime/pull/27207
+             strcmp(name1, "WebGPU_Buf") == 0 ||
+             strcmp(name1, "WebNN_Ten") == 0) {
     *out = new OrtMemoryInfo(
         name1, type,
         OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::NONE, device_id),
diff --git a/onnxruntime/core/framework/debug_node_inputs_outputs_utils.cc b/onnxruntime/core/framework/debug_node_inputs_outputs_utils.cc
index 38dd8de01147c..5137c22d6cf61 100644
--- a/onnxruntime/core/framework/debug_node_inputs_outputs_utils.cc
+++ b/onnxruntime/core/framework/debug_node_inputs_outputs_utils.cc
@@ -621,8 +621,8 @@ void DumpNodeInputs(
           std::cout << " is non-tensor type.\n";
         }
       } else {
-        // this could happen with an empty Optional input
-        std::cout << " was missing data type\n";
+        // this could happen with an empty Optional input or the tensor is removed after pre-packing.
+        std::cout << " was missing data type (maybe pre-packed).\n";
       }
     } else {
       std::cout << "Input " << i << " is optional and was not provided.\n";
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index 961012536126b..e4c7830ffbb55 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -371,6 +371,21 @@ Status ValidateExternalDataPath(const std::filesystem::path& base_dir,
                              "External data path: ", location, " (resolved path: ", resolved,
                              ") escapes model directory: ", base_dir);
     }
+  } else {
+    // The basedir is empty, which occurs when 1) the session loads a model from bytes and 2) the application does not
+    // set an external file folder path via the session config option
+    // `kOrtSessionOptionsModelExternalInitializersFileFolderPath`.
+
+    // We conservatively check that the normalized relative path does not contain ".." path components that would allow
+    // access to arbitrary files outside of the current working directory. Based on ONNX checker validation.
+    auto norm_location = location.lexically_normal();
+
+    for (const auto& path_component : norm_location) {
+      if (path_component == ORT_TSTR("..")) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "External data path: ", location,
+                               " (model loaded from bytes) escapes working directory");
+      }
+    }
   }
   return Status::OK();
 }
diff --git a/onnxruntime/core/mlas/inc/mlas_q4.h b/onnxruntime/core/mlas/inc/mlas_q4.h
index 69f0435615079..d60e5b0164fe8 100644
--- a/onnxruntime/core/mlas/inc/mlas_q4.h
+++ b/onnxruntime/core/mlas/inc/mlas_q4.h
@@ -57,10 +57,10 @@ MlasQ4GemmPackBSize(
  *
  * @param QType      type of block quantization
  * @param PackedBuf  destination buffer
- * @param FpData     the pointer to fp32 matrix
- * @param N          the number of columns of matrix B.
- * @param K          the number of rows of matrix B.
- * @param ldb        leading dimension of B
+ * @param FpData     the pointer to fp32 matrix, with shape [K, N].
+ * @param N          the number of columns of matrix B (Output Channels).
+ * @param K          the number of rows of matrix B (Input Channels).
+ * @param ldb        leading dimension of FpData (usually N)
 */
 void
 MLASCALL
diff --git a/onnxruntime/core/mlas/lib/kai_ukernel_interface.cpp b/onnxruntime/core/mlas/lib/kai_ukernel_interface.cpp
index a406f371a3bd2..06f9d97b872c7 100644
--- a/onnxruntime/core/mlas/lib/kai_ukernel_interface.cpp
+++ b/onnxruntime/core/mlas/lib/kai_ukernel_interface.cpp
@@ -19,8 +19,12 @@
 #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.h"
 #include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa.h"
 #include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa.h"
+
+#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa.h"
+#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa.h"
 #if defined(ENABLE_QMX_KERNELS)
 #include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa.h"
+#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa.h"
 #endif // ENABLE_QMX_KERNELS
 
 const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod =
@@ -125,6 +129,32 @@ const kai_matmul_clamp_f32_f32p_f32p_ukernel sgemm_gemm_sme2 =
     kai_get_dst_size_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa,
     kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa};
 
+const kai_matmul_clamp_f32_qai8dxp_qsi8cxp_ukernel qgemm_gemm_sme =
+    {kai_get_m_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa,
+    kai_get_n_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa,
+    kai_get_mr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa,
+    kai_get_nr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa,
+    kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa,
+    kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa,
+    kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa,
+    kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa,
+    kai_get_dst_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa,
+    kai_get_dst_size_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa,
+    kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa};
+
+const kai_matmul_clamp_f32_qai8dxp_qsi8cxp_ukernel qgemm_gemm_sme2 =
+    {kai_get_m_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+    kai_get_n_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+    kai_get_mr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+    kai_get_nr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+    kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+    kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+    kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+    kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+    kai_get_dst_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+    kai_get_dst_size_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa,
+    kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa};
+
 #if defined(ENABLE_QMX_KERNELS)
 const kai_matmul_clamp_f32_f32p_f32p_ukernel sgemm_gemm_qmx =
     {kai_get_m_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
@@ -138,6 +168,19 @@ const kai_matmul_clamp_f32_f32p_f32p_ukernel sgemm_gemm_qmx =
     kai_get_dst_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
     kai_get_dst_size_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
     kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa};
+
+const kai_matmul_clamp_f32_qai8dxp_qsi8cxp_ukernel qgemm_gemm_qmx =
+    {kai_get_m_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa,
+    kai_get_n_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa,
+    kai_get_mr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa,
+    kai_get_nr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa,
+    kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa,
+    kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa,
+    kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa,
+    kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa,
+    kai_get_dst_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa,
+    kai_get_dst_size_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa,
+    kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa};
 #endif // ENABLE_QMX_KERNELS
 
 const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& GetKleidiAIGemmUKernel() {
@@ -181,3 +224,21 @@ const kai_matmul_clamp_f32_f32_f32p_ukernel& GetKleidiAISGemvUKernel() {
         return sgemm_gemv_sme;
     }
 }
+
+const kai_matmul_clamp_f32_qai8dxp_qsi8cxp_ukernel& GetKleidiAIQGemmUKernel() {
+    if (MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME2()) {
+        return qgemm_gemm_sme2;
+    } else {
+#if defined(ENABLE_QMX_KERNELS)
+        if (ArmKleidiAI::vendor_name.compare("Qualcomm") == 0)
+        {
+            KLEIDIAI_KERNEL_LOG("QGEMM: Using QMX Kernel");
+            return qgemm_gemm_qmx;
+        } else {
+            return qgemm_gemm_sme;
+        }
+#else
+        return qgemm_gemm_sme;
+#endif // ENABLE_QMX_KERNELS
+    }
+}
diff --git a/onnxruntime/core/mlas/lib/kai_ukernel_interface.h b/onnxruntime/core/mlas/lib/kai_ukernel_interface.h
index e69c72329d64b..7bd8959b0b5bd 100644
--- a/onnxruntime/core/mlas/lib/kai_ukernel_interface.h
+++ b/onnxruntime/core/mlas/lib/kai_ukernel_interface.h
@@ -12,8 +12,12 @@
 
 #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p_interface.h"
 
+#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp_qsi8cxp_interface.h"
+
 const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& GetKleidiAIGemmUKernel();
 const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& GetKleidiAIGemvUKernel();
 
 const kai_matmul_clamp_f32_f32p_f32p_ukernel& GetKleidiAISGemmUKernel();
 const kai_matmul_clamp_f32_f32_f32p_ukernel& GetKleidiAISGemvUKernel();
+
+const kai_matmul_clamp_f32_qai8dxp_qsi8cxp_ukernel& GetKleidiAIQGemmUKernel();
\ No newline at end of file
diff --git a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
index d4df09bb94a93..4c088e8660874 100644
--- a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
+++ b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
@@ -107,14 +107,14 @@ MlasGemmBatch(
 
 size_t
 MLASCALL
-MlasDynamicQgemmPackBSize(
+MlasDynamicQGemmPackBSize(
     size_t N,
     size_t K
 );
 
 void
 MLASCALL
-MlasDynamicQgemmPackB(
+MlasDynamicQGemmPackB(
     size_t N,
     size_t K,
     const int8_t* B,
diff --git a/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp
index 9b2deac69ff63..b6a23735bd131 100644
--- a/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp
+++ b/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp
@@ -10,9 +10,7 @@
 #include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h"
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon.h"
 
-#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa.h"
-#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa.h"
-#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot.h"
+#include "kai_ukernel_interface.h"
 #if defined(ENABLE_QMX_KERNELS)
 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa.h"
 #endif // ENABLE_QMX_KERNELS
@@ -26,11 +24,13 @@ struct KaiTlsBuffersQgemm {
 };
 static thread_local KaiTlsBuffersQgemm g_kai_tls_qgemm;
 
+const kai_matmul_clamp_f32_qai8dxp_qsi8cxp_ukernel qgemm_gemm = GetKleidiAIQGemmUKernel();
+
 // Matmul with float output of dynamic-quantized A and symmetric-quantized B.
 
 size_t
 MLASCALL
-ArmKleidiAI::MlasDynamicQgemmPackBSize(
+ArmKleidiAI::MlasDynamicQGemmPackBSize(
     size_t N,
     size_t K
 ) {
@@ -39,10 +39,9 @@ ArmKleidiAI::MlasDynamicQgemmPackBSize(
         return 0;
     }
 
-    // Default to sme2_mopa, but this may not always be the most optimal kernel variant to use.
-    auto nr = kai_get_nr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa();
-    auto kr = kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa();
-    auto sr = kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa();
+    auto nr = qgemm_gemm.get_nr();
+    auto kr = qgemm_gemm.get_kr();
+    auto sr = qgemm_gemm.get_sr();
 
     // Regardless of kernel variant, use the NEON packing variant.
     KLEIDIAI_KERNEL_LOG("kai_run_rhs_pack_kxn_qsi8cxp_qsi8cx_neon Groups=1"
@@ -52,7 +51,7 @@ ArmKleidiAI::MlasDynamicQgemmPackBSize(
 
 void
 MLASCALL
-ArmKleidiAI::MlasDynamicQgemmPackB(
+ArmKleidiAI::MlasDynamicQGemmPackB(
     size_t N,
     size_t K,
     const int8_t* B,
@@ -65,10 +64,9 @@ ArmKleidiAI::MlasDynamicQgemmPackB(
         return;
     }
 
-    // Default to sme2_mopa, but this may not always be the most optimal kernel variant to use.
-    auto nr = kai_get_nr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa();
-    auto kr = kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa();
-    auto sr = kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa();
+    auto nr = qgemm_gemm.get_nr();
+    auto kr = qgemm_gemm.get_kr();
+    auto sr = qgemm_gemm.get_sr();
 
     // y - float output
     // scale_factor_lhs - lhs scaling factor
@@ -105,17 +103,12 @@ ArmKleidiAI::MlasDynamicQGemmBatch(
     MLAS_THREADPOOL* ThreadPool
 ) {
 
-    const size_t mr = UseSME2 ? kai_get_mr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa()
-                              : kai_get_mr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa();
-    const size_t kr = UseSME2 ? kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa()
-                              : kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa();
-    const size_t sr = UseSME2 ? kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa()
-                              : kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa();
+    const size_t mr = qgemm_gemm.get_mr();
+    const size_t kr = qgemm_gemm.get_kr();
+    const size_t sr = qgemm_gemm.get_sr();
 
-    size_t m_step = UseSME2 ? kai_get_m_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa()
-                            : kai_get_m_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa();
-    size_t n_step = UseSME2 ? kai_get_n_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa()
-                            : kai_get_n_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa();
+    size_t m_step = qgemm_gemm.get_m_step();
+    size_t n_step = qgemm_gemm.get_n_step();
 
     if (BatchSize == 0 || Shape.M == 0 || Shape.N == 0 || Shape.K == 0) {
         return;
@@ -216,17 +209,13 @@ ArmKleidiAI::MlasDynamicQGemmBatch(
         ptrdiff_t NIdx = (tid % (dim[1] * dim[2])) % dim[2];
 
         // Get rhs tile, B
-        const size_t rhs_packed_offset =
-           UseSME2 ? kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa(NIdx * n_step, Shape.K)
-                   : kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa(NIdx * n_step, Shape.K);
+        const size_t rhs_packed_offset = qgemm_gemm.get_rhs_packed_offset(NIdx * n_step, Shape.K);
 
         const std::byte* B_base = reinterpret_cast<const std::byte*>(DataParams[BIdx].PackedB);
         auto BTile = reinterpret_cast<const void*>(B_base + rhs_packed_offset);
 
         // Get lhs tile, A
-        const size_t lhs_packed_offset =
-            UseSME2 ? kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa(MIdx * m_step, Shape.K)
-                    : kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa(MIdx * m_step, Shape.K);
+        const size_t lhs_packed_offset =qgemm_gemm.get_lhs_packed_offset(MIdx * m_step, Shape.K);
 
         const std::byte* A_base = tls_lhs_base[BIdx]; // LhsPackedData + LhsPackedStride * BIdx; OR DataParams[batch_idx].Workspace;
         auto ATile = reinterpret_cast<const std::byte*>(A_base + lhs_packed_offset);
@@ -240,46 +229,12 @@ ArmKleidiAI::MlasDynamicQGemmBatch(
         NIdx * n_step * sizeof(float)
         );
 
-        if (UseSME2) {
-            kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa(
+        qgemm_gemm.run_matmul(
                 TileSizeM, TileSizeN, Shape.K, ATile, BTile,
                 dst_tile,
                 DataParams[BIdx].ldc * sizeof(float),
                 sizeof(float),
                 -std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
                 );
-            }
-        else {
-            #if defined(ENABLE_QMX_KERNELS)
-                if(ArmKleidiAI::vendor_name.compare("Qualcomm") == 0)
-                {
-                    kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa(
-                        TileSizeM, TileSizeN, Shape.K, ATile, BTile,
-                        dst_tile,
-                        DataParams[BIdx].ldc * sizeof(float),
-                        sizeof(float),
-                        -std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
-                        );
-                }
-                else
-                {
-                    kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa(
-                        TileSizeM, TileSizeN, Shape.K, ATile, BTile,
-                        dst_tile,
-                        DataParams[BIdx].ldc * sizeof(float),
-                        sizeof(float),
-                        -std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
-                        );
-                }
-            #else
-                kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa(
-                        TileSizeM, TileSizeN, Shape.K, ATile, BTile,
-                        dst_tile,
-                        DataParams[BIdx].ldc * sizeof(float),
-                        sizeof(float),
-                        -std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
-                        );
-            #endif // ENABLE_QMX_KERNELS
-        }
     });
 }
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
index e75ca3dc90e60..ac7528853c596 100644
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@@ -851,17 +851,9 @@ bool
     MLAS_THREADPOOL* ThreadPool
     );
 
-typedef void (MLASCALL MLAS_GEMM_BATCH)(
-    CBLAS_TRANSPOSE TransA,
-    CBLAS_TRANSPOSE TransB,
-    size_t M,
-    size_t N,
-    size_t K,
-    const MLAS_SGEMM_DATA_PARAMS* Data,
-    size_t BatchSize,
-    MLAS_THREADPOOL* ThreadPool);
-
-typedef bool (MLASCALL MLAS_GEMM_BATCH_OVERRIDE)(
+typedef
+bool
+(MLASCALL MLAS_SGEMM_BATCH_OVERRIDE)(
     CBLAS_TRANSPOSE TransA,
     CBLAS_TRANSPOSE TransB,
     size_t M,
@@ -871,19 +863,17 @@ typedef bool (MLASCALL MLAS_GEMM_BATCH_OVERRIDE)(
     size_t BatchSize,
     MLAS_THREADPOOL* ThreadPool);
 
-typedef size_t (MLASCALL MLAS_GEMM_PACK_B_SIZE)(
-    CBLAS_TRANSPOSE TransA,
-    CBLAS_TRANSPOSE TransB,
-    size_t N,
-    size_t K);
-
-typedef size_t (MLASCALL MLAS_GEMM_PACK_B_SIZE_OVERRIDE)(
+typedef
+size_t
+(MLASCALL MLAS_SGEMM_PACK_B_SIZE_OVERRIDE)(
     CBLAS_TRANSPOSE TransA,
     CBLAS_TRANSPOSE TransB,
     size_t N,
     size_t K);
 
-typedef void (MLASCALL MLAS_GEMM_PACK_B)(
+typedef
+bool
+(MLASCALL MLAS_SGEMM_PACK_B_OVERRIDE)(
     CBLAS_TRANSPOSE TransA,
     CBLAS_TRANSPOSE TransB,
     size_t N,
@@ -892,13 +882,28 @@ typedef void (MLASCALL MLAS_GEMM_PACK_B)(
     size_t ldb,
     void* PackedB);
 
-typedef bool (MLASCALL MLAS_GEMM_PACK_B_OVERRIDE)(
-    CBLAS_TRANSPOSE TransA,
-    CBLAS_TRANSPOSE TransB,
+typedef
+void
+(MLASCALL MLAS_DYNAMIC_QGEMM_BATCH_OVERRIDE)(
+    const MLAS_GEMM_DYN_QUANT_SHAPE_PARAMS& Shape,
+    const MLAS_GEMM_DYN_QUANT_DATA_PARAMS* DataParams,
+    const size_t BatchN,
+    MLAS_THREADPOOL* ThreadPool);
+
+typedef
+size_t
+(MLASCALL MLAS_DYNAMIC_QGEMM_PACK_B_SIZE_OVERRIDE)(
+    size_t N,
+    size_t K);
+
+typedef
+void
+(MLASCALL MLAS_DYNAMIC_QGEMM_PACK_B_OVERRIDE)(
     size_t N,
     size_t K,
-    const float* B,
-    size_t ldb,
+    const int8_t* B,
+    const float* Scales,
+    const float* Bias,
     void* PackedB);
 
 extern "C" {
@@ -1333,10 +1338,15 @@ struct MLAS_PLATFORM {
     bool Avx512Supported_ = false;
     bool ArmNeonIsQuantActivationsUnsigned = false;
 
-    // Mlas overrides initialisation
-    MLAS_GEMM_BATCH_OVERRIDE* MlasGemmBatchOverride = nullptr;
-    MLAS_GEMM_PACK_B_SIZE_OVERRIDE* MlasGemmPackBSizeOverride = nullptr;
-    MLAS_GEMM_PACK_B_OVERRIDE* MlasGemmPackBOverride = nullptr;
+    // MLAS SGemm overrides
+    MLAS_SGEMM_BATCH_OVERRIDE* MlasSGemmBatchOverride = nullptr;
+    MLAS_SGEMM_PACK_B_SIZE_OVERRIDE* MlasSGemmPackBSizeOverride = nullptr;
+    MLAS_SGEMM_PACK_B_OVERRIDE* MlasSGemmPackBOverride = nullptr;
+    // MLAS Dynamic QGemm overrides
+    MLAS_DYNAMIC_QGEMM_BATCH_OVERRIDE* MlasDynamicQGemmBatchOverride = nullptr;
+    MLAS_DYNAMIC_QGEMM_PACK_B_SIZE_OVERRIDE* MlasDynamicQGemmPackBSizeOverride = nullptr;
+    MLAS_DYNAMIC_QGEMM_PACK_B_OVERRIDE* MlasDynamicQGemmPackBOverride = nullptr;
+    // MLAS Conv overrides
     MLAS_CONV_PREPARE_FLOAT_OVERRIDE* MlasConvPrepareOverride = nullptr;
     MLAS_CONV_FLOAT_OVERRIDE* MlasConvOverride = nullptr;
 
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
index b913b1c3b8c26..6ebd6be068b12 100644
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@@ -610,9 +610,12 @@ Return Value:
 
 #if defined(USE_KLEIDIAI)
     if(MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME()){
-        this->MlasGemmBatchOverride = ArmKleidiAI::MlasGemmBatch;
-        this->MlasGemmPackBSizeOverride = ArmKleidiAI::MlasGemmPackBSize;
-        this->MlasGemmPackBOverride = ArmKleidiAI::MlasGemmPackB;
+        this->MlasSGemmBatchOverride = ArmKleidiAI::MlasGemmBatch;
+        this->MlasSGemmPackBSizeOverride = ArmKleidiAI::MlasGemmPackBSize;
+        this->MlasSGemmPackBOverride = ArmKleidiAI::MlasGemmPackB;
+        this->MlasDynamicQGemmBatchOverride = ArmKleidiAI::MlasDynamicQGemmBatch;
+        this->MlasDynamicQGemmPackBSizeOverride = ArmKleidiAI::MlasDynamicQGemmPackBSize;
+        this->MlasDynamicQGemmPackBOverride = ArmKleidiAI::MlasDynamicQGemmPackB;
         this->MlasConvPrepareOverride = ArmKleidiAI::MlasConvPrepare;
         this->MlasConvOverride = ArmKleidiAI::MlasConv;
     }
diff --git a/onnxruntime/core/mlas/lib/qgemm.cpp b/onnxruntime/core/mlas/lib/qgemm.cpp
index 3fc69a607de3f..5f6a8f8394470 100644
--- a/onnxruntime/core/mlas/lib/qgemm.cpp
+++ b/onnxruntime/core/mlas/lib/qgemm.cpp
@@ -224,7 +224,9 @@ MlasDynamicQGemmBatch (
 
 #if defined(USE_KLEIDIAI)
     //No fallback
-    ArmKleidiAI::MlasDynamicQGemmBatch(Shape, DataParams, BatchN, ThreadPool);
+    if (GetMlasPlatform().MlasDynamicQGemmBatchOverride != nullptr) {
+        GetMlasPlatform().MlasDynamicQGemmBatchOverride(Shape, DataParams, BatchN, ThreadPool);
+    }
 #endif
 
     MLAS_UNREFERENCED_PARAMETER(Shape);
@@ -348,8 +350,9 @@ MlasDynamicQgemmPackBSize(
     size_t bytes = 0;
 #if defined(USE_KLEIDIAI)
     //No fallback available
-    //TODO: Insert Override
-    bytes = ArmKleidiAI::MlasDynamicQgemmPackBSize(N, K);
+    if (GetMlasPlatform().MlasDynamicQGemmPackBSizeOverride != nullptr) {
+       bytes = GetMlasPlatform().MlasDynamicQGemmPackBSizeOverride(N, K);
+    }
 #endif
 
     MLAS_UNREFERENCED_PARAMETER(N);
@@ -442,7 +445,9 @@ MlasDynamicQgemmPackB(
 
 #if defined(USE_KLEIDIAI)
     //No fallback
-    ArmKleidiAI::MlasDynamicQgemmPackB(N, K, B, Scales, Bias, PackedB);
+    if (GetMlasPlatform().MlasDynamicQGemmPackBOverride != nullptr) {
+        GetMlasPlatform().MlasDynamicQGemmPackBOverride(N, K, B, Scales, Bias, PackedB);
+    }
 #endif
 
     MLAS_UNREFERENCED_PARAMETER(N);
diff --git a/onnxruntime/core/mlas/lib/sgemm.cpp b/onnxruntime/core/mlas/lib/sgemm.cpp
index 02e38b6ef432e..7117f20b82ce5 100644
--- a/onnxruntime/core/mlas/lib/sgemm.cpp
+++ b/onnxruntime/core/mlas/lib/sgemm.cpp
@@ -1573,10 +1573,10 @@ MlasGemmBatch(
     )
 {
     // Override
-    if(GetMlasPlatform().MlasGemmBatchOverride != nullptr &&
+    if(GetMlasPlatform().MlasSGemmBatchOverride != nullptr &&
         // TODO: Remove once KAI supports transposing for A
         TransA != CBLAS_TRANSPOSE::CblasTrans &&
-        GetMlasPlatform().MlasGemmBatchOverride(TransA, TransB, M, N, K, Data, BatchSize, ThreadPool)){
+        GetMlasPlatform().MlasSGemmBatchOverride(TransA, TransB, M, N, K, Data, BatchSize, ThreadPool)){
         return;
     }
     //
@@ -1671,12 +1671,12 @@ Return Value:
     //
     // KleidiAI or other override
     #if defined(USE_KLEIDIAI)
-    if (GetMlasPlatform().MlasGemmPackBSizeOverride != nullptr &&
+    if (GetMlasPlatform().MlasSGemmPackBSizeOverride != nullptr &&
         // TODO: Remove once KAI supports transposing for A
         TransA != CBLAS_TRANSPOSE::CblasTrans) {
         size_t bytes_required;
         //TODO pass status by reference to indicate success/fail
-        bytes_required = GetMlasPlatform().MlasGemmPackBSizeOverride(TransA, TransB, N, K);
+        bytes_required = GetMlasPlatform().MlasSGemmPackBSizeOverride(TransA, TransB, N, K);
         if (bytes_required != 0){// If ArmKleidiAI::MlasGemmPackBSize ran to completion
             return bytes_required;
         }
@@ -1738,10 +1738,10 @@ Return Value:
 --*/
 {
 #if defined(USE_KLEIDIAI)
-    if (GetMlasPlatform().MlasGemmPackBOverride != nullptr  &&
+    if (GetMlasPlatform().MlasSGemmPackBOverride != nullptr  &&
         // TODO: Remove once KAI supports transposing for A
         TransA != CBLAS_TRANSPOSE::CblasTrans    &&
-        GetMlasPlatform().MlasGemmPackBOverride(TransA, TransB, N, K, B, ldb, PackedB)){
+        GetMlasPlatform().MlasSGemmPackBOverride(TransA, TransB, N, K, B, ldb, PackedB)){
          return;
     }
 #endif
diff --git a/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc b/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc
index 655364357999a..3727ac0918115 100644
--- a/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc
+++ b/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc
@@ -254,6 +254,21 @@ Status SkipLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_le
       continue;
     }
 
+    // SkipLayerNormalization kernel requires gamma and beta to be 1D.
+    // Skip fusion if gamma or beta have more than 1 dimension.
+    const NodeArg* gamma_arg = ln_node.MutableInputDefs()[1];
+    const TensorShapeProto* gamma_shape = gamma_arg->Shape();
+    if (gamma_shape != nullptr && gamma_shape->dim_size() != 1) {
+      continue;
+    }
+    if (ln_node.MutableInputDefs().size() > 2) {
+      const NodeArg* beta_arg = ln_node.MutableInputDefs()[2];
+      const TensorShapeProto* beta_shape = beta_arg->Shape();
+      if (beta_shape != nullptr && beta_shape->dim_size() != 1) {
+        continue;
+      }
+    }
+
     NodeArg beta_place_holder("", nullptr);
 
     // Get the inputs for the new SkipLayerNormalization node.
diff --git a/onnxruntime/core/providers/cpu/tensor/gather.cc b/onnxruntime/core/providers/cpu/tensor/gather.cc
index 38a16ee83c86b..e36f2c7f89183 100644
--- a/onnxruntime/core/providers/cpu/tensor/gather.cc
+++ b/onnxruntime/core/providers/cpu/tensor/gather.cc
@@ -102,9 +102,9 @@ Status GatherCopyData(const Tensor* indices_tensor, const uint8_t* src_base, uin
     }
   }
 
-  auto lambda = [&](int64_t index) {
-    int64_t batch = index / N;
-    int64_t i = index % N;
+  auto lambda = [&](ptrdiff_t index) {
+    const int64_t batch = static_cast<int64_t>(index / N);
+    const int64_t i = static_cast<int64_t>(index % N);
 
     const int64_t src_offset_batch = batch * data_batch_bytes;
     const int64_t dst_offset_batch = batch * gathered_batch_bytes;
@@ -120,12 +120,14 @@ Status GatherCopyData(const Tensor* indices_tensor, const uint8_t* src_base, uin
       memcpy(dst_base + dst_offset, src_base + src_offset, narrow<size_t>(block_size));
     }
   };
-  concurrency::ThreadPool::TryParallelFor(tp, SafeInt<ptrdiff_t>(M) * N, static_cast<double>(block_size),
-                                          [&lambda](ptrdiff_t first, ptrdiff_t last) {
-                                            for (int index = static_cast<int>(first), end = static_cast<int>(last); index < end; ++index) {
-                                              lambda(index);
-                                            }
-                                          });
+
+  concurrency::ThreadPool::TryParallelFor(
+      tp, SafeInt<ptrdiff_t>(M) * N, static_cast<double>(block_size),
+      [&lambda](ptrdiff_t first, ptrdiff_t last) {
+        for (ptrdiff_t index = first; index < last; ++index) {
+          lambda(index);
+        }
+      });
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/cpu/tensor/gather_nd.cc b/onnxruntime/core/providers/cpu/tensor/gather_nd.cc
index ad3faa70ed6af..a0a848eef0dff 100644
--- a/onnxruntime/core/providers/cpu/tensor/gather_nd.cc
+++ b/onnxruntime/core/providers/cpu/tensor/gather_nd.cc
@@ -66,6 +66,18 @@ Status GatherNDBase::PrepareForCompute(const TensorShape& input_shape, const Ten
   const auto num_slices = indices_shape.SizeToDimension(indices_shape.NumDimensions() - 1);
   const auto slice_size = input_shape.SizeFromDimension(SafeInt<size_t>(batch_dims_) + num_slice_dims);
   const auto num_batches = input_shape.SizeToDimension(SafeInt<size_t>(batch_dims_));
+
+  // Validate batch dimensions to prevent division by zero
+  if (num_batches == 0) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "GatherND: input tensor batch dimensions cannot be zero");
+  }
+  if (num_slices % num_batches != 0) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "GatherND: indices batch size (", num_slices,
+                           ") is not divisible by input batch size (", num_batches, ")");
+  }
+
   const auto input_batch_stride = input_shape.SizeFromDimension(SafeInt<size_t>(batch_dims_));
   const auto num_slices_per_batch = num_slices / num_batches;
   std::vector<int64_t> sizes_from_slice_dims(onnxruntime::narrow<size_t>(num_slice_dims));
diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu
index 51c80d272bb96..62801c8da1e5f 100644
--- a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu
+++ b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu
@@ -209,7 +209,7 @@ __device__ void reduce_all(
   // the size of shared_memory equals to the number of warps.
 #pragma unroll
   for (int stride = MAX_NUM_WARPS_PER_BLOCK / 2; stride > 0; stride /= 2) {
-    if (tid_in_block + stride < num_warps_in_block) {
+    if (tid_in_block < stride && tid_in_block + stride < num_warps_in_block) {
       shared_memory[tid_in_block] += shared_memory[tid_in_block + stride];
     }
     __syncthreads();
diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
index a96d4c82a7fdc..963fa020d033a 100644
--- a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
@@ -585,6 +585,13 @@ size_t CalcResizeBufferSize(const onnxruntime::UpsampleMode upsample_mode,
                  static_cast<size_t>(std::accumulate(output_dims.begin(),
                                                      output_dims.end(), (int64_t)0));
     case UpsampleMode::LINEAR:
+      // For LINEAR mode:
+      // - bilinear (2-D/4-D) uses mapping for [H, W]
+      // - trilinear (3-D/5-D) uses mapping for [D, H, W]
+      if (output_dims.size() == 3 || output_dims.size() == 5) {
+        return sizeof(LinearMappingInfo) *
+               static_cast<size_t>(std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 3, (int64_t)0));
+      }
       return sizeof(LinearMappingInfo) *
              static_cast<size_t>(std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, (int64_t)0));
     case UpsampleMode::CUBIC:
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 2806eb7a7a8d8..aa6a69adb549c 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -4843,7 +4843,7 @@ static_assert(offsetof(OrtApi, CreateExternalInitializerInfo) / sizeof(void*) ==
 static_assert(offsetof(OrtApi, GetTensorElementTypeAndShapeDataReference) / sizeof(void*) == 414, "Size of version 24 API cannot change");
 
 // So that nobody forgets to finish an API version, this check will serve as a reminder:
-static_assert(std::string_view(ORT_VERSION) == "1.24.2",
+static_assert(std::string_view(ORT_VERSION) == "1.24.3",
               "ORT_Version change detected, please follow below steps to ensure OrtApi is updated properly");
 // 1. Update the hardcoded version string in above static_assert to silence it
 // 2. If there were any APIs added to ort_api_1_to_24 above:
diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
index d96d229c942cb..89651c2d955de 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
@@ -794,7 +794,7 @@ std::string _get_type_name(std::string&) {
 #if !defined(DISABLE_ML_OPS)
 template <typename KeyType, typename ValueType, typename KeyGetterType, typename ValueGetterType>
 static void CreateMapMLValue_LoopIntoMap(Py_ssize_t& pos, PyObject*& key, const std::string& name_input, PyObject*& value,
-                                         PyObject* item, std::map<KeyType, ValueType>& current,
+                                         PyObject* item, bool owns_item_ref, std::map<KeyType, ValueType>& current,
                                          KeyGetterType keyGetter, ValueGetterType valueGetter) {
   KeyType ckey;
   ValueType cvalue;
@@ -806,7 +806,9 @@ static void CreateMapMLValue_LoopIntoMap(Py_ssize_t& pos, PyObject*& key, const
       std::string sType = spyType;
       Py_XDECREF(pStr);
       Py_XDECREF(pType);
-      Py_XDECREF(item);
+      if (owns_item_ref) {
+        Py_XDECREF(item);
+      }
       throw std::runtime_error(std::string("Unexpected key type  ") + sType +
                                std::string(", it cannot be linked to C type ") +
                                _get_type_name(ckey) + std::string(" for input '") +
@@ -820,7 +822,9 @@ static void CreateMapMLValue_LoopIntoMap(Py_ssize_t& pos, PyObject*& key, const
       std::string sType = spyType;
       Py_XDECREF(pStr);
       Py_XDECREF(pType);
-      Py_XDECREF(item);
+      if (owns_item_ref) {
+        Py_XDECREF(item);
+      }
       throw std::runtime_error(std::string("Unexpected value type  ") + sType +
                                std::string(", it cannot be linked to C type ") +
                                _get_type_name(ckey) + std::string(" for input '") +
@@ -836,7 +840,7 @@ static void CreateMapMLValue_Map(Py_ssize_t& pos, PyObject*& key, const std::str
                                  ValueGetterType valueGetter) {
   std::unique_ptr<std::map<KeyType, ValueType>> dst;
   dst = std::make_unique<std::map<KeyType, ValueType>>();
-  CreateMapMLValue_LoopIntoMap(pos, key, name_input, value, item, *dst, keyGetter, valueGetter);
+  CreateMapMLValue_LoopIntoMap(pos, key, name_input, value, item, false, *dst, keyGetter, valueGetter);
   p_mlvalue->Init(dst.release(), DataTypeImpl::GetType<std::map<KeyType, ValueType>>(),
                   DataTypeImpl::GetType<std::map<KeyType, ValueType>>()->GetDeleteFunc());
 }
@@ -850,7 +854,7 @@ void CreateMapMLValue_VectorMap(Py_ssize_t& pos, PyObject*& key, const std::stri
   int index = 0;
   do {
     dstVector->push_back(std::map<KeyType, ValueType>());
-    CreateMapMLValue_LoopIntoMap(pos, key, name_input, value, item, (*dstVector)[index], keyGetter, valueGetter);
+    CreateMapMLValue_LoopIntoMap(pos, key, name_input, value, item, true, (*dstVector)[index], keyGetter, valueGetter);
     Py_DECREF(item);
     ++index;
     item = iterator == NULL ? NULL : PyIter_Next(iterator);
diff --git a/onnxruntime/python/tools/pytorch_export_contrib_ops.py b/onnxruntime/python/tools/pytorch_export_contrib_ops.py
index 1c5e31af99d82..0bd75e5c92e4c 100644
--- a/onnxruntime/python/tools/pytorch_export_contrib_ops.py
+++ b/onnxruntime/python/tools/pytorch_export_contrib_ops.py
@@ -6,6 +6,7 @@
 PyTorch-ONNX exporter (torch.onnx.export).
 """
 
+import contextlib
 import typing
 
 try:
@@ -22,7 +23,7 @@
 _registered_ops: typing.AbstractSet[str] = set()
 
 
-def _reg(symbolic_fn: typing.Callable, namespace: str = ""):
+def _reg(symbolic_fn: typing.Callable, namespace: str = "aten"):
     name = f"{namespace}::{symbolic_fn.__name__}"
     torch.onnx.register_custom_op_symbolic(name, symbolic_fn, _OPSET_VERSION)
     _registered_ops.add(name)
@@ -49,13 +50,6 @@ def grid_sampler(g, input, grid, mode, padding_mode, align_corners):
         padding_mode_str = ["zeros", "border", "reflection"][padding_mode]
         align_corners = int(symbolic_helper._maybe_get_const(align_corners, "b"))
 
-        # From opset v13 onward, the output shape can be specified with
-        # (N, C, H, W) (N, H_out, W_out, 2) => (N, C, H_out, W_out)
-        # input_shape = input.type().sizes()
-        # gird_shape = grid.type().sizes()
-        # output_shape = input_shape[:2] + gird_shape[1:3]
-        # g.op(...).setType(input.type().with_sizes(output_shape))
-
         return g.op(
             "com.microsoft::GridSample",
             input,
@@ -71,15 +65,24 @@ def inverse(g, self):
         return g.op("com.microsoft::Inverse", self).setType(self.type())
 
     _reg(inverse)
+    torch.onnx.register_custom_op_symbolic("aten::linalg_inv", inverse, _OPSET_VERSION)
+    _registered_ops.add("aten::linalg_inv")
+
+    def gelu(g, self: torch._C.Value, approximate="none"):
+        # PyTorch can emit aten::gelu with or without the optional approximate arg.
+        if not isinstance(approximate, str):
+            approximate = symbolic_helper._maybe_get_const(approximate, "s")
 
-    @torch.onnx.symbolic_helper.parse_args("v", "s")
-    def gelu(g, self: torch._C.Value, approximate: str = "none"):
-        # Use microsoft::Gelu for performance if possible. It only supports approximate == "none"
+        # Use microsoft::Gelu for performance if possible. It only supports approximate == "none".
         if approximate == "none":
             return g.op("com.microsoft::Gelu", self).setType(self.type())
         return torch.onnx.symbolic_opset9.gelu(g, self, approximate)
 
     _reg(gelu)
+    # Some PyTorch versions dispatch GELU symbolic lookup by exporter opset.
+    # Registering across stable opsets keeps ORT Gelu fusion consistently enabled.
+    for opset in range(9, 21):
+        torch.onnx.register_custom_op_symbolic("aten::gelu", gelu, opset)
 
     def triu(g, self, diagonal):
         return g.op("com.microsoft::Trilu", self, diagonal, upper_i=1).setType(self.type())
@@ -127,3 +130,8 @@ def unregister():
             for version in symbolic_helper._onnx_stable_opsets:
                 if version >= _OPSET_VERSION and symbolic_registry.is_registered_op(kind, namespace, version):
                     del symbolic_registry._registry[(namespace, version)][kind]
+
+    # Also clean up gelu's multi-opset registrations (see register()).
+    for opset in range(9, 21):
+        with contextlib.suppress(Exception):
+            torch.onnx.unregister_custom_op_symbolic("aten::gelu", opset)
diff --git a/onnxruntime/python/tools/transformers/fusion_attention.py b/onnxruntime/python/tools/transformers/fusion_attention.py
index 08f8691d8b2b5..de7f0a044c118 100644
--- a/onnxruntime/python/tools/transformers/fusion_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_attention.py
@@ -1112,11 +1112,11 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node):
             if (
                 (mul_val is None)
                 or not (isinstance(mul_val, np.ndarray) and mul_val.size == 1)
-                or (float(mul_val) >= 0)
+                or (mul_val.item() >= 0)
             ):
                 return
-            if float(mul_val) != -10000:
-                self.mask_filter_value = float(mul_val)
+            if mul_val.item() != -10000:
+                self.mask_filter_value = mul_val.item()
 
         if matmul_v.input[0] == root_input and matmul_q.input[0] == root_input and matmul_k.input[0] == root_input:
             mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0]) if not is_no_mask_attention else None
diff --git a/onnxruntime/python/tools/transformers/large_model_exporter.py b/onnxruntime/python/tools/transformers/large_model_exporter.py
index 29829a6c475d9..f4d9e28d4ecb2 100644
--- a/onnxruntime/python/tools/transformers/large_model_exporter.py
+++ b/onnxruntime/python/tools/transformers/large_model_exporter.py
@@ -290,6 +290,7 @@ def do_export_internal(model: nn.Module, onnx_io_tuple: tuple, onnx_inputs: tupl
             input_names=onnx_inp_names,
             output_names=onnx_out_names,
             dynamic_axes=onnx_dynamic_axes,
+            dynamo=False,
         )
 
         onnx_path.unlink(missing_ok=True)
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
index b405c19b04689..0b86d5f038cd8 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
@@ -473,7 +473,7 @@ def export_onnx(
                     input_names=input_names,
                     output_names=output_names,
                     dynamic_axes=dynamic_axes,
-                    opset_version=11,
+                    opset_version=14,
                     do_constant_folding=True,
                     use_external_data_format=True,
                     verbose=verbose,
diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
index 2cb6a733c5bc7..17a4ef58914d6 100644
--- a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
@@ -235,6 +235,7 @@ def run_torchscript_separate_export(
         opset_version=torch_export_onnx_opset_version,
         do_constant_folding=True,
         verbose=args.verbose,
+        dynamo=False,
     )
 
     # Check decoder_model.onnx and save all external data to one file
@@ -294,6 +295,7 @@ def run_torchscript_separate_export(
         opset_version=torch_export_onnx_opset_version,
         do_constant_folding=True,
         verbose=args.verbose,
+        dynamo=False,
     )
 
     # Check decoder_with_past_model.onnx and save all external data to one file
diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py
index e10e616d35d38..31fb60f86faf1 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py
@@ -391,8 +391,9 @@ def export_onnx(
                 input_names=input_names,
                 output_names=output_names,
                 dynamic_axes=dynamic_axes,
-                opset_version=17,
+                opset_version=18,
                 do_constant_folding=True,
+                dynamo=False,
                 verbose=verbose,
             )
 
diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder.py
index 851f641442016..48d4e12a38a43 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder.py
@@ -110,8 +110,9 @@ def export_onnx(
                 input_names=input_names,
                 output_names=output_names,
                 dynamic_axes=dynamic_axes,
-                opset_version=17,
+                opset_version=18,
                 do_constant_folding=True,
+                dynamo=False,
                 verbose=verbose,
             )
 
diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py
index cd81edc1001be..35ec59b2bca69 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py
@@ -293,8 +293,9 @@ def export_onnx(
                 input_names=input_names,
                 output_names=output_names,
                 dynamic_axes=dynamic_axes,
-                opset_version=17,
+                opset_version=18,
                 do_constant_folding=True,
+                dynamo=False,
                 verbose=verbose,
             )
 
diff --git a/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py b/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py
index 66f24c47f6cdb..a8c2ad1967acb 100644
--- a/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py
+++ b/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py
@@ -49,6 +49,7 @@ def torch_onnx_export(
             keep_initializers_as_inputs=keep_initializers_as_inputs,
             custom_opsets=custom_opsets,
             export_modules_as_functions=export_modules_as_functions,
+            dynamo=False,
         )
     else:
         torch.onnx.export(
diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
index 8cbdc381a1a70..66f87142d3a34 100644
--- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -495,6 +495,16 @@ TEST(MatMulNBits, Float16_4b_Accuracy4) {
   TestMatMulNBitsTyped<MLFloat16, 100, 288, 93, 32, 4>();
   TestMatMulNBitsTyped<MLFloat16, 100, 288, 93, 128, 4>();
   TestMatMulNBitsTyped<MLFloat16, 100, 288, 1234, 16, 4>();
+
+  // See PR #27412 for details on the following test case,
+  // which is added to cover a specific failure case in the past.
+  // 6144, 2048
+
+  // Since K is larger (more change of larger error),
+  // and N is larger (more chance of havinga value with larger error),
+  // we set a higher tolerance for this case to avoid false positives
+  // and flaky failures.
+  TestMatMulNBitsTyped<MLFloat16, 369, 6144, 2048, 32, 4>(0.2f, 0.03f);
 }
 
 TEST(MatMulNBits, LegacyShape_4b) {
diff --git a/onnxruntime/test/framework/tensorutils_test.cc b/onnxruntime/test/framework/tensorutils_test.cc
index 0d7b583faf27b..c9b61a7a39632 100644
--- a/onnxruntime/test/framework/tensorutils_test.cc
+++ b/onnxruntime/test/framework/tensorutils_test.cc
@@ -530,9 +530,6 @@ TEST_F(PathValidationTest, ValidateExternalDataPath) {
   // Valid relative path.
   ASSERT_STATUS_OK(utils::ValidateExternalDataPath(base_dir_, "data.bin"));
 
-  // Empty base directory.
-  ASSERT_STATUS_OK(utils::ValidateExternalDataPath("", "data.bin"));
-
   // Empty location.
   // Only validate it is not an absolute path.
   ASSERT_TRUE(utils::ValidateExternalDataPath(base_dir_, "").IsOK());
@@ -555,6 +552,29 @@ TEST_F(PathValidationTest, ValidateExternalDataPath) {
 
   // Base directory does not exist.
   ASSERT_STATUS_OK(utils::ValidateExternalDataPath("non_existent_dir", "data.bin"));
+
+  //
+  // Tests for an empty base directory.
+  // The base directory would be empty when 1) the session loads a model from bytes and 2) the application does not
+  // set an external file folder path via the session config option
+  // kOrtSessionOptionsModelExternalInitializersFileFolderPath.
+  //
+
+  // A simple filename is ok (would not escape current working directory).
+  ASSERT_STATUS_OK(utils::ValidateExternalDataPath("", "data.bin"));
+  ASSERT_STATUS_OK(utils::ValidateExternalDataPath("", "./data.bin"));
+
+  // A ".." that is not a path component (part of the filename) is ok
+  ASSERT_STATUS_OK(utils::ValidateExternalDataPath("", "data..bin"));
+
+  // A path that would escape the current working directory is invalid.
+  ASSERT_FALSE(utils::ValidateExternalDataPath("", "../data.bin").IsOK());
+
+  // A path that uses ".." but would not escape the current working directory should be fine.
+  ASSERT_STATUS_OK(utils::ValidateExternalDataPath("", "a/../data.bin"));
+
+  // A path with multiple internal ".." that would escape current working direction should fail.
+  ASSERT_FALSE(utils::ValidateExternalDataPath("", "a/../../data.bin").IsOK());
 }
 
 TEST_F(PathValidationTest, ValidateExternalDataPathWithSymlinkInside) {
diff --git a/onnxruntime/test/optimizer/graph_transform_test_layernorm.cc b/onnxruntime/test/optimizer/graph_transform_test_layernorm.cc
index 0afb836192b0a..4615b6a57b558 100644
--- a/onnxruntime/test/optimizer/graph_transform_test_layernorm.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test_layernorm.cc
@@ -638,6 +638,37 @@ TEST_F(GraphTransformationTests, SkipLayerNormFusionTest) {
   TestSkipLayerNormFusion(MODEL_FOLDER "fusion/skip_layer_norm_format3_graph_output.onnx", 1, 1, 0, 0, logger_.get());
 }
 
+// SkipLayerNorm fusion should not be applied when gamma/beta have more than 1 dimension,
+// because the SkipLayerNormalization kernel requires 1D gamma/beta.
+TEST_F(GraphTransformationTests, SkipLayerNormFusion_3DGamma_NoFusion) {
+  auto build_test_case = [](ModelTestBuilder& builder) {
+    // Inputs: A and B are 3D [16, 32, 4]
+    auto* input_a = builder.MakeInput<float>({16, 32, 4}, -1.0f, 1.0f);
+    auto* input_b = builder.MakeInput<float>({16, 32, 4}, -1.0f, 1.0f);
+    // gamma and beta have 3D shape [1, 1, 4] (not 1D)
+    auto* gamma = builder.MakeInitializer<float>({1, 1, 4}, {1.0f, 2.0f, 3.0f, 4.0f});
+    auto* beta = builder.MakeInitializer<float>({1, 1, 4}, {0.1f, 0.2f, 0.3f, 0.4f});
+    auto* add_out = builder.MakeIntermediate();
+    auto* ln_out = builder.MakeOutput();
+
+    builder.AddNode("Add", {input_a, input_b}, {add_out});
+    builder.AddNode("LayerNormalization", {add_out, gamma, beta}, {ln_out})
+        .AddAttribute("axis", static_cast<int64_t>(-1));
+  };
+
+  auto post_graph_checker = [](Graph& graph) {
+    // SkipLayerNormalization should NOT have been created because gamma/beta are 3D.
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Add"] == 1);
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["LayerNormalization"] == 1);
+    TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["com.microsoft.SkipLayerNormalization"] == 0);
+    return Status::OK();
+  };
+
+  ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 17, *logger_,
+                                        std::make_unique<SkipLayerNormFusion>(),
+                                        TransformerLevel::Level2, 1, nullptr, post_graph_checker));
+}
+
 TEST_F(GraphTransformationTests, GroupQueryAttentionFusionTest) {
   TestGQAFusion(MODEL_FOLDER "fusion/gqa_fusion_quantized_simple.onnx", 1, 0, logger_.get());
   TestGQAFusion(MODEL_FOLDER "fusion/gqa_fusion_different_head_sizes.onnx", 0, 1, logger_.get());
diff --git a/onnxruntime/test/providers/cpu/tensor/gather_nd_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_nd_op_test.cc
index 081b4b484a73b..a8f3b99b2b3d3 100644
--- a/onnxruntime/test/providers/cpu/tensor/gather_nd_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/gather_nd_op_test.cc
@@ -329,5 +329,48 @@ TEST(GatherNDOpTest, GatherND_slice_int64_t) {
   test.Run();
 }
 
+// Test for issue #23828: GatherND should return error instead of crashing
+// when batch dimensions mismatch between input and indices
+TEST(GatherNDOpTest, GatherND_batch_dims_mismatch_error) {
+  OpTester test("GatherND", 12, kOnnxDomain);
+  test.AddAttribute<int64_t>("batch_dims", 1);
+
+  // Input has 3 batches, but indices has 2 slices (indices batch size 2), which is not divisible by 3 - mismatch!
+  test.AddInput<float>("data", {3, 3}, {0.f, 1.f, 2.f, 10.f, 11.f, 12.f, 20.f, 21.f, 22.f});
+  test.AddInput<int64_t>("indices", {2, 1}, {1, 2});
+  test.AddOutput<float>("output", {2}, {0.f, 0.f});  // dummy output, won't be used
+
+  // Force execution only on CPU
+  std::vector<std::unique_ptr<onnxruntime::IExecutionProvider>> cpu_only_ep;
+  cpu_only_ep.push_back(DefaultCpuExecutionProvider());
+
+  test.Run(OpTester::ExpectResult::kExpectFailure,
+           "GatherND: indices batch size (2) is not divisible by input batch size (3)",
+           {},             // no excluded providers needed
+           nullptr,        // no RunOptions
+           &cpu_only_ep);  // force CPU
+}
+
+// Test for issue #23828: GatherND should return error when input batch dimension is zero
+TEST(GatherNDOpTest, GatherND_zero_batch_dims_error) {
+  OpTester test("GatherND", 12, kOnnxDomain);
+  test.AddAttribute<int64_t>("batch_dims", 1);
+
+  // Input has 0 batches - should fail with clear error instead of division by zero
+  test.AddInput<float>("data", {0, 3}, {});
+  test.AddInput<int64_t>("indices", {2, 1}, {1, 2});
+  test.AddOutput<float>("output", {2}, {0.f, 0.f});  // dummy output, won't be used
+
+  // Force execution only on CPU
+  std::vector<std::unique_ptr<onnxruntime::IExecutionProvider>> cpu_only_ep;
+  cpu_only_ep.push_back(DefaultCpuExecutionProvider());
+
+  test.Run(OpTester::ExpectResult::kExpectFailure,
+           "GatherND: input tensor batch dimensions cannot be zero",
+           {},
+           nullptr,
+           &cpu_only_ep);  // force CPU
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
index 35066bd68c65e..adbe86a14dabf 100644
--- a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
@@ -341,6 +341,41 @@ TEST(GatherOpTest, Gather_axis1_indices2d_string) {
   test.Run();
 }
 
+TEST(GatherOpTest, Gather_overflow_check) {
+// Skip on 32-bit platforms where size_t overflow would truncate the large expected
+// output shape and where allocating the full reference tensor is infeasible.
+#if SIZE_MAX <= UINT32_MAX
+  GTEST_SKIP() << "Gather_overflow_check skipped on 32-bit platforms.";
+#endif
+
+  // The test uses dimensions (65537, 2) and indices of length 65537, which produce an output
+  // shape of (65537, 65537).
+  //
+  // 65537 x 65537 = 4,295,098,369 which is greater than the maximum value of a 32-bit integer (2,147,483,647).
+  //
+  // This test is to verify CPU implementation of the Gather operator doesn't overflow when calculating
+  // the output shape and generating the output tensor.
+
+  OpTester test("Gather");
+  test.AddAttribute<int64_t>("axis", 1LL);
+
+  // Inputs
+  const std::vector<int64_t> data_dims{65537, 2};
+  const std::vector<int64_t> indices_dims{65537};
+  std::vector<uint8_t> data_values(static_cast<size_t>(data_dims[0] * data_dims[1]), 1);
+  std::vector<int64_t> indices_values(static_cast<size_t>(indices_dims[0]), 1);
+  std::vector<uint8_t> expected_output_values(static_cast<size_t>(65537) * static_cast<size_t>(65537), 1);
+
+  test.AddInput<uint8_t>("data", {65537, 2}, data_values);
+  test.AddInput<int64_t>("indices", {65537}, indices_values);
+  test.AddOutput<uint8_t>("output", {65537, 65537}, expected_output_values);
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.emplace_back(DefaultCpuExecutionProvider());
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
 TEST(GatherOpTest, Gather_axis1_indices2d_bool) {
   OpTester test("Gather");
   test.AddAttribute<int64_t>("axis", 1LL);
diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
index 200a1aded8204..8fd994baec713 100644
--- a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
@@ -1015,6 +1015,38 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_5DTrilinear_pytorch_half_pixel) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: results mismatch
 }
 
+TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_5DTrilinear_CudaRegression) {
+  auto cuda_ep = DefaultCudaExecutionProvider();
+  if (!cuda_ep) {
+    GTEST_SKIP() << "CUDA EP not available";
+  }
+
+  OpTester test("Resize", 13);
+  std::vector<float> roi{};
+  std::vector<float> scales{1.0f, 1.0f, 2.0f, 2.0f, 2.0f};
+
+  test.AddAttribute("mode", "linear");
+  test.AddAttribute("coordinate_transformation_mode", "pytorch_half_pixel");
+
+  constexpr int64_t N = 1, C = 1, D = 3, H = 4, W = 5;
+  std::vector<float> X(static_cast<size_t>(N * C * D * H * W), 1.0f);
+
+  test.AddInput<float>("X", {N, C, D, H, W}, X);
+  test.AddInput<float>("roi", {0}, roi);
+  test.AddInput<float>("scales", {5}, scales);
+
+  constexpr int64_t out_D = D * 2;
+  constexpr int64_t out_H = H * 2;
+  constexpr int64_t out_W = W * 2;
+  std::vector<float> Y(static_cast<size_t>(N * C * out_D * out_H * out_W), 1.0f);
+
+  test.AddOutput<float>("Y", {N, C, out_D, out_H, out_W}, Y);
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(std::move(cuda_ep));
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
 TEST(ResizeOpTest, ResizeOpLinearScalesNoOpTest) {
   // To test NNAPI EP, we need the scales/sizes to be in initializers
   auto run_test = [](bool scales_in_initializer) {
diff --git a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
index ec7e98528504e..593255b9e9c23 100644
--- a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
@@ -177,6 +177,35 @@ void TestReduceColumnsToColumn(int m, int n, float relative_error_tolerance = 1e
 
   CheckDeviceValues(m, d_out.get(), expected_column.data(), relative_error_tolerance);
 }
+
+void TestReduceColumnsToColumnRepeated(int m, int n, int iterations, float relative_error_tolerance = 1e-4f) {
+  SCOPED_TRACE(MakeString("m: ", m, ", n:", n, ", iterations: ", iterations));
+
+  const TensorShape shape{m, n};
+  RandomValueGenerator random{};
+  const auto values = random.Uniform<float>(shape.GetDims(), 1.0f, 10.0f);
+  const auto expected_column = ExpectedReduceMatrixColumnsOutput(m, n, values);
+
+  auto d_in = AllocateDeviceMemory<float>(m * n);
+  auto d_out = AllocateDeviceMemory<float>(m);
+
+  cudaMemcpy(d_in.get(), values.data(), m * n * sizeof(float), cudaMemcpyHostToDevice);
+
+  size_t buffer_size_in_bytes =
+      compute_reduce_matrix_columns_buffer_size<float>(m, n);
+  auto d_buffer = AllocateDeviceMemory<char>(buffer_size_in_bytes);
+
+  for (int i = 0; i < iterations; ++i) {
+    ASSERT_STATUS_OK(reduce_matrix_columns(
+        0,
+        d_in.get(), d_out.get(),
+        m, n,
+        d_buffer.get(), buffer_size_in_bytes));
+
+    ASSERT_TRUE(CUDA_CALL(cudaDeviceSynchronize()).IsOK());
+    CheckDeviceValues(m, d_out.get(), expected_column.data(), relative_error_tolerance);
+  }
+}
 }  // namespace
 
 TEST(ReductionFunctionsTest, ReduceRowToScalar) {
@@ -205,6 +234,10 @@ TEST(ReductionFunctionsTest, ReduceColumnsToColumn) {
   }
 }
 
+TEST(ReductionFunctionsTest, ReduceColumnsToColumnRepeated) {
+  TestReduceColumnsToColumnRepeated(17, 8192, 100, 2e-4f);
+}
+
 TEST(ReductionFunctionsTest, BufferOffsets) {
   const int m = 2048;
   const int n = 1024;
diff --git a/onnxruntime/test/python/test_pytorch_export_contrib_ops.py b/onnxruntime/test/python/test_pytorch_export_contrib_ops.py
index e7ea83dd00297..afefc4e616a87 100644
--- a/onnxruntime/test/python/test_pytorch_export_contrib_ops.py
+++ b/onnxruntime/test/python/test_pytorch_export_contrib_ops.py
@@ -59,6 +59,9 @@ def setUp(self):
         torch.manual_seed(0)
         pytorch_export_contrib_ops.register()
 
+    def tearDown(self):
+        pytorch_export_contrib_ops.unregister()
+
     def run_test(
         self,
         model,
@@ -101,6 +104,7 @@ def run_test(
                 input_names=input_names,
                 output_names=output_names,
                 custom_opsets=custom_opsets,
+                dynamo=False,
             )
 
             # compute onnxruntime output prediction
@@ -143,12 +147,13 @@ def test_gelu_is_fused_by_default(self):
             f,
             opset_version=self.opset_version,
             custom_opsets={"com.microsoft": 1},
+            dynamo=False,
         )
         f.seek(0)
         onnx_model = onnx.load(f)
-        node = onnx_model.graph.node[0]
-        self.assertEqual(node.op_type, "Gelu")
-        self.assertEqual(node.domain, "com.microsoft")
+        # Default GELU should be mapped to ORT contrib Gelu for performance.
+        gelu_nodes = [n for n in onnx_model.graph.node if n.op_type == "Gelu" and n.domain == "com.microsoft"]
+        self.assertEqual(len(gelu_nodes), 1)
 
     @parameterized.parameterized.expand([("default_approximate", "none"), ("tanh_approximate", "tanh")])
     @unittest.skipIf(_torch_version_lower_than("1.12"), "Gelu's approximate parameter unsupported in PyTorch < 1.12")
@@ -230,8 +235,8 @@ def forward(self, input):
 # IR version 4 style export.
 ONNXExporterTest_opset9_IRv4 = type(
     "TestONNXRuntime_opset9_IRv4",
-    (unittest.TestCase,),
-    dict(ONNXExporterTest.__dict__, keep_initializers_as_inputs=False),
+    (ONNXExporterTest,),
+    dict(keep_initializers_as_inputs=False),
 )
 
 
diff --git a/onnxruntime/test/python/transformers/benchmark_qmoe.py b/onnxruntime/test/python/transformers/benchmark_qmoe.py
new file mode 100644
index 0000000000000..b96c9cdcf5c3a
--- /dev/null
+++ b/onnxruntime/test/python/transformers/benchmark_qmoe.py
@@ -0,0 +1,191 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+import os
+import sys
+import time
+import unittest
+
+import numpy
+import torch
+
+# Add current directory to path to allow importing from test_qmoe_cpu
+current_dir = os.path.dirname(os.path.abspath(__file__))
+if current_dir not in sys.path:
+    sys.path.append(current_dir)
+
+from test_qmoe_cpu import PhiMoEConfig, PhiMoESparseMoeBlock, TensorProto  # noqa: E402
+
+# Reduces number of tests to run for faster pipeline checks
+pipeline_mode = os.getenv("PIPELINE_MODE", "1") == "1"
+
+
+@unittest.skipIf(pipeline_mode, "Skip benchmark in CI pipeline.")
+class TestQMoESwiGLUBenchmark(unittest.TestCase):
+    """Benchmark tests for QMoE SwiGLU performance measurement."""
+
+    def test_qmoe_swiglu_throughput_benchmark(self):
+        """Comprehensive throughput benchmark for QMoE SwiGLU across different configurations."""
+        print("\n=== QMoE SwiGLU Throughput Benchmark ===")
+
+        # Test configurations: (name, hidden_size, intermediate_size, num_experts, top_k, quant_bits)
+        configs = [
+            ("Medium-4bit", 2880, 2880, 32, 4, 4),
+            ("Medium-8bit", 2880, 2880, 32, 4, 8),
+        ]
+
+        batch_size = 1
+        sequence_length = 512
+        num_runs = 1000
+
+        results = []
+
+        for config_name, hidden_size, intermediate_size, num_experts, top_k, quant_bits in configs:
+            torch.manual_seed(42)
+            numpy.random.seed(42)
+
+            torch_output = None
+            ort_output = None
+
+            print(f"\nTesting {config_name}:")
+            print(f"  Hidden: {hidden_size}, Intermediate: {intermediate_size}")
+            print(f"  Experts: {num_experts}, Top-K: {top_k}, Quant: {quant_bits}-bit")
+
+            try:
+                # Create config and model
+                config = PhiMoEConfig(
+                    hidden_size=hidden_size,
+                    intermediate_size=intermediate_size,
+                    num_local_experts=num_experts,
+                    num_experts_per_tok=top_k,
+                )
+
+                qmoe_swiglu = PhiMoESparseMoeBlock(
+                    config,
+                    batch_size=batch_size,
+                    sequence_length=sequence_length,
+                    quant_bits=quant_bits,
+                    onnx_dtype=TensorProto.FLOAT,
+                )
+
+                # Create test input with fixed sequence length to match ONNX model
+                full_hidden_states = torch.randn(batch_size, sequence_length, hidden_size).to(torch.float32)
+
+                # For TTFT simulation, we'll measure single forward pass time
+                # This represents the time to process one token in autoregressive generation
+
+                # Warm up with full context
+                for _ in range(3):
+                    _ = qmoe_swiglu.forward(full_hidden_states)
+
+                # Benchmark PyTorch TTFT (Time to First Token)
+                # Measure time for a single forward pass (represents token generation time)
+                torch.manual_seed(42)
+
+                start_time = time.time()
+                for _ in range(num_runs):
+                    torch_output = qmoe_swiglu.forward(full_hidden_states)
+                end_time = time.time()
+                torch_ttft_ms = (end_time - start_time) / num_runs * 1000
+
+                # Calculate tokens per second (throughput)
+                # For sequence generation, this represents the rate at which we can generate tokens
+                torch_tokens_per_sec = 1000.0 / torch_ttft_ms  # 1 token / (time_ms / 1000)
+
+                print(f"  PyTorch TTFT: {torch_ttft_ms:.3f} ms (per token generation time)")
+                print(f"  PyTorch Throughput: {torch_tokens_per_sec:.1f} tokens/sec")
+
+                # Benchmark ONNX Runtime
+                ort_ttft_ms = 0
+                ort_tokens_per_sec = 0
+                speedup = 0
+                throughput_ratio = 0
+                max_diff = 0
+
+                model_updated = qmoe_swiglu.recreate_onnx_model()
+                if model_updated and qmoe_swiglu.ort_sess is not None:
+                    # Warm up ORT with full context
+                    for _ in range(3):
+                        _ = qmoe_swiglu.ort_forward(full_hidden_states)
+
+                    torch.manual_seed(42)
+
+                    # Measure ONNX Runtime TTFT (Time to First Token)
+                    start_time = time.time()
+                    for _ in range(num_runs):
+                        ort_output = qmoe_swiglu.ort_forward(full_hidden_states)
+                    end_time = time.time()
+                    ort_ttft_ms = (end_time - start_time) / num_runs * 1000
+
+                    # Calculate tokens per second for ONNX Runtime
+                    ort_tokens_per_sec = 1000.0 / ort_ttft_ms  # 1 token / (time_ms / 1000)
+
+                    speedup = torch_ttft_ms / ort_ttft_ms if ort_ttft_ms > 0 else 0
+                    throughput_ratio = ort_tokens_per_sec / torch_tokens_per_sec if torch_tokens_per_sec > 0 else 0
+
+                    print(f"  ONNX RT TTFT: {ort_ttft_ms:.3f} ms (per token generation time)")
+                    print(f"  ONNX RT Throughput: {ort_tokens_per_sec:.1f} tokens/sec")
+                    print(f"  TTFT Speedup: {speedup:.2f}x")
+                    print(f"  Throughput Gain: {throughput_ratio:.2f}x")
+                else:
+                    print("  ONNX RT: Not available")
+
+                # Calculate max difference if both outputs available
+                if torch_output is not None and ort_output is not None:
+                    max_diff = (torch_output.cpu() - ort_output.cpu()).abs().max().item()
+                    print(f"  Max diff: {max_diff:.6f}")
+
+                results.append(
+                    {
+                        "config": config_name,
+                        "torch_ttft_ms": torch_ttft_ms,
+                        "torch_tokens_per_sec": torch_tokens_per_sec,
+                        "ort_ttft_ms": ort_ttft_ms,
+                        "ort_tokens_per_sec": ort_tokens_per_sec,
+                        "speedup": speedup,
+                        "throughput_ratio": throughput_ratio,
+                        "max_diff": max_diff,
+                    }
+                )
+
+            except Exception as e:
+                print(f"  Error: {e}")
+                continue
+
+        # Summary
+        print("\n=== Token Generation Time & Throughput Summary ===")
+        print(
+            f"{'Config':<15} {'PT Time':<10} {'PT tok/s':<10} {'ORT Time':<11} {'ORT tok/s':<11} {'Time Gain':<10} {'Throughput':<11} {'Max Diff':<10}"
+        )
+        print("-" * 105)
+        for result in results:
+            config = result["config"]
+            torch_ttft = result["torch_ttft_ms"]
+            torch_tps = result["torch_tokens_per_sec"]
+            ort_ttft = result["ort_ttft_ms"]
+            ort_tps = result["ort_tokens_per_sec"]
+            speedup = result["speedup"]
+            throughput_ratio = result["throughput_ratio"]
+            max_diff = result["max_diff"]
+
+            ort_ttft_str = f"{ort_ttft:.3f}" if ort_ttft > 0 else "N/A"
+            ort_tps_str = f"{ort_tps:.1f}" if ort_tps > 0 else "N/A"
+            speedup_str = f"{speedup:.2f}x" if speedup > 0 else "N/A"
+            throughput_str = f"{throughput_ratio:.2f}x" if throughput_ratio > 0 else "N/A"
+
+            print(
+                f"{config:<15} {torch_ttft:<10.3f} {torch_tps:<10.1f} {ort_ttft_str:<11} {ort_tps_str:<11} {speedup_str:<10} {throughput_str:<11} {max_diff:<10.6f}"
+            )
+
+        print("\nNotes:")
+        print("- Time: Token generation time in ms (lower is better)")
+        print("- tok/s: Tokens per second throughput (higher is better)")
+        print("- Time Gain: ORT speedup for latency (higher is better)")
+        print("- Throughput: ORT throughput improvement (higher is better)")
+
+
+if __name__ == "__main__":
+    benchmark = TestQMoESwiGLUBenchmark()
+    benchmark.test_qmoe_swiglu_throughput_benchmark()
diff --git a/onnxruntime/test/python/transformers/parity_utilities.py b/onnxruntime/test/python/transformers/parity_utilities.py
index fa16f0e67a523..04a1ed06773e7 100644
--- a/onnxruntime/test/python/transformers/parity_utilities.py
+++ b/onnxruntime/test/python/transformers/parity_utilities.py
@@ -92,6 +92,7 @@ def export_onnx(model, onnx_model_path, float16, hidden_size, device):
         dynamic_axes=dynamic_axes,
         opset_version=11,
         do_constant_folding=True,
+        dynamo=False,
     )
     print("exported:", onnx_model_path)
 
diff --git a/onnxruntime/test/python/transformers/test_gelu_fusions.py b/onnxruntime/test/python/transformers/test_gelu_fusions.py
index 11ae1401ff8ed..a63e2653f2fbc 100644
--- a/onnxruntime/test/python/transformers/test_gelu_fusions.py
+++ b/onnxruntime/test/python/transformers/test_gelu_fusions.py
@@ -75,17 +75,22 @@ def test_fusions(self, test_case, dynamo):
         dummy_input = torch.ones(3, dtype=torch.float32)
         test_name = f"{operator}_{source}"
         onnx_path = f"{test_name}.onnx"
+
+        # For Torch 2.10+, torch.nn.functional.gelu(approximate="tanh") exports as Gelu node.
+        # So we force opset_version=18 here.
         torch.onnx.export(
             model,
             (dummy_input,),
             onnx_path,
             input_names=["input"],
             output_names=["output"],
-            dynamo=dynamo,
+            opset_version=18,
+            dynamo=False,
             optimize=True,  # Only meaningful when dynamo is True
         )
         optimizer = optimize_model(onnx_path, "bert")
         # optimizer.save_model_to_file(f"{operator}_{source}_opt.onnx")
+
         os.remove(onnx_path)
         # Remove the associated .data file (dynamo)
         data_path = onnx_path + ".data"
diff --git a/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py b/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py
index 444d86da75ba6..c07eb39e6df75 100644
--- a/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py
+++ b/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py
@@ -253,6 +253,7 @@ def export_onnx(model, onnx_model_path, float16, hidden_size, num_attention_head
         dynamic_axes=dynamic_axes,
         opset_version=11,
         do_constant_folding=True,
+        dynamo=False,
     )
     print("exported:", onnx_model_path)
 
diff --git a/onnxruntime/test/python/transformers/test_phi_vision.py b/onnxruntime/test/python/transformers/test_phi_vision.py
index d276366706af9..5a5fa926eb255 100644
--- a/onnxruntime/test/python/transformers/test_phi_vision.py
+++ b/onnxruntime/test/python/transformers/test_phi_vision.py
@@ -208,6 +208,7 @@ def export(self, model, inputs):
                     "input": {0: "batch", 1: "seq"},
                     "attention_mask": {0: "batch", 2: "seq", 3: "seq"},
                 },
+                dynamo=False,
             )
         else:
             torch.onnx.export(
@@ -217,6 +218,7 @@ def export(self, model, inputs):
                 export_params=True,
                 opset_version=14,
                 do_constant_folding=True,
+                dynamo=False,
             )
 
     def tearDown(self):
diff --git a/onnxruntime/test/python/transformers/test_qmoe_cpu.py b/onnxruntime/test/python/transformers/test_qmoe_cpu.py
index 90ebb148a26a5..8415c7b08b77c 100644
--- a/onnxruntime/test/python/transformers/test_qmoe_cpu.py
+++ b/onnxruntime/test/python/transformers/test_qmoe_cpu.py
@@ -23,9 +23,11 @@
 # normalization on the selected experts. This provides proper weight distribution
 # while maintaining computational efficiency.
 # --------------------------------------------------------------------------
+import os
 import time
 import unittest
 from collections import OrderedDict
+from contextlib import contextmanager
 
 import numpy
 import torch
@@ -76,6 +78,8 @@ class TensorProtoPlaceholder:
 
 ort_provider = ["CPUExecutionProvider"]
 
+ORT_USE_MLAS_Q4_GEMM_MOE = "ORT_USE_MLAS_Q4_GEMM_MOE"
+
 torch.manual_seed(42)
 numpy.random.seed(42)
 
@@ -364,7 +368,7 @@ def create_cpu_moe_onnx_graph(
     use_swiglu=False,
     use_quant=False,
     quant_bits=4,
-    swiglu_interleaved=False,
+    swiglu_fusion=0,
     block_size=0,
 ):
     if not has_onnx:
@@ -400,10 +404,10 @@ def create_cpu_moe_onnx_graph(
             "router_probs",  # 1
             "fc1_experts_weights",  # 2
             "fc1_scales",  # 3
-            "",  # 4: fc1_bias
+            "fc1_experts_bias" if fc1_bias is not None else "",  # 4
             "fc2_experts_weights",  # 5
             "fc2_scales",  # 6
-            "",  # 7: fc2_bias
+            "fc2_experts_bias" if fc2_bias is not None else "",  # 7
             "",  # 8: fc3_weights
             "",  # 9: fc3_scales
             "",  # 10: fc3_bias
@@ -442,11 +446,10 @@ def create_cpu_moe_onnx_graph(
             normalize_routing_weights=normalize_routing,
             activation_type=activation,
             # Add new attributes with backwards-compatible default values
-            swiglu_fusion=1 if use_swiglu else 0,  # 1 if using SwiGLU activation
+            swiglu_fusion=swiglu_fusion,
             swiglu_limit=7.0,
             activation_alpha=1.702,
             activation_beta=1.0,
-            swiglu_interleaved=1 if swiglu_interleaved else 0,  # Enable this attribute
             domain="com.microsoft",
         ),
     ]
@@ -559,6 +562,30 @@ def create_cpu_moe_onnx_graph(
             )
         )
 
+    if fc1_bias is not None:
+        fc1_bias_np = fc1_bias.detach().cpu().numpy().astype(ort_to_numpy_type_map[onnx_dtype])
+        initializers.append(
+            helper.make_tensor(
+                "fc1_experts_bias",
+                onnx_dtype,
+                list(fc1_bias.shape),
+                fc1_bias_np.flatten().tolist(),
+                raw=False,
+            )
+        )
+
+    if fc2_bias is not None:
+        fc2_bias_np = fc2_bias.detach().cpu().numpy().astype(ort_to_numpy_type_map[onnx_dtype])
+        initializers.append(
+            helper.make_tensor(
+                "fc2_experts_bias",
+                onnx_dtype,
+                list(fc2_bias.shape),
+                fc2_bias_np.flatten().tolist(),
+                raw=False,
+            )
+        )
+
     graph_inputs = [
         helper.make_tensor_value_info("input", onnx_dtype, [sequence_length, hidden_size]),
     ]
@@ -626,7 +653,7 @@ def __init__(
         self.num_experts_per_token = num_experts_per_token
 
 
-def swiglu(x: torch.Tensor, alpha: float = 1.702, limit: float = 7.0):
+def swiglu(x: torch.Tensor, alpha: float = 1.702, beta: float = 1.0, limit: float = 7.0):
     dim = x.shape[-1]
     x = x.view(-1, dim // 2, 2)
     x_glu, x_linear = x[..., 0], x[..., 1]
@@ -635,8 +662,8 @@ def swiglu(x: torch.Tensor, alpha: float = 1.702, limit: float = 7.0):
         x_glu = x_glu.clamp(max=limit)
         x_linear = x_linear.clamp(min=-limit, max=limit)
 
-    y = x_glu * torch.sigmoid(alpha * x_glu) * (x_linear + 1)
-    return y
+    y = x_glu * torch.sigmoid(alpha * x_glu) * (x_linear + beta)
+    return y.view(-1, dim // 2)
 
 
 class MoEBlockSparseTop2MLP(nn.Module):
@@ -855,7 +882,7 @@ def ort_forward(self, hidden_states: torch.Tensor, enable_performance_test=False
                 e = time.time()
                 time_ms = (e - s) / repeat * 1000
                 is_swiglu = hasattr(self, "use_swiglu") and self.use_swiglu
-                is_interleaved = hasattr(self, "swiglu_interleaved") and self.swiglu_interleaved
+                is_interleaved = hasattr(self, "swiglu_fusion") and self.swiglu_fusion == 1
                 act_type = f"SwiGLU(interleaved={is_interleaved})" if is_swiglu else "SiLU"
                 print(f"ORT Performance - {act_type} {self.quant_bits}-bit: {time_ms:.3f} ms/inference")
 
@@ -868,62 +895,80 @@ def recreate_onnx_model(self):
         """Recreate the ONNX model with the current weights to reflect any changes to the quantization code."""
 
         w1_list, w2_list = [], []
+        w1_bias_list, w2_bias_list = [], []
         w1_scale_list, w2_scale_list = [], []
         w1_zp_list, w2_zp_list = [], []
 
         is_4_bit = self.quant_bits == 4
         for i in range(self.num_experts):
-            if self.block_size > 0:
-                # Use block-wise quantization
-                w1_scale, pre_qweight1, w1_qdq, w1_zp = quant_dequant_blockwise(
-                    self.experts[i].w1.weight, self.block_size, is_4_bit, asymmetric=self.use_asymmetric_quant
-                )
-                w2_scale, pre_qweight2, w2_qdq, w2_zp = quant_dequant_blockwise(
-                    self.experts[i].w2.weight, self.block_size, is_4_bit, asymmetric=self.use_asymmetric_quant
-                )
+            if hasattr(self.experts[i], "w3"):
+                w1, w3 = self.experts[i].w1.weight, self.experts[i].w3.weight
+                w2 = self.experts[i].w2.weight
+                w1_bias = self.experts[i].w1.bias
+                w3_bias = getattr(self.experts[i].w3, "bias", None)
+
+                # Combine and interleave w1 and w3 for the fused kernel
+                w1_combined = torch.cat([w1, w3], dim=0)  # [2*inter, hidden]
+                if getattr(self, "swiglu_fusion", 0) == 1:
+                    w1_combined = w1_combined.view(2, -1, self.hidden_dim).transpose(0, 1).reshape(-1, self.hidden_dim)
+
+                if self.block_size > 0:
+                    w1_scale, pre_qweight1, w1_qdq, w1_zp = quant_dequant_blockwise(
+                        w1_combined, self.block_size, is_4_bit, asymmetric=self.use_asymmetric_quant
+                    )
+                    w2_scale, pre_qweight2, w2_qdq, w2_zp = quant_dequant_blockwise(
+                        w2, self.block_size, is_4_bit, asymmetric=self.use_asymmetric_quant
+                    )
+                else:
+                    w1_scale, pre_qweight1, w1_qdq, w1_zp = quant_dequant(
+                        w1_combined, is_4_bit, asymmetric=self.use_asymmetric_quant
+                    )
+                    w2_scale, pre_qweight2, w2_qdq, w2_zp = quant_dequant(
+                        w2, is_4_bit, asymmetric=self.use_asymmetric_quant
+                    )
+
+                if w1_bias is not None and w3_bias is not None:
+                    b1_combined = torch.cat([w1_bias, w3_bias], dim=0)
+                    if getattr(self, "swiglu_fusion", 0) == 1:
+                        b1_combined = b1_combined.view(2, -1).transpose(0, 1).reshape(-1)
+                    w1_bias_list.append(b1_combined.detach().cpu())
+                elif w1_bias is not None:
+                    w1_bias_list.append(w1_bias.detach().cpu())
             else:
-                # Use row-wise quantization
-                w1_scale, pre_qweight1, w1_qdq, w1_zp = quant_dequant(
-                    self.experts[i].w1.weight, is_4_bit, asymmetric=self.use_asymmetric_quant
-                )
-                w2_scale, pre_qweight2, w2_qdq, w2_zp = quant_dequant(
-                    self.experts[i].w2.weight, is_4_bit, asymmetric=self.use_asymmetric_quant
-                )
+                # PhiMoESwiGLUMLP already has interleaved weights in w1
+                w1 = self.experts[i].w1.weight
+                w2 = self.experts[i].w2.weight
+                w1_bias = self.experts[i].w1.bias
 
-            if self.use_swiglu:
-                if self.swiglu_interleaved:
-                    pass
+                if self.block_size > 0:
+                    w1_scale, pre_qweight1, w1_qdq, w1_zp = quant_dequant_blockwise(
+                        w1, self.block_size, is_4_bit, asymmetric=self.use_asymmetric_quant
+                    )
+                    w2_scale, pre_qweight2, w2_qdq, w2_zp = quant_dequant_blockwise(
+                        w2, self.block_size, is_4_bit, asymmetric=self.use_asymmetric_quant
+                    )
                 else:
-                    if self.block_size > 0:
-                        w3_scale, pre_qweight3, w3_qdq, w3_zp = quant_dequant_blockwise(
-                            self.experts[i].w3.weight, self.block_size, is_4_bit, asymmetric=self.use_asymmetric_quant
-                        )
-                    else:
-                        w3_scale, pre_qweight3, w3_qdq, w3_zp = quant_dequant(
-                            self.experts[i].w3.weight, is_4_bit, asymmetric=self.use_asymmetric_quant
-                        )
-
-                    gate_weights = pre_qweight1
-                    value_weights = pre_qweight3
-                    gate_scales = w1_scale
-                    value_scales = w3_scale
-                    gate_zp = w1_zp
-                    value_zp = w3_zp
-
-                    pre_qweight1 = torch.cat([gate_weights, value_weights], dim=0)
-                    w1_scale = torch.cat([gate_scales, value_scales], dim=0)
-                    if w1_zp is not None and w3_zp is not None:
-                        w1_zp = torch.cat([gate_zp, value_zp], dim=0)
-
-                if self.swiglu_interleaved:
-                    self.experts[i].w1.weight = nn.Parameter(w1_qdq.contiguous().clone())
+                    w1_scale, pre_qweight1, w1_qdq, w1_zp = quant_dequant(
+                        w1, is_4_bit, asymmetric=self.use_asymmetric_quant
+                    )
+                    w2_scale, pre_qweight2, w2_qdq, w2_zp = quant_dequant(
+                        w2, is_4_bit, asymmetric=self.use_asymmetric_quant
+                    )
+                if w1_bias is not None:
+                    w1_bias_list.append(w1_bias.detach().cpu())
 
+            if self.use_swiglu:
+                if getattr(self, "swiglu_fusion", 0) == 1:
+                    self.experts[i].w1.weight = nn.Parameter(w1_qdq.contiguous().clone())
                 else:
                     intermediate_size = self.experts[i].w1.weight.shape[0]
                     gate_dequant = w1_qdq[:intermediate_size].contiguous().clone()
                     value_dequant = w1_qdq[intermediate_size:].contiguous().clone()
-                    self.experts[i].w1.weight.data = gate_dequant
-                    self.experts[i].w3.weight.data = value_dequant
+                    if hasattr(self.experts[i], "w3"):
+                        self.experts[i].w1.weight.data = gate_dequant
+                        self.experts[i].w3.weight.data = value_dequant
+                    else:
+                        self.experts[i].w1.weight.data = w1_qdq.contiguous().clone()
             else:
                 self.experts[i].w1.weight.data = w1_qdq.contiguous().clone()
 
@@ -931,6 +976,9 @@ def recreate_onnx_model(self):
 
             w1_list.append(pre_qweight1)
             w2_list.append(pre_qweight2)
+
+            if self.experts[i].w2.bias is not None:
+                w2_bias_list.append(self.experts[i].w2.bias)
             w1_scale_list.append(w1_scale)
             w2_scale_list.append(w2_scale)
             if w1_zp is not None:
@@ -963,9 +1011,9 @@ def recreate_onnx_model(self):
                 onnx_dtype=self.onnx_dtype,
                 fc1_experts_weights=self.moe_experts_weight1,
                 fc2_experts_weights=self.moe_experts_weight2,
-                # Biases are not used in QMoE
-                fc1_bias=None,
-                fc2_bias=None,
+                # Pass collected biases
+                fc1_bias=torch.stack(w1_bias_list, dim=0) if w1_bias_list else None,
+                fc2_bias=torch.stack(w2_bias_list, dim=0) if w2_bias_list else None,
                 # Scales are used for dequantization
                 fc1_scales=moe_experts_weight_scale1,
                 fc2_scales=moe_experts_weight_scale2,
@@ -975,7 +1023,7 @@ def recreate_onnx_model(self):
                 use_swiglu=self.use_swiglu,
                 use_quant=True,  # Always use QMoE
                 quant_bits=self.quant_bits,
-                swiglu_interleaved=self.swiglu_interleaved if hasattr(self, "swiglu_interleaved") else False,
+                swiglu_fusion=getattr(self, "swiglu_fusion", 0),
                 block_size=self.block_size,  # Add block_size for block-wise quantization
             )
         except Exception:
@@ -1020,7 +1068,7 @@ def parity_check(self):
             max_diff = (torch_output.cpu() - ort_output.cpu()).abs().max()
 
         is_swiglu = hasattr(self, "use_swiglu") and self.use_swiglu
-        is_interleaved = hasattr(self, "swiglu_interleaved") and self.swiglu_interleaved
+        is_interleaved = getattr(self, "swiglu_fusion", 0) == 1
         act_type = f"SwiGLU(interleaved={is_interleaved})" if is_swiglu else "SiLU"
         quant_type = "Asymmetric" if self.use_asymmetric_quant else "Symmetric"
         block_type = f"Block({self.block_size})" if self.block_size > 0 else "Row"
@@ -1047,24 +1095,6 @@ def parity_check(self):
             )
             print("Torch sample:", torch_output.cpu().reshape(-1, hidden_dim)[i, k].item())
             print("ORT  sample:", ort_output.cpu().reshape(-1, hidden_dim)[i, k].item())
-            # Print routing and per-expert contributions for this token from the PyTorch reference
-            try:
-                hidden_states_flat = hidden_state.view(-1, hidden_dim)
-                token_vec = hidden_states_flat[i : i + 1]
-                gate_logits = self.gate(token_vec)
-                topk_vals, topk_experts = torch.topk(gate_logits, self.top_k, dim=-1)
-                topk_soft = F.softmax(topk_vals, dim=1)
-                print("Gate logits:", gate_logits.detach().cpu().numpy())
-                print("Selected experts:", topk_experts.detach().cpu().numpy())
-                print("Routing weights:", topk_soft.detach().cpu().numpy())
-                # Compute per-expert contributions for selected experts
-                for idx_e, e in enumerate(topk_experts[0].tolist()):
-                    expert_layer = self.experts[e]
-                    expert_out = expert_layer(token_vec)
-                    contrib = expert_out[0, k].item() * topk_soft[0, idx_e].item()
-                    print(f"Expert {e} contrib at hidden {k}: {contrib}")
-            except Exception as _:
-                pass
 
         ort_dtype_quant_bits_tolerance_map = {
             "FP32:0": (5e-3, 1e-3),
@@ -1111,6 +1141,43 @@ def small_test_cases():
             yield batch_size, sequence_length
 
 
+def with_mlas_q4_mode(test_cases):
+    expanded_cases = []
+    for case in test_cases:
+        quant_bits = case[2]
+        if quant_bits == 4:
+            expanded_cases.append((*case, None))
+            expanded_cases.append((*case, False))
+            expanded_cases.append((*case, True))
+        else:
+            expanded_cases.append((*case, None))
+    return expanded_cases
+
+
+@contextmanager
+def scoped_env_var(name: str, value: str):
+    previous = os.environ.get(name)
+    os.environ[name] = value
+    try:
+        yield
+    finally:
+        if previous is None:
+            os.environ.pop(name, None)
+        else:
+            os.environ[name] = previous
+
+
+def run_parity_with_mlas_q4_mode(test_runner, enable_mlas_q4_gemm: bool | None):
+    if enable_mlas_q4_gemm is None:  # No env var
+        test_runner()
+    else:
+        env_value = "1" if enable_mlas_q4_gemm else "0"
+        mode = "enabled" if enable_mlas_q4_gemm else "disabled"
+        print(f"DirectQ4 mode ({ORT_USE_MLAS_Q4_GEMM_MOE}) is {mode}")
+        with scoped_env_var(ORT_USE_MLAS_Q4_GEMM_MOE, env_value):
+            test_runner()
+
+
 class SwigluMoEBlock(SparseMoeBlockORTHelper):
     def __init__(
         self,
@@ -1128,7 +1195,7 @@ def __init__(
         self.num_experts = config.num_local_experts
         self.top_k = config.num_experts_per_token
         self.use_swiglu = True
-        self.swiglu_interleaved = True
+        self.swiglu_fusion = 1
         self.block_size = block_size
         use_quant = self.quant_bits > 0
 
@@ -1232,7 +1299,7 @@ def __init__(
         self.top_k = config.num_experts_per_tok
         self.router_jitter_noise = config.router_jitter_noise
         self.use_swiglu = True
-        self.swiglu_interleaved = True
+        self.swiglu_fusion = 1
         self.block_size = block_size
         use_quant = self.quant_bits > 0
 
@@ -1314,7 +1381,8 @@ def __init__(
             use_swiglu=self.use_swiglu,
             use_quant=use_quant,
             quant_bits=self.quant_bits,
-            swiglu_interleaved=self.swiglu_interleaved,
+            # swiglu_fusion=1 means fused and interleaved, which is the standard for QMoE.
+            swiglu_fusion=getattr(self, "swiglu_fusion", 0),
             block_size=self.block_size,
         )
 
@@ -1354,8 +1422,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return final_hidden_states
 
 
-disable_cpu_qmoe_tests = False
-
 # Define test cases for different MoE types
 phi3_test_cases = [
     (1, 32, 4),
@@ -1373,10 +1439,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 ]
 
 
-@unittest.skipIf(disable_cpu_qmoe_tests, "Skipping qMoE cpu tests")
 class TestPhiQMoECPU(unittest.TestCase):
-    @parameterized.expand(phi3_test_cases)
-    def test_phi3_qmoe_parity_cpu(self, batch_size, sequence_length, quant_bits):
+    @parameterized.expand(with_mlas_q4_mode(phi3_test_cases))
+    def test_phi3_qmoe_parity_cpu(self, batch_size, sequence_length, quant_bits, enable_mlas_q4_gemm):
         # Create unique seed based on test parameters to ensure different inputs for each test
         base_seed = 2000  # Different base seed from other tests
         param_hash = hash((batch_size, sequence_length, quant_bits))
@@ -1411,10 +1476,10 @@ def test_phi3_qmoe_parity_cpu(self, batch_size, sequence_length, quant_bits):
         self.assertFalse(torch.isnan(torch_result).any())
         self.assertFalse(torch.isinf(torch_result).any())
 
-        phi3_moe.parity_check()
+        run_parity_with_mlas_q4_mode(phi3_moe.parity_check, enable_mlas_q4_gemm)
 
-    @parameterized.expand(phi3_test_cases)
-    def test_phi3_qmoe_asymmetric_parity_cpu(self, batch_size, sequence_length, quant_bits):
+    @parameterized.expand(with_mlas_q4_mode(phi3_test_cases))
+    def test_phi3_qmoe_asymmetric_parity_cpu(self, batch_size, sequence_length, quant_bits, enable_mlas_q4_gemm):
         base_seed = 3000
         param_hash = hash((batch_size, sequence_length, quant_bits))
         unique_seed = base_seed + abs(param_hash) % 1000
@@ -1436,10 +1501,12 @@ def test_phi3_qmoe_asymmetric_parity_cpu(self, batch_size, sequence_length, quan
             onnx_dtype=TensorProto.FLOAT,
             use_asymmetric_quant=True,
         )
-        phi3_moe.parity_check()
+        run_parity_with_mlas_q4_mode(phi3_moe.parity_check, enable_mlas_q4_gemm)
 
-    @parameterized.expand(phi3_blockwise_test_cases)
-    def test_phi3_qmoe_blockwise_parity_cpu(self, batch_size, sequence_length, quant_bits, block_size):
+    @parameterized.expand(with_mlas_q4_mode(phi3_blockwise_test_cases))
+    def test_phi3_qmoe_blockwise_parity_cpu(
+        self, batch_size, sequence_length, quant_bits, block_size, enable_mlas_q4_gemm
+    ):
         torch.manual_seed(42)
         numpy.random.seed(42)
 
@@ -1468,10 +1535,12 @@ def test_phi3_qmoe_blockwise_parity_cpu(self, batch_size, sequence_length, quant
         self.assertFalse(torch.isnan(torch_result).any())
         self.assertFalse(torch.isinf(torch_result).any())
 
-        phi3_moe.parity_check()
+        run_parity_with_mlas_q4_mode(phi3_moe.parity_check, enable_mlas_q4_gemm)
 
-    @parameterized.expand(phi3_blockwise_test_cases)
-    def test_phi3_qmoe_blockwise_asymmetric_parity_cpu(self, batch_size, sequence_length, quant_bits, block_size):
+    @parameterized.expand(with_mlas_q4_mode(phi3_blockwise_test_cases))
+    def test_phi3_qmoe_blockwise_asymmetric_parity_cpu(
+        self, batch_size, sequence_length, quant_bits, block_size, enable_mlas_q4_gemm
+    ):
         torch.manual_seed(43)
         numpy.random.seed(43)
 
@@ -1489,10 +1558,8 @@ def test_phi3_qmoe_blockwise_asymmetric_parity_cpu(self, batch_size, sequence_le
             block_size=block_size,
             use_asymmetric_quant=True,
         )
-        phi3_moe.parity_check()
-
+        run_parity_with_mlas_q4_mode(phi3_moe.parity_check, enable_mlas_q4_gemm)
 
-disable_cpu_qmoe_tests = False
 
 swiglu_test_cases = [
     (1, 32, 4),
@@ -1510,10 +1577,9 @@ def test_phi3_qmoe_blockwise_asymmetric_parity_cpu(self, batch_size, sequence_le
 ]
 
 
-@unittest.skipIf(disable_cpu_qmoe_tests, "Skipping qMoE cpu tests")
 class TestSwigluQMoECPU(unittest.TestCase):
-    @parameterized.expand(swiglu_test_cases)
-    def test_swiglu_qmoe_parity_cpu(self, batch_size, sequence_length, quant_bits):
+    @parameterized.expand(with_mlas_q4_mode(swiglu_test_cases))
+    def test_swiglu_qmoe_parity_cpu(self, batch_size, sequence_length, quant_bits, enable_mlas_q4_gemm):
         # Create unique seed based on test parameters to ensure different inputs for each test
         base_seed = 1000  # Different base seed from regular MoE tests
         param_hash = hash((batch_size, sequence_length, quant_bits))
@@ -1547,10 +1613,10 @@ def test_swiglu_qmoe_parity_cpu(self, batch_size, sequence_length, quant_bits):
         self.assertFalse(torch.isnan(torch_result).any())
         self.assertFalse(torch.isinf(torch_result).any())
 
-        swiglu_moe.parity_check()
+        run_parity_with_mlas_q4_mode(swiglu_moe.parity_check, enable_mlas_q4_gemm)
 
-    @parameterized.expand(swiglu_test_cases)
-    def test_swiglu_qmoe_asymmetric_parity_cpu(self, batch_size, sequence_length, quant_bits):
+    @parameterized.expand(with_mlas_q4_mode(swiglu_test_cases))
+    def test_swiglu_qmoe_asymmetric_parity_cpu(self, batch_size, sequence_length, quant_bits, enable_mlas_q4_gemm):
         base_seed = 1100
         param_hash = hash((batch_size, sequence_length, quant_bits))
         unique_seed = base_seed + abs(param_hash) % 1000
@@ -1572,10 +1638,12 @@ def test_swiglu_qmoe_asymmetric_parity_cpu(self, batch_size, sequence_length, qu
             onnx_dtype=TensorProto.FLOAT,
             use_asymmetric_quant=True,
         )
-        swiglu_moe.parity_check()
+        run_parity_with_mlas_q4_mode(swiglu_moe.parity_check, enable_mlas_q4_gemm)
 
-    @parameterized.expand(swiglu_blockwise_test_cases)
-    def test_swiglu_qmoe_blockwise_parity_cpu(self, batch_size, sequence_length, quant_bits, block_size):
+    @parameterized.expand(with_mlas_q4_mode(swiglu_blockwise_test_cases))
+    def test_swiglu_qmoe_blockwise_parity_cpu(
+        self, batch_size, sequence_length, quant_bits, block_size, enable_mlas_q4_gemm
+    ):
         torch.manual_seed(42)
         numpy.random.seed(42)
 
@@ -1603,10 +1671,12 @@ def test_swiglu_qmoe_blockwise_parity_cpu(self, batch_size, sequence_length, qua
         self.assertFalse(torch.isnan(torch_result).any())
         self.assertFalse(torch.isinf(torch_result).any())
 
-        swiglu_moe.parity_check()
+        run_parity_with_mlas_q4_mode(swiglu_moe.parity_check, enable_mlas_q4_gemm)
 
-    @parameterized.expand(swiglu_blockwise_test_cases)
-    def test_swiglu_qmoe_blockwise_asymmetric_parity_cpu(self, batch_size, sequence_length, quant_bits, block_size):
+    @parameterized.expand(with_mlas_q4_mode(swiglu_blockwise_test_cases))
+    def test_swiglu_qmoe_blockwise_asymmetric_parity_cpu(
+        self, batch_size, sequence_length, quant_bits, block_size, enable_mlas_q4_gemm
+    ):
         torch.manual_seed(43)
         numpy.random.seed(43)
 
@@ -1624,7 +1694,7 @@ def test_swiglu_qmoe_blockwise_asymmetric_parity_cpu(self, batch_size, sequence_
             block_size=block_size,
             use_asymmetric_quant=True,
         )
-        swiglu_moe.parity_check()
+        run_parity_with_mlas_q4_mode(swiglu_moe.parity_check, enable_mlas_q4_gemm)
 
 
 @unittest.skipIf(True, "Skipping QMoE CPU benchmark tests")
@@ -1633,9 +1703,6 @@ class TestQMoESwiGLUBenchmark(unittest.TestCase):
 
     def test_qmoe_swiglu_throughput_benchmark(self):
         """Comprehensive throughput benchmark for QMoE SwiGLU across different configurations."""
-        if disable_cpu_qmoe_tests:
-            self.skipTest("QMoE CPU tests disabled")
-
         print("\n=== QMoE SwiGLU Throughput Benchmark ===")
 
         # Test configurations: (name, hidden_size, intermediate_size, num_experts, top_k, quant_bits)
diff --git a/onnxruntime/test/python/transformers/test_whisper.py b/onnxruntime/test/python/transformers/test_whisper.py
index e3ca8e6b6ac9c..e90a14f8d7d61 100644
--- a/onnxruntime/test/python/transformers/test_whisper.py
+++ b/onnxruntime/test/python/transformers/test_whisper.py
@@ -471,8 +471,9 @@ def export(self, model, inputs, input_names, output_names, dynamic_axes):
             input_names=input_names,
             output_names=output_names,
             dynamic_axes=dynamic_axes,
-            opset_version=17,
+            opset_version=18,
             do_constant_folding=True,
+            dynamo=False,
             verbose=False,
         )
 
@@ -530,9 +531,7 @@ def test_hf_whisper_encoder_self_attention(self, precision, ep):
             use_gpu=True,
             only_onnxruntime=False,
         )
-        name = f"hf_{precision}_encoder_self_attention.onnx"
-        # optimized_model.save_model_to_file(name)  # Uncomment for debugging purposes
-        self.verify_fusion(optimized_model, name)
+        self.verify_fusion(optimized_model, f"hf_{precision}_encoder_self_attention.onnx")
 
     @parameterized.expand(
         [
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index 4e991716dd108..e472cbcee12d6 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -4918,6 +4918,93 @@ TEST(CApiTest, ModelWithExternalDataOutsideModelDirectoryShouldFailToLoad) {
       << "Exception message should indicate external data or security issue. Got: " << exception_message;
 }
 
+TEST(CApiTest, InMemoryModel_ExternalDataOutsideWorkingDirectory_FailToLoad) {
+  // Attempt to create an ORT session with the malicious model (loaded from bytes).
+  // This should fail due to the use of an external file path that is not under current working directory.
+  // i.e. ../../../../etc/passwd
+  constexpr const ORTCHAR_T* model_path = TSTR("testdata/test_arbitrary_external_file.onnx");
+
+  Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test");
+  Ort::SessionOptions session_options;
+
+  // Load model contents into array
+  std::ifstream model_file_stream(model_path, std::ios::in | std::ios::binary);
+  ASSERT_TRUE(model_file_stream.good());
+  model_file_stream.seekg(0, std::ios::end);
+  const auto file_contents_size = onnxruntime::narrow<size_t>(model_file_stream.tellg());
+  model_file_stream.seekg(0, std::ios::beg);
+  std::vector<char> file_contents(file_contents_size, 0);
+  model_file_stream.read(&file_contents[0], file_contents_size);
+  model_file_stream.close();
+
+  bool exception_thrown = false;
+  std::string exception_message;
+
+  try {
+    // This should throw an exception due to malicious external data
+    Ort::Session session(env, file_contents.data(), file_contents_size, session_options);
+  } catch (const Ort::Exception& e) {
+    exception_thrown = true;
+    exception_message = e.what();
+  } catch (const std::exception& e) {
+    exception_thrown = true;
+    exception_message = e.what();
+  }
+
+  // Verify that loading the model failed
+  EXPECT_TRUE(exception_thrown) << "Expected model loading to fail due to malicious external data path";
+
+  // Verify that the exception message indicates security or external data issues
+  EXPECT_TRUE(exception_message.find("External data path") != std::string::npos &&
+              exception_message.find("escapes working directory") != std::string::npos)
+      << "Exception message should indicate external data or security issue. Got: " << exception_message;
+}
+
+TEST(CApiTest, InMemoryModel_SessionConfigExternalFileFolder_ExternalDataOutsideModelDirectory_FailToLoad) {
+  // Attempt to create an ORT session with the malicious model (loaded from bytes).
+  // A valid external file folder path is explicitly set via session options.
+  // However, this should still fail due to the use of an external file path that escapes the set directory.
+  // i.e. ../../../../etc/passwd
+  constexpr const ORTCHAR_T* model_path = TSTR("testdata/test_arbitrary_external_file.onnx");
+
+  Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test");
+  Ort::SessionOptions session_options;
+  session_options.AddConfigEntry(kOrtSessionOptionsModelExternalInitializersFileFolderPath, "testdata");
+
+  // Load model contents into array
+  std::ifstream model_file_stream(model_path, std::ios::in | std::ios::binary);
+  ASSERT_TRUE(model_file_stream.good());
+  model_file_stream.seekg(0, std::ios::end);
+  const auto file_contents_size = onnxruntime::narrow<size_t>(model_file_stream.tellg());
+  model_file_stream.seekg(0, std::ios::beg);
+  std::vector<char> file_contents(file_contents_size, 0);
+  model_file_stream.read(&file_contents[0], file_contents_size);
+  model_file_stream.close();
+
+  bool exception_thrown = false;
+  std::string exception_message;
+
+  try {
+    // This should throw an exception due to malicious external data
+    Ort::Session session(env, file_contents.data(), file_contents_size, session_options);
+  } catch (const Ort::Exception& e) {
+    exception_thrown = true;
+    exception_message = e.what();
+  } catch (const std::exception& e) {
+    exception_thrown = true;
+    exception_message = e.what();
+  }
+
+  // Verify that loading the model failed
+  EXPECT_TRUE(exception_thrown) << "Expected model loading to fail due to malicious external data path";
+
+  // Verify that the exception message indicates security or external data issues
+  EXPECT_TRUE(exception_message.find("External data path") != std::string::npos &&
+              exception_message.find("escapes both model directory") != std::string::npos &&
+              exception_message.find("and real model directory") != std::string::npos)
+      << "Exception message should indicate external data or security issue. Got: " << exception_message;
+}
+
 #ifdef ORT_ENABLE_STREAM
 #if USE_CUDA
 
diff --git a/samples/cxx/CMakeLists.txt b/samples/cxx/CMakeLists.txt
new file mode 100644
index 0000000000000..875e37c64eda2
--- /dev/null
+++ b/samples/cxx/CMakeLists.txt
@@ -0,0 +1,50 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+cmake_minimum_required(VERSION 3.28)
+
+project(onnxruntime_sample CXX)
+
+set(CMAKE_CXX_STANDARD 20)
+
+foreach(VAR IN ITEMS ORT_LIBRARY_DIR ORT_HEADER_DIR)
+    if (NOT DEFINED ${VAR})
+        message(FATAL_ERROR "Required variable ${VAR} is not set. "
+            "Set ORT_LIBRARY_DIR to the ONNX Runtime lib directory and "
+            "ORT_HEADER_DIR to the ONNX Runtime include directory.")
+    endif()
+endforeach()
+
+# Resolve to absolute paths
+get_filename_component(ORT_LIBRARY_DIR "${ORT_LIBRARY_DIR}" ABSOLUTE)
+get_filename_component(ORT_HEADER_DIR "${ORT_HEADER_DIR}" ABSOLUTE)
+
+#
+# onnxruntime_sample_program
+#
+block()
+add_executable(onnxruntime_sample_program)
+
+target_sources(onnxruntime_sample_program PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/main.cc)
+
+target_include_directories(onnxruntime_sample_program PRIVATE ${ORT_HEADER_DIR})
+
+target_link_directories(onnxruntime_sample_program PRIVATE ${ORT_LIBRARY_DIR})
+target_link_libraries(onnxruntime_sample_program PRIVATE onnxruntime)
+
+# Copy ONNX Runtime shared libraries next to the executable.
+# Collect shared library files from the ORT library directory based on platform.
+if (WIN32)
+    file(GLOB ORT_SHARED_LIBS "${ORT_LIBRARY_DIR}/*.dll")
+elseif (APPLE)
+    file(GLOB ORT_SHARED_LIBS "${ORT_LIBRARY_DIR}/*.dylib")
+else()
+    file(GLOB ORT_SHARED_LIBS "${ORT_LIBRARY_DIR}/*.so" "${ORT_LIBRARY_DIR}/*.so.*")
+endif()
+
+add_custom_command(TARGET onnxruntime_sample_program POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different
+        ${ORT_SHARED_LIBS}
+        $<TARGET_FILE_DIR:onnxruntime_sample_program>
+)
+endblock()
diff --git a/samples/cxx/README.md b/samples/cxx/README.md
new file mode 100644
index 0000000000000..1904c082cef7a
--- /dev/null
+++ b/samples/cxx/README.md
@@ -0,0 +1,92 @@
+# ONNX Runtime C++ Sample
+
+A minimal C++ program demonstrating basic ONNX Runtime inference. It loads an ONNX model that adds two float tensors (`C = A + B`), runs inference, and verifies the result.
+
+## Prerequisites
+
+- CMake 3.28 or later
+- C++20 compatible compiler (e.g., Visual Studio 2022)
+- An ONNX Runtime release package (download from [GitHub releases](https://github.com/microsoft/onnxruntime/releases))
+- For model generation:
+  - Python with the `onnx` package
+
+## Directory Structure
+
+```
+samples/cxx/
+├── CMakeLists.txt         # Build configuration
+├── main.cc                # Sample program source
+├── add_model.onnx         # ONNX model (C = A + B)
+├── generate_model.py      # Script to generate the ONNX model
+└── README.md              # This file
+```
+
+## Steps
+
+### 1. Extract the ONNX Runtime package
+
+Download and extract an ONNX Runtime release archive. For example:
+
+```
+tar -xf onnxruntime-win-x64-1.25.0.zip
+```
+
+This creates a directory like `onnxruntime-win-x64-1.25.0/` containing `include/` and `lib/` subdirectories.
+
+### 2. [Optional] Generate the ONNX model
+
+```
+cd samples/cxx
+pip install onnx
+python generate_model.py
+```
+
+This creates `add_model.onnx` in the current directory.
+
+### 3. Configure and build
+
+From the `samples/cxx` directory:
+
+**Windows:**
+```
+cmake -S . -B build ^
+    -DORT_HEADER_DIR:PATH=path\to\onnxruntime-win-x64-1.25.0\include ^
+    -DORT_LIBRARY_DIR:PATH=path\to\onnxruntime-win-x64-1.25.0\lib
+cmake --build build --config Release
+```
+
+**Linux / macOS:**
+```
+cmake -S . -B build \
+    -DORT_HEADER_DIR:PATH=path/to/onnxruntime-linux-x64-1.25.0/include \
+    -DORT_LIBRARY_DIR:PATH=path/to/onnxruntime-linux-x64-1.25.0/lib
+cmake --build build --config Release
+```
+
+Adjust the paths to match your extracted package name and location.
+
+The build automatically copies the ONNX Runtime shared libraries next to the executable.
+
+#### CMake Variables
+
+| Variable | Description |
+|---|---|
+| `ORT_HEADER_DIR` | Path to the ONNX Runtime `include` directory |
+| `ORT_LIBRARY_DIR` | Path to the ONNX Runtime `lib` directory |
+
+### 4. Run
+
+**Windows:**
+```
+build\Release\onnxruntime_sample_program.exe
+```
+
+**Linux / macOS:**
+```
+./build/onnxruntime_sample_program
+```
+
+You can also pass a model path as an argument:
+```
+onnxruntime_sample_program path/to/add_model.onnx
+```
diff --git a/samples/cxx/add_model.onnx b/samples/cxx/add_model.onnx
new file mode 100644
index 0000000000000..36308c1372a22
Binary files /dev/null and b/samples/cxx/add_model.onnx differ
diff --git a/samples/cxx/generate_model.py b/samples/cxx/generate_model.py
new file mode 100644
index 0000000000000..9ac70ab29deb4
--- /dev/null
+++ b/samples/cxx/generate_model.py
@@ -0,0 +1,42 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""Generate a simple ONNX model that computes C = A + B.
+
+Inputs:
+  A : float tensor of shape [1, 3]
+  B : float tensor of shape [1, 3]
+
+Output:
+  C : float tensor of shape [1, 3]
+
+Usage:
+  pip install onnx
+  python generate_model.py
+"""
+
+from onnx import TensorProto, helper, save_model
+from onnx.checker import check_model
+
+
+def main():
+    # Define inputs and output
+    a = helper.make_tensor_value_info("A", TensorProto.FLOAT, [1, 3])
+    b = helper.make_tensor_value_info("B", TensorProto.FLOAT, [1, 3])
+    c = helper.make_tensor_value_info("C", TensorProto.FLOAT, [1, 3])
+
+    # Create the Add node
+    add_node = helper.make_node("Add", inputs=["A", "B"], outputs=["C"])
+
+    # Build the graph and model
+    graph = helper.make_graph([add_node], "add_graph", [a, b], [c])
+    model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
+
+    # Validate and save
+    check_model(model)
+    save_model(model, "add_model.onnx")
+    print("Saved add_model.onnx")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/samples/cxx/main.cc b/samples/cxx/main.cc
new file mode 100644
index 0000000000000..4e31e033ab8c7
--- /dev/null
+++ b/samples/cxx/main.cc
@@ -0,0 +1,170 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// Sample program demonstrating basic ONNX Runtime C++ API usage.
+// Loads a simple ONNX model (C = A + B), runs inference, and prints the result.
+//
+// Generate the model first:  python generate_model.py
+
+#include <array>
+#include <cstdint>
+#include <cstdlib>
+#include <filesystem>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "onnxruntime_cxx_api.h"
+
+// Throw std::runtime_error if `condition` is false. Includes file and line info.
+#define THROW_IF_NOT(condition)                                  \
+  do {                                                           \
+    if (!(condition)) {                                          \
+      throw std::runtime_error(std::string(__FILE__) + ":" +     \
+                               std::to_string(__LINE__) + ": " + \
+                               "check failed: " #condition);     \
+    }                                                            \
+  } while (0)
+
+int main(int argc, char* argv[]) {
+  try {
+    // -----------------------------------------------------------------------
+    // 1. Initialize the ONNX Runtime environment
+    // -----------------------------------------------------------------------
+    Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "onnxruntime_sample");
+    std::cout << "ONNX Runtime version: " << Ort::GetVersionString() << "\n\n";
+
+    // -----------------------------------------------------------------------
+    // 2. Create session options (could add execution providers here)
+    // -----------------------------------------------------------------------
+    Ort::SessionOptions session_options;
+    session_options.SetIntraOpNumThreads(1);
+    session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_BASIC);
+
+    // -----------------------------------------------------------------------
+    // 3. Load the ONNX model from a file
+    //    Generate with:  python generate_model.py
+    // -----------------------------------------------------------------------
+    const std::filesystem::path model_path = (argc > 1) ? argv[1] : "add_model.onnx";
+    std::cout << "Loading model: " << model_path.string() << "\n";
+
+    Ort::Session session(env, model_path.native().c_str(), session_options);
+
+    // -----------------------------------------------------------------------
+    // 4. Query model metadata: input/output names and shapes
+    // -----------------------------------------------------------------------
+    Ort::AllocatorWithDefaultOptions allocator;
+
+    const size_t num_inputs = session.GetInputCount();
+    const size_t num_outputs = session.GetOutputCount();
+    std::cout << "Model inputs:  " << num_inputs << "\n";
+    std::cout << "Model outputs: " << num_outputs << "\n";
+
+    // Collect input/output names
+    std::vector<std::string> input_names;
+    std::vector<std::string> output_names;
+
+    for (size_t i = 0; i < num_inputs; ++i) {
+      auto name = session.GetInputNameAllocated(i, allocator);
+      std::cout << "  Input  " << i << ": " << name.get() << "\n";
+      input_names.emplace_back(name.get());
+    }
+    for (size_t i = 0; i < num_outputs; ++i) {
+      auto name = session.GetOutputNameAllocated(i, allocator);
+      std::cout << "  Output " << i << ": " << name.get() << "\n";
+      output_names.emplace_back(name.get());
+    }
+    std::cout << "\n";
+
+    // -----------------------------------------------------------------------
+    // 5. Prepare input tensors
+    // -----------------------------------------------------------------------
+    // Our model expects two float tensors of shape [1, 3].
+    constexpr int64_t batch_size = 1;
+    constexpr int64_t num_elements = 3;
+    const std::array<int64_t, 2> input_shape = {batch_size, num_elements};
+
+    std::array<float, num_elements> input_a = {1.0f, 2.0f, 3.0f};
+    std::array<float, num_elements> input_b = {4.0f, 5.0f, 6.0f};
+
+    auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+
+    auto tensor_a = Ort::Value::CreateTensor<float>(
+        memory_info, input_a.data(), input_a.size(),
+        input_shape.data(), input_shape.size());
+
+    auto tensor_b = Ort::Value::CreateTensor<float>(
+        memory_info, input_b.data(), input_b.size(),
+        input_shape.data(), input_shape.size());
+
+    THROW_IF_NOT(tensor_a.IsTensor());
+    THROW_IF_NOT(tensor_b.IsTensor());
+
+    // The Run() API expects arrays of C strings for input/output names.
+    std::vector<const char*> input_name_ptrs;
+    std::vector<const char*> output_name_ptrs;
+    for (const auto& n : input_names) input_name_ptrs.push_back(n.c_str());
+    for (const auto& n : output_names) output_name_ptrs.push_back(n.c_str());
+
+    std::array<Ort::Value, 2> input_tensors{std::move(tensor_a), std::move(tensor_b)};
+
+    // -----------------------------------------------------------------------
+    // 6. Run inference
+    // -----------------------------------------------------------------------
+    std::cout << "Running inference...\n";
+
+    Ort::RunOptions run_options;
+    auto output_tensors = session.Run(
+        run_options,
+        input_name_ptrs.data(), input_tensors.data(), input_tensors.size(),
+        output_name_ptrs.data(), output_name_ptrs.size());
+
+    // -----------------------------------------------------------------------
+    // 7. Process output
+    // -----------------------------------------------------------------------
+    THROW_IF_NOT(!output_tensors.empty() && output_tensors[0].IsTensor());
+
+    const float* output_data = output_tensors[0].GetTensorData<float>();
+    auto type_info = output_tensors[0].GetTensorTypeAndShapeInfo();
+    size_t output_count = type_info.GetElementCount();
+
+    std::cout << "\nInputs:\n";
+    std::cout << "  A = [";
+    for (size_t i = 0; i < input_a.size(); ++i) {
+      std::cout << (i ? ", " : "") << input_a[i];
+    }
+    std::cout << "]\n";
+
+    std::cout << "  B = [";
+    for (size_t i = 0; i < input_b.size(); ++i) {
+      std::cout << (i ? ", " : "") << input_b[i];
+    }
+    std::cout << "]\n";
+
+    std::cout << "\nOutput (A + B):\n";
+    std::cout << "  C = [";
+    for (size_t i = 0; i < output_count; ++i) {
+      std::cout << (i ? ", " : "") << output_data[i];
+    }
+    std::cout << "]\n";
+
+    // Verify correctness
+    bool correct = true;
+    for (size_t i = 0; i < num_elements; ++i) {
+      if (output_data[i] != input_a[i] + input_b[i]) {
+        correct = false;
+        break;
+      }
+    }
+    std::cout << "\nResult: " << (correct ? "PASS" : "FAIL") << "\n";
+
+    return correct ? EXIT_SUCCESS : EXIT_FAILURE;
+  } catch (const Ort::Exception& e) {
+    std::cerr << "ONNX Runtime error: " << e.what() << "\n";
+    return EXIT_FAILURE;
+  } catch (const std::exception& e) {
+    std::cerr << "Error: " << e.what() << "\n";
+    return EXIT_FAILURE;
+  }
+}
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index c8beeef8aa509..de64183e1bb18 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1761,7 +1761,7 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                 # Install cpu only version of torch when cuda is not enabled in Linux.
                 extra = [] if args.use_cuda and is_linux() else ["--index-url", "https://download.pytorch.org/whl/cpu"]
                 run_subprocess(
-                    [sys.executable, "-m", "pip", "install", "torch==2.8.0", "torchvision==0.23.0", *extra],
+                    [sys.executable, "-m", "pip", "install", "torch==2.10.0", "torchvision==0.25.0", *extra],
                     cwd=cwd,
                     dll_path=dll_path,
                     python_path=python_path,
@@ -1838,11 +1838,9 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                         [sys.executable, "-m", "unittest", "discover", "-s", "quantization"], cwd=cwd, dll_path=dll_path
                     )
 
-                    # onnx package does not support python 3.14 yet so skip the transformers tests for python 3.14.
-                    # we can remove this check when onnx package supports python 3.14.
                     if args.enable_transformers_tool_test and (sys.version_info.major, sys.version_info.minor) < (
                         3,
-                        14,
+                        15,
                     ):
                         import google.protobuf  # noqa: PLC0415
                         import numpy  # noqa: PLC0415
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml
index 8d96c1ae99e0a..5ddac928b32d3 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml
@@ -160,7 +160,61 @@ stages:
     NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu.Linux'
     CudaVersion: 12.8
 
+- template: templates/test-binary-archive-stage.yml
+  parameters:
+    artifactName: onnxruntime-linux-aarch64
+    artifactPipelineResource: build
+    previousStageName: Setup
+    platform: linux-aarch64
+    agentPool: onnxruntime-linux-ARM64-CPU-2019
+
+- template: templates/test-binary-archive-stage.yml
+  parameters:
+    artifactName: onnxruntime-linux-x64
+    artifactPipelineResource: build
+    previousStageName: Setup
+    platform: linux-x64
+    agentPool: onnxruntime-Ubuntu2204-AMD-CPU
 
+- template: templates/test-binary-archive-stage.yml
+  parameters:
+    artifactName: onnxruntime-osx-arm64
+    artifactPipelineResource: build
+    previousStageName: Setup
+    platform: osx-arm64
+    agentPool:
+      name: AcesShared
+      os: macOS
+      demands:
+      - ImageOverride -equals ACES_VM_SharedPool_Sequoia
+    agentSetupSteps:
+    - template: templates/setup-build-tools.yml
+      parameters:
+        host_cpu_arch: arm64
+
+- template: templates/test-binary-archive-stage.yml
+  parameters:
+    artifactName: onnxruntime-win-arm64
+    artifactPipelineResource: build
+    previousStageName: Setup
+    platform: win-arm64
+    agentPool: onnxruntime-qnn-windows-vs-2022-arm64
+
+- template: templates/test-binary-archive-stage.yml
+  parameters:
+    artifactName: onnxruntime-win-arm64x
+    artifactPipelineResource: build
+    previousStageName: Setup
+    platform: win-arm64x
+    agentPool: onnxruntime-qnn-windows-vs-2022-arm64
+
+- template: templates/test-binary-archive-stage.yml
+  parameters:
+    artifactName: onnxruntime-win-x64
+    artifactPipelineResource: build
+    previousStageName: Setup
+    platform: win-x64
+    agentPool: onnxruntime-Win-CPU-VS2022-Latest
 
 # Run GPU tests.
 - stage: Windows_Packaging_cuda_Testing
diff --git a/tools/ci_build/github/azure-pipelines/dml-nuget-packaging.yml b/tools/ci_build/github/azure-pipelines/dml-nuget-packaging.yml
index 3cf28655c36e7..0b63a4f5b83c1 100644
--- a/tools/ci_build/github/azure-pipelines/dml-nuget-packaging.yml
+++ b/tools/ci_build/github/azure-pipelines/dml-nuget-packaging.yml
@@ -111,3 +111,4 @@ extends:
     - template: stages/nuget_dml_packaging_stage.yml
       parameters:
         DoEsrp: ${{ parameters.DoEsrp }}
+        IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
index 0481a356cf9a1..2a8e222a9e192 100644
--- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
@@ -96,6 +96,5 @@ extends:
         QnnSdk: ${{ parameters.QnnSdk }}
         IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
         DoEsrp: ${{ parameters.DoEsrp }}
-        ArtifactName: 'drop-nuget-qnn-arm64x'
         StageName: 'OnnxRuntime_QNN_Nuget_Win_Arm64x'
         build_config: ${{ parameters.build_config }}
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
index 7e47227c23d5b..385cee35eb95d 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
@@ -175,8 +175,6 @@ stages:
 
   - stage: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Tests
     dependsOn: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Build
-    # Skip this stage for Python 3.14 for now until onnx package support python 3.14.
-    condition: and(succeeded(), ne('${{ parameters.PYTHON_VERSION }}', '3.14'))
     jobs:
     - job: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Tests
       workspace:
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-win-webgpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-win-webgpu-stage.yml
index 8bd8521d80104..1897d94db76c7 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-win-webgpu-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-win-webgpu-stage.yml
@@ -131,8 +131,6 @@ stages:
 
   - stage: Win_py_webgpu_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Tests
     dependsOn: Win_py_webgpu_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Build
-    # Skip this stage for Python 3.14 for now until onnx package support python 3.14.
-    condition: and(succeeded(), ne('${{ parameters.PYTHON_VERSION }}', '3.14'))
     jobs:
     - job: Win_py_webgpu_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Tests
       workspace:
diff --git a/tools/ci_build/github/azure-pipelines/templates/build-win-arm64x-steps.yml b/tools/ci_build/github/azure-pipelines/templates/build-win-arm64x-steps.yml
new file mode 100644
index 0000000000000..50e7cbb13d6e1
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/templates/build-win-arm64x-steps.yml
@@ -0,0 +1,28 @@
+# Runs a Windows ARM64X build in `buildDirectory`.
+
+parameters:
+  buildDirectory: '$(Build.BinariesDirectory)'
+  additionalBuildPyArgs: ''
+
+steps:
+- task: PythonScript@0
+  displayName: 'Build arm64 project for arm64x - generate the def & lib file for next build'
+  inputs:
+    scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+    arguments: >
+      ${{ parameters.additionalBuildPyArgs }}
+      --build_shared_lib
+      --arm64
+      --buildasx
+      --build_dir="${{ parameters.buildDirectory }}/arm64"
+
+- task: PythonScript@0
+  displayName: 'Build arm64ec project for arm64x - the real arm64x'
+  inputs:
+    scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+    arguments: >
+      ${{ parameters.additionalBuildPyArgs }}
+      --build_shared_lib
+      --arm64ec
+      --buildasx
+      --build_dir="${{ parameters.buildDirectory }}"
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows-qnn.yml
deleted file mode 100644
index ab3e0ebaab39a..0000000000000
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows-qnn.yml
+++ /dev/null
@@ -1,141 +0,0 @@
-# sets up common build tools for the windows build machines before build
-
-parameters:
-- name: DoEsrp
-  displayName: Run code sign tasks? Must be true if you are doing an Onnx Runtime release.
-  type: boolean
-  default: true
-
-- name: buildConfig
-  displayName: buildConfig
-  type: string
-  default: 'RelWithDebInfo'
-
-- name: artifactName
-  displayName: artifactName，like 'onnxruntime-win-x64-1.6.0'
-  type: string
-  default: ''
-
-- name: artifactNameNoVersionString
-  type: string
-  default: 'onnxruntime-win-x64'
-
-- name: commitId
-  displayName: commitId
-  type: string
-  default: ''
-
-- name: trtEnabled
-  displayName: Include TRT EP libraries?
-  type: boolean
-  default: true
-
-steps:
-    - ${{if or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-'))}}:
-      - template: publish-symbolrequestprod-api.yml
-        parameters:
-          ${{if eq(variables['Build.SourceBranch'], 'refs/heads/main')}}:
-            symbolExpiryTime: 60
-          includePublicSymbolServer: true
-          symbolsArtifactName: ${{parameters.artifactNameNoVersionString}}
-          symbolsVersion: $(Build.BuildId)
-          symbolProject: 'ONNX Runtime'
-          subscription: 'OnnxrunTimeCodeSign_20240611'
-          searchPattern: |
-            $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime.pdb
-            $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_*.pdb
-
-
-    - task: CmdLine@2
-      displayName: 'Copy build artifacts for zipping'
-      inputs:
-        script: |
-          mkdir $(Build.BinariesDirectory)\${{parameters.artifactName}}
-          mkdir $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          mkdir $(Build.BinariesDirectory)\${{parameters.artifactName}}\include
-
-          if exist $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_cuda.dll (
-            echo "cuda context headers copied"
-            mkdir $(Build.BinariesDirectory)\${{parameters.artifactName}}\include\core\providers\cuda
-            copy $(Build.SourcesDirectory)\include\onnxruntime\core\providers\resource.h $(Build.BinariesDirectory)\${{parameters.artifactName}}\include\core\providers
-            copy $(Build.SourcesDirectory)\include\onnxruntime\core\providers\custom_op_context.h $(Build.BinariesDirectory)\${{parameters.artifactName}}\include\core\providers
-            copy $(Build.SourcesDirectory)\include\onnxruntime\core\providers\cuda\cuda_context.h $(Build.BinariesDirectory)\${{parameters.artifactName}}\include\core\providers\cuda
-            copy $(Build.SourcesDirectory)\include\onnxruntime\core\providers\cuda\cuda_resource.h $(Build.BinariesDirectory)\${{parameters.artifactName}}\include\core\providers\cuda
-          )
-
-          echo "Directories created"
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_shared.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_shared.lib $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_shared.pdb $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_cuda.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_cuda.pdb $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_cuda.lib $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-
-          # Copy WebGPU dependencies if required
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\dxcompiler.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\dxil.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-
-          # Copy QNN dependencies if required
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_qnn.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\libQnnHtp*.so $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib /Y
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\libqnnhtp*.cat $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib /Y
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnCpu.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnGpu.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtp.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpPrepare.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpV68Stub.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpV73Stub.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpV81Stub.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnSaver.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnSystem.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\Qualcomm_LICENSE.pdf $(Build.BinariesDirectory)\${{parameters.artifactName}}
-
-          # copy trt ep libraries only when trt ep is enabled
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_tensorrt.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_tensorrt.pdb $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_tensorrt.lib $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime.pdb $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime.lib $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
-          copy $(Build.SourcesDirectory)\include\onnxruntime\core\session\onnxruntime_*.h  $(Build.BinariesDirectory)\${{parameters.artifactName}}\include
-          copy $(Build.SourcesDirectory)\include\onnxruntime\core\framework\provider_options.h  $(Build.BinariesDirectory)\${{parameters.artifactName}}\include
-          copy $(Build.SourcesDirectory)\include\onnxruntime\core\providers\cpu\cpu_provider_factory.h  $(Build.BinariesDirectory)\${{parameters.artifactName}}\include
-          copy $(Build.SourcesDirectory)\orttraining\orttraining\training_api\include\onnxruntime_training*.h  $(Build.BinariesDirectory)\${{parameters.artifactName}}\include
-
-          REM copy the README, license and TPN
-          copy $(Build.SourcesDirectory)\README.md $(Build.BinariesDirectory)\${{parameters.artifactName}}\README.md
-          copy $(Build.SourcesDirectory)\docs\Privacy.md $(Build.BinariesDirectory)\${{parameters.artifactName}}\Privacy.md
-          copy $(Build.SourcesDirectory)\LICENSE $(Build.BinariesDirectory)\${{parameters.artifactName}}\LICENSE
-          copy $(Build.SourcesDirectory)\ThirdPartyNotices.txt $(Build.BinariesDirectory)\${{parameters.artifactName}}\ThirdPartyNotices.txt
-          copy $(Build.SourcesDirectory)\VERSION_NUMBER $(Build.BinariesDirectory)\${{parameters.artifactName}}\VERSION_NUMBER
-          @echo ${{parameters.commitId}} > $(Build.BinariesDirectory)\${{parameters.artifactName}}\GIT_COMMIT_ID
-
-        workingDirectory: '$(Build.BinariesDirectory)\${{parameters.buildConfig}}'
-
-    - ${{ if eq(parameters.DoEsrp, true) }}:
-      - template: win-esrp-dll.yml
-        parameters:
-          FolderPath: '$(Build.BinariesDirectory)\${{parameters.artifactName}}'
-          DisplayName: 'ESRP - Sign Native dlls'
-          DoEsrp: ${{parameters.DoEsrp}}
-          Pattern: '*.dll,*.exe'
-
-    - task: DeleteFiles@1
-      displayName: 'Delete CodeSignSummary*.md'
-      inputs:
-        SourceFolder: '$(Build.BinariesDirectory)\${{parameters.artifactName}}'
-        Contents: 'CodeSignSummary*.md'
-
-    - task: ArchiveFiles@2
-      inputs:
-        rootFolderOrFile: '$(Build.BinariesDirectory)\${{parameters.artifactName}}'
-        includeRootFolder: true
-        archiveType: 'zip' # Options: zip, 7z, tar, wim
-        archiveFile: '$(Build.ArtifactStagingDirectory)\${{parameters.artifactName}}.zip'
-        replaceExistingArchive: true
-
-    - task: 1ES.PublishPipelineArtifact@1
-      inputs:
-        targetPath: '$(Build.ArtifactStagingDirectory)'
-        artifactName: '${{parameters.artifactNameNoVersionString}}'
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
index 28a1960aac27b..5f9dd5677e7bc 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
@@ -12,7 +12,7 @@ parameters:
   default: 'RelWithDebInfo'
 
 - name: artifactName
-  displayName: artifactName，like 'onnxruntime-win-x64-1.6.0'
+  displayName: artifactName, like 'onnxruntime-win-x64-1.6.0'
   type: string
   default: ''
 
@@ -30,6 +30,11 @@ parameters:
   type: boolean
   default: true
 
+- name: publishArtifactStagingDirectory
+  displayName: Whether to publish the artifact staging directory as an artifact named `artifactNameNoVersionString`.
+  type: boolean
+  default: false
+
 steps:
     - ${{if or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-'))}}:
       - template: publish-symbolrequestprod-api.yml
@@ -89,6 +94,7 @@ steps:
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpV81Stub.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnSaver.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnSystem.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
+          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\Qualcomm_LICENSE.pdf $(Build.BinariesDirectory)\${{parameters.artifactName}}
 
           # copy trt ep libraries only when trt ep is enabled
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_tensorrt.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
@@ -133,3 +139,9 @@ steps:
         archiveType: 'zip' # Options: zip, 7z, tar, wim
         archiveFile: '$(Build.ArtifactStagingDirectory)\${{parameters.artifactName}}.zip'
         replaceExistingArchive: true
+
+    - ${{ if parameters.publishArtifactStagingDirectory }}:
+      - task: 1ES.PublishPipelineArtifact@1
+        inputs:
+          targetPath: '$(Build.ArtifactStagingDirectory)'
+          artifactName: '${{parameters.artifactNameNoVersionString}}'
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index a0f023325be04..448dbafcaaaac 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -163,6 +163,20 @@ stages:
     PreReleaseVersionSuffixString: ${{ parameters.PreReleaseVersionSuffixString }}
     PreReleaseVersionSuffixNumber: ${{ parameters.PreReleaseVersionSuffixNumber }}
 
+- template: win-ci.yml
+  parameters:
+    DoEsrp: true
+    stage_name_suffix: CPU_arm64x_${{ parameters.BuildVariant }}
+    buildArch: x64
+    msbuildPlatform: arm64x
+    packageName: arm64x
+    buildparameter: ${{ parameters.AdditionalBuildFlags }} ${{ parameters.AdditionalWinBuildFlags}}
+    runTests: false
+    buildJava: false
+    buildNodejs: false
+    PreReleaseVersionSuffixString: ${{ parameters.PreReleaseVersionSuffixString }}
+    PreReleaseVersionSuffixNumber: ${{ parameters.PreReleaseVersionSuffixNumber }}
+
 - template: win-ci.yml
   parameters:
     DoEsrp: true
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-cpu.yml
index 09603f2350657..326cfd7829f2f 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-cpu.yml
@@ -149,20 +149,18 @@ jobs:
 
 
   - powershell: |
-      if ("$(PythonVersion)" -notcontains "3.14") {
-        python -m pip uninstall -y onnxruntime onnxruntime-gpu -qq
-        Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate}
-        Remove-Item -Recurse -Force onnxruntime
-        if ("$(ExtraParam)".Split() -contains "--use_azure") {
-
-          if( "${{parameters.architecture}}" -eq 'arm64') {
-            $env:path="$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\arm64-windows\bin;$env:path"
-          }  else {
-            $env:path="$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\x64-windows\bin;$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\x86-windows\bin;$env:path"
-          }
-          python onnxruntime_test_python_azure.py
+      python -m pip uninstall -y onnxruntime onnxruntime-gpu -qq
+      Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate}
+      Remove-Item -Recurse -Force onnxruntime
+      if ("$(ExtraParam)".Split() -contains "--use_azure") {
+
+        if( "${{parameters.architecture}}" -eq 'arm64') {
+          $env:path="$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\arm64-windows\bin;$env:path"
+        }  else {
+          $env:path="$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\x64-windows\bin;$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\x86-windows\bin;$env:path"
         }
-        python onnx_backend_test_series.py
+        python onnxruntime_test_python_azure.py
       }
+      python onnx_backend_test_series.py
     workingDirectory: '$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}'
     displayName: 'Run Python Tests'
diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
index 8a1c4f8a39316..7e176b67f6685 100644
--- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
@@ -48,7 +48,7 @@ stages:
     variables:
       OrtPackageId: ${{ parameters.OrtNugetPackageId }}
       ReleaseVersionSuffix: $[stageDependencies.Setup.Set_Variables.outputs['Set_Release_Version_Suffix.ReleaseVersionSuffix']]
-      commonBuildArgs: '--skip_submodule_sync --build_shared_lib --client_package_build --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags ${{ parameters.AdditionalBuildArgs}}'
+      commonBuildArgs: '--skip_submodule_sync --build_shared_lib --client_package_build --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags --config ${{ parameters.build_config }} ${{ parameters.AdditionalBuildArgs}}'
 
     steps:
     - template: set-version-number-variables-step.yml
@@ -61,17 +61,10 @@ stages:
       parameters:
         QnnSDKVersion: ${{ parameters.QnnSdk }}
 
-    - task: PythonScript@0
-      displayName: 'Build arm64x project - generate the def & lib file for next build'
-      inputs:
-        scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-        arguments: ' --arm64 --buildasx --build_dir $(Build.BinariesDirectory)\arm64x --use_qnn --qnn_home $(QnnSDKRootDir) $(commonBuildArgs)'
-
-    - task: PythonScript@0
-      displayName: 'Build arm64ecx project - the real arm64x'
-      inputs:
-        scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-        arguments: ' --arm64ec --buildasx --build_dir $(Build.BinariesDirectory) --use_qnn --qnn_home $(QnnSDKRootDir) $(commonBuildArgs)'
+    - template: build-win-arm64x-steps.yml
+      parameters:
+        buildDirectory: '$(Build.BinariesDirectory)'
+        additionalBuildPyArgs: '$(commonBuildArgs) --use_qnn --qnn_home $(QnnSDKRootDir)'
 
     - task: CmdLine@2
       displayName: 'Print contents of binaries directory'
@@ -87,12 +80,13 @@ stages:
         Pattern: 'onnxruntime*.dll'
 
     - ${{ if eq(parameters.PublishArchive, true) }}:
-      - template: c-api-artifacts-package-and-publish-steps-windows-qnn.yml
+      - template: c-api-artifacts-package-and-publish-steps-windows.yml
         parameters:
           buildConfig: ${{ parameters.build_config }}
           artifactName: 'onnxruntime-win-arm64x-qnn'
           artifactNameNoVersionString: 'onnxruntime-win-arm64x-qnn'
           DoEsrp: ${{ parameters.DoEsrp }}
+          publishArtifactStagingDirectory: true
 
     - task: MSBuild@1
       displayName: 'Restore NuGet Packages and create project.assets.json'
diff --git a/tools/ci_build/github/azure-pipelines/templates/set-variable.yml b/tools/ci_build/github/azure-pipelines/templates/set-variable.yml
new file mode 100644
index 0000000000000..2cf49f2f067c2
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/templates/set-variable.yml
@@ -0,0 +1,32 @@
+# Sets an ADO pipeline variable.
+# See https://learn.microsoft.com/en-us/azure/devops/pipelines/process/set-variables-scripts
+
+parameters:
+- name: name
+  type: string
+
+- name: value
+  type: string
+
+steps:
+- task: PythonScript@0
+  displayName: 'Set variable - ${{ parameters.name }}'
+  inputs:
+    scriptSource: inline
+    script: |
+      import os
+
+      variable_name = os.getenv("VARIABLE_NAME")
+      variable_value = os.getenv("VARIABLE_VALUE")
+
+      if not variable_name.isidentifier():
+          raise ValueError(f"Variable name is not a valid identifier: '{variable_name}'")
+
+      if "\n" in variable_value:
+          raise ValueError(f"Variable value should not contain any newlines: '{variable_value}'")
+
+      print(f"Setting variable: {variable_name} = '{variable_value}'")
+      print(f"##vso[task.setvariable variable={variable_name}]{variable_value}")
+  env:
+    VARIABLE_NAME: ${{ parameters.name }}
+    VARIABLE_VALUE: ${{ parameters.value }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
index 7d6e272533696..8303547a47566 100644
--- a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
@@ -22,7 +22,7 @@ stages:
       buildSettingsFile: "tools/ci_build/github/apple/default_full_apple_framework_build_settings.json"
       cPodName: onnxruntime-c
       objcPodName: onnxruntime-objc
-    timeoutInMinutes: 270
+    timeoutInMinutes: 360
     templateContext:
       outputs:
       - output: pipelineArtifact
diff --git a/tools/ci_build/github/azure-pipelines/templates/test-binary-archive-stage.yml b/tools/ci_build/github/azure-pipelines/templates/test-binary-archive-stage.yml
new file mode 100644
index 0000000000000..b9b9cdc6b0eb3
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/templates/test-binary-archive-stage.yml
@@ -0,0 +1,121 @@
+# Tests an ONNX Runtime binary archive produced by the packaging pipeline.
+
+parameters:
+- name: artifactName
+  type: string
+- name: artifactPipelineResource
+  type: string
+- name: previousStageName
+  type: string
+  default: ''
+- name: platform
+  type: string
+- name: agentPool
+  type: object
+- name: agentSetupSteps
+  type: stepList
+  default: []
+
+stages:
+- stage: Binary_Archive_Testing_${{ replace(parameters.platform, '-', '_') }}
+  ${{ if ne(parameters.previousStageName, '') }}:
+    dependsOn: ${{ parameters.previousStageName }}
+
+  jobs:
+  - job: Binary_Archive_Testing_${{ replace(parameters.platform, '-', '_') }}
+    pool: ${{ parameters.agentPool }}
+
+    variables:
+    - name: buildConfig
+      value: Release
+    - name: relativePathFromBuildToOutputDir
+      ${{ if startsWith(parameters.platform, 'win') }}:
+        value: "${{ variables['buildConfig'] }}"
+      ${{ else }}:
+        value: "."
+
+    steps:
+    - checkout: self
+      clean: true
+      submodules: none
+
+    - ${{ each agentSetupStep in parameters.agentSetupSteps }}:
+      - ${{ agentSetupStep }}
+
+    - download: ${{ parameters.artifactPipelineResource }}
+      artifact: ${{ parameters.artifactName }}
+      patterns: |
+        *.zip
+        *.tgz
+      displayName: Download binary archive for ${{ parameters.platform }}
+
+    # Extract the binary archive.
+    # The archive contains a top-level directory like onnxruntime-<platform>-<version>/.
+    # After extraction, set ORT_PACKAGE_DIR to the extracted directory.
+    - ${{ if startsWith(parameters.platform, 'win') }}:
+      - task: PowerShell@2
+        displayName: 'Extract binary archive'
+        inputs:
+          targetType: 'inline'
+          script: |
+            $artifactDir = "$(Pipeline.Workspace)/${{ parameters.artifactPipelineResource }}/${{ parameters.artifactName }}"
+            $archive = (Get-ChildItem -Path $artifactDir -Filter *.zip)[0].FullName
+            Write-Host "Extracting $archive"
+            Expand-Archive -Path $archive -DestinationPath $(Build.BinariesDirectory)
+            $extractedDir = (Get-ChildItem -Path $(Build.BinariesDirectory) -Directory | Where-Object { $_.Name -like "onnxruntime-*" })[0].FullName
+            Write-Host "Extracted to $extractedDir"
+            Write-Host "##vso[task.setvariable variable=ORT_PACKAGE_DIR]$extractedDir"
+
+    - ${{ else }}:
+      - bash: |
+          set -ex
+          artifact_dir="$(Pipeline.Workspace)/${{ parameters.artifactPipelineResource }}/${{ parameters.artifactName }}"
+          archive=$(find "$artifact_dir" -name '*.tgz' | head -1)
+          echo "Extracting $archive"
+          tar -xzf "$archive" -C $(Build.BinariesDirectory)
+          extracted_dir=$(find $(Build.BinariesDirectory) -maxdepth 1 -type d -name 'onnxruntime-*' | head -1)
+          echo "Extracted to $extracted_dir"
+
+          # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote.
+          set +x
+          echo "##vso[task.setvariable variable=ORT_PACKAGE_DIR]$extracted_dir"
+        displayName: 'Extract binary archive'
+
+    # Build and run the C++ sample using the extracted ONNX Runtime package.
+
+    - script: >
+        cmake
+        -S $(Build.SourcesDirectory)/samples/cxx
+        -B $(Build.BinariesDirectory)/sample_build
+        -DORT_HEADER_DIR:PATH=$(ORT_PACKAGE_DIR)/include
+        -DORT_LIBRARY_DIR:PATH=$(ORT_PACKAGE_DIR)/lib
+      displayName: 'Generate C++ sample build system'
+
+    - script: |
+        cmake --build $(Build.BinariesDirectory)/sample_build --config $(buildConfig)
+      displayName: 'Build C++ sample'
+
+    - script: >
+        $(Build.BinariesDirectory)/sample_build/$(relativePathFromBuildToOutputDir)/onnxruntime_sample_program
+        $(Build.SourcesDirectory)/samples/cxx/add_model.onnx
+      displayName: 'Run C++ sample'
+
+    # For win-arm64x, also build and run for ARM64EC.
+    - ${{ if eq(parameters.platform, 'win-arm64x') }}:
+      - script: >
+          cmake
+          -S $(Build.SourcesDirectory)/samples/cxx
+          -B $(Build.BinariesDirectory)/sample_build_arm64ec
+          -DORT_HEADER_DIR:PATH=$(ORT_PACKAGE_DIR)/include
+          -DORT_LIBRARY_DIR:PATH=$(ORT_PACKAGE_DIR)/lib
+          -A ARM64EC
+        displayName: 'Generate C++ sample build system (ARM64EC)'
+
+      - script: |
+          cmake --build $(Build.BinariesDirectory)/sample_build_arm64ec --config $(buildConfig)
+        displayName: 'Build C++ sample (ARM64EC)'
+
+      - script: >
+          $(Build.BinariesDirectory)/sample_build_arm64ec/$(relativePathFromBuildToOutputDir)/onnxruntime_sample_program
+          $(Build.SourcesDirectory)/samples/cxx/add_model.onnx
+        displayName: 'Run C++ sample (ARM64EC)'
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index cfb752ddc2b58..8a5584c111525 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -177,12 +177,40 @@ stages:
 
       - script: python -m pip install -r $(Build.SourcesDirectory)\tools\ci_build\github\windows\python\requirements.txt
 
-      - task: PythonScript@0
-        displayName: 'Generate cmake config'
-        inputs:
-          scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: '--parallel 16 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --build --cmake_generator "$(VSGenerator)" --enable_onnx_tests $(TelemetryOption) ${{ parameters.buildparameter }} $(timeoutParameter) $(buildJavaParameter)'
-          workingDirectory: '$(Build.BinariesDirectory)'
+      - template: set-variable.yml
+        parameters:
+          name: commonBuildPyArgs
+          value: >-
+            --config RelWithDebInfo
+            --parallel
+            --use_vcpkg
+            --use_vcpkg_ms_internal_asset_cache
+            --use_binskim_compliant_compile_flags
+            --enable_lto
+            --disable_rtti
+            --skip_submodule_sync
+            --build_shared_lib
+            --update --build
+            --cmake_generator "$(VSGenerator)"
+            --enable_onnx_tests
+            $(TelemetryOption)
+            ${{ parameters.buildparameter }}
+            $(timeoutParameter)
+            $(buildJavaParameter)
+
+      - ${{ if eq(parameters.msbuildPlatform, 'arm64x') }}:
+        - template: build-win-arm64x-steps.yml
+          parameters:
+            buildDirectory: '$(Build.BinariesDirectory)'
+            additionalBuildPyArgs: '$(commonBuildPyArgs)'
+
+      - ${{ else }}:
+        - task: PythonScript@0
+          displayName: 'Generate build system and build'
+          inputs:
+            scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+            arguments: '$(commonBuildPyArgs) --build_dir $(Build.BinariesDirectory)'
+            workingDirectory: '$(Build.BinariesDirectory)'
 
       # For CPU job, tests are run in the same machine as building
       - ${{ if eq(parameters.buildJava, 'true') }}:
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
index 42bee7a892b11..7e2b6e74cfdde 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt
@@ -1,5 +1,5 @@
-numpy==2.2.6; python_version < "3.14"
-numpy==2.3.2; python_version >= "3.14"
+numpy==2.2.6; python_version < "3.11"
+numpy==2.4.2; python_version >= "3.11"
 mypy
 pytest
 setuptools>=68.2.2
@@ -7,4 +7,4 @@ wheel
 protobuf==4.25.8
 sympy==1.14
 flatbuffers
-onnx==1.20.1; python_version < "3.14"
+onnx==1.20.1
diff --git a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
index c5fc16837e093..63a8e96d8c128 100644
--- a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt
@@ -3,13 +3,13 @@ beartype==0.15.0
 flatbuffers
 cerberus
 h5py
-onnx==1.20.1; python_version < "3.14"
+onnx==1.20.1
 # Python dependencies required for pytorch development
 astunparse
 expecttest!=0.2.0
 hypothesis
-numpy==2.2.6; python_version < "3.14"
-numpy==2.3.2; python_version >= "3.14"
+numpy==2.2.6; python_version < "3.11"
+numpy==2.4.2; python_version >= "3.11"
 psutil
 pyyaml
 requests
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
index 8f5d0776501c0..ffcad5ee67208 100644
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
@@ -1,5 +1,5 @@
-numpy==2.2.6; python_version < "3.14"
-numpy==2.3.2; python_version >= "3.14"
+numpy==2.2.6; python_version < "3.11"
+numpy==2.4.2; python_version >= "3.11"
 mypy
 pytest
 setuptools>=68.2.2
@@ -8,6 +8,5 @@ protobuf==6.33.0
 sympy==1.14
 flatbuffers
 neural-compressor>=2.2.1
-triton==3.2.0; python_version < "3.14"
-triton==3.5.0; python_version >= "3.14"
-onnx==1.20.1; python_version < "3.14"
+triton==3.5.0
+onnx==1.20.1
diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt
index 85a9c6391af80..ad57cc715589b 100644
--- a/tools/ci_build/github/linux/docker/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt
@@ -1,6 +1,6 @@
 cerberus
-numpy==2.2.6; python_version < "3.14"
-numpy==2.3.2; python_version >= "3.14"
+numpy==2.2.6; python_version < "3.11"
+numpy==2.4.2; python_version >= "3.11"
 mypy
 pytest
 setuptools==78.1.1
@@ -10,6 +10,6 @@ sympy==1.14
 flatbuffers
 protobuf==6.33.0
 packaging
-onnxscript==0.5.3; python_version < "3.14"
-onnx-ir==0.1.10; python_version < "3.14"
-onnx==1.20.1; python_version < "3.14"
+onnxscript==0.6.2
+onnx-ir==0.1.16
+onnx==1.20.1
diff --git a/tools/ci_build/github/linux/python/requirements.txt b/tools/ci_build/github/linux/python/requirements.txt
index 6a474973d4f0c..d95e44bb3a280 100644
--- a/tools/ci_build/github/linux/python/requirements.txt
+++ b/tools/ci_build/github/linux/python/requirements.txt
@@ -1,5 +1,5 @@
-numpy==2.2.6; python_version < "3.14"
-numpy==2.3.2; python_version >= "3.14"
+numpy==2.2.6; python_version < "3.11"
+numpy==2.4.2; python_version >= "3.11"
 mypy
 pytest
 setuptools>=68.2.2
@@ -8,8 +8,8 @@ protobuf==6.33.0
 sympy==1.14
 flatbuffers
 psutil
-onnxscript==0.5.3; python_version < "3.14"
-onnx-ir==0.1.10; python_version < "3.14"
+onnxscript==0.6.2
+onnx-ir==0.1.16
 jinja2
 markupsafe
-onnx==1.20.1; python_version < "3.14"
+onnx==1.20.1
diff --git a/tools/ci_build/github/windows/python/requirements.txt b/tools/ci_build/github/windows/python/requirements.txt
index a86eef170bc25..83593ff47e453 100644
--- a/tools/ci_build/github/windows/python/requirements.txt
+++ b/tools/ci_build/github/windows/python/requirements.txt
@@ -1,5 +1,5 @@
-numpy==2.2.6; python_version < "3.14"
-numpy==2.3.2; python_version >= "3.14"
+numpy==2.2.6; python_version < "3.11"
+numpy==2.4.2; python_version >= "3.11"
 mypy
 pytest
 setuptools>=68.2.2
@@ -8,10 +8,10 @@ protobuf==6.33.0
 sympy==1.14
 flatbuffers
 psutil
-onnxscript==0.5.3; python_version < "3.14"
-onnx-ir==0.1.10; python_version < "3.14"
+onnxscript==0.6.2
+onnx-ir==0.1.16
 jinja2
 markupsafe
 semver
 packaging
-onnx==1.20.1; python_version < "3.14"
+onnx==1.20.1
diff --git a/tools/ci_build/requirements/transformers-test/requirements.txt b/tools/ci_build/requirements/transformers-test/requirements.txt
index 1523b420bfdbd..c764225dbc98d 100644
--- a/tools/ci_build/requirements/transformers-test/requirements.txt
+++ b/tools/ci_build/requirements/transformers-test/requirements.txt
@@ -2,14 +2,14 @@
 packaging
 # protobuf and numpy is same as tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
 protobuf==6.33.0
-numpy==2.2.6; python_version < "3.14"
-numpy==2.3.2; python_version >= "3.14"
-torch==2.8.0
-torchvision==0.23.0
+numpy==2.2.6; python_version < "3.11"
+numpy==2.4.2; python_version >= "3.11"
+torch==2.10.0
+torchvision==0.25.0
 transformers==4.52.1
 parameterized>=0.8.1
 sentencepiece
 psutil
 einops
-onnxscript==0.5.3; python_version < "3.14"
-onnx-ir==0.1.10; python_version < "3.14"
+onnxscript==0.6.2
+onnx-ir==0.1.16