diff --git a/.github/actions/macos-ci-setup/action.yml b/.github/actions/macos-ci-setup/action.yml index 0d60eeae8aee3..054676d301820 100644 --- a/.github/actions/macos-ci-setup/action.yml +++ b/.github/actions/macos-ci-setup/action.yml @@ -8,7 +8,7 @@ inputs: python_version: required: false type: string - default: "3.11" + default: "3.14" node_version: required: false type: string diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml index 9aa8418c55a40..dd8cbfdc71a9c 100644 --- a/.github/workflows/linux_ci.yml +++ b/.github/workflows/linux_ci.yml @@ -68,6 +68,21 @@ jobs: secrets: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + build-linux-x64-release-py314: + name: Build Linux x64 Release (Python 3.14) + uses: ./.github/workflows/reusable_linux_build.yml + with: + pool_name: "onnxruntime-github-Ubuntu2204-AMD-CPU" + build_config: Release + architecture: x64 + dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu + docker_image_repo: onnxruntimecpubuildpythonx64 + extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --build_nuget --enable_transformers_tool_test --cmake_extra_defines onnxruntime_BUILD_BENCHMARKS=ON' + python_path_prefix: 'PATH=/opt/python/cp314-cp314/bin:$PATH' # $ needs escaping in single quotes + job_identifier: build-linux-x64-release-py314 + secrets: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + orttraining-linux-ci-pipeline: name: Build Linux x64 Release with training uses: ./.github/workflows/reusable_linux_build.yml @@ -109,7 +124,7 @@ jobs: dockerfile_path: tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile docker_image_repo: onnxruntimecpubuildpythonaarch64 extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cmake_extra_defines onnxruntime_BUILD_BENCHMARKS=ON' - python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH' # $ needs escaping in single quotes + python_path_prefix: 'PATH=/opt/python/cp314-cp314/bin:$PATH' # $ needs escaping in single quotes job_identifier: build-linux-arm64-release secrets: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index 32e126c34ac39..0f8b4a42f48ae 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -16,7 +16,7 @@ concurrency: cancel-in-progress: true env: - python_version: 3.11 + python_version: "3.14" jobs: cpu: @@ -28,6 +28,7 @@ jobs: {"machine": "arm64", "target": "arm64", "build_config": "Debug"}, {"machine": "arm64", "target": "arm64", "build_config": "Release"} ] + python_version: "3.14" coreml: uses: ./.github/workflows/macos-ci-build-and-test-workflow.yml @@ -39,6 +40,7 @@ jobs: {"machine": "arm64", "target": "arm64", "build_config": "Debug"}, {"machine": "arm64", "target": "arm64", "build_config": "Release"} ] + python_version: "3.14" xnnpack: uses: ./.github/workflows/macos-ci-build-and-test-workflow.yml @@ -49,6 +51,7 @@ jobs: [ {"machine": "arm64", "target": "arm64", "build_config": "Debug"} ] + python_version: "3.14" webgpu: uses: ./.github/workflows/macos-ci-build-and-test-workflow.yml @@ -60,6 +63,7 @@ jobs: {"machine": "arm64", "target": "arm64", "build_config": "Debug"}, {"machine": "arm64", "target": "arm64", "build_config": "Release"} ] + python_version: "3.14" iphone_simulator: runs-on: macos-15 diff --git a/.github/workflows/macos-ci-build-and-test-workflow.yml b/.github/workflows/macos-ci-build-and-test-workflow.yml index 75002fdf12c00..76198c7f5c1ce 100644 --- a/.github/workflows/macos-ci-build-and-test-workflow.yml +++ b/.github/workflows/macos-ci-build-and-test-workflow.yml @@ -19,7 +19,7 @@ on: python_version: required: false type: string - default: "3.11" + default: "3.14" matrix_include: required: false type: string diff --git a/.github/workflows/windows_cuda.yml b/.github/workflows/windows_cuda.yml index 89ae03981ecef..7b93086fbb77d 100644 --- a/.github/workflows/windows_cuda.yml +++ b/.github/workflows/windows_cuda.yml @@ -32,7 +32,7 @@ jobs: - uses: actions/setup-python@v6 with: - python-version: '3.12' + python-version: '3.14' architecture: x64 - name: Locate vcvarsall and Setup Env @@ -173,7 +173,7 @@ jobs: - uses: actions/setup-python@v6 with: - python-version: '3.12' + python-version: '3.14' architecture: x64 - uses: actions/setup-node@v6 diff --git a/VERSION_NUMBER b/VERSION_NUMBER index e4a973f913f4f..ae96cc7310aaa 100644 --- a/VERSION_NUMBER +++ b/VERSION_NUMBER @@ -1 +1 @@ -1.24.2 +1.24.3 diff --git a/cmake/deps.txt b/cmake/deps.txt index 578dd8fd23d09..65c74060e8deb 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -46,7 +46,7 @@ protoc_linux_aarch64;https://github.com/protocolbuffers/protobuf/releases/downlo protoc_mac_universal;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-osx-universal_binary.zip;23710c3d1c2036d8d65a6a22234372fa2d7af9ef psimd;https://github.com/Maratyszcza/psimd/archive/072586a71b55b7f8c584153d223e95687148a900.zip;1f5454b01f06f9656b77e4a5e2e31d7422487013 pthreadpool;https://github.com/google/pthreadpool/archive/dcc9f28589066af0dbd4555579281230abbf74dd.zip;533a77943203ef15ca608bcd9dbe2c94da7451d2 -pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f780292da9db273c8ef06ccf5fd4b623624143e9 +pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v3.0.2.zip;a064e663b4d7a337ac291d1bef7337ef4e60a1ae pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/403d652dca4c1046e8145950b1c0997a9f748b57.zip;30b2a07fe4bae8574f89176e56274cacdd6d135b re2;https://github.com/google/re2/archive/refs/tags/2024-07-02.zip;646e1728269cde7fcef990bf4a8e87b047882e88 safeint;https://github.com/dcleblanc/SafeInt/archive/refs/tags/3.0.28.zip;23f252040ff6cb9f1fd18575b32fa8fb5928daac diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake index 79280c97a899e..ba14667bc3c88 100644 --- a/cmake/external/pybind11.cmake +++ b/cmake/external/pybind11.cmake @@ -6,7 +6,6 @@ onnxruntime_fetchcontent_declare( URL ${DEP_URL_pybind11} URL_HASH SHA1=${DEP_SHA1_pybind11} EXCLUDE_FROM_ALL - FIND_PACKAGE_ARGS 2.13 NAMES pybind11 + FIND_PACKAGE_ARGS 3.0 NAMES pybind11 ) onnxruntime_fetchcontent_makeavailable(pybind11_project) - diff --git a/cmake/vcpkg-ports/pybind11/portfile.cmake b/cmake/vcpkg-ports/pybind11/portfile.cmake index 2c63582d1ee15..4e4cd30a26df1 100644 --- a/cmake/vcpkg-ports/pybind11/portfile.cmake +++ b/cmake/vcpkg-ports/pybind11/portfile.cmake @@ -2,7 +2,8 @@ vcpkg_from_github( OUT_SOURCE_PATH SOURCE_PATH REPO pybind/pybind11 REF "v${VERSION}" - SHA512 497c25b33b09a9c42f67131ab82e35d689e8ce089dd7639be997305ff9a6d502447b79c824508c455d559e61f0186335b54dd2771d903a7c1621833930622d1a + # SHA512 for the zip (not tar.gz) file. + SHA512 786b1bf534ac67a8d5669f8babf67bb13e48b3a3da1b6344e43ae10a84b80bbc8fea5f12a65fd18739c341fefef5622c5dc096db964dff33cc62ea4259b2e2c1 HEAD_REF master ) diff --git a/cmake/vcpkg-ports/pybind11/vcpkg.json b/cmake/vcpkg-ports/pybind11/vcpkg.json index a730d32017885..058e2235fea08 100644 --- a/cmake/vcpkg-ports/pybind11/vcpkg.json +++ b/cmake/vcpkg-ports/pybind11/vcpkg.json @@ -1,6 +1,6 @@ { "name": "pybind11", - "version": "2.13.6", + "version": "3.0.2", "description": "pybind11 is a lightweight header-only library that exposes C++ types in Python and vice versa, mainly to create Python bindings of existing C++ code", "homepage": "https://github.com/pybind/pybind11", "license": "BSD-3-Clause", diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs index abe73b77f4071..a6b267c6802cf 100644 --- a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs +++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs @@ -892,99 +892,115 @@ internal class NativeLib /// On Windows, it explicitly loads the library with a lowercase .dll extension to handle /// case-sensitive filesystems. /// +#if NET5_0_OR_GREATER + [System.Diagnostics.CodeAnalysis.UnconditionalSuppressMessage("SingleFile", "IL3000:Avoid accessing Assembly file path when publishing as a single file", Justification = "We also check AppContext.BaseDirectory as a fallback")] +#endif private static IntPtr DllImportResolver(string libraryName, Assembly assembly, DllImportSearchPath? searchPath) { - if (libraryName == NativeLib.DllName || libraryName == OrtExtensionsNativeMethods.ExtensionsDllName) + try { - string mappedName = null; - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) - { - // Explicitly load with .dll extension to avoid issues where the OS might try .DLL - mappedName = libraryName + ".dll"; - } - else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) + if (libraryName == NativeLib.DllName || libraryName == OrtExtensionsNativeMethods.ExtensionsDllName) { - // Explicitly load with .so extension and lib prefix - mappedName = "lib" + libraryName + ".so"; - } - else if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX)) - { - // Explicitly load with .dylib extension and lib prefix - mappedName = "lib" + libraryName + ".dylib"; - } - - if (mappedName != null) - { - // 1. Try default loading (name only) - if (NativeLibrary.TryLoad(mappedName, assembly, searchPath, out IntPtr handle)) + string mappedName = null; + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { - return handle; + // Explicitly load with .dll extension to avoid issues where the OS might try .DLL + mappedName = libraryName + ".dll"; } - - // 2. Try relative to assembly location (look into runtimes subfolders) - string assemblyLocation = null; - try { assemblyLocation = assembly.Location; } catch { } - if (!string.IsNullOrEmpty(assemblyLocation)) + else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) + { + // Explicitly load with .so extension and lib prefix + mappedName = "lib" + libraryName + ".so"; + } + else if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX)) { - string assemblyDir = System.IO.Path.GetDirectoryName(assemblyLocation); - string rid = RuntimeInformation.RuntimeIdentifier; + // Explicitly load with .dylib extension and lib prefix + mappedName = "lib" + libraryName + ".dylib"; + } - // Probe the specific RID first, then common fallbacks for the current OS - string[] ridsToTry; - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) - { - ridsToTry = new[] { rid, "win-x64", "win-arm64" }; - } - else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) - { - ridsToTry = new[] { rid, "linux-x64", "linux-arm64" }; - } - else if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX)) - { - // We no longer provide osx-x64 in official package since 1.24. - // However, we keep it in the list for build-from-source users. - ridsToTry = new[] { rid, "osx-arm64", "osx-x64" }; - } - else + if (mappedName != null) + { + // 1. Try default loading (name only) + if (NativeLibrary.TryLoad(mappedName, assembly, searchPath, out IntPtr handle)) { - ridsToTry = new[] { rid }; + return handle; } - foreach (var tryRid in ridsToTry) + // 2. Try relative to assembly location (look into runtimes subfolders) + string assemblyLocation = null; + try { assemblyLocation = assembly.Location; } catch { } + if (!string.IsNullOrEmpty(assemblyLocation)) { - string probePath = System.IO.Path.Combine(assemblyDir, "runtimes", tryRid, "native", mappedName); - if (System.IO.File.Exists(probePath) && NativeLibrary.TryLoad(probePath, assembly, searchPath, out handle)) + string assemblyDir = System.IO.Path.GetDirectoryName(assemblyLocation); + string rid = RuntimeInformation.RuntimeIdentifier; + + // Probe the specific RID first, then common fallbacks for the current OS + string[] ridsToTry; + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { - LogLibLoad($"[DllImportResolver] Loaded {mappedName} from: {probePath}"); - return handle; + ridsToTry = new[] { rid, "win-x64", "win-arm64" }; + } + else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) + { + ridsToTry = new[] { rid, "linux-x64", "linux-arm64" }; + } + else if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX)) + { + // We no longer provide osx-x64 in official package since 1.24. + // However, we keep it in the list for build-from-source users. + ridsToTry = new[] { rid, "osx-arm64", "osx-x64" }; + } + else + { + ridsToTry = new[] { rid }; } - } - } - // 3. Try AppContext.BaseDirectory as a fallback - string baseDir = AppContext.BaseDirectory; - if (!string.IsNullOrEmpty(baseDir)) - { - string probePath = System.IO.Path.Combine(baseDir, mappedName); - if (NativeLibrary.TryLoad(probePath, assembly, searchPath, out handle)) - { - LogLibLoad($"[DllImportResolver] Loaded {mappedName} from: {probePath}"); - return handle; + foreach (var tryRid in ridsToTry) + { + string probePath = System.IO.Path.Combine(assemblyDir, "runtimes", tryRid, "native", mappedName); + if (System.IO.File.Exists(probePath) && NativeLibrary.TryLoad(probePath, assembly, searchPath, out handle)) + { + LogLibLoad($"[DllImportResolver] Loaded {mappedName} from: {probePath}"); + return handle; + } + } } - string rid = RuntimeInformation.RuntimeIdentifier; - probePath = System.IO.Path.Combine(baseDir, "runtimes", rid, "native", mappedName); - if (NativeLibrary.TryLoad(probePath, assembly, searchPath, out handle)) + // 3. Try AppContext.BaseDirectory as a fallback + try { - LogLibLoad($"[DllImportResolver] Loaded {mappedName} from: {probePath}"); - return handle; + string baseDir = AppContext.BaseDirectory; + if (!string.IsNullOrEmpty(baseDir)) + { + string probePath = System.IO.Path.Combine(baseDir, mappedName); + if (NativeLibrary.TryLoad(probePath, assembly, searchPath, out handle)) + { + LogLibLoad($"[DllImportResolver] Loaded {mappedName} from: {probePath}"); + return handle; + } + + string rid = RuntimeInformation.RuntimeIdentifier; + probePath = System.IO.Path.Combine(baseDir, "runtimes", rid, "native", mappedName); + if (NativeLibrary.TryLoad(probePath, assembly, searchPath, out handle)) + { + LogLibLoad($"[DllImportResolver] Loaded {mappedName} from: {probePath}"); + return handle; + } + } } - } + catch { } // Ignore AppDomainUnloadedException or similar from AppContext.BaseDirectory - LogLibLoad($"[DllImportResolver] Failed loading {mappedName} (RID: {RuntimeInformation.RuntimeIdentifier}, Assembly: {assemblyLocation})"); + LogLibLoad($"[DllImportResolver] Failed loading {mappedName} (RID: {RuntimeInformation.RuntimeIdentifier}, Assembly: {assemblyLocation})"); + } } } + catch (Exception ex) + { + // Unhandled exceptions inside DllImportResolver can result in TypeInitializationException. + // Log and swallow the error, returning IntPtr.Zero to fall back to default CLR logic. + try { System.Diagnostics.Trace.WriteLine($"[DllImportResolver] Exception during resolution: {ex}"); } catch { } + } // Fall back to default resolution return IntPtr.Zero; diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtEnvTests.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtEnvTests.cs index 94f8e927c1331..aa1b683acd668 100644 --- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtEnvTests.cs +++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtEnvTests.cs @@ -489,4 +489,47 @@ void TestCopyTensors() } } } + + [Collection("Ort Inference Tests")] + public class OrtEnvDllImportResolverTest + { + [Fact(DisplayName = "TestDllImportResolverDoesNotThrow")] + public void TestDllImportResolverDoesNotThrow() + { + // The DllImportResolver is a private static method in NativeMethods. + var nativeMethodsType = typeof(OrtEnv).Assembly.GetType("Microsoft.ML.OnnxRuntime.NativeMethods"); + Assert.NotNull(nativeMethodsType); + + // It might not be defined on all platforms (defined when !NETSTANDARD2_0 && !__ANDROID__ && !__IOS__). + var resolverMethod = nativeMethodsType.GetMethod("DllImportResolver", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static); + + if (resolverMethod != null) + { + try + { + // Invoke with null assembly to force it into edge cases where assembly.Location would throw NullReferenceException. + // It should catch the exception and return IntPtr.Zero gracefully rather than throwing. + var result = resolverMethod.Invoke(null, new object[] { "onnxruntime", null, null }); + + // If it reaches here without throwing TargetInvocationException, the try-catch in DllImportResolver works. + Assert.True(result is IntPtr); + } + catch (System.Reflection.TargetInvocationException ex) + { + // If NativeMethods..cctor() threw because the native library is missing, + // we will get a TypeInitializationException wrapping a DllNotFoundException (or DllImportException). + // This is acceptable locally. What we want to avoid is NullReferenceException from DllImportResolver. + if (ex.InnerException is TypeInitializationException typeInitEx) + { + Assert.IsNotType(typeInitEx.InnerException); + } + else + { + Assert.IsNotType(ex.InnerException); + throw; + } + } + } + } + } } diff --git a/docs/python/README.rst b/docs/python/README.rst index 6124ffe8d27b4..06f5b0ebf3094 100644 --- a/docs/python/README.rst +++ b/docs/python/README.rst @@ -8,6 +8,11 @@ For more information on ONNX Runtime, please see `aka.ms/onnxruntime fileName; } else if (wasmPathOverride || wasmPrefixOverride) { // A callback function to locate the WebAssembly file. The function should return the full path of the file. // diff --git a/js/web/lib/wasm/wasm-utils-import.ts b/js/web/lib/wasm/wasm-utils-import.ts index e2e46bb37dcfc..6c899d1ae9cf5 100644 --- a/js/web/lib/wasm/wasm-utils-import.ts +++ b/js/web/lib/wasm/wasm-utils-import.ts @@ -272,7 +272,9 @@ export const importWasmModule = async ( } } else { // if the script source is available, we can check if it is from the same origin. - useEmbeddedModule = isSameOrigin(scriptSrc); + // Also use the embedded module when wasmBinary is provided and single-threaded (eg. Blob URL workers + // where isSameOrigin fails but no file resolution or worker spawning is needed). + useEmbeddedModule = isSameOrigin(scriptSrc) || (isWasmOverridden && !isMultiThreaded); } } if (useEmbeddedModule) { diff --git a/js/web/package-lock.json b/js/web/package-lock.json index 5782c9b2bfcc3..01a6e497a9a75 100644 --- a/js/web/package-lock.json +++ b/js/web/package-lock.json @@ -1,12 +1,12 @@ { "name": "onnxruntime-web", - "version": "1.24.2", + "version": "1.24.3", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "onnxruntime-web", - "version": "1.24.2", + "version": "1.24.3", "license": "MIT", "dependencies": { "flatbuffers": "^25.1.24", @@ -50,7 +50,7 @@ }, "../common": { "name": "onnxruntime-common", - "version": "1.24.2", + "version": "1.24.3", "license": "MIT", "devDependencies": { "globby": "^15.0.0", diff --git a/js/web/package.json b/js/web/package.json index f0a2dcb410a2a..23d89ca57199f 100644 --- a/js/web/package.json +++ b/js/web/package.json @@ -7,7 +7,7 @@ "type": "git" }, "author": "fs-eire", - "version": "1.24.2", + "version": "1.24.3", "jsdelivr": "dist/ort.min.js", "dependencies": { "flatbuffers": "^25.1.24", diff --git a/onnxruntime/__init__.py b/onnxruntime/__init__.py index 98096104ed304..ace3e4dd4bf46 100644 --- a/onnxruntime/__init__.py +++ b/onnxruntime/__init__.py @@ -10,7 +10,7 @@ import contextlib -__version__ = "1.24.2" +__version__ = "1.24.3" __author__ = "Microsoft" # we need to do device version validation (for example to check Cuda version for an onnxruntime-training package). diff --git a/onnxruntime/contrib_ops/cpu/moe/moe_helper.h b/onnxruntime/contrib_ops/cpu/moe/moe_helper.h index 257c5a189b3bd..bd30418030dc2 100644 --- a/onnxruntime/contrib_ops/cpu/moe/moe_helper.h +++ b/onnxruntime/contrib_ops/cpu/moe/moe_helper.h @@ -35,44 +35,86 @@ struct MoEParameters { }; namespace moe_helper { +// Helper to check shape dimensions +#define ASSERT_SHAPE_DIMENSION(shape_ptr, dim, name) \ + if (shape_ptr != nullptr) { \ + if (shape_ptr->NumDimensions() != dim) { \ + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input '", name, \ + "' is expected to have ", dim, " dimensions, got ", \ + shape_ptr->NumDimensions()); \ + } \ + } + +#define ASSERT_SHAPE_3D(shape_ptr, name) ASSERT_SHAPE_DIMENSION(shape_ptr, 3, name) + +#define CHECK_SHAPE(shape_ptr, name, ...) \ + if (shape_ptr != nullptr) { \ + const TensorShape& expected_shape = make_shape(__VA_ARGS__); \ + if (*shape_ptr != expected_shape) { \ + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input '", name, \ + "' is expected to have shape ", expected_shape, \ + ", got ", *shape_ptr); \ + } \ + } + template Status CheckInputs(MoEParameters& parameters, - const Tensor* input, // required - const Tensor* router_probs, // required - const Tensor* fc1_experts_weights, // required - const Tensor* fc1_experts_bias, // optional - const Tensor* fc1_experts_scales, // required for qMoE; NULL for MOE - const Tensor* fc1_zero_points, // optional, for qMoE - const Tensor* fc2_experts_weights, // required - const Tensor* fc2_experts_bias, // optional - const Tensor* fc2_experts_scales, // required for qMoE; NULL for MOE - const Tensor* fc2_zero_points, // optional, for qMoE - const Tensor* fc3_experts_weights, // optional - const Tensor* fc3_experts_bias, // optional - const Tensor* fc3_experts_scales, // required for qMoE; NULL for MOE - const Tensor* fc3_zero_points, // optional, for qMoE - const int64_t pack_size, // number of weights packed together (like 2 for uint4 packed to uint8) + const Tensor* input, // required + const Tensor* router_probs, // required + const TensorShape* fc1_experts_weights_shape, // required + const Tensor* fc1_experts_bias, // optional + const Tensor* fc1_experts_scales, // required for qMoE; NULL for MOE + const Tensor* fc1_zero_points, // optional, for qMoE + const TensorShape* fc2_experts_weights_shape, // required + const Tensor* fc2_experts_bias, // optional + const Tensor* fc2_experts_scales, // required for qMoE; NULL for MOE + const Tensor* fc2_zero_points, // optional, for qMoE + const TensorShape* fc3_experts_weights_shape, // optional + const Tensor* fc3_experts_bias, // optional + const Tensor* fc3_experts_scales, // required for qMoE; NULL for MOE + const Tensor* fc3_zero_points, // optional, for qMoE + const int64_t pack_size, // number of weights packed together (like 2 for uint4 packed to uint8) const bool is_fused_swiglu, const int64_t block_size = 0) { // block size for block-wise quantization - // Check dimensions of input to avoid input_dims index out of range. CHECK_TENSOR_SHAPE will verify each tensor later. + // Required inputs + if (input == nullptr) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'input' is required."); + } ASSERT_TENSOR_2D_OR_3D(input); - ASSERT_TENSOR_3D(fc1_experts_weights); - ASSERT_TENSOR_3D(fc2_experts_weights); + + if (router_probs == nullptr) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'router_probs' is required."); + } ASSERT_TENSOR_2D(router_probs); + if (fc1_experts_weights_shape == nullptr) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'fc1_experts_weights' is required."); + } + ASSERT_SHAPE_3D(fc1_experts_weights_shape, "fc1_experts_weights"); + + if (fc2_experts_weights_shape == nullptr) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'fc2_experts_weights' is required."); + } + ASSERT_SHAPE_3D(fc2_experts_weights_shape, "fc2_experts_weights"); + const auto& input_dims = input->Shape().GetDims(); const auto& router_probs_dims = router_probs->Shape().GetDims(); - const auto& fc1_experts_weights_dims = fc1_experts_weights->Shape().GetDims(); - const auto& fc2_experts_weights_dims = fc2_experts_weights->Shape().GetDims(); int64_t num_rows = input_dims.size() == 2 ? input_dims[0] : input_dims[0] * input_dims[1]; int64_t hidden_size = input_dims[input_dims.size() - 1]; - int64_t local_num_experts = fc1_experts_weights_dims[0]; int64_t num_experts = router_probs_dims[1]; - int64_t inter_size = (fc2_experts_weights_dims[1] * fc2_experts_weights_dims[2] * pack_size) / hidden_size; - const bool legacy_shape = (hidden_size != inter_size && fc2_experts_weights_dims[1] == inter_size) || - (hidden_size == inter_size && is_fused_swiglu && fc1_experts_weights_dims[1] == hidden_size); + int64_t local_num_experts = fc1_experts_weights_shape->GetDims()[0]; + + int64_t inter_size = (fc2_experts_weights_shape->GetDims()[1] * + fc2_experts_weights_shape->GetDims()[2] * pack_size) / + hidden_size; + + bool legacy_shape = false; + const auto& fc2_experts_weights_dims = fc2_experts_weights_shape->GetDims(); + const auto& fc1_experts_weights_dims = fc1_experts_weights_shape->GetDims(); + legacy_shape = (hidden_size != inter_size && fc2_experts_weights_dims[1] == inter_size) || + (hidden_size == inter_size && is_fused_swiglu && fc1_experts_weights_dims[1] == hidden_size); // Fused swiglu doubles the output dimension of FC1 since it fused two GEMMs into one. const int64_t fc1_inter_size = is_fused_swiglu ? (inter_size + inter_size) : inter_size; @@ -80,13 +122,13 @@ Status CheckInputs(MoEParameters& parameters, if (legacy_shape) { // legacy shape does not match column major memory layout. This is for backward compatibility. - CHECK_TENSOR_SHAPE(fc1_experts_weights, num_experts, hidden_size, fc1_inter_size / pack_size); - CHECK_TENSOR_SHAPE(fc2_experts_weights, num_experts, inter_size, hidden_size / pack_size); - CHECK_TENSOR_SHAPE(fc3_experts_weights, num_experts, hidden_size, inter_size / pack_size); + CHECK_SHAPE(fc1_experts_weights_shape, "fc1_experts_weights", num_experts, hidden_size, fc1_inter_size / pack_size); + CHECK_SHAPE(fc2_experts_weights_shape, "fc2_experts_weights", num_experts, inter_size, hidden_size / pack_size); + CHECK_SHAPE(fc3_experts_weights_shape, "fc3_experts_weights", num_experts, hidden_size, inter_size / pack_size); } else { - CHECK_TENSOR_SHAPE(fc1_experts_weights, num_experts, fc1_inter_size, hidden_size / pack_size); - CHECK_TENSOR_SHAPE(fc2_experts_weights, num_experts, hidden_size, inter_size / pack_size); - CHECK_TENSOR_SHAPE(fc3_experts_weights, num_experts, inter_size, hidden_size / pack_size); + CHECK_SHAPE(fc1_experts_weights_shape, "fc1_experts_weights", num_experts, fc1_inter_size, hidden_size / pack_size); + CHECK_SHAPE(fc2_experts_weights_shape, "fc2_experts_weights", num_experts, hidden_size, inter_size / pack_size); + CHECK_SHAPE(fc3_experts_weights_shape, "fc3_experts_weights", num_experts, inter_size, hidden_size / pack_size); } CHECK_TENSOR_SHAPE(router_probs, num_rows, num_experts); @@ -168,9 +210,11 @@ Status CheckInputs(MoEParameters& parameters, } } - if (fc3_experts_weights == nullptr) { + if (fc3_experts_weights_shape == nullptr) { + // If fc3 weights are not provided, ensure no other fc3 parameters are provided ORT_ENFORCE(fc3_experts_bias == nullptr && fc3_experts_scales == nullptr && fc3_zero_points == nullptr); } else { + // If fc3 weights are provided, ensure scales logic is consistent ORT_ENFORCE(fc1_experts_scales == nullptr || fc3_experts_scales != nullptr); // MOE no scale, or qMOE need scales } @@ -200,6 +244,36 @@ Status CheckInputs(MoEParameters& parameters, return Status::OK(); } +template +Status CheckInputs(MoEParameters& parameters, + const Tensor* input, // required + const Tensor* router_probs, // required + const Tensor* fc1_experts_weights, // required + const Tensor* fc1_experts_bias, // optional + const Tensor* fc1_experts_scales, // required for qMoE; NULL for MOE + const Tensor* fc1_zero_points, // optional, for qMoE + const Tensor* fc2_experts_weights, // required + const Tensor* fc2_experts_bias, // optional + const Tensor* fc2_experts_scales, // required for qMoE; NULL for MOE + const Tensor* fc2_zero_points, // optional, for qMoE + const Tensor* fc3_experts_weights, // optional + const Tensor* fc3_experts_bias, // optional + const Tensor* fc3_experts_scales, // required for qMoE; NULL for MOE + const Tensor* fc3_zero_points, // optional, for qMoE + const int64_t pack_size, // number of weights packed together (like 2 for uint4 packed to uint8) + const bool is_fused_swiglu, + const int64_t block_size = 0) { // block size for block-wise quantization + + const TensorShape* fc1_shape = (fc1_experts_weights != nullptr) ? &fc1_experts_weights->Shape() : nullptr; + const TensorShape* fc2_shape = (fc2_experts_weights != nullptr) ? &fc2_experts_weights->Shape() : nullptr; + const TensorShape* fc3_shape = (fc3_experts_weights != nullptr) ? &fc3_experts_weights->Shape() : nullptr; + + return CheckInputs(parameters, input, router_probs, fc1_shape, fc1_experts_bias, fc1_experts_scales, fc1_zero_points, + fc2_shape, fc2_experts_bias, fc2_experts_scales, fc2_zero_points, + fc3_shape, fc3_experts_bias, fc3_experts_scales, fc3_zero_points, + pack_size, is_fused_swiglu, block_size); +} + } // namespace moe_helper } // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc b/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc index 6d1d191689466..81d2b0f8efdc6 100644 --- a/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc +++ b/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.cc @@ -13,6 +13,7 @@ #include "core/common/narrow.h" #include "core/framework/tensor_type_and_shape.h" #include "core/util/math.h" +#include "core/platform/env_var_utils.h" #include "contrib_ops/cpu/moe/moe_utils.h" #include "contrib_ops/cpu/moe/moe_helper.h" @@ -69,13 +70,13 @@ bool CanUseMlasQ4Gemm(int64_t expert_weight_bits, int64_t block_size, out_qtype = BlkQ4Sym64; } else if (block_size == 128) { out_qtype = BlkQ4Sym128; - } else if (block_size == 0) { + } else if (block_size == 0 || block_size == 32) { out_qtype = BlkQ4Sym; } else { return false; } - size_t expected_size = MlasQ4GemmPackBSize(out_qtype, static_cast(cols), static_cast(rows)); + size_t expected_size = MlasQ4GemmPackBSize(out_qtype, static_cast(rows), static_cast(cols)); return expected_size > 0; } @@ -84,6 +85,8 @@ bool CanUseMlasQ4Gemm(int64_t expert_weight_bits, int64_t block_size, namespace onnxruntime { namespace contrib { +constexpr const char* kUseMlasQ4GemmMoe = "ORT_USE_MLAS_Q4_GEMM_MOE"; + template void DequantizeBlockWithMlas(const uint8_t* quantized_data, const TScale* scales, @@ -118,13 +121,23 @@ Status ConvertToMlasQ4Format(const uint8_t* quantized_data, DequantizeBlockWithMlas(quantized_data, scales, zero_points, block_size, num_bits, rows, cols, temp_float, nullptr); - size_t packed_size = MlasQ4GemmPackBSize(qtype, static_cast(cols), static_cast(rows)); + // Transpose from N x K (weights) to K x N. + // DirectQ4Gemm expects weights to be packed in a specific layout ([K, N] logically) + auto transposed_float_buffer = IAllocator::MakeUniquePtr(allocator, static_cast(rows * cols)); + float* transposed_float = transposed_float_buffer.get(); + for (int64_t r = 0; r < rows; ++r) { + for (int64_t c = 0; c < cols; ++c) { + transposed_float[c * rows + r] = temp_float[r * cols + c]; + } + } + + size_t packed_size = MlasQ4GemmPackBSize(qtype, static_cast(rows), static_cast(cols)); if (packed_size == 0) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "MLAS Q4 packing not supported for this configuration"); } mlas_packed_buffer = IAllocator::MakeUniquePtr(allocator, packed_size); - MlasQ4GemmPackB(qtype, mlas_packed_buffer.get(), temp_float, static_cast(cols), static_cast(rows), static_cast(cols)); + MlasQ4GemmPackB(qtype, mlas_packed_buffer.get(), transposed_float, static_cast(rows), static_cast(cols), static_cast(rows)); return Status::OK(); } @@ -354,6 +367,257 @@ void DequantizeBlock(const uint8_t* quantized_data, DequantizeBlockWithMlas(quantized_data, scales, zero_points, block_size, num_bits, rows, cols, dequantized_data, thread_pool); } +template +void DequantizePrePacked(const uint8_t* prepacked_data, + const TScale* scales, + const uint8_t* zero_points, + int64_t block_size, + int64_t rows, + int64_t cols, + float* dequantized_data, + const gsl::span& scale_dims) { + // prepacked_data is [cols, rows] (transposed, unpacked) + // dequantized_data is [cols, rows] (transposed) + // scales, zero_points correspond to original [rows, cols] layout + + const float default_zp_4bit = 8.0f; + const int64_t blocks_per_row = (block_size > 0) ? ((cols + block_size - 1) / block_size) : 1; + const int64_t zp_pack_size = 2; // Always 2 for 4-bit + + // Iterate over Columns (K) then Rows (N) because prepacked_data is [K, N] + for (int64_t c = 0; c < cols; ++c) { + for (int64_t r = 0; r < rows; ++r) { + uint8_t val = prepacked_data[c * rows + r]; + + int64_t block_idx = (block_size > 0) ? (c / block_size) : 0; + if (block_size > 0) block_idx = std::min(block_idx, blocks_per_row - 1); + + int64_t scale_idx; + if (scale_dims.size() == 3 && scale_dims[2] > 1) { // block-wise + scale_idx = r * blocks_per_row + block_idx; + } else { // per-channel + scale_idx = r; + } + + float scale = static_cast(scales[scale_idx]); + float zp = default_zp_4bit; + + if (zero_points != nullptr) { + int64_t zp_idx; + bool is_lower_nibble; + + if (scale_dims.size() == 3 && scale_dims[2] > 1) { // block-wise + int64_t zp_blocks_packed = (blocks_per_row + zp_pack_size - 1) / zp_pack_size; + zp_idx = r * zp_blocks_packed + block_idx / 2; + is_lower_nibble = (block_idx % 2 == 0); + } else { + zp_idx = r / 2; + is_lower_nibble = (r % 2 == 0); + } + + uint8_t packed_zp = zero_points[zp_idx]; + zp = is_lower_nibble ? static_cast(packed_zp & 0x0F) : static_cast(packed_zp >> 4); + } + + dequantized_data[c * rows + r] = scale * (static_cast(val) - zp); + } + } +} + +template +Status BuildDirectQ4PackedBCache(const uint8_t* prepacked_weights, + const TScale* scales_data, + int64_t num_experts, + int64_t rows, + int64_t cols, + int64_t block_size, + const gsl::span& scales_dims, + MLAS_BLK_QUANT_TYPE qtype, + AllocatorPtr allocator, + IAllocatorUniquePtr& packed_b) { + const size_t packed_size = MlasQ4GemmPackBSize(qtype, static_cast(rows), static_cast(cols)); + if (packed_size == 0) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Failed to compute MLAS Q4 packed size for cache"); + } + + const bool is_block_wise = (scales_dims.size() == 3 && scales_dims[2] > 1); + const int64_t scales_expert_stride = is_block_wise ? (rows * scales_dims[2]) : rows; + const size_t prepacked_expert_stride = static_cast(rows * cols); + const size_t total_packed_size = packed_size * static_cast(num_experts); + + packed_b = IAllocator::MakeUniquePtr(allocator, total_packed_size, true); + uint8_t* packed_b_ptr = static_cast(packed_b.get()); + + std::vector dequantized_transposed(static_cast(rows * cols)); + for (int64_t expert_idx = 0; expert_idx < num_experts; ++expert_idx) { + const uint8_t* expert_prepacked = prepacked_weights + static_cast(expert_idx) * prepacked_expert_stride; + const TScale* expert_scales = scales_data + expert_idx * scales_expert_stride; + + DequantizePrePacked(expert_prepacked, expert_scales, nullptr, block_size, rows, cols, + dequantized_transposed.data(), scales_dims); + + MlasQ4GemmPackB(qtype, packed_b_ptr + expert_idx * packed_size, dequantized_transposed.data(), + static_cast(rows), static_cast(cols), static_cast(rows)); + } + + return Status::OK(); +} + +template +Status QMoECPU::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, + /*out*/ bool& is_packed, + /*out*/ PrePackedWeights* prepacked_weights) { + is_packed = false; + + // If scales are prepacked, they are constant initializers. + if (input_idx == 3) { + return Status::OK(); + } + if (input_idx == 6) { + return Status::OK(); + } + + // Only support PrePack for FC1 (2) and FC2 (5) weights + // and only if expert_weight_bits_ == 4 (since we unpack to uint8) + if (expert_weight_bits_ != 4) { + return Status::OK(); + } + + if (input_idx == 2 || input_idx == 5) { + const auto& shape = tensor.Shape(); + const int64_t num_experts = shape[0]; + const int64_t rows = shape[1]; + const int64_t cols_packed = shape[2]; + const int64_t cols = cols_packed * 2; + + size_t packed_size = static_cast(num_experts * rows * cols); + auto packed_buffer = IAllocator::MakeUniquePtr(alloc, packed_size, true); + uint8_t* dst_base = static_cast(packed_buffer.get()); + const uint8_t* src_base = static_cast(tensor.DataRaw()); + + for (int64_t i = 0; i < num_experts; ++i) { + const uint8_t* src = src_base + i * rows * cols_packed; + uint8_t* dst = dst_base + i * rows * cols; + + for (int64_t r = 0; r < rows; ++r) { + for (int64_t c = 0; c < cols; ++c) { + uint8_t packed_val = src[r * cols_packed + (c / 2)]; + uint8_t val = (c % 2 == 0) ? (packed_val & 0x0F) : (packed_val >> 4); + + dst[c * rows + r] = val; + } + } + } + + if (input_idx == 2) { + fc1_shape_ = shape; + } else if (input_idx == 5) { + fc2_shape_ = shape; + } + + if (prepacked_weights) { + prepacked_weights->buffers_.push_back(std::move(packed_buffer)); + prepacked_weights->buffer_sizes_.push_back(packed_size); + is_packed = true; + + // Pack Shape (Buffer 1) + auto dims = shape.GetDims(); + size_t rank_bytes = sizeof(int64_t); + size_t dims_bytes = dims.size() * sizeof(int64_t); + size_t shape_size = rank_bytes + dims_bytes; + + auto shape_buffer = IAllocator::MakeUniquePtr(alloc, shape_size); + int64_t* buffer_data = static_cast(shape_buffer.get()); + *buffer_data = static_cast(dims.size()); + memcpy(buffer_data + 1, dims.data(), dims_bytes); + + prepacked_weights->buffers_.push_back(std::move(shape_buffer)); + prepacked_weights->buffer_sizes_.push_back(shape_size); + + // Try build MLAS Q4 cache if scales are available + if (use_mlas_q4_gemm_) { + const Tensor* scales_tensor = nullptr; + MLAS_BLK_QUANT_TYPE qtype = BlkQ4Sym; + int scales_idx = -1; + int zp_idx = -1; + + if (input_idx == 2) { // FC1 + scales_idx = 3; + zp_idx = 11; + } else if (input_idx == 5) { // FC2 + scales_idx = 6; + zp_idx = 12; + } + + if (scales_idx != -1 && + (zp_idx >= static_cast(Info().node().InputDefs().size()) || !Info().node().InputDefs()[zp_idx]->Exists()) && + Info().TryGetConstantInput(scales_idx, &scales_tensor) && + scales_tensor != nullptr && + CanUseMlasQ4Gemm(expert_weight_bits_, block_size_, rows, cols, qtype)) { + IAllocatorUniquePtr cache_buffer; + const auto& scales_dims = scales_tensor->Shape().GetDims(); + const T* scales_data = scales_tensor->Data(); + // Use the simple packed buffer we just created (buffer 0) as input + const uint8_t* simple_packed = dst_base; + + if (BuildDirectQ4PackedBCache(simple_packed, scales_data, num_experts, rows, cols, + block_size_, scales_dims, qtype, + alloc, cache_buffer) + .IsOK()) { + // Store the MLAS Q4 cache as buffer 2 (after unpacked weights and shape). + size_t cache_size = MlasQ4GemmPackBSize(qtype, static_cast(rows), static_cast(cols)) * static_cast(num_experts); + prepacked_weights->buffers_.push_back(std::move(cache_buffer)); + prepacked_weights->buffer_sizes_.push_back(cache_size); + } + } + } + } + } + + return Status::OK(); +} + +template +Status QMoECPU::UseSharedPrePackedBuffers_V2(std::vector& prepacked_buffers, + gsl::span /*prepacked_buffer_sizes*/, + int input_idx, + /*out*/ bool& used_shared_buffers) { + used_shared_buffers = false; + + if (expert_weight_bits_ != 4) { + return Status::OK(); + } + + if ((input_idx == 2 || input_idx == 5) && !prepacked_buffers.empty()) { + auto parse_shape = [&](TensorShape& shape) { + if (prepacked_buffers.size() > 1) { + int64_t* buffer_data = static_cast(prepacked_buffers[1].get()); + int64_t rank = buffer_data[0]; + std::vector dims(static_cast(rank)); + memcpy(dims.data(), buffer_data + 1, static_cast(rank) * sizeof(int64_t)); + shape = TensorShape(dims); + } + }; + + if (input_idx == 2) { + packed_fc1_ = std::move(prepacked_buffers[0]); + parse_shape(fc1_shape_); + if (prepacked_buffers.size() > 2) { + packed_fc1_mlas_cache_ = std::move(prepacked_buffers[2]); + } + } else if (input_idx == 5) { + packed_fc2_ = std::move(prepacked_buffers[0]); + parse_shape(fc2_shape_); + if (prepacked_buffers.size() > 2) { + packed_fc2_mlas_cache_ = std::move(prepacked_buffers[2]); + } + } + used_shared_buffers = true; + } + + return Status::OK(); +} + template QMoECPU::QMoECPU(const OpKernelInfo& op_kernel_info) : OpKernel(op_kernel_info), @@ -362,21 +626,32 @@ QMoECPU::QMoECPU(const OpKernelInfo& op_kernel_info) ORT_ENFORCE(expert_weight_bits_ == 4 || expert_weight_bits_ == 8, "Attribute 'expert_weight_bits' must be 4 or 8."); block_size_ = op_kernel_info.GetAttrOrDefault("block_size", 0); + ORT_ENFORCE(block_size_ >= 0); if (block_size_ > 0) { ORT_ENFORCE(block_size_ >= 16, "block_size must be >= 16 when provided."); ORT_ENFORCE((block_size_ & (block_size_ - 1)) == 0, "block_size must be a power of 2."); } + + const auto use_mlas_q4_gemm = ParseEnvironmentVariable(kUseMlasQ4GemmMoe); + if (use_mlas_q4_gemm.has_value()) { + use_mlas_q4_gemm_ = *use_mlas_q4_gemm; + use_mlas_q4_gemm_overridden_ = true; + } else { + // Default policy: enable fast path unless this run hits a known accuracy-loss configuration. + use_mlas_q4_gemm_ = true; + use_mlas_q4_gemm_overridden_ = false; + } } template Status QMoECPU::Compute(OpKernelContext* context) const { const auto* input = context->Input(0); const auto* router_probs = context->Input(1); - const auto* fc1_experts_weights = context->Input(2); + const auto* fc1_experts_weights = packed_fc1_ ? nullptr : context->Input(2); const auto* fc1_scales = context->Input(3); const auto* fc1_experts_bias = context->Input(4); - const auto* fc2_experts_weights = context->Input(5); + const auto* fc2_experts_weights = packed_fc2_ ? nullptr : context->Input(5); const auto* fc2_scales = context->Input(6); const auto* fc2_experts_bias = context->Input(7); const auto* fc3_experts_weights = context->Input(8); @@ -386,17 +661,21 @@ Status QMoECPU::Compute(OpKernelContext* context) const { const auto* fc2_zero_points = context->Input(12); const auto* fc3_zero_points = context->Input(13); + const TensorShape* fc1_shape_ptr = packed_fc1_ ? &fc1_shape_ : (fc1_experts_weights ? &fc1_experts_weights->Shape() : nullptr); + const TensorShape* fc2_shape_ptr = packed_fc2_ ? &fc2_shape_ : (fc2_experts_weights ? &fc2_experts_weights->Shape() : nullptr); + const TensorShape* fc3_shape_ptr = fc3_experts_weights ? &fc3_experts_weights->Shape() : nullptr; + MoEParameters moe_params; ORT_RETURN_IF_ERROR(moe_helper::CheckInputs( moe_params, input, router_probs, - fc1_experts_weights, fc1_experts_bias, fc1_scales, fc1_zero_points, - fc2_experts_weights, fc2_experts_bias, fc2_scales, fc2_zero_points, - fc3_experts_weights, fc3_experts_bias, fc3_scales, fc3_zero_points, + fc1_shape_ptr, fc1_experts_bias, fc1_scales, fc1_zero_points, + fc2_shape_ptr, fc2_experts_bias, fc2_scales, fc2_zero_points, + fc3_shape_ptr, fc3_experts_bias, fc3_scales, fc3_zero_points, expert_weight_bits_ == 4 ? 2 : 1, - true, + activation_type_ == ActivationType::SwiGLU, block_size_)); - if (fc3_experts_weights || fc3_experts_bias || fc3_scales || fc3_zero_points) { + if (fc3_shape_ptr || fc3_experts_bias || fc3_scales || fc3_zero_points) { return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "FC3 gating is not yet implemented on CPU for QMoE"); } @@ -559,8 +838,8 @@ Status QMoECPU::Compute(OpKernelContext* context) const { const bool is_fc1_block_wise = (fc1_scales_dims.size() == 3 && fc1_scales_dims[2] > 1); const bool is_fc2_block_wise = (fc2_scales_dims.size() == 3 && fc2_scales_dims[2] > 1); - const uint8_t* fc1_weights_data = fc1_experts_weights->Data(); - const uint8_t* fc2_weights_data = fc2_experts_weights->Data(); + const uint8_t* fc1_weights_data = (packed_fc1_ != nullptr) ? nullptr : fc1_experts_weights->template Data(); + const uint8_t* fc2_weights_data = (packed_fc2_ != nullptr) ? nullptr : fc2_experts_weights->template Data(); const T* fc1_scales_data = fc1_scales->Data(); const T* fc2_scales_data = fc2_scales->Data(); const T* fc1_bias_data = fc1_experts_bias ? fc1_experts_bias->Data() : nullptr; @@ -568,6 +847,13 @@ Status QMoECPU::Compute(OpKernelContext* context) const { const uint8_t* fc1_zp_data = fc1_zero_points ? fc1_zero_points->Data() : nullptr; const uint8_t* fc2_zp_data = fc2_zero_points ? fc2_zero_points->Data() : nullptr; + // Known loss-prone case from parity testing: 4-bit symmetric path (row-wise and block-wise). + const bool known_accuracy_loss_case = (expert_weight_bits_ == 4) && + (fc1_zp_data == nullptr) && (fc2_zp_data == nullptr); + const bool use_mlas_q4_gemm_effective = use_mlas_q4_gemm_overridden_ + ? use_mlas_q4_gemm_ + : (use_mlas_q4_gemm_ && !known_accuracy_loss_case); + const int64_t pack_unit = (8 / expert_weight_bits_); const int64_t fc1_packed_cols = (hidden_size + pack_unit - 1) / pack_unit; const int64_t fc2_packed_cols = (inter_size + pack_unit - 1) / pack_unit; @@ -595,6 +881,22 @@ Status QMoECPU::Compute(OpKernelContext* context) const { fc2_zp_expert_stride = (hidden_size + zp_pack_size - 1) / zp_pack_size; } + MLAS_BLK_QUANT_TYPE fc1_direct_qtype = BlkQ4Sym; + MLAS_BLK_QUANT_TYPE fc2_direct_qtype = BlkQ4Sym; + + // Use pre-packed MLAS cache if available + const void* fc1_direct_q4_cache_ptr = nullptr; + if (use_mlas_q4_gemm_effective && packed_fc1_mlas_cache_ && fc1_zp_data == nullptr && + CanUseMlasQ4Gemm(expert_weight_bits_, is_fc1_block_wise ? block_size_ : 0, fc1_out_features, hidden_size, fc1_direct_qtype)) { + fc1_direct_q4_cache_ptr = packed_fc1_mlas_cache_.get(); + } + + const void* fc2_direct_q4_cache_ptr = nullptr; + if (use_mlas_q4_gemm_effective && packed_fc2_mlas_cache_ && fc2_zp_data == nullptr && + CanUseMlasQ4Gemm(expert_weight_bits_, is_fc2_block_wise ? block_size_ : 0, hidden_size, inter_size, fc2_direct_qtype)) { + fc2_direct_q4_cache_ptr = packed_fc2_mlas_cache_.get(); + } + std::vector> expert_workload; size_t total_work = 0; @@ -634,6 +936,7 @@ Status QMoECPU::Compute(OpKernelContext* context) const { float* thread_bias2_buffer = thread_bias1_buffer + static_cast(fc1_out_features); for (int64_t expert_idx : expert_batch) { + bool fc2_bias_added_by_mlas = false; const auto& routes = expert_token_map[static_cast(expert_idx)]; if (routes.empty()) { continue; @@ -707,12 +1010,57 @@ Status QMoECPU::Compute(OpKernelContext* context) const { const size_t k = static_cast(hidden_size); MLAS_BLK_QUANT_TYPE q_type = BlkQ4Sym; // Initialize to default - // Direct Q4 GEMM only supports symmetric quantization, so we disable it if zero_points are provided. - bool use_direct_q4_gemm = (fc1_zp_data == nullptr) && - CanUseMlasQ4Gemm(expert_weight_bits_, is_fc1_block_wise ? block_size_ : 0, - fc1_out_features, hidden_size, q_type); - bool fc1_used_direct_q4 = false; - bool fc1_bias_handled_by_q4_gemm = false; + bool use_direct_q4_gemm = use_mlas_q4_gemm_effective && + ((fc1_direct_q4_cache_ptr != nullptr) || + ((packed_fc1_ == nullptr) && (fc1_zp_data == nullptr) && + CanUseMlasQ4Gemm(expert_weight_bits_, is_fc1_block_wise ? block_size_ : 0, + fc1_out_features, hidden_size, q_type))); + + if (packed_fc1_ != nullptr) { + if (use_mlas_q4_gemm_effective && fc1_zp_data == nullptr && + CanUseMlasQ4Gemm(expert_weight_bits_, is_fc1_block_wise ? block_size_ : 0, + fc1_out_features, hidden_size, q_type)) { + if (fc1_direct_q4_cache_ptr != nullptr) { + float* fc1_bias_float = nullptr; + if (has_fc1_bias) { + const T* B1_bias = fc1_bias_data + expert_idx * fc1_out_features; + if constexpr (std::is_same_v) { + MlasConvertHalfToFloatBuffer(reinterpret_cast(B1_bias), thread_bias1_buffer, static_cast(fc1_out_features)); + } else { + std::memcpy(thread_bias1_buffer, B1_bias, static_cast(fc1_out_features) * sizeof(float)); + } + fc1_bias_float = thread_bias1_buffer; + } + + size_t packed_size = MlasQ4GemmPackBSize(q_type, static_cast(fc1_out_features), static_cast(hidden_size)); + const uint8_t* packed_b = static_cast(fc1_direct_q4_cache_ptr) + expert_idx * packed_size; + + Status gemm_status = DirectQ4Gemm(A1, packed_b, fc1_bias_float, C1, + num_expert_tokens, fc1_out_features, hidden_size, fc1_direct_qtype, tp); + if (gemm_status.IsOK()) { + goto fc1_gemm_done; + } + } + } + + // Fallback: Dequantize from PrePacked (transposed, unpacked) -> MlasGemm + const uint8_t* current_packed_ptr = static_cast(packed_fc1_.get()) + expert_idx * fc1_out_features * hidden_size; + + DequantizePrePacked(current_packed_ptr, fc1_scales_ptr, fc1_zp_ptr, + is_fc1_block_wise ? block_size_ : 0, + fc1_out_features, hidden_size, + B1_dequant, fc1_scales_dims); + + // Use MlasGemm with B1_dequant (which is already float transposed) + MlasGemm(CblasNoTrans, CblasNoTrans, + m, n, k, + 1.0f, A1, k, + B1_dequant, n, + 0.0f, C1, n, + tp); + + goto fc1_bias_handling; + } if (use_direct_q4_gemm) { IAllocatorUniquePtr mlas_packed_fc1; @@ -730,12 +1078,10 @@ Status QMoECPU::Compute(OpKernelContext* context) const { if (convert_status.IsOK()) { float* fc1_bias_float = nullptr; - IAllocatorUniquePtr fc1_bias_buffer; if (has_fc1_bias) { const T* B1_bias = fc1_bias_data + expert_idx * fc1_out_features; - fc1_bias_buffer = IAllocator::MakeUniquePtr(allocator, static_cast(fc1_out_features)); - fc1_bias_float = fc1_bias_buffer.get(); + fc1_bias_float = thread_bias1_buffer; if constexpr (std::is_same_v) { MlasConvertHalfToFloatBuffer(reinterpret_cast(B1_bias), fc1_bias_float, static_cast(fc1_out_features)); @@ -750,7 +1096,6 @@ Status QMoECPU::Compute(OpKernelContext* context) const { num_expert_tokens, fc1_out_features, hidden_size, q_type, tp); if (gemm_status.IsOK()) { - fc1_used_direct_q4 = true; goto fc1_gemm_done; } } @@ -797,8 +1142,9 @@ Status QMoECPU::Compute(OpKernelContext* context) const { 0.0f, C1, n, tp); - fc1_bias_handled_by_q4_gemm = fc1_used_direct_q4 && has_fc1_bias; - if (has_fc1_bias && !fc1_bias_handled_by_q4_gemm) { + fc1_bias_handling: + + if (has_fc1_bias) { const T* B1_bias = fc1_bias_data + expert_idx * fc1_out_features; if constexpr (std::is_same_v) { MlasConvertHalfToFloatBuffer(reinterpret_cast(B1_bias), thread_bias1_buffer, static_cast(fc1_out_features)); @@ -837,22 +1183,30 @@ Status QMoECPU::Compute(OpKernelContext* context) const { fc1_gemm_done: - const int64_t activation_threshold = std::max(int64_t{4}, 256 / std::max(int64_t{1}, inter_size)); - if (num_expert_tokens >= activation_threshold && tp != nullptr) { - const int64_t activation_block_size = std::max(int64_t{1}, std::min(int64_t{64}, activation_threshold)); - const int64_t num_activation_blocks = (num_expert_tokens + activation_block_size - 1) / activation_block_size; - - if (num_activation_blocks > 1) { - concurrency::ThreadPool::TrySimpleParallelFor(tp, narrow(num_activation_blocks), [&](std::ptrdiff_t block_idx) { - const int64_t start_token = block_idx * activation_block_size; - const int64_t end_token = std::min(start_token + activation_block_size, num_expert_tokens); - - for (int64_t i = start_token; i < end_token; ++i) { + if (activation_type_ == ActivationType::SwiGLU) { + const int64_t activation_threshold = std::max(int64_t{4}, 256 / std::max(int64_t{1}, inter_size)); + if (num_expert_tokens >= activation_threshold && tp != nullptr) { + const int64_t activation_block_size = std::max(int64_t{1}, std::min(int64_t{64}, activation_threshold)); + const int64_t num_activation_blocks = (num_expert_tokens + activation_block_size - 1) / activation_block_size; + + if (num_activation_blocks > 1) { + concurrency::ThreadPool::TrySimpleParallelFor(tp, narrow(num_activation_blocks), [&](std::ptrdiff_t block_idx) { + const int64_t start_token = block_idx * activation_block_size; + const int64_t end_token = std::min(start_token + activation_block_size, num_expert_tokens); + + for (int64_t i = start_token; i < end_token; ++i) { + const float* C1_token = C1 + i * fc1_out_features; + float* A2_token = A2 + i * inter_size; + ApplySwiGLUActivation(C1_token, A2_token, inter_size, true, activation_alpha_, activation_beta_, swiglu_limit_); + } + }); + } else { + for (int64_t i = 0; i < num_expert_tokens; ++i) { const float* C1_token = C1 + i * fc1_out_features; float* A2_token = A2 + i * inter_size; ApplySwiGLUActivation(C1_token, A2_token, inter_size, true, activation_alpha_, activation_beta_, swiglu_limit_); } - }); + } } else { for (int64_t i = 0; i < num_expert_tokens; ++i) { const float* C1_token = C1 + i * fc1_out_features; @@ -861,11 +1215,8 @@ Status QMoECPU::Compute(OpKernelContext* context) const { } } } else { - for (int64_t i = 0; i < num_expert_tokens; ++i) { - const float* C1_token = C1 + i * fc1_out_features; - float* A2_token = A2 + i * inter_size; - ApplySwiGLUActivation(C1_token, A2_token, inter_size, true, activation_alpha_, activation_beta_, swiglu_limit_); - } + ApplyActivationVectorized(C1, num_expert_tokens * fc1_out_features); + std::copy(C1, C1 + (num_expert_tokens * fc1_out_features), A2); } const T* fc2_scales_ptr; @@ -888,10 +1239,58 @@ Status QMoECPU::Compute(OpKernelContext* context) const { const size_t k2 = static_cast(inter_size); MLAS_BLK_QUANT_TYPE q_type2 = BlkQ4Sym; // Initialize to default - bool use_direct_q4_gemm_fc2 = (fc2_zp_data == nullptr) && - CanUseMlasQ4Gemm(expert_weight_bits_, is_fc2_block_wise ? block_size_ : 0, - hidden_size, inter_size, q_type2); - bool fc2_used_direct_q4 = false; + bool use_direct_q4_gemm_fc2 = use_mlas_q4_gemm_effective && + ((fc2_direct_q4_cache_ptr != nullptr) || + ((packed_fc2_ == nullptr) && (fc2_zp_data == nullptr) && + CanUseMlasQ4Gemm(expert_weight_bits_, is_fc2_block_wise ? block_size_ : 0, + hidden_size, inter_size, q_type2))); + + if (packed_fc2_ != nullptr) { + if (use_mlas_q4_gemm_effective && fc2_zp_data == nullptr && + CanUseMlasQ4Gemm(expert_weight_bits_, is_fc2_block_wise ? block_size_ : 0, + hidden_size, inter_size, q_type2)) { + if (fc2_direct_q4_cache_ptr != nullptr) { + float* fc2_bias_float = nullptr; + if (has_fc2_bias) { + const T* B2_bias = fc2_bias_data + expert_idx * hidden_size; + if constexpr (std::is_same_v) { + MlasConvertHalfToFloatBuffer(reinterpret_cast(B2_bias), thread_bias2_buffer, static_cast(hidden_size)); + } else { + std::memcpy(thread_bias2_buffer, B2_bias, static_cast(hidden_size) * sizeof(float)); + } + fc2_bias_float = thread_bias2_buffer; + } + + size_t packed_size = MlasQ4GemmPackBSize(q_type2, static_cast(hidden_size), static_cast(inter_size)); + const uint8_t* packed_b = static_cast(fc2_direct_q4_cache_ptr) + expert_idx * packed_size; + + Status gemm_status = DirectQ4Gemm(A2, packed_b, fc2_bias_float, C2, + num_expert_tokens, hidden_size, inter_size, fc2_direct_qtype, tp); + if (gemm_status.IsOK()) { + fc2_bias_added_by_mlas = true; + goto fc2_gemm_done; + } + } + } + + // Dequantize from PrePacked (transposed, unpacked) + const uint8_t* current_packed_ptr = static_cast(packed_fc2_.get()) + expert_idx * hidden_size * inter_size; + + DequantizePrePacked(current_packed_ptr, fc2_scales_ptr, fc2_zp_ptr, + is_fc2_block_wise ? block_size_ : 0, + hidden_size, inter_size, + B2_dequant, fc2_scales_dims); + + // Fallback + MlasGemm(CblasNoTrans, CblasNoTrans, + m2, n2, k2, + 1.0f, A2, k2, + B2_dequant, n2, + 0.0f, C2, n2, + tp); + + goto fc2_gemm_done; + } if (use_direct_q4_gemm_fc2) { IAllocatorUniquePtr mlas_packed_fc2; @@ -909,12 +1308,10 @@ Status QMoECPU::Compute(OpKernelContext* context) const { if (convert_status.IsOK()) { float* fc2_bias_float = nullptr; - IAllocatorUniquePtr fc2_bias_buffer; if (has_fc2_bias) { const T* B2_bias = fc2_bias_data + expert_idx * hidden_size; - fc2_bias_buffer = IAllocator::MakeUniquePtr(allocator, static_cast(hidden_size)); - fc2_bias_float = fc2_bias_buffer.get(); + fc2_bias_float = thread_bias2_buffer; if constexpr (std::is_same_v) { MlasConvertHalfToFloatBuffer(reinterpret_cast(B2_bias), fc2_bias_float, static_cast(hidden_size)); @@ -929,7 +1326,7 @@ Status QMoECPU::Compute(OpKernelContext* context) const { num_expert_tokens, hidden_size, inter_size, q_type2, tp); if (gemm_status.IsOK()) { - fc2_used_direct_q4 = true; + fc2_bias_added_by_mlas = true; goto fc2_gemm_done; } } @@ -979,8 +1376,7 @@ Status QMoECPU::Compute(OpKernelContext* context) const { fc2_gemm_done: - bool fc2_bias_handled_by_q4_gemm = fc2_used_direct_q4 && has_fc2_bias; - if (has_fc2_bias && !fc2_bias_handled_by_q4_gemm) { + if (has_fc2_bias && !fc2_bias_added_by_mlas) { const T* B2_bias = fc2_bias_data + expert_idx * hidden_size; if constexpr (std::is_same_v) { MlasConvertHalfToFloatBuffer(reinterpret_cast(B2_bias), thread_bias2_buffer, static_cast(hidden_size)); @@ -1015,7 +1411,7 @@ Status QMoECPU::Compute(OpKernelContext* context) const { float* dest = thread_local_outputs + static_cast(thread_id) * output_buffer_size + buffer_offset; const float* src = C2 + i * hidden_size; - if (has_fc2_bias && !fc2_bias_handled_by_q4_gemm) { + if (has_fc2_bias && !fc2_bias_added_by_mlas) { const size_t unroll_factor = narrow(GetUnrollFactor(hidden_size)); size_t j = 0; for (; j + unroll_factor <= narrow(hidden_size); j += unroll_factor) { @@ -1109,10 +1505,22 @@ Status QMoECPU::Compute(OpKernelContext* context) const { return Status::OK(); } +template +void QMoECPU::ApplyActivationVectorized(float* data, int64_t size) const { + for (int64_t i = 0; i < size; ++i) { + data[i] = ApplyActivation(data[i], activation_type_); + } +} + template QMoECPU::QMoECPU(const OpKernelInfo& op_kernel_info); + template Status QMoECPU::Compute(OpKernelContext* context) const; +template Status QMoECPU::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool& is_packed, PrePackedWeights* prepacked_weights); +template Status QMoECPU::UseSharedPrePackedBuffers_V2(std::vector& prepacked_buffers, gsl::span prepacked_buffer_sizes, int input_idx, bool& used_shared_buffers); template QMoECPU::QMoECPU(const OpKernelInfo& op_kernel_info); template Status QMoECPU::Compute(OpKernelContext* context) const; +template Status QMoECPU::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool& is_packed, PrePackedWeights* prepacked_weights); +template Status QMoECPU::UseSharedPrePackedBuffers_V2(std::vector& prepacked_buffers, gsl::span prepacked_buffer_sizes, int input_idx, bool& used_shared_buffers); // Kernel Registration ONNX_OPERATOR_TYPED_KERNEL_EX( diff --git a/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.h b/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.h index 890580e051a8e..f678a27190c90 100644 --- a/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.h +++ b/onnxruntime/contrib_ops/cpu/moe/moe_quantization_cpu.h @@ -5,7 +5,9 @@ #include "core/common/common.h" #include "core/framework/op_kernel.h" +#include "core/mlas/inc/mlas_q4.h" #include "contrib_ops/cpu/moe/moe_base_cpu.h" +#include namespace onnxruntime { namespace contrib { @@ -26,8 +28,30 @@ class QMoECPU final : public OpKernel, public MoEBaseCPU { Status Compute(OpKernelContext* context) const override; private: + Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, + /*out*/ bool& is_packed, + /*out*/ PrePackedWeights* prepacked_weights) override; + + Status UseSharedPrePackedBuffers_V2(std::vector& prepacked_buffers, + gsl::span prepacked_buffer_sizes, + int input_idx, + /*out*/ bool& used_shared_buffers) override; + + void ApplyActivationVectorized(float* data, int64_t size) const; + int64_t expert_weight_bits_; int64_t block_size_; + bool use_mlas_q4_gemm_{false}; + bool use_mlas_q4_gemm_overridden_{false}; + + IAllocatorUniquePtr packed_fc1_; + IAllocatorUniquePtr packed_fc2_; + + TensorShape fc1_shape_; + TensorShape fc2_shape_; + + IAllocatorUniquePtr packed_fc1_mlas_cache_; + IAllocatorUniquePtr packed_fc2_mlas_cache_; }; } // namespace contrib diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc index 53e95bd8c5627..cc93799059f43 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc @@ -359,9 +359,9 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*ou OpKernel::Info().TryGetConstantInput(InputIndex::scales, &scales); if (scales && MlasQNBitGemmScalesPacked(K_, nbits_, block_size_, compute_type_, has_zp_input_)) { auto sptr = scales->Data(); - auto tensor_size = static_cast(tensor.Shape().Size()); - auto ptr = IAllocator::MakeUniquePtr(alloc, tensor_size, true); - MlasConvertHalfToFloatBuffer(sptr, ptr.get(), tensor_size); + auto scales_size = static_cast(scales->Shape().Size()); + auto ptr = IAllocator::MakeUniquePtr(alloc, scales_size, true); + MlasConvertHalfToFloatBuffer(sptr, ptr.get(), scales_size); scales_fp32_ = std::move(ptr); } diff --git a/onnxruntime/core/common/cpuid_info.cc b/onnxruntime/core/common/cpuid_info.cc index afea9f62419fa..00711e416e4e3 100644 --- a/onnxruntime/core/common/cpuid_info.cc +++ b/onnxruntime/core/common/cpuid_info.cc @@ -366,7 +366,11 @@ CPUIDInfo::CPUIDInfo() { #endif // defined(CPUINFO_SUPPORTED) // Note: This should be run after cpuinfo initialization if cpuinfo is enabled. + // On Wasm/Emscripten, cpuinfo cannot detect the CPU vendor so skip to avoid + // an unhelpful "Unknown CPU vendor" warning. +#if !defined(__wasm__) VendorInfoInit(); +#endif #ifdef CPUIDINFO_ARCH_X86 X86Init(); diff --git a/onnxruntime/core/common/cpuid_info.h b/onnxruntime/core/common/cpuid_info.h index ca9315c7ef95d..be301019df5c0 100644 --- a/onnxruntime/core/common/cpuid_info.h +++ b/onnxruntime/core/common/cpuid_info.h @@ -168,7 +168,7 @@ class CPUIDInfo { bool has_arm_sme2_{false}; std::string vendor_; - uint32_t vendor_id_; + uint32_t vendor_id_{0}; }; } // namespace onnxruntime diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc index a656abb098911..7648aaf8f9d33 100644 --- a/onnxruntime/core/framework/allocator.cc +++ b/onnxruntime/core/framework/allocator.cc @@ -237,7 +237,19 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtA OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::AMD, device_id), mem_type1); } else if (strcmp(name1, onnxruntime::WEBGPU_BUFFER) == 0 || - strcmp(name1, onnxruntime::WEBNN_TENSOR) == 0) { + strcmp(name1, onnxruntime::WEBNN_TENSOR) == 0 || + // PR #27207 (merged to main/1.25.x, not in 1.24.x) shortened the WebGPU/WebNN + // memory info names from "WebGPU_Buffer"/"WebNN_Tensor" to "WebGPU_Buf"/"WebNN_Ten" + // to enable Small String Optimization (SSO) on wasm32 (emscripten), where strings + // must be <= 10 chars for SSO. + // + // A WebGPU/WebNN plugin EP built against 1.25.x will use the new short names. + // Accept both old and new names here so that plugin EPs targeting either 1.24.x + // or 1.25.x can work with this 1.24.x runtime. + // + // See: https://github.com/microsoft/onnxruntime/pull/27207 + strcmp(name1, "WebGPU_Buf") == 0 || + strcmp(name1, "WebNN_Ten") == 0) { *out = new OrtMemoryInfo( name1, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::NONE, device_id), diff --git a/onnxruntime/core/framework/debug_node_inputs_outputs_utils.cc b/onnxruntime/core/framework/debug_node_inputs_outputs_utils.cc index 38dd8de01147c..5137c22d6cf61 100644 --- a/onnxruntime/core/framework/debug_node_inputs_outputs_utils.cc +++ b/onnxruntime/core/framework/debug_node_inputs_outputs_utils.cc @@ -621,8 +621,8 @@ void DumpNodeInputs( std::cout << " is non-tensor type.\n"; } } else { - // this could happen with an empty Optional input - std::cout << " was missing data type\n"; + // this could happen with an empty Optional input or the tensor is removed after pre-packing. + std::cout << " was missing data type (maybe pre-packed).\n"; } } else { std::cout << "Input " << i << " is optional and was not provided.\n"; diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc index 961012536126b..e4c7830ffbb55 100644 --- a/onnxruntime/core/framework/tensorprotoutils.cc +++ b/onnxruntime/core/framework/tensorprotoutils.cc @@ -371,6 +371,21 @@ Status ValidateExternalDataPath(const std::filesystem::path& base_dir, "External data path: ", location, " (resolved path: ", resolved, ") escapes model directory: ", base_dir); } + } else { + // The basedir is empty, which occurs when 1) the session loads a model from bytes and 2) the application does not + // set an external file folder path via the session config option + // `kOrtSessionOptionsModelExternalInitializersFileFolderPath`. + + // We conservatively check that the normalized relative path does not contain ".." path components that would allow + // access to arbitrary files outside of the current working directory. Based on ONNX checker validation. + auto norm_location = location.lexically_normal(); + + for (const auto& path_component : norm_location) { + if (path_component == ORT_TSTR("..")) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "External data path: ", location, + " (model loaded from bytes) escapes working directory"); + } + } } return Status::OK(); } diff --git a/onnxruntime/core/mlas/inc/mlas_q4.h b/onnxruntime/core/mlas/inc/mlas_q4.h index 69f0435615079..d60e5b0164fe8 100644 --- a/onnxruntime/core/mlas/inc/mlas_q4.h +++ b/onnxruntime/core/mlas/inc/mlas_q4.h @@ -57,10 +57,10 @@ MlasQ4GemmPackBSize( * * @param QType type of block quantization * @param PackedBuf destination buffer - * @param FpData the pointer to fp32 matrix - * @param N the number of columns of matrix B. - * @param K the number of rows of matrix B. - * @param ldb leading dimension of B + * @param FpData the pointer to fp32 matrix, with shape [K, N]. + * @param N the number of columns of matrix B (Output Channels). + * @param K the number of rows of matrix B (Input Channels). + * @param ldb leading dimension of FpData (usually N) */ void MLASCALL diff --git a/onnxruntime/core/mlas/lib/kai_ukernel_interface.cpp b/onnxruntime/core/mlas/lib/kai_ukernel_interface.cpp index a406f371a3bd2..06f9d97b872c7 100644 --- a/onnxruntime/core/mlas/lib/kai_ukernel_interface.cpp +++ b/onnxruntime/core/mlas/lib/kai_ukernel_interface.cpp @@ -19,8 +19,12 @@ #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.h" #include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa.h" #include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa.h" + +#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa.h" +#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa.h" #if defined(ENABLE_QMX_KERNELS) #include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa.h" +#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa.h" #endif // ENABLE_QMX_KERNELS const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod = @@ -125,6 +129,32 @@ const kai_matmul_clamp_f32_f32p_f32p_ukernel sgemm_gemm_sme2 = kai_get_dst_size_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa, kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa}; +const kai_matmul_clamp_f32_qai8dxp_qsi8cxp_ukernel qgemm_gemm_sme = + {kai_get_m_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa, + kai_get_n_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa, + kai_get_mr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa, + kai_get_nr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa, + kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa, + kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa, + kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa, + kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa, + kai_get_dst_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa, + kai_get_dst_size_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa, + kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa}; + +const kai_matmul_clamp_f32_qai8dxp_qsi8cxp_ukernel qgemm_gemm_sme2 = + {kai_get_m_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa, + kai_get_n_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa, + kai_get_mr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa, + kai_get_nr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa, + kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa, + kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa, + kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa, + kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa, + kai_get_dst_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa, + kai_get_dst_size_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa, + kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa}; + #if defined(ENABLE_QMX_KERNELS) const kai_matmul_clamp_f32_f32p_f32p_ukernel sgemm_gemm_qmx = {kai_get_m_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa, @@ -138,6 +168,19 @@ const kai_matmul_clamp_f32_f32p_f32p_ukernel sgemm_gemm_qmx = kai_get_dst_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa, kai_get_dst_size_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa, kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa}; + +const kai_matmul_clamp_f32_qai8dxp_qsi8cxp_ukernel qgemm_gemm_qmx = + {kai_get_m_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa, + kai_get_n_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa, + kai_get_mr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa, + kai_get_nr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa, + kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa, + kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa, + kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa, + kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa, + kai_get_dst_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa, + kai_get_dst_size_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa, + kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa}; #endif // ENABLE_QMX_KERNELS const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& GetKleidiAIGemmUKernel() { @@ -181,3 +224,21 @@ const kai_matmul_clamp_f32_f32_f32p_ukernel& GetKleidiAISGemvUKernel() { return sgemm_gemv_sme; } } + +const kai_matmul_clamp_f32_qai8dxp_qsi8cxp_ukernel& GetKleidiAIQGemmUKernel() { + if (MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME2()) { + return qgemm_gemm_sme2; + } else { +#if defined(ENABLE_QMX_KERNELS) + if (ArmKleidiAI::vendor_name.compare("Qualcomm") == 0) + { + KLEIDIAI_KERNEL_LOG("QGEMM: Using QMX Kernel"); + return qgemm_gemm_qmx; + } else { + return qgemm_gemm_sme; + } +#else + return qgemm_gemm_sme; +#endif // ENABLE_QMX_KERNELS + } +} diff --git a/onnxruntime/core/mlas/lib/kai_ukernel_interface.h b/onnxruntime/core/mlas/lib/kai_ukernel_interface.h index e69c72329d64b..7bd8959b0b5bd 100644 --- a/onnxruntime/core/mlas/lib/kai_ukernel_interface.h +++ b/onnxruntime/core/mlas/lib/kai_ukernel_interface.h @@ -12,8 +12,12 @@ #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p_interface.h" +#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp_qsi8cxp_interface.h" + const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& GetKleidiAIGemmUKernel(); const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& GetKleidiAIGemvUKernel(); const kai_matmul_clamp_f32_f32p_f32p_ukernel& GetKleidiAISGemmUKernel(); const kai_matmul_clamp_f32_f32_f32p_ukernel& GetKleidiAISGemvUKernel(); + +const kai_matmul_clamp_f32_qai8dxp_qsi8cxp_ukernel& GetKleidiAIQGemmUKernel(); \ No newline at end of file diff --git a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h index d4df09bb94a93..4c088e8660874 100644 --- a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h +++ b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h @@ -107,14 +107,14 @@ MlasGemmBatch( size_t MLASCALL -MlasDynamicQgemmPackBSize( +MlasDynamicQGemmPackBSize( size_t N, size_t K ); void MLASCALL -MlasDynamicQgemmPackB( +MlasDynamicQGemmPackB( size_t N, size_t K, const int8_t* B, diff --git a/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp index 9b2deac69ff63..b6a23735bd131 100644 --- a/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp +++ b/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp @@ -10,9 +10,7 @@ #include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon.h" -#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa.h" -#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa.h" -#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot.h" +#include "kai_ukernel_interface.h" #if defined(ENABLE_QMX_KERNELS) #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa.h" #endif // ENABLE_QMX_KERNELS @@ -26,11 +24,13 @@ struct KaiTlsBuffersQgemm { }; static thread_local KaiTlsBuffersQgemm g_kai_tls_qgemm; +const kai_matmul_clamp_f32_qai8dxp_qsi8cxp_ukernel qgemm_gemm = GetKleidiAIQGemmUKernel(); + // Matmul with float output of dynamic-quantized A and symmetric-quantized B. size_t MLASCALL -ArmKleidiAI::MlasDynamicQgemmPackBSize( +ArmKleidiAI::MlasDynamicQGemmPackBSize( size_t N, size_t K ) { @@ -39,10 +39,9 @@ ArmKleidiAI::MlasDynamicQgemmPackBSize( return 0; } - // Default to sme2_mopa, but this may not always be the most optimal kernel variant to use. - auto nr = kai_get_nr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa(); - auto kr = kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa(); - auto sr = kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa(); + auto nr = qgemm_gemm.get_nr(); + auto kr = qgemm_gemm.get_kr(); + auto sr = qgemm_gemm.get_sr(); // Regardless of kernel variant, use the NEON packing variant. KLEIDIAI_KERNEL_LOG("kai_run_rhs_pack_kxn_qsi8cxp_qsi8cx_neon Groups=1" @@ -52,7 +51,7 @@ ArmKleidiAI::MlasDynamicQgemmPackBSize( void MLASCALL -ArmKleidiAI::MlasDynamicQgemmPackB( +ArmKleidiAI::MlasDynamicQGemmPackB( size_t N, size_t K, const int8_t* B, @@ -65,10 +64,9 @@ ArmKleidiAI::MlasDynamicQgemmPackB( return; } - // Default to sme2_mopa, but this may not always be the most optimal kernel variant to use. - auto nr = kai_get_nr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa(); - auto kr = kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa(); - auto sr = kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa(); + auto nr = qgemm_gemm.get_nr(); + auto kr = qgemm_gemm.get_kr(); + auto sr = qgemm_gemm.get_sr(); // y - float output // scale_factor_lhs - lhs scaling factor @@ -105,17 +103,12 @@ ArmKleidiAI::MlasDynamicQGemmBatch( MLAS_THREADPOOL* ThreadPool ) { - const size_t mr = UseSME2 ? kai_get_mr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa() - : kai_get_mr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa(); - const size_t kr = UseSME2 ? kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa() - : kai_get_kr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa(); - const size_t sr = UseSME2 ? kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa() - : kai_get_sr_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa(); + const size_t mr = qgemm_gemm.get_mr(); + const size_t kr = qgemm_gemm.get_kr(); + const size_t sr = qgemm_gemm.get_sr(); - size_t m_step = UseSME2 ? kai_get_m_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa() - : kai_get_m_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa(); - size_t n_step = UseSME2 ? kai_get_n_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa() - : kai_get_n_step_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa(); + size_t m_step = qgemm_gemm.get_m_step(); + size_t n_step = qgemm_gemm.get_n_step(); if (BatchSize == 0 || Shape.M == 0 || Shape.N == 0 || Shape.K == 0) { return; @@ -216,17 +209,13 @@ ArmKleidiAI::MlasDynamicQGemmBatch( ptrdiff_t NIdx = (tid % (dim[1] * dim[2])) % dim[2]; // Get rhs tile, B - const size_t rhs_packed_offset = - UseSME2 ? kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa(NIdx * n_step, Shape.K) - : kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa(NIdx * n_step, Shape.K); + const size_t rhs_packed_offset = qgemm_gemm.get_rhs_packed_offset(NIdx * n_step, Shape.K); const std::byte* B_base = reinterpret_cast(DataParams[BIdx].PackedB); auto BTile = reinterpret_cast(B_base + rhs_packed_offset); // Get lhs tile, A - const size_t lhs_packed_offset = - UseSME2 ? kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa(MIdx * m_step, Shape.K) - : kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa(MIdx * m_step, Shape.K); + const size_t lhs_packed_offset =qgemm_gemm.get_lhs_packed_offset(MIdx * m_step, Shape.K); const std::byte* A_base = tls_lhs_base[BIdx]; // LhsPackedData + LhsPackedStride * BIdx; OR DataParams[batch_idx].Workspace; auto ATile = reinterpret_cast(A_base + lhs_packed_offset); @@ -240,46 +229,12 @@ ArmKleidiAI::MlasDynamicQGemmBatch( NIdx * n_step * sizeof(float) ); - if (UseSME2) { - kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa( + qgemm_gemm.run_matmul( TileSizeM, TileSizeN, Shape.K, ATile, BTile, dst_tile, DataParams[BIdx].ldc * sizeof(float), sizeof(float), -std::numeric_limits::max(), std::numeric_limits::max() ); - } - else { - #if defined(ENABLE_QMX_KERNELS) - if(ArmKleidiAI::vendor_name.compare("Qualcomm") == 0) - { - kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa( - TileSizeM, TileSizeN, Shape.K, ATile, BTile, - dst_tile, - DataParams[BIdx].ldc * sizeof(float), - sizeof(float), - -std::numeric_limits::max(), std::numeric_limits::max() - ); - } - else - { - kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa( - TileSizeM, TileSizeN, Shape.K, ATile, BTile, - dst_tile, - DataParams[BIdx].ldc * sizeof(float), - sizeof(float), - -std::numeric_limits::max(), std::numeric_limits::max() - ); - } - #else - kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa( - TileSizeM, TileSizeN, Shape.K, ATile, BTile, - dst_tile, - DataParams[BIdx].ldc * sizeof(float), - sizeof(float), - -std::numeric_limits::max(), std::numeric_limits::max() - ); - #endif // ENABLE_QMX_KERNELS - } }); } diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h index e75ca3dc90e60..ac7528853c596 100644 --- a/onnxruntime/core/mlas/lib/mlasi.h +++ b/onnxruntime/core/mlas/lib/mlasi.h @@ -851,17 +851,9 @@ bool MLAS_THREADPOOL* ThreadPool ); -typedef void (MLASCALL MLAS_GEMM_BATCH)( - CBLAS_TRANSPOSE TransA, - CBLAS_TRANSPOSE TransB, - size_t M, - size_t N, - size_t K, - const MLAS_SGEMM_DATA_PARAMS* Data, - size_t BatchSize, - MLAS_THREADPOOL* ThreadPool); - -typedef bool (MLASCALL MLAS_GEMM_BATCH_OVERRIDE)( +typedef +bool +(MLASCALL MLAS_SGEMM_BATCH_OVERRIDE)( CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB, size_t M, @@ -871,19 +863,17 @@ typedef bool (MLASCALL MLAS_GEMM_BATCH_OVERRIDE)( size_t BatchSize, MLAS_THREADPOOL* ThreadPool); -typedef size_t (MLASCALL MLAS_GEMM_PACK_B_SIZE)( - CBLAS_TRANSPOSE TransA, - CBLAS_TRANSPOSE TransB, - size_t N, - size_t K); - -typedef size_t (MLASCALL MLAS_GEMM_PACK_B_SIZE_OVERRIDE)( +typedef +size_t +(MLASCALL MLAS_SGEMM_PACK_B_SIZE_OVERRIDE)( CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB, size_t N, size_t K); -typedef void (MLASCALL MLAS_GEMM_PACK_B)( +typedef +bool +(MLASCALL MLAS_SGEMM_PACK_B_OVERRIDE)( CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB, size_t N, @@ -892,13 +882,28 @@ typedef void (MLASCALL MLAS_GEMM_PACK_B)( size_t ldb, void* PackedB); -typedef bool (MLASCALL MLAS_GEMM_PACK_B_OVERRIDE)( - CBLAS_TRANSPOSE TransA, - CBLAS_TRANSPOSE TransB, +typedef +void +(MLASCALL MLAS_DYNAMIC_QGEMM_BATCH_OVERRIDE)( + const MLAS_GEMM_DYN_QUANT_SHAPE_PARAMS& Shape, + const MLAS_GEMM_DYN_QUANT_DATA_PARAMS* DataParams, + const size_t BatchN, + MLAS_THREADPOOL* ThreadPool); + +typedef +size_t +(MLASCALL MLAS_DYNAMIC_QGEMM_PACK_B_SIZE_OVERRIDE)( + size_t N, + size_t K); + +typedef +void +(MLASCALL MLAS_DYNAMIC_QGEMM_PACK_B_OVERRIDE)( size_t N, size_t K, - const float* B, - size_t ldb, + const int8_t* B, + const float* Scales, + const float* Bias, void* PackedB); extern "C" { @@ -1333,10 +1338,15 @@ struct MLAS_PLATFORM { bool Avx512Supported_ = false; bool ArmNeonIsQuantActivationsUnsigned = false; - // Mlas overrides initialisation - MLAS_GEMM_BATCH_OVERRIDE* MlasGemmBatchOverride = nullptr; - MLAS_GEMM_PACK_B_SIZE_OVERRIDE* MlasGemmPackBSizeOverride = nullptr; - MLAS_GEMM_PACK_B_OVERRIDE* MlasGemmPackBOverride = nullptr; + // MLAS SGemm overrides + MLAS_SGEMM_BATCH_OVERRIDE* MlasSGemmBatchOverride = nullptr; + MLAS_SGEMM_PACK_B_SIZE_OVERRIDE* MlasSGemmPackBSizeOverride = nullptr; + MLAS_SGEMM_PACK_B_OVERRIDE* MlasSGemmPackBOverride = nullptr; + // MLAS Dynamic QGemm overrides + MLAS_DYNAMIC_QGEMM_BATCH_OVERRIDE* MlasDynamicQGemmBatchOverride = nullptr; + MLAS_DYNAMIC_QGEMM_PACK_B_SIZE_OVERRIDE* MlasDynamicQGemmPackBSizeOverride = nullptr; + MLAS_DYNAMIC_QGEMM_PACK_B_OVERRIDE* MlasDynamicQGemmPackBOverride = nullptr; + // MLAS Conv overrides MLAS_CONV_PREPARE_FLOAT_OVERRIDE* MlasConvPrepareOverride = nullptr; MLAS_CONV_FLOAT_OVERRIDE* MlasConvOverride = nullptr; diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp index b913b1c3b8c26..6ebd6be068b12 100644 --- a/onnxruntime/core/mlas/lib/platform.cpp +++ b/onnxruntime/core/mlas/lib/platform.cpp @@ -610,9 +610,12 @@ Return Value: #if defined(USE_KLEIDIAI) if(MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME()){ - this->MlasGemmBatchOverride = ArmKleidiAI::MlasGemmBatch; - this->MlasGemmPackBSizeOverride = ArmKleidiAI::MlasGemmPackBSize; - this->MlasGemmPackBOverride = ArmKleidiAI::MlasGemmPackB; + this->MlasSGemmBatchOverride = ArmKleidiAI::MlasGemmBatch; + this->MlasSGemmPackBSizeOverride = ArmKleidiAI::MlasGemmPackBSize; + this->MlasSGemmPackBOverride = ArmKleidiAI::MlasGemmPackB; + this->MlasDynamicQGemmBatchOverride = ArmKleidiAI::MlasDynamicQGemmBatch; + this->MlasDynamicQGemmPackBSizeOverride = ArmKleidiAI::MlasDynamicQGemmPackBSize; + this->MlasDynamicQGemmPackBOverride = ArmKleidiAI::MlasDynamicQGemmPackB; this->MlasConvPrepareOverride = ArmKleidiAI::MlasConvPrepare; this->MlasConvOverride = ArmKleidiAI::MlasConv; } diff --git a/onnxruntime/core/mlas/lib/qgemm.cpp b/onnxruntime/core/mlas/lib/qgemm.cpp index 3fc69a607de3f..5f6a8f8394470 100644 --- a/onnxruntime/core/mlas/lib/qgemm.cpp +++ b/onnxruntime/core/mlas/lib/qgemm.cpp @@ -224,7 +224,9 @@ MlasDynamicQGemmBatch ( #if defined(USE_KLEIDIAI) //No fallback - ArmKleidiAI::MlasDynamicQGemmBatch(Shape, DataParams, BatchN, ThreadPool); + if (GetMlasPlatform().MlasDynamicQGemmBatchOverride != nullptr) { + GetMlasPlatform().MlasDynamicQGemmBatchOverride(Shape, DataParams, BatchN, ThreadPool); + } #endif MLAS_UNREFERENCED_PARAMETER(Shape); @@ -348,8 +350,9 @@ MlasDynamicQgemmPackBSize( size_t bytes = 0; #if defined(USE_KLEIDIAI) //No fallback available - //TODO: Insert Override - bytes = ArmKleidiAI::MlasDynamicQgemmPackBSize(N, K); + if (GetMlasPlatform().MlasDynamicQGemmPackBSizeOverride != nullptr) { + bytes = GetMlasPlatform().MlasDynamicQGemmPackBSizeOverride(N, K); + } #endif MLAS_UNREFERENCED_PARAMETER(N); @@ -442,7 +445,9 @@ MlasDynamicQgemmPackB( #if defined(USE_KLEIDIAI) //No fallback - ArmKleidiAI::MlasDynamicQgemmPackB(N, K, B, Scales, Bias, PackedB); + if (GetMlasPlatform().MlasDynamicQGemmPackBOverride != nullptr) { + GetMlasPlatform().MlasDynamicQGemmPackBOverride(N, K, B, Scales, Bias, PackedB); + } #endif MLAS_UNREFERENCED_PARAMETER(N); diff --git a/onnxruntime/core/mlas/lib/sgemm.cpp b/onnxruntime/core/mlas/lib/sgemm.cpp index 02e38b6ef432e..7117f20b82ce5 100644 --- a/onnxruntime/core/mlas/lib/sgemm.cpp +++ b/onnxruntime/core/mlas/lib/sgemm.cpp @@ -1573,10 +1573,10 @@ MlasGemmBatch( ) { // Override - if(GetMlasPlatform().MlasGemmBatchOverride != nullptr && + if(GetMlasPlatform().MlasSGemmBatchOverride != nullptr && // TODO: Remove once KAI supports transposing for A TransA != CBLAS_TRANSPOSE::CblasTrans && - GetMlasPlatform().MlasGemmBatchOverride(TransA, TransB, M, N, K, Data, BatchSize, ThreadPool)){ + GetMlasPlatform().MlasSGemmBatchOverride(TransA, TransB, M, N, K, Data, BatchSize, ThreadPool)){ return; } // @@ -1671,12 +1671,12 @@ Return Value: // // KleidiAI or other override #if defined(USE_KLEIDIAI) - if (GetMlasPlatform().MlasGemmPackBSizeOverride != nullptr && + if (GetMlasPlatform().MlasSGemmPackBSizeOverride != nullptr && // TODO: Remove once KAI supports transposing for A TransA != CBLAS_TRANSPOSE::CblasTrans) { size_t bytes_required; //TODO pass status by reference to indicate success/fail - bytes_required = GetMlasPlatform().MlasGemmPackBSizeOverride(TransA, TransB, N, K); + bytes_required = GetMlasPlatform().MlasSGemmPackBSizeOverride(TransA, TransB, N, K); if (bytes_required != 0){// If ArmKleidiAI::MlasGemmPackBSize ran to completion return bytes_required; } @@ -1738,10 +1738,10 @@ Return Value: --*/ { #if defined(USE_KLEIDIAI) - if (GetMlasPlatform().MlasGemmPackBOverride != nullptr && + if (GetMlasPlatform().MlasSGemmPackBOverride != nullptr && // TODO: Remove once KAI supports transposing for A TransA != CBLAS_TRANSPOSE::CblasTrans && - GetMlasPlatform().MlasGemmPackBOverride(TransA, TransB, N, K, B, ldb, PackedB)){ + GetMlasPlatform().MlasSGemmPackBOverride(TransA, TransB, N, K, B, ldb, PackedB)){ return; } #endif diff --git a/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc b/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc index 655364357999a..3727ac0918115 100644 --- a/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc +++ b/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc @@ -254,6 +254,21 @@ Status SkipLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_le continue; } + // SkipLayerNormalization kernel requires gamma and beta to be 1D. + // Skip fusion if gamma or beta have more than 1 dimension. + const NodeArg* gamma_arg = ln_node.MutableInputDefs()[1]; + const TensorShapeProto* gamma_shape = gamma_arg->Shape(); + if (gamma_shape != nullptr && gamma_shape->dim_size() != 1) { + continue; + } + if (ln_node.MutableInputDefs().size() > 2) { + const NodeArg* beta_arg = ln_node.MutableInputDefs()[2]; + const TensorShapeProto* beta_shape = beta_arg->Shape(); + if (beta_shape != nullptr && beta_shape->dim_size() != 1) { + continue; + } + } + NodeArg beta_place_holder("", nullptr); // Get the inputs for the new SkipLayerNormalization node. diff --git a/onnxruntime/core/providers/cpu/tensor/gather.cc b/onnxruntime/core/providers/cpu/tensor/gather.cc index 38a16ee83c86b..e36f2c7f89183 100644 --- a/onnxruntime/core/providers/cpu/tensor/gather.cc +++ b/onnxruntime/core/providers/cpu/tensor/gather.cc @@ -102,9 +102,9 @@ Status GatherCopyData(const Tensor* indices_tensor, const uint8_t* src_base, uin } } - auto lambda = [&](int64_t index) { - int64_t batch = index / N; - int64_t i = index % N; + auto lambda = [&](ptrdiff_t index) { + const int64_t batch = static_cast(index / N); + const int64_t i = static_cast(index % N); const int64_t src_offset_batch = batch * data_batch_bytes; const int64_t dst_offset_batch = batch * gathered_batch_bytes; @@ -120,12 +120,14 @@ Status GatherCopyData(const Tensor* indices_tensor, const uint8_t* src_base, uin memcpy(dst_base + dst_offset, src_base + src_offset, narrow(block_size)); } }; - concurrency::ThreadPool::TryParallelFor(tp, SafeInt(M) * N, static_cast(block_size), - [&lambda](ptrdiff_t first, ptrdiff_t last) { - for (int index = static_cast(first), end = static_cast(last); index < end; ++index) { - lambda(index); - } - }); + + concurrency::ThreadPool::TryParallelFor( + tp, SafeInt(M) * N, static_cast(block_size), + [&lambda](ptrdiff_t first, ptrdiff_t last) { + for (ptrdiff_t index = first; index < last; ++index) { + lambda(index); + } + }); return Status::OK(); } diff --git a/onnxruntime/core/providers/cpu/tensor/gather_nd.cc b/onnxruntime/core/providers/cpu/tensor/gather_nd.cc index ad3faa70ed6af..a0a848eef0dff 100644 --- a/onnxruntime/core/providers/cpu/tensor/gather_nd.cc +++ b/onnxruntime/core/providers/cpu/tensor/gather_nd.cc @@ -66,6 +66,18 @@ Status GatherNDBase::PrepareForCompute(const TensorShape& input_shape, const Ten const auto num_slices = indices_shape.SizeToDimension(indices_shape.NumDimensions() - 1); const auto slice_size = input_shape.SizeFromDimension(SafeInt(batch_dims_) + num_slice_dims); const auto num_batches = input_shape.SizeToDimension(SafeInt(batch_dims_)); + + // Validate batch dimensions to prevent division by zero + if (num_batches == 0) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "GatherND: input tensor batch dimensions cannot be zero"); + } + if (num_slices % num_batches != 0) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "GatherND: indices batch size (", num_slices, + ") is not divisible by input batch size (", num_batches, ")"); + } + const auto input_batch_stride = input_shape.SizeFromDimension(SafeInt(batch_dims_)); const auto num_slices_per_batch = num_slices / num_batches; std::vector sizes_from_slice_dims(onnxruntime::narrow(num_slice_dims)); diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu index 51c80d272bb96..62801c8da1e5f 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu +++ b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu @@ -209,7 +209,7 @@ __device__ void reduce_all( // the size of shared_memory equals to the number of warps. #pragma unroll for (int stride = MAX_NUM_WARPS_PER_BLOCK / 2; stride > 0; stride /= 2) { - if (tid_in_block + stride < num_warps_in_block) { + if (tid_in_block < stride && tid_in_block + stride < num_warps_in_block) { shared_memory[tid_in_block] += shared_memory[tid_in_block + stride]; } __syncthreads(); diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu index a96d4c82a7fdc..963fa020d033a 100644 --- a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu @@ -585,6 +585,13 @@ size_t CalcResizeBufferSize(const onnxruntime::UpsampleMode upsample_mode, static_cast(std::accumulate(output_dims.begin(), output_dims.end(), (int64_t)0)); case UpsampleMode::LINEAR: + // For LINEAR mode: + // - bilinear (2-D/4-D) uses mapping for [H, W] + // - trilinear (3-D/5-D) uses mapping for [D, H, W] + if (output_dims.size() == 3 || output_dims.size() == 5) { + return sizeof(LinearMappingInfo) * + static_cast(std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 3, (int64_t)0)); + } return sizeof(LinearMappingInfo) * static_cast(std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, (int64_t)0)); case UpsampleMode::CUBIC: diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc index 2806eb7a7a8d8..aa6a69adb549c 100644 --- a/onnxruntime/core/session/onnxruntime_c_api.cc +++ b/onnxruntime/core/session/onnxruntime_c_api.cc @@ -4843,7 +4843,7 @@ static_assert(offsetof(OrtApi, CreateExternalInitializerInfo) / sizeof(void*) == static_assert(offsetof(OrtApi, GetTensorElementTypeAndShapeDataReference) / sizeof(void*) == 414, "Size of version 24 API cannot change"); // So that nobody forgets to finish an API version, this check will serve as a reminder: -static_assert(std::string_view(ORT_VERSION) == "1.24.2", +static_assert(std::string_view(ORT_VERSION) == "1.24.3", "ORT_Version change detected, please follow below steps to ensure OrtApi is updated properly"); // 1. Update the hardcoded version string in above static_assert to silence it // 2. If there were any APIs added to ort_api_1_to_24 above: diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc index d96d229c942cb..89651c2d955de 100644 --- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc +++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc @@ -794,7 +794,7 @@ std::string _get_type_name(std::string&) { #if !defined(DISABLE_ML_OPS) template static void CreateMapMLValue_LoopIntoMap(Py_ssize_t& pos, PyObject*& key, const std::string& name_input, PyObject*& value, - PyObject* item, std::map& current, + PyObject* item, bool owns_item_ref, std::map& current, KeyGetterType keyGetter, ValueGetterType valueGetter) { KeyType ckey; ValueType cvalue; @@ -806,7 +806,9 @@ static void CreateMapMLValue_LoopIntoMap(Py_ssize_t& pos, PyObject*& key, const std::string sType = spyType; Py_XDECREF(pStr); Py_XDECREF(pType); - Py_XDECREF(item); + if (owns_item_ref) { + Py_XDECREF(item); + } throw std::runtime_error(std::string("Unexpected key type ") + sType + std::string(", it cannot be linked to C type ") + _get_type_name(ckey) + std::string(" for input '") + @@ -820,7 +822,9 @@ static void CreateMapMLValue_LoopIntoMap(Py_ssize_t& pos, PyObject*& key, const std::string sType = spyType; Py_XDECREF(pStr); Py_XDECREF(pType); - Py_XDECREF(item); + if (owns_item_ref) { + Py_XDECREF(item); + } throw std::runtime_error(std::string("Unexpected value type ") + sType + std::string(", it cannot be linked to C type ") + _get_type_name(ckey) + std::string(" for input '") + @@ -836,7 +840,7 @@ static void CreateMapMLValue_Map(Py_ssize_t& pos, PyObject*& key, const std::str ValueGetterType valueGetter) { std::unique_ptr> dst; dst = std::make_unique>(); - CreateMapMLValue_LoopIntoMap(pos, key, name_input, value, item, *dst, keyGetter, valueGetter); + CreateMapMLValue_LoopIntoMap(pos, key, name_input, value, item, false, *dst, keyGetter, valueGetter); p_mlvalue->Init(dst.release(), DataTypeImpl::GetType>(), DataTypeImpl::GetType>()->GetDeleteFunc()); } @@ -850,7 +854,7 @@ void CreateMapMLValue_VectorMap(Py_ssize_t& pos, PyObject*& key, const std::stri int index = 0; do { dstVector->push_back(std::map()); - CreateMapMLValue_LoopIntoMap(pos, key, name_input, value, item, (*dstVector)[index], keyGetter, valueGetter); + CreateMapMLValue_LoopIntoMap(pos, key, name_input, value, item, true, (*dstVector)[index], keyGetter, valueGetter); Py_DECREF(item); ++index; item = iterator == NULL ? NULL : PyIter_Next(iterator); diff --git a/onnxruntime/python/tools/pytorch_export_contrib_ops.py b/onnxruntime/python/tools/pytorch_export_contrib_ops.py index 1c5e31af99d82..0bd75e5c92e4c 100644 --- a/onnxruntime/python/tools/pytorch_export_contrib_ops.py +++ b/onnxruntime/python/tools/pytorch_export_contrib_ops.py @@ -6,6 +6,7 @@ PyTorch-ONNX exporter (torch.onnx.export). """ +import contextlib import typing try: @@ -22,7 +23,7 @@ _registered_ops: typing.AbstractSet[str] = set() -def _reg(symbolic_fn: typing.Callable, namespace: str = ""): +def _reg(symbolic_fn: typing.Callable, namespace: str = "aten"): name = f"{namespace}::{symbolic_fn.__name__}" torch.onnx.register_custom_op_symbolic(name, symbolic_fn, _OPSET_VERSION) _registered_ops.add(name) @@ -49,13 +50,6 @@ def grid_sampler(g, input, grid, mode, padding_mode, align_corners): padding_mode_str = ["zeros", "border", "reflection"][padding_mode] align_corners = int(symbolic_helper._maybe_get_const(align_corners, "b")) - # From opset v13 onward, the output shape can be specified with - # (N, C, H, W) (N, H_out, W_out, 2) => (N, C, H_out, W_out) - # input_shape = input.type().sizes() - # gird_shape = grid.type().sizes() - # output_shape = input_shape[:2] + gird_shape[1:3] - # g.op(...).setType(input.type().with_sizes(output_shape)) - return g.op( "com.microsoft::GridSample", input, @@ -71,15 +65,24 @@ def inverse(g, self): return g.op("com.microsoft::Inverse", self).setType(self.type()) _reg(inverse) + torch.onnx.register_custom_op_symbolic("aten::linalg_inv", inverse, _OPSET_VERSION) + _registered_ops.add("aten::linalg_inv") + + def gelu(g, self: torch._C.Value, approximate="none"): + # PyTorch can emit aten::gelu with or without the optional approximate arg. + if not isinstance(approximate, str): + approximate = symbolic_helper._maybe_get_const(approximate, "s") - @torch.onnx.symbolic_helper.parse_args("v", "s") - def gelu(g, self: torch._C.Value, approximate: str = "none"): - # Use microsoft::Gelu for performance if possible. It only supports approximate == "none" + # Use microsoft::Gelu for performance if possible. It only supports approximate == "none". if approximate == "none": return g.op("com.microsoft::Gelu", self).setType(self.type()) return torch.onnx.symbolic_opset9.gelu(g, self, approximate) _reg(gelu) + # Some PyTorch versions dispatch GELU symbolic lookup by exporter opset. + # Registering across stable opsets keeps ORT Gelu fusion consistently enabled. + for opset in range(9, 21): + torch.onnx.register_custom_op_symbolic("aten::gelu", gelu, opset) def triu(g, self, diagonal): return g.op("com.microsoft::Trilu", self, diagonal, upper_i=1).setType(self.type()) @@ -127,3 +130,8 @@ def unregister(): for version in symbolic_helper._onnx_stable_opsets: if version >= _OPSET_VERSION and symbolic_registry.is_registered_op(kind, namespace, version): del symbolic_registry._registry[(namespace, version)][kind] + + # Also clean up gelu's multi-opset registrations (see register()). + for opset in range(9, 21): + with contextlib.suppress(Exception): + torch.onnx.unregister_custom_op_symbolic("aten::gelu", opset) diff --git a/onnxruntime/python/tools/transformers/fusion_attention.py b/onnxruntime/python/tools/transformers/fusion_attention.py index 08f8691d8b2b5..de7f0a044c118 100644 --- a/onnxruntime/python/tools/transformers/fusion_attention.py +++ b/onnxruntime/python/tools/transformers/fusion_attention.py @@ -1112,11 +1112,11 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node): if ( (mul_val is None) or not (isinstance(mul_val, np.ndarray) and mul_val.size == 1) - or (float(mul_val) >= 0) + or (mul_val.item() >= 0) ): return - if float(mul_val) != -10000: - self.mask_filter_value = float(mul_val) + if mul_val.item() != -10000: + self.mask_filter_value = mul_val.item() if matmul_v.input[0] == root_input and matmul_q.input[0] == root_input and matmul_k.input[0] == root_input: mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0]) if not is_no_mask_attention else None diff --git a/onnxruntime/python/tools/transformers/large_model_exporter.py b/onnxruntime/python/tools/transformers/large_model_exporter.py index 29829a6c475d9..f4d9e28d4ecb2 100644 --- a/onnxruntime/python/tools/transformers/large_model_exporter.py +++ b/onnxruntime/python/tools/transformers/large_model_exporter.py @@ -290,6 +290,7 @@ def do_export_internal(model: nn.Module, onnx_io_tuple: tuple, onnx_inputs: tupl input_names=onnx_inp_names, output_names=onnx_out_names, dynamic_axes=onnx_dynamic_axes, + dynamo=False, ) onnx_path.unlink(missing_ok=True) diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py index b405c19b04689..0b86d5f038cd8 100644 --- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py +++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py @@ -473,7 +473,7 @@ def export_onnx( input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes, - opset_version=11, + opset_version=14, do_constant_folding=True, use_external_data_format=True, verbose=verbose, diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py index 2cb6a733c5bc7..17a4ef58914d6 100644 --- a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py +++ b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py @@ -235,6 +235,7 @@ def run_torchscript_separate_export( opset_version=torch_export_onnx_opset_version, do_constant_folding=True, verbose=args.verbose, + dynamo=False, ) # Check decoder_model.onnx and save all external data to one file @@ -294,6 +295,7 @@ def run_torchscript_separate_export( opset_version=torch_export_onnx_opset_version, do_constant_folding=True, verbose=args.verbose, + dynamo=False, ) # Check decoder_with_past_model.onnx and save all external data to one file diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py index e10e616d35d38..31fb60f86faf1 100644 --- a/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py +++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py @@ -391,8 +391,9 @@ def export_onnx( input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes, - opset_version=17, + opset_version=18, do_constant_folding=True, + dynamo=False, verbose=verbose, ) diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder.py index 851f641442016..48d4e12a38a43 100644 --- a/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder.py +++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder.py @@ -110,8 +110,9 @@ def export_onnx( input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes, - opset_version=17, + opset_version=18, do_constant_folding=True, + dynamo=False, verbose=verbose, ) diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py index cd81edc1001be..35ec59b2bca69 100644 --- a/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py +++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py @@ -293,8 +293,9 @@ def export_onnx( input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes, - opset_version=17, + opset_version=18, do_constant_folding=True, + dynamo=False, verbose=verbose, ) diff --git a/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py b/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py index 66f24c47f6cdb..a8c2ad1967acb 100644 --- a/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py +++ b/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py @@ -49,6 +49,7 @@ def torch_onnx_export( keep_initializers_as_inputs=keep_initializers_as_inputs, custom_opsets=custom_opsets, export_modules_as_functions=export_modules_as_functions, + dynamo=False, ) else: torch.onnx.export( diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc index 8cbdc381a1a70..66f87142d3a34 100644 --- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc @@ -495,6 +495,16 @@ TEST(MatMulNBits, Float16_4b_Accuracy4) { TestMatMulNBitsTyped(); TestMatMulNBitsTyped(); TestMatMulNBitsTyped(); + + // See PR #27412 for details on the following test case, + // which is added to cover a specific failure case in the past. + // 6144, 2048 + + // Since K is larger (more change of larger error), + // and N is larger (more chance of havinga value with larger error), + // we set a higher tolerance for this case to avoid false positives + // and flaky failures. + TestMatMulNBitsTyped(0.2f, 0.03f); } TEST(MatMulNBits, LegacyShape_4b) { diff --git a/onnxruntime/test/framework/tensorutils_test.cc b/onnxruntime/test/framework/tensorutils_test.cc index 0d7b583faf27b..c9b61a7a39632 100644 --- a/onnxruntime/test/framework/tensorutils_test.cc +++ b/onnxruntime/test/framework/tensorutils_test.cc @@ -530,9 +530,6 @@ TEST_F(PathValidationTest, ValidateExternalDataPath) { // Valid relative path. ASSERT_STATUS_OK(utils::ValidateExternalDataPath(base_dir_, "data.bin")); - // Empty base directory. - ASSERT_STATUS_OK(utils::ValidateExternalDataPath("", "data.bin")); - // Empty location. // Only validate it is not an absolute path. ASSERT_TRUE(utils::ValidateExternalDataPath(base_dir_, "").IsOK()); @@ -555,6 +552,29 @@ TEST_F(PathValidationTest, ValidateExternalDataPath) { // Base directory does not exist. ASSERT_STATUS_OK(utils::ValidateExternalDataPath("non_existent_dir", "data.bin")); + + // + // Tests for an empty base directory. + // The base directory would be empty when 1) the session loads a model from bytes and 2) the application does not + // set an external file folder path via the session config option + // kOrtSessionOptionsModelExternalInitializersFileFolderPath. + // + + // A simple filename is ok (would not escape current working directory). + ASSERT_STATUS_OK(utils::ValidateExternalDataPath("", "data.bin")); + ASSERT_STATUS_OK(utils::ValidateExternalDataPath("", "./data.bin")); + + // A ".." that is not a path component (part of the filename) is ok + ASSERT_STATUS_OK(utils::ValidateExternalDataPath("", "data..bin")); + + // A path that would escape the current working directory is invalid. + ASSERT_FALSE(utils::ValidateExternalDataPath("", "../data.bin").IsOK()); + + // A path that uses ".." but would not escape the current working directory should be fine. + ASSERT_STATUS_OK(utils::ValidateExternalDataPath("", "a/../data.bin")); + + // A path with multiple internal ".." that would escape current working direction should fail. + ASSERT_FALSE(utils::ValidateExternalDataPath("", "a/../../data.bin").IsOK()); } TEST_F(PathValidationTest, ValidateExternalDataPathWithSymlinkInside) { diff --git a/onnxruntime/test/optimizer/graph_transform_test_layernorm.cc b/onnxruntime/test/optimizer/graph_transform_test_layernorm.cc index 0afb836192b0a..4615b6a57b558 100644 --- a/onnxruntime/test/optimizer/graph_transform_test_layernorm.cc +++ b/onnxruntime/test/optimizer/graph_transform_test_layernorm.cc @@ -638,6 +638,37 @@ TEST_F(GraphTransformationTests, SkipLayerNormFusionTest) { TestSkipLayerNormFusion(MODEL_FOLDER "fusion/skip_layer_norm_format3_graph_output.onnx", 1, 1, 0, 0, logger_.get()); } +// SkipLayerNorm fusion should not be applied when gamma/beta have more than 1 dimension, +// because the SkipLayerNormalization kernel requires 1D gamma/beta. +TEST_F(GraphTransformationTests, SkipLayerNormFusion_3DGamma_NoFusion) { + auto build_test_case = [](ModelTestBuilder& builder) { + // Inputs: A and B are 3D [16, 32, 4] + auto* input_a = builder.MakeInput({16, 32, 4}, -1.0f, 1.0f); + auto* input_b = builder.MakeInput({16, 32, 4}, -1.0f, 1.0f); + // gamma and beta have 3D shape [1, 1, 4] (not 1D) + auto* gamma = builder.MakeInitializer({1, 1, 4}, {1.0f, 2.0f, 3.0f, 4.0f}); + auto* beta = builder.MakeInitializer({1, 1, 4}, {0.1f, 0.2f, 0.3f, 0.4f}); + auto* add_out = builder.MakeIntermediate(); + auto* ln_out = builder.MakeOutput(); + + builder.AddNode("Add", {input_a, input_b}, {add_out}); + builder.AddNode("LayerNormalization", {add_out, gamma, beta}, {ln_out}) + .AddAttribute("axis", static_cast(-1)); + }; + + auto post_graph_checker = [](Graph& graph) { + // SkipLayerNormalization should NOT have been created because gamma/beta are 3D. + TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["Add"] == 1); + TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["LayerNormalization"] == 1); + TEST_RETURN_IF_NOT(CountOpsInGraph(graph)["com.microsoft.SkipLayerNormalization"] == 0); + return Status::OK(); + }; + + ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 17, *logger_, + std::make_unique(), + TransformerLevel::Level2, 1, nullptr, post_graph_checker)); +} + TEST_F(GraphTransformationTests, GroupQueryAttentionFusionTest) { TestGQAFusion(MODEL_FOLDER "fusion/gqa_fusion_quantized_simple.onnx", 1, 0, logger_.get()); TestGQAFusion(MODEL_FOLDER "fusion/gqa_fusion_different_head_sizes.onnx", 0, 1, logger_.get()); diff --git a/onnxruntime/test/providers/cpu/tensor/gather_nd_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_nd_op_test.cc index 081b4b484a73b..a8f3b99b2b3d3 100644 --- a/onnxruntime/test/providers/cpu/tensor/gather_nd_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/gather_nd_op_test.cc @@ -329,5 +329,48 @@ TEST(GatherNDOpTest, GatherND_slice_int64_t) { test.Run(); } +// Test for issue #23828: GatherND should return error instead of crashing +// when batch dimensions mismatch between input and indices +TEST(GatherNDOpTest, GatherND_batch_dims_mismatch_error) { + OpTester test("GatherND", 12, kOnnxDomain); + test.AddAttribute("batch_dims", 1); + + // Input has 3 batches, but indices has 2 slices (indices batch size 2), which is not divisible by 3 - mismatch! + test.AddInput("data", {3, 3}, {0.f, 1.f, 2.f, 10.f, 11.f, 12.f, 20.f, 21.f, 22.f}); + test.AddInput("indices", {2, 1}, {1, 2}); + test.AddOutput("output", {2}, {0.f, 0.f}); // dummy output, won't be used + + // Force execution only on CPU + std::vector> cpu_only_ep; + cpu_only_ep.push_back(DefaultCpuExecutionProvider()); + + test.Run(OpTester::ExpectResult::kExpectFailure, + "GatherND: indices batch size (2) is not divisible by input batch size (3)", + {}, // no excluded providers needed + nullptr, // no RunOptions + &cpu_only_ep); // force CPU +} + +// Test for issue #23828: GatherND should return error when input batch dimension is zero +TEST(GatherNDOpTest, GatherND_zero_batch_dims_error) { + OpTester test("GatherND", 12, kOnnxDomain); + test.AddAttribute("batch_dims", 1); + + // Input has 0 batches - should fail with clear error instead of division by zero + test.AddInput("data", {0, 3}, {}); + test.AddInput("indices", {2, 1}, {1, 2}); + test.AddOutput("output", {2}, {0.f, 0.f}); // dummy output, won't be used + + // Force execution only on CPU + std::vector> cpu_only_ep; + cpu_only_ep.push_back(DefaultCpuExecutionProvider()); + + test.Run(OpTester::ExpectResult::kExpectFailure, + "GatherND: input tensor batch dimensions cannot be zero", + {}, + nullptr, + &cpu_only_ep); // force CPU +} + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc index 35066bd68c65e..adbe86a14dabf 100644 --- a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc @@ -341,6 +341,41 @@ TEST(GatherOpTest, Gather_axis1_indices2d_string) { test.Run(); } +TEST(GatherOpTest, Gather_overflow_check) { +// Skip on 32-bit platforms where size_t overflow would truncate the large expected +// output shape and where allocating the full reference tensor is infeasible. +#if SIZE_MAX <= UINT32_MAX + GTEST_SKIP() << "Gather_overflow_check skipped on 32-bit platforms."; +#endif + + // The test uses dimensions (65537, 2) and indices of length 65537, which produce an output + // shape of (65537, 65537). + // + // 65537 x 65537 = 4,295,098,369 which is greater than the maximum value of a 32-bit integer (2,147,483,647). + // + // This test is to verify CPU implementation of the Gather operator doesn't overflow when calculating + // the output shape and generating the output tensor. + + OpTester test("Gather"); + test.AddAttribute("axis", 1LL); + + // Inputs + const std::vector data_dims{65537, 2}; + const std::vector indices_dims{65537}; + std::vector data_values(static_cast(data_dims[0] * data_dims[1]), 1); + std::vector indices_values(static_cast(indices_dims[0]), 1); + std::vector expected_output_values(static_cast(65537) * static_cast(65537), 1); + + test.AddInput("data", {65537, 2}, data_values); + test.AddInput("indices", {65537}, indices_values); + test.AddOutput("output", {65537, 65537}, expected_output_values); + + std::vector> execution_providers; + execution_providers.emplace_back(DefaultCpuExecutionProvider()); + + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + TEST(GatherOpTest, Gather_axis1_indices2d_bool) { OpTester test("Gather"); test.AddAttribute("axis", 1LL); diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc index 200a1aded8204..8fd994baec713 100644 --- a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc @@ -1015,6 +1015,38 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_5DTrilinear_pytorch_half_pixel) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: results mismatch } +TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_5DTrilinear_CudaRegression) { + auto cuda_ep = DefaultCudaExecutionProvider(); + if (!cuda_ep) { + GTEST_SKIP() << "CUDA EP not available"; + } + + OpTester test("Resize", 13); + std::vector roi{}; + std::vector scales{1.0f, 1.0f, 2.0f, 2.0f, 2.0f}; + + test.AddAttribute("mode", "linear"); + test.AddAttribute("coordinate_transformation_mode", "pytorch_half_pixel"); + + constexpr int64_t N = 1, C = 1, D = 3, H = 4, W = 5; + std::vector X(static_cast(N * C * D * H * W), 1.0f); + + test.AddInput("X", {N, C, D, H, W}, X); + test.AddInput("roi", {0}, roi); + test.AddInput("scales", {5}, scales); + + constexpr int64_t out_D = D * 2; + constexpr int64_t out_H = H * 2; + constexpr int64_t out_W = W * 2; + std::vector Y(static_cast(N * C * out_D * out_H * out_W), 1.0f); + + test.AddOutput("Y", {N, C, out_D, out_H, out_W}, Y); + + std::vector> execution_providers; + execution_providers.push_back(std::move(cuda_ep)); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + TEST(ResizeOpTest, ResizeOpLinearScalesNoOpTest) { // To test NNAPI EP, we need the scales/sizes to be in initializers auto run_test = [](bool scales_in_initializer) { diff --git a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc index ec7e98528504e..593255b9e9c23 100644 --- a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc @@ -177,6 +177,35 @@ void TestReduceColumnsToColumn(int m, int n, float relative_error_tolerance = 1e CheckDeviceValues(m, d_out.get(), expected_column.data(), relative_error_tolerance); } + +void TestReduceColumnsToColumnRepeated(int m, int n, int iterations, float relative_error_tolerance = 1e-4f) { + SCOPED_TRACE(MakeString("m: ", m, ", n:", n, ", iterations: ", iterations)); + + const TensorShape shape{m, n}; + RandomValueGenerator random{}; + const auto values = random.Uniform(shape.GetDims(), 1.0f, 10.0f); + const auto expected_column = ExpectedReduceMatrixColumnsOutput(m, n, values); + + auto d_in = AllocateDeviceMemory(m * n); + auto d_out = AllocateDeviceMemory(m); + + cudaMemcpy(d_in.get(), values.data(), m * n * sizeof(float), cudaMemcpyHostToDevice); + + size_t buffer_size_in_bytes = + compute_reduce_matrix_columns_buffer_size(m, n); + auto d_buffer = AllocateDeviceMemory(buffer_size_in_bytes); + + for (int i = 0; i < iterations; ++i) { + ASSERT_STATUS_OK(reduce_matrix_columns( + 0, + d_in.get(), d_out.get(), + m, n, + d_buffer.get(), buffer_size_in_bytes)); + + ASSERT_TRUE(CUDA_CALL(cudaDeviceSynchronize()).IsOK()); + CheckDeviceValues(m, d_out.get(), expected_column.data(), relative_error_tolerance); + } +} } // namespace TEST(ReductionFunctionsTest, ReduceRowToScalar) { @@ -205,6 +234,10 @@ TEST(ReductionFunctionsTest, ReduceColumnsToColumn) { } } +TEST(ReductionFunctionsTest, ReduceColumnsToColumnRepeated) { + TestReduceColumnsToColumnRepeated(17, 8192, 100, 2e-4f); +} + TEST(ReductionFunctionsTest, BufferOffsets) { const int m = 2048; const int n = 1024; diff --git a/onnxruntime/test/python/test_pytorch_export_contrib_ops.py b/onnxruntime/test/python/test_pytorch_export_contrib_ops.py index e7ea83dd00297..afefc4e616a87 100644 --- a/onnxruntime/test/python/test_pytorch_export_contrib_ops.py +++ b/onnxruntime/test/python/test_pytorch_export_contrib_ops.py @@ -59,6 +59,9 @@ def setUp(self): torch.manual_seed(0) pytorch_export_contrib_ops.register() + def tearDown(self): + pytorch_export_contrib_ops.unregister() + def run_test( self, model, @@ -101,6 +104,7 @@ def run_test( input_names=input_names, output_names=output_names, custom_opsets=custom_opsets, + dynamo=False, ) # compute onnxruntime output prediction @@ -143,12 +147,13 @@ def test_gelu_is_fused_by_default(self): f, opset_version=self.opset_version, custom_opsets={"com.microsoft": 1}, + dynamo=False, ) f.seek(0) onnx_model = onnx.load(f) - node = onnx_model.graph.node[0] - self.assertEqual(node.op_type, "Gelu") - self.assertEqual(node.domain, "com.microsoft") + # Default GELU should be mapped to ORT contrib Gelu for performance. + gelu_nodes = [n for n in onnx_model.graph.node if n.op_type == "Gelu" and n.domain == "com.microsoft"] + self.assertEqual(len(gelu_nodes), 1) @parameterized.parameterized.expand([("default_approximate", "none"), ("tanh_approximate", "tanh")]) @unittest.skipIf(_torch_version_lower_than("1.12"), "Gelu's approximate parameter unsupported in PyTorch < 1.12") @@ -230,8 +235,8 @@ def forward(self, input): # IR version 4 style export. ONNXExporterTest_opset9_IRv4 = type( "TestONNXRuntime_opset9_IRv4", - (unittest.TestCase,), - dict(ONNXExporterTest.__dict__, keep_initializers_as_inputs=False), + (ONNXExporterTest,), + dict(keep_initializers_as_inputs=False), ) diff --git a/onnxruntime/test/python/transformers/benchmark_qmoe.py b/onnxruntime/test/python/transformers/benchmark_qmoe.py new file mode 100644 index 0000000000000..b96c9cdcf5c3a --- /dev/null +++ b/onnxruntime/test/python/transformers/benchmark_qmoe.py @@ -0,0 +1,191 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +import os +import sys +import time +import unittest + +import numpy +import torch + +# Add current directory to path to allow importing from test_qmoe_cpu +current_dir = os.path.dirname(os.path.abspath(__file__)) +if current_dir not in sys.path: + sys.path.append(current_dir) + +from test_qmoe_cpu import PhiMoEConfig, PhiMoESparseMoeBlock, TensorProto # noqa: E402 + +# Reduces number of tests to run for faster pipeline checks +pipeline_mode = os.getenv("PIPELINE_MODE", "1") == "1" + + +@unittest.skipIf(pipeline_mode, "Skip benchmark in CI pipeline.") +class TestQMoESwiGLUBenchmark(unittest.TestCase): + """Benchmark tests for QMoE SwiGLU performance measurement.""" + + def test_qmoe_swiglu_throughput_benchmark(self): + """Comprehensive throughput benchmark for QMoE SwiGLU across different configurations.""" + print("\n=== QMoE SwiGLU Throughput Benchmark ===") + + # Test configurations: (name, hidden_size, intermediate_size, num_experts, top_k, quant_bits) + configs = [ + ("Medium-4bit", 2880, 2880, 32, 4, 4), + ("Medium-8bit", 2880, 2880, 32, 4, 8), + ] + + batch_size = 1 + sequence_length = 512 + num_runs = 1000 + + results = [] + + for config_name, hidden_size, intermediate_size, num_experts, top_k, quant_bits in configs: + torch.manual_seed(42) + numpy.random.seed(42) + + torch_output = None + ort_output = None + + print(f"\nTesting {config_name}:") + print(f" Hidden: {hidden_size}, Intermediate: {intermediate_size}") + print(f" Experts: {num_experts}, Top-K: {top_k}, Quant: {quant_bits}-bit") + + try: + # Create config and model + config = PhiMoEConfig( + hidden_size=hidden_size, + intermediate_size=intermediate_size, + num_local_experts=num_experts, + num_experts_per_tok=top_k, + ) + + qmoe_swiglu = PhiMoESparseMoeBlock( + config, + batch_size=batch_size, + sequence_length=sequence_length, + quant_bits=quant_bits, + onnx_dtype=TensorProto.FLOAT, + ) + + # Create test input with fixed sequence length to match ONNX model + full_hidden_states = torch.randn(batch_size, sequence_length, hidden_size).to(torch.float32) + + # For TTFT simulation, we'll measure single forward pass time + # This represents the time to process one token in autoregressive generation + + # Warm up with full context + for _ in range(3): + _ = qmoe_swiglu.forward(full_hidden_states) + + # Benchmark PyTorch TTFT (Time to First Token) + # Measure time for a single forward pass (represents token generation time) + torch.manual_seed(42) + + start_time = time.time() + for _ in range(num_runs): + torch_output = qmoe_swiglu.forward(full_hidden_states) + end_time = time.time() + torch_ttft_ms = (end_time - start_time) / num_runs * 1000 + + # Calculate tokens per second (throughput) + # For sequence generation, this represents the rate at which we can generate tokens + torch_tokens_per_sec = 1000.0 / torch_ttft_ms # 1 token / (time_ms / 1000) + + print(f" PyTorch TTFT: {torch_ttft_ms:.3f} ms (per token generation time)") + print(f" PyTorch Throughput: {torch_tokens_per_sec:.1f} tokens/sec") + + # Benchmark ONNX Runtime + ort_ttft_ms = 0 + ort_tokens_per_sec = 0 + speedup = 0 + throughput_ratio = 0 + max_diff = 0 + + model_updated = qmoe_swiglu.recreate_onnx_model() + if model_updated and qmoe_swiglu.ort_sess is not None: + # Warm up ORT with full context + for _ in range(3): + _ = qmoe_swiglu.ort_forward(full_hidden_states) + + torch.manual_seed(42) + + # Measure ONNX Runtime TTFT (Time to First Token) + start_time = time.time() + for _ in range(num_runs): + ort_output = qmoe_swiglu.ort_forward(full_hidden_states) + end_time = time.time() + ort_ttft_ms = (end_time - start_time) / num_runs * 1000 + + # Calculate tokens per second for ONNX Runtime + ort_tokens_per_sec = 1000.0 / ort_ttft_ms # 1 token / (time_ms / 1000) + + speedup = torch_ttft_ms / ort_ttft_ms if ort_ttft_ms > 0 else 0 + throughput_ratio = ort_tokens_per_sec / torch_tokens_per_sec if torch_tokens_per_sec > 0 else 0 + + print(f" ONNX RT TTFT: {ort_ttft_ms:.3f} ms (per token generation time)") + print(f" ONNX RT Throughput: {ort_tokens_per_sec:.1f} tokens/sec") + print(f" TTFT Speedup: {speedup:.2f}x") + print(f" Throughput Gain: {throughput_ratio:.2f}x") + else: + print(" ONNX RT: Not available") + + # Calculate max difference if both outputs available + if torch_output is not None and ort_output is not None: + max_diff = (torch_output.cpu() - ort_output.cpu()).abs().max().item() + print(f" Max diff: {max_diff:.6f}") + + results.append( + { + "config": config_name, + "torch_ttft_ms": torch_ttft_ms, + "torch_tokens_per_sec": torch_tokens_per_sec, + "ort_ttft_ms": ort_ttft_ms, + "ort_tokens_per_sec": ort_tokens_per_sec, + "speedup": speedup, + "throughput_ratio": throughput_ratio, + "max_diff": max_diff, + } + ) + + except Exception as e: + print(f" Error: {e}") + continue + + # Summary + print("\n=== Token Generation Time & Throughput Summary ===") + print( + f"{'Config':<15} {'PT Time':<10} {'PT tok/s':<10} {'ORT Time':<11} {'ORT tok/s':<11} {'Time Gain':<10} {'Throughput':<11} {'Max Diff':<10}" + ) + print("-" * 105) + for result in results: + config = result["config"] + torch_ttft = result["torch_ttft_ms"] + torch_tps = result["torch_tokens_per_sec"] + ort_ttft = result["ort_ttft_ms"] + ort_tps = result["ort_tokens_per_sec"] + speedup = result["speedup"] + throughput_ratio = result["throughput_ratio"] + max_diff = result["max_diff"] + + ort_ttft_str = f"{ort_ttft:.3f}" if ort_ttft > 0 else "N/A" + ort_tps_str = f"{ort_tps:.1f}" if ort_tps > 0 else "N/A" + speedup_str = f"{speedup:.2f}x" if speedup > 0 else "N/A" + throughput_str = f"{throughput_ratio:.2f}x" if throughput_ratio > 0 else "N/A" + + print( + f"{config:<15} {torch_ttft:<10.3f} {torch_tps:<10.1f} {ort_ttft_str:<11} {ort_tps_str:<11} {speedup_str:<10} {throughput_str:<11} {max_diff:<10.6f}" + ) + + print("\nNotes:") + print("- Time: Token generation time in ms (lower is better)") + print("- tok/s: Tokens per second throughput (higher is better)") + print("- Time Gain: ORT speedup for latency (higher is better)") + print("- Throughput: ORT throughput improvement (higher is better)") + + +if __name__ == "__main__": + benchmark = TestQMoESwiGLUBenchmark() + benchmark.test_qmoe_swiglu_throughput_benchmark() diff --git a/onnxruntime/test/python/transformers/parity_utilities.py b/onnxruntime/test/python/transformers/parity_utilities.py index fa16f0e67a523..04a1ed06773e7 100644 --- a/onnxruntime/test/python/transformers/parity_utilities.py +++ b/onnxruntime/test/python/transformers/parity_utilities.py @@ -92,6 +92,7 @@ def export_onnx(model, onnx_model_path, float16, hidden_size, device): dynamic_axes=dynamic_axes, opset_version=11, do_constant_folding=True, + dynamo=False, ) print("exported:", onnx_model_path) diff --git a/onnxruntime/test/python/transformers/test_gelu_fusions.py b/onnxruntime/test/python/transformers/test_gelu_fusions.py index 11ae1401ff8ed..a63e2653f2fbc 100644 --- a/onnxruntime/test/python/transformers/test_gelu_fusions.py +++ b/onnxruntime/test/python/transformers/test_gelu_fusions.py @@ -75,17 +75,22 @@ def test_fusions(self, test_case, dynamo): dummy_input = torch.ones(3, dtype=torch.float32) test_name = f"{operator}_{source}" onnx_path = f"{test_name}.onnx" + + # For Torch 2.10+, torch.nn.functional.gelu(approximate="tanh") exports as Gelu node. + # So we force opset_version=18 here. torch.onnx.export( model, (dummy_input,), onnx_path, input_names=["input"], output_names=["output"], - dynamo=dynamo, + opset_version=18, + dynamo=False, optimize=True, # Only meaningful when dynamo is True ) optimizer = optimize_model(onnx_path, "bert") # optimizer.save_model_to_file(f"{operator}_{source}_opt.onnx") + os.remove(onnx_path) # Remove the associated .data file (dynamo) data_path = onnx_path + ".data" diff --git a/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py b/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py index 444d86da75ba6..c07eb39e6df75 100644 --- a/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py +++ b/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py @@ -253,6 +253,7 @@ def export_onnx(model, onnx_model_path, float16, hidden_size, num_attention_head dynamic_axes=dynamic_axes, opset_version=11, do_constant_folding=True, + dynamo=False, ) print("exported:", onnx_model_path) diff --git a/onnxruntime/test/python/transformers/test_phi_vision.py b/onnxruntime/test/python/transformers/test_phi_vision.py index d276366706af9..5a5fa926eb255 100644 --- a/onnxruntime/test/python/transformers/test_phi_vision.py +++ b/onnxruntime/test/python/transformers/test_phi_vision.py @@ -208,6 +208,7 @@ def export(self, model, inputs): "input": {0: "batch", 1: "seq"}, "attention_mask": {0: "batch", 2: "seq", 3: "seq"}, }, + dynamo=False, ) else: torch.onnx.export( @@ -217,6 +218,7 @@ def export(self, model, inputs): export_params=True, opset_version=14, do_constant_folding=True, + dynamo=False, ) def tearDown(self): diff --git a/onnxruntime/test/python/transformers/test_qmoe_cpu.py b/onnxruntime/test/python/transformers/test_qmoe_cpu.py index 90ebb148a26a5..8415c7b08b77c 100644 --- a/onnxruntime/test/python/transformers/test_qmoe_cpu.py +++ b/onnxruntime/test/python/transformers/test_qmoe_cpu.py @@ -23,9 +23,11 @@ # normalization on the selected experts. This provides proper weight distribution # while maintaining computational efficiency. # -------------------------------------------------------------------------- +import os import time import unittest from collections import OrderedDict +from contextlib import contextmanager import numpy import torch @@ -76,6 +78,8 @@ class TensorProtoPlaceholder: ort_provider = ["CPUExecutionProvider"] +ORT_USE_MLAS_Q4_GEMM_MOE = "ORT_USE_MLAS_Q4_GEMM_MOE" + torch.manual_seed(42) numpy.random.seed(42) @@ -364,7 +368,7 @@ def create_cpu_moe_onnx_graph( use_swiglu=False, use_quant=False, quant_bits=4, - swiglu_interleaved=False, + swiglu_fusion=0, block_size=0, ): if not has_onnx: @@ -400,10 +404,10 @@ def create_cpu_moe_onnx_graph( "router_probs", # 1 "fc1_experts_weights", # 2 "fc1_scales", # 3 - "", # 4: fc1_bias + "fc1_experts_bias" if fc1_bias is not None else "", # 4 "fc2_experts_weights", # 5 "fc2_scales", # 6 - "", # 7: fc2_bias + "fc2_experts_bias" if fc2_bias is not None else "", # 7 "", # 8: fc3_weights "", # 9: fc3_scales "", # 10: fc3_bias @@ -442,11 +446,10 @@ def create_cpu_moe_onnx_graph( normalize_routing_weights=normalize_routing, activation_type=activation, # Add new attributes with backwards-compatible default values - swiglu_fusion=1 if use_swiglu else 0, # 1 if using SwiGLU activation + swiglu_fusion=swiglu_fusion, swiglu_limit=7.0, activation_alpha=1.702, activation_beta=1.0, - swiglu_interleaved=1 if swiglu_interleaved else 0, # Enable this attribute domain="com.microsoft", ), ] @@ -559,6 +562,30 @@ def create_cpu_moe_onnx_graph( ) ) + if fc1_bias is not None: + fc1_bias_np = fc1_bias.detach().cpu().numpy().astype(ort_to_numpy_type_map[onnx_dtype]) + initializers.append( + helper.make_tensor( + "fc1_experts_bias", + onnx_dtype, + list(fc1_bias.shape), + fc1_bias_np.flatten().tolist(), + raw=False, + ) + ) + + if fc2_bias is not None: + fc2_bias_np = fc2_bias.detach().cpu().numpy().astype(ort_to_numpy_type_map[onnx_dtype]) + initializers.append( + helper.make_tensor( + "fc2_experts_bias", + onnx_dtype, + list(fc2_bias.shape), + fc2_bias_np.flatten().tolist(), + raw=False, + ) + ) + graph_inputs = [ helper.make_tensor_value_info("input", onnx_dtype, [sequence_length, hidden_size]), ] @@ -626,7 +653,7 @@ def __init__( self.num_experts_per_token = num_experts_per_token -def swiglu(x: torch.Tensor, alpha: float = 1.702, limit: float = 7.0): +def swiglu(x: torch.Tensor, alpha: float = 1.702, beta: float = 1.0, limit: float = 7.0): dim = x.shape[-1] x = x.view(-1, dim // 2, 2) x_glu, x_linear = x[..., 0], x[..., 1] @@ -635,8 +662,8 @@ def swiglu(x: torch.Tensor, alpha: float = 1.702, limit: float = 7.0): x_glu = x_glu.clamp(max=limit) x_linear = x_linear.clamp(min=-limit, max=limit) - y = x_glu * torch.sigmoid(alpha * x_glu) * (x_linear + 1) - return y + y = x_glu * torch.sigmoid(alpha * x_glu) * (x_linear + beta) + return y.view(-1, dim // 2) class MoEBlockSparseTop2MLP(nn.Module): @@ -855,7 +882,7 @@ def ort_forward(self, hidden_states: torch.Tensor, enable_performance_test=False e = time.time() time_ms = (e - s) / repeat * 1000 is_swiglu = hasattr(self, "use_swiglu") and self.use_swiglu - is_interleaved = hasattr(self, "swiglu_interleaved") and self.swiglu_interleaved + is_interleaved = hasattr(self, "swiglu_fusion") and self.swiglu_fusion == 1 act_type = f"SwiGLU(interleaved={is_interleaved})" if is_swiglu else "SiLU" print(f"ORT Performance - {act_type} {self.quant_bits}-bit: {time_ms:.3f} ms/inference") @@ -868,62 +895,80 @@ def recreate_onnx_model(self): """Recreate the ONNX model with the current weights to reflect any changes to the quantization code.""" w1_list, w2_list = [], [] + w1_bias_list, w2_bias_list = [], [] w1_scale_list, w2_scale_list = [], [] w1_zp_list, w2_zp_list = [], [] is_4_bit = self.quant_bits == 4 for i in range(self.num_experts): - if self.block_size > 0: - # Use block-wise quantization - w1_scale, pre_qweight1, w1_qdq, w1_zp = quant_dequant_blockwise( - self.experts[i].w1.weight, self.block_size, is_4_bit, asymmetric=self.use_asymmetric_quant - ) - w2_scale, pre_qweight2, w2_qdq, w2_zp = quant_dequant_blockwise( - self.experts[i].w2.weight, self.block_size, is_4_bit, asymmetric=self.use_asymmetric_quant - ) + if hasattr(self.experts[i], "w3"): + w1, w3 = self.experts[i].w1.weight, self.experts[i].w3.weight + w2 = self.experts[i].w2.weight + w1_bias = self.experts[i].w1.bias + w3_bias = getattr(self.experts[i].w3, "bias", None) + + # Combine and interleave w1 and w3 for the fused kernel + w1_combined = torch.cat([w1, w3], dim=0) # [2*inter, hidden] + if getattr(self, "swiglu_fusion", 0) == 1: + w1_combined = w1_combined.view(2, -1, self.hidden_dim).transpose(0, 1).reshape(-1, self.hidden_dim) + + if self.block_size > 0: + w1_scale, pre_qweight1, w1_qdq, w1_zp = quant_dequant_blockwise( + w1_combined, self.block_size, is_4_bit, asymmetric=self.use_asymmetric_quant + ) + w2_scale, pre_qweight2, w2_qdq, w2_zp = quant_dequant_blockwise( + w2, self.block_size, is_4_bit, asymmetric=self.use_asymmetric_quant + ) + else: + w1_scale, pre_qweight1, w1_qdq, w1_zp = quant_dequant( + w1_combined, is_4_bit, asymmetric=self.use_asymmetric_quant + ) + w2_scale, pre_qweight2, w2_qdq, w2_zp = quant_dequant( + w2, is_4_bit, asymmetric=self.use_asymmetric_quant + ) + + if w1_bias is not None and w3_bias is not None: + b1_combined = torch.cat([w1_bias, w3_bias], dim=0) + if getattr(self, "swiglu_fusion", 0) == 1: + b1_combined = b1_combined.view(2, -1).transpose(0, 1).reshape(-1) + w1_bias_list.append(b1_combined.detach().cpu()) + elif w1_bias is not None: + w1_bias_list.append(w1_bias.detach().cpu()) else: - # Use row-wise quantization - w1_scale, pre_qweight1, w1_qdq, w1_zp = quant_dequant( - self.experts[i].w1.weight, is_4_bit, asymmetric=self.use_asymmetric_quant - ) - w2_scale, pre_qweight2, w2_qdq, w2_zp = quant_dequant( - self.experts[i].w2.weight, is_4_bit, asymmetric=self.use_asymmetric_quant - ) + # PhiMoESwiGLUMLP already has interleaved weights in w1 + w1 = self.experts[i].w1.weight + w2 = self.experts[i].w2.weight + w1_bias = self.experts[i].w1.bias - if self.use_swiglu: - if self.swiglu_interleaved: - pass + if self.block_size > 0: + w1_scale, pre_qweight1, w1_qdq, w1_zp = quant_dequant_blockwise( + w1, self.block_size, is_4_bit, asymmetric=self.use_asymmetric_quant + ) + w2_scale, pre_qweight2, w2_qdq, w2_zp = quant_dequant_blockwise( + w2, self.block_size, is_4_bit, asymmetric=self.use_asymmetric_quant + ) else: - if self.block_size > 0: - w3_scale, pre_qweight3, w3_qdq, w3_zp = quant_dequant_blockwise( - self.experts[i].w3.weight, self.block_size, is_4_bit, asymmetric=self.use_asymmetric_quant - ) - else: - w3_scale, pre_qweight3, w3_qdq, w3_zp = quant_dequant( - self.experts[i].w3.weight, is_4_bit, asymmetric=self.use_asymmetric_quant - ) - - gate_weights = pre_qweight1 - value_weights = pre_qweight3 - gate_scales = w1_scale - value_scales = w3_scale - gate_zp = w1_zp - value_zp = w3_zp - - pre_qweight1 = torch.cat([gate_weights, value_weights], dim=0) - w1_scale = torch.cat([gate_scales, value_scales], dim=0) - if w1_zp is not None and w3_zp is not None: - w1_zp = torch.cat([gate_zp, value_zp], dim=0) - - if self.swiglu_interleaved: - self.experts[i].w1.weight = nn.Parameter(w1_qdq.contiguous().clone()) + w1_scale, pre_qweight1, w1_qdq, w1_zp = quant_dequant( + w1, is_4_bit, asymmetric=self.use_asymmetric_quant + ) + w2_scale, pre_qweight2, w2_qdq, w2_zp = quant_dequant( + w2, is_4_bit, asymmetric=self.use_asymmetric_quant + ) + if w1_bias is not None: + w1_bias_list.append(w1_bias.detach().cpu()) + if self.use_swiglu: + if getattr(self, "swiglu_fusion", 0) == 1: + self.experts[i].w1.weight = nn.Parameter(w1_qdq.contiguous().clone()) else: intermediate_size = self.experts[i].w1.weight.shape[0] gate_dequant = w1_qdq[:intermediate_size].contiguous().clone() value_dequant = w1_qdq[intermediate_size:].contiguous().clone() - self.experts[i].w1.weight.data = gate_dequant - self.experts[i].w3.weight.data = value_dequant + if hasattr(self.experts[i], "w3"): + self.experts[i].w1.weight.data = gate_dequant + self.experts[i].w3.weight.data = value_dequant + else: + self.experts[i].w1.weight.data = w1_qdq.contiguous().clone() else: self.experts[i].w1.weight.data = w1_qdq.contiguous().clone() @@ -931,6 +976,9 @@ def recreate_onnx_model(self): w1_list.append(pre_qweight1) w2_list.append(pre_qweight2) + + if self.experts[i].w2.bias is not None: + w2_bias_list.append(self.experts[i].w2.bias) w1_scale_list.append(w1_scale) w2_scale_list.append(w2_scale) if w1_zp is not None: @@ -963,9 +1011,9 @@ def recreate_onnx_model(self): onnx_dtype=self.onnx_dtype, fc1_experts_weights=self.moe_experts_weight1, fc2_experts_weights=self.moe_experts_weight2, - # Biases are not used in QMoE - fc1_bias=None, - fc2_bias=None, + # Pass collected biases + fc1_bias=torch.stack(w1_bias_list, dim=0) if w1_bias_list else None, + fc2_bias=torch.stack(w2_bias_list, dim=0) if w2_bias_list else None, # Scales are used for dequantization fc1_scales=moe_experts_weight_scale1, fc2_scales=moe_experts_weight_scale2, @@ -975,7 +1023,7 @@ def recreate_onnx_model(self): use_swiglu=self.use_swiglu, use_quant=True, # Always use QMoE quant_bits=self.quant_bits, - swiglu_interleaved=self.swiglu_interleaved if hasattr(self, "swiglu_interleaved") else False, + swiglu_fusion=getattr(self, "swiglu_fusion", 0), block_size=self.block_size, # Add block_size for block-wise quantization ) except Exception: @@ -1020,7 +1068,7 @@ def parity_check(self): max_diff = (torch_output.cpu() - ort_output.cpu()).abs().max() is_swiglu = hasattr(self, "use_swiglu") and self.use_swiglu - is_interleaved = hasattr(self, "swiglu_interleaved") and self.swiglu_interleaved + is_interleaved = getattr(self, "swiglu_fusion", 0) == 1 act_type = f"SwiGLU(interleaved={is_interleaved})" if is_swiglu else "SiLU" quant_type = "Asymmetric" if self.use_asymmetric_quant else "Symmetric" block_type = f"Block({self.block_size})" if self.block_size > 0 else "Row" @@ -1047,24 +1095,6 @@ def parity_check(self): ) print("Torch sample:", torch_output.cpu().reshape(-1, hidden_dim)[i, k].item()) print("ORT sample:", ort_output.cpu().reshape(-1, hidden_dim)[i, k].item()) - # Print routing and per-expert contributions for this token from the PyTorch reference - try: - hidden_states_flat = hidden_state.view(-1, hidden_dim) - token_vec = hidden_states_flat[i : i + 1] - gate_logits = self.gate(token_vec) - topk_vals, topk_experts = torch.topk(gate_logits, self.top_k, dim=-1) - topk_soft = F.softmax(topk_vals, dim=1) - print("Gate logits:", gate_logits.detach().cpu().numpy()) - print("Selected experts:", topk_experts.detach().cpu().numpy()) - print("Routing weights:", topk_soft.detach().cpu().numpy()) - # Compute per-expert contributions for selected experts - for idx_e, e in enumerate(topk_experts[0].tolist()): - expert_layer = self.experts[e] - expert_out = expert_layer(token_vec) - contrib = expert_out[0, k].item() * topk_soft[0, idx_e].item() - print(f"Expert {e} contrib at hidden {k}: {contrib}") - except Exception as _: - pass ort_dtype_quant_bits_tolerance_map = { "FP32:0": (5e-3, 1e-3), @@ -1111,6 +1141,43 @@ def small_test_cases(): yield batch_size, sequence_length +def with_mlas_q4_mode(test_cases): + expanded_cases = [] + for case in test_cases: + quant_bits = case[2] + if quant_bits == 4: + expanded_cases.append((*case, None)) + expanded_cases.append((*case, False)) + expanded_cases.append((*case, True)) + else: + expanded_cases.append((*case, None)) + return expanded_cases + + +@contextmanager +def scoped_env_var(name: str, value: str): + previous = os.environ.get(name) + os.environ[name] = value + try: + yield + finally: + if previous is None: + os.environ.pop(name, None) + else: + os.environ[name] = previous + + +def run_parity_with_mlas_q4_mode(test_runner, enable_mlas_q4_gemm: bool | None): + if enable_mlas_q4_gemm is None: # No env var + test_runner() + else: + env_value = "1" if enable_mlas_q4_gemm else "0" + mode = "enabled" if enable_mlas_q4_gemm else "disabled" + print(f"DirectQ4 mode ({ORT_USE_MLAS_Q4_GEMM_MOE}) is {mode}") + with scoped_env_var(ORT_USE_MLAS_Q4_GEMM_MOE, env_value): + test_runner() + + class SwigluMoEBlock(SparseMoeBlockORTHelper): def __init__( self, @@ -1128,7 +1195,7 @@ def __init__( self.num_experts = config.num_local_experts self.top_k = config.num_experts_per_token self.use_swiglu = True - self.swiglu_interleaved = True + self.swiglu_fusion = 1 self.block_size = block_size use_quant = self.quant_bits > 0 @@ -1232,7 +1299,7 @@ def __init__( self.top_k = config.num_experts_per_tok self.router_jitter_noise = config.router_jitter_noise self.use_swiglu = True - self.swiglu_interleaved = True + self.swiglu_fusion = 1 self.block_size = block_size use_quant = self.quant_bits > 0 @@ -1314,7 +1381,8 @@ def __init__( use_swiglu=self.use_swiglu, use_quant=use_quant, quant_bits=self.quant_bits, - swiglu_interleaved=self.swiglu_interleaved, + # swiglu_fusion=1 means fused and interleaved, which is the standard for QMoE. + swiglu_fusion=getattr(self, "swiglu_fusion", 0), block_size=self.block_size, ) @@ -1354,8 +1422,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return final_hidden_states -disable_cpu_qmoe_tests = False - # Define test cases for different MoE types phi3_test_cases = [ (1, 32, 4), @@ -1373,10 +1439,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: ] -@unittest.skipIf(disable_cpu_qmoe_tests, "Skipping qMoE cpu tests") class TestPhiQMoECPU(unittest.TestCase): - @parameterized.expand(phi3_test_cases) - def test_phi3_qmoe_parity_cpu(self, batch_size, sequence_length, quant_bits): + @parameterized.expand(with_mlas_q4_mode(phi3_test_cases)) + def test_phi3_qmoe_parity_cpu(self, batch_size, sequence_length, quant_bits, enable_mlas_q4_gemm): # Create unique seed based on test parameters to ensure different inputs for each test base_seed = 2000 # Different base seed from other tests param_hash = hash((batch_size, sequence_length, quant_bits)) @@ -1411,10 +1476,10 @@ def test_phi3_qmoe_parity_cpu(self, batch_size, sequence_length, quant_bits): self.assertFalse(torch.isnan(torch_result).any()) self.assertFalse(torch.isinf(torch_result).any()) - phi3_moe.parity_check() + run_parity_with_mlas_q4_mode(phi3_moe.parity_check, enable_mlas_q4_gemm) - @parameterized.expand(phi3_test_cases) - def test_phi3_qmoe_asymmetric_parity_cpu(self, batch_size, sequence_length, quant_bits): + @parameterized.expand(with_mlas_q4_mode(phi3_test_cases)) + def test_phi3_qmoe_asymmetric_parity_cpu(self, batch_size, sequence_length, quant_bits, enable_mlas_q4_gemm): base_seed = 3000 param_hash = hash((batch_size, sequence_length, quant_bits)) unique_seed = base_seed + abs(param_hash) % 1000 @@ -1436,10 +1501,12 @@ def test_phi3_qmoe_asymmetric_parity_cpu(self, batch_size, sequence_length, quan onnx_dtype=TensorProto.FLOAT, use_asymmetric_quant=True, ) - phi3_moe.parity_check() + run_parity_with_mlas_q4_mode(phi3_moe.parity_check, enable_mlas_q4_gemm) - @parameterized.expand(phi3_blockwise_test_cases) - def test_phi3_qmoe_blockwise_parity_cpu(self, batch_size, sequence_length, quant_bits, block_size): + @parameterized.expand(with_mlas_q4_mode(phi3_blockwise_test_cases)) + def test_phi3_qmoe_blockwise_parity_cpu( + self, batch_size, sequence_length, quant_bits, block_size, enable_mlas_q4_gemm + ): torch.manual_seed(42) numpy.random.seed(42) @@ -1468,10 +1535,12 @@ def test_phi3_qmoe_blockwise_parity_cpu(self, batch_size, sequence_length, quant self.assertFalse(torch.isnan(torch_result).any()) self.assertFalse(torch.isinf(torch_result).any()) - phi3_moe.parity_check() + run_parity_with_mlas_q4_mode(phi3_moe.parity_check, enable_mlas_q4_gemm) - @parameterized.expand(phi3_blockwise_test_cases) - def test_phi3_qmoe_blockwise_asymmetric_parity_cpu(self, batch_size, sequence_length, quant_bits, block_size): + @parameterized.expand(with_mlas_q4_mode(phi3_blockwise_test_cases)) + def test_phi3_qmoe_blockwise_asymmetric_parity_cpu( + self, batch_size, sequence_length, quant_bits, block_size, enable_mlas_q4_gemm + ): torch.manual_seed(43) numpy.random.seed(43) @@ -1489,10 +1558,8 @@ def test_phi3_qmoe_blockwise_asymmetric_parity_cpu(self, batch_size, sequence_le block_size=block_size, use_asymmetric_quant=True, ) - phi3_moe.parity_check() - + run_parity_with_mlas_q4_mode(phi3_moe.parity_check, enable_mlas_q4_gemm) -disable_cpu_qmoe_tests = False swiglu_test_cases = [ (1, 32, 4), @@ -1510,10 +1577,9 @@ def test_phi3_qmoe_blockwise_asymmetric_parity_cpu(self, batch_size, sequence_le ] -@unittest.skipIf(disable_cpu_qmoe_tests, "Skipping qMoE cpu tests") class TestSwigluQMoECPU(unittest.TestCase): - @parameterized.expand(swiglu_test_cases) - def test_swiglu_qmoe_parity_cpu(self, batch_size, sequence_length, quant_bits): + @parameterized.expand(with_mlas_q4_mode(swiglu_test_cases)) + def test_swiglu_qmoe_parity_cpu(self, batch_size, sequence_length, quant_bits, enable_mlas_q4_gemm): # Create unique seed based on test parameters to ensure different inputs for each test base_seed = 1000 # Different base seed from regular MoE tests param_hash = hash((batch_size, sequence_length, quant_bits)) @@ -1547,10 +1613,10 @@ def test_swiglu_qmoe_parity_cpu(self, batch_size, sequence_length, quant_bits): self.assertFalse(torch.isnan(torch_result).any()) self.assertFalse(torch.isinf(torch_result).any()) - swiglu_moe.parity_check() + run_parity_with_mlas_q4_mode(swiglu_moe.parity_check, enable_mlas_q4_gemm) - @parameterized.expand(swiglu_test_cases) - def test_swiglu_qmoe_asymmetric_parity_cpu(self, batch_size, sequence_length, quant_bits): + @parameterized.expand(with_mlas_q4_mode(swiglu_test_cases)) + def test_swiglu_qmoe_asymmetric_parity_cpu(self, batch_size, sequence_length, quant_bits, enable_mlas_q4_gemm): base_seed = 1100 param_hash = hash((batch_size, sequence_length, quant_bits)) unique_seed = base_seed + abs(param_hash) % 1000 @@ -1572,10 +1638,12 @@ def test_swiglu_qmoe_asymmetric_parity_cpu(self, batch_size, sequence_length, qu onnx_dtype=TensorProto.FLOAT, use_asymmetric_quant=True, ) - swiglu_moe.parity_check() + run_parity_with_mlas_q4_mode(swiglu_moe.parity_check, enable_mlas_q4_gemm) - @parameterized.expand(swiglu_blockwise_test_cases) - def test_swiglu_qmoe_blockwise_parity_cpu(self, batch_size, sequence_length, quant_bits, block_size): + @parameterized.expand(with_mlas_q4_mode(swiglu_blockwise_test_cases)) + def test_swiglu_qmoe_blockwise_parity_cpu( + self, batch_size, sequence_length, quant_bits, block_size, enable_mlas_q4_gemm + ): torch.manual_seed(42) numpy.random.seed(42) @@ -1603,10 +1671,12 @@ def test_swiglu_qmoe_blockwise_parity_cpu(self, batch_size, sequence_length, qua self.assertFalse(torch.isnan(torch_result).any()) self.assertFalse(torch.isinf(torch_result).any()) - swiglu_moe.parity_check() + run_parity_with_mlas_q4_mode(swiglu_moe.parity_check, enable_mlas_q4_gemm) - @parameterized.expand(swiglu_blockwise_test_cases) - def test_swiglu_qmoe_blockwise_asymmetric_parity_cpu(self, batch_size, sequence_length, quant_bits, block_size): + @parameterized.expand(with_mlas_q4_mode(swiglu_blockwise_test_cases)) + def test_swiglu_qmoe_blockwise_asymmetric_parity_cpu( + self, batch_size, sequence_length, quant_bits, block_size, enable_mlas_q4_gemm + ): torch.manual_seed(43) numpy.random.seed(43) @@ -1624,7 +1694,7 @@ def test_swiglu_qmoe_blockwise_asymmetric_parity_cpu(self, batch_size, sequence_ block_size=block_size, use_asymmetric_quant=True, ) - swiglu_moe.parity_check() + run_parity_with_mlas_q4_mode(swiglu_moe.parity_check, enable_mlas_q4_gemm) @unittest.skipIf(True, "Skipping QMoE CPU benchmark tests") @@ -1633,9 +1703,6 @@ class TestQMoESwiGLUBenchmark(unittest.TestCase): def test_qmoe_swiglu_throughput_benchmark(self): """Comprehensive throughput benchmark for QMoE SwiGLU across different configurations.""" - if disable_cpu_qmoe_tests: - self.skipTest("QMoE CPU tests disabled") - print("\n=== QMoE SwiGLU Throughput Benchmark ===") # Test configurations: (name, hidden_size, intermediate_size, num_experts, top_k, quant_bits) diff --git a/onnxruntime/test/python/transformers/test_whisper.py b/onnxruntime/test/python/transformers/test_whisper.py index e3ca8e6b6ac9c..e90a14f8d7d61 100644 --- a/onnxruntime/test/python/transformers/test_whisper.py +++ b/onnxruntime/test/python/transformers/test_whisper.py @@ -471,8 +471,9 @@ def export(self, model, inputs, input_names, output_names, dynamic_axes): input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes, - opset_version=17, + opset_version=18, do_constant_folding=True, + dynamo=False, verbose=False, ) @@ -530,9 +531,7 @@ def test_hf_whisper_encoder_self_attention(self, precision, ep): use_gpu=True, only_onnxruntime=False, ) - name = f"hf_{precision}_encoder_self_attention.onnx" - # optimized_model.save_model_to_file(name) # Uncomment for debugging purposes - self.verify_fusion(optimized_model, name) + self.verify_fusion(optimized_model, f"hf_{precision}_encoder_self_attention.onnx") @parameterized.expand( [ diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc index 4e991716dd108..e472cbcee12d6 100644 --- a/onnxruntime/test/shared_lib/test_inference.cc +++ b/onnxruntime/test/shared_lib/test_inference.cc @@ -4918,6 +4918,93 @@ TEST(CApiTest, ModelWithExternalDataOutsideModelDirectoryShouldFailToLoad) { << "Exception message should indicate external data or security issue. Got: " << exception_message; } +TEST(CApiTest, InMemoryModel_ExternalDataOutsideWorkingDirectory_FailToLoad) { + // Attempt to create an ORT session with the malicious model (loaded from bytes). + // This should fail due to the use of an external file path that is not under current working directory. + // i.e. ../../../../etc/passwd + constexpr const ORTCHAR_T* model_path = TSTR("testdata/test_arbitrary_external_file.onnx"); + + Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test"); + Ort::SessionOptions session_options; + + // Load model contents into array + std::ifstream model_file_stream(model_path, std::ios::in | std::ios::binary); + ASSERT_TRUE(model_file_stream.good()); + model_file_stream.seekg(0, std::ios::end); + const auto file_contents_size = onnxruntime::narrow(model_file_stream.tellg()); + model_file_stream.seekg(0, std::ios::beg); + std::vector file_contents(file_contents_size, 0); + model_file_stream.read(&file_contents[0], file_contents_size); + model_file_stream.close(); + + bool exception_thrown = false; + std::string exception_message; + + try { + // This should throw an exception due to malicious external data + Ort::Session session(env, file_contents.data(), file_contents_size, session_options); + } catch (const Ort::Exception& e) { + exception_thrown = true; + exception_message = e.what(); + } catch (const std::exception& e) { + exception_thrown = true; + exception_message = e.what(); + } + + // Verify that loading the model failed + EXPECT_TRUE(exception_thrown) << "Expected model loading to fail due to malicious external data path"; + + // Verify that the exception message indicates security or external data issues + EXPECT_TRUE(exception_message.find("External data path") != std::string::npos && + exception_message.find("escapes working directory") != std::string::npos) + << "Exception message should indicate external data or security issue. Got: " << exception_message; +} + +TEST(CApiTest, InMemoryModel_SessionConfigExternalFileFolder_ExternalDataOutsideModelDirectory_FailToLoad) { + // Attempt to create an ORT session with the malicious model (loaded from bytes). + // A valid external file folder path is explicitly set via session options. + // However, this should still fail due to the use of an external file path that escapes the set directory. + // i.e. ../../../../etc/passwd + constexpr const ORTCHAR_T* model_path = TSTR("testdata/test_arbitrary_external_file.onnx"); + + Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "test"); + Ort::SessionOptions session_options; + session_options.AddConfigEntry(kOrtSessionOptionsModelExternalInitializersFileFolderPath, "testdata"); + + // Load model contents into array + std::ifstream model_file_stream(model_path, std::ios::in | std::ios::binary); + ASSERT_TRUE(model_file_stream.good()); + model_file_stream.seekg(0, std::ios::end); + const auto file_contents_size = onnxruntime::narrow(model_file_stream.tellg()); + model_file_stream.seekg(0, std::ios::beg); + std::vector file_contents(file_contents_size, 0); + model_file_stream.read(&file_contents[0], file_contents_size); + model_file_stream.close(); + + bool exception_thrown = false; + std::string exception_message; + + try { + // This should throw an exception due to malicious external data + Ort::Session session(env, file_contents.data(), file_contents_size, session_options); + } catch (const Ort::Exception& e) { + exception_thrown = true; + exception_message = e.what(); + } catch (const std::exception& e) { + exception_thrown = true; + exception_message = e.what(); + } + + // Verify that loading the model failed + EXPECT_TRUE(exception_thrown) << "Expected model loading to fail due to malicious external data path"; + + // Verify that the exception message indicates security or external data issues + EXPECT_TRUE(exception_message.find("External data path") != std::string::npos && + exception_message.find("escapes both model directory") != std::string::npos && + exception_message.find("and real model directory") != std::string::npos) + << "Exception message should indicate external data or security issue. Got: " << exception_message; +} + #ifdef ORT_ENABLE_STREAM #if USE_CUDA diff --git a/samples/cxx/CMakeLists.txt b/samples/cxx/CMakeLists.txt new file mode 100644 index 0000000000000..875e37c64eda2 --- /dev/null +++ b/samples/cxx/CMakeLists.txt @@ -0,0 +1,50 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +cmake_minimum_required(VERSION 3.28) + +project(onnxruntime_sample CXX) + +set(CMAKE_CXX_STANDARD 20) + +foreach(VAR IN ITEMS ORT_LIBRARY_DIR ORT_HEADER_DIR) + if (NOT DEFINED ${VAR}) + message(FATAL_ERROR "Required variable ${VAR} is not set. " + "Set ORT_LIBRARY_DIR to the ONNX Runtime lib directory and " + "ORT_HEADER_DIR to the ONNX Runtime include directory.") + endif() +endforeach() + +# Resolve to absolute paths +get_filename_component(ORT_LIBRARY_DIR "${ORT_LIBRARY_DIR}" ABSOLUTE) +get_filename_component(ORT_HEADER_DIR "${ORT_HEADER_DIR}" ABSOLUTE) + +# +# onnxruntime_sample_program +# +block() +add_executable(onnxruntime_sample_program) + +target_sources(onnxruntime_sample_program PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/main.cc) + +target_include_directories(onnxruntime_sample_program PRIVATE ${ORT_HEADER_DIR}) + +target_link_directories(onnxruntime_sample_program PRIVATE ${ORT_LIBRARY_DIR}) +target_link_libraries(onnxruntime_sample_program PRIVATE onnxruntime) + +# Copy ONNX Runtime shared libraries next to the executable. +# Collect shared library files from the ORT library directory based on platform. +if (WIN32) + file(GLOB ORT_SHARED_LIBS "${ORT_LIBRARY_DIR}/*.dll") +elseif (APPLE) + file(GLOB ORT_SHARED_LIBS "${ORT_LIBRARY_DIR}/*.dylib") +else() + file(GLOB ORT_SHARED_LIBS "${ORT_LIBRARY_DIR}/*.so" "${ORT_LIBRARY_DIR}/*.so.*") +endif() + +add_custom_command(TARGET onnxruntime_sample_program POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + ${ORT_SHARED_LIBS} + $ +) +endblock() diff --git a/samples/cxx/README.md b/samples/cxx/README.md new file mode 100644 index 0000000000000..1904c082cef7a --- /dev/null +++ b/samples/cxx/README.md @@ -0,0 +1,92 @@ +# ONNX Runtime C++ Sample + +A minimal C++ program demonstrating basic ONNX Runtime inference. It loads an ONNX model that adds two float tensors (`C = A + B`), runs inference, and verifies the result. + +## Prerequisites + +- CMake 3.28 or later +- C++20 compatible compiler (e.g., Visual Studio 2022) +- An ONNX Runtime release package (download from [GitHub releases](https://github.com/microsoft/onnxruntime/releases)) +- For model generation: + - Python with the `onnx` package + +## Directory Structure + +``` +samples/cxx/ +├── CMakeLists.txt # Build configuration +├── main.cc # Sample program source +├── add_model.onnx # ONNX model (C = A + B) +├── generate_model.py # Script to generate the ONNX model +└── README.md # This file +``` + +## Steps + +### 1. Extract the ONNX Runtime package + +Download and extract an ONNX Runtime release archive. For example: + +``` +tar -xf onnxruntime-win-x64-1.25.0.zip +``` + +This creates a directory like `onnxruntime-win-x64-1.25.0/` containing `include/` and `lib/` subdirectories. + +### 2. [Optional] Generate the ONNX model + +``` +cd samples/cxx +pip install onnx +python generate_model.py +``` + +This creates `add_model.onnx` in the current directory. + +### 3. Configure and build + +From the `samples/cxx` directory: + +**Windows:** +``` +cmake -S . -B build ^ + -DORT_HEADER_DIR:PATH=path\to\onnxruntime-win-x64-1.25.0\include ^ + -DORT_LIBRARY_DIR:PATH=path\to\onnxruntime-win-x64-1.25.0\lib +cmake --build build --config Release +``` + +**Linux / macOS:** +``` +cmake -S . -B build \ + -DORT_HEADER_DIR:PATH=path/to/onnxruntime-linux-x64-1.25.0/include \ + -DORT_LIBRARY_DIR:PATH=path/to/onnxruntime-linux-x64-1.25.0/lib +cmake --build build --config Release +``` + +Adjust the paths to match your extracted package name and location. + +The build automatically copies the ONNX Runtime shared libraries next to the executable. + +#### CMake Variables + +| Variable | Description | +|---|---| +| `ORT_HEADER_DIR` | Path to the ONNX Runtime `include` directory | +| `ORT_LIBRARY_DIR` | Path to the ONNX Runtime `lib` directory | + +### 4. Run + +**Windows:** +``` +build\Release\onnxruntime_sample_program.exe +``` + +**Linux / macOS:** +``` +./build/onnxruntime_sample_program +``` + +You can also pass a model path as an argument: +``` +onnxruntime_sample_program path/to/add_model.onnx +``` diff --git a/samples/cxx/add_model.onnx b/samples/cxx/add_model.onnx new file mode 100644 index 0000000000000..36308c1372a22 Binary files /dev/null and b/samples/cxx/add_model.onnx differ diff --git a/samples/cxx/generate_model.py b/samples/cxx/generate_model.py new file mode 100644 index 0000000000000..9ac70ab29deb4 --- /dev/null +++ b/samples/cxx/generate_model.py @@ -0,0 +1,42 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +"""Generate a simple ONNX model that computes C = A + B. + +Inputs: + A : float tensor of shape [1, 3] + B : float tensor of shape [1, 3] + +Output: + C : float tensor of shape [1, 3] + +Usage: + pip install onnx + python generate_model.py +""" + +from onnx import TensorProto, helper, save_model +from onnx.checker import check_model + + +def main(): + # Define inputs and output + a = helper.make_tensor_value_info("A", TensorProto.FLOAT, [1, 3]) + b = helper.make_tensor_value_info("B", TensorProto.FLOAT, [1, 3]) + c = helper.make_tensor_value_info("C", TensorProto.FLOAT, [1, 3]) + + # Create the Add node + add_node = helper.make_node("Add", inputs=["A", "B"], outputs=["C"]) + + # Build the graph and model + graph = helper.make_graph([add_node], "add_graph", [a, b], [c]) + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)]) + + # Validate and save + check_model(model) + save_model(model, "add_model.onnx") + print("Saved add_model.onnx") + + +if __name__ == "__main__": + main() diff --git a/samples/cxx/main.cc b/samples/cxx/main.cc new file mode 100644 index 0000000000000..4e31e033ab8c7 --- /dev/null +++ b/samples/cxx/main.cc @@ -0,0 +1,170 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +// Sample program demonstrating basic ONNX Runtime C++ API usage. +// Loads a simple ONNX model (C = A + B), runs inference, and prints the result. +// +// Generate the model first: python generate_model.py + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "onnxruntime_cxx_api.h" + +// Throw std::runtime_error if `condition` is false. Includes file and line info. +#define THROW_IF_NOT(condition) \ + do { \ + if (!(condition)) { \ + throw std::runtime_error(std::string(__FILE__) + ":" + \ + std::to_string(__LINE__) + ": " + \ + "check failed: " #condition); \ + } \ + } while (0) + +int main(int argc, char* argv[]) { + try { + // ----------------------------------------------------------------------- + // 1. Initialize the ONNX Runtime environment + // ----------------------------------------------------------------------- + Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "onnxruntime_sample"); + std::cout << "ONNX Runtime version: " << Ort::GetVersionString() << "\n\n"; + + // ----------------------------------------------------------------------- + // 2. Create session options (could add execution providers here) + // ----------------------------------------------------------------------- + Ort::SessionOptions session_options; + session_options.SetIntraOpNumThreads(1); + session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_BASIC); + + // ----------------------------------------------------------------------- + // 3. Load the ONNX model from a file + // Generate with: python generate_model.py + // ----------------------------------------------------------------------- + const std::filesystem::path model_path = (argc > 1) ? argv[1] : "add_model.onnx"; + std::cout << "Loading model: " << model_path.string() << "\n"; + + Ort::Session session(env, model_path.native().c_str(), session_options); + + // ----------------------------------------------------------------------- + // 4. Query model metadata: input/output names and shapes + // ----------------------------------------------------------------------- + Ort::AllocatorWithDefaultOptions allocator; + + const size_t num_inputs = session.GetInputCount(); + const size_t num_outputs = session.GetOutputCount(); + std::cout << "Model inputs: " << num_inputs << "\n"; + std::cout << "Model outputs: " << num_outputs << "\n"; + + // Collect input/output names + std::vector input_names; + std::vector output_names; + + for (size_t i = 0; i < num_inputs; ++i) { + auto name = session.GetInputNameAllocated(i, allocator); + std::cout << " Input " << i << ": " << name.get() << "\n"; + input_names.emplace_back(name.get()); + } + for (size_t i = 0; i < num_outputs; ++i) { + auto name = session.GetOutputNameAllocated(i, allocator); + std::cout << " Output " << i << ": " << name.get() << "\n"; + output_names.emplace_back(name.get()); + } + std::cout << "\n"; + + // ----------------------------------------------------------------------- + // 5. Prepare input tensors + // ----------------------------------------------------------------------- + // Our model expects two float tensors of shape [1, 3]. + constexpr int64_t batch_size = 1; + constexpr int64_t num_elements = 3; + const std::array input_shape = {batch_size, num_elements}; + + std::array input_a = {1.0f, 2.0f, 3.0f}; + std::array input_b = {4.0f, 5.0f, 6.0f}; + + auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); + + auto tensor_a = Ort::Value::CreateTensor( + memory_info, input_a.data(), input_a.size(), + input_shape.data(), input_shape.size()); + + auto tensor_b = Ort::Value::CreateTensor( + memory_info, input_b.data(), input_b.size(), + input_shape.data(), input_shape.size()); + + THROW_IF_NOT(tensor_a.IsTensor()); + THROW_IF_NOT(tensor_b.IsTensor()); + + // The Run() API expects arrays of C strings for input/output names. + std::vector input_name_ptrs; + std::vector output_name_ptrs; + for (const auto& n : input_names) input_name_ptrs.push_back(n.c_str()); + for (const auto& n : output_names) output_name_ptrs.push_back(n.c_str()); + + std::array input_tensors{std::move(tensor_a), std::move(tensor_b)}; + + // ----------------------------------------------------------------------- + // 6. Run inference + // ----------------------------------------------------------------------- + std::cout << "Running inference...\n"; + + Ort::RunOptions run_options; + auto output_tensors = session.Run( + run_options, + input_name_ptrs.data(), input_tensors.data(), input_tensors.size(), + output_name_ptrs.data(), output_name_ptrs.size()); + + // ----------------------------------------------------------------------- + // 7. Process output + // ----------------------------------------------------------------------- + THROW_IF_NOT(!output_tensors.empty() && output_tensors[0].IsTensor()); + + const float* output_data = output_tensors[0].GetTensorData(); + auto type_info = output_tensors[0].GetTensorTypeAndShapeInfo(); + size_t output_count = type_info.GetElementCount(); + + std::cout << "\nInputs:\n"; + std::cout << " A = ["; + for (size_t i = 0; i < input_a.size(); ++i) { + std::cout << (i ? ", " : "") << input_a[i]; + } + std::cout << "]\n"; + + std::cout << " B = ["; + for (size_t i = 0; i < input_b.size(); ++i) { + std::cout << (i ? ", " : "") << input_b[i]; + } + std::cout << "]\n"; + + std::cout << "\nOutput (A + B):\n"; + std::cout << " C = ["; + for (size_t i = 0; i < output_count; ++i) { + std::cout << (i ? ", " : "") << output_data[i]; + } + std::cout << "]\n"; + + // Verify correctness + bool correct = true; + for (size_t i = 0; i < num_elements; ++i) { + if (output_data[i] != input_a[i] + input_b[i]) { + correct = false; + break; + } + } + std::cout << "\nResult: " << (correct ? "PASS" : "FAIL") << "\n"; + + return correct ? EXIT_SUCCESS : EXIT_FAILURE; + } catch (const Ort::Exception& e) { + std::cerr << "ONNX Runtime error: " << e.what() << "\n"; + return EXIT_FAILURE; + } catch (const std::exception& e) { + std::cerr << "Error: " << e.what() << "\n"; + return EXIT_FAILURE; + } +} diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index c8beeef8aa509..de64183e1bb18 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -1761,7 +1761,7 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs): # Install cpu only version of torch when cuda is not enabled in Linux. extra = [] if args.use_cuda and is_linux() else ["--index-url", "https://download.pytorch.org/whl/cpu"] run_subprocess( - [sys.executable, "-m", "pip", "install", "torch==2.8.0", "torchvision==0.23.0", *extra], + [sys.executable, "-m", "pip", "install", "torch==2.10.0", "torchvision==0.25.0", *extra], cwd=cwd, dll_path=dll_path, python_path=python_path, @@ -1838,11 +1838,9 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs): [sys.executable, "-m", "unittest", "discover", "-s", "quantization"], cwd=cwd, dll_path=dll_path ) - # onnx package does not support python 3.14 yet so skip the transformers tests for python 3.14. - # we can remove this check when onnx package supports python 3.14. if args.enable_transformers_tool_test and (sys.version_info.major, sys.version_info.minor) < ( 3, - 14, + 15, ): import google.protobuf # noqa: PLC0415 import numpy # noqa: PLC0415 diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml index 8d96c1ae99e0a..5ddac928b32d3 100644 --- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml +++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-test-pipelines.yml @@ -160,7 +160,61 @@ stages: NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu.Linux' CudaVersion: 12.8 +- template: templates/test-binary-archive-stage.yml + parameters: + artifactName: onnxruntime-linux-aarch64 + artifactPipelineResource: build + previousStageName: Setup + platform: linux-aarch64 + agentPool: onnxruntime-linux-ARM64-CPU-2019 + +- template: templates/test-binary-archive-stage.yml + parameters: + artifactName: onnxruntime-linux-x64 + artifactPipelineResource: build + previousStageName: Setup + platform: linux-x64 + agentPool: onnxruntime-Ubuntu2204-AMD-CPU +- template: templates/test-binary-archive-stage.yml + parameters: + artifactName: onnxruntime-osx-arm64 + artifactPipelineResource: build + previousStageName: Setup + platform: osx-arm64 + agentPool: + name: AcesShared + os: macOS + demands: + - ImageOverride -equals ACES_VM_SharedPool_Sequoia + agentSetupSteps: + - template: templates/setup-build-tools.yml + parameters: + host_cpu_arch: arm64 + +- template: templates/test-binary-archive-stage.yml + parameters: + artifactName: onnxruntime-win-arm64 + artifactPipelineResource: build + previousStageName: Setup + platform: win-arm64 + agentPool: onnxruntime-qnn-windows-vs-2022-arm64 + +- template: templates/test-binary-archive-stage.yml + parameters: + artifactName: onnxruntime-win-arm64x + artifactPipelineResource: build + previousStageName: Setup + platform: win-arm64x + agentPool: onnxruntime-qnn-windows-vs-2022-arm64 + +- template: templates/test-binary-archive-stage.yml + parameters: + artifactName: onnxruntime-win-x64 + artifactPipelineResource: build + previousStageName: Setup + platform: win-x64 + agentPool: onnxruntime-Win-CPU-VS2022-Latest # Run GPU tests. - stage: Windows_Packaging_cuda_Testing diff --git a/tools/ci_build/github/azure-pipelines/dml-nuget-packaging.yml b/tools/ci_build/github/azure-pipelines/dml-nuget-packaging.yml index 3cf28655c36e7..0b63a4f5b83c1 100644 --- a/tools/ci_build/github/azure-pipelines/dml-nuget-packaging.yml +++ b/tools/ci_build/github/azure-pipelines/dml-nuget-packaging.yml @@ -111,3 +111,4 @@ extends: - template: stages/nuget_dml_packaging_stage.yml parameters: DoEsrp: ${{ parameters.DoEsrp }} + IsReleaseBuild: ${{ parameters.IsReleaseBuild }} diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml index 0481a356cf9a1..2a8e222a9e192 100644 --- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml @@ -96,6 +96,5 @@ extends: QnnSdk: ${{ parameters.QnnSdk }} IsReleaseBuild: ${{ parameters.IsReleaseBuild }} DoEsrp: ${{ parameters.DoEsrp }} - ArtifactName: 'drop-nuget-qnn-arm64x' StageName: 'OnnxRuntime_QNN_Nuget_Win_Arm64x' build_config: ${{ parameters.build_config }} diff --git a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml index 7e47227c23d5b..385cee35eb95d 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml @@ -175,8 +175,6 @@ stages: - stage: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Tests dependsOn: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Build - # Skip this stage for Python 3.14 for now until onnx package support python 3.14. - condition: and(succeeded(), ne('${{ parameters.PYTHON_VERSION }}', '3.14')) jobs: - job: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Tests workspace: diff --git a/tools/ci_build/github/azure-pipelines/stages/py-win-webgpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-win-webgpu-stage.yml index 8bd8521d80104..1897d94db76c7 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-win-webgpu-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-win-webgpu-stage.yml @@ -131,8 +131,6 @@ stages: - stage: Win_py_webgpu_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Tests dependsOn: Win_py_webgpu_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Build - # Skip this stage for Python 3.14 for now until onnx package support python 3.14. - condition: and(succeeded(), ne('${{ parameters.PYTHON_VERSION }}', '3.14')) jobs: - job: Win_py_webgpu_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Tests workspace: diff --git a/tools/ci_build/github/azure-pipelines/templates/build-win-arm64x-steps.yml b/tools/ci_build/github/azure-pipelines/templates/build-win-arm64x-steps.yml new file mode 100644 index 0000000000000..50e7cbb13d6e1 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/templates/build-win-arm64x-steps.yml @@ -0,0 +1,28 @@ +# Runs a Windows ARM64X build in `buildDirectory`. + +parameters: + buildDirectory: '$(Build.BinariesDirectory)' + additionalBuildPyArgs: '' + +steps: +- task: PythonScript@0 + displayName: 'Build arm64 project for arm64x - generate the def & lib file for next build' + inputs: + scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' + arguments: > + ${{ parameters.additionalBuildPyArgs }} + --build_shared_lib + --arm64 + --buildasx + --build_dir="${{ parameters.buildDirectory }}/arm64" + +- task: PythonScript@0 + displayName: 'Build arm64ec project for arm64x - the real arm64x' + inputs: + scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' + arguments: > + ${{ parameters.additionalBuildPyArgs }} + --build_shared_lib + --arm64ec + --buildasx + --build_dir="${{ parameters.buildDirectory }}" diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows-qnn.yml deleted file mode 100644 index ab3e0ebaab39a..0000000000000 --- a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows-qnn.yml +++ /dev/null @@ -1,141 +0,0 @@ -# sets up common build tools for the windows build machines before build - -parameters: -- name: DoEsrp - displayName: Run code sign tasks? Must be true if you are doing an Onnx Runtime release. - type: boolean - default: true - -- name: buildConfig - displayName: buildConfig - type: string - default: 'RelWithDebInfo' - -- name: artifactName - displayName: artifactName,like 'onnxruntime-win-x64-1.6.0' - type: string - default: '' - -- name: artifactNameNoVersionString - type: string - default: 'onnxruntime-win-x64' - -- name: commitId - displayName: commitId - type: string - default: '' - -- name: trtEnabled - displayName: Include TRT EP libraries? - type: boolean - default: true - -steps: - - ${{if or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-'))}}: - - template: publish-symbolrequestprod-api.yml - parameters: - ${{if eq(variables['Build.SourceBranch'], 'refs/heads/main')}}: - symbolExpiryTime: 60 - includePublicSymbolServer: true - symbolsArtifactName: ${{parameters.artifactNameNoVersionString}} - symbolsVersion: $(Build.BuildId) - symbolProject: 'ONNX Runtime' - subscription: 'OnnxrunTimeCodeSign_20240611' - searchPattern: | - $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime.pdb - $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_*.pdb - - - - task: CmdLine@2 - displayName: 'Copy build artifacts for zipping' - inputs: - script: | - mkdir $(Build.BinariesDirectory)\${{parameters.artifactName}} - mkdir $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - mkdir $(Build.BinariesDirectory)\${{parameters.artifactName}}\include - - if exist $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_cuda.dll ( - echo "cuda context headers copied" - mkdir $(Build.BinariesDirectory)\${{parameters.artifactName}}\include\core\providers\cuda - copy $(Build.SourcesDirectory)\include\onnxruntime\core\providers\resource.h $(Build.BinariesDirectory)\${{parameters.artifactName}}\include\core\providers - copy $(Build.SourcesDirectory)\include\onnxruntime\core\providers\custom_op_context.h $(Build.BinariesDirectory)\${{parameters.artifactName}}\include\core\providers - copy $(Build.SourcesDirectory)\include\onnxruntime\core\providers\cuda\cuda_context.h $(Build.BinariesDirectory)\${{parameters.artifactName}}\include\core\providers\cuda - copy $(Build.SourcesDirectory)\include\onnxruntime\core\providers\cuda\cuda_resource.h $(Build.BinariesDirectory)\${{parameters.artifactName}}\include\core\providers\cuda - ) - - echo "Directories created" - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_shared.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_shared.lib $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_shared.pdb $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_cuda.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_cuda.pdb $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_cuda.lib $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - - # Copy WebGPU dependencies if required - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\dxcompiler.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\dxil.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - - # Copy QNN dependencies if required - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_qnn.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\libQnnHtp*.so $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib /Y - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\libqnnhtp*.cat $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib /Y - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnCpu.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnGpu.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtp.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpPrepare.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpV68Stub.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpV73Stub.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpV81Stub.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnSaver.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnSystem.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\Qualcomm_LICENSE.pdf $(Build.BinariesDirectory)\${{parameters.artifactName}} - - # copy trt ep libraries only when trt ep is enabled - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_tensorrt.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_tensorrt.pdb $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_tensorrt.lib $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime.pdb $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime.lib $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib - copy $(Build.SourcesDirectory)\include\onnxruntime\core\session\onnxruntime_*.h $(Build.BinariesDirectory)\${{parameters.artifactName}}\include - copy $(Build.SourcesDirectory)\include\onnxruntime\core\framework\provider_options.h $(Build.BinariesDirectory)\${{parameters.artifactName}}\include - copy $(Build.SourcesDirectory)\include\onnxruntime\core\providers\cpu\cpu_provider_factory.h $(Build.BinariesDirectory)\${{parameters.artifactName}}\include - copy $(Build.SourcesDirectory)\orttraining\orttraining\training_api\include\onnxruntime_training*.h $(Build.BinariesDirectory)\${{parameters.artifactName}}\include - - REM copy the README, license and TPN - copy $(Build.SourcesDirectory)\README.md $(Build.BinariesDirectory)\${{parameters.artifactName}}\README.md - copy $(Build.SourcesDirectory)\docs\Privacy.md $(Build.BinariesDirectory)\${{parameters.artifactName}}\Privacy.md - copy $(Build.SourcesDirectory)\LICENSE $(Build.BinariesDirectory)\${{parameters.artifactName}}\LICENSE - copy $(Build.SourcesDirectory)\ThirdPartyNotices.txt $(Build.BinariesDirectory)\${{parameters.artifactName}}\ThirdPartyNotices.txt - copy $(Build.SourcesDirectory)\VERSION_NUMBER $(Build.BinariesDirectory)\${{parameters.artifactName}}\VERSION_NUMBER - @echo ${{parameters.commitId}} > $(Build.BinariesDirectory)\${{parameters.artifactName}}\GIT_COMMIT_ID - - workingDirectory: '$(Build.BinariesDirectory)\${{parameters.buildConfig}}' - - - ${{ if eq(parameters.DoEsrp, true) }}: - - template: win-esrp-dll.yml - parameters: - FolderPath: '$(Build.BinariesDirectory)\${{parameters.artifactName}}' - DisplayName: 'ESRP - Sign Native dlls' - DoEsrp: ${{parameters.DoEsrp}} - Pattern: '*.dll,*.exe' - - - task: DeleteFiles@1 - displayName: 'Delete CodeSignSummary*.md' - inputs: - SourceFolder: '$(Build.BinariesDirectory)\${{parameters.artifactName}}' - Contents: 'CodeSignSummary*.md' - - - task: ArchiveFiles@2 - inputs: - rootFolderOrFile: '$(Build.BinariesDirectory)\${{parameters.artifactName}}' - includeRootFolder: true - archiveType: 'zip' # Options: zip, 7z, tar, wim - archiveFile: '$(Build.ArtifactStagingDirectory)\${{parameters.artifactName}}.zip' - replaceExistingArchive: true - - - task: 1ES.PublishPipelineArtifact@1 - inputs: - targetPath: '$(Build.ArtifactStagingDirectory)' - artifactName: '${{parameters.artifactNameNoVersionString}}' diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml index 28a1960aac27b..5f9dd5677e7bc 100644 --- a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml +++ b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml @@ -12,7 +12,7 @@ parameters: default: 'RelWithDebInfo' - name: artifactName - displayName: artifactName,like 'onnxruntime-win-x64-1.6.0' + displayName: artifactName, like 'onnxruntime-win-x64-1.6.0' type: string default: '' @@ -30,6 +30,11 @@ parameters: type: boolean default: true +- name: publishArtifactStagingDirectory + displayName: Whether to publish the artifact staging directory as an artifact named `artifactNameNoVersionString`. + type: boolean + default: false + steps: - ${{if or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-'))}}: - template: publish-symbolrequestprod-api.yml @@ -89,6 +94,7 @@ steps: copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpV81Stub.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnSaver.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnSystem.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib + copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\Qualcomm_LICENSE.pdf $(Build.BinariesDirectory)\${{parameters.artifactName}} # copy trt ep libraries only when trt ep is enabled copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\onnxruntime_providers_tensorrt.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib @@ -133,3 +139,9 @@ steps: archiveType: 'zip' # Options: zip, 7z, tar, wim archiveFile: '$(Build.ArtifactStagingDirectory)\${{parameters.artifactName}}.zip' replaceExistingArchive: true + + - ${{ if parameters.publishArtifactStagingDirectory }}: + - task: 1ES.PublishPipelineArtifact@1 + inputs: + targetPath: '$(Build.ArtifactStagingDirectory)' + artifactName: '${{parameters.artifactNameNoVersionString}}' diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml index a0f023325be04..448dbafcaaaac 100644 --- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml +++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml @@ -163,6 +163,20 @@ stages: PreReleaseVersionSuffixString: ${{ parameters.PreReleaseVersionSuffixString }} PreReleaseVersionSuffixNumber: ${{ parameters.PreReleaseVersionSuffixNumber }} +- template: win-ci.yml + parameters: + DoEsrp: true + stage_name_suffix: CPU_arm64x_${{ parameters.BuildVariant }} + buildArch: x64 + msbuildPlatform: arm64x + packageName: arm64x + buildparameter: ${{ parameters.AdditionalBuildFlags }} ${{ parameters.AdditionalWinBuildFlags}} + runTests: false + buildJava: false + buildNodejs: false + PreReleaseVersionSuffixString: ${{ parameters.PreReleaseVersionSuffixString }} + PreReleaseVersionSuffixNumber: ${{ parameters.PreReleaseVersionSuffixNumber }} + - template: win-ci.yml parameters: DoEsrp: true diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-cpu.yml index 09603f2350657..326cfd7829f2f 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-win-cpu.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-win-cpu.yml @@ -149,20 +149,18 @@ jobs: - powershell: | - if ("$(PythonVersion)" -notcontains "3.14") { - python -m pip uninstall -y onnxruntime onnxruntime-gpu -qq - Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate} - Remove-Item -Recurse -Force onnxruntime - if ("$(ExtraParam)".Split() -contains "--use_azure") { - - if( "${{parameters.architecture}}" -eq 'arm64') { - $env:path="$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\arm64-windows\bin;$env:path" - } else { - $env:path="$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\x64-windows\bin;$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\x86-windows\bin;$env:path" - } - python onnxruntime_test_python_azure.py + python -m pip uninstall -y onnxruntime onnxruntime-gpu -qq + Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate} + Remove-Item -Recurse -Force onnxruntime + if ("$(ExtraParam)".Split() -contains "--use_azure") { + + if( "${{parameters.architecture}}" -eq 'arm64') { + $env:path="$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\arm64-windows\bin;$env:path" + } else { + $env:path="$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\x64-windows\bin;$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\_deps\vcpkg-src\installed\x86-windows\bin;$env:path" } - python onnx_backend_test_series.py + python onnxruntime_test_python_azure.py } + python onnx_backend_test_series.py workingDirectory: '$(Build.SourcesDirectory)\build\${{ parameters.cmake_build_type }}\${{ parameters.cmake_build_type }}' displayName: 'Run Python Tests' diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml index 8a1c4f8a39316..7e176b67f6685 100644 --- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml +++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml @@ -48,7 +48,7 @@ stages: variables: OrtPackageId: ${{ parameters.OrtNugetPackageId }} ReleaseVersionSuffix: $[stageDependencies.Setup.Set_Variables.outputs['Set_Release_Version_Suffix.ReleaseVersionSuffix']] - commonBuildArgs: '--skip_submodule_sync --build_shared_lib --client_package_build --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags ${{ parameters.AdditionalBuildArgs}}' + commonBuildArgs: '--skip_submodule_sync --build_shared_lib --client_package_build --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags --config ${{ parameters.build_config }} ${{ parameters.AdditionalBuildArgs}}' steps: - template: set-version-number-variables-step.yml @@ -61,17 +61,10 @@ stages: parameters: QnnSDKVersion: ${{ parameters.QnnSdk }} - - task: PythonScript@0 - displayName: 'Build arm64x project - generate the def & lib file for next build' - inputs: - scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' - arguments: ' --arm64 --buildasx --build_dir $(Build.BinariesDirectory)\arm64x --use_qnn --qnn_home $(QnnSDKRootDir) $(commonBuildArgs)' - - - task: PythonScript@0 - displayName: 'Build arm64ecx project - the real arm64x' - inputs: - scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' - arguments: ' --arm64ec --buildasx --build_dir $(Build.BinariesDirectory) --use_qnn --qnn_home $(QnnSDKRootDir) $(commonBuildArgs)' + - template: build-win-arm64x-steps.yml + parameters: + buildDirectory: '$(Build.BinariesDirectory)' + additionalBuildPyArgs: '$(commonBuildArgs) --use_qnn --qnn_home $(QnnSDKRootDir)' - task: CmdLine@2 displayName: 'Print contents of binaries directory' @@ -87,12 +80,13 @@ stages: Pattern: 'onnxruntime*.dll' - ${{ if eq(parameters.PublishArchive, true) }}: - - template: c-api-artifacts-package-and-publish-steps-windows-qnn.yml + - template: c-api-artifacts-package-and-publish-steps-windows.yml parameters: buildConfig: ${{ parameters.build_config }} artifactName: 'onnxruntime-win-arm64x-qnn' artifactNameNoVersionString: 'onnxruntime-win-arm64x-qnn' DoEsrp: ${{ parameters.DoEsrp }} + publishArtifactStagingDirectory: true - task: MSBuild@1 displayName: 'Restore NuGet Packages and create project.assets.json' diff --git a/tools/ci_build/github/azure-pipelines/templates/set-variable.yml b/tools/ci_build/github/azure-pipelines/templates/set-variable.yml new file mode 100644 index 0000000000000..2cf49f2f067c2 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/templates/set-variable.yml @@ -0,0 +1,32 @@ +# Sets an ADO pipeline variable. +# See https://learn.microsoft.com/en-us/azure/devops/pipelines/process/set-variables-scripts + +parameters: +- name: name + type: string + +- name: value + type: string + +steps: +- task: PythonScript@0 + displayName: 'Set variable - ${{ parameters.name }}' + inputs: + scriptSource: inline + script: | + import os + + variable_name = os.getenv("VARIABLE_NAME") + variable_value = os.getenv("VARIABLE_VALUE") + + if not variable_name.isidentifier(): + raise ValueError(f"Variable name is not a valid identifier: '{variable_name}'") + + if "\n" in variable_value: + raise ValueError(f"Variable value should not contain any newlines: '{variable_value}'") + + print(f"Setting variable: {variable_name} = '{variable_value}'") + print(f"##vso[task.setvariable variable={variable_name}]{variable_value}") + env: + VARIABLE_NAME: ${{ parameters.name }} + VARIABLE_VALUE: ${{ parameters.value }} diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml index 7d6e272533696..8303547a47566 100644 --- a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml @@ -22,7 +22,7 @@ stages: buildSettingsFile: "tools/ci_build/github/apple/default_full_apple_framework_build_settings.json" cPodName: onnxruntime-c objcPodName: onnxruntime-objc - timeoutInMinutes: 270 + timeoutInMinutes: 360 templateContext: outputs: - output: pipelineArtifact diff --git a/tools/ci_build/github/azure-pipelines/templates/test-binary-archive-stage.yml b/tools/ci_build/github/azure-pipelines/templates/test-binary-archive-stage.yml new file mode 100644 index 0000000000000..b9b9cdc6b0eb3 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/templates/test-binary-archive-stage.yml @@ -0,0 +1,121 @@ +# Tests an ONNX Runtime binary archive produced by the packaging pipeline. + +parameters: +- name: artifactName + type: string +- name: artifactPipelineResource + type: string +- name: previousStageName + type: string + default: '' +- name: platform + type: string +- name: agentPool + type: object +- name: agentSetupSteps + type: stepList + default: [] + +stages: +- stage: Binary_Archive_Testing_${{ replace(parameters.platform, '-', '_') }} + ${{ if ne(parameters.previousStageName, '') }}: + dependsOn: ${{ parameters.previousStageName }} + + jobs: + - job: Binary_Archive_Testing_${{ replace(parameters.platform, '-', '_') }} + pool: ${{ parameters.agentPool }} + + variables: + - name: buildConfig + value: Release + - name: relativePathFromBuildToOutputDir + ${{ if startsWith(parameters.platform, 'win') }}: + value: "${{ variables['buildConfig'] }}" + ${{ else }}: + value: "." + + steps: + - checkout: self + clean: true + submodules: none + + - ${{ each agentSetupStep in parameters.agentSetupSteps }}: + - ${{ agentSetupStep }} + + - download: ${{ parameters.artifactPipelineResource }} + artifact: ${{ parameters.artifactName }} + patterns: | + *.zip + *.tgz + displayName: Download binary archive for ${{ parameters.platform }} + + # Extract the binary archive. + # The archive contains a top-level directory like onnxruntime--/. + # After extraction, set ORT_PACKAGE_DIR to the extracted directory. + - ${{ if startsWith(parameters.platform, 'win') }}: + - task: PowerShell@2 + displayName: 'Extract binary archive' + inputs: + targetType: 'inline' + script: | + $artifactDir = "$(Pipeline.Workspace)/${{ parameters.artifactPipelineResource }}/${{ parameters.artifactName }}" + $archive = (Get-ChildItem -Path $artifactDir -Filter *.zip)[0].FullName + Write-Host "Extracting $archive" + Expand-Archive -Path $archive -DestinationPath $(Build.BinariesDirectory) + $extractedDir = (Get-ChildItem -Path $(Build.BinariesDirectory) -Directory | Where-Object { $_.Name -like "onnxruntime-*" })[0].FullName + Write-Host "Extracted to $extractedDir" + Write-Host "##vso[task.setvariable variable=ORT_PACKAGE_DIR]$extractedDir" + + - ${{ else }}: + - bash: | + set -ex + artifact_dir="$(Pipeline.Workspace)/${{ parameters.artifactPipelineResource }}/${{ parameters.artifactName }}" + archive=$(find "$artifact_dir" -name '*.tgz' | head -1) + echo "Extracting $archive" + tar -xzf "$archive" -C $(Build.BinariesDirectory) + extracted_dir=$(find $(Build.BinariesDirectory) -maxdepth 1 -type d -name 'onnxruntime-*' | head -1) + echo "Extracted to $extracted_dir" + + # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote. + set +x + echo "##vso[task.setvariable variable=ORT_PACKAGE_DIR]$extracted_dir" + displayName: 'Extract binary archive' + + # Build and run the C++ sample using the extracted ONNX Runtime package. + + - script: > + cmake + -S $(Build.SourcesDirectory)/samples/cxx + -B $(Build.BinariesDirectory)/sample_build + -DORT_HEADER_DIR:PATH=$(ORT_PACKAGE_DIR)/include + -DORT_LIBRARY_DIR:PATH=$(ORT_PACKAGE_DIR)/lib + displayName: 'Generate C++ sample build system' + + - script: | + cmake --build $(Build.BinariesDirectory)/sample_build --config $(buildConfig) + displayName: 'Build C++ sample' + + - script: > + $(Build.BinariesDirectory)/sample_build/$(relativePathFromBuildToOutputDir)/onnxruntime_sample_program + $(Build.SourcesDirectory)/samples/cxx/add_model.onnx + displayName: 'Run C++ sample' + + # For win-arm64x, also build and run for ARM64EC. + - ${{ if eq(parameters.platform, 'win-arm64x') }}: + - script: > + cmake + -S $(Build.SourcesDirectory)/samples/cxx + -B $(Build.BinariesDirectory)/sample_build_arm64ec + -DORT_HEADER_DIR:PATH=$(ORT_PACKAGE_DIR)/include + -DORT_LIBRARY_DIR:PATH=$(ORT_PACKAGE_DIR)/lib + -A ARM64EC + displayName: 'Generate C++ sample build system (ARM64EC)' + + - script: | + cmake --build $(Build.BinariesDirectory)/sample_build_arm64ec --config $(buildConfig) + displayName: 'Build C++ sample (ARM64EC)' + + - script: > + $(Build.BinariesDirectory)/sample_build_arm64ec/$(relativePathFromBuildToOutputDir)/onnxruntime_sample_program + $(Build.SourcesDirectory)/samples/cxx/add_model.onnx + displayName: 'Run C++ sample (ARM64EC)' diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml index cfb752ddc2b58..8a5584c111525 100644 --- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml @@ -177,12 +177,40 @@ stages: - script: python -m pip install -r $(Build.SourcesDirectory)\tools\ci_build\github\windows\python\requirements.txt - - task: PythonScript@0 - displayName: 'Generate cmake config' - inputs: - scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' - arguments: '--parallel 16 --use_vcpkg --use_vcpkg_ms_internal_asset_cache --config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --build --cmake_generator "$(VSGenerator)" --enable_onnx_tests $(TelemetryOption) ${{ parameters.buildparameter }} $(timeoutParameter) $(buildJavaParameter)' - workingDirectory: '$(Build.BinariesDirectory)' + - template: set-variable.yml + parameters: + name: commonBuildPyArgs + value: >- + --config RelWithDebInfo + --parallel + --use_vcpkg + --use_vcpkg_ms_internal_asset_cache + --use_binskim_compliant_compile_flags + --enable_lto + --disable_rtti + --skip_submodule_sync + --build_shared_lib + --update --build + --cmake_generator "$(VSGenerator)" + --enable_onnx_tests + $(TelemetryOption) + ${{ parameters.buildparameter }} + $(timeoutParameter) + $(buildJavaParameter) + + - ${{ if eq(parameters.msbuildPlatform, 'arm64x') }}: + - template: build-win-arm64x-steps.yml + parameters: + buildDirectory: '$(Build.BinariesDirectory)' + additionalBuildPyArgs: '$(commonBuildPyArgs)' + + - ${{ else }}: + - task: PythonScript@0 + displayName: 'Generate build system and build' + inputs: + scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' + arguments: '$(commonBuildPyArgs) --build_dir $(Build.BinariesDirectory)' + workingDirectory: '$(Build.BinariesDirectory)' # For CPU job, tests are run in the same machine as building - ${{ if eq(parameters.buildJava, 'true') }}: diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt index 42bee7a892b11..7e2b6e74cfdde 100644 --- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt +++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt @@ -1,5 +1,5 @@ -numpy==2.2.6; python_version < "3.14" -numpy==2.3.2; python_version >= "3.14" +numpy==2.2.6; python_version < "3.11" +numpy==2.4.2; python_version >= "3.11" mypy pytest setuptools>=68.2.2 @@ -7,4 +7,4 @@ wheel protobuf==4.25.8 sympy==1.14 flatbuffers -onnx==1.20.1; python_version < "3.14" +onnx==1.20.1 diff --git a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt index c5fc16837e093..63a8e96d8c128 100644 --- a/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/lort/requirements.txt @@ -3,13 +3,13 @@ beartype==0.15.0 flatbuffers cerberus h5py -onnx==1.20.1; python_version < "3.14" +onnx==1.20.1 # Python dependencies required for pytorch development astunparse expecttest!=0.2.0 hypothesis -numpy==2.2.6; python_version < "3.14" -numpy==2.3.2; python_version >= "3.14" +numpy==2.2.6; python_version < "3.11" +numpy==2.4.2; python_version >= "3.11" psutil pyyaml requests diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt index 8f5d0776501c0..ffcad5ee67208 100644 --- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt @@ -1,5 +1,5 @@ -numpy==2.2.6; python_version < "3.14" -numpy==2.3.2; python_version >= "3.14" +numpy==2.2.6; python_version < "3.11" +numpy==2.4.2; python_version >= "3.11" mypy pytest setuptools>=68.2.2 @@ -8,6 +8,5 @@ protobuf==6.33.0 sympy==1.14 flatbuffers neural-compressor>=2.2.1 -triton==3.2.0; python_version < "3.14" -triton==3.5.0; python_version >= "3.14" -onnx==1.20.1; python_version < "3.14" +triton==3.5.0 +onnx==1.20.1 diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt index 85a9c6391af80..ad57cc715589b 100644 --- a/tools/ci_build/github/linux/docker/scripts/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt @@ -1,6 +1,6 @@ cerberus -numpy==2.2.6; python_version < "3.14" -numpy==2.3.2; python_version >= "3.14" +numpy==2.2.6; python_version < "3.11" +numpy==2.4.2; python_version >= "3.11" mypy pytest setuptools==78.1.1 @@ -10,6 +10,6 @@ sympy==1.14 flatbuffers protobuf==6.33.0 packaging -onnxscript==0.5.3; python_version < "3.14" -onnx-ir==0.1.10; python_version < "3.14" -onnx==1.20.1; python_version < "3.14" +onnxscript==0.6.2 +onnx-ir==0.1.16 +onnx==1.20.1 diff --git a/tools/ci_build/github/linux/python/requirements.txt b/tools/ci_build/github/linux/python/requirements.txt index 6a474973d4f0c..d95e44bb3a280 100644 --- a/tools/ci_build/github/linux/python/requirements.txt +++ b/tools/ci_build/github/linux/python/requirements.txt @@ -1,5 +1,5 @@ -numpy==2.2.6; python_version < "3.14" -numpy==2.3.2; python_version >= "3.14" +numpy==2.2.6; python_version < "3.11" +numpy==2.4.2; python_version >= "3.11" mypy pytest setuptools>=68.2.2 @@ -8,8 +8,8 @@ protobuf==6.33.0 sympy==1.14 flatbuffers psutil -onnxscript==0.5.3; python_version < "3.14" -onnx-ir==0.1.10; python_version < "3.14" +onnxscript==0.6.2 +onnx-ir==0.1.16 jinja2 markupsafe -onnx==1.20.1; python_version < "3.14" +onnx==1.20.1 diff --git a/tools/ci_build/github/windows/python/requirements.txt b/tools/ci_build/github/windows/python/requirements.txt index a86eef170bc25..83593ff47e453 100644 --- a/tools/ci_build/github/windows/python/requirements.txt +++ b/tools/ci_build/github/windows/python/requirements.txt @@ -1,5 +1,5 @@ -numpy==2.2.6; python_version < "3.14" -numpy==2.3.2; python_version >= "3.14" +numpy==2.2.6; python_version < "3.11" +numpy==2.4.2; python_version >= "3.11" mypy pytest setuptools>=68.2.2 @@ -8,10 +8,10 @@ protobuf==6.33.0 sympy==1.14 flatbuffers psutil -onnxscript==0.5.3; python_version < "3.14" -onnx-ir==0.1.10; python_version < "3.14" +onnxscript==0.6.2 +onnx-ir==0.1.16 jinja2 markupsafe semver packaging -onnx==1.20.1; python_version < "3.14" +onnx==1.20.1 diff --git a/tools/ci_build/requirements/transformers-test/requirements.txt b/tools/ci_build/requirements/transformers-test/requirements.txt index 1523b420bfdbd..c764225dbc98d 100644 --- a/tools/ci_build/requirements/transformers-test/requirements.txt +++ b/tools/ci_build/requirements/transformers-test/requirements.txt @@ -2,14 +2,14 @@ packaging # protobuf and numpy is same as tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt protobuf==6.33.0 -numpy==2.2.6; python_version < "3.14" -numpy==2.3.2; python_version >= "3.14" -torch==2.8.0 -torchvision==0.23.0 +numpy==2.2.6; python_version < "3.11" +numpy==2.4.2; python_version >= "3.11" +torch==2.10.0 +torchvision==0.25.0 transformers==4.52.1 parameterized>=0.8.1 sentencepiece psutil einops -onnxscript==0.5.3; python_version < "3.14" -onnx-ir==0.1.10; python_version < "3.14" +onnxscript==0.6.2 +onnx-ir==0.1.16