From ab320744e6dd0b78a37b5be0bb1ee8a9726a9871 Mon Sep 17 00:00:00 2001 From: edgchen1 <18449977+edgchen1@users.noreply.github.com> Date: Tue, 18 Nov 2025 13:10:50 -0800 Subject: [PATCH 1/9] Upgrade cpuinfo version. Update ARM64 Windows feature detection to use cpuinfo functions which are now implemented. --- cmake/deps.txt | 2 +- cmake/vcpkg-ports/cpuinfo/portfile.cmake | 4 +-- onnxruntime/core/common/cpuid_info.cc | 33 ++++++------------------ onnxruntime/core/common/cpuid_info.h | 2 +- 4 files changed, 12 insertions(+), 29 deletions(-) diff --git a/cmake/deps.txt b/cmake/deps.txt index e1870bf2df0cf..69da93ff233f6 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -47,7 +47,7 @@ protoc_mac_universal;https://github.com/protocolbuffers/protobuf/releases/downlo psimd;https://github.com/Maratyszcza/psimd/archive/072586a71b55b7f8c584153d223e95687148a900.zip;1f5454b01f06f9656b77e4a5e2e31d7422487013 pthreadpool;https://github.com/google/pthreadpool/archive/dcc9f28589066af0dbd4555579281230abbf74dd.zip;533a77943203ef15ca608bcd9dbe2c94da7451d2 pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f780292da9db273c8ef06ccf5fd4b623624143e9 -pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/877328f188a3c7d1fa855871a278eb48d530c4c0.zip;9152d4bf6b8bde9f19b116de3bd8a745097ed9df +pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/f858c30bcb16f8effd5ff46996f0514539e17abc.zip;66a964eda7de60c925e2e26f71f9bbe31698997b re2;https://github.com/google/re2/archive/refs/tags/2024-07-02.zip;646e1728269cde7fcef990bf4a8e87b047882e88 safeint;https://github.com/dcleblanc/SafeInt/archive/refs/tags/3.0.28.zip;23f252040ff6cb9f1fd18575b32fa8fb5928daac tensorboard;https://github.com/tensorflow/tensorboard/archive/373eb09e4c5d2b3cc2493f0949dc4be6b6a45e81.zip;67b833913605a4f3f499894ab11528a702c2b381 diff --git a/cmake/vcpkg-ports/cpuinfo/portfile.cmake b/cmake/vcpkg-ports/cpuinfo/portfile.cmake index 80192840ee9b0..ff2bfc2f137cf 100644 --- a/cmake/vcpkg-ports/cpuinfo/portfile.cmake +++ b/cmake/vcpkg-ports/cpuinfo/portfile.cmake @@ -6,8 +6,8 @@ endif() vcpkg_from_github( OUT_SOURCE_PATH SOURCE_PATH REPO pytorch/cpuinfo - REF 877328f188a3c7d1fa855871a278eb48d530c4c0 - SHA512 b6d5a9ce9996eee3b2f09f39115f7ae178fe4d4814cc35b049a59d04a82228e268aa52d073c307ccb56a427428622940e1c77f004c99851dfca0d3a5d803658b + REF f858c30bcb16f8effd5ff46996f0514539e17abc + SHA512 cd7c0c1ea59fac69f2746f65f59656798eeb87410c304ac9d3b3d26ebea4f4124d1426c10fb4b87ff5f93f367ea10d63337f519ee3c3f8fefbb4b7ebf6438130 HEAD_REF master PATCHES patch_cpuinfo_h_for_arm64ec.patch diff --git a/onnxruntime/core/common/cpuid_info.cc b/onnxruntime/core/common/cpuid_info.cc index ab8ab0b326292..afea9f62419fa 100644 --- a/onnxruntime/core/common/cpuid_info.cc +++ b/onnxruntime/core/common/cpuid_info.cc @@ -237,9 +237,9 @@ void CPUIDInfo::ArmLinuxInit() { #elif defined(_WIN32) // ^ defined(__linux__) void CPUIDInfo::ArmWindowsInit() { - // Read MIDR and ID_AA64ISAR1_EL1 register values from Windows registry + // Read MIDR register values from Windows registry // There should be one per CPU - std::vector midr_values{}, id_aa64isar1_el1_values{}; + std::vector midr_values{}; // TODO!! Don't support multiple processor group yet!! constexpr int MAX_CORES = 64; @@ -272,17 +272,7 @@ void CPUIDInfo::ArmWindowsInit() { break; } - uint64_t id_aa64isar1_el1_value; - data_size = sizeof(id_aa64isar1_el1_value); - - // CP 4031 corresponds to ID_AA64ISAR1_EL1 register - if (::RegGetValueA(HKEY_LOCAL_MACHINE, processor_subkey, "CP 4031", RRF_RT_REG_QWORD, - nullptr, &id_aa64isar1_el1_value, &data_size) != ERROR_SUCCESS) { - break; - } - midr_values.push_back(midr_value); - id_aa64isar1_el1_values.push_back(id_aa64isar1_el1_value); } // process midr_values @@ -308,22 +298,15 @@ void CPUIDInfo::ArmWindowsInit() { } } - has_arm_neon_i8mm_ = std::all_of( - id_aa64isar1_el1_values.begin(), id_aa64isar1_el1_values.end(), - [](uint64_t id_aa64isar1_el1_value) { - // I8MM, bits [55:52] - return ((id_aa64isar1_el1_value >> 52) & 0xF) != 0; - }); - - has_arm_neon_dot_ = (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) != 0); - #if defined(CPUINFO_SUPPORTED) if (pytorch_cpuinfo_init_) { + has_arm_neon_dot_ = cpuinfo_has_arm_neon_dot(); has_fp16_ = cpuinfo_has_arm_neon_fp16_arith(); - // cpuinfo_has_arm_i8mm() doesn't work on Windows yet. See https://github.com/pytorch/cpuinfo/issues/279. - // has_arm_neon_i8mm_ = cpuinfo_has_arm_i8mm(); - has_arm_sve_i8mm_ = cpuinfo_has_arm_sve() && has_arm_neon_i8mm_; + has_arm_neon_i8mm_ = cpuinfo_has_arm_i8mm(); + has_arm_sve_i8mm_ = cpuinfo_has_arm_sve() && cpuinfo_has_arm_i8mm(); has_arm_neon_bf16_ = cpuinfo_has_arm_neon_bf16(); + has_arm_sme_ = cpuinfo_has_arm_sme(); + has_arm_sme2_ = cpuinfo_has_arm_sme2(); } #endif // defined(CPUINFO_SUPPORTED) } @@ -397,4 +380,4 @@ CPUIDInfo::CPUIDInfo() { #endif #endif // defined(CPUIDINFO_ARCH_ARM) } -} // namespace onnxruntime \ No newline at end of file +} // namespace onnxruntime diff --git a/onnxruntime/core/common/cpuid_info.h b/onnxruntime/core/common/cpuid_info.h index 9c40627b5cd1b..ca9315c7ef95d 100644 --- a/onnxruntime/core/common/cpuid_info.h +++ b/onnxruntime/core/common/cpuid_info.h @@ -171,4 +171,4 @@ class CPUIDInfo { uint32_t vendor_id_; }; -} // namespace onnxruntime \ No newline at end of file +} // namespace onnxruntime From 6a9cfc750221cec52f98fa55a0a1ed727214f0b8 Mon Sep 17 00:00:00 2001 From: edgchen1 <18449977+edgchen1@users.noreply.github.com> Date: Tue, 18 Nov 2025 16:20:23 -0800 Subject: [PATCH 2/9] add MlasIsDynamicQGemmAvailable() function, use that instead of checking CPUIDInfo::HasArm_SME2(). --- .../quantization/dynamic_quantize_matmul.cc | 5 +---- onnxruntime/core/mlas/inc/mlas.h | 10 ++++++++++ onnxruntime/core/mlas/lib/qgemm.cpp | 19 +++++++++++++++---- .../test/mlas/unittest/test_dynamic_qgemm.cpp | 7 +++---- 4 files changed, 29 insertions(+), 12 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc index ceb498372a6fc..2bba0adcd987c 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc @@ -1,7 +1,6 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/common/cpuid_info.h" // for CPUIDInfo::GetCPUIDInfo().HasArm_SME2() #include "core/common/narrow.h" #include "core/common/safeint.h" #include "core/mlas/inc/mlas.h" @@ -213,9 +212,7 @@ class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase { } } - // Currently, MlasDynamicQGemmBatch() and associated functions require SME2 or else they are no-ops. - // We check that here too before attempting to use them. - if (!CPUIDInfo::GetCPUIDInfo().HasArm_SME2()) { + if (!MlasIsDynamicQGemmAvailable()) { can_use_dynamic_quant_mlas_ = false; } diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h index 9d98a15d8457a..248c6d74e6cbd 100644 --- a/onnxruntime/core/mlas/inc/mlas.h +++ b/onnxruntime/core/mlas/inc/mlas.h @@ -634,6 +634,7 @@ MlasGemm( { MlasGemmBatch(Shape, &DataParams, 1, ThreadPool); } + /** * @brief Parameters that define the shape of a dynamically quantized GEMM operation. * @@ -646,6 +647,7 @@ struct MLAS_GEMM_DYN_QUANT_SHAPE_PARAMS { size_t N = 0; /**< Column size of matrix B */ size_t K = 0; /**< Column size of matrix A and Row size of matrix B */ }; + /** * @brief Parameters that define the data buffers and layout for a dynamic quant GEMM. * @@ -680,6 +682,14 @@ MlasDynamicQGemm ( MlasDynamicQGemmBatch(Shape, DataParams, 1, ThreadPool); } +/** + * @brief Determines whether a dynamic quantized GEMM implementation is available on the current platform. + * + * MlasDynamicQGemm() and MlasDynamicQGemmBatch() should only be called if this function returns true. + */ +bool +MLASCALL +MlasIsDynamicQGemmAvailable(); // // Symmetric QGEMM has limited buffer overrun. diff --git a/onnxruntime/core/mlas/lib/qgemm.cpp b/onnxruntime/core/mlas/lib/qgemm.cpp index a1c2e467188f7..9d9f741b89617 100644 --- a/onnxruntime/core/mlas/lib/qgemm.cpp +++ b/onnxruntime/core/mlas/lib/qgemm.cpp @@ -201,6 +201,17 @@ MlasGemmBatch( }); } +bool +MLASCALL +MlasIsDynamicQGemmAvailable() +{ +#if defined(USE_KLEIDIAI) && !defined(_MSC_VER) + return ArmKleidiAI::UseSME2; +#else + return false; +#endif +} + void MLASCALL MlasDynamicQGemmBatch ( @@ -211,7 +222,7 @@ MlasDynamicQGemmBatch ( ) { #if defined(USE_KLEIDIAI) && !defined(_MSC_VER) //No fallback and putting in guards. This implementation is SME2 specific. - if(ArmKleidiAI::UseSME2){ + if (ArmKleidiAI::UseSME2) { ArmKleidiAI::MlasDynamicQGemmBatch(Shape, DataParams, BatchN, ThreadPool); } #endif @@ -336,7 +347,7 @@ MlasDynamicQgemmPackBSize( #if defined(USE_KLEIDIAI) && !defined(_MSC_VER) //No fallback available //TODO: Insert Override - if(MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME()){//Still require this since no override + if (ArmKleidiAI::UseSME2) { //Still require this since no override bytes = ArmKleidiAI::MlasDynamicQgemmPackBSize(N, K); } #endif @@ -407,7 +418,7 @@ Return Value: ~(BufferAlignment - 1); // If this gemm B argument is used in a dynamically quantization gemm operation we can optimize for // this use case. Concat both packed representations for later decision. This allows for cases later - // where we still have the prepack at the cost of some memory otherwise we can use the qgemm quantization + // where we still have the prepack at the cost of some memory otherwise we can use the qgemm quantization // for better performance return AlignedBytesRequired + MlasDynamicQgemmPackBSize(N, K); } @@ -425,7 +436,7 @@ MlasDynamicQgemmPackB( { #if defined(USE_KLEIDIAI) && !defined(_MSC_VER) //No fallback - if(MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME()){//Still require this since no override + if (ArmKleidiAI::UseSME2) { //Still require this since no override ArmKleidiAI::MlasDynamicQgemmPackB(N, K, B, Scales, Bias, PackedB); } #endif diff --git a/onnxruntime/test/mlas/unittest/test_dynamic_qgemm.cpp b/onnxruntime/test/mlas/unittest/test_dynamic_qgemm.cpp index bebff37ad8460..8a9de636c835f 100644 --- a/onnxruntime/test/mlas/unittest/test_dynamic_qgemm.cpp +++ b/onnxruntime/test/mlas/unittest/test_dynamic_qgemm.cpp @@ -7,8 +7,8 @@ // Currently this test only applies to KleidiAI Guard against it running in any other situation #if defined(USE_KLEIDIAI) && !defined(_MSC_VER) +#include "mlas.h" #include "test_util.h" -#include "core/mlas/lib/mlasi.h" // for MLAS_CPUIDINFO class MlasDynamicQgemmTest { private: @@ -20,9 +20,8 @@ class MlasDynamicQgemmTest { public: void Test(size_t M, size_t N, size_t K, size_t BatchSize) { - // Currently, MlasDynamicQGemmBatch() and associated functions require SME2 or else they are no-ops. - if (!MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME2()) { - GTEST_SKIP() << "MlasDynamicQGemmBatch() requires ARM64 SME2 but it was not detected. Skipping test."; + if (!MlasIsDynamicQGemmAvailable()) { + GTEST_SKIP() << "MlasDynamicQGemmBatch() is not supported on this platform. Skipping test."; } // Setup buffers for holding various data From a18cdcac9cbbd6182e8328a97aafaf02dba1cf6f Mon Sep 17 00:00:00 2001 From: edgchen1 <18449977+edgchen1@users.noreply.github.com> Date: Tue, 18 Nov 2025 16:58:34 -0800 Subject: [PATCH 3/9] bump cpuinfo to 403d652dca4c1046e8145950b1c0997a9f748b57 to get asan issue fix --- cmake/deps.txt | 2 +- cmake/vcpkg-ports/cpuinfo/portfile.cmake | 4 ++-- cmake/vcpkg-ports/cpuinfo/vcpkg.json | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cmake/deps.txt b/cmake/deps.txt index 69da93ff233f6..f8e5fb7f8ede0 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -47,7 +47,7 @@ protoc_mac_universal;https://github.com/protocolbuffers/protobuf/releases/downlo psimd;https://github.com/Maratyszcza/psimd/archive/072586a71b55b7f8c584153d223e95687148a900.zip;1f5454b01f06f9656b77e4a5e2e31d7422487013 pthreadpool;https://github.com/google/pthreadpool/archive/dcc9f28589066af0dbd4555579281230abbf74dd.zip;533a77943203ef15ca608bcd9dbe2c94da7451d2 pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f780292da9db273c8ef06ccf5fd4b623624143e9 -pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/f858c30bcb16f8effd5ff46996f0514539e17abc.zip;66a964eda7de60c925e2e26f71f9bbe31698997b +pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/403d652dca4c1046e8145950b1c0997a9f748b57.zip;30b2a07fe4bae8574f89176e56274cacdd6d135b re2;https://github.com/google/re2/archive/refs/tags/2024-07-02.zip;646e1728269cde7fcef990bf4a8e87b047882e88 safeint;https://github.com/dcleblanc/SafeInt/archive/refs/tags/3.0.28.zip;23f252040ff6cb9f1fd18575b32fa8fb5928daac tensorboard;https://github.com/tensorflow/tensorboard/archive/373eb09e4c5d2b3cc2493f0949dc4be6b6a45e81.zip;67b833913605a4f3f499894ab11528a702c2b381 diff --git a/cmake/vcpkg-ports/cpuinfo/portfile.cmake b/cmake/vcpkg-ports/cpuinfo/portfile.cmake index ff2bfc2f137cf..5aa5aebe50f3d 100644 --- a/cmake/vcpkg-ports/cpuinfo/portfile.cmake +++ b/cmake/vcpkg-ports/cpuinfo/portfile.cmake @@ -6,8 +6,8 @@ endif() vcpkg_from_github( OUT_SOURCE_PATH SOURCE_PATH REPO pytorch/cpuinfo - REF f858c30bcb16f8effd5ff46996f0514539e17abc - SHA512 cd7c0c1ea59fac69f2746f65f59656798eeb87410c304ac9d3b3d26ebea4f4124d1426c10fb4b87ff5f93f367ea10d63337f519ee3c3f8fefbb4b7ebf6438130 + REF 403d652dca4c1046e8145950b1c0997a9f748b57 + SHA512 f7cd6dc44bd1120af610cae1337ed4c0f557ba78d2de9c73fed350fa3dfe9512643a1619ae55f5a540c6316a87d641856cca27297bb8766e48f39b7b7a59da1f HEAD_REF master PATCHES patch_cpuinfo_h_for_arm64ec.patch diff --git a/cmake/vcpkg-ports/cpuinfo/vcpkg.json b/cmake/vcpkg-ports/cpuinfo/vcpkg.json index f1ccda72679b1..76486eceecf12 100644 --- a/cmake/vcpkg-ports/cpuinfo/vcpkg.json +++ b/cmake/vcpkg-ports/cpuinfo/vcpkg.json @@ -1,7 +1,7 @@ { "name": "cpuinfo", - "version-date": "2025-10-23", - "port-version": 4, + "version-date": "2025-11-18", + "port-version": 5, "description": "CPU INFOrmation library (x86/x86-64/ARM/ARM64, Linux/Windows/Android/macOS/iOS)", "homepage": "https://github.com/pytorch/cpuinfo", "license": "BSD-2-Clause", From e2ddd856b5319eb3caf5d6556534ebdc3ca6e1e0 Mon Sep 17 00:00:00 2001 From: edgchen1 <18449977+edgchen1@users.noreply.github.com> Date: Tue, 18 Nov 2025 19:32:49 -0800 Subject: [PATCH 4/9] fix indent --- onnxruntime/core/mlas/lib/qgemm.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/mlas/lib/qgemm.cpp b/onnxruntime/core/mlas/lib/qgemm.cpp index 9d9f741b89617..b3eb5893ba7e7 100644 --- a/onnxruntime/core/mlas/lib/qgemm.cpp +++ b/onnxruntime/core/mlas/lib/qgemm.cpp @@ -206,9 +206,9 @@ MLASCALL MlasIsDynamicQGemmAvailable() { #if defined(USE_KLEIDIAI) && !defined(_MSC_VER) - return ArmKleidiAI::UseSME2; + return ArmKleidiAI::UseSME2; #else - return false; + return false; #endif } From cb7bbd6b0811b8c8138994f124018feca34067c1 Mon Sep 17 00:00:00 2001 From: edgchen1 <18449977+edgchen1@users.noreply.github.com> Date: Wed, 19 Nov 2025 15:42:50 -0800 Subject: [PATCH 5/9] debug - print has_fp16_ value --- onnxruntime/core/common/cpuid_info.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/onnxruntime/core/common/cpuid_info.cc b/onnxruntime/core/common/cpuid_info.cc index afea9f62419fa..42f35697e629f 100644 --- a/onnxruntime/core/common/cpuid_info.cc +++ b/onnxruntime/core/common/cpuid_info.cc @@ -309,6 +309,8 @@ void CPUIDInfo::ArmWindowsInit() { has_arm_sme2_ = cpuinfo_has_arm_sme2(); } #endif // defined(CPUINFO_SUPPORTED) + + LogEarlyWarning(std::string{"CPUIDInfo::has_fp16_ = "} + (has_fp16_ ? "true" : "false")); } #elif defined(__APPLE__) // ^ defined(_WIN32) From 9b186b21f96e47346db74a0c69d91ae1ca9200c5 Mon Sep 17 00:00:00 2001 From: edgchen1 <18449977+edgchen1@users.noreply.github.com> Date: Thu, 20 Nov 2025 09:41:31 -0800 Subject: [PATCH 6/9] add note about has_fp16_ workaround, remove debug print --- onnxruntime/core/common/cpuid_info.cc | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/common/cpuid_info.cc b/onnxruntime/core/common/cpuid_info.cc index 42f35697e629f..f5d3b93b395e2 100644 --- a/onnxruntime/core/common/cpuid_info.cc +++ b/onnxruntime/core/common/cpuid_info.cc @@ -302,6 +302,22 @@ void CPUIDInfo::ArmWindowsInit() { if (pytorch_cpuinfo_init_) { has_arm_neon_dot_ = cpuinfo_has_arm_neon_dot(); has_fp16_ = cpuinfo_has_arm_neon_fp16_arith(); + + // Note: + // cpuinfo is using IsProcessorFeaturePresent(PF_ARM_V82_FP16_INSTRUCTIONS_AVAILABLE): + // https://github.com/pytorch/cpuinfo/blob/403d652dca4c1046e8145950b1c0997a9f748b57/src/arm/windows/init.c#L224-L225 + // However, on some systems (notably, a Windows ARM64 CI build agent), cpuinfo_has_arm_neon_fp16_arith() started to + // return false in the newer cpuinfo version that uses IsProcessorFeaturePresent(). Perhaps the newer + // PF_ARM_V82_FP16_INSTRUCTIONS_AVAILABLE constant is not supported yet in the Windows version on those systems. + // To avoid regressing in fp16 instructions detection, we fall back to what cpuinfo used to do, i.e., use the + // detection of dot product instructions: + // https://github.com/pytorch/cpuinfo/blob/877328f188a3c7d1fa855871a278eb48d530c4c0/src/arm/windows/init.c#L206-L209 + // This workaround can be removed when cpuinfo_has_arm_neon_fp16_arith() works correctly on all the Windows + // versions that we want to support. + if (!has_fp16_) { + has_fp16_ = has_arm_neon_dot_; + } + has_arm_neon_i8mm_ = cpuinfo_has_arm_i8mm(); has_arm_sve_i8mm_ = cpuinfo_has_arm_sve() && cpuinfo_has_arm_i8mm(); has_arm_neon_bf16_ = cpuinfo_has_arm_neon_bf16(); @@ -309,8 +325,6 @@ void CPUIDInfo::ArmWindowsInit() { has_arm_sme2_ = cpuinfo_has_arm_sme2(); } #endif // defined(CPUINFO_SUPPORTED) - - LogEarlyWarning(std::string{"CPUIDInfo::has_fp16_ = "} + (has_fp16_ ? "true" : "false")); } #elif defined(__APPLE__) // ^ defined(_WIN32) From b124ff432eee8eac19c4ac6ceeb3eada97579794 Mon Sep 17 00:00:00 2001 From: edgchen1 <18449977+edgchen1@users.noreply.github.com> Date: Wed, 26 Nov 2025 12:12:42 -0800 Subject: [PATCH 7/9] Revert "add MlasIsDynamicQGemmAvailable() function, use that instead of checking CPUIDInfo::HasArm_SME2()." This reverts commit 6a9cfc750221cec52f98fa55a0a1ed727214f0b8. --- .../cpu/quantization/dynamic_quantize_matmul.cc | 5 ++++- onnxruntime/core/mlas/inc/mlas.h | 10 ---------- onnxruntime/core/mlas/lib/qgemm.cpp | 17 +++-------------- .../test/mlas/unittest/test_dynamic_qgemm.cpp | 7 ++++--- 4 files changed, 11 insertions(+), 28 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc index 2bba0adcd987c..ceb498372a6fc 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#include "core/common/cpuid_info.h" // for CPUIDInfo::GetCPUIDInfo().HasArm_SME2() #include "core/common/narrow.h" #include "core/common/safeint.h" #include "core/mlas/inc/mlas.h" @@ -212,7 +213,9 @@ class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase { } } - if (!MlasIsDynamicQGemmAvailable()) { + // Currently, MlasDynamicQGemmBatch() and associated functions require SME2 or else they are no-ops. + // We check that here too before attempting to use them. + if (!CPUIDInfo::GetCPUIDInfo().HasArm_SME2()) { can_use_dynamic_quant_mlas_ = false; } diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h index 248c6d74e6cbd..9d98a15d8457a 100644 --- a/onnxruntime/core/mlas/inc/mlas.h +++ b/onnxruntime/core/mlas/inc/mlas.h @@ -634,7 +634,6 @@ MlasGemm( { MlasGemmBatch(Shape, &DataParams, 1, ThreadPool); } - /** * @brief Parameters that define the shape of a dynamically quantized GEMM operation. * @@ -647,7 +646,6 @@ struct MLAS_GEMM_DYN_QUANT_SHAPE_PARAMS { size_t N = 0; /**< Column size of matrix B */ size_t K = 0; /**< Column size of matrix A and Row size of matrix B */ }; - /** * @brief Parameters that define the data buffers and layout for a dynamic quant GEMM. * @@ -682,14 +680,6 @@ MlasDynamicQGemm ( MlasDynamicQGemmBatch(Shape, DataParams, 1, ThreadPool); } -/** - * @brief Determines whether a dynamic quantized GEMM implementation is available on the current platform. - * - * MlasDynamicQGemm() and MlasDynamicQGemmBatch() should only be called if this function returns true. - */ -bool -MLASCALL -MlasIsDynamicQGemmAvailable(); // // Symmetric QGEMM has limited buffer overrun. diff --git a/onnxruntime/core/mlas/lib/qgemm.cpp b/onnxruntime/core/mlas/lib/qgemm.cpp index b3eb5893ba7e7..4c675f104c52b 100644 --- a/onnxruntime/core/mlas/lib/qgemm.cpp +++ b/onnxruntime/core/mlas/lib/qgemm.cpp @@ -201,17 +201,6 @@ MlasGemmBatch( }); } -bool -MLASCALL -MlasIsDynamicQGemmAvailable() -{ -#if defined(USE_KLEIDIAI) && !defined(_MSC_VER) - return ArmKleidiAI::UseSME2; -#else - return false; -#endif -} - void MLASCALL MlasDynamicQGemmBatch ( @@ -222,7 +211,7 @@ MlasDynamicQGemmBatch ( ) { #if defined(USE_KLEIDIAI) && !defined(_MSC_VER) //No fallback and putting in guards. This implementation is SME2 specific. - if (ArmKleidiAI::UseSME2) { + if(ArmKleidiAI::UseSME2){ ArmKleidiAI::MlasDynamicQGemmBatch(Shape, DataParams, BatchN, ThreadPool); } #endif @@ -347,7 +336,7 @@ MlasDynamicQgemmPackBSize( #if defined(USE_KLEIDIAI) && !defined(_MSC_VER) //No fallback available //TODO: Insert Override - if (ArmKleidiAI::UseSME2) { //Still require this since no override + if(MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME()){//Still require this since no override bytes = ArmKleidiAI::MlasDynamicQgemmPackBSize(N, K); } #endif @@ -436,7 +425,7 @@ MlasDynamicQgemmPackB( { #if defined(USE_KLEIDIAI) && !defined(_MSC_VER) //No fallback - if (ArmKleidiAI::UseSME2) { //Still require this since no override + if(MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME()){//Still require this since no override ArmKleidiAI::MlasDynamicQgemmPackB(N, K, B, Scales, Bias, PackedB); } #endif diff --git a/onnxruntime/test/mlas/unittest/test_dynamic_qgemm.cpp b/onnxruntime/test/mlas/unittest/test_dynamic_qgemm.cpp index 8a9de636c835f..bebff37ad8460 100644 --- a/onnxruntime/test/mlas/unittest/test_dynamic_qgemm.cpp +++ b/onnxruntime/test/mlas/unittest/test_dynamic_qgemm.cpp @@ -7,8 +7,8 @@ // Currently this test only applies to KleidiAI Guard against it running in any other situation #if defined(USE_KLEIDIAI) && !defined(_MSC_VER) -#include "mlas.h" #include "test_util.h" +#include "core/mlas/lib/mlasi.h" // for MLAS_CPUIDINFO class MlasDynamicQgemmTest { private: @@ -20,8 +20,9 @@ class MlasDynamicQgemmTest { public: void Test(size_t M, size_t N, size_t K, size_t BatchSize) { - if (!MlasIsDynamicQGemmAvailable()) { - GTEST_SKIP() << "MlasDynamicQGemmBatch() is not supported on this platform. Skipping test."; + // Currently, MlasDynamicQGemmBatch() and associated functions require SME2 or else they are no-ops. + if (!MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME2()) { + GTEST_SKIP() << "MlasDynamicQGemmBatch() requires ARM64 SME2 but it was not detected. Skipping test."; } // Setup buffers for holding various data From 5d8b20a4c90c8102fec9a83b5b9780e01a11f7f5 Mon Sep 17 00:00:00 2001 From: edgchen1 <18449977+edgchen1@users.noreply.github.com> Date: Wed, 26 Nov 2025 14:04:18 -0800 Subject: [PATCH 8/9] patch cpuinfo with fp16 detection fallback --- cmake/external/onnxruntime_external_deps.cmake | 4 +++- .../win_arm_fp16_detection_fallback.patch | Bin 0 -> 1666 bytes cmake/vcpkg-ports/cpuinfo/portfile.cmake | 3 ++- .../win_arm_fp16_detection_fallback.patch | Bin 0 -> 1666 bytes onnxruntime/core/common/cpuid_info.cc | 16 ---------------- 5 files changed, 5 insertions(+), 18 deletions(-) create mode 100644 cmake/patches/cpuinfo/win_arm_fp16_detection_fallback.patch create mode 100644 cmake/vcpkg-ports/cpuinfo/win_arm_fp16_detection_fallback.patch diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index 1c73dcfb7f332..3c616684fb296 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -369,7 +369,9 @@ if (CPUINFO_SUPPORTED) PATCH_COMMAND ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/cpuinfo/patch_cpuinfo_h_for_arm64ec.patch && # https://github.com/pytorch/cpuinfo/pull/324 - ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/cpuinfo/patch_vcpkg_arm64ec_support.patch + ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/cpuinfo/patch_vcpkg_arm64ec_support.patch && + # https://github.com/pytorch/cpuinfo/pull/348 + ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/cpuinfo/win_arm_fp16_detection_fallback.patch FIND_PACKAGE_ARGS NAMES cpuinfo ) else() diff --git a/cmake/patches/cpuinfo/win_arm_fp16_detection_fallback.patch b/cmake/patches/cpuinfo/win_arm_fp16_detection_fallback.patch new file mode 100644 index 0000000000000000000000000000000000000000..d731edd28acca321a411f27077a9e38ef87676d2 GIT binary patch literal 1666 zcmb`HTTc@~6vxl2CVq$EO)kBlfEr^08c8%|MYBReLiOmz5!eXomtCDn_14Du^+Rap_^FG3aAoZoUEHzIqTlj z0Tn2`Kc@rBEgIDK(F<7-L)7 zbF2MU@w^C?J9`4!E%*Wc>Q6>5DttV$%7)?84d^a<^)q&L$SEiu7=au6dEj8B# xpU%A*IoA0&Q%|MYBReLiOmz5!eXomtCDn_14Du^+Rap_^FG3aAoZoUEHzIqTlj z0Tn2`Kc@rBEgIDK(F<7-L)7 zbF2MU@w^C?J9`4!E%*Wc>Q6>5DttV$%7)?84d^a<^)q&L$SEiu7=au6dEj8B# xpU%A*IoA0&Q Date: Wed, 26 Nov 2025 14:19:15 -0800 Subject: [PATCH 9/9] fix patch files --- .../win_arm_fp16_detection_fallback.patch | Bin 1666 -> 813 bytes .../win_arm_fp16_detection_fallback.patch | Bin 1666 -> 813 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/cmake/patches/cpuinfo/win_arm_fp16_detection_fallback.patch b/cmake/patches/cpuinfo/win_arm_fp16_detection_fallback.patch index d731edd28acca321a411f27077a9e38ef87676d2..44ac0f13f5466e59b39c5a576111e58f6cf323ea 100644 GIT binary patch literal 813 zcma)3T~FIE6n$=f#o?_bP13T`Z9+(7Xq6%{h)Tz_r|2fRZmgwoWIOH9#DB-31C@tL zJlXO+KKFc-QV0aWT3S$KbPL_67(YG)$ZNE;T&j|@Bq60$+7sc?iGI`{!ssLKs5Q7YSNRbF`T{GKgc zWbyJx_VDE_Tcp!@mfYU2mOrmo$=&UJ7C*$vP5kZVJ2v!)Mp@(x+LEj(ueo35ma^V6 z{o03%Eo&oHV?rbXqMgo3OIu+_=7phT%Z23y$GUwV e$R@j9{!UkSN2(j)%|MYBReLiOmz5!eXomtCDn_14Du^+Rap_^FG3aAoZoUEHzIqTlj z0Tn2`Kc@rBEgIDK(F<7-L)7 zbF2MU@w^C?J9`4!E%*Wc>Q6>5DttV$%7)?84d^a<^)q&L$SEiu7=au6dEj8B# xpU%A*IoA0&QL_67(YG)$ZNE;T&j|@Bq60$+7sc?iGI`{!ssLKs5Q7YSNRbF`T{GKgc zWbyJx_VDE_Tcp!@mfYU2mOrmo$=&UJ7C*$vP5kZVJ2v!)Mp@(x+LEj(ueo35ma^V6 z{o03%Eo&oHV?rbXqMgo3OIu+_=7phT%Z23y$GUwV e$R@j9{!UkSN2(j)%|MYBReLiOmz5!eXomtCDn_14Du^+Rap_^FG3aAoZoUEHzIqTlj z0Tn2`Kc@rBEgIDK(F<7-L)7 zbF2MU@w^C?J9`4!E%*Wc>Q6>5DttV$%7)?84d^a<^)q&L$SEiu7=au6dEj8B# xpU%A*IoA0&Q