diff --git a/cmake/deps.txt b/cmake/deps.txt index e1870bf2df0cf..f8e5fb7f8ede0 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -47,7 +47,7 @@ protoc_mac_universal;https://github.com/protocolbuffers/protobuf/releases/downlo psimd;https://github.com/Maratyszcza/psimd/archive/072586a71b55b7f8c584153d223e95687148a900.zip;1f5454b01f06f9656b77e4a5e2e31d7422487013 pthreadpool;https://github.com/google/pthreadpool/archive/dcc9f28589066af0dbd4555579281230abbf74dd.zip;533a77943203ef15ca608bcd9dbe2c94da7451d2 pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f780292da9db273c8ef06ccf5fd4b623624143e9 -pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/877328f188a3c7d1fa855871a278eb48d530c4c0.zip;9152d4bf6b8bde9f19b116de3bd8a745097ed9df +pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/403d652dca4c1046e8145950b1c0997a9f748b57.zip;30b2a07fe4bae8574f89176e56274cacdd6d135b re2;https://github.com/google/re2/archive/refs/tags/2024-07-02.zip;646e1728269cde7fcef990bf4a8e87b047882e88 safeint;https://github.com/dcleblanc/SafeInt/archive/refs/tags/3.0.28.zip;23f252040ff6cb9f1fd18575b32fa8fb5928daac tensorboard;https://github.com/tensorflow/tensorboard/archive/373eb09e4c5d2b3cc2493f0949dc4be6b6a45e81.zip;67b833913605a4f3f499894ab11528a702c2b381 diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index 1c73dcfb7f332..3c616684fb296 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -369,7 +369,9 @@ if (CPUINFO_SUPPORTED) PATCH_COMMAND ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/cpuinfo/patch_cpuinfo_h_for_arm64ec.patch && # https://github.com/pytorch/cpuinfo/pull/324 - ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/cpuinfo/patch_vcpkg_arm64ec_support.patch + ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/cpuinfo/patch_vcpkg_arm64ec_support.patch && + # https://github.com/pytorch/cpuinfo/pull/348 + ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/cpuinfo/win_arm_fp16_detection_fallback.patch FIND_PACKAGE_ARGS NAMES cpuinfo ) else() diff --git a/cmake/patches/cpuinfo/win_arm_fp16_detection_fallback.patch b/cmake/patches/cpuinfo/win_arm_fp16_detection_fallback.patch new file mode 100644 index 0000000000000..44ac0f13f5466 --- /dev/null +++ b/cmake/patches/cpuinfo/win_arm_fp16_detection_fallback.patch @@ -0,0 +1,19 @@ +diff --git a/src/arm/windows/init.c b/src/arm/windows/init.c +index 5c0a5f3..a07fbe4 100644 +--- a/src/arm/windows/init.c ++++ b/src/arm/windows/init.c +@@ -249,6 +249,14 @@ static void set_cpuinfo_isa_fields(void) { + // guarantee that, but it holds in practice. + cpuinfo_isa.rdm = dotprod; + ++ // PF_ARM_V82_FP16_INSTRUCTIONS_AVAILABLE may not be available in older ++ // Windows versions. If fp16arith was not detected with ++ // IsProcessorFeaturePresent(PF_ARM_V82_FP16_INSTRUCTIONS_AVAILABLE), fall ++ // back to using the value of dotprod. ++ if (!cpuinfo_isa.fp16arith) { ++ cpuinfo_isa.fp16arith = dotprod; ++ } ++ + /* Windows API reports all or nothing for cryptographic instructions. */ + const bool crypto = IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) != 0; + cpuinfo_isa.aes = crypto; diff --git a/cmake/vcpkg-ports/cpuinfo/portfile.cmake b/cmake/vcpkg-ports/cpuinfo/portfile.cmake index 80192840ee9b0..67bd18e61cc28 100644 --- a/cmake/vcpkg-ports/cpuinfo/portfile.cmake +++ b/cmake/vcpkg-ports/cpuinfo/portfile.cmake @@ -6,12 +6,13 @@ endif() vcpkg_from_github( OUT_SOURCE_PATH SOURCE_PATH REPO pytorch/cpuinfo - REF 877328f188a3c7d1fa855871a278eb48d530c4c0 - SHA512 b6d5a9ce9996eee3b2f09f39115f7ae178fe4d4814cc35b049a59d04a82228e268aa52d073c307ccb56a427428622940e1c77f004c99851dfca0d3a5d803658b + REF 403d652dca4c1046e8145950b1c0997a9f748b57 + SHA512 f7cd6dc44bd1120af610cae1337ed4c0f557ba78d2de9c73fed350fa3dfe9512643a1619ae55f5a540c6316a87d641856cca27297bb8766e48f39b7b7a59da1f HEAD_REF master PATCHES patch_cpuinfo_h_for_arm64ec.patch - patch_vcpkg_arm64ec_support.patch # https://github.com/pytorch/cpuinfo/pull/324 + patch_vcpkg_arm64ec_support.patch # https://github.com/pytorch/cpuinfo/pull/324 + win_arm_fp16_detection_fallback.patch # https://github.com/pytorch/cpuinfo/pull/348 ) vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS diff --git a/cmake/vcpkg-ports/cpuinfo/vcpkg.json b/cmake/vcpkg-ports/cpuinfo/vcpkg.json index f1ccda72679b1..76486eceecf12 100644 --- a/cmake/vcpkg-ports/cpuinfo/vcpkg.json +++ b/cmake/vcpkg-ports/cpuinfo/vcpkg.json @@ -1,7 +1,7 @@ { "name": "cpuinfo", - "version-date": "2025-10-23", - "port-version": 4, + "version-date": "2025-11-18", + "port-version": 5, "description": "CPU INFOrmation library (x86/x86-64/ARM/ARM64, Linux/Windows/Android/macOS/iOS)", "homepage": "https://github.com/pytorch/cpuinfo", "license": "BSD-2-Clause", diff --git a/cmake/vcpkg-ports/cpuinfo/win_arm_fp16_detection_fallback.patch b/cmake/vcpkg-ports/cpuinfo/win_arm_fp16_detection_fallback.patch new file mode 100644 index 0000000000000..44ac0f13f5466 --- /dev/null +++ b/cmake/vcpkg-ports/cpuinfo/win_arm_fp16_detection_fallback.patch @@ -0,0 +1,19 @@ +diff --git a/src/arm/windows/init.c b/src/arm/windows/init.c +index 5c0a5f3..a07fbe4 100644 +--- a/src/arm/windows/init.c ++++ b/src/arm/windows/init.c +@@ -249,6 +249,14 @@ static void set_cpuinfo_isa_fields(void) { + // guarantee that, but it holds in practice. + cpuinfo_isa.rdm = dotprod; + ++ // PF_ARM_V82_FP16_INSTRUCTIONS_AVAILABLE may not be available in older ++ // Windows versions. If fp16arith was not detected with ++ // IsProcessorFeaturePresent(PF_ARM_V82_FP16_INSTRUCTIONS_AVAILABLE), fall ++ // back to using the value of dotprod. ++ if (!cpuinfo_isa.fp16arith) { ++ cpuinfo_isa.fp16arith = dotprod; ++ } ++ + /* Windows API reports all or nothing for cryptographic instructions. */ + const bool crypto = IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) != 0; + cpuinfo_isa.aes = crypto; diff --git a/onnxruntime/core/common/cpuid_info.cc b/onnxruntime/core/common/cpuid_info.cc index ab8ab0b326292..afea9f62419fa 100644 --- a/onnxruntime/core/common/cpuid_info.cc +++ b/onnxruntime/core/common/cpuid_info.cc @@ -237,9 +237,9 @@ void CPUIDInfo::ArmLinuxInit() { #elif defined(_WIN32) // ^ defined(__linux__) void CPUIDInfo::ArmWindowsInit() { - // Read MIDR and ID_AA64ISAR1_EL1 register values from Windows registry + // Read MIDR register values from Windows registry // There should be one per CPU - std::vector midr_values{}, id_aa64isar1_el1_values{}; + std::vector midr_values{}; // TODO!! Don't support multiple processor group yet!! constexpr int MAX_CORES = 64; @@ -272,17 +272,7 @@ void CPUIDInfo::ArmWindowsInit() { break; } - uint64_t id_aa64isar1_el1_value; - data_size = sizeof(id_aa64isar1_el1_value); - - // CP 4031 corresponds to ID_AA64ISAR1_EL1 register - if (::RegGetValueA(HKEY_LOCAL_MACHINE, processor_subkey, "CP 4031", RRF_RT_REG_QWORD, - nullptr, &id_aa64isar1_el1_value, &data_size) != ERROR_SUCCESS) { - break; - } - midr_values.push_back(midr_value); - id_aa64isar1_el1_values.push_back(id_aa64isar1_el1_value); } // process midr_values @@ -308,22 +298,15 @@ void CPUIDInfo::ArmWindowsInit() { } } - has_arm_neon_i8mm_ = std::all_of( - id_aa64isar1_el1_values.begin(), id_aa64isar1_el1_values.end(), - [](uint64_t id_aa64isar1_el1_value) { - // I8MM, bits [55:52] - return ((id_aa64isar1_el1_value >> 52) & 0xF) != 0; - }); - - has_arm_neon_dot_ = (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) != 0); - #if defined(CPUINFO_SUPPORTED) if (pytorch_cpuinfo_init_) { + has_arm_neon_dot_ = cpuinfo_has_arm_neon_dot(); has_fp16_ = cpuinfo_has_arm_neon_fp16_arith(); - // cpuinfo_has_arm_i8mm() doesn't work on Windows yet. See https://github.com/pytorch/cpuinfo/issues/279. - // has_arm_neon_i8mm_ = cpuinfo_has_arm_i8mm(); - has_arm_sve_i8mm_ = cpuinfo_has_arm_sve() && has_arm_neon_i8mm_; + has_arm_neon_i8mm_ = cpuinfo_has_arm_i8mm(); + has_arm_sve_i8mm_ = cpuinfo_has_arm_sve() && cpuinfo_has_arm_i8mm(); has_arm_neon_bf16_ = cpuinfo_has_arm_neon_bf16(); + has_arm_sme_ = cpuinfo_has_arm_sme(); + has_arm_sme2_ = cpuinfo_has_arm_sme2(); } #endif // defined(CPUINFO_SUPPORTED) } @@ -397,4 +380,4 @@ CPUIDInfo::CPUIDInfo() { #endif #endif // defined(CPUIDINFO_ARCH_ARM) } -} // namespace onnxruntime \ No newline at end of file +} // namespace onnxruntime diff --git a/onnxruntime/core/common/cpuid_info.h b/onnxruntime/core/common/cpuid_info.h index 9c40627b5cd1b..ca9315c7ef95d 100644 --- a/onnxruntime/core/common/cpuid_info.h +++ b/onnxruntime/core/common/cpuid_info.h @@ -171,4 +171,4 @@ class CPUIDInfo { uint32_t vendor_id_; }; -} // namespace onnxruntime \ No newline at end of file +} // namespace onnxruntime diff --git a/onnxruntime/core/mlas/lib/qgemm.cpp b/onnxruntime/core/mlas/lib/qgemm.cpp index a1c2e467188f7..4c675f104c52b 100644 --- a/onnxruntime/core/mlas/lib/qgemm.cpp +++ b/onnxruntime/core/mlas/lib/qgemm.cpp @@ -407,7 +407,7 @@ Return Value: ~(BufferAlignment - 1); // If this gemm B argument is used in a dynamically quantization gemm operation we can optimize for // this use case. Concat both packed representations for later decision. This allows for cases later - // where we still have the prepack at the cost of some memory otherwise we can use the qgemm quantization + // where we still have the prepack at the cost of some memory otherwise we can use the qgemm quantization // for better performance return AlignedBytesRequired + MlasDynamicQgemmPackBSize(N, K); }