From c9f71907bbaa89318ffba362301e419ca009d868 Mon Sep 17 00:00:00 2001 From: edgchen1 <18449977+edgchen1@users.noreply.github.com> Date: Mon, 23 Mar 2026 18:04:48 -0700 Subject: [PATCH 01/13] add GetTotalPhysicalMemoryBytes helper --- onnxruntime/test/util/include/system_info.h | 16 ++++++++++ onnxruntime/test/util/system_info.cc | 33 +++++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 onnxruntime/test/util/include/system_info.h create mode 100644 onnxruntime/test/util/system_info.cc diff --git a/onnxruntime/test/util/include/system_info.h b/onnxruntime/test/util/include/system_info.h new file mode 100644 index 0000000000000..7f0e925c6ccbc --- /dev/null +++ b/onnxruntime/test/util/include/system_info.h @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include +#include + +namespace onnxruntime { +namespace test { + +// Returns the total physical memory (RAM) in bytes, or std::nullopt if detection fails. +std::optional GetTotalPhysicalMemoryBytes(); + +} // namespace test +} // namespace onnxruntime diff --git a/onnxruntime/test/util/system_info.cc b/onnxruntime/test/util/system_info.cc new file mode 100644 index 0000000000000..653be56cd98c2 --- /dev/null +++ b/onnxruntime/test/util/system_info.cc @@ -0,0 +1,33 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "test/util/include/system_info.h" + +#ifdef _WIN32 +#include +#else +#include +#endif + +namespace onnxruntime { +namespace test { + +std::optional GetTotalPhysicalMemoryBytes() { +#ifdef _WIN32 + MEMORYSTATUSEX mem_info = {}; + mem_info.dwLength = sizeof(mem_info); + if (GlobalMemoryStatusEx(&mem_info)) { + return static_cast(mem_info.ullTotalPhys); + } +#else + long pages = sysconf(_SC_PHYS_PAGES); + long page_size = sysconf(_SC_PAGESIZE); + if (pages > 0 && page_size > 0) { + return static_cast(pages) * static_cast(page_size); + } +#endif + return std::nullopt; +} + +} // namespace test +} // namespace onnxruntime From b72abd5e574fccde132256106230c4006c61d6f0 Mon Sep 17 00:00:00 2001 From: edgchen1 <18449977+edgchen1@users.noreply.github.com> Date: Mon, 23 Mar 2026 18:10:20 -0700 Subject: [PATCH 02/13] add test --- .../providers/cpu/llm/attention_op_test.cc | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc index 45c5220f42564..6d83c73acfe85 100644 --- a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc +++ b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc @@ -2,11 +2,13 @@ // Licensed under the MIT License. #include +#include #include "gtest/gtest.h" #include "core/session/onnxruntime_cxx_api.h" #include "test/common/tensor_op_test_utils.h" #include "test/common/cuda_op_test_utils.h" #include "test/providers/provider_test_utils.h" +#include "test/util/include/system_info.h" namespace onnxruntime { namespace test { @@ -2405,5 +2407,63 @@ TEST(AttentionTest, Attention_NonPadKVSeqLen_WithFloatAttnMask_MultiBatch) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); } +// Regression test for integer overflow in FP16 softmax allocation. +// ComputeAttentionSoftmaxInplace previously used int for N and D. +// For large enough values of N and D, N * D * sizeof(float) could overflow int32. +TEST(AttentionTest, Attention_FP16_SoftmaxLargeDimensions) { + // Skip if the machine has less than 16GB of physical RAM. + constexpr uint64_t required_ram_bytes = 16ULL * 1024 * 1024 * 1024; + if (const auto total_ram_bytes = GetTotalPhysicalMemoryBytes(); + total_ram_bytes.has_value() && *total_ram_bytes < required_ram_bytes) { + GTEST_SKIP() << "Skipping: test requires >= 16GB RAM, machine has " + << (*total_ram_bytes / (1024 * 1024)) << "MB"; + } + + constexpr int batch_size = 1; + constexpr int num_heads = 1; + constexpr int q_sequence_length = 46341; + constexpr int kv_sequence_length = 46341; + constexpr int head_size = 1; + + // Verify at compile time that these dimensions trigger the overflow scenario. + static_assert(static_cast(q_sequence_length) * kv_sequence_length > + static_cast(std::numeric_limits::max()), + "Test dimensions must cause int32 overflow in N*D"); + + OpTester test("Attention", 23, onnxruntime::kOnnxDomain); + + // 4D BNSH inputs + std::vector q_shape = {batch_size, num_heads, q_sequence_length, head_size}; + std::vector k_shape = {batch_size, num_heads, kv_sequence_length, head_size}; + std::vector v_shape = {batch_size, num_heads, kv_sequence_length, head_size}; + + constexpr int q_elements = batch_size * num_heads * q_sequence_length * head_size; + constexpr int kv_elements = batch_size * num_heads * kv_sequence_length * head_size; + + // All-zero Q and K → all attention scores are 0, softmax produces uniform 1/kv_seq. + // All-one V → output is also all 1.0 (weighted average of 1s). + std::vector q_data(q_elements, 0.0f); + std::vector k_data(kv_elements, 0.0f); + std::vector v_data(kv_elements, 1.0f); + + test.AddInput("Q", q_shape, ToFloat16(q_data)); + test.AddInput("K", k_shape, ToFloat16(k_data)); + test.AddInput("V", v_shape, ToFloat16(v_data)); + test.AddOptionalInputEdge(); // attn_mask + test.AddOptionalInputEdge(); // past_key + test.AddOptionalInputEdge(); // past_value + + // Expected output: all 1.0 (uniform attention over all-ones V). + std::vector y_shape = {batch_size, num_heads, q_sequence_length, head_size}; + std::vector expected_y(q_elements, 1.0f); + test.AddOutput("Y", y_shape, ToFloat16(expected_y), false, 0, 3e-3f); + test.AddOptionalOutputEdge(); // present_key + test.AddOptionalOutputEdge(); // present_value + + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + } // namespace test } // namespace onnxruntime From 1769061a869cd208efcad4d472e8cd7d7e2bc56c Mon Sep 17 00:00:00 2001 From: edgchen1 <18449977+edgchen1@users.noreply.github.com> Date: Mon, 23 Mar 2026 18:11:35 -0700 Subject: [PATCH 03/13] fix overflow issue --- onnxruntime/core/providers/cpu/llm/attention.cc | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/onnxruntime/core/providers/cpu/llm/attention.cc b/onnxruntime/core/providers/cpu/llm/attention.cc index 3d505af487712..648fb483ba2dd 100644 --- a/onnxruntime/core/providers/cpu/llm/attention.cc +++ b/onnxruntime/core/providers/cpu/llm/attention.cc @@ -78,20 +78,21 @@ void make_copy(MLFloat16* mask_data, const bool* mask_index, si } template -inline void ComputeAttentionSoftmaxInplace(T* score, int N, int D, ThreadPool* tp, AllocatorPtr) { +inline void ComputeAttentionSoftmaxInplace(T* score, size_t N, size_t D, ThreadPool* tp, AllocatorPtr) { MlasComputeSoftmax(score, score, N, D, false, false, 0.0f, tp); } template <> -inline void ComputeAttentionSoftmaxInplace(MLFloat16* score, int N, int D, ThreadPool* tp, AllocatorPtr allocator) { +inline void ComputeAttentionSoftmaxInplace(MLFloat16* score, size_t N, size_t D, ThreadPool* tp, AllocatorPtr allocator) { ORT_ENFORCE(tp == nullptr, "No parallelized version of softmax for float16."); // Mlas Lacks kernels for fp16 softmax, we convert into float32 and call the float32 version. - void* allocated_ptr = allocator->Alloc(static_cast(N * D * sizeof(float))); + const auto num_elements = SafeInt(N) * D; + void* allocated_ptr = allocator->Alloc(num_elements * sizeof(float)); BufferUniquePtr float_buffer(allocated_ptr, BufferDeleter(allocator)); float* ptr = reinterpret_cast(allocated_ptr); - MlasConvertHalfToFloatBuffer(score, ptr, N * D); + MlasConvertHalfToFloatBuffer(score, ptr, num_elements); MlasComputeSoftmax(ptr, ptr, N, D, false, false, 0.0f, tp); - MlasConvertFloatToHalfBuffer(ptr, score, N * D); + MlasConvertFloatToHalfBuffer(ptr, score, num_elements); } template @@ -500,7 +501,8 @@ void AttentionBase::ComputeAttentionProbs(T* attention_probs, if (out_qk != nullptr && parameters.qk_matmul_output_mode == attention_helper::QKMatMulOutputMode::kQKSoftCap) { memcpy(out_qk, output, SafeInt(probs_matrix_size) * sizeof(T)); } - ComputeAttentionSoftmaxInplace(output, parameters.q_sequence_length, parameters.total_sequence_length, nullptr, allocator); + ComputeAttentionSoftmaxInplace(output, SafeInt(parameters.q_sequence_length), + SafeInt(parameters.total_sequence_length), nullptr, allocator); if (output_qk != nullptr && parameters.qk_matmul_output_mode == attention_helper::QKMatMulOutputMode::kQKSoftMax) { memcpy(output_qk + output_offset, output, From 20332e891fa68b2c033e490154fffe95717323a4 Mon Sep 17 00:00:00 2001 From: edgchen1 <18449977+edgchen1@users.noreply.github.com> Date: Mon, 23 Mar 2026 18:22:01 -0700 Subject: [PATCH 04/13] lint --- .../test/providers/cpu/llm/attention_op_test.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc index 6d83c73acfe85..e9a4ec27d3a78 100644 --- a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc +++ b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc @@ -2449,16 +2449,16 @@ TEST(AttentionTest, Attention_FP16_SoftmaxLargeDimensions) { test.AddInput("Q", q_shape, ToFloat16(q_data)); test.AddInput("K", k_shape, ToFloat16(k_data)); test.AddInput("V", v_shape, ToFloat16(v_data)); - test.AddOptionalInputEdge(); // attn_mask - test.AddOptionalInputEdge(); // past_key - test.AddOptionalInputEdge(); // past_value + test.AddOptionalInputEdge(); // attn_mask + test.AddOptionalInputEdge(); // past_key + test.AddOptionalInputEdge(); // past_value // Expected output: all 1.0 (uniform attention over all-ones V). std::vector y_shape = {batch_size, num_heads, q_sequence_length, head_size}; std::vector expected_y(q_elements, 1.0f); test.AddOutput("Y", y_shape, ToFloat16(expected_y), false, 0, 3e-3f); - test.AddOptionalOutputEdge(); // present_key - test.AddOptionalOutputEdge(); // present_value + test.AddOptionalOutputEdge(); // present_key + test.AddOptionalOutputEdge(); // present_value std::vector> execution_providers; execution_providers.push_back(DefaultCpuExecutionProvider()); From d30ee590c7286860b75938b778e55aa9871ae819 Mon Sep 17 00:00:00 2001 From: edgchen1 <18449977+edgchen1@users.noreply.github.com> Date: Mon, 23 Mar 2026 18:52:18 -0700 Subject: [PATCH 05/13] update comment --- onnxruntime/test/providers/cpu/llm/attention_op_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc index e9a4ec27d3a78..f2b6f536548af 100644 --- a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc +++ b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc @@ -2409,7 +2409,7 @@ TEST(AttentionTest, Attention_NonPadKVSeqLen_WithFloatAttnMask_MultiBatch) { // Regression test for integer overflow in FP16 softmax allocation. // ComputeAttentionSoftmaxInplace previously used int for N and D. -// For large enough values of N and D, N * D * sizeof(float) could overflow int32. +// For large enough values of N and D, N * D could overflow int32. TEST(AttentionTest, Attention_FP16_SoftmaxLargeDimensions) { // Skip if the machine has less than 16GB of physical RAM. constexpr uint64_t required_ram_bytes = 16ULL * 1024 * 1024 * 1024; From 70cd5b14003ec19cd4f68fbf6a1567878cfa2376 Mon Sep 17 00:00:00 2001 From: edgchen1 <18449977+edgchen1@users.noreply.github.com> Date: Tue, 24 Mar 2026 10:16:03 -0700 Subject: [PATCH 06/13] increase tolerance for test output --- onnxruntime/test/providers/cpu/llm/attention_op_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc index f2b6f536548af..d4be253f914a8 100644 --- a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc +++ b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc @@ -2456,7 +2456,7 @@ TEST(AttentionTest, Attention_FP16_SoftmaxLargeDimensions) { // Expected output: all 1.0 (uniform attention over all-ones V). std::vector y_shape = {batch_size, num_heads, q_sequence_length, head_size}; std::vector expected_y(q_elements, 1.0f); - test.AddOutput("Y", y_shape, ToFloat16(expected_y), false, 0, 3e-3f); + test.AddOutput("Y", y_shape, ToFloat16(expected_y), false, 0, 3e-2f); test.AddOptionalOutputEdge(); // present_key test.AddOptionalOutputEdge(); // present_value From 605328f3bb280b8afdb0cb4ae73d3de341f72bdc Mon Sep 17 00:00:00 2001 From: edgchen1 <18449977+edgchen1@users.noreply.github.com> Date: Tue, 24 Mar 2026 11:31:42 -0700 Subject: [PATCH 07/13] expect test overflow in 32-bit builds, update test name and comment --- .../test/providers/cpu/llm/attention_op_test.cc | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc index d4be253f914a8..77a6ae92b95c1 100644 --- a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc +++ b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc @@ -2407,10 +2407,10 @@ TEST(AttentionTest, Attention_NonPadKVSeqLen_WithFloatAttnMask_MultiBatch) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); } -// Regression test for integer overflow in FP16 softmax allocation. +// Regression test for CPU kernel integer overflow in FP16 softmax allocation. // ComputeAttentionSoftmaxInplace previously used int for N and D. // For large enough values of N and D, N * D could overflow int32. -TEST(AttentionTest, Attention_FP16_SoftmaxLargeDimensions) { +TEST(AttentionTest, AttentionCpuFp16SoftmaxLargeDimensions) { // Skip if the machine has less than 16GB of physical RAM. constexpr uint64_t required_ram_bytes = 16ULL * 1024 * 1024 * 1024; if (const auto total_ram_bytes = GetTotalPhysicalMemoryBytes(); @@ -2462,7 +2462,13 @@ TEST(AttentionTest, Attention_FP16_SoftmaxLargeDimensions) { std::vector> execution_providers; execution_providers.push_back(DefaultCpuExecutionProvider()); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + + if constexpr (sizeof(void*) == 4) { + // Expect overflow for 32-bit builds. + test.Run(OpTester::ExpectResult::kExpectFailure, "Integer overflow", {}, nullptr, &execution_providers); + } else { + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } } } // namespace test From c3ba15bc4e26497d65efdfdd66ea221c8b28de1c Mon Sep 17 00:00:00 2001 From: edgchen1 <18449977+edgchen1@users.noreply.github.com> Date: Tue, 24 Mar 2026 13:52:01 -0700 Subject: [PATCH 08/13] Fix integer overflow in FP16 softmax allocation Extract ComputeAttentionSoftmaxInplace into attention_softmax.h, changing parameters from int to size_t and using SafeInt for the N*D multiplication. Previously, N*D could overflow int32 when q_sequence_length * total_sequence_length > INT_MAX, causing an undersized buffer allocation. Replace the old Attention_FP16_SoftmaxLargeDimensions operator-level test in attention_op_test.cc with a direct regression test in the new attention_softmax_test.cc. The new test uses a custom allocator to verify the correct allocation size without needing the ~8GB buffer, and handles both 64-bit (verifies size) and 32-bit (SafeInt overflow) builds. Remove system_info.h/.cc (no longer needed). --- .../core/providers/cpu/llm/attention.cc | 22 +---- .../providers/cpu/llm/attention_softmax.h | 35 ++++++++ .../providers/cpu/llm/attention_op_test.cc | 65 -------------- .../cpu/llm/attention_softmax_test.cc | 85 +++++++++++++++++++ onnxruntime/test/util/include/system_info.h | 16 ---- onnxruntime/test/util/system_info.cc | 33 ------- 6 files changed, 122 insertions(+), 134 deletions(-) create mode 100644 onnxruntime/core/providers/cpu/llm/attention_softmax.h create mode 100644 onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc delete mode 100644 onnxruntime/test/util/include/system_info.h delete mode 100644 onnxruntime/test/util/system_info.cc diff --git a/onnxruntime/core/providers/cpu/llm/attention.cc b/onnxruntime/core/providers/cpu/llm/attention.cc index 648fb483ba2dd..7a94d9a39182c 100644 --- a/onnxruntime/core/providers/cpu/llm/attention.cc +++ b/onnxruntime/core/providers/cpu/llm/attention.cc @@ -3,6 +3,7 @@ #include "core/providers/cpu/llm/attention.h" #include "core/providers/cpu/llm/attention_helper.h" +#include "core/providers/cpu/llm/attention_softmax.h" #include "core/common/common.h" #include "core/common/safeint.h" @@ -77,24 +78,6 @@ void make_copy(MLFloat16* mask_data, const bool* mask_index, si } } -template -inline void ComputeAttentionSoftmaxInplace(T* score, size_t N, size_t D, ThreadPool* tp, AllocatorPtr) { - MlasComputeSoftmax(score, score, N, D, false, false, 0.0f, tp); -} - -template <> -inline void ComputeAttentionSoftmaxInplace(MLFloat16* score, size_t N, size_t D, ThreadPool* tp, AllocatorPtr allocator) { - ORT_ENFORCE(tp == nullptr, "No parallelized version of softmax for float16."); - // Mlas Lacks kernels for fp16 softmax, we convert into float32 and call the float32 version. - const auto num_elements = SafeInt(N) * D; - void* allocated_ptr = allocator->Alloc(num_elements * sizeof(float)); - BufferUniquePtr float_buffer(allocated_ptr, BufferDeleter(allocator)); - float* ptr = reinterpret_cast(allocated_ptr); - MlasConvertHalfToFloatBuffer(score, ptr, num_elements); - MlasComputeSoftmax(ptr, ptr, N, D, false, false, 0.0f, tp); - MlasConvertFloatToHalfBuffer(ptr, score, num_elements); -} - template inline void ComputeAttentionSoftcapInplace(T* scores, int sequence_length, T softcap) { MlasComputeSoftcap(scores, scores, sequence_length, softcap); @@ -501,8 +484,7 @@ void AttentionBase::ComputeAttentionProbs(T* attention_probs, if (out_qk != nullptr && parameters.qk_matmul_output_mode == attention_helper::QKMatMulOutputMode::kQKSoftCap) { memcpy(out_qk, output, SafeInt(probs_matrix_size) * sizeof(T)); } - ComputeAttentionSoftmaxInplace(output, SafeInt(parameters.q_sequence_length), - SafeInt(parameters.total_sequence_length), nullptr, allocator); + ComputeAttentionSoftmaxInplace(output, parameters.q_sequence_length, parameters.total_sequence_length, nullptr, allocator); if (output_qk != nullptr && parameters.qk_matmul_output_mode == attention_helper::QKMatMulOutputMode::kQKSoftMax) { memcpy(output_qk + output_offset, output, diff --git a/onnxruntime/core/providers/cpu/llm/attention_softmax.h b/onnxruntime/core/providers/cpu/llm/attention_softmax.h new file mode 100644 index 0000000000000..7176e7b6a7d58 --- /dev/null +++ b/onnxruntime/core/providers/cpu/llm/attention_softmax.h @@ -0,0 +1,35 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/common/common.h" +#include "core/common/safeint.h" +#include "core/framework/allocator.h" +#include "core/framework/buffer_deleter.h" +#include "core/mlas/inc/mlas.h" +#include "core/platform/threadpool.h" + +namespace onnxruntime { + +template +inline void ComputeAttentionSoftmaxInplace(T* score, size_t N, size_t D, + concurrency::ThreadPool* tp, AllocatorPtr) { + MlasComputeSoftmax(score, score, N, D, false, false, 0.0f, tp); +} + +template <> +inline void ComputeAttentionSoftmaxInplace(MLFloat16* score, size_t N, size_t D, + concurrency::ThreadPool* tp, AllocatorPtr allocator) { + ORT_ENFORCE(tp == nullptr, "No parallelized version of softmax for float16."); + // MLAS lacks kernels for fp16 softmax, so we convert to float32 and use the float32 version. + size_t num_elements = SafeInt(N) * D; + void* allocated_ptr = allocator->Alloc(num_elements * sizeof(float)); + BufferUniquePtr float_buffer(allocated_ptr, BufferDeleter(allocator)); + float* ptr = reinterpret_cast(allocated_ptr); + MlasConvertHalfToFloatBuffer(score, ptr, num_elements); + MlasComputeSoftmax(ptr, ptr, N, D, false, false, 0.0f, tp); + MlasConvertFloatToHalfBuffer(ptr, score, num_elements); +} + +} // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc index 77a6ae92b95c1..b651a47b582ac 100644 --- a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc +++ b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc @@ -8,7 +8,6 @@ #include "test/common/tensor_op_test_utils.h" #include "test/common/cuda_op_test_utils.h" #include "test/providers/provider_test_utils.h" -#include "test/util/include/system_info.h" namespace onnxruntime { namespace test { @@ -2407,69 +2406,5 @@ TEST(AttentionTest, Attention_NonPadKVSeqLen_WithFloatAttnMask_MultiBatch) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); } -// Regression test for CPU kernel integer overflow in FP16 softmax allocation. -// ComputeAttentionSoftmaxInplace previously used int for N and D. -// For large enough values of N and D, N * D could overflow int32. -TEST(AttentionTest, AttentionCpuFp16SoftmaxLargeDimensions) { - // Skip if the machine has less than 16GB of physical RAM. - constexpr uint64_t required_ram_bytes = 16ULL * 1024 * 1024 * 1024; - if (const auto total_ram_bytes = GetTotalPhysicalMemoryBytes(); - total_ram_bytes.has_value() && *total_ram_bytes < required_ram_bytes) { - GTEST_SKIP() << "Skipping: test requires >= 16GB RAM, machine has " - << (*total_ram_bytes / (1024 * 1024)) << "MB"; - } - - constexpr int batch_size = 1; - constexpr int num_heads = 1; - constexpr int q_sequence_length = 46341; - constexpr int kv_sequence_length = 46341; - constexpr int head_size = 1; - - // Verify at compile time that these dimensions trigger the overflow scenario. - static_assert(static_cast(q_sequence_length) * kv_sequence_length > - static_cast(std::numeric_limits::max()), - "Test dimensions must cause int32 overflow in N*D"); - - OpTester test("Attention", 23, onnxruntime::kOnnxDomain); - - // 4D BNSH inputs - std::vector q_shape = {batch_size, num_heads, q_sequence_length, head_size}; - std::vector k_shape = {batch_size, num_heads, kv_sequence_length, head_size}; - std::vector v_shape = {batch_size, num_heads, kv_sequence_length, head_size}; - - constexpr int q_elements = batch_size * num_heads * q_sequence_length * head_size; - constexpr int kv_elements = batch_size * num_heads * kv_sequence_length * head_size; - - // All-zero Q and K → all attention scores are 0, softmax produces uniform 1/kv_seq. - // All-one V → output is also all 1.0 (weighted average of 1s). - std::vector q_data(q_elements, 0.0f); - std::vector k_data(kv_elements, 0.0f); - std::vector v_data(kv_elements, 1.0f); - - test.AddInput("Q", q_shape, ToFloat16(q_data)); - test.AddInput("K", k_shape, ToFloat16(k_data)); - test.AddInput("V", v_shape, ToFloat16(v_data)); - test.AddOptionalInputEdge(); // attn_mask - test.AddOptionalInputEdge(); // past_key - test.AddOptionalInputEdge(); // past_value - - // Expected output: all 1.0 (uniform attention over all-ones V). - std::vector y_shape = {batch_size, num_heads, q_sequence_length, head_size}; - std::vector expected_y(q_elements, 1.0f); - test.AddOutput("Y", y_shape, ToFloat16(expected_y), false, 0, 3e-2f); - test.AddOptionalOutputEdge(); // present_key - test.AddOptionalOutputEdge(); // present_value - - std::vector> execution_providers; - execution_providers.push_back(DefaultCpuExecutionProvider()); - - if constexpr (sizeof(void*) == 4) { - // Expect overflow for 32-bit builds. - test.Run(OpTester::ExpectResult::kExpectFailure, "Integer overflow", {}, nullptr, &execution_providers); - } else { - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); - } -} - } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc b/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc new file mode 100644 index 0000000000000..b4a39bdcdcc99 --- /dev/null +++ b/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc @@ -0,0 +1,85 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#if !defined(ORT_NO_EXCEPTIONS) + +#include +#include + +#include "gtest/gtest.h" + +#include "core/framework/allocator.h" +#include "core/providers/cpu/llm/attention_softmax.h" + +namespace onnxruntime { +namespace test { + +// Regression test for integer overflow in FP16 softmax allocation. +// ComputeAttentionSoftmaxInplace previously used int for N and D, so N*D could overflow int32. +// The fix changed parameters to size_t and uses SafeInt for the multiplication. +// +// This test calls ComputeAttentionSoftmaxInplace directly with overflow-triggering dimensions +// (N=46341, D=46341, where N*D > INT_MAX). +// A custom allocator intercepts the Alloc call to verify the requested size is computed correctly with size_t +// arithmetic, without actually allocating the ~8GB buffer. +// +// On 32-bit builds, SafeInt(N) * D itself overflows. +TEST(AttentionSoftmaxTest, Fp16OverflowAllocation) { + // Custom exception thrown by the allocator to distinguish it from SafeInt overflow. + struct AllocationIntercepted : std::exception { + const char* what() const noexcept override { return "allocation intercepted"; } + }; + + // Custom allocator that records the requested allocation size and throws to avoid actually allocating the + // (very large) buffer. + class OverflowCheckAllocator : public IAllocator { + public: + OverflowCheckAllocator() + : IAllocator(OrtMemoryInfo(CPU, OrtDeviceAllocator)) {} + void* Alloc(size_t size) override { + last_alloc_size_ = size; + throw AllocationIntercepted(); + } + void Free(void*) override {} + size_t LastAllocSize() const { return last_alloc_size_; } + + private: + size_t last_alloc_size_ = 0; + }; + + constexpr size_t N = 46341; + constexpr size_t D = 46341; + + // Verify at compile time that these dimensions would overflow int32. + static_assert(static_cast(N) * D > static_cast(std::numeric_limits::max()), + "Test dimensions must cause int32 overflow in N*D"); + + auto alloc = std::make_shared(); + MLFloat16 dummy_score{0.0f}; + + if constexpr (static_cast(N) * D <= static_cast(std::numeric_limits::max())) { + // N * D fits in size_t. The function reaches Alloc, which records the requested size and throws + // AllocationIntercepted. + EXPECT_THROW(ComputeAttentionSoftmaxInplace(&dummy_score, N, D, nullptr, alloc), + AllocationIntercepted); + + // The allocation size must reflect correct size_t arithmetic: N * D * sizeof(float). + // With the old int parameters, N * D would overflow to a small/negative value, producing a wrong (much smaller) + // allocation size. + constexpr size_t expected_bytes = N * D * sizeof(float); + EXPECT_EQ(alloc->LastAllocSize(), expected_bytes); + } else { + // N * D overflows size_t (i.e., in a 32-bit build), so SafeInt will throw an exception. + try { + ComputeAttentionSoftmaxInplace(&dummy_score, N, D, nullptr, alloc); + FAIL() << "Expected OnnxRuntimeException to be thrown"; + } catch (const OnnxRuntimeException& e) { + EXPECT_THAT(e.what(), testing::HasSubstr("Integer overflow")); + } + } +} + +} // namespace test +} // namespace onnxruntime + +#endif // !defined(ORT_NO_EXCEPTIONS) diff --git a/onnxruntime/test/util/include/system_info.h b/onnxruntime/test/util/include/system_info.h deleted file mode 100644 index 7f0e925c6ccbc..0000000000000 --- a/onnxruntime/test/util/include/system_info.h +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -#include -#include - -namespace onnxruntime { -namespace test { - -// Returns the total physical memory (RAM) in bytes, or std::nullopt if detection fails. -std::optional GetTotalPhysicalMemoryBytes(); - -} // namespace test -} // namespace onnxruntime diff --git a/onnxruntime/test/util/system_info.cc b/onnxruntime/test/util/system_info.cc deleted file mode 100644 index 653be56cd98c2..0000000000000 --- a/onnxruntime/test/util/system_info.cc +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "test/util/include/system_info.h" - -#ifdef _WIN32 -#include -#else -#include -#endif - -namespace onnxruntime { -namespace test { - -std::optional GetTotalPhysicalMemoryBytes() { -#ifdef _WIN32 - MEMORYSTATUSEX mem_info = {}; - mem_info.dwLength = sizeof(mem_info); - if (GlobalMemoryStatusEx(&mem_info)) { - return static_cast(mem_info.ullTotalPhys); - } -#else - long pages = sysconf(_SC_PHYS_PAGES); - long page_size = sysconf(_SC_PAGESIZE); - if (pages > 0 && page_size > 0) { - return static_cast(pages) * static_cast(page_size); - } -#endif - return std::nullopt; -} - -} // namespace test -} // namespace onnxruntime From 8c69b10a2a611cd5b9e164ea377f2776d156fada Mon Sep 17 00:00:00 2001 From: Edward Chen <18449977+edgchen1@users.noreply.github.com> Date: Tue, 24 Mar 2026 14:07:41 -0700 Subject: [PATCH 09/13] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc b/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc index b4a39bdcdcc99..fad847aef4602 100644 --- a/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc +++ b/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc @@ -7,6 +7,7 @@ #include #include "gtest/gtest.h" +#include "gmock/gmock.h" #include "core/framework/allocator.h" #include "core/providers/cpu/llm/attention_softmax.h" From a168897c7730f1e8cda155d7b24d5de1ef26b150 Mon Sep 17 00:00:00 2001 From: edgchen1 <18449977+edgchen1@users.noreply.github.com> Date: Tue, 24 Mar 2026 14:10:46 -0700 Subject: [PATCH 10/13] propagate safeint --- onnxruntime/core/providers/cpu/llm/attention_softmax.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/cpu/llm/attention_softmax.h b/onnxruntime/core/providers/cpu/llm/attention_softmax.h index 7176e7b6a7d58..77c390418be4b 100644 --- a/onnxruntime/core/providers/cpu/llm/attention_softmax.h +++ b/onnxruntime/core/providers/cpu/llm/attention_softmax.h @@ -23,7 +23,7 @@ inline void ComputeAttentionSoftmaxInplace(MLFloat16* score, size_t N concurrency::ThreadPool* tp, AllocatorPtr allocator) { ORT_ENFORCE(tp == nullptr, "No parallelized version of softmax for float16."); // MLAS lacks kernels for fp16 softmax, so we convert to float32 and use the float32 version. - size_t num_elements = SafeInt(N) * D; + auto num_elements = SafeInt(N) * D; void* allocated_ptr = allocator->Alloc(num_elements * sizeof(float)); BufferUniquePtr float_buffer(allocated_ptr, BufferDeleter(allocator)); float* ptr = reinterpret_cast(allocated_ptr); From 33cc1ea734561c2df5fec173a2c33f82edd94c08 Mon Sep 17 00:00:00 2001 From: edgchen1 <18449977+edgchen1@users.noreply.github.com> Date: Tue, 24 Mar 2026 14:58:05 -0700 Subject: [PATCH 11/13] Add float16 header. --- onnxruntime/core/providers/cpu/llm/attention_softmax.h | 1 + 1 file changed, 1 insertion(+) diff --git a/onnxruntime/core/providers/cpu/llm/attention_softmax.h b/onnxruntime/core/providers/cpu/llm/attention_softmax.h index 77c390418be4b..46beed384f739 100644 --- a/onnxruntime/core/providers/cpu/llm/attention_softmax.h +++ b/onnxruntime/core/providers/cpu/llm/attention_softmax.h @@ -4,6 +4,7 @@ #pragma once #include "core/common/common.h" +#include "core/common/float16.h" #include "core/common/safeint.h" #include "core/framework/allocator.h" #include "core/framework/buffer_deleter.h" From 136a6e22b0ba2d90122c968e451bf9d03f866478 Mon Sep 17 00:00:00 2001 From: edgchen1 <18449977+edgchen1@users.noreply.github.com> Date: Tue, 24 Mar 2026 15:33:37 -0700 Subject: [PATCH 12/13] improve size_t overflow check --- .../cpu/llm/attention_softmax_test.cc | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc b/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc index fad847aef4602..2cab626fe363e 100644 --- a/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc +++ b/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc @@ -52,25 +52,26 @@ TEST(AttentionSoftmaxTest, Fp16OverflowAllocation) { constexpr size_t D = 46341; // Verify at compile time that these dimensions would overflow int32. - static_assert(static_cast(N) * D > static_cast(std::numeric_limits::max()), + static_assert(int64_t{N} * int64_t{D} > int64_t{std::numeric_limits::max()}, "Test dimensions must cause int32 overflow in N*D"); auto alloc = std::make_shared(); MLFloat16 dummy_score{0.0f}; - if constexpr (static_cast(N) * D <= static_cast(std::numeric_limits::max())) { - // N * D fits in size_t. The function reaches Alloc, which records the requested size and throws + // The allocation size must reflect correct size_t arithmetic: N * D * sizeof(float). + // With the old int parameters, N * D would overflow to a small/negative value, producing a wrong (much smaller) + // allocation size. + constexpr uintmax_t expected_allocation_size = uintmax_t{N} * D * sizeof(float); + + if constexpr (expected_allocation_size <= uintmax_t{std::numeric_limits::max()}) { + // Allocation size fits in size_t. The function reaches Alloc, which records the requested size and throws // AllocationIntercepted. EXPECT_THROW(ComputeAttentionSoftmaxInplace(&dummy_score, N, D, nullptr, alloc), AllocationIntercepted); - // The allocation size must reflect correct size_t arithmetic: N * D * sizeof(float). - // With the old int parameters, N * D would overflow to a small/negative value, producing a wrong (much smaller) - // allocation size. - constexpr size_t expected_bytes = N * D * sizeof(float); - EXPECT_EQ(alloc->LastAllocSize(), expected_bytes); + EXPECT_EQ(alloc->LastAllocSize(), static_cast(expected_allocation_size)); } else { - // N * D overflows size_t (i.e., in a 32-bit build), so SafeInt will throw an exception. + // Allocation size overflows size_t (i.e., in a 32-bit build), so SafeInt will throw an exception. try { ComputeAttentionSoftmaxInplace(&dummy_score, N, D, nullptr, alloc); FAIL() << "Expected OnnxRuntimeException to be thrown"; From 9923ec0892e6c3e84042065d0c2526359007db7a Mon Sep 17 00:00:00 2001 From: edgchen1 <18449977+edgchen1@users.noreply.github.com> Date: Tue, 24 Mar 2026 17:29:41 -0700 Subject: [PATCH 13/13] update comments --- onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc b/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc index 2cab626fe363e..cb98d248b2196 100644 --- a/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc +++ b/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc @@ -24,7 +24,7 @@ namespace test { // A custom allocator intercepts the Alloc call to verify the requested size is computed correctly with size_t // arithmetic, without actually allocating the ~8GB buffer. // -// On 32-bit builds, SafeInt(N) * D itself overflows. +// On 32-bit builds, SafeInt will signal an overflow for the requested size. TEST(AttentionSoftmaxTest, Fp16OverflowAllocation) { // Custom exception thrown by the allocator to distinguish it from SafeInt overflow. struct AllocationIntercepted : std::exception { @@ -59,8 +59,7 @@ TEST(AttentionSoftmaxTest, Fp16OverflowAllocation) { MLFloat16 dummy_score{0.0f}; // The allocation size must reflect correct size_t arithmetic: N * D * sizeof(float). - // With the old int parameters, N * D would overflow to a small/negative value, producing a wrong (much smaller) - // allocation size. + // With the old int parameters, N * D would overflow to a small/negative value, producing a wrong allocation size. constexpr uintmax_t expected_allocation_size = uintmax_t{N} * D * sizeof(float); if constexpr (expected_allocation_size <= uintmax_t{std::numeric_limits::max()}) {