From c9f71907bbaa89318ffba362301e419ca009d868 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Mon, 23 Mar 2026 18:04:48 -0700
Subject: [PATCH 01/13] add GetTotalPhysicalMemoryBytes helper

---
 onnxruntime/test/util/include/system_info.h | 16 ++++++++++
 onnxruntime/test/util/system_info.cc        | 33 +++++++++++++++++++++
 2 files changed, 49 insertions(+)
 create mode 100644 onnxruntime/test/util/include/system_info.h
 create mode 100644 onnxruntime/test/util/system_info.cc
diff --git a/onnxruntime/test/util/include/system_info.h b/onnxruntime/test/util/include/system_info.h
new file mode 100644
index 0000000000000..7f0e925c6ccbc
--- /dev/null
+++ b/onnxruntime/test/util/include/system_info.h
@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <cstdint>
+#include <optional>
+
+namespace onnxruntime {
+namespace test {
+
+// Returns the total physical memory (RAM) in bytes, or std::nullopt if detection fails.
+std::optional<uint64_t> GetTotalPhysicalMemoryBytes();
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/util/system_info.cc b/onnxruntime/test/util/system_info.cc
new file mode 100644
index 0000000000000..653be56cd98c2
--- /dev/null
+++ b/onnxruntime/test/util/system_info.cc
@@ -0,0 +1,33 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "test/util/include/system_info.h"
+
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <unistd.h>
+#endif
+
+namespace onnxruntime {
+namespace test {
+
+std::optional<uint64_t> GetTotalPhysicalMemoryBytes() {
+#ifdef _WIN32
+  MEMORYSTATUSEX mem_info = {};
+  mem_info.dwLength = sizeof(mem_info);
+  if (GlobalMemoryStatusEx(&mem_info)) {
+    return static_cast<uint64_t>(mem_info.ullTotalPhys);
+  }
+#else
+  long pages = sysconf(_SC_PHYS_PAGES);
+  long page_size = sysconf(_SC_PAGESIZE);
+  if (pages > 0 && page_size > 0) {
+    return static_cast<uint64_t>(pages) * static_cast<uint64_t>(page_size);
+  }
+#endif
+  return std::nullopt;
+}
+
+}  // namespace test
+}  // namespace onnxruntime

From b72abd5e574fccde132256106230c4006c61d6f0 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Mon, 23 Mar 2026 18:10:20 -0700
Subject: [PATCH 02/13] add test

---
 .../providers/cpu/llm/attention_op_test.cc    | 60 +++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
index 45c5220f42564..6d83c73acfe85 100644
--- a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
+++ b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
@@ -2,11 +2,13 @@
 // Licensed under the MIT License.
 
 #include <cassert>
+#include <limits>
 #include "gtest/gtest.h"
 #include "core/session/onnxruntime_cxx_api.h"
 #include "test/common/tensor_op_test_utils.h"
 #include "test/common/cuda_op_test_utils.h"
 #include "test/providers/provider_test_utils.h"
+#include "test/util/include/system_info.h"
 
 namespace onnxruntime {
 namespace test {
@@ -2405,5 +2407,63 @@ TEST(AttentionTest, Attention_NonPadKVSeqLen_WithFloatAttnMask_MultiBatch) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 
+// Regression test for integer overflow in FP16 softmax allocation.
+// ComputeAttentionSoftmaxInplace<MLFloat16> previously used int for N and D.
+// For large enough values of N and D, N * D * sizeof(float) could overflow int32.
+TEST(AttentionTest, Attention_FP16_SoftmaxLargeDimensions) {
+  // Skip if the machine has less than 16GB of physical RAM.
+  constexpr uint64_t required_ram_bytes = 16ULL * 1024 * 1024 * 1024;
+  if (const auto total_ram_bytes = GetTotalPhysicalMemoryBytes();
+      total_ram_bytes.has_value() && *total_ram_bytes < required_ram_bytes) {
+    GTEST_SKIP() << "Skipping: test requires >= 16GB RAM, machine has "
+                 << (*total_ram_bytes / (1024 * 1024)) << "MB";
+  }
+
+  constexpr int batch_size = 1;
+  constexpr int num_heads = 1;
+  constexpr int q_sequence_length = 46341;
+  constexpr int kv_sequence_length = 46341;
+  constexpr int head_size = 1;
+
+  // Verify at compile time that these dimensions trigger the overflow scenario.
+  static_assert(static_cast<int64_t>(q_sequence_length) * kv_sequence_length >
+                    static_cast<int64_t>(std::numeric_limits<int>::max()),
+                "Test dimensions must cause int32 overflow in N*D");
+
+  OpTester test("Attention", 23, onnxruntime::kOnnxDomain);
+
+  // 4D BNSH inputs
+  std::vector<int64_t> q_shape = {batch_size, num_heads, q_sequence_length, head_size};
+  std::vector<int64_t> k_shape = {batch_size, num_heads, kv_sequence_length, head_size};
+  std::vector<int64_t> v_shape = {batch_size, num_heads, kv_sequence_length, head_size};
+
+  constexpr int q_elements = batch_size * num_heads * q_sequence_length * head_size;
+  constexpr int kv_elements = batch_size * num_heads * kv_sequence_length * head_size;
+
+  // All-zero Q and K → all attention scores are 0, softmax produces uniform 1/kv_seq.
+  // All-one V → output is also all 1.0 (weighted average of 1s).
+  std::vector<float> q_data(q_elements, 0.0f);
+  std::vector<float> k_data(kv_elements, 0.0f);
+  std::vector<float> v_data(kv_elements, 1.0f);
+
+  test.AddInput<MLFloat16>("Q", q_shape, ToFloat16(q_data));
+  test.AddInput<MLFloat16>("K", k_shape, ToFloat16(k_data));
+  test.AddInput<MLFloat16>("V", v_shape, ToFloat16(v_data));
+  test.AddOptionalInputEdge<bool>();      // attn_mask
+  test.AddOptionalInputEdge<MLFloat16>(); // past_key
+  test.AddOptionalInputEdge<MLFloat16>(); // past_value
+
+  // Expected output: all 1.0 (uniform attention over all-ones V).
+  std::vector<int64_t> y_shape = {batch_size, num_heads, q_sequence_length, head_size};
+  std::vector<float> expected_y(q_elements, 1.0f);
+  test.AddOutput<MLFloat16>("Y", y_shape, ToFloat16(expected_y), false, 0, 3e-3f);
+  test.AddOptionalOutputEdge<MLFloat16>(); // present_key
+  test.AddOptionalOutputEdge<MLFloat16>(); // present_value
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultCpuExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
 }  // namespace test
 }  // namespace onnxruntime

From 1769061a869cd208efcad4d472e8cd7d7e2bc56c Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Mon, 23 Mar 2026 18:11:35 -0700
Subject: [PATCH 03/13] fix overflow issue

---
 onnxruntime/core/providers/cpu/llm/attention.cc | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/llm/attention.cc b/onnxruntime/core/providers/cpu/llm/attention.cc
index 3d505af487712..648fb483ba2dd 100644
--- a/onnxruntime/core/providers/cpu/llm/attention.cc
+++ b/onnxruntime/core/providers/cpu/llm/attention.cc
@@ -78,20 +78,21 @@ void make_copy<MLFloat16, bool>(MLFloat16* mask_data, const bool* mask_index, si
 }
 
 template <typename T>
-inline void ComputeAttentionSoftmaxInplace(T* score, int N, int D, ThreadPool* tp, AllocatorPtr) {
+inline void ComputeAttentionSoftmaxInplace(T* score, size_t N, size_t D, ThreadPool* tp, AllocatorPtr) {
   MlasComputeSoftmax(score, score, N, D, false, false, 0.0f, tp);
 }
 
 template <>
-inline void ComputeAttentionSoftmaxInplace<MLFloat16>(MLFloat16* score, int N, int D, ThreadPool* tp, AllocatorPtr allocator) {
+inline void ComputeAttentionSoftmaxInplace<MLFloat16>(MLFloat16* score, size_t N, size_t D, ThreadPool* tp, AllocatorPtr allocator) {
   ORT_ENFORCE(tp == nullptr, "No parallelized version of softmax for float16.");
   // Mlas Lacks kernels for fp16 softmax, we convert into float32 and call the float32 version.
-  void* allocated_ptr = allocator->Alloc(static_cast<size_t>(N * D * sizeof(float)));
+  const auto num_elements = SafeInt<size_t>(N) * D;
+  void* allocated_ptr = allocator->Alloc(num_elements * sizeof(float));
   BufferUniquePtr float_buffer(allocated_ptr, BufferDeleter(allocator));
   float* ptr = reinterpret_cast<float*>(allocated_ptr);
-  MlasConvertHalfToFloatBuffer(score, ptr, N * D);
+  MlasConvertHalfToFloatBuffer(score, ptr, num_elements);
   MlasComputeSoftmax(ptr, ptr, N, D, false, false, 0.0f, tp);
-  MlasConvertFloatToHalfBuffer(ptr, score, N * D);
+  MlasConvertFloatToHalfBuffer(ptr, score, num_elements);
 }
 
 template <typename T>
@@ -500,7 +501,8 @@ void AttentionBase<T>::ComputeAttentionProbs(T* attention_probs,
       if (out_qk != nullptr && parameters.qk_matmul_output_mode == attention_helper::QKMatMulOutputMode::kQKSoftCap) {
         memcpy(out_qk, output, SafeInt<size_t>(probs_matrix_size) * sizeof(T));
       }
-      ComputeAttentionSoftmaxInplace(output, parameters.q_sequence_length, parameters.total_sequence_length, nullptr, allocator);
+      ComputeAttentionSoftmaxInplace(output, SafeInt<size_t>(parameters.q_sequence_length),
+                                     SafeInt<size_t>(parameters.total_sequence_length), nullptr, allocator);
 
       if (output_qk != nullptr && parameters.qk_matmul_output_mode == attention_helper::QKMatMulOutputMode::kQKSoftMax) {
         memcpy(output_qk + output_offset, output,

From 20332e891fa68b2c033e490154fffe95717323a4 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Mon, 23 Mar 2026 18:22:01 -0700
Subject: [PATCH 04/13] lint

---
 .../test/providers/cpu/llm/attention_op_test.cc        | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
index 6d83c73acfe85..e9a4ec27d3a78 100644
--- a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
+++ b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
@@ -2449,16 +2449,16 @@ TEST(AttentionTest, Attention_FP16_SoftmaxLargeDimensions) {
   test.AddInput<MLFloat16>("Q", q_shape, ToFloat16(q_data));
   test.AddInput<MLFloat16>("K", k_shape, ToFloat16(k_data));
   test.AddInput<MLFloat16>("V", v_shape, ToFloat16(v_data));
-  test.AddOptionalInputEdge<bool>();      // attn_mask
-  test.AddOptionalInputEdge<MLFloat16>(); // past_key
-  test.AddOptionalInputEdge<MLFloat16>(); // past_value
+  test.AddOptionalInputEdge<bool>();       // attn_mask
+  test.AddOptionalInputEdge<MLFloat16>();  // past_key
+  test.AddOptionalInputEdge<MLFloat16>();  // past_value
 
   // Expected output: all 1.0 (uniform attention over all-ones V).
   std::vector<int64_t> y_shape = {batch_size, num_heads, q_sequence_length, head_size};
   std::vector<float> expected_y(q_elements, 1.0f);
   test.AddOutput<MLFloat16>("Y", y_shape, ToFloat16(expected_y), false, 0, 3e-3f);
-  test.AddOptionalOutputEdge<MLFloat16>(); // present_key
-  test.AddOptionalOutputEdge<MLFloat16>(); // present_value
+  test.AddOptionalOutputEdge<MLFloat16>();  // present_key
+  test.AddOptionalOutputEdge<MLFloat16>();  // present_value
 
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
   execution_providers.push_back(DefaultCpuExecutionProvider());

From d30ee590c7286860b75938b778e55aa9871ae819 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Mon, 23 Mar 2026 18:52:18 -0700
Subject: [PATCH 05/13] update comment

---
 onnxruntime/test/providers/cpu/llm/attention_op_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
index e9a4ec27d3a78..f2b6f536548af 100644
--- a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
+++ b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
@@ -2409,7 +2409,7 @@ TEST(AttentionTest, Attention_NonPadKVSeqLen_WithFloatAttnMask_MultiBatch) {
 
 // Regression test for integer overflow in FP16 softmax allocation.
 // ComputeAttentionSoftmaxInplace<MLFloat16> previously used int for N and D.
-// For large enough values of N and D, N * D * sizeof(float) could overflow int32.
+// For large enough values of N and D, N * D could overflow int32.
 TEST(AttentionTest, Attention_FP16_SoftmaxLargeDimensions) {
   // Skip if the machine has less than 16GB of physical RAM.
   constexpr uint64_t required_ram_bytes = 16ULL * 1024 * 1024 * 1024;

From 70cd5b14003ec19cd4f68fbf6a1567878cfa2376 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Tue, 24 Mar 2026 10:16:03 -0700
Subject: [PATCH 06/13] increase tolerance for test output

---
 onnxruntime/test/providers/cpu/llm/attention_op_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
index f2b6f536548af..d4be253f914a8 100644
--- a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
+++ b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
@@ -2456,7 +2456,7 @@ TEST(AttentionTest, Attention_FP16_SoftmaxLargeDimensions) {
   // Expected output: all 1.0 (uniform attention over all-ones V).
   std::vector<int64_t> y_shape = {batch_size, num_heads, q_sequence_length, head_size};
   std::vector<float> expected_y(q_elements, 1.0f);
-  test.AddOutput<MLFloat16>("Y", y_shape, ToFloat16(expected_y), false, 0, 3e-3f);
+  test.AddOutput<MLFloat16>("Y", y_shape, ToFloat16(expected_y), false, 0, 3e-2f);
   test.AddOptionalOutputEdge<MLFloat16>();  // present_key
   test.AddOptionalOutputEdge<MLFloat16>();  // present_value
 

From 605328f3bb280b8afdb0cb4ae73d3de341f72bdc Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Tue, 24 Mar 2026 11:31:42 -0700
Subject: [PATCH 07/13] expect test overflow in 32-bit builds, update test name
 and comment

---
 .../test/providers/cpu/llm/attention_op_test.cc      | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
index d4be253f914a8..77a6ae92b95c1 100644
--- a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
+++ b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
@@ -2407,10 +2407,10 @@ TEST(AttentionTest, Attention_NonPadKVSeqLen_WithFloatAttnMask_MultiBatch) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 
-// Regression test for integer overflow in FP16 softmax allocation.
+// Regression test for CPU kernel integer overflow in FP16 softmax allocation.
 // ComputeAttentionSoftmaxInplace<MLFloat16> previously used int for N and D.
 // For large enough values of N and D, N * D could overflow int32.
-TEST(AttentionTest, Attention_FP16_SoftmaxLargeDimensions) {
+TEST(AttentionTest, AttentionCpuFp16SoftmaxLargeDimensions) {
   // Skip if the machine has less than 16GB of physical RAM.
   constexpr uint64_t required_ram_bytes = 16ULL * 1024 * 1024 * 1024;
   if (const auto total_ram_bytes = GetTotalPhysicalMemoryBytes();
@@ -2462,7 +2462,13 @@ TEST(AttentionTest, Attention_FP16_SoftmaxLargeDimensions) {
 
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
   execution_providers.push_back(DefaultCpuExecutionProvider());
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+
+  if constexpr (sizeof(void*) == 4) {
+    // Expect overflow for 32-bit builds.
+    test.Run(OpTester::ExpectResult::kExpectFailure, "Integer overflow", {}, nullptr, &execution_providers);
+  } else {
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+  }
 }
 
 }  // namespace test

From c3ba15bc4e26497d65efdfdd66ea221c8b28de1c Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Tue, 24 Mar 2026 13:52:01 -0700
Subject: [PATCH 08/13] Fix integer overflow in FP16 softmax allocation

Extract ComputeAttentionSoftmaxInplace into attention_softmax.h, changing
parameters from int to size_t and using SafeInt for the N*D multiplication.
Previously, N*D could overflow int32 when q_sequence_length *
total_sequence_length > INT_MAX, causing an undersized buffer allocation.

Replace the old Attention_FP16_SoftmaxLargeDimensions operator-level test
in attention_op_test.cc with a direct regression test in the new
attention_softmax_test.cc. The new test uses a custom allocator to verify
the correct allocation size without needing the ~8GB buffer, and handles
both 64-bit (verifies size) and 32-bit (SafeInt overflow) builds.

Remove system_info.h/.cc (no longer needed).
---
 .../core/providers/cpu/llm/attention.cc       | 22 +----
 .../providers/cpu/llm/attention_softmax.h     | 35 ++++++++
 .../providers/cpu/llm/attention_op_test.cc    | 65 --------------
 .../cpu/llm/attention_softmax_test.cc         | 85 +++++++++++++++++++
 onnxruntime/test/util/include/system_info.h   | 16 ----
 onnxruntime/test/util/system_info.cc          | 33 -------
 6 files changed, 122 insertions(+), 134 deletions(-)
 create mode 100644 onnxruntime/core/providers/cpu/llm/attention_softmax.h
 create mode 100644 onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc
 delete mode 100644 onnxruntime/test/util/include/system_info.h
 delete mode 100644 onnxruntime/test/util/system_info.cc

diff --git a/onnxruntime/core/providers/cpu/llm/attention.cc b/onnxruntime/core/providers/cpu/llm/attention.cc
index 648fb483ba2dd..7a94d9a39182c 100644
--- a/onnxruntime/core/providers/cpu/llm/attention.cc
+++ b/onnxruntime/core/providers/cpu/llm/attention.cc
@@ -3,6 +3,7 @@
 
 #include "core/providers/cpu/llm/attention.h"
 #include "core/providers/cpu/llm/attention_helper.h"
+#include "core/providers/cpu/llm/attention_softmax.h"
 
 #include "core/common/common.h"
 #include "core/common/safeint.h"
@@ -77,24 +78,6 @@ void make_copy<MLFloat16, bool>(MLFloat16* mask_data, const bool* mask_index, si
   }
 }
 
-template <typename T>
-inline void ComputeAttentionSoftmaxInplace(T* score, size_t N, size_t D, ThreadPool* tp, AllocatorPtr) {
-  MlasComputeSoftmax(score, score, N, D, false, false, 0.0f, tp);
-}
-
-template <>
-inline void ComputeAttentionSoftmaxInplace<MLFloat16>(MLFloat16* score, size_t N, size_t D, ThreadPool* tp, AllocatorPtr allocator) {
-  ORT_ENFORCE(tp == nullptr, "No parallelized version of softmax for float16.");
-  // Mlas Lacks kernels for fp16 softmax, we convert into float32 and call the float32 version.
-  const auto num_elements = SafeInt<size_t>(N) * D;
-  void* allocated_ptr = allocator->Alloc(num_elements * sizeof(float));
-  BufferUniquePtr float_buffer(allocated_ptr, BufferDeleter(allocator));
-  float* ptr = reinterpret_cast<float*>(allocated_ptr);
-  MlasConvertHalfToFloatBuffer(score, ptr, num_elements);
-  MlasComputeSoftmax(ptr, ptr, N, D, false, false, 0.0f, tp);
-  MlasConvertFloatToHalfBuffer(ptr, score, num_elements);
-}
-
 template <typename T>
 inline void ComputeAttentionSoftcapInplace(T* scores, int sequence_length, T softcap) {
   MlasComputeSoftcap(scores, scores, sequence_length, softcap);
@@ -501,8 +484,7 @@ void AttentionBase<T>::ComputeAttentionProbs(T* attention_probs,
       if (out_qk != nullptr && parameters.qk_matmul_output_mode == attention_helper::QKMatMulOutputMode::kQKSoftCap) {
         memcpy(out_qk, output, SafeInt<size_t>(probs_matrix_size) * sizeof(T));
       }
-      ComputeAttentionSoftmaxInplace(output, SafeInt<size_t>(parameters.q_sequence_length),
-                                     SafeInt<size_t>(parameters.total_sequence_length), nullptr, allocator);
+      ComputeAttentionSoftmaxInplace(output, parameters.q_sequence_length, parameters.total_sequence_length, nullptr, allocator);
 
       if (output_qk != nullptr && parameters.qk_matmul_output_mode == attention_helper::QKMatMulOutputMode::kQKSoftMax) {
         memcpy(output_qk + output_offset, output,
diff --git a/onnxruntime/core/providers/cpu/llm/attention_softmax.h b/onnxruntime/core/providers/cpu/llm/attention_softmax.h
new file mode 100644
index 0000000000000..7176e7b6a7d58
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/llm/attention_softmax.h
@@ -0,0 +1,35 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/common.h"
+#include "core/common/safeint.h"
+#include "core/framework/allocator.h"
+#include "core/framework/buffer_deleter.h"
+#include "core/mlas/inc/mlas.h"
+#include "core/platform/threadpool.h"
+
+namespace onnxruntime {
+
+template <typename T>
+inline void ComputeAttentionSoftmaxInplace(T* score, size_t N, size_t D,
+                                           concurrency::ThreadPool* tp, AllocatorPtr) {
+  MlasComputeSoftmax(score, score, N, D, false, false, 0.0f, tp);
+}
+
+template <>
+inline void ComputeAttentionSoftmaxInplace<MLFloat16>(MLFloat16* score, size_t N, size_t D,
+                                                      concurrency::ThreadPool* tp, AllocatorPtr allocator) {
+  ORT_ENFORCE(tp == nullptr, "No parallelized version of softmax for float16.");
+  // MLAS lacks kernels for fp16 softmax, so we convert to float32 and use the float32 version.
+  size_t num_elements = SafeInt<size_t>(N) * D;
+  void* allocated_ptr = allocator->Alloc(num_elements * sizeof(float));
+  BufferUniquePtr float_buffer(allocated_ptr, BufferDeleter(allocator));
+  float* ptr = reinterpret_cast<float*>(allocated_ptr);
+  MlasConvertHalfToFloatBuffer(score, ptr, num_elements);
+  MlasComputeSoftmax(ptr, ptr, N, D, false, false, 0.0f, tp);
+  MlasConvertFloatToHalfBuffer(ptr, score, num_elements);
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
index 77a6ae92b95c1..b651a47b582ac 100644
--- a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
+++ b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc
@@ -8,7 +8,6 @@
 #include "test/common/tensor_op_test_utils.h"
 #include "test/common/cuda_op_test_utils.h"
 #include "test/providers/provider_test_utils.h"
-#include "test/util/include/system_info.h"
 
 namespace onnxruntime {
 namespace test {
@@ -2407,69 +2406,5 @@ TEST(AttentionTest, Attention_NonPadKVSeqLen_WithFloatAttnMask_MultiBatch) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
 }
 
-// Regression test for CPU kernel integer overflow in FP16 softmax allocation.
-// ComputeAttentionSoftmaxInplace<MLFloat16> previously used int for N and D.
-// For large enough values of N and D, N * D could overflow int32.
-TEST(AttentionTest, AttentionCpuFp16SoftmaxLargeDimensions) {
-  // Skip if the machine has less than 16GB of physical RAM.
-  constexpr uint64_t required_ram_bytes = 16ULL * 1024 * 1024 * 1024;
-  if (const auto total_ram_bytes = GetTotalPhysicalMemoryBytes();
-      total_ram_bytes.has_value() && *total_ram_bytes < required_ram_bytes) {
-    GTEST_SKIP() << "Skipping: test requires >= 16GB RAM, machine has "
-                 << (*total_ram_bytes / (1024 * 1024)) << "MB";
-  }
-
-  constexpr int batch_size = 1;
-  constexpr int num_heads = 1;
-  constexpr int q_sequence_length = 46341;
-  constexpr int kv_sequence_length = 46341;
-  constexpr int head_size = 1;
-
-  // Verify at compile time that these dimensions trigger the overflow scenario.
-  static_assert(static_cast<int64_t>(q_sequence_length) * kv_sequence_length >
-                    static_cast<int64_t>(std::numeric_limits<int>::max()),
-                "Test dimensions must cause int32 overflow in N*D");
-
-  OpTester test("Attention", 23, onnxruntime::kOnnxDomain);
-
-  // 4D BNSH inputs
-  std::vector<int64_t> q_shape = {batch_size, num_heads, q_sequence_length, head_size};
-  std::vector<int64_t> k_shape = {batch_size, num_heads, kv_sequence_length, head_size};
-  std::vector<int64_t> v_shape = {batch_size, num_heads, kv_sequence_length, head_size};
-
-  constexpr int q_elements = batch_size * num_heads * q_sequence_length * head_size;
-  constexpr int kv_elements = batch_size * num_heads * kv_sequence_length * head_size;
-
-  // All-zero Q and K → all attention scores are 0, softmax produces uniform 1/kv_seq.
-  // All-one V → output is also all 1.0 (weighted average of 1s).
-  std::vector<float> q_data(q_elements, 0.0f);
-  std::vector<float> k_data(kv_elements, 0.0f);
-  std::vector<float> v_data(kv_elements, 1.0f);
-
-  test.AddInput<MLFloat16>("Q", q_shape, ToFloat16(q_data));
-  test.AddInput<MLFloat16>("K", k_shape, ToFloat16(k_data));
-  test.AddInput<MLFloat16>("V", v_shape, ToFloat16(v_data));
-  test.AddOptionalInputEdge<bool>();       // attn_mask
-  test.AddOptionalInputEdge<MLFloat16>();  // past_key
-  test.AddOptionalInputEdge<MLFloat16>();  // past_value
-
-  // Expected output: all 1.0 (uniform attention over all-ones V).
-  std::vector<int64_t> y_shape = {batch_size, num_heads, q_sequence_length, head_size};
-  std::vector<float> expected_y(q_elements, 1.0f);
-  test.AddOutput<MLFloat16>("Y", y_shape, ToFloat16(expected_y), false, 0, 3e-2f);
-  test.AddOptionalOutputEdge<MLFloat16>();  // present_key
-  test.AddOptionalOutputEdge<MLFloat16>();  // present_value
-
-  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
-  execution_providers.push_back(DefaultCpuExecutionProvider());
-
-  if constexpr (sizeof(void*) == 4) {
-    // Expect overflow for 32-bit builds.
-    test.Run(OpTester::ExpectResult::kExpectFailure, "Integer overflow", {}, nullptr, &execution_providers);
-  } else {
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
-  }
-}
-
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc b/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc
new file mode 100644
index 0000000000000..b4a39bdcdcc99
--- /dev/null
+++ b/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc
@@ -0,0 +1,85 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_NO_EXCEPTIONS)
+
+#include <exception>
+#include <limits>
+
+#include "gtest/gtest.h"
+
+#include "core/framework/allocator.h"
+#include "core/providers/cpu/llm/attention_softmax.h"
+
+namespace onnxruntime {
+namespace test {
+
+// Regression test for integer overflow in FP16 softmax allocation.
+// ComputeAttentionSoftmaxInplace<MLFloat16> previously used int for N and D, so N*D could overflow int32.
+// The fix changed parameters to size_t and uses SafeInt for the multiplication.
+//
+// This test calls ComputeAttentionSoftmaxInplace<MLFloat16> directly with overflow-triggering dimensions
+// (N=46341, D=46341, where N*D > INT_MAX).
+// A custom allocator intercepts the Alloc call to verify the requested size is computed correctly with size_t
+// arithmetic, without actually allocating the ~8GB buffer.
+//
+// On 32-bit builds, SafeInt<size_t>(N) * D itself overflows.
+TEST(AttentionSoftmaxTest, Fp16OverflowAllocation) {
+  // Custom exception thrown by the allocator to distinguish it from SafeInt overflow.
+  struct AllocationIntercepted : std::exception {
+    const char* what() const noexcept override { return "allocation intercepted"; }
+  };
+
+  // Custom allocator that records the requested allocation size and throws to avoid actually allocating the
+  // (very large) buffer.
+  class OverflowCheckAllocator : public IAllocator {
+   public:
+    OverflowCheckAllocator()
+        : IAllocator(OrtMemoryInfo(CPU, OrtDeviceAllocator)) {}
+    void* Alloc(size_t size) override {
+      last_alloc_size_ = size;
+      throw AllocationIntercepted();
+    }
+    void Free(void*) override {}
+    size_t LastAllocSize() const { return last_alloc_size_; }
+
+   private:
+    size_t last_alloc_size_ = 0;
+  };
+
+  constexpr size_t N = 46341;
+  constexpr size_t D = 46341;
+
+  // Verify at compile time that these dimensions would overflow int32.
+  static_assert(static_cast<int64_t>(N) * D > static_cast<int64_t>(std::numeric_limits<int>::max()),
+                "Test dimensions must cause int32 overflow in N*D");
+
+  auto alloc = std::make_shared<OverflowCheckAllocator>();
+  MLFloat16 dummy_score{0.0f};
+
+  if constexpr (static_cast<uintmax_t>(N) * D <= static_cast<uintmax_t>(std::numeric_limits<size_t>::max())) {
+    // N * D fits in size_t.  The function reaches Alloc, which records the requested size and throws
+    // AllocationIntercepted.
+    EXPECT_THROW(ComputeAttentionSoftmaxInplace<MLFloat16>(&dummy_score, N, D, nullptr, alloc),
+                 AllocationIntercepted);
+
+    // The allocation size must reflect correct size_t arithmetic: N * D * sizeof(float).
+    // With the old int parameters, N * D would overflow to a small/negative value, producing a wrong (much smaller)
+    // allocation size.
+    constexpr size_t expected_bytes = N * D * sizeof(float);
+    EXPECT_EQ(alloc->LastAllocSize(), expected_bytes);
+  } else {
+    // N * D overflows size_t (i.e., in a 32-bit build), so SafeInt<size_t> will throw an exception.
+    try {
+      ComputeAttentionSoftmaxInplace<MLFloat16>(&dummy_score, N, D, nullptr, alloc);
+      FAIL() << "Expected OnnxRuntimeException to be thrown";
+    } catch (const OnnxRuntimeException& e) {
+      EXPECT_THAT(e.what(), testing::HasSubstr("Integer overflow"));
+    }
+  }
+}
+
+}  // namespace test
+}  // namespace onnxruntime
+
+#endif  // !defined(ORT_NO_EXCEPTIONS)
diff --git a/onnxruntime/test/util/include/system_info.h b/onnxruntime/test/util/include/system_info.h
deleted file mode 100644
index 7f0e925c6ccbc..0000000000000
--- a/onnxruntime/test/util/include/system_info.h
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include <cstdint>
-#include <optional>
-
-namespace onnxruntime {
-namespace test {
-
-// Returns the total physical memory (RAM) in bytes, or std::nullopt if detection fails.
-std::optional<uint64_t> GetTotalPhysicalMemoryBytes();
-
-}  // namespace test
-}  // namespace onnxruntime
diff --git a/onnxruntime/test/util/system_info.cc b/onnxruntime/test/util/system_info.cc
deleted file mode 100644
index 653be56cd98c2..0000000000000
--- a/onnxruntime/test/util/system_info.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "test/util/include/system_info.h"
-
-#ifdef _WIN32
-#include <windows.h>
-#else
-#include <unistd.h>
-#endif
-
-namespace onnxruntime {
-namespace test {
-
-std::optional<uint64_t> GetTotalPhysicalMemoryBytes() {
-#ifdef _WIN32
-  MEMORYSTATUSEX mem_info = {};
-  mem_info.dwLength = sizeof(mem_info);
-  if (GlobalMemoryStatusEx(&mem_info)) {
-    return static_cast<uint64_t>(mem_info.ullTotalPhys);
-  }
-#else
-  long pages = sysconf(_SC_PHYS_PAGES);
-  long page_size = sysconf(_SC_PAGESIZE);
-  if (pages > 0 && page_size > 0) {
-    return static_cast<uint64_t>(pages) * static_cast<uint64_t>(page_size);
-  }
-#endif
-  return std::nullopt;
-}
-
-}  // namespace test
-}  // namespace onnxruntime

From 8c69b10a2a611cd5b9e164ea377f2776d156fada Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Tue, 24 Mar 2026 14:07:41 -0700
Subject: [PATCH 09/13] Apply suggestion from @Copilot

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc b/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc
index b4a39bdcdcc99..fad847aef4602 100644
--- a/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc
+++ b/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc
@@ -7,6 +7,7 @@
 #include <limits>
 
 #include "gtest/gtest.h"
+#include "gmock/gmock.h"
 
 #include "core/framework/allocator.h"
 #include "core/providers/cpu/llm/attention_softmax.h"

From a168897c7730f1e8cda155d7b24d5de1ef26b150 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Tue, 24 Mar 2026 14:10:46 -0700
Subject: [PATCH 10/13] propagate safeint

---
 onnxruntime/core/providers/cpu/llm/attention_softmax.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/cpu/llm/attention_softmax.h b/onnxruntime/core/providers/cpu/llm/attention_softmax.h
index 7176e7b6a7d58..77c390418be4b 100644
--- a/onnxruntime/core/providers/cpu/llm/attention_softmax.h
+++ b/onnxruntime/core/providers/cpu/llm/attention_softmax.h
@@ -23,7 +23,7 @@ inline void ComputeAttentionSoftmaxInplace<MLFloat16>(MLFloat16* score, size_t N
                                                       concurrency::ThreadPool* tp, AllocatorPtr allocator) {
   ORT_ENFORCE(tp == nullptr, "No parallelized version of softmax for float16.");
   // MLAS lacks kernels for fp16 softmax, so we convert to float32 and use the float32 version.
-  size_t num_elements = SafeInt<size_t>(N) * D;
+  auto num_elements = SafeInt<size_t>(N) * D;
   void* allocated_ptr = allocator->Alloc(num_elements * sizeof(float));
   BufferUniquePtr float_buffer(allocated_ptr, BufferDeleter(allocator));
   float* ptr = reinterpret_cast<float*>(allocated_ptr);

From 33cc1ea734561c2df5fec173a2c33f82edd94c08 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Tue, 24 Mar 2026 14:58:05 -0700
Subject: [PATCH 11/13] Add float16 header.

---
 onnxruntime/core/providers/cpu/llm/attention_softmax.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/onnxruntime/core/providers/cpu/llm/attention_softmax.h b/onnxruntime/core/providers/cpu/llm/attention_softmax.h
index 77c390418be4b..46beed384f739 100644
--- a/onnxruntime/core/providers/cpu/llm/attention_softmax.h
+++ b/onnxruntime/core/providers/cpu/llm/attention_softmax.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "core/common/common.h"
+#include "core/common/float16.h"
 #include "core/common/safeint.h"
 #include "core/framework/allocator.h"
 #include "core/framework/buffer_deleter.h"

From 136a6e22b0ba2d90122c968e451bf9d03f866478 Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Tue, 24 Mar 2026 15:33:37 -0700
Subject: [PATCH 12/13] improve size_t overflow check

---
 .../cpu/llm/attention_softmax_test.cc         | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc b/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc
index fad847aef4602..2cab626fe363e 100644
--- a/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc
+++ b/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc
@@ -52,25 +52,26 @@ TEST(AttentionSoftmaxTest, Fp16OverflowAllocation) {
   constexpr size_t D = 46341;
 
   // Verify at compile time that these dimensions would overflow int32.
-  static_assert(static_cast<int64_t>(N) * D > static_cast<int64_t>(std::numeric_limits<int>::max()),
+  static_assert(int64_t{N} * int64_t{D} > int64_t{std::numeric_limits<int>::max()},
                 "Test dimensions must cause int32 overflow in N*D");
 
   auto alloc = std::make_shared<OverflowCheckAllocator>();
   MLFloat16 dummy_score{0.0f};
 
-  if constexpr (static_cast<uintmax_t>(N) * D <= static_cast<uintmax_t>(std::numeric_limits<size_t>::max())) {
-    // N * D fits in size_t.  The function reaches Alloc, which records the requested size and throws
+  // The allocation size must reflect correct size_t arithmetic: N * D * sizeof(float).
+  // With the old int parameters, N * D would overflow to a small/negative value, producing a wrong (much smaller)
+  // allocation size.
+  constexpr uintmax_t expected_allocation_size = uintmax_t{N} * D * sizeof(float);
+
+  if constexpr (expected_allocation_size <= uintmax_t{std::numeric_limits<size_t>::max()}) {
+    // Allocation size fits in size_t. The function reaches Alloc, which records the requested size and throws
     // AllocationIntercepted.
     EXPECT_THROW(ComputeAttentionSoftmaxInplace<MLFloat16>(&dummy_score, N, D, nullptr, alloc),
                  AllocationIntercepted);
 
-    // The allocation size must reflect correct size_t arithmetic: N * D * sizeof(float).
-    // With the old int parameters, N * D would overflow to a small/negative value, producing a wrong (much smaller)
-    // allocation size.
-    constexpr size_t expected_bytes = N * D * sizeof(float);
-    EXPECT_EQ(alloc->LastAllocSize(), expected_bytes);
+    EXPECT_EQ(alloc->LastAllocSize(), static_cast<size_t>(expected_allocation_size));
   } else {
-    // N * D overflows size_t (i.e., in a 32-bit build), so SafeInt<size_t> will throw an exception.
+    // Allocation size overflows size_t (i.e., in a 32-bit build), so SafeInt<size_t> will throw an exception.
     try {
       ComputeAttentionSoftmaxInplace<MLFloat16>(&dummy_score, N, D, nullptr, alloc);
       FAIL() << "Expected OnnxRuntimeException to be thrown";

From 9923ec0892e6c3e84042065d0c2526359007db7a Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Tue, 24 Mar 2026 17:29:41 -0700
Subject: [PATCH 13/13] update comments

---
 onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc b/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc
index 2cab626fe363e..cb98d248b2196 100644
--- a/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc
+++ b/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc
@@ -24,7 +24,7 @@ namespace test {
 // A custom allocator intercepts the Alloc call to verify the requested size is computed correctly with size_t
 // arithmetic, without actually allocating the ~8GB buffer.
 //
-// On 32-bit builds, SafeInt<size_t>(N) * D itself overflows.
+// On 32-bit builds, SafeInt<size_t> will signal an overflow for the requested size.
 TEST(AttentionSoftmaxTest, Fp16OverflowAllocation) {
   // Custom exception thrown by the allocator to distinguish it from SafeInt overflow.
   struct AllocationIntercepted : std::exception {
@@ -59,8 +59,7 @@ TEST(AttentionSoftmaxTest, Fp16OverflowAllocation) {
   MLFloat16 dummy_score{0.0f};
 
   // The allocation size must reflect correct size_t arithmetic: N * D * sizeof(float).
-  // With the old int parameters, N * D would overflow to a small/negative value, producing a wrong (much smaller)
-  // allocation size.
+  // With the old int parameters, N * D would overflow to a small/negative value, producing a wrong allocation size.
   constexpr uintmax_t expected_allocation_size = uintmax_t{N} * D * sizeof(float);
 
   if constexpr (expected_allocation_size <= uintmax_t{std::numeric_limits<size_t>::max()}) {