Skip to content
18 changes: 1 addition & 17 deletions onnxruntime/core/providers/cpu/llm/attention.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "core/providers/cpu/llm/attention.h"
#include "core/providers/cpu/llm/attention_helper.h"
#include "core/providers/cpu/llm/attention_softmax.h"

#include "core/common/common.h"
#include "core/common/safeint.h"
Expand Down Expand Up @@ -77,23 +78,6 @@ void make_copy<MLFloat16, bool>(MLFloat16* mask_data, const bool* mask_index, si
}
}

template <typename T>
inline void ComputeAttentionSoftmaxInplace(T* score, int N, int D, ThreadPool* tp, AllocatorPtr) {
MlasComputeSoftmax(score, score, N, D, false, false, 0.0f, tp);
}

template <>
inline void ComputeAttentionSoftmaxInplace<MLFloat16>(MLFloat16* score, int N, int D, ThreadPool* tp, AllocatorPtr allocator) {
ORT_ENFORCE(tp == nullptr, "No parallelized version of softmax for float16.");
// Mlas Lacks kernels for fp16 softmax, we convert into float32 and call the float32 version.
void* allocated_ptr = allocator->Alloc(static_cast<size_t>(N * D * sizeof(float)));
BufferUniquePtr float_buffer(allocated_ptr, BufferDeleter(allocator));
float* ptr = reinterpret_cast<float*>(allocated_ptr);
MlasConvertHalfToFloatBuffer(score, ptr, N * D);
MlasComputeSoftmax(ptr, ptr, N, D, false, false, 0.0f, tp);
MlasConvertFloatToHalfBuffer(ptr, score, N * D);
}

template <typename T>
inline void ComputeAttentionSoftcapInplace(T* scores, int sequence_length, T softcap) {
MlasComputeSoftcap(scores, scores, sequence_length, softcap);
Expand Down
36 changes: 36 additions & 0 deletions onnxruntime/core/providers/cpu/llm/attention_softmax.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#pragma once

#include "core/common/common.h"
#include "core/common/float16.h"
#include "core/common/safeint.h"
#include "core/framework/allocator.h"
#include "core/framework/buffer_deleter.h"
#include "core/mlas/inc/mlas.h"
#include "core/platform/threadpool.h"

namespace onnxruntime {

template <typename T>
inline void ComputeAttentionSoftmaxInplace(T* score, size_t N, size_t D,
concurrency::ThreadPool* tp, AllocatorPtr) {
MlasComputeSoftmax(score, score, N, D, false, false, 0.0f, tp);
}

template <>
inline void ComputeAttentionSoftmaxInplace<MLFloat16>(MLFloat16* score, size_t N, size_t D,
concurrency::ThreadPool* tp, AllocatorPtr allocator) {
ORT_ENFORCE(tp == nullptr, "No parallelized version of softmax for float16.");
// MLAS lacks kernels for fp16 softmax, so we convert to float32 and use the float32 version.
auto num_elements = SafeInt<size_t>(N) * D;
void* allocated_ptr = allocator->Alloc(num_elements * sizeof(float));
Comment thread
edgchen1 marked this conversation as resolved.
BufferUniquePtr float_buffer(allocated_ptr, BufferDeleter(allocator));
float* ptr = reinterpret_cast<float*>(allocated_ptr);
MlasConvertHalfToFloatBuffer(score, ptr, num_elements);
MlasComputeSoftmax(ptr, ptr, N, D, false, false, 0.0f, tp);
MlasConvertFloatToHalfBuffer(ptr, score, num_elements);
Comment thread
edgchen1 marked this conversation as resolved.
}

} // namespace onnxruntime
1 change: 1 addition & 0 deletions onnxruntime/test/providers/cpu/llm/attention_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// Licensed under the MIT License.

#include <cassert>
#include <limits>
#include "gtest/gtest.h"
#include "core/session/onnxruntime_cxx_api.h"
#include "test/common/tensor_op_test_utils.h"
Expand Down
86 changes: 86 additions & 0 deletions onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#if !defined(ORT_NO_EXCEPTIONS)

#include <exception>
#include <limits>

#include "gtest/gtest.h"
Comment thread
edgchen1 marked this conversation as resolved.
#include "gmock/gmock.h"

#include "core/framework/allocator.h"
#include "core/providers/cpu/llm/attention_softmax.h"

namespace onnxruntime {
namespace test {

// Regression test for integer overflow in FP16 softmax allocation.
// ComputeAttentionSoftmaxInplace<MLFloat16> previously used int for N and D, so N*D could overflow int32.
// The fix changed parameters to size_t and uses SafeInt for the multiplication.
//
// This test calls ComputeAttentionSoftmaxInplace<MLFloat16> directly with overflow-triggering dimensions
// (N=46341, D=46341, where N*D > INT_MAX).
// A custom allocator intercepts the Alloc call to verify the requested size is computed correctly with size_t
// arithmetic, without actually allocating the ~8GB buffer.
//
// On 32-bit builds, SafeInt<size_t> will signal an overflow for the requested size.
TEST(AttentionSoftmaxTest, Fp16OverflowAllocation) {
Comment thread
edgchen1 marked this conversation as resolved.
// Custom exception thrown by the allocator to distinguish it from SafeInt overflow.
struct AllocationIntercepted : std::exception {
const char* what() const noexcept override { return "allocation intercepted"; }
};

// Custom allocator that records the requested allocation size and throws to avoid actually allocating the
// (very large) buffer.
class OverflowCheckAllocator : public IAllocator {
public:
OverflowCheckAllocator()
: IAllocator(OrtMemoryInfo(CPU, OrtDeviceAllocator)) {}
void* Alloc(size_t size) override {
last_alloc_size_ = size;
throw AllocationIntercepted();
}
void Free(void*) override {}
size_t LastAllocSize() const { return last_alloc_size_; }

private:
size_t last_alloc_size_ = 0;
};

constexpr size_t N = 46341;
constexpr size_t D = 46341;

// Verify at compile time that these dimensions would overflow int32.
static_assert(int64_t{N} * int64_t{D} > int64_t{std::numeric_limits<int>::max()},
"Test dimensions must cause int32 overflow in N*D");

auto alloc = std::make_shared<OverflowCheckAllocator>();
MLFloat16 dummy_score{0.0f};

// The allocation size must reflect correct size_t arithmetic: N * D * sizeof(float).
// With the old int parameters, N * D would overflow to a small/negative value, producing a wrong allocation size.
constexpr uintmax_t expected_allocation_size = uintmax_t{N} * D * sizeof(float);

if constexpr (expected_allocation_size <= uintmax_t{std::numeric_limits<size_t>::max()}) {
// Allocation size fits in size_t. The function reaches Alloc, which records the requested size and throws
// AllocationIntercepted.
EXPECT_THROW(ComputeAttentionSoftmaxInplace<MLFloat16>(&dummy_score, N, D, nullptr, alloc),

Check warning on line 68 in onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc

View workflow job for this annotation

GitHub Actions / build_x86_release

'gtest_label_testthrow_69': unreferenced label
AllocationIntercepted);

EXPECT_EQ(alloc->LastAllocSize(), static_cast<size_t>(expected_allocation_size));
} else {
// Allocation size overflows size_t (i.e., in a 32-bit build), so SafeInt<size_t> will throw an exception.
try {
ComputeAttentionSoftmaxInplace<MLFloat16>(&dummy_score, N, D, nullptr, alloc);
FAIL() << "Expected OnnxRuntimeException to be thrown";
} catch (const OnnxRuntimeException& e) {
Comment thread
edgchen1 marked this conversation as resolved.
EXPECT_THAT(e.what(), testing::HasSubstr("Integer overflow"));
}
}
}

} // namespace test
} // namespace onnxruntime

#endif // !defined(ORT_NO_EXCEPTIONS)
Loading