diff --git a/onnxruntime/core/providers/cpu/llm/attention.cc b/onnxruntime/core/providers/cpu/llm/attention.cc index 3d505af487712..7a94d9a39182c 100644 --- a/onnxruntime/core/providers/cpu/llm/attention.cc +++ b/onnxruntime/core/providers/cpu/llm/attention.cc @@ -3,6 +3,7 @@ #include "core/providers/cpu/llm/attention.h" #include "core/providers/cpu/llm/attention_helper.h" +#include "core/providers/cpu/llm/attention_softmax.h" #include "core/common/common.h" #include "core/common/safeint.h" @@ -77,23 +78,6 @@ void make_copy(MLFloat16* mask_data, const bool* mask_index, si } } -template -inline void ComputeAttentionSoftmaxInplace(T* score, int N, int D, ThreadPool* tp, AllocatorPtr) { - MlasComputeSoftmax(score, score, N, D, false, false, 0.0f, tp); -} - -template <> -inline void ComputeAttentionSoftmaxInplace(MLFloat16* score, int N, int D, ThreadPool* tp, AllocatorPtr allocator) { - ORT_ENFORCE(tp == nullptr, "No parallelized version of softmax for float16."); - // Mlas Lacks kernels for fp16 softmax, we convert into float32 and call the float32 version. - void* allocated_ptr = allocator->Alloc(static_cast(N * D * sizeof(float))); - BufferUniquePtr float_buffer(allocated_ptr, BufferDeleter(allocator)); - float* ptr = reinterpret_cast(allocated_ptr); - MlasConvertHalfToFloatBuffer(score, ptr, N * D); - MlasComputeSoftmax(ptr, ptr, N, D, false, false, 0.0f, tp); - MlasConvertFloatToHalfBuffer(ptr, score, N * D); -} - template inline void ComputeAttentionSoftcapInplace(T* scores, int sequence_length, T softcap) { MlasComputeSoftcap(scores, scores, sequence_length, softcap); diff --git a/onnxruntime/core/providers/cpu/llm/attention_softmax.h b/onnxruntime/core/providers/cpu/llm/attention_softmax.h new file mode 100644 index 0000000000000..46beed384f739 --- /dev/null +++ b/onnxruntime/core/providers/cpu/llm/attention_softmax.h @@ -0,0 +1,36 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/common/common.h" +#include "core/common/float16.h" +#include "core/common/safeint.h" +#include "core/framework/allocator.h" +#include "core/framework/buffer_deleter.h" +#include "core/mlas/inc/mlas.h" +#include "core/platform/threadpool.h" + +namespace onnxruntime { + +template +inline void ComputeAttentionSoftmaxInplace(T* score, size_t N, size_t D, + concurrency::ThreadPool* tp, AllocatorPtr) { + MlasComputeSoftmax(score, score, N, D, false, false, 0.0f, tp); +} + +template <> +inline void ComputeAttentionSoftmaxInplace(MLFloat16* score, size_t N, size_t D, + concurrency::ThreadPool* tp, AllocatorPtr allocator) { + ORT_ENFORCE(tp == nullptr, "No parallelized version of softmax for float16."); + // MLAS lacks kernels for fp16 softmax, so we convert to float32 and use the float32 version. + auto num_elements = SafeInt(N) * D; + void* allocated_ptr = allocator->Alloc(num_elements * sizeof(float)); + BufferUniquePtr float_buffer(allocated_ptr, BufferDeleter(allocator)); + float* ptr = reinterpret_cast(allocated_ptr); + MlasConvertHalfToFloatBuffer(score, ptr, num_elements); + MlasComputeSoftmax(ptr, ptr, N, D, false, false, 0.0f, tp); + MlasConvertFloatToHalfBuffer(ptr, score, num_elements); +} + +} // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc index 45c5220f42564..b651a47b582ac 100644 --- a/onnxruntime/test/providers/cpu/llm/attention_op_test.cc +++ b/onnxruntime/test/providers/cpu/llm/attention_op_test.cc @@ -2,6 +2,7 @@ // Licensed under the MIT License. #include +#include #include "gtest/gtest.h" #include "core/session/onnxruntime_cxx_api.h" #include "test/common/tensor_op_test_utils.h" diff --git a/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc b/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc new file mode 100644 index 0000000000000..cb98d248b2196 --- /dev/null +++ b/onnxruntime/test/providers/cpu/llm/attention_softmax_test.cc @@ -0,0 +1,86 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#if !defined(ORT_NO_EXCEPTIONS) + +#include +#include + +#include "gtest/gtest.h" +#include "gmock/gmock.h" + +#include "core/framework/allocator.h" +#include "core/providers/cpu/llm/attention_softmax.h" + +namespace onnxruntime { +namespace test { + +// Regression test for integer overflow in FP16 softmax allocation. +// ComputeAttentionSoftmaxInplace previously used int for N and D, so N*D could overflow int32. +// The fix changed parameters to size_t and uses SafeInt for the multiplication. +// +// This test calls ComputeAttentionSoftmaxInplace directly with overflow-triggering dimensions +// (N=46341, D=46341, where N*D > INT_MAX). +// A custom allocator intercepts the Alloc call to verify the requested size is computed correctly with size_t +// arithmetic, without actually allocating the ~8GB buffer. +// +// On 32-bit builds, SafeInt will signal an overflow for the requested size. +TEST(AttentionSoftmaxTest, Fp16OverflowAllocation) { + // Custom exception thrown by the allocator to distinguish it from SafeInt overflow. + struct AllocationIntercepted : std::exception { + const char* what() const noexcept override { return "allocation intercepted"; } + }; + + // Custom allocator that records the requested allocation size and throws to avoid actually allocating the + // (very large) buffer. + class OverflowCheckAllocator : public IAllocator { + public: + OverflowCheckAllocator() + : IAllocator(OrtMemoryInfo(CPU, OrtDeviceAllocator)) {} + void* Alloc(size_t size) override { + last_alloc_size_ = size; + throw AllocationIntercepted(); + } + void Free(void*) override {} + size_t LastAllocSize() const { return last_alloc_size_; } + + private: + size_t last_alloc_size_ = 0; + }; + + constexpr size_t N = 46341; + constexpr size_t D = 46341; + + // Verify at compile time that these dimensions would overflow int32. + static_assert(int64_t{N} * int64_t{D} > int64_t{std::numeric_limits::max()}, + "Test dimensions must cause int32 overflow in N*D"); + + auto alloc = std::make_shared(); + MLFloat16 dummy_score{0.0f}; + + // The allocation size must reflect correct size_t arithmetic: N * D * sizeof(float). + // With the old int parameters, N * D would overflow to a small/negative value, producing a wrong allocation size. + constexpr uintmax_t expected_allocation_size = uintmax_t{N} * D * sizeof(float); + + if constexpr (expected_allocation_size <= uintmax_t{std::numeric_limits::max()}) { + // Allocation size fits in size_t. The function reaches Alloc, which records the requested size and throws + // AllocationIntercepted. + EXPECT_THROW(ComputeAttentionSoftmaxInplace(&dummy_score, N, D, nullptr, alloc), + AllocationIntercepted); + + EXPECT_EQ(alloc->LastAllocSize(), static_cast(expected_allocation_size)); + } else { + // Allocation size overflows size_t (i.e., in a 32-bit build), so SafeInt will throw an exception. + try { + ComputeAttentionSoftmaxInplace(&dummy_score, N, D, nullptr, alloc); + FAIL() << "Expected OnnxRuntimeException to be thrown"; + } catch (const OnnxRuntimeException& e) { + EXPECT_THAT(e.what(), testing::HasSubstr("Integer overflow")); + } + } +} + +} // namespace test +} // namespace onnxruntime + +#endif // !defined(ORT_NO_EXCEPTIONS)