diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_bnb4.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_bnb4.cc index 3ce683eb712c2..31eea611ea972 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/matmul_bnb4.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_bnb4.cc @@ -19,6 +19,9 @@ class MatMulBnb4 final : public OpKernel { ORT_ENFORCE(Status::OK() == info.GetAttr("N", &N_)); ORT_ENFORCE(Status::OK() == info.GetAttr("block_size", &block_size_)); ORT_ENFORCE(Status::OK() == info.GetAttr("quant_type", &quant_type_)); + ORT_ENFORCE(K_ > 0, "K must be positive, got ", K_); + ORT_ENFORCE(N_ > 0, "N must be positive, got ", N_); + ORT_ENFORCE(block_size_ > 0, "block_size must be positive, got ", block_size_); ORT_ENFORCE( quant_type_ == FP4 || quant_type_ == NF4, "Invalid quant_type, only 0 (FP4) and 1 (NF4) are supported."); @@ -50,6 +53,32 @@ Status MatMulBnb4::Compute(OpKernelContext* ctx) const { const uint8_t* b_quant_data = b_quant->Data(); const float* absmax_data = absmax->Data(); + // Overflow-safe computation of expected tensor sizes. + // K_, N_, block_size_ are validated > 0 in the constructor. + if (K_ > std::numeric_limits::max() / N_) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Overflow computing K * N for K=", K_, ", N=", N_, "."); + } + const int64_t numel = K_ * N_; + // Overflow-safe ceiling division: rewrite (a + b - 1) / b as ((a - 1) / b) + 1. + // Safe because numel > 0 (K_ > 0 and N_ > 0 validated in constructor). + const int64_t expected_b_quant_size = ((numel - 1) / 2) + 1; + const int64_t expected_absmax_size = ((numel - 1) / block_size_) + 1; + + if (b_quant->Shape().Size() < expected_b_quant_size) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "b_quant tensor size (", b_quant->Shape().Size(), + ") is too small for K=", K_, " and N=", N_, + ". Expected at least ", expected_b_quant_size, " elements."); + } + if (absmax->Shape().Size() < expected_absmax_size) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "absmax tensor size (", absmax->Shape().Size(), + ") is too small for K=", K_, ", N=", N_, + ", block_size=", block_size_, + ". Expected at least ", expected_absmax_size, " elements."); + } + AllocatorPtr allocator; auto status = ctx->GetTempSpaceAllocator(&allocator); ORT_RETURN_IF_ERROR(status); diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_bnb4.cc b/onnxruntime/contrib_ops/cuda/quantization/matmul_bnb4.cc index d5d7153d0c8b9..01c399924aaf1 100644 --- a/onnxruntime/contrib_ops/cuda/quantization/matmul_bnb4.cc +++ b/onnxruntime/contrib_ops/cuda/quantization/matmul_bnb4.cc @@ -22,6 +22,9 @@ class MatMulBnb4 final : public CudaKernel { ORT_ENFORCE(Status::OK() == info.GetAttr("N", &N_)); ORT_ENFORCE(Status::OK() == info.GetAttr("block_size", &block_size_)); ORT_ENFORCE(Status::OK() == info.GetAttr("quant_type", &quant_type_)); + ORT_ENFORCE(K_ > 0, "K must be positive, got ", K_); + ORT_ENFORCE(N_ > 0, "N must be positive, got ", N_); + ORT_ENFORCE(block_size_ > 0, "block_size must be positive, got ", block_size_); ORT_ENFORCE( quant_type_ == FP4 || quant_type_ == NF4, "Invalid quant_type, only 0 (FP4) and 1 (NF4) are supported."); @@ -51,6 +54,32 @@ Status MatMulBnb4::ComputeInternal(OpKernelContext* ctx) const { const uint8_t* b_quant_data = b_quant->Data(); const auto* absmax_data = absmax->Data(); + // Overflow-safe computation of expected tensor sizes. + // K_, N_, block_size_ are validated > 0 in the constructor. + if (K_ > std::numeric_limits::max() / N_) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Overflow computing K * N for K=", K_, ", N=", N_, "."); + } + const int64_t numel = K_ * N_; + // Overflow-safe ceiling division: rewrite (a + b - 1) / b as ((a - 1) / b) + 1. + // Safe because numel > 0 (K_ > 0 and N_ > 0 validated in constructor). + const int64_t expected_b_quant_size = ((numel - 1) / 2) + 1; + const int64_t expected_absmax_size = ((numel - 1) / block_size_) + 1; + + if (b_quant->Shape().Size() < expected_b_quant_size) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "b_quant tensor size (", b_quant->Shape().Size(), + ") is too small for K=", K_, " and N=", N_, + ". Expected at least ", expected_b_quant_size, " elements."); + } + if (absmax->Shape().Size() < expected_absmax_size) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "absmax tensor size (", absmax->Shape().Size(), + ") is too small for K=", K_, ", N=", N_, + ", block_size=", block_size_, + ". Expected at least ", expected_absmax_size, " elements."); + } + typedef typename ToCudaType::MappedType CudaT; // TODO: find a better way to create the quant_map without using a buffer diff --git a/onnxruntime/test/contrib_ops/matmul_bnb4_test.cc b/onnxruntime/test/contrib_ops/matmul_bnb4_test.cc index 47422eac216b7..6dfb8c87c14ec 100644 --- a/onnxruntime/test/contrib_ops/matmul_bnb4_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_bnb4_test.cc @@ -115,6 +115,88 @@ void RunTest(int64_t quant_type, int64_t M, int64_t N, int64_t K, int64_t block_ } } +TEST(MatMulBnb4, RejectsUndersizedBQuantTensor) { + // K=32, N=2 → numel=64, expected b_quant size = (64+1)/2 = 32 + // Provide only 4 bytes (valid for K=4, N=2) but claim K=32, N=2 + OpTester test("MatMulBnb4", 1, kMSDomain); + test.AddAttribute("K", 32LL); + test.AddAttribute("N", 2LL); + test.AddAttribute("block_size", 32LL); + test.AddAttribute("quant_type", 1LL); // NF4 + + test.AddInput("A", {1, 32}, std::vector(32, 0.0f)); + test.AddInput("B", {4}, std::vector(4, 0)); // too small + test.AddInput("absmax", {2}, std::vector(2, 1.0f)); + test.AddOutput("Y", {1, 2}, std::vector(2, 0.0f)); + + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectFailure, "b_quant tensor size", {}, nullptr, &execution_providers); +} + +TEST(MatMulBnb4, RejectsUndersizedAbsmaxTensor) { + // K=32, N=2, block_size=32 → numel=64, expected absmax size = (64+32-1)/32 = 2 + // Provide only 1 absmax element + int64_t K = 32, N = 2, block_size = 32; + int64_t numel = K * N; + int64_t quantized_numel = (numel + 1) / 2; + + OpTester test("MatMulBnb4", 1, kMSDomain); + test.AddAttribute("K", K); + test.AddAttribute("N", N); + test.AddAttribute("block_size", block_size); + test.AddAttribute("quant_type", 1LL); // NF4 + + test.AddInput("A", {1, K}, std::vector(K, 0.0f)); + test.AddInput("B", {quantized_numel}, std::vector(quantized_numel, 0)); + test.AddInput("absmax", {1}, std::vector(1, 1.0f)); // too small + test.AddOutput("Y", {1, N}, std::vector(N, 0.0f)); + + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectFailure, "absmax tensor size", {}, nullptr, &execution_providers); +} + +#if defined(USE_CUDA) +TEST(MatMulBnb4, RejectsUndersizedBQuantTensorCuda) { + OpTester test("MatMulBnb4", 1, kMSDomain); + test.AddAttribute("K", 32LL); + test.AddAttribute("N", 2LL); + test.AddAttribute("block_size", 32LL); + test.AddAttribute("quant_type", 1LL); // NF4 + + test.AddInput("A", {1, 32}, std::vector(32, 0.0f)); + test.AddInput("B", {4}, std::vector(4, 0)); // too small + test.AddInput("absmax", {2}, std::vector(2, 1.0f)); + test.AddOutput("Y", {1, 2}, std::vector(2, 0.0f)); + + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectFailure, "b_quant tensor size", {}, nullptr, &execution_providers); +} + +TEST(MatMulBnb4, RejectsUndersizedAbsmaxTensorCuda) { + int64_t K = 32, N = 2, block_size = 32; + int64_t numel = K * N; + int64_t quantized_numel = (numel + 1) / 2; + + OpTester test("MatMulBnb4", 1, kMSDomain); + test.AddAttribute("K", K); + test.AddAttribute("N", N); + test.AddAttribute("block_size", block_size); + test.AddAttribute("quant_type", 1LL); // NF4 + + test.AddInput("A", {1, K}, std::vector(K, 0.0f)); + test.AddInput("B", {quantized_numel}, std::vector(quantized_numel, 0)); + test.AddInput("absmax", {1}, std::vector(1, 1.0f)); // too small + test.AddOutput("Y", {1, N}, std::vector(N, 0.0f)); + + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectFailure, "absmax tensor size", {}, nullptr, &execution_providers); +} +#endif + TEST(MatMulBnb4, DISABLED_Float32) { for (auto qt : {0, 1}) { for (auto M : {1, 2, 100}) {