microsoft · vraspar · Apr 15, 2026 · Apr 6, 2026 · Apr 7, 2026 · Apr 8, 2026
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_bnb4.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_bnb4.cc
@@ -19,6 +19,9 @@
     ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("N", &N_));
     ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("block_size", &block_size_));
     ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("quant_type", &quant_type_));
+    ORT_ENFORCE(K_ > 0, "K must be positive, got ", K_);
+    ORT_ENFORCE(N_ > 0, "N must be positive, got ", N_);
+    ORT_ENFORCE(block_size_ > 0, "block_size must be positive, got ", block_size_);
     ORT_ENFORCE(
         quant_type_ == FP4 || quant_type_ == NF4,
         "Invalid quant_type, only 0 (FP4) and 1 (NF4) are supported.");
@@ -50,6 +53,32 @@
   const uint8_t* b_quant_data = b_quant->Data<uint8_t>();
   const float* absmax_data = absmax->Data<float>();
 
+  // Overflow-safe computation of expected tensor sizes.
+  // K_, N_, block_size_ are validated > 0 in the constructor.
+  if (K_ > std::numeric_limits<int64_t>::max() / N_) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Overflow computing K * N for K=", K_, ", N=", N_, ".");
+  }
+  const int64_t numel = K_ * N_;
+  // Overflow-safe ceiling division: rewrite (a + b - 1) / b as ((a - 1) / b) + 1.
+  // Safe because numel > 0 (K_ > 0 and N_ > 0 validated in constructor).
+  const int64_t expected_b_quant_size = ((numel - 1) / 2) + 1;
+  const int64_t expected_absmax_size = ((numel - 1) / block_size_) + 1;
+
+  if (b_quant->Shape().Size() < expected_b_quant_size) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "b_quant tensor size (", b_quant->Shape().Size(),
+                           ") is too small for K=", K_, " and N=", N_,
+                           ". Expected at least ", expected_b_quant_size, " elements.");
+  }
+  if (absmax->Shape().Size() < expected_absmax_size) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "absmax tensor size (", absmax->Shape().Size(),
+                           ") is too small for K=", K_, ", N=", N_,
+                           ", block_size=", block_size_,
+                           ". Expected at least ", expected_absmax_size, " elements.");
+  }
+
   AllocatorPtr allocator;
   auto status = ctx->GetTempSpaceAllocator(&allocator);
   ORT_RETURN_IF_ERROR(status);

diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_bnb4.cc b/onnxruntime/contrib_ops/cuda/quantization/matmul_bnb4.cc
@@ -22,6 +22,9 @@
     ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("N", &N_));
     ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("block_size", &block_size_));
     ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("quant_type", &quant_type_));
+    ORT_ENFORCE(K_ > 0, "K must be positive, got ", K_);
+    ORT_ENFORCE(N_ > 0, "N must be positive, got ", N_);
+    ORT_ENFORCE(block_size_ > 0, "block_size must be positive, got ", block_size_);
     ORT_ENFORCE(
         quant_type_ == FP4 || quant_type_ == NF4,
         "Invalid quant_type, only 0 (FP4) and 1 (NF4) are supported.");
@@ -51,6 +54,32 @@
   const uint8_t* b_quant_data = b_quant->Data<uint8_t>();
   const auto* absmax_data = absmax->Data<T>();
 
+  // Overflow-safe computation of expected tensor sizes.
+  // K_, N_, block_size_ are validated > 0 in the constructor.
+  if (K_ > std::numeric_limits<int64_t>::max() / N_) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Overflow computing K * N for K=", K_, ", N=", N_, ".");
+  }
+  const int64_t numel = K_ * N_;
+  // Overflow-safe ceiling division: rewrite (a + b - 1) / b as ((a - 1) / b) + 1.
+  // Safe because numel > 0 (K_ > 0 and N_ > 0 validated in constructor).
+  const int64_t expected_b_quant_size = ((numel - 1) / 2) + 1;
+  const int64_t expected_absmax_size = ((numel - 1) / block_size_) + 1;
+
+  if (b_quant->Shape().Size() < expected_b_quant_size) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "b_quant tensor size (", b_quant->Shape().Size(),
+                           ") is too small for K=", K_, " and N=", N_,
+                           ". Expected at least ", expected_b_quant_size, " elements.");
+  }
+  if (absmax->Shape().Size() < expected_absmax_size) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "absmax tensor size (", absmax->Shape().Size(),
+                           ") is too small for K=", K_, ", N=", N_,
+                           ", block_size=", block_size_,
+                           ". Expected at least ", expected_absmax_size, " elements.");
+  }
+
   typedef typename ToCudaType<T>::MappedType CudaT;
 
   // TODO: find a better way to create the quant_map without using a buffer

diff --git a/onnxruntime/test/contrib_ops/matmul_bnb4_test.cc b/onnxruntime/test/contrib_ops/matmul_bnb4_test.cc
@@ -115,6 +115,88 @@ void RunTest(int64_t quant_type, int64_t M, int64_t N, int64_t K, int64_t block_
   }
 }
 
+TEST(MatMulBnb4, RejectsUndersizedBQuantTensor) {
+  // K=32, N=2 → numel=64, expected b_quant size = (64+1)/2 = 32
+  // Provide only 4 bytes (valid for K=4, N=2) but claim K=32, N=2
+  OpTester test("MatMulBnb4", 1, kMSDomain);
+  test.AddAttribute<int64_t>("K", 32LL);
+  test.AddAttribute<int64_t>("N", 2LL);
+  test.AddAttribute<int64_t>("block_size", 32LL);
+  test.AddAttribute<int64_t>("quant_type", 1LL);  // NF4
+
+  test.AddInput<float>("A", {1, 32}, std::vector<float>(32, 0.0f));
+  test.AddInput<uint8_t>("B", {4}, std::vector<uint8_t>(4, 0));  // too small
+  test.AddInput<float>("absmax", {2}, std::vector<float>(2, 1.0f));
+  test.AddOutput<float>("Y", {1, 2}, std::vector<float>(2, 0.0f));
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultCpuExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectFailure, "b_quant tensor size", {}, nullptr, &execution_providers);
+}
+
+TEST(MatMulBnb4, RejectsUndersizedAbsmaxTensor) {
+  // K=32, N=2, block_size=32 → numel=64, expected absmax size = (64+32-1)/32 = 2
+  // Provide only 1 absmax element
+  int64_t K = 32, N = 2, block_size = 32;
+  int64_t numel = K * N;
+  int64_t quantized_numel = (numel + 1) / 2;
+
+  OpTester test("MatMulBnb4", 1, kMSDomain);
+  test.AddAttribute<int64_t>("K", K);
+  test.AddAttribute<int64_t>("N", N);
+  test.AddAttribute<int64_t>("block_size", block_size);
+  test.AddAttribute<int64_t>("quant_type", 1LL);  // NF4
+
+  test.AddInput<float>("A", {1, K}, std::vector<float>(K, 0.0f));
+  test.AddInput<uint8_t>("B", {quantized_numel}, std::vector<uint8_t>(quantized_numel, 0));
+  test.AddInput<float>("absmax", {1}, std::vector<float>(1, 1.0f));  // too small
+  test.AddOutput<float>("Y", {1, N}, std::vector<float>(N, 0.0f));
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultCpuExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectFailure, "absmax tensor size", {}, nullptr, &execution_providers);
+}
+
+#if defined(USE_CUDA)
+TEST(MatMulBnb4, RejectsUndersizedBQuantTensorCuda) {
+  OpTester test("MatMulBnb4", 1, kMSDomain);
+  test.AddAttribute<int64_t>("K", 32LL);
+  test.AddAttribute<int64_t>("N", 2LL);
+  test.AddAttribute<int64_t>("block_size", 32LL);
+  test.AddAttribute<int64_t>("quant_type", 1LL);  // NF4
+
+  test.AddInput<float>("A", {1, 32}, std::vector<float>(32, 0.0f));
+  test.AddInput<uint8_t>("B", {4}, std::vector<uint8_t>(4, 0));  // too small
+  test.AddInput<float>("absmax", {2}, std::vector<float>(2, 1.0f));
+  test.AddOutput<float>("Y", {1, 2}, std::vector<float>(2, 0.0f));
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultCudaExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectFailure, "b_quant tensor size", {}, nullptr, &execution_providers);
+}
+
+TEST(MatMulBnb4, RejectsUndersizedAbsmaxTensorCuda) {
+  int64_t K = 32, N = 2, block_size = 32;
+  int64_t numel = K * N;
+  int64_t quantized_numel = (numel + 1) / 2;
+
+  OpTester test("MatMulBnb4", 1, kMSDomain);
+  test.AddAttribute<int64_t>("K", K);
+  test.AddAttribute<int64_t>("N", N);
+  test.AddAttribute<int64_t>("block_size", block_size);
+  test.AddAttribute<int64_t>("quant_type", 1LL);  // NF4
+
+  test.AddInput<float>("A", {1, K}, std::vector<float>(K, 0.0f));
+  test.AddInput<uint8_t>("B", {quantized_numel}, std::vector<uint8_t>(quantized_numel, 0));
+  test.AddInput<float>("absmax", {1}, std::vector<float>(1, 1.0f));  // too small
+  test.AddOutput<float>("Y", {1, N}, std::vector<float>(N, 0.0f));
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultCudaExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectFailure, "absmax tensor size", {}, nullptr, &execution_providers);
+}
+#endif
+
 TEST(MatMulBnb4, DISABLED_Float32) {
   for (auto qt : {0, 1}) {
     for (auto M : {1, 2, 100}) {