microsoft · adrastogi · Mar 30, 2026 · Mar 23, 2026 · Mar 27, 2026
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
@@ -232,8 +232,6 @@ namespace DmlGraphFusionHelper
                     }
                 }
 
-                // Tensor sizes in DML must be a multiple of 4 bytes large.
-                tensorByteSize = AlignToPow2<size_t>(tensorByteSize, 4);
                 if(graphSerializationEnabled)
                 {
                     WriteToFile(modelName, ConvertToWString(iter->first) + L".bin", reinterpret_cast<uint8_t*>(tensorPtr), tensorByteSize);
@@ -264,9 +262,10 @@ namespace DmlGraphFusionHelper
                         initializeInputBuffer = CreateCpuResource(providerImpl, tensorPtr, tensorByteSize);
                     }
 
-                    // Set the binding for operator initialization to the buffer
+                    // Set the binding for operator initialization to the buffer.
+                    // DML requires buffer binding sizes to be a multiple of 4 bytes.
                     initInputBindings[i].Buffer = initializeInputBuffer.Get();
-                    initInputBindings[i].SizeInBytes = tensorByteSize;
+                    initInputBindings[i].SizeInBytes = AlignToPow2<size_t>(tensorByteSize, 4);
                     initializeResourceRefs.push_back(std::move(initializeInputBuffer));
                 }
 

diff --git a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
@@ -74,6 +74,34 @@ TEST(DequantizeLinearOpTest, Int4_LargeInitializerInput) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
+// Regression test: int8 tensor whose byte size is not a multiple of 4.
+// DML graph fusion rounds tensor sizes to a multiple of 4 via AlignToPow2.
+// If the original buffer is not padded, the subsequent memcpy reads past the
+// allocation boundary (heap-buffer-overflow detectable with ASan).
+// Mirrors the WebNN PoC: dequantizeLinear with int8[135] (135 % 4 != 0).
+TEST(DequantizeLinearOpTest, Int8_NonAlignedSize_Initializer) {
+  OpTester test("DequantizeLinear", 10);
+  constexpr int64_t kNumElements = 135;  // 135 bytes, AlignToPow2(135,4)=136
+
+  std::vector<int8_t> x_data(kNumElements);
+  std::vector<float> y_expected(kNumElements);
+  const float scale = 0.5f;
+  const int8_t zero_point = 0;
+  for (int64_t i = 0; i < kNumElements; ++i) {
+    x_data[i] = static_cast<int8_t>(i % 127);
+    y_expected[i] = (x_data[i] - zero_point) * scale;
+  }
+
+  // Mark all inputs as initializers so they go through DML's ProcessInputData
+  // → UnpackInitializer → AlignToPow2 code path during graph fusion.
+  test.AddInput<int8_t>("x", {kNumElements}, x_data, /*is_initializer=*/true);
+  test.AddInput<float>("x_scale", {1}, {scale}, /*is_initializer=*/true);
+  test.AddInput<int8_t>("x_zero_point", {1}, {zero_point}, /*is_initializer=*/true);
+  test.AddOutput<float>("y", {kNumElements}, y_expected);
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
 // scalar zero & scale with int4
 TEST(DequantizeLinearOpTest, Int4) {
   OpTester test("DequantizeLinear", 21);