microsoft · adrastogi · Mar 30, 2026 · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
@@ -907,4 +907,71 @@
         bufferTensorDesc->TotalTensorSizeInBytes = (elementSize + 3) & ~3;
     }
 
+    void DmlOperator::BroadcastQuantizationParameters(
+        const MLOperatorKernelCreationContext& kernelInfo,
+        gsl::span<const uint32_t> outputShape
+        )
+    {
+        const uint32_t outputShapeDimCount = gsl::narrow_cast<uint32_t>(outputShape.size());
+
+        uint32_t axis = 0;
+
+        // If an axis was explicitly passed (or the default value 1 is set from the schema),
+        // then other inputs are broadcasting to the shape of the input data tensor.
+        if (kernelInfo.HasAttribute(AttrName::Axis, MLOperatorAttributeType::Int))
+        {
+            // Avoid validating the axis until later because the axis parameter is ignorable unless
+            // broadcasting is actually needed. ONNX opset 13 returns a default value of 1 for the
+            // "axis" attribute even when the attribute doesn't actually exist in the model, which
+            // would cause a validation failure here.
+            const int32_t signedAxis = gsl::narrow_cast<int32_t>(kernelInfo.GetAttribute<int64_t>(AttrName::Axis));
+            axis = Dml::HandleNegativeAxis(signedAxis, outputShapeDimCount, /*validateAxis*/ false);
+        }
+
+        // Explicitly reshape each of the inputs after the first input (scale tensor and optional zero point tensor).
+        for (uint32_t index = 1, inputCount = gsl::narrow_cast<uint32_t>(m_inputTensorDescs.size()); index < inputCount; ++index)
+        {
+            if (!kernelInfo.IsInputValid(index))
+            {
+                continue;
+            }
+
+            auto edgeDesc = kernelInfo.GetInputEdgeDescription(index);
+            assert(edgeDesc.edgeType == MLOperatorEdgeType::Tensor);
+
+            // Fix up the tensor shape by filling with trailing ones. So input[2,3] with axis=0 and scale[2]
+            // becomes scale[2,1], so that broadcasting works correctly.
+            std::vector<uint32_t> inputTensorShape = kernelInfo.GetTensorShapeDescription().GetInputTensorShape(index);
+
+            // If the input tensor is a 1D vector, then extra massaging is needed to project their
+            // 1D vectors back to the full shape for broadcasting along the given axis.
+            // The 1D vector should have a length equal to the output tensor's dimension on that axis.
+            if (inputTensorShape.size() == 1 && inputTensorShape != std::vector<uint32_t>(outputShape.begin(), outputShape.end()))
+            {
+                ML_CHECK_VALID_ARGUMENT(axis < outputShapeDimCount);
+                uint32_t broadcastAxisLength = outputShape[axis];
+                ML_CHECK_VALID_ARGUMENT(
+                    (inputTensorShape[0] == broadcastAxisLength) ||
+                    // Treat as broadcast dimension to match CPU behavior.
+                    (inputTensorShape[0] == 1)
+                );
+                inputTensorShape.insert(inputTensorShape.begin(), axis, 1);
+                inputTensorShape.insert(inputTensorShape.end(), outputShapeDimCount - 1 - axis, 1);
+            }
+            // For any other shape (scalar/ND), leave it alone, and the TensorDesc constructor
+            // will apply broadcasting with standard elementwise alignment.
+
+            m_inputTensorDescs[index] = TensorDesc(
+                edgeDesc.tensorDataType,
+                outputShape,
+                gsl::make_span(inputTensorShape),
+                TensorAxis::DoNotCoerce,
+                TensorAxis::W,
+                TensorAxis::RightAligned,
+                NchwDimensionCount, // minDimensionCount
+                0 // guaranteedBaseOffsetAlignment
+            );
+        }
+    }
+
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
@@ -149,6 +149,15 @@ namespace Dml
             uint32_t minDimensionCount = NchwDimensionCount
             ) const;
 
+        // Reshapes scale and zero_point tensor descriptors (inputs after index 0) so that their
+        // dimension count matches the output shape, enabling correct broadcasting in DML.
+        // For 1D per-axis tensors, the shape is projected along the given axis (e.g. scale[6]
+        // with axis=0 on a 5D output becomes [6,1,1,1,1]).
+        void BroadcastQuantizationParameters(
+            const MLOperatorKernelCreationContext& kernelInfo,
+            gsl::span<const uint32_t> outputShape
+            );
+
         static void TryConvertTensorToBroadcastScalar(
             const MLOperatorKernelCreationContext& kernelInfo,
             const DML_TENSOR_DESC* tensor,

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorElementWise.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorElementWise.cpp
@@ -542,64 +542,7 @@ class DmlOperatorElementwiseQLinear : public DmlOperator
         const DML_TENSOR_DATA_TYPE outputDataType = m_outputTensorDescs[0].GetDmlDataType();
         bool hasZeroPointTensor = kernelInfo.IsInputValid(2);
 
-        uint32_t axis = 0;
-
-        // If an axis was given explicitly passed (or the default value 1 is set from the schema),
-        // then other inputs are broadcasting to the shape of the input data tensor.
-        if (kernelInfo.HasAttribute(AttrName::Axis, MLOperatorAttributeType::Int))
-        {
-            // Avoid validating the axis until later because the axis parameter is ignorable unless
-            // broadcasting is actually needed. ONNX opset 13 returns a default value of 1 for the
-            // "axis" attribute even when the attribute doesn't actually exist in the model, which
-            // would cause a validation failure here.
-            const int32_t signedAxis = gsl::narrow_cast<int32_t>(kernelInfo.GetAttribute<int64_t>(AttrName::Axis));
-            axis = Dml::HandleNegativeAxis(signedAxis, outputShapeDimCount, /*validateAxis*/ false);
-        }
-
-        // Explicitly reshape each of the inputs after the first input (scale tensor and optional zero point tensor).
-        for (uint32_t index = 1, inputCount = gsl::narrow_cast<uint32_t>(m_inputTensorDescs.size()); index < inputCount; ++index)
-        {
-            if (!kernelInfo.IsInputValid(index))
-            {
-                continue;
-            }
-
-            auto edgeDesc = kernelInfo.GetInputEdgeDescription(index);
-            assert(edgeDesc.edgeType == MLOperatorEdgeType::Tensor);
-
-            // Fix up the the tensor shape by filling with trailing ones. So input[2,3] with axis=0 and scale[2]
-            // becomes scale[2,1], so that broadcasting works correctly.
-            std::vector<uint32_t> inputTensorShape = kernelInfo.GetTensorShapeDescription().GetInputTensorShape(index);
-
-            // If the input tensor is a 1D vector, then extra massaging is needed to project their
-            // 1D vectors back to the full shape for broadcasting along the given axis.
-            // The 1D vector should have a length equal to the output tensor's dimension on that axis.
-            if (inputTensorShape.size() == 1 && inputTensorShape != outputShape)
-            {
-                ML_CHECK_VALID_ARGUMENT(axis < outputShapeDimCount);
-                uint32_t broadcastAxisLength = outputShape[axis];
-                ML_CHECK_VALID_ARGUMENT(
-                    (inputTensorShape[0] == broadcastAxisLength) ||
-                    // Treat as broadcast dimension to match CPU behavior.
-                    (inputTensorShape[0] == 1)
-                );
-                inputTensorShape.insert(inputTensorShape.begin(), axis, 1);
-                inputTensorShape.insert(inputTensorShape.end(), outputShapeDimCount - 1 - axis, 1);
-            }
-            // For any other shape (scalar/ND), leave it alone, and the TensorDesc constructor
-            // will apply broadcasting with standard elementwise alignment.
-
-            m_inputTensorDescs[index] = TensorDesc(
-                edgeDesc.tensorDataType,
-                gsl::make_span(outputShape),
-                gsl::make_span(inputTensorShape),
-                TensorAxis::DoNotCoerce,
-                TensorAxis::W,
-                TensorAxis::RightAligned,
-                NchwDimensionCount, // minDimensionCount
-                0 // guaranteedBaseOffsetAlignment
-            );
-        }
+        BroadcastQuantizationParameters(kernelInfo, gsl::make_span(outputShape));
 
         std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
         std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
@@ -630,6 +573,8 @@ class DmlOperatorQuantization21 : public DmlOperator
         const DML_TENSOR_DATA_TYPE outputDataType = m_outputTensorDescs[0].GetDmlDataType();
         bool hasZeroPointTensor = kernelInfo.IsInputValid(2);
 
+        BroadcastQuantizationParameters(kernelInfo, gsl::make_span(outputShape));
+
         std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
         std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
 

diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorHelper.h
@@ -972,8 +972,23 @@ class MLOperatorKernel : public Microsoft::WRL::RuntimeClass<
     {
         ORT_TRY
         {
-            Microsoft::WRL::ComPtr<MLOperatorKernel> kernel = wil::MakeOrThrow<MLOperatorKernel>(MLOperatorKernelCreationContext(&info));
-
+            // Use placement new instead of wil::MakeOrThrow to control allocation
+            // and deallocation directly, ensuring correctly-sized cleanup if the
+            // constructor throws an exception.
+            void* buffer = ::operator new(sizeof(MLOperatorKernel));
+            MLOperatorKernel* kernelRaw = nullptr;
+            try
+            {
+                kernelRaw = new (buffer) MLOperatorKernel(MLOperatorKernelCreationContext(&info));
+            }
+            catch (...)
+            {
+                ::operator delete(buffer, sizeof(MLOperatorKernel));
+                throw;
+            }
+
+            Microsoft::WRL::ComPtr<MLOperatorKernel> kernel;
+            kernel.Attach(kernelRaw);
             *opKernel = kernel.Detach();
             return S_OK;
         }

diff --git a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
@@ -516,6 +516,90 @@ TEST(QuantizeLinearOpTest, Int8) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
+// Repro for new-delete-type-mismatch in DML EP during graph fusion.
+// QuantizeLinear float32→int8 with 5D input triggers a type-size
+// mismatch (192 bytes allocated, 1 byte deallocated) visible under ASan.
+TEST(QuantizeLinearOpTest, Int8_5D_DML_TypeMismatch) {
+  auto dml_ep = DefaultDmlExecutionProvider();
+  if (!dml_ep) {
+    GTEST_SKIP() << "Skipping because DML EP is not available.";
+  }
+
+  OpTester test("QuantizeLinear", 13);
+  std::vector<int64_t> dims{6, 1, 1, 1, 1};
+  test.AddInput<float>("x", dims, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+  test.AddInput<float>("y_scale", {}, {1.0f});
+  test.AddInput<int8_t>("y_zero_point", {}, {0});
+  test.AddOutput<int8_t>("y", dims, {1, 2, 3, 4, 5, 6});
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.emplace_back(std::move(dml_ep));
+  test.ConfigEps(std::move(execution_providers))
+      .RunWithConfig();
+}
+
+// Same as above but with per-axis quantization along axis 0 to exercise
+// the DML graph fusion path with per-channel int8 quantization.
+TEST(QuantizeLinearOpTest, Int8_5D_PerAxis_DML_TypeMismatch) {
+  auto dml_ep = DefaultDmlExecutionProvider();
+  if (!dml_ep) {
+    GTEST_SKIP() << "Skipping because DML EP is not available.";
+  }
+
+  OpTester test("QuantizeLinear", 13);
+  std::vector<int64_t> dims{6, 1, 1, 1, 1};
+  test.AddAttribute<int64_t>("axis", 0);
+  test.AddInput<float>("x", dims, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+  test.AddInput<float>("y_scale", {6}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
+  test.AddInput<int8_t>("y_zero_point", {6}, {0, 0, 0, 0, 0, 0});
+  test.AddOutput<int8_t>("y", dims, {1, 2, 3, 4, 5, 6});
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.emplace_back(std::move(dml_ep));
+  test.ConfigEps(std::move(execution_providers))
+      .RunWithConfig();
+}
+
+// Opset 21 QuantizeLinear float32→uint8 WITHOUT zero_point.
+// Without zero_point, the output type defaults to uint8.
+TEST(QuantizeLinearOpTest, Uint8_5D_NoZeroPoint_Opset21_DML) {
+  auto dml_ep = DefaultDmlExecutionProvider();
+  if (!dml_ep) {
+    GTEST_SKIP() << "Skipping because DML EP is not available.";
+  }
+
+  OpTester test("QuantizeLinear", 21);
+  std::vector<int64_t> dims{6, 1, 1, 1, 1};
+  test.AddInput<float>("x", dims, {0.0f, 51.0f, 102.0f, 153.0f, 204.0f, 255.0f});
+  test.AddInput<float>("y_scale", {}, {1.0f});
+  test.AddOutput<uint8_t>("y", dims, {0, 51, 102, 153, 204, 255});
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.emplace_back(std::move(dml_ep));
+  test.ConfigEps(std::move(execution_providers))
+      .RunWithConfig();
+}
+
+// Opset 21 QuantizeLinear float32→int8 with zero_point (the customer's exact scenario).
+TEST(QuantizeLinearOpTest, Int8_5D_WithZeroPoint_Opset21_DML) {
+  auto dml_ep = DefaultDmlExecutionProvider();
+  if (!dml_ep) {
+    GTEST_SKIP() << "Skipping because DML EP is not available.";
+  }
+
+  OpTester test("QuantizeLinear", 21);
+  std::vector<int64_t> dims{6, 1, 1, 1, 1};
+  test.AddInput<float>("x", dims, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+  test.AddInput<float>("y_scale", {}, {1.0f});
+  test.AddInput<int8_t>("y_zero_point", {}, {0});
+  test.AddOutput<int8_t>("y", dims, {1, 2, 3, 4, 5, 6});
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.emplace_back(std::move(dml_ep));
+  test.ConfigEps(std::move(execution_providers))
+      .RunWithConfig();
+}
+
 // Test uint16 QuantizeLinear (per tensor)
 TEST(QuantizeLinearOpTest, Uint16) {
   OpTester test("QuantizeLinear", 21);