Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -907,4 +907,71 @@
bufferTensorDesc->TotalTensorSizeInBytes = (elementSize + 3) & ~3;
}

void DmlOperator::BroadcastQuantizationParameters(
const MLOperatorKernelCreationContext& kernelInfo,
gsl::span<const uint32_t> outputShape
)
{
const uint32_t outputShapeDimCount = gsl::narrow_cast<uint32_t>(outputShape.size());

uint32_t axis = 0;

// If an axis was explicitly passed (or the default value 1 is set from the schema),
// then other inputs are broadcasting to the shape of the input data tensor.
if (kernelInfo.HasAttribute(AttrName::Axis, MLOperatorAttributeType::Int))
{
// Avoid validating the axis until later because the axis parameter is ignorable unless
// broadcasting is actually needed. ONNX opset 13 returns a default value of 1 for the
// "axis" attribute even when the attribute doesn't actually exist in the model, which
// would cause a validation failure here.
const int32_t signedAxis = gsl::narrow_cast<int32_t>(kernelInfo.GetAttribute<int64_t>(AttrName::Axis));
axis = Dml::HandleNegativeAxis(signedAxis, outputShapeDimCount, /*validateAxis*/ false);
}

// Explicitly reshape each of the inputs after the first input (scale tensor and optional zero point tensor).
for (uint32_t index = 1, inputCount = gsl::narrow_cast<uint32_t>(m_inputTensorDescs.size()); index < inputCount; ++index)
{
if (!kernelInfo.IsInputValid(index))
{
continue;
}

auto edgeDesc = kernelInfo.GetInputEdgeDescription(index);
assert(edgeDesc.edgeType == MLOperatorEdgeType::Tensor);

// Fix up the tensor shape by filling with trailing ones. So input[2,3] with axis=0 and scale[2]
// becomes scale[2,1], so that broadcasting works correctly.
std::vector<uint32_t> inputTensorShape = kernelInfo.GetTensorShapeDescription().GetInputTensorShape(index);

// If the input tensor is a 1D vector, then extra massaging is needed to project their
// 1D vectors back to the full shape for broadcasting along the given axis.
// The 1D vector should have a length equal to the output tensor's dimension on that axis.
if (inputTensorShape.size() == 1 && inputTensorShape != std::vector<uint32_t>(outputShape.begin(), outputShape.end()))

Check warning on line 949 in onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <vector> for vector<> [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp:949: Add #include <vector> for vector<> [build/include_what_you_use] [4]
{
ML_CHECK_VALID_ARGUMENT(axis < outputShapeDimCount);
uint32_t broadcastAxisLength = outputShape[axis];
ML_CHECK_VALID_ARGUMENT(
(inputTensorShape[0] == broadcastAxisLength) ||
// Treat as broadcast dimension to match CPU behavior.
(inputTensorShape[0] == 1)
);
inputTensorShape.insert(inputTensorShape.begin(), axis, 1);
inputTensorShape.insert(inputTensorShape.end(), outputShapeDimCount - 1 - axis, 1);
}
// For any other shape (scalar/ND), leave it alone, and the TensorDesc constructor
// will apply broadcasting with standard elementwise alignment.

m_inputTensorDescs[index] = TensorDesc(
edgeDesc.tensorDataType,
outputShape,
gsl::make_span(inputTensorShape),
TensorAxis::DoNotCoerce,
TensorAxis::W,
TensorAxis::RightAligned,
NchwDimensionCount, // minDimensionCount
0 // guaranteedBaseOffsetAlignment
);
}
}

} // namespace Dml
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,15 @@ namespace Dml
uint32_t minDimensionCount = NchwDimensionCount
) const;

// Reshapes scale and zero_point tensor descriptors (inputs after index 0) so that their
// dimension count matches the output shape, enabling correct broadcasting in DML.
// For 1D per-axis tensors, the shape is projected along the given axis (e.g. scale[6]
// with axis=0 on a 5D output becomes [6,1,1,1,1]).
void BroadcastQuantizationParameters(
const MLOperatorKernelCreationContext& kernelInfo,
gsl::span<const uint32_t> outputShape
);

static void TryConvertTensorToBroadcastScalar(
const MLOperatorKernelCreationContext& kernelInfo,
const DML_TENSOR_DESC* tensor,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -542,64 +542,7 @@ class DmlOperatorElementwiseQLinear : public DmlOperator
const DML_TENSOR_DATA_TYPE outputDataType = m_outputTensorDescs[0].GetDmlDataType();
bool hasZeroPointTensor = kernelInfo.IsInputValid(2);

uint32_t axis = 0;

// If an axis was given explicitly passed (or the default value 1 is set from the schema),
// then other inputs are broadcasting to the shape of the input data tensor.
if (kernelInfo.HasAttribute(AttrName::Axis, MLOperatorAttributeType::Int))
{
// Avoid validating the axis until later because the axis parameter is ignorable unless
// broadcasting is actually needed. ONNX opset 13 returns a default value of 1 for the
// "axis" attribute even when the attribute doesn't actually exist in the model, which
// would cause a validation failure here.
const int32_t signedAxis = gsl::narrow_cast<int32_t>(kernelInfo.GetAttribute<int64_t>(AttrName::Axis));
axis = Dml::HandleNegativeAxis(signedAxis, outputShapeDimCount, /*validateAxis*/ false);
}

// Explicitly reshape each of the inputs after the first input (scale tensor and optional zero point tensor).
for (uint32_t index = 1, inputCount = gsl::narrow_cast<uint32_t>(m_inputTensorDescs.size()); index < inputCount; ++index)
{
if (!kernelInfo.IsInputValid(index))
{
continue;
}

auto edgeDesc = kernelInfo.GetInputEdgeDescription(index);
assert(edgeDesc.edgeType == MLOperatorEdgeType::Tensor);

// Fix up the the tensor shape by filling with trailing ones. So input[2,3] with axis=0 and scale[2]
// becomes scale[2,1], so that broadcasting works correctly.
std::vector<uint32_t> inputTensorShape = kernelInfo.GetTensorShapeDescription().GetInputTensorShape(index);

// If the input tensor is a 1D vector, then extra massaging is needed to project their
// 1D vectors back to the full shape for broadcasting along the given axis.
// The 1D vector should have a length equal to the output tensor's dimension on that axis.
if (inputTensorShape.size() == 1 && inputTensorShape != outputShape)
{
ML_CHECK_VALID_ARGUMENT(axis < outputShapeDimCount);
uint32_t broadcastAxisLength = outputShape[axis];
ML_CHECK_VALID_ARGUMENT(
(inputTensorShape[0] == broadcastAxisLength) ||
// Treat as broadcast dimension to match CPU behavior.
(inputTensorShape[0] == 1)
);
inputTensorShape.insert(inputTensorShape.begin(), axis, 1);
inputTensorShape.insert(inputTensorShape.end(), outputShapeDimCount - 1 - axis, 1);
}
// For any other shape (scalar/ND), leave it alone, and the TensorDesc constructor
// will apply broadcasting with standard elementwise alignment.

m_inputTensorDescs[index] = TensorDesc(
edgeDesc.tensorDataType,
gsl::make_span(outputShape),
gsl::make_span(inputTensorShape),
TensorAxis::DoNotCoerce,
TensorAxis::W,
TensorAxis::RightAligned,
NchwDimensionCount, // minDimensionCount
0 // guaranteedBaseOffsetAlignment
);
}
BroadcastQuantizationParameters(kernelInfo, gsl::make_span(outputShape));

std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
Expand Down Expand Up @@ -630,6 +573,8 @@ class DmlOperatorQuantization21 : public DmlOperator
const DML_TENSOR_DATA_TYPE outputDataType = m_outputTensorDescs[0].GetDmlDataType();
bool hasZeroPointTensor = kernelInfo.IsInputValid(2);

BroadcastQuantizationParameters(kernelInfo, gsl::make_span(outputShape));

std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -972,8 +972,23 @@ class MLOperatorKernel : public Microsoft::WRL::RuntimeClass<
{
ORT_TRY
{
Microsoft::WRL::ComPtr<MLOperatorKernel> kernel = wil::MakeOrThrow<MLOperatorKernel>(MLOperatorKernelCreationContext(&info));
Comment thread
adrastogi marked this conversation as resolved.

// Use placement new instead of wil::MakeOrThrow to control allocation
// and deallocation directly, ensuring correctly-sized cleanup if the
// constructor throws an exception.
void* buffer = ::operator new(sizeof(MLOperatorKernel));
MLOperatorKernel* kernelRaw = nullptr;
try
{
kernelRaw = new (buffer) MLOperatorKernel(MLOperatorKernelCreationContext(&info));
}
catch (...)
{
::operator delete(buffer, sizeof(MLOperatorKernel));
throw;
}

Microsoft::WRL::ComPtr<MLOperatorKernel> kernel;
kernel.Attach(kernelRaw);
Comment thread
adrastogi marked this conversation as resolved.
Outdated
*opKernel = kernel.Detach();
return S_OK;
}
Expand Down
84 changes: 84 additions & 0 deletions onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,90 @@ TEST(QuantizeLinearOpTest, Int8) {
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
}

// Repro for new-delete-type-mismatch in DML EP during graph fusion.
// QuantizeLinear float32→int8 with 5D input triggers a type-size
// mismatch (192 bytes allocated, 1 byte deallocated) visible under ASan.
TEST(QuantizeLinearOpTest, Int8_5D_DML_TypeMismatch) {
auto dml_ep = DefaultDmlExecutionProvider();
if (!dml_ep) {
GTEST_SKIP() << "Skipping because DML EP is not available.";
}

OpTester test("QuantizeLinear", 13);
std::vector<int64_t> dims{6, 1, 1, 1, 1};
test.AddInput<float>("x", dims, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
test.AddInput<float>("y_scale", {}, {1.0f});
test.AddInput<int8_t>("y_zero_point", {}, {0});
test.AddOutput<int8_t>("y", dims, {1, 2, 3, 4, 5, 6});

std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
execution_providers.emplace_back(std::move(dml_ep));
test.ConfigEps(std::move(execution_providers))
.RunWithConfig();
}

// Same as above but with per-axis quantization along axis 0 to exercise
// the DML graph fusion path with per-channel int8 quantization.
TEST(QuantizeLinearOpTest, Int8_5D_PerAxis_DML_TypeMismatch) {
auto dml_ep = DefaultDmlExecutionProvider();
if (!dml_ep) {
GTEST_SKIP() << "Skipping because DML EP is not available.";
}

OpTester test("QuantizeLinear", 13);
std::vector<int64_t> dims{6, 1, 1, 1, 1};
test.AddAttribute<int64_t>("axis", 0);
test.AddInput<float>("x", dims, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
test.AddInput<float>("y_scale", {6}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
test.AddInput<int8_t>("y_zero_point", {6}, {0, 0, 0, 0, 0, 0});
test.AddOutput<int8_t>("y", dims, {1, 2, 3, 4, 5, 6});

std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
execution_providers.emplace_back(std::move(dml_ep));
test.ConfigEps(std::move(execution_providers))
.RunWithConfig();
}

// Opset 21 QuantizeLinear float32→uint8 WITHOUT zero_point.
// Without zero_point, the output type defaults to uint8.
TEST(QuantizeLinearOpTest, Uint8_5D_NoZeroPoint_Opset21_DML) {
auto dml_ep = DefaultDmlExecutionProvider();
if (!dml_ep) {
GTEST_SKIP() << "Skipping because DML EP is not available.";
}

OpTester test("QuantizeLinear", 21);
std::vector<int64_t> dims{6, 1, 1, 1, 1};
test.AddInput<float>("x", dims, {0.0f, 51.0f, 102.0f, 153.0f, 204.0f, 255.0f});
test.AddInput<float>("y_scale", {}, {1.0f});
test.AddOutput<uint8_t>("y", dims, {0, 51, 102, 153, 204, 255});

std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
execution_providers.emplace_back(std::move(dml_ep));
test.ConfigEps(std::move(execution_providers))
.RunWithConfig();
}

// Opset 21 QuantizeLinear float32→int8 with zero_point (the customer's exact scenario).
TEST(QuantizeLinearOpTest, Int8_5D_WithZeroPoint_Opset21_DML) {
auto dml_ep = DefaultDmlExecutionProvider();
if (!dml_ep) {
GTEST_SKIP() << "Skipping because DML EP is not available.";
}

OpTester test("QuantizeLinear", 21);
std::vector<int64_t> dims{6, 1, 1, 1, 1};
test.AddInput<float>("x", dims, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
test.AddInput<float>("y_scale", {}, {1.0f});
test.AddInput<int8_t>("y_zero_point", {}, {0});
test.AddOutput<int8_t>("y", dims, {1, 2, 3, 4, 5, 6});

std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
execution_providers.emplace_back(std::move(dml_ep));
test.ConfigEps(std::move(execution_providers))
.RunWithConfig();
}

// Test uint16 QuantizeLinear (per tensor)
TEST(QuantizeLinearOpTest, Uint16) {
OpTester test("QuantizeLinear", 21);
Expand Down
Loading