Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions onnxruntime/python/tools/quantization/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
CalibrationDataReader,
CalibrationMethod,
MinMaxCalibrater,
TensorData,
TensorsData,
create_calibrator,
load_tensors_data,
save_tensors_data,
)
from .qdq_quantizer import QDQQuantizer # noqa: F401
from .quant_utils import QuantFormat, QuantType, write_calibration_table # noqa: F401
Expand Down
86 changes: 86 additions & 0 deletions onnxruntime/python/tools/quantization/calibrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,12 @@
# license information.
# --------------------------------------------------------------------------
import abc
import contextlib
import copy
import itertools
import json
import os
import tempfile
import uuid
from collections.abc import Sequence
from enum import Enum
Expand Down Expand Up @@ -98,6 +101,21 @@ def to_dict(self):
data["CLS"] = self.__class__.__name__
return data

@classmethod
def from_dict(cls, d: dict) -> "TensorData":
"""Reconstruct a TensorData from a dict produced by to_dict()."""
kwargs = {}
for k, v in d.items():
if k == "CLS":
continue
value = v
if isinstance(value, dict) and value.get("CLS") == "numpy.array":
value = np.array(value["data"], dtype=np.dtype(value["dtype"]))
elif k in cls._floats and isinstance(value, (int, float)):
value = np.array(value, dtype=np.float32)
kwargs[k] = value
Comment thread
tianleiwu marked this conversation as resolved.
return cls(**kwargs)


class TensorsData:
def __init__(self, calibration_method, data: dict[str, TensorData | tuple]):
Expand Down Expand Up @@ -150,6 +168,18 @@ def to_dict(self):
}
return data

@classmethod
def from_dict(cls, d: dict) -> "TensorsData":
"""Reconstruct a TensorsData from a dict produced by to_dict()."""
method_val = d["calibration_method"]
if isinstance(method_val, dict) and method_val.get("CLS") == "CalibrationMethod":
name = method_val["value"].split(".")[-1]
method = CalibrationMethod[name]
else:
method = method_val
reconstructed = {k: TensorData.from_dict(v) for k, v in d["data"].items()}
return cls(method, reconstructed)


class CalibrationMethod(Enum):
MinMax = 0
Expand Down Expand Up @@ -184,6 +214,62 @@ def set_range(self, start_index: int, end_index: int):
raise NotImplementedError


class CalibrationCacheEncoder(json.JSONEncoder):
"""Shared JSON encoder for calibration caches.

Handles numpy ndarrays and numpy scalar types (integer/floating) so
calibration JSON output is consistent across ``save_tensors_data`` and
``quant_utils.write_calibration_table``.
"""

def default(self, obj):
if isinstance(obj, (TensorData, TensorsData)):
return obj.to_dict()
if isinstance(obj, np.ndarray):
return {"data": obj.tolist(), "dtype": str(obj.dtype), "CLS": "numpy.array"}
if isinstance(obj, CalibrationMethod):
return {"CLS": obj.__class__.__name__, "value": str(obj)}
if isinstance(obj, np.integer):
return int(obj)
if isinstance(obj, np.floating):
return float(obj)
return json.JSONEncoder.default(self, obj)


def save_tensors_data(tensors_data: "TensorsData", path: "str | Path", *, smooth_quant: bool = False) -> None:
"""Serialize calibration tensor ranges to a JSON file at *path*.

:param smooth_quant: whether the producing run used SmoothQuant. Stored in
the cache so a later load can detect a mismatch and recompute.
"""
path = Path(path)
path.parent.mkdir(parents=True, exist_ok=True)
fd, tmp_name = tempfile.mkstemp(dir=path.parent, prefix=".calibcache_", suffix=".tmp")
try:
with os.fdopen(fd, "w") as f:
payload = tensors_data.to_dict()
payload["smooth_quant"] = smooth_quant
json.dump(payload, f, cls=CalibrationCacheEncoder)
f.flush()
os.replace(tmp_name, path)
except BaseException:
with contextlib.suppress(FileNotFoundError):
os.unlink(tmp_name)
raise


def load_tensors_data(path: "str | Path") -> "TensorsData":
"""Load calibration tensor ranges from a JSON file written by save_tensors_data()."""
path = Path(path)
if not path.exists():
raise FileNotFoundError(f"Calibration cache not found: {path}")
Comment thread
Rishi-Dave marked this conversation as resolved.
if not path.is_file():
raise ValueError(f"Calibration cache path is not a file: {path}")
with path.open("r") as f:
d = json.load(f)
return TensorsData.from_dict(d)


class CalibraterBase:
def __init__(
self,
Expand Down
17 changes: 5 additions & 12 deletions onnxruntime/python/tools/quantization/quant_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -796,21 +796,14 @@ def write_calibration_table(calibration_cache, dir="."):

import onnxruntime.quantization.CalTableFlatBuffers.KeyValue as KeyValue # noqa: PLC0415
import onnxruntime.quantization.CalTableFlatBuffers.TrtTable as TrtTable # noqa: PLC0415
from onnxruntime.quantization.calibrate import CalibrationMethod, TensorData, TensorsData # noqa: PLC0415

# Use the shared encoder from calibrate.py so write_calibration_table and
# save_tensors_data produce identical JSON for numpy scalar/array values.
from onnxruntime.quantization.calibrate import CalibrationCacheEncoder # noqa: PLC0415

logging.info(f"calibration cache: {calibration_cache}")

class MyEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, (TensorData, TensorsData)):
return obj.to_dict()
if isinstance(obj, np.ndarray):
return {"data": obj.tolist(), "dtype": str(obj.dtype), "CLS": "numpy.array"}
if isinstance(obj, CalibrationMethod):
return {"CLS": obj.__class__.__name__, "value": str(obj)}
return json.JSONEncoder.default(self, obj)

json_data = json.dumps(calibration_cache, cls=MyEncoder)
json_data = json.dumps(calibration_cache, cls=CalibrationCacheEncoder)

with open(os.path.join(dir, "calibration.json"), "w") as file:
file.write(json_data) # use `json.loads` to do the reverse
Expand Down
139 changes: 97 additions & 42 deletions onnxruntime/python/tools/quantization/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from __future__ import annotations

import copy
import json
import logging
import tempfile
from collections.abc import Callable
Expand All @@ -14,7 +15,14 @@

import onnx

from .calibrate import CalibrationDataReader, CalibrationMethod, TensorsData, create_calibrator
from .calibrate import (
CalibrationDataReader,
CalibrationMethod,
TensorsData,
create_calibrator,
load_tensors_data,
save_tensors_data,
)
from .onnx_quantizer import ONNXQuantizer
from .qdq_quantizer import QDQQuantizer
from .quant_utils import (
Expand Down Expand Up @@ -479,7 +487,7 @@ def check_static_quant_arguments(quant_format: QuantFormat, activation_type: Qua
def quantize_static(
model_input: str | Path | onnx.ModelProto,
model_output: str | Path,
calibration_data_reader: CalibrationDataReader,
calibration_data_reader: CalibrationDataReader | None = None,
quant_format=QuantFormat.QDQ,
op_types_to_quantize=None,
per_channel=False,
Expand All @@ -492,6 +500,7 @@ def quantize_static(
calibrate_method=CalibrationMethod.MinMax,
calibration_providers=None,
extra_options=None,
calibration_cache_path: str | Path | None = None,
):
Comment thread
Rishi-Dave marked this conversation as resolved.
"""
Given an onnx model and calibration data reader, create a quantized onnx model and save it into a file
Expand All @@ -506,7 +515,13 @@ def quantize_static(
model_output: file path of quantized model
calibration_data_reader: a calibration data reader. It
enumerates calibration data and generates inputs for the
original model.
original model. May be None if calibration_cache_path points to an
existing cache file.
calibration_cache_path: optional path to a JSON calibration cache. If
the file already exists, calibration inference is skipped and the
cached tensor ranges are loaded instead. If the file does not yet
exist, calibration runs normally and the result is saved to this
path for future reuse.
quant_format: QuantFormat{QOperator, QDQ}.
QOperator format quantizes the model with quantized operators directly.
QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
Expand Down Expand Up @@ -673,6 +688,11 @@ def quantize_static(
}

if extra_options.get("SmoothQuant", False):
if calibration_data_reader is None:
Comment thread
tianleiwu marked this conversation as resolved.
raise ValueError(
"SmoothQuant requires a non-None calibration_data_reader; the calibration cache "
"stores per-tensor ranges only and cannot drive the SmoothQuant transform."
)
import importlib # noqa: PLC0415

try:
Expand Down Expand Up @@ -704,48 +724,83 @@ def inc_dataloader():
if is_model_updated:
model = updated_model

with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
if is_model_updated:
# Update model_input and avoid to use the original one
model_input = copy.deepcopy(model)

if isinstance(model_input, onnx.ModelProto):
output_path = Path(quant_tmp_dir).joinpath("model_input.onnx").as_posix()
onnx.save_model(
model_input,
output_path,
save_as_external_data=True,
_cache_path = Path(calibration_cache_path) if calibration_cache_path is not None else None
if _cache_path is not None and _cache_path.exists() and not _cache_path.is_file():
raise ValueError(f"calibration_cache_path is not a file: {_cache_path}")
_cache_hit = _cache_path is not None and _cache_path.is_file()
_smooth_quant = bool(extra_options.get("SmoothQuant", False))

if _cache_hit:
with _cache_path.open("r") as _f:
_raw = json.load(_f)
_cached_sq = bool(_raw.get("smooth_quant", False))
if _cached_sq != _smooth_quant:
logging.warning(
Comment thread
tianleiwu marked this conversation as resolved.
"Calibration cache at %s was produced with smooth_quant=%s; "
"current run uses smooth_quant=%s. Recomputing ranges and overwriting cache.",
_cache_path,
Comment thread
tianleiwu marked this conversation as resolved.
_cached_sq,
_smooth_quant,
)
model_input = output_path

calibrator = create_calibrator(
Path(model_input),
op_types_to_quantize,
augmented_model_path=Path(quant_tmp_dir).joinpath("augmented_model.onnx").as_posix(),
calibrate_method=calibrate_method,
use_external_data_format=use_external_data_format,
providers=calibration_providers,
extra_options=calib_extra_options,
)

stride = extra_options.get("CalibStridedMinMax", None)
if stride:
total_data_size = len(calibration_data_reader)
if total_data_size % stride != 0:
raise ValueError(f"Total data size ({total_data_size}) is not divisible by stride size ({stride}).")

for start in range(0, total_data_size, stride):
end_index = start + stride
calibration_data_reader.set_range(start_index=start, end_index=end_index)
calibrator.collect_data(calibration_data_reader)
_cache_hit = False
else:
calibrator.collect_data(calibration_data_reader)
tensors_range = calibrator.compute_data()
if not isinstance(tensors_range, TensorsData):
raise TypeError(
f"Unexpected type {type(tensors_range)} for tensors_range and calibrator={type(calibrator)}."
tensors_range = load_tensors_data(_cache_path)
if tensors_range.calibration_method != calibrate_method:
raise ValueError(
f"Calibration cache at {_cache_path} was produced with "
f"{tensors_range.calibration_method}, but quantize_static was called "
f"with calibrate_method={calibrate_method}. Delete the cache or "
f"pass a matching calibrate_method."
)

if not _cache_hit:
if calibration_data_reader is None:
raise ValueError("Either calibration_data_reader or an existing calibration_cache_path must be provided.")
with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
if is_model_updated:
# Update model_input and avoid to use the original one
model_input = copy.deepcopy(model)

if isinstance(model_input, onnx.ModelProto):
output_path = Path(quant_tmp_dir).joinpath("model_input.onnx").as_posix()
onnx.save_model(
model_input,
output_path,
save_as_external_data=True,
)
model_input = output_path

calibrator = create_calibrator(
Path(model_input),
op_types_to_quantize,
augmented_model_path=Path(quant_tmp_dir).joinpath("augmented_model.onnx").as_posix(),
calibrate_method=calibrate_method,
use_external_data_format=use_external_data_format,
providers=calibration_providers,
extra_options=calib_extra_options,
)
del calibrator

stride = extra_options.get("CalibStridedMinMax", None)
if stride:
total_data_size = len(calibration_data_reader)
if total_data_size % stride != 0:
raise ValueError(f"Total data size ({total_data_size}) is not divisible by stride size ({stride}).")

for start in range(0, total_data_size, stride):
end_index = start + stride
calibration_data_reader.set_range(start_index=start, end_index=end_index)
calibrator.collect_data(calibration_data_reader)
else:
calibrator.collect_data(calibration_data_reader)
tensors_range = calibrator.compute_data()
if not isinstance(tensors_range, TensorsData):
raise TypeError(
f"Unexpected type {type(tensors_range)} for tensors_range and calibrator={type(calibrator)}."
)
del calibrator

if _cache_path is not None:
save_tensors_data(tensors_range, _cache_path, smooth_quant=_smooth_quant)

check_static_quant_arguments(quant_format, activation_type, weight_type)

Expand Down
Loading
Loading