From becce1f7f467fe96e67bc2848cff2f61072aafa4 Mon Sep 17 00:00:00 2001 From: Rishi Dave Date: Fri, 24 Apr 2026 11:14:26 +0000 Subject: [PATCH 1/6] feat(quantization): add calibration cache to quantize_static Introduce an optional calibration_cache_path parameter on quantize_static so users can save the computed TensorsData after calibration and reload it on subsequent runs. This avoids repeating the expensive calibration inference pass when only post-calibration knobs (e.g. nodes_to_exclude, quant types) change between runs. The cache is a human-readable JSON file whose schema mirrors the encoder used by write_calibration_table: TensorData / TensorsData round-trip through new from_dict classmethods and module-level save_tensors_data / load_tensors_data helpers in calibrate.py. calibration_data_reader is now optional; at least one of it or an existing cache file must be provided. Fixes #21908 --- .../python/tools/quantization/__init__.py | 4 + .../python/tools/quantization/calibrate.py | 64 +++++++ .../python/tools/quantization/quantize.py | 116 +++++++----- .../python/quantization/test_calibration.py | 170 +++++++++++++++++- 4 files changed, 311 insertions(+), 43 deletions(-) diff --git a/onnxruntime/python/tools/quantization/__init__.py b/onnxruntime/python/tools/quantization/__init__.py index ac99de348f612..50b0bd08ae360 100644 --- a/onnxruntime/python/tools/quantization/__init__.py +++ b/onnxruntime/python/tools/quantization/__init__.py @@ -3,7 +3,11 @@ CalibrationDataReader, CalibrationMethod, MinMaxCalibrater, + TensorData, + TensorsData, create_calibrator, + load_tensors_data, + save_tensors_data, ) from .qdq_quantizer import QDQQuantizer # noqa: F401 from .quant_utils import QuantFormat, QuantType, write_calibration_table # noqa: F401 diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py index 05a5b0873d93d..f266922574a94 100644 --- a/onnxruntime/python/tools/quantization/calibrate.py +++ b/onnxruntime/python/tools/quantization/calibrate.py @@ -7,6 +7,7 @@ import abc import copy import itertools +import json import os import uuid from collections.abc import Sequence @@ -98,6 +99,19 @@ def to_dict(self): data["CLS"] = self.__class__.__name__ return data + @classmethod + def from_dict(cls, d: dict) -> "TensorData": + """Reconstruct a TensorData from a dict produced by to_dict().""" + kwargs = {} + for k, v in d.items(): + if k == "CLS": + continue + value = v + if isinstance(value, dict) and value.get("CLS") == "numpy.array": + value = np.array(value["data"], dtype=np.dtype(value["dtype"])) + kwargs[k] = value + return cls(**kwargs) + class TensorsData: def __init__(self, calibration_method, data: dict[str, TensorData | tuple]): @@ -150,6 +164,18 @@ def to_dict(self): } return data + @classmethod + def from_dict(cls, d: dict) -> "TensorsData": + """Reconstruct a TensorsData from a dict produced by to_dict().""" + method_val = d["calibration_method"] + if isinstance(method_val, dict) and method_val.get("CLS") == "CalibrationMethod": + name = method_val["value"].split(".")[-1] + method = CalibrationMethod[name] + else: + method = method_val + reconstructed = {k: TensorData.from_dict(v) for k, v in d["data"].items()} + return cls(method, reconstructed) + class CalibrationMethod(Enum): MinMax = 0 @@ -184,6 +210,44 @@ def set_range(self, start_index: int, end_index: int): raise NotImplementedError +class _CalibrationCacheEncoder(json.JSONEncoder): + """JSON encoder for calibration cache serialization.""" + + def default(self, obj): + if isinstance(obj, (TensorData, TensorsData)): + return obj.to_dict() + if isinstance(obj, np.ndarray): + return {"data": obj.tolist(), "dtype": str(obj.dtype), "CLS": "numpy.array"} + if isinstance(obj, CalibrationMethod): + return {"CLS": obj.__class__.__name__, "value": str(obj)} + if isinstance(obj, np.integer): + return int(obj) + if isinstance(obj, np.floating): + return float(obj) + return json.JSONEncoder.default(self, obj) + + +def save_tensors_data(tensors_data: "TensorsData", path: "str | Path") -> None: + """Serialize calibration tensor ranges to a JSON file at *path*.""" + path = Path(path) + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(path.suffix + ".tmp") + with tmp.open("w") as f: + json.dump(tensors_data, f, cls=_CalibrationCacheEncoder) + f.flush() + os.replace(tmp, path) + + +def load_tensors_data(path: "str | Path") -> "TensorsData": + """Load calibration tensor ranges from a JSON file written by save_tensors_data().""" + path = Path(path) + if not path.exists(): + raise FileNotFoundError(f"Calibration cache not found: {path}") + with path.open("r") as f: + d = json.load(f) + return TensorsData.from_dict(d) + + class CalibraterBase: def __init__( self, diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py index b8b239b85e7ad..f10a9f1fc561a 100644 --- a/onnxruntime/python/tools/quantization/quantize.py +++ b/onnxruntime/python/tools/quantization/quantize.py @@ -14,7 +14,14 @@ import onnx -from .calibrate import CalibrationDataReader, CalibrationMethod, TensorsData, create_calibrator +from .calibrate import ( + CalibrationDataReader, + CalibrationMethod, + TensorsData, + create_calibrator, + load_tensors_data, + save_tensors_data, +) from .onnx_quantizer import ONNXQuantizer from .qdq_quantizer import QDQQuantizer from .quant_utils import ( @@ -479,7 +486,7 @@ def check_static_quant_arguments(quant_format: QuantFormat, activation_type: Qua def quantize_static( model_input: str | Path | onnx.ModelProto, model_output: str | Path, - calibration_data_reader: CalibrationDataReader, + calibration_data_reader: CalibrationDataReader | None = None, quant_format=QuantFormat.QDQ, op_types_to_quantize=None, per_channel=False, @@ -492,6 +499,7 @@ def quantize_static( calibrate_method=CalibrationMethod.MinMax, calibration_providers=None, extra_options=None, + calibration_cache_path: str | Path | None = None, ): """ Given an onnx model and calibration data reader, create a quantized onnx model and save it into a file @@ -506,7 +514,13 @@ def quantize_static( model_output: file path of quantized model calibration_data_reader: a calibration data reader. It enumerates calibration data and generates inputs for the - original model. + original model. May be None if calibration_cache_path points to an + existing cache file. + calibration_cache_path: optional path to a JSON calibration cache. If + the file already exists, calibration inference is skipped and the + cached tensor ranges are loaded instead. If the file does not yet + exist, calibration runs normally and the result is saved to this + path for future reuse. quant_format: QuantFormat{QOperator, QDQ}. QOperator format quantizes the model with quantized operators directly. QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor. @@ -704,48 +718,66 @@ def inc_dataloader(): if is_model_updated: model = updated_model - with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir: - if is_model_updated: - # Update model_input and avoid to use the original one - model_input = copy.deepcopy(model) - - if isinstance(model_input, onnx.ModelProto): - output_path = Path(quant_tmp_dir).joinpath("model_input.onnx").as_posix() - onnx.save_model( - model_input, - output_path, - save_as_external_data=True, + _cache_path = Path(calibration_cache_path) if calibration_cache_path is not None else None + _cache_hit = _cache_path is not None and _cache_path.exists() + + if _cache_hit: + tensors_range = load_tensors_data(_cache_path) + if tensors_range.calibration_method != calibrate_method: + raise ValueError( + f"Calibration cache at {_cache_path} was produced with " + f"{tensors_range.calibration_method}, but quantize_static was called " + f"with calibrate_method={calibrate_method}. Delete the cache or " + f"pass a matching calibrate_method." + ) + else: + if calibration_data_reader is None: + raise ValueError("Either calibration_data_reader or an existing calibration_cache_path must be provided.") + with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir: + if is_model_updated: + # Update model_input and avoid to use the original one + model_input = copy.deepcopy(model) + + if isinstance(model_input, onnx.ModelProto): + output_path = Path(quant_tmp_dir).joinpath("model_input.onnx").as_posix() + onnx.save_model( + model_input, + output_path, + save_as_external_data=True, + ) + model_input = output_path + + calibrator = create_calibrator( + Path(model_input), + op_types_to_quantize, + augmented_model_path=Path(quant_tmp_dir).joinpath("augmented_model.onnx").as_posix(), + calibrate_method=calibrate_method, + use_external_data_format=use_external_data_format, + providers=calibration_providers, + extra_options=calib_extra_options, ) - model_input = output_path - - calibrator = create_calibrator( - Path(model_input), - op_types_to_quantize, - augmented_model_path=Path(quant_tmp_dir).joinpath("augmented_model.onnx").as_posix(), - calibrate_method=calibrate_method, - use_external_data_format=use_external_data_format, - providers=calibration_providers, - extra_options=calib_extra_options, - ) - - stride = extra_options.get("CalibStridedMinMax", None) - if stride: - total_data_size = len(calibration_data_reader) - if total_data_size % stride != 0: - raise ValueError(f"Total data size ({total_data_size}) is not divisible by stride size ({stride}).") - for start in range(0, total_data_size, stride): - end_index = start + stride - calibration_data_reader.set_range(start_index=start, end_index=end_index) + stride = extra_options.get("CalibStridedMinMax", None) + if stride: + total_data_size = len(calibration_data_reader) + if total_data_size % stride != 0: + raise ValueError(f"Total data size ({total_data_size}) is not divisible by stride size ({stride}).") + + for start in range(0, total_data_size, stride): + end_index = start + stride + calibration_data_reader.set_range(start_index=start, end_index=end_index) + calibrator.collect_data(calibration_data_reader) + else: calibrator.collect_data(calibration_data_reader) - else: - calibrator.collect_data(calibration_data_reader) - tensors_range = calibrator.compute_data() - if not isinstance(tensors_range, TensorsData): - raise TypeError( - f"Unexpected type {type(tensors_range)} for tensors_range and calibrator={type(calibrator)}." - ) - del calibrator + tensors_range = calibrator.compute_data() + if not isinstance(tensors_range, TensorsData): + raise TypeError( + f"Unexpected type {type(tensors_range)} for tensors_range and calibrator={type(calibrator)}." + ) + del calibrator + + if _cache_path is not None: + save_tensors_data(tensors_range, _cache_path) check_static_quant_arguments(quant_format, activation_type, weight_type) diff --git a/onnxruntime/test/python/quantization/test_calibration.py b/onnxruntime/test/python/quantization/test_calibration.py index 60c5f9d404258..41f5624bd6eac 100644 --- a/onnxruntime/test/python/quantization/test_calibration.py +++ b/onnxruntime/test/python/quantization/test_calibration.py @@ -14,7 +14,16 @@ from onnx import TensorProto, helper, numpy_helper import onnxruntime -from onnxruntime.quantization.calibrate import CalibrationDataReader, CalibrationMethod, create_calibrator +from onnxruntime.quantization import quantize_static +from onnxruntime.quantization.calibrate import ( + CalibrationDataReader, + CalibrationMethod, + TensorData, + TensorsData, + create_calibrator, + load_tensors_data, + save_tensors_data, +) def generate_input_initializer(tensor_shape, tensor_dtype, input_name): @@ -528,5 +537,164 @@ def test_compute_data_per_channel(self): np.testing.assert_equal(min_max, tensors_range[output_name].range_value) +class TestCalibrationCache(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls._tmp_dir = tempfile.TemporaryDirectory(prefix="test_calibration_cache.") + + @classmethod + def tearDownClass(cls): + cls._tmp_dir.cleanup() + + def _make_simple_model(self, path): + """Build a tiny Conv+Relu model for end-to-end cache tests.""" + vi_input = helper.make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 1, 3]) + vi_output = helper.make_tensor_value_info("X6", TensorProto.FLOAT, [1, 3, 1, 3]) + w1 = generate_input_initializer([3, 3, 1, 1], np.float32, "W1") + b1 = generate_input_initializer([3], np.float32, "B1") + conv_node = helper.make_node("Conv", ["input", "W1", "B1"], ["X2"], name="Conv1") + relu_node = helper.make_node("Relu", ["X2"], ["X6"], name="Relu1") + graph = helper.make_graph([conv_node, relu_node], "cache_test_graph", [vi_input], [vi_output]) + graph.initializer.add().CopyFrom(w1) + graph.initializer.add().CopyFrom(b1) + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)]) + onnx.save(model, path) + + def test_save_load_tensors_data_minmax_roundtrip(self): + td = TensorsData( + CalibrationMethod.MinMax, + {"x": TensorData(lowest=np.array(-1.0, dtype=np.float32), highest=np.array(2.0, dtype=np.float32))}, + ) + cache_path = Path(self._tmp_dir.name) / "minmax_cache.json" + save_tensors_data(td, cache_path) + self.assertTrue(cache_path.exists()) + + loaded = load_tensors_data(cache_path) + self.assertEqual(loaded.calibration_method, CalibrationMethod.MinMax) + self.assertEqual(list(loaded.keys()), ["x"]) + lo, hi = loaded["x"].range_value + np.testing.assert_array_equal(lo, np.array(-1.0, dtype=np.float32)) + np.testing.assert_array_equal(hi, np.array(2.0, dtype=np.float32)) + self.assertEqual(lo.shape, ()) + self.assertEqual(hi.shape, ()) + + def test_save_load_tensors_data_entropy_roundtrip(self): + hist = np.array([1.0, 2.0, 3.0], dtype=np.float32) + hist_edges = np.array([0.0, 1.0, 2.0, 3.0], dtype=np.float32) + td = TensorsData( + CalibrationMethod.Entropy, + { + "y": TensorData( + lowest=np.array(-0.5, dtype=np.float32), + highest=np.array(0.5, dtype=np.float32), + hist=hist, + hist_edges=hist_edges, + ) + }, + ) + cache_path = Path(self._tmp_dir.name) / "entropy_cache.json" + save_tensors_data(td, cache_path) + + loaded = load_tensors_data(cache_path) + self.assertEqual(loaded.calibration_method, CalibrationMethod.Entropy) + lo, hi = loaded["y"].range_value + np.testing.assert_array_almost_equal(lo, np.array(-0.5, dtype=np.float32)) + np.testing.assert_array_almost_equal(hi, np.array(0.5, dtype=np.float32)) + np.testing.assert_array_almost_equal(loaded["y"].hist, hist) + np.testing.assert_array_almost_equal(loaded["y"].hist_edges, hist_edges) + + def test_load_tensors_data_invalid_path(self): + with self.assertRaises(FileNotFoundError): + load_tensors_data("/nonexistent/path/cache.json") + + def test_quantize_static_calibration_cache_path(self): + model_path = Path(self._tmp_dir.name) / "tiny_model.onnx" + self._make_simple_model(str(model_path)) + + cache_path = Path(self._tmp_dir.name) / "quant_cache.json" + out1_path = Path(self._tmp_dir.name) / "quantized1.onnx" + out2_path = Path(self._tmp_dir.name) / "quantized2.onnx" + + # First call: calibration_data_reader provided, cache written + data_reader = TestDataReader() + quantize_static( + str(model_path), + str(out1_path), + calibration_data_reader=data_reader, + calibration_cache_path=cache_path, + ) + self.assertTrue(cache_path.exists()) + td1 = load_tensors_data(cache_path) + + # Second call: no data_reader, load from cache + quantize_static( + str(model_path), + str(out2_path), + calibration_data_reader=None, + calibration_cache_path=cache_path, + ) + self.assertTrue(out2_path.exists()) + td2 = load_tensors_data(cache_path) + self.assertEqual(td1.calibration_method, td2.calibration_method) + + def test_quantize_static_no_reader_no_cache_raises(self): + model_path = Path(self._tmp_dir.name) / "tiny_model2.onnx" + self._make_simple_model(str(model_path)) + out_path = Path(self._tmp_dir.name) / "quantized_err.onnx" + + with self.assertRaises(ValueError): + quantize_static(str(model_path), str(out_path), calibration_data_reader=None) + + def test_save_tensors_data_creates_parent_dir(self): + nested_path = Path(self._tmp_dir.name) / "nested" / "dir" / "cache.json" + td = TensorsData( + CalibrationMethod.MinMax, + {"x": TensorData(lowest=np.array(-1.0, dtype=np.float32), highest=np.array(1.0, dtype=np.float32))}, + ) + save_tensors_data(td, nested_path) + self.assertTrue(nested_path.exists()) + + def test_save_tensors_data_handles_scalar_bins(self): + td = TensorsData( + CalibrationMethod.Entropy, + { + "z": TensorData( + lowest=np.array(0.0, dtype=np.float32), + highest=np.array(1.0, dtype=np.float32), + hist=np.array([1, 2], dtype=np.int64), + bins=np.int64(5), + ) + }, + ) + cache_path = Path(self._tmp_dir.name) / "scalar_bins_cache.json" + save_tensors_data(td, cache_path) + loaded = load_tensors_data(cache_path) + self.assertEqual(loaded["z"].bins, 5) + + def test_load_tensors_data_method_mismatch_raises(self): + model_path = Path(self._tmp_dir.name) / "tiny_mismatch.onnx" + self._make_simple_model(str(model_path)) + cache_path = Path(self._tmp_dir.name) / "mismatch_cache.json" + out_path = Path(self._tmp_dir.name) / "quantized_mismatch.onnx" + + data_reader = TestDataReader() + quantize_static( + str(model_path), + str(out_path), + calibration_data_reader=data_reader, + calibrate_method=CalibrationMethod.MinMax, + calibration_cache_path=cache_path, + ) + + with self.assertRaises(ValueError): + quantize_static( + str(model_path), + str(out_path), + calibration_data_reader=None, + calibrate_method=CalibrationMethod.Entropy, + calibration_cache_path=cache_path, + ) + + if __name__ == "__main__": unittest.main() From aa4521fcb36142e58443d02498a472c0de1acfce Mon Sep 17 00:00:00 2001 From: Rishi Dave Date: Fri, 1 May 2026 00:58:42 +0000 Subject: [PATCH 2/6] fix(quantization): tighten calibration cache path validation - load_tensors_data: reject non-file paths up front with a ValueError instead of letting Path.open raise IsADirectoryError or similar. - quantize_static: when extra_options['SmoothQuant']=True, require a non-None calibration_data_reader since the cache stores per-tensor ranges only and cannot drive the SmoothQuant transform. - quantize_static: treat the cache path as a hit only when it is a regular file; raise ValueError if it exists but is e.g. a directory, so callers get a clear message instead of a low-level IOError. --- onnxruntime/python/tools/quantization/calibrate.py | 2 ++ onnxruntime/python/tools/quantization/quantize.py | 9 ++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py index f266922574a94..453fa82188f55 100644 --- a/onnxruntime/python/tools/quantization/calibrate.py +++ b/onnxruntime/python/tools/quantization/calibrate.py @@ -243,6 +243,8 @@ def load_tensors_data(path: "str | Path") -> "TensorsData": path = Path(path) if not path.exists(): raise FileNotFoundError(f"Calibration cache not found: {path}") + if not path.is_file(): + raise ValueError(f"Calibration cache path is not a file: {path}") with path.open("r") as f: d = json.load(f) return TensorsData.from_dict(d) diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py index f10a9f1fc561a..b80451f275acd 100644 --- a/onnxruntime/python/tools/quantization/quantize.py +++ b/onnxruntime/python/tools/quantization/quantize.py @@ -687,6 +687,11 @@ def quantize_static( } if extra_options.get("SmoothQuant", False): + if calibration_data_reader is None: + raise ValueError( + "SmoothQuant requires a non-None calibration_data_reader; the calibration cache " + "stores per-tensor ranges only and cannot drive the SmoothQuant transform." + ) import importlib # noqa: PLC0415 try: @@ -719,7 +724,9 @@ def inc_dataloader(): model = updated_model _cache_path = Path(calibration_cache_path) if calibration_cache_path is not None else None - _cache_hit = _cache_path is not None and _cache_path.exists() + if _cache_path is not None and _cache_path.exists() and not _cache_path.is_file(): + raise ValueError(f"calibration_cache_path is not a file: {_cache_path}") + _cache_hit = _cache_path is not None and _cache_path.is_file() if _cache_hit: tensors_range = load_tensors_data(_cache_path) From 755a1c916ef9fac61a734cdb2f10314dceff22ac Mon Sep 17 00:00:00 2001 From: Rishi Dave Date: Sun, 3 May 2026 11:43:43 +0000 Subject: [PATCH 3/6] fix(quantization): coerce float scalars and clean tmp on cache write failure - TensorData.from_dict: wrap plain int/float values for _floats keys as np.array(value, dtype=np.float32) so cache round-trip works for Entropy/Distribution calibration, where hist_edges.min()/.max() are serialized as numpy scalars and deserialize as plain Python floats. - save_tensors_data: wrap json.dump + os.replace in try/except BaseException that unlinks the .tmp file on failure, so partial serialization (or KeyboardInterrupt mid-write) does not leave stray .tmp files behind. --- onnxruntime/python/tools/quantization/calibrate.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py index 453fa82188f55..fc14a589e7ff5 100644 --- a/onnxruntime/python/tools/quantization/calibrate.py +++ b/onnxruntime/python/tools/quantization/calibrate.py @@ -109,6 +109,8 @@ def from_dict(cls, d: dict) -> "TensorData": value = v if isinstance(value, dict) and value.get("CLS") == "numpy.array": value = np.array(value["data"], dtype=np.dtype(value["dtype"])) + elif k in cls._floats and isinstance(value, (int, float)): + value = np.array(value, dtype=np.float32) kwargs[k] = value return cls(**kwargs) @@ -232,10 +234,14 @@ def save_tensors_data(tensors_data: "TensorsData", path: "str | Path") -> None: path = Path(path) path.parent.mkdir(parents=True, exist_ok=True) tmp = path.with_suffix(path.suffix + ".tmp") - with tmp.open("w") as f: - json.dump(tensors_data, f, cls=_CalibrationCacheEncoder) - f.flush() - os.replace(tmp, path) + try: + with tmp.open("w") as f: + json.dump(tensors_data, f, cls=_CalibrationCacheEncoder) + f.flush() + os.replace(tmp, path) + except BaseException: + tmp.unlink(missing_ok=True) + raise def load_tensors_data(path: "str | Path") -> "TensorsData": From c8d11ce14fbc05f6a2d556b8255b44adaff9a61e Mon Sep 17 00:00:00 2001 From: Rishi Dave Date: Mon, 4 May 2026 11:35:29 +0000 Subject: [PATCH 4/6] refactor(quantization): share calibration JSON encoder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review feedback on PR #28221 by hoisting the previously private `_CalibrationCacheEncoder` in `calibrate.py` to a module-level `CalibrationCacheEncoder` and reusing it from `quant_utils.write_calibration_table` instead of redefining a narrower inner `MyEncoder`. The shared encoder is a superset of the old inner one — `write_calibration_table` now also serializes bare numpy scalar types (np.integer, np.floating, np.bool_) that previously would have raised TypeError. This is the intended consolidation: both helpers now produce identical JSON. Also parametrize the invalid-path test against `TemporaryDirectory` so it no longer relies on a hardcoded POSIX absolute path, making it deterministic on Windows. --- .../python/tools/quantization/calibrate.py | 11 ++++++++--- .../python/tools/quantization/quant_utils.py | 17 +++++------------ .../python/quantization/test_calibration.py | 3 ++- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py index fc14a589e7ff5..b92eccd2989aa 100644 --- a/onnxruntime/python/tools/quantization/calibrate.py +++ b/onnxruntime/python/tools/quantization/calibrate.py @@ -212,8 +212,13 @@ def set_range(self, start_index: int, end_index: int): raise NotImplementedError -class _CalibrationCacheEncoder(json.JSONEncoder): - """JSON encoder for calibration cache serialization.""" +class CalibrationCacheEncoder(json.JSONEncoder): + """Shared JSON encoder for calibration caches. + + Handles numpy ndarrays and numpy scalar types (integer/floating) so + calibration JSON output is consistent across ``save_tensors_data`` and + ``quant_utils.write_calibration_table``. + """ def default(self, obj): if isinstance(obj, (TensorData, TensorsData)): @@ -236,7 +241,7 @@ def save_tensors_data(tensors_data: "TensorsData", path: "str | Path") -> None: tmp = path.with_suffix(path.suffix + ".tmp") try: with tmp.open("w") as f: - json.dump(tensors_data, f, cls=_CalibrationCacheEncoder) + json.dump(tensors_data, f, cls=CalibrationCacheEncoder) f.flush() os.replace(tmp, path) except BaseException: diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py index 0ce1e1a0d75de..c8deb0d3e395a 100644 --- a/onnxruntime/python/tools/quantization/quant_utils.py +++ b/onnxruntime/python/tools/quantization/quant_utils.py @@ -796,21 +796,14 @@ def write_calibration_table(calibration_cache, dir="."): import onnxruntime.quantization.CalTableFlatBuffers.KeyValue as KeyValue # noqa: PLC0415 import onnxruntime.quantization.CalTableFlatBuffers.TrtTable as TrtTable # noqa: PLC0415 - from onnxruntime.quantization.calibrate import CalibrationMethod, TensorData, TensorsData # noqa: PLC0415 + + # Use the shared encoder from calibrate.py so write_calibration_table and + # save_tensors_data produce identical JSON for numpy scalar/array values. + from onnxruntime.quantization.calibrate import CalibrationCacheEncoder # noqa: PLC0415 logging.info(f"calibration cache: {calibration_cache}") - class MyEncoder(json.JSONEncoder): - def default(self, obj): - if isinstance(obj, (TensorData, TensorsData)): - return obj.to_dict() - if isinstance(obj, np.ndarray): - return {"data": obj.tolist(), "dtype": str(obj.dtype), "CLS": "numpy.array"} - if isinstance(obj, CalibrationMethod): - return {"CLS": obj.__class__.__name__, "value": str(obj)} - return json.JSONEncoder.default(self, obj) - - json_data = json.dumps(calibration_cache, cls=MyEncoder) + json_data = json.dumps(calibration_cache, cls=CalibrationCacheEncoder) with open(os.path.join(dir, "calibration.json"), "w") as file: file.write(json_data) # use `json.loads` to do the reverse diff --git a/onnxruntime/test/python/quantization/test_calibration.py b/onnxruntime/test/python/quantization/test_calibration.py index 41f5624bd6eac..e74fe1554e267 100644 --- a/onnxruntime/test/python/quantization/test_calibration.py +++ b/onnxruntime/test/python/quantization/test_calibration.py @@ -604,8 +604,9 @@ def test_save_load_tensors_data_entropy_roundtrip(self): np.testing.assert_array_almost_equal(loaded["y"].hist_edges, hist_edges) def test_load_tensors_data_invalid_path(self): + bogus = Path(self._tmp_dir.name) / "does_not_exist.json" with self.assertRaises(FileNotFoundError): - load_tensors_data("/nonexistent/path/cache.json") + load_tensors_data(bogus) def test_quantize_static_calibration_cache_path(self): model_path = Path(self._tmp_dir.name) / "tiny_model.onnx" From 91c30c6189a16a76cf2e4c766168004eb6981e0e Mon Sep 17 00:00:00 2001 From: Rishi Dave Date: Tue, 5 May 2026 11:18:23 +0000 Subject: [PATCH 5/6] fix(quantization): use unique temp filename for calibration cache writes Previously save_tensors_data used a deterministic '.tmp' suffix for the intermediate write. If two processes wrote to the same calibration cache path concurrently, they raced on the same temp filename and the atomic-write guarantee was lost (one writer could unlink the partial file of another). Switch to tempfile.mkstemp in the destination directory so each writer gets a unique temp file. Restore prior 0o644 permissions before os.replace since mkstemp defaults to 0o600. Use contextlib.suppress for the FileNotFoundError cleanup path. --- onnxruntime/python/tools/quantization/calibrate.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py index b92eccd2989aa..e35fdc545c5ec 100644 --- a/onnxruntime/python/tools/quantization/calibrate.py +++ b/onnxruntime/python/tools/quantization/calibrate.py @@ -5,10 +5,12 @@ # license information. # -------------------------------------------------------------------------- import abc +import contextlib import copy import itertools import json import os +import tempfile import uuid from collections.abc import Sequence from enum import Enum @@ -238,14 +240,16 @@ def save_tensors_data(tensors_data: "TensorsData", path: "str | Path") -> None: """Serialize calibration tensor ranges to a JSON file at *path*.""" path = Path(path) path.parent.mkdir(parents=True, exist_ok=True) - tmp = path.with_suffix(path.suffix + ".tmp") + fd, tmp_name = tempfile.mkstemp(dir=path.parent, prefix=".calibcache_", suffix=".tmp") try: - with tmp.open("w") as f: + with os.fdopen(fd, "w") as f: json.dump(tensors_data, f, cls=CalibrationCacheEncoder) f.flush() - os.replace(tmp, path) + os.chmod(tmp_name, 0o644) + os.replace(tmp_name, path) except BaseException: - tmp.unlink(missing_ok=True) + with contextlib.suppress(FileNotFoundError): + os.unlink(tmp_name) raise From 40d6038528cef1c385f3f50ebb3c7436da1c9b1c Mon Sep 17 00:00:00 2001 From: Rishi Dave Date: Wed, 6 May 2026 11:24:52 +0000 Subject: [PATCH 6/6] fix(quantization): guard calibration cache against SmoothQuant mismatch - Persist `smooth_quant` flag in the calibration cache JSON payload. - On load, if the cached flag differs from the current run, log a warning and recompute (treat as a cache miss); a missing field is assumed False for backwards compatibility with caches written by earlier revisions of this PR. - Drop the explicit `os.chmod(..., 0o644)` in save_tensors_data; the tempfile is created with mode 0o600 by mkstemp and the umask should govern the final mode (CodeQL flagged 0o644 as overly permissive). - Add unit tests covering field persistence, mismatch->recompute+warn, matching->cache hit, and legacy (missing field)->cache hit. --- .../python/tools/quantization/calibrate.py | 13 +- .../python/tools/quantization/quantize.py | 34 ++++-- .../python/quantization/test_calibration.py | 113 ++++++++++++++++++ 3 files changed, 147 insertions(+), 13 deletions(-) diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py index e35fdc545c5ec..305804661cf64 100644 --- a/onnxruntime/python/tools/quantization/calibrate.py +++ b/onnxruntime/python/tools/quantization/calibrate.py @@ -236,16 +236,21 @@ def default(self, obj): return json.JSONEncoder.default(self, obj) -def save_tensors_data(tensors_data: "TensorsData", path: "str | Path") -> None: - """Serialize calibration tensor ranges to a JSON file at *path*.""" +def save_tensors_data(tensors_data: "TensorsData", path: "str | Path", *, smooth_quant: bool = False) -> None: + """Serialize calibration tensor ranges to a JSON file at *path*. + + :param smooth_quant: whether the producing run used SmoothQuant. Stored in + the cache so a later load can detect a mismatch and recompute. + """ path = Path(path) path.parent.mkdir(parents=True, exist_ok=True) fd, tmp_name = tempfile.mkstemp(dir=path.parent, prefix=".calibcache_", suffix=".tmp") try: with os.fdopen(fd, "w") as f: - json.dump(tensors_data, f, cls=CalibrationCacheEncoder) + payload = tensors_data.to_dict() + payload["smooth_quant"] = smooth_quant + json.dump(payload, f, cls=CalibrationCacheEncoder) f.flush() - os.chmod(tmp_name, 0o644) os.replace(tmp_name, path) except BaseException: with contextlib.suppress(FileNotFoundError): diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py index b80451f275acd..d6b2ecb2b17ed 100644 --- a/onnxruntime/python/tools/quantization/quantize.py +++ b/onnxruntime/python/tools/quantization/quantize.py @@ -6,6 +6,7 @@ from __future__ import annotations import copy +import json import logging import tempfile from collections.abc import Callable @@ -727,17 +728,32 @@ def inc_dataloader(): if _cache_path is not None and _cache_path.exists() and not _cache_path.is_file(): raise ValueError(f"calibration_cache_path is not a file: {_cache_path}") _cache_hit = _cache_path is not None and _cache_path.is_file() + _smooth_quant = bool(extra_options.get("SmoothQuant", False)) if _cache_hit: - tensors_range = load_tensors_data(_cache_path) - if tensors_range.calibration_method != calibrate_method: - raise ValueError( - f"Calibration cache at {_cache_path} was produced with " - f"{tensors_range.calibration_method}, but quantize_static was called " - f"with calibrate_method={calibrate_method}. Delete the cache or " - f"pass a matching calibrate_method." + with _cache_path.open("r") as _f: + _raw = json.load(_f) + _cached_sq = bool(_raw.get("smooth_quant", False)) + if _cached_sq != _smooth_quant: + logging.warning( + "Calibration cache at %s was produced with smooth_quant=%s; " + "current run uses smooth_quant=%s. Recomputing ranges and overwriting cache.", + _cache_path, + _cached_sq, + _smooth_quant, ) - else: + _cache_hit = False + else: + tensors_range = load_tensors_data(_cache_path) + if tensors_range.calibration_method != calibrate_method: + raise ValueError( + f"Calibration cache at {_cache_path} was produced with " + f"{tensors_range.calibration_method}, but quantize_static was called " + f"with calibrate_method={calibrate_method}. Delete the cache or " + f"pass a matching calibrate_method." + ) + + if not _cache_hit: if calibration_data_reader is None: raise ValueError("Either calibration_data_reader or an existing calibration_cache_path must be provided.") with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir: @@ -784,7 +800,7 @@ def inc_dataloader(): del calibrator if _cache_path is not None: - save_tensors_data(tensors_range, _cache_path) + save_tensors_data(tensors_range, _cache_path, smooth_quant=_smooth_quant) check_static_quant_arguments(quant_format, activation_type, weight_type) diff --git a/onnxruntime/test/python/quantization/test_calibration.py b/onnxruntime/test/python/quantization/test_calibration.py index e74fe1554e267..7afb77f9dfab2 100644 --- a/onnxruntime/test/python/quantization/test_calibration.py +++ b/onnxruntime/test/python/quantization/test_calibration.py @@ -5,6 +5,8 @@ # license information. # -------------------------------------------------------------------------- +import json +import logging import tempfile import unittest from pathlib import Path @@ -696,6 +698,117 @@ def test_load_tensors_data_method_mismatch_raises(self): calibration_cache_path=cache_path, ) + def test_save_tensors_data_writes_smooth_quant_field(self): + """save_tensors_data persists the smooth_quant flag in the JSON payload.""" + td = TensorsData( + CalibrationMethod.MinMax, + {"x": TensorData(lowest=np.array(-1.0, dtype=np.float32), highest=np.array(1.0, dtype=np.float32))}, + ) + for flag in (False, True): + cache_path = Path(self._tmp_dir.name) / f"sq_{flag}_cache.json" + save_tensors_data(td, cache_path, smooth_quant=flag) + with cache_path.open("r") as f: + raw = json.load(f) + self.assertIn("smooth_quant", raw) + self.assertEqual(raw["smooth_quant"], flag) + + def test_smooth_quant_mismatch_triggers_recompute(self): + """Cache produced with smooth_quant=True must not be used for a smooth_quant=False run.""" + model_path = Path(self._tmp_dir.name) / "sq_mismatch_model.onnx" + self._make_simple_model(str(model_path)) + cache_path = Path(self._tmp_dir.name) / "sq_mismatch_cache.json" + out1_path = Path(self._tmp_dir.name) / "sq_mismatch_out1.onnx" + + # Write a cache that claims smooth_quant=True by injecting the field directly. + td = TensorsData( + CalibrationMethod.MinMax, + {"x": TensorData(lowest=np.array(-1.0, dtype=np.float32), highest=np.array(1.0, dtype=np.float32))}, + ) + save_tensors_data(td, cache_path, smooth_quant=True) + with cache_path.open("r") as f: + self.assertEqual(json.load(f)["smooth_quant"], True) + + # Run with smooth_quant=False (default): the cache must be treated as a miss. + # We supply a real data_reader so recompute can proceed. + data_reader = TestDataReader() + with self.assertLogs("root", level=logging.WARNING) as log_cm: + quantize_static( + str(model_path), + str(out1_path), + calibration_data_reader=data_reader, + calibration_cache_path=cache_path, + # SmoothQuant not set -> defaults to False + ) + # At least one WARNING about the mismatch must have been emitted. + self.assertTrue( + any("smooth_quant" in msg for msg in log_cm.output), + msg=f"Expected smooth_quant warning; got: {log_cm.output}", + ) + # The rewritten cache must now have smooth_quant=False. + with cache_path.open("r") as f: + self.assertEqual(json.load(f)["smooth_quant"], False) + + def test_smooth_quant_match_produces_cache_hit(self): + """Cache with smooth_quant=False is reused when the run also uses smooth_quant=False.""" + model_path = Path(self._tmp_dir.name) / "sq_hit_model.onnx" + self._make_simple_model(str(model_path)) + cache_path = Path(self._tmp_dir.name) / "sq_hit_cache.json" + out1_path = Path(self._tmp_dir.name) / "sq_hit_out1.onnx" + out2_path = Path(self._tmp_dir.name) / "sq_hit_out2.onnx" + + # First run: write cache with smooth_quant=False (the default). + data_reader = TestDataReader() + quantize_static( + str(model_path), + str(out1_path), + calibration_data_reader=data_reader, + calibration_cache_path=cache_path, + ) + with cache_path.open("r") as f: + self.assertEqual(json.load(f)["smooth_quant"], False) + + # Second run: no data_reader, cache should be a hit (smooth_quant matches). + quantize_static( + str(model_path), + str(out2_path), + calibration_data_reader=None, + calibration_cache_path=cache_path, + ) + self.assertTrue(out2_path.exists()) + + def test_old_cache_without_smooth_quant_field_treated_as_false(self): + """A legacy cache without a smooth_quant key is assumed smooth_quant=False.""" + model_path = Path(self._tmp_dir.name) / "legacy_sq_model.onnx" + self._make_simple_model(str(model_path)) + cache_path = Path(self._tmp_dir.name) / "legacy_sq_cache.json" + out1_path = Path(self._tmp_dir.name) / "legacy_sq_out1.onnx" + out2_path = Path(self._tmp_dir.name) / "legacy_sq_out2.onnx" + + # First run: populate a real cache against the actual model so tensor names match. + data_reader = TestDataReader() + quantize_static( + str(model_path), + str(out1_path), + calibration_data_reader=data_reader, + calibration_cache_path=cache_path, + ) + + # Strip the smooth_quant field to simulate a legacy cache file. + with cache_path.open("r") as f: + raw = json.load(f) + raw.pop("smooth_quant", None) + with cache_path.open("w") as f: + json.dump(raw, f) + + # A run with smooth_quant=False (default) must treat the legacy cache as a hit (no recompute needed). + quantize_static( + str(model_path), + str(out2_path), + calibration_data_reader=None, + calibration_cache_path=cache_path, + ) + self.assertTrue(out2_path.exists()) + if __name__ == "__main__": unittest.main()