From becce1f7f467fe96e67bc2848cff2f61072aafa4 Mon Sep 17 00:00:00 2001
From: Rishi Dave <rishipdave@gmail.com>
Date: Fri, 24 Apr 2026 11:14:26 +0000
Subject: [PATCH 1/6] feat(quantization): add calibration cache to
 quantize_static

Introduce an optional calibration_cache_path parameter on quantize_static
so users can save the computed TensorsData after calibration and reload
it on subsequent runs. This avoids repeating the expensive calibration
inference pass when only post-calibration knobs (e.g. nodes_to_exclude,
quant types) change between runs.

The cache is a human-readable JSON file whose schema mirrors the encoder
used by write_calibration_table: TensorData / TensorsData round-trip
through new from_dict classmethods and module-level save_tensors_data /
load_tensors_data helpers in calibrate.py. calibration_data_reader is now
optional; at least one of it or an existing cache file must be provided.

Fixes #21908
---
 .../python/tools/quantization/__init__.py     |   4 +
 .../python/tools/quantization/calibrate.py    |  64 +++++++
 .../python/tools/quantization/quantize.py     | 116 +++++++-----
 .../python/quantization/test_calibration.py   | 170 +++++++++++++++++-
 4 files changed, 311 insertions(+), 43 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/__init__.py b/onnxruntime/python/tools/quantization/__init__.py
index ac99de348f612..50b0bd08ae360 100644
--- a/onnxruntime/python/tools/quantization/__init__.py
+++ b/onnxruntime/python/tools/quantization/__init__.py
@@ -3,7 +3,11 @@
     CalibrationDataReader,
     CalibrationMethod,
     MinMaxCalibrater,
+    TensorData,
+    TensorsData,
     create_calibrator,
+    load_tensors_data,
+    save_tensors_data,
 )
 from .qdq_quantizer import QDQQuantizer  # noqa: F401
 from .quant_utils import QuantFormat, QuantType, write_calibration_table  # noqa: F401
diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index 05a5b0873d93d..f266922574a94 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -7,6 +7,7 @@
 import abc
 import copy
 import itertools
+import json
 import os
 import uuid
 from collections.abc import Sequence
@@ -98,6 +99,19 @@ def to_dict(self):
         data["CLS"] = self.__class__.__name__
         return data
 
+    @classmethod
+    def from_dict(cls, d: dict) -> "TensorData":
+        """Reconstruct a TensorData from a dict produced by to_dict()."""
+        kwargs = {}
+        for k, v in d.items():
+            if k == "CLS":
+                continue
+            value = v
+            if isinstance(value, dict) and value.get("CLS") == "numpy.array":
+                value = np.array(value["data"], dtype=np.dtype(value["dtype"]))
+            kwargs[k] = value
+        return cls(**kwargs)
+
 
 class TensorsData:
     def __init__(self, calibration_method, data: dict[str, TensorData | tuple]):
@@ -150,6 +164,18 @@ def to_dict(self):
         }
         return data
 
+    @classmethod
+    def from_dict(cls, d: dict) -> "TensorsData":
+        """Reconstruct a TensorsData from a dict produced by to_dict()."""
+        method_val = d["calibration_method"]
+        if isinstance(method_val, dict) and method_val.get("CLS") == "CalibrationMethod":
+            name = method_val["value"].split(".")[-1]
+            method = CalibrationMethod[name]
+        else:
+            method = method_val
+        reconstructed = {k: TensorData.from_dict(v) for k, v in d["data"].items()}
+        return cls(method, reconstructed)
+
 
 class CalibrationMethod(Enum):
     MinMax = 0
@@ -184,6 +210,44 @@ def set_range(self, start_index: int, end_index: int):
         raise NotImplementedError
 
 
+class _CalibrationCacheEncoder(json.JSONEncoder):
+    """JSON encoder for calibration cache serialization."""
+
+    def default(self, obj):
+        if isinstance(obj, (TensorData, TensorsData)):
+            return obj.to_dict()
+        if isinstance(obj, np.ndarray):
+            return {"data": obj.tolist(), "dtype": str(obj.dtype), "CLS": "numpy.array"}
+        if isinstance(obj, CalibrationMethod):
+            return {"CLS": obj.__class__.__name__, "value": str(obj)}
+        if isinstance(obj, np.integer):
+            return int(obj)
+        if isinstance(obj, np.floating):
+            return float(obj)
+        return json.JSONEncoder.default(self, obj)
+
+
+def save_tensors_data(tensors_data: "TensorsData", path: "str | Path") -> None:
+    """Serialize calibration tensor ranges to a JSON file at *path*."""
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    tmp = path.with_suffix(path.suffix + ".tmp")
+    with tmp.open("w") as f:
+        json.dump(tensors_data, f, cls=_CalibrationCacheEncoder)
+        f.flush()
+    os.replace(tmp, path)
+
+
+def load_tensors_data(path: "str | Path") -> "TensorsData":
+    """Load calibration tensor ranges from a JSON file written by save_tensors_data()."""
+    path = Path(path)
+    if not path.exists():
+        raise FileNotFoundError(f"Calibration cache not found: {path}")
+    with path.open("r") as f:
+        d = json.load(f)
+    return TensorsData.from_dict(d)
+
+
 class CalibraterBase:
     def __init__(
         self,
diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
index b8b239b85e7ad..f10a9f1fc561a 100644
--- a/onnxruntime/python/tools/quantization/quantize.py
+++ b/onnxruntime/python/tools/quantization/quantize.py
@@ -14,7 +14,14 @@
 
 import onnx
 
-from .calibrate import CalibrationDataReader, CalibrationMethod, TensorsData, create_calibrator
+from .calibrate import (
+    CalibrationDataReader,
+    CalibrationMethod,
+    TensorsData,
+    create_calibrator,
+    load_tensors_data,
+    save_tensors_data,
+)
 from .onnx_quantizer import ONNXQuantizer
 from .qdq_quantizer import QDQQuantizer
 from .quant_utils import (
@@ -479,7 +486,7 @@ def check_static_quant_arguments(quant_format: QuantFormat, activation_type: Qua
 def quantize_static(
     model_input: str | Path | onnx.ModelProto,
     model_output: str | Path,
-    calibration_data_reader: CalibrationDataReader,
+    calibration_data_reader: CalibrationDataReader | None = None,
     quant_format=QuantFormat.QDQ,
     op_types_to_quantize=None,
     per_channel=False,
@@ -492,6 +499,7 @@ def quantize_static(
     calibrate_method=CalibrationMethod.MinMax,
     calibration_providers=None,
     extra_options=None,
+    calibration_cache_path: str | Path | None = None,
 ):
     """
     Given an onnx model and calibration data reader, create a quantized onnx model and save it into a file
@@ -506,7 +514,13 @@ def quantize_static(
         model_output: file path of quantized model
         calibration_data_reader: a calibration data reader. It
             enumerates calibration data and generates inputs for the
-            original model.
+            original model. May be None if calibration_cache_path points to an
+            existing cache file.
+        calibration_cache_path: optional path to a JSON calibration cache. If
+            the file already exists, calibration inference is skipped and the
+            cached tensor ranges are loaded instead. If the file does not yet
+            exist, calibration runs normally and the result is saved to this
+            path for future reuse.
         quant_format: QuantFormat{QOperator, QDQ}.
             QOperator format quantizes the model with quantized operators directly.
             QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
@@ -704,48 +718,66 @@ def inc_dataloader():
     if is_model_updated:
         model = updated_model
 
-    with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
-        if is_model_updated:
-            # Update model_input and avoid to use the original one
-            model_input = copy.deepcopy(model)
-
-        if isinstance(model_input, onnx.ModelProto):
-            output_path = Path(quant_tmp_dir).joinpath("model_input.onnx").as_posix()
-            onnx.save_model(
-                model_input,
-                output_path,
-                save_as_external_data=True,
+    _cache_path = Path(calibration_cache_path) if calibration_cache_path is not None else None
+    _cache_hit = _cache_path is not None and _cache_path.exists()
+
+    if _cache_hit:
+        tensors_range = load_tensors_data(_cache_path)
+        if tensors_range.calibration_method != calibrate_method:
+            raise ValueError(
+                f"Calibration cache at {_cache_path} was produced with "
+                f"{tensors_range.calibration_method}, but quantize_static was called "
+                f"with calibrate_method={calibrate_method}. Delete the cache or "
+                f"pass a matching calibrate_method."
+            )
+    else:
+        if calibration_data_reader is None:
+            raise ValueError("Either calibration_data_reader or an existing calibration_cache_path must be provided.")
+        with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
+            if is_model_updated:
+                # Update model_input and avoid to use the original one
+                model_input = copy.deepcopy(model)
+
+            if isinstance(model_input, onnx.ModelProto):
+                output_path = Path(quant_tmp_dir).joinpath("model_input.onnx").as_posix()
+                onnx.save_model(
+                    model_input,
+                    output_path,
+                    save_as_external_data=True,
+                )
+                model_input = output_path
+
+            calibrator = create_calibrator(
+                Path(model_input),
+                op_types_to_quantize,
+                augmented_model_path=Path(quant_tmp_dir).joinpath("augmented_model.onnx").as_posix(),
+                calibrate_method=calibrate_method,
+                use_external_data_format=use_external_data_format,
+                providers=calibration_providers,
+                extra_options=calib_extra_options,
             )
-            model_input = output_path
-
-        calibrator = create_calibrator(
-            Path(model_input),
-            op_types_to_quantize,
-            augmented_model_path=Path(quant_tmp_dir).joinpath("augmented_model.onnx").as_posix(),
-            calibrate_method=calibrate_method,
-            use_external_data_format=use_external_data_format,
-            providers=calibration_providers,
-            extra_options=calib_extra_options,
-        )
-
-        stride = extra_options.get("CalibStridedMinMax", None)
-        if stride:
-            total_data_size = len(calibration_data_reader)
-            if total_data_size % stride != 0:
-                raise ValueError(f"Total data size ({total_data_size}) is not divisible by stride size ({stride}).")
 
-            for start in range(0, total_data_size, stride):
-                end_index = start + stride
-                calibration_data_reader.set_range(start_index=start, end_index=end_index)
+            stride = extra_options.get("CalibStridedMinMax", None)
+            if stride:
+                total_data_size = len(calibration_data_reader)
+                if total_data_size % stride != 0:
+                    raise ValueError(f"Total data size ({total_data_size}) is not divisible by stride size ({stride}).")
+
+                for start in range(0, total_data_size, stride):
+                    end_index = start + stride
+                    calibration_data_reader.set_range(start_index=start, end_index=end_index)
+                    calibrator.collect_data(calibration_data_reader)
+            else:
                 calibrator.collect_data(calibration_data_reader)
-        else:
-            calibrator.collect_data(calibration_data_reader)
-        tensors_range = calibrator.compute_data()
-        if not isinstance(tensors_range, TensorsData):
-            raise TypeError(
-                f"Unexpected type {type(tensors_range)} for tensors_range and calibrator={type(calibrator)}."
-            )
-        del calibrator
+            tensors_range = calibrator.compute_data()
+            if not isinstance(tensors_range, TensorsData):
+                raise TypeError(
+                    f"Unexpected type {type(tensors_range)} for tensors_range and calibrator={type(calibrator)}."
+                )
+            del calibrator
+
+        if _cache_path is not None:
+            save_tensors_data(tensors_range, _cache_path)
 
     check_static_quant_arguments(quant_format, activation_type, weight_type)
 
diff --git a/onnxruntime/test/python/quantization/test_calibration.py b/onnxruntime/test/python/quantization/test_calibration.py
index 60c5f9d404258..41f5624bd6eac 100644
--- a/onnxruntime/test/python/quantization/test_calibration.py
+++ b/onnxruntime/test/python/quantization/test_calibration.py
@@ -14,7 +14,16 @@
 from onnx import TensorProto, helper, numpy_helper
 
 import onnxruntime
-from onnxruntime.quantization.calibrate import CalibrationDataReader, CalibrationMethod, create_calibrator
+from onnxruntime.quantization import quantize_static
+from onnxruntime.quantization.calibrate import (
+    CalibrationDataReader,
+    CalibrationMethod,
+    TensorData,
+    TensorsData,
+    create_calibrator,
+    load_tensors_data,
+    save_tensors_data,
+)
 
 
 def generate_input_initializer(tensor_shape, tensor_dtype, input_name):
@@ -528,5 +537,164 @@ def test_compute_data_per_channel(self):
             np.testing.assert_equal(min_max, tensors_range[output_name].range_value)
 
 
+class TestCalibrationCache(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._tmp_dir = tempfile.TemporaryDirectory(prefix="test_calibration_cache.")
+
+    @classmethod
+    def tearDownClass(cls):
+        cls._tmp_dir.cleanup()
+
+    def _make_simple_model(self, path):
+        """Build a tiny Conv+Relu model for end-to-end cache tests."""
+        vi_input = helper.make_tensor_value_info("input", TensorProto.FLOAT, [1, 3, 1, 3])
+        vi_output = helper.make_tensor_value_info("X6", TensorProto.FLOAT, [1, 3, 1, 3])
+        w1 = generate_input_initializer([3, 3, 1, 1], np.float32, "W1")
+        b1 = generate_input_initializer([3], np.float32, "B1")
+        conv_node = helper.make_node("Conv", ["input", "W1", "B1"], ["X2"], name="Conv1")
+        relu_node = helper.make_node("Relu", ["X2"], ["X6"], name="Relu1")
+        graph = helper.make_graph([conv_node, relu_node], "cache_test_graph", [vi_input], [vi_output])
+        graph.initializer.add().CopyFrom(w1)
+        graph.initializer.add().CopyFrom(b1)
+        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
+        onnx.save(model, path)
+
+    def test_save_load_tensors_data_minmax_roundtrip(self):
+        td = TensorsData(
+            CalibrationMethod.MinMax,
+            {"x": TensorData(lowest=np.array(-1.0, dtype=np.float32), highest=np.array(2.0, dtype=np.float32))},
+        )
+        cache_path = Path(self._tmp_dir.name) / "minmax_cache.json"
+        save_tensors_data(td, cache_path)
+        self.assertTrue(cache_path.exists())
+
+        loaded = load_tensors_data(cache_path)
+        self.assertEqual(loaded.calibration_method, CalibrationMethod.MinMax)
+        self.assertEqual(list(loaded.keys()), ["x"])
+        lo, hi = loaded["x"].range_value
+        np.testing.assert_array_equal(lo, np.array(-1.0, dtype=np.float32))
+        np.testing.assert_array_equal(hi, np.array(2.0, dtype=np.float32))
+        self.assertEqual(lo.shape, ())
+        self.assertEqual(hi.shape, ())
+
+    def test_save_load_tensors_data_entropy_roundtrip(self):
+        hist = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+        hist_edges = np.array([0.0, 1.0, 2.0, 3.0], dtype=np.float32)
+        td = TensorsData(
+            CalibrationMethod.Entropy,
+            {
+                "y": TensorData(
+                    lowest=np.array(-0.5, dtype=np.float32),
+                    highest=np.array(0.5, dtype=np.float32),
+                    hist=hist,
+                    hist_edges=hist_edges,
+                )
+            },
+        )
+        cache_path = Path(self._tmp_dir.name) / "entropy_cache.json"
+        save_tensors_data(td, cache_path)
+
+        loaded = load_tensors_data(cache_path)
+        self.assertEqual(loaded.calibration_method, CalibrationMethod.Entropy)
+        lo, hi = loaded["y"].range_value
+        np.testing.assert_array_almost_equal(lo, np.array(-0.5, dtype=np.float32))
+        np.testing.assert_array_almost_equal(hi, np.array(0.5, dtype=np.float32))
+        np.testing.assert_array_almost_equal(loaded["y"].hist, hist)
+        np.testing.assert_array_almost_equal(loaded["y"].hist_edges, hist_edges)
+
+    def test_load_tensors_data_invalid_path(self):
+        with self.assertRaises(FileNotFoundError):
+            load_tensors_data("/nonexistent/path/cache.json")
+
+    def test_quantize_static_calibration_cache_path(self):
+        model_path = Path(self._tmp_dir.name) / "tiny_model.onnx"
+        self._make_simple_model(str(model_path))
+
+        cache_path = Path(self._tmp_dir.name) / "quant_cache.json"
+        out1_path = Path(self._tmp_dir.name) / "quantized1.onnx"
+        out2_path = Path(self._tmp_dir.name) / "quantized2.onnx"
+
+        # First call: calibration_data_reader provided, cache written
+        data_reader = TestDataReader()
+        quantize_static(
+            str(model_path),
+            str(out1_path),
+            calibration_data_reader=data_reader,
+            calibration_cache_path=cache_path,
+        )
+        self.assertTrue(cache_path.exists())
+        td1 = load_tensors_data(cache_path)
+
+        # Second call: no data_reader, load from cache
+        quantize_static(
+            str(model_path),
+            str(out2_path),
+            calibration_data_reader=None,
+            calibration_cache_path=cache_path,
+        )
+        self.assertTrue(out2_path.exists())
+        td2 = load_tensors_data(cache_path)
+        self.assertEqual(td1.calibration_method, td2.calibration_method)
+
+    def test_quantize_static_no_reader_no_cache_raises(self):
+        model_path = Path(self._tmp_dir.name) / "tiny_model2.onnx"
+        self._make_simple_model(str(model_path))
+        out_path = Path(self._tmp_dir.name) / "quantized_err.onnx"
+
+        with self.assertRaises(ValueError):
+            quantize_static(str(model_path), str(out_path), calibration_data_reader=None)
+
+    def test_save_tensors_data_creates_parent_dir(self):
+        nested_path = Path(self._tmp_dir.name) / "nested" / "dir" / "cache.json"
+        td = TensorsData(
+            CalibrationMethod.MinMax,
+            {"x": TensorData(lowest=np.array(-1.0, dtype=np.float32), highest=np.array(1.0, dtype=np.float32))},
+        )
+        save_tensors_data(td, nested_path)
+        self.assertTrue(nested_path.exists())
+
+    def test_save_tensors_data_handles_scalar_bins(self):
+        td = TensorsData(
+            CalibrationMethod.Entropy,
+            {
+                "z": TensorData(
+                    lowest=np.array(0.0, dtype=np.float32),
+                    highest=np.array(1.0, dtype=np.float32),
+                    hist=np.array([1, 2], dtype=np.int64),
+                    bins=np.int64(5),
+                )
+            },
+        )
+        cache_path = Path(self._tmp_dir.name) / "scalar_bins_cache.json"
+        save_tensors_data(td, cache_path)
+        loaded = load_tensors_data(cache_path)
+        self.assertEqual(loaded["z"].bins, 5)
+
+    def test_load_tensors_data_method_mismatch_raises(self):
+        model_path = Path(self._tmp_dir.name) / "tiny_mismatch.onnx"
+        self._make_simple_model(str(model_path))
+        cache_path = Path(self._tmp_dir.name) / "mismatch_cache.json"
+        out_path = Path(self._tmp_dir.name) / "quantized_mismatch.onnx"
+
+        data_reader = TestDataReader()
+        quantize_static(
+            str(model_path),
+            str(out_path),
+            calibration_data_reader=data_reader,
+            calibrate_method=CalibrationMethod.MinMax,
+            calibration_cache_path=cache_path,
+        )
+
+        with self.assertRaises(ValueError):
+            quantize_static(
+                str(model_path),
+                str(out_path),
+                calibration_data_reader=None,
+                calibrate_method=CalibrationMethod.Entropy,
+                calibration_cache_path=cache_path,
+            )
+
+
 if __name__ == "__main__":
     unittest.main()

From aa4521fcb36142e58443d02498a472c0de1acfce Mon Sep 17 00:00:00 2001
From: Rishi Dave <rishipdave@gmail.com>
Date: Fri, 1 May 2026 00:58:42 +0000
Subject: [PATCH 2/6] fix(quantization): tighten calibration cache path
 validation

- load_tensors_data: reject non-file paths up front with a ValueError instead
  of letting Path.open raise IsADirectoryError or similar.
- quantize_static: when extra_options['SmoothQuant']=True, require a non-None
  calibration_data_reader since the cache stores per-tensor ranges only and
  cannot drive the SmoothQuant transform.
- quantize_static: treat the cache path as a hit only when it is a regular
  file; raise ValueError if it exists but is e.g. a directory, so callers get
  a clear message instead of a low-level IOError.
---
 onnxruntime/python/tools/quantization/calibrate.py | 2 ++
 onnxruntime/python/tools/quantization/quantize.py  | 9 ++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index f266922574a94..453fa82188f55 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -243,6 +243,8 @@ def load_tensors_data(path: "str | Path") -> "TensorsData":
     path = Path(path)
     if not path.exists():
         raise FileNotFoundError(f"Calibration cache not found: {path}")
+    if not path.is_file():
+        raise ValueError(f"Calibration cache path is not a file: {path}")
     with path.open("r") as f:
         d = json.load(f)
     return TensorsData.from_dict(d)
diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
index f10a9f1fc561a..b80451f275acd 100644
--- a/onnxruntime/python/tools/quantization/quantize.py
+++ b/onnxruntime/python/tools/quantization/quantize.py
@@ -687,6 +687,11 @@ def quantize_static(
     }
 
     if extra_options.get("SmoothQuant", False):
+        if calibration_data_reader is None:
+            raise ValueError(
+                "SmoothQuant requires a non-None calibration_data_reader; the calibration cache "
+                "stores per-tensor ranges only and cannot drive the SmoothQuant transform."
+            )
         import importlib  # noqa: PLC0415
 
         try:
@@ -719,7 +724,9 @@ def inc_dataloader():
         model = updated_model
 
     _cache_path = Path(calibration_cache_path) if calibration_cache_path is not None else None
-    _cache_hit = _cache_path is not None and _cache_path.exists()
+    if _cache_path is not None and _cache_path.exists() and not _cache_path.is_file():
+        raise ValueError(f"calibration_cache_path is not a file: {_cache_path}")
+    _cache_hit = _cache_path is not None and _cache_path.is_file()
 
     if _cache_hit:
         tensors_range = load_tensors_data(_cache_path)

From 755a1c916ef9fac61a734cdb2f10314dceff22ac Mon Sep 17 00:00:00 2001
From: Rishi Dave <rishipdave@gmail.com>
Date: Sun, 3 May 2026 11:43:43 +0000
Subject: [PATCH 3/6] fix(quantization): coerce float scalars and clean tmp on
 cache write failure

- TensorData.from_dict: wrap plain int/float values for _floats keys as
  np.array(value, dtype=np.float32) so cache round-trip works for
  Entropy/Distribution calibration, where hist_edges.min()/.max() are
  serialized as numpy scalars and deserialize as plain Python floats.
- save_tensors_data: wrap json.dump + os.replace in try/except
  BaseException that unlinks the .tmp file on failure, so partial
  serialization (or KeyboardInterrupt mid-write) does not leave stray
  .tmp files behind.
---
 onnxruntime/python/tools/quantization/calibrate.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index 453fa82188f55..fc14a589e7ff5 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -109,6 +109,8 @@ def from_dict(cls, d: dict) -> "TensorData":
             value = v
             if isinstance(value, dict) and value.get("CLS") == "numpy.array":
                 value = np.array(value["data"], dtype=np.dtype(value["dtype"]))
+            elif k in cls._floats and isinstance(value, (int, float)):
+                value = np.array(value, dtype=np.float32)
             kwargs[k] = value
         return cls(**kwargs)
 
@@ -232,10 +234,14 @@ def save_tensors_data(tensors_data: "TensorsData", path: "str | Path") -> None:
     path = Path(path)
     path.parent.mkdir(parents=True, exist_ok=True)
     tmp = path.with_suffix(path.suffix + ".tmp")
-    with tmp.open("w") as f:
-        json.dump(tensors_data, f, cls=_CalibrationCacheEncoder)
-        f.flush()
-    os.replace(tmp, path)
+    try:
+        with tmp.open("w") as f:
+            json.dump(tensors_data, f, cls=_CalibrationCacheEncoder)
+            f.flush()
+        os.replace(tmp, path)
+    except BaseException:
+        tmp.unlink(missing_ok=True)
+        raise
 
 
 def load_tensors_data(path: "str | Path") -> "TensorsData":

From c8d11ce14fbc05f6a2d556b8255b44adaff9a61e Mon Sep 17 00:00:00 2001
From: Rishi Dave <rishipdave@gmail.com>
Date: Mon, 4 May 2026 11:35:29 +0000
Subject: [PATCH 4/6] refactor(quantization): share calibration JSON encoder
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address review feedback on PR #28221 by hoisting the previously
private `_CalibrationCacheEncoder` in `calibrate.py` to a module-level
`CalibrationCacheEncoder` and reusing it from
`quant_utils.write_calibration_table` instead of redefining a
narrower inner `MyEncoder`. The shared encoder is a superset of the
old inner one — `write_calibration_table` now also serializes bare
numpy scalar types (np.integer, np.floating, np.bool_) that
previously would have raised TypeError. This is the intended
consolidation: both helpers now produce identical JSON. Also
parametrize the invalid-path test against `TemporaryDirectory` so it
no longer relies on a hardcoded POSIX absolute path, making it
deterministic on Windows.
---
 .../python/tools/quantization/calibrate.py      | 11 ++++++++---
 .../python/tools/quantization/quant_utils.py    | 17 +++++------------
 .../python/quantization/test_calibration.py     |  3 ++-
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index fc14a589e7ff5..b92eccd2989aa 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -212,8 +212,13 @@ def set_range(self, start_index: int, end_index: int):
         raise NotImplementedError
 
 
-class _CalibrationCacheEncoder(json.JSONEncoder):
-    """JSON encoder for calibration cache serialization."""
+class CalibrationCacheEncoder(json.JSONEncoder):
+    """Shared JSON encoder for calibration caches.
+
+    Handles numpy ndarrays and numpy scalar types (integer/floating) so
+    calibration JSON output is consistent across ``save_tensors_data`` and
+    ``quant_utils.write_calibration_table``.
+    """
 
     def default(self, obj):
         if isinstance(obj, (TensorData, TensorsData)):
@@ -236,7 +241,7 @@ def save_tensors_data(tensors_data: "TensorsData", path: "str | Path") -> None:
     tmp = path.with_suffix(path.suffix + ".tmp")
     try:
         with tmp.open("w") as f:
-            json.dump(tensors_data, f, cls=_CalibrationCacheEncoder)
+            json.dump(tensors_data, f, cls=CalibrationCacheEncoder)
             f.flush()
         os.replace(tmp, path)
     except BaseException:
diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py
index 0ce1e1a0d75de..c8deb0d3e395a 100644
--- a/onnxruntime/python/tools/quantization/quant_utils.py
+++ b/onnxruntime/python/tools/quantization/quant_utils.py
@@ -796,21 +796,14 @@ def write_calibration_table(calibration_cache, dir="."):
 
     import onnxruntime.quantization.CalTableFlatBuffers.KeyValue as KeyValue  # noqa: PLC0415
     import onnxruntime.quantization.CalTableFlatBuffers.TrtTable as TrtTable  # noqa: PLC0415
-    from onnxruntime.quantization.calibrate import CalibrationMethod, TensorData, TensorsData  # noqa: PLC0415
+
+    # Use the shared encoder from calibrate.py so write_calibration_table and
+    # save_tensors_data produce identical JSON for numpy scalar/array values.
+    from onnxruntime.quantization.calibrate import CalibrationCacheEncoder  # noqa: PLC0415
 
     logging.info(f"calibration cache: {calibration_cache}")
 
-    class MyEncoder(json.JSONEncoder):
-        def default(self, obj):
-            if isinstance(obj, (TensorData, TensorsData)):
-                return obj.to_dict()
-            if isinstance(obj, np.ndarray):
-                return {"data": obj.tolist(), "dtype": str(obj.dtype), "CLS": "numpy.array"}
-            if isinstance(obj, CalibrationMethod):
-                return {"CLS": obj.__class__.__name__, "value": str(obj)}
-            return json.JSONEncoder.default(self, obj)
-
-    json_data = json.dumps(calibration_cache, cls=MyEncoder)
+    json_data = json.dumps(calibration_cache, cls=CalibrationCacheEncoder)
 
     with open(os.path.join(dir, "calibration.json"), "w") as file:
         file.write(json_data)  # use `json.loads` to do the reverse
diff --git a/onnxruntime/test/python/quantization/test_calibration.py b/onnxruntime/test/python/quantization/test_calibration.py
index 41f5624bd6eac..e74fe1554e267 100644
--- a/onnxruntime/test/python/quantization/test_calibration.py
+++ b/onnxruntime/test/python/quantization/test_calibration.py
@@ -604,8 +604,9 @@ def test_save_load_tensors_data_entropy_roundtrip(self):
         np.testing.assert_array_almost_equal(loaded["y"].hist_edges, hist_edges)
 
     def test_load_tensors_data_invalid_path(self):
+        bogus = Path(self._tmp_dir.name) / "does_not_exist.json"
         with self.assertRaises(FileNotFoundError):
-            load_tensors_data("/nonexistent/path/cache.json")
+            load_tensors_data(bogus)
 
     def test_quantize_static_calibration_cache_path(self):
         model_path = Path(self._tmp_dir.name) / "tiny_model.onnx"

From 91c30c6189a16a76cf2e4c766168004eb6981e0e Mon Sep 17 00:00:00 2001
From: Rishi Dave <rishipdave@gmail.com>
Date: Tue, 5 May 2026 11:18:23 +0000
Subject: [PATCH 5/6] fix(quantization): use unique temp filename for
 calibration cache writes

Previously save_tensors_data used a deterministic '<path>.tmp' suffix for the
intermediate write. If two processes wrote to the same calibration cache path
concurrently, they raced on the same temp filename and the atomic-write
guarantee was lost (one writer could unlink the partial file of another).

Switch to tempfile.mkstemp in the destination directory so each writer gets a
unique temp file. Restore prior 0o644 permissions before os.replace since
mkstemp defaults to 0o600. Use contextlib.suppress for the FileNotFoundError
cleanup path.
---
 onnxruntime/python/tools/quantization/calibrate.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index b92eccd2989aa..e35fdc545c5ec 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -5,10 +5,12 @@
 # license information.
 # --------------------------------------------------------------------------
 import abc
+import contextlib
 import copy
 import itertools
 import json
 import os
+import tempfile
 import uuid
 from collections.abc import Sequence
 from enum import Enum
@@ -238,14 +240,16 @@ def save_tensors_data(tensors_data: "TensorsData", path: "str | Path") -> None:
     """Serialize calibration tensor ranges to a JSON file at *path*."""
     path = Path(path)
     path.parent.mkdir(parents=True, exist_ok=True)
-    tmp = path.with_suffix(path.suffix + ".tmp")
+    fd, tmp_name = tempfile.mkstemp(dir=path.parent, prefix=".calibcache_", suffix=".tmp")
     try:
-        with tmp.open("w") as f:
+        with os.fdopen(fd, "w") as f:
             json.dump(tensors_data, f, cls=CalibrationCacheEncoder)
             f.flush()
-        os.replace(tmp, path)
+        os.chmod(tmp_name, 0o644)
+        os.replace(tmp_name, path)
     except BaseException:
-        tmp.unlink(missing_ok=True)
+        with contextlib.suppress(FileNotFoundError):
+            os.unlink(tmp_name)
         raise
 
 

From 40d6038528cef1c385f3f50ebb3c7436da1c9b1c Mon Sep 17 00:00:00 2001
From: Rishi Dave <rishipdave@gmail.com>
Date: Wed, 6 May 2026 11:24:52 +0000
Subject: [PATCH 6/6] fix(quantization): guard calibration cache against
 SmoothQuant mismatch

- Persist `smooth_quant` flag in the calibration cache JSON payload.
- On load, if the cached flag differs from the current run, log a
  warning and recompute (treat as a cache miss); a missing field is
  assumed False for backwards compatibility with caches written by
  earlier revisions of this PR.
- Drop the explicit `os.chmod(..., 0o644)` in save_tensors_data; the
  tempfile is created with mode 0o600 by mkstemp and the umask should
  govern the final mode (CodeQL flagged 0o644 as overly permissive).
- Add unit tests covering field persistence, mismatch->recompute+warn,
  matching->cache hit, and legacy (missing field)->cache hit.
---
 .../python/tools/quantization/calibrate.py    |  13 +-
 .../python/tools/quantization/quantize.py     |  34 ++++--
 .../python/quantization/test_calibration.py   | 113 ++++++++++++++++++
 3 files changed, 147 insertions(+), 13 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index e35fdc545c5ec..305804661cf64 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -236,16 +236,21 @@ def default(self, obj):
         return json.JSONEncoder.default(self, obj)
 
 
-def save_tensors_data(tensors_data: "TensorsData", path: "str | Path") -> None:
-    """Serialize calibration tensor ranges to a JSON file at *path*."""
+def save_tensors_data(tensors_data: "TensorsData", path: "str | Path", *, smooth_quant: bool = False) -> None:
+    """Serialize calibration tensor ranges to a JSON file at *path*.
+
+    :param smooth_quant: whether the producing run used SmoothQuant.  Stored in
+        the cache so a later load can detect a mismatch and recompute.
+    """
     path = Path(path)
     path.parent.mkdir(parents=True, exist_ok=True)
     fd, tmp_name = tempfile.mkstemp(dir=path.parent, prefix=".calibcache_", suffix=".tmp")
     try:
         with os.fdopen(fd, "w") as f:
-            json.dump(tensors_data, f, cls=CalibrationCacheEncoder)
+            payload = tensors_data.to_dict()
+            payload["smooth_quant"] = smooth_quant
+            json.dump(payload, f, cls=CalibrationCacheEncoder)
             f.flush()
-        os.chmod(tmp_name, 0o644)
         os.replace(tmp_name, path)
     except BaseException:
         with contextlib.suppress(FileNotFoundError):
diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
index b80451f275acd..d6b2ecb2b17ed 100644
--- a/onnxruntime/python/tools/quantization/quantize.py
+++ b/onnxruntime/python/tools/quantization/quantize.py
@@ -6,6 +6,7 @@
 from __future__ import annotations
 
 import copy
+import json
 import logging
 import tempfile
 from collections.abc import Callable
@@ -727,17 +728,32 @@ def inc_dataloader():
     if _cache_path is not None and _cache_path.exists() and not _cache_path.is_file():
         raise ValueError(f"calibration_cache_path is not a file: {_cache_path}")
     _cache_hit = _cache_path is not None and _cache_path.is_file()
+    _smooth_quant = bool(extra_options.get("SmoothQuant", False))
 
     if _cache_hit:
-        tensors_range = load_tensors_data(_cache_path)
-        if tensors_range.calibration_method != calibrate_method:
-            raise ValueError(
-                f"Calibration cache at {_cache_path} was produced with "
-                f"{tensors_range.calibration_method}, but quantize_static was called "
-                f"with calibrate_method={calibrate_method}. Delete the cache or "
-                f"pass a matching calibrate_method."
+        with _cache_path.open("r") as _f:
+            _raw = json.load(_f)
+        _cached_sq = bool(_raw.get("smooth_quant", False))
+        if _cached_sq != _smooth_quant:
+            logging.warning(
+                "Calibration cache at %s was produced with smooth_quant=%s; "
+                "current run uses smooth_quant=%s. Recomputing ranges and overwriting cache.",
+                _cache_path,
+                _cached_sq,
+                _smooth_quant,
             )
-    else:
+            _cache_hit = False
+        else:
+            tensors_range = load_tensors_data(_cache_path)
+            if tensors_range.calibration_method != calibrate_method:
+                raise ValueError(
+                    f"Calibration cache at {_cache_path} was produced with "
+                    f"{tensors_range.calibration_method}, but quantize_static was called "
+                    f"with calibrate_method={calibrate_method}. Delete the cache or "
+                    f"pass a matching calibrate_method."
+                )
+
+    if not _cache_hit:
         if calibration_data_reader is None:
             raise ValueError("Either calibration_data_reader or an existing calibration_cache_path must be provided.")
         with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
@@ -784,7 +800,7 @@ def inc_dataloader():
             del calibrator
 
         if _cache_path is not None:
-            save_tensors_data(tensors_range, _cache_path)
+            save_tensors_data(tensors_range, _cache_path, smooth_quant=_smooth_quant)
 
     check_static_quant_arguments(quant_format, activation_type, weight_type)
 
diff --git a/onnxruntime/test/python/quantization/test_calibration.py b/onnxruntime/test/python/quantization/test_calibration.py
index e74fe1554e267..7afb77f9dfab2 100644
--- a/onnxruntime/test/python/quantization/test_calibration.py
+++ b/onnxruntime/test/python/quantization/test_calibration.py
@@ -5,6 +5,8 @@
 # license information.
 # --------------------------------------------------------------------------
 
+import json
+import logging
 import tempfile
 import unittest
 from pathlib import Path
@@ -696,6 +698,117 @@ def test_load_tensors_data_method_mismatch_raises(self):
                 calibration_cache_path=cache_path,
             )
 
+    def test_save_tensors_data_writes_smooth_quant_field(self):
+        """save_tensors_data persists the smooth_quant flag in the JSON payload."""
+        td = TensorsData(
+            CalibrationMethod.MinMax,
+            {"x": TensorData(lowest=np.array(-1.0, dtype=np.float32), highest=np.array(1.0, dtype=np.float32))},
+        )
+        for flag in (False, True):
+            cache_path = Path(self._tmp_dir.name) / f"sq_{flag}_cache.json"
+            save_tensors_data(td, cache_path, smooth_quant=flag)
+            with cache_path.open("r") as f:
+                raw = json.load(f)
+            self.assertIn("smooth_quant", raw)
+            self.assertEqual(raw["smooth_quant"], flag)
+
+    def test_smooth_quant_mismatch_triggers_recompute(self):
+        """Cache produced with smooth_quant=True must not be used for a smooth_quant=False run."""
+        model_path = Path(self._tmp_dir.name) / "sq_mismatch_model.onnx"
+        self._make_simple_model(str(model_path))
+        cache_path = Path(self._tmp_dir.name) / "sq_mismatch_cache.json"
+        out1_path = Path(self._tmp_dir.name) / "sq_mismatch_out1.onnx"
+
+        # Write a cache that claims smooth_quant=True by injecting the field directly.
+        td = TensorsData(
+            CalibrationMethod.MinMax,
+            {"x": TensorData(lowest=np.array(-1.0, dtype=np.float32), highest=np.array(1.0, dtype=np.float32))},
+        )
+        save_tensors_data(td, cache_path, smooth_quant=True)
+        with cache_path.open("r") as f:
+            self.assertEqual(json.load(f)["smooth_quant"], True)
+
+        # Run with smooth_quant=False (default): the cache must be treated as a miss.
+        # We supply a real data_reader so recompute can proceed.
+        data_reader = TestDataReader()
+        with self.assertLogs("root", level=logging.WARNING) as log_cm:
+            quantize_static(
+                str(model_path),
+                str(out1_path),
+                calibration_data_reader=data_reader,
+                calibration_cache_path=cache_path,
+                # SmoothQuant not set -> defaults to False
+            )
+        # At least one WARNING about the mismatch must have been emitted.
+        self.assertTrue(
+            any("smooth_quant" in msg for msg in log_cm.output),
+            msg=f"Expected smooth_quant warning; got: {log_cm.output}",
+        )
+        # The rewritten cache must now have smooth_quant=False.
+        with cache_path.open("r") as f:
+            self.assertEqual(json.load(f)["smooth_quant"], False)
+
+    def test_smooth_quant_match_produces_cache_hit(self):
+        """Cache with smooth_quant=False is reused when the run also uses smooth_quant=False."""
+        model_path = Path(self._tmp_dir.name) / "sq_hit_model.onnx"
+        self._make_simple_model(str(model_path))
+        cache_path = Path(self._tmp_dir.name) / "sq_hit_cache.json"
+        out1_path = Path(self._tmp_dir.name) / "sq_hit_out1.onnx"
+        out2_path = Path(self._tmp_dir.name) / "sq_hit_out2.onnx"
+
+        # First run: write cache with smooth_quant=False (the default).
+        data_reader = TestDataReader()
+        quantize_static(
+            str(model_path),
+            str(out1_path),
+            calibration_data_reader=data_reader,
+            calibration_cache_path=cache_path,
+        )
+        with cache_path.open("r") as f:
+            self.assertEqual(json.load(f)["smooth_quant"], False)
+
+        # Second run: no data_reader, cache should be a hit (smooth_quant matches).
+        quantize_static(
+            str(model_path),
+            str(out2_path),
+            calibration_data_reader=None,
+            calibration_cache_path=cache_path,
+        )
+        self.assertTrue(out2_path.exists())
+
+    def test_old_cache_without_smooth_quant_field_treated_as_false(self):
+        """A legacy cache without a smooth_quant key is assumed smooth_quant=False."""
+        model_path = Path(self._tmp_dir.name) / "legacy_sq_model.onnx"
+        self._make_simple_model(str(model_path))
+        cache_path = Path(self._tmp_dir.name) / "legacy_sq_cache.json"
+        out1_path = Path(self._tmp_dir.name) / "legacy_sq_out1.onnx"
+        out2_path = Path(self._tmp_dir.name) / "legacy_sq_out2.onnx"
+
+        # First run: populate a real cache against the actual model so tensor names match.
+        data_reader = TestDataReader()
+        quantize_static(
+            str(model_path),
+            str(out1_path),
+            calibration_data_reader=data_reader,
+            calibration_cache_path=cache_path,
+        )
+
+        # Strip the smooth_quant field to simulate a legacy cache file.
+        with cache_path.open("r") as f:
+            raw = json.load(f)
+        raw.pop("smooth_quant", None)
+        with cache_path.open("w") as f:
+            json.dump(raw, f)
+
+        # A run with smooth_quant=False (default) must treat the legacy cache as a hit (no recompute needed).
+        quantize_static(
+            str(model_path),
+            str(out2_path),
+            calibration_data_reader=None,
+            calibration_cache_path=cache_path,
+        )
+        self.assertTrue(out2_path.exists())
+
 
 if __name__ == "__main__":
     unittest.main()