microsoft · tianleiwu · May 11, 2026 · Apr 27, 2026 · May 3, 2026 · May 3, 2026
diff --git a/onnxruntime/python/tools/quantization/base_quantizer.py b/onnxruntime/python/tools/quantization/base_quantizer.py
@@ -104,6 +104,7 @@ def __init__(
         # the symmetry (i.e., signed integer types will use symmetric quantization). See `def is_weight_symmetric()`
         self._is_weight_symmetric: bool | None = self.extra_options.get("WeightSymmetric", None)
         self.is_activation_symmetric = self.extra_options.get("ActivationSymmetric", False)
+        self.is_activation_restricted_asymmetric = self.extra_options.get("ActivationRestrictedAsymmetric", False)
         self.min_real_range = self.extra_options.get("MinimumRealRange")
 
         self.activation_qType = getattr(activation_qType, "tensor_type", activation_qType)

diff --git a/onnxruntime/python/tools/quantization/onnx_quantizer.py b/onnxruntime/python/tools/quantization/onnx_quantizer.py
@@ -30,6 +30,7 @@
     ms_domain,
     quantize_onnx_initializer,
     save_and_reload_model_with_shape_infer,
+    snap_zero_point_to_uint8,
     tensor_proto_to_array,
 )
 from .registry import CreateOpQuantizer
@@ -1157,6 +1158,8 @@ def calculate_quantization_params(self):
                 reduce_range = quant_overrides.get("reduce_range", False)
                 qmin, qmax = get_qmin_qmax_for_qType(quant_type, reduce_range=reduce_range, symmetric=symmetric)
                 zero, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, self.min_real_range)
+                if self.is_activation_restricted_asymmetric and quant_type == onnx.TensorProto.UINT8 and not symmetric:
+                    zero, scale = snap_zero_point_to_uint8(rmin, rmax)
 
             quantization_params[tensor_name] = QuantizationParams(zero_point=zero, scale=scale, quant_type=quant_type)
 

diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py
@@ -38,6 +38,7 @@
     ms_domain,
     normalize_axis,
     quantize_onnx_initializer,
+    snap_zero_point_to_uint8,
     tensor_proto_to_array,
 )
 from .registry import CreateQDQQuantizer
@@ -1320,6 +1321,12 @@ def calc_quant_params(self, tensor_data: TensorData, quant_overrides: dict[str,
             reduce_range = quant_overrides.get("reduce_range", False)
             qmin, qmax = get_qmin_qmax_for_qType(quant_type, reduce_range=reduce_range, symmetric=symmetric)
             zero, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, self.min_real_range)
+            if (
+                self.is_activation_restricted_asymmetric
+                and quant_type == onnx.TensorProto.UINT8
+                and not symmetric
+            ):
+                zero, scale = snap_zero_point_to_uint8(rmin, rmax)
 
         return QuantizationParams(zero_point=zero.squeeze(), scale=scale.squeeze(), quant_type=quant_type)
 

diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py
@@ -297,6 +297,33 @@ def compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=False, min_real_range=Non
     return [zero_point, scale]
 
 
+def snap_zero_point_to_uint8(rmin, rmax):
+    """Snap a uint8 activation zero-point to 0 (when rmin >= 0) or 128 (when rmin < 0).
+
+    Used by the ActivationRestrictedAsymmetric quantization option. Recomputes scale so the
+    dequantized range still covers [rmin, rmax] without clipping.
+
+    :parameter rmin: calibrated minimum activation value (numpy scalar)
+    :parameter rmax: calibrated maximum activation value (numpy scalar)
+    :return: (zero_point, scale) with zero_point dtype uint8 and scale dtype float32
+    """
+    rmin = float(numpy.squeeze(rmin))
+    rmax = float(numpy.squeeze(rmax))
+    if rmax <= rmin:
+        # Degenerate range – return neutral values
+        return numpy.array(0, dtype=numpy.uint8), numpy.array(1.0, dtype=numpy.float32)
+    if rmin >= 0.0:
+        zero_point = numpy.array(0, dtype=numpy.uint8)
+        scale = numpy.array(rmax / 255.0, dtype=numpy.float32)
+    else:
+        zero_point = numpy.array(128, dtype=numpy.uint8)
+        # Choose scale that covers both negative and positive halves without clipping
+        scale_neg = -rmin / 128.0  # scale needed to represent rmin at q=0
+        scale_pos = rmax / 127.0  # scale needed to represent rmax at q=255
+        scale = numpy.array(max(scale_neg, scale_pos), dtype=numpy.float32)
+    return zero_point, scale
+
+
 def compute_scale_zp_float8(element_type, std):
     """Calculate the scale s for a float8 type (E4M3FN).
     The function assumes the coefficient distribution and the float 8

diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
@@ -120,6 +120,8 @@ def __init__(
                 key value pair dictionary for various options in different case. Current used:
                     extra.Sigmoid.nnapi = True/False  (Default is False)
                     ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
+                    ActivationRestrictedAsymmetric = True/False: (uint8 activations only) snap zero-point to 0
+                        (rmin>=0) or 128 (rmin<0); recompute scale accordingly (default is False).
                     WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
                     EnableSubgraph = True/False : Default is False. If enabled, subgraph will be quantized.
                                                   Dyanmic mode currently is supported. Will support more in future.
@@ -419,6 +421,8 @@ def __init__(
             extra_options: key value pair dictionary for various options in different case. Current used:
                 extra.Sigmoid.nnapi = True/False  (Default is False)
                 ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
+                ActivationRestrictedAsymmetric = True/False: (uint8 activations only) snap zero-point to 0
+                    (rmin>=0) or 128 (rmin<0); recompute scale accordingly (default is False).
                 WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
                 EnableSubgraph = True/False :
                     Default is False. If enabled, subgraph will be quantized. Dynamic mode currently is supported. Will
@@ -544,6 +548,8 @@ def quantize_static(
             key value pair dictionary for various options in different case. Current used:
                 extra.Sigmoid.nnapi = True/False  (Default is False)
                 ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
+                ActivationRestrictedAsymmetric = True/False: (uint8 activations only) snap zero-point to 0
+                    (rmin>=0) or 128 (rmin<0); recompute scale accordingly (default is False).
                 WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
                 EnableSubgraph = True/False : Default is False. If enabled, subgraph will be quantized.
                                               Dyanmic mode currently is supported. Will support more in the future.
@@ -834,6 +840,8 @@ def quantize_dynamic(
             key value pair dictionary for various options in different case. Current used:
                 extra.Sigmoid.nnapi = True/False  (Default is False)
                 ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
+                ActivationRestrictedAsymmetric = True/False: (uint8 activations only) snap zero-point to 0
+                    (rmin>=0) or 128 (rmin<0); recompute scale accordingly (default is False).
                 WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
                 EnableSubgraph = True/False :
                     Default is False. If enabled, subgraph will be quantized. Dynamic mode currently is supported. Will

diff --git a/onnxruntime/test/python/quantization/test_symmetric_flag.py b/onnxruntime/test/python/quantization/test_symmetric_flag.py
@@ -150,3 +150,84 @@ def test_3(self):
 
 if __name__ == "__main__":
     unittest.main()
+
+
+class TestRestrictedAsymmetricFlag(unittest.TestCase):
+    """Tests for ActivationRestrictedAsymmetric extra-option (uint8 zero-point snapping)."""
+
+    def setUp(self):
+        # All-positive activations (post-ReLU-like): rmin >= 0, expect zp == 0
+        self.positive_activations = [
+            np.zeros([1, 2, 32, 32], dtype="float32"),
+            np.ones([1, 2, 32, 32], dtype="float32") * 2.0,
+        ]
+        # Signed-range activations: rmin < 0, expect zp == 128
+        self.signed_activations = [
+            -1.0 * np.ones([1, 2, 32, 32], dtype="float32"),
+            +2.0 * np.ones([1, 2, 32, 32], dtype="float32"),
+        ]
+
+        self.weights = np.concatenate(
+            (
+                -1 * np.ones([1, 1, 2, 2], dtype="float32"),
+                +1 * np.ones([1, 1, 2, 2], dtype="float32"),
+            ),
+            axis=1,
+        )
+
+    def _quantize(self, activations, extra_options):
+        act = helper.make_tensor_value_info("ACT", TensorProto.FLOAT, activations[0].shape)
+        res = helper.make_tensor_value_info("RES", TensorProto.FLOAT, [None, None, None, None])
+        wgt_init = numpy_helper.from_array(self.weights, "WGT")
+        conv_node = onnx.helper.make_node("Conv", ["ACT", "WGT"], ["RES"])
+        graph = helper.make_graph([conv_node], "test", [act], [res], initializer=[wgt_init])
+        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 11)])
+        onnx.save(model, "model_restricted.onnx")
+
+        class DummyDataReader(quantization.CalibrationDataReader):
+            def __init__(self_inner):
+                self_inner.iterator = ({"ACT": act} for act in activations)
+
+            def get_next(self_inner):
+                return next(self_inner.iterator, None)
+
+        quantization.quantize_static(
+            model_input="model_restricted.onnx",
+            model_output="quantized_restricted.onnx",
+            calibration_data_reader=DummyDataReader(),
+            quant_format=quantization.QuantFormat.QOperator,
+            activation_type=quantization.QuantType.QUInt8,
+            weight_type=quantization.QuantType.QUInt8,
+            op_types_to_quantize=["Conv", "MatMul"],
+            extra_options=extra_options,
+        )
+
+        model = onnx.load("quantized_restricted.onnx")
+        act_zp = next(init for init in model.graph.initializer if init.name == "ACT_zero_point").int32_data[0]
+        act_sc = next(init for init in model.graph.initializer if init.name == "ACT_scale").float_data[0]
+        return act_zp, act_sc
+
+    def test_positive_activations_zp_is_zero(self):
+        """All-positive range (rmin >= 0): zero-point must snap to 0."""
+        act_zp, act_sc = self._quantize(
+            self.positive_activations,
+            extra_options={"ActivationRestrictedAsymmetric": True},
+        )
+        self.assertEqual(act_zp, 0, f"Expected zp=0 for rmin>=0, got {act_zp}")
+
+    def test_signed_activations_zp_is_128(self):
+        """Signed range (rmin < 0): zero-point must snap to 128."""
+        act_zp, act_sc = self._quantize(
+            self.signed_activations,
+            extra_options={"ActivationRestrictedAsymmetric": True},
+        )
+        self.assertEqual(act_zp, 128, f"Expected zp=128 for rmin<0, got {act_zp}")
+
+    def test_option_false_does_not_snap(self):
+        """When ActivationRestrictedAsymmetric is False, behavior matches standard asymmetric (zp != 128 for signed)."""
+        act_zp, act_sc = self._quantize(
+            self.signed_activations,
+            extra_options={"ActivationRestrictedAsymmetric": False},
+        )
+        # Standard asymmetric uint8 with rmin=-1, rmax=2 should give non-128 zp (it's ~85)
+        self.assertNotEqual(act_zp, 128, f"Option=False should not snap to 128, got {act_zp}")