link new API with old API using version 2

namgyu-youn · namgyu-youn · commit c53dad03d0f6 · 2025-09-28T23:48:38.000+09:00
diff --git a/test/quantization/quantize_/workflows/int8/test_int8_tensor.py b/test/quantization/quantize_/workflows/int8/test_int8_tensor.py
@@ -4,11 +4,19 @@
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
 
+import copy
 import unittest
+from contextlib import nullcontext
+from typing import Tuple
 
 import torch
-from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal import common_utils
 
+from torchao.quantization import (
+    Int8DynamicActivationInt8WeightConfig,
+    Int8WeightOnlyConfig,
+    quantize_,
+)
 from torchao.quantization.quantize_.workflows.int8.int8_tensor import (
     Int8Tensor,
     QuantizeTensorToInt8Kwargs,
@@ -17,7 +25,46 @@
 from torchao.testing.utils import TorchAOIntegrationTestCase
 
 
+# TODO: Refactor after https://github.com/pytorch/ao/pull/2729 is merged
+class ToyTwoLinearModel(torch.nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        hidden_dim,
+        output_dim,
+        has_bias=False,
+        dtype=None,
+        device=None,
+    ):
+        super().__init__()
+        self.dtype = dtype
+        self.device = device
+        self.linear1 = torch.nn.Linear(
+            input_dim, hidden_dim, bias=has_bias, dtype=dtype, device=device
+        )
+        self.linear2 = torch.nn.Linear(
+            hidden_dim, output_dim, bias=has_bias, dtype=dtype, device=device
+        )
+
+    # Note: tinygemm kernel only uses bfloat16 inputs
+    def example_inputs(self, batch_size=1):
+        return (
+            torch.randn(
+                batch_size,
+                self.linear1.in_features,
+                dtype=self.dtype,
+                device=self.device,
+            ),
+        )
+
+    def forward(self, x):
+        x = self.linear1(x)
+        x = self.linear2(x)
+        return x
+
+
 @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+@common_utils.instantiate_parametrized_tests
 class TestInt8Tensor(TorchAOIntegrationTestCase):
     def setUp(self):
         super().setUp()
@@ -37,6 +84,56 @@ def test_creation_and_attributes(self):
             torch.all(tensor.qdata >= -128) and torch.all(tensor.qdata <= 127)
         )
 
+    @common_utils.parametrize("dtype", [torch.bfloat16, torch.float32])
+    @common_utils.parametrize("compile", [False, True])
+    @common_utils.parametrize(
+        "sizes",
+        [
+            ((128,), 256, 128),
+            ((32, 128), 64, 256),
+        ],
+    )
+    @common_utils.parametrize(
+        "config",
+        [
+            Int8DynamicActivationInt8WeightConfig(version=2),
+            Int8WeightOnlyConfig(version=2),
+        ],
+    )
+    def test_int8_linear_variants(
+        self,
+        dtype: torch.dtype,
+        compile: bool,
+        sizes: Tuple,
+        config,
+    ):
+        error_message = None
+
+        error_context = (
+            self.assertRaisesRegex(AssertionError, error_message)
+            if error_message
+            else nullcontext()
+        )
+
+        with error_context:
+            M, N, K = sizes
+            input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda")
+
+            # Create a linear layer
+            m = ToyTwoLinearModel(K, N, K).eval().to(dtype).to("cuda")
+            m_q = copy.deepcopy(m)
+
+            # Quantize
+            quantize_(m_q, config)
+
+            output_original = m(input_tensor)
+            output_quantized = m_q(input_tensor)
+
+            error = compute_error(output_original, output_quantized)
+            assert compute_error(output_original, output_quantized) > 20, (
+                f"Quantization error is too high got a SQNR of {error}"
+            )
+
     def test_linear_operations(self):
         """Test fp+int8 and int8+int8 linear ops with quantization error check"""
         weight_q8 = Int8Tensor.from_hp(self.weight_fp, self.block_size)
@@ -85,4 +182,4 @@ def test_error_handling_and_dequant(self):
 
 
 if __name__ == "__main__":
-    run_tests()
+    common_utils.run_tests()
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -78,6 +78,7 @@
     Int4PreshuffledTensor,
     Int4Tensor,
     Int4TilePackedTo4dTensor,
+    Int8Tensor,
     IntxOpaqueTensor,
     IntxPackingFormat,
     IntxUnpackedToInt8Tensor,
@@ -1352,10 +1353,12 @@ class Int8WeightOnlyConfig(AOBaseConfig):
             Otherwise, applies per-group quantization with the specified group size.
         set_inductor_config: bool = True - If True, adjusts `torchinductor` settings to recommended values
             for better performance with this quantization scheme.
+        version: int = 2 - Version of the config to use. Version 1 uses AffineQuantization for quantization,
     """
 
     group_size: Optional[int] = None
     set_inductor_config: bool = True
+    version: int = 1
 
     def __post_init__(self):
         torch._C._log_api_usage_once("torchao.quantization.Int8WeightOnlyConfig")
@@ -1366,22 +1369,30 @@ def __post_init__(self):
 
 
 def _int8_weight_only_quantize_tensor(weight, config):
-    mapping_type = MappingType.SYMMETRIC
-    target_dtype = torch.int8
-    eps = torch.finfo(torch.float32).eps
-    zero_point_dtype = torch.int64
-    group_size = config.group_size
-    if group_size is None:
-        group_size = weight.shape[-1]
-    block_size = tuple([1 for x in range(weight.dim() - 1)] + [group_size])
-    new_weight = to_affine_quantized_intx(
-        weight,
-        mapping_type,
-        block_size,
-        target_dtype,
-        eps=eps,
-        zero_point_dtype=zero_point_dtype,
-    )
+    if config.version == 1:
+        warnings.warn(
+            "Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2649 for more details"
+        )
+        mapping_type = MappingType.SYMMETRIC
+        target_dtype = torch.int8
+        eps = torch.finfo(torch.float32).eps
+        zero_point_dtype = torch.int64
+        group_size = config.group_size
+        if group_size is None:
+            group_size = weight.shape[-1]
+        block_size = tuple([1 for x in range(weight.dim() - 1)] + [group_size])
+        new_weight = to_affine_quantized_intx(
+            weight,
+            mapping_type,
+            block_size,
+            target_dtype,
+            eps=eps,
+            zero_point_dtype=zero_point_dtype,
+        )
+    else:
+        assert config.version == 2, f"Unexpected version: {config.version}"
+        block_size = [weight.shape[0], weight.shape[1]]
+        new_weight = Int8Tensor.from_hp(weight, block_size=block_size)
     return new_weight
 
 
@@ -1509,12 +1520,14 @@ class Int8DynamicActivationInt8WeightConfig(AOBaseConfig):
             in original precision during decode operations.
         set_inductor_config: bool = True - If True, adjusts `torchinductor` settings to recommended values
             for better performance with this quantization scheme.
+        version (int): the version of the config, version 1 is using AffineQuantizedTensor that we plan to deprecate/split, version 2 is using Int8Tensor
     """
 
     layout: Optional[Layout] = PlainLayout()
     act_mapping_type: Optional[MappingType] = MappingType.SYMMETRIC
     weight_only_decode: bool = False
     set_inductor_config: bool = True
+    version: int = 1
 
     def __post_init__(self):
         torch._C._log_api_usage_once(
@@ -1562,19 +1575,28 @@ def get_weight_block_size(x):
         else:
             input_quant_func = _int8_asymm_per_token_quant
 
-    block_size = get_weight_block_size(weight)
-    new_weight = to_affine_quantized_intx(
-        weight,
-        mapping_type,
-        block_size,
-        target_dtype,
-        eps=eps,
-        zero_point_dtype=zero_point_dtype,
-        _layout=layout,
-        zero_point_domain=weight_zero_point_domain,
-    )
-    new_weight = to_linear_activation_quantized(new_weight, input_quant_func)
-    return new_weight
+    if config.version == 1:
+        block_size = get_weight_block_size(weight)
+        quantized_weight = to_affine_quantized_intx(
+            weight,
+            mapping_type,
+            block_size,
+            target_dtype,
+            eps=eps,
+            zero_point_dtype=zero_point_dtype,
+            _layout=layout,
+            zero_point_domain=weight_zero_point_domain,
+        )
+        quantized_weight = to_linear_activation_quantized(
+            quantized_weight, input_quant_func
+        )
+    else:
+        quantized_weight = Int8Tensor.from_hp(
+            weight,
+            block_size=get_weight_block_size(weight),
+        )
+
+    return quantized_weight
 
 
 @register_quantize_module_handler(Int8DynamicActivationInt8WeightConfig)
diff --git a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
@@ -29,6 +29,7 @@ class QuantizeTensorToInt8Kwargs(QuantizeTensorKwargs):
     """
 
     block_size: Optional[list[int]] = None
+    kernel_preference: Optional[str] = None
 
 
 # TODO: Implement block-wise quantization using block_size
@@ -102,6 +103,7 @@ def from_hp(
         w: torch.Tensor,
         block_size: list[int],
         act_quant_kwargs: Optional[QuantizeTensorToInt8Kwargs] = None,
+        kernel_preference: Optional[str] = None,
     ):
         if w.dim() != 2 or len(block_size) != 2:
             raise ValueError("Expected 2D tensor and block_size length 2")