update int8

namgyu-youn · namgyu-youn · commit 062f3ccae219 · 2025-10-24T19:55:37.000+09:00
diff --git a/test/quantization/quantize_/workflows/int8/test_int8_tensor.py b/test/quantization/quantize_/workflows/int8/test_int8_tensor.py
@@ -111,12 +111,16 @@ def test_int8_linear_variants(
         assert error > 20, f"Quantization error is too high got a SQNR of {error}"
 
     @common_utils.parametrize("dtype", [torch.bfloat16, torch.float16])
-    def test_static_quantization(self, dtype):
-        """Test static quantization with pre-computed scale"""
+    def test_static_dynamic_quantization(self, dtype):
+        """Test static and dynamic quantization"""
         K, N = 128, 64
         weight = torch.randn(N, K, dtype=dtype, device="cuda")
         input_tensor = torch.randn(32, K, dtype=dtype, device="cuda")
 
+        # Dynamic quantization (runtime scale computation)
+        dynamic_tensor = Int8Tensor.from_hp(weight, block_size=[N, K])
+
+        # Static quantization (pre-computed scale)
         act_scale, _ = choose_qparams_affine(
             input=input_tensor,
             mapping_type=MappingType.SYMMETRIC,
@@ -128,8 +132,8 @@ def test_static_quantization(self, dtype):
             zero_point_dtype=torch.int8,
         )
 
-        # Create weight with static quantization
-        weight_int8 = Int8Tensor.from_hp(
+        # Static quantization (with pre-computed scale)
+        static_tensor = Int8Tensor.from_hp(
             weight,
             block_size=[N, K],
             act_quant_kwargs=QuantizeTensorToInt8Kwargs(
@@ -138,9 +142,13 @@ def test_static_quantization(self, dtype):
             ),
         )
 
-        output = torch.nn.functional.linear(input_tensor, weight_int8)
-        self.assertEqual(output.shape, (32, N))
-        self.assertEqual(output.dtype, dtype)
+        dynamic_output = torch.nn.functional.linear(input_tensor, dynamic_tensor)
+        static_output = torch.nn.functional.linear(input_tensor, static_tensor)
+
+        self.assertEqual(dynamic_output.shape, (32, N))
+        self.assertEqual(static_output.shape, (32, N))
+        self.assertEqual(dynamic_output.dtype, dtype)
+        self.assertEqual(static_output.dtype, dtype)
 
     @unittest.skip("granularity parameter not supported in current API")
     @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
@@ -190,6 +198,8 @@ def test_slice(self, config, device, dtype):
             # PerTensor: scale unchanged by slicing
             self.assertEqual(weight1.scale, dummy.weight.scale)
             self.assertEqual(weight2.scale, dummy.weight.scale)
+        with self.assertRaises(NotImplementedError):
+            _ = dummy.weight[::2]
 
     def test_index_select(self):
         """test that `x_0 = x[0]` works when `x` is a 2D `Int8Tensor`."""
@@ -212,7 +222,7 @@ def test_error_handling_and_dequant(self):
         test_data = torch.tensor([[1.0, -1.0]], dtype=torch.bfloat16)
         tensor = Int8Tensor.from_hp(test_data, [1, 2])
 
-        dequantized = torch.ops.aten.dequantize.self(tensor)
+        dequantized = tensor.dequantize()
         self.assertEqual(dequantized.shape, test_data.shape)
         self.assertLess(torch.abs(dequantized - test_data).max().item(), 0.1)
 
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -1522,6 +1522,7 @@ class Int8DynamicActivationInt8WeightConfig(AOBaseConfig):
     layout: Optional[Layout] = PlainLayout()
     act_mapping_type: Optional[MappingType] = MappingType.SYMMETRIC
     weight_only_decode: bool = False
+    granularity: Optional[Union[PerRow, PerTensor]] = PerRow()
     set_inductor_config: bool = True
     version: int = 2
 
@@ -1555,9 +1556,6 @@ def _int8_dynamic_activation_int8_weight_quantize_tensor(weight, config):
     mapping_type = MappingType.SYMMETRIC
     weight_zero_point_domain = ZeroPointDomain.NONE
 
-    def get_weight_block_size(x):
-        return tuple([1 for _ in range(x.dim() - 1)] + [x.shape[-1]])
-
     target_dtype = torch.int8
     eps = torch.finfo(torch.float32).eps
     zero_point_dtype = torch.int64
@@ -1571,7 +1569,13 @@ def get_weight_block_size(x):
         else:
             input_quant_func = _int8_asymm_per_token_quant
 
-    block_size = get_weight_block_size(weight)
+    if isinstance(config.granularity, PerTensor):
+        # Tensor granularity
+        block_size = weight.shape
+    else:
+        # Per row granularity
+        block_size = tuple([1 for _ in range(weight.dim() - 1)] + [weight.shape[-1]])
+
     if config.version == 1:
         warnings.warn(
             "Config Deprecation: version 1 of Int8DynamicActivationInt8WeightConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details"
diff --git a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
@@ -5,11 +5,12 @@
 # LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass
-from typing import List, Optional
+from typing import Optional
 
 import torch
 from torch.utils._python_dispatch import return_and_correct_aliasing
 
+from torchao.float8.inference import _slice_scale_for_dimension
 from torchao.quantization.quant_primitives import (
     MappingType,
     _maybe_expand_scale_to_tensor_shape,
@@ -20,7 +21,7 @@
     QuantizeTensorKwargs,
     _choose_quant_func_and_quantize_tensor,
 )
-from torchao.utils import TorchAOBaseTensor
+from torchao.utils import TorchAOBaseTensor, fill_defaults
 
 __all__ = ["Int8Tensor", "QuantizeTensorToInt8Kwargs"]
 
@@ -32,11 +33,11 @@ class QuantizeTensorToInt8Kwargs(QuantizeTensorKwargs):
     """Tensor kwargs for creating int8 tensor (either activation or weight)
 
     Args:
-        block_size (List[int]): block size for quantization granularity
+        block_size (list[int]): block size for quantization granularity
         static_scale (Optional[torch.Tensor]): pre-computed scale for static quantization
     """
 
-    block_size: List[int]
+    block_size: list[int]
     static_scale: Optional[torch.Tensor] = None
 
 
@@ -64,7 +65,7 @@ def __new__(
         cls: type,
         qdata: torch.Tensor,
         scale: torch.Tensor,
-        block_size: List[int],
+        block_size: list[int],
         act_quant_kwargs=None,
         dtype=None,
     ):
@@ -73,13 +74,13 @@ def __new__(
             "dtype": dtype or scale.dtype,
             "requires_grad": False,
         }
-        return torch.Tensor._make_wrapper_subclass(cls, List(qdata.shape), **kwargs)
+        return torch.Tensor._make_wrapper_subclass(cls, qdata.shape, **kwargs)
 
     def __init__(
         self,
         qdata: torch.Tensor,
         scale: torch.Tensor,
-        block_size: List[int],
+        block_size: list[int],
         act_quant_kwargs=None,
         dtype=None,
     ):
@@ -99,13 +100,13 @@ def __repr__(self):
     def from_hp(
         cls,
         w: torch.Tensor,
-        block_size: List[int],
+        block_size: list[int],
         act_quant_kwargs: Optional[QuantizeTensorToInt8Kwargs] = None,
     ):
         if w.dim() != 2 or len(block_size) != 2:
             raise ValueError("Expected 2D tensor and block_size length 2")
 
-        if act_quant_kwargs and act_quant_kwargs.static_scale is not None:
+        if act_quant_kwargs is not None and act_quant_kwargs.static_scale is not None:
             # INT8 × INT8 (static)
             scale = act_quant_kwargs.static_scale
             zero_point = torch.zeros_like(scale, dtype=torch.int8)
@@ -114,7 +115,7 @@ def from_hp(
             scale, zero_point = choose_qparams_affine(
                 input=w,
                 mapping_type=MappingType.SYMMETRIC,
-                block_size=tuple(block_size),
+                block_size=block_size,
                 target_dtype=torch.int8,
                 quant_min=-128,
                 quant_max=127,
@@ -124,12 +125,19 @@ def from_hp(
 
         int_data = quantize_affine(
             w,
-            block_size=tuple(block_size),
+            block_size=block_size,
             scale=scale,
             zero_point=zero_point,
             output_dtype=torch.int8,
         )
 
+        if tuple(block_size) == w.shape:
+            # per-tensor
+            scale = scale.expand(w.shape)
+        elif len(scale.shape) == 1:
+            # per-row, 1D -> 2D
+            scale = scale.unsqueeze(-1)
+
         return cls(
             int_data,
             scale,
@@ -208,37 +216,32 @@ def _(func, types, args, kwargs):
     return result + bias if bias is not None else result
 
 
-@implements([aten.slice.Tensor])
+@implements(aten.slice.Tensor)
 def _(func, types, args, kwargs):
     """Slice operation for Int8Tensor"""
-    tensor, dim, start, end, step = (
-        args[0],
-        args[1],
-        args[2],
-        args[3],
-        args[4] if len(args) > 4 else 1,
-    )
+    self, dim, start, end, step = fill_defaults(args, 5, [0, None, None, 1])
 
-    assert dim in (0, 1), f"Only dim 0 or 1 supported, got {dim}"
+    if step != 1:
+        raise NotImplementedError("Slicing with step > 1 is not supported")
 
-    if end >= tensor.shape[dim]:
-        end = tensor.shape[dim]
+    if end >= self.shape[dim]:
+        end = self.shape[dim]
 
-    # Always slice the qdata
-    sliced_qdata = func(tensor.qdata, dim, start, end, step)
+    sliced_qdata = aten.slice.Tensor(self.qdata, dim, start, end, step)
 
-    if tensor.scale.numel() == 1:
+    if self.scale.numel() == 1:
         # Per-tensor quantization - scale doesn't change
-        sliced_scale = tensor.scale
-    elif dim < tensor.scale.ndim and tensor.scale.shape[dim] > 1:
+        sliced_scale = self.scale
+    elif dim < self.scale.ndim and self.scale.shape[dim] > 1:
         # Block-wise quantization - need to slice the scale appropriately
-        sliced_scale = func(tensor.scale, dim, start, end, step)
+        sliced_scale = aten.slice.Tensor(self.scale, dim, start, end, step)
     else:
-        sliced_scale = tensor.scale
-
-    # adjust block_size since the shape has changed, block_size[i] should not be greater than shape[i]
-    block_size = List(tensor.block_size)
+        # Block-wise quantization - need to slice the scale appropriately
+        sliced_scale = _slice_scale_for_dimension(
+            self.scale, self.qdata.shape, dim, start, end, step
+        )
 
+    block_size = list(self.block_size)
     for i in range(len(block_size)):
         block_size[i] = min(block_size[i], sliced_qdata.shape[i])
 
@@ -250,8 +253,8 @@ def _(func, types, args, kwargs):
             sliced_qdata,
             sliced_scale,
             block_size,
-            tensor.act_quant_kwargs,
-            tensor.dtype,
+            self.act_quant_kwargs,
+            dtype=self.dtype,
         ),
     )