pytorch · namgyu-youn · Oct 24, 2025 · Oct 26, 2025 · Oct 26, 2025 · Oct 31, 2025
diff --git a/docs/source/quantization_overview.rst b/docs/source/quantization_overview.rst
@@ -5,7 +5,7 @@ First we want to lay out the torchao stack::
 
   Quantization Algorithms/Flows: weight only/dynamic/static quantization, hqq, awq, gptq etc.
   ---------------------------------------------------------------------------------------------
-      Quantized Tensors (derived dtypes): Int4Tensor, Int4PreshuffledTensor, Float8Tensor
+      Quantized Tensors (derived dtypes): Int4Tensor, Int4PreshuffledTensor, Int8Tensor, Float8Tensor
   ---------------------------------------------------------------------------------------------
     Quantization Primitive Ops/Efficient Kernels: matmul, quantize, dequantize
   ---------------------------------------------------------------------------------------------
@@ -88,6 +88,8 @@ So in general we structure Tensor subclasses by dervied dtpype and packing forma
      - scaled int4
      - preshuffled (special format to optimize for loading)
      - float8 act + int4 weight dynamic quantization and int4 weight only quantization
+   * - Int8Tensor
+     - plain
 
 .. note::
    We don't have granularity specific tensor subclasses, i.e. no Float8RowwiseTensor or Float8BlockwiseTensor, all granularities are implemented in the same Tensor, we typically use a general `block_size` attribute to distinguish between different granularities, and each Tensor is allowed to support only a subset of all possible granularity options.

diff --git a/test/quantization/quantize_/workflows/int8/test_int8_tensor.py b/test/quantization/quantize_/workflows/int8/test_int8_tensor.py
@@ -0,0 +1,235 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import unittest
+
+import torch
+from torch.testing._internal import common_utils
+
+from torchao.quantization import (
+    Int8DynamicActivationInt8WeightConfig,
+    Int8WeightOnlyConfig,
+    quantize_,
+)
+from torchao.quantization.quantize_.workflows.int8.int8_tensor import (
+    Int8Tensor,
+)
+from torchao.quantization.utils import compute_error
+from torchao.testing.utils import TorchAOIntegrationTestCase
+
+
+# TODO: Refactor after https://github.com/pytorch/ao/pull/2729 is merged
+class ToyTwoLinearModel(torch.nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        hidden_dim,
+        output_dim,
+        has_bias=False,
+        dtype=None,
+        device=None,
+    ):
+        super().__init__()
+        self.dtype = dtype
+        self.device = device
+        self.linear1 = torch.nn.Linear(
+            input_dim, hidden_dim, bias=has_bias, dtype=dtype, device=device
+        )
+        self.linear2 = torch.nn.Linear(
+            hidden_dim, output_dim, bias=has_bias, dtype=dtype, device=device
+        )
+
+    def forward(self, x):
+        x = self.linear1(x)
+        x = self.linear2(x)
+        return x
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+@common_utils.instantiate_parametrized_tests
+class TestInt8Tensor(TorchAOIntegrationTestCase):
+    def setUp(self):
+        super().setUp()
+
+        self.test_shape = (4, 3)
+        self.dtype = torch.bfloat16
+        self.batch_size = 32
+
+        torch.manual_seed(42)
+        self.weight_fp = torch.randn(*self.test_shape, dtype=self.dtype)
+        self.input_fp = torch.randn(*self.test_shape, dtype=self.dtype)
+        self.bias = torch.randn(self.test_shape[0], dtype=self.dtype)
+        self.block_size = list(self.test_shape)
+
+    def test_creation_and_attributes(self):
+        """Test tensor creation, dtypes, and ranges"""
+        tensor = Int8Tensor.from_hp(self.weight_fp, self.block_size)
+
+        self.assertEqual(tensor.shape, self.test_shape)
+        self.assertEqual(tensor.qdata.dtype, torch.int8)
+        self.assertTrue(
+            torch.all(tensor.qdata >= -128) and torch.all(tensor.qdata <= 127)
+        )
+
+    @common_utils.parametrize("dtype", [torch.bfloat16, torch.float16])
+    @common_utils.parametrize(
+        "sizes",
+        [
+            ((128,), 256, 128),
+        ],
+    )
+    @common_utils.parametrize(
+        "config",
+        [
+            Int8DynamicActivationInt8WeightConfig(version=2),
+            Int8WeightOnlyConfig(version=2),
+        ],
+    )
+    def test_int8_linear_quantization_accuracy(
+        self,
+        dtype: torch.dtype,
+        sizes: tuple,
+        config,
+    ):
+        """Test quantization preserves reasonable accuracy"""
+        M, N, K = sizes
+        input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda")
+
+        # Create a linear layer
+        m = ToyTwoLinearModel(K, N, K).eval().to(dtype).to("cuda")
+        m_q = copy.deepcopy(m)
+
+        # Quantize
+        quantize_(m_q, config)
+
+        output_original = m(input_tensor)
+        output_quantized = m_q(input_tensor)
+
+        error = compute_error(output_original, output_quantized)
+        assert error > 20, (
+            f"Quantization quality is too low, SQNR: {error}dB (expected > {20}dB)"
+        )
+
+    @common_utils.parametrize("dtype", [torch.bfloat16, torch.float16])
+    @common_utils.parametrize(
+        "config",
+        [
+            Int8DynamicActivationInt8WeightConfig(version=2),
+            Int8WeightOnlyConfig(version=2),
+        ],
+    )
+    def test_per_row_scale_shape(self, dtype, config):
+        """Test per-row quantization maintains 1D scale"""
+        N, K = 64, 128
+        linear = torch.nn.Linear(K, N, bias=False, dtype=dtype, device="cuda")
+        quantize_(linear, config)
+
+        # Dynamic: per-row (1D scale [N]), Weight-only: per-tensor (scalar)
+        if isinstance(config, Int8DynamicActivationInt8WeightConfig):
+            self.assertEqual(linear.weight.scale.shape, (N,))
+            self.assertEqual(linear.weight.scale.ndim, 1)
+        else:
+            self.assertEqual(linear.weight.scale.numel(), 1)
+
+    @common_utils.parametrize("dtype", [torch.bfloat16, torch.float16])
+    @common_utils.parametrize("has_bias", [True, False])
+    def test_weight_only_linear_with_bias(self, dtype, has_bias):
+        """Test weight-only quantization with and without bias"""
+        K, N = 128, 64
+        linear = torch.nn.Linear(K, N, bias=has_bias, dtype=dtype, device="cuda")
+        input_tensor = torch.randn(self.batch_size, K, dtype=dtype, device="cuda")
+
+        output_fp = linear(input_tensor)
+
+        quantize_(linear, Int8WeightOnlyConfig(version=2))
+        output_q = linear(input_tensor)
+
+        self.assertEqual(output_q.shape, output_fp.shape)
+        error = compute_error(output_fp, output_q)
+        self.assertGreater(error, 20)
+
+    @common_utils.parametrize(
+        "config",
+        [
+            Int8DynamicActivationInt8WeightConfig(version=2),
+            Int8WeightOnlyConfig(version=2),
+        ],
+    )
+    @common_utils.parametrize("device", ["cpu", "cuda"])
+    @common_utils.parametrize("dtype", [torch.bfloat16, torch.float16])
+    def test_slice(self, config, device, dtype):
+        """Test tensor slicing"""
+        tensor_size = 256
+        slice_sizes = (64, 128)
+
+        dummy = torch.nn.Linear(
+            tensor_size, tensor_size, bias=False, dtype=dtype, device=device
+        )
+        quantize_(dummy, config)
+
+        weight1 = dummy.weight.clone().narrow(0, 0, slice_sizes[0])
+        weight2 = dummy.weight.clone().narrow(1, 0, slice_sizes[1])
+
+        self.assertEqual(weight1.qdata, dummy.weight.qdata.narrow(0, 0, slice_sizes[0]))
+        self.assertEqual(weight2.qdata, dummy.weight.qdata.narrow(1, 0, slice_sizes[1]))
+
+        # Int8DynamicActivationInt8WeightConfig uses per-row (PerRow)
+        # Int8WeightOnlyConfig uses per-tensor (PerTensor)
 group_size = weight.shape[-1] 
 group_size = weight.shape[-1] 
+        if isinstance(config, Int8DynamicActivationInt8WeightConfig):
+            # PerRow: dim 0 slicing affects scale, dim 1 doesn't
+            self.assertEqual(
+                weight1.scale, dummy.weight.scale.narrow(0, 0, slice_sizes[0])
+            )
+            self.assertEqual(weight2.scale, dummy.weight.scale)
+        else:
+            # PerTensor: scale unchanged by slicing
+            self.assertEqual(weight1.scale, dummy.weight.scale)
+            self.assertEqual(weight2.scale, dummy.weight.scale)
+        with self.assertRaises(NotImplementedError):
+            _ = dummy.weight[::2]
+
+    def test_index_select(self):
+        """test that `x_0 = x[0]` works when `x` is a 2D `Int8Tensor`."""
+        N, K = 256, 512
+        x = torch.randn(N, K, device="cuda", dtype=torch.bfloat16)
+        x_int8 = Int8Tensor.from_hp(x, block_size=[N, K])
+        x_int8_0 = x_int8[0]
+        torch.testing.assert_close(
+            x_int8.dequantize()[0], x_int8_0.dequantize(), atol=0, rtol=0
+        )
+
+    def test_invalid_input_handling(self):
+        """Test input validation with specific error types"""
+        invalid_tensor = torch.randn(5)
+        incompatible_block_size = [1]
+
+        with self.assertRaises(
+            ValueError, msg="Should reject incompatible tensor dimensions"
+        ):
+            Int8Tensor.from_hp(invalid_tensor, incompatible_block_size)
+
+        with self.assertRaises(
+            ValueError, msg="Should reject mismatched block size dimensions"
+        ):
+            Int8Tensor.from_hp(self.weight_fp, [1])
+
+    def test_dequantization_accuracy(self):
+        """Test dequantization accuracy separately"""
+        test_data = torch.tensor([[1.0, -1.0]], dtype=torch.bfloat16)
+        tensor = Int8Tensor.from_hp(test_data, [1, 2])
+
+        dequantized = tensor.dequantize()
+        self.assertEqual(dequantized.shape, test_data.shape)
+        self.assertLess(
+            torch.abs(dequantized - test_data).max().item(),
+            0.1,
+            msg=f"Dequantization error exceeds tolerance of {0.1}",
+        )
+
+
+if __name__ == "__main__":
+    common_utils.run_tests()
diff --git a/torchao/float8/inference.py b/torchao/float8/inference.py
@@ -140,7 +140,18 @@ def _slice_scale_for_dimension(
     """
     aten = torch.ops.aten
 
-    # Unsupported case for now, this would be 1 scale per data element
+    # Per-tensor quantization (scalar scale)
+    if scale.numel() == 1:
 if block_size_for_dim == 1: 
     # Scale is per-element along this dimension 
     # Slice away as normal 
     return aten.slice.Tensor(scale, dim, start, end, step) 
 else: 
     # There is blocking in this dimension 
     # Calculate which scale elements correspond to the sliced data 
     scale_start = start // block_size_for_dim if start is not None else None 
     scale_end = ( 
         (end + block_size_for_dim - 1) // block_size_for_dim 
         if end is not None 
         else None 
     ) 
     # Error on Step > 1 
     if step > 1: 
         raise NotImplementedError( 
             "Slicing with step > 1 is not implemented for scale tensors." 
         ) 
     return aten.slice.Tensor(scale, dim, scale_start, scale_end, 1) 
 if block_size_for_dim == 1: 
     # Scale is per-element along this dimension 
     # Slice away as normal 
     return aten.slice.Tensor(scale, dim, start, end, step) 
 else: 
     # There is blocking in this dimension 
     # Calculate which scale elements correspond to the sliced data 
     scale_start = start // block_size_for_dim if start is not None else None 
     scale_end = ( 
         (end + block_size_for_dim - 1) // block_size_for_dim 
         if end is not None 
         else None 
     ) 
  
     # Error on Step > 1 
     if step > 1: 
         raise NotImplementedError( 
             "Slicing with step > 1 is not implemented for scale tensors." 
         ) 
  
     return aten.slice.Tensor(scale, dim, scale_start, scale_end, 1) 
+        return scale
+
+    # Per-row quantization (1D scale)
+    if scale.ndim == 1:
+        if dim == 0:
+            return aten.slice.Tensor(scale, 0, start, end, step)
+        else:
+            return scale
+
+    # Block-wise quantization (2D scale)
     if scale.shape == data_shape:
         return aten.slice.Tensor(scale, dim, start, end, step)
 
@@ -158,6 +169,12 @@ def _slice_scale_for_dimension(
         # Slice away as normal
         return aten.slice.Tensor(scale, dim, start, end, step)
     else:
+        # Error on Step > 1
+        if step > 1:
+            raise NotImplementedError(
+                "Slicing with step > 1 is not implemented for scale tensors."
+            )
+
         # There is blocking in this dimension
         # Calculate which scale elements correspond to the sliced data
         scale_start = start // block_size_for_dim if start is not None else None
@@ -167,12 +184,6 @@ def _slice_scale_for_dimension(
             else None
         )
 
-        # Error on Step > 1
-        if step > 1:
-            raise NotImplementedError(
-                "Slicing with step > 1 is not implemented for scale tensors."
-            )
-
         return aten.slice.Tensor(scale, dim, scale_start, scale_end, 1)
 
 

diff --git a/torchao/quantization/__init__.py b/torchao/quantization/__init__.py
@@ -97,6 +97,7 @@
     Int4PreshuffledTensor,
     Int4Tensor,
     Int4TilePackedTo4dTensor,
+    Int8Tensor,
     IntxOpaqueTensor,
     IntxUnpackedToInt8Tensor,
 )
@@ -168,6 +169,7 @@
     "IntxOpaqueTensor",
     "IntxUnpackedToInt8Tensor",
     "Int4TilePackedTo4dTensor",
+    "Int8Tensor",
     "Float8Tensor",
     "Int4OpaqueTensor",
     # smooth quant - subject to change