pytorch · namgyu-youn · Sep 21, 2025 · Sep 22, 2025 · Sep 23, 2025 · Sep 24, 2025
diff --git a/docs/source/quantization_overview.rst b/docs/source/quantization_overview.rst
@@ -5,7 +5,7 @@ First we want to lay out the torchao stack::
 
   Quantization Algorithms/Flows: weight only/dynamic/static quantization, hqq, awq, gptq etc.
   ---------------------------------------------------------------------------------------------
-      Quantized Tensors (derived dtypes): Int4Tensor, Int4PreshuffledTensor, Float8Tensor
+      Quantized Tensors (derived dtypes): Int4Tensor, Int4PreshuffledTensor, Int8Tensor, Float8Tensor
   ---------------------------------------------------------------------------------------------
     Quantization Primitive Ops/Efficient Kernels: matmul, quantize, dequantize
   ---------------------------------------------------------------------------------------------
@@ -88,6 +88,8 @@ So in general we structure Tensor subclasses by dervied dtpype and packing forma
      - scaled int4
      - preshuffled (special format to optimize for loading)
      - float8 act + int4 weight dynamic quantization and int4 weight only quantization
+   * - Int8Tensor
+     - plain
 
 .. note::
    We don't have granularity specific tensor subclasses, i.e. no Float8RowwiseTensor or Float8BlockwiseTensor, all granularities are implemented in the same Tensor, we typically use a general `block_size` attribute to distinguish between different granularities, and each Tensor is allowed to support only a subset of all possible granularity options.

diff --git a/test/quantization/quantize_/workflows/int8/test_int8_tensor.py b/test/quantization/quantize_/workflows/int8/test_int8_tensor.py
@@ -0,0 +1,73 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from torch.testing._internal.common_utils import run_tests
+
+from torchao.quantization.quantize_.workflows.int8.int8_tensor import (
+    Int8PlainInt8Tensor,
+)
+from torchao.quantization.utils import compute_error
+from torchao.testing.utils import TorchAOIntegrationTestCase
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+class TestInt8PlainInt8Tensor(TorchAOIntegrationTestCase):
+    def setUp(self):
+        super().setUp()
+        torch.manual_seed(42)
+        self.weight_fp = torch.randn(4, 3, dtype=torch.float32)
+        self.input_fp = torch.randn(2, 3, dtype=torch.float32)
+        self.bias = torch.randn(4)
+        self.block_size = [4, 3]
+
+    def test_creation_and_attributes(self):
+        """Test tensor creation, dtypes, and ranges"""
+        tensor = Int8PlainInt8Tensor.from_hp(self.weight_fp, self.block_size)
+
+        self.assertEqual(tensor.shape, (4, 3))
+        self.assertEqual(tensor.qdata.dtype, torch.int8)
+        self.assertTrue(
+            torch.all(tensor.qdata >= -128) and torch.all(tensor.qdata <= 127)
+        )
+
+    def test_linear_operations(self):
+        """Test fp+int8 and int8+int8 linear ops with quantization error check"""
+        weight_q8 = Int8PlainInt8Tensor.from_hp(self.weight_fp, self.block_size)
+        input_q8 = Int8PlainInt8Tensor.from_hp(self.input_fp, self.block_size)
+
+        reference = torch.nn.functional.linear(self.input_fp, self.weight_fp, self.bias)
+        result_fp = torch.nn.functional.linear(self.input_fp, weight_q8, self.bias)
+        result_q8 = torch.nn.functional.linear(input_q8, weight_q8, self.bias)
+
+        self.assertEqual(result_fp.shape, reference.shape)
+        self.assertEqual(result_q8.shape, reference.shape)
+        self.assertTrue(compute_error(result_fp, reference) > 10)
+        self.assertTrue(compute_error(result_q8, reference) > 10)
+
+    def test_error_handling_and_dequant(self):
+        """Test input validation and dequantization accuracy"""
+        # Test 1D tensor validation
+        with self.assertRaises((AssertionError, ValueError, RuntimeError)):
+            Int8PlainInt8Tensor.from_hp(torch.randn(5), [1])
+
+        # Test wrong block_size validation
+        with self.assertRaises((AssertionError, ValueError, RuntimeError)):
+            Int8PlainInt8Tensor.from_hp(self.weight_fp, [1])
+
+        # Test dequantization with exact values
+        test_data = torch.tensor([[1.0, -1.0]], dtype=torch.float32)
+        tensor = Int8PlainInt8Tensor.from_hp(test_data, [1, 1])
+
+        dequantized = tensor.dequantize()
+        self.assertEqual(dequantized.shape, test_data.shape)
+        self.assertLess(torch.abs(dequantized - test_data).max().item(), 0.1)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torchao/quantization/__init__.py b/torchao/quantization/__init__.py
@@ -95,6 +95,7 @@
     Int4PreshuffledTensor,
     Int4Tensor,
     Int4TilePackedTo4dTensor,
+    Int8PlainInt8Tensor,
     IntxOpaqueTensor,
     IntxUnpackedToInt8Tensor,
 )
@@ -168,6 +169,7 @@
     "IntxOpaqueTensor",
     "IntxUnpackedToInt8Tensor",
     "Int4TilePackedTo4dTensor",
+    "Int8PlainInt8Tensor",
     "Float8Tensor",
     "Int4OpaqueTensor",
     # smooth quant - subject to change

diff --git a/torchao/quantization/quantize_/workflows/__init__.py b/torchao/quantization/quantize_/workflows/__init__.py
@@ -1,3 +1,5 @@
+from int8.int8_tensor import Int8PlainInt8Tensor
+
 from .float8.float8_tensor import (
     Float8Tensor,
     QuantizeTensorToFloat8Kwargs,
@@ -36,6 +38,7 @@
     "Int4MarlinSparseTensor",
     "Int4PlainInt32Tensor",
     "Int4TilePackedTo4dTensor",
+    "Int8PlainInt8Tensor",
     "Float8Tensor",
     "QuantizeTensorToFloat8Kwargs",
     "Int4OpaqueTensor",

diff --git a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
@@ -0,0 +1,106 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+
+from torchao.utils import TorchAOBaseTensor
+
+__all__ = ["Int8PlainInt8Tensor"]
+
+aten = torch.ops.aten
+
+
+# TODO: Implement block-wise quantization using block_size
+class Int8PlainInt8Tensor(TorchAOBaseTensor):
+    """
+    int8 quantized tensor with plain layout
+
+    Tensor Attributes:
+        qdata: (N, K) int8 quantized weight data
+        scale: scale factors for dequantization
+        zero_point: zero points for dequantization
+
+    Non-Tensor Attributes:
+        block_size: block size for quantization granularity
+        shape: original tensor shape
+    """
+
+    tensor_data_names = ["qdata", "scale", "zero_point"]
+    tensor_attribute_names = ["block_size"]
+
+    def __new__(cls, qdata, scale, zero_point, block_size, shape):
+        kwargs = {"device": qdata.device, "dtype": scale.dtype, "requires_grad": False}
+        return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)
+
+    def __init__(self, qdata, scale, zero_point, block_size, shape):
+        self.qdata = qdata
+        self.scale = scale
+        self.zero_point = zero_point
+        self.block_size = block_size
+
+    @classmethod
+    def from_hp(cls, w: torch.Tensor, block_size: list[int]):
+        if w.dim() != 2 or len(block_size) != 2:
+            raise ValueError("Expected 2D tensor and block_size length 2")
+
+        # Rounding function from high precision dtype
+        scale = w.abs().max(dim=-1, keepdim=True)[0] / 127.0
 def _linear_fp_act_int8_weight_check(input_tensor, weight_tensor, bias): 
 scale, zero_point = choose_qparams_affine( 
     input=preprocessed_w, 
     mapping_type=MappingType.SYMMETRIC, 
     block_size=block_size, 
     target_dtype=target_dtype, 
     quant_min=quant_min, 
     quant_max=quant_max, 
     eps=1e-6, 
 ) 
 wq = quantize_affine( 
     input=preprocessed_w, 
     block_size=block_size, 
     scale=scale, 
     zero_point=zero_point, 
     output_dtype=target_dtype, 
     quant_min=quant_min, 
     quant_max=quant_max, 
 ) 
 new_weight = to_affine_quantized_intx( 
 scale, zero_point = choose_qparams_affine( 
 def _linear_fp_act_int8_weight_check(input_tensor, weight_tensor, bias): 
 scale, zero_point = choose_qparams_affine( 
     input=preprocessed_w, 
     mapping_type=MappingType.SYMMETRIC, 
     block_size=block_size, 
     target_dtype=target_dtype, 
     quant_min=quant_min, 
     quant_max=quant_max, 
     eps=1e-6, 
 ) 
  
 wq = quantize_affine( 
     input=preprocessed_w, 
     block_size=block_size, 
     scale=scale, 
     zero_point=zero_point, 
     output_dtype=target_dtype, 
     quant_min=quant_min, 
     quant_max=quant_max, 
 ) 
 new_weight = to_affine_quantized_intx( 
 scale, zero_point = choose_qparams_affine( 
+        scale = scale.clamp(min=1e-6)
+
+        int_data = torch.round(w / scale).clamp(-128, 127).to(torch.int8)
+
+        return cls(
+            int_data,
+            scale.squeeze(-1),
+            torch.zeros_like(scale.squeeze(-1), dtype=torch.int8),
+            block_size,
+            w.shape,
+        )
+
+
+implements = Int8PlainInt8Tensor.implements
+
+
+@implements([aten.dequantize.self])
+def _(func, types, args, kwargs):
+    """dequantization: int8 -> float"""
+    tensor = args[0]
+    return (
+        tensor.qdata.to(tensor.scale.dtype)
+        - tensor.zero_point.to(tensor.scale.dtype).unsqueeze(1)
+    ) * tensor.scale.unsqueeze(1)
+
+
+@implements([torch.nn.functional.linear, aten.linear.default])
+def _(func, types, args, kwargs):
+    """quantization: float -> int8"""
+    input_tensor, weight_tensor, bias = (
+        args[0],
+        args[1],
+        args[2] if len(args) > 2 else None,
+    )
+
+    if isinstance(input_tensor, Int8PlainInt8Tensor):
+        # INT8 × INT8
+        x_int32 = input_tensor.qdata.to(torch.int32)
+        w_int32 = weight_tensor.qdata.to(torch.int32).t()
+
+        result = torch.mm(x_int32.view(-1, x_int32.size(-1)), w_int32)
+        scale = input_tensor.scale.view(-1, 1) * weight_tensor.scale.unsqueeze(0)
+        result = result.to(scale.dtype) * scale
+        result = result.view(*input_tensor.shape[:-1], -1)
+    else:
+        # FP × INT8
+        result = torch.nn.functional.linear(
+            input_tensor, weight_tensor.dequantize(), None
+        )
+
+    return result + bias if bias is not None else result
+
+
+Int8PlainInt8Tensor.__module__ = "torchao.quantization"
+torch.serialization.add_safe_globals([Int8PlainInt8Tensor])