pytorch · MirMustafaAli · Nov 6, 2024 · Nov 6, 2024 · Nov 6, 2024 · Nov 6, 2024
diff --git a/torchao/prototype/low_bit_optim/adam.py b/torchao/prototype/low_bit_optim/adam.py
@@ -253,6 +253,7 @@ def __init__(
         *,
         block_size=256,
         bf16_stochastic_round=False,
+        dynamic_range_expansion=False,
     ) -> None:
         super().__init__(
             params,
@@ -265,11 +266,28 @@ def __init__(
             bf16_stochastic_round=bf16_stochastic_round,
             is_adamw=False,
         )
+        self.dynamic_range_expansion = dynamic_range_expansion
 
     @staticmethod
-    def _subclass_zeros(p: Tensor, signed: bool, block_size: int):
-        return OptimStateFp8.zeros(p.shape, block_size, p.device)
+    def _subclass_zeros(p: Tensor, signed: bool, block_size: int, dynamic_range_expansion: bool):
+        return OptimStateFp8.zeros(p.shape, block_size, p.device, dynamic_range_expansion)
 
+    def _new_buffer(self, p: Tensor, signed: bool):
+        if p.numel() >= 4096 and p.numel() % self.block_size == 0:
+            if isinstance(p, DTensor):
+                out = DTensor.from_local(
+                    local_tensor=self._subclass_zeros(
+                        p.to_local(), signed, self.block_size, self.dynamic_range_expansion
+                    ),
+                    device_mesh=p.device_mesh,
+                    placements=p.placements,
+                    run_check=False,
+                )
+            else:
+                out = self._subclass_zeros(p, signed, self.block_size, self.dynamic_range_expansion)
+        else:
+            out = torch.zeros_like(p)
+        return out
 
 class AdamW8bit(_AdamBase):
     def __init__(

diff --git a/torchao/prototype/low_bit_optim/subclass_fp8.py b/torchao/prototype/low_bit_optim/subclass_fp8.py
@@ -1,5 +1,6 @@
 import torch
 from torch import Tensor
+from typing import Optional
 from torch.utils._python_dispatch import return_and_correct_aliasing
 
 from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, TorchAOBaseTensor
@@ -11,13 +12,27 @@
 DTYPE = torch.float8_e4m3fn
 
 
-def quantize_fp8(input: Tensor, block_size: int):
+def quantize_fp8(input: Tensor, block_size: int, dynamic_range_expansion: bool):
+
     shape = input.shape
     input = input.view(-1, block_size)
+    k = None
+
+    if dynamic_range_expansion:
+        # NOTE: the calculation is from the paper https://arxiv.org/abs/2410.19313
+        # The idea is to align optimizer state distributions more closely 
+        # with the FP8 representation range, reducing the quantization error.
+        k = aten.ones(input.shape[0], dtype=DTYPE, device=input.device)
+        Rdtype = torch.finfo(DTYPE).max / torch.finfo(DTYPE).min # calculate the range of the dtype
+        Rx = input.abs().amax(-1).clip(1e-12) / input.abs().amin(-1).clip(1e-12) # range of input max and min
+        k = torch.log(Rdtype) / torch.log(Rx)# calculating optimal value k dynamically
+        input = input.sign() * (input.abs() ** k.view(-1, 1))
+
     scale = input.abs().amax(-1).clip(1e-12) / torch.finfo(DTYPE).max
     input = input / scale.view(-1, 1)
     codes = input.to(DTYPE).view(-1)
-    return codes.view(shape), scale
+
+    return codes.view(shape), scale, k
 
 
 # NOTE: FP8 sign bit is redundant for unsigned optim state.
@@ -27,10 +42,10 @@ class OptimStateFp8(TorchAOBaseTensor):
     tensor_attrs = ["codes", "scale"]
 
     @staticmethod
-    def __new__(cls, codes: Tensor, scale: Tensor):
+    def __new__(cls, codes: Tensor, scale: Tensor, k: Optional[Tensor] =None):
         return Tensor._make_wrapper_subclass(cls, codes.shape, device=codes.device)
 
-    def __init__(self, codes: Tensor, scale: Tensor):
+    def __init__(self, codes: Tensor, scale: Tensor, k: Optional[Tensor] =None):
         """Create quantized FP8 optimizer state.
 
         Args
@@ -45,6 +60,7 @@ def __init__(self, codes: Tensor, scale: Tensor):
         assert scale.ndim == 1
         self.codes = codes
         self.scale = scale
+        self.k = k
         self.block_size = codes.numel() // scale.numel()
 
     def __tensor_flatten__(self):
@@ -62,15 +78,22 @@ def dequantize(self, output_dtype=None):
         float_data = self.codes.float()
         float_data = float_data.view(-1, self.block_size) * self.scale.view(-1, 1)
 
+        if self.k is not None:
+            float_data = float_data.view(-1, self.block_size)
+            float_data =  float_data ** (1 / self.k.view(-1, 1))
+
         if output_dtype is not None:
             float_data = float_data.to(output_dtype)
+
         return float_data.view(self.codes.shape)
 
     @classmethod
-    def zeros(cls, shape, block_size: int = 256, device=None):
+    def zeros(cls, shape, block_size: int = 256, device=None, dynamic_range_expansion: bool = False):
+
         codes = torch.zeros(shape, dtype=DTYPE, device=device)
         scale = torch.zeros(codes.numel() // block_size, device=device)
-        return cls(codes, scale)
+        k = torch.ones(codes.numel() // block_size, device=device) if dynamic_range_expansion else None
+        return cls(codes, scale, k)
 
     def __repr__(self):
         return (
@@ -88,12 +111,18 @@ def _(func, types, args, kwargs):
         assert dst.block_size == src.block_size
         dst.codes.copy_(src.codes)
         dst.scale.copy_(src.scale)
+        if dst.k is not None:
+            dst.k.copy_(src.k)
 
     elif isinstance(dst, OptimStateFp8):
-        codes, scale = quantize_fp8(src, dst.block_size)
+
+        codes, scale, k = quantize_fp8(src, dst.block_size, True if dst.k is not None else False)
+
         dst.codes.copy_(codes)
         dst.scale.copy_(scale)
-
+
+        if dst.k is not None:
+            dst.k.copy_(k)
     else:
         dst.copy_(src.dequantize())
 
@@ -107,6 +136,7 @@ def _(func, types, args, kwargs):
     out = OptimStateFp8(
         args[0].codes.to(device=device),
         args[0].scale.to(device=device),
+        args[0].k.to(device=device) if args[0].k is not None else None
     )
     return return_and_correct_aliasing(func, args, kwargs, out)
 
@@ -121,7 +151,7 @@ def _(func, types, args, kwargs):
 @OptimStateFp8.implements(aten.view.default)
 def _(func, types, args, kwargs):
     x, shape = args
-    return OptimStateFp8(x.codes.view(shape), x.scale)
+    return OptimStateFp8(x.codes.view(shape), x.scale, x.k)
 
 
 # this is needed for DTensor.full_tensor()
@@ -142,10 +172,11 @@ def _(func, types, args, kwargs):
     return OptimStateFp8(
         func(x.codes, *args[1:], **kwargs),
         func(x.scale, *args[1:], **kwargs),
+        func(x.k, *args[1:], **kwargs) if x.k else None
     )
 
 
 if TORCH_VERSION_AT_LEAST_2_5:
     from torch.serialization import add_safe_globals
 
-    add_safe_globals([OptimStateFp8])
+    add_safe_globals([OptimStateFp8])