pytorch · weifengpy · Oct 1, 2024 · Sep 19, 2024 · Sep 19, 2024 · Sep 19, 2024
diff --git a/test/float8/test_base.py b/test/float8/test_base.py
@@ -15,6 +15,9 @@
 
 import torch
 import torch.nn as nn
+from torchao.float8.float8_scaling_utils import (
+    hp_tensor_to_float8_dynamic,
+)
 
 from torchao.utils import TORCH_VERSION_AT_LEAST_2_5
 
@@ -604,6 +607,40 @@ def test_small_amax_float16(self, float8_dtype):
         x = torch.tensor([target_amax], dtype=torch.float16, device="cuda")
         scale = tensor_to_scale(x, float8_dtype)
         assert not torch.any(torch.isinf(scale))
+
+    @unittest.skipIf(
+        not is_cuda_8_9,
+        "CUDA not available",
+    )
+    @pytest.mark.parametrize(
+        "dtype",
+        [
+            torch.float32,
+            torch.bfloat16,
+            torch.float16,
+        ],
+    )
+    def test_dynamic_scale_parity(self, dtype: torch.dtype):
+        scaling_type_weight = ScalingType.DYNAMIC
+        torch.manual_seed(0)
+        hp_tensor = torch.randn(768, 32, device="cuda", dtype=dtype)
+        float8_config = Float8LinearConfig(
+            cast_config_weight=CastConfig(scaling_type=scaling_type_weight),
+        )
+        float8_eager = hp_tensor_to_float8_dynamic(
+            hp_tensor,
+            torch.float8_e4m3fn,
+            float8_config,
+            gemm_input_role=GemmInputRole.WEIGHT,
+        )
+        float8_compile = torch.compile(hp_tensor_to_float8_dynamic)(
+            hp_tensor,
+            torch.float8_e4m3fn,
+            float8_config,
+            gemm_input_role=GemmInputRole.WEIGHT,
+        )
+        assert torch.equal(float8_eager._scale, float8_compile._scale)
+        assert torch.equal(float8_eager._data, float8_compile._data)
 
 
 class TestFloat8LinearUtils(unittest.TestCase):

diff --git a/torchao/float8/float8_tensor.py b/torchao/float8/float8_tensor.py
@@ -163,7 +163,8 @@ def forward(
 
         DTensor Invariant: DTensor must always be the outer most tensor subclass
         """
-        tensor_scaled = tensor * scale
+        # scale is float32 thus upcasting tensor to match
+        tensor_scaled = tensor.to(torch.float32) * scale
         bits_fp8 = to_fp8_saturated(tensor_scaled, float8_dtype)
 
         if isinstance(bits_fp8, DTensor):

diff --git a/torchao/float8/float8_utils.py b/torchao/float8/float8_utils.py
@@ -42,6 +42,8 @@ def amax_to_scale(
         float8_dtype: The float8 dtype.
         orig_dtype: The original dtype of the tensor.
     """
+    # _scaled_mm requires float32 scale
+    amax = amax.to(torch.float32)
     if float8_dtype in FP8_TYPES:
         res = torch.finfo(float8_dtype).max / torch.clamp(amax, min=EPS)
     else:
@@ -52,7 +54,7 @@ def amax_to_scale(
     # to care about this for float32/bfloat16.
     if orig_dtype is torch.float16:
         res = torch.clamp(res, max=torch.finfo(torch.float16).max)
-    return res.to(torch.float32)
+    return res
 
 
 @torch.no_grad()