pytorch · weifengpy · Oct 1, 2024 · Sep 19, 2024 · Sep 19, 2024 · Sep 19, 2024
diff --git a/test/float8/test_base.py b/test/float8/test_base.py
@@ -15,6 +15,9 @@
 
 import torch
 import torch.nn as nn
+from torchao.float8.float8_scaling_utils import (
+    hp_tensor_to_float8_dynamic,
+)
 
 from torchao.utils import TORCH_VERSION_AT_LEAST_2_5
 
@@ -604,6 +607,40 @@ def test_small_amax_float16(self, float8_dtype):
         x = torch.tensor([target_amax], dtype=torch.float16, device="cuda")
         scale = tensor_to_scale(x, float8_dtype)
         assert not torch.any(torch.isinf(scale))
+
+    @unittest.skipIf(
+        not is_cuda_8_9,
+        "CUDA not available",
+    )
+    @pytest.mark.parametrize(
+        "dtype",
+        [
+            torch.float32,
+            torch.bfloat16,
+            torch.float16,
+        ],
+    )
+    def test_dynamic_scale_parity(self, dtype: torch.dtype):
+        scaling_type_weight = ScalingType.DYNAMIC
+        torch.manual_seed(0)
+        hp_tensor = torch.randn(768, 32, device="cuda", dtype=dtype)
+        float8_config = Float8LinearConfig(
+            cast_config_weight=CastConfig(scaling_type=scaling_type_weight),
+        )
+        float8_eager = hp_tensor_to_float8_dynamic(
+            hp_tensor,
+            torch.float8_e4m3fn,
+            float8_config,
+            gemm_input_role=GemmInputRole.WEIGHT,
+        )
+        float8_compile = torch.compile(hp_tensor_to_float8_dynamic)(
+            hp_tensor,
+            torch.float8_e4m3fn,
+            float8_config,
+            gemm_input_role=GemmInputRole.WEIGHT,
+        )
+        assert torch.equal(float8_eager._scale, float8_compile._scale)
+        assert torch.testing.assert_close(float8_eager._data, float8_compile._data)
 
 
 class TestFloat8LinearUtils(unittest.TestCase):

diff --git a/test/float8/test_fsdp2/fsdp2_common.py b/test/float8/test_fsdp2/fsdp2_common.py
@@ -48,10 +48,7 @@ def check_parity_no_mp(
             ):
                 precompute_float8_dynamic_scale_for_fsdp(model)
 
-        if compile_transformer_block:
-            test_cls.assertEqual(losses[0], losses[1], atol=1e-4, rtol=1e-4)
-        else:
-            test_cls.assertEqual(losses[0], losses[1])
+        test_cls.assertEqual(losses[0], losses[1])
 
 
 def check_parity_bf16_mp(

diff --git a/torchao/float8/float8_tensor.py b/torchao/float8/float8_tensor.py
@@ -163,7 +163,8 @@ def forward(
 
         DTensor Invariant: DTensor must always be the outer most tensor subclass
         """
-        tensor_scaled = tensor * scale
+        # scale is float32 thus upcasting tensor to match
+        tensor_scaled = tensor.to(torch.float32) * scale
         bits_fp8 = to_fp8_saturated(tensor_scaled, float8_dtype)
 
         if isinstance(bits_fp8, DTensor):

diff --git a/torchao/float8/float8_utils.py b/torchao/float8/float8_utils.py
@@ -42,6 +42,8 @@ def amax_to_scale(
         float8_dtype: The float8 dtype.
         orig_dtype: The original dtype of the tensor.
     """
+    # _scaled_mm requires float32 scale
+    amax = amax.to(torch.float64)
     if float8_dtype in FP8_TYPES:
         res = torch.finfo(float8_dtype).max / torch.clamp(amax, min=EPS)
     else:

diff --git a/torchao/float8/fsdp_utils.py b/torchao/float8/fsdp_utils.py
@@ -59,17 +59,17 @@ def precompute_float8_dynamic_scale_for_fsdp(module: nn.Module) -> None:
         return
 
     # inf-norm is equivalent to max(abs(w))
-    max_weights = torch._foreach_norm(weights, ord=math.inf)  # Partial
+    max_weights = torch._foreach_norm(weights, ord=math.inf, dtype=torch.float64)  # Partial
     amax_tensor = torch.stack(max_weights)  # Partial
     # clamp is dispatched through DTensor
     # it will issue a single all-reduce
     amax_tensor = torch.clamp(amax_tensor, EPS)  # Replicate
     scale_tensor = torch.finfo(torch.float8_e4m3fn).max / amax_tensor  # Replicate
     if amax_tensor.dtype is torch.float16:
         scale_tensor = torch.clamp(scale_tensor, max=torch.finfo(torch.float16).max)
-    local_scale_tensor = scale_tensor.to_local()
+    local_scale_tensor = scale_tensor.to_local().to(torch.float32)
     for i, float8_linear in enumerate(float8_linears):
-        float8_linear.weight._local_tensor._precomputed_scale = local_scale_tensor[i].to(torch.float32)
+        float8_linear.weight._local_tensor._precomputed_scale = local_scale_tensor[i]
 
 
 # FSDP pads its local tensor on dim-0. The subclass should be preserved such