pytorch · msaroufim · Aug 22, 2024 · Aug 20, 2024 · Aug 20, 2024 · Aug 22, 2024
diff --git a/test/float8/test_fsdp2/test_fsdp2.py b/test/float8/test_fsdp2/test_fsdp2.py
@@ -59,7 +59,7 @@ def init_multi_module(self) -> nn.Module:
         self.broadcast_module(module)
         return module
 
-    def init_transformer(self, weight_tying: bool) -> nn.Module:
+    def init_transformer(self, weight_tying: bool, dtype: torch.dtype | None = None) -> nn.Module:
         torch.manual_seed(42)
         args = ModelArgs(
             n_layers=3,
@@ -70,6 +70,8 @@ def init_transformer(self, weight_tying: bool) -> nn.Module:
             vocab_size=32,
         )
         module = Transformer(args).cuda()
+        if dtype is not None:
+            module = module.to(dtype=dtype)
         self.broadcast_module(module)
         return module
 
@@ -96,6 +98,7 @@ def test_transformer_parity(self):
                     ScalingType.DELAYED,
                 ],
                 "compile_transformer_block": [False, True],
+                "dtype": [torch.float32, torch.bfloat16],
             },
             self._test_transformer_parity,
         )
@@ -106,6 +109,7 @@ def _test_transformer_parity(
         precompute: bool,
         scaling_type_weight: ScalingType,
         compile_transformer_block: bool,
+        dtype: torch.dtype | None = None,
     ):
         if not enable_fsdp_float8_all_gather and precompute:
             return
@@ -117,7 +121,7 @@ def _test_transformer_parity(
         # latter uses fp8 compute. With fp8 all-gather, FSDP would pre-cast to
         # fp8 for that tied weight, incorrectly using fp8 for the embedding.
         weight_tying = not enable_fsdp_float8_all_gather
-        module = self.init_transformer(weight_tying=weight_tying).cuda()
+        module = self.init_transformer(weight_tying=weight_tying, dtype=dtype)
         ref_module = copy.deepcopy(module)
         float8_linear_config1 = Float8LinearConfig(
             cast_config_weight=CastConfig(scaling_type=scaling_type_weight),

diff --git a/torchao/float8/fsdp_utils.py b/torchao/float8/fsdp_utils.py
@@ -67,7 +67,7 @@ def precompute_float8_dynamic_scale_for_fsdp(module: nn.Module) -> None:
     scale_tensor = torch.finfo(torch.float8_e4m3fn).max / amax_tensor  # Replicate
     if amax_tensor.dtype is torch.float16:
         scale_tensor = torch.clamp(scale_tensor, max=torch.finfo(torch.float16).max)
-    local_scale_tensor = scale_tensor.to_local()
+    local_scale_tensor = scale_tensor.to_local().to(dtype=torch.float32)
 return res.to(torch.float32) 
 return res.to(torch.float32) 
     for i, float8_linear in enumerate(float8_linears):
         float8_linear.weight._local_tensor._precomputed_scale = local_scale_tensor[i]