NVIDIA · vthumbe1503 · Oct 7, 2025 · Oct 7, 2025 · Oct 7, 2025 · Oct 7, 2025
diff --git a/tests/pytorch/distributed/run_fsdp2_model.py b/tests/pytorch/distributed/run_fsdp2_model.py
@@ -9,7 +9,7 @@
 import argparse
 
 import transformer_engine.pytorch as te
-from transformer_engine.common.recipe import Format, DelayedScaling
+from transformer_engine.common.recipe import Format, DelayedScaling, Float8CurrentScaling, MXFP8BlockScaling
 
 import torch
 import torch.distributed as dist
@@ -18,6 +18,7 @@
 from torch.distributed import DeviceMesh
 from torch.distributed._composable.fsdp import fully_shard
 from torch.distributed.device_mesh import init_device_mesh
+from transformer_engine.pytorch import QuantizedTensor
 from contextlib import nullcontext
 
 
@@ -36,8 +37,14 @@ def forward(self, x):
 def save_custom_attrs(module):
     custom_attrs = {}
     for name, param in module.named_parameters():
+        if isinstance(param, QuantizedTensor):
+            # Ignore FP8 metadata attributes. Otherwise we will save duplicate copies
+            # for data/transpose FP8 tensors on top of FP8 tensors that FSDP2 will save.
+            ignore_keys = [key for key in param.__dict__.keys() if key.startswith("_")]
+        else:
+            ignore_keys = []
         attrs = vars(param)
-        custom_attrs[name] = {k: v for k, v in attrs.items()}
+        custom_attrs[name] = {k: v for k, v in attrs.items() if k not in ignore_keys}
     return custom_attrs
 
 
@@ -103,25 +110,23 @@ def _train(args):
 
     # FP8 Configuration
     fp8_format = Format.HYBRID
-    fp8_recipe = DelayedScaling(fp8_format=fp8_format, amax_history_len=16, amax_compute_algo="max")
+    # fp8_recipe = DelayedScaling(fp8_format=fp8_format, amax_history_len=16, amax_compute_algo="max")
+    # fp8_recipe = Float8CurrentScaling(fp8_format=fp8_format)
+    fp8_recipe = MXFP8BlockScaling(fp8_format=fp8_format)    
 
+    build_model_context_args = {}
     if not args.fp8_init:
         # Build model context (FP8 init)
         build_model_context = nullcontext
-        build_model_context_args = {}
-
+    else:
         from transformer_engine.pytorch import fp8_model_init
-
         build_model_context = fp8_model_init
         build_model_context_args["enabled"] = True
-
-        # Build the model with the specified context
-        with build_model_context(**build_model_context_args):
-            model = SimpleNet(args.input_size, args.hidden_size, args.output_size)
-    else:
-        model = SimpleNet(args.input_size, args.hidden_size, args.output_size)
+        build_model_context_args["recipe"] = fp8_recipe
     # Move the model to the correct device
-
+    # Build the model with the specified context
+    with build_model_context(**build_model_context_args):
+        model = SimpleNet(args.input_size, args.hidden_size, args.output_size)
     model.to(device)
 
     if LOCAL_RANK == 0:
@@ -146,7 +151,6 @@ def _train(args):
         )
     else:
         assert False
-
     # Apply FSDP/HSDP
     custom_attrs = save_custom_attrs(model)
     for sub_module in model.modules():
@@ -156,20 +160,20 @@ def _train(args):
             fully_shard(sub_module, mesh=mesh)
     fully_shard(model, mesh=mesh)
     restore_custom_attrs(model, custom_attrs)
-
     optimizer = optim.Adam(model.parameters(), lr=1e-3)
 
     for iteration in range(args.iter):
         # Zero the parameter gradients
         optimizer.zero_grad()
         input_data = torch.randn(args.batch_size, args.input_size).to(device)
-        output = model(input_data)
+        with te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe):
+            output = model(input_data)
         target = torch.randn(args.batch_size, args.output_size).to(device)
         loss = F.mse_loss(output, target)
         loss.backward()
         optimizer.step()
         if LOCAL_RANK == 0:
-            print(f"Rank {LOCAL_RANK}: Iteration {iteration} completed.")
+            print(f"Rank {LOCAL_RANK}: Iteration {iteration} completed with loss {loss.item()}")
 
     dist.destroy_process_group()
     if LOCAL_RANK == 0:

diff --git a/transformer_engine/pytorch/module/base.py b/transformer_engine/pytorch/module/base.py
@@ -1371,6 +1371,7 @@ def get_weight_workspace(
         # Note: Make sure weights have required usages, but do not
         # destroy unnecessary usages since they may be used later.
         if isinstance(tensor, QuantizedTensor):
+            quantizer = tensor._quantizer
             update_rowwise_usage = True if quantizer.rowwise_usage else None
             update_columnwise_usage = True if quantizer.columnwise_usage else None
             tensor.update_usage(

diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py
@@ -44,6 +44,7 @@
 
 from ..tensor.float8_tensor import Float8CurrentScalingQuantizer, Float8Quantizer
 from ..tensor.quantized_tensor import (
+    QuantizedTensor,
     QuantizedTensorStorage,
     Quantizer,
     prepare_for_saving,
@@ -107,7 +108,8 @@ def forward(
                     is_fp8_activation_recompute_enabled()
                     and not in_fp8_activation_recompute_phase()
                 )
-            if weight_quantizers[0] is not None:
+            # No need to set the quantizer states if weight is already quantized
+            if weight_quantizers[0] is not None and not isinstance(weights[0], QuantizedTensor):
                 for weight_quantizer in weight_quantizers:
                     weight_quantizer.set_usage(rowwise=True, columnwise=columnwise_usage)
         if output_quantizers[0] is not None:
@@ -204,11 +206,6 @@ def forward(
                             inputmat.update_usage(rowwise_usage=False, columnwise_usage=True)
             else:
                 inputmats = [None] * num_gemms
-            if inp.requires_grad:
-                for weight in weights_fp8:
-                    if isinstance(weight, QuantizedTensorStorage):
-                        weight.update_usage(columnwise_usage=True)
-
             tensors_to_save, tensor_objects = prepare_for_saving(
                 *inputmats,
                 *weights_fp8,
@@ -336,13 +333,11 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                     dtype=ctx.activation_dtype,
                     device=ctx.device,
                 )
-
-                for weight, quantizer in zip(weights, ctx.weight_quantizers):
-                    if quantizer is not None and isinstance(weight, QuantizedTensorStorage):
-                        weight.update_usage(
-                            rowwise_usage=quantizer.rowwise_usage,
-                            columnwise_usage=quantizer.columnwise_usage,
-                        )
+                # Make sure weights are available in column-wise format
+                # for dgrad computation.
+                for weight in weights:
+                    if isinstance(weight, QuantizedTensorStorage):
+                        weight.update_usage(columnwise_usage=True)
                 general_grouped_gemm(
                     weights,
                     grad_output,

diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -280,8 +280,9 @@ def forward(
         if fp8 or debug:
             quantized_weight = not isinstance(weight, QuantizedTensorStorage)
 
-            # Configure quantizer
-            if weight_quantizer is not None:
+            # Configure quantizer.
+            # No need to set the quantizer states if weight is already quantized
+            if weight_quantizer is not None and not isinstance(weight, QuantizedTensor):
                 weight_quantizer.set_usage(rowwise=True, columnwise=is_grad_enabled)
 
             # Get quantized weight

diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py
@@ -71,6 +71,7 @@
 from ._common import apply_normalization, WeightGradStore
 from ..cpu_offload import is_cpu_offload_enabled, mark_activation_offload
 from ..tensor.quantized_tensor import (
+    QuantizedTensor,
     QuantizedTensorStorage,
     Quantizer,
     prepare_for_saving,
@@ -347,8 +348,11 @@ def forward(
             # which handles weight caching etc.
             # FP8 cast to workspace buffer
             update_workspace = is_first_microbatch is None or is_first_microbatch
-            fc1_weight_quantizer.set_usage(rowwise=True, columnwise=is_grad_enabled)
-            fc2_weight_quantizer.set_usage(rowwise=True, columnwise=is_grad_enabled)
+            # No need to set the quantizer states if weights are already quantized
+            if not isinstance(fc1_weight, QuantizedTensor):
+                fc1_weight_quantizer.set_usage(rowwise=True, columnwise=is_grad_enabled)
+            if not isinstance(fc2_weight, QuantizedTensor):
+                fc2_weight_quantizer.set_usage(rowwise=True, columnwise=is_grad_enabled)
             fc1_weight_final = module.get_weight_workspace(
                 tensor=fc1_weight,
                 quantizer=fc1_weight_quantizer,

diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py
@@ -240,7 +240,8 @@ def forward(
         weightmat = weight
         if fp8 or debug:
             # Configure quantizer
-            if weight_quantizer is not None:
+            # No need to set the quantizer states if weight is already quantized
+            if weight_quantizer is not None and not isinstance(weight, QuantizedTensor):
                 columnwise_usage = is_grad_enabled and inp.requires_grad
                 if not columnwise_usage:
                     columnwise_usage = (
@@ -389,11 +390,6 @@ def forward(
             if backward_needs_input:
                 saved_inputmat = inputmat
 
-            # Weight with column-wise usage is needed for dgrad GEMM.
-            if inp.requires_grad:
-                if isinstance(weightmat, QuantizedTensorStorage):
-                    weightmat.update_usage(columnwise_usage=True)
-
             if cpu_offloading and saved_inputmat is not None:
                 mark_activation_offload(saved_inputmat)