NVIDIA · pggPL · Nov 5, 2025 · Oct 23, 2025 · Oct 23, 2025 · Oct 31, 2025
diff --git a/tests/jax/test_layer.py b/tests/jax/test_layer.py
@@ -311,6 +311,10 @@ def _loss_fn(self, diff_xs, no_diff_xs, params, others, model):
         variables = {"params": params, **others}
         output = model.apply(variables, *diff_xs, *no_diff_xs, rngs=self.apply_rng)
         return jnp.mean(output, dtype=jnp.float32).astype(output.dtype)
+
+    def _output_fn(self, params, others, model, diff_xs, no_diff_xs):
+        variables = {"params": params, **others}
+        return model.apply(variables, *diff_xs, *no_diff_xs, rngs=self.apply_rng)
 
     def _sync_params(self, ref, target):
         """Copy the reference params to target"""
@@ -334,11 +338,14 @@ def test_forward(
         test_layer, test_params, test_others = self._generate_layer(layer_cls, inputs, test_masks)
         ref_params, test_params = self._sync_params(ref_params, test_params)
 
-        ref_out = self._loss_fn(inputs, ref_masks, ref_params, ref_others, ref_layer)
-        test_out = self._loss_fn(inputs, test_masks, test_params, test_others, test_layer)
+        ref_out = self._output_fn(ref_params, ref_others, ref_layer, inputs, ref_masks)
+        test_out = self._output_fn(test_params, test_others, test_layer, inputs, test_masks)
 
         tols = dtype_tols(dtype, rtol=rtol, atol=atol)
-        assert_allclose(ref_out, test_out, **tols)
+        if not get_quantize_config().is_fp8_enabled():
+            assert_allclose(ref_out, test_out, **tols)
+        else:
+            assert_allclose(ref_out.mean(), test_out.mean(), **tols)
 
     def test_backward(
         self,

diff --git a/transformer_engine/jax/flax/transformer.py b/transformer_engine/jax/flax/transformer.py
@@ -197,6 +197,7 @@ def __call__(
             fused_scale_factor = scale_factor
             if self.attn_bias_type == AttnBiasType.PRE_SCALE_BIAS:
                 attn_weights += bias
+                bias = None
 
         def apply_swa_mask(original_mask: Array) -> Array:
             """Apply the sliding window mask to a given mask"""