NVIDIA
diff --git a/‎3rdparty/cudnn-frontend‎ b/‎3rdparty/cudnn-frontend‎
diff --git a/‎examples/jax/encoder/test_multiprocessing_encoder.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/jax/encoder/test_multiprocessing_encoder.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tests/cpp/operator/test_normalization.h‎
Lines changed: 11 additions & 1 deletion b/‎tests/cpp/operator/test_normalization.h‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎tests/jax/test_custom_call_compute.py‎
Lines changed: 0 additions & 1 deletion b/‎tests/jax/test_custom_call_compute.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎tests/jax/test_distributed_layernorm_mlp.py‎
Lines changed: 2 additions & 0 deletions b/‎tests/jax/test_distributed_layernorm_mlp.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tests/jax/test_helper.py‎ renamed to ‎tests/jax/test_recipe_characteristics.py‎
Lines changed: 66 additions & 1 deletion b/‎tests/jax/test_helper.py‎ renamed to ‎tests/jax/test_recipe_characteristics.py‎
Lines changed: 66 additions & 1 deletion
diff --git a/‎tests/jax/utils.py‎
Lines changed: 6 additions & 6 deletions b/‎tests/jax/utils.py‎
Lines changed: 6 additions & 6 deletions
@@ -672,7 +672,7 @@ def test_te_mxfp8(self):
     def test_te_nvfp4(self):
         """Test Transformer Engine with NVFP4"""
         result = self.exec(True, "NVFP4BlockScaling")
-        assert result[0] < 0.451 and result[1] > 0.788
+        assert result[0] < 0.451 and result[1] > 0.787
 
     @unittest.skipIf(not is_bf16_supported(), "Device compute capability 8.0+ is required for BF16")
     def test_te_bf16_shardy(self):
@@ -710,7 +710,7 @@ def test_te_mxfp8_shardy(self):
     def test_te_nvfp4_shardy(self):
         """Test Transformer Engine with NVFP4"""
         result = self.exec(True, "NVFP4BlockScaling", enable_shardy=True)
-        assert result[0] < 0.451 and result[1] > 0.788
+        assert result[0] < 0.451 and result[1] > 0.787
 
 
 if __name__ == "__main__":
 
@@ -114,8 +114,18 @@ void compute_ref_output(NormType norm_type,
         tmp = current * rsigma[i] * g;
       }
 
+      // Write output (scaled only for fp8 paths)
       output[i * H + j] = static_cast<OutputType>(tmp * scale);
-      current_max = fmaxf(current_max, fabsf(tmp));
+
+      // amax semantics:
+      // - fp8_out (scale != 1): amax on pre-scale compute value 'tmp'
+      // - non-fp8_out (scale == 1): amax on value converted to OutputType (e.g., bf16)
+      if (scale != 1.f) {
+        current_max = fmaxf(current_max, fabsf(tmp));
+      } else {
+        OutputType out_t_val = static_cast<OutputType>(tmp);
+        current_max = fmaxf(current_max, fabsf(static_cast<compute_t>(out_t_val)));
+      }
     }
   }
 
 
@@ -45,7 +45,6 @@
 from transformer_engine.jax.activation import activation
 from transformer_engine.jax.dense import dense, grouped_dense
 from transformer_engine.jax.layernorm_dense import layernorm_dense
-from transformer_engine.common import recipe
 
 GEMM_CASES = [
     (256, 256, 512),
 
@@ -389,6 +389,7 @@ def _test_layernorm_mlp(
                     intermediate_dim=INTERMEDIATE,
                     activations=activation_type,
                     use_bias=use_bias,
+                    return_layernorm_output=True,
                 )
                 params_single = ln_mlp_single.init(init_rngs, x, deterministic=True)
                 mlp_out_single, ln_out_single = ln_mlp_single.apply(
@@ -417,6 +418,7 @@ def _test_layernorm_mlp(
                     dot_1_input_axes=DOT_1_INPUT_AXES,
                     dot_2_input_axes=DOT_2_INPUT_AXES,
                     name="mlp",
+                    return_layernorm_output=True,
                 )
                 params_sharded = ln_mlp_sharded.init(init_rngs, x, deterministic=True)
                 mlp_out_sharded, ln_out_sharded = ln_mlp_sharded.apply(
 
@@ -11,7 +11,7 @@
 import numpy as np
 from flax import linen as nn
 
-from utils import assert_allclose
+from utils import assert_allclose, pytest_parametrize_wrapper
 from transformer_engine.common.recipe import (
     DelayedScaling,
     MXFP8BlockScaling,
@@ -22,6 +22,7 @@
 from transformer_engine.jax import autocast
 from transformer_engine.jax.quantize import (
     get_quantize_config,
+    get_supported_quantization_recipes,
     is_scaling_mode_supported,
     ScalingMode,
     update_collections,
@@ -32,11 +33,15 @@
 from transformer_engine.jax.quantize.helper import _format2dtypes
 from transformer_engine.jax.sharding import MeshResource, global_mesh_resource
 from transformer_engine.jax.flax.module import TransformerEngineBase
+from transformer_engine.jax import flax as te_flax
+import transformer_engine.jax as te
 
 is_fp8_supported, reason = is_scaling_mode_supported(ScalingMode.DELAYED_TENSOR_SCALING)
 is_mxfp8_supported, mxfp8_reason = is_scaling_mode_supported(ScalingMode.MXFP8_1D_SCALING)
 is_nvfp4_supported, nvfp4_reason = is_scaling_mode_supported(ScalingMode.NVFP4_1D_SCALING)
 
+SUPPORTED_RECIPES = get_supported_quantization_recipes()
+
 
 def quantizer_check_vjp(outer_quantizer_set, assertion_func, x):
     """Check that the quantizers in the quantizer set are as expected and reconstructed correctly from flattened pytree representations across VJP boundaries."""
@@ -253,3 +258,63 @@ def test_autocast_nvfp4_block_scaling(self):
             self._compare_nvfp4_scaling_quantizers(bs)
 
         self._check_default_state()
+
+
+class TestJaxprAndHlo:
+    """Tests to verify Jaxpr and/or HLO of compiled modules apply expected recipe functionality and optimizations."""
+
+    @pytest_parametrize_wrapper(
+        "quantization_recipe",
+        [
+            quantization_recipe
+            for quantization_recipe in SUPPORTED_RECIPES
+            if isinstance(quantization_recipe, NVFP4BlockScaling)
+        ],
+    )
+    def test_layernorm_mlp_reuses_amax_nvfp4(self, quantization_recipe):
+        """Tests that layernorm_mlp reuses the amax computed in layernorm and the activation and does not recompute it during quantizaton."""
+
+        with te.autocast(enabled=True, recipe=quantization_recipe, mesh_resource=te.MeshResource()):
+            model = te_flax.LayerNormMLP(
+                layernorm_type="rmsnorm",
+                return_layernorm_output=False,
+                intermediate_dropout_rate=0.0,
+                dtype=jnp.bfloat16,
+            )
+
+            var_collect = model.init(
+                jax.random.PRNGKey(0),
+                jnp.ones((128, 128), dtype=jnp.bfloat16),
+            )
+
+            def loss_fn(x, rngs):
+                return jnp.mean(model.apply(var_collect, x, rngs=rngs)[0])
+
+            x = jax.random.normal(jax.random.PRNGKey(0), (128, 128), dtype=jnp.bfloat16)
+            rngs = {"sr_rng": jax.random.PRNGKey(1), "dropout": jax.random.PRNGKey(2)}
+            jaxpr = jax.make_jaxpr(jax.value_and_grad(loss_fn))(x, rngs=rngs)
+
+            rht_amax_eqns = [
+                eqn for eqn in jaxpr.jaxpr.eqns if eqn.primitive.name == "te_rht_amax_ffi_wrapper"
+            ]
+
+            assert len(rht_amax_eqns) == 4, f"Expected 4 rht_amax_eqns, got {len(rht_amax_eqns)}"
+
+            def assert_param(index, tensor_name, expected_value: bool):
+                if expected_value:
+                    assert rht_amax_eqns[index].params["produce_regular_amax"] == True, (
+                        f"Expected produce_regular_amax for {tensor_name} to be True, indicating no"
+                        " reuse of amax as this tensor does not have a previous operation to fuse"
+                        " with"
+                    )
+                else:
+                    assert rht_amax_eqns[index].params["produce_regular_amax"] == False, (
+                        f"Expected produce_regular_amax for {tensor_name} to be False, indicating"
+                        " reuse of amax"
+                    )
+
+            assert_param(0, "fwd ln+q", False)
+            assert_param(1, "fwd act+q", False)
+            # No previous op before incoming dgrad in the backward so amax is not reused
+            assert_param(2, "bwd dgrad", True)
+            assert_param(3, "bwd dact+q", False)
@@ -364,9 +364,9 @@ class MlpBlock(nn.Module):
 
     transpose_batch_sequence: bool
     intermediate_dim: int = 2048
-    activations: Sequence[Union[str, Callable]] = ("relu",)
+    activations: Sequence[Union[str, Callable]] = ("gelu",)
     kernel_init: Initializer = None
-    intermediate_dropout_rate: float = 0.1
+    intermediate_dropout_rate: float = 0.0
     intermediate_dropout_dims: Sequence[int] = ()
     use_bias: bool = False
     dtype: Any = jnp.float32
@@ -1035,14 +1035,14 @@ class EncoderLayer(nn.Module):
     hidden_dropout: float = 0.1
     hidden_dropout_dims: Sequence[int] = ()
     attention_dropout: float = 0.1
-    intermediate_dropout: float = 0.1
+    intermediate_dropout: float = 0.0
     intermediate_dropout_dims: Sequence[int] = ()
     transpose_batch_sequence: bool = True
     float32_attention_logits: bool = False
     scale_attn_logits: bool = False
     scaled_query_init: bool = True
     mlp_dim: int = 2048
-    mlp_activations: Sequence[str] = ("relu",)
+    mlp_activations: Sequence[str] = ("gelu",)
     use_bias: bool = False
     dtype: Any = jnp.float32
     apply_residual_connection_post_layernorm: bool = False
@@ -1199,14 +1199,14 @@ class DecoderLayer(nn.Module):
     hidden_dropout: float = 0.1
     hidden_dropout_dims: Sequence[int] = ()
     attention_dropout: float = 0.1
-    intermediate_dropout: float = 0.1
+    intermediate_dropout: float = 0.0
     intermediate_dropout_dims: Sequence[int] = ()
     transpose_batch_sequence: bool = True
     float32_attention_logits: bool = False
     scale_attn_logits: bool = False
     scaled_query_init: bool = True
     mlp_dim: int = 2048
-    mlp_activations: Sequence[str] = ("relu",)
+    mlp_activations: Sequence[str] = ("gelu",)
     use_bias: bool = False
     dtype: Any = jnp.float32
     apply_residual_connection_post_layernorm: bool = False
Original file line number	Diff line number	Diff line change
`@@ -114,8 +114,18 @@ void compute_ref_output(NormType norm_type,`
`114`	`114`	`tmp = current * rsigma[i] * g;`
`115`	`115`	`}`
`116`	`116`
	`117`	`+ // Write output (scaled only for fp8 paths)`
`117`	`118`	`output[i * H + j] = static_cast<OutputType>(tmp * scale);`
`118`		`- current_max = fmaxf(current_max, fabsf(tmp));`
	`119`	`+`
	`120`	`+ // amax semantics:`
	`121`	`+ // - fp8_out (scale != 1): amax on pre-scale compute value 'tmp'`
	`122`	`+ // - non-fp8_out (scale == 1): amax on value converted to OutputType (e.g., bf16)`
	`123`	`+ if (scale != 1.f) {`
	`124`	`+ current_max = fmaxf(current_max, fabsf(tmp));`
	`125`	`+ } else {`
	`126`	`+ OutputType out_t_val = static_cast<OutputType>(tmp);`
	`127`	`+ current_max = fmaxf(current_max, fabsf(static_cast<compute_t>(out_t_val)));`
	`128`	`+ }`
`119`	`129`	`}`
`120`	`130`	`}`
`121`	`131`