NVIDIA · jaimec00 · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025 · Oct 28, 2025
diff --git a/tests/pytorch/distributed/run_numerics.py b/tests/pytorch/distributed/run_numerics.py
@@ -1030,6 +1030,7 @@ def test_layernorm_mlp():
         {"return_bias": True},
         {"return_layernorm_output": True},
         {"delay_wgrad_compute": True},
+        {"checkpoint": True},
     ]
 
     for kwargs in kwargs_list:

diff --git a/tests/pytorch/distributed/test_numerics.py b/tests/pytorch/distributed/test_numerics.py
@@ -13,7 +13,7 @@
 """
     Distributed numerics tests
 
-    These tests test the numerical corectness of the TransformerEngine layers.
+    These tests test the numerical correctness of the TransformerEngine layers.
     Tests are parametrized by the layer and fp8 precision.
     One test consists of running multiple configurations from file run_numerics.py
     Such design is due to the fact the initialization of one test is long

diff --git a/tests/pytorch/layernorm_mlp/test_selective_activation_checkpoint.py b/tests/pytorch/layernorm_mlp/test_selective_activation_checkpoint.py
@@ -0,0 +1,156 @@
+import torch
+from transformer_engine.pytorch import LayerNormMLP
+import pytest
+
+torch.manual_seed(1234)
+device = torch.device("cuda")
+
+
+class _Sequential(torch.nn.Sequential):
+    """Sequential model that forwards keyword arguments to modules"""
+
+    def forward(self, input_: torch.Tensor, **kwargs) -> torch.Tensor:
+        x = input_
+        for module in self:
+            x = module(x, **kwargs)
+        return x
+
+
+class ModelConfig:
+    def __init__(
+        self,
+        hidden_size: int = 128,
+        ffn_hidden_size: int = 512,
+        layers: int = 1,
+    ):
+        self._hidden_size = hidden_size
+        self._ffn_hidden_size = ffn_hidden_size
+        self._layers = layers
+
+    def build(self):
+
+        ln_list, sln_list = [], []
+        for _ in range(self._layers):
+            ln = LayerNormMLP(self._hidden_size, self._ffn_hidden_size, checkpoint=False).to(device)
+            sln = LayerNormMLP(self._hidden_size, self._ffn_hidden_size, checkpoint=True).to(device)
+            with torch.no_grad():
+                sln.layer_norm_weight = torch.nn.Parameter(ln.layer_norm_weight.clone())
+                sln.layer_norm_bias = torch.nn.Parameter(ln.layer_norm_bias.clone())
+                sln.fc1_weight = torch.nn.Parameter(ln.fc1_weight.clone())
+                sln.fc2_weight = torch.nn.Parameter(ln.fc2_weight.clone())
+                sln.fc1_bias = torch.nn.Parameter(ln.fc1_bias.clone())
+                sln.fc2_bias = torch.nn.Parameter(ln.fc2_bias.clone())
+            ln_list.append(ln)
+            sln_list.append(sln)
+
+        ln_model = _Sequential(*ln_list)
+        sln_model = _Sequential(*sln_list)
+
+        return ln_model, sln_model
+
+config = {
+    "small": ModelConfig(128, 512, 12),
+    "medium": ModelConfig(512, 2048, 12),
+    "large": ModelConfig(1024, 4096, 12),
+    "huge": ModelConfig(2048, 8192, 12),
+}
+
+seq_sizes = [2**7, 2**10, 2**14, 2**16]
+
+def _warmup(model, tensor):
+    for _ in range(10):
+        model(tensor).sum().backward()
+
+def _run_fwd(model, tensor):
+
+    torch.cuda.reset_peak_memory_stats(device)
+    start_time, end_time = torch.cuda.Event(enable_timing=True), torch.cuda.Event(
+        enable_timing=True
+    )
+
+    torch.cuda.synchronize()
+    start_mem = torch.cuda.memory_allocated(device)
+    start_time.record()
+    out = model(tensor)
+    end_time.record()
+    end_time.synchronize()
+    elapsed = start_time.elapsed_time(end_time)
+    peak_mem = torch.cuda.max_memory_allocated(device)
+    mem = float(peak_mem - start_mem)
+
+    return out, elapsed, mem
+
+def _run_bwd(model, out):
+
+    model.zero_grad(set_to_none=False)
+    loss = out.sum()
+
+    torch.cuda.reset_peak_memory_stats(device)
+    start_time, end_time = torch.cuda.Event(enable_timing=True), torch.cuda.Event(
+        enable_timing=True
+    )
+
+    torch.cuda.synchronize()
+    start_mem = torch.cuda.memory_allocated(device)
+    start_time.record()
+    loss.backward()
+    end_time.record()
+    end_time.synchronize()
+    elapsed = start_time.elapsed_time(end_time)
+    peak_mem = torch.cuda.max_memory_allocated(device)
+    mem = float(peak_mem - start_mem)
+
+    param_grads = _collect_param_grads(model)
+    return param_grads, elapsed, mem
+
+def _max_diff(ref, other):
+    """Return max absolute difference between two tensors or collections."""
+    if ref is None or other is None:
+        return 0.0
+    if isinstance(ref, (list, tuple)):
+        diffs = [_max_diff(r, o) for r, o in zip(ref, other)]
+        return max(diffs) if diffs else 0.0
+    return torch.max(torch.abs(ref.detach() - other.detach())).item()
+
+def _collect_param_grads(model):
+    grads = {}
+    for name, param in model.named_parameters():
+        if param.grad is None:
+            continue
+        key = _param_key(name)
+        if key is not None:
+            grads[key] = param.grad.detach().clone()
+    return grads
+
+def _param_key(name):
+    return name.split(".")[-1]
+
+
+@pytest.mark.parametrize("size", config.keys())
+@pytest.mark.parametrize("seq_size", seq_sizes)
+def test_selective_activation_checkpoint(size, seq_size):
+
+    ln_model, sln_model = config[size].build()
+    data = torch.randn((seq_size, config[size]._hidden_size), device=device)
+
+    _warmup(ln_model, data.clone())
+    ln_fwd_out, ln_fwd_time, ln_fwd_mem = _run_fwd(ln_model, data.clone())
+    ln_grads, ln_bwd_time, ln_bwd_mem = _run_bwd(ln_model, ln_fwd_out)
+
+    _warmup(sln_model, data.clone())
+    sln_fwd_out, sln_fwd_time, sln_fwd_mem = _run_fwd(sln_model, data.clone())
+    sln_grads, sln_bwd_time, sln_bwd_mem = _run_bwd(sln_model, sln_fwd_out)
+
+    assert ln_fwd_mem > 6*sln_fwd_mem, ""
+    assert ln_bwd_time < sln_bwd_time, ""
+    assert _max_diff(ln_fwd_out, sln_fwd_out)==0.0, "outputs are not equal!"
+    for key in [
+        "layer_norm_weight",
+        "layer_norm_bias",
+        "fc1_weight",
+        "fc1_bias",
+        "fc2_weight",
+        "fc2_bias",
+    ]:
+        assert _max_diff(ln_grads[key], sln_grads[key])==0.0, f"gradients for {key} are not equal!"
+
diff --git a/tests/pytorch/selective_layernorm_mlp/compare.py b/tests/pytorch/selective_layernorm_mlp/compare.py
@@ -0,0 +1,156 @@
+import torch
+from transformer_engine.pytorch import LayerNormMLP
+import pytest
+
+torch.manual_seed(1234)
+device = torch.device("cuda")
+
+
+class _Sequential(torch.nn.Sequential):
+    """Sequential model that forwards keyword arguments to modules"""
+
+    def forward(self, input_: torch.Tensor, **kwargs) -> torch.Tensor:
+        x = input_
+        for module in self:
+            x = module(x, **kwargs)
+        return x
+
+
+class ModelConfig:
+    def __init__(
+        self,
+        hidden_size: int = 128,
+        ffn_hidden_size: int = 512,
+        layers: int = 1,
+    ):
+        self._hidden_size = hidden_size
+        self._ffn_hidden_size = ffn_hidden_size
+        self._layers = layers
+
+    def build(self):
+
+        ln_list, sln_list = [], []
+        for _ in range(self._layers):
+            ln = LayerNormMLP(self._hidden_size, self._ffn_hidden_size, checkpoint=False).to(device)
+            sln = LayerNormMLP(self._hidden_size, self._ffn_hidden_size, checkpoint=True).to(device)
+            with torch.no_grad():
+                sln.layer_norm_weight = torch.nn.Parameter(ln.layer_norm_weight.clone())
+                sln.layer_norm_bias = torch.nn.Parameter(ln.layer_norm_bias.clone())
+                sln.fc1_weight = torch.nn.Parameter(ln.fc1_weight.clone())
+                sln.fc2_weight = torch.nn.Parameter(ln.fc2_weight.clone())
+                sln.fc1_bias = torch.nn.Parameter(ln.fc1_bias.clone())
+                sln.fc2_bias = torch.nn.Parameter(ln.fc2_bias.clone())
+            ln_list.append(ln)
+            sln_list.append(sln)
+
+        ln_model = _Sequential(*ln_list)
+        sln_model = _Sequential(*sln_list)
+
+        return ln_model, sln_model
+
+config = {
+    "small": ModelConfig(128, 512, 12),
+    "medium": ModelConfig(512, 2048, 12),
+    "large": ModelConfig(1024, 4096, 12),
+    "huge": ModelConfig(2048, 8192, 12),
+}
+
+seq_sizes = [2**7, 2**10, 2**14, 2**16]
+
+def _warmup(model, tensor):
+    for _ in range(3):
+        model(tensor).sum().backward()
+
+def _run_fwd(model, tensor):
+
+    torch.cuda.reset_peak_memory_stats(device)
+    start_time, end_time = torch.cuda.Event(enable_timing=True), torch.cuda.Event(
+        enable_timing=True
+    )
+
+    torch.cuda.synchronize()
+    start_mem = torch.cuda.memory_allocated(device)
+    start_time.record()
+    out = model(tensor)
+    end_time.record()
+    end_time.synchronize()
+    elapsed = start_time.elapsed_time(end_time)
+    peak_mem = torch.cuda.max_memory_allocated(device)
+    mem = float(peak_mem - start_mem)
+
+    return out, elapsed, mem
+
+def _run_bwd(model, out):
+
+    model.zero_grad(set_to_none=False)
+    loss = out.sum()
+
+    torch.cuda.reset_peak_memory_stats(device)
+    start_time, end_time = torch.cuda.Event(enable_timing=True), torch.cuda.Event(
+        enable_timing=True
+    )
+
+    torch.cuda.synchronize()
+    start_mem = torch.cuda.memory_allocated(device)
+    start_time.record()
+    loss.backward()
+    end_time.record()
+    end_time.synchronize()
+    elapsed = start_time.elapsed_time(end_time)
+    peak_mem = torch.cuda.max_memory_allocated(device)
+    mem = float(peak_mem - start_mem)
+
+    param_grads = _collect_param_grads(model)
+    return param_grads, elapsed, mem
+
+def _max_diff(ref, other):
+    """Return max absolute difference between two tensors or collections."""
+    if ref is None or other is None:
+        return 0.0
+    if isinstance(ref, (list, tuple)):
+        diffs = [_max_diff(r, o) for r, o in zip(ref, other)]
+        return max(diffs) if diffs else 0.0
+    return torch.max(torch.abs(ref.detach() - other.detach())).item()
+
+def _collect_param_grads(model):
+    grads = {}
+    for name, param in model.named_parameters():
+        if param.grad is None:
+            continue
+        key = _param_key(name)
+        if key is not None:
+            grads[key] = param.grad.detach().clone()
+    return grads
+
+def _param_key(name):
+    return name.split(".")[-1]
+
+
+@pytest.mark.parametrize("size", config.keys())
+@pytest.mark.parametrize("seq_size", seq_sizes)
+def test_selective_activation_checkpoint(size, seq_size):
+
+    ln_model, sln_model = config[size].build()
+    data = torch.randn((seq_size, config[size]._hidden_size), device=device)
+
+    _warmup(ln_model, data.clone())
+    ln_fwd_out, ln_fwd_time, ln_fwd_mem = _run_fwd(ln_model, data.clone())
+    ln_grads, ln_bwd_time, ln_bwd_mem = _run_bwd(ln_model, ln_fwd_out)
+
+    _warmup(sln_model, data.clone())
+    sln_fwd_out, sln_fwd_time, sln_fwd_mem = _run_fwd(sln_model, data.clone())
+    sln_grads, sln_bwd_time, sln_bwd_mem = _run_bwd(sln_model, sln_fwd_out)
+
+    assert ln_fwd_mem > 6*sln_fwd_mem, f"selective activation checkpointing does not reduce forward memory by 6X, only by {ln_fwd_mem/sln_fwd_mem}!"
+    assert ln_bwd_time < sln_bwd_time, "selective activation activation checkpointing backward pass is slower than native!"
+    assert _max_diff(ln_fwd_out, sln_fwd_out)==0.0, "outputs are not equal!"
+    for key in [
+        "layer_norm_weight",
+        "layer_norm_bias",
+        "fc1_weight",
+        "fc1_bias",
+        "fc2_weight",
+        "fc2_bias",
+    ]:
+        assert _max_diff(ln_grads[key], sln_grads[key])==0.0, f"gradients for {key} are not equal!"
+
diff --git a/tests/pytorch/test_cuda_graphs.py b/tests/pytorch/test_cuda_graphs.py
@@ -176,7 +176,8 @@ def forward(self, input_: torch.Tensor, **kwargs) -> torch.Tensor:
     # creating TMA descriptor for MXFP8 quantization.
     "linear",
     "transformer",
-    "layernorm_mlp",
+    "layernorm_mlp_nocheckpoint",
+    "layernorm_mlp_checkpoint",
     "layernorm_linear",
     "mha",
     "linear_op",
@@ -218,15 +219,26 @@ def _test_cuda_graphs(
                 )
                 for _ in range(num_layers)
             ]
-        elif module == "layernorm_mlp":
+        elif module == "layernorm_mlp_nocheckpoint":
             modules = [
                 LayerNormMLP(
                     model_config.hidden_size,
                     model_config.hidden_size,
                     params_dtype=dtype,
+                    checkpoint=False,
                 )
                 for _ in range(num_layers)
             ]
+        elif module == "layernorm_mlp_checkpoint":
+            modules = [
+                LayerNormMLP(
+                    model_config.hidden_size,
+                    model_config.hidden_size,
+                    params_dtype=dtype,
+                    checkpoint=True,
+                )
+                for _ in range(num_layers)
+            ]            
         elif module == "layernorm_linear":
             modules = [
                 LayerNormLinear(
@@ -383,7 +395,8 @@ def test_make_graphed_callables(
 
 _test_make_graphed_callables_with_fp8_weight_caching_modules = [
     "transformer",
-    "layernorm_mlp",
+    "layernorm_mlp_nocheckpoint",
+    "layernorm_mlp_checkpoint",
     "layernorm_linear",
     "linear",
     "mha",