From ce6fd8abf1553f3ba46af09953e405dc9f44863f Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Thu, 16 Oct 2025 10:12:37 -0700
Subject: [PATCH 01/17] enable megatron calculate_per_token_loss

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 examples/configs/dpo.yaml                            | 1 -
 examples/configs/grpo_math_1B.yaml                   | 1 -
 examples/configs/grpo_math_1B_megatron.yaml          | 1 -
 examples/configs/rm.yaml                             | 1 -
 examples/configs/sft.yaml                            | 1 -
 examples/configs/sft_openmathinstruct2_megatron.yaml | 1 -
 examples/configs/vlm_grpo_3B.yaml                    | 1 -
 examples/configs/vlm_grpo_3B_megatron.yaml           | 1 -
 nemo_rl/models/policy/megatron_policy_worker.py      | 8 +++++---
 9 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/examples/configs/dpo.yaml b/examples/configs/dpo.yaml
index 6af1405533..a450e9e397 100755
--- a/examples/configs/dpo.yaml
+++ b/examples/configs/dpo.yaml
@@ -153,7 +153,6 @@ policy:
       grad_reduce_in_fp32: false
       overlap_grad_reduce: true
       overlap_param_gather: true
-      average_in_collective: true
       data_parallel_sharding_strategy: "optim_grads_params"
     
 data:
diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
index 2c3642b4c8..c3b8ed22dd 100644
--- a/examples/configs/grpo_math_1B.yaml
+++ b/examples/configs/grpo_math_1B.yaml
@@ -130,7 +130,6 @@ policy:
       grad_reduce_in_fp32: false
       overlap_grad_reduce: true
       overlap_param_gather: true
-      average_in_collective: true
       use_custom_fsdp: false
       data_parallel_sharding_strategy: "optim_grads_params"
 
diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml
index 4246fee777..bb6b2ab5f1 100644
--- a/examples/configs/grpo_math_1B_megatron.yaml
+++ b/examples/configs/grpo_math_1B_megatron.yaml
@@ -133,7 +133,6 @@ policy:
       grad_reduce_in_fp32: false
       overlap_grad_reduce: true
       overlap_param_gather: true
-      average_in_collective: true
       use_custom_fsdp: false
       data_parallel_sharding_strategy: "optim_grads_params"
 
diff --git a/examples/configs/rm.yaml b/examples/configs/rm.yaml
index cdbf900fb4..d5a026ec98 100644
--- a/examples/configs/rm.yaml
+++ b/examples/configs/rm.yaml
@@ -123,7 +123,6 @@ policy:
       grad_reduce_in_fp32: false
       overlap_grad_reduce: true
       overlap_param_gather: false
-      average_in_collective: true
       data_parallel_sharding_strategy: "optim_grads_params"
 
     
diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml
index 62873bc9d4..dbc2270923 100644
--- a/examples/configs/sft.yaml
+++ b/examples/configs/sft.yaml
@@ -134,7 +134,6 @@ policy:
       grad_reduce_in_fp32: false
       overlap_grad_reduce: true
       overlap_param_gather: true
-      average_in_collective: true
       data_parallel_sharding_strategy: "optim_grads_params"
       use_custom_fsdp: false
 
diff --git a/examples/configs/sft_openmathinstruct2_megatron.yaml b/examples/configs/sft_openmathinstruct2_megatron.yaml
index 9621bdfe4b..3a42c4cf2e 100644
--- a/examples/configs/sft_openmathinstruct2_megatron.yaml
+++ b/examples/configs/sft_openmathinstruct2_megatron.yaml
@@ -35,7 +35,6 @@ policy:
     activation_checkpointing: false
     context_parallel_size: 1
     distributed_data_parallel_config:
-      average_in_collective: true
       data_parallel_sharding_strategy: optim_grads_params
       grad_reduce_in_fp32: true
       overlap_grad_reduce: true
diff --git a/examples/configs/vlm_grpo_3B.yaml b/examples/configs/vlm_grpo_3B.yaml
index f3172a3131..e25856cad8 100644
--- a/examples/configs/vlm_grpo_3B.yaml
+++ b/examples/configs/vlm_grpo_3B.yaml
@@ -117,7 +117,6 @@ policy:
       grad_reduce_in_fp32: false
       overlap_grad_reduce: true
       overlap_param_gather: true
-      average_in_collective: true
       use_custom_fsdp: false
       data_parallel_sharding_strategy: "optim_grads_params"
 
diff --git a/examples/configs/vlm_grpo_3B_megatron.yaml b/examples/configs/vlm_grpo_3B_megatron.yaml
index 2bf0e86184..9b5714f087 100644
--- a/examples/configs/vlm_grpo_3B_megatron.yaml
+++ b/examples/configs/vlm_grpo_3B_megatron.yaml
@@ -146,7 +146,6 @@ policy:
       grad_reduce_in_fp32: false
       overlap_grad_reduce: false
       overlap_param_gather: true
-      average_in_collective: true
       use_custom_fsdp: false
       data_parallel_sharding_strategy: optim_grads_params
 data:
diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py
index 2d84509879..6a9f759fe3 100644
--- a/nemo_rl/models/policy/megatron_policy_worker.py
+++ b/nemo_rl/models/policy/megatron_policy_worker.py
@@ -15,6 +15,7 @@
 import math
 import os
 import time
+from tokenize import triple_quoted
 import warnings
 from collections import defaultdict
 from contextlib import AbstractContextManager, contextmanager, nullcontext
@@ -658,6 +659,9 @@ def __init__(
             "https://github.com/NVIDIA-NeMo/RL/blob/bccbc377705a81a1f4b3c31ad9767bcc15f735a8/nemo_rl/algorithms/sft.py#L175-L179."
         )
 
+        ## TODO: make sure this works with sequence-level losses as well
+        model_cfg.calculate_per_token_loss = True
+
         self.megatron_cfg = ConfigContainer(
             model=model_cfg,
             checkpoint=checkpoint_config,
@@ -683,9 +687,7 @@ def __init__(
                 overlap_param_gather=self.cfg["megatron_cfg"][
                     "distributed_data_parallel_config"
                 ]["overlap_param_gather"],
-                average_in_collective=self.cfg["megatron_cfg"][
-                    "distributed_data_parallel_config"
-                ]["average_in_collective"],
+                average_in_collective=False, # average in collective is not supported with per-token loss
                 use_distributed_optimizer=self.cfg["megatron_cfg"]["optimizer"][
                     "use_distributed_optimizer"
                 ],

From bdcee3cc836b78c0e6b69c9202bd3b3bb0369d6e Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Fri, 17 Oct 2025 16:43:24 -0700
Subject: [PATCH 02/17] remove unneeded import

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 nemo_rl/models/policy/megatron_policy_worker.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py
index 6a9f759fe3..56c6c02ccf 100644
--- a/nemo_rl/models/policy/megatron_policy_worker.py
+++ b/nemo_rl/models/policy/megatron_policy_worker.py
@@ -15,7 +15,6 @@
 import math
 import os
 import time
-from tokenize import triple_quoted
 import warnings
 from collections import defaultdict
 from contextlib import AbstractContextManager, contextmanager, nullcontext

From 408900a4b33ef7ab16275692f2680bed6c4cbe22 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Thu, 23 Oct 2025 15:27:54 -0700
Subject: [PATCH 03/17] TP bug fix and add WIP unit test

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 .../models/policy/megatron_policy_worker.py   |  15 +-
 .../models/policy/test_megatron_worker.py     | 155 ++++++++++++++++++
 2 files changed, 168 insertions(+), 2 deletions(-)

diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py
index 56c6c02ccf..ef61ac76fc 100644
--- a/nemo_rl/models/policy/megatron_policy_worker.py
+++ b/nemo_rl/models/policy/megatron_policy_worker.py
@@ -658,8 +658,13 @@ def __init__(
             "https://github.com/NVIDIA-NeMo/RL/blob/bccbc377705a81a1f4b3c31ad9767bcc15f735a8/nemo_rl/algorithms/sft.py#L175-L179."
         )
 
-        ## TODO: make sure this works with sequence-level losses as well
+        ## These settings are required for correct gradient computations in mcore
+        ## when calculate_per_token_loss is True, there is no scaling of the gradient in mcore,
+        ## so we handle the scaling in nemo-rl.
+        ## perform_initialization = True is a workaround to ensure the correct tensor parallel attributes are set
+        ## on the TP-sharded parameters.
         model_cfg.calculate_per_token_loss = True
+        model_cfg.perform_initialization = True
 
         self.megatron_cfg = ConfigContainer(
             model=model_cfg,
@@ -686,7 +691,9 @@ def __init__(
                 overlap_param_gather=self.cfg["megatron_cfg"][
                     "distributed_data_parallel_config"
                 ]["overlap_param_gather"],
-                average_in_collective=False, # average in collective is not supported with per-token loss
+                # we need to set average_in_collective=False with calculate_per_token_loss=True.
+                # otherwise, mcore throws an assertion error.
+                average_in_collective=False,
                 use_distributed_optimizer=self.cfg["megatron_cfg"]["optimizer"][
                     "use_distributed_optimizer"
                 ],
@@ -704,6 +711,10 @@ def __init__(
             ),
         )
         self.megatron_cfg.validate()
+        assert (
+           "aux_loss" not in self.megatron_cfg.moe_router_load_balancing_type or 
+           self.megatron_cfg.moe_aux_loss_coeff == 0
+        ), "MoE aux loss is currently not supported due to a known but in Megatron-LM. See ## TODO: link to GH issue"
         (
             self.mcore_state,
             self.model,
diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py
index 48b2c01dc8..c3a401c634 100644
--- a/tests/unit/models/policy/test_megatron_worker.py
+++ b/tests/unit/models/policy/test_megatron_worker.py
@@ -16,6 +16,7 @@
 import time
 from typing import Optional
 
+import numpy as np
 import pytest
 import torch
 
@@ -1824,6 +1825,160 @@ def test_megatron_context_parallel_training_agreement(tiny_llama_model_path):
     )
 
 
+@pytest.mark.hf_gated
+@pytest.mark.timeout(300)
+def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_path):
+    """Test that gradient norms are consistent across different TP and DP configurations.
+    
+    This test validates that the same model produces identical gradient norms
+    regardless of tensor parallelism (TP) and data parallelism (DP) settings.
+    """
+    batch_size = 8
+    seq_len = 64
+    vocab_size = 32000
+    
+    # Create reproducible test data
+    torch.manual_seed(42)
+    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len))
+    attention_mask = torch.ones(batch_size, seq_len)
+    input_lengths = attention_mask.sum(dim=1).to(torch.int32)
+    labels = torch.randint(0, vocab_size, (batch_size, seq_len))
+    
+    data = BatchedDataDict(
+        {
+            "input_ids": input_ids,
+            "input_lengths": input_lengths,
+            "attention_mask": attention_mask,
+            "labels": labels,
+            "sample_mask": torch.ones(batch_size),
+        }
+    )
+    
+    # Test configurations: (num_gpus, tp, pp, description)
+    test_configs = [
+        (1, 1, 1, "DP1TP1")
+        (2, 1, 1, "DP2"),  # Data parallel with 2 GPUs
+        (2, 2, 1, "TP2"),  # Tensor parallel with 2 GPUs
+    ]
+    
+    grad_norms = {}
+    losses = {}
+    
+    for num_gpus, tp, pp, desc in test_configs:
+        print(f"\n=== Testing {desc} configuration (GPUs={num_gpus}, TP={tp}, PP={pp}) ===")
+        
+        cluster = RayVirtualCluster(
+            name=f"test-grad-norm-{desc.lower()}",
+            bundle_ct_per_node_list=[num_gpus],
+            use_gpus=True,
+            num_gpus_per_node=num_gpus,
+            max_colocated_worker_groups=1,
+        )
+        
+        config = create_megatron_test_config(
+            model_name=tiny_llama_model_path,
+            tp=tp,
+            pp=pp,
+            precision="float32",  # Use float32 for more stable gradient comparisons
+        )
+        
+        tokenizer = get_tokenizer(config["tokenizer"])
+        config["generation"] = configure_generation_config(
+            config["generation"], tokenizer
+        )
+        
+        policy = Policy(
+            cluster=cluster,
+            config=config,
+            tokenizer=tokenizer,
+            init_reference_model=False,
+        )
+        
+        # Use SimpleLoss for consistent comparison
+        loss_fn = SimpleLoss()
+        
+        try:
+            # Prepare for training
+            policy.prepare_for_training()
+            
+            # Perform one forward/backward step
+            print(f"Performing forward/backward pass for {desc}...")
+            results = policy.train(data, loss_fn)
+            
+            # Extract metrics
+            loss_tensor = results["loss"]
+            all_metrics = results["all_mb_metrics"]
+            
+            # Verify loss is valid
+            assert not torch.isnan(loss_tensor).any(), f"Loss should not be NaN for {desc}"
+            assert not torch.isinf(loss_tensor).any(), f"Loss should not be Inf for {desc}"
+            
+            # Extract gradient norm
+            assert "grad_norm" in all_metrics, f"grad_norm should be in metrics for {desc}"
+            grad_norm = all_metrics["grad_norm"]
+            
+            # Store results for comparison
+            grad_norms[desc] = grad_norm
+            losses[desc] = loss_tensor.cpu().numpy()
+            
+            print(f"{desc} - Loss: {loss_tensor}")
+            print(f"{desc} - Grad norm: {grad_norm}")
+            
+        finally:
+            policy.shutdown()
+            cluster.shutdown()
+    
+    # Compare gradient norms across configurations
+    print("\n=== Comparing gradient norms across configurations ===")
+    
+    # Get reference values from DP2 configuration
+    reference_config = "DP1TP1"
+    reference_grad_norm = grad_norms[reference_config]
+    reference_loss = losses[reference_config]
+    
+    for config_name, grad_norm in grad_norms.items():
+        if config_name == reference_config:
+            continue
+            
+        print(f"\nComparing {config_name} with {reference_config}:")
+        print(f"  {reference_config} grad norm: {reference_grad_norm}")
+        print(f"  {config_name} grad norm: {grad_norm}")
+        
+        # Compare gradient norms
+        if not isinstance(grad_norm, list):
+            grad_norm = [grad_norm]
+            reference_grad_norm = [reference_grad_norm]
+        if isinstance(grad_norm, list) and isinstance(reference_grad_norm, list):
+            # Handle case where grad_norm is a list (multiple microbatches)
+            assert len(grad_norm) == len(reference_grad_norm), (
+                f"Number of gradient norm values should match: {len(grad_norm)} vs {len(reference_grad_norm)}"
+            )
+            
+            for i, (gn, ref_gn) in enumerate(zip(grad_norm, reference_grad_norm)):
+                grad_diff = abs(gn - ref_gn)
+                relative_diff = grad_diff / (ref_gn + 1e-8)
+                print(f"    Microbatch {i}: {ref_gn} vs {gn}, diff={grad_diff:.6f}, rel_diff={relative_diff:.6f}")
+                
+                # Allow small differences due to floating point precision and parallelization
+                assert relative_diff < 0.01 or grad_diff < 1e-6, (
+                    f"Gradient norm difference too large for microbatch {i}: "
+                    f"{ref_gn} vs {gn} (diff={grad_diff:.6f}, rel_diff={relative_diff:.6f})"
+                )
+        
+        # Compare losses (should also be identical for same computation)
+        loss_diff = np.max(np.abs(reference_loss - losses[config_name]))
+        relative_loss_diff = loss_diff / (np.mean(np.abs(reference_loss)) + 1e-8)
+        print(f"    Loss diff: {loss_diff:.6f}, relative loss diff: {relative_loss_diff:.6f}")
+        
+        # Allow small differences in loss as well
+        assert relative_loss_diff < 0.01 or loss_diff < 1e-6, (
+            f"Loss difference too large: "
+            f"max diff={loss_diff:.6f}, rel_diff={relative_loss_diff:.6f}"
+        )
+    
+    print("\n✓ SUCCESS: Gradient norms are consistent across all parallelization configurations!")
+
+
 @pytest.mark.hf_gated
 @pytest.mark.timeout(300)
 def test_megatron_policy_flops_range_check(tiny_llama_model_path):

From d8348d1f63e92ba111710f6628d39ca9ba14cbec Mon Sep 17 00:00:00 2001
From: Anna Shors <ashors@nvidia.com>
Date: Fri, 24 Oct 2025 16:00:07 -0700
Subject: [PATCH 04/17] fixes

Signed-off-by: Anna Shors <ashors@nvidia.com>
---
 nemo_rl/models/policy/megatron_policy_worker.py  | 11 ++++++-----
 tests/unit/models/policy/test_megatron_worker.py |  7 +++++--
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py
index ef61ac76fc..9e4e9952a4 100644
--- a/nemo_rl/models/policy/megatron_policy_worker.py
+++ b/nemo_rl/models/policy/megatron_policy_worker.py
@@ -664,7 +664,12 @@ def __init__(
         ## perform_initialization = True is a workaround to ensure the correct tensor parallel attributes are set
         ## on the TP-sharded parameters.
         model_cfg.calculate_per_token_loss = True
-        model_cfg.perform_initialization = True
+        model_cfg.perform_initialization = False
+
+        assert (
+           "aux_loss" not in self.model_cfg.moe_router_load_balancing_type or
+           self.model_cfg.moe_aux_loss_coeff == 0
+        ), "MoE aux loss is currently not supported due to a known but in Megatron-LM. See ## TODO: link to GH issue"
 
         self.megatron_cfg = ConfigContainer(
             model=model_cfg,
@@ -711,10 +716,6 @@ def __init__(
             ),
         )
         self.megatron_cfg.validate()
-        assert (
-           "aux_loss" not in self.megatron_cfg.moe_router_load_balancing_type or 
-           self.megatron_cfg.moe_aux_loss_coeff == 0
-        ), "MoE aux loss is currently not supported due to a known but in Megatron-LM. See ## TODO: link to GH issue"
         (
             self.mcore_state,
             self.model,
diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py
index c3a401c634..47fffec1ee 100644
--- a/tests/unit/models/policy/test_megatron_worker.py
+++ b/tests/unit/models/policy/test_megatron_worker.py
@@ -1851,12 +1851,13 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_
             "attention_mask": attention_mask,
             "labels": labels,
             "sample_mask": torch.ones(batch_size),
+            "token_mask": torch.ones_like(input_ids),
         }
     )
     
     # Test configurations: (num_gpus, tp, pp, description)
     test_configs = [
-        (1, 1, 1, "DP1TP1")
+        (1, 1, 1, "DP1TP1"),
         (2, 1, 1, "DP2"),  # Data parallel with 2 GPUs
         (2, 2, 1, "TP2"),  # Tensor parallel with 2 GPUs
     ]
@@ -1895,7 +1896,7 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_
         )
         
         # Use SimpleLoss for consistent comparison
-        loss_fn = SimpleLoss()
+        loss_fn = NLLLoss()
         
         try:
             # Prepare for training
@@ -1932,6 +1933,8 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_
     print("\n=== Comparing gradient norms across configurations ===")
     
     # Get reference values from DP2 configuration
+    # NOTE: even if TP2 config passes these tests, it doesn't necessarily imply
+    # there are no bugs. Sometimes bugs with grad norm are hard to catch.
     reference_config = "DP1TP1"
     reference_grad_norm = grad_norms[reference_config]
     reference_loss = losses[reference_config]

From 9d0c41021ab570ab5e03120eea0479561ecf9ecf Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Fri, 24 Oct 2025 16:04:57 -0700
Subject: [PATCH 05/17] lint

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 .../models/policy/megatron_policy_worker.py   |  8 +-
 .../models/policy/test_megatron_worker.py     | 86 +++++++++++--------
 2 files changed, 55 insertions(+), 39 deletions(-)

diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py
index 9e4e9952a4..3a1801fad6 100644
--- a/nemo_rl/models/policy/megatron_policy_worker.py
+++ b/nemo_rl/models/policy/megatron_policy_worker.py
@@ -667,9 +667,11 @@ def __init__(
         model_cfg.perform_initialization = False
 
         assert (
-           "aux_loss" not in self.model_cfg.moe_router_load_balancing_type or
-           self.model_cfg.moe_aux_loss_coeff == 0
-        ), "MoE aux loss is currently not supported due to a known but in Megatron-LM. See ## TODO: link to GH issue"
+            "aux_loss" not in self.model_cfg.moe_router_load_balancing_type
+            or self.model_cfg.moe_aux_loss_coeff == 0
+        ), (
+            "MoE aux loss is currently not supported due to a known but in Megatron-LM. See ## TODO: link to GH issue"
+        )
 
         self.megatron_cfg = ConfigContainer(
             model=model_cfg,
diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py
index 47fffec1ee..1ad6a7052f 100644
--- a/tests/unit/models/policy/test_megatron_worker.py
+++ b/tests/unit/models/policy/test_megatron_worker.py
@@ -1829,21 +1829,21 @@ def test_megatron_context_parallel_training_agreement(tiny_llama_model_path):
 @pytest.mark.timeout(300)
 def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_path):
     """Test that gradient norms are consistent across different TP and DP configurations.
-    
+
     This test validates that the same model produces identical gradient norms
     regardless of tensor parallelism (TP) and data parallelism (DP) settings.
     """
     batch_size = 8
     seq_len = 64
     vocab_size = 32000
-    
+
     # Create reproducible test data
     torch.manual_seed(42)
     input_ids = torch.randint(0, vocab_size, (batch_size, seq_len))
     attention_mask = torch.ones(batch_size, seq_len)
     input_lengths = attention_mask.sum(dim=1).to(torch.int32)
     labels = torch.randint(0, vocab_size, (batch_size, seq_len))
-    
+
     data = BatchedDataDict(
         {
             "input_ids": input_ids,
@@ -1854,20 +1854,22 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_
             "token_mask": torch.ones_like(input_ids),
         }
     )
-    
+
     # Test configurations: (num_gpus, tp, pp, description)
     test_configs = [
         (1, 1, 1, "DP1TP1"),
         (2, 1, 1, "DP2"),  # Data parallel with 2 GPUs
         (2, 2, 1, "TP2"),  # Tensor parallel with 2 GPUs
     ]
-    
+
     grad_norms = {}
     losses = {}
-    
+
     for num_gpus, tp, pp, desc in test_configs:
-        print(f"\n=== Testing {desc} configuration (GPUs={num_gpus}, TP={tp}, PP={pp}) ===")
-        
+        print(
+            f"\n=== Testing {desc} configuration (GPUs={num_gpus}, TP={tp}, PP={pp}) ==="
+        )
+
         cluster = RayVirtualCluster(
             name=f"test-grad-norm-{desc.lower()}",
             bundle_ct_per_node_list=[num_gpus],
@@ -1875,78 +1877,84 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_
             num_gpus_per_node=num_gpus,
             max_colocated_worker_groups=1,
         )
-        
+
         config = create_megatron_test_config(
             model_name=tiny_llama_model_path,
             tp=tp,
             pp=pp,
             precision="float32",  # Use float32 for more stable gradient comparisons
         )
-        
+
         tokenizer = get_tokenizer(config["tokenizer"])
         config["generation"] = configure_generation_config(
             config["generation"], tokenizer
         )
-        
+
         policy = Policy(
             cluster=cluster,
             config=config,
             tokenizer=tokenizer,
             init_reference_model=False,
         )
-        
+
         # Use SimpleLoss for consistent comparison
         loss_fn = NLLLoss()
-        
+
         try:
             # Prepare for training
             policy.prepare_for_training()
-            
+
             # Perform one forward/backward step
             print(f"Performing forward/backward pass for {desc}...")
             results = policy.train(data, loss_fn)
-            
+
             # Extract metrics
             loss_tensor = results["loss"]
             all_metrics = results["all_mb_metrics"]
-            
+
             # Verify loss is valid
-            assert not torch.isnan(loss_tensor).any(), f"Loss should not be NaN for {desc}"
-            assert not torch.isinf(loss_tensor).any(), f"Loss should not be Inf for {desc}"
-            
+            assert not torch.isnan(loss_tensor).any(), (
+                f"Loss should not be NaN for {desc}"
+            )
+            assert not torch.isinf(loss_tensor).any(), (
+                f"Loss should not be Inf for {desc}"
+            )
+
             # Extract gradient norm
-            assert "grad_norm" in all_metrics, f"grad_norm should be in metrics for {desc}"
+            assert "grad_norm" in all_metrics, (
+                f"grad_norm should be in metrics for {desc}"
+            )
             grad_norm = all_metrics["grad_norm"]
-            
+
             # Store results for comparison
             grad_norms[desc] = grad_norm
             losses[desc] = loss_tensor.cpu().numpy()
-            
+
             print(f"{desc} - Loss: {loss_tensor}")
             print(f"{desc} - Grad norm: {grad_norm}")
-            
+
         finally:
             policy.shutdown()
             cluster.shutdown()
-    
+
     # Compare gradient norms across configurations
     print("\n=== Comparing gradient norms across configurations ===")
-    
+
     # Get reference values from DP2 configuration
     # NOTE: even if TP2 config passes these tests, it doesn't necessarily imply
     # there are no bugs. Sometimes bugs with grad norm are hard to catch.
     reference_config = "DP1TP1"
     reference_grad_norm = grad_norms[reference_config]
     reference_loss = losses[reference_config]
-    
+
     for config_name, grad_norm in grad_norms.items():
         if config_name == reference_config:
             continue
-            
+
         print(f"\nComparing {config_name} with {reference_config}:")
         print(f"  {reference_config} grad norm: {reference_grad_norm}")
         print(f"  {config_name} grad norm: {grad_norm}")
-        
+
         # Compare gradient norms
         if not isinstance(grad_norm, list):
             grad_norm = [grad_norm]
@@ -1956,30 +1964,36 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_
             assert len(grad_norm) == len(reference_grad_norm), (
                 f"Number of gradient norm values should match: {len(grad_norm)} vs {len(reference_grad_norm)}"
             )
-            
+
             for i, (gn, ref_gn) in enumerate(zip(grad_norm, reference_grad_norm)):
                 grad_diff = abs(gn - ref_gn)
                 relative_diff = grad_diff / (ref_gn + 1e-8)
-                print(f"    Microbatch {i}: {ref_gn} vs {gn}, diff={grad_diff:.6f}, rel_diff={relative_diff:.6f}")
-                
+                print(
+                    f"    Microbatch {i}: {ref_gn} vs {gn}, diff={grad_diff:.6f}, rel_diff={relative_diff:.6f}"
+                )
+
                 # Allow small differences due to floating point precision and parallelization
                 assert relative_diff < 0.01 or grad_diff < 1e-6, (
                     f"Gradient norm difference too large for microbatch {i}: "
                     f"{ref_gn} vs {gn} (diff={grad_diff:.6f}, rel_diff={relative_diff:.6f})"
                 )
-        
+
         # Compare losses (should also be identical for same computation)
         loss_diff = np.max(np.abs(reference_loss - losses[config_name]))
         relative_loss_diff = loss_diff / (np.mean(np.abs(reference_loss)) + 1e-8)
-        print(f"    Loss diff: {loss_diff:.6f}, relative loss diff: {relative_loss_diff:.6f}")
-        
+        print(
+            f"    Loss diff: {loss_diff:.6f}, relative loss diff: {relative_loss_diff:.6f}"
+        )
+
         # Allow small differences in loss as well
         assert relative_loss_diff < 0.01 or loss_diff < 1e-6, (
             f"Loss difference too large: "
             f"max diff={loss_diff:.6f}, rel_diff={relative_loss_diff:.6f}"
         )
-    
-    print("\n✓ SUCCESS: Gradient norms are consistent across all parallelization configurations!")
+
+    print(
+        "\n✓ SUCCESS: Gradient norms are consistent across all parallelization configurations!"
+    )
 
 
 @pytest.mark.hf_gated

From 72a5c73ced440cbd94b0d4e207573dd41ea588a3 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Tue, 28 Oct 2025 08:57:49 -0700
Subject: [PATCH 06/17] add tensor parallel attributes check

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 .../models/policy/megatron_policy_worker.py   | 44 +++++++++++++++++++
 .../models/policy/test_megatron_worker.py     | 40 +++++++++++++++++
 2 files changed, 84 insertions(+)

diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py
index 3a1801fad6..daf5f0acd3 100644
--- a/nemo_rl/models/policy/megatron_policy_worker.py
+++ b/nemo_rl/models/policy/megatron_policy_worker.py
@@ -2034,3 +2034,47 @@ def start_gpu_profiling(self) -> None:
     def stop_gpu_profiling(self) -> None:
         """Stop GPU profiling."""
         torch.cuda.profiler.stop()
+
+    def check_tensor_parallel_attributes(self) -> dict[str, Any]:
+        """Check tensor parallel attributes on model parameters.
+
+        Returns:
+            Dictionary containing information about tensor parallel parameters:
+            - tp_params: List of parameter names that have tensor_model_parallel=True
+            - non_tp_params: List of parameter names that have tensor_model_parallel=False
+            - total_params: Total number of parameters checked
+            - tp_size: Tensor parallel size from config
+        """
+        tp_params = []
+        non_tp_params = []
+        total_params = 0
+
+        for name, param in self.model.named_parameters():
+            total_params += 1
+            tensor_model_parallel = getattr(param, "tensor_model_parallel", False)
+
+            if tensor_model_parallel:
+                tp_params.append(
+                    {
+                        "name": name,
+                        "tensor_model_parallel": tensor_model_parallel,
+                        "partition_dim": getattr(param, "partition_dim", None),
+                        "partition_stride": getattr(param, "partition_stride", None),
+                        "shape": list(param.shape),
+                    }
+                )
+            else:
+                non_tp_params.append(
+                    {
+                        "name": name,
+                        "tensor_model_parallel": tensor_model_parallel,
+                        "shape": list(param.shape),
+                    }
+                )
+
+        return {
+            "tp_params": tp_params,
+            "non_tp_params": non_tp_params,
+            "total_params": total_params,
+            "tp_size": self.megatron_cfg.model.tensor_model_parallel_size,
+        }
diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py
index 1ad6a7052f..0411726fbf 100644
--- a/tests/unit/models/policy/test_megatron_worker.py
+++ b/tests/unit/models/policy/test_megatron_worker.py
@@ -18,6 +18,7 @@
 
 import numpy as np
 import pytest
+import ray
 import torch
 
 from nemo_rl.algorithms.interfaces import LossFunction
@@ -1933,6 +1934,45 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_
             print(f"{desc} - Loss: {loss_tensor}")
             print(f"{desc} - Grad norm: {grad_norm}")
 
+            # Check tensor parallel attributes on model parameters
+            print(f"Checking tensor parallel attributes for {desc}...")
+            tp_check_futures = policy.worker_group.run_all_workers_single_data(
+                "check_tensor_parallel_attributes"
+            )
+            tp_check_results = [ray.get(future) for future in tp_check_futures]
+
+            # Analyze the first worker's results (all workers should have the same structure)
+            tp_info = tp_check_results[0]
+
+            print(f"{desc} - TP size: {tp_info['tp_size']}")
+            print(f"{desc} - Total params: {tp_info['total_params']}")
+            print(f"{desc} - TP params: {len(tp_info['tp_params'])}")
+            print(f"{desc} - Non-TP params: {len(tp_info['non_tp_params'])}")
+
+            # Validate tensor parallel attributes
+            expected_tp_size = tp
+            assert tp_info["tp_size"] == expected_tp_size, (
+                f"Expected TP size {expected_tp_size}, got {tp_info['tp_size']}"
+            )
+
+            if tp > 1:
+                # When tensor parallelism is enabled, we should have some TP parameters
+                assert len(tp_info["tp_params"]) > 0, (
+                    f"Expected tensor parallel parameters when TP={tp}, but found none"
+                )
+            else:
+                # When tensor parallelism is disabled, no parameters should have TP attributes
+                if len(tp_info["tp_params"]) > 0:
+                    print(
+                        f"WARNING: Found {len(tp_info['tp_params'])} TP parameters when TP=1:"
+                    )
+                    for tp_param in tp_info["tp_params"][:3]:  # Show first 3
+                        print(f"  - {tp_param['name']}")
+
+                print(
+                    f"✓ {desc} - Model configured for TP={tp} (no tensor parallel sharding expected)"
+                )
+
         finally:
             policy.shutdown()
             cluster.shutdown()

From dcdd016f79dda0afe658df8fbfee080a235069a0 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Tue, 28 Oct 2025 09:13:06 -0700
Subject: [PATCH 07/17] add convergence test

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 .../llm/sft-qwen2.5-math-7b-megatron.yaml     | 149 ++++++++++++++++++
 .../llm/sft-qwen2.5-math-7b-megatron.sh       |  42 +++++
 tests/test_suites/nightly.txt                 |   2 +
 3 files changed, 193 insertions(+)
 create mode 100644 examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml
 create mode 100644 tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh

diff --git a/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml b/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml
new file mode 100644
index 0000000000..95586bf3fb
--- /dev/null
+++ b/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml
@@ -0,0 +1,149 @@
+# SFT Algorithm Configuration
+defaults: ../../sft.yaml
+sft:
+  ## total number of steps to train will equal
+  ## min((max_num_epochs * len(train_dataloader)), max_num_steps)
+  max_num_epochs: 1
+  max_num_steps: 80
+
+  val_period: 10
+  val_batches: 8
+  val_global_batch_size: 32
+  val_micro_batch_size: 1
+  val_at_start: true
+  seed: 42
+
+checkpointing:
+  enabled: false
+  checkpoint_dir: "results/sft"
+  metric_name: "val_loss" ## set to null to save most recent k checkpoints
+  higher_is_better: false
+  keep_top_k: 3
+  save_period: 10
+  checkpoint_must_save_by: null
+
+policy:
+  model_name: "Qwen/Qwen2.5-Math-7B"
+  tokenizer:
+    name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
+  train_global_batch_size: 512
+  train_micro_batch_size: 1
+  generation_batch_size: 32 # Only used when generating using HF backend
+  logprob_batch_size: 1 #4
+  max_total_sequence_length: 16384 # max_prompt_length + max_response_length
+  precision: "bfloat16"
+
+  dtensor_cfg:
+    enabled: false
+  
+  megatron_cfg:
+    enabled: true
+    empty_unused_memory_level: 1
+    activation_checkpointing: false
+    tensor_model_parallel_size: 4
+    pipeline_model_parallel_size: 1
+    num_layers_in_first_pipeline_stage: null
+    num_layers_in_last_pipeline_stage: null
+    context_parallel_size: 2
+    expert_tensor_parallel_size: 1
+    expert_model_parallel_size: 1
+    pipeline_dtype: ${policy.precision}
+
+    sequence_parallel: true
+    freeze_moe_router: true
+    moe_router_dtype: "fp64"
+    moe_router_load_balancing_type: "seq_aux_loss" # causes logprob error divergence for grpo
+    moe_aux_loss_coeff: 0.01
+    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
+    #gives ~20% training perf speedup with sequence packing
+    apply_rope_fusion: True
+    moe_permute_fusion: True
+
+    optimizer:
+      optimizer: "adam"
+      lr: 1.0e-6
+      min_lr: 1.0e-6
+      weight_decay: 0.1
+      bf16: true
+      fp16: false
+      params_dtype: "float32"
+
+      #adam
+      adam_beta1: 0.9
+      adam_beta2: 0.999
+      adam_eps: 1e-8
+
+      #sgd
+      sgd_momentum: 0.9
+
+      #distributed optimizer
+      use_distributed_optimizer: False #true
+      use_precision_aware_optimizer: False #true
+
+      clip_grad: ${policy.max_grad_norm}
+
+    scheduler:
+      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      weight_decay_incr_style: "constant"
+      lr_decay_style: "constant"
+      lr_decay_iters: null
+      lr_warmup_iters: 10
+      lr_warmup_init: 0.00000000001
+
+    distributed_data_parallel_config:
+      grad_reduce_in_fp32: false
+      overlap_grad_reduce: true
+      overlap_param_gather: true
+      use_custom_fsdp: false
+      data_parallel_sharding_strategy: "optim_grads_params"
+
+  dynamic_batching:
+    enabled: false
+
+  sequence_packing:
+    enabled: True
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
+    algorithm: "modified_first_fit_decreasing"
+    sequence_length_round: 64
+
+  # makes the training sequence length divisible by the tensor parallel size
+  # this is useful for sequence parallel training
+  make_sequence_length_divisible_by: 32
+  max_grad_norm: 1.0
+
+data:
+  max_input_seq_length: ${policy.max_total_sequence_length}
+  dataset_name: "openmathinstruct2"
+  prompt_file: examples/prompts/math.txt
+  split: "train_1M"
+  add_bos: true
+  add_eos: true
+  add_generation_prompt: true
+  output_key: 'generated_solution'
+  shuffle: true
+  num_workers: 8
+
+logger:
+  log_dir: "logs"  # Base directory for all logs
+  wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running
+  tensorboard_enabled: true
+  mlflow_enabled: false
+  swanlab_enabled: false # Disable SwanLab logging
+  monitor_gpus: true  # If true, will monitor GPU usage and log to wandb and/or tensorboard
+  wandb:
+    project: "nemo-rl"
+    name: "sft-qwen2.5-math-7b-megatron"
+  tensorboard:
+    log_dir: "tb_logs-sft-qwen2.5-math-7b-megatron"
+  mlflow:
+    experiment_name: "sft-dev"
+    run_name: "sft-qwen2.5-math-7b-megatron"
+  gpu_monitoring:
+    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
+    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
+
+cluster:
+  gpus_per_node: 8
+  num_nodes: 2
diff --git a/tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh b/tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh
new file mode 100644
index 0000000000..e782ff3986
--- /dev/null
+++ b/tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# TODO: this config can crash on OOM
+# https://github.com/NVIDIA-NeMo/RL/issues/263
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=2
+STEPS_PER_RUN=80  # step_time ~ 29sec
+MAX_STEPS=80
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=30
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_sft.py \
+    --config $CONFIG_PATH \
+    sft.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'data["train/loss"]["80"] < 0.301' \
+        'data["validation/loss"]["80"] < 0.304'
+fi
diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt
index b7f2410785..1f828fb85b 100644
--- a/tests/test_suites/nightly.txt
+++ b/tests/test_suites/nightly.txt
@@ -66,6 +66,8 @@ tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.sh
 tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron.sh
 # sequence packing
 tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.sh
+# validate TP/DP
+tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh
 
 #######
 # DPO #

From 9f99d5315fe6377cfdc5b0a5ab6567f5e69132ac Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Tue, 28 Oct 2025 09:14:37 -0700
Subject: [PATCH 08/17] address comment

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 nemo_rl/models/policy/megatron_policy_worker.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py
index daf5f0acd3..8406059c48 100644
--- a/nemo_rl/models/policy/megatron_policy_worker.py
+++ b/nemo_rl/models/policy/megatron_policy_worker.py
@@ -667,8 +667,8 @@ def __init__(
         model_cfg.perform_initialization = False
 
         assert (
-            "aux_loss" not in self.model_cfg.moe_router_load_balancing_type
-            or self.model_cfg.moe_aux_loss_coeff == 0
+            "aux_loss" not in model_cfg.moe_router_load_balancing_type
+            or model_cfg.moe_aux_loss_coeff == 0
         ), (
             "MoE aux loss is currently not supported due to a known but in Megatron-LM. See ## TODO: link to GH issue"
         )

From 3007940beccf4c5f407ffe36ca39e1b60a0ce13c Mon Sep 17 00:00:00 2001
From: Anna Shors <ashors@nvidia.com>
Date: Tue, 28 Oct 2025 10:04:12 -0700
Subject: [PATCH 09/17] improve unit test

Signed-off-by: Anna Shors <ashors@nvidia.com>
---
 .../models/policy/megatron_policy_worker.py   |  2 +-
 .../models/policy/test_megatron_worker.py     | 34 ++++++-------------
 2 files changed, 11 insertions(+), 25 deletions(-)

diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py
index 5ba521d907..afd8da4ccb 100644
--- a/nemo_rl/models/policy/megatron_policy_worker.py
+++ b/nemo_rl/models/policy/megatron_policy_worker.py
@@ -670,7 +670,7 @@ def __init__(
         ## perform_initialization = True is a workaround to ensure the correct tensor parallel attributes are set
         ## on the TP-sharded parameters.
         model_cfg.calculate_per_token_loss = True
-        model_cfg.perform_initialization = False
+        model_cfg.perform_initialization = True
 
         assert (
             "aux_loss" not in model_cfg.moe_router_load_balancing_type
diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py
index bfc9557c00..25074bdc3b 100644
--- a/tests/unit/models/policy/test_megatron_worker.py
+++ b/tests/unit/models/policy/test_megatron_worker.py
@@ -2354,7 +2354,7 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_
 
             # Extract metrics
             loss_tensor = results["loss"]
-            all_metrics = results["all_mb_metrics"]
+            grad_norm = results["grad_norm"]
 
             # Verify loss is valid
             assert not torch.isnan(loss_tensor).any(), (
@@ -2364,12 +2364,6 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_
                 f"Loss should not be Inf for {desc}"
             )
 
-            # Extract gradient norm
-            assert "grad_norm" in all_metrics, (
-                f"grad_norm should be in metrics for {desc}"
-            )
-            grad_norm = all_metrics["grad_norm"]
-
             # Store results for comparison
             grad_norms[desc] = grad_norm
             losses[desc] = loss_tensor.cpu().numpy()
@@ -2399,21 +2393,10 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_
             )
 
             if tp > 1:
+                tp_sharded_names = [item["name"] for item in tp_info["tp_params"]]
                 # When tensor parallelism is enabled, we should have some TP parameters
-                assert len(tp_info["tp_params"]) > 0, (
-                    f"Expected tensor parallel parameters when TP={tp}, but found none"
-                )
-            else:
-                # When tensor parallelism is disabled, no parameters should have TP attributes
-                if len(tp_info["tp_params"]) > 0:
-                    print(
-                        f"WARNING: Found {len(tp_info['tp_params'])} TP parameters when TP=1:"
-                    )
-                    for tp_param in tp_info["tp_params"][:3]:  # Show first 3
-                        print(f"  - {tp_param['name']}")
-
-                print(
-                    f"✓ {desc} - Model configured for TP={tp} (no tensor parallel sharding expected)"
+                assert "module.embedding.word_embeddings.weight" in tp_sharded_names, (
+                    f"Expected module.embedding.word_embeddings.weight to be TP-sharded when TP={tp}"
                 )
 
         finally:
@@ -2425,7 +2408,7 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_
 
     # Get reference values from DP2 configuration
     # NOTE: even if TP2 config passes these tests, it doesn't necessarily imply
-    # there are no bugs. Sometimes bugs with grad norm are hard to catch.
+    # there are no bugs. That's why we also check that TP attributes are set correctly above
     reference_config = "DP1TP1"
     reference_grad_norm = grad_norms[reference_config]
     reference_loss = losses[reference_config]
@@ -2434,6 +2417,9 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_
         if config_name == reference_config:
             continue
 
+        if not isinstance(grad_norm, list):
+            grad_norm = [grad_norm]
+
         print(f"\nComparing {config_name} with {reference_config}:")
         print(f"  {reference_config} grad norm: {reference_grad_norm}")
         print(f"  {config_name} grad norm: {grad_norm}")
@@ -2452,13 +2438,13 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_
                 grad_diff = abs(gn - ref_gn)
                 relative_diff = grad_diff / (ref_gn + 1e-8)
                 print(
-                    f"    Microbatch {i}: {ref_gn} vs {gn}, diff={grad_diff:.6f}, rel_diff={relative_diff:.6f}"
+                    f"    Microbatch {i}: {ref_gn} vs {gn}, diff={grad_diff.item():.6f}, rel_diff={relative_diff.item():.6f}"
                 )
 
                 # Allow small differences due to floating point precision and parallelization
                 assert relative_diff < 0.01 or grad_diff < 1e-6, (
                     f"Gradient norm difference too large for microbatch {i}: "
-                    f"{ref_gn} vs {gn} (diff={grad_diff:.6f}, rel_diff={relative_diff:.6f})"
+                    f"{ref_gn} vs {gn} (diff={grad_diff.item():.6f}, rel_diff={relative_diff.item():.6f})"
                 )
 
         # Compare losses (should also be identical for same computation)

From ba86f3a7939123b346450613681708804312d7ac Mon Sep 17 00:00:00 2001
From: Anna Shors <ashors@nvidia.com>
Date: Tue, 28 Oct 2025 10:08:28 -0700
Subject: [PATCH 10/17] minimize config

Signed-off-by: Anna Shors <ashors@nvidia.com>
---
 .../llm/sft-qwen2.5-math-7b-megatron.yaml     | 135 +++---------------
 1 file changed, 22 insertions(+), 113 deletions(-)

diff --git a/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml b/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml
index 95586bf3fb..25627fe080 100644
--- a/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml
@@ -1,149 +1,58 @@
-# SFT Algorithm Configuration
 defaults: ../../sft.yaml
 sft:
-  ## total number of steps to train will equal
-  ## min((max_num_epochs * len(train_dataloader)), max_num_steps)
-  max_num_epochs: 1
   max_num_steps: 80
-
-  val_period: 10
-  val_batches: 8
-  val_global_batch_size: 32
-  val_micro_batch_size: 1
-  val_at_start: true
-  seed: 42
-
 checkpointing:
   enabled: false
-  checkpoint_dir: "results/sft"
-  metric_name: "val_loss" ## set to null to save most recent k checkpoints
-  higher_is_better: false
-  keep_top_k: 3
-  save_period: 10
-  checkpoint_must_save_by: null
-
 policy:
-  model_name: "Qwen/Qwen2.5-Math-7B"
-  tokenizer:
-    name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
+  model_name: Qwen/Qwen2.5-Math-7B
   train_global_batch_size: 512
-  train_micro_batch_size: 1
-  generation_batch_size: 32 # Only used when generating using HF backend
-  logprob_batch_size: 1 #4
-  max_total_sequence_length: 16384 # max_prompt_length + max_response_length
-  precision: "bfloat16"
-
+  generation_batch_size: 32
+  logprob_batch_size: 1
+  max_total_sequence_length: 16384
   dtensor_cfg:
     enabled: false
-  
   megatron_cfg:
     enabled: true
-    empty_unused_memory_level: 1
-    activation_checkpointing: false
     tensor_model_parallel_size: 4
-    pipeline_model_parallel_size: 1
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
     context_parallel_size: 2
-    expert_tensor_parallel_size: 1
-    expert_model_parallel_size: 1
-    pipeline_dtype: ${policy.precision}
-
     sequence_parallel: true
     freeze_moe_router: true
-    moe_router_dtype: "fp64"
-    moe_router_load_balancing_type: "seq_aux_loss" # causes logprob error divergence for grpo
+    moe_router_dtype: fp64
+    moe_router_load_balancing_type: seq_aux_loss
     moe_aux_loss_coeff: 0.01
-    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
-    #gives ~20% training perf speedup with sequence packing
-    apply_rope_fusion: True
-    moe_permute_fusion: True
-
+    moe_router_bias_update_rate: 0.0
+    moe_permute_fusion: true
     optimizer:
-      optimizer: "adam"
-      lr: 1.0e-6
-      min_lr: 1.0e-6
-      weight_decay: 0.1
+      lr: 1.0e-06
+      min_lr: 1.0e-06
       bf16: true
-      fp16: false
-      params_dtype: "float32"
-
-      #adam
-      adam_beta1: 0.9
       adam_beta2: 0.999
-      adam_eps: 1e-8
-
-      #sgd
-      sgd_momentum: 0.9
-
-      #distributed optimizer
-      use_distributed_optimizer: False #true
-      use_precision_aware_optimizer: False #true
-
-      clip_grad: ${policy.max_grad_norm}
-
+      adam_eps: 1.0e-08
+      use_distributed_optimizer: false
+      use_precision_aware_optimizer: false
     scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
       lr_decay_iters: null
       lr_warmup_iters: 10
-      lr_warmup_init: 0.00000000001
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: true
-      overlap_param_gather: true
-      use_custom_fsdp: false
-      data_parallel_sharding_strategy: "optim_grads_params"
-
-  dynamic_batching:
-    enabled: false
-
+      lr_warmup_init: 1.0e-11
   sequence_packing:
-    enabled: True
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    enabled: true
     logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    algorithm: "modified_first_fit_decreasing"
-    sequence_length_round: 64
-
-  # makes the training sequence length divisible by the tensor parallel size
-  # this is useful for sequence parallel training
   make_sequence_length_divisible_by: 32
-  max_grad_norm: 1.0
-
 data:
-  max_input_seq_length: ${policy.max_total_sequence_length}
-  dataset_name: "openmathinstruct2"
+  dataset_name: openmathinstruct2
   prompt_file: examples/prompts/math.txt
-  split: "train_1M"
-  add_bos: true
-  add_eos: true
+  split: train_1M
   add_generation_prompt: true
-  output_key: 'generated_solution'
-  shuffle: true
+  output_key: generated_solution
   num_workers: 8
-
 logger:
-  log_dir: "logs"  # Base directory for all logs
-  wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running
-  tensorboard_enabled: true
-  mlflow_enabled: false
-  swanlab_enabled: false # Disable SwanLab logging
-  monitor_gpus: true  # If true, will monitor GPU usage and log to wandb and/or tensorboard
   wandb:
-    project: "nemo-rl"
-    name: "sft-qwen2.5-math-7b-megatron"
+    project: nemo-rl
+    name: sft-qwen2.5-math-7b-megatron
   tensorboard:
-    log_dir: "tb_logs-sft-qwen2.5-math-7b-megatron"
+    log_dir: tb_logs-sft-qwen2.5-math-7b-megatron
   mlflow:
-    experiment_name: "sft-dev"
-    run_name: "sft-qwen2.5-math-7b-megatron"
-  gpu_monitoring:
-    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
-    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
-
+    run_name: sft-qwen2.5-math-7b-megatron
 cluster:
   gpus_per_node: 8
   num_nodes: 2

From 37d57d62d0e8d3ca0e87ed54f524b4ab36dc30fd Mon Sep 17 00:00:00 2001
From: Anna Shors <ashors@nvidia.com>
Date: Tue, 28 Oct 2025 10:09:29 -0700
Subject: [PATCH 11/17] fix permissions

Signed-off-by: Anna Shors <ashors@nvidia.com>
---
 tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh

diff --git a/tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh b/tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh
old mode 100644
new mode 100755

From 7a657971868473c812cb1d9383351cae70885a3f Mon Sep 17 00:00:00 2001
From: Anna Shors <ashors@nvidia.com>
Date: Tue, 28 Oct 2025 16:46:26 -0700
Subject: [PATCH 12/17] fix convergence test

Signed-off-by: Anna Shors <ashors@nvidia.com>
---
 examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml | 1 +
 tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh          | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml b/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml
index 25627fe080..d7a08770b6 100644
--- a/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml
@@ -1,6 +1,7 @@
 defaults: ../../sft.yaml
 sft:
   max_num_steps: 80
+  seed: 42
 checkpointing:
   enabled: false
 policy:
diff --git a/tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh b/tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh
index e782ff3986..897f4fbb60 100755
--- a/tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh
+++ b/tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh
@@ -28,6 +28,7 @@ uv run examples/run_sft.py \
     logger.tensorboard_enabled=True \
     checkpointing.enabled=True \
     checkpointing.checkpoint_dir=$CKPT_DIR \
+    ~policy.tokenizer.chat_template \
     $@ \
     2>&1 | tee $RUN_LOG
 
@@ -38,5 +39,5 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
         'data["train/loss"]["80"] < 0.301' \
-        'data["validation/loss"]["80"] < 0.304'
+        'data["validation/val_loss"]["80"] < 0.304'
 fi

From 198027abc4ebce05fbf8006553f7c0c1d08721a1 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Tue, 28 Oct 2025 17:26:24 -0700
Subject: [PATCH 13/17] minimize config

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml b/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml
index d7a08770b6..25627fe080 100644
--- a/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml
@@ -1,7 +1,6 @@
 defaults: ../../sft.yaml
 sft:
   max_num_steps: 80
-  seed: 42
 checkpointing:
   enabled: false
 policy:

From 64594d08166eea4cc436ea218453ee21641e8be8 Mon Sep 17 00:00:00 2001
From: Anna Shors <ashors@nvidia.com>
Date: Wed, 29 Oct 2025 15:50:29 -0700
Subject: [PATCH 14/17] fix unit test

Signed-off-by: Anna Shors <ashors@nvidia.com>
---
 .../configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml    | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml b/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml
index 25627fe080..151319df3a 100644
--- a/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml
@@ -6,8 +6,6 @@ checkpointing:
 policy:
   model_name: Qwen/Qwen2.5-Math-7B
   train_global_batch_size: 512
-  generation_batch_size: 32
-  logprob_batch_size: 1
   max_total_sequence_length: 16384
   dtensor_cfg:
     enabled: false
@@ -18,8 +16,6 @@ policy:
     sequence_parallel: true
     freeze_moe_router: true
     moe_router_dtype: fp64
-    moe_router_load_balancing_type: seq_aux_loss
-    moe_aux_loss_coeff: 0.01
     moe_router_bias_update_rate: 0.0
     moe_permute_fusion: true
     optimizer:
@@ -36,7 +32,6 @@ policy:
       lr_warmup_init: 1.0e-11
   sequence_packing:
     enabled: true
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
   make_sequence_length_divisible_by: 32
 data:
   dataset_name: openmathinstruct2

From f2cb2ba5df6156f329e440340fe7f45dc3187282 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Thu, 30 Oct 2025 09:12:05 -0700
Subject: [PATCH 15/17] remove more occurrences of average_in_collective

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 examples/configs/distillation_math.yaml              | 1 -
 examples/configs/distillation_math_megatron.yaml     | 1 -
 nemo_rl/models/policy/__init__.py                    | 1 -
 tests/unit/models/generation/test_vllm_generation.py | 1 -
 tests/unit/models/policy/test_megatron_worker.py     | 1 -
 tools/refit_verifier.py                              | 1 -
 6 files changed, 6 deletions(-)

diff --git a/examples/configs/distillation_math.yaml b/examples/configs/distillation_math.yaml
index e0b8bcf283..ae9aff5e10 100644
--- a/examples/configs/distillation_math.yaml
+++ b/examples/configs/distillation_math.yaml
@@ -145,7 +145,6 @@ policy: &POLICY_BASE
             grad_reduce_in_fp32: false
             overlap_grad_reduce: true
             overlap_param_gather: true
-            average_in_collective: true
             use_custom_fsdp: false
             data_parallel_sharding_strategy: "optim_grads_params"
 
diff --git a/examples/configs/distillation_math_megatron.yaml b/examples/configs/distillation_math_megatron.yaml
index 3df59eba84..b005f13c29 100644
--- a/examples/configs/distillation_math_megatron.yaml
+++ b/examples/configs/distillation_math_megatron.yaml
@@ -99,7 +99,6 @@ policy: &POLICY_BASE
             grad_reduce_in_fp32: false
             overlap_grad_reduce: true
             overlap_param_gather: true
-            average_in_collective: true
             use_custom_fsdp: false
             data_parallel_sharding_strategy: "optim_grads_params"
 
diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py
index fa988ae1bd..ea21728611 100644
--- a/nemo_rl/models/policy/__init__.py
+++ b/nemo_rl/models/policy/__init__.py
@@ -82,7 +82,6 @@ class MegatronDDPConfig(TypedDict):
     grad_reduce_in_fp32: bool
     overlap_grad_reduce: bool
     overlap_param_gather: bool
-    average_in_collective: bool
     use_custom_fsdp: bool
     data_parallel_sharding_strategy: str
 
diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py
index d6bab8b2c0..87dbd6abf2 100644
--- a/tests/unit/models/generation/test_vllm_generation.py
+++ b/tests/unit/models/generation/test_vllm_generation.py
@@ -209,7 +209,6 @@ def get_basic_megatron_test_config(
                 "grad_reduce_in_fp32": False,
                 "overlap_grad_reduce": True,
                 "overlap_param_gather": False,
-                "average_in_collective": True,
                 "data_parallel_sharding_strategy": "optim_grads_params",
             },
         },
diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py
index 25074bdc3b..d79b0d2453 100644
--- a/tests/unit/models/policy/test_megatron_worker.py
+++ b/tests/unit/models/policy/test_megatron_worker.py
@@ -151,7 +151,6 @@ def create_megatron_test_config(
                 "grad_reduce_in_fp32": False,
                 "overlap_grad_reduce": True,
                 "overlap_param_gather": False,
-                "average_in_collective": True,
                 "data_parallel_sharding_strategy": "optim_grads_params",
             },
             "fp8_cfg": {
diff --git a/tools/refit_verifier.py b/tools/refit_verifier.py
index 475bfd1215..ee21370797 100644
--- a/tools/refit_verifier.py
+++ b/tools/refit_verifier.py
@@ -249,7 +249,6 @@ def setup_configs(args, tokenizer):
                 "grad_reduce_in_fp32": False,
                 "overlap_grad_reduce": False,
                 "overlap_param_gather": False,
-                "average_in_collective": False,
                 "use_custom_fsdp": False,
                 "data_parallel_sharding_strategy": "optim_grads_params",
             },

From cc8b6d74f15de37910c295c0b9f8bc36576efa58 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Thu, 30 Oct 2025 11:23:39 -0700
Subject: [PATCH 16/17] link to GH issue

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 nemo_rl/models/policy/megatron_policy_worker.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py
index afd8da4ccb..dffd2e4d07 100644
--- a/nemo_rl/models/policy/megatron_policy_worker.py
+++ b/nemo_rl/models/policy/megatron_policy_worker.py
@@ -676,7 +676,8 @@ def __init__(
             "aux_loss" not in model_cfg.moe_router_load_balancing_type
             or model_cfg.moe_aux_loss_coeff == 0
         ), (
-            "MoE aux loss is currently not supported due to a known but in Megatron-LM. See ## TODO: link to GH issue"
+            "MoE aux loss is currently not supported due to a known bug in Megatron-LM. "
+            "See https://github.com/NVIDIA/Megatron-LM/issues/1984 for more details."
         )
 
         self.megatron_cfg = ConfigContainer(

From f12e113432408035c803843f0d50cd6d482798f5 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Thu, 30 Oct 2025 18:09:50 -0700
Subject: [PATCH 17/17] fix test failure

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 ...h-7b-megatron.yaml => sft-qwen2.5-math7b-2n8g-megatron.yaml} | 0
 nemo_rl/models/policy/__init__.py                               | 2 +-
 ...-math-7b-megatron.sh => sft-qwen2.5-math7b-2n8g-megatron.sh} | 0
 tests/test_suites/nightly.txt                                   | 2 +-
 4 files changed, 2 insertions(+), 2 deletions(-)
 rename examples/configs/recipes/llm/{sft-qwen2.5-math-7b-megatron.yaml => sft-qwen2.5-math7b-2n8g-megatron.yaml} (100%)
 rename tests/test_suites/llm/{sft-qwen2.5-math-7b-megatron.sh => sft-qwen2.5-math7b-2n8g-megatron.sh} (100%)

diff --git a/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml b/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml
similarity index 100%
rename from examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml
rename to examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml
diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py
index da043d9f94..bf8599a595 100644
--- a/nemo_rl/models/policy/__init__.py
+++ b/nemo_rl/models/policy/__init__.py
@@ -81,7 +81,7 @@ class MegatronSchedulerConfig(TypedDict):
     end_weight_decay: float
     weight_decay_incr_style: str
     lr_decay_style: str
-    lr_decay_iters: NotRequired[int]
+    lr_decay_iters: NotRequired[int | None]
     lr_warmup_iters: int
     lr_warmup_init: float
 
diff --git a/tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh b/tests/test_suites/llm/sft-qwen2.5-math7b-2n8g-megatron.sh
similarity index 100%
rename from tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh
rename to tests/test_suites/llm/sft-qwen2.5-math7b-2n8g-megatron.sh
diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt
index c6e779c3de..91c24aada9 100644
--- a/tests/test_suites/nightly.txt
+++ b/tests/test_suites/nightly.txt
@@ -68,7 +68,7 @@ tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron.sh
 # sequence packing
 tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.sh
 # validate TP/DP
-tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh
+tests/test_suites/llm/sft-qwen2.5-math7b-2n8g-megatron.sh
 
 #######
 # DPO #