From ce6fd8abf1553f3ba46af09953e405dc9f44863f Mon Sep 17 00:00:00 2001 From: ashors1 Date: Thu, 16 Oct 2025 10:12:37 -0700 Subject: [PATCH 01/17] enable megatron calculate_per_token_loss Signed-off-by: ashors1 --- examples/configs/dpo.yaml | 1 - examples/configs/grpo_math_1B.yaml | 1 - examples/configs/grpo_math_1B_megatron.yaml | 1 - examples/configs/rm.yaml | 1 - examples/configs/sft.yaml | 1 - examples/configs/sft_openmathinstruct2_megatron.yaml | 1 - examples/configs/vlm_grpo_3B.yaml | 1 - examples/configs/vlm_grpo_3B_megatron.yaml | 1 - nemo_rl/models/policy/megatron_policy_worker.py | 8 +++++--- 9 files changed, 5 insertions(+), 11 deletions(-) diff --git a/examples/configs/dpo.yaml b/examples/configs/dpo.yaml index 6af1405533..a450e9e397 100755 --- a/examples/configs/dpo.yaml +++ b/examples/configs/dpo.yaml @@ -153,7 +153,6 @@ policy: grad_reduce_in_fp32: false overlap_grad_reduce: true overlap_param_gather: true - average_in_collective: true data_parallel_sharding_strategy: "optim_grads_params" data: diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml index 2c3642b4c8..c3b8ed22dd 100644 --- a/examples/configs/grpo_math_1B.yaml +++ b/examples/configs/grpo_math_1B.yaml @@ -130,7 +130,6 @@ policy: grad_reduce_in_fp32: false overlap_grad_reduce: true overlap_param_gather: true - average_in_collective: true use_custom_fsdp: false data_parallel_sharding_strategy: "optim_grads_params" diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml index 4246fee777..bb6b2ab5f1 100644 --- a/examples/configs/grpo_math_1B_megatron.yaml +++ b/examples/configs/grpo_math_1B_megatron.yaml @@ -133,7 +133,6 @@ policy: grad_reduce_in_fp32: false overlap_grad_reduce: true overlap_param_gather: true - average_in_collective: true use_custom_fsdp: false data_parallel_sharding_strategy: "optim_grads_params" diff --git a/examples/configs/rm.yaml b/examples/configs/rm.yaml index cdbf900fb4..d5a026ec98 100644 --- a/examples/configs/rm.yaml +++ b/examples/configs/rm.yaml @@ -123,7 +123,6 @@ policy: grad_reduce_in_fp32: false overlap_grad_reduce: true overlap_param_gather: false - average_in_collective: true data_parallel_sharding_strategy: "optim_grads_params" diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml index 62873bc9d4..dbc2270923 100644 --- a/examples/configs/sft.yaml +++ b/examples/configs/sft.yaml @@ -134,7 +134,6 @@ policy: grad_reduce_in_fp32: false overlap_grad_reduce: true overlap_param_gather: true - average_in_collective: true data_parallel_sharding_strategy: "optim_grads_params" use_custom_fsdp: false diff --git a/examples/configs/sft_openmathinstruct2_megatron.yaml b/examples/configs/sft_openmathinstruct2_megatron.yaml index 9621bdfe4b..3a42c4cf2e 100644 --- a/examples/configs/sft_openmathinstruct2_megatron.yaml +++ b/examples/configs/sft_openmathinstruct2_megatron.yaml @@ -35,7 +35,6 @@ policy: activation_checkpointing: false context_parallel_size: 1 distributed_data_parallel_config: - average_in_collective: true data_parallel_sharding_strategy: optim_grads_params grad_reduce_in_fp32: true overlap_grad_reduce: true diff --git a/examples/configs/vlm_grpo_3B.yaml b/examples/configs/vlm_grpo_3B.yaml index f3172a3131..e25856cad8 100644 --- a/examples/configs/vlm_grpo_3B.yaml +++ b/examples/configs/vlm_grpo_3B.yaml @@ -117,7 +117,6 @@ policy: grad_reduce_in_fp32: false overlap_grad_reduce: true overlap_param_gather: true - average_in_collective: true use_custom_fsdp: false data_parallel_sharding_strategy: "optim_grads_params" diff --git a/examples/configs/vlm_grpo_3B_megatron.yaml b/examples/configs/vlm_grpo_3B_megatron.yaml index 2bf0e86184..9b5714f087 100644 --- a/examples/configs/vlm_grpo_3B_megatron.yaml +++ b/examples/configs/vlm_grpo_3B_megatron.yaml @@ -146,7 +146,6 @@ policy: grad_reduce_in_fp32: false overlap_grad_reduce: false overlap_param_gather: true - average_in_collective: true use_custom_fsdp: false data_parallel_sharding_strategy: optim_grads_params data: diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py index 2d84509879..6a9f759fe3 100644 --- a/nemo_rl/models/policy/megatron_policy_worker.py +++ b/nemo_rl/models/policy/megatron_policy_worker.py @@ -15,6 +15,7 @@ import math import os import time +from tokenize import triple_quoted import warnings from collections import defaultdict from contextlib import AbstractContextManager, contextmanager, nullcontext @@ -658,6 +659,9 @@ def __init__( "https://github.com/NVIDIA-NeMo/RL/blob/bccbc377705a81a1f4b3c31ad9767bcc15f735a8/nemo_rl/algorithms/sft.py#L175-L179." ) + ## TODO: make sure this works with sequence-level losses as well + model_cfg.calculate_per_token_loss = True + self.megatron_cfg = ConfigContainer( model=model_cfg, checkpoint=checkpoint_config, @@ -683,9 +687,7 @@ def __init__( overlap_param_gather=self.cfg["megatron_cfg"][ "distributed_data_parallel_config" ]["overlap_param_gather"], - average_in_collective=self.cfg["megatron_cfg"][ - "distributed_data_parallel_config" - ]["average_in_collective"], + average_in_collective=False, # average in collective is not supported with per-token loss use_distributed_optimizer=self.cfg["megatron_cfg"]["optimizer"][ "use_distributed_optimizer" ], From bdcee3cc836b78c0e6b69c9202bd3b3bb0369d6e Mon Sep 17 00:00:00 2001 From: ashors1 Date: Fri, 17 Oct 2025 16:43:24 -0700 Subject: [PATCH 02/17] remove unneeded import Signed-off-by: ashors1 --- nemo_rl/models/policy/megatron_policy_worker.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py index 6a9f759fe3..56c6c02ccf 100644 --- a/nemo_rl/models/policy/megatron_policy_worker.py +++ b/nemo_rl/models/policy/megatron_policy_worker.py @@ -15,7 +15,6 @@ import math import os import time -from tokenize import triple_quoted import warnings from collections import defaultdict from contextlib import AbstractContextManager, contextmanager, nullcontext From 408900a4b33ef7ab16275692f2680bed6c4cbe22 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Thu, 23 Oct 2025 15:27:54 -0700 Subject: [PATCH 03/17] TP bug fix and add WIP unit test Signed-off-by: ashors1 --- .../models/policy/megatron_policy_worker.py | 15 +- .../models/policy/test_megatron_worker.py | 155 ++++++++++++++++++ 2 files changed, 168 insertions(+), 2 deletions(-) diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py index 56c6c02ccf..ef61ac76fc 100644 --- a/nemo_rl/models/policy/megatron_policy_worker.py +++ b/nemo_rl/models/policy/megatron_policy_worker.py @@ -658,8 +658,13 @@ def __init__( "https://github.com/NVIDIA-NeMo/RL/blob/bccbc377705a81a1f4b3c31ad9767bcc15f735a8/nemo_rl/algorithms/sft.py#L175-L179." ) - ## TODO: make sure this works with sequence-level losses as well + ## These settings are required for correct gradient computations in mcore + ## when calculate_per_token_loss is True, there is no scaling of the gradient in mcore, + ## so we handle the scaling in nemo-rl. + ## perform_initialization = True is a workaround to ensure the correct tensor parallel attributes are set + ## on the TP-sharded parameters. model_cfg.calculate_per_token_loss = True + model_cfg.perform_initialization = True self.megatron_cfg = ConfigContainer( model=model_cfg, @@ -686,7 +691,9 @@ def __init__( overlap_param_gather=self.cfg["megatron_cfg"][ "distributed_data_parallel_config" ]["overlap_param_gather"], - average_in_collective=False, # average in collective is not supported with per-token loss + # we need to set average_in_collective=False with calculate_per_token_loss=True. + # otherwise, mcore throws an assertion error. + average_in_collective=False, use_distributed_optimizer=self.cfg["megatron_cfg"]["optimizer"][ "use_distributed_optimizer" ], @@ -704,6 +711,10 @@ def __init__( ), ) self.megatron_cfg.validate() + assert ( + "aux_loss" not in self.megatron_cfg.moe_router_load_balancing_type or + self.megatron_cfg.moe_aux_loss_coeff == 0 + ), "MoE aux loss is currently not supported due to a known but in Megatron-LM. See ## TODO: link to GH issue" ( self.mcore_state, self.model, diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py index 48b2c01dc8..c3a401c634 100644 --- a/tests/unit/models/policy/test_megatron_worker.py +++ b/tests/unit/models/policy/test_megatron_worker.py @@ -16,6 +16,7 @@ import time from typing import Optional +import numpy as np import pytest import torch @@ -1824,6 +1825,160 @@ def test_megatron_context_parallel_training_agreement(tiny_llama_model_path): ) +@pytest.mark.hf_gated +@pytest.mark.timeout(300) +def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_path): + """Test that gradient norms are consistent across different TP and DP configurations. + + This test validates that the same model produces identical gradient norms + regardless of tensor parallelism (TP) and data parallelism (DP) settings. + """ + batch_size = 8 + seq_len = 64 + vocab_size = 32000 + + # Create reproducible test data + torch.manual_seed(42) + input_ids = torch.randint(0, vocab_size, (batch_size, seq_len)) + attention_mask = torch.ones(batch_size, seq_len) + input_lengths = attention_mask.sum(dim=1).to(torch.int32) + labels = torch.randint(0, vocab_size, (batch_size, seq_len)) + + data = BatchedDataDict( + { + "input_ids": input_ids, + "input_lengths": input_lengths, + "attention_mask": attention_mask, + "labels": labels, + "sample_mask": torch.ones(batch_size), + } + ) + + # Test configurations: (num_gpus, tp, pp, description) + test_configs = [ + (1, 1, 1, "DP1TP1") + (2, 1, 1, "DP2"), # Data parallel with 2 GPUs + (2, 2, 1, "TP2"), # Tensor parallel with 2 GPUs + ] + + grad_norms = {} + losses = {} + + for num_gpus, tp, pp, desc in test_configs: + print(f"\n=== Testing {desc} configuration (GPUs={num_gpus}, TP={tp}, PP={pp}) ===") + + cluster = RayVirtualCluster( + name=f"test-grad-norm-{desc.lower()}", + bundle_ct_per_node_list=[num_gpus], + use_gpus=True, + num_gpus_per_node=num_gpus, + max_colocated_worker_groups=1, + ) + + config = create_megatron_test_config( + model_name=tiny_llama_model_path, + tp=tp, + pp=pp, + precision="float32", # Use float32 for more stable gradient comparisons + ) + + tokenizer = get_tokenizer(config["tokenizer"]) + config["generation"] = configure_generation_config( + config["generation"], tokenizer + ) + + policy = Policy( + cluster=cluster, + config=config, + tokenizer=tokenizer, + init_reference_model=False, + ) + + # Use SimpleLoss for consistent comparison + loss_fn = SimpleLoss() + + try: + # Prepare for training + policy.prepare_for_training() + + # Perform one forward/backward step + print(f"Performing forward/backward pass for {desc}...") + results = policy.train(data, loss_fn) + + # Extract metrics + loss_tensor = results["loss"] + all_metrics = results["all_mb_metrics"] + + # Verify loss is valid + assert not torch.isnan(loss_tensor).any(), f"Loss should not be NaN for {desc}" + assert not torch.isinf(loss_tensor).any(), f"Loss should not be Inf for {desc}" + + # Extract gradient norm + assert "grad_norm" in all_metrics, f"grad_norm should be in metrics for {desc}" + grad_norm = all_metrics["grad_norm"] + + # Store results for comparison + grad_norms[desc] = grad_norm + losses[desc] = loss_tensor.cpu().numpy() + + print(f"{desc} - Loss: {loss_tensor}") + print(f"{desc} - Grad norm: {grad_norm}") + + finally: + policy.shutdown() + cluster.shutdown() + + # Compare gradient norms across configurations + print("\n=== Comparing gradient norms across configurations ===") + + # Get reference values from DP2 configuration + reference_config = "DP1TP1" + reference_grad_norm = grad_norms[reference_config] + reference_loss = losses[reference_config] + + for config_name, grad_norm in grad_norms.items(): + if config_name == reference_config: + continue + + print(f"\nComparing {config_name} with {reference_config}:") + print(f" {reference_config} grad norm: {reference_grad_norm}") + print(f" {config_name} grad norm: {grad_norm}") + + # Compare gradient norms + if not isinstance(grad_norm, list): + grad_norm = [grad_norm] + reference_grad_norm = [reference_grad_norm] + if isinstance(grad_norm, list) and isinstance(reference_grad_norm, list): + # Handle case where grad_norm is a list (multiple microbatches) + assert len(grad_norm) == len(reference_grad_norm), ( + f"Number of gradient norm values should match: {len(grad_norm)} vs {len(reference_grad_norm)}" + ) + + for i, (gn, ref_gn) in enumerate(zip(grad_norm, reference_grad_norm)): + grad_diff = abs(gn - ref_gn) + relative_diff = grad_diff / (ref_gn + 1e-8) + print(f" Microbatch {i}: {ref_gn} vs {gn}, diff={grad_diff:.6f}, rel_diff={relative_diff:.6f}") + + # Allow small differences due to floating point precision and parallelization + assert relative_diff < 0.01 or grad_diff < 1e-6, ( + f"Gradient norm difference too large for microbatch {i}: " + f"{ref_gn} vs {gn} (diff={grad_diff:.6f}, rel_diff={relative_diff:.6f})" + ) + + # Compare losses (should also be identical for same computation) + loss_diff = np.max(np.abs(reference_loss - losses[config_name])) + relative_loss_diff = loss_diff / (np.mean(np.abs(reference_loss)) + 1e-8) + print(f" Loss diff: {loss_diff:.6f}, relative loss diff: {relative_loss_diff:.6f}") + + # Allow small differences in loss as well + assert relative_loss_diff < 0.01 or loss_diff < 1e-6, ( + f"Loss difference too large: " + f"max diff={loss_diff:.6f}, rel_diff={relative_loss_diff:.6f}" + ) + + print("\n✓ SUCCESS: Gradient norms are consistent across all parallelization configurations!") + + @pytest.mark.hf_gated @pytest.mark.timeout(300) def test_megatron_policy_flops_range_check(tiny_llama_model_path): From d8348d1f63e92ba111710f6628d39ca9ba14cbec Mon Sep 17 00:00:00 2001 From: Anna Shors Date: Fri, 24 Oct 2025 16:00:07 -0700 Subject: [PATCH 04/17] fixes Signed-off-by: Anna Shors --- nemo_rl/models/policy/megatron_policy_worker.py | 11 ++++++----- tests/unit/models/policy/test_megatron_worker.py | 7 +++++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py index ef61ac76fc..9e4e9952a4 100644 --- a/nemo_rl/models/policy/megatron_policy_worker.py +++ b/nemo_rl/models/policy/megatron_policy_worker.py @@ -664,7 +664,12 @@ def __init__( ## perform_initialization = True is a workaround to ensure the correct tensor parallel attributes are set ## on the TP-sharded parameters. model_cfg.calculate_per_token_loss = True - model_cfg.perform_initialization = True + model_cfg.perform_initialization = False + + assert ( + "aux_loss" not in self.model_cfg.moe_router_load_balancing_type or + self.model_cfg.moe_aux_loss_coeff == 0 + ), "MoE aux loss is currently not supported due to a known but in Megatron-LM. See ## TODO: link to GH issue" self.megatron_cfg = ConfigContainer( model=model_cfg, @@ -711,10 +716,6 @@ def __init__( ), ) self.megatron_cfg.validate() - assert ( - "aux_loss" not in self.megatron_cfg.moe_router_load_balancing_type or - self.megatron_cfg.moe_aux_loss_coeff == 0 - ), "MoE aux loss is currently not supported due to a known but in Megatron-LM. See ## TODO: link to GH issue" ( self.mcore_state, self.model, diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py index c3a401c634..47fffec1ee 100644 --- a/tests/unit/models/policy/test_megatron_worker.py +++ b/tests/unit/models/policy/test_megatron_worker.py @@ -1851,12 +1851,13 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_ "attention_mask": attention_mask, "labels": labels, "sample_mask": torch.ones(batch_size), + "token_mask": torch.ones_like(input_ids), } ) # Test configurations: (num_gpus, tp, pp, description) test_configs = [ - (1, 1, 1, "DP1TP1") + (1, 1, 1, "DP1TP1"), (2, 1, 1, "DP2"), # Data parallel with 2 GPUs (2, 2, 1, "TP2"), # Tensor parallel with 2 GPUs ] @@ -1895,7 +1896,7 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_ ) # Use SimpleLoss for consistent comparison - loss_fn = SimpleLoss() + loss_fn = NLLLoss() try: # Prepare for training @@ -1932,6 +1933,8 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_ print("\n=== Comparing gradient norms across configurations ===") # Get reference values from DP2 configuration + # NOTE: even if TP2 config passes these tests, it doesn't necessarily imply + # there are no bugs. Sometimes bugs with grad norm are hard to catch. reference_config = "DP1TP1" reference_grad_norm = grad_norms[reference_config] reference_loss = losses[reference_config] From 9d0c41021ab570ab5e03120eea0479561ecf9ecf Mon Sep 17 00:00:00 2001 From: ashors1 Date: Fri, 24 Oct 2025 16:04:57 -0700 Subject: [PATCH 05/17] lint Signed-off-by: ashors1 --- .../models/policy/megatron_policy_worker.py | 8 +- .../models/policy/test_megatron_worker.py | 86 +++++++++++-------- 2 files changed, 55 insertions(+), 39 deletions(-) diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py index 9e4e9952a4..3a1801fad6 100644 --- a/nemo_rl/models/policy/megatron_policy_worker.py +++ b/nemo_rl/models/policy/megatron_policy_worker.py @@ -667,9 +667,11 @@ def __init__( model_cfg.perform_initialization = False assert ( - "aux_loss" not in self.model_cfg.moe_router_load_balancing_type or - self.model_cfg.moe_aux_loss_coeff == 0 - ), "MoE aux loss is currently not supported due to a known but in Megatron-LM. See ## TODO: link to GH issue" + "aux_loss" not in self.model_cfg.moe_router_load_balancing_type + or self.model_cfg.moe_aux_loss_coeff == 0 + ), ( + "MoE aux loss is currently not supported due to a known but in Megatron-LM. See ## TODO: link to GH issue" + ) self.megatron_cfg = ConfigContainer( model=model_cfg, diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py index 47fffec1ee..1ad6a7052f 100644 --- a/tests/unit/models/policy/test_megatron_worker.py +++ b/tests/unit/models/policy/test_megatron_worker.py @@ -1829,21 +1829,21 @@ def test_megatron_context_parallel_training_agreement(tiny_llama_model_path): @pytest.mark.timeout(300) def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_path): """Test that gradient norms are consistent across different TP and DP configurations. - + This test validates that the same model produces identical gradient norms regardless of tensor parallelism (TP) and data parallelism (DP) settings. """ batch_size = 8 seq_len = 64 vocab_size = 32000 - + # Create reproducible test data torch.manual_seed(42) input_ids = torch.randint(0, vocab_size, (batch_size, seq_len)) attention_mask = torch.ones(batch_size, seq_len) input_lengths = attention_mask.sum(dim=1).to(torch.int32) labels = torch.randint(0, vocab_size, (batch_size, seq_len)) - + data = BatchedDataDict( { "input_ids": input_ids, @@ -1854,20 +1854,22 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_ "token_mask": torch.ones_like(input_ids), } ) - + # Test configurations: (num_gpus, tp, pp, description) test_configs = [ (1, 1, 1, "DP1TP1"), (2, 1, 1, "DP2"), # Data parallel with 2 GPUs (2, 2, 1, "TP2"), # Tensor parallel with 2 GPUs ] - + grad_norms = {} losses = {} - + for num_gpus, tp, pp, desc in test_configs: - print(f"\n=== Testing {desc} configuration (GPUs={num_gpus}, TP={tp}, PP={pp}) ===") - + print( + f"\n=== Testing {desc} configuration (GPUs={num_gpus}, TP={tp}, PP={pp}) ===" + ) + cluster = RayVirtualCluster( name=f"test-grad-norm-{desc.lower()}", bundle_ct_per_node_list=[num_gpus], @@ -1875,78 +1877,84 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_ num_gpus_per_node=num_gpus, max_colocated_worker_groups=1, ) - + config = create_megatron_test_config( model_name=tiny_llama_model_path, tp=tp, pp=pp, precision="float32", # Use float32 for more stable gradient comparisons ) - + tokenizer = get_tokenizer(config["tokenizer"]) config["generation"] = configure_generation_config( config["generation"], tokenizer ) - + policy = Policy( cluster=cluster, config=config, tokenizer=tokenizer, init_reference_model=False, ) - + # Use SimpleLoss for consistent comparison loss_fn = NLLLoss() - + try: # Prepare for training policy.prepare_for_training() - + # Perform one forward/backward step print(f"Performing forward/backward pass for {desc}...") results = policy.train(data, loss_fn) - + # Extract metrics loss_tensor = results["loss"] all_metrics = results["all_mb_metrics"] - + # Verify loss is valid - assert not torch.isnan(loss_tensor).any(), f"Loss should not be NaN for {desc}" - assert not torch.isinf(loss_tensor).any(), f"Loss should not be Inf for {desc}" - + assert not torch.isnan(loss_tensor).any(), ( + f"Loss should not be NaN for {desc}" + ) + assert not torch.isinf(loss_tensor).any(), ( + f"Loss should not be Inf for {desc}" + ) + # Extract gradient norm - assert "grad_norm" in all_metrics, f"grad_norm should be in metrics for {desc}" + assert "grad_norm" in all_metrics, ( + f"grad_norm should be in metrics for {desc}" + ) grad_norm = all_metrics["grad_norm"] - + # Store results for comparison grad_norms[desc] = grad_norm losses[desc] = loss_tensor.cpu().numpy() - + print(f"{desc} - Loss: {loss_tensor}") print(f"{desc} - Grad norm: {grad_norm}") - + finally: policy.shutdown() cluster.shutdown() - + # Compare gradient norms across configurations print("\n=== Comparing gradient norms across configurations ===") - + # Get reference values from DP2 configuration # NOTE: even if TP2 config passes these tests, it doesn't necessarily imply # there are no bugs. Sometimes bugs with grad norm are hard to catch. reference_config = "DP1TP1" reference_grad_norm = grad_norms[reference_config] reference_loss = losses[reference_config] - + for config_name, grad_norm in grad_norms.items(): if config_name == reference_config: continue - + print(f"\nComparing {config_name} with {reference_config}:") print(f" {reference_config} grad norm: {reference_grad_norm}") print(f" {config_name} grad norm: {grad_norm}") - + # Compare gradient norms if not isinstance(grad_norm, list): grad_norm = [grad_norm] @@ -1956,30 +1964,36 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_ assert len(grad_norm) == len(reference_grad_norm), ( f"Number of gradient norm values should match: {len(grad_norm)} vs {len(reference_grad_norm)}" ) - + for i, (gn, ref_gn) in enumerate(zip(grad_norm, reference_grad_norm)): grad_diff = abs(gn - ref_gn) relative_diff = grad_diff / (ref_gn + 1e-8) - print(f" Microbatch {i}: {ref_gn} vs {gn}, diff={grad_diff:.6f}, rel_diff={relative_diff:.6f}") - + print( + f" Microbatch {i}: {ref_gn} vs {gn}, diff={grad_diff:.6f}, rel_diff={relative_diff:.6f}" + ) + # Allow small differences due to floating point precision and parallelization assert relative_diff < 0.01 or grad_diff < 1e-6, ( f"Gradient norm difference too large for microbatch {i}: " f"{ref_gn} vs {gn} (diff={grad_diff:.6f}, rel_diff={relative_diff:.6f})" ) - + # Compare losses (should also be identical for same computation) loss_diff = np.max(np.abs(reference_loss - losses[config_name])) relative_loss_diff = loss_diff / (np.mean(np.abs(reference_loss)) + 1e-8) - print(f" Loss diff: {loss_diff:.6f}, relative loss diff: {relative_loss_diff:.6f}") - + print( + f" Loss diff: {loss_diff:.6f}, relative loss diff: {relative_loss_diff:.6f}" + ) + # Allow small differences in loss as well assert relative_loss_diff < 0.01 or loss_diff < 1e-6, ( f"Loss difference too large: " f"max diff={loss_diff:.6f}, rel_diff={relative_loss_diff:.6f}" ) - - print("\n✓ SUCCESS: Gradient norms are consistent across all parallelization configurations!") + + print( + "\n✓ SUCCESS: Gradient norms are consistent across all parallelization configurations!" + ) @pytest.mark.hf_gated From 72a5c73ced440cbd94b0d4e207573dd41ea588a3 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Tue, 28 Oct 2025 08:57:49 -0700 Subject: [PATCH 06/17] add tensor parallel attributes check Signed-off-by: ashors1 --- .../models/policy/megatron_policy_worker.py | 44 +++++++++++++++++++ .../models/policy/test_megatron_worker.py | 40 +++++++++++++++++ 2 files changed, 84 insertions(+) diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py index 3a1801fad6..daf5f0acd3 100644 --- a/nemo_rl/models/policy/megatron_policy_worker.py +++ b/nemo_rl/models/policy/megatron_policy_worker.py @@ -2034,3 +2034,47 @@ def start_gpu_profiling(self) -> None: def stop_gpu_profiling(self) -> None: """Stop GPU profiling.""" torch.cuda.profiler.stop() + + def check_tensor_parallel_attributes(self) -> dict[str, Any]: + """Check tensor parallel attributes on model parameters. + + Returns: + Dictionary containing information about tensor parallel parameters: + - tp_params: List of parameter names that have tensor_model_parallel=True + - non_tp_params: List of parameter names that have tensor_model_parallel=False + - total_params: Total number of parameters checked + - tp_size: Tensor parallel size from config + """ + tp_params = [] + non_tp_params = [] + total_params = 0 + + for name, param in self.model.named_parameters(): + total_params += 1 + tensor_model_parallel = getattr(param, "tensor_model_parallel", False) + + if tensor_model_parallel: + tp_params.append( + { + "name": name, + "tensor_model_parallel": tensor_model_parallel, + "partition_dim": getattr(param, "partition_dim", None), + "partition_stride": getattr(param, "partition_stride", None), + "shape": list(param.shape), + } + ) + else: + non_tp_params.append( + { + "name": name, + "tensor_model_parallel": tensor_model_parallel, + "shape": list(param.shape), + } + ) + + return { + "tp_params": tp_params, + "non_tp_params": non_tp_params, + "total_params": total_params, + "tp_size": self.megatron_cfg.model.tensor_model_parallel_size, + } diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py index 1ad6a7052f..0411726fbf 100644 --- a/tests/unit/models/policy/test_megatron_worker.py +++ b/tests/unit/models/policy/test_megatron_worker.py @@ -18,6 +18,7 @@ import numpy as np import pytest +import ray import torch from nemo_rl.algorithms.interfaces import LossFunction @@ -1933,6 +1934,45 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_ print(f"{desc} - Loss: {loss_tensor}") print(f"{desc} - Grad norm: {grad_norm}") + # Check tensor parallel attributes on model parameters + print(f"Checking tensor parallel attributes for {desc}...") + tp_check_futures = policy.worker_group.run_all_workers_single_data( + "check_tensor_parallel_attributes" + ) + tp_check_results = [ray.get(future) for future in tp_check_futures] + + # Analyze the first worker's results (all workers should have the same structure) + tp_info = tp_check_results[0] + + print(f"{desc} - TP size: {tp_info['tp_size']}") + print(f"{desc} - Total params: {tp_info['total_params']}") + print(f"{desc} - TP params: {len(tp_info['tp_params'])}") + print(f"{desc} - Non-TP params: {len(tp_info['non_tp_params'])}") + + # Validate tensor parallel attributes + expected_tp_size = tp + assert tp_info["tp_size"] == expected_tp_size, ( + f"Expected TP size {expected_tp_size}, got {tp_info['tp_size']}" + ) + + if tp > 1: + # When tensor parallelism is enabled, we should have some TP parameters + assert len(tp_info["tp_params"]) > 0, ( + f"Expected tensor parallel parameters when TP={tp}, but found none" + ) + else: + # When tensor parallelism is disabled, no parameters should have TP attributes + if len(tp_info["tp_params"]) > 0: + print( + f"WARNING: Found {len(tp_info['tp_params'])} TP parameters when TP=1:" + ) + for tp_param in tp_info["tp_params"][:3]: # Show first 3 + print(f" - {tp_param['name']}") + + print( + f"✓ {desc} - Model configured for TP={tp} (no tensor parallel sharding expected)" + ) + finally: policy.shutdown() cluster.shutdown() From dcdd016f79dda0afe658df8fbfee080a235069a0 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Tue, 28 Oct 2025 09:13:06 -0700 Subject: [PATCH 07/17] add convergence test Signed-off-by: ashors1 --- .../llm/sft-qwen2.5-math-7b-megatron.yaml | 149 ++++++++++++++++++ .../llm/sft-qwen2.5-math-7b-megatron.sh | 42 +++++ tests/test_suites/nightly.txt | 2 + 3 files changed, 193 insertions(+) create mode 100644 examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml create mode 100644 tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh diff --git a/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml b/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml new file mode 100644 index 0000000000..95586bf3fb --- /dev/null +++ b/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml @@ -0,0 +1,149 @@ +# SFT Algorithm Configuration +defaults: ../../sft.yaml +sft: + ## total number of steps to train will equal + ## min((max_num_epochs * len(train_dataloader)), max_num_steps) + max_num_epochs: 1 + max_num_steps: 80 + + val_period: 10 + val_batches: 8 + val_global_batch_size: 32 + val_micro_batch_size: 1 + val_at_start: true + seed: 42 + +checkpointing: + enabled: false + checkpoint_dir: "results/sft" + metric_name: "val_loss" ## set to null to save most recent k checkpoints + higher_is_better: false + keep_top_k: 3 + save_period: 10 + checkpoint_must_save_by: null + +policy: + model_name: "Qwen/Qwen2.5-Math-7B" + tokenizer: + name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default + train_global_batch_size: 512 + train_micro_batch_size: 1 + generation_batch_size: 32 # Only used when generating using HF backend + logprob_batch_size: 1 #4 + max_total_sequence_length: 16384 # max_prompt_length + max_response_length + precision: "bfloat16" + + dtensor_cfg: + enabled: false + + megatron_cfg: + enabled: true + empty_unused_memory_level: 1 + activation_checkpointing: false + tensor_model_parallel_size: 4 + pipeline_model_parallel_size: 1 + num_layers_in_first_pipeline_stage: null + num_layers_in_last_pipeline_stage: null + context_parallel_size: 2 + expert_tensor_parallel_size: 1 + expert_model_parallel_size: 1 + pipeline_dtype: ${policy.precision} + + sequence_parallel: true + freeze_moe_router: true + moe_router_dtype: "fp64" + moe_router_load_balancing_type: "seq_aux_loss" # causes logprob error divergence for grpo + moe_aux_loss_coeff: 0.01 + moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo + #gives ~20% training perf speedup with sequence packing + apply_rope_fusion: True + moe_permute_fusion: True + + optimizer: + optimizer: "adam" + lr: 1.0e-6 + min_lr: 1.0e-6 + weight_decay: 0.1 + bf16: true + fp16: false + params_dtype: "float32" + + #adam + adam_beta1: 0.9 + adam_beta2: 0.999 + adam_eps: 1e-8 + + #sgd + sgd_momentum: 0.9 + + #distributed optimizer + use_distributed_optimizer: False #true + use_precision_aware_optimizer: False #true + + clip_grad: ${policy.max_grad_norm} + + scheduler: + start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} + end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} + weight_decay_incr_style: "constant" + lr_decay_style: "constant" + lr_decay_iters: null + lr_warmup_iters: 10 + lr_warmup_init: 0.00000000001 + + distributed_data_parallel_config: + grad_reduce_in_fp32: false + overlap_grad_reduce: true + overlap_param_gather: true + use_custom_fsdp: false + data_parallel_sharding_strategy: "optim_grads_params" + + dynamic_batching: + enabled: false + + sequence_packing: + enabled: True + train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} + algorithm: "modified_first_fit_decreasing" + sequence_length_round: 64 + + # makes the training sequence length divisible by the tensor parallel size + # this is useful for sequence parallel training + make_sequence_length_divisible_by: 32 + max_grad_norm: 1.0 + +data: + max_input_seq_length: ${policy.max_total_sequence_length} + dataset_name: "openmathinstruct2" + prompt_file: examples/prompts/math.txt + split: "train_1M" + add_bos: true + add_eos: true + add_generation_prompt: true + output_key: 'generated_solution' + shuffle: true + num_workers: 8 + +logger: + log_dir: "logs" # Base directory for all logs + wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running + tensorboard_enabled: true + mlflow_enabled: false + swanlab_enabled: false # Disable SwanLab logging + monitor_gpus: true # If true, will monitor GPU usage and log to wandb and/or tensorboard + wandb: + project: "nemo-rl" + name: "sft-qwen2.5-math-7b-megatron" + tensorboard: + log_dir: "tb_logs-sft-qwen2.5-math-7b-megatron" + mlflow: + experiment_name: "sft-dev" + run_name: "sft-qwen2.5-math-7b-megatron" + gpu_monitoring: + collection_interval: 10 # How often to collect GPU usage metrics (in seconds) + flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) + +cluster: + gpus_per_node: 8 + num_nodes: 2 diff --git a/tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh b/tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh new file mode 100644 index 0000000000..e782ff3986 --- /dev/null +++ b/tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh @@ -0,0 +1,42 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# TODO: this config can crash on OOM +# https://github.com/NVIDIA-NeMo/RL/issues/263 + +# ===== BEGIN CONFIG ===== +NUM_NODES=2 +STEPS_PER_RUN=80 # step_time ~ 29sec +MAX_STEPS=80 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=30 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_sft.py \ + --config $CONFIG_PATH \ + sft.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["80"] < 0.301' \ + 'data["validation/loss"]["80"] < 0.304' +fi diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt index b7f2410785..1f828fb85b 100644 --- a/tests/test_suites/nightly.txt +++ b/tests/test_suites/nightly.txt @@ -66,6 +66,8 @@ tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.sh tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron.sh # sequence packing tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.sh +# validate TP/DP +tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh ####### # DPO # From 9f99d5315fe6377cfdc5b0a5ab6567f5e69132ac Mon Sep 17 00:00:00 2001 From: ashors1 Date: Tue, 28 Oct 2025 09:14:37 -0700 Subject: [PATCH 08/17] address comment Signed-off-by: ashors1 --- nemo_rl/models/policy/megatron_policy_worker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py index daf5f0acd3..8406059c48 100644 --- a/nemo_rl/models/policy/megatron_policy_worker.py +++ b/nemo_rl/models/policy/megatron_policy_worker.py @@ -667,8 +667,8 @@ def __init__( model_cfg.perform_initialization = False assert ( - "aux_loss" not in self.model_cfg.moe_router_load_balancing_type - or self.model_cfg.moe_aux_loss_coeff == 0 + "aux_loss" not in model_cfg.moe_router_load_balancing_type + or model_cfg.moe_aux_loss_coeff == 0 ), ( "MoE aux loss is currently not supported due to a known but in Megatron-LM. See ## TODO: link to GH issue" ) From 3007940beccf4c5f407ffe36ca39e1b60a0ce13c Mon Sep 17 00:00:00 2001 From: Anna Shors Date: Tue, 28 Oct 2025 10:04:12 -0700 Subject: [PATCH 09/17] improve unit test Signed-off-by: Anna Shors --- .../models/policy/megatron_policy_worker.py | 2 +- .../models/policy/test_megatron_worker.py | 34 ++++++------------- 2 files changed, 11 insertions(+), 25 deletions(-) diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py index 5ba521d907..afd8da4ccb 100644 --- a/nemo_rl/models/policy/megatron_policy_worker.py +++ b/nemo_rl/models/policy/megatron_policy_worker.py @@ -670,7 +670,7 @@ def __init__( ## perform_initialization = True is a workaround to ensure the correct tensor parallel attributes are set ## on the TP-sharded parameters. model_cfg.calculate_per_token_loss = True - model_cfg.perform_initialization = False + model_cfg.perform_initialization = True assert ( "aux_loss" not in model_cfg.moe_router_load_balancing_type diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py index bfc9557c00..25074bdc3b 100644 --- a/tests/unit/models/policy/test_megatron_worker.py +++ b/tests/unit/models/policy/test_megatron_worker.py @@ -2354,7 +2354,7 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_ # Extract metrics loss_tensor = results["loss"] - all_metrics = results["all_mb_metrics"] + grad_norm = results["grad_norm"] # Verify loss is valid assert not torch.isnan(loss_tensor).any(), ( @@ -2364,12 +2364,6 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_ f"Loss should not be Inf for {desc}" ) - # Extract gradient norm - assert "grad_norm" in all_metrics, ( - f"grad_norm should be in metrics for {desc}" - ) - grad_norm = all_metrics["grad_norm"] - # Store results for comparison grad_norms[desc] = grad_norm losses[desc] = loss_tensor.cpu().numpy() @@ -2399,21 +2393,10 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_ ) if tp > 1: + tp_sharded_names = [item["name"] for item in tp_info["tp_params"]] # When tensor parallelism is enabled, we should have some TP parameters - assert len(tp_info["tp_params"]) > 0, ( - f"Expected tensor parallel parameters when TP={tp}, but found none" - ) - else: - # When tensor parallelism is disabled, no parameters should have TP attributes - if len(tp_info["tp_params"]) > 0: - print( - f"WARNING: Found {len(tp_info['tp_params'])} TP parameters when TP=1:" - ) - for tp_param in tp_info["tp_params"][:3]: # Show first 3 - print(f" - {tp_param['name']}") - - print( - f"✓ {desc} - Model configured for TP={tp} (no tensor parallel sharding expected)" + assert "module.embedding.word_embeddings.weight" in tp_sharded_names, ( + f"Expected module.embedding.word_embeddings.weight to be TP-sharded when TP={tp}" ) finally: @@ -2425,7 +2408,7 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_ # Get reference values from DP2 configuration # NOTE: even if TP2 config passes these tests, it doesn't necessarily imply - # there are no bugs. Sometimes bugs with grad norm are hard to catch. + # there are no bugs. That's why we also check that TP attributes are set correctly above reference_config = "DP1TP1" reference_grad_norm = grad_norms[reference_config] reference_loss = losses[reference_config] @@ -2434,6 +2417,9 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_ if config_name == reference_config: continue + if not isinstance(grad_norm, list): + grad_norm = [grad_norm] + print(f"\nComparing {config_name} with {reference_config}:") print(f" {reference_config} grad norm: {reference_grad_norm}") print(f" {config_name} grad norm: {grad_norm}") @@ -2452,13 +2438,13 @@ def test_megatron_gradient_norm_consistency_across_parallelism(tiny_llama_model_ grad_diff = abs(gn - ref_gn) relative_diff = grad_diff / (ref_gn + 1e-8) print( - f" Microbatch {i}: {ref_gn} vs {gn}, diff={grad_diff:.6f}, rel_diff={relative_diff:.6f}" + f" Microbatch {i}: {ref_gn} vs {gn}, diff={grad_diff.item():.6f}, rel_diff={relative_diff.item():.6f}" ) # Allow small differences due to floating point precision and parallelization assert relative_diff < 0.01 or grad_diff < 1e-6, ( f"Gradient norm difference too large for microbatch {i}: " - f"{ref_gn} vs {gn} (diff={grad_diff:.6f}, rel_diff={relative_diff:.6f})" + f"{ref_gn} vs {gn} (diff={grad_diff.item():.6f}, rel_diff={relative_diff.item():.6f})" ) # Compare losses (should also be identical for same computation) From ba86f3a7939123b346450613681708804312d7ac Mon Sep 17 00:00:00 2001 From: Anna Shors Date: Tue, 28 Oct 2025 10:08:28 -0700 Subject: [PATCH 10/17] minimize config Signed-off-by: Anna Shors --- .../llm/sft-qwen2.5-math-7b-megatron.yaml | 135 +++--------------- 1 file changed, 22 insertions(+), 113 deletions(-) diff --git a/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml b/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml index 95586bf3fb..25627fe080 100644 --- a/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml +++ b/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml @@ -1,149 +1,58 @@ -# SFT Algorithm Configuration defaults: ../../sft.yaml sft: - ## total number of steps to train will equal - ## min((max_num_epochs * len(train_dataloader)), max_num_steps) - max_num_epochs: 1 max_num_steps: 80 - - val_period: 10 - val_batches: 8 - val_global_batch_size: 32 - val_micro_batch_size: 1 - val_at_start: true - seed: 42 - checkpointing: enabled: false - checkpoint_dir: "results/sft" - metric_name: "val_loss" ## set to null to save most recent k checkpoints - higher_is_better: false - keep_top_k: 3 - save_period: 10 - checkpoint_must_save_by: null - policy: - model_name: "Qwen/Qwen2.5-Math-7B" - tokenizer: - name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default + model_name: Qwen/Qwen2.5-Math-7B train_global_batch_size: 512 - train_micro_batch_size: 1 - generation_batch_size: 32 # Only used when generating using HF backend - logprob_batch_size: 1 #4 - max_total_sequence_length: 16384 # max_prompt_length + max_response_length - precision: "bfloat16" - + generation_batch_size: 32 + logprob_batch_size: 1 + max_total_sequence_length: 16384 dtensor_cfg: enabled: false - megatron_cfg: enabled: true - empty_unused_memory_level: 1 - activation_checkpointing: false tensor_model_parallel_size: 4 - pipeline_model_parallel_size: 1 - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null context_parallel_size: 2 - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 - pipeline_dtype: ${policy.precision} - sequence_parallel: true freeze_moe_router: true - moe_router_dtype: "fp64" - moe_router_load_balancing_type: "seq_aux_loss" # causes logprob error divergence for grpo + moe_router_dtype: fp64 + moe_router_load_balancing_type: seq_aux_loss moe_aux_loss_coeff: 0.01 - moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo - #gives ~20% training perf speedup with sequence packing - apply_rope_fusion: True - moe_permute_fusion: True - + moe_router_bias_update_rate: 0.0 + moe_permute_fusion: true optimizer: - optimizer: "adam" - lr: 1.0e-6 - min_lr: 1.0e-6 - weight_decay: 0.1 + lr: 1.0e-06 + min_lr: 1.0e-06 bf16: true - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 adam_beta2: 0.999 - adam_eps: 1e-8 - - #sgd - sgd_momentum: 0.9 - - #distributed optimizer - use_distributed_optimizer: False #true - use_precision_aware_optimizer: False #true - - clip_grad: ${policy.max_grad_norm} - + adam_eps: 1.0e-08 + use_distributed_optimizer: false + use_precision_aware_optimizer: false scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" lr_decay_iters: null lr_warmup_iters: 10 - lr_warmup_init: 0.00000000001 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - use_custom_fsdp: false - data_parallel_sharding_strategy: "optim_grads_params" - - dynamic_batching: - enabled: false - + lr_warmup_init: 1.0e-11 sequence_packing: - enabled: True - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} + enabled: true logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - algorithm: "modified_first_fit_decreasing" - sequence_length_round: 64 - - # makes the training sequence length divisible by the tensor parallel size - # this is useful for sequence parallel training make_sequence_length_divisible_by: 32 - max_grad_norm: 1.0 - data: - max_input_seq_length: ${policy.max_total_sequence_length} - dataset_name: "openmathinstruct2" + dataset_name: openmathinstruct2 prompt_file: examples/prompts/math.txt - split: "train_1M" - add_bos: true - add_eos: true + split: train_1M add_generation_prompt: true - output_key: 'generated_solution' - shuffle: true + output_key: generated_solution num_workers: 8 - logger: - log_dir: "logs" # Base directory for all logs - wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running - tensorboard_enabled: true - mlflow_enabled: false - swanlab_enabled: false # Disable SwanLab logging - monitor_gpus: true # If true, will monitor GPU usage and log to wandb and/or tensorboard wandb: - project: "nemo-rl" - name: "sft-qwen2.5-math-7b-megatron" + project: nemo-rl + name: sft-qwen2.5-math-7b-megatron tensorboard: - log_dir: "tb_logs-sft-qwen2.5-math-7b-megatron" + log_dir: tb_logs-sft-qwen2.5-math-7b-megatron mlflow: - experiment_name: "sft-dev" - run_name: "sft-qwen2.5-math-7b-megatron" - gpu_monitoring: - collection_interval: 10 # How often to collect GPU usage metrics (in seconds) - flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) - + run_name: sft-qwen2.5-math-7b-megatron cluster: gpus_per_node: 8 num_nodes: 2 From 37d57d62d0e8d3ca0e87ed54f524b4ab36dc30fd Mon Sep 17 00:00:00 2001 From: Anna Shors Date: Tue, 28 Oct 2025 10:09:29 -0700 Subject: [PATCH 11/17] fix permissions Signed-off-by: Anna Shors --- tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh diff --git a/tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh b/tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh old mode 100644 new mode 100755 From 7a657971868473c812cb1d9383351cae70885a3f Mon Sep 17 00:00:00 2001 From: Anna Shors Date: Tue, 28 Oct 2025 16:46:26 -0700 Subject: [PATCH 12/17] fix convergence test Signed-off-by: Anna Shors --- examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml | 1 + tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml b/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml index 25627fe080..d7a08770b6 100644 --- a/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml +++ b/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml @@ -1,6 +1,7 @@ defaults: ../../sft.yaml sft: max_num_steps: 80 + seed: 42 checkpointing: enabled: false policy: diff --git a/tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh b/tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh index e782ff3986..897f4fbb60 100755 --- a/tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh +++ b/tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh @@ -28,6 +28,7 @@ uv run examples/run_sft.py \ logger.tensorboard_enabled=True \ checkpointing.enabled=True \ checkpointing.checkpoint_dir=$CKPT_DIR \ + ~policy.tokenizer.chat_template \ $@ \ 2>&1 | tee $RUN_LOG @@ -38,5 +39,5 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ 'data["train/loss"]["80"] < 0.301' \ - 'data["validation/loss"]["80"] < 0.304' + 'data["validation/val_loss"]["80"] < 0.304' fi From 198027abc4ebce05fbf8006553f7c0c1d08721a1 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Tue, 28 Oct 2025 17:26:24 -0700 Subject: [PATCH 13/17] minimize config Signed-off-by: ashors1 --- examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml b/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml index d7a08770b6..25627fe080 100644 --- a/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml +++ b/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml @@ -1,7 +1,6 @@ defaults: ../../sft.yaml sft: max_num_steps: 80 - seed: 42 checkpointing: enabled: false policy: From 64594d08166eea4cc436ea218453ee21641e8be8 Mon Sep 17 00:00:00 2001 From: Anna Shors Date: Wed, 29 Oct 2025 15:50:29 -0700 Subject: [PATCH 14/17] fix unit test Signed-off-by: Anna Shors --- .../configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml b/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml index 25627fe080..151319df3a 100644 --- a/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml +++ b/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml @@ -6,8 +6,6 @@ checkpointing: policy: model_name: Qwen/Qwen2.5-Math-7B train_global_batch_size: 512 - generation_batch_size: 32 - logprob_batch_size: 1 max_total_sequence_length: 16384 dtensor_cfg: enabled: false @@ -18,8 +16,6 @@ policy: sequence_parallel: true freeze_moe_router: true moe_router_dtype: fp64 - moe_router_load_balancing_type: seq_aux_loss - moe_aux_loss_coeff: 0.01 moe_router_bias_update_rate: 0.0 moe_permute_fusion: true optimizer: @@ -36,7 +32,6 @@ policy: lr_warmup_init: 1.0e-11 sequence_packing: enabled: true - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} make_sequence_length_divisible_by: 32 data: dataset_name: openmathinstruct2 From f2cb2ba5df6156f329e440340fe7f45dc3187282 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Thu, 30 Oct 2025 09:12:05 -0700 Subject: [PATCH 15/17] remove more occurrences of average_in_collective Signed-off-by: ashors1 --- examples/configs/distillation_math.yaml | 1 - examples/configs/distillation_math_megatron.yaml | 1 - nemo_rl/models/policy/__init__.py | 1 - tests/unit/models/generation/test_vllm_generation.py | 1 - tests/unit/models/policy/test_megatron_worker.py | 1 - tools/refit_verifier.py | 1 - 6 files changed, 6 deletions(-) diff --git a/examples/configs/distillation_math.yaml b/examples/configs/distillation_math.yaml index e0b8bcf283..ae9aff5e10 100644 --- a/examples/configs/distillation_math.yaml +++ b/examples/configs/distillation_math.yaml @@ -145,7 +145,6 @@ policy: &POLICY_BASE grad_reduce_in_fp32: false overlap_grad_reduce: true overlap_param_gather: true - average_in_collective: true use_custom_fsdp: false data_parallel_sharding_strategy: "optim_grads_params" diff --git a/examples/configs/distillation_math_megatron.yaml b/examples/configs/distillation_math_megatron.yaml index 3df59eba84..b005f13c29 100644 --- a/examples/configs/distillation_math_megatron.yaml +++ b/examples/configs/distillation_math_megatron.yaml @@ -99,7 +99,6 @@ policy: &POLICY_BASE grad_reduce_in_fp32: false overlap_grad_reduce: true overlap_param_gather: true - average_in_collective: true use_custom_fsdp: false data_parallel_sharding_strategy: "optim_grads_params" diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py index fa988ae1bd..ea21728611 100644 --- a/nemo_rl/models/policy/__init__.py +++ b/nemo_rl/models/policy/__init__.py @@ -82,7 +82,6 @@ class MegatronDDPConfig(TypedDict): grad_reduce_in_fp32: bool overlap_grad_reduce: bool overlap_param_gather: bool - average_in_collective: bool use_custom_fsdp: bool data_parallel_sharding_strategy: str diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py index d6bab8b2c0..87dbd6abf2 100644 --- a/tests/unit/models/generation/test_vllm_generation.py +++ b/tests/unit/models/generation/test_vllm_generation.py @@ -209,7 +209,6 @@ def get_basic_megatron_test_config( "grad_reduce_in_fp32": False, "overlap_grad_reduce": True, "overlap_param_gather": False, - "average_in_collective": True, "data_parallel_sharding_strategy": "optim_grads_params", }, }, diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py index 25074bdc3b..d79b0d2453 100644 --- a/tests/unit/models/policy/test_megatron_worker.py +++ b/tests/unit/models/policy/test_megatron_worker.py @@ -151,7 +151,6 @@ def create_megatron_test_config( "grad_reduce_in_fp32": False, "overlap_grad_reduce": True, "overlap_param_gather": False, - "average_in_collective": True, "data_parallel_sharding_strategy": "optim_grads_params", }, "fp8_cfg": { diff --git a/tools/refit_verifier.py b/tools/refit_verifier.py index 475bfd1215..ee21370797 100644 --- a/tools/refit_verifier.py +++ b/tools/refit_verifier.py @@ -249,7 +249,6 @@ def setup_configs(args, tokenizer): "grad_reduce_in_fp32": False, "overlap_grad_reduce": False, "overlap_param_gather": False, - "average_in_collective": False, "use_custom_fsdp": False, "data_parallel_sharding_strategy": "optim_grads_params", }, From cc8b6d74f15de37910c295c0b9f8bc36576efa58 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Thu, 30 Oct 2025 11:23:39 -0700 Subject: [PATCH 16/17] link to GH issue Signed-off-by: ashors1 --- nemo_rl/models/policy/megatron_policy_worker.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py index afd8da4ccb..dffd2e4d07 100644 --- a/nemo_rl/models/policy/megatron_policy_worker.py +++ b/nemo_rl/models/policy/megatron_policy_worker.py @@ -676,7 +676,8 @@ def __init__( "aux_loss" not in model_cfg.moe_router_load_balancing_type or model_cfg.moe_aux_loss_coeff == 0 ), ( - "MoE aux loss is currently not supported due to a known but in Megatron-LM. See ## TODO: link to GH issue" + "MoE aux loss is currently not supported due to a known bug in Megatron-LM. " + "See https://github.com/NVIDIA/Megatron-LM/issues/1984 for more details." ) self.megatron_cfg = ConfigContainer( From f12e113432408035c803843f0d50cd6d482798f5 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Thu, 30 Oct 2025 18:09:50 -0700 Subject: [PATCH 17/17] fix test failure Signed-off-by: ashors1 --- ...h-7b-megatron.yaml => sft-qwen2.5-math7b-2n8g-megatron.yaml} | 0 nemo_rl/models/policy/__init__.py | 2 +- ...-math-7b-megatron.sh => sft-qwen2.5-math7b-2n8g-megatron.sh} | 0 tests/test_suites/nightly.txt | 2 +- 4 files changed, 2 insertions(+), 2 deletions(-) rename examples/configs/recipes/llm/{sft-qwen2.5-math-7b-megatron.yaml => sft-qwen2.5-math7b-2n8g-megatron.yaml} (100%) rename tests/test_suites/llm/{sft-qwen2.5-math-7b-megatron.sh => sft-qwen2.5-math7b-2n8g-megatron.sh} (100%) diff --git a/examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml b/examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml similarity index 100% rename from examples/configs/recipes/llm/sft-qwen2.5-math-7b-megatron.yaml rename to examples/configs/recipes/llm/sft-qwen2.5-math7b-2n8g-megatron.yaml diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py index da043d9f94..bf8599a595 100644 --- a/nemo_rl/models/policy/__init__.py +++ b/nemo_rl/models/policy/__init__.py @@ -81,7 +81,7 @@ class MegatronSchedulerConfig(TypedDict): end_weight_decay: float weight_decay_incr_style: str lr_decay_style: str - lr_decay_iters: NotRequired[int] + lr_decay_iters: NotRequired[int | None] lr_warmup_iters: int lr_warmup_init: float diff --git a/tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh b/tests/test_suites/llm/sft-qwen2.5-math7b-2n8g-megatron.sh similarity index 100% rename from tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh rename to tests/test_suites/llm/sft-qwen2.5-math7b-2n8g-megatron.sh diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt index c6e779c3de..91c24aada9 100644 --- a/tests/test_suites/nightly.txt +++ b/tests/test_suites/nightly.txt @@ -68,7 +68,7 @@ tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron.sh # sequence packing tests/test_suites/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.sh # validate TP/DP -tests/test_suites/llm/sft-qwen2.5-math-7b-megatron.sh +tests/test_suites/llm/sft-qwen2.5-math7b-2n8g-megatron.sh ####### # DPO #