diff --git a/.gitmodules b/.gitmodules index 4c94abbb10..9181358bb2 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,12 +1,12 @@ [submodule "3rdparty/Megatron-LM"] path = 3rdparty/Megatron-LM-workspace/Megatron-LM url = https://github.com/terrykong/Megatron-LM.git - branch = yuya/nemo-rl-use + branch = yuya/nemo-rl-use-2 shallow = true [submodule "3rdparty/Megatron-Bridge"] path = 3rdparty/Megatron-Bridge-workspace/Megatron-Bridge url = https://github.com/NVIDIA-NeMo/Megatron-Bridge.git - branch = yuya/nemo-rl-use-chunkpatch + branch = yifu/nemo-rl-use-chunkpatch-ds shallow = true [submodule "3rdparty/Automodel-workspace/Automodel"] path = 3rdparty/Automodel-workspace/Automodel diff --git a/3rdparty/Megatron-Bridge-workspace/Megatron-Bridge b/3rdparty/Megatron-Bridge-workspace/Megatron-Bridge index a1bbfc2429..abd52c89fe 160000 --- a/3rdparty/Megatron-Bridge-workspace/Megatron-Bridge +++ b/3rdparty/Megatron-Bridge-workspace/Megatron-Bridge @@ -1 +1 @@ -Subproject commit a1bbfc2429a23786a0a288ac55437fc931c567bd +Subproject commit abd52c89fe969869b8969acc181630c273cca4fd diff --git a/3rdparty/Megatron-Bridge-workspace/setup.py b/3rdparty/Megatron-Bridge-workspace/setup.py index d12fa2d8cb..06657bab31 100644 --- a/3rdparty/Megatron-Bridge-workspace/setup.py +++ b/3rdparty/Megatron-Bridge-workspace/setup.py @@ -49,6 +49,7 @@ "nvidia-modelopt[torch,onnx]>=0.33.0a0,<0.34.0; sys_platform != 'darwin'", "nvidia-resiliency-ext>=0.4.0a0,<0.5.0; sys_platform != 'darwin'", "transformer-engine[pytorch]>=2.5.0a0,<2.6.0; sys_platform != 'darwin'", + "filelock", ] # If the bridge source exists, compare cached dependencies with the submodule's pyproject diff --git a/3rdparty/Megatron-LM-workspace/Megatron-LM b/3rdparty/Megatron-LM-workspace/Megatron-LM index e2d5bcd605..383d1144c3 160000 --- a/3rdparty/Megatron-LM-workspace/Megatron-LM +++ b/3rdparty/Megatron-LM-workspace/Megatron-LM @@ -1 +1 @@ -Subproject commit e2d5bcd605108e2cf64fdb91fdfc669f10a57f56 +Subproject commit 383d1144c3b3f77096c63b7308402a0ea6ba47dd diff --git a/examples/configs/dpo.yaml b/examples/configs/dpo.yaml index d2db9b274d..e4a296be06 100755 --- a/examples/configs/dpo.yaml +++ b/examples/configs/dpo.yaml @@ -109,6 +109,7 @@ policy: moe_router_dtype: "fp64" moe_router_load_balancing_type: "aux_loss" moe_router_bias_update_rate: 1e-3 + moe_permute_fusion: false #gives ~20% training perf speedup with sequence packing apply_rope_fusion: True diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml index 621022b57f..6ba7e4d54b 100644 --- a/examples/configs/grpo_math_1B.yaml +++ b/examples/configs/grpo_math_1B.yaml @@ -74,6 +74,7 @@ policy: moe_router_dtype: "fp64" moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo + moe_permute_fusion: false #gives ~20% training perf speedup with sequence packing apply_rope_fusion: True defer_fp32_logits: null diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml index 2cd8563037..511e38c5b5 100644 --- a/examples/configs/grpo_math_1B_megatron.yaml +++ b/examples/configs/grpo_math_1B_megatron.yaml @@ -89,6 +89,7 @@ policy: moe_router_dtype: "fp64" moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo + moe_permute_fusion: false #gives ~20% training perf speedup with sequence packing apply_rope_fusion: True diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml index 1d7de8d507..f5f0b2e5d7 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml @@ -62,6 +62,7 @@ policy: moe_router_dtype: "fp64" moe_router_load_balancing_type: "aux_loss" moe_router_bias_update_rate: 1e-3 + moe_permute_fusion: false #gives ~20% training perf speedup with sequence packing apply_rope_fusion: True diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml index 3684fa866f..9dd723ec22 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml @@ -62,6 +62,7 @@ policy: moe_router_dtype: "fp64" moe_router_load_balancing_type: "aux_loss" moe_router_bias_update_rate: 1e-3 + moe_permute_fusion: false #gives ~20% training perf speedup with sequence packing apply_rope_fusion: True diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8.yaml index f54633eb29..ca39a10190 100644 --- a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8.yaml +++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8.yaml @@ -71,6 +71,7 @@ policy: moe_router_dtype: "fp64" moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo + moe_permute_fusion: false apply_rope_fusion: True activation_checkpointing: True defer_fp32_logits: True diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml index c991a708ca..153bd64e58 100755 --- a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml @@ -56,6 +56,7 @@ policy: moe_router_dtype: "fp64" moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo + moe_permute_fusion: false #gives ~20% training perf speedup with sequence packing apply_rope_fusion: True diff --git a/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml b/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml index 900bf7d7d3..0f8d4b1867 100644 --- a/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml +++ b/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml @@ -77,6 +77,7 @@ policy: moe_router_dtype: "fp64" moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo + moe_permute_fusion: false apply_rope_fusion: True activation_checkpointing: True defer_fp32_logits: True diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml index f585aebd19..8944064ba4 100644 --- a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml @@ -85,6 +85,7 @@ policy: moe_router_dtype: "fp64" moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo + moe_permute_fusion: false #gives ~20% training perf speedup with sequence packing # Causes logprob error divergence for moonlight apply_rope_fusion: False diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml index e3ff04a9d1..58d44a4da4 100755 --- a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml @@ -58,6 +58,7 @@ policy: moe_router_dtype: "fp64" moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo + moe_permute_fusion: false #gives ~20% training perf speedup with sequence packing apply_rope_fusion: True diff --git a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml index 64ea7eef2f..5c7d1ed78f 100755 --- a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml @@ -73,6 +73,7 @@ policy: moe_router_dtype: "fp64" moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo + moe_permute_fusion: false #gives ~20% training perf speedup with sequence packing apply_rope_fusion: True diff --git a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml index f21c1c3dc8..33435fbd15 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml @@ -42,6 +42,7 @@ policy: moe_router_dtype: "fp64" moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo + moe_permute_fusion: false #gives ~20% training perf speedup with sequence packing apply_rope_fusion: True diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml index 934feb6dbf..0bb610fff3 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml @@ -52,6 +52,7 @@ policy: moe_router_dtype: null moe_router_load_balancing_type: "aux_loss" moe_router_bias_update_rate: 1e-3 + moe_permute_fusion: false #gives ~20% training perf speedup with sequence packing apply_rope_fusion: True diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml index 1366d111d1..648f45ab12 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml @@ -52,6 +52,7 @@ policy: moe_router_dtype: null moe_router_load_balancing_type: "aux_loss" moe_router_bias_update_rate: 1e-3 + moe_permute_fusion: false #gives ~20% training perf speedup with sequence packing apply_rope_fusion: True diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml index 2b5e69e66a..0db82b6a2f 100644 --- a/examples/configs/sft.yaml +++ b/examples/configs/sft.yaml @@ -86,6 +86,7 @@ policy: moe_router_dtype: null moe_router_load_balancing_type: "aux_loss" moe_router_bias_update_rate: 1e-3 + moe_permute_fusion: false #gives ~20% training perf speedup with sequence packing apply_rope_fusion: True diff --git a/examples/configs/sft_openmathinstruct2_megatron.yaml b/examples/configs/sft_openmathinstruct2_megatron.yaml index a9b79c5af3..67734cebe4 100644 --- a/examples/configs/sft_openmathinstruct2_megatron.yaml +++ b/examples/configs/sft_openmathinstruct2_megatron.yaml @@ -82,6 +82,7 @@ policy: moe_router_dtype: "fp64" moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo + moe_permute_fusion: false #gives ~20% training perf speedup with sequence packing apply_rope_fusion: True diff --git a/nemo_rl/models/megatron/common.py b/nemo_rl/models/megatron/common.py index c6efee6e93..38078dca13 100644 --- a/nemo_rl/models/megatron/common.py +++ b/nemo_rl/models/megatron/common.py @@ -333,7 +333,13 @@ def forward_step_arbitrary_loss( else: input_ids_cp_sharded = input_ids attention_mask, _, position_ids = get_ltor_masks_and_position_ids( - input_ids, 0, False, False, False + data=input_ids, + eod_token=0, # used for loss_mask, which we don't use + pad_token=0, # used for loss_mask, which we don't use + reset_position_ids=False, + reset_attention_mask=False, + eod_mask_loss=False, + pad_mask_loss=False, ) with straggler_timer: diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py index c3356badb1..7629d0761d 100644 --- a/nemo_rl/models/policy/megatron_policy_worker.py +++ b/nemo_rl/models/policy/megatron_policy_worker.py @@ -61,7 +61,7 @@ from megatron.bridge.utils.instantiate_utils import InstantiationMode from megatron.core import parallel_state from megatron.core.distributed import DistributedDataParallel -from megatron.core.distributed.custom_fsdp import ( +from megatron.core.distributed.fsdp.mcore_fsdp_adapter import ( FullyShardedDataParallel as custom_FSDP, ) from megatron.core.inference.engines import ( @@ -234,6 +234,7 @@ def setup_megatron_model( make_vocab_size_divisible_by=cfg.model.make_vocab_size_divisible_by // cfg.model.tensor_model_parallel_size, tensor_model_parallel_size=cfg.model.tensor_model_parallel_size, + trust_remote_code=True, ) if not cfg.model.vocab_size: cfg.model.vocab_size = cfg.tokenizer.padded_vocab_size @@ -562,6 +563,7 @@ def __init__( "moe_router_bias_update_rate" ] + model_cfg.moe_permute_fusion = self.cfg["megatron_cfg"]["moe_permute_fusion"] if "layernorm_epsilon" in self.cfg["megatron_cfg"]: model_cfg.layernorm_epsilon = self.cfg["megatron_cfg"]["layernorm_epsilon"] @@ -767,6 +769,7 @@ def __init__( tensor_model_parallel_size=self.cfg["megatron_cfg"][ "tensor_model_parallel_size" ], + trust_remote_code=True, ) self.final_padded_vocab_size = tokenizer_config.padded_vocab_size self.dp_size = worker_sharding_annotations.get_axis_size("data_parallel") @@ -1164,7 +1167,13 @@ def forward_step_fn( input_ids = data_dict["input_ids"] input_ids_cp_sharded = input_ids attention_mask, _, position_ids = get_ltor_masks_and_position_ids( - input_ids, 0, False, False, False + data=input_ids, + eod_token=0, # used for loss_mask, which we don't use + pad_token=0, # used for loss_mask, which we don't use + reset_position_ids=False, + reset_attention_mask=False, + eod_mask_loss=False, + pad_mask_loss=False, ) packed_seq_params = None unpacked_input_ids = input_ids diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py index 61f77d8c67..1f632951cf 100644 --- a/tests/unit/models/generation/test_vllm_generation.py +++ b/tests/unit/models/generation/test_vllm_generation.py @@ -165,6 +165,7 @@ def get_basic_megatron_test_config( "moe_router_dtype": "fp64", "moe_router_load_balancing_type": "none", "moe_router_bias_update_rate": 0.0, + "moe_permute_fusion": False, "apply_rope_fusion": True, "train_iters": 100, # Required for Megatron training "optimizer": { diff --git a/tests/unit/models/policy/test_megatron_worker.py b/tests/unit/models/policy/test_megatron_worker.py index 94cdafeaca..80c9febc13 100644 --- a/tests/unit/models/policy/test_megatron_worker.py +++ b/tests/unit/models/policy/test_megatron_worker.py @@ -95,6 +95,7 @@ def create_megatron_test_config( "moe_router_dtype": "fp64", "moe_router_load_balancing_type": "none", "moe_router_bias_update_rate": 0.0, + "moe_permute_fusion": False, "apply_rope_fusion": True, "defer_fp32_logits": defer_fp32_logits, "train_iters": 100, # Required for Megatron training diff --git a/tools/refit_verifier.py b/tools/refit_verifier.py index 67321beb4b..f4d1059c84 100644 --- a/tools/refit_verifier.py +++ b/tools/refit_verifier.py @@ -210,6 +210,7 @@ def setup_configs(args, tokenizer): "moe_router_dtype": "fp64", "moe_router_load_balancing_type": "none", "moe_router_bias_update_rate": 0.0, + "moe_permute_fusion": False, "pipeline_dtype": "bfloat16", "freeze_moe_router": False, "apply_rope_fusion": False, diff --git a/uv.lock b/uv.lock index 2d9beb2297..5852373c05 100644 --- a/uv.lock +++ b/uv.lock @@ -2245,6 +2245,7 @@ dependencies = [ { name = "accelerate" }, { name = "datasets" }, { name = "einops" }, + { name = "filelock" }, { name = "hydra-core" }, { name = "megatron-core" }, { name = "numpy" }, @@ -2273,6 +2274,7 @@ requires-dist = [ { name = "accelerate", specifier = ">=1.6.0" }, { name = "datasets" }, { name = "einops", specifier = ">=0.8.1" }, + { name = "filelock" }, { name = "hydra-core", specifier = ">1.3,<=1.3.2" }, { name = "megatron-core", editable = "3rdparty/Megatron-LM-workspace" }, { name = "numpy", specifier = "<2" },