diff --git a/3rdparty/Megatron-Bridge-workspace/Megatron-Bridge b/3rdparty/Megatron-Bridge-workspace/Megatron-Bridge index f003cd8ca3..8aa287df3c 160000 --- a/3rdparty/Megatron-Bridge-workspace/Megatron-Bridge +++ b/3rdparty/Megatron-Bridge-workspace/Megatron-Bridge @@ -1 +1 @@ -Subproject commit f003cd8ca3e4876853b6097e816f0a94ea8fefc1 +Subproject commit 8aa287df3ca6833c78733460f0c0f0bcfb79f5de diff --git a/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml new file mode 100644 index 0000000000..86690abcc2 --- /dev/null +++ b/examples/configs/recipes/llm/grpo-nano-v2-12b-1n8g-megatron.yaml @@ -0,0 +1,34 @@ +defaults: ../../grpo_math_1B.yaml +grpo: + max_num_steps: 30 +checkpointing: + checkpoint_dir: results/grpo-nano-v2-12b-1n8g-megatron +policy: + model_name: nvidia/NVIDIA-Nemotron-Nano-12B-v2 + tokenizer: + name: nvidia/NVIDIA-Nemotron-Nano-12B-v2 + optimizer: null + megatron_cfg: + enabled: true + bias_activation_fusion: false + tensor_model_parallel_size: 8 + dtensor_cfg: + enabled: false + make_sequence_length_divisible_by: 1 + generation: + max_new_tokens: 512 + vllm_cfg: + max_model_len: 512 + sequence_packing: + enabled: false +data: + max_input_seq_length: 512 +logger: + log_dir: logs/grpo-nano-v2-12b-1n8g-megatron + wandb_enabled: true + tensorboard_enabled: true + wandb: + project: nemo-rl + name: grpo-nano-v2-12b-1n8g-megatron +cluster: + gpus_per_node: 8 diff --git a/examples/configs/recipes/llm/grpo-nano-v2-12b-2n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/grpo-nano-v2-12b-2n8g-fsdp2tp1.yaml new file mode 100644 index 0000000000..7f77edbb43 --- /dev/null +++ b/examples/configs/recipes/llm/grpo-nano-v2-12b-2n8g-fsdp2tp1.yaml @@ -0,0 +1,44 @@ +defaults: ../../grpo_math_1B.yaml +grpo: + max_num_steps: 30 +checkpointing: + checkpoint_dir: results/grpo-nano-v2-12b-2n8g-fsdp2tp1 +policy: + model_name: nvidia/NVIDIA-Nemotron-Nano-12B-v2 + tokenizer: + name: nvidia/NVIDIA-Nemotron-Nano-12B-v2 + dtensor_cfg: + cpu_offload: true + activation_checkpointing: true + dynamic_batching: + enabled: true + sequence_packing: + enabled: false + make_sequence_length_divisible_by: 1 + generation: + max_new_tokens: 512 + vllm_cfg: + max_model_len: 512 + scheduler: + - name: "torch.optim.lr_scheduler.LinearLR" + kwargs: + start_factor: 0.1 + end_factor: 1.0 + total_iters: 13 + - name: "torch.optim.lr_scheduler.ConstantLR" + kwargs: + factor: 1.0 + total_iters: 10000000000 + - milestones: [13] +data: + max_input_seq_length: 512 +logger: + log_dir: logs/grpo-nano-v2-12b-2n8g-fsdp2tp1 + wandb_enabled: true + tensorboard_enabled: true + wandb: + project: nemo-rl + name: grpo-nano-v2-12b-2n8g-fsdp2tp1 +cluster: + gpus_per_node: 8 + num_nodes: 2 diff --git a/nemo_rl/models/megatron/common.py b/nemo_rl/models/megatron/common.py index 87a0ddb1d5..e56855b410 100644 --- a/nemo_rl/models/megatron/common.py +++ b/nemo_rl/models/megatron/common.py @@ -348,12 +348,17 @@ def forward_step_arbitrary_loss( if len(multimodal_data) > 0: position_ids = None + additional_kwargs = {} + # Mamba models currently do not support packed_seq_params + if packed_seq_params is not None: + additional_kwargs["packed_seq_params"] = packed_seq_params + with straggler_timer: output_tensor = model( input_ids=input_ids_cp_sharded, position_ids=position_ids, attention_mask=attention_mask, - packed_seq_params=packed_seq_params, + **additional_kwargs, **multimodal_data, ) diff --git a/nemo_rl/models/megatron/community_import.py b/nemo_rl/models/megatron/community_import.py index 33aca51444..a1d4efe0d8 100644 --- a/nemo_rl/models/megatron/community_import.py +++ b/nemo_rl/models/megatron/community_import.py @@ -42,6 +42,7 @@ def import_model_from_hf_name( # Keep track of defaults so can restore them to the config after loading the model orig_tensor_model_parallel_size = model_provider.tensor_model_parallel_size orig_pipeline_model_parallel_size = model_provider.pipeline_model_parallel_size + orig_context_parallel_size = model_provider.context_parallel_size orig_expert_model_parallel_size = model_provider.expert_model_parallel_size orig_expert_tensor_parallel_size = model_provider.expert_tensor_parallel_size orig_num_layers_in_first_pipeline_stage = ( @@ -59,6 +60,7 @@ def import_model_from_hf_name( model_provider.pipeline_model_parallel_size = megatron_config[ "pipeline_model_parallel_size" ] + model_provider.context_parallel_size = megatron_config["context_parallel_size"] model_provider.expert_model_parallel_size = megatron_config[ "expert_model_parallel_size" ] @@ -82,6 +84,7 @@ def import_model_from_hf_name( config = megatron_model[0].config config.tensor_model_parallel_size = orig_tensor_model_parallel_size config.pipeline_model_parallel_size = orig_pipeline_model_parallel_size + config.context_parallel_size = orig_context_parallel_size config.expert_model_parallel_size = orig_expert_model_parallel_size config.expert_tensor_parallel_size = orig_expert_tensor_parallel_size config.num_layers_in_first_pipeline_stage = orig_num_layers_in_first_pipeline_stage @@ -122,6 +125,11 @@ def export_model_from_megatron( # Export performs on CPU with proper distributed context with temporary_distributed_context(backend="gloo"): + # Need to set model parallel cuda manual seed for mamba mixer + from megatron.core.tensor_parallel import model_parallel_cuda_manual_seed + + model_parallel_cuda_manual_seed(0) + # Load the Megatron model megatron_model = bridge.load_megatron_model( input_path, skip_temp_dist_context=True diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py index 3bf211be5c..c507b43acb 100644 --- a/nemo_rl/models/policy/megatron_policy_worker.py +++ b/nemo_rl/models/policy/megatron_policy_worker.py @@ -269,7 +269,7 @@ def freeze_moe_router(megatron_model): if hasattr(model_module, "language_model"): model_module = model_module.language_model for layer in model_module.decoder.layers: - if hasattr(layer.mlp, "router"): + if hasattr(layer, "mlp") and hasattr(layer.mlp, "router"): layer.mlp.router.weight.requires_grad = False mixed_precision_wrapper = CustomFloat16Module @@ -1271,12 +1271,17 @@ def forward_step_fn( if len(multimodal_data) > 0: position_ids = None + additional_kwargs = {} + # Mamba models currently do not support packed_seq_params + if packed_seq_params is not None: + additional_kwargs["packed_seq_params"] = packed_seq_params + output_tensor = model( input_ids=input_ids_cp_sharded, position_ids=position_ids, attention_mask=attention_mask, - packed_seq_params=packed_seq_params, **multimodal_data, + **additional_kwargs, ) # Apply temperature scaling to logits for training @@ -1550,11 +1555,15 @@ def forward_step_fn( if len(multimodal_data) > 0: position_ids = None + additional_kwargs = {} + if packed_seq_params is not None: + additional_kwargs["packed_seq_params"] = packed_seq_params + output_tensor = model( input_ids=input_ids_cp_sharded, position_ids=position_ids, attention_mask=attention_mask, - packed_seq_params=packed_seq_params, + **additional_kwargs, **multimodal_data, ) diff --git a/tests/test_suites/llm/grpo-nano-v2-12b-1n8g-megatron.sh b/tests/test_suites/llm/grpo-nano-v2-12b-1n8g-megatron.sh new file mode 100755 index 0000000000..68a694098c --- /dev/null +++ b/tests/test_suites/llm/grpo-nano-v2-12b-1n8g-megatron.sh @@ -0,0 +1,41 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=30 +MAX_STEPS=30 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=60 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.05' \ + 'data["train/token_mult_prob_error"]["30"] < 1.05' \ + 'data["train/reward"]["30"] > 0.4' \ + 'mean(data["timing/train/total_step_time"], -6, -1) < 80' +fi diff --git a/tests/test_suites/llm/grpo-nano-v2-12b-2n8g-fsdp2tp1.sh b/tests/test_suites/llm/grpo-nano-v2-12b-2n8g-fsdp2tp1.sh new file mode 100755 index 0000000000..d1ad766b5b --- /dev/null +++ b/tests/test_suites/llm/grpo-nano-v2-12b-2n8g-fsdp2tp1.sh @@ -0,0 +1,41 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=2 +STEPS_PER_RUN=30 +MAX_STEPS=30 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=60 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.05' \ + 'data["train/token_mult_prob_error"]["30"] < 1.05' \ + 'data["train/reward"]["30"] > 0.4' \ + 'mean(data["timing/train/total_step_time"], -6, -1) < 60' +fi diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt index 91c24aada9..ea353d1131 100644 --- a/tests/test_suites/nightly.txt +++ b/tests/test_suites/nightly.txt @@ -48,6 +48,10 @@ tests/test_suites/llm/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated.sh #https://github.com/NVIDIA-NeMo/RL/issues/1374 #tests/test_suites/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.sh +# Nano-v2 +tests/test_suites/llm/grpo-nano-v2-12b-1n8g-megatron.sh +tests/test_suites/llm/grpo-nano-v2-12b-2n8g-fsdp2tp1.sh + ####### # SFT # #######