Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion 3rdparty/Megatron-Bridge-workspace/Megatron-Bridge
Submodule Megatron-Bridge updated 48 files
+5 −0 .github/workflows/cicd-main.yml
+1 −1 README.md
+1 −0 docs/models/vlm/index.md
+192 −0 docs/models/vlm/nemotron-nano-v2-vl.md
+2 −2 examples/conversion/compare_hf_and_megatron/compare.py
+1 −1 examples/conversion/hf_megatron_roundtrip.py
+2 −0 src/megatron/bridge/models/deepseek/deepseek_provider.py
+1 −4 src/megatron/bridge/models/gemma/gemma3_provider.py
+7 −6 src/megatron/bridge/models/gpt_provider.py
+10 −8 src/megatron/bridge/models/llama/llama_provider.py
+14 −1 src/megatron/bridge/models/mamba/mamba_provider.py
+12 −11 src/megatron/bridge/models/model_provider.py
+3 −4 src/megatron/bridge/models/nemotron/nemotron_provider.py
+1 −0 src/megatron/bridge/models/nemotronh/nemotron_h_bridge.py
+4 −3 src/megatron/bridge/models/nemotronh/nemotron_h_provider.py
+2 −0 src/megatron/bridge/models/qwen/qwen_provider.py
+2 −2 src/megatron/bridge/recipes/nemotronh/nemotronh.py
+0 −2 src/megatron/bridge/recipes/qwen/qwen3_moe.py
+13 −10 src/megatron/bridge/training/config.py
+18 −1 src/megatron/bridge/training/initialize.py
+10 −0 src/megatron/bridge/training/utils/train_utils.py
+12 −106 src/megatron/bridge/utils/fusions.py
+3 −0 tests/functional_tests/data/test_loaders.py
+2 −12 tests/functional_tests/models/test_gemma2_conversion.py
+1 −10 tests/functional_tests/models/test_gemma2_provider.py
+3 −11 tests/functional_tests/models/test_gemma3_conversion.py
+1 −7 tests/functional_tests/models/test_gemma3_provider.py
+2 −12 tests/functional_tests/models/test_gemma_conversion.py
+1 −10 tests/functional_tests/models/test_gemma_provider.py
+1 −10 tests/functional_tests/models/test_glm45_provider.py
+16 −11 tests/functional_tests/recipes/test_llama_recipes_pretrain_1b.py
+13 −22 tests/functional_tests/recipes/test_mamba_recipes_pretrain.py
+13 −7 tests/functional_tests/recipes/test_qwen_recipes_pretrain.py
+13 −4 tests/functional_tests/recipes/test_qwen_vl_recipes_finetune.py
+13 −3 tests/functional_tests/recipes/utils.py
+3 −2 tests/functional_tests/training/test_inprocess_restart.py
+5 −5 tests/functional_tests/training/test_megatron_fsdp.py
+5 −4 tests/functional_tests/training/test_pretrain.py
+3 −3 tests/functional_tests/training/test_pretrain_resume.py
+1 −0 tests/functional_tests/training/test_sample_based_training.py
+0 −1 tests/unit_tests/models/llama/test_llama_provider.py
+1 −1 tests/unit_tests/models/nemotronh/test_nemotron_h_provider.py
+10 −5 tests/unit_tests/recipes/nemotronh/test_nemotronh.py
+146 −0 tests/unit_tests/training/test_config.py
+311 −0 tests/unit_tests/training/test_initialize.py
+250 −1 tests/unit_tests/training/test_tokenizer.py
+68 −0 tests/unit_tests/training/utils/test_train_utils.py
+41 −391 tests/unit_tests/utils/test_fusions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
defaults: ../../grpo_math_1B.yaml
grpo:
max_num_steps: 30
checkpointing:
checkpoint_dir: results/grpo-nano-v2-12b-1n8g-megatron
policy:
model_name: nvidia/NVIDIA-Nemotron-Nano-12B-v2
tokenizer:
name: nvidia/NVIDIA-Nemotron-Nano-12B-v2
optimizer: null
megatron_cfg:
enabled: true
bias_activation_fusion: false
tensor_model_parallel_size: 8
dtensor_cfg:
enabled: false
make_sequence_length_divisible_by: 1
generation:
max_new_tokens: 512
vllm_cfg:
max_model_len: 512
sequence_packing:
enabled: false
data:
max_input_seq_length: 512
logger:
log_dir: logs/grpo-nano-v2-12b-1n8g-megatron
wandb_enabled: true
tensorboard_enabled: true
wandb:
project: nemo-rl
name: grpo-nano-v2-12b-1n8g-megatron
cluster:
gpus_per_node: 8
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
defaults: ../../grpo_math_1B.yaml
grpo:
max_num_steps: 30
checkpointing:
checkpoint_dir: results/grpo-nano-v2-12b-2n8g-fsdp2tp1
policy:
model_name: nvidia/NVIDIA-Nemotron-Nano-12B-v2
tokenizer:
name: nvidia/NVIDIA-Nemotron-Nano-12B-v2
dtensor_cfg:
cpu_offload: true
activation_checkpointing: true
dynamic_batching:
enabled: true
sequence_packing:
enabled: false
make_sequence_length_divisible_by: 1
generation:
max_new_tokens: 512
vllm_cfg:
max_model_len: 512
scheduler:
- name: "torch.optim.lr_scheduler.LinearLR"
kwargs:
start_factor: 0.1
end_factor: 1.0
total_iters: 13
- name: "torch.optim.lr_scheduler.ConstantLR"
kwargs:
factor: 1.0
total_iters: 10000000000
- milestones: [13]
data:
max_input_seq_length: 512
logger:
log_dir: logs/grpo-nano-v2-12b-2n8g-fsdp2tp1
wandb_enabled: true
tensorboard_enabled: true
wandb:
project: nemo-rl
name: grpo-nano-v2-12b-2n8g-fsdp2tp1
cluster:
gpus_per_node: 8
num_nodes: 2
7 changes: 6 additions & 1 deletion nemo_rl/models/megatron/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,12 +348,17 @@ def forward_step_arbitrary_loss(
if len(multimodal_data) > 0:
position_ids = None

additional_kwargs = {}
# Mamba models currently do not support packed_seq_params
if packed_seq_params is not None:
additional_kwargs["packed_seq_params"] = packed_seq_params

with straggler_timer:
output_tensor = model(
input_ids=input_ids_cp_sharded,
position_ids=position_ids,
attention_mask=attention_mask,
packed_seq_params=packed_seq_params,
**additional_kwargs,
**multimodal_data,
)

Expand Down
8 changes: 8 additions & 0 deletions nemo_rl/models/megatron/community_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def import_model_from_hf_name(
# Keep track of defaults so can restore them to the config after loading the model
orig_tensor_model_parallel_size = model_provider.tensor_model_parallel_size
orig_pipeline_model_parallel_size = model_provider.pipeline_model_parallel_size
orig_context_parallel_size = model_provider.context_parallel_size
orig_expert_model_parallel_size = model_provider.expert_model_parallel_size
orig_expert_tensor_parallel_size = model_provider.expert_tensor_parallel_size
orig_num_layers_in_first_pipeline_stage = (
Expand All @@ -59,6 +60,7 @@ def import_model_from_hf_name(
model_provider.pipeline_model_parallel_size = megatron_config[
"pipeline_model_parallel_size"
]
model_provider.context_parallel_size = megatron_config["context_parallel_size"]
model_provider.expert_model_parallel_size = megatron_config[
"expert_model_parallel_size"
]
Expand All @@ -82,6 +84,7 @@ def import_model_from_hf_name(
config = megatron_model[0].config
config.tensor_model_parallel_size = orig_tensor_model_parallel_size
config.pipeline_model_parallel_size = orig_pipeline_model_parallel_size
config.context_parallel_size = orig_context_parallel_size
config.expert_model_parallel_size = orig_expert_model_parallel_size
config.expert_tensor_parallel_size = orig_expert_tensor_parallel_size
config.num_layers_in_first_pipeline_stage = orig_num_layers_in_first_pipeline_stage
Expand Down Expand Up @@ -122,6 +125,11 @@ def export_model_from_megatron(

# Export performs on CPU with proper distributed context
with temporary_distributed_context(backend="gloo"):
# Need to set model parallel cuda manual seed for mamba mixer
from megatron.core.tensor_parallel import model_parallel_cuda_manual_seed

model_parallel_cuda_manual_seed(0)

# Load the Megatron model
megatron_model = bridge.load_megatron_model(
input_path, skip_temp_dist_context=True
Expand Down
15 changes: 12 additions & 3 deletions nemo_rl/models/policy/megatron_policy_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ def freeze_moe_router(megatron_model):
if hasattr(model_module, "language_model"):
model_module = model_module.language_model
for layer in model_module.decoder.layers:
if hasattr(layer.mlp, "router"):
if hasattr(layer, "mlp") and hasattr(layer.mlp, "router"):
layer.mlp.router.weight.requires_grad = False

mixed_precision_wrapper = CustomFloat16Module
Expand Down Expand Up @@ -1271,12 +1271,17 @@ def forward_step_fn(
if len(multimodal_data) > 0:
position_ids = None

additional_kwargs = {}
# Mamba models currently do not support packed_seq_params
if packed_seq_params is not None:
additional_kwargs["packed_seq_params"] = packed_seq_params

output_tensor = model(
input_ids=input_ids_cp_sharded,
position_ids=position_ids,
attention_mask=attention_mask,
packed_seq_params=packed_seq_params,
**multimodal_data,
**additional_kwargs,
)

# Apply temperature scaling to logits for training
Expand Down Expand Up @@ -1550,11 +1555,15 @@ def forward_step_fn(
if len(multimodal_data) > 0:
position_ids = None

additional_kwargs = {}
if packed_seq_params is not None:
additional_kwargs["packed_seq_params"] = packed_seq_params

output_tensor = model(
input_ids=input_ids_cp_sharded,
position_ids=position_ids,
attention_mask=attention_mask,
packed_seq_params=packed_seq_params,
**additional_kwargs,
**multimodal_data,
)

Expand Down
41 changes: 41 additions & 0 deletions tests/test_suites/llm/grpo-nano-v2-12b-1n8g-megatron.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
source $SCRIPT_DIR/common.env

# ===== BEGIN CONFIG =====
NUM_NODES=1
STEPS_PER_RUN=30
MAX_STEPS=30
NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up
NUM_MINUTES=60
# ===== END CONFIG =====

exit_if_max_steps_reached

# Run the experiment
cd $PROJECT_ROOT
uv run examples/run_grpo_math.py \
--config $CONFIG_PATH \
grpo.max_num_steps=$MAX_STEPS \
logger.log_dir=$LOG_DIR \
logger.wandb_enabled=True \
logger.wandb.project=nemo-rl \
logger.wandb.name=$EXP_NAME \
logger.monitor_gpus=True \
logger.tensorboard_enabled=True \
checkpointing.enabled=True \
checkpointing.checkpoint_dir=$CKPT_DIR \
$@ \
2>&1 | tee $RUN_LOG

# Convert tensorboard logs to json
uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS

# Only run metrics if the target step is reached
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
uv run tests/check_metrics.py $JSON_METRICS \
'mean(data["train/token_mult_prob_error"]) < 1.05' \
'data["train/token_mult_prob_error"]["30"] < 1.05' \
'data["train/reward"]["30"] > 0.4' \
'mean(data["timing/train/total_step_time"], -6, -1) < 80'
fi
41 changes: 41 additions & 0 deletions tests/test_suites/llm/grpo-nano-v2-12b-2n8g-fsdp2tp1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
source $SCRIPT_DIR/common.env

# ===== BEGIN CONFIG =====
NUM_NODES=2
STEPS_PER_RUN=30
MAX_STEPS=30
NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up
NUM_MINUTES=60
# ===== END CONFIG =====

exit_if_max_steps_reached

# Run the experiment
cd $PROJECT_ROOT
uv run examples/run_grpo_math.py \
--config $CONFIG_PATH \
grpo.max_num_steps=$MAX_STEPS \
logger.log_dir=$LOG_DIR \
logger.wandb_enabled=True \
logger.wandb.project=nemo-rl \
logger.wandb.name=$EXP_NAME \
logger.monitor_gpus=True \
logger.tensorboard_enabled=True \
checkpointing.enabled=True \
checkpointing.checkpoint_dir=$CKPT_DIR \
$@ \
2>&1 | tee $RUN_LOG

# Convert tensorboard logs to json
uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS

# Only run metrics if the target step is reached
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
uv run tests/check_metrics.py $JSON_METRICS \
'mean(data["train/token_mult_prob_error"]) < 1.05' \
'data["train/token_mult_prob_error"]["30"] < 1.05' \
'data["train/reward"]["30"] > 0.4' \
'mean(data["timing/train/total_step_time"], -6, -1) < 60'
fi
4 changes: 4 additions & 0 deletions tests/test_suites/nightly.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ tests/test_suites/llm/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated.sh
#https://github.com/NVIDIA-NeMo/RL/issues/1374
#tests/test_suites/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.sh

# Nano-v2
tests/test_suites/llm/grpo-nano-v2-12b-1n8g-megatron.sh
tests/test_suites/llm/grpo-nano-v2-12b-2n8g-fsdp2tp1.sh

#######
# SFT #
#######
Expand Down
Loading