NVIDIA-NeMo · terrykong · Nov 18, 2025 · Nov 13, 2025 · Oct 8, 2025 · Nov 13, 2025
@@ -0,0 +1,34 @@
+defaults: ../../grpo_math_1B.yaml
+grpo:
+  max_num_steps: 30
+checkpointing:
+  checkpoint_dir: results/grpo-nano-v2-12b-1n8g-megatron
+policy:
+  model_name: nvidia/NVIDIA-Nemotron-Nano-12B-v2
+  tokenizer:
+    name: nvidia/NVIDIA-Nemotron-Nano-12B-v2
+  optimizer: null
+  megatron_cfg:
+    enabled: true
+    bias_activation_fusion: false
+    tensor_model_parallel_size: 8
+  dtensor_cfg:
+    enabled: false
+  make_sequence_length_divisible_by: 1
+  generation:
+    max_new_tokens: 512
+    vllm_cfg:
+      max_model_len: 512
+  sequence_packing:
+    enabled: false
+data:
+  max_input_seq_length: 512
+logger:
+  log_dir: logs/grpo-nano-v2-12b-1n8g-megatron
+  wandb_enabled: true
+  tensorboard_enabled: true
+  wandb:
+    project: nemo-rl
+    name: grpo-nano-v2-12b-1n8g-megatron
+cluster:
+  gpus_per_node: 8
@@ -0,0 +1,44 @@
+defaults: ../../grpo_math_1B.yaml
+grpo:
+  max_num_steps: 30
+checkpointing:
+  checkpoint_dir: results/grpo-nano-v2-12b-2n8g-fsdp2tp1
+policy:
+  model_name: nvidia/NVIDIA-Nemotron-Nano-12B-v2
+  tokenizer:
+    name: nvidia/NVIDIA-Nemotron-Nano-12B-v2
+  dtensor_cfg:
+    cpu_offload: true
+    activation_checkpointing: true
+  dynamic_batching:
+    enabled: true
+  sequence_packing:
+    enabled: false
+  make_sequence_length_divisible_by: 1
+  generation:
+    max_new_tokens: 512
+    vllm_cfg:
+      max_model_len: 512
+  scheduler:
+    - name: "torch.optim.lr_scheduler.LinearLR"
+      kwargs:
+        start_factor: 0.1
+        end_factor: 1.0
+        total_iters: 13
+    - name: "torch.optim.lr_scheduler.ConstantLR"
+      kwargs:
+        factor: 1.0
+        total_iters: 10000000000
+    - milestones: [13]
+data:
+  max_input_seq_length: 512
+logger:
+  log_dir: logs/grpo-nano-v2-12b-2n8g-fsdp2tp1
+  wandb_enabled: true
+  tensorboard_enabled: true
+  wandb:
+    project: nemo-rl
+    name: grpo-nano-v2-12b-2n8g-fsdp2tp1
+cluster:
+  gpus_per_node: 8
+  num_nodes: 2
@@ -348,12 +348,17 @@ def forward_step_arbitrary_loss(
     if len(multimodal_data) > 0:
         position_ids = None
 
+    additional_kwargs = {}
+    # Mamba models currently do not support packed_seq_params
+    if packed_seq_params is not None:
+        additional_kwargs["packed_seq_params"] = packed_seq_params
+
     with straggler_timer:
         output_tensor = model(
             input_ids=input_ids_cp_sharded,
             position_ids=position_ids,
             attention_mask=attention_mask,
-            packed_seq_params=packed_seq_params,
+            **additional_kwargs,
             **multimodal_data,
         )
 

@@ -42,6 +42,7 @@ def import_model_from_hf_name(
     # Keep track of defaults so can restore them to the config after loading the model
     orig_tensor_model_parallel_size = model_provider.tensor_model_parallel_size
     orig_pipeline_model_parallel_size = model_provider.pipeline_model_parallel_size
+    orig_context_parallel_size = model_provider.context_parallel_size
     orig_expert_model_parallel_size = model_provider.expert_model_parallel_size
     orig_expert_tensor_parallel_size = model_provider.expert_tensor_parallel_size
     orig_num_layers_in_first_pipeline_stage = (
@@ -59,6 +60,7 @@ def import_model_from_hf_name(
         model_provider.pipeline_model_parallel_size = megatron_config[
             "pipeline_model_parallel_size"
         ]
+        model_provider.context_parallel_size = megatron_config["context_parallel_size"]
         model_provider.expert_model_parallel_size = megatron_config[
             "expert_model_parallel_size"
         ]
@@ -82,6 +84,7 @@ def import_model_from_hf_name(
     config = megatron_model[0].config
     config.tensor_model_parallel_size = orig_tensor_model_parallel_size
     config.pipeline_model_parallel_size = orig_pipeline_model_parallel_size
+    config.context_parallel_size = orig_context_parallel_size
     config.expert_model_parallel_size = orig_expert_model_parallel_size
     config.expert_tensor_parallel_size = orig_expert_tensor_parallel_size
     config.num_layers_in_first_pipeline_stage = orig_num_layers_in_first_pipeline_stage
@@ -122,6 +125,11 @@ def export_model_from_megatron(
 
     # Export performs on CPU with proper distributed context
     with temporary_distributed_context(backend="gloo"):
+        # Need to set model parallel cuda manual seed for mamba mixer
+        from megatron.core.tensor_parallel import model_parallel_cuda_manual_seed
+
+        model_parallel_cuda_manual_seed(0)
+
         # Load the Megatron model
         megatron_model = bridge.load_megatron_model(
             input_path, skip_temp_dist_context=True

@@ -269,7 +269,7 @@ def freeze_moe_router(megatron_model):
                 if hasattr(model_module, "language_model"):
                     model_module = model_module.language_model
                 for layer in model_module.decoder.layers:
-                    if hasattr(layer.mlp, "router"):
+                    if hasattr(layer, "mlp") and hasattr(layer.mlp, "router"):
                         layer.mlp.router.weight.requires_grad = False
 
         mixed_precision_wrapper = CustomFloat16Module
@@ -1271,12 +1271,17 @@ def forward_step_fn(
             if len(multimodal_data) > 0:
                 position_ids = None
 
+            additional_kwargs = {}
+            # Mamba models currently do not support packed_seq_params
+            if packed_seq_params is not None:
+                additional_kwargs["packed_seq_params"] = packed_seq_params
+
             output_tensor = model(
                 input_ids=input_ids_cp_sharded,
                 position_ids=position_ids,
                 attention_mask=attention_mask,
-                packed_seq_params=packed_seq_params,
                 **multimodal_data,
+                **additional_kwargs,
             )
 
             # Apply temperature scaling to logits for training
@@ -1550,11 +1555,15 @@ def forward_step_fn(
             if len(multimodal_data) > 0:
                 position_ids = None
 
+            additional_kwargs = {}
+            if packed_seq_params is not None:
+                additional_kwargs["packed_seq_params"] = packed_seq_params
+
             output_tensor = model(
                 input_ids=input_ids_cp_sharded,
                 position_ids=position_ids,
                 attention_mask=attention_mask,
-                packed_seq_params=packed_seq_params,
+                **additional_kwargs,
                 **multimodal_data,
             )
 

@@ -0,0 +1,41 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=1
+STEPS_PER_RUN=30
+MAX_STEPS=30
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=60
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.05' \
+        'data["train/token_mult_prob_error"]["30"] < 1.05' \
+        'data["train/reward"]["30"] > 0.4' \
+        'mean(data["timing/train/total_step_time"], -6, -1) < 80'
+fi
@@ -0,0 +1,41 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=2
+STEPS_PER_RUN=30
+MAX_STEPS=30
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=60
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.05' \
+        'data["train/token_mult_prob_error"]["30"] < 1.05' \
+        'data["train/reward"]["30"] > 0.4' \
+        'mean(data["timing/train/total_step_time"], -6, -1) < 60'
+fi
@@ -48,6 +48,10 @@ tests/test_suites/llm/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated.sh
 #https://github.com/NVIDIA-NeMo/RL/issues/1374
 #tests/test_suites/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.sh
 
+# Nano-v2
+tests/test_suites/llm/grpo-nano-v2-12b-1n8g-megatron.sh
+tests/test_suites/llm/grpo-nano-v2-12b-2n8g-fsdp2tp1.sh
+
 #######
 # SFT #
 #######
+5 −0		.github/workflows/cicd-main.yml
+1 −1		README.md
+1 −0		docs/models/vlm/index.md
+192 −0		docs/models/vlm/nemotron-nano-v2-vl.md
+2 −2		examples/conversion/compare_hf_and_megatron/compare.py
+1 −1		examples/conversion/hf_megatron_roundtrip.py
+2 −0		src/megatron/bridge/models/deepseek/deepseek_provider.py
+1 −4		src/megatron/bridge/models/gemma/gemma3_provider.py
+7 −6		src/megatron/bridge/models/gpt_provider.py
+10 −8		src/megatron/bridge/models/llama/llama_provider.py
+14 −1		src/megatron/bridge/models/mamba/mamba_provider.py
+12 −11		src/megatron/bridge/models/model_provider.py
+3 −4		src/megatron/bridge/models/nemotron/nemotron_provider.py
+1 −0		src/megatron/bridge/models/nemotronh/nemotron_h_bridge.py
+4 −3		src/megatron/bridge/models/nemotronh/nemotron_h_provider.py
+2 −0		src/megatron/bridge/models/qwen/qwen_provider.py
+2 −2		src/megatron/bridge/recipes/nemotronh/nemotronh.py
+0 −2		src/megatron/bridge/recipes/qwen/qwen3_moe.py
+13 −10		src/megatron/bridge/training/config.py
+18 −1		src/megatron/bridge/training/initialize.py
+10 −0		src/megatron/bridge/training/utils/train_utils.py
+12 −106		src/megatron/bridge/utils/fusions.py
+3 −0		tests/functional_tests/data/test_loaders.py
+2 −12		tests/functional_tests/models/test_gemma2_conversion.py
+1 −10		tests/functional_tests/models/test_gemma2_provider.py
+3 −11		tests/functional_tests/models/test_gemma3_conversion.py
+1 −7		tests/functional_tests/models/test_gemma3_provider.py
+2 −12		tests/functional_tests/models/test_gemma_conversion.py
+1 −10		tests/functional_tests/models/test_gemma_provider.py
+1 −10		tests/functional_tests/models/test_glm45_provider.py
+16 −11		tests/functional_tests/recipes/test_llama_recipes_pretrain_1b.py
+13 −22		tests/functional_tests/recipes/test_mamba_recipes_pretrain.py
+13 −7		tests/functional_tests/recipes/test_qwen_recipes_pretrain.py
+13 −4		tests/functional_tests/recipes/test_qwen_vl_recipes_finetune.py
+13 −3		tests/functional_tests/recipes/utils.py
+3 −2		tests/functional_tests/training/test_inprocess_restart.py
+5 −5		tests/functional_tests/training/test_megatron_fsdp.py
+5 −4		tests/functional_tests/training/test_pretrain.py
+3 −3		tests/functional_tests/training/test_pretrain_resume.py
+1 −0		tests/functional_tests/training/test_sample_based_training.py
+0 −1		tests/unit_tests/models/llama/test_llama_provider.py
+1 −1		tests/unit_tests/models/nemotronh/test_nemotron_h_provider.py
+10 −5		tests/unit_tests/recipes/nemotronh/test_nemotronh.py
+146 −0		tests/unit_tests/training/test_config.py
+311 −0		tests/unit_tests/training/test_initialize.py
+250 −1		tests/unit_tests/training/test_tokenizer.py
+68 −0		tests/unit_tests/training/utils/test_train_utils.py
+41 −391		tests/unit_tests/utils/test_fusions.py