diff --git a/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml b/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml new file mode 100644 index 0000000000..d13b13f92a --- /dev/null +++ b/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml @@ -0,0 +1,26 @@ +defaults: ../../sft.yaml +sft: + max_num_steps: 100 +checkpointing: + enabled: false +policy: + model_name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16 + train_global_batch_size: 16 + max_total_sequence_length: 2048 + dtensor_cfg: + lora_cfg: + enabled: true + dim: 256 + alpha: 512 + use_triton: false +logger: + wandb: + project: nemo-rl + name: sft-nanov3-30BA3B-2n8g-fsdp2-lora + tensorboard: + log_dir: tb_logs-sft-nanov3-30BA3B-2n8g-fsdp2-lora + mlflow: + run_name: sft-nanov3-30BA3B-2n8g-fsdp2-lora +cluster: + gpus_per_node: 8 + num_nodes: 2 diff --git a/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2.yaml b/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2.yaml new file mode 100644 index 0000000000..6639de189b --- /dev/null +++ b/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2.yaml @@ -0,0 +1,20 @@ +defaults: ../../sft.yaml +sft: + max_num_steps: 100 +checkpointing: + enabled: false +policy: + model_name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16 + train_global_batch_size: 16 + max_total_sequence_length: 2048 +logger: + wandb: + project: nemo-rl + name: sft-nanov3-30BA3B-2n8g-fsdp2 + tensorboard: + log_dir: tb_logs-sft-nanov3-30BA3B-2n8g-fsdp2 + mlflow: + run_name: sft-nanov3-30BA3B-2n8g-fsdp2 +cluster: + gpus_per_node: 8 + num_nodes: 2 diff --git a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh new file mode 100755 index 0000000000..f20120f158 --- /dev/null +++ b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh @@ -0,0 +1,38 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=2 +STEPS_PER_RUN=20 # step_time ~ 10sec +MAX_STEPS=20 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=15 # Usually 15 minutes is enough for 20 steps, but we add a buffer of 3 minutes in metrics check +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_sft.py \ + --config $CONFIG_PATH \ + sft.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["20"] < 2.03' \ + 'mean(data["timing/train/total_step_time"], 2) < 18' +fi diff --git a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh new file mode 100755 index 0000000000..90eda7713f --- /dev/null +++ b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh @@ -0,0 +1,38 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=2 +STEPS_PER_RUN=20 # step_time ~ 15sec +MAX_STEPS=20 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=15 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_sft.py \ + --config $CONFIG_PATH \ + sft.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["20"] < 1.98' \ + 'mean(data["timing/train/total_step_time"], 2) < 15' +fi diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt index 7d830fbfd5..e95507105a 100644 --- a/tests/test_suites/nightly.txt +++ b/tests/test_suites/nightly.txt @@ -90,6 +90,10 @@ tests/test_suites/llm/sft-qwen2.5-math7b-2n8g-megatron.sh # gpt-oss 20b DeepEP test tests/test_suites/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.sh +# Nemotron 3 Nano 30B A3B Base BF16 tests +tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh +tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh + ####### # DPO # ####### diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py index ade6d49d87..bea4897459 100644 --- a/tests/unit/test_recipes_and_test_suites.py +++ b/tests/unit/test_recipes_and_test_suites.py @@ -180,7 +180,7 @@ def test_all_recipe_yamls_accounted_for_in_test_suites( ) -def test_nightly_compute_stays_below_1130_hours(nightly_test_suite, tracker): +def test_nightly_compute_stays_below_1140_hours(nightly_test_suite, tracker): command = f"DRYRUN=1 HF_HOME=... HF_DATASETS_CACHE=... CONTAINER= ACCOUNT= PARTITION= ./tools/launch {' '.join(nightly_test_suite)}" print(f"Running command: {command}") @@ -212,8 +212,8 @@ def test_nightly_compute_stays_below_1130_hours(nightly_test_suite, tracker): f"Last line of output was not as expected: '{last_line}'" ) total_gpu_hours = float(last_line.split(":")[-1].strip()) - assert total_gpu_hours <= 1130, ( - f"Total GPU hours exceeded 1130: {last_line}. We should revisit the test suites to reduce the total GPU hours." + assert total_gpu_hours <= 1140, ( + f"Total GPU hours exceeded 1140: {last_line}. We should revisit the test suites to reduce the total GPU hours." ) tracker.track("total_nightly_gpu_hours", total_gpu_hours)