Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
defaults: ../../sft.yaml
sft:
max_num_steps: 100
checkpointing:
enabled: false
policy:
model_name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16
train_global_batch_size: 16
max_total_sequence_length: 2048
dtensor_cfg:
lora_cfg:
enabled: true
dim: 256
alpha: 512
use_triton: false
logger:
wandb:
project: nemo-rl
name: sft-nanov3-30BA3B-2n8g-fsdp2-lora
tensorboard:
log_dir: tb_logs-sft-nanov3-30BA3B-2n8g-fsdp2-lora
mlflow:
run_name: sft-nanov3-30BA3B-2n8g-fsdp2-lora
cluster:
gpus_per_node: 8
num_nodes: 2
20 changes: 20 additions & 0 deletions examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
defaults: ../../sft.yaml
sft:
max_num_steps: 100
checkpointing:
enabled: false
policy:
model_name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16
train_global_batch_size: 16
max_total_sequence_length: 2048
logger:
wandb:
project: nemo-rl
name: sft-nanov3-30BA3B-2n8g-fsdp2
tensorboard:
log_dir: tb_logs-sft-nanov3-30BA3B-2n8g-fsdp2
mlflow:
run_name: sft-nanov3-30BA3B-2n8g-fsdp2
cluster:
gpus_per_node: 8
num_nodes: 2
38 changes: 38 additions & 0 deletions tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
source $SCRIPT_DIR/common.env

# ===== BEGIN CONFIG =====
NUM_NODES=2
STEPS_PER_RUN=20 # step_time ~ 10sec
MAX_STEPS=20
NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up
NUM_MINUTES=15 # Usually 15 minutes is enough for 20 steps, but we add a buffer of 3 minutes in metrics check
# ===== END CONFIG =====

exit_if_max_steps_reached

# Run the experiment
cd $PROJECT_ROOT
uv run examples/run_sft.py \
--config $CONFIG_PATH \
sft.max_num_steps=$MAX_STEPS \
logger.log_dir=$LOG_DIR \
logger.wandb_enabled=True \
logger.wandb.project=nemo-rl \
logger.wandb.name=$EXP_NAME \
logger.monitor_gpus=True \
logger.tensorboard_enabled=True \
checkpointing.checkpoint_dir=$CKPT_DIR \
$@ \
2>&1 | tee $RUN_LOG

# Convert tensorboard logs to json
uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS

# Only run metrics if the target step is reached
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
uv run tests/check_metrics.py $JSON_METRICS \
'data["train/loss"]["20"] < 2.03' \
'mean(data["timing/train/total_step_time"], 2) < 18'
fi
38 changes: 38 additions & 0 deletions tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
source $SCRIPT_DIR/common.env

# ===== BEGIN CONFIG =====
NUM_NODES=2
STEPS_PER_RUN=20 # step_time ~ 15sec
MAX_STEPS=20
NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up
NUM_MINUTES=15
# ===== END CONFIG =====

exit_if_max_steps_reached

# Run the experiment
cd $PROJECT_ROOT
uv run examples/run_sft.py \
--config $CONFIG_PATH \
sft.max_num_steps=$MAX_STEPS \
logger.log_dir=$LOG_DIR \
logger.wandb_enabled=True \
logger.wandb.project=nemo-rl \
logger.wandb.name=$EXP_NAME \
logger.monitor_gpus=True \
logger.tensorboard_enabled=True \
checkpointing.checkpoint_dir=$CKPT_DIR \
$@ \
2>&1 | tee $RUN_LOG

# Convert tensorboard logs to json
uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS

# Only run metrics if the target step is reached
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
uv run tests/check_metrics.py $JSON_METRICS \
'data["train/loss"]["20"] < 1.98' \
'mean(data["timing/train/total_step_time"], 2) < 15'
fi
4 changes: 4 additions & 0 deletions tests/test_suites/nightly.txt
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ tests/test_suites/llm/sft-qwen2.5-math7b-2n8g-megatron.sh
# gpt-oss 20b DeepEP test
tests/test_suites/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.sh

# Nemotron 3 Nano 30B A3B Base BF16 tests
tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh
tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh

#######
# DPO #
#######
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/test_recipes_and_test_suites.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def test_all_recipe_yamls_accounted_for_in_test_suites(
)


def test_nightly_compute_stays_below_1130_hours(nightly_test_suite, tracker):
def test_nightly_compute_stays_below_1140_hours(nightly_test_suite, tracker):
command = f"DRYRUN=1 HF_HOME=... HF_DATASETS_CACHE=... CONTAINER= ACCOUNT= PARTITION= ./tools/launch {' '.join(nightly_test_suite)}"

print(f"Running command: {command}")
Expand Down Expand Up @@ -212,8 +212,8 @@ def test_nightly_compute_stays_below_1130_hours(nightly_test_suite, tracker):
f"Last line of output was not as expected: '{last_line}'"
)
total_gpu_hours = float(last_line.split(":")[-1].strip())
assert total_gpu_hours <= 1130, (
f"Total GPU hours exceeded 1130: {last_line}. We should revisit the test suites to reduce the total GPU hours."
assert total_gpu_hours <= 1140, (
f"Total GPU hours exceeded 1140: {last_line}. We should revisit the test suites to reduce the total GPU hours."
)
tracker.track("total_nightly_gpu_hours", total_gpu_hours)

Expand Down
Loading