Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ checkpointing:
checkpoint_dir: results/dapo-qwen2.5-7b
keep_top_k: 5
save_period: 5
model_save_format: "dcp"
model_save_format: null
policy:
model_name: Qwen/Qwen2.5-Math-7B
hf_config_overrides:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
# Only run metrics if the target step is reached
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
uv run tests/check_metrics.py $JSON_METRICS \
'data["train/loss"]["1"] < 3.6' \
'data["train/loss"]["1"] < 3.65' \
'data["train/loss"]["150"] < 3.0' \
'data["train/preference_loss"]["1"] > 0.69314' \
'data["train/preference_loss"]["1"] < 0.69316' \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
# Only run metrics if the target step is reached
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
uv run tests/check_metrics.py $JSON_METRICS \
'data["train/loss"]["1"] < 3.6' \
'data["train/loss"]["1"] < 3.65' \
'data["train/loss"]["150"] < 3.0' \
'data["train/preference_loss"]["1"] > 0.69314' \
'data["train/preference_loss"]["1"] < 0.69316' \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ NUM_NODES=16
STEPS_PER_RUN=10
MAX_STEPS=10
NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up
NUM_MINUTES=100
NUM_MINUTES=115
# ===== END CONFIG =====

exit_if_max_steps_reached
Expand All @@ -24,6 +24,7 @@ uv run examples/run_grpo_math.py \
logger.wandb.project=nemo-rl \
logger.wandb.name=$EXP_NAME \
logger.monitor_gpus=True \
logger.tensorboard_enabled=True \
checkpointing.enabled=True \
checkpointing.checkpoint_dir=$CKPT_DIR \
$@ \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ uv run examples/run_grpo_math.py \
logger.wandb.project=nemo-rl \
logger.wandb.name=$EXP_NAME \
logger.monitor_gpus=True \
logger.tensorboard_enabled=True \
checkpointing.enabled=True \
checkpointing.checkpoint_dir=$CKPT_DIR \
$@ \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,5 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
uv run tests/check_metrics.py $JSON_METRICS \
'data["train/loss"]["1"] < 0.55' \
'data["train/loss"]["300"] < 0.285' \
'max(data["ray/node.0.gpu.0.mem_gb"]) < 70' \
'mean(data["timing/train/total_step_time"], 2) < 20'
fi
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,10 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
# TODO: the memory check is known to OOM. see https://github.com/NVIDIA-NeMo/RL/issues/263
# Only run metrics if the target step is reached
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
# Last observed memory around 72.6 (But can be noisy)
uv run tests/check_metrics.py $JSON_METRICS \
'data["train/loss"]["1"] < 0.6' \
'data["train/loss"]["250"] < 0.36' \
'max(data["ray/node.0.gpu.0.mem_gb"]) < 70' \
'max(data["ray/node.0.gpu.0.mem_gb"]) < 75' \
'mean(data["timing/train/total_step_time"], -6, -1) < 10'
fi
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ NUM_NODES=1
STEPS_PER_RUN=250
MAX_STEPS=250
NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up
NUM_MINUTES=15
NUM_MINUTES=30
# ===== END CONFIG =====

exit_if_max_steps_reached
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ NUM_NODES=2
STEPS_PER_RUN=20 # step_time ~ 10sec
MAX_STEPS=20
NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up
NUM_MINUTES=15 # Usually 15 minutes is enough for 20 steps, but we add a buffer of 3 minutes in metrics check
NUM_MINUTES=30 # Usually 15 minutes is enough for 20 steps, but we add a buffer of 3 minutes in metrics check (30min to buffer for initial ckpt download)
# ===== END CONFIG =====

exit_if_max_steps_reached
Expand Down
2 changes: 1 addition & 1 deletion tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ NUM_NODES=2
STEPS_PER_RUN=20 # step_time ~ 15sec
MAX_STEPS=20
NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up
NUM_MINUTES=15
NUM_MINUTES=30
# ===== END CONFIG =====

exit_if_max_steps_reached
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/test_recipes_and_test_suites.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def test_all_recipe_yamls_accounted_for_in_test_suites(
)


def test_nightly_compute_stays_below_1140_hours(nightly_test_suite, tracker):
def test_nightly_compute_stays_below_1180_hours(nightly_test_suite, tracker):
command = f"DRYRUN=1 HF_HOME=... HF_DATASETS_CACHE=... CONTAINER= ACCOUNT= PARTITION= ./tools/launch {' '.join(nightly_test_suite)}"

print(f"Running command: {command}")
Expand Down Expand Up @@ -212,8 +212,8 @@ def test_nightly_compute_stays_below_1140_hours(nightly_test_suite, tracker):
f"Last line of output was not as expected: '{last_line}'"
)
total_gpu_hours = float(last_line.split(":")[-1].strip())
assert total_gpu_hours <= 1140, (
f"Total GPU hours exceeded 1140: {last_line}. We should revisit the test suites to reduce the total GPU hours."
assert total_gpu_hours <= 1180, (
f"Total GPU hours exceeded 1180: {last_line}. We should revisit the test suites to reduce the total GPU hours."
)
tracker.track("total_nightly_gpu_hours", total_gpu_hours)

Expand Down
Loading