From 4daf34689cad99c3f8de29502b78b641e098c450 Mon Sep 17 00:00:00 2001 From: ruit Date: Wed, 17 Dec 2025 01:42:09 -0800 Subject: [PATCH 01/10] add nano v3 nightly test Signed-off-by: ruit --- .../sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml | 26 ++++++++++++ .../llm/sft-nanov3-30BA3B-2n8g-fsdp2.yaml | 22 ++++++++++ .../llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh | 40 +++++++++++++++++++ .../llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh | 40 +++++++++++++++++++ tests/test_suites/nightly.txt | 4 ++ 5 files changed, 132 insertions(+) create mode 100644 examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml create mode 100644 examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2.yaml create mode 100755 tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh create mode 100755 tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh diff --git a/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml b/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml new file mode 100644 index 0000000000..471419a8f7 --- /dev/null +++ b/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml @@ -0,0 +1,26 @@ +defaults: ../../sft.yaml +sft: + max_num_steps: 100 +checkpointing: + enabled: false +policy: + model_name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 + train_global_batch_size: 16 + max_total_sequence_length: 2048 + dtensor_cfg: + lora_cfg: + enabled: true + use_triton: false + optimizer: + name: torch.optim.Adam +logger: + wandb: + project: nemo-rl + name: sft-nanov3-30BA3B-2n8g-fsdp2 + tensorboard: + log_dir: tb_logs-sft-nanov3-30BA3B-2n8g-fsdp2 + mlflow: + run_name: sft-nanov3-30BA3B-2n8g-fsdp2 +cluster: + gpus_per_node: 8 + num_nodes: 2 diff --git a/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2.yaml b/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2.yaml new file mode 100644 index 0000000000..131c7aa873 --- /dev/null +++ b/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2.yaml @@ -0,0 +1,22 @@ +defaults: ../../sft.yaml +sft: + max_num_steps: 100 +checkpointing: + enabled: false +policy: + model_name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 + train_global_batch_size: 16 + max_total_sequence_length: 2048 + optimizer: + name: torch.optim.Adam +logger: + wandb: + project: nemo-rl + name: sft-nanov3-30BA3B-2n8g-fsdp2 + tensorboard: + log_dir: tb_logs-sft-nanov3-30BA3B-2n8g-fsdp2 + mlflow: + run_name: sft-nanov3-30BA3B-2n8g-fsdp2 +cluster: + gpus_per_node: 8 + num_nodes: 2 diff --git a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh new file mode 100755 index 0000000000..cd46170061 --- /dev/null +++ b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh @@ -0,0 +1,40 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=2 +STEPS_PER_RUN=50 # step_time ~ 9sec +MAX_STEPS=50 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=30 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_sft.py \ + --config $CONFIG_PATH \ + sft.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + ~policy.tokenizer.chat_template \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["50"] < 0.375' \ + 'data["validation/val_loss"]["50"] < 0.271' +fi diff --git a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh new file mode 100755 index 0000000000..0e3d47c170 --- /dev/null +++ b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh @@ -0,0 +1,40 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=2 +STEPS_PER_RUN=50 # step_time ~ 8sec +MAX_STEPS=50 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=15 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_sft.py \ + --config $CONFIG_PATH \ + sft.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + ~policy.tokenizer.chat_template \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["50"] < 0.306' \ + 'data["validation/val_loss"]["50"] < 0.25' +fi diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt index 7d830fbfd5..e95507105a 100644 --- a/tests/test_suites/nightly.txt +++ b/tests/test_suites/nightly.txt @@ -90,6 +90,10 @@ tests/test_suites/llm/sft-qwen2.5-math7b-2n8g-megatron.sh # gpt-oss 20b DeepEP test tests/test_suites/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.sh +# Nemotron 3 Nano 30B A3B Base BF16 tests +tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh +tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh + ####### # DPO # ####### From 703e28ebc1dbc09d9334b092c08d6ea5559d45fd Mon Sep 17 00:00:00 2001 From: ruit Date: Wed, 17 Dec 2025 18:55:15 -0800 Subject: [PATCH 02/10] refresh nano v3 recipe metric Signed-off-by: ruit --- tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh | 4 ++-- tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh index cd46170061..77ae549da4 100755 --- a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh +++ b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh @@ -35,6 +35,6 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["50"] < 0.375' \ - 'data["validation/val_loss"]["50"] < 0.271' + 'data["train/loss"]["50"] < 2.32' \ + 'data["validation/val_loss"]["50"] < 2.35' fi diff --git a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh index 0e3d47c170..a5f715598d 100755 --- a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh +++ b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh @@ -35,6 +35,6 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["50"] < 0.306' \ - 'data["validation/val_loss"]["50"] < 0.25' + 'data["train/loss"]["50"] < 1.34' \ + 'data["validation/val_loss"]["50"] < 1.41' fi From 70bc9095154b0ff1982079d9a1dac7cf858b81d2 Mon Sep 17 00:00:00 2001 From: ruit Date: Thu, 18 Dec 2025 18:53:37 -0800 Subject: [PATCH 03/10] update metrics Signed-off-by: ruit --- .../llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh | 10 +++++----- tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh index 77ae549da4..bb20913a8b 100755 --- a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh +++ b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh @@ -4,10 +4,10 @@ source $SCRIPT_DIR/common.env # ===== BEGIN CONFIG ===== NUM_NODES=2 -STEPS_PER_RUN=50 # step_time ~ 9sec -MAX_STEPS=50 +STEPS_PER_RUN=20 # step_time ~ 8sec +MAX_STEPS=20 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=30 +NUM_MINUTES=15 # ===== END CONFIG ===== exit_if_max_steps_reached @@ -35,6 +35,6 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["50"] < 2.32' \ - 'data["validation/val_loss"]["50"] < 2.35' + 'data["train/loss"]["20"] < 4.14' \ + 'data["train/loss"]["0"] < 5.28' fi diff --git a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh index a5f715598d..1189302b3c 100755 --- a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh +++ b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh @@ -4,8 +4,8 @@ source $SCRIPT_DIR/common.env # ===== BEGIN CONFIG ===== NUM_NODES=2 -STEPS_PER_RUN=50 # step_time ~ 8sec -MAX_STEPS=50 +STEPS_PER_RUN=20 # step_time ~ 15sec +MAX_STEPS=20 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up NUM_MINUTES=15 # ===== END CONFIG ===== @@ -35,6 +35,6 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["50"] < 1.34' \ - 'data["validation/val_loss"]["50"] < 1.41' + 'data["train/loss"]["20"] < 3.14' \ + 'data["train/loss"]["0"] < 5.28' fi From a1729a1f77a0ff8d677380ebb7b1503f1a391aaa Mon Sep 17 00:00:00 2001 From: ruit Date: Sun, 21 Dec 2025 21:31:00 -0800 Subject: [PATCH 04/10] fix lora recipe Signed-off-by: ruit --- tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh | 5 ++--- tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh index bb20913a8b..51f8ef8fec 100755 --- a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh +++ b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh @@ -23,7 +23,6 @@ uv run examples/run_sft.py \ logger.wandb.name=$EXP_NAME \ logger.monitor_gpus=True \ logger.tensorboard_enabled=True \ - checkpointing.enabled=True \ checkpointing.checkpoint_dir=$CKPT_DIR \ ~policy.tokenizer.chat_template \ $@ \ @@ -35,6 +34,6 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["20"] < 4.14' \ - 'data["train/loss"]["0"] < 5.28' + 'data["train/loss"]["20"] < 4.20' \ + 'mean(data["timing/train/total_step_time"], 2) < 15' fi diff --git a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh index 1189302b3c..71cfefa374 100755 --- a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh +++ b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh @@ -23,7 +23,6 @@ uv run examples/run_sft.py \ logger.wandb.name=$EXP_NAME \ logger.monitor_gpus=True \ logger.tensorboard_enabled=True \ - checkpointing.enabled=True \ checkpointing.checkpoint_dir=$CKPT_DIR \ ~policy.tokenizer.chat_template \ $@ \ @@ -35,6 +34,6 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["20"] < 3.14' \ - 'data["train/loss"]["0"] < 5.28' + 'data["train/loss"]["20"] < 3.20' \ + 'mean(data["timing/train/total_step_time"], 2) < 15' fi From d1ccee7b6ec520514e4744839ef2001351176422 Mon Sep 17 00:00:00 2001 From: ruit Date: Mon, 22 Dec 2025 21:19:00 -0800 Subject: [PATCH 05/10] change model to base model Signed-off-by: ruit --- .../configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml | 2 +- examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2.yaml | 2 +- tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh | 1 - tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh | 1 - 4 files changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml b/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml index 471419a8f7..59c6f906e5 100644 --- a/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml +++ b/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml @@ -4,7 +4,7 @@ sft: checkpointing: enabled: false policy: - model_name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 + model_name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16 train_global_batch_size: 16 max_total_sequence_length: 2048 dtensor_cfg: diff --git a/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2.yaml b/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2.yaml index 131c7aa873..8e5d2709ec 100644 --- a/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2.yaml +++ b/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2.yaml @@ -4,7 +4,7 @@ sft: checkpointing: enabled: false policy: - model_name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 + model_name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16 train_global_batch_size: 16 max_total_sequence_length: 2048 optimizer: diff --git a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh index 51f8ef8fec..a570c654d0 100755 --- a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh +++ b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh @@ -24,7 +24,6 @@ uv run examples/run_sft.py \ logger.monitor_gpus=True \ logger.tensorboard_enabled=True \ checkpointing.checkpoint_dir=$CKPT_DIR \ - ~policy.tokenizer.chat_template \ $@ \ 2>&1 | tee $RUN_LOG diff --git a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh index 71cfefa374..d1b052af5b 100755 --- a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh +++ b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh @@ -24,7 +24,6 @@ uv run examples/run_sft.py \ logger.monitor_gpus=True \ logger.tensorboard_enabled=True \ checkpointing.checkpoint_dir=$CKPT_DIR \ - ~policy.tokenizer.chat_template \ $@ \ 2>&1 | tee $RUN_LOG From 46c82706b04645cf1e81f25e3fe4bfed47f82947 Mon Sep 17 00:00:00 2001 From: ruit Date: Mon, 22 Dec 2025 21:28:37 -0800 Subject: [PATCH 06/10] update wandb name Signed-off-by: ruit --- .../recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml b/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml index 59c6f906e5..d6cb43a2ef 100644 --- a/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml +++ b/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml @@ -16,11 +16,11 @@ policy: logger: wandb: project: nemo-rl - name: sft-nanov3-30BA3B-2n8g-fsdp2 + name: sft-nanov3-30BA3B-2n8g-fsdp2-lora tensorboard: - log_dir: tb_logs-sft-nanov3-30BA3B-2n8g-fsdp2 + log_dir: tb_logs-sft-nanov3-30BA3B-2n8g-fsdp2-lora mlflow: - run_name: sft-nanov3-30BA3B-2n8g-fsdp2 + run_name: sft-nanov3-30BA3B-2n8g-fsdp2-lora cluster: gpus_per_node: 8 num_nodes: 2 From ae05d23ad512ef314ab976a7b6afe59d11918174 Mon Sep 17 00:00:00 2001 From: ruit Date: Mon, 22 Dec 2025 21:33:29 -0800 Subject: [PATCH 07/10] update nightly test time Signed-off-by: ruit --- tests/unit/test_recipes_and_test_suites.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py index ade6d49d87..bea4897459 100644 --- a/tests/unit/test_recipes_and_test_suites.py +++ b/tests/unit/test_recipes_and_test_suites.py @@ -180,7 +180,7 @@ def test_all_recipe_yamls_accounted_for_in_test_suites( ) -def test_nightly_compute_stays_below_1130_hours(nightly_test_suite, tracker): +def test_nightly_compute_stays_below_1140_hours(nightly_test_suite, tracker): command = f"DRYRUN=1 HF_HOME=... HF_DATASETS_CACHE=... CONTAINER= ACCOUNT= PARTITION= ./tools/launch {' '.join(nightly_test_suite)}" print(f"Running command: {command}") @@ -212,8 +212,8 @@ def test_nightly_compute_stays_below_1130_hours(nightly_test_suite, tracker): f"Last line of output was not as expected: '{last_line}'" ) total_gpu_hours = float(last_line.split(":")[-1].strip()) - assert total_gpu_hours <= 1130, ( - f"Total GPU hours exceeded 1130: {last_line}. We should revisit the test suites to reduce the total GPU hours." + assert total_gpu_hours <= 1140, ( + f"Total GPU hours exceeded 1140: {last_line}. We should revisit the test suites to reduce the total GPU hours." ) tracker.track("total_nightly_gpu_hours", total_gpu_hours) From 327d8706d9c7304b72643e21b12a9268a35a1848 Mon Sep 17 00:00:00 2001 From: ruit Date: Tue, 23 Dec 2025 04:26:05 -0800 Subject: [PATCH 08/10] change optimizer to adamW Signed-off-by: ruit --- .../configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml | 2 -- examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2.yaml | 2 -- tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh | 2 +- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml b/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml index d6cb43a2ef..7fe62c16b6 100644 --- a/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml +++ b/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml @@ -11,8 +11,6 @@ policy: lora_cfg: enabled: true use_triton: false - optimizer: - name: torch.optim.Adam logger: wandb: project: nemo-rl diff --git a/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2.yaml b/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2.yaml index 8e5d2709ec..6639de189b 100644 --- a/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2.yaml +++ b/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2.yaml @@ -7,8 +7,6 @@ policy: model_name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16 train_global_batch_size: 16 max_total_sequence_length: 2048 - optimizer: - name: torch.optim.Adam logger: wandb: project: nemo-rl diff --git a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh index a570c654d0..06cc510707 100755 --- a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh +++ b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh @@ -4,7 +4,7 @@ source $SCRIPT_DIR/common.env # ===== BEGIN CONFIG ===== NUM_NODES=2 -STEPS_PER_RUN=20 # step_time ~ 8sec +STEPS_PER_RUN=20 # step_time ~ 10sec MAX_STEPS=20 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up NUM_MINUTES=15 From 0a0f4428adb05368e045a166acc11cf1d0877fab Mon Sep 17 00:00:00 2001 From: ruit Date: Tue, 23 Dec 2025 18:00:24 -0800 Subject: [PATCH 09/10] update lora config Signed-off-by: ruit --- .../configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml b/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml index 7fe62c16b6..846156ea2a 100644 --- a/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml +++ b/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml @@ -10,6 +10,8 @@ policy: dtensor_cfg: lora_cfg: enabled: true + dim: 128 + alpha: 256 use_triton: false logger: wandb: From 83582e74e015cafe8d5e810f1ec3377ce1c6d6ee Mon Sep 17 00:00:00 2001 From: ruit Date: Tue, 23 Dec 2025 19:27:46 -0800 Subject: [PATCH 10/10] update lora config Signed-off-by: ruit --- .../recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml | 4 ++-- tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh | 6 +++--- tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml b/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml index 846156ea2a..d13b13f92a 100644 --- a/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml +++ b/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml @@ -10,8 +10,8 @@ policy: dtensor_cfg: lora_cfg: enabled: true - dim: 128 - alpha: 256 + dim: 256 + alpha: 512 use_triton: false logger: wandb: diff --git a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh index 06cc510707..f20120f158 100755 --- a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh +++ b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh @@ -7,7 +7,7 @@ NUM_NODES=2 STEPS_PER_RUN=20 # step_time ~ 10sec MAX_STEPS=20 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=15 +NUM_MINUTES=15 # Usually 15 minutes is enough for 20 steps, but we add a buffer of 3 minutes in metrics check # ===== END CONFIG ===== exit_if_max_steps_reached @@ -33,6 +33,6 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["20"] < 4.20' \ - 'mean(data["timing/train/total_step_time"], 2) < 15' + 'data["train/loss"]["20"] < 2.03' \ + 'mean(data["timing/train/total_step_time"], 2) < 18' fi diff --git a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh index d1b052af5b..90eda7713f 100755 --- a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh +++ b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh @@ -33,6 +33,6 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["20"] < 3.20' \ + 'data["train/loss"]["20"] < 1.98' \ 'mean(data["timing/train/total_step_time"], 2) < 15' fi