diff --git a/examples/configs/recipes/llm/distillation-qwen3-32b-to-4b-base-1n8g-fsdp2tp2-dynamicbatch.v1.yaml b/examples/configs/recipes/llm/distillation-qwen3-32b-to-4b-base-1n8g-fsdp2tp2-dynamicbatch.v1.yaml index a1aa9a62fc..f9551a954e 100644 --- a/examples/configs/recipes/llm/distillation-qwen3-32b-to-4b-base-1n8g-fsdp2tp2-dynamicbatch.v1.yaml +++ b/examples/configs/recipes/llm/distillation-qwen3-32b-to-4b-base-1n8g-fsdp2tp2-dynamicbatch.v1.yaml @@ -2,7 +2,7 @@ defaults: ../../distillation_math.yaml distillation: num_prompts_per_step: 64 max_num_steps: 20 - val_batch_size: 32 + val_batch_size: 256 val_period: 10 max_val_samples: 256 loss_fn: @@ -11,43 +11,15 @@ checkpointing: checkpoint_dir: checkpoints/distillation-qwen3-32b-to-4b-base-dynamicbatch policy: model_name: Qwen/Qwen3-4B-Base - train_global_batch_size: 32 - generation_batch_size: 32 dtensor_cfg: context_parallel_size: 1 make_sequence_length_divisible_by: 2 - scheduler: - - name: torch.optim.lr_scheduler.LinearLR - kwargs: - start_factor: 0.1 - end_factor: 1.0 - total_iters: 20 - - name: torch.optim.lr_scheduler.ConstantLR - kwargs: - factor: 1.0 - total_iters: 10000000000 - - milestones: - - 20 teacher: model_name: Qwen/Qwen3-32B - train_global_batch_size: 32 - generation_batch_size: 32 dtensor_cfg: tensor_parallel_size: 8 context_parallel_size: 1 make_sequence_length_divisible_by: 2 - scheduler: - - name: torch.optim.lr_scheduler.LinearLR - kwargs: - start_factor: 0.1 - end_factor: 1.0 - total_iters: 20 - - name: torch.optim.lr_scheduler.ConstantLR - kwargs: - factor: 1.0 - total_iters: 10000000000 - - milestones: - - 20 logger: log_dir: logs/distillation-qwen3-32b-to-4b-base-dynamicbatch wandb: diff --git a/examples/configs/recipes/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-long.v1.yaml b/examples/configs/recipes/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-long.v1.yaml index 0f7ebfae4d..d2b4ec620f 100644 --- a/examples/configs/recipes/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-long.v1.yaml +++ b/examples/configs/recipes/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-long.v1.yaml @@ -2,58 +2,22 @@ defaults: ../../distillation_math.yaml distillation: num_prompts_per_step: 64 max_num_steps: 500 - val_batch_size: 32 + val_batch_size: 512 val_period: 50 - max_val_samples: 256 +loss_fn: + kl_type: reverse checkpointing: checkpoint_dir: checkpoints/distillation-qwen3-32b-to-4b-base-long - save_period: 50 + save_period: 10 policy: model_name: Qwen/Qwen3-4B-Base - train_global_batch_size: 32 - generation_batch_size: 32 - max_total_sequence_length: 32768 - dynamic_batching: - enabled: false - make_sequence_length_divisible_by: 2 - optimizer: - kwargs: - lr: 1.0e-05 - scheduler: - - name: torch.optim.lr_scheduler.LinearLR - kwargs: - start_factor: 0.1 - end_factor: 1.0 - total_iters: 100 - - name: torch.optim.lr_scheduler.CosineAnnealingLR - kwargs: - T_max: 900 - eta_min: 1.0e-07 - - milestones: - - 100 + max_total_sequence_length: 20480 + generation: + vllm_cfg: + tensor_parallel_size: 2 teacher: model_name: Qwen/Qwen3-32B - train_global_batch_size: 32 - generation_batch_size: 32 - max_total_sequence_length: 32768 - dynamic_batching: - enabled: false - make_sequence_length_divisible_by: 2 - optimizer: - kwargs: - lr: 1.0e-05 - scheduler: - - name: torch.optim.lr_scheduler.LinearLR - kwargs: - start_factor: 0.1 - end_factor: 1.0 - total_iters: 100 - - name: torch.optim.lr_scheduler.CosineAnnealingLR - kwargs: - T_max: 900 - eta_min: 1.0e-07 - - milestones: - - 100 + max_total_sequence_length: 20480 logger: log_dir: logs/distillation-qwen3-32b-to-4b-base-long wandb: diff --git a/examples/configs/recipes/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-seqpack.v1.yaml b/examples/configs/recipes/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-seqpack.v1.yaml new file mode 100644 index 0000000000..9d7b8746dc --- /dev/null +++ b/examples/configs/recipes/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-seqpack.v1.yaml @@ -0,0 +1,37 @@ +defaults: ../../distillation_math.yaml +distillation: + num_prompts_per_step: 64 + max_num_steps: 20 + val_batch_size: 256 + val_period: 10 + max_val_samples: 256 +loss_fn: + kl_type: reverse +checkpointing: + checkpoint_dir: checkpoints/distillation-qwen3-32b-to-4b-base-seqpack +policy: + model_name: Qwen/Qwen3-4B-Base + dtensor_cfg: + context_parallel_size: 1 + dynamic_batching: + enabled: false + sequence_packing: + enabled: true + make_sequence_length_divisible_by: 2 +teacher: + model_name: Qwen/Qwen3-32B + dtensor_cfg: + tensor_parallel_size: 8 + context_parallel_size: 1 + dynamic_batching: + enabled: false + sequence_packing: + enabled: true + make_sequence_length_divisible_by: 2 +logger: + log_dir: logs/distillation-qwen3-32b-to-4b-base-seqpack + wandb: + project: nemo-rl + name: distillation-qwen3-32b-to-4b-base-seqpack +cluster: + num_nodes: 2 diff --git a/examples/configs/recipes/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp8-noncolocated.v1.yaml b/examples/configs/recipes/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp8-noncolocated.v1.yaml index 5adcbe9cab..8f1d235d69 100644 --- a/examples/configs/recipes/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp8-noncolocated.v1.yaml +++ b/examples/configs/recipes/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp8-noncolocated.v1.yaml @@ -2,7 +2,7 @@ defaults: ../../distillation_math.yaml distillation: num_prompts_per_step: 64 max_num_steps: 20 - val_batch_size: 32 + val_batch_size: 256 val_period: 10 max_val_samples: 256 loss_fn: @@ -12,29 +12,10 @@ checkpointing: save_period: 50 policy: model_name: Qwen/Qwen3-4B-Base - train_global_batch_size: 32 - generation_batch_size: 32 dtensor_cfg: tensor_parallel_size: 8 context_parallel_size: 1 - dynamic_batching: - enabled: false make_sequence_length_divisible_by: 2 - optimizer: - kwargs: - lr: 1.0e-05 - scheduler: - - name: torch.optim.lr_scheduler.LinearLR - kwargs: - start_factor: 0.1 - end_factor: 1.0 - total_iters: 100 - - name: torch.optim.lr_scheduler.CosineAnnealingLR - kwargs: - T_max: 900 - eta_min: 1.0e-07 - - milestones: - - 100 generation: colocated: enabled: false @@ -43,29 +24,10 @@ policy: num_nodes: 1 teacher: model_name: Qwen/Qwen3-32B - train_global_batch_size: 32 - generation_batch_size: 32 dtensor_cfg: tensor_parallel_size: 8 context_parallel_size: 1 - dynamic_batching: - enabled: false make_sequence_length_divisible_by: 2 - optimizer: - kwargs: - lr: 1.0e-05 - scheduler: - - name: torch.optim.lr_scheduler.LinearLR - kwargs: - start_factor: 0.1 - end_factor: 1.0 - total_iters: 100 - - name: torch.optim.lr_scheduler.CosineAnnealingLR - kwargs: - T_max: 900 - eta_min: 1.0e-07 - - milestones: - - 100 generation: colocated: enabled: false diff --git a/examples/configs/recipes/llm/distillation-qwen3-32b-to-4b-instruct-2n8g-fsdp2tp2-seqpack.v1.yaml b/examples/configs/recipes/llm/distillation-qwen3-32b-to-4b-instruct-2n8g-fsdp2tp2-seqpack.v1.yaml deleted file mode 100644 index b11b27fd54..0000000000 --- a/examples/configs/recipes/llm/distillation-qwen3-32b-to-4b-instruct-2n8g-fsdp2tp2-seqpack.v1.yaml +++ /dev/null @@ -1,65 +0,0 @@ -defaults: ../../distillation_math.yaml -distillation: - num_prompts_per_step: 64 - max_num_steps: 20 - val_batch_size: 32 - val_period: 10 - max_val_samples: 256 -loss_fn: - kl_type: reverse -checkpointing: - checkpoint_dir: checkpoints/distillation-qwen3-32b-to-4b-instruct-seqpack -policy: - model_name: Qwen/Qwen3-4B-Instruct - train_global_batch_size: 32 - generation_batch_size: 32 - dtensor_cfg: - context_parallel_size: 1 - dynamic_batching: - enabled: false - sequence_packing: - enabled: true - make_sequence_length_divisible_by: 2 - scheduler: - - name: torch.optim.lr_scheduler.LinearLR - kwargs: - start_factor: 0.1 - end_factor: 1.0 - total_iters: 20 - - name: torch.optim.lr_scheduler.ConstantLR - kwargs: - factor: 1.0 - total_iters: 10000000000 - - milestones: - - 20 -teacher: - model_name: Qwen/Qwen3-32B - train_global_batch_size: 32 - generation_batch_size: 32 - dtensor_cfg: - tensor_parallel_size: 8 - context_parallel_size: 1 - dynamic_batching: - enabled: false - sequence_packing: - enabled: true - make_sequence_length_divisible_by: 2 - scheduler: - - name: torch.optim.lr_scheduler.LinearLR - kwargs: - start_factor: 0.1 - end_factor: 1.0 - total_iters: 20 - - name: torch.optim.lr_scheduler.ConstantLR - kwargs: - factor: 1.0 - total_iters: 10000000000 - - milestones: - - 20 -logger: - log_dir: logs/distillation-qwen3-32b-to-4b-instruct-seqpack - wandb: - project: nemo-rl - name: distillation-qwen3-32b-to-4b-instruct-seqpack -cluster: - num_nodes: 2 diff --git a/examples/configs/recipes/llm/distillation-qwen3-32b-to-8b-base-2n8g-fsdp2tp2.v1.yaml b/examples/configs/recipes/llm/distillation-qwen3-32b-to-8b-base-2n8g-fsdp2tp2.v1.yaml deleted file mode 100644 index 6dd08a3f66..0000000000 --- a/examples/configs/recipes/llm/distillation-qwen3-32b-to-8b-base-2n8g-fsdp2tp2.v1.yaml +++ /dev/null @@ -1,57 +0,0 @@ -defaults: ../../distillation_math.yaml -distillation: - num_prompts_per_step: 64 - max_num_steps: 100 - val_batch_size: 32 - val_period: 10 - max_val_samples: 256 -loss_fn: - kl_type: reverse -checkpointing: - checkpoint_dir: checkpoints/distillation-qwen3-32b-to-8b-base -policy: - model_name: Qwen/Qwen3-8B-Base - train_global_batch_size: 32 - generation_batch_size: 32 - max_total_sequence_length: 16384 - make_sequence_length_divisible_by: 2 - scheduler: - - name: torch.optim.lr_scheduler.LinearLR - kwargs: - start_factor: 0.1 - end_factor: 1.0 - total_iters: 20 - - name: torch.optim.lr_scheduler.ConstantLR - kwargs: - factor: 1.0 - total_iters: 10000000000 - - milestones: - - 20 -teacher: - model_name: Qwen/Qwen3-32B - train_global_batch_size: 32 - generation_batch_size: 32 - max_total_sequence_length: 16384 - dtensor_cfg: - tensor_parallel_size: 8 - context_parallel_size: 1 - make_sequence_length_divisible_by: 2 - scheduler: - - name: torch.optim.lr_scheduler.LinearLR - kwargs: - start_factor: 0.1 - end_factor: 1.0 - total_iters: 20 - - name: torch.optim.lr_scheduler.ConstantLR - kwargs: - factor: 1.0 - total_iters: 10000000000 - - milestones: - - 20 -logger: - log_dir: logs/distillation-qwen3-32b-to-8b-base - wandb: - project: nemo-rl - name: distillation-qwen3-32b-to-8b-base -cluster: - num_nodes: 2 diff --git a/examples/configs/recipes/llm/distillation-qwen3-32b-to-8b-base-4n8g-fsdp2tp8-long.v1.yaml b/examples/configs/recipes/llm/distillation-qwen3-32b-to-8b-base-4n8g-fsdp2tp8-long.v1.yaml deleted file mode 100644 index 1da1e231b6..0000000000 --- a/examples/configs/recipes/llm/distillation-qwen3-32b-to-8b-base-4n8g-fsdp2tp8-long.v1.yaml +++ /dev/null @@ -1,64 +0,0 @@ -defaults: ../../distillation_math.yaml -distillation: - num_prompts_per_step: 64 - max_num_steps: 500 - val_batch_size: 32 - val_period: 50 - max_val_samples: 256 -loss_fn: - kl_type: reverse -checkpointing: - checkpoint_dir: checkpoints/distillation-qwen3-32b-to-8b-base-long - save_period: 50 -policy: - model_name: Qwen/Qwen3-8B-Base - train_global_batch_size: 32 - generation_batch_size: 32 - max_total_sequence_length: 32768 - dtensor_cfg: - tensor_parallel_size: 8 - context_parallel_size: 1 - make_sequence_length_divisible_by: 4 - optimizer: - kwargs: - lr: 1.0e-05 - scheduler: - - name: torch.optim.lr_scheduler.LinearLR - kwargs: - start_factor: 0.1 - end_factor: 1.0 - total_iters: 150 - - name: torch.optim.lr_scheduler.CosineAnnealingLR - kwargs: - T_max: 1350 - eta_min: 1.0e-07 - - milestones: - - 150 -teacher: - model_name: Qwen/Qwen3-32B - train_global_batch_size: 32 - generation_batch_size: 32 - max_total_sequence_length: 32768 - make_sequence_length_divisible_by: 4 - optimizer: - kwargs: - lr: 1.0e-05 - scheduler: - - name: torch.optim.lr_scheduler.LinearLR - kwargs: - start_factor: 0.1 - end_factor: 1.0 - total_iters: 150 - - name: torch.optim.lr_scheduler.CosineAnnealingLR - kwargs: - T_max: 1350 - eta_min: 1.0e-07 - - milestones: - - 150 -logger: - log_dir: logs/distillation-qwen3-32b-to-8b-base-long - wandb: - project: nemo-rl - name: distillation-qwen3-32b-to-8b-base-long -cluster: - num_nodes: 4 diff --git a/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-1n8g-fsdp2tp2-dynamicbatch.v1.sh b/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-1n8g-fsdp2tp2-dynamicbatch.v1.sh index 0573d0bcba..52f17c2c28 100755 --- a/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-1n8g-fsdp2tp2-dynamicbatch.v1.sh +++ b/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-1n8g-fsdp2tp2-dynamicbatch.v1.sh @@ -7,7 +7,7 @@ NUM_NODES=1 STEPS_PER_RUN=20 MAX_STEPS=20 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=240 +NUM_MINUTES=120 # ===== END CONFIG ===== exit_if_max_steps_reached @@ -35,7 +35,7 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ 'data["train/loss"]["1"] < 1.5' \ - 'data["train/loss"]["20"] < 0.5' \ - 'max(data["ray/node.0.gpu.0.mem_gb"]) < 70' \ + 'data["train/loss"]["20"] < 0.3' \ + 'data["validation/accuracy"]["20"] > 0.1' \ 'mean(data["timing/train/total_step_time"], -6, -1) < 1000' fi diff --git a/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-long.v1.sh b/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-long.v1.sh index 0b759078a3..cd4b635e72 100755 --- a/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-long.v1.sh +++ b/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-long.v1.sh @@ -4,10 +4,10 @@ source $SCRIPT_DIR/common.env # ===== BEGIN CONFIG ===== NUM_NODES=2 -STEPS_PER_RUN=100 -MAX_STEPS=500 +STEPS_PER_RUN=50 +MAX_STEPS=100 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=1200 +NUM_MINUTES=240 # ===== END CONFIG ===== exit_if_max_steps_reached @@ -35,7 +35,7 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ 'data["train/loss"]["1"] < 1.5' \ - 'data["train/loss"]["100"] < 0.3' \ - 'max(data["ray/node.0.gpu.0.mem_gb"]) < 70' \ + 'data["train/loss"]["100"] < 0.25' \ + 'data["validation/accuracy"]["100"] > 0.2' \ 'mean(data["timing/train/total_step_time"], -6, -1) < 1600' fi diff --git a/tests/test_suites/llm/distillation-qwen3-32b-to-4b-instruct-2n8g-fsdp2tp2-seqpack.v1.sh b/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-seqpack.v1.sh similarity index 91% rename from tests/test_suites/llm/distillation-qwen3-32b-to-4b-instruct-2n8g-fsdp2tp2-seqpack.v1.sh rename to tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-seqpack.v1.sh index a366f77ac6..df8d6daed7 100755 --- a/tests/test_suites/llm/distillation-qwen3-32b-to-4b-instruct-2n8g-fsdp2tp2-seqpack.v1.sh +++ b/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-seqpack.v1.sh @@ -7,7 +7,7 @@ NUM_NODES=2 STEPS_PER_RUN=20 MAX_STEPS=20 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=300 +NUM_MINUTES=120 # ===== END CONFIG ===== exit_if_max_steps_reached @@ -35,7 +35,7 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ 'data["train/loss"]["1"] < 1.5' \ - 'data["train/loss"]["20"] < 0.5' \ - 'max(data["ray/node.0.gpu.0.mem_gb"]) < 70' \ + 'data["train/loss"]["20"] < 0.3' \ + 'data["validation/accuracy"]["20"] > 0.1' \ 'mean(data["timing/train/total_step_time"], -6, -1) < 1000' fi diff --git a/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp8-noncolocated.v1.sh b/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp8-noncolocated.v1.sh index 463dc3c3b1..df8d6daed7 100755 --- a/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp8-noncolocated.v1.sh +++ b/tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp8-noncolocated.v1.sh @@ -7,7 +7,7 @@ NUM_NODES=2 STEPS_PER_RUN=20 MAX_STEPS=20 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=240 +NUM_MINUTES=120 # ===== END CONFIG ===== exit_if_max_steps_reached @@ -35,7 +35,7 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ 'data["train/loss"]["1"] < 1.5' \ - 'data["train/loss"]["20"] < 0.5' \ - 'max(data["ray/node.0.gpu.0.mem_gb"]) < 70' \ + 'data["train/loss"]["20"] < 0.3' \ + 'data["validation/accuracy"]["20"] > 0.1' \ 'mean(data["timing/train/total_step_time"], -6, -1) < 1000' fi diff --git a/tests/test_suites/llm/distillation-qwen3-32b-to-8b-base-2n8g-fsdp2tp2.v1.sh b/tests/test_suites/llm/distillation-qwen3-32b-to-8b-base-2n8g-fsdp2tp2.v1.sh deleted file mode 100755 index 9705c8e155..0000000000 --- a/tests/test_suites/llm/distillation-qwen3-32b-to-8b-base-2n8g-fsdp2tp2.v1.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) -source $SCRIPT_DIR/common.env - -# ===== BEGIN CONFIG ===== -NUM_NODES=2 -STEPS_PER_RUN=100 -MAX_STEPS=100 -NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=240 -# ===== END CONFIG ===== - -exit_if_max_steps_reached - -# Run the experiment -cd $PROJECT_ROOT -uv run examples/run_distillation_math.py \ - --config $CONFIG_PATH \ - distillation.max_num_steps=$MAX_STEPS \ - logger.log_dir=$LOG_DIR \ - logger.wandb_enabled=True \ - logger.wandb.project=nemo-rl-distillation \ - logger.wandb.name=$EXP_NAME \ - logger.monitor_gpus=True \ - logger.tensorboard_enabled=True \ - checkpointing.enabled=True \ - checkpointing.checkpoint_dir=$CKPT_DIR \ - $@ \ - 2>&1 | tee $RUN_LOG - -# Convert tensorboard logs to json -uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS - -# Only run metrics if the target step is reached -if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then - uv run tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["1"] < 1.5' \ - 'data["train/loss"]["100"] < 0.5' \ - 'max(data["ray/node.0.gpu.0.mem_gb"]) < 80' \ - 'mean(data["timing/train/total_step_time"], -6, -1) < 500' -fi diff --git a/tests/test_suites/llm/distillation-qwen3-32b-to-8b-base-4n8g-fsdp2tp8-long.v1.sh b/tests/test_suites/llm/distillation-qwen3-32b-to-8b-base-4n8g-fsdp2tp8-long.v1.sh deleted file mode 100755 index 2686ebe281..0000000000 --- a/tests/test_suites/llm/distillation-qwen3-32b-to-8b-base-4n8g-fsdp2tp8-long.v1.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) -source $SCRIPT_DIR/common.env - -# ===== BEGIN CONFIG ===== -NUM_NODES=4 -STEPS_PER_RUN=100 -MAX_STEPS=500 -NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=1200 -# ===== END CONFIG ===== - -exit_if_max_steps_reached - -# Run the experiment -cd $PROJECT_ROOT -uv run examples/run_distillation_math.py \ - --config $CONFIG_PATH \ - distillation.max_num_steps=$MAX_STEPS \ - logger.log_dir=$LOG_DIR \ - logger.wandb_enabled=True \ - logger.wandb.project=nemo-rl-distillation \ - logger.wandb.name=$EXP_NAME \ - logger.monitor_gpus=True \ - logger.tensorboard_enabled=True \ - checkpointing.enabled=True \ - checkpointing.checkpoint_dir=$CKPT_DIR \ - $@ \ - 2>&1 | tee $RUN_LOG - -# Convert tensorboard logs to json -uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS - -# Only run metrics if the target step is reached -if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then - uv run tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["1"] < 1.5' \ - 'data["train/loss"]["100"] < 0.3' \ - 'max(data["ray/node.0.gpu.0.mem_gb"]) < 80' \ - 'mean(data["timing/train/total_step_time"], -6, -1) < 1600' -fi diff --git a/tests/test_suites/release.txt b/tests/test_suites/release.txt index 83cf4009ba..2eddf0011d 100644 --- a/tests/test_suites/release.txt +++ b/tests/test_suites/release.txt @@ -40,14 +40,10 @@ tests/test_suites/llm/dpo-llama3.1-8b-tulu3-1n8g-fsdp2tp1.sh # Distillation # ################ -# 100 step 4b convergence -tests/test_suites/llm/distillation-qwen3-32b-to-8b-base-2n8g-fsdp2tp2.v1.sh - -# Long 4b and 8b convergence +# Long 4b convergence tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-long.v1.sh -tests/test_suites/llm/distillation-qwen3-32b-to-8b-base-4n8g-fsdp2tp8-long.v1.sh # 20 step functional tests on dynamic batching, non-colocated and seqence packing features tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-1n8g-fsdp2tp2-dynamicbatch.v1.sh tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp8-noncolocated.v1.sh -tests/test_suites/llm/distillation-qwen3-32b-to-4b-instruct-2n8g-fsdp2tp2-seqpack.v1.sh +tests/test_suites/llm/distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-seqpack.v1.sh