From c35d0f4591b6ff5921d70cc7c93d041630932b00 Mon Sep 17 00:00:00 2001 From: Guyue Huang Date: Thu, 18 Dec 2025 16:04:58 -0800 Subject: [PATCH 1/9] Perf recipe for v0.5 Signed-off-by: Guyue Huang --- ...oonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml | 7 ------ .../performance/grpo-deepseek-v3-32n4g.yaml | 21 ++++++++++++++++++ .../grpo-deepseek-v3-64n4g-async-1off.yaml | 22 +++++++++++++++++++ ...grpo-deepseek-v3-64n8g-fp8-async-1off.yaml | 21 ++++++++++++++++++ ...-llama3.1-8b-instruct-2n8g-async-1off.yaml | 2 ++ ...ma3.1-8b-instruct-2n8g-fp8-async-1off.yaml | 20 +++++++++++++++++ .../performance/grpo-qwen3-235b-16n4g.yaml | 20 +++++++++++++++++ .../grpo-qwen3-235b-32n4g-async-1off.yaml | 20 +++++++++++++++++ 8 files changed, 126 insertions(+), 7 deletions(-) create mode 100644 examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml create mode 100644 examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml create mode 100644 examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml create mode 100644 examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml create mode 100644 examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml create mode 100644 examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml index 19c86c8a04..98fbed9812 100644 --- a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml +++ b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml @@ -16,8 +16,6 @@ policy: max_total_sequence_length: 8192 dtensor_cfg: enabled: false - sequence_packing: - algorithm: modified_ffd make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} optimizer: null megatron_cfg: @@ -45,11 +43,6 @@ policy: precision: fp8 use_deep_gemm: true gpu_memory_utilization: 0.5 - expert_parallel_size: 4 - quantization_ignored_layer_kws: [ - a_proj, - b_proj - ] logger: monitor_gpus: false wandb: diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml new file mode 100644 index 0000000000..812fdfae9d --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml @@ -0,0 +1,21 @@ +defaults: ./grpo-deepseek-v3-32n8g.yaml +checkpointing: + checkpoint_dir: results/grpo-deepseek-v3-32n4g +policy: + sequence_packing: + enabled: false + megatron_cfg: + pipeline_model_parallel_size: 8 + expert_model_parallel_size: 16 + num_layers_in_first_pipeline_stage: 7 + num_layers_in_last_pipeline_stage: 6 + generation: + vllm_cfg: + tensor_parallel_size: 16 +logger: + log_dir: logs/grpo-deepseek-v3-32n4g + wandb: + name: grpo-deepseek-v3-32n4g +cluster: + gpus_per_node: 4 + num_nodes: 32 diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml new file mode 100644 index 0000000000..c019e34962 --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml @@ -0,0 +1,22 @@ +defaults: ./grpo-deepseek-v3-64n8g-async-1off.yaml +checkpointing: + checkpoint_dir: results/grpo-deepseek-v3-64n4g-async-1off +policy: + sequence_packing: + enabled: false + megatron_cfg: + pipeline_model_parallel_size: 8 + expert_model_parallel_size: 16 + num_layers_in_first_pipeline_stage: 7 + num_layers_in_last_pipeline_stage: 6 + generation: + vllm_cfg: + tensor_parallel_size: 16 + gpu_memory_utilization: 0.8 +logger: + log_dir: logs/grpo-deepseek-v3-64n4g-async-32T32G-1off + wandb: + name: grpo-deepseek-v3-64n4g-async-32T32G-1off +cluster: + gpus_per_node: 4 + num_nodes: 64 diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml new file mode 100644 index 0000000000..b47aa65fb6 --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml @@ -0,0 +1,21 @@ +defaults: ./grpo-deepseek-v3-64n8g-async-1off.yaml +checkpointing: + checkpoint_dir: results/grpo-deepseek-v3-64n8g-fp8-async-1off +policy: + megatron_cfg: + fp8_cfg: + enabled: true + fp8: "e4m3" + fp8_recipe: "blockwise" + fp8_param: false + env_vars: + NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1" + generation: + vllm_cfg: + tensor_parallel_size: 16 + precision: "fp8" + use_deep_gemm: true +logger: + log_dir: logs/grpo-deepseek-v3-64n8g-fp8-async-1off + wandb: + name: grpo-deepseek-v3-64n8g-fp8-async-1off \ No newline at end of file diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml index b6d7ed441d..c0263f68fb 100644 --- a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml @@ -9,6 +9,8 @@ loss_fn: checkpointing: checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n8g-async-1off policy: + megatron_cfg: + pipeline_model_parallel_size: 1 generation: colocated: enabled: false diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml new file mode 100644 index 0000000000..b32786f7d7 --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml @@ -0,0 +1,20 @@ +defaults: ./grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml +checkpointing: + checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off +policy: + megatron_cfg: + fp8_cfg: + enabled: true + fp8: "e4m3" + fp8_recipe: "blockwise" + fp8_param: false + env_vars: + NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1" + generation: + vllm_cfg: + precision: "fp8" + use_deep_gemm: true +logger: + log_dir: logs/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off + wandb: + name: grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off \ No newline at end of file diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml new file mode 100644 index 0000000000..0b1120b64a --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml @@ -0,0 +1,20 @@ +defaults: ./grpo-qwen3-235b-16n8g.yaml +checkpointing: + checkpoint_dir: results/grpo-qwen3-235b-16n4g +policy: + sequence_packing: + enabled: false + megatron_cfg: + pipeline_model_parallel_size: 4 + num_layers_in_first_pipeline_stage: 23 + num_layers_in_last_pipeline_stage: 23 + generation: + vllm_cfg: + tensor_parallel_size: 8 +logger: + log_dir: logs/grpo-qwen3-235b-16n4g + wandb: + name: grpo-qwen3-235b-16n4g +cluster: + gpus_per_node: 4 + num_nodes: 16 diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml new file mode 100644 index 0000000000..ae8f4bb25b --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml @@ -0,0 +1,20 @@ +defaults: ./grpo-qwen3-235b-32n8g-async-1off.yaml +checkpointing: + checkpoint_dir: results/grpo-qwen3-235b-32n4g-async-1off +policy: + sequence_packing: + enabled: false + megatron_cfg: + pipeline_model_parallel_size: 4 + num_layers_in_first_pipeline_stage: 23 + num_layers_in_last_pipeline_stage: 23 + generation: + vllm_cfg: + tensor_parallel_size: 8 +logger: + log_dir: logs/grpo-qwen3-235b-32n4g-async-1off + wandb: + name: grpo-qwen3-235b-32n4g-async-1off +cluster: + gpus_per_node: 4 + num_nodes: 32 From 7f128544d4d1f17e57cdd0f8eb4f17095b9b2558 Mon Sep 17 00:00:00 2001 From: Guyue Huang Date: Fri, 19 Dec 2025 10:11:35 -0800 Subject: [PATCH 2/9] Add tests to performance.txt Signed-off-by: Guyue Huang --- .../llm/performance/grpo-deepseek-v3-32n4g.sh | 45 +++++++++++++++++++ .../grpo-deepseek-v3-64n4g-async-1off.sh | 45 +++++++++++++++++++ .../grpo-deepseek-v3-64n8g-fp8-async-1off.sh | 45 +++++++++++++++++++ ...lama3.1-8b-instruct-2n8g-fp8-async-1off.sh | 39 ++++++++++++++++ .../llm/performance/grpo-qwen3-235b-16n4g.sh | 40 +++++++++++++++++ .../grpo-qwen3-235b-32n4g-async-1off.sh | 40 +++++++++++++++++ tests/test_suites/performance.txt | 23 +++++++++- 7 files changed, 276 insertions(+), 1 deletion(-) create mode 100755 tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh create mode 100755 tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh create mode 100755 tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.sh create mode 100755 tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.sh create mode 100755 tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh create mode 100755 tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh diff --git a/tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh b/tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh new file mode 100755 index 0000000000..738b38dd5b --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh @@ -0,0 +1,45 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env +# disable NVLS to avoid OOM issue +export NCCL_NVLS_ENABLE=0 +# allow user to pass an existing HF checkpoint path based on instruction in https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/deepseek.md +export MODEL_NAME=${NRL_DEEPSEEK_V3_HF_CKPT:-"unsloth/DeepSeek-V3-0324-BF16"} + +# ===== BEGIN CONFIG ===== +NUM_NODES=32 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=240 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + policy.model_name=$MODEL_NAME \ + policy.tokenizer.name=$MODEL_NAME \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' +fi diff --git a/tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh new file mode 100755 index 0000000000..14138486e1 --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh @@ -0,0 +1,45 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env +# disable NVLS to avoid OOM issue +export NCCL_NVLS_ENABLE=0 +# allow user to pass an existing HF checkpoint path based on instruction in https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/deepseek.md +export MODEL_NAME=${NRL_DEEPSEEK_V3_HF_CKPT:-"unsloth/DeepSeek-V3-0324-BF16"} + +# ===== BEGIN CONFIG ===== +NUM_NODES=64 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=240 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + policy.model_name=$MODEL_NAME \ + policy.tokenizer.name=$MODEL_NAME \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' +fi diff --git a/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.sh b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.sh new file mode 100755 index 0000000000..14138486e1 --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.sh @@ -0,0 +1,45 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env +# disable NVLS to avoid OOM issue +export NCCL_NVLS_ENABLE=0 +# allow user to pass an existing HF checkpoint path based on instruction in https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/deepseek.md +export MODEL_NAME=${NRL_DEEPSEEK_V3_HF_CKPT:-"unsloth/DeepSeek-V3-0324-BF16"} + +# ===== BEGIN CONFIG ===== +NUM_NODES=64 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=240 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + policy.model_name=$MODEL_NAME \ + policy.tokenizer.name=$MODEL_NAME \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' +fi diff --git a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.sh b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.sh new file mode 100755 index 0000000000..e7636f3e93 --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.sh @@ -0,0 +1,39 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=2 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=100 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' +fi diff --git a/tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh b/tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh new file mode 100755 index 0000000000..0f9bf9289f --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh @@ -0,0 +1,40 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env +# disable NVLS to avoid OOM issue +export NCCL_NVLS_ENABLE=0 + +# ===== BEGIN CONFIG ===== +NUM_NODES=16 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=100 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' +fi diff --git a/tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh new file mode 100755 index 0000000000..f7dac553af --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh @@ -0,0 +1,40 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env +# disable NVLS to avoid OOM issue +export NCCL_NVLS_ENABLE=0 + +# ===== BEGIN CONFIG ===== +NUM_NODES=32 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=100 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' +fi diff --git a/tests/test_suites/performance.txt b/tests/test_suites/performance.txt index bf714b0e74..e96ba7e110 100644 --- a/tests/test_suites/performance.txt +++ b/tests/test_suites/performance.txt @@ -2,16 +2,37 @@ # GRPO # ######## +# H100 BF16 + +## SYNC tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g.sh tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g.sh tests/test_suites/llm/performance/grpo-deepseek-v3-32n8g.sh tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh tests/test_suites/llm/performance/grpo-qwen3-235b-16n8g.sh +## ASYNC 1-off tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-async-1off.sh tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.sh tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g-async-1off.sh tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.sh -tests/test_suites/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.sh \ No newline at end of file +## ASYNC many-off +tests/test_suites/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.sh + +# H100 FP8 + +## ASYNC 1-off +tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.sh +tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.sh + +# GB200 BF16 + +## SYNC +tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh +tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh + +## ASYNC 1-off +tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh +tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh From 7977eaa73c1174e867bea3791cce6fc969abb9f5 Mon Sep 17 00:00:00 2001 From: Guyue Huang Date: Fri, 19 Dec 2025 10:11:51 -0800 Subject: [PATCH 3/9] Revert a change to moonlight16b fp8 Signed-off-by: Guyue Huang --- .../llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml index 98fbed9812..27108c55c7 100644 --- a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml +++ b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml @@ -43,6 +43,10 @@ policy: precision: fp8 use_deep_gemm: true gpu_memory_utilization: 0.5 + quantization_ignored_layer_kws: [ + a_proj, + b_proj + ] logger: monitor_gpus: false wandb: From fedf77081fb4e4f95ee22a6ee7f3389cae54ce74 Mon Sep 17 00:00:00 2001 From: Guyue Huang Date: Fri, 19 Dec 2025 10:28:36 -0800 Subject: [PATCH 4/9] fix deepseek fp8 recipe Signed-off-by: Guyue Huang --- .../performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml index b47aa65fb6..a9be31b136 100644 --- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml @@ -8,6 +8,7 @@ policy: fp8: "e4m3" fp8_recipe: "blockwise" fp8_param: false + moe_router_dtype: fp32 env_vars: NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1" generation: @@ -15,6 +16,10 @@ policy: tensor_parallel_size: 16 precision: "fp8" use_deep_gemm: true + quantization_ignored_layer_kws: [ + a_proj, + b_proj + ] logger: log_dir: logs/grpo-deepseek-v3-64n8g-fp8-async-1off wandb: From 0a3c512c0fb467a60ad540628681e8b3c6b73726 Mon Sep 17 00:00:00 2001 From: Guyue Huang Date: Fri, 19 Dec 2025 15:41:58 -0800 Subject: [PATCH 5/9] script fix Signed-off-by: Guyue Huang --- .../llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml | 3 +++ .../performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml | 4 +++- .../recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml | 2 -- .../llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml | 5 +++-- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml index c019e34962..bf9a30a5d3 100644 --- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml @@ -10,6 +10,9 @@ policy: num_layers_in_first_pipeline_stage: 7 num_layers_in_last_pipeline_stage: 6 generation: + colocated: + resources: + gpus_per_node: 4 vllm_cfg: tensor_parallel_size: 16 gpu_memory_utilization: 0.8 diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml index a9be31b136..7f6b5ae86b 100644 --- a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml @@ -20,7 +20,9 @@ policy: a_proj, b_proj ] + vllm_kwargs: + max_num_seqs: 32 logger: log_dir: logs/grpo-deepseek-v3-64n8g-fp8-async-1off wandb: - name: grpo-deepseek-v3-64n8g-fp8-async-1off \ No newline at end of file + name: grpo-deepseek-v3-64n8g-fp8-async-1off diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml index 0b1120b64a..1640deda09 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml @@ -2,8 +2,6 @@ defaults: ./grpo-qwen3-235b-16n8g.yaml checkpointing: checkpoint_dir: results/grpo-qwen3-235b-16n4g policy: - sequence_packing: - enabled: false megatron_cfg: pipeline_model_parallel_size: 4 num_layers_in_first_pipeline_stage: 23 diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml index ae8f4bb25b..f55b383686 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml @@ -2,13 +2,14 @@ defaults: ./grpo-qwen3-235b-32n8g-async-1off.yaml checkpointing: checkpoint_dir: results/grpo-qwen3-235b-32n4g-async-1off policy: - sequence_packing: - enabled: false megatron_cfg: pipeline_model_parallel_size: 4 num_layers_in_first_pipeline_stage: 23 num_layers_in_last_pipeline_stage: 23 generation: + colocated: + resources: + gpus_per_node: 4 vllm_cfg: tensor_parallel_size: 8 logger: From 477dddab7211ed0610921c99f18b3ba1adee7973 Mon Sep 17 00:00:00 2001 From: Seonjin Date: Fri, 19 Dec 2025 16:26:26 -0800 Subject: [PATCH 6/9] feat: GB200 Perf recipes for Qwen3-30BA3, Qwen3-32B, LLaMA3.1-8B (#1666) --- ...-llama3.1-8b-instruct-2n4g-async-1off.yaml | 32 ++++++++++ .../grpo-llama3.1-8b-instruct-2n4g.yaml | 58 +++++++++++++++++++ .../performance/grpo-qwen3-30ba3b-4n4g.yaml | 45 ++++++++++++++ .../grpo-qwen3-30ba3b-8n4g-async-1off.yaml | 33 +++++++++++ .../llm/performance/grpo-qwen3-32b-4n4g.yaml | 42 ++++++++++++++ .../grpo-qwen3-32b-8n4g-async-1off.yaml | 33 +++++++++++ ...po-llama3.1-8b-instruct-2n4g-async-1off.sh | 39 +++++++++++++ .../grpo-llama3.1-8b-instruct-2n4g.sh | 39 +++++++++++++ .../llm/performance/grpo-qwen3-30ba3b-4n4g.sh | 40 +++++++++++++ .../grpo-qwen3-30ba3b-8n4g-async-1off.sh | 40 +++++++++++++ .../llm/performance/grpo-qwen3-32b-4n4g.sh | 40 +++++++++++++ .../grpo-qwen3-32b-8n4g-async-1off.sh | 39 +++++++++++++ tests/test_suites/performance.txt | 6 ++ 13 files changed, 486 insertions(+) create mode 100644 examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml create mode 100644 examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml create mode 100644 examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml create mode 100644 examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml create mode 100644 examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml create mode 100644 examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml create mode 100755 tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh create mode 100755 tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh create mode 100755 tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh create mode 100644 tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh create mode 100755 tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh create mode 100644 tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml new file mode 100644 index 0000000000..d906eda2b4 --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml @@ -0,0 +1,32 @@ +defaults: ./grpo-llama3.1-8b-instruct-2n4g.yaml +grpo: + async_grpo: + enabled: true + max_trajectory_age_steps: 1 + in_flight_weight_updates: true +loss_fn: + use_importance_sampling_correction: true +checkpointing: + checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n4g-async-1off +policy: + megatron_cfg: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + sequence_parallel: false + generation: + colocated: + enabled: false + resources: + num_nodes: 1 + gpus_per_node: 4 + vllm_cfg: + async_engine: true + tensor_parallel_size: 1 + gpu_memory_utilization: 0.8 +logger: + log_dir: logs/grpo-llama3.1-8b-instruct-2n4g-async-1off + wandb: + name: grpo-llama3.1-8b-instruct-2n4g-async-1off +cluster: + gpus_per_node: 4 + num_nodes: 2 diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml new file mode 100644 index 0000000000..a99f7c1498 --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml @@ -0,0 +1,58 @@ +defaults: ../../../grpo_math_1B.yaml +grpo: + num_prompts_per_step: 64 + num_generations_per_prompt: 32 + max_num_steps: 500 +loss_fn: + use_importance_sampling_correction: true +checkpointing: + checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n4g +policy: + model_name: meta-llama/Llama-3.1-8B-Instruct + tokenizer: + name: meta-llama/Llama-3.1-8B-Instruct + train_micro_batch_size: 1 + logprob_batch_size: 2 + max_total_sequence_length: 4096 + make_sequence_length_divisible_by: 1 + dtensor_cfg: + enabled: false + megatron_cfg: + enabled: true + empty_unused_memory_level: 1 + converter_type: LlamaForCausalLM + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + sequence_parallel: false + activation_checkpointing: true + defer_fp32_logits: true + optimizer: + lr: 5.0e-07 + min_lr: 5.0e-08 + weight_decay: 0.0 + use_precision_aware_optimizer: true + scheduler: + lr_warmup_iters: 2 + lr_warmup_init: 5.0e-08 + fp8_cfg: + enabled: false + generation: + max_new_tokens: 4096 + stop_token_ids: + - 128009 + vllm_cfg: + max_model_len: 4096 + tensor_parallel_size: 1 +data: + max_input_seq_length: 4096 +logger: + log_dir: logs/grpo-llama3.1-8b-instruct-2n4g + wandb_enabled: true + tensorboard_enabled: true + wandb: + project: nemo-rl + name: grpo-llama3.1-8b-instruct-2n4g +cluster: + gpus_per_node: 4 + num_nodes: 2 + diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml new file mode 100644 index 0000000000..21b9746f4b --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml @@ -0,0 +1,45 @@ +defaults: ../../../grpo_math_1B.yaml +grpo: + num_prompts_per_step: 64 + num_generations_per_prompt: 32 +checkpointing: + enabled: false + checkpoint_dir: results/grpo-qwen3-30ba3b-4n4g +policy: + model_name: Qwen/Qwen3-30B-A3B + train_micro_batch_size: 1 + max_total_sequence_length: 4096 + dtensor_cfg: + enabled: false + optimizer: null + scheduler: null + make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} + megatron_cfg: + enabled: true + empty_unused_memory_level: 1 + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 16 + sequence_parallel: false + optimizer: + lr: 3.0e-07 + min_lr: 3.0e-08 + scheduler: + lr_warmup_iters: 50 + lr_warmup_init: 3.0e-08 + env_vars: + PYTORCH_CUDA_ALLOC_CONF: expandable_segments:False + generation: + vllm_cfg: + tensor_parallel_size: 1 +logger: + log_dir: logs/grpo-qwen3-30ba3b-4n4g + wandb_enabled: true + tensorboard_enabled: true + wandb: + project: nemo-rl + name: grpo-qwen3-30ba3b-4n4g +cluster: + gpus_per_node: 4 + num_nodes: 4 + diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml new file mode 100644 index 0000000000..a9837c87f2 --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml @@ -0,0 +1,33 @@ +defaults: ./grpo-qwen3-30ba3b-4n4g.yaml +grpo: + async_grpo: + enabled: true + max_trajectory_age_steps: 1 + in_flight_weight_updates: true +loss_fn: + use_importance_sampling_correction: true +checkpointing: + checkpoint_dir: results/grpo-qwen3-30ba3b-8n4g-async-1off +policy: + megatron_cfg: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 16 + sequence_parallel: false + generation: + colocated: + enabled: false + resources: + num_nodes: 4 + gpus_per_node: 4 + vllm_cfg: + async_engine: true + tensor_parallel_size: 1 + gpu_memory_utilization: 0.8 +logger: + log_dir: logs/grpo-qwen3-30ba3b-8n4g-async-1off + wandb: + name: grpo-qwen3-30ba3b-8n4g-async-1off +cluster: + gpus_per_node: 4 + num_nodes: 8 diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml new file mode 100644 index 0000000000..2e441cdb5f --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml @@ -0,0 +1,42 @@ +defaults: ../../../grpo_math_1B.yaml +grpo: + num_prompts_per_step: 64 + num_generations_per_prompt: 32 +checkpointing: + enabled: false + checkpoint_dir: results/grpo-qwen3-32b-4n4g +policy: + model_name: Qwen/Qwen3-32B + train_micro_batch_size: 1 + max_total_sequence_length: 4096 + dtensor_cfg: + enabled: false + optimizer: null + scheduler: null + make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} + megatron_cfg: + enabled: true + empty_unused_memory_level: 1 + tensor_model_parallel_size: 2 + pipeline_model_parallel_size: 1 + sequence_parallel: true + optimizer: + lr: 3.0e-07 + min_lr: 3.0e-08 + scheduler: + lr_warmup_iters: 2 + lr_warmup_init: 3.0e-08 + generation: + vllm_cfg: + tensor_parallel_size: 1 +logger: + log_dir: logs/grpo-qwen3-32b-4n4g + wandb_enabled: true + tensorboard_enabled: true + wandb: + project: nemo-rl + name: grpo-qwen3-32b-4n4g +cluster: + gpus_per_node: 4 + num_nodes: 4 + diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml new file mode 100644 index 0000000000..4f8a0a03bb --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml @@ -0,0 +1,33 @@ +defaults: ./grpo-qwen3-32b-4n4g.yaml +grpo: + async_grpo: + enabled: true + max_trajectory_age_steps: 1 + in_flight_weight_updates: true +loss_fn: + use_importance_sampling_correction: true +checkpointing: + checkpoint_dir: results/grpo-qwen3-32b-8n4g-async-1off +policy: + megatron_cfg: + tensor_model_parallel_size: 2 + pipeline_model_parallel_size: 1 + sequence_parallel: true + generation: + colocated: + enabled: false + resources: + num_nodes: 4 + gpus_per_node: 4 + vllm_cfg: + async_engine: true + tensor_parallel_size: 1 + gpu_memory_utilization: 0.8 +logger: + log_dir: logs/grpo-qwen3-32b-8n4g-async-1off + wandb: + name: grpo-qwen3-32b-8n4g-async-1off +cluster: + gpus_per_node: 4 + num_nodes: 8 + diff --git a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh new file mode 100755 index 0000000000..e7636f3e93 --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh @@ -0,0 +1,39 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=2 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=100 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' +fi diff --git a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh new file mode 100755 index 0000000000..e7636f3e93 --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh @@ -0,0 +1,39 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=2 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=100 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' +fi diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh new file mode 100755 index 0000000000..2a56609ffd --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh @@ -0,0 +1,40 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=4 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=100 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' +fi + diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh new file mode 100644 index 0000000000..8350d128e8 --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh @@ -0,0 +1,40 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=8 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=100 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' +fi + diff --git a/tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh b/tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh new file mode 100755 index 0000000000..2a56609ffd --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh @@ -0,0 +1,40 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=4 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=100 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' +fi + diff --git a/tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh new file mode 100644 index 0000000000..35d58c98f7 --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh @@ -0,0 +1,39 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=8 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=100 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' +fi diff --git a/tests/test_suites/performance.txt b/tests/test_suites/performance.txt index e96ba7e110..2bc3e13efd 100644 --- a/tests/test_suites/performance.txt +++ b/tests/test_suites/performance.txt @@ -30,9 +30,15 @@ tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off. # GB200 BF16 ## SYNC +tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh +tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh +tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh ## ASYNC 1-off +tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh +tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh +tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh From dd196a642ee71def344ba93b9fd5f1b8c7942387 Mon Sep 17 00:00:00 2001 From: Guyue Huang Date: Fri, 19 Dec 2025 16:28:34 -0800 Subject: [PATCH 7/9] Separate gb200 and h100 perf test Signed-off-by: Guyue Huang --- tests/test_suites/performance_gb200.txt | 19 +++++++++++++++++++ .../{performance.txt => performance_h100.txt} | 15 --------------- 2 files changed, 19 insertions(+), 15 deletions(-) create mode 100644 tests/test_suites/performance_gb200.txt rename tests/test_suites/{performance.txt => performance_h100.txt} (58%) diff --git a/tests/test_suites/performance_gb200.txt b/tests/test_suites/performance_gb200.txt new file mode 100644 index 0000000000..d958386001 --- /dev/null +++ b/tests/test_suites/performance_gb200.txt @@ -0,0 +1,19 @@ +######## +# GRPO # +######## + +# GB200 BF16 + +## SYNC +tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh +tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh +tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh +tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh +tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh + +## ASYNC 1-off +tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh +tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh +tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh +tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh +tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh diff --git a/tests/test_suites/performance.txt b/tests/test_suites/performance_h100.txt similarity index 58% rename from tests/test_suites/performance.txt rename to tests/test_suites/performance_h100.txt index 2bc3e13efd..9e3eb208ce 100644 --- a/tests/test_suites/performance.txt +++ b/tests/test_suites/performance_h100.txt @@ -27,18 +27,3 @@ tests/test_suites/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.sh tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.sh tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.sh -# GB200 BF16 - -## SYNC -tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh -tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh -tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh -tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh -tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh - -## ASYNC 1-off -tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh -tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh -tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh -tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh -tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh From 22dc2ba6cae56278339dc4f4d598d3f0888d8431 Mon Sep 17 00:00:00 2001 From: Guyue Huang Date: Fri, 19 Dec 2025 17:18:42 -0800 Subject: [PATCH 8/9] Fix unit test Signed-off-by: Guyue Huang --- tests/unit/test_recipes_and_test_suites.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py index 435742ec4a..101f4b21e5 100644 --- a/tests/unit/test_recipes_and_test_suites.py +++ b/tests/unit/test_recipes_and_test_suites.py @@ -28,7 +28,8 @@ nightly_test_suite_path = os.path.join(test_suites_dir, "nightly.txt") release_test_suite_path = os.path.join(test_suites_dir, "release.txt") -performance_test_suite_path = os.path.join(test_suites_dir, "performance.txt") +h100_performance_test_suite_path = os.path.join(test_suites_dir, "performance_h100.txt") +gb200_performance_test_suite_path = os.path.join(test_suites_dir, "performance_gb200.txt") # Relative to project root ALGO_MAPPING_TO_BASE_YAML = { @@ -72,7 +73,12 @@ def release_test_suite(): @pytest.fixture def performance_test_suite(): performance_suite = [] - with open(performance_test_suite_path, "r") as f: + with open(h100_performance_test_suite_path, "r") as f: + for line in f: + line = line.strip() + if line and not line.startswith("#"): + performance_suite.append(line) + with open(gb200_performance_test_suite_path, "r") as f: for line in f: line = line.strip() if line and not line.startswith("#"): @@ -104,12 +110,14 @@ def all_recipe_yaml_rel_paths(): [ nightly_test_suite_path, release_test_suite_path, - performance_test_suite_path, + h100_performance_test_suite_path, + gb200_performance_test_suite_path, ], ids=[ "nightly_test_suite", "release_test_suite", - "performance_test_suite", + "h100_performance_test_suite", + "gb200_performance_test_suite", ], ) def test_test_suites_exist(test_suite_path): From f75a469fc64e191375d87d4c5aeaee0b74ec74aa Mon Sep 17 00:00:00 2001 From: Guyue Huang Date: Fri, 19 Dec 2025 20:27:20 -0800 Subject: [PATCH 9/9] Fix lint and unit test Signed-off-by: Guyue Huang --- .../llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh | 0 .../llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh | 0 tests/unit/test_recipes_and_test_suites.py | 4 +++- 3 files changed, 3 insertions(+), 1 deletion(-) mode change 100644 => 100755 tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh mode change 100644 => 100755 tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh old mode 100644 new mode 100755 diff --git a/tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh old mode 100644 new mode 100755 diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py index 101f4b21e5..ade6d49d87 100644 --- a/tests/unit/test_recipes_and_test_suites.py +++ b/tests/unit/test_recipes_and_test_suites.py @@ -29,7 +29,9 @@ nightly_test_suite_path = os.path.join(test_suites_dir, "nightly.txt") release_test_suite_path = os.path.join(test_suites_dir, "release.txt") h100_performance_test_suite_path = os.path.join(test_suites_dir, "performance_h100.txt") -gb200_performance_test_suite_path = os.path.join(test_suites_dir, "performance_gb200.txt") +gb200_performance_test_suite_path = os.path.join( + test_suites_dir, "performance_gb200.txt" +) # Relative to project root ALGO_MAPPING_TO_BASE_YAML = {