diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml index 19c86c8a04..27108c55c7 100644 --- a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml +++ b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml @@ -16,8 +16,6 @@ policy: max_total_sequence_length: 8192 dtensor_cfg: enabled: false - sequence_packing: - algorithm: modified_ffd make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} optimizer: null megatron_cfg: @@ -45,7 +43,6 @@ policy: precision: fp8 use_deep_gemm: true gpu_memory_utilization: 0.5 - expert_parallel_size: 4 quantization_ignored_layer_kws: [ a_proj, b_proj diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml new file mode 100644 index 0000000000..812fdfae9d --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml @@ -0,0 +1,21 @@ +defaults: ./grpo-deepseek-v3-32n8g.yaml +checkpointing: + checkpoint_dir: results/grpo-deepseek-v3-32n4g +policy: + sequence_packing: + enabled: false + megatron_cfg: + pipeline_model_parallel_size: 8 + expert_model_parallel_size: 16 + num_layers_in_first_pipeline_stage: 7 + num_layers_in_last_pipeline_stage: 6 + generation: + vllm_cfg: + tensor_parallel_size: 16 +logger: + log_dir: logs/grpo-deepseek-v3-32n4g + wandb: + name: grpo-deepseek-v3-32n4g +cluster: + gpus_per_node: 4 + num_nodes: 32 diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml new file mode 100644 index 0000000000..bf9a30a5d3 --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n4g-async-1off.yaml @@ -0,0 +1,25 @@ +defaults: ./grpo-deepseek-v3-64n8g-async-1off.yaml +checkpointing: + checkpoint_dir: results/grpo-deepseek-v3-64n4g-async-1off +policy: + sequence_packing: + enabled: false + megatron_cfg: + pipeline_model_parallel_size: 8 + expert_model_parallel_size: 16 + num_layers_in_first_pipeline_stage: 7 + num_layers_in_last_pipeline_stage: 6 + generation: + colocated: + resources: + gpus_per_node: 4 + vllm_cfg: + tensor_parallel_size: 16 + gpu_memory_utilization: 0.8 +logger: + log_dir: logs/grpo-deepseek-v3-64n4g-async-32T32G-1off + wandb: + name: grpo-deepseek-v3-64n4g-async-32T32G-1off +cluster: + gpus_per_node: 4 + num_nodes: 64 diff --git a/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml new file mode 100644 index 0000000000..7f6b5ae86b --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.yaml @@ -0,0 +1,28 @@ +defaults: ./grpo-deepseek-v3-64n8g-async-1off.yaml +checkpointing: + checkpoint_dir: results/grpo-deepseek-v3-64n8g-fp8-async-1off +policy: + megatron_cfg: + fp8_cfg: + enabled: true + fp8: "e4m3" + fp8_recipe: "blockwise" + fp8_param: false + moe_router_dtype: fp32 + env_vars: + NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1" + generation: + vllm_cfg: + tensor_parallel_size: 16 + precision: "fp8" + use_deep_gemm: true + quantization_ignored_layer_kws: [ + a_proj, + b_proj + ] + vllm_kwargs: + max_num_seqs: 32 +logger: + log_dir: logs/grpo-deepseek-v3-64n8g-fp8-async-1off + wandb: + name: grpo-deepseek-v3-64n8g-fp8-async-1off diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml new file mode 100644 index 0000000000..d906eda2b4 --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.yaml @@ -0,0 +1,32 @@ +defaults: ./grpo-llama3.1-8b-instruct-2n4g.yaml +grpo: + async_grpo: + enabled: true + max_trajectory_age_steps: 1 + in_flight_weight_updates: true +loss_fn: + use_importance_sampling_correction: true +checkpointing: + checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n4g-async-1off +policy: + megatron_cfg: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + sequence_parallel: false + generation: + colocated: + enabled: false + resources: + num_nodes: 1 + gpus_per_node: 4 + vllm_cfg: + async_engine: true + tensor_parallel_size: 1 + gpu_memory_utilization: 0.8 +logger: + log_dir: logs/grpo-llama3.1-8b-instruct-2n4g-async-1off + wandb: + name: grpo-llama3.1-8b-instruct-2n4g-async-1off +cluster: + gpus_per_node: 4 + num_nodes: 2 diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml new file mode 100644 index 0000000000..a99f7c1498 --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n4g.yaml @@ -0,0 +1,58 @@ +defaults: ../../../grpo_math_1B.yaml +grpo: + num_prompts_per_step: 64 + num_generations_per_prompt: 32 + max_num_steps: 500 +loss_fn: + use_importance_sampling_correction: true +checkpointing: + checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n4g +policy: + model_name: meta-llama/Llama-3.1-8B-Instruct + tokenizer: + name: meta-llama/Llama-3.1-8B-Instruct + train_micro_batch_size: 1 + logprob_batch_size: 2 + max_total_sequence_length: 4096 + make_sequence_length_divisible_by: 1 + dtensor_cfg: + enabled: false + megatron_cfg: + enabled: true + empty_unused_memory_level: 1 + converter_type: LlamaForCausalLM + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + sequence_parallel: false + activation_checkpointing: true + defer_fp32_logits: true + optimizer: + lr: 5.0e-07 + min_lr: 5.0e-08 + weight_decay: 0.0 + use_precision_aware_optimizer: true + scheduler: + lr_warmup_iters: 2 + lr_warmup_init: 5.0e-08 + fp8_cfg: + enabled: false + generation: + max_new_tokens: 4096 + stop_token_ids: + - 128009 + vllm_cfg: + max_model_len: 4096 + tensor_parallel_size: 1 +data: + max_input_seq_length: 4096 +logger: + log_dir: logs/grpo-llama3.1-8b-instruct-2n4g + wandb_enabled: true + tensorboard_enabled: true + wandb: + project: nemo-rl + name: grpo-llama3.1-8b-instruct-2n4g +cluster: + gpus_per_node: 4 + num_nodes: 2 + diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml index b6d7ed441d..c0263f68fb 100644 --- a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml +++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml @@ -9,6 +9,8 @@ loss_fn: checkpointing: checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n8g-async-1off policy: + megatron_cfg: + pipeline_model_parallel_size: 1 generation: colocated: enabled: false diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml new file mode 100644 index 0000000000..b32786f7d7 --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml @@ -0,0 +1,20 @@ +defaults: ./grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml +checkpointing: + checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off +policy: + megatron_cfg: + fp8_cfg: + enabled: true + fp8: "e4m3" + fp8_recipe: "blockwise" + fp8_param: false + env_vars: + NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1" + generation: + vllm_cfg: + precision: "fp8" + use_deep_gemm: true +logger: + log_dir: logs/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off + wandb: + name: grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off \ No newline at end of file diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml new file mode 100644 index 0000000000..1640deda09 --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml @@ -0,0 +1,18 @@ +defaults: ./grpo-qwen3-235b-16n8g.yaml +checkpointing: + checkpoint_dir: results/grpo-qwen3-235b-16n4g +policy: + megatron_cfg: + pipeline_model_parallel_size: 4 + num_layers_in_first_pipeline_stage: 23 + num_layers_in_last_pipeline_stage: 23 + generation: + vllm_cfg: + tensor_parallel_size: 8 +logger: + log_dir: logs/grpo-qwen3-235b-16n4g + wandb: + name: grpo-qwen3-235b-16n4g +cluster: + gpus_per_node: 4 + num_nodes: 16 diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml new file mode 100644 index 0000000000..f55b383686 --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-235b-32n4g-async-1off.yaml @@ -0,0 +1,21 @@ +defaults: ./grpo-qwen3-235b-32n8g-async-1off.yaml +checkpointing: + checkpoint_dir: results/grpo-qwen3-235b-32n4g-async-1off +policy: + megatron_cfg: + pipeline_model_parallel_size: 4 + num_layers_in_first_pipeline_stage: 23 + num_layers_in_last_pipeline_stage: 23 + generation: + colocated: + resources: + gpus_per_node: 4 + vllm_cfg: + tensor_parallel_size: 8 +logger: + log_dir: logs/grpo-qwen3-235b-32n4g-async-1off + wandb: + name: grpo-qwen3-235b-32n4g-async-1off +cluster: + gpus_per_node: 4 + num_nodes: 32 diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml new file mode 100644 index 0000000000..21b9746f4b --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml @@ -0,0 +1,45 @@ +defaults: ../../../grpo_math_1B.yaml +grpo: + num_prompts_per_step: 64 + num_generations_per_prompt: 32 +checkpointing: + enabled: false + checkpoint_dir: results/grpo-qwen3-30ba3b-4n4g +policy: + model_name: Qwen/Qwen3-30B-A3B + train_micro_batch_size: 1 + max_total_sequence_length: 4096 + dtensor_cfg: + enabled: false + optimizer: null + scheduler: null + make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} + megatron_cfg: + enabled: true + empty_unused_memory_level: 1 + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 16 + sequence_parallel: false + optimizer: + lr: 3.0e-07 + min_lr: 3.0e-08 + scheduler: + lr_warmup_iters: 50 + lr_warmup_init: 3.0e-08 + env_vars: + PYTORCH_CUDA_ALLOC_CONF: expandable_segments:False + generation: + vllm_cfg: + tensor_parallel_size: 1 +logger: + log_dir: logs/grpo-qwen3-30ba3b-4n4g + wandb_enabled: true + tensorboard_enabled: true + wandb: + project: nemo-rl + name: grpo-qwen3-30ba3b-4n4g +cluster: + gpus_per_node: 4 + num_nodes: 4 + diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml new file mode 100644 index 0000000000..a9837c87f2 --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.yaml @@ -0,0 +1,33 @@ +defaults: ./grpo-qwen3-30ba3b-4n4g.yaml +grpo: + async_grpo: + enabled: true + max_trajectory_age_steps: 1 + in_flight_weight_updates: true +loss_fn: + use_importance_sampling_correction: true +checkpointing: + checkpoint_dir: results/grpo-qwen3-30ba3b-8n4g-async-1off +policy: + megatron_cfg: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 16 + sequence_parallel: false + generation: + colocated: + enabled: false + resources: + num_nodes: 4 + gpus_per_node: 4 + vllm_cfg: + async_engine: true + tensor_parallel_size: 1 + gpu_memory_utilization: 0.8 +logger: + log_dir: logs/grpo-qwen3-30ba3b-8n4g-async-1off + wandb: + name: grpo-qwen3-30ba3b-8n4g-async-1off +cluster: + gpus_per_node: 4 + num_nodes: 8 diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml new file mode 100644 index 0000000000..2e441cdb5f --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n4g.yaml @@ -0,0 +1,42 @@ +defaults: ../../../grpo_math_1B.yaml +grpo: + num_prompts_per_step: 64 + num_generations_per_prompt: 32 +checkpointing: + enabled: false + checkpoint_dir: results/grpo-qwen3-32b-4n4g +policy: + model_name: Qwen/Qwen3-32B + train_micro_batch_size: 1 + max_total_sequence_length: 4096 + dtensor_cfg: + enabled: false + optimizer: null + scheduler: null + make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} + megatron_cfg: + enabled: true + empty_unused_memory_level: 1 + tensor_model_parallel_size: 2 + pipeline_model_parallel_size: 1 + sequence_parallel: true + optimizer: + lr: 3.0e-07 + min_lr: 3.0e-08 + scheduler: + lr_warmup_iters: 2 + lr_warmup_init: 3.0e-08 + generation: + vllm_cfg: + tensor_parallel_size: 1 +logger: + log_dir: logs/grpo-qwen3-32b-4n4g + wandb_enabled: true + tensorboard_enabled: true + wandb: + project: nemo-rl + name: grpo-qwen3-32b-4n4g +cluster: + gpus_per_node: 4 + num_nodes: 4 + diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml new file mode 100644 index 0000000000..4f8a0a03bb --- /dev/null +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-8n4g-async-1off.yaml @@ -0,0 +1,33 @@ +defaults: ./grpo-qwen3-32b-4n4g.yaml +grpo: + async_grpo: + enabled: true + max_trajectory_age_steps: 1 + in_flight_weight_updates: true +loss_fn: + use_importance_sampling_correction: true +checkpointing: + checkpoint_dir: results/grpo-qwen3-32b-8n4g-async-1off +policy: + megatron_cfg: + tensor_model_parallel_size: 2 + pipeline_model_parallel_size: 1 + sequence_parallel: true + generation: + colocated: + enabled: false + resources: + num_nodes: 4 + gpus_per_node: 4 + vllm_cfg: + async_engine: true + tensor_parallel_size: 1 + gpu_memory_utilization: 0.8 +logger: + log_dir: logs/grpo-qwen3-32b-8n4g-async-1off + wandb: + name: grpo-qwen3-32b-8n4g-async-1off +cluster: + gpus_per_node: 4 + num_nodes: 8 + diff --git a/tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh b/tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh new file mode 100755 index 0000000000..738b38dd5b --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh @@ -0,0 +1,45 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env +# disable NVLS to avoid OOM issue +export NCCL_NVLS_ENABLE=0 +# allow user to pass an existing HF checkpoint path based on instruction in https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/deepseek.md +export MODEL_NAME=${NRL_DEEPSEEK_V3_HF_CKPT:-"unsloth/DeepSeek-V3-0324-BF16"} + +# ===== BEGIN CONFIG ===== +NUM_NODES=32 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=240 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + policy.model_name=$MODEL_NAME \ + policy.tokenizer.name=$MODEL_NAME \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' +fi diff --git a/tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh new file mode 100755 index 0000000000..14138486e1 --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh @@ -0,0 +1,45 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env +# disable NVLS to avoid OOM issue +export NCCL_NVLS_ENABLE=0 +# allow user to pass an existing HF checkpoint path based on instruction in https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/deepseek.md +export MODEL_NAME=${NRL_DEEPSEEK_V3_HF_CKPT:-"unsloth/DeepSeek-V3-0324-BF16"} + +# ===== BEGIN CONFIG ===== +NUM_NODES=64 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=240 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + policy.model_name=$MODEL_NAME \ + policy.tokenizer.name=$MODEL_NAME \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' +fi diff --git a/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.sh b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.sh new file mode 100755 index 0000000000..14138486e1 --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.sh @@ -0,0 +1,45 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env +# disable NVLS to avoid OOM issue +export NCCL_NVLS_ENABLE=0 +# allow user to pass an existing HF checkpoint path based on instruction in https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/deepseek.md +export MODEL_NAME=${NRL_DEEPSEEK_V3_HF_CKPT:-"unsloth/DeepSeek-V3-0324-BF16"} + +# ===== BEGIN CONFIG ===== +NUM_NODES=64 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=240 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + policy.model_name=$MODEL_NAME \ + policy.tokenizer.name=$MODEL_NAME \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' +fi diff --git a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh new file mode 100755 index 0000000000..e7636f3e93 --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh @@ -0,0 +1,39 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=2 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=100 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' +fi diff --git a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh new file mode 100755 index 0000000000..e7636f3e93 --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh @@ -0,0 +1,39 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=2 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=100 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' +fi diff --git a/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.sh b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.sh new file mode 100755 index 0000000000..e7636f3e93 --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.sh @@ -0,0 +1,39 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=2 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=100 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' +fi diff --git a/tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh b/tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh new file mode 100755 index 0000000000..0f9bf9289f --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh @@ -0,0 +1,40 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env +# disable NVLS to avoid OOM issue +export NCCL_NVLS_ENABLE=0 + +# ===== BEGIN CONFIG ===== +NUM_NODES=16 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=100 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' +fi diff --git a/tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh new file mode 100755 index 0000000000..f7dac553af --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh @@ -0,0 +1,40 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env +# disable NVLS to avoid OOM issue +export NCCL_NVLS_ENABLE=0 + +# ===== BEGIN CONFIG ===== +NUM_NODES=32 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=100 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' +fi diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh new file mode 100755 index 0000000000..2a56609ffd --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh @@ -0,0 +1,40 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=4 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=100 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' +fi + diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh new file mode 100755 index 0000000000..8350d128e8 --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh @@ -0,0 +1,40 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=8 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=100 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' +fi + diff --git a/tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh b/tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh new file mode 100755 index 0000000000..2a56609ffd --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh @@ -0,0 +1,40 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=4 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=100 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' +fi + diff --git a/tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh new file mode 100755 index 0000000000..35d58c98f7 --- /dev/null +++ b/tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh @@ -0,0 +1,39 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=8 +STEPS_PER_RUN=10 +MAX_STEPS=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=100 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["10"] < 1.1' +fi diff --git a/tests/test_suites/performance_gb200.txt b/tests/test_suites/performance_gb200.txt new file mode 100644 index 0000000000..d958386001 --- /dev/null +++ b/tests/test_suites/performance_gb200.txt @@ -0,0 +1,19 @@ +######## +# GRPO # +######## + +# GB200 BF16 + +## SYNC +tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g.sh +tests/test_suites/llm/performance/grpo-qwen3-32b-4n4g.sh +tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n4g.sh +tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh +tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh + +## ASYNC 1-off +tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n4g-async-1off.sh +tests/test_suites/llm/performance/grpo-qwen3-32b-8n4g-async-1off.sh +tests/test_suites/llm/performance/grpo-qwen3-30ba3b-8n4g-async-1off.sh +tests/test_suites/llm/performance/grpo-deepseek-v3-64n4g-async-1off.sh +tests/test_suites/llm/performance/grpo-qwen3-235b-32n4g-async-1off.sh diff --git a/tests/test_suites/performance.txt b/tests/test_suites/performance_h100.txt similarity index 75% rename from tests/test_suites/performance.txt rename to tests/test_suites/performance_h100.txt index bf714b0e74..9e3eb208ce 100644 --- a/tests/test_suites/performance.txt +++ b/tests/test_suites/performance_h100.txt @@ -2,16 +2,28 @@ # GRPO # ######## +# H100 BF16 + +## SYNC tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g.sh tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g.sh tests/test_suites/llm/performance/grpo-deepseek-v3-32n8g.sh tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh tests/test_suites/llm/performance/grpo-qwen3-235b-16n8g.sh +## ASYNC 1-off tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-async-1off.sh tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.sh tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g-async-1off.sh tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.sh -tests/test_suites/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.sh \ No newline at end of file +## ASYNC many-off +tests/test_suites/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.sh + +# H100 FP8 + +## ASYNC 1-off +tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-fp8-async-1off.sh +tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.sh + diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py index 435742ec4a..ade6d49d87 100644 --- a/tests/unit/test_recipes_and_test_suites.py +++ b/tests/unit/test_recipes_and_test_suites.py @@ -28,7 +28,10 @@ nightly_test_suite_path = os.path.join(test_suites_dir, "nightly.txt") release_test_suite_path = os.path.join(test_suites_dir, "release.txt") -performance_test_suite_path = os.path.join(test_suites_dir, "performance.txt") +h100_performance_test_suite_path = os.path.join(test_suites_dir, "performance_h100.txt") +gb200_performance_test_suite_path = os.path.join( + test_suites_dir, "performance_gb200.txt" +) # Relative to project root ALGO_MAPPING_TO_BASE_YAML = { @@ -72,7 +75,12 @@ def release_test_suite(): @pytest.fixture def performance_test_suite(): performance_suite = [] - with open(performance_test_suite_path, "r") as f: + with open(h100_performance_test_suite_path, "r") as f: + for line in f: + line = line.strip() + if line and not line.startswith("#"): + performance_suite.append(line) + with open(gb200_performance_test_suite_path, "r") as f: for line in f: line = line.strip() if line and not line.startswith("#"): @@ -104,12 +112,14 @@ def all_recipe_yaml_rel_paths(): [ nightly_test_suite_path, release_test_suite_path, - performance_test_suite_path, + h100_performance_test_suite_path, + gb200_performance_test_suite_path, ], ids=[ "nightly_test_suite", "release_test_suite", - "performance_test_suite", + "h100_performance_test_suite", + "gb200_performance_test_suite", ], ) def test_test_suites_exist(test_suite_path):