Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@ policy:
max_total_sequence_length: 8192
dtensor_cfg:
enabled: false
sequence_packing:
algorithm: modified_ffd
make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
optimizer: null
megatron_cfg:
Expand Down Expand Up @@ -45,7 +43,6 @@ policy:
precision: fp8
use_deep_gemm: true
gpu_memory_utilization: 0.5
expert_parallel_size: 4
quantization_ignored_layer_kws: [
a_proj,
b_proj
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
defaults: ./grpo-deepseek-v3-32n8g.yaml
checkpointing:
checkpoint_dir: results/grpo-deepseek-v3-32n4g
policy:
sequence_packing:
enabled: false
megatron_cfg:
pipeline_model_parallel_size: 8
expert_model_parallel_size: 16
num_layers_in_first_pipeline_stage: 7
num_layers_in_last_pipeline_stage: 6
generation:
vllm_cfg:
tensor_parallel_size: 16
logger:
log_dir: logs/grpo-deepseek-v3-32n4g
wandb:
name: grpo-deepseek-v3-32n4g
cluster:
gpus_per_node: 4
num_nodes: 32
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
defaults: ./grpo-deepseek-v3-64n8g-async-1off.yaml
checkpointing:
checkpoint_dir: results/grpo-deepseek-v3-64n4g-async-1off
policy:
sequence_packing:
enabled: false
megatron_cfg:
pipeline_model_parallel_size: 8
expert_model_parallel_size: 16
num_layers_in_first_pipeline_stage: 7
num_layers_in_last_pipeline_stage: 6
generation:
vllm_cfg:
tensor_parallel_size: 16
gpu_memory_utilization: 0.8
logger:
log_dir: logs/grpo-deepseek-v3-64n4g-async-32T32G-1off
wandb:
name: grpo-deepseek-v3-64n4g-async-32T32G-1off
cluster:
gpus_per_node: 4
num_nodes: 64
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
defaults: ./grpo-deepseek-v3-64n8g-async-1off.yaml
checkpointing:
checkpoint_dir: results/grpo-deepseek-v3-64n8g-fp8-async-1off
policy:
megatron_cfg:
fp8_cfg:
enabled: true
fp8: "e4m3"
fp8_recipe: "blockwise"
fp8_param: false
moe_router_dtype: fp32
env_vars:
NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1"
generation:
vllm_cfg:
tensor_parallel_size: 16
precision: "fp8"
use_deep_gemm: true
quantization_ignored_layer_kws: [
a_proj,
b_proj
]
logger:
log_dir: logs/grpo-deepseek-v3-64n8g-fp8-async-1off
wandb:
name: grpo-deepseek-v3-64n8g-fp8-async-1off
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ loss_fn:
checkpointing:
checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n8g-async-1off
policy:
megatron_cfg:
pipeline_model_parallel_size: 1
generation:
colocated:
enabled: false
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
defaults: ./grpo-llama3.1-8b-instruct-2n8g-async-1off.yaml
checkpointing:
checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off
policy:
megatron_cfg:
fp8_cfg:
enabled: true
fp8: "e4m3"
fp8_recipe: "blockwise"
fp8_param: false
env_vars:
NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1"
generation:
vllm_cfg:
precision: "fp8"
use_deep_gemm: true
logger:
log_dir: logs/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off
wandb:
name: grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
defaults: ./grpo-qwen3-235b-16n8g.yaml
checkpointing:
checkpoint_dir: results/grpo-qwen3-235b-16n4g
policy:
sequence_packing:
enabled: false
megatron_cfg:
pipeline_model_parallel_size: 4
num_layers_in_first_pipeline_stage: 23
num_layers_in_last_pipeline_stage: 23
generation:
vllm_cfg:
tensor_parallel_size: 8
logger:
log_dir: logs/grpo-qwen3-235b-16n4g
wandb:
name: grpo-qwen3-235b-16n4g
cluster:
gpus_per_node: 4
num_nodes: 16
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
defaults: ./grpo-qwen3-235b-32n8g-async-1off.yaml
checkpointing:
checkpoint_dir: results/grpo-qwen3-235b-32n4g-async-1off
policy:
sequence_packing:
enabled: false
megatron_cfg:
pipeline_model_parallel_size: 4
num_layers_in_first_pipeline_stage: 23
num_layers_in_last_pipeline_stage: 23
generation:
vllm_cfg:
tensor_parallel_size: 8
logger:
log_dir: logs/grpo-qwen3-235b-32n4g-async-1off
wandb:
name: grpo-qwen3-235b-32n4g-async-1off
cluster:
gpus_per_node: 4
num_nodes: 32
45 changes: 45 additions & 0 deletions tests/test_suites/llm/performance/grpo-deepseek-v3-32n4g.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
source $SCRIPT_DIR/common.env
# disable NVLS to avoid OOM issue
export NCCL_NVLS_ENABLE=0
# allow user to pass an existing HF checkpoint path based on instruction in https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/deepseek.md
export MODEL_NAME=${NRL_DEEPSEEK_V3_HF_CKPT:-"unsloth/DeepSeek-V3-0324-BF16"}

# ===== BEGIN CONFIG =====
NUM_NODES=32
STEPS_PER_RUN=10
MAX_STEPS=10
NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up
NUM_MINUTES=240
# ===== END CONFIG =====

exit_if_max_steps_reached

# Run the experiment
cd $PROJECT_ROOT
uv run examples/run_grpo_math.py \
--config $CONFIG_PATH \
grpo.max_num_steps=$MAX_STEPS \
policy.model_name=$MODEL_NAME \
policy.tokenizer.name=$MODEL_NAME \
logger.log_dir=$LOG_DIR \
logger.wandb_enabled=True \
logger.wandb.project=nemo-rl \
logger.wandb.name=$EXP_NAME \
logger.monitor_gpus=True \
logger.tensorboard_enabled=True \
checkpointing.enabled=True \
checkpointing.checkpoint_dir=$CKPT_DIR \
$@ \
2>&1 | tee $RUN_LOG

# Convert tensorboard logs to json
uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS

# Only run metrics if the target step is reached
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
uv run tests/check_metrics.py $JSON_METRICS \
'mean(data["train/token_mult_prob_error"]) < 1.1' \
'data["train/token_mult_prob_error"]["10"] < 1.1'
fi
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
source $SCRIPT_DIR/common.env
# disable NVLS to avoid OOM issue
export NCCL_NVLS_ENABLE=0
# allow user to pass an existing HF checkpoint path based on instruction in https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/deepseek.md
export MODEL_NAME=${NRL_DEEPSEEK_V3_HF_CKPT:-"unsloth/DeepSeek-V3-0324-BF16"}

# ===== BEGIN CONFIG =====
NUM_NODES=64
STEPS_PER_RUN=10
MAX_STEPS=10
NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up
NUM_MINUTES=240
# ===== END CONFIG =====

exit_if_max_steps_reached

# Run the experiment
cd $PROJECT_ROOT
uv run examples/run_grpo_math.py \
--config $CONFIG_PATH \
grpo.max_num_steps=$MAX_STEPS \
policy.model_name=$MODEL_NAME \
policy.tokenizer.name=$MODEL_NAME \
logger.log_dir=$LOG_DIR \
logger.wandb_enabled=True \
logger.wandb.project=nemo-rl \
logger.wandb.name=$EXP_NAME \
logger.monitor_gpus=True \
logger.tensorboard_enabled=True \
checkpointing.enabled=True \
checkpointing.checkpoint_dir=$CKPT_DIR \
$@ \
2>&1 | tee $RUN_LOG

# Convert tensorboard logs to json
uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS

# Only run metrics if the target step is reached
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
uv run tests/check_metrics.py $JSON_METRICS \
'mean(data["train/token_mult_prob_error"]) < 1.1' \
'data["train/token_mult_prob_error"]["10"] < 1.1'
fi
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
source $SCRIPT_DIR/common.env
# disable NVLS to avoid OOM issue
export NCCL_NVLS_ENABLE=0
# allow user to pass an existing HF checkpoint path based on instruction in https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/deepseek.md
export MODEL_NAME=${NRL_DEEPSEEK_V3_HF_CKPT:-"unsloth/DeepSeek-V3-0324-BF16"}

# ===== BEGIN CONFIG =====
NUM_NODES=64
STEPS_PER_RUN=10
MAX_STEPS=10
NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up
NUM_MINUTES=240
# ===== END CONFIG =====

exit_if_max_steps_reached

# Run the experiment
cd $PROJECT_ROOT
uv run examples/run_grpo_math.py \
--config $CONFIG_PATH \
grpo.max_num_steps=$MAX_STEPS \
policy.model_name=$MODEL_NAME \
policy.tokenizer.name=$MODEL_NAME \
logger.log_dir=$LOG_DIR \
logger.wandb_enabled=True \
logger.wandb.project=nemo-rl \
logger.wandb.name=$EXP_NAME \
logger.monitor_gpus=True \
logger.tensorboard_enabled=True \
checkpointing.enabled=True \
checkpointing.checkpoint_dir=$CKPT_DIR \
$@ \
2>&1 | tee $RUN_LOG

# Convert tensorboard logs to json
uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS

# Only run metrics if the target step is reached
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
uv run tests/check_metrics.py $JSON_METRICS \
'mean(data["train/token_mult_prob_error"]) < 1.1' \
'data["train/token_mult_prob_error"]["10"] < 1.1'
fi
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
source $SCRIPT_DIR/common.env

# ===== BEGIN CONFIG =====
NUM_NODES=2
STEPS_PER_RUN=10
MAX_STEPS=10
NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up
NUM_MINUTES=100
# ===== END CONFIG =====

exit_if_max_steps_reached

# Run the experiment
cd $PROJECT_ROOT
uv run examples/run_grpo_math.py \
--config $CONFIG_PATH \
grpo.max_num_steps=$MAX_STEPS \
logger.log_dir=$LOG_DIR \
logger.wandb_enabled=True \
logger.wandb.project=nemo-rl \
logger.wandb.name=$EXP_NAME \
logger.monitor_gpus=True \
logger.tensorboard_enabled=True \
checkpointing.enabled=True \
checkpointing.checkpoint_dir=$CKPT_DIR \
$@ \
2>&1 | tee $RUN_LOG

# Convert tensorboard logs to json
uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS

# Only run metrics if the target step is reached
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
uv run tests/check_metrics.py $JSON_METRICS \
'mean(data["train/token_mult_prob_error"]) < 1.1' \
'data["train/token_mult_prob_error"]["10"] < 1.1'
fi
40 changes: 40 additions & 0 deletions tests/test_suites/llm/performance/grpo-qwen3-235b-16n4g.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/bin/bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
source $SCRIPT_DIR/common.env
# disable NVLS to avoid OOM issue
export NCCL_NVLS_ENABLE=0

# ===== BEGIN CONFIG =====
NUM_NODES=16
STEPS_PER_RUN=10
MAX_STEPS=10
NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up
NUM_MINUTES=100
# ===== END CONFIG =====

exit_if_max_steps_reached

# Run the experiment
cd $PROJECT_ROOT
uv run examples/run_grpo_math.py \
--config $CONFIG_PATH \
grpo.max_num_steps=$MAX_STEPS \
logger.log_dir=$LOG_DIR \
logger.wandb_enabled=True \
logger.wandb.project=nemo-rl \
logger.wandb.name=$EXP_NAME \
logger.monitor_gpus=True \
checkpointing.enabled=True \
checkpointing.checkpoint_dir=$CKPT_DIR \
$@ \
2>&1 | tee $RUN_LOG

# Convert tensorboard logs to json
uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS

# Only run metrics if the target step is reached
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
uv run tests/check_metrics.py $JSON_METRICS \
'mean(data["train/token_mult_prob_error"]) < 1.1' \
'data["train/token_mult_prob_error"]["10"] < 1.1'
fi
Loading
Loading