Skip to content

Commit 3350ba2

Browse files
guyueh1youngeunkwon0405ZhiyuLi-Nvidiaterrykongashors1
authored
feat: Onboard perf recipes in tests (#1322)
Signed-off-by: Guyue Huang <[email protected]> Signed-off-by: Youngeun Kwon <[email protected]> Signed-off-by: Zhiyu Li <[email protected]> Signed-off-by: Zhiyu Li <[email protected]> Signed-off-by: Terry Kong <[email protected]> Signed-off-by: ashors1 <[email protected]> Signed-off-by: Yuki Huang <[email protected]> Signed-off-by: Anna Shors <[email protected]> Signed-off-by: Yi-Fu Wu <[email protected]> Signed-off-by: Parth Chadha <[email protected]> Signed-off-by: Guyue Huang <[email protected]> Co-authored-by: Youngeun Kwon <[email protected]> Co-authored-by: Zhiyu Li <[email protected]> Co-authored-by: Terry Kong <[email protected]> Co-authored-by: Anna Shors <[email protected]> Co-authored-by: Yuki Huang <[email protected]> Co-authored-by: Yi-Fu Wu <[email protected]> Co-authored-by: Parth Chadha <[email protected]> Co-authored-by: Terry Kong <[email protected]>
1 parent ba68386 commit 3350ba2

23 files changed

+895
-1
lines changed
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
defaults: ../../../grpo_math_1B.yaml
2+
grpo:
3+
num_prompts_per_step: 32
4+
num_generations_per_prompt: 16
5+
max_num_steps: 500
6+
val_batch_size: 5
7+
max_val_samples: 16
8+
loss_fn:
9+
use_importance_sampling_correction: true
10+
checkpointing:
11+
checkpoint_dir: results/grpo-deepseek-v3-32n8g
12+
policy:
13+
model_name: unsloth/DeepSeek-V3-0324-BF16
14+
tokenizer:
15+
name: unsloth/DeepSeek-V3-0324-BF16
16+
train_micro_batch_size: 1
17+
logprob_batch_size: 1
18+
max_total_sequence_length: 1536
19+
make_sequence_length_divisible_by: 1
20+
dtensor_cfg:
21+
enabled: false
22+
megatron_cfg:
23+
enabled: true
24+
empty_unused_memory_level: 1
25+
converter_type: LlamaForCausalLM
26+
pipeline_model_parallel_size: 16
27+
expert_model_parallel_size: 16
28+
activation_checkpointing: true
29+
num_layers_in_first_pipeline_stage: 3
30+
num_layers_in_last_pipeline_stage: 2
31+
apply_rope_fusion: false
32+
moe_permute_fusion: true
33+
defer_fp32_logits: true
34+
optimizer:
35+
lr: 5.0e-07
36+
min_lr: 5.0e-08
37+
weight_decay: 0.0
38+
use_precision_aware_optimizer: true
39+
scheduler:
40+
lr_warmup_iters: 2
41+
lr_warmup_init: 5.0e-08
42+
fp8_cfg:
43+
enabled: false
44+
generation:
45+
vllm_cfg:
46+
tensor_parallel_size: 32
47+
async_engine: true
48+
logger:
49+
log_dir: logs/grpo-deepseek-v3-32n8g
50+
wandb_enabled: true
51+
tensorboard_enabled: true
52+
wandb:
53+
project: nemo-rl
54+
name: grpo-deepseek-v3-32n8g
55+
cluster:
56+
gpus_per_node: 8
57+
num_nodes: 32
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
defaults: ./grpo-deepseek-v3-32n8g.yaml
2+
grpo:
3+
async_grpo:
4+
enabled: true
5+
max_trajectory_age_steps: 1
6+
in_flight_weight_updates: true
7+
loss_fn:
8+
use_importance_sampling_correction: true
9+
checkpointing:
10+
checkpoint_dir: results/grpo-deepseek-v3-64n8g-async-1off
11+
policy:
12+
logprob_batch_size: 2
13+
megatron_cfg:
14+
tensor_model_parallel_size: 1
15+
pipeline_model_parallel_size: 16
16+
expert_model_parallel_size: 16
17+
generation:
18+
colocated:
19+
enabled: false
20+
resources:
21+
num_nodes: 32
22+
gpus_per_node: 8
23+
vllm_cfg:
24+
tensor_parallel_size: 32
25+
gpu_memory_utilization: 0.8
26+
async_engine: true
27+
logger:
28+
log_dir: logs/grpo-deepseek-v3-64n8g-async-32T32G-1off
29+
wandb:
30+
name: grpo-deepseek-v3-64n8g-async-32T32G-1off
31+
cluster:
32+
gpus_per_node: 8
33+
num_nodes: 64
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
defaults: ./grpo-llama3.1-8b-instruct-2n8g.yaml
2+
grpo:
3+
async_grpo:
4+
enabled: true
5+
max_trajectory_age_steps: 1
6+
in_flight_weight_updates: true
7+
loss_fn:
8+
use_importance_sampling_correction: true
9+
checkpointing:
10+
checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n8g-async-1off
11+
policy:
12+
generation:
13+
colocated:
14+
enabled: false
15+
resources:
16+
num_nodes: 1
17+
gpus_per_node: 8
18+
vllm_cfg:
19+
async_engine: true
20+
gpu_memory_utilization: 0.8
21+
logger:
22+
log_dir: logs/grpo-llama3.1-8b-instruct-2n8g-1T1G-async-1off
23+
wandb:
24+
name: grpo-llama3.1-8b-instruct-2n8g-1T1G-async-1off
25+
cluster:
26+
gpus_per_node: 8
27+
num_nodes: 2
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
defaults: ../../../grpo_math_1B.yaml
2+
grpo:
3+
num_prompts_per_step: 64
4+
num_generations_per_prompt: 32
5+
max_num_steps: 500
6+
loss_fn:
7+
use_importance_sampling_correction: true
8+
checkpointing:
9+
checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n8g
10+
policy:
11+
model_name: meta-llama/Llama-3.1-8B-Instruct
12+
tokenizer:
13+
name: meta-llama/Llama-3.1-8B-Instruct
14+
train_micro_batch_size: 1
15+
logprob_batch_size: 2
16+
max_total_sequence_length: 4096
17+
make_sequence_length_divisible_by: 1
18+
dtensor_cfg:
19+
enabled: false
20+
megatron_cfg:
21+
enabled: true
22+
empty_unused_memory_level: 1
23+
converter_type: LlamaForCausalLM
24+
pipeline_model_parallel_size: 2
25+
activation_checkpointing: true
26+
defer_fp32_logits: true
27+
optimizer:
28+
lr: 5.0e-07
29+
min_lr: 5.0e-08
30+
weight_decay: 0.0
31+
use_precision_aware_optimizer: true
32+
scheduler:
33+
lr_warmup_iters: 2
34+
lr_warmup_init: 5.0e-08
35+
fp8_cfg:
36+
enabled: false
37+
generation:
38+
max_new_tokens: 4096
39+
stop_token_ids:
40+
- 128009
41+
vllm_cfg:
42+
max_model_len: 4096
43+
data:
44+
max_input_seq_length: 4096
45+
logger:
46+
log_dir: logs/grpo-llama3.1-8b-instruct-2n8g
47+
wandb_enabled: true
48+
tensorboard_enabled: true
49+
wandb:
50+
project: nemo-rl
51+
name: grpo-llama3.1-8b-instruct-2n8g
52+
cluster:
53+
gpus_per_node: 8
54+
num_nodes: 2
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
defaults: ../../../grpo_math_1B.yaml
2+
grpo:
3+
num_prompts_per_step: 16
4+
num_generations_per_prompt: 32
5+
max_num_steps: 500
6+
val_batch_size: 5
7+
max_val_samples: 16
8+
loss_fn:
9+
use_importance_sampling_correction: true
10+
checkpointing:
11+
checkpoint_dir: results/grpo-qwen3-235b-16n8g
12+
policy:
13+
model_name: Qwen/Qwen3-235B-A22B
14+
tokenizer:
15+
name: Qwen/Qwen3-235B-A22B
16+
train_micro_batch_size: 1
17+
logprob_batch_size: 1
18+
max_total_sequence_length: 8192
19+
make_sequence_length_divisible_by: 1
20+
dtensor_cfg:
21+
enabled: false
22+
megatron_cfg:
23+
enabled: true
24+
empty_unused_memory_level: 1
25+
converter_type: LlamaForCausalLM
26+
tensor_model_parallel_size: 2
27+
sequence_parallel: true
28+
pipeline_model_parallel_size: 8
29+
context_parallel_size: 2
30+
expert_model_parallel_size: 16
31+
activation_checkpointing: true
32+
num_layers_in_first_pipeline_stage: 11
33+
num_layers_in_last_pipeline_stage: 11
34+
moe_permute_fusion: true
35+
defer_fp32_logits: true
36+
optimizer:
37+
lr: 5.0e-07
38+
min_lr: 5.0e-08
39+
weight_decay: 0.0
40+
use_precision_aware_optimizer: true
41+
scheduler:
42+
lr_warmup_iters: 2
43+
lr_warmup_init: 5.0e-08
44+
fp8_cfg:
45+
enabled: false
46+
generation:
47+
vllm_cfg:
48+
tensor_parallel_size: 16
49+
async_engine: true
50+
logger:
51+
log_dir: logs/grpo-qwen3-235b-16n8g
52+
wandb_enabled: true
53+
tensorboard_enabled: false # to avoid a bug
54+
wandb:
55+
project: nemo-rl
56+
name: grpo-qwen3-235b-16n8g
57+
cluster:
58+
gpus_per_node: 8
59+
num_nodes: 16
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
defaults: ./grpo-qwen3-235b-16n8g.yaml
2+
grpo:
3+
async_grpo:
4+
enabled: true
5+
max_trajectory_age_steps: 1
6+
in_flight_weight_updates: true
7+
loss_fn:
8+
use_importance_sampling_correction: true
9+
checkpointing:
10+
checkpoint_dir: results/grpo-qwen3-235b-32n8g-async-1off
11+
policy:
12+
megatron_cfg:
13+
tensor_model_parallel_size: 4
14+
sequence_parallel: true
15+
context_parallel_size: 1
16+
pipeline_model_parallel_size: 8
17+
expert_model_parallel_size: 16
18+
defer_fp32_logits: false
19+
generation:
20+
colocated:
21+
enabled: false
22+
resources:
23+
num_nodes: 16
24+
gpus_per_node: 8
25+
vllm_cfg:
26+
tensor_parallel_size: 8
27+
gpu_memory_utilization: 0.8
28+
async_engine: true
29+
logger:
30+
log_dir: logs/grpo-qwen3-235b-32n8g-16T16G-async-1off
31+
wandb:
32+
name: grpo-qwen3-235b-32n8g-16T16G-async-1off
33+
cluster:
34+
gpus_per_node: 8
35+
num_nodes: 32
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
defaults: ./grpo-qwen3-30ba3b-4n8g.yaml
2+
grpo:
3+
async_grpo:
4+
enabled: true
5+
max_trajectory_age_steps: 1
6+
in_flight_weight_updates: true
7+
loss_fn:
8+
use_importance_sampling_correction: true
9+
checkpointing:
10+
checkpoint_dir: results/grpo-qwen3-30ba3b-4n8g-async-1off
11+
policy:
12+
megatron_cfg:
13+
tensor_model_parallel_size: 2
14+
pipeline_model_parallel_size: 1
15+
expert_model_parallel_size: 8
16+
sequence_parallel: true
17+
generation:
18+
colocated:
19+
enabled: false
20+
resources:
21+
num_nodes: 2
22+
gpus_per_node: 8
23+
vllm_cfg:
24+
async_engine: true
25+
tensor_parallel_size: 4
26+
gpu_memory_utilization: 0.8
27+
logger:
28+
log_dir: logs/grpo-qwen3-30ba3b-4n8g-2T2G-async-1off
29+
wandb:
30+
name: grpo-qwen3-30ba3b-4n8g-2T2G-async-1off
31+
cluster:
32+
gpus_per_node: 8
33+
num_nodes: 4
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
defaults: ../../../grpo_math_1B.yaml
2+
grpo:
3+
num_prompts_per_step: 64
4+
num_generations_per_prompt: 32
5+
checkpointing:
6+
enabled: false
7+
checkpoint_dir: results/grpo-qwen3-30ba3b-4n8g
8+
policy:
9+
model_name: Qwen/Qwen3-30B-A3B
10+
train_micro_batch_size: 1
11+
max_total_sequence_length: 4096
12+
dtensor_cfg:
13+
enabled: false
14+
optimizer: null
15+
scheduler: null
16+
make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
17+
megatron_cfg:
18+
enabled: true
19+
empty_unused_memory_level: 1
20+
tensor_model_parallel_size: 2
21+
pipeline_model_parallel_size: 1
22+
expert_model_parallel_size: 8
23+
sequence_parallel: true
24+
optimizer:
25+
lr: 3.0e-07
26+
min_lr: 3.0e-08
27+
scheduler:
28+
lr_warmup_iters: 50
29+
lr_warmup_init: 3.0e-08
30+
env_vars:
31+
PYTORCH_CUDA_ALLOC_CONF: expandable_segments:False
32+
generation:
33+
vllm_cfg:
34+
tensor_parallel_size: 4
35+
logger:
36+
log_dir: logs/grpo-qwen3-30ba3b-4n8g
37+
wandb_enabled: true
38+
tensorboard_enabled: true
39+
wandb:
40+
project: nemo-rl
41+
name: grpo-qwen3-30ba3b-4n8g
42+
cluster:
43+
gpus_per_node: 8
44+
num_nodes: 4
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
defaults: ../../../grpo_math_1B.yaml
2+
grpo:
3+
num_prompts_per_step: 64
4+
num_generations_per_prompt: 32
5+
checkpointing:
6+
enabled: false
7+
checkpoint_dir: results/grpo-qwen3-32b-4n8g
8+
policy:
9+
model_name: Qwen/Qwen3-32B
10+
train_micro_batch_size: 1
11+
max_total_sequence_length: 4096
12+
dtensor_cfg:
13+
enabled: false
14+
optimizer: null
15+
scheduler: null
16+
make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
17+
megatron_cfg:
18+
enabled: true
19+
empty_unused_memory_level: 1
20+
tensor_model_parallel_size: 4
21+
pipeline_model_parallel_size: 4
22+
sequence_parallel: true
23+
optimizer:
24+
lr: 3.0e-07
25+
min_lr: 3.0e-08
26+
scheduler:
27+
lr_warmup_iters: 2
28+
lr_warmup_init: 3.0e-08
29+
generation:
30+
vllm_cfg:
31+
tensor_parallel_size: 4
32+
logger:
33+
log_dir: logs/grpo-qwen3-32b-4n8g
34+
wandb_enabled: true
35+
tensorboard_enabled: true
36+
wandb:
37+
project: nemo-rl
38+
name: grpo-qwen3-32b-4n8g
39+
cluster:
40+
gpus_per_node: 8
41+
num_nodes: 4

0 commit comments

Comments
 (0)