Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 0 additions & 71 deletions examples/configs/distillation_math.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,77 +84,6 @@ policy: &POLICY_BASE
foreach: False
fused: False

megatron_cfg: &MEGATRON_BASE
enabled: false
empty_unused_memory_level: 0
activation_checkpointing: false
converter_type: "Qwen3ForCausalLM"
tensor_model_parallel_size: 2
expert_tensor_parallel_size: 1
expert_model_parallel_size: 1
pipeline_model_parallel_size: 2
num_layers_in_first_pipeline_stage: null
num_layers_in_last_pipeline_stage: null
context_parallel_size: 2
pipeline_dtype: ${policy.precision}
sequence_parallel: false
freeze_moe_router: true
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True
bias_activation_fusion: True
defer_fp32_logits: False
moe_per_layer_logging: False
moe_enable_deepep: false
moe_token_dispatcher_type: "allgather"
moe_shared_expert_overlap: false

optimizer:
optimizer: "adam"
lr: 2.00001e-5
min_lr: 2.0e-5
weight_decay: 0.01
bf16: true
fp16: false
params_dtype: "float32"

#adam
adam_beta1: 0.9
adam_beta2: 0.999
adam_eps: 1e-8

#sgd
sgd_momentum: 0.9

#distributed optimizer
use_distributed_optimizer: true
use_precision_aware_optimizer: true

# optimizer cpu offload
optimizer_cpu_offload: false
optimizer_offload_fraction: 0.0

clip_grad: ${policy.max_grad_norm}

scheduler:
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
weight_decay_incr_style: "constant"
lr_decay_style: "constant"
lr_decay_iters: 1000
lr_warmup_iters: 10
lr_warmup_init: 2.0e-6

distributed_data_parallel_config:
grad_reduce_in_fp32: false
overlap_grad_reduce: true
overlap_param_gather: true
use_custom_fsdp: false
data_parallel_sharding_strategy: "optim_grads_params"

scheduler:
- name: "torch.optim.lr_scheduler.LinearLR"
kwargs:
Expand Down
3 changes: 3 additions & 0 deletions examples/configs/distillation_math_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ policy: &POLICY_BASE

make_sequence_length_divisible_by: ${mul:${mul:${.megatron_cfg.tensor_model_parallel_size}, ${.megatron_cfg.context_parallel_size}}, 2}

megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_1p7b_pretrain_config

megatron_cfg: &MEGATRON_BASE
enabled: true
empty_unused_memory_level: 0
Expand Down Expand Up @@ -140,6 +142,7 @@ policy: &POLICY_BASE
teacher:
<<: *POLICY_BASE
model_name: "Qwen/Qwen3-4B"
megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_4b_pretrain_config
megatron_cfg:
<<: *MEGATRON_BASE
context_parallel_size: 2
Expand Down
71 changes: 0 additions & 71 deletions examples/configs/dpo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -106,78 +106,7 @@ policy:
factor: 1.0
total_iters: 10000000000
- milestones: [20]

## ignored since enabled=false, but needed for testing purposes
megatron_cfg:
enabled: false
empty_unused_memory_level: 1
activation_checkpointing: false
tensor_model_parallel_size: 2
expert_tensor_parallel_size: 1
expert_model_parallel_size: 1
pipeline_model_parallel_size: 1
context_parallel_size: 1
pipeline_dtype: ${policy.precision}
num_layers_in_first_pipeline_stage: null
num_layers_in_last_pipeline_stage: null
sequence_parallel: true
freeze_moe_router: false
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "aux_loss"
moe_router_bias_update_rate: 1e-3
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True
# gives ~25% training perf speedup with sequence packing and apply_rope_fusion
bias_activation_fusion: True
defer_fp32_logits: False
moe_per_layer_logging: False
moe_enable_deepep: false
moe_token_dispatcher_type: "allgather"
moe_shared_expert_overlap: false

optimizer:
optimizer: "adam"
lr: 5.0e-6 #4.0e-5
min_lr: 5.0e-6 #4.0e-5
weight_decay: 0.1
bf16: true
fp16: false
params_dtype: "float32"

#adam
adam_beta1: 0.9
adam_beta2: 0.98
adam_eps: 1e-8

#sgd
sgd_momentum: 0.9

#distributed optimizer
use_distributed_optimizer: true
use_precision_aware_optimizer: true

clip_grad: ${policy.max_grad_norm}

# optimizer cpu offload
optimizer_cpu_offload: false
optimizer_offload_fraction: 0.0

scheduler:
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
weight_decay_incr_style: "constant"
lr_decay_style: "constant"
lr_warmup_iters: 1
lr_warmup_init: 0.00000001

distributed_data_parallel_config:
grad_reduce_in_fp32: false
overlap_grad_reduce: true
overlap_param_gather: true
data_parallel_sharding_strategy: "optim_grads_params"
use_custom_fsdp: false

data:
max_input_seq_length: ${policy.max_total_sequence_length}
shuffle: true
Expand Down
75 changes: 0 additions & 75 deletions examples/configs/grpo_math_1B.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -105,81 +105,6 @@ policy:
lora_A_init: "xavier" # Initialization method for LoRA A matrix: "xavier" or "uniform"
use_triton: true # Use Triton-optimized kernels for LoRA (faster but requires flash-attn). Disable when tensor_parallel_size > 1

megatron_cfg:
enabled: false
empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
activation_checkpointing: false
converter_type: "Qwen2ForCausalLM"
tensor_model_parallel_size: 1
expert_tensor_parallel_size: 1
expert_model_parallel_size: 1
pipeline_model_parallel_size: 1
num_layers_in_first_pipeline_stage: null
num_layers_in_last_pipeline_stage: null
context_parallel_size: 1
pipeline_dtype: ${policy.precision}
sequence_parallel: false
freeze_moe_router: true
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True
# gives ~25% training perf speedup with sequence packing and apply_rope_fusion
bias_activation_fusion: True
defer_fp32_logits: False
moe_per_layer_logging: False
moe_enable_deepep: false
moe_token_dispatcher_type: "allgather"
moe_shared_expert_overlap: false

optimizer:
optimizer: "adam"
lr: 5.0e-6
min_lr: 5.0e-7
weight_decay: 0.01
bf16: true
fp16: false
params_dtype: "float32"

#adam
adam_beta1: 0.9
adam_beta2: 0.999
adam_eps: 1e-8

#sgd
sgd_momentum: 0.9

#distributed optimizer
use_distributed_optimizer: true
use_precision_aware_optimizer: true

clip_grad: ${policy.max_grad_norm}

# optimizer cpu offload
optimizer_cpu_offload: false
optimizer_offload_fraction: 0.0

scheduler:
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
weight_decay_incr_style: "constant"
lr_decay_style: "constant"
lr_decay_iters: 1000
lr_warmup_iters: 13
lr_warmup_init: 5.0e-7

distributed_data_parallel_config:
grad_reduce_in_fp32: false
overlap_grad_reduce: true
overlap_param_gather: true
use_custom_fsdp: false
data_parallel_sharding_strategy: "optim_grads_params"

fp8_cfg: null

env_vars: null

# See docs/design-docs/sequence-packing-and-dynamic-batching.md
# for more details on dynamic batching and sequence packing.
Expand Down
68 changes: 1 addition & 67 deletions examples/configs/grpo_math_1B_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,75 +70,9 @@ policy:
sequence_length_round: 64

max_grad_norm: 1.0
# makes the training sequence length divisible by the tensor parallel size
# this is useful for sequence parallel training
make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}

optimizer: null # remove default FSDP optimizer

megatron_cfg:
enabled: true
empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
activation_checkpointing: false
converter_type: "Qwen2ForCausalLM"
tensor_model_parallel_size: 1
expert_tensor_parallel_size: 1
expert_model_parallel_size: 1
pipeline_model_parallel_size: 1
num_layers_in_first_pipeline_stage: null
num_layers_in_last_pipeline_stage: null
context_parallel_size: 1
pipeline_dtype: ${policy.precision}
sequence_parallel: false
freeze_moe_router: true
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
moe_permute_fusion: false
moe_enable_deepep: false
moe_token_dispatcher_type: "allgather"
moe_shared_expert_overlap: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True

optimizer:
optimizer: "adam"
lr: 5.0e-6
min_lr: 5.0e-7
weight_decay: 0.01
bf16: true
fp16: false
params_dtype: "float32"

#adam
adam_beta1: 0.9
adam_beta2: 0.999
adam_eps: 1e-8

#sgd
sgd_momentum: 0.9

#distributed optimizer
use_distributed_optimizer: true
use_precision_aware_optimizer: true

clip_grad: ${policy.max_grad_norm}

scheduler:
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
weight_decay_incr_style: "constant"
lr_decay_style: "constant"
lr_decay_iters: 1000
lr_warmup_iters: 13
lr_warmup_init: 5.0e-7

distributed_data_parallel_config:
grad_reduce_in_fp32: false
overlap_grad_reduce: true
overlap_param_gather: true
use_custom_fsdp: false
data_parallel_sharding_strategy: "optim_grads_params"
optimizer: null # remove default FSDP optimizer

generation:
backend: "vllm"
Expand Down
2 changes: 2 additions & 0 deletions examples/configs/grpo_math_70B_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ policy:

scheduler: null # remove default FSDP scheduler

megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_70b_pretrain_config

megatron_cfg:
enabled: true
empty_unused_memory_level: 1
Expand Down
2 changes: 2 additions & 0 deletions examples/configs/grpo_math_8B_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ policy:

scheduler: null # remove default FSDP scheduler

megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config

megatron_cfg:
enabled: true
empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
Expand Down
2 changes: 1 addition & 1 deletion examples/configs/grpo_math_8B_megatron_fp8.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ policy:
optimizer:
use_precision_aware_optimizer: false
env_vars:
NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1"
NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1"
2 changes: 2 additions & 0 deletions examples/configs/grpo_math_qwen30ba3b_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ policy:

scheduler: null # remove default FSDP scheduler

megatron_recipe: megatron.bridge.recipes.qwen.qwen3_30b_a3b_finetune_config

megatron_cfg:
enabled: true
empty_unused_memory_level: 1
Expand Down
1 change: 1 addition & 0 deletions examples/configs/recipes/llm/dapo-qwen2.5-7b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ policy:
dtensor_cfg:
_v2: false
context_parallel_size: 4
megatron_recipe: megatron.bridge.recipes.qwen.qwen25_7b_finetune_config
megatron_cfg:
tensor_model_parallel_size: 4
pipeline_model_parallel_size: 2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ policy:
${.megatron_cfg.context_parallel_size}}, 2}
megatron_cfg:
enabled: true
megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_32b_pretrain_config
teacher:
model_name: Qwen/Qwen3-32B
dtensor_cfg:
Expand All @@ -30,6 +31,7 @@ teacher:
enabled: false
sequence_packing:
enabled: true
megatron_recipe: megatron.bridge.recipes.qwen.qwen3.qwen3_32b_pretrain_config
megatron_cfg:
enabled: true
tensor_model_parallel_size: 4
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ policy:
enabled: false
make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
optimizer: null
megatron_recipe: megatron.bridge.recipes.llama.llama3.llama31_8b_pretrain_config
megatron_cfg:
enabled: true
tensor_model_parallel_size: 4
Expand Down
Loading
Loading