Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion examples/configs/dpo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,6 @@ logger:
tensorboard_enabled: false
mlflow_enabled: false # Disable MLflow logging
monitor_gpus: true # If true, will monitor GPU usage and log to wandb and/or tensorboard
num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal
wandb:
project: "dpo-dev"
name: "dpo"
Expand Down
2 changes: 2 additions & 0 deletions examples/configs/grpo_math_1B.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ policy:
use_custom_fsdp: false
data_parallel_sharding_strategy: "optim_grads_params"

env_vars: null

# See docs/design-docs/sequence-packing-and-dynamic-batching.md
# for more details on dynamic batching and sequence packing.
dynamic_batching:
Expand Down
5 changes: 4 additions & 1 deletion examples/configs/grpo_math_qwen30ba3b_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,10 @@ policy:
lr_decay_iters: null
lr_warmup_iters: 13
lr_warmup_init: 3.0e-8


env_vars:
PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:False"

generation:
backend: "vllm"
max_new_tokens: ${policy.max_total_sequence_length}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ logger:
tensorboard_enabled: true
mlflow_enabled: false
monitor_gpus: true
num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal
wandb:
project: nemo-rl
name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ checkpointing:
metric_name: "val_loss"
higher_is_better: false
keep_top_k: 3
save_period: 10000
save_period: 50
checkpoint_must_save_by: null

policy:
Expand All @@ -29,14 +29,14 @@ policy:
name: ${policy.model_name}
train_global_batch_size: 256
train_micro_batch_size: 1
max_total_sequence_length: 2048
max_total_sequence_length: 8192
precision: "bfloat16"
dtensor_cfg:
enabled: true
cpu_offload: False
sequence_parallel: false
activation_checkpointing: false
tensor_parallel_size: 1
tensor_parallel_size: 4
context_parallel_size: 1
custom_parallel_plan: null

Expand Down Expand Up @@ -82,10 +82,9 @@ logger:
tensorboard_enabled: true
mlflow_enabled: false
monitor_gpus: true
num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal
wandb:
project: nemo-rl
name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1
name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4
tensorboard: {}
gpu_monitoring:
collection_interval: 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ checkpointing:
metric_name: "val_loss"
higher_is_better: false
keep_top_k: 3
save_period: 10000
save_period: 50
checkpoint_must_save_by: null

policy:
Expand All @@ -29,7 +29,7 @@ policy:
name: ${policy.model_name}
train_global_batch_size: 256
train_micro_batch_size: 1
max_total_sequence_length: 2048
max_total_sequence_length: 8192
precision: "bfloat16"
dtensor_cfg:
enabled: false
Expand All @@ -49,7 +49,7 @@ policy:
enabled: true
empty_unused_memory_level: 1
activation_checkpointing: false
tensor_model_parallel_size: 2
tensor_model_parallel_size: 4
expert_tensor_parallel_size: 1
expert_model_parallel_size: 1
pipeline_model_parallel_size: 1
Expand Down Expand Up @@ -115,10 +115,9 @@ logger:
tensorboard_enabled: true
mlflow_enabled: false
monitor_gpus: true
num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal
wandb:
project: nemo-rl
name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1
name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-megatron
tensorboard: {}
gpu_monitoring:
collection_interval: 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ logger:
tensorboard_enabled: true
mlflow_enabled: false
monitor_gpus: true
num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal
wandb:
project: nemo-rl
name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ logger:
tensorboard_enabled: true
mlflow_enabled: false
monitor_gpus: true
num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal
wandb:
project: nemo-rl
name: dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
grpo:
num_prompts_per_step: 32
num_generations_per_prompt: 16
max_rollout_turns: 1
max_num_steps: 500
normalize_rewards: true
use_leave_one_out_baseline: true
val_period: 10
val_at_start: false
max_val_samples: 256
val_batch_size: 256
seed: 42
loss_fn:
reference_policy_kl_penalty: 0.01
ratio_clip_min: 0.2
ratio_clip_max: 0.2
ratio_clip_c: null
use_on_policy_kl_approximation: false
use_importance_sampling_correction: false
token_level_loss: true
checkpointing:
enabled: false
checkpoint_dir: results/grpo-llama3.2-1b-instruct-1n8g-megatron
metric_name: val_reward
higher_is_better: true
keep_top_k: 3
save_period: 100
checkpoint_must_save_by: null
policy:
model_name: meta-llama/Llama-3.2-1B-Instruct
tokenizer:
name: meta-llama/Llama-3.2-1B-Instruct
train_global_batch_size: 512
train_micro_batch_size: 4
generation_batch_size: 32
logprob_batch_size: 4
max_total_sequence_length: 512
precision: bfloat16
optimizer: null
megatron_cfg:
enabled: true
empty_unused_memory_level: 0
activation_checkpointing: false
tensor_model_parallel_size: 1
expert_tensor_parallel_size: 1
expert_model_parallel_size: 1
pipeline_model_parallel_size: 1
num_layers_in_first_pipeline_stage: null
num_layers_in_last_pipeline_stage: null
context_parallel_size: 1
pipeline_dtype: ${policy.precision}
sequence_parallel: false
freeze_moe_router: true
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True

optimizer:
optimizer: "adam"
lr: 5.0e-6
min_lr: 5.0e-7
weight_decay: 0.01
bf16: true
fp16: false
params_dtype: "float32"

#adam
adam_beta1: 0.9
adam_beta2: 0.999
adam_eps: 1e-8

#sgd
sgd_momentum: 0.9

#distributed optimizer
use_distributed_optimizer: true
use_precision_aware_optimizer: true

clip_grad: ${policy.max_grad_norm}

scheduler:
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
weight_decay_incr_style: "constant"
lr_decay_style: "constant"
lr_decay_iters: null
lr_warmup_iters: 50
lr_warmup_init: 5.0e-7

distributed_data_parallel_config:
grad_reduce_in_fp32: false
overlap_grad_reduce: true
overlap_param_gather: true
average_in_collective: true
use_custom_fsdp: false
data_parallel_sharding_strategy: "optim_grads_params"

dtensor_cfg:
enabled: false
dynamic_batching:
enabled: False
sequence_packing:
enabled: True
train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
algorithm: "modified_first_fit_decreasing"
sequence_length_round: 64
make_sequence_length_divisible_by: 1
max_grad_norm: 1
generation:
backend: vllm
max_new_tokens: 512
temperature: 1
top_p: 1
top_k: null
stop_token_ids:
- 128009
stop_strings: null
vllm_cfg:
async_engine: false
precision: ${policy.precision}
tensor_parallel_size: 1
pipeline_parallel_size: 1
gpu_memory_utilization: 0.6
max_model_len: 512
enforce_eager: False
colocated:
enabled: true
resources:
gpus_per_node: null
num_nodes: null
data:
max_input_seq_length: 512
prompt_file: examples/prompts/cot.txt
system_prompt_file: null
dataset_name: OpenMathInstruct-2
shuffle: true
env:
math:
num_workers: 8
logger:
log_dir: logs/grpo-llama3.2-1b-instruct-1n8g-megatron
num_val_samples_to_print: 0
wandb_enabled: true
tensorboard_enabled: true
mlflow_enabled: False
monitor_gpus: true
wandb:
project: nemo-rl
name: grpo-llama3.2-1b-instruct-1n8g-megatron
tensorboard: {}
gpu_monitoring:
collection_interval: 10
flush_interval: 10
cluster:
gpus_per_node: 8
num_nodes: 1
Loading