Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
273 changes: 273 additions & 0 deletions examples/penguin/grpo_dapo17k_bytedtsinghua_qwen3_4binstruct_nf.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,273 @@
grpo:
max_num_epochs: 1
num_prompts_per_step: 64
num_generations_per_prompt: 16
max_rollout_turns: 1 # for multi-turn rollouts. Math Environments just have 1 turn (answering the question)
max_num_steps: 1000000
normalize_rewards: true
use_leave_one_out_baseline: true
val_period: 10
val_at_start: true
overlong_filtering: false
max_val_samples: null # inferred from size of val dataset. for multi evals, repeat val ds via `num_repeats` in `ng_prepare_data`.
val_batch_size: null
seed: 42
use_dynamic_sampling: false
dynamic_sampling_max_gen_batches: 10
batch_multiplier: 1
reward_shaping:
enabled: false
overlong_buffer_length: 128
overlong_buffer_penalty: 1
max_response_length: ${policy.max_total_sequence_length}
reward_scaling:
enabled: false
source_min: 0.0
source_max: 1.0
target_min: 0.0
target_max: 1.0
skip_reference_policy_logprobs_calculation: true

loss_fn:
reference_policy_kl_penalty: 0
reference_policy_kl_type: "k3"
kl_input_clamp_value: 20.0
kl_output_clamp_value: 10.0
ratio_clip_min: 0.2
ratio_clip_max: 0.2
ratio_clip_c: null
# (default off) loss formulation improvements (docs/guides/grpo.md#loss)
use_on_policy_kl_approximation: false
truncated_importance_sampling_ratio: null
use_importance_sampling_correction: false
token_level_loss: true

checkpointing:
enabled: true
checkpoint_dir: "results/grpo"
metric_name: "val:accuracy"
higher_is_better: true
keep_top_k: 3
save_period: 1
checkpoint_must_save_by: null

policy:
model_name: "Qwen/Qwen3-4B-Instruct-2507"
tokenizer:
name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
chat_template_kwargs: null # can be used to pass kwargs to the chat template, e.g., enable_thinking=true
hf_config_overrides: {}
train_global_batch_size: ${mul:${grpo.num_prompts_per_step}, ${grpo.num_generations_per_prompt}} # Match the total rollouts per step
train_micro_batch_size: 1
logprob_batch_size: 1
generation_batch_size: 32 # Only used when generating using HF backend
max_total_sequence_length: 32768
precision: "bfloat16"
logprob_chunk_size: 1024

dtensor_cfg:
_v2: false
enabled: true
cpu_offload: False
sequence_parallel: false
activation_checkpointing: true
tensor_parallel_size: 2
context_parallel_size: 1
custom_parallel_plan: null
clear_cache_every_n_steps: null

megatron_cfg:
enabled: false
# We might want to consider setting this value higher (e.g. to 1) and raising the vllm generation max mem utilization
empty_unused_memory_level: 0
activation_checkpointing: true
converter_type: "Qwen2ForCausalLM" # Apparently this is comptible with Qwen 3 dense models.
tensor_model_parallel_size: 1
expert_tensor_parallel_size: 1
expert_model_parallel_size: 1
pipeline_model_parallel_size: 1
num_layers_in_first_pipeline_stage: null
num_layers_in_last_pipeline_stage: null
context_parallel_size: 1
pipeline_dtype: ${policy.precision}
sequence_parallel: false
freeze_moe_router: true
moe_router_dtype: "fp64"
moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True
defer_fp32_logits: true
moe_permute_fusion: false
bias_activation_fusion: True

optimizer:
optimizer: "adam"
lr: 5.0e-6
min_lr: 5.0e-7
weight_decay: 0.01
bf16: true
fp16: false
params_dtype: "float32"

#adam
adam_beta1: 0.9
adam_beta2: 0.999
adam_eps: 1e-8

#sgd
sgd_momentum: 0.9

#distributed optimizer
use_distributed_optimizer: true
use_precision_aware_optimizer: true

# optimizer cpu offload
optimizer_cpu_offload: false
optimizer_offload_fraction: 0.0

clip_grad: ${policy.max_grad_norm}

scheduler:
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
weight_decay_incr_style: "constant"
lr_decay_style: "constant"
lr_decay_iters: null
lr_warmup_iters: 13
lr_warmup_init: 5.0e-7

distributed_data_parallel_config:
grad_reduce_in_fp32: false
overlap_grad_reduce: true
overlap_param_gather: true
use_custom_fsdp: false
data_parallel_sharding_strategy: "optim_grads_params"

env_vars: null

# See docs/design-docs/sequence-packing-and-dynamic-batching.md
# for more details on dynamic batching and sequence packing.
dynamic_batching:
enabled: False
train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
sequence_length_round: 64

sequence_packing:
enabled: false
train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
algorithm: "modified_first_fit_decreasing"
sequence_length_round: 64

# makes the training sequence length divisible by the tensor parallel size
# this is useful for sequence parallel training
make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
max_grad_norm: 1.0

optimizer:
name: "torch.optim.AdamW"
kwargs:
lr: 1.0e-6
weight_decay: 0.01
betas: [0.9, 0.999]
eps: 1e-8
# when using Dtensor, we need to set foreach
# and fused to False
foreach: False
fused: False

scheduler:
- name: "torch.optim.lr_scheduler.ConstantLR"
kwargs:
factor: 1.0
total_iters: 10000000000
- milestones: []

generation:
backend: "vllm"
max_new_tokens: ${policy.max_total_sequence_length}
temperature: 1.0
top_p: 1.0
top_k: null
stop_token_ids: null
stop_strings: null
vllm_cfg:
async_engine: true
precision: ${policy.precision}
tensor_parallel_size: 1
pipeline_parallel_size: 1
enable_expert_parallel: false
expert_parallel_size: 1
gpu_memory_utilization: 0.8
max_model_len: ${policy.max_total_sequence_length}
enforce_eager: false
use_deep_gemm: False
num_last_layers_in_bf16: 0
num_first_layers_in_bf16: 0
expose_http_server: true
skip_tokenizer_init: false
http_server_serving_chat_kwargs:
# This is the tool parser for Qwen 3 4B Instruct. This needs to be changed for other models.
enable_auto_tools: true
tool_parser: hermes
# Enable the appropriate reasoning parser here. Since this model is an instruct model, we comment it out.
# reasoning_parser: deepseek_r1
vllm_kwargs:
compilation_config:
# when enforce_eager is False, set ++policy.generation.vllm_kwargs.compilation_config.use_inductor=False for better accuracy,
# with the flag, vllm will use the custom CUDA kernels instead of the Triton kernels generated by torch.compile
# for more details, see convergence issue https://github.com/NVIDIA-NeMo/RL/issues/998
use_inductor: False
colocated:
# true: generation shares training GPUs
# false: uses dedicated generation resources
enabled: true
# only relevant when enabled is false
resources:
gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1
num_nodes: null # Decides number of nodes to be dedicated to generation

data:
train_jsonl_fpath: 3rdparty/Penguin-workspace/Penguin/data/bytedtsinghua_dapo17k/train.jsonl
validation_jsonl_fpath: 3rdparty/Penguin-workspace/Penguin/data/bytedtsinghua_dapo17k/validation.jsonl
shuffle: true
num_workers: 0

env:
should_use_penguin: true
should_log_penguin_responses: true # If you have low logging storage, set this to false
penguin: # This is passed into Penguin as the initial_global_config_dict
config_paths:
- responses_api_models/vllm_model/configs/vllm_model_for_training.yaml # Required! And it must be *for_training
- resources_servers/library_judge_math/configs/library_judge_math.yaml
library_judge_math:
resources_servers:
library_judge_math:
judge_model_server:
name: policy_model
should_use_judge: false

logger:
log_dir: "logs" # Base directory for all logs
num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal
wandb_enabled: true
tensorboard_enabled: false
mlflow_enabled: false # Disable MLflow logging
swanlab_enabled: false
monitor_gpus: true # If true, will monitor GPU usage and log to wandb and/or tensorboard
wandb:
project: "grpo-dev"
name: "grpo-dev-logger"
tensorboard: {}
mlflow:
experiment_name: "grpo-dev"
run_name: "grpo-dev-logger"
gpu_monitoring:
collection_interval: 10 # How often to collect GPU usage metrics (in seconds)
flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds)

cluster:
gpus_per_node: 8
num_nodes: 8
Loading
Loading