Skip to content
Merged
10 changes: 7 additions & 3 deletions examples/grpo_trainer/run_mistral13b_skyworkrm_hhrlhf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,14 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
reward_model.enable=True \
reward_model.model.fsdp_config.param_offload=True \
reward_model.model.path=Skywork/Skywork-Reward-Llama-3.1-8B \
reward_model.model.input_tokenizer=mistralai/Mistral-Nemo-Instruct-2407 \
reward_model.micro_batch_size_per_gpu=4 \
reward_model.use_reward_loop=True \
reward_model.rollout.name=vllm \
reward_model.rollout.gpu_memory_utilization=0.8 \
reward_model.rollout.tensor_model_parallel_size=1 \
reward_model.rollout.prompt_length=8192 \
reward_model.rollout.response_length=4096 \
reward_model.num_workers=8 \
algorithm.use_kl_in_reward=False \
trainer.logger='["console","wandb"]' \
trainer.val_before_train=False \
Expand Down
10 changes: 7 additions & 3 deletions examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,14 @@ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megat
critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
critic.ppo_micro_batch_size_per_gpu=4 \
reward_model.enable=True \
reward_model.megatron.tensor_model_parallel_size=4 \
reward_model.model.path=deepseek-ai/deepseek-llm-7b-chat \
reward_model.micro_batch_size_per_gpu=4 \
reward_model.param_offload=False \
reward_model.use_reward_loop=True \
reward_model.rollout.name=vllm \
reward_model.rollout.gpu_memory_utilization=0.8 \
reward_model.rollout.tensor_model_parallel_size=4 \
reward_model.rollout.prompt_length=256 \
reward_model.rollout.response_length=128 \
reward_model.num_workers=8 \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
Expand Down
10 changes: 7 additions & 3 deletions examples/ppo_trainer/run_qwen2-7b_rm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,13 @@ python3 -m verl.trainer.main_ppo \
critic.model.fsdp_config.optimizer_offload=False \
reward_model.enable=True \
reward_model.model.path="$HOME/models/FsfairX-LLaMA3-RM-v0.1" \
reward_model.model.use_remove_padding=True \
reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size_per_gpu=32 \
reward_model.use_reward_loop=True \
reward_model.rollout.name=vllm \
reward_model.rollout.gpu_memory_utilization=0.8 \
reward_model.rollout.tensor_model_parallel_size=1 \
reward_model.rollout.prompt_length=2048 \
reward_model.rollout.response_length=1024 \
reward_model.num_workers=8 \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
Expand Down
12 changes: 7 additions & 5 deletions examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,13 @@ python3 -m verl.trainer.main_ppo \
critic.model.fsdp_config.optimizer_offload=False \
reward_model.enable=True \
reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\
reward_model.model.use_remove_padding=True \
reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size_per_gpu=32 \
reward_model.use_dynamic_bsz=True \
reward_model.forward_max_token_len_per_gpu=98304 \
reward_model.use_reward_loop=True \
reward_model.rollout.name=vllm \
reward_model.rollout.gpu_memory_utilization=0.8 \
reward_model.rollout.tensor_model_parallel_size=1 \
reward_model.rollout.prompt_length=8192 \
reward_model.rollout.response_length=4096 \
reward_model.num_workers=8 \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,14 @@ python3 -m verl.trainer.main_ppo \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
reward_model.enable=True \
reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\
reward_model.model.use_remove_padding=True \
reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size_per_gpu=32 \
reward_model.use_dynamic_bsz=True \
reward_model.forward_max_token_len_per_gpu=98304 \
reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1 \
reward_model.use_reward_loop=True \
reward_model.rollout.name=vllm \
reward_model.rollout.gpu_memory_utilization=0.8 \
reward_model.rollout.tensor_model_parallel_size=1 \
reward_model.rollout.prompt_length=8192 \
reward_model.rollout.response_length=4096 \
reward_model.num_workers=8 \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
Expand Down
15 changes: 7 additions & 8 deletions examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_nsys.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,13 @@ python3 -m verl.trainer.main_ppo \
critic.profiler.all_ranks=$PROFILE_RANKS_ALL \
reward_model.enable=True \
reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\
reward_model.model.use_remove_padding=True \
reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size_per_gpu=32 \
reward_model.use_dynamic_bsz=True \
reward_model.forward_max_token_len_per_gpu=98304 \
reward_model.profiler.enable=True \
reward_model.profiler.ranks=$PROFILE_RANKS \
reward_model.profiler.all_ranks=$PROFILE_RANKS_ALL \
reward_model.use_reward_loop=True \
reward_model.rollout.name=vllm \
reward_model.rollout.gpu_memory_utilization=0.8 \
reward_model.rollout.tensor_model_parallel_size=1 \
reward_model.rollout.prompt_length=8192 \
reward_model.rollout.response_length=4096 \
reward_model.num_workers=8 \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
Expand Down
63 changes: 63 additions & 0 deletions examples/ppo_trainer/run_qwen2.5-3b_rm_legacy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# download datasets and models
# python3 examples/data_preprocess/gsm8k.py
# python3 examples/data_preprocess/math_dataset.py
# huggingface-cli download Skywork/Skywork-Reward-V2-Llama-3.2-3B --local-dir $HOME/models/Skywork-Reward-V2-Llama-3.2-3B
# huggingface-cli download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct

gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
math_train_path=$HOME/data/math/train.parquet
math_test_path=$HOME/data/math/test.parquet

train_files="['$gsm8k_train_path', '$math_train_path']"
test_files="['$gsm8k_test_path', '$math_test_path']"

python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=gae \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=1024 \
data.max_prompt_length=1024 \
data.max_response_length=2048 \
data.filter_overlong_prompts=True \
data.truncation='error' \
data.return_raw_chat=True \
actor_rollout_ref.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.actor.use_kl_loss=False \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.optim.lr_warmup_steps_ratio=0.05 \
critic.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
critic.model.enable_gradient_checkpointing=True \
critic.ppo_micro_batch_size_per_gpu=32 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
reward_model.enable=True \
reward_model.model.path="$HOME/models/Skywork-Reward-V2-Llama-3.2-3B" \
reward_model.use_reward_loop=False \
reward_model.model.use_remove_padding=True \
reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size_per_gpu=32 \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
trainer.project_name='verl_test_qwen25_rm' \
trainer.val_before_train=True \
trainer.experiment_name='legacy_fsdp_reward_model' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=10 \
trainer.total_epochs=15 $@
66 changes: 66 additions & 0 deletions examples/ppo_trainer/run_qwen2.5-3b_rm_reward_loop_colocate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# download datasets and models
# python3 examples/data_preprocess/gsm8k.py
# python3 examples/data_preprocess/math_dataset.py
# huggingface-cli download Skywork/Skywork-Reward-V2-Llama-3.2-3B --local-dir $HOME/models/Skywork-Reward-V2-Llama-3.2-3B
# huggingface-cli download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct

gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
math_train_path=$HOME/data/math/train.parquet
math_test_path=$HOME/data/math/test.parquet

train_files="['$gsm8k_train_path', '$math_train_path']"
test_files="['$gsm8k_test_path', '$math_test_path']"

python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=gae \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=1024 \
data.max_prompt_length=1024 \
data.max_response_length=2048 \
data.filter_overlong_prompts=True \
data.truncation='error' \
data.return_raw_chat=True \
actor_rollout_ref.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.actor.use_kl_loss=False \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.optim.lr_warmup_steps_ratio=0.05 \
critic.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
critic.model.enable_gradient_checkpointing=True \
critic.ppo_micro_batch_size_per_gpu=32 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
reward_model.enable=True \
reward_model.model.path="$HOME/models/Skywork-Reward-V2-Llama-3.2-3B" \
reward_model.use_reward_loop=True \
reward_model.rollout.name=vllm \
reward_model.rollout.gpu_memory_utilization=0.8 \
reward_model.rollout.tensor_model_parallel_size=1 \
reward_model.rollout.prompt_length=4096 \
reward_model.rollout.response_length=4096 \
reward_model.num_workers=8 \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
trainer.project_name='verl_test_qwen25_rm' \
trainer.val_before_train=False \
trainer.experiment_name='reward_loop_colocate_reward_model' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=10 \
trainer.total_epochs=15 $@
9 changes: 9 additions & 0 deletions recipe/fapo/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,12 @@ bash recipe/fapo/run_fapo_32b.sh # 32b fapo model
We implement RewardLoop to enable efficient and flexible reward computation.
The core implementation can be found in `verl/experimental/reward/`.
Refer to [this official document](https://verl.readthedocs.io/en/latest/advance/reward_loop.html) for more implementation details.

```bibtex
@article{ding2025fapo,
title={FAPO: Flawed-Aware Policy Optimization for Efficient and Reliable Reasoning},
author={Ding, Yuyang and Zhang, Chi and Li, Juntao and Lin, Haibin and Liu, Xin and Zhang, Min},
journal={arXiv preprint arXiv:2510.22543},
year={2025}
}
```
5 changes: 0 additions & 5 deletions recipe/fapo/run_baseline_32b.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,10 @@ offload=True
gen_tp=4
fsdp_size=32

PROJECT_DIR="$(pwd)"
CONFIG_PATH="$PROJECT_DIR/recipe/fapo/config"

ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
--address "${RAY_ADDRESS}" \
--working-dir "${WORKING_DIR}" \
-- python3 -m verl.trainer.main_ppo \
--config-path $CONFIG_PATH \
--config-name rm_config.yaml \
data.train_files="${TRAIN_FILE}" \
data.val_files="${TEST_FILE}" \
data.prompt_key=prompt \
Expand Down
5 changes: 0 additions & 5 deletions recipe/fapo/run_baseline_7b.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,10 @@ offload=True
gen_tp=1
fsdp_size=8

PROJECT_DIR="$(pwd)"
CONFIG_PATH="$PROJECT_DIR/recipe/fapo/config"

ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
--address "${RAY_ADDRESS}" \
--working-dir "${WORKING_DIR}" \
-- python3 -m verl.trainer.main_ppo \
--config-path $CONFIG_PATH \
--config-name rm_config.yaml \
data.train_files="${TRAIN_FILE}" \
data.val_files="${TEST_FILE}" \
data.prompt_key=prompt \
Expand Down
5 changes: 0 additions & 5 deletions recipe/fapo/run_fapo_32b.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,10 @@ offload=True
gen_tp=4
fsdp_size=32

PROJECT_DIR="$(pwd)"
CONFIG_PATH="$PROJECT_DIR/recipe/fapo/config"

ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
--address "${RAY_ADDRESS}" \
--working-dir "${WORKING_DIR}" \
-- python3 -m verl.trainer.main_ppo \
--config-path $CONFIG_PATH \
--config-name rm_config.yaml \
data.train_files="${TRAIN_FILE}" \
data.val_files="${TEST_FILE}" \
data.prompt_key=prompt \
Expand Down
5 changes: 0 additions & 5 deletions recipe/fapo/run_fapo_32b_remote.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,10 @@ offload=True
gen_tp=4
fsdp_size=32

PROJECT_DIR="$(pwd)"
CONFIG_PATH="$PROJECT_DIR/recipe/fapo/config"

ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
--address "${RAY_ADDRESS}" \
--working-dir "${WORKING_DIR}" \
-- python3 -m verl.trainer.main_ppo \
--config-path $CONFIG_PATH \
--config-name rm_config.yaml \
data.train_files="${TRAIN_FILE}" \
data.val_files="${TEST_FILE}" \
data.prompt_key=prompt \
Expand Down
5 changes: 0 additions & 5 deletions recipe/fapo/run_fapo_7b.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,10 @@ offload=True
gen_tp=1
fsdp_size=8

PROJECT_DIR="$(pwd)"
CONFIG_PATH="$PROJECT_DIR/recipe/fapo/config"

ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
--address "${RAY_ADDRESS}" \
--working-dir "${WORKING_DIR}" \
-- python3 -m verl.trainer.main_ppo \
--config-path $CONFIG_PATH \
--config-name rm_config.yaml \
data.train_files="${TRAIN_FILE}" \
data.val_files="${TEST_FILE}" \
data.prompt_key=prompt \
Expand Down
5 changes: 0 additions & 5 deletions recipe/fapo/run_fapo_7b_remote.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,10 @@ offload=True
gen_tp=1
fsdp_size=8

PROJECT_DIR="$(pwd)"
CONFIG_PATH="$PROJECT_DIR/recipe/fapo/config"

ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
--address "${RAY_ADDRESS}" \
--working-dir "${WORKING_DIR}" \
-- python3 -m verl.trainer.main_ppo \
--config-path $CONFIG_PATH \
--config-name rm_config.yaml \
data.train_files="${TRAIN_FILE}" \
data.val_files="${TEST_FILE}" \
data.prompt_key=prompt \
Expand Down
6 changes: 5 additions & 1 deletion recipe/spin/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,11 @@ def check_mutually_exclusive(mbs, mbs_per_gpu, name: str):
)

# Check for reward model micro-batch size conflicts
if config.reward_model.enable and not config.reward_model.use_dynamic_bsz:
if (
config.reward_model.enable
and not config.reward_model.use_dynamic_bsz
and not config.reward_model.use_reward_loop
):
check_mutually_exclusive(
config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu, "reward_model"
)
Expand Down
4 changes: 2 additions & 2 deletions tests/experimental/reward/test_agent_loop_reward_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ def test_agent_loop_reward_manager():
}
}
)
with initialize_config_dir(config_dir=os.path.abspath("recipe/fapo/config")):
config = compose("rm_config")
with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
config = compose(config_name="ppo_trainer")

rollout_model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-0.5B-Instruct")
reward_model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct")
Expand Down
4 changes: 2 additions & 2 deletions tests/experimental/reward/test_agent_reward_loop_colocate.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ def test_agent_loop_reward_manager():
}
}
)
with initialize_config_dir(config_dir=os.path.abspath("recipe/fapo/config")):
config = compose("rm_config")
with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
config = compose(config_name="ppo_trainer")

rollout_model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-0.5B-Instruct")
reward_model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct")
Expand Down
Loading
Loading