Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions examples/ppo_trainer/run_qwen2-7b_rm_legacy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# download datasets and models
# python3 examples/data_preprocess/gsm8k.py
# python3 examples/data_preprocess/math_dataset.py
# huggingface-cli download Skywork/Skywork-Reward-V2-Llama-3.2-3B --local-dir $HOME/models/Skywork-Reward-V2-Llama-3.2-3B
# huggingface-cli download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct

gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
math_train_path=$HOME/data/math/train.parquet
math_test_path=$HOME/data/math/test.parquet

train_files="['$gsm8k_train_path', '$math_train_path']"
test_files="['$gsm8k_test_path', '$math_test_path']"

python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=gae \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=1024 \
data.max_prompt_length=1024 \
data.max_response_length=2048 \
data.filter_overlong_prompts=True \
data.truncation='error' \
data.return_raw_chat=True \
actor_rollout_ref.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.actor.use_kl_loss=False \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.optim.lr_warmup_steps_ratio=0.05 \
critic.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
critic.model.enable_gradient_checkpointing=True \
critic.ppo_micro_batch_size_per_gpu=32 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
reward_model.enable=True \
reward_model.model.path="$HOME/models/Skywork-Reward-V2-Llama-3.2-3B" \
reward_model.use_reward_loop=False \
reward_model.model.use_remove_padding=True \
reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size_per_gpu=32 \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
trainer.project_name='verl_test_qwen25_rm' \
trainer.val_before_train=True \
trainer.experiment_name='legacy_fsdp_reward_model' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=10 \
trainer.total_epochs=15 $@
69 changes: 69 additions & 0 deletions examples/ppo_trainer/run_qwen2-7b_rm_reward_loop_colocate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# download datasets and models
# python3 examples/data_preprocess/gsm8k.py
# python3 examples/data_preprocess/math_dataset.py
# huggingface-cli download Skywork/Skywork-Reward-V2-Llama-3.2-3B --local-dir $HOME/models/Skywork-Reward-V2-Llama-3.2-3B
# huggingface-cli download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct

gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
math_train_path=$HOME/data/math/train.parquet
math_test_path=$HOME/data/math/test.parquet

train_files="['$gsm8k_train_path', '$math_train_path']"
test_files="['$gsm8k_test_path', '$math_test_path']"

python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=gae \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=1024 \
data.max_prompt_length=1024 \
data.max_response_length=2048 \
data.filter_overlong_prompts=True \
data.truncation='error' \
data.return_raw_chat=True \
actor_rollout_ref.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.actor.use_kl_loss=False \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.optim.lr_warmup_steps_ratio=0.05 \
critic.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
critic.model.enable_gradient_checkpointing=True \
critic.ppo_micro_batch_size_per_gpu=32 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
reward_model.enable=True \
reward_model.model.path="$HOME/models/Skywork-Reward-V2-Llama-3.2-3B" \
reward_model.use_reward_loop=True \
reward_model.rollout.name=vllm \
reward_model.rollout.gpu_memory_utilization=0.8 \
reward_model.rollout.prompt_length=4096 \
reward_model.rollout.response_length=4096 \
reward_model.rollout.tensor_model_parallel_size=1 \
reward_model.num_workers=8 \
reward_model.model.use_remove_padding=True \
reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size_per_gpu=32 \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
trainer.project_name='verl_test_qwen25_rm' \
trainer.val_before_train=False \
trainer.experiment_name='reward_loop_colocate_reward_model' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=10 \
trainer.total_epochs=15 $@
1 change: 1 addition & 0 deletions tests/special_e2e/ppo_trainer/run_model_reward.sh
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ python3 -m verl.trainer.main_ppo \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
reward_model.enable=True \
reward_model.use_reward_loop=False \
reward_model.ulysses_sequence_parallel_size="${SP_SIZE}" \
reward_model.model.path="${MODEL_PATH}" \
reward_model.model.use_remove_padding="${RM_PAD}" \
Expand Down
1 change: 1 addition & 0 deletions tests/special_e2e/run_ppo_trainer_megatron.sh
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,7 @@ python3 -m verl.trainer.main_ppo --config-path=config \
critic.profiler.ranks=$PROFILE_RANKS \
critic.profiler.all_ranks=$PROFILE_RANKS_ALL \
reward_model.enable=True \
reward_model.use_reward_loop=False \
reward_model.model.path="${MODEL_PATH}" \
reward_model.micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
reward_model.megatron.use_mbridge=${USE_MBRIDGE} \
Expand Down
12 changes: 10 additions & 2 deletions verl/experimental/reward/reward_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,14 @@ async def _preprocess_reward_inputs(self, data: DataProto) -> str:
add_generation_prompt=False,
tokenize=False,
)

# llama tokenizer will add bos token by default
# will be removed in vllm >= 0.11.2, where we can add "add_special_tokens" = False
if self.reward_model_tokenizer.bos_token is not None and rm_prompt.startswith(
self.reward_model_tokenizer.bos_token
):
rm_prompt = rm_prompt[len(self.reward_model_tokenizer.bos_token) :]

return rm_prompt

async def compute_score_disrm(self, data: DataProto) -> dict:
Expand All @@ -148,7 +156,7 @@ async def compute_score_disrm(self, data: DataProto) -> dict:
"model": model_name,
"input": disrm_prompt,
"activation": False,
"add_special_tokens": False,
# "add_special_tokens": False, # vllm >= 0.11.2
}
output = await self._post_request(payloads, "classify")
rm_score = output["data"][-1]["probs"][-1]
Expand Down Expand Up @@ -187,7 +195,7 @@ def __init__(self, config: DictConfig, rm_resource_pool: RayResourcePool = None)

def _init_reward_loop_workers(self):
self.reward_loop_workers = []
num_workers = self.config.reward_model.get("num_workers", 1)
num_workers = self.config.reward_model.num_workers
node_ids = [node["NodeID"] for node in ray.nodes() if node["Alive"] and node["Resources"].get("CPU", 0) > 0]

for i in range(num_workers):
Expand Down
4 changes: 3 additions & 1 deletion verl/single_controller/ray/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,13 +220,15 @@ def split_resource_pool(
else:
start_bundle_idx_list = np.cumsum([0] + split_size_list[:-1])

# ensure resource_pool.pgs has been initialized
placement_groups = resource_pool.get_placement_groups()
split_resource_pools = [
SubRayResourcePool(
process_on_nodes=resource_pool.store,
use_gpu=resource_pool.use_gpu,
name_prefix=f"{resource_pool.name_prefix}_split_{split_idx}",
max_colocate_count=resource_pool.max_colocate_count,
placement_groups=resource_pool.pgs,
placement_groups=placement_groups,
start_bundle_index=start_bundle_idx_list[split_idx],
subgroup_world_size=split_size_list[split_idx],
)
Expand Down
9 changes: 5 additions & 4 deletions verl/trainer/config/_generated_ppo_megatron_trainer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ critic:
reward_model:
enable: false
enable_resource_pool: false
n_gpus_per_node: 0
n_gpus_per_node: 8
nnodes: 0
strategy: megatron
model:
Expand Down Expand Up @@ -572,6 +572,7 @@ reward_model:
dtype: bfloat16
load_weight: true
use_reward_loop: true
num_workers: 1
rollout:
_target_: verl.workers.config.RolloutConfig
name: ???
Expand All @@ -592,9 +593,9 @@ reward_model:
enable_chunked_prefill: true
enable_prefix_caching: true
disable_log_stats: true
skip_tokenizer_init: true
prompt_length: 512
response_length: 512
skip_tokenizer_init: false
prompt_length: 2048
response_length: 2048
algorithm:
rollout_correction:
rollout_is: null
Expand Down
9 changes: 5 additions & 4 deletions verl/trainer/config/_generated_ppo_trainer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ critic:
reward_model:
enable: false
enable_resource_pool: false
n_gpus_per_node: 0
n_gpus_per_node: 8
nnodes: 0
strategy: fsdp
model:
Expand Down Expand Up @@ -496,6 +496,7 @@ reward_model:
tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
ulysses_sequence_parallel_size: 1
use_reward_loop: true
num_workers: 1
rollout:
_target_: verl.workers.config.RolloutConfig
name: ???
Expand All @@ -516,9 +517,9 @@ reward_model:
enable_chunked_prefill: true
enable_prefix_caching: true
disable_log_stats: true
skip_tokenizer_init: true
prompt_length: 512
response_length: 512
skip_tokenizer_init: false
prompt_length: 2048
response_length: 2048
algorithm:
rollout_correction:
rollout_is: null
Expand Down
9 changes: 5 additions & 4 deletions verl/trainer/config/reward_model/dp_reward_loop.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ enable: False

# Whether to deploy the model to a separate resource pool.
enable_resource_pool: False
n_gpus_per_node: 0
n_gpus_per_node: 8
num_workers: 1
nnodes: 0

model:
Expand Down Expand Up @@ -36,7 +37,7 @@ rollout:
enable_chunked_prefill: true
enable_prefix_caching: true
disable_log_stats: true
skip_tokenizer_init: true
skip_tokenizer_init: false

prompt_length: 512
response_length: 512
prompt_length: 2048
response_length: 2048
9 changes: 5 additions & 4 deletions verl/trainer/config/reward_model/megatron_reward_loop.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ enable: False

# Whether to deploy the model to a separate resource pool.
enable_resource_pool: False
n_gpus_per_node: 0
n_gpus_per_node: 8
num_workers: 1
nnodes: 0

model:
Expand Down Expand Up @@ -36,7 +37,7 @@ rollout:
enable_chunked_prefill: true
enable_prefix_caching: true
disable_log_stats: true
skip_tokenizer_init: true
skip_tokenizer_init: false

prompt_length: 512
response_length: 512
prompt_length: 2048
response_length: 2048
Loading