Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: utils_cpu_test
name: cpu_unit_tests

on:
# Trigger the workflow on push or pull request,
Expand All @@ -13,7 +13,7 @@ on:
- v0.*
paths:
- "**/*.py"
- .github/workflows/utils_cpu_test.yml
- .github/workflows/cpu_unit_tests.yml
- "!recipe/**/*.py"

# Cancel jobs on the same ref if a new one is triggered
Expand All @@ -26,7 +26,7 @@ permissions:
contents: read

jobs:
utils_cpu_test:
cpu_unit_tests:
runs-on: ubuntu-latest
timeout-minutes: 10 # Increase this timeout value as needed
strategy:
Expand All @@ -41,7 +41,7 @@ jobs:
- name: Install the current repository
run: |
pip install -e .[test]
- name: Running test protocol.py
- name: Running data proto test
run: |
cd tests
pytest -s -x test_protocol.py
Expand All @@ -53,3 +53,7 @@ jobs:
run: |
cd tests/trainer
pytest -s -x .
- name: Running worker tests
run: |
cd tests/workers/reward_manager
pytest -s -x .
88 changes: 28 additions & 60 deletions .github/workflows/e2e_ppo_trainer.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ jobs:

e2e_ppo_trainer_vllm:
runs-on: [L20x8]
timeout-minutes: 40 # Increase this timeout value as needed
timeout-minutes: 60 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
Expand Down Expand Up @@ -161,6 +161,14 @@ jobs:
run: |
ray stop --force
LIGER=True bash tests/e2e/ppo_trainer/run_model_reward.sh
- name: Running GSM8K E2E with rmpad using model rm with Fused Kernel enabled
run: |
ray stop --force
FUSED_KERNELS=True bash tests/e2e/ppo_trainer/run_model_reward.sh
- name: Running GSM8K E2E with rmpad using model rm with Fused Kernel enabled
run: |
ray stop --force
FUSED_KERNEL=True FUSED_KERNEL_BACKEND=triton bash tests/e2e/ppo_trainer/run_model_reward.sh

e2e_ppo_trainer_vllm_vlm:
runs-on: [L20x8]
Expand All @@ -181,13 +189,13 @@ jobs:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -e .[test,geo,vllm]
pip3 install -e .[test,gpu,vllm,geo,trl]
# Geo3k
- name: Prepare Geo3k dataset
run: |
ray stop --force
python3 examples/data_preprocess/geo3k.py
- name: Running Geo3k VLM E2E training tests on 8 L20 GPUs with rmpad using function rm
- name: Running Geo3k VLM GRPO E2E training tests on 8 L20 GPUs with rmpad using function rm
run: |
ray stop --force
TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \
Expand All @@ -197,6 +205,16 @@ jobs:
SP_SIZE=2 \
bash tests/e2e/ppo_trainer/run_function_reward.sh

- name: Running Geo3k VLM PPO E2E training tests on 8 L20 GPUs with rmpad using function rm
run: |
ray stop --force
TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \
MAX_PROMPT_LEN=1536 MAX_RESPONSE_LEN=1536 \
MODEL_ID=Qwen/Qwen2-VL-2B-Instruct \
ADV_ESTIMATOR=gae RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
SP_SIZE=2 \
bash tests/e2e/ppo_trainer/run_function_reward.sh

e2e_ppo_trainer_sglang:
runs-on: [L20x8]
needs: pre_commit_for_ppo
Expand Down Expand Up @@ -262,7 +280,7 @@ jobs:
e2e_ppo_trainer_sglang_vlm:
runs-on: [L20x8]
needs: pre_commit_for_ppo
timeout-minutes: 40 # Increase this timeout value as needed
timeout-minutes: 60 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
Expand Down Expand Up @@ -294,74 +312,24 @@ jobs:
ENGINE=sglang GPU_MEMORY_UTILIZATION=0.6 ACTOR_FSDP_PARAM_OFFLOAD=True \
ACTOR_FSDP_OPTIMIZER_OFFLOAD=True REF_FSDP_PARAM_OFFLOAD=True \
bash tests/e2e/ppo_trainer/run_function_reward.sh

e2e_ppo_trainer_fused_kernels_vllm:
runs-on: [L20x8]
needs: pre_commit_for_ppo
timeout-minutes: 40 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.3-flashinfer0.2.2-cxx11abi0
options: --gpus all --shm-size=50g # Visual dataloader requires large memory
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -e .[test,geo,vllm]
# Geo3k
- name: Prepare Geo3k dataset
run: |
ray stop --force
python3 examples/data_preprocess/geo3k.py
- name: Running Geo3k VLM E2E with rmpad using fused kernel (Qwen2.5-VL)
- name: Running Geo3k VLM E2E with rmpad using torch fused kernel (Qwen2.5-VL)
run: |
ray stop --force
FUSED_KERNELS=True TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \
MAX_PROMPT_LEN=1536 MAX_RESPONSE_LEN=1536 \
MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct \
ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
GPU_MEMORY_UTILIZATION=0.6 ACTOR_FSDP_PARAM_OFFLOAD=True \
ENGINE=sglang GPU_MEMORY_UTILIZATION=0.6 ACTOR_FSDP_PARAM_OFFLOAD=True \
ACTOR_FSDP_OPTIMIZER_OFFLOAD=True REF_FSDP_PARAM_OFFLOAD=True \
bash tests/e2e/ppo_trainer/run_function_reward.sh

e2e_ppo_trainer_fused_kernels_sglang:
runs-on: [L20x8]
needs: pre_commit_for_ppo
timeout-minutes: 40 # Increase this timeout value as needed
env:
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
options: --gpus all --shm-size=50g # Visual dataloader requires large memory
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: Install the current repository
run: |
pip3 install -e .[test,geo,gpu,sglang]
- name: Prepare Geo3k dataset
run: |
ray stop --force
python3 examples/data_preprocess/geo3k.py
- name: Running Geo3k VLM E2E with rmpad using fused kernel (Qwen2.5-VL)
- name: Running Geo3k VLM E2E with rmpad using triton fused kernel (Qwen2.5-VL)
run: |
ray stop --force
FUSED_KERNELS=True TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \
FUSED_KERNELS=True FUSED_KERNEL_BACKEND=triton \
TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \
MAX_PROMPT_LEN=1536 MAX_RESPONSE_LEN=1536 \
MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct \
ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
ENGINE=sglang GPU_MEMORY_UTILIZATION=0.6 ACTOR_FSDP_PARAM_OFFLOAD=True \
ACTOR_FSDP_OPTIMIZER_OFFLOAD=True REF_FSDP_PARAM_OFFLOAD=True \
bash tests/e2e/ppo_trainer/run_function_reward.sh
bash tests/e2e/ppo_trainer/run_function_reward.sh
7 changes: 5 additions & 2 deletions .github/workflows/kernels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ permissions:
contents: read

jobs:
e2e_gsm8k_megatron:
kernels:
runs-on: [L20x8]
timeout-minutes: 40 # Increase this timeout value as needed
env:
Expand All @@ -59,4 +59,7 @@ jobs:
pip3 install --no-deps -e .[test]
- name: Testing LinearCrossEntropy Correction, Computation Time and Memory Consumption
run: |
python3 tests/kernels/test_linear_cross_entropy.py
python3 tests/kernels/test_linear_cross_entropy.py
- name: Testing LinearCrossEntropyTP Correction, Computation Time and Memory Consumption
run: |
torchrun --standalone --nnodes=1 --nproc-per-node=8 tests/kernels/test_linear_cross_entropy_tp.py
9 changes: 5 additions & 4 deletions docs/api/trainer.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,18 @@ Trainers drive the training loop. Introducing new trainer classes in case of new
Core APIs
~~~~~~~~~~~~~~~~~

.. autoclass:: verl.trainer.ppo.ray_trainer.RayPPOTrainer
.. autoclass:: verl.trainer.ppo.ray_trainer.RayPPOTrainer
:members: __init__, init_workers, fit


.. automodule:: verl.utils.tokenizer
:members: hf_tokenizer


.. automodule:: verl.trainer.ppo.core_algos
:members: agg_loss, kl_penalty, compute_policy_loss, kl_penalty


.. automodule:: verl.trainer.ppo.reward
:members: load_reward_manager, compute_reward, compute_reward_async

.. autoclass:: verl.workers.reward_manager.NaiveRewardManager

.. autoclass:: verl.workers.reward_manager.DAPORewardManager
64 changes: 64 additions & 0 deletions examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_fused_kernels.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
set -x

gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
math_train_path=$HOME/data/math/train.parquet
math_test_path=$HOME/data/math/test.parquet

train_files="['$gsm8k_train_path', '$math_train_path']"
test_files="['$gsm8k_test_path', '$math_test_path']"

FUSED_KERNEL_BACKEND=triton # or 'torch' for torch backend

python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=gae \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=4096 \
data.max_prompt_length=4096 \
data.max_response_length=4096 \
data.filter_overlong_prompts=True \
data.truncation='error' \
data.return_raw_chat=True \
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.model.use_fused_kernels=True \
actor_rollout_ref.model.fused_kernel_options.impl_backend=$FUSED_KERNEL_BACKEND \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.ppo_mini_batch_size=512 \
actor_rollout_ref.actor.use_dynamic_bsz=True \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.actor.use_kl_loss=False \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=24000 \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.model.path=Qwen/Qwen2-7B-Instruct \
critic.model.enable_gradient_checkpointing=True \
critic.use_dynamic_bsz=True \
critic.ppo_max_token_len_per_gpu=98304 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
reward_model.enable=True \
reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\
reward_model.model.use_remove_padding=True \
reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size_per_gpu=32 \
reward_model.use_dynamic_bsz=True \
reward_model.forward_max_token_len_per_gpu=98304 \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_example_gsm8k' \
trainer.experiment_name='qwen2-7b_hybrid_rm_bsz8k_p4k_r4k_seq_packing_fused_kernel' \
trainer.n_gpus_per_node=8 \
trainer.val_before_train=False \
trainer.nnodes=1 \
trainer.save_freq=20 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@
File renamed without changes.
2 changes: 1 addition & 1 deletion recipe/char_count/train_grpo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,5 @@ python3 -m verl.trainer.main_ppo \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=2 \
custom_reward_function.path=/home/chi/Developer/verl/recipe/char_count/reward_function.py \
custom_reward_function.path=recipe/char_count/reward_function.py \
custom_reward_function.name=char_count_reward_function
2 changes: 1 addition & 1 deletion recipe/char_count/train_sft.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
data.max_length=256 \
data.train_batch_size=256 \
use_remove_padding=True \
model.partial_pretrain=$HOME/models/SmolLM2-135M-Instruct \
model.partial_pretrain=HuggingFaceTB/SmolLM2-135M-Instruct \
trainer.default_local_dir=$save_path \
trainer.project_name=char_count-sft \
trainer.experiment_name=char_count-sft-SmolLM2-135M-Instruct \
Expand Down
1 change: 0 additions & 1 deletion recipe/dapo/dapo_ray_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,6 @@ def fit(self):

new_batch.batch["token_level_scores"] = reward_tensor

print(f"{list(reward_extra_infos_dict.keys())=}")
if reward_extra_infos_dict:
new_batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})

Expand Down
19 changes: 5 additions & 14 deletions recipe/dapo/main_dapo.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,21 +118,12 @@ def run(self, config):
role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
mapping[Role.RefPolicy] = global_pool_id

reward_manager_name = config.reward_model.get("reward_manager", "naive")
if reward_manager_name == "naive":
from verl.workers.reward_manager import NaiveRewardManager

reward_manager_cls = NaiveRewardManager
elif reward_manager_name == "prime":
from verl.workers.reward_manager import PrimeRewardManager

reward_manager_cls = PrimeRewardManager
elif reward_manager_name == "dapo":
from verl.workers.reward_manager import DAPORewardManager
from verl.workers.reward_manager import get_reward_manager_cls

reward_manager_cls = DAPORewardManager
else:
raise NotImplementedError
# Note(haibin.lin): please make sure custom reward managers are imported and
# registered via `verl.workers.reward_manager.register`
reward_manager_name = config.reward_model.get("reward_manager", "naive")
reward_manager_cls = get_reward_manager_cls(reward_manager_name)

compute_score = get_custom_reward_fn(config)
reward_fn = reward_manager_cls(
Expand Down
2 changes: 2 additions & 0 deletions recipe/prime/config/prime_trainer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ reward_model:
ref_path: ${reward_model.model.path}
use_remove_padding: True
use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
fused_kernel_options:
impl_backend: torch # triton, torch
tokenizer_path: ${actor_rollout_ref.model.path}
enable_gradient_checkpointing: ${actor_rollout_ref.model.enable_gradient_checkpointing}
ref_type: freeze
Expand Down
2 changes: 2 additions & 0 deletions recipe/prime/prime_dp_rm.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def _forward_micro_batch(self, micro_batch, prompt_length):
attention_mask=None,
position_ids=position_ids_rmpad,
use_cache=False,
return_dict=self.use_fused_kernels,
)

if self.use_fused_kernels:
Expand All @@ -100,6 +101,7 @@ def _forward_micro_batch(self, micro_batch, prompt_length):
attention_mask=micro_batch["attention_mask"],
position_ids=micro_batch["position_ids"],
use_cache=False,
return_dict=self.use_fused_kernels,
)

if self.use_fused_kernels:
Expand Down
4 changes: 4 additions & 0 deletions recipe/prime/prime_fsdp_workers.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,11 +129,15 @@ def _build_reward_ref_model_optimizer(self, config):
trust_remote_code=trust_remote_code,
)

fused_kernel_options = config.model.get("fused_kernel_options", None)
fused_kernels_backend = fused_kernel_options.get("impl_backend", None) if fused_kernel_options is not None else None

apply_monkey_patch(
model=reward_module,
ulysses_sp_size=self.ulysses_sequence_parallel_size,
use_remove_padding=config.model.get("use_remove_padding", False),
use_fused_kernels=config.model.get("use_fused_kernels", False),
fused_kernels_backend=fused_kernels_backend,
)

# some parameters may not in torch_dtype
Expand Down
Loading