Jianbing-D · ETOgaosion · Jun 8, 2025 · Jun 8, 2025 · Jun 9, 2025 · Jun 9, 2025
diff --git a/.github/workflows/utils_cpu_test.yml → .github/workflows/cpu_unit_tests.yml b/.github/workflows/utils_cpu_test.yml → .github/workflows/cpu_unit_tests.yml
@@ -1,4 +1,4 @@
-name: utils_cpu_test
+name: cpu_unit_tests
 
 on:
   # Trigger the workflow on push or pull request,
@@ -13,7 +13,7 @@ on:
       - v0.*
     paths:
       - "**/*.py"
-      - .github/workflows/utils_cpu_test.yml
+      - .github/workflows/cpu_unit_tests.yml
       - "!recipe/**/*.py"
 
 # Cancel jobs on the same ref if a new one is triggered
@@ -26,7 +26,7 @@ permissions:
   contents: read
 
 jobs:
-  utils_cpu_test:
+  cpu_unit_tests:
     runs-on: ubuntu-latest
     timeout-minutes: 10 # Increase this timeout value as needed
     strategy:
@@ -41,7 +41,7 @@ jobs:
       - name: Install the current repository
         run: |
           pip install -e .[test]
-      - name: Running test protocol.py
+      - name: Running data proto test
         run: |
           cd tests
           pytest -s -x test_protocol.py
@@ -53,3 +53,7 @@ jobs:
         run: |
           cd tests/trainer
           pytest -s -x .
+      - name: Running worker tests
+        run: |
+          cd tests/workers/reward_manager
+          pytest -s -x .
diff --git a/.github/workflows/e2e_ppo_trainer.yml b/.github/workflows/e2e_ppo_trainer.yml
@@ -61,7 +61,7 @@ jobs:
 
   e2e_ppo_trainer_vllm:
     runs-on: [L20x8]
-    timeout-minutes: 40 # Increase this timeout value as needed
+    timeout-minutes: 60 # Increase this timeout value as needed
     env:
       HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
@@ -161,6 +161,14 @@ jobs:
         run: |
           ray stop --force
           LIGER=True bash tests/e2e/ppo_trainer/run_model_reward.sh
+      - name: Running GSM8K E2E with rmpad using model rm with Fused Kernel enabled
+        run: |
+          ray stop --force
+          FUSED_KERNELS=True bash tests/e2e/ppo_trainer/run_model_reward.sh
+      - name: Running GSM8K E2E with rmpad using model rm with Fused Kernel enabled
+        run: |
+          ray stop --force
+          FUSED_KERNEL=True FUSED_KERNEL_BACKEND=triton bash tests/e2e/ppo_trainer/run_model_reward.sh
 
   e2e_ppo_trainer_vllm_vlm:
     runs-on: [L20x8]
@@ -181,13 +189,13 @@ jobs:
           fetch-depth: 0
       - name: Install the current repository
         run: |
-          pip3 install -e .[test,geo,vllm]
+          pip3 install -e .[test,gpu,vllm,geo,trl]
       # Geo3k
       - name: Prepare Geo3k dataset
         run: |
           ray stop --force
           python3 examples/data_preprocess/geo3k.py
-      - name: Running Geo3k VLM E2E training tests on 8 L20 GPUs with rmpad using function rm
+      - name: Running Geo3k VLM GRPO E2E training tests on 8 L20 GPUs with rmpad using function rm
         run: |
           ray stop --force
           TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \
@@ -197,6 +205,16 @@ jobs:
             SP_SIZE=2 \
             bash tests/e2e/ppo_trainer/run_function_reward.sh
 
+      - name: Running Geo3k VLM PPO E2E training tests on 8 L20 GPUs with rmpad using function rm
+        run: |
+          ray stop --force
+          TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \
+            MAX_PROMPT_LEN=1536 MAX_RESPONSE_LEN=1536 \
+            MODEL_ID=Qwen/Qwen2-VL-2B-Instruct \
+            ADV_ESTIMATOR=gae RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
+            SP_SIZE=2 \
+            bash tests/e2e/ppo_trainer/run_function_reward.sh
+
   e2e_ppo_trainer_sglang:
     runs-on: [L20x8]
     needs: pre_commit_for_ppo
@@ -262,7 +280,7 @@ jobs:
   e2e_ppo_trainer_sglang_vlm:
     runs-on: [L20x8]
     needs: pre_commit_for_ppo
-    timeout-minutes: 40 # Increase this timeout value as needed
+    timeout-minutes: 60 # Increase this timeout value as needed
     env:
       HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
@@ -294,74 +312,24 @@ jobs:
             ENGINE=sglang GPU_MEMORY_UTILIZATION=0.6 ACTOR_FSDP_PARAM_OFFLOAD=True \
             ACTOR_FSDP_OPTIMIZER_OFFLOAD=True REF_FSDP_PARAM_OFFLOAD=True \
             bash tests/e2e/ppo_trainer/run_function_reward.sh
-
-  e2e_ppo_trainer_fused_kernels_vllm:
-    runs-on: [L20x8]
-    needs: pre_commit_for_ppo
-    timeout-minutes: 40 # Increase this timeout value as needed
-    env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
-      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
-      NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
-      HF_ENDPOINT: "https://hf-mirror.com"
-      HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
-    container:
-      image: hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.3-flashinfer0.2.2-cxx11abi0
-      options: --gpus all --shm-size=50g # Visual dataloader requires large memory
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-      - name: Install the current repository
-        run: |
-          pip3 install -e .[test,geo,vllm]
-      # Geo3k
-      - name: Prepare Geo3k dataset
-        run: |
-          ray stop --force
-          python3 examples/data_preprocess/geo3k.py
-      - name: Running Geo3k VLM E2E with rmpad using fused kernel (Qwen2.5-VL)
+      - name: Running Geo3k VLM E2E with rmpad using torch fused kernel (Qwen2.5-VL)
         run: |
           ray stop --force
           FUSED_KERNELS=True TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \
             MAX_PROMPT_LEN=1536 MAX_RESPONSE_LEN=1536 \
             MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct \
             ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
-            GPU_MEMORY_UTILIZATION=0.6 ACTOR_FSDP_PARAM_OFFLOAD=True \
+            ENGINE=sglang GPU_MEMORY_UTILIZATION=0.6 ACTOR_FSDP_PARAM_OFFLOAD=True \
             ACTOR_FSDP_OPTIMIZER_OFFLOAD=True REF_FSDP_PARAM_OFFLOAD=True \
             bash tests/e2e/ppo_trainer/run_function_reward.sh
-
-  e2e_ppo_trainer_fused_kernels_sglang:
-    runs-on: [L20x8]
-    needs: pre_commit_for_ppo
-    timeout-minutes: 40 # Increase this timeout value as needed
-    env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
-      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
-      NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
-      HF_ENDPOINT: "https://hf-mirror.com"
-      HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
-    container:
-      image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
-      options: --gpus all --shm-size=50g # Visual dataloader requires large memory
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-      - name: Install the current repository
-        run: |
-          pip3 install -e .[test,geo,gpu,sglang]
-      - name: Prepare Geo3k dataset
-        run: |
-          ray stop --force
-          python3 examples/data_preprocess/geo3k.py
-      - name: Running Geo3k VLM E2E with rmpad using fused kernel (Qwen2.5-VL)
+      - name: Running Geo3k VLM E2E with rmpad using triton fused kernel (Qwen2.5-VL)
         run: |
           ray stop --force
-          FUSED_KERNELS=True TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \
+          FUSED_KERNELS=True FUSED_KERNEL_BACKEND=triton \
+            TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \
             MAX_PROMPT_LEN=1536 MAX_RESPONSE_LEN=1536 \
             MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct \
             ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \
             ENGINE=sglang GPU_MEMORY_UTILIZATION=0.6 ACTOR_FSDP_PARAM_OFFLOAD=True \
             ACTOR_FSDP_OPTIMIZER_OFFLOAD=True REF_FSDP_PARAM_OFFLOAD=True \
-            bash tests/e2e/ppo_trainer/run_function_reward.sh
+            bash tests/e2e/ppo_trainer/run_function_reward.sh
diff --git a/.github/workflows/kernels.yml b/.github/workflows/kernels.yml
@@ -38,7 +38,7 @@ permissions:
   contents: read
 
 jobs:
-  e2e_gsm8k_megatron:
+  kernels:
     runs-on: [L20x8]
     timeout-minutes: 40 # Increase this timeout value as needed
     env:
@@ -59,4 +59,7 @@ jobs:
           pip3 install --no-deps -e .[test]
       - name: Testing LinearCrossEntropy Correction, Computation Time and Memory Consumption
         run: |
-          python3 tests/kernels/test_linear_cross_entropy.py
+          python3 tests/kernels/test_linear_cross_entropy.py
+      - name: Testing LinearCrossEntropyTP Correction, Computation Time and Memory Consumption
+        run: |
+          torchrun --standalone --nnodes=1 --nproc-per-node=8 tests/kernels/test_linear_cross_entropy_tp.py
diff --git a/docs/api/trainer.rst b/docs/api/trainer.rst
@@ -12,17 +12,18 @@ Trainers drive the training loop. Introducing new trainer classes in case of new
 Core APIs
 ~~~~~~~~~~~~~~~~~
 
-.. autoclass::  verl.trainer.ppo.ray_trainer.RayPPOTrainer
+.. autoclass:: verl.trainer.ppo.ray_trainer.RayPPOTrainer
    :members: __init__, init_workers, fit
 
-
 .. automodule:: verl.utils.tokenizer
    :members: hf_tokenizer
 
-
 .. automodule:: verl.trainer.ppo.core_algos
    :members: agg_loss, kl_penalty, compute_policy_loss, kl_penalty
 
-
 .. automodule:: verl.trainer.ppo.reward
    :members: load_reward_manager, compute_reward, compute_reward_async
+
+.. autoclass:: verl.workers.reward_manager.NaiveRewardManager
+
+.. autoclass:: verl.workers.reward_manager.DAPORewardManager
diff --git a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_fused_kernels.sh b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_fused_kernels.sh
@@ -0,0 +1,64 @@
+set -x
+
+gsm8k_train_path=$HOME/data/gsm8k/train.parquet
+gsm8k_test_path=$HOME/data/gsm8k/test.parquet
+math_train_path=$HOME/data/math/train.parquet
+math_test_path=$HOME/data/math/test.parquet
+
+train_files="['$gsm8k_train_path', '$math_train_path']"
+test_files="['$gsm8k_test_path', '$math_test_path']"
+
+FUSED_KERNEL_BACKEND=triton # or 'torch' for torch backend
+
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=gae \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.train_batch_size=4096 \
+    data.max_prompt_length=4096 \
+    data.max_response_length=4096 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.return_raw_chat=True \
+    actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.model.use_fused_kernels=True \
+    actor_rollout_ref.model.fused_kernel_options.impl_backend=$FUSED_KERNEL_BACKEND \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=512 \
+    actor_rollout_ref.actor.use_dynamic_bsz=True \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=24000 \
+    critic.optim.lr=1e-5 \
+    critic.model.use_remove_padding=True \
+    critic.model.path=Qwen/Qwen2-7B-Instruct \
+    critic.model.enable_gradient_checkpointing=True \
+    critic.use_dynamic_bsz=True \
+    critic.ppo_max_token_len_per_gpu=98304 \
+    critic.model.fsdp_config.param_offload=False \
+    critic.model.fsdp_config.optimizer_offload=False \
+    reward_model.enable=True \
+    reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\
+    reward_model.model.use_remove_padding=True \
+    reward_model.model.fsdp_config.param_offload=True \
+    reward_model.micro_batch_size_per_gpu=32 \
+    reward_model.use_dynamic_bsz=True \
+    reward_model.forward_max_token_len_per_gpu=98304 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name='verl_example_gsm8k' \
+    trainer.experiment_name='qwen2-7b_hybrid_rm_bsz8k_p4k_r4k_seq_packing_fused_kernel' \
+    trainer.n_gpus_per_node=8 \
+    trainer.val_before_train=False \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=15 $@
diff --git a/recipe/char_count/READMD.md → recipe/char_count/README.md b/recipe/char_count/READMD.md → recipe/char_count/README.md
diff --git a/recipe/char_count/train_grpo.sh b/recipe/char_count/train_grpo.sh
@@ -40,5 +40,5 @@ python3 -m verl.trainer.main_ppo \
     trainer.save_freq=-1 \
     trainer.test_freq=5 \
     trainer.total_epochs=2 \
-    custom_reward_function.path=/home/chi/Developer/verl/recipe/char_count/reward_function.py \
+    custom_reward_function.path=recipe/char_count/reward_function.py \
     custom_reward_function.name=char_count_reward_function
diff --git a/recipe/char_count/train_sft.sh b/recipe/char_count/train_sft.sh
@@ -13,7 +13,7 @@ torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
     data.max_length=256 \
     data.train_batch_size=256 \
     use_remove_padding=True \
-    model.partial_pretrain=$HOME/models/SmolLM2-135M-Instruct \
+    model.partial_pretrain=HuggingFaceTB/SmolLM2-135M-Instruct \
     trainer.default_local_dir=$save_path \
     trainer.project_name=char_count-sft \
     trainer.experiment_name=char_count-sft-SmolLM2-135M-Instruct \

diff --git a/recipe/dapo/dapo_ray_trainer.py b/recipe/dapo/dapo_ray_trainer.py
@@ -155,7 +155,6 @@ def fit(self):
 
                         new_batch.batch["token_level_scores"] = reward_tensor
 
-                        print(f"{list(reward_extra_infos_dict.keys())=}")
                         if reward_extra_infos_dict:
                             new_batch.non_tensor_batch.update({k: np.array(v) for k, v in reward_extra_infos_dict.items()})
 

diff --git a/recipe/dapo/main_dapo.py b/recipe/dapo/main_dapo.py
@@ -118,21 +118,12 @@ def run(self, config):
             role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
             mapping[Role.RefPolicy] = global_pool_id
 
-        reward_manager_name = config.reward_model.get("reward_manager", "naive")
-        if reward_manager_name == "naive":
-            from verl.workers.reward_manager import NaiveRewardManager
-
-            reward_manager_cls = NaiveRewardManager
-        elif reward_manager_name == "prime":
-            from verl.workers.reward_manager import PrimeRewardManager
-
-            reward_manager_cls = PrimeRewardManager
-        elif reward_manager_name == "dapo":
-            from verl.workers.reward_manager import DAPORewardManager
+        from verl.workers.reward_manager import get_reward_manager_cls
 
-            reward_manager_cls = DAPORewardManager
-        else:
-            raise NotImplementedError
+        # Note(haibin.lin): please make sure custom reward managers are imported and
+        # registered via `verl.workers.reward_manager.register`
+        reward_manager_name = config.reward_model.get("reward_manager", "naive")
+        reward_manager_cls = get_reward_manager_cls(reward_manager_name)
 
         compute_score = get_custom_reward_fn(config)
         reward_fn = reward_manager_cls(

diff --git a/recipe/prime/config/prime_trainer.yaml b/recipe/prime/config/prime_trainer.yaml
@@ -33,6 +33,8 @@ reward_model:
     ref_path: ${reward_model.model.path}
     use_remove_padding:  True
     use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    fused_kernel_options:
+      impl_backend: torch # triton, torch
     tokenizer_path: ${actor_rollout_ref.model.path}
     enable_gradient_checkpointing: ${actor_rollout_ref.model.enable_gradient_checkpointing}
     ref_type: freeze

diff --git a/recipe/prime/prime_dp_rm.py b/recipe/prime/prime_dp_rm.py
@@ -77,6 +77,7 @@ def _forward_micro_batch(self, micro_batch, prompt_length):
                 attention_mask=None,
                 position_ids=position_ids_rmpad,
                 use_cache=False,
+                return_dict=self.use_fused_kernels,
             )
 
             if self.use_fused_kernels:
@@ -100,6 +101,7 @@ def _forward_micro_batch(self, micro_batch, prompt_length):
                 attention_mask=micro_batch["attention_mask"],
                 position_ids=micro_batch["position_ids"],
                 use_cache=False,
+                return_dict=self.use_fused_kernels,
             )
 
             if self.use_fused_kernels:

diff --git a/recipe/prime/prime_fsdp_workers.py b/recipe/prime/prime_fsdp_workers.py
@@ -129,11 +129,15 @@ def _build_reward_ref_model_optimizer(self, config):
                 trust_remote_code=trust_remote_code,
             )
 
+            fused_kernel_options = config.model.get("fused_kernel_options", None)
+            fused_kernels_backend = fused_kernel_options.get("impl_backend", None) if fused_kernel_options is not None else None
+
             apply_monkey_patch(
                 model=reward_module,
                 ulysses_sp_size=self.ulysses_sequence_parallel_size,
                 use_remove_padding=config.model.get("use_remove_padding", False),
                 use_fused_kernels=config.model.get("use_fused_kernels", False),
+                fused_kernels_backend=fused_kernels_backend,
             )
 
             # some parameters may not in torch_dtype