diff --git a/.github/workflows/.deprecate/e2e_ppo_trainer.yml b/.github/workflows/.deprecate/e2e_ppo_trainer.yml
index 00ecd79152b..94ef83db3d3 100644
--- a/.github/workflows/.deprecate/e2e_ppo_trainer.yml
+++ b/.github/workflows/.deprecate/e2e_ppo_trainer.yml
@@ -77,7 +77,19 @@ jobs:
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     container:
-      image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2
+<<<<<<< HEAD
+      image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2
+=======
+<<<<<<< HEAD
+<<<<<<< HEAD
+      image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2
+=======
+      image: verlai/verl:app-verl0.5-sglang0.4.10.post2-mcore0.12.2-te2.2
+>>>>>>> 8ebbea9a (Add CI new version)
+=======
+      image: popsodazhp/verl:app-verl0.5-sglang0.4.10.post2-mcore0.12.2-te2.2
+>>>>>>> 0134c016 (Use personal version)
+>>>>>>> c0d00361 (Use personal version)
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -110,7 +122,7 @@ jobs:
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     container:
-      image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2
+      image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/.github/workflows/.deprecate/e2e_ppo_trainer_megatron_sglang.yml b/.github/workflows/.deprecate/e2e_ppo_trainer_megatron_sglang.yml
index 177af9ec8b9..0e5f7a487a1 100644
--- a/.github/workflows/.deprecate/e2e_ppo_trainer_megatron_sglang.yml
+++ b/.github/workflows/.deprecate/e2e_ppo_trainer_megatron_sglang.yml
@@ -75,7 +75,7 @@ permissions:
   contents: read
 
 env:
-  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2"
+  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2"
   DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
 
 jobs:
diff --git a/.github/workflows/checkpoint_converter.yml b/.github/workflows/checkpoint_converter.yml
index 90ad640dff4..65baa00e956 100644
--- a/.github/workflows/checkpoint_converter.yml
+++ b/.github/workflows/checkpoint_converter.yml
@@ -81,7 +81,11 @@ jobs:
       NO_PROXY: "localhost,127.0.0.1"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     container:
-      image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2
+<<<<<<< HEAD
+      image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2
+=======
+      image: popsodazhp/verl:app-verl0.5-sglang0.4.10.post2-mcore0.12.2-te2.2
+>>>>>>> 0134c016 (Use personal version)
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -116,7 +120,11 @@ jobs:
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
       HF_ENDPOINT: "https://hf-mirror.com"
     container:
-      image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2
+<<<<<<< HEAD
+      image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2
+=======
+      image: popsodazhp/verl:app-verl0.5-sglang0.4.10.post2-mcore0.12.2-te2.2
+>>>>>>> 0134c016 (Use personal version)
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/.github/workflows/cpu_unit_tests.yml b/.github/workflows/cpu_unit_tests.yml
index 698816ce11e..95cbc8be370 100644
--- a/.github/workflows/cpu_unit_tests.yml
+++ b/.github/workflows/cpu_unit_tests.yml
@@ -68,7 +68,7 @@ jobs:
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     container:
-      image: verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2
+      image: verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
diff --git a/.github/workflows/e2e_dapo.yml b/.github/workflows/e2e_dapo.yml
index 60119e8436f..b4e28286069 100644
--- a/.github/workflows/e2e_dapo.yml
+++ b/.github/workflows/e2e_dapo.yml
@@ -94,7 +94,7 @@ jobs:
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     container:
-      image: verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2
+      image: verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/.github/workflows/e2e_eval_aime24.yml b/.github/workflows/e2e_eval_aime24.yml
index f5718603b9e..b0da8f2acc2 100644
--- a/.github/workflows/e2e_eval_aime24.yml
+++ b/.github/workflows/e2e_eval_aime24.yml
@@ -88,7 +88,7 @@ permissions:
   contents: read
 
 env:
-  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2"
+  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2"
   DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
 
 jobs:
diff --git a/.github/workflows/e2e_genrm_remote.yml b/.github/workflows/e2e_genrm_remote.yml
index 8c7bc690718..6574ef3b61e 100644
--- a/.github/workflows/e2e_genrm_remote.yml
+++ b/.github/workflows/e2e_genrm_remote.yml
@@ -87,7 +87,7 @@ jobs:
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     container:
-      image: verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2
+      image: verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/.github/workflows/e2e_ppo_trainer.yml b/.github/workflows/e2e_ppo_trainer.yml
index f27da026aaf..27fa3ba5448 100644
--- a/.github/workflows/e2e_ppo_trainer.yml
+++ b/.github/workflows/e2e_ppo_trainer.yml
@@ -87,7 +87,7 @@ jobs:
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     container:
-      image: verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2
+      image: verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -229,7 +229,7 @@ jobs:
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     container:
-      image: verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2
+      image: verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2
       options: --gpus all --shm-size=50g # Visual dataloader requires large memory
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -238,11 +238,10 @@ jobs:
       - name: Install the current repository
         run: |
           pip3 install --no-deps -e .[test,gpu,vllm,geo,trl]
-          pip install "transformers[hf_xet]<4.53.0" # Fix for transformers 4.53.0
+          pip install "transformers[hf_xet]==4.54.0"
       # Geo3k
       - name: Prepare GEO3K dataset
         run: |
-          ray stop --force
           python3 examples/data_preprocess/geo3k.py
       - name: Running GEO3K VLM GRPO E2E training tests on 8 L20 GPUs with rmpad using function rm
         run: |
@@ -285,7 +284,7 @@ jobs:
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     container:
-      image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2
+      image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -318,7 +317,7 @@ jobs:
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     container:
-      image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2
+      image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2
       options: --gpus all --shm-size=50g # Visual dataloader requires large memory
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -326,7 +325,8 @@ jobs:
           fetch-depth: 0
       - name: Install the current repository
         run: |
-          pip3 install -e .[test,geo,gpu,sglang] --no-deps && pip install transformers==4.52.3
+          pip3 install -e .[test,geo,gpu,sglang] --no-deps
+          pip install "transformers[hf_xet]==4.54.0"
       # Geo3k
       - name: Prepare GEO3K dataset
         run: |
diff --git a/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml b/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml
index 4e635b3351b..f37866274a4 100644
--- a/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml
+++ b/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml
@@ -86,7 +86,7 @@ permissions:
   contents: read
 
 env:
-  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2"
+  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2"
   DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
 
 jobs:
diff --git a/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml b/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml
index ae12c110693..3fa0e51a2d9 100644
--- a/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml
+++ b/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml
@@ -85,7 +85,7 @@ permissions:
   contents: read
 
 env:
-  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2"
+  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2"
   DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
 
 jobs:
@@ -348,7 +348,6 @@ jobs:
       - name: Install the current repository
         run: |
           pip3 install --no-deps -e .[test]
-          pip3 install "transformers[hf_xet]<4.52.0"
       - name: Prepare Geo3k dataset
         run: |
           python3 examples/data_preprocess/geo3k.py
diff --git a/.github/workflows/e2e_spin.yml b/.github/workflows/e2e_spin.yml
index cb56fbeac7a..ad6a2bfd748 100644
--- a/.github/workflows/e2e_spin.yml
+++ b/.github/workflows/e2e_spin.yml
@@ -68,7 +68,7 @@ jobs:
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     container:
-      image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2
+      image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/.github/workflows/e2e_sppo.yml b/.github/workflows/e2e_sppo.yml
index cf85c296012..15edc4f9d00 100644
--- a/.github/workflows/e2e_sppo.yml
+++ b/.github/workflows/e2e_sppo.yml
@@ -66,7 +66,7 @@ jobs:
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     container:
-      image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2
+      image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/.github/workflows/gpu_unit_tests.yml b/.github/workflows/gpu_unit_tests.yml
index d86e7e64d86..25018594d48 100644
--- a/.github/workflows/gpu_unit_tests.yml
+++ b/.github/workflows/gpu_unit_tests.yml
@@ -80,7 +80,7 @@ jobs:
       NO_PROXY: "localhost,127.0.0.1"
       HF_HUB_ENABLE_HF_TRANSFER: 1
     container:
-      image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2
+      image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/.github/workflows/model.yml b/.github/workflows/model.yml
index d484c2b9d51..280781f8c7c 100644
--- a/.github/workflows/model.yml
+++ b/.github/workflows/model.yml
@@ -73,7 +73,7 @@ jobs:
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     container:
-      image: verlai/verl:app-verl0.4-sglang0.4.6.post5-vllm0.8.5-mcore0.12.2-te2.2
+      image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -82,7 +82,7 @@ jobs:
       - name: Install the current repository and upgrade to latest transformers(4.54.0)/flash_attn, transformers 4.55.0 has strange behavior with model backward
         run: |
           pip3 install --no-deps -e .[test]
-          pip3 install --upgrade transformers==4.54.0
+          pip3 install --upgrade transformers
       - name: Running rmpad model tests on 8 L20 GPUs + flash_attn 2.5.8
         run: |
           pytest -s tests/models/test_transformer.py
@@ -95,6 +95,10 @@ jobs:
       - name: Running transformers ulysses tests on 8 L20 GPUs + latest transformers
         run: |
           torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py
+      - name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.54.1
+        run: |
+          pip3 install transformers==4.54.1
+          torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py
       - name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.53.2
         run: |
           pip3 install transformers==4.53.2
@@ -119,7 +123,7 @@ jobs:
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     container:
-      image: verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.0-fa2.7.4
+      image: verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/.github/workflows/sanity.yml b/.github/workflows/sanity.yml
index ce759b82664..39eaf0e3156 100644
--- a/.github/workflows/sanity.yml
+++ b/.github/workflows/sanity.yml
@@ -12,7 +12,7 @@
 # - `special_sanity`: a suite of quick sanity tests
 # - `special_standalone`: a set of test that are designed to run in dedicated environments
 
-# Accelerators for tests 
+# Accelerators for tests
 # - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
 # - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
 
@@ -78,7 +78,7 @@ jobs:
           pytest -s -x tests/special_sanity
       - name: Run license test
         run: |
-          python3 tests/special_sanity/check_license.py --directory .
+          python3 tests/special_sanity/check_license.py --directories .
       - name: Assert naming convention
         run: |
           if grep -rIn --exclude-dir=.git --exclude-dir=.github --exclude-dir=venv --exclude-dir=__pycache__ 'veRL' .; then
diff --git a/.github/workflows/sgl.yml b/.github/workflows/sgl.yml
index 5795f9c3e7c..9eb07e8faf8 100644
--- a/.github/workflows/sgl.yml
+++ b/.github/workflows/sgl.yml
@@ -90,7 +90,11 @@ jobs:
       NCCL_SHM_DISABLE: "1"
       NCCL_P2P_DISABLE: "1"
     container:
-      image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2
+<<<<<<< HEAD
+      image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2
+=======
+      image: popsodazhp/verl:app-verl0.5-sglang0.4.10.post2-mcore0.12.2-te2.2
+>>>>>>> 0134c016 (Use personal version)
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml
index 2998c08f09f..181eb9be74e 100644
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@@ -84,7 +84,7 @@ jobs:
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     container:
-      image: verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2
+      image: verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4b4c7b8435c..bd77c362015 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -32,6 +32,6 @@ repos:
     hooks:
       - id: check-license
         name: Check license
-        entry: python3 tests/special_sanity/check_license.py --directory .
+        entry: python3 tests/special_sanity/check_license.py --directories examples recipe scripts tests verl setup.py
         language: python
         pass_filenames: false
diff --git a/README.md b/README.md
index 423a3ff83e6..2345b46441e 100644
--- a/README.md
+++ b/README.md
@@ -232,6 +232,7 @@ verl is inspired by the design of Nemo-Aligner, Deepspeed-chat and OpenRLHF. The
 - [Agent Lightning](https://github.com/microsoft/agent-lightning): A flexible and extensible framework that enables seamless agent optimization for any existing agent framework. ![GitHub Repo stars](https://img.shields.io/github/stars/microsoft/agent-lightning)
 - [VTool-R1](https://github.com/VTOOL-R1/vtool-r1): VLMs Learn to Think with Images via Reinforcement Learning on Multimodal Tool Use. ![GitHub Repo stars](https://img.shields.io/github/stars/VTOOL-R1/vtool-r1)
 - [Kimina-Prover-RL](https://github.com/project-numina/kimina-prover-rl/tree/main/recipe/kimina_prover_rl): Training pipeline for formal theorem proving, based on a paradigm inspired by DeepSeek-R1.
+- [RL-PLUS](https://github.com/YihongDong/RL-PLUS): Countering Capability Boundary Collapse of LLMs in Reinforcement Learning with Hybrid-policy Optimization.
 
 and many more awesome work listed in [recipe](recipe/README.md).
 
diff --git a/docker/Dockerfile.sglang b/docker/Dockerfile.sglang
index 11ad4a77da6..7ed9c43876c 100644
--- a/docker/Dockerfile.sglang
+++ b/docker/Dockerfile.sglang
@@ -36,11 +36,11 @@ RUN pip config set global.index-url "${PIP_INDEX}" && \
     pip config set global.extra-index-url "${PIP_INDEX}" && \
     python -m pip install --upgrade pip
 
-# Install sglang-0.4.6.post5 and torch-memory-saver
-RUN pip uninstall -y cuda-python && pip install "sglang[all]==0.4.6.post5" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir
+# Install sglang-0.4.10.post2 and torch-memory-saver
+RUN pip uninstall -y cuda-python && pip install "sglang[all]==0.4.10.post2" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.7/flashinfer-python && pip install torch-memory-saver --no-cache-dir
 
-# Install torch-2.6.0
-RUN pip install --no-cache-dir torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 tensordict torchdata \
+# Install torch-2.7.1
+RUN pip install --no-cache-dir torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 tensordict torchdata \
     transformers>=4.49.0 accelerate datasets peft hf_transfer \
     ray[default] codetiming hydra-core pandas pyarrow>=15.0.0 pylatexenc qwen-vl-utils wandb liger-kernel \
     pytest pre-commit py-spy pyext
diff --git a/docker/README.md b/docker/README.md
index 787843ec375..d988b0a2b2d 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -14,9 +14,7 @@ The first two types of images are hosted on dockerhub [verlai/verl](https://hub.
 
 ## Base Image
 
-The stable base image is ``verlai/verl:base-verl0.4-cu124-cudnn9.8-torch2.6-fa2.7.4``. The installed package versions can be found from tags, and the Dockerfile can be found in ``verl[version]-[packages]/Dockerfile.base``.
-
-The base images for preview are ``verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.8.0`` and ``verlai/verl:base-verl0.5-preview-cu128-cudnn9.8-torch2.7.1-fa2.8.0`` with different CUDA versions.
+The stable base image is ``verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4`` with different CUDA versions.
 
 The update of base image is not frequent, and the app image can be built on top of it without reinstalling base packages.
 
@@ -25,8 +23,8 @@ The update of base image is not frequent, and the app image can be built on top
 From this version, we divide images built for vLLM and SGLang as the divergence of dependent packages like FlashInfer.
 There are 2 types of application images available:
 
-- **vLLM with FSDP and Megatron**: ``verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2``
-- **SGLang with FSDP and Megatron**: ``verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2``
+- **vLLM with FSDP and Megatron**: ``verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2``
+- **SGLang with FSDP and Megatron**: `verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2`
 
 Docker images with Megatron backends are runnable with large language model like ``Qwen/Qwen3-235B-A22B``, ``deepseek-ai/DeepSeek-V3-0324`` post-training. Refer to the :doc:`Large Language Model Post-Training documentation<../perf/dpsk>` for more details.
 
@@ -54,7 +52,7 @@ docker start verl
 docker exec -it verl bash
 ```
 
-2.	If you use the images provided, you only need to install verl itself without dependencies:
+2. If you use the images provided, you only need to install verl itself without dependencies:
 
 ```sh
 # install the nightly version (recommended)
diff --git a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.post6.mcore0.12 b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.10.post2.mcore0.12
similarity index 94%
rename from docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.post6.mcore0.12
rename to docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.10.post2.mcore0.12
index 292363f9056..64b0de34b49 100644
--- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.post6.mcore0.12
+++ b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.10.post2.mcore0.12
@@ -14,10 +14,10 @@ ENV HF_HUB_ENABLE_HF_TRANSFER="1"
 # Install FlashInfer Python package
 RUN pip install --upgrade pip setuptools packaging
 RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation flashinfer-python==0.2.9rc1
-RUN pip install --resume-retries 999  --no-cache-dir --no-build-isolation "sglang[all]==0.4.9.post6"
+RUN pip install --resume-retries 999  --no-cache-dir --no-build-isolation "sglang[all]==0.4.10.post2"
 
 # Fix packages
-RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]==4.54.0" accelerate datasets peft hf-transfer \
+RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]==4.54.1" accelerate datasets peft hf-transfer \
     "numpy<2.0.0" "pyarrow>=19.0.1" pandas \
     ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
     pytest py-spy pyext pre-commit ruff
diff --git a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.8.mcore0.12 b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.8.mcore0.12
deleted file mode 100644
index 0ac7904b7c6..00000000000
--- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.8.mcore0.12
+++ /dev/null
@@ -1,39 +0,0 @@
-# Start from the verl base image
-# Dockerfile.base
-FROM verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4
-
-# Define environments
-ENV MAX_JOBS=8
-ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
-ENV DEBIAN_FRONTEND=noninteractive
-ENV NODE_OPTIONS=""
-ENV PIP_ROOT_USER_ACTION=ignore
-ENV HF_HUB_ENABLE_HF_TRANSFER="1"
-
-# Install sglang-0.4.8 and torch-memory-saver
-# Install FlashInfer Python package
-RUN pip install --upgrade pip setuptools packaging
-RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation flashinfer-python==0.2.6.post1
-RUN pip install --resume-retries 999  --no-cache-dir "sglang[all]==0.4.8" && pip install torch-memory-saver --no-cache-dir
-
-# Fix packages
-RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.52.3" accelerate datasets peft hf-transfer \
-    "numpy<2.0.0" "pyarrow>=19.0.1" pandas \
-    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
-    pytest py-spy pyext pre-commit ruff
-
-RUN pip uninstall -y pynvml nvidia-ml-py && \
-    pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
-
-RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
-
-# Install TransformerEngine
-RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1
-
-# Install Megatron-LM
-RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
-
-# Install mbridge
-RUN pip3 install --no-cache-dir mbridge
-
-RUN pip3 install --no-deps --no-cache-dir --no-build-isolation --resume-retries 999 vllm==0.9.2
\ No newline at end of file
diff --git a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.mcore0.12 b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.post6.mcore0.13
similarity index 83%
rename from docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.mcore0.12
rename to docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.post6.mcore0.13
index 3f7bff3de1f..d79201a92ee 100644
--- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.mcore0.12
+++ b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.post6.mcore0.13
@@ -10,14 +10,14 @@ ENV NODE_OPTIONS=""
 ENV PIP_ROOT_USER_ACTION=ignore
 ENV HF_HUB_ENABLE_HF_TRANSFER="1"
 
-# Install sglang-0.4.8 and torch-memory-saver
+# Install sglang-0.4.10
 # Install FlashInfer Python package
 RUN pip install --upgrade pip setuptools packaging
 RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation flashinfer-python==0.2.9rc1
-RUN pip install --resume-retries 999  --no-cache-dir "sglang[all]==0.4.9.post4" && pip install torch-memory-saver --no-cache-dir
+RUN pip install --resume-retries 999  --no-cache-dir --no-build-isolation "sglang[all]==0.4.9.post6"
 
 # Fix packages
-RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]==4.53.2" accelerate datasets peft hf-transfer \
+RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]==4.55.4" accelerate datasets peft hf-transfer \
     "numpy<2.0.0" "pyarrow>=19.0.1" pandas \
     ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
     pytest py-spy pyext pre-commit ruff
@@ -31,7 +31,7 @@ RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
 RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1
 
 # Install Megatron-LM
-RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
+RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.13.0
 
 # Install mbridge
 RUN pip3 install --no-cache-dir mbridge
\ No newline at end of file
diff --git a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.vllm.mcore0.12 b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.vllm.mcore0.13
similarity index 79%
rename from docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.vllm.mcore0.12
rename to docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.vllm.mcore0.13
index 9746301b471..9d73e0ffeeb 100644
--- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.vllm.mcore0.12
+++ b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.vllm.mcore0.13
@@ -1,6 +1,6 @@
 # Start from the verl base image
 # Dockerfile.base
-FROM verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.0-fa2.7.4
+FROM verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4
 
 # Define environments
 ENV MAX_JOBS=32
@@ -10,11 +10,12 @@ ENV NODE_OPTIONS=""
 ENV PIP_ROOT_USER_ACTION=ignore
 ENV HF_HUB_ENABLE_HF_TRANSFER="1"
 
-# Install torch-2.7.0+cu126 + vllm-0.9.1
-RUN pip install --resume-retries 999 --no-cache-dir vllm==0.9.1
+# Install torch-2.7.1+cu126 + vllm-0.10.0
+RUN pip install --resume-retries 999 --no-cache-dir vllm==0.10.0
 
 # Fix packages
-RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
+# transformers 4.54.0 still not support
+RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.55.4" accelerate datasets peft hf-transfer \
     "numpy<2.0.0" "pyarrow>=19.0.1" pandas \
     ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
     pytest py-spy pyext pre-commit ruff
@@ -28,7 +29,7 @@ RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
 RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1
 
 # Install Megatron-LM
-RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2
+RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.13.0
 
 # Install mbridge
 RUN pip3 install --no-cache-dir mbridge
diff --git a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.base.torch2.7.0 b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.base.torch2.7.0
deleted file mode 100644
index 30251f578e9..00000000000
--- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.base.torch2.7.0
+++ /dev/null
@@ -1,133 +0,0 @@
-# Base Docker Image of verl, with CUDA/Torch/FlashAttn/Apex/TransformerEngine, without other frameworks
-# Target: verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.8.0-fi0.2.6
-# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
-FROM nvcr.io/nvidia/pytorch:24.08-py3
-
-# Define environments
-ENV MAX_JOBS=16
-ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
-ENV DEBIAN_FRONTEND=noninteractive
-ENV NODE_OPTIONS=""
-ENV PIP_ROOT_USER_ACTION=ignore
-ENV HF_HUB_ENABLE_HF_TRANSFER="1"
-
-# Define installation arguments
-ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
-ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-
-# Set apt source
-RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
-    { \
-    echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
-    echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
-    echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
-    echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
-    } > /etc/apt/sources.list
-
-# Install systemctl
-RUN apt-get update && \
-    apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
-    apt-get clean
-
-# Install tini
-RUN apt-get update && \
-    apt-get install -y tini aria2 libfreeimage3 libfreeimage-dev zlib1g htop && \
-    apt-get clean
-
-# Change pip source
-RUN pip config set global.index-url "${PIP_INDEX}" && \
-    pip config set global.extra-index-url "${PIP_INDEX}" && \
-    python -m pip install --upgrade pip
-
-# Uninstall nv-pytorch fork
-RUN pip uninstall -y torch torchvision torchaudio \
-    pytorch-quantization pytorch-triton torch-tensorrt \
-    xgboost transformer_engine flash_attn apex megatron-core grpcio
-
-RUN pip install --resume-retries 999 --no-cache-dir torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0
-
-# Install flash-attn-2.7.4.post1, although built with torch2.6, it is compatible with torch2.7
-# https://github.com/Dao-AILab/flash-attention/issues/1644#issuecomment-2899396361
-RUN ABI_FLAG=$(python -c "import torch; print('TRUE' if torch._C._GLIBCXX_USE_CXX11_ABI else 'FALSE')") && \
-    URL="https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abi${ABI_FLAG}-cp310-cp310-linux_x86_64.whl" && \
-    FILE="flash_attn-2.7.4.post1+cu12torch2.6cxx11abi${ABI_FLAG}-cp310-cp310-linux_x86_64.whl" && \
-    wget -nv "${URL}" && \
-    pip install --no-cache-dir "${FILE}"
-
-# Fix packages
-RUN pip uninstall -y pynvml nvidia-ml-py && \
-    pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
-
-# Install cudnn
-RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
-    dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
-    cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-*-keyring.gpg /usr/share/keyrings/ && \
-    apt-get update && \
-    apt-get -y install cudnn-cuda-12 && \
-    rm cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb
-
-# Install Apex
-RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" --resume-retries 999 git+https://github.com/NVIDIA/apex.git
-
-# Profiling tools
-RUN aria2c --always-resume=true --max-tries=99999 https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
-    apt-get update && apt-get install -y libxcb-cursor0
-
-RUN apt-get install -y ./nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \
-    rm -rf /usr/local/cuda/bin/nsys && \
-    ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys  /usr/local/cuda/bin/nsys && \
-    rm -rf /usr/local/cuda/bin/nsys-ui && \
-    ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys-ui /usr/local/cuda/bin/nsys-ui && \
-    rm nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb
-
-RUN pip install --resume-retries 999 --no-cache-dir "tensordict==0.6.2" torchdata "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
-    "numpy<2.0.0" "pyarrow>=19.0.1" pandas cuda-bindings \
-    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \
-    pytest py-spy pyext pre-commit ruff
-
-# Install DeepEP
-## the dependency of IBGDA
-RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
-
-## Clone and build deepep and deepep-nvshmem
-RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \
-    git clone https://github.com/deepseek-ai/DeepEP.git  && \
-    cd DeepEP && git checkout a84a248
-
-# Prepare nvshmem
-RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \
-    tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \
-    cd deepep-nvshmem && git apply ../DeepEP/third-party/nvshmem.patch
-
-ENV CUDA_HOME=/usr/local/cuda
-### Set MPI environment variables. Having errors when not set.
-ENV CPATH=/usr/local/mpi/include:$CPATH
-ENV LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH
-ENV LD_LIBRARY_PATH=/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH
-ENV GDRCOPY_HOME=/workspace/gdrcopy
-
-## Build deepep-nvshmem
-RUN cd deepep-nvshmem && \
-    NVSHMEM_SHMEM_SUPPORT=0 \
-    NVSHMEM_UCX_SUPPORT=0 \
-    NVSHMEM_USE_NCCL=0 \
-    NVSHMEM_MPI_SUPPORT=0 \
-    NVSHMEM_IBGDA_SUPPORT=1 \
-    NVSHMEM_PMIX_SUPPORT=0 \
-    NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
-    NVSHMEM_USE_GDRCOPY=1 \
-    cmake -G Ninja -S . -B build/ -DCMAKE_INSTALL_PREFIX=/workspace/deepep-nvshmem/install && cmake --build build/ --target install
-
-ENV NVSHMEM_DIR=/workspace/deepep-nvshmem/install
-ENV LD_LIBRARY_PATH=$NVSHMEM_DIR/lib:$LD_LIBRARY_PATH
-ENV PATH=$NVSHMEM_DIR/bin:$PATH
-
-## Build deepep
-RUN cd DeepEP && \
-    python setup.py install
-
-# Reset pip config
-RUN pip config unset global.index-url && \
-    pip config unset global.extra-index-url
-
diff --git a/docker/verl0.5-cu126-torch2.7-fa2.7.4/README.md b/docker/verl0.5-cu126-torch2.7-fa2.7.4/README.md
index 2db8c58d51d..023e0eec0fa 100644
--- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/README.md
+++ b/docker/verl0.5-cu126-torch2.7-fa2.7.4/README.md
@@ -20,9 +20,7 @@ megatron.core==core_r0.13.0
 ## Target
 
 - Base image:
-    - `verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.0-fa2.7.4`: We offer a base image with deep ep built in, for vllm
-    - `verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4`: We offer a base image with deep ep built in, for sglang
+  - `verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4`: We offer a base image with deep ep built in, for vllm/sglang
 - App image:
-    - `verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2`
-    - `verlai/verl:app-verl0.5-sglang0.4.8-mcore0.12.2-te2.2`
-    - `verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2`
\ No newline at end of file
+  - `verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2`
+  - `verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2`
diff --git a/docs/ascend_tutorial/ascend_quick_start.rst b/docs/ascend_tutorial/ascend_quick_start.rst
index 390c864d899..90bf0aebaab 100644
--- a/docs/ascend_tutorial/ascend_quick_start.rst
+++ b/docs/ascend_tutorial/ascend_quick_start.rst
@@ -187,6 +187,8 @@ vllm & vllm-ascend
 +-----------+-------------------------+-------------+-------------------+-------------------+-------------------+--------------------------+
 |   DAPO    | Qwen3-14B-base          |    5.9%     |        pending    |        FSDP       |    vllm-ascend    |    Atlas 200T A2 Box16   |
 +-----------+-------------------------+-------------+-------------------+-------------------+-------------------+--------------------------+
+|   DAPO    | Qwen3-30B-base          |    1.08%    |        pending    |        FSDP       |    vllm-ascend    |    Atlas 200T A2 Box16   |
++-----------+-------------------------+-------------+-------------------+-------------------+-------------------+--------------------------+
 
 **表2** SFT类算法
 
diff --git a/docs/perf/nsight_profiling.md b/docs/perf/nsight_profiling.md
index 452d5533232..490de5e7e4f 100644
--- a/docs/perf/nsight_profiling.md
+++ b/docs/perf/nsight_profiling.md
@@ -60,16 +60,16 @@ To enable profiling for specific components and steps, modify your ppo_trainer.y
         discrete: False
     actor_rollout_ref:
         actor:
-            profile:
+            profiler:
                 enable: True
                 all_ranks: True
         # rollout & ref follow actor settings
     critic:
-            profile:
+            profiler:
                 enable: True
                 all_ranks: True
     reward_model:
-            profile:
+            profiler:
                 enable: True
                 all_ranks: True
 ```
diff --git a/docs/start/install.rst b/docs/start/install.rst
index a384a4dc3cf..44ab7b46cd3 100644
--- a/docs/start/install.rst
+++ b/docs/start/install.rst
@@ -52,7 +52,7 @@ The first two types of images are hosted on dockerhub `verlai/verl <https://hub.
 Base Image
 ::::::::::
 
-The stable base image is ``verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.0-fa2.7.4`` and ``verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4``, with different Pytorch version for vLLM and sglang. The installed package versions can be found from tags, and the Dockerfile can be found in ``docker/verl[version]-[packages]/Dockerfile.base``.
+The stable base image is ``verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4`` for vLLM and sglang. The installed package versions can be found from tags, and the Dockerfile can be found in ``docker/verl[version]-[packages]/Dockerfile.base``.
 
 The update of base image is not frequent, and the app image can be built on top of it without reinstalling base packages.
 
@@ -63,8 +63,8 @@ From this version, we divide images built for vLLM and SGLang as the divergence
 
 There are 2 types of application images available:
 
-- **vLLM with FSDP and Megatron**: ``verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2``
-- **SGLang with FSDP and Megatron**: ``verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2``
+- **vLLM with FSDP and Megatron**: ``verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2``
+- **SGLang with FSDP and Megatron**: ``verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2``
 
 Docker images with Megatron backends are runnable with large language model like ``Qwen/Qwen3-235B-A22B``, ``deepseek-ai/DeepSeek-V3-0324`` post-training. Refer to the :doc:`Large Language Model Post-Training documentation<../perf/dpsk>` for more details.
 
@@ -77,7 +77,7 @@ Community images are provided by the community, including the latest versions of
 
 For latest vLLM with FSDP, please refer to `hiyouga/verl <https://hub.docker.com/r/hiyouga/verl>`_ repository and the latest version is ``hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.4-flashinfer0.2.2-cxx11abi0``.
 
-For latest SGLang with FSDP, please refer to `hebiaobuaa/verl <https://hub.docker.com/r/hebiaobuaa/verl>`_ repository and the latest version is ``hebiaobuaa/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2`` which is provided by SGLang RL Group.
+For latest SGLang with FSDP, please refer to `hebiaobuaa/verl <https://hub.docker.com/r/hebiaobuaa/verl>`_ repository and the latest version is ``hebiaobuaa/verl:app-verl0.5-sglang0.4.10.post2-mcore0.12.2-te2.2`` which is provided by SGLang RL Group.
 
 See files under ``docker/`` for NGC-based image or if you want to build your own.
 
diff --git a/examples/grpo_trainer/run_deepseek671b_math_megatron_80gb.sh b/examples/grpo_trainer/run_deepseek671b_math_megatron_80gb.sh
index b876b19ba57..2f5a93e4466 100644
--- a/examples/grpo_trainer/run_deepseek671b_math_megatron_80gb.sh
+++ b/examples/grpo_trainer/run_deepseek671b_math_megatron_80gb.sh
@@ -6,7 +6,7 @@ set -x
 # huggingface-cli download deepseek-ai/DeepSeek-V3-0324
 
 # no offline dist checkpoint needed, now with mbridge>=0.13.0, we can directly init model from huggingface downloaded fp8 weights
-# tested on docker://verlai/verl:app-verl0.5-vllm0.10.0-mcore0.13.0-te2.2
+# tested on docker://verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2
 LLM="<path_to_dsv3_config>"
 
 
diff --git a/examples/grpo_trainer/run_qwen2_5_7b_grpo_discrete_prof_npu.sh b/examples/grpo_trainer/run_qwen2_5_7b_grpo_discrete_prof_npu.sh
index 70ea42f2da0..27ab478da28 100644
--- a/examples/grpo_trainer/run_qwen2_5_7b_grpo_discrete_prof_npu.sh
+++ b/examples/grpo_trainer/run_qwen2_5_7b_grpo_discrete_prof_npu.sh
@@ -48,6 +48,13 @@ python3 -m verl.trainer.main_ppo \
     actor_rollout_ref.rollout.enable_chunked_prefill=False \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.ref.profiler.enable=True \
+    actor_rollout_ref.ref.profiler.ranks=$PROFILE_RANKS \
+    actor_rollout_ref.ref.profiler.all_ranks=$PROFILE_RANKS_ALL \
+    actor_rollout_ref.ref.profiler.tool_config.npu.discrete=$DISCRETE \
+    actor_rollout_ref.ref.profiler.tool_config.npu.contents=$CONTENTS \
+    actor_rollout_ref.ref.profiler.tool_config.npu.level=$LEVEL \
+    actor_rollout_ref.ref.profiler.tool_config.npu.analysis=$ANALYSIS \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger=console \
diff --git a/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh b/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh
index a9fff3437e3..1ac6dfe9445 100644
--- a/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh
+++ b/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh
@@ -46,6 +46,12 @@ python3 -m verl.trainer.main_ppo \
     actor_rollout_ref.rollout.enable_chunked_prefill=False \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
     actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.ref.profiler.enable=True \
+    actor_rollout_ref.ref.profiler.all_ranks=$PROFILE_RANKS_ALL \
+    actor_rollout_ref.ref.profiler.tool_config.npu.discrete=$DISCRETE \
+    actor_rollout_ref.ref.profiler.tool_config.npu.contents=$CONTENTS \
+    actor_rollout_ref.ref.profiler.tool_config.npu.level=$LEVEL \
+    actor_rollout_ref.ref.profiler.tool_config.npu.analysis=$ANALYSIS \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger=console \
diff --git a/examples/grpo_trainer/run_qwen2_5_vl-7b-sglang.sh b/examples/grpo_trainer/run_qwen2_5_vl-7b-sglang.sh
index 985bc59c04d..86267a5602a 100644
--- a/examples/grpo_trainer/run_qwen2_5_vl-7b-sglang.sh
+++ b/examples/grpo_trainer/run_qwen2_5_vl-7b-sglang.sh
@@ -40,6 +40,7 @@ python3 -m verl.trainer.main_ppo \
     actor_rollout_ref.rollout.n=5 \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=20 \
     actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.rollout.mode=sync \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
diff --git a/examples/grpo_trainer/run_qwen3moe-30b_megatron_96gb.sh b/examples/grpo_trainer/run_qwen3moe-30b_megatron_96gb.sh
index 7bf06e3ada5..6937db5fcfa 100644
--- a/examples/grpo_trainer/run_qwen3moe-30b_megatron_96gb.sh
+++ b/examples/grpo_trainer/run_qwen3moe-30b_megatron_96gb.sh
@@ -168,7 +168,7 @@ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megat
     actor_rollout_ref.rollout.free_cache_engine=True \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${infer_ppo_micro_batch_size_per_gpu} \
     actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
-    actor_rollout_ref.ref.megatron.use_dist_checkpointing=True \
+    actor_rollout_ref.ref.megatron.use_dist_checkpointing=${USE_DIST_CKPT} \
     actor_rollout_ref.ref.megatron.param_offload=${offload} \
     actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${REF_TP} \
     actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${REF_PP} \
@@ -192,4 +192,4 @@ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megat
     trainer.save_freq=100 \
     trainer.total_epochs=10 \
     trainer.resume_mode=auto \
-    trainer.log_val_generations=10
\ No newline at end of file
+    trainer.log_val_generations=10
diff --git a/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn.sh b/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn.sh
index 9038813c864..3c3dd6a4515 100644
--- a/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn.sh
+++ b/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn.sh
@@ -48,7 +48,8 @@ python3 -m verl.trainer.main_ppo \
     actor_rollout_ref.rollout.n=16 \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
     actor_rollout_ref.ref.fsdp_config.param_offload=True \
-    actor_rollout_ref.rollout.over_sample_rate=0 \
+    actor_rollout_ref.rollout.over_sample_rate=0.1 \
+    actor_rollout_ref.rollout.mode=sync \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
diff --git a/examples/sglang_multiturn/run_qwen3-4b_gsm8k_multiturn.sh b/examples/sglang_multiturn/run_qwen3-4b_gsm8k_multiturn.sh
index 56228f4b55e..6f1f99e4bd2 100755
--- a/examples/sglang_multiturn/run_qwen3-4b_gsm8k_multiturn.sh
+++ b/examples/sglang_multiturn/run_qwen3-4b_gsm8k_multiturn.sh
@@ -35,6 +35,8 @@ python3 -m verl.trainer.main_ppo \
     actor_rollout_ref.rollout.name=sglang \
     actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
     actor_rollout_ref.rollout.n=16 \
+    actor_rollout_ref.rollout.over_sample_rate=0.1 \
+    actor_rollout_ref.rollout.mode=sync \
     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
     actor_rollout_ref.ref.fsdp_config.param_offload=True \
     algorithm.use_kl_in_reward=False \
diff --git a/examples/sglang_multiturn/run_qwen3_4b_dapo_multiturn.sh b/examples/sglang_multiturn/run_qwen3_4b_dapo_multiturn.sh
index 53f856cca27..39948693264 100644
--- a/examples/sglang_multiturn/run_qwen3_4b_dapo_multiturn.sh
+++ b/examples/sglang_multiturn/run_qwen3_4b_dapo_multiturn.sh
@@ -17,10 +17,30 @@ hf download \
     --repo-type dataset \
     --local-dir $HOME/data/Maxwell-Jia/AIME_2024
 
-# Note that this script is using AgentLoop instead of SGLang Multi-Turn
-# We are concerned that the reward is not actually converge, since the
-# reward of retool is encouraging the model to generate more turns to
-# call more tools. The answers are not actually correct.
+
+# Note:
+# 1. 
+# a sandbox fusion server is needed to run the code interpreter tool.
+# docker run -it -p 8080:8080 volcengine/sandbox-fusion:server-20250609
+
+# 2. 
+# The model located at font-info/qwen3-4b-sft-SGLang-RL (https://huggingface.co/font-info/qwen3-4b-sft-SGLang-RL)
+# is a fine-tuned version provided by the SGLang RL team. Without supervised fine-tuning (SFT)
+# on the Retool dataset, Dapo training will not converge.
+
+# If you still wish to perform SFT from scratch, follow the steps below:
+
+# Step 1: Download the SFT dataset
+#huggingface-cli download JoeYing/ReTool-SFT --repo-type dataset --local-dir ./ReTool-SFT
+
+# Step 2: Preprocess the data for SFT
+#python3 recipe/retool/retool_sft_preprocess.py
+
+# Step 3: Run SFT training
+#bash recipe/retool/run_qwen2-32b_sft.sh
+
+# having trouble setup? see https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/blob/main/rlhf/verl/multi-turn/release_log/latest_sglang.md for more details.
+
 
 python3 -m verl.trainer.main_ppo \
     algorithm.adv_estimator=grpo \
@@ -38,7 +58,7 @@ python3 -m verl.trainer.main_ppo \
     data.custom_cls.name=CustomRLHFDataset \
     custom_reward_function.path=$PROJECT_DIR/recipe/retool/retool.py \
     custom_reward_function.name=compute_score \
-    actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 \
+    actor_rollout_ref.model.path=font-info/qwen3-4b-sft-SGLang-RL \
     actor_rollout_ref.model.use_remove_padding=True \
     actor_rollout_ref.model.enable_gradient_checkpointing=True \
     actor_rollout_ref.actor.use_kl_loss=False \
@@ -47,16 +67,17 @@ python3 -m verl.trainer.main_ppo \
     actor_rollout_ref.actor.clip_ratio_high=0.28 \
     actor_rollout_ref.actor.clip_ratio_c=10.0 \
     actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
     actor_rollout_ref.actor.ppo_mini_batch_size=32 \
     actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
     actor_rollout_ref.actor.ppo_max_token_len_per_gpu=32768 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8 \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
     actor_rollout_ref.rollout.name=sglang \
     actor_rollout_ref.rollout.mode=async \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
     actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
     actor_rollout_ref.rollout.update_weights_bucket_megabytes=512 \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.85 \
     actor_rollout_ref.rollout.multi_stage_wake_up=True \
     actor_rollout_ref.rollout.multi_turn.enable=True \
     actor_rollout_ref.rollout.multi_turn.max_user_turns=16 \
@@ -69,7 +90,7 @@ python3 -m verl.trainer.main_ppo \
     actor_rollout_ref.rollout.val_kwargs.n=30 \
     trainer.logger=['console','wandb'] \
     trainer.project_name=sglang-dapo-multiturn \
-    trainer.experiment_name=qwen3-4b_dapo_multiturn \
+    trainer.experiment_name=qwen3_4b_sft_dapo_multiturn \
     trainer.n_gpus_per_node=8 \
     trainer.log_val_generations=20 \
     trainer.val_before_train=True \
diff --git a/recipe/dapo/run_dapo_qwen3_moe_30b_base_npu_fsdp.sh b/recipe/dapo/run_dapo_qwen3_moe_30b_base_npu_fsdp.sh
new file mode 100644
index 00000000000..36cf175a18f
--- /dev/null
+++ b/recipe/dapo/run_dapo_qwen3_moe_30b_base_npu_fsdp.sh
@@ -0,0 +1,146 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+project_name='DAPO'
+exp_name='DAPO-Qwen3-MOE-30B-FSDP-128rank-gbs512'
+
+NNODES=8
+NPUS_PER_NODE=16
+
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 20))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+loss_agg_mode="token-mean"
+ppo_mini_batch_size=32
+
+enable_filter_groups=True
+filter_groups_metric=acc
+max_num_gen_batches=10
+train_prompt_bsz=512
+gen_prompt_bsz=$((train_prompt_bsz * 3))
+n_resp_per_prompt=16
+
+RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-30B-A3B-Base"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+sp_size=16 # For load-balance. For smaller cluster this can be set to as less as 2.
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) / 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) / 2))
+offload=True
+recompute=True
+max_num_seqs=128
+gen_tp=2
+gen_world_size=$((NNODES * NPUS_PER_NODE)) # nnodes* npus_in_per_node
+
+
+ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
+    -- python3 -m recipe.dapo.main_dapo \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    actor_rollout_ref.rollout.max_num_seqs=${max_num_seqs} \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length))  \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    algorithm.filter_groups.enable=${enable_filter_groups} \
+    algorithm.filter_groups.max_num_gen_batches=${max_num_gen_batches} \
+    algorithm.filter_groups.metric=${filter_groups_metric} \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    +actor_rollout_ref.model.override_config.attention_dropout=0. \
+    +actor_rollout_ref.model.override_config.embd_pdrop=0. \
+    +actor_rollout_ref.model.override_config.resid_pdrop=0. \
+    actor_rollout_ref.model.enable_gradient_checkpointing=${recompute} \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${ppo_mini_batch_size} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.fsdp_config.forward_prefetch=False \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    +actor_rollout_ref.rollout.rollout_world_size=${gen_world_size} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=-1 \
+    actor_rollout_ref.ref.fsdp_config.forward_prefetch=False \
+    actor_rollout_ref.rollout.enforce_eager=False \
+    actor_rollout_ref.rollout.free_cache_engine=True \
+    reward_model.reward_manager=dapo \
+    reward_model.overlong_buffer.enable=${enable_overlong_buffer} \
+    reward_model.overlong_buffer.len=${overlong_buffer_len} \
+    reward_model.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node="${NPUS_PER_NODE}" \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=False \
+    trainer.test_freq=5 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=1 \
+    trainer.device="npu" \
+    actor_rollout_ref.actor.use_torch_compile=False \
+    actor_rollout_ref.ref.use_torch_compile=False 
+   
diff --git a/recipe/one_step_off_policy/main_ppo.py b/recipe/one_step_off_policy/main_ppo.py
index ea869b5489f..344fe4b9f0c 100644
--- a/recipe/one_step_off_policy/main_ppo.py
+++ b/recipe/one_step_off_policy/main_ppo.py
@@ -23,9 +23,12 @@
 import ray
 from omegaconf import OmegaConf
 
+from recipe.one_step_off_policy.utils import need_critic
 from verl.trainer.constants_ppo import get_ppo_ray_runtime_env
 from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler
 from verl.trainer.ppo.reward import load_reward_manager
+from verl.trainer.ppo.utils import need_reference_policy
+from verl.utils.config import validate_config
 
 from .ray_trainer import OneStepOffRayTrainer
 
@@ -87,20 +90,6 @@ def run(self, config):
 
         OmegaConf.resolve(config)
 
-        # Download the checkpoint from HDFS to the local machine.
-        # `use_shm` determines whether to use shared memory, which could lead to faster model loading if turned on
-        local_path = copy_to_local(
-            config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False)
-        )
-
-        # Instantiate the tokenizer and processor.
-        from verl.utils import hf_processor, hf_tokenizer
-
-        trust_remote_code = config.data.get("trust_remote_code", False)
-        tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
-        # Used for multimodal LLM, could be None
-        processor = hf_processor(local_path, trust_remote_code=trust_remote_code, use_fast=True)
-
         # Define worker classes based on the actor strategy.
         if config.actor_rollout_ref.actor.strategy == "fsdp2":
             assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
@@ -190,6 +179,27 @@ def run(self, config):
             role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
             mapping[Role.RefPolicy] = global_pool_id
 
+        # validate config
+        validate_config(
+            config=config,
+            use_reference_policy=need_reference_policy(role_worker_mapping),
+            use_critic=need_critic(config),
+        )
+
+        # Download the checkpoint from HDFS to the local machine.
+        # `use_shm` determines whether to use shared memory, which could lead to faster model loading if turned on
+        local_path = copy_to_local(
+            config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False)
+        )
+
+        # Instantiate the tokenizer and processor.
+        from verl.utils import hf_processor, hf_tokenizer
+
+        trust_remote_code = config.data.get("trust_remote_code", False)
+        tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
+        # Used for multimodal LLM, could be None
+        processor = hf_processor(local_path, trust_remote_code=trust_remote_code, use_fast=True)
+
         # Load the reward manager for training and validation.
         reward_fn = load_reward_manager(
             config, tokenizer, num_examine=0, **config.reward_model.get("reward_kwargs", {})
diff --git a/recipe/one_step_off_policy/ray_trainer.py b/recipe/one_step_off_policy/ray_trainer.py
index 70399f82f75..cf989d315d3 100644
--- a/recipe/one_step_off_policy/ray_trainer.py
+++ b/recipe/one_step_off_policy/ray_trainer.py
@@ -28,11 +28,12 @@
 from torch.utils.data import Dataset, Sampler
 from tqdm import tqdm
 
+from recipe.one_step_off_policy.utils import need_critic
 from verl import DataProto
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.single_controller.ray.base import create_colocated_worker_cls
 from verl.trainer.ppo import core_algos
-from verl.trainer.ppo.core_algos import AdvantageEstimator, agg_loss
+from verl.trainer.ppo.core_algos import agg_loss
 from verl.trainer.ppo.metric_utils import (
     compute_data_metrics,
     compute_throughout_metrics,
@@ -41,13 +42,12 @@
 from verl.trainer.ppo.ray_trainer import (
     RayPPOTrainer,
     ResourcePoolManager,
-    Role,
-    WorkerType,
     apply_kl_penalty,
     compute_advantage,
     compute_response_mask,
 )
 from verl.trainer.ppo.reward import compute_reward, compute_reward_async
+from verl.trainer.ppo.utils import Role, WorkerType, need_reference_policy, need_reward_model
 from verl.utils.debug import marked_timer
 from verl.utils.metric import (
     reduce_metrics,
@@ -140,8 +140,9 @@ def __init__(
 
         self.role_worker_mapping = role_worker_mapping
         self.resource_pool_manager = resource_pool_manager
-        self.use_reference_policy = Role.RefPolicy in role_worker_mapping
-        self.use_rm = Role.RewardModel in role_worker_mapping
+        self.use_reference_policy = need_reference_policy(self.role_worker_mapping)
+        self.use_rm = need_reward_model(self.role_worker_mapping)
+        self.use_critic = need_critic(config)
         self.ray_worker_group_cls = ray_worker_group_cls
         self.device_name = device_name
         self.validation_generations_logger = ValidationGenerationsLogger()
@@ -154,23 +155,6 @@ def __init__(
         if config.algorithm.use_kl_in_reward:
             self.kl_ctrl_in_reward = core_algos.get_kl_controller(config.algorithm.kl_ctrl)
 
-        if self.config.algorithm.adv_estimator == AdvantageEstimator.GAE:
-            self.use_critic = True
-        elif self.config.algorithm.adv_estimator in [
-            AdvantageEstimator.GRPO,
-            AdvantageEstimator.GRPO_PASSK,
-            AdvantageEstimator.REINFORCE_PLUS_PLUS,
-            # AdvantageEstimator.REMAX, # TODO:REMAX advantage estimator is not yet supported in one_step_off_policy
-            AdvantageEstimator.RLOO,
-            AdvantageEstimator.OPO,
-            AdvantageEstimator.REINFORCE_PLUS_PLUS_BASELINE,
-            AdvantageEstimator.GPG,
-        ]:
-            self.use_critic = False
-        else:
-            raise NotImplementedError
-
-        self._validate_config()
         self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
 
     def _validate(self):
diff --git a/recipe/one_step_off_policy/utils.py b/recipe/one_step_off_policy/utils.py
new file mode 100644
index 00000000000..1879b0672fa
--- /dev/null
+++ b/recipe/one_step_off_policy/utils.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2025 Meituan Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from omegaconf import DictConfig
+
+from verl.trainer.ppo.core_algos import AdvantageEstimator
+
+
+def need_critic(config: DictConfig) -> bool:
+    """Given a config, do we need critic"""
+    if config.algorithm.adv_estimator == AdvantageEstimator.GAE:
+        return True
+    elif config.algorithm.adv_estimator in [
+        AdvantageEstimator.GRPO,
+        AdvantageEstimator.GRPO_PASSK,
+        AdvantageEstimator.REINFORCE_PLUS_PLUS,
+        # AdvantageEstimator.REMAX, # TODO:REMAX advantage estimator is not yet supported in one_step_off_policy
+        AdvantageEstimator.RLOO,
+        AdvantageEstimator.OPO,
+        AdvantageEstimator.REINFORCE_PLUS_PLUS_BASELINE,
+        AdvantageEstimator.GPG,
+    ]:
+        return False
+    else:
+        raise NotImplementedError
diff --git a/recipe/prime/main_prime.py b/recipe/prime/main_prime.py
index 4c3ed6e6d9e..39d20de4326 100644
--- a/recipe/prime/main_prime.py
+++ b/recipe/prime/main_prime.py
@@ -33,6 +33,9 @@
 import ray
 from omegaconf import OmegaConf
 
+from verl.trainer.ppo.utils import need_reference_policy
+from verl.utils.config import validate_config
+
 from .prime_ray_trainer import RayPRIMETrainer
 
 
@@ -67,14 +70,6 @@ def main_task(config, compute_score=None):
     pprint(OmegaConf.to_container(config, resolve=True))  # resolve=True will eval symbol values
     OmegaConf.resolve(config)
 
-    # download the checkpoint from hdfs
-    local_path = copy_local_path_from_hdfs(config.actor_rollout_ref.model.path)
-
-    # instantiate tokenizer
-    from verl.utils import hf_tokenizer
-
-    tokenizer = hf_tokenizer(local_path)
-
     # define worker classes
     if config.actor_rollout_ref.actor.strategy in {"fsdp", "fsdp2"}:
         assert config.critic.strategy in {"fsdp", "fsdp2"}
@@ -118,6 +113,21 @@ def main_task(config, compute_score=None):
         role_worker_mapping[Role.RewardModel] = ray.remote(PRIMERewardModelWorker)
         mapping[Role.RewardModel] = global_pool_id
 
+    # validate config
+    # TODO: Additional config checks can be added with proper function under prime recipe
+    validate_config(
+        config=config,
+        use_reference_policy=need_reference_policy(role_worker_mapping),
+        use_critic=False,
+    )
+
+    # download the checkpoint from hdfs
+    local_path = copy_local_path_from_hdfs(config.actor_rollout_ref.model.path)
+
+    # instantiate tokenizer
+    from verl.utils import hf_tokenizer
+
+    tokenizer = hf_tokenizer(local_path)
     reward_manager_name = config.reward_model.get("reward_manager", "naive")
     if reward_manager_name == "naive":
         from verl.workers.reward_manager import NaiveRewardManager
diff --git a/recipe/prime/prime_ray_trainer.py b/recipe/prime/prime_ray_trainer.py
index a5ad96431a8..6782b32256a 100644
--- a/recipe/prime/prime_ray_trainer.py
+++ b/recipe/prime/prime_ray_trainer.py
@@ -30,7 +30,8 @@
 from verl.single_controller.ray import RayWorkerGroup
 from verl.trainer.ppo.core_algos import agg_loss
 from verl.trainer.ppo.metric_utils import _compute_response_info
-from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType
+from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager
+from verl.trainer.ppo.utils import Role, WorkerType
 from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path
 from verl.utils.dataset.rl_dataset import RLHFDataset, collate_fn
 from verl.utils.metric import reduce_metrics
@@ -176,10 +177,6 @@ def __init__(
 
         self.use_critic = False
 
-    def _validate_config(self):
-        super()._validate_config()
-        # TODO: Additional config checks can be added here
-
     def _create_dataloader(self, *args, **kwargs):
         from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 
diff --git a/recipe/retool/retool.py b/recipe/retool/retool.py
index b4d6028ff8f..7bcc70453ee 100644
--- a/recipe/retool/retool.py
+++ b/recipe/retool/retool.py
@@ -112,7 +112,7 @@ def compute_score(data_source, solution_str, ground_truth, extra_info):
     num_turns = extra_info["num_turns"]
     if result["score"] < 0:
         tool_call_reward = (num_turns - 2) / 2 * 0.1
-        result["score"] = min(0, result["score"] + tool_call_reward)
+        result["score"] = min(-0.6, result["score"] + tool_call_reward)
 
     if result["pred"] is None:
         result["pred"] = ""
diff --git a/recipe/retool/sandbox_fusion_tool_config.yaml b/recipe/retool/sandbox_fusion_tool_config.yaml
index 20345715525..71b10e50ec9 100644
--- a/recipe/retool/sandbox_fusion_tool_config.yaml
+++ b/recipe/retool/sandbox_fusion_tool_config.yaml
@@ -1,7 +1,7 @@
 tools:
   - class_name: "recipe.retool.retool.CustomSandboxFusionTool"
     config:
-      sandbox_fusion_url: "https://***.apigateway-cn-beijing.volceapi.com/run_code"
+      sandbox_fusion_url: "http://localhost:8080/run_code"
       num_workers: 128
       enable_global_rate_limit: true
       rate_limit: 128
diff --git a/recipe/spin/main_spin.py b/recipe/spin/main_spin.py
index 782fe4cff17..e66ed07256e 100644
--- a/recipe/spin/main_spin.py
+++ b/recipe/spin/main_spin.py
@@ -19,7 +19,9 @@
 import ray
 
 from recipe.spin.spin_trainer import RaySPINTrainer
+from recipe.spin.utils import validate_config
 from verl.trainer.ppo.reward import get_custom_reward_fn
+from verl.trainer.ppo.utils import need_reference_policy
 
 
 @hydra.main(config_path="config", config_name="spin_trainer", version_base=None)
@@ -56,16 +58,6 @@ def run(self, config):
         pprint(OmegaConf.to_container(config, resolve=True))  # resolve=True will eval symbol values
         OmegaConf.resolve(config)
 
-        # download the checkpoint from hdfs
-        local_path = copy_to_local(config.actor_rollout_ref.model.path)
-
-        # instantiate tokenizer
-        from verl.utils import hf_processor, hf_tokenizer
-
-        trust_remote_code = config.data.get("trust_remote_code", False)
-        tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
-        processor = hf_processor(local_path, use_fast=True)  # used for multimodal LLM, could be none
-
         # define worker classes
         if config.actor_rollout_ref.actor.strategy in {"fsdp", "fsdp2"}:
             assert config.critic.strategy in {"fsdp", "fsdp2"}
@@ -117,6 +109,23 @@ def run(self, config):
         role_worker_mapping[Role.RefPolicy] = ray.remote(SPINRolloutRefWorker)
         mapping[Role.RefPolicy] = global_pool_id
 
+        # validate config
+        validate_config(
+            config=config,
+            use_reference_policy=need_reference_policy(self.role_worker_mapping),
+            use_critic=False,
+        )
+
+        # download the checkpoint from hdfs
+        local_path = copy_to_local(config.actor_rollout_ref.model.path)
+
+        # instantiate tokenizer
+        from verl.utils import hf_processor, hf_tokenizer
+
+        trust_remote_code = config.data.get("trust_remote_code", False)
+        tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
+        processor = hf_processor(local_path, use_fast=True)  # used for multimodal LLM, could be none
+
         from verl.workers.reward_manager import get_reward_manager_cls
 
         # Note(haibin.lin): please make sure custom reward managers are imported and
diff --git a/recipe/spin/spin_trainer.py b/recipe/spin/spin_trainer.py
index 43789218f57..bb6fe672634 100644
--- a/recipe/spin/spin_trainer.py
+++ b/recipe/spin/spin_trainer.py
@@ -19,7 +19,6 @@
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass, field
-from enum import Enum
 from pprint import pprint
 from typing import Any, Optional
 
@@ -35,7 +34,6 @@
 from recipe.spin import core_algos
 from verl import DataProto
 from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto
-from verl.single_controller.base import Worker
 from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
 from verl.single_controller.ray.base import create_colocated_worker_cls
 from verl.trainer.ppo.metric_utils import (
@@ -44,27 +42,12 @@
     process_validation_metrics,
     reduce_metrics,
 )
-from verl.trainer.ppo.ray_trainer import Role
+from verl.trainer.ppo.utils import Role, WorkerType, need_reference_policy, need_reward_model
 from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path
 from verl.utils.seqlen_balancing import get_seqlen_balanced_partitions, log_seqlen_unbalance
 from verl.utils.torch_functional import masked_mean
 from verl.utils.tracking import ValidationGenerationsLogger
 
-WorkerType = type[Worker]
-
-
-class AdvantageEstimator(str, Enum):
-    """
-    Using an enumeration class to avoid spelling errors in adv_estimator
-    """
-
-    GAE = "gae"
-    GRPO = "grpo"
-    REINFORCE_PLUS_PLUS = "reinforce_plus_plus"
-    REINFORCE_PLUS_PLUS_BASELINE = "reinforce_plus_plus_baseline"
-    REMAX = "remax"
-    RLOO = "rloo"
-
 
 @dataclass
 class ResourcePoolManager:
@@ -386,8 +369,9 @@ def __init__(
 
         self.role_worker_mapping = role_worker_mapping
         self.resource_pool_manager = resource_pool_manager
-        self.use_reference_policy = Role.RefPolicy in role_worker_mapping
-        self.use_rm = Role.RewardModel in role_worker_mapping
+        self.use_reference_policy = need_reference_policy(role_worker_mapping)
+        self.use_rm = need_reward_model(role_worker_mapping)
+        self.use_critic = False
         self.ray_worker_group_cls = ray_worker_group_cls
         self.validation_generations_logger = ValidationGenerationsLogger()
         self.async_rollout_mode = False
@@ -398,146 +382,8 @@ def __init__(
         if config.algorithm.use_kl_in_reward:
             self.kl_ctrl_in_reward = core_algos.get_kl_controller(config.algorithm.kl_ctrl)
 
-        self.use_critic = False
-        self._validate_config()
         self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
 
-    def _validate_config(self):
-        config = self.config
-        # number of GPUs total
-        n_gpus = config.trainer.n_gpus_per_node * config.trainer.nnodes
-
-        # 1. Check total batch size for data correctness
-        real_train_batch_size = config.data.train_batch_size * config.actor_rollout_ref.rollout.n
-        assert real_train_batch_size % n_gpus == 0, (
-            f"real_train_batch_size ({real_train_batch_size}) must be divisible by total n_gpus ({n_gpus})."
-        )
-
-        # A helper function to check "micro_batch_size" vs "micro_batch_size_per_gpu"
-        # We throw an error if the user sets both. The new convention is "..._micro_batch_size_per_gpu".
-        def check_mutually_exclusive(mbs, mbs_per_gpu, name: str):
-            settings = {
-                "actor_rollout_ref.actor": "micro_batch_size",
-                "critic": "micro_batch_size",
-                "reward_model": "micro_batch_size",
-                "actor_rollout_ref.ref": "log_prob_micro_batch_size",
-                "actor_rollout_ref.rollout": "log_prob_micro_batch_size",
-            }
-
-            if name in settings:
-                param = settings[name]
-                param_per_gpu = f"{param}_per_gpu"
-
-                if mbs is None and mbs_per_gpu is None:
-                    raise ValueError(
-                        f"[{name}] Please set at least one of '{name}.{param}' or '{name}.{param_per_gpu}'."
-                    )
-
-                if mbs is not None and mbs_per_gpu is not None:
-                    raise ValueError(
-                        f"[{name}] You have set both '{name}.{param}' AND '{name}.{param_per_gpu}'. "
-                        f"Please remove '{name}.{param}' because only '*_{param_per_gpu}' is supported "
-                        f"(the former is deprecated)."
-                    )
-
-        if not config.actor_rollout_ref.actor.use_dynamic_bsz:
-            # actor: ppo_micro_batch_size vs. ppo_micro_batch_size_per_gpu
-            check_mutually_exclusive(
-                config.actor_rollout_ref.actor.ppo_micro_batch_size,
-                config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu,
-                "actor_rollout_ref.actor",
-            )
-
-            if self.use_reference_policy:
-                # reference: log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu
-                check_mutually_exclusive(
-                    config.actor_rollout_ref.ref.log_prob_micro_batch_size,
-                    config.actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu,
-                    "actor_rollout_ref.ref",
-                )
-
-            #  The rollout section also has log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu
-            check_mutually_exclusive(
-                config.actor_rollout_ref.rollout.log_prob_micro_batch_size,
-                config.actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu,
-                "actor_rollout_ref.rollout",
-            )
-
-        if self.use_critic and not config.critic.use_dynamic_bsz:
-            # Check for critic micro-batch size conflicts
-            check_mutually_exclusive(
-                config.critic.ppo_micro_batch_size, config.critic.ppo_micro_batch_size_per_gpu, "critic"
-            )
-
-        # Check for reward model micro-batch size conflicts
-        if config.reward_model.enable and not config.reward_model.use_dynamic_bsz:
-            check_mutually_exclusive(
-                config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu, "reward_model"
-            )
-
-        # Actor
-        # check if train_batch_size is larger than ppo_mini_batch_size
-        # if NOT dynamic_bsz, we must ensure:
-        #    ppo_mini_batch_size is divisible by ppo_micro_batch_size
-        #    ppo_micro_batch_size * sequence_parallel_size >= n_gpus
-        if not config.actor_rollout_ref.actor.use_dynamic_bsz:
-            assert config.data.train_batch_size >= config.actor_rollout_ref.actor.ppo_mini_batch_size
-            sp_size = config.actor_rollout_ref.actor.get("ulysses_sequence_parallel_size", 1)
-            if config.actor_rollout_ref.actor.ppo_micro_batch_size is not None:
-                assert (
-                    config.actor_rollout_ref.actor.ppo_mini_batch_size
-                    % config.actor_rollout_ref.actor.ppo_micro_batch_size
-                    == 0
-                )
-                assert config.actor_rollout_ref.actor.ppo_micro_batch_size * sp_size >= n_gpus
-
-        assert config.actor_rollout_ref.actor.loss_agg_mode in [
-            "token-mean",
-            "seq-mean-token-sum",
-            "seq-mean-token-mean",
-        ], f"Invalid loss_agg_mode: {config.actor_rollout_ref.actor.loss_agg_mode}"
-
-        if config.algorithm.use_kl_in_reward and config.actor_rollout_ref.actor.use_kl_loss:
-            print("NOTICE: You have both enabled in-reward kl and kl loss.")
-
-        # critic
-        if self.use_critic and not config.critic.use_dynamic_bsz:
-            assert config.data.train_batch_size >= config.critic.ppo_mini_batch_size
-            sp_size = config.critic.get("ulysses_sequence_parallel_size", 1)
-            if config.critic.ppo_micro_batch_size is not None:
-                assert config.critic.ppo_mini_batch_size % config.critic.ppo_micro_batch_size == 0
-                assert config.critic.ppo_micro_batch_size * sp_size >= n_gpus
-
-        # Check if use_remove_padding is enabled when using sequence parallelism for fsdp
-        if config.actor_rollout_ref.actor.strategy in {"fsdp", "fsdp2"}:
-            if (
-                config.actor_rollout_ref.actor.get("ulysses_sequence_parallel_size", 1) > 1
-                or config.actor_rollout_ref.ref.get("ulysses_sequence_parallel_size", 1) > 1
-            ):
-                assert config.actor_rollout_ref.model.use_remove_padding, (
-                    "When using sequence parallelism for actor/ref policy, you must enable `use_remove_padding`."
-                )
-
-        if self.use_critic and config.critic.strategy in {"fsdp", "fsdp2"}:
-            if config.critic.get("ulysses_sequence_parallel_size", 1) > 1:
-                assert config.critic.model.use_remove_padding, (
-                    "When using sequence parallelism for critic, you must enable `use_remove_padding`."
-                )
-
-        if config.data.get("val_batch_size", None) is not None:
-            print(
-                "WARNING: val_batch_size is deprecated. Validation datasets are sent to inference engines "
-                "as a whole batch, which will schedule the memory themselves."
-            )
-
-        # check eval config
-        if config.actor_rollout_ref.rollout.val_kwargs.do_sample:
-            assert config.actor_rollout_ref.rollout.temperature > 0, (
-                "validation gen temperature should be greater than 0 when enabling do_sample"
-            )
-
-        print("[validate_config] All configuration checks passed successfully!")
-
     def _create_dataloader(self, train_dataset, val_dataset, collate_fn, train_sampler):
         """
         Creates the train and validation dataloaders.
diff --git a/recipe/spin/utils.py b/recipe/spin/utils.py
new file mode 100644
index 00000000000..571ad1e9154
--- /dev/null
+++ b/recipe/spin/utils.py
@@ -0,0 +1,160 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023-2024 SGLang Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from omegaconf import DictConfig
+
+
+def validate_config(
+    config: DictConfig,
+    use_reference_policy: bool,
+    use_critic: bool,
+) -> None:
+    """
+    Validate an OmegaConf DictConfig
+
+    Args:
+        config: The OmegaConf DictConfig to validate.
+        use_reference_policy (bool): is ref policy needed
+        use_critic (bool): is critic needed
+    """
+    # number of GPUs total
+    n_gpus = config.trainer.n_gpus_per_node * config.trainer.nnodes
+
+    # 1. Check total batch size for data correctness
+    real_train_batch_size = config.data.train_batch_size * config.actor_rollout_ref.rollout.n
+    assert real_train_batch_size % n_gpus == 0, (
+        f"real_train_batch_size ({real_train_batch_size}) must be divisible by total n_gpus ({n_gpus})."
+    )
+
+    # A helper function to check "micro_batch_size" vs "micro_batch_size_per_gpu"
+    # We throw an error if the user sets both. The new convention is "..._micro_batch_size_per_gpu".
+    def check_mutually_exclusive(mbs, mbs_per_gpu, name: str):
+        settings = {
+            "actor_rollout_ref.actor": "micro_batch_size",
+            "critic": "micro_batch_size",
+            "reward_model": "micro_batch_size",
+            "actor_rollout_ref.ref": "log_prob_micro_batch_size",
+            "actor_rollout_ref.rollout": "log_prob_micro_batch_size",
+        }
+
+        if name in settings:
+            param = settings[name]
+            param_per_gpu = f"{param}_per_gpu"
+
+            if mbs is None and mbs_per_gpu is None:
+                raise ValueError(f"[{name}] Please set at least one of '{name}.{param}' or '{name}.{param_per_gpu}'.")
+
+            if mbs is not None and mbs_per_gpu is not None:
+                raise ValueError(
+                    f"[{name}] You have set both '{name}.{param}' AND '{name}.{param_per_gpu}'. "
+                    f"Please remove '{name}.{param}' because only '*_{param_per_gpu}' is supported "
+                    f"(the former is deprecated)."
+                )
+
+    if not config.actor_rollout_ref.actor.use_dynamic_bsz:
+        # actor: ppo_micro_batch_size vs. ppo_micro_batch_size_per_gpu
+        check_mutually_exclusive(
+            config.actor_rollout_ref.actor.ppo_micro_batch_size,
+            config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu,
+            "actor_rollout_ref.actor",
+        )
+
+        if use_reference_policy:
+            # reference: log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu
+            check_mutually_exclusive(
+                config.actor_rollout_ref.ref.log_prob_micro_batch_size,
+                config.actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu,
+                "actor_rollout_ref.ref",
+            )
+
+        #  The rollout section also has log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu
+        check_mutually_exclusive(
+            config.actor_rollout_ref.rollout.log_prob_micro_batch_size,
+            config.actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu,
+            "actor_rollout_ref.rollout",
+        )
+
+    if use_critic and not config.critic.use_dynamic_bsz:
+        # Check for critic micro-batch size conflicts
+        check_mutually_exclusive(
+            config.critic.ppo_micro_batch_size, config.critic.ppo_micro_batch_size_per_gpu, "critic"
+        )
+
+    # Check for reward model micro-batch size conflicts
+    if config.reward_model.enable and not config.reward_model.use_dynamic_bsz:
+        check_mutually_exclusive(
+            config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu, "reward_model"
+        )
+
+    # Actor
+    # check if train_batch_size is larger than ppo_mini_batch_size
+    # if NOT dynamic_bsz, we must ensure:
+    #    ppo_mini_batch_size is divisible by ppo_micro_batch_size
+    #    ppo_micro_batch_size * sequence_parallel_size >= n_gpus
+    if not config.actor_rollout_ref.actor.use_dynamic_bsz:
+        assert config.data.train_batch_size >= config.actor_rollout_ref.actor.ppo_mini_batch_size
+        sp_size = config.actor_rollout_ref.actor.get("ulysses_sequence_parallel_size", 1)
+        if config.actor_rollout_ref.actor.ppo_micro_batch_size is not None:
+            assert (
+                config.actor_rollout_ref.actor.ppo_mini_batch_size % config.actor_rollout_ref.actor.ppo_micro_batch_size
+                == 0
+            )
+            assert config.actor_rollout_ref.actor.ppo_micro_batch_size * sp_size >= n_gpus
+
+    assert config.actor_rollout_ref.actor.loss_agg_mode in [
+        "token-mean",
+        "seq-mean-token-sum",
+        "seq-mean-token-mean",
+    ], f"Invalid loss_agg_mode: {config.actor_rollout_ref.actor.loss_agg_mode}"
+
+    if config.algorithm.use_kl_in_reward and config.actor_rollout_ref.actor.use_kl_loss:
+        print("NOTICE: You have both enabled in-reward kl and kl loss.")
+
+    # critic
+    if use_critic and not config.critic.use_dynamic_bsz:
+        assert config.data.train_batch_size >= config.critic.ppo_mini_batch_size
+        sp_size = config.critic.get("ulysses_sequence_parallel_size", 1)
+        if config.critic.ppo_micro_batch_size is not None:
+            assert config.critic.ppo_mini_batch_size % config.critic.ppo_micro_batch_size == 0
+            assert config.critic.ppo_micro_batch_size * sp_size >= n_gpus
+
+    # Check if use_remove_padding is enabled when using sequence parallelism for fsdp
+    if config.actor_rollout_ref.actor.strategy in {"fsdp", "fsdp2"}:
+        if (
+            config.actor_rollout_ref.actor.get("ulysses_sequence_parallel_size", 1) > 1
+            or config.actor_rollout_ref.ref.get("ulysses_sequence_parallel_size", 1) > 1
+        ):
+            assert config.actor_rollout_ref.model.use_remove_padding, (
+                "When using sequence parallelism for actor/ref policy, you must enable `use_remove_padding`."
+            )
+
+    if use_critic and config.critic.strategy in {"fsdp", "fsdp2"}:
+        if config.critic.get("ulysses_sequence_parallel_size", 1) > 1:
+            assert config.critic.model.use_remove_padding, (
+                "When using sequence parallelism for critic, you must enable `use_remove_padding`."
+            )
+
+    if config.data.get("val_batch_size", None) is not None:
+        print(
+            "WARNING: val_batch_size is deprecated. Validation datasets are sent to inference engines "
+            "as a whole batch, which will schedule the memory themselves."
+        )
+
+    # check eval config
+    if config.actor_rollout_ref.rollout.val_kwargs.do_sample:
+        assert config.actor_rollout_ref.rollout.temperature > 0, (
+            "validation gen temperature should be greater than 0 when enabling do_sample"
+        )
+
+    print("[validate_config] All configuration checks passed successfully!")
diff --git a/recipe/sppo/main_sppo.py b/recipe/sppo/main_sppo.py
index eb080eba06b..7f5a9e2c9ad 100644
--- a/recipe/sppo/main_sppo.py
+++ b/recipe/sppo/main_sppo.py
@@ -24,6 +24,8 @@
 from omegaconf import OmegaConf
 
 from verl.trainer.ppo.reward import load_reward_manager
+from verl.trainer.ppo.utils import need_reference_policy
+from verl.utils.config import validate_config
 
 from .sppo_ray_trainer import RaySPPOTrainer
 
@@ -66,16 +68,6 @@ def run(self, config):
         pprint(OmegaConf.to_container(config, resolve=True))  # resolve=True will eval symbol values
         OmegaConf.resolve(config)
 
-        # download the checkpoint from hdfs
-        local_path = copy_to_local(config.actor_rollout_ref.model.path)
-
-        # instantiate tokenizer
-        from verl.utils import hf_processor, hf_tokenizer
-
-        trust_remote_code = config.data.get("trust_remote_code", False)
-        tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
-        processor = hf_processor(local_path, use_fast=True)  # used for multimodal LLM, could be none
-
         # define worker classes
         if config.actor_rollout_ref.actor.strategy in {"fsdp", "fsdp2"}:
             assert config.critic.strategy in {"fsdp", "fsdp2"}
@@ -133,6 +125,23 @@ def run(self, config):
             role_worker_mapping[Role.RefPolicy] = ray.remote(SPPOActorRolloutRefWorker)
             mapping[Role.RefPolicy] = global_pool_id
 
+        # validate config
+        validate_config(
+            config=config,
+            use_reference_policy=need_reference_policy(role_worker_mapping),
+            use_critic=False,
+        )
+
+        # download the checkpoint from hdfs
+        local_path = copy_to_local(config.actor_rollout_ref.model.path)
+
+        # instantiate tokenizer
+        from verl.utils import hf_processor, hf_tokenizer
+
+        trust_remote_code = config.data.get("trust_remote_code", False)
+        tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
+        processor = hf_processor(local_path, use_fast=True)  # used for multimodal LLM, could be none
+
         reward_fn = load_reward_manager(
             config, tokenizer, num_examine=0, **config.reward_model.get("reward_kwargs", {})
         )
diff --git a/recipe/sppo/sppo_ray_trainer.py b/recipe/sppo/sppo_ray_trainer.py
index 0725d293e2b..f11421f8511 100644
--- a/recipe/sppo/sppo_ray_trainer.py
+++ b/recipe/sppo/sppo_ray_trainer.py
@@ -38,12 +38,11 @@
     AdvantageEstimator,
     RayPPOTrainer,
     ResourcePoolManager,
-    Role,
-    WorkerType,
     apply_kl_penalty,
     compute_response_mask,
 )
 from verl.trainer.ppo.reward import compute_reward, compute_reward_async
+from verl.trainer.ppo.utils import Role, WorkerType, need_reference_policy, need_reward_model
 from verl.utils.profiler.performance import simple_timer
 from verl.utils.tracking import ValidationGenerationsLogger
 
@@ -111,8 +110,9 @@ def __init__(
 
         self.role_worker_mapping = role_worker_mapping
         self.resource_pool_manager = resource_pool_manager
-        self.use_reference_policy = Role.RefPolicy in role_worker_mapping
-        self.use_rm = Role.RewardModel in role_worker_mapping
+        self.use_reference_policy = need_reference_policy(role_worker_mapping)
+        self.use_rm = need_reward_model(role_worker_mapping)
+        self.use_critic = False
         self.ray_worker_group_cls = ray_worker_group_cls
         self.validation_generations_logger = ValidationGenerationsLogger()
         self.device_name = device_name if device_name else self.config.trainer.device
@@ -122,9 +122,6 @@ def __init__(
         if config.algorithm.use_kl_in_reward:
             self.kl_ctrl_in_reward = core_algos.get_kl_controller(config.algorithm.kl_ctrl)
 
-        self.use_critic = False
-
-        self._validate_config()
         self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
 
     def fit(self):
diff --git a/requirements_sglang.txt b/requirements_sglang.txt
index aa3a30c2924..c366ace43ba 100644
--- a/requirements_sglang.txt
+++ b/requirements_sglang.txt
@@ -17,5 +17,5 @@ torchdata
 torchvision
 transformers
 wandb
-sglang[all]==0.4.9.post6
+sglang[all]==0.4.10.post2
 huggingface_hub
diff --git a/setup.py b/setup.py
index 7e97389f74c..5c10c1547cc 100644
--- a/setup.py
+++ b/setup.py
@@ -52,7 +52,7 @@
 VLLM_REQUIRES = ["tensordict>=0.8.0,<=0.9.1,!=0.9.0", "vllm>=0.7.3,<=0.9.1"]
 SGLANG_REQUIRES = [
     "tensordict>=0.8.0,<=0.9.1,!=0.9.0",
-    "sglang[srt,openai]==0.4.9.post6",
+    "sglang[srt,openai]==0.4.10.post2",
     "torch==2.7.1",
 ]
 TRL_REQUIRES = ["trl<=0.9.6"]
diff --git a/tests/experimental/agent_loop/test_basic_agent_loop.py b/tests/experimental/agent_loop/test_basic_agent_loop.py
index 553a9a72586..b41538296b3 100644
--- a/tests/experimental/agent_loop/test_basic_agent_loop.py
+++ b/tests/experimental/agent_loop/test_basic_agent_loop.py
@@ -26,6 +26,7 @@
 from verl.protocol import DataProto
 from verl.tools.base_tool import BaseTool, OpenAIFunctionToolSchema
 from verl.tools.schemas import ToolResponse
+from verl.trainer.ppo.reward import compute_reward, load_reward_manager
 from verl.utils import hf_tokenizer
 
 
@@ -41,6 +42,10 @@ def init_config() -> DictConfig:
                 # test sleep/wake_up with fsdp offload
                 "actor_rollout_ref.actor.fsdp_config.param_offload=True",
                 "actor_rollout_ref.actor.fsdp_config.optimizer_offload=True",
+                "reward_model.reward_manager=dapo",
+                "+reward_model.reward_kwargs.overlong_buffer_cfg.enable=False",
+                "+reward_model.reward_kwargs.overlong_buffer_cfg.len=3072",
+                "+reward_model.reward_kwargs.max_resp_len=4096",
             ],
         )
 
@@ -69,6 +74,10 @@ def test_single_turn(init_config):
     )
 
     agent_loop_manager = init_agent_loop_manager(init_config)
+    tokenizer = hf_tokenizer(init_config.actor_rollout_ref.model.path)
+    reward_fn = load_reward_manager(
+        init_config, tokenizer, num_examine=0, **init_config.reward_model.get("reward_kwargs", {})
+    )
 
     raw_prompts = [
         [
@@ -97,10 +106,17 @@ def test_single_turn(init_config):
     assert result.batch["input_ids"].size(1) == seq_len
     assert result.batch["attention_mask"].size(1) == seq_len
     assert result.batch["position_ids"].size(1) == seq_len
-    assert result.batch["rm_scores"].size(1) == result.batch["responses"].size(1)
+
     if init_config.actor_rollout_ref.rollout.calculate_log_probs:
         assert result.batch["rollout_log_probs"].size(1) == result.batch["responses"].size(1)
 
+    # check compute score
+    assert result.batch["rm_scores"].shape == result.batch["responses"].shape
+    reward_tensor, reward_extra_info = compute_reward(result, reward_fn)
+    assert reward_tensor.shape == result.batch["responses"].shape
+    assert "acc" in reward_extra_info, f"reward_extra_info {reward_extra_info} should contain 'acc'"
+    assert reward_extra_info["acc"].shape == (len(result),), f"invalid acc: {reward_extra_info['acc']}"
+
     # check turns
     num_turns = result.non_tensor_batch["__num_turns__"]
     assert np.all(num_turns == 2)
diff --git a/tests/special_sanity/check_license.py b/tests/special_sanity/check_license.py
index 1a2073e6b02..a4ade024433 100644
--- a/tests/special_sanity/check_license.py
+++ b/tests/special_sanity/check_license.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from argparse import ArgumentParser
 from pathlib import Path
+from typing import Iterable
 
 license_head_bytedance = "Copyright 2024 Bytedance Ltd. and/or its affiliates"
 license_head_bytedance_25 = "Copyright 2025 Bytedance Ltd. and/or its affiliates"
@@ -35,13 +36,37 @@
 ]
 
 
+def get_py_files(path_arg: Path) -> Iterable[Path]:
+    """get py files under a dir. if already py file return it
+
+    Args:
+        path_arg (Path): path to scan for py files
+
+    Returns:
+        Iterable[Path]: list of py files
+    """
+    if path_arg.is_dir():
+        return path_arg.glob("**/*.py")
+    elif path_arg.is_file() and path_arg.suffix == ".py":
+        return [path_arg]
+    return []
+
+
 if __name__ == "__main__":
     parser = ArgumentParser()
-    parser.add_argument("--directory", "-d", required=True, type=str)
+    parser.add_argument(
+        "--directories",
+        "-d",
+        required=True,
+        type=Path,
+        nargs="+",
+        help="List of directories to check for license headers",
+    )
     args = parser.parse_args()
-    directory_in_str = args.directory
 
-    pathlist = Path(directory_in_str).glob("**/*.py")
+    # Collect all Python files from specified directories
+    pathlist = set(path for path_arg in args.directories for path in get_py_files(path_arg))
+
     for path in pathlist:
         # because path is object not string
         path_in_str = str(path.absolute())
diff --git a/tests/utils/dataset/test_rl_dataset_on_cpu.py b/tests/utils/dataset/test_rl_dataset_on_cpu.py
index 2afc3ef49f6..391e89a94d5 100644
--- a/tests/utils/dataset/test_rl_dataset_on_cpu.py
+++ b/tests/utils/dataset/test_rl_dataset_on_cpu.py
@@ -77,7 +77,7 @@ def test_image_rl_data():
             "prompt_key": "prompt",
             "max_prompt_length": 1024,
             "filter_overlong_prompts": True,
-            "filter_overlong_prompts_workers": 2,
+            "filter_overlong_prompts_workers": 1,
         }
     )
     dataset = RLHFDataset(
diff --git a/tests/utils/test_activation_offload.py b/tests/utils/test_activation_offload.py
index 2393d7962ae..25bc23c40ac 100644
--- a/tests/utils/test_activation_offload.py
+++ b/tests/utils/test_activation_offload.py
@@ -29,6 +29,23 @@
 from verl.utils.fsdp_utils import MixedPrecisionPolicy, apply_fsdp2, get_fsdp_wrap_policy
 
 
+def create_random_input_ids(batch_size, seq_len, vocab_size):
+    from flash_attn.bert_padding import unpad_input
+
+    from verl.utils.model import compute_position_id_with_mask, create_random_mask
+
+    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device="cuda")
+
+    attention_mask = create_random_mask(
+        input_ids, max_ratio_of_left_padding=0.1, min_ratio_of_valid_token=0.5, max_ratio_of_valid_token=0.7
+    )
+    position_ids = compute_position_id_with_mask(attention_mask)
+
+    input_ids = unpad_input(input_ids.unsqueeze(-1), attention_mask)[0].transpose(0, 1)
+    position_ids = unpad_input(position_ids.unsqueeze(-1), attention_mask)[0].transpose(0, 1)
+    return input_ids, position_ids
+
+
 def _fsdp_activation_offloading_test(rank, world_size, rendezvous_file, strategy="fsdp"):
     torch.cuda.set_device(rank)
     torch.distributed.init_process_group(
@@ -85,15 +102,13 @@ def _fsdp_activation_offloading_test(rank, world_size, rendezvous_file, strategy
     seq_len = 32
     vocab_size = 32000
     # First input for initial update
-    input_ids1 = torch.randint(0, vocab_size, (batch_size, seq_len), device="cuda")
-    attention_mask1 = torch.ones_like(input_ids1)
+    input_ids1, position_ids1 = create_random_input_ids(batch_size, seq_len, vocab_size)
 
     # Second input for verification
-    input_ids2 = torch.randint(0, vocab_size, (batch_size, seq_len), device="cuda")
-    attention_mask2 = torch.ones_like(input_ids2)
+    input_ids2, position_ids2 = create_random_input_ids(batch_size, seq_len, vocab_size)
 
     # Step 1: Initial update and save checkpoint
-    outputs1 = model(input_ids=input_ids1, attention_mask=attention_mask1)
+    outputs1 = model(input_ids=input_ids1, position_ids=position_ids1)
     loss1 = outputs1.logits.mean()
     loss1.backward()
     optimizer.step()
@@ -106,7 +121,7 @@ def _fsdp_activation_offloading_test(rank, world_size, rendezvous_file, strategy
     checkpoint_manager.save_checkpoint(local_path=checkpoint_path, hdfs_path=None, global_step=0)
 
     # Step 2: Second update and forward pass
-    outputs2 = model(input_ids=input_ids2, attention_mask=attention_mask2)
+    outputs2 = model(input_ids=input_ids2, position_ids=position_ids2)
     loss2 = outputs2.logits.mean()
     loss2.backward()
     optimizer.step()
@@ -115,14 +130,14 @@ def _fsdp_activation_offloading_test(rank, world_size, rendezvous_file, strategy
 
     # Record logits after second update
     with torch.no_grad():
-        logits_without_offloading = model(input_ids=input_ids2, attention_mask=attention_mask2).logits
+        logits_without_offloading = model(input_ids=input_ids2, position_ids=position_ids2).logits
 
     # Step 3: wrap module with activation offloading and load checkpoint
-    enable_activation_offloading(model, "fsdp")
+    enable_activation_offloading(model, strategy=strategy)
     checkpoint_manager.load_checkpoint(checkpoint_path)
 
     # Step 4: Repeat the second update with same input
-    outputs3 = model(input_ids=input_ids2, attention_mask=attention_mask2)
+    outputs3 = model(input_ids=input_ids2, position_ids=position_ids2)
     loss3 = outputs3.logits.mean()
     loss3.backward()
     optimizer.step()
@@ -131,7 +146,7 @@ def _fsdp_activation_offloading_test(rank, world_size, rendezvous_file, strategy
 
     # Record logits after loaded checkpoint and update
     with torch.no_grad():
-        logits_with_offloading = model(input_ids=input_ids2, attention_mask=attention_mask2).logits
+        logits_with_offloading = model(input_ids=input_ids2, position_ids=position_ids2).logits
 
     # Step 4: Verify outputs match
     torch.testing.assert_close(logits_without_offloading, logits_with_offloading, atol=0.0, rtol=0.0)
diff --git a/tests/utils/test_nvtx_profile.py b/tests/utils/test_nvtx_profile.py
index fea7675335a..645da153d0a 100644
--- a/tests/utils/test_nvtx_profile.py
+++ b/tests/utils/test_nvtx_profile.py
@@ -120,8 +120,9 @@ def test_annotate_decorator(self):
         mock_self = MagicMock()
         mock_self.profiler = self.profiler
         mock_self.profiler.this_step = True
+        decorator = mock_self.profiler.annotate(message="test")
 
-        @NsightSystemsProfiler.annotate(message="test")
+        @decorator
         def test_func(self, *args, **kwargs):
             return "result"
 
diff --git a/tests/utils/test_special_mstx_profile.py b/tests/utils/test_special_mstx_profile.py
index c723c240865..a80cabfa49c 100644
--- a/tests/utils/test_special_mstx_profile.py
+++ b/tests/utils/test_special_mstx_profile.py
@@ -149,8 +149,9 @@ def test_annotate_decorator_applied_correctly(self):
             mock_start_patch.return_value = mock_mark_range
 
             with patch("verl.utils.profiler.mstx_profile.get_npu_profiler") as mock_get_profiler:
+                decorator = mock_worker.profiler.annotate(message="test")
 
-                @NPUProfiler.annotate(message="test")
+                @decorator
                 def test_func(self, *args, **kwargs):
                     return "result"
 
@@ -171,8 +172,9 @@ def test_annotate_when_profiler_disabled(self):
             patch("verl.utils.profiler.mstx_profile.mark_end_range") as mock_end_patch,
             patch("verl.utils.profiler.mstx_profile.get_npu_profiler") as mock_get_profiler,
         ):
+            decorator = mock_worker.profiler.annotate(message="test")
 
-            @NPUProfiler.annotate(message="test")
+            @decorator
             def test_func(self, *args, **kwargs):
                 return "result"
 
@@ -193,8 +195,9 @@ def test_annotate_when_this_step_disabled(self):
             patch("verl.utils.profiler.mstx_profile.mark_end_range") as mock_end_patch,
             patch("verl.utils.profiler.mstx_profile.get_npu_profiler") as mock_get_profiler,
         ):
+            decorator = mock_worker.profiler.annotate(message="test")
 
-            @NPUProfiler.annotate(message="test")
+            @decorator
             def test_func(self, *args, **kwargs):
                 return "result"
 
@@ -221,8 +224,9 @@ def test_annotate_discrete_mode_enabled(self):
         ):
             mock_start_patch.return_value = mock_mark_range
             mock_get_profiler.return_value = mock_profile_npu
+            decorator = mock_worker.profiler.annotate(message="test", role="test_role")
 
-            @NPUProfiler.annotate(message="test", role="test_role")
+            @decorator
             def test_func(self, *args, **kwargs):
                 return "result"
 
@@ -253,8 +257,9 @@ def test_annotate_with_default_message(self):
             patch("verl.utils.profiler.mstx_profile.mark_end_range") as mock_end_patch,
         ):
             mock_start_patch.return_value = mock_mark_range
+            decorator = mock_worker.profiler.annotate()
 
-            @NPUProfiler.annotate()
+            @decorator
             def test_func(self, *args, **kwargs):
                 return "result"
 
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index f174ef6da4c..13526046a0d 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -134,6 +134,8 @@ class AgentLoopOutput(BaseModel):
     """Number of chat turns, including user, assistant, tool."""
     metrics: AgentLoopMetrics
     """Auxiliary performance metrics"""
+    extra_fields: dict[str, Any] = {}
+    """Extra fields for dynamic addition."""
 
 
 class _InternalAgentLoopOutput(AgentLoopOutput):
@@ -252,7 +254,7 @@ def __init__(self, config: DictConfig, local_path: str) -> None:
         )
         self.loop = asyncio.get_event_loop()
 
-    async def compute_score(self, output: AgentLoopOutput, kwargs: dict) -> float:
+    async def compute_score(self, output: AgentLoopOutput, kwargs: dict) -> dict:
         """Compute reward score for agent loop output.
 
         NOTE: Since `reward_manager.__call__` is blocking function, we run it in thread pool to
@@ -263,7 +265,7 @@ async def compute_score(self, output: AgentLoopOutput, kwargs: dict) -> float:
             kwargs (dict): Dataset fields from `verl.utils.dataset.RLHFDataset`.
 
         Returns:
-            float: Reward score.
+            dict: Reward score and reward extra info.
         """
         prompts = torch.tensor(output.prompt_ids, dtype=torch.long).unsqueeze(0)
         responses = torch.tensor(output.response_ids, dtype=torch.long).unsqueeze(0)
@@ -284,12 +286,16 @@ async def compute_score(self, output: AgentLoopOutput, kwargs: dict) -> float:
             batch=batch,
             non_tensor_batch=non_tensor_batch,
         )
-        reward_tensor = await self.loop.run_in_executor(
+        result = await self.loop.run_in_executor(
             None,
             self.reward_manager,
             data,
+            True,  # return_dict
         )
-        return reward_tensor.sum(dim=-1).item()
+
+        reward_score = result["reward_tensor"].sum(dim=-1).item()
+        reward_extra_info = {k: v[0] for k, v in result.get("reward_extra_info", {}).items()}
+        return {"reward_score": reward_score, "reward_extra_info": reward_extra_info}
 
 
 @ray.remote
@@ -424,7 +430,9 @@ async def _run_agent_loop(
 
             # Some AgentLoop may have already computed the reward score, e.g SWE-agent.
             if output.reward_score is None and not self.config.reward_model.enable:
-                output.reward_score = await self.reward_manager_worker.compute_score.remote(output, kwargs)
+                result = await self.reward_manager_worker.compute_score.remote(output, kwargs)
+                output.reward_score = result["reward_score"]
+                output.extra_fields["reward_extra_info"] = result["reward_extra_info"]
 
             # NOTE: consistent with batch version of generate_sequences in vllm_rollout_spmd.py
             # prompt_ids: left padded with zeros (e.g., [0,0,0,0,1,2,3,4])
@@ -534,6 +542,7 @@ async def _run_agent_loop(
                 reward_score=output.reward_score,
                 num_turns=output.num_turns,
                 metrics=output.metrics,
+                extra_fields=output.extra_fields,
             )
 
     def _postprocess(self, inputs: list[_InternalAgentLoopOutput]) -> DataProto:
@@ -575,13 +584,23 @@ def _postprocess(self, inputs: list[_InternalAgentLoopOutput]) -> DataProto:
             "__num_turns__": np.array([input.num_turns for input in inputs], dtype=np.int32),
         }
 
+        # add reward_extra_info to non_tensor_batch
+        reward_extra_infos = [input.extra_fields.get("reward_extra_info", {}) for input in inputs]
+        reward_extra_keys = list(reward_extra_infos[0].keys())
+        for key in reward_extra_keys:
+            non_tensor_batch[key] = np.array([info[key] for info in reward_extra_infos])
+
         # Add multi_modal_inputs to non_tensor_batch if any samples have them
         multi_modal_inputs_list = [input.multi_modal_inputs for input in inputs]
         if any(mmi is not None for mmi in multi_modal_inputs_list):
             non_tensor_batch["multi_modal_inputs"] = np.array(multi_modal_inputs_list, dtype=object)
 
         metrics = [input.metrics.model_dump() for input in inputs]
-        return DataProto(batch=batch, non_tensor_batch=non_tensor_batch, meta_info={"metrics": metrics})
+        return DataProto(
+            batch=batch,
+            non_tensor_batch=non_tensor_batch,
+            meta_info={"metrics": metrics, "reward_extra_keys": reward_extra_keys},
+        )
 
 
 async def get_trajectory_info(step, index, validate):
@@ -717,10 +736,10 @@ def generate_sequences(self, prompts: DataProto) -> DataProto:
             self.sleep()
 
         # calculate performance metrics
-        metrics = [output.meta_info["metrics"] for output in outputs]  # List[List[Dict[str, str]]]
+        metrics = [output.meta_info.pop("metrics") for output in outputs]  # List[List[Dict[str, str]]]
         timing = self._performance_metrics(metrics, output)
 
-        output.meta_info = {"timing": timing}
+        output.meta_info = {"timing": timing, **outputs[0].meta_info}
         return output
 
     def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: DataProto) -> dict[str, float]:
diff --git a/verl/models/mcore/config_converter.py b/verl/models/mcore/config_converter.py
index 9daf550cdb8..9d3809f93c2 100644
--- a/verl/models/mcore/config_converter.py
+++ b/verl/models/mcore/config_converter.py
@@ -156,7 +156,8 @@ def check_and_construct_configs(original_config: dict, cls: type[T]) -> T:
         for key in removed_keys:
             original_config.pop(key)
 
-    print(f"Overridden {cls.__name__} init config: {original_config}")
+    if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
+        print(f"Overridden {cls.__name__} init config: {original_config}")
     return cls(**original_config)
 
 
diff --git a/verl/models/mcore/qwen2_5_vl/attention.py b/verl/models/mcore/qwen2_5_vl/attention.py
index 91a27cc3edf..84e7ba8eda2 100644
--- a/verl/models/mcore/qwen2_5_vl/attention.py
+++ b/verl/models/mcore/qwen2_5_vl/attention.py
@@ -118,7 +118,8 @@ def forward(
             output, bias = self.linear_proj(context_layer)
             return output, bias
 
-        query, key, value, rotary_pos_emb, attn_mask_type = self._adjust_key_value_for_inference(
+        # Use latest mcore 0.13 API and forward-compatible with previous versions.
+        outputs = self._adjust_key_value_for_inference(
             inference_context,
             query,
             key,
@@ -129,6 +130,8 @@ def forward(
             sequence_len_offset,
         )
 
+        query, key, value, rotary_pos_emb, attn_mask_type = outputs[:5]
+
         if packed_seq_params is not None:
             query = query.squeeze(1)
             key = key.squeeze(1)
diff --git a/verl/models/transformers/monkey_patch.py b/verl/models/transformers/monkey_patch.py
index 7f0e10ab65e..d8d67d5ebaa 100644
--- a/verl/models/transformers/monkey_patch.py
+++ b/verl/models/transformers/monkey_patch.py
@@ -110,6 +110,65 @@ def _ulysses_flash_attention_forward(
     return attn_output
 
 
+def _ulysses_flash_attention_forward_transformers_4_55(
+    query_states: torch.Tensor,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    query_length: int,
+    *args,
+    position_ids: Optional[torch.Tensor] = None,
+    **kwargs,
+):
+    """For transformers>=4.55, the flash attention api has changed,
+    we need to pass the query_length after doing ulysses alltoall.
+
+    See https://github.com/huggingface/transformers/issues/40399
+    """
+    ulysses_sp_size = get_ulysses_sequence_parallel_world_size()
+
+    ########## AlltoAll for Ulysses ##########
+    if ulysses_sp_size > 1:
+        assert position_ids is not None, "position_ids is required for Ulysses sequence parallelism"
+
+        # NOTE: repeat kv heads to be divided by sequence parallel. Instead of repeating nheads_q//nheads_k,
+        # we choose to repeat sp_size//nheads_k, since flash_attention supports MQA/GQA.
+        # For example:
+        # - nheads_k=4, sp=8, repeats=2
+        # - nheads_k=8, sp=8, repeats=1
+        # - nheads_k=16, sp=8, repeats=1
+        repeats = max(ulysses_sp_size // key_states.size(2), 1)
+        key_states = repeat_kv(key_states, repeats)
+        value_states = repeat_kv(value_states, repeats)
+
+        # (bsz, seq_len/n, n_head, head_dim) -> (bsz, seq_len, n_head/n, head_dim)
+        query_states = gather_seq_scatter_heads(query_states, seq_dim=1, head_dim=2)
+        key_states = gather_seq_scatter_heads(key_states, seq_dim=1, head_dim=2)
+        value_states = gather_seq_scatter_heads(value_states, seq_dim=1, head_dim=2)
+
+        # TODO: all_gather position_ids because `prepare_fa2_from_position_ids` needs it, we can eliminate
+        # this all_gather by passing cu_seq_lens_q, cu_seq_lens_k, max_length_k, max_length_q explicitly.
+        # https://github.com/huggingface/transformers/pull/33932
+
+        # (bsz, seq_len/n) -> (bsz, seq_len)
+        position_ids_list = [torch.empty_like(position_ids) for _ in range(ulysses_sp_size)]
+        torch.distributed.all_gather(position_ids_list, position_ids, group=get_ulysses_sequence_parallel_group())
+        position_ids = torch.concat(position_ids_list, dim=-1)
+
+    # (bsz, seq_len, n_head/n, head_dim)
+    query_length = query_states.size(1)
+    attn_output = _flash_attention_forward(
+        query_states, key_states, value_states, attention_mask, query_length, *args, position_ids=position_ids, **kwargs
+    )
+
+    ########## AlltoAll for Ulysses ##########
+    if ulysses_sp_size > 1:
+        # (bsz, seq_len, n_head/n, head_dim) -> (bsz, seq_len/n, n_head, head_dim)
+        attn_output = gather_heads_scatter_seq(attn_output, seq_dim=1, head_dim=2)
+
+    return attn_output
+
+
 def patch_vlm_for_ulysses_input_slicing(model_class: type):
     """
     Applies a monkey patch to the forward method of a given model class
@@ -304,11 +363,17 @@ def state_dict(self, *args, **kwargs):
             module._flash_attention_forward = _ulysses_flash_attention_forward
             print(f"Monkey patch _flash_attention_forward in {model.__module__}")
         else:
-            # transformers>=4.48.0
-            from transformers.integrations import flash_attention
+            if is_transformers_version_in_range(min_version="4.55.0"):
+                from transformers.integrations import flash_attention
+
+                flash_attention._flash_attention_forward = _ulysses_flash_attention_forward_transformers_4_55
+                print(f"Monkey patch _flash_attention_forward in {model.__module__} for new api")
+            else:
+                # 4.48.0 <= transformers <= 4.54.1, Vision attention
+                from transformers.integrations import flash_attention
 
-            flash_attention._flash_attention_forward = _ulysses_flash_attention_forward
-            print(f"Monkey patch _flash_attention_forward in {flash_attention.__name__}")
+                flash_attention._flash_attention_forward = _ulysses_flash_attention_forward
+                print(f"Monkey patch _flash_attention_forward in {flash_attention.__name__}")
 
     patch_forward_with_backends(model, use_fused_kernels=use_fused_kernels, fused_kernels_backend=fused_kernels_backend)
 
diff --git a/verl/models/transformers/qwen2_vl.py b/verl/models/transformers/qwen2_vl.py
index 7c8214c0a8e..0cc7820d114 100644
--- a/verl/models/transformers/qwen2_vl.py
+++ b/verl/models/transformers/qwen2_vl.py
@@ -217,18 +217,36 @@ def flash_attention_forward(
         )  # remove channel dimension
         cu_seqlens_q, cu_seqlens_k = cu_seq_lens
         max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-        attn_output = flash_attn_varlen_func(
+
+        flash_attn_func = flash_attn_varlen_func
+        common_attn_kwargs = {
+            "cu_seqlens_q": cu_seqlens_q,
+            "cu_seqlens_k": cu_seqlens_k,
+            "max_seqlen_q": max_seqlen_in_batch_q,
+            "max_seqlen_k": max_seqlen_in_batch_k,
+            "dropout_p": kwargs.pop("dropout", 0.0),
+            "softmax_scale": kwargs.pop("softmax_scale", None),
+            **flash_kwargs,
+        }
+
+        if flash_attn_func is None:
+            # Use transformers >= 4.54
+            flash_attn_func = _flash_attention_forward
+            specific_attn_kwargs = {
+                "attention_mask": attention_mask,
+                "position_ids": position_ids,
+                "query_length": query_length,
+                "is_causal": causal,
+            }
+        else:
+            specific_attn_kwargs = {"causal": causal}
+
+        attn_output = flash_attn_func(
             query_states,
             key_states,
             value_states,
-            cu_seqlens_q=cu_seqlens_q,
-            cu_seqlens_k=cu_seqlens_k,
-            max_seqlen_q=max_seqlen_in_batch_q,
-            max_seqlen_k=max_seqlen_in_batch_k,
-            dropout_p=kwargs.pop("dropout", 0.0),
-            softmax_scale=kwargs.pop("softmax_scale", None),
-            causal=causal,
-            **flash_kwargs,
+            **common_attn_kwargs,
+            **specific_attn_kwargs,
         )
         attn_output = attn_output.view(batch_size, -1, attn_output.size(-2), attn_output.size(-1))
     else:
diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py
index f7d8825b57d..7ab01b456f7 100644
--- a/verl/trainer/main_ppo.py
+++ b/verl/trainer/main_ppo.py
@@ -26,6 +26,8 @@
 from verl.trainer.constants_ppo import get_ppo_ray_runtime_env
 from verl.trainer.ppo.ray_trainer import RayPPOTrainer
 from verl.trainer.ppo.reward import load_reward_manager
+from verl.trainer.ppo.utils import need_critic, need_reference_policy
+from verl.utils.config import validate_config
 from verl.utils.device import is_cuda_available
 from verl.utils.import_utils import load_extern_type
 
@@ -219,20 +221,6 @@ def run(self, config):
         pprint(OmegaConf.to_container(config, resolve=True))
         OmegaConf.resolve(config)
 
-        # Download the checkpoint from HDFS to the local machine.
-        # `use_shm` determines whether to use shared memory, which could lead to faster model loading if turned on
-        local_path = copy_to_local(
-            config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False)
-        )
-
-        # Instantiate the tokenizer and processor.
-        from verl.utils import hf_processor, hf_tokenizer
-
-        trust_remote_code = config.data.get("trust_remote_code", False)
-        tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
-        # Used for multimodal LLM, could be None
-        processor = hf_processor(local_path, trust_remote_code=trust_remote_code, use_fast=True)
-
         actor_rollout_cls, ray_worker_group_cls = self.add_actor_rollout_worker(config)
         self.add_critic_worker(config)
 
@@ -247,6 +235,27 @@ def run(self, config):
         # Add a reference policy worker if KL loss or KL reward is used.
         self.add_ref_policy_worker(config, actor_rollout_cls)
 
+        # validate config
+        validate_config(
+            config=config,
+            use_reference_policy=need_reference_policy(self.role_worker_mapping),
+            use_critic=need_critic(config),
+        )
+
+        # Download the checkpoint from HDFS to the local machine.
+        # `use_shm` determines whether to use shared memory, which could lead to faster model loading if turned on
+        local_path = copy_to_local(
+            config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False)
+        )
+
+        # Instantiate the tokenizer and processor.
+        from verl.utils import hf_processor, hf_tokenizer
+
+        trust_remote_code = config.data.get("trust_remote_code", False)
+        tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code)
+        # Used for multimodal LLM, could be None
+        processor = hf_processor(local_path, trust_remote_code=trust_remote_code, use_fast=True)
+
         # Load the reward manager for training and validation.
         reward_fn = load_reward_manager(
             config, tokenizer, num_examine=0, **config.reward_model.get("reward_kwargs", {})
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 5e6e48115e5..d2508a1259c 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -21,11 +21,9 @@
 import json
 import os
 import uuid
-import warnings
 from collections import defaultdict
 from copy import deepcopy
 from dataclasses import dataclass, field
-from enum import Enum
 from pprint import pprint
 from typing import Optional
 
@@ -40,7 +38,6 @@
 from verl import DataProto
 from verl.experimental.dataset.sampler import AbstractCurriculumSampler
 from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto
-from verl.single_controller.base import Worker
 from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
 from verl.single_controller.ray.base import create_colocated_worker_cls
 from verl.trainer.config import AlgoConfig
@@ -53,6 +50,7 @@
     process_validation_metrics,
 )
 from verl.trainer.ppo.reward import compute_reward, compute_reward_async
+from verl.trainer.ppo.utils import Role, WorkerType, need_critic, need_reference_policy, need_reward_model
 from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path, should_save_ckpt_esi
 from verl.utils.config import omega_conf_to_dataclass
 from verl.utils.debug import marked_timer
@@ -62,22 +60,6 @@
 from verl.utils.torch_functional import masked_mean
 from verl.utils.tracking import ValidationGenerationsLogger
 
-WorkerType = type[Worker]
-
-
-class Role(Enum):
-    """
-    To create more roles dynamically, you can subclass Role and add new members
-    """
-
-    Actor = 0
-    Rollout = 1
-    ActorRollout = 2
-    Critic = 3
-    RefPolicy = 4
-    RewardModel = 5
-    ActorRolloutRef = 6
-
 
 @dataclass
 class ResourcePoolManager:
@@ -352,8 +334,9 @@ def __init__(
 
         self.role_worker_mapping = role_worker_mapping
         self.resource_pool_manager = resource_pool_manager
-        self.use_reference_policy = Role.RefPolicy in role_worker_mapping
-        self.use_rm = Role.RewardModel in role_worker_mapping
+        self.use_reference_policy = need_reference_policy(self.role_worker_mapping)
+        self.use_rm = need_reward_model(self.role_worker_mapping)
+        self.use_critic = need_critic(self.config)
         self.ray_worker_group_cls = ray_worker_group_cls
         self.device_name = device_name if device_name else self.config.trainer.device
         self.validation_generations_logger = ValidationGenerationsLogger(
@@ -369,138 +352,8 @@ def __init__(
         if self.config.algorithm.use_kl_in_reward:
             self.kl_ctrl_in_reward = core_algos.get_kl_controller(self.config.algorithm.kl_ctrl)
 
-        if config.critic.enable is not None:
-            self.use_critic = bool(config.critic.enable)
-        elif self.config.algorithm.adv_estimator == AdvantageEstimator.GAE:
-            self.use_critic = True
-        else:
-            warnings.warn(
-                "Disabled critic as algorithm.adv_estimator != gae. "
-                "If it is not intended, please set critic.enable=True",
-                stacklevel=2,
-            )
-            self.use_critic = False
-
-        self._validate_config()
         self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
 
-    def _validate_config(self):
-        config = self.config
-        # number of GPUs total
-        n_gpus = config.trainer.n_gpus_per_node * config.trainer.nnodes
-
-        if not config.actor_rollout_ref.actor.use_dynamic_bsz:
-            if config.actor_rollout_ref.actor.strategy == "megatron":
-                model_parallel_size = (
-                    config.actor_rollout_ref.actor.megatron.tensor_model_parallel_size
-                    * config.actor_rollout_ref.actor.megatron.pipeline_model_parallel_size
-                )
-                assert (
-                    n_gpus % (model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size) == 0
-                ), (
-                    f"n_gpus ({n_gpus}) must be divisible by model_parallel_size ({model_parallel_size}) times "
-                    f"context_parallel_size ({config.actor_rollout_ref.actor.megatron.context_parallel_size})"
-                )
-                megatron_dp = n_gpus // (
-                    model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size
-                )
-                minimal_bsz = megatron_dp * config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu
-            else:
-                minimal_bsz = n_gpus
-
-            # 1. Check total batch size for data correctness
-            real_train_batch_size = config.data.train_batch_size * config.actor_rollout_ref.rollout.n
-            assert real_train_batch_size % minimal_bsz == 0, (
-                f"real_train_batch_size ({real_train_batch_size}) must be divisible by minimal possible batch size "
-                f"({minimal_bsz})"
-            )
-
-        # A helper function to check "micro_batch_size" vs "micro_batch_size_per_gpu"
-        # We throw an error if the user sets both. The new convention is "..._micro_batch_size_per_gpu".
-        def check_mutually_exclusive(mbs, mbs_per_gpu, name: str):
-            """Validate mutually exclusive micro batch size configuration options.
-
-            Ensures that users don't set both deprecated micro_batch_size and
-            the new micro_batch_size_per_gpu parameters simultaneously.
-
-            Args:
-                mbs: Deprecated micro batch size parameter value.
-                mbs_per_gpu: New micro batch size per GPU parameter value.
-                name (str): Configuration section name for error messages.
-
-            Raises:
-                ValueError: If both parameters are set or neither is set.
-            """
-            settings = {
-                "reward_model": "micro_batch_size",
-                "actor_rollout_ref.ref": "log_prob_micro_batch_size",
-                "actor_rollout_ref.rollout": "log_prob_micro_batch_size",
-            }
-
-            if name in settings:
-                param = settings[name]
-                param_per_gpu = f"{param}_per_gpu"
-
-                if mbs is None and mbs_per_gpu is None:
-                    raise ValueError(
-                        f"[{name}] Please set at least one of '{name}.{param}' or '{name}.{param_per_gpu}'."
-                    )
-
-                if mbs is not None and mbs_per_gpu is not None:
-                    raise ValueError(
-                        f"[{name}] You have set both '{name}.{param}' AND '{name}.{param_per_gpu}'. Please remove "
-                        f"'{name}.{param}' because only '*_{param_per_gpu}' is supported (the former is deprecated)."
-                    )
-
-        # Actor validation done in ActorConfig.__post_init__ and validate()
-        actor_config = omega_conf_to_dataclass(config.actor_rollout_ref.actor)
-        actor_config.validate(n_gpus, config.data.train_batch_size, config.actor_rollout_ref.model)
-
-        if not config.actor_rollout_ref.actor.use_dynamic_bsz:
-            if self.use_reference_policy:
-                # reference: log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu
-                check_mutually_exclusive(
-                    config.actor_rollout_ref.ref.log_prob_micro_batch_size,
-                    config.actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu,
-                    "actor_rollout_ref.ref",
-                )
-
-            #  The rollout section also has log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu
-            check_mutually_exclusive(
-                config.actor_rollout_ref.rollout.log_prob_micro_batch_size,
-                config.actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu,
-                "actor_rollout_ref.rollout",
-            )
-
-        # Check for reward model micro-batch size conflicts
-        if config.reward_model.enable and not config.reward_model.use_dynamic_bsz:
-            check_mutually_exclusive(
-                config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu, "reward_model"
-            )
-
-        if self.config.algorithm.use_kl_in_reward and config.actor_rollout_ref.actor.use_kl_loss:
-            print("NOTICE: You have both enabled in-reward kl and kl loss.")
-
-        # critic
-        if self.use_critic:
-            critic_config = omega_conf_to_dataclass(config.critic)
-            critic_config.validate(n_gpus, config.data.train_batch_size)
-
-        if config.data.get("val_batch_size", None) is not None:
-            print(
-                "WARNING: val_batch_size is deprecated."
-                + " Validation datasets are sent to inference engines as a whole batch,"
-                + " which will schedule the memory themselves."
-            )
-
-        # check eval config
-        if config.actor_rollout_ref.rollout.val_kwargs.do_sample:
-            assert config.actor_rollout_ref.rollout.temperature > 0, (
-                "validation gen temperature should be greater than 0 when enabling do_sample"
-            )
-
-        print("[validate_config] All configuration checks passed successfully!")
-
     def _create_dataloader(self, train_dataset, val_dataset, collate_fn, train_sampler: Optional[Sampler]):
         """
         Creates the train and validation dataloaders.
diff --git a/verl/trainer/ppo/utils.py b/verl/trainer/ppo/utils.py
new file mode 100644
index 00000000000..22d00a45052
--- /dev/null
+++ b/verl/trainer/ppo/utils.py
@@ -0,0 +1,65 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from enum import Enum
+
+from omegaconf import DictConfig
+
+from verl.single_controller.base import Worker
+from verl.trainer.ppo.core_algos import AdvantageEstimator
+
+WorkerType = type[Worker]
+
+
+class Role(Enum):
+    """
+    To create more roles dynamically, you can subclass Role and add new members
+    """
+
+    Actor = 0
+    Rollout = 1
+    ActorRollout = 2
+    Critic = 3
+    RefPolicy = 4
+    RewardModel = 5
+    ActorRolloutRef = 6
+
+
+def need_reference_policy(
+    role_worker_mapping: dict[Role, WorkerType],
+) -> bool:
+    """Given a role worker mapping, do we need ref policy."""
+    return Role.RefPolicy in role_worker_mapping
+
+
+def need_reward_model(
+    role_worker_mapping: dict[Role, WorkerType],
+) -> bool:
+    """Given a role worker mapping, do we need reward model."""
+    return Role.RewardModel in role_worker_mapping
+
+
+def need_critic(config: DictConfig) -> bool:
+    """Given a config, do we need critic."""
+    if config.critic.enable is not None:
+        return bool(config.critic.enable)
+    elif config.algorithm.adv_estimator == AdvantageEstimator.GAE:
+        return True
+    else:
+        warnings.warn(
+            "Disabled critic as algorithm.adv_estimator != gae. If it is not intended, please set critic.enable=True",
+            stacklevel=2,
+        )
+        return False
diff --git a/verl/utils/config.py b/verl/utils/config.py
index fabed0b2526..fa3630c654c 100644
--- a/verl/utils/config.py
+++ b/verl/utils/config.py
@@ -17,7 +17,7 @@
 
 from omegaconf import DictConfig, ListConfig, OmegaConf
 
-__all__ = ["omega_conf_to_dataclass"]
+__all__ = ["omega_conf_to_dataclass", "validate_config"]
 
 
 def omega_conf_to_dataclass(config: DictConfig | dict, dataclass_type: Optional[type[Any]] = None) -> Any:
@@ -69,3 +69,129 @@ def update_dict_with_config(dictionary: dict, config: DictConfig):
     for key in dictionary:
         if hasattr(config, key):
             dictionary[key] = getattr(config, key)
+
+
+def validate_config(
+    config: DictConfig,
+    use_reference_policy: bool,
+    use_critic: bool,
+) -> None:
+    """Validate an OmegaConf DictConfig.
+
+    Args:
+        config (DictConfig): The OmegaConf DictConfig to validate.
+        use_reference_policy (bool): is ref policy needed
+        use_critic (bool): is critic needed
+    """
+    # number of GPUs total
+    n_gpus = config.trainer.n_gpus_per_node * config.trainer.nnodes
+
+    if not config.actor_rollout_ref.actor.use_dynamic_bsz:
+        if config.actor_rollout_ref.actor.strategy == "megatron":
+            model_parallel_size = (
+                config.actor_rollout_ref.actor.megatron.tensor_model_parallel_size
+                * config.actor_rollout_ref.actor.megatron.pipeline_model_parallel_size
+            )
+            assert (
+                n_gpus % (model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size) == 0
+            ), (
+                f"n_gpus ({n_gpus}) must be divisible by model_parallel_size ({model_parallel_size}) times "
+                f"context_parallel_size ({config.actor_rollout_ref.actor.megatron.context_parallel_size})"
+            )
+            megatron_dp = n_gpus // (
+                model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size
+            )
+            minimal_bsz = megatron_dp * config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu
+        else:
+            minimal_bsz = n_gpus
+
+        # 1. Check total batch size for data correctness
+        real_train_batch_size = config.data.train_batch_size * config.actor_rollout_ref.rollout.n
+        assert real_train_batch_size % minimal_bsz == 0, (
+            f"real_train_batch_size ({real_train_batch_size}) must be divisible by minimal possible batch size "
+            f"({minimal_bsz})"
+        )
+
+    # A helper function to check "micro_batch_size" vs "micro_batch_size_per_gpu"
+    # We throw an error if the user sets both. The new convention is "..._micro_batch_size_per_gpu".
+    def check_mutually_exclusive(mbs, mbs_per_gpu, name: str):
+        """Validate mutually exclusive micro batch size configuration options.
+
+        Ensures that users don't set both deprecated micro_batch_size and
+        the new micro_batch_size_per_gpu parameters simultaneously.
+
+        Args:
+            mbs: Deprecated micro batch size parameter value.
+            mbs_per_gpu: New micro batch size per GPU parameter value.
+            name (str): Configuration section name for error messages.
+
+        Raises:
+            ValueError: If both parameters are set or neither is set.
+        """
+        settings = {
+            "reward_model": "micro_batch_size",
+            "actor_rollout_ref.ref": "log_prob_micro_batch_size",
+            "actor_rollout_ref.rollout": "log_prob_micro_batch_size",
+        }
+
+        if name in settings:
+            param = settings[name]
+            param_per_gpu = f"{param}_per_gpu"
+
+            if mbs is None and mbs_per_gpu is None:
+                raise ValueError(f"[{name}] Please set at least one of '{name}.{param}' or '{name}.{param_per_gpu}'.")
+
+            if mbs is not None and mbs_per_gpu is not None:
+                raise ValueError(
+                    f"[{name}] You have set both '{name}.{param}' AND '{name}.{param_per_gpu}'. Please remove "
+                    f"'{name}.{param}' because only '*_{param_per_gpu}' is supported (the former is deprecated)."
+                )
+
+    # Actor validation done in ActorConfig.__post_init__ and validate()
+    actor_config = omega_conf_to_dataclass(config.actor_rollout_ref.actor)
+    actor_config.validate(n_gpus, config.data.train_batch_size, config.actor_rollout_ref.model)
+
+    if not config.actor_rollout_ref.actor.use_dynamic_bsz:
+        if use_reference_policy:
+            # reference: log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu
+            check_mutually_exclusive(
+                config.actor_rollout_ref.ref.log_prob_micro_batch_size,
+                config.actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu,
+                "actor_rollout_ref.ref",
+            )
+
+        #  The rollout section also has log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu
+        check_mutually_exclusive(
+            config.actor_rollout_ref.rollout.log_prob_micro_batch_size,
+            config.actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu,
+            "actor_rollout_ref.rollout",
+        )
+
+    # Check for reward model micro-batch size conflicts
+    if config.reward_model.enable and not config.reward_model.use_dynamic_bsz:
+        check_mutually_exclusive(
+            config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu, "reward_model"
+        )
+
+    if config.algorithm.use_kl_in_reward and config.actor_rollout_ref.actor.use_kl_loss:
+        print("NOTICE: You have both enabled in-reward kl and kl loss.")
+
+    # critic
+    if use_critic:
+        critic_config = omega_conf_to_dataclass(config.critic)
+        critic_config.validate(n_gpus, config.data.train_batch_size)
+
+    if config.data.get("val_batch_size", None) is not None:
+        print(
+            "WARNING: val_batch_size is deprecated."
+            + " Validation datasets are sent to inference engines as a whole batch,"
+            + " which will schedule the memory themselves."
+        )
+
+    # check eval config
+    if config.actor_rollout_ref.rollout.val_kwargs.do_sample:
+        assert config.actor_rollout_ref.rollout.temperature > 0, (
+            "validation gen temperature should be greater than 0 when enabling do_sample"
+        )
+
+    print("[validate_config] All configuration checks passed successfully!")
diff --git a/verl/utils/dataset/multiturn_sft_dataset.py b/verl/utils/dataset/multiturn_sft_dataset.py
index cd58c984359..0d38fdd850d 100644
--- a/verl/utils/dataset/multiturn_sft_dataset.py
+++ b/verl/utils/dataset/multiturn_sft_dataset.py
@@ -22,6 +22,7 @@
 import numpy as np
 import pandas as pd
 import torch
+from omegaconf import ListConfig
 from torch.utils.data import Dataset
 from transformers import PreTrainedTokenizer
 
@@ -60,7 +61,7 @@ def __init__(self, parquet_files: str | list[str], tokenizer, config=None):
         self.apply_chat_template_kwargs = config.get("apply_chat_template_kwargs", {})
         assert self.truncation in ["error", "left", "right"]
 
-        if not isinstance(parquet_files, list):
+        if not isinstance(parquet_files, list | ListConfig):
             parquet_files = [parquet_files]
 
         self.parquet_files = parquet_files
diff --git a/verl/utils/profiler/mstx_profile.py b/verl/utils/profiler/mstx_profile.py
index 33caedce412..b9576714248 100644
--- a/verl/utils/profiler/mstx_profile.py
+++ b/verl/utils/profiler/mstx_profile.py
@@ -214,8 +214,7 @@ def stop(self):
                 self.profile_npu.stop()
                 NPUProfiler._define_count -= 1
 
-    @staticmethod
-    def annotate(message: Optional[str] = None, role: Optional[str] = None, **kwargs) -> Callable:
+    def annotate(self, message: Optional[str] = None, role: Optional[str] = None, **kwargs_outer) -> Callable:
         """Decorate a Worker member function to profile the current rank in the current training step.
 
         Requires the target function to be a member function of a Worker,
@@ -230,32 +229,32 @@ def annotate(message: Optional[str] = None, role: Optional[str] = None, **kwargs
 
         def decorator(func):
             @functools.wraps(func)
-            def wrapper(self, *args, **kwargs):
-                if not self.profiler.enable:
-                    return func(self, *args, **kwargs)
+            def wrapper(*args, **kwargs_inner):
+                if not self.enable:
+                    return func(*args, **kwargs_inner)
 
                 profile_name = message or func.__name__
-                discrete_mode = self.profiler.discrete
-                profile_enable = self.profiler.this_step and self.profiler.enable
+                discrete_mode = self.discrete
+                profile_enable = self.this_step and self.enable
 
                 if not profile_enable:
-                    return func(self, *args, **kwargs)
+                    return func(*args, **kwargs_inner)
 
                 if profile_enable:
                     if not discrete_mode:
                         mark_range = mark_start_range(message=profile_name)
                     else:
                         profile_npu = get_npu_profiler(
-                            contents=self.profiler.profile_contents,
-                            profile_level=self.profiler.profile_level,
-                            profile_save_path=self.profiler.profile_save_path,
-                            analysis=self.profiler.analysis,
+                            contents=self.profile_contents,
+                            profile_level=self.profile_level,
+                            profile_save_path=self.profile_save_path,
+                            analysis=self.analysis,
                             role=role,
                         )
                         profile_npu.start()
                         mark_range = mark_start_range(message=profile_name)
 
-                result = func(self, *args, **kwargs)
+                result = func(*args, **kwargs_inner)
 
                 if profile_enable:
                     if not discrete_mode:
diff --git a/verl/utils/profiler/nvtx_profile.py b/verl/utils/profiler/nvtx_profile.py
index b92e588032f..35857498c03 100644
--- a/verl/utils/profiler/nvtx_profile.py
+++ b/verl/utils/profiler/nvtx_profile.py
@@ -149,13 +149,13 @@ def stop(self):
             if not self.discrete:
                 torch.cuda.profiler.stop()
 
-    @staticmethod
     def annotate(
+        self,
         message: Optional[str] = None,
         color: Optional[str] = None,
         domain: Optional[str] = None,
         category: Optional[str] = None,
-        **kwargs,
+        **kwargs_outer,
     ) -> Callable:
         """Decorate a Worker member function to profile the current rank in the current training step.
 
@@ -175,22 +175,22 @@ def annotate(
 
         def decorator(func):
             @functools.wraps(func)
-            def wrapper(self, *args, **kwargs):
-                if not self.profiler.enable:
-                    return func(self, *args, **kwargs)
+            def wrapper(*args, **kwargs_inner):
+                if not self.enable:
+                    return func(*args, **kwargs_inner)
 
                 profile_name = message or func.__name__
 
-                if self.profiler.this_step:
-                    if self.profiler.discrete:
+                if self.this_step:
+                    if self.discrete:
                         torch.cuda.profiler.start()
                     mark_range = mark_start_range(message=profile_name, color=color, domain=domain, category=category)
 
-                result = func(self, *args, **kwargs)
+                result = func(*args, **kwargs_inner)
 
-                if self.profiler.this_step:
+                if self.this_step:
                     mark_end_range(mark_range)
-                    if self.profiler.discrete:
+                    if self.discrete:
                         torch.cuda.profiler.stop()
 
                 return result
diff --git a/verl/utils/profiler/profile.py b/verl/utils/profiler/profile.py
index 92baf5ec978..7c9d2fe15ce 100644
--- a/verl/utils/profiler/profile.py
+++ b/verl/utils/profiler/profile.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import functools
 import os
 from typing import Callable, Optional
 
@@ -226,16 +227,35 @@ def start(self, **kwargs):
     def stop(self):
         return getattr(self._impl, "stop", lambda: None)()
 
-    @staticmethod
+    @classmethod
     def annotate(
+        cls,
         message: Optional[str] = None,
         color: Optional[str] = None,
         domain: Optional[str] = None,
         category: Optional[str] = None,
-        **kwargs,
+        **kwargs_outer,
     ) -> Callable:
         def decorator(func):
-            return func
+            @functools.wraps(func)
+            def wrapper(self_instance, *args, **kwargs_inner):
+                profiler = getattr(self_instance, "profiler", None)
+                if not profiler:
+                    return func(self_instance, *args, **kwargs_inner)
+
+                impl = profiler._impl
+                if hasattr(impl, "annotate"):
+                    try:
+                        actual_decorator = impl.annotate(
+                            message=message, color=color, domain=domain, category=category, **kwargs_outer
+                        )
+
+                        return actual_decorator(func)(self_instance, *args, **kwargs_inner)
+                    except Exception:
+                        return func(self_instance, *args, **kwargs_inner)
+                return func(self_instance, *args, **kwargs_inner)
+
+            return wrapper
 
         return decorator
 
diff --git a/verl/workers/actor/megatron_actor.py b/verl/workers/actor/megatron_actor.py
index 40e823a54f5..7f85a8955af 100644
--- a/verl/workers/actor/megatron_actor.py
+++ b/verl/workers/actor/megatron_actor.py
@@ -28,12 +28,10 @@
 import torch
 import torch.distributed
 from megatron.core import parallel_state as mpu
-from megatron.core.distributed import finalize_model_grads
 
 # from megatron.core.optimizer import DistributedOptimizer
 from megatron.core.optimizer import DistributedOptimizer
 from megatron.core.pipeline_parallel import get_forward_backward_func
-from omegaconf import OmegaConf
 from torch import nn
 
 from verl import DataProto
@@ -136,23 +134,9 @@ def __init__(
             for model in self.actor_module:
                 patch_fused_forward(model)
 
-        self.optimizer_step_args = OmegaConf.create(
-            {
-                "skip_grad": None,
-                "overlap_dp_param_comm": False,
-                "overlap_dp_grad_comm": False,
-                "gradient_accumulation_steps": 1,
-                "sequence_parallel": self.tf_config.sequence_parallel,
-                "DDP_impl": "local",
-                "layernorm_allreduce_bucket_threshold": 0,
-                "pipeline_model_parallel_split_rank": None,
-                "reduce_grads_use_alltoall": False,
-            }
-        )
-
         config = get_model_config(self.actor_module[0])
-        print(config)
-        config.finalize_model_grads_func = finalize_model_grads
+        if torch.distributed.get_rank() == 0:
+            print(config)
 
     def _validate_config(self, config) -> None:
         """Validate config options not implemented for Megatron backend"""
@@ -194,85 +178,73 @@ def compute_log_prob(self, data: DataProto, calculate_entropy=False) -> torch.Te
                 "micro batch size is needed for forward compute when use_dynamic_bsz is False"
             )
 
-        def compute_logprobs_fn(output, data, use_dynamic_bsz=False, indices=None):
-            response = data["responses"]
-            response_length = response.size(1)
-            log_probs = output["log_probs"][:, -response_length - 1 : -1].contiguous()
-            return {"log_probs": log_probs}
-
         # We make recompute_old_log_prob by default here.
         # TODO (zhangchi.usc1992): actually, this function should only return log_prob and this logic should be
         # handled by user outside
-        recompute_old_log_prob = self.config.get("recompute_old_log_prob", True)
-
         entropys = torch.Tensor()
-        if recompute_old_log_prob:
-            select_keys = ["responses", "input_ids", "attention_mask", "position_ids"]
-            batch = data.select(batch_keys=select_keys).batch
-            input_ids = batch["input_ids"]
-            batch_size = input_ids.size(0)
-            response = batch["responses"]
-            response_length = response.size(1)
-            with torch.no_grad():
-                output = self.forward_backward_batch(
-                    data,
-                    forward_only=True,
-                    post_process_fn=compute_logprobs_fn,
-                    calculate_entropy=calculate_entropy,
-                    use_dynamic_bsz=use_dynamic_bsz,
-                    micro_batch_size=micro_batch_size,
-                    max_token_len=max_token_len,
-                )
-                if mpu.is_pipeline_last_stage(ignore_virtual=True):
-                    # only on last rank. It should be on every tp rank
+
+        select_keys = ["responses", "input_ids", "attention_mask", "position_ids"]
+        batch = data.select(batch_keys=select_keys).batch
+        input_ids = batch["input_ids"]
+        batch_size = input_ids.size(0)
+        response = batch["responses"]
+        response_length = response.size(1)
+        with torch.no_grad():
+            output = self.forward_backward_batch(
+                data,
+                forward_only=True,
+                calculate_entropy=calculate_entropy,
+                use_dynamic_bsz=use_dynamic_bsz,
+                micro_batch_size=micro_batch_size,
+                max_token_len=max_token_len,
+            )
+            if mpu.is_pipeline_last_stage(ignore_virtual=True):
+                # only on last rank. It should be on every tp rank
+                log_probs = [o["log_probs"] for o in output["output"]]  # (bs, seq_size)
+                log_probs = torch.cat(log_probs, dim=0).to(torch.float32)
+
+                if calculate_entropy:
+                    entropys = torch.cat([o["entropy"] for o in output["output"]], dim=0)
+                    entropys = entropys.to(torch.float32)
+
+                if use_dynamic_bsz:
+                    indices = output["indices"]
+                    indices = list(itertools.chain.from_iterable(indices))
+                    assert len(indices) == log_probs.size(0), f"{len(indices)} vs. {log_probs.size()}"
+                    revert_indices = torch.tensor(get_reverse_idx(indices), dtype=torch.long)
+                    log_probs = log_probs[revert_indices]
                     if calculate_entropy:
-                        log_probs = [o[0]["log_probs"] for o in output["output"]]  # (bs, seq_size)
-                    else:
-                        log_probs = [o["log_probs"] for o in output["output"]]  # (bs, seq_size)
-                    log_probs = torch.cat(log_probs, dim=0).to(torch.float32)
-                    if use_dynamic_bsz:
-                        indices = output["indices"]
-                        indices = list(itertools.chain.from_iterable(indices))
-                        assert len(indices) == log_probs.size(0), f"{len(indices)} vs. {log_probs.size()}"
-                        revert_indices = torch.tensor(get_reverse_idx(indices), dtype=torch.long)
-                        log_probs = log_probs[revert_indices]
-                else:
-                    log_probs = torch.empty(
+                        assert len(indices) == entropys.size(0), f"{len(indices)} vs. {entropys.size()}"
+                        entropys = entropys[revert_indices]
+            else:
+                # other pp ranks
+                log_probs = torch.empty(
+                    size=(batch_size, response_length), dtype=torch.float32, device=input_ids.device
+                )
+                if calculate_entropy:
+                    entropys = torch.empty(
                         size=(batch_size, response_length), dtype=torch.float32, device=input_ids.device
                     )
-                log_probs = log_probs.to(get_device_id())
-                # broadcast across pp ranks
+
+            log_probs = log_probs.to(get_device_id())
+            # broadcast across pp ranks
+            torch.distributed.broadcast(
+                tensor=log_probs,
+                src=mpu.get_pipeline_model_parallel_last_rank(),
+                group=mpu.get_pipeline_model_parallel_group(),
+                async_op=False,
+            )
+            log_probs = log_probs.to("cpu")
+
+            if calculate_entropy:
+                entropys = entropys.to(get_device_id())
                 torch.distributed.broadcast(
-                    tensor=log_probs,
+                    tensor=entropys,
                     src=mpu.get_pipeline_model_parallel_last_rank(),
                     group=mpu.get_pipeline_model_parallel_group(),
                     async_op=False,
                 )
-                log_probs = log_probs.to("cpu")
-                if calculate_entropy:
-                    # Note that o[0] is metrics, o[1] is entropy
-                    if mpu.is_pipeline_last_stage(ignore_virtual=True):
-                        entropys = torch.cat([o[1] for o in output["output"]], dim=0)
-                        entropys = entropys.to(torch.float32)
-                        if use_dynamic_bsz:
-                            indices = output["indices"]
-                            indices = list(itertools.chain.from_iterable(indices))
-                            assert len(indices) == entropys.size(0), f"{len(indices)} vs. {entropys.size()}"
-                            revert_indices = torch.tensor(get_reverse_idx(indices), dtype=torch.long)
-                            entropys = entropys[revert_indices]
-                    else:
-                        entropys = torch.empty(
-                            size=(batch_size, response_length), dtype=torch.float32, device=input_ids.device
-                        )
-                    # broadcast across pp ranks
-                    entropys = entropys.to(get_device_id())
-                    torch.distributed.broadcast(
-                        tensor=entropys,
-                        src=mpu.get_pipeline_model_parallel_last_rank(),
-                        group=mpu.get_pipeline_model_parallel_group(),
-                        async_op=False,
-                    )
-                    entropys = entropys.to("cpu")
+                entropys = entropys.to("cpu")
 
         # add empty cache after each compute
         get_torch_device().empty_cache()
@@ -328,16 +300,68 @@ def make_minibatch_iterator(self, data: DataProto) -> Iterable[DataProto]:
             dataloader_kwargs={"shuffle": self.config.shuffle},
         )
 
+    def compute_ppo_loss(self, model_output, data):
+        log_prob = model_output["log_probs"]
+        entropy = model_output.get("entropy", None)
+
+        metrics = {}
+
+        response_mask = data["response_mask"].to(bool)
+        # compute policy loss
+        old_log_prob = data["old_log_probs"]
+        advantages = data["advantages"]
+
+        loss_agg_mode = self.config.loss_agg_mode
+
+        loss_mode = self.config.policy_loss.get("loss_mode", "vanilla")
+
+        policy_loss_fn = get_policy_loss_fn(loss_mode)
+        pg_loss, pg_clipfrac, ppo_kl, pg_clipfrac_lower = policy_loss_fn(
+            old_log_prob=old_log_prob,
+            log_prob=log_prob,
+            advantages=advantages,
+            response_mask=response_mask,
+            loss_agg_mode=loss_agg_mode,
+            config=self.config,
+        )
+
+        metrics.update(
+            {
+                "actor/pg_loss": pg_loss.detach().item(),
+                "actor/pg_clipfrac": pg_clipfrac.detach().item(),
+                "actor/ppo_kl": ppo_kl.detach().item(),
+                "actor/pg_clipfrac_lower": pg_clipfrac_lower.detach().item(),
+            }
+        )
+        policy_loss = pg_loss
+
+        # add entropy loss
+        if entropy is not None:
+            entropy_loss = agg_loss(loss_mat=entropy, loss_mask=response_mask, loss_agg_mode=loss_agg_mode)
+            entropy_coeff = self.config.entropy_coeff
+            policy_loss -= entropy_coeff * entropy_loss
+
+        # add kl loss
+        if self.config.use_kl_loss:
+            ref_log_prob = data["ref_log_prob"]
+            # compute kl loss
+            kld = kl_penalty(logprob=log_prob, ref_logprob=ref_log_prob, kl_penalty=self.config.kl_loss_type)
+            kl_loss = agg_loss(loss_mat=kld, loss_mask=response_mask, loss_agg_mode=self.config.loss_agg_mode)
+
+            policy_loss += kl_loss * self.config.kl_loss_coef
+            metrics["actor/kl_loss"] = kl_loss.detach().item()
+            metrics["actor/kl_coef"] = self.config.kl_loss_coef
+
+        return policy_loss, metrics
+
     def forward_backward_batch(
         self,
         data: DataProto,
         forward_only=False,
-        post_process_fn=None,
         calculate_entropy=False,
         use_dynamic_bsz=False,
         micro_batch_size=None,
         max_token_len=None,
-        mini_batch_size=None,
     ):
         """
         We assume:
@@ -387,98 +411,40 @@ def forward_backward_batch(
                 )
             else:
                 micro_batches, indices = rearrange_micro_batches(batch=mini_batch.batch, max_token_len=max_token_len)
-            total_seqlen = max_token_len
         else:
             assert micro_batch_size is not None, (
                 "micro_batch_size is needed to be passed in when not using dynamic batch size"
             )
             micro_batches = mini_batch.batch.split(micro_batch_size)
-            seq_len = micro_batches[0]["input_ids"].shape[1]
-            total_seqlen = micro_batch_size * seq_len
         # compute input shapes for pp stages
         n_micro_batch = len(micro_batches)
 
         forward_backward_func = get_forward_backward_func()
 
-        def loss_func(output, data, meta_info):
+        def loss_func(output, data):
             # For memory efficiency
             # We move calculation of entropy to compute_log_probs, forward_only == True
             device = output["log_probs"].device
-            metrics = {}
-            if forward_only:
-                if post_process_fn is None:
-                    pass
-                    # metrics["logits"] = output
-                else:
-                    stats = post_process_fn(output, data)
-                    metrics.update(stats)
-                if not calculate_entropy:
-                    return torch.tensor(1.0, device=device), metrics
 
             responses = data["responses"]
             response_length = responses.size(1)
-            response_mask = data["response_mask"].to(bool)
-            loss_agg_mode = self.config.loss_agg_mode
 
-            # compute policy loss
             log_prob = output["log_probs"][:, -response_length - 1 : -1].contiguous()
-            ret_entropy = None
-            stats = {}
-            if not forward_only:
-                old_log_prob = data["old_log_probs"]
-                advantages = data["advantages"]
-
-                entropy_coeff = self.config.entropy_coeff
-                loss_agg_mode = self.config.loss_agg_mode
-
-                loss_mode = self.config.policy_loss.get("loss_mode", "vanilla")
-
-                policy_loss_fn = get_policy_loss_fn(loss_mode)
-                pg_loss, pg_clipfrac, ppo_kl, pg_clipfrac_lower = policy_loss_fn(
-                    old_log_prob=old_log_prob,
-                    log_prob=log_prob,
-                    advantages=advantages,
-                    response_mask=response_mask,
-                    loss_agg_mode=loss_agg_mode,
-                    config=self.config,
-                )
-
-                stats.update(
-                    {
-                        "actor/pg_loss": pg_loss.detach().item(),
-                        "actor/pg_clipfrac": pg_clipfrac.detach().item(),
-                        "actor/ppo_kl": ppo_kl.detach().item(),
-                        "actor/pg_clipfrac_lower": pg_clipfrac_lower.detach().item(),
-                    }
-                )
-                policy_loss = pg_loss
-
+            model_output = {"log_probs": log_prob}
             if calculate_entropy:
                 entropy = output["entropy"][:, -response_length - 1 : -1].contiguous()
-                if not forward_only:
-                    entropy_loss = agg_loss(loss_mat=entropy, loss_mask=response_mask, loss_agg_mode=loss_agg_mode)
-                    entropy_coeff = meta_info["entropy_coeff"]
-                    policy_loss = pg_loss - entropy_coeff * entropy_loss
-                else:
-                    ret_entropy = entropy
+                model_output["entropy"] = entropy
 
             if forward_only:
-                policy_loss = torch.tensor(1.0, device=device)
-            else:
-                if self.config.use_kl_loss:
-                    ref_log_prob = data["ref_log_prob"]
-                    # compute kl loss
-                    kld = kl_penalty(logprob=log_prob, ref_logprob=ref_log_prob, kl_penalty=self.config.kl_loss_type)
-                    kl_loss = agg_loss(loss_mat=kld, loss_mask=response_mask, loss_agg_mode=self.config.loss_agg_mode)
+                # for inference
+                return torch.tensor(1.0, device=device), model_output
 
-                    policy_loss = policy_loss + kl_loss * self.config.kl_loss_coef
-                    metrics["actor/kl_loss"] = kl_loss.detach().item()
-                    metrics["actor/kl_coef"] = self.config.kl_loss_coef
+            # for training
+            # note that this loss function can be swapped with other loss functions such as SFT
+            policy_loss, metrics = self.compute_ppo_loss(model_output, data)
 
-                # return loss and stats
-
-            append_to_dict(metrics, stats)
-            return policy_loss, [metrics, ret_entropy]
+            # return loss and stats
+            return policy_loss, metrics
 
         def forward_step(batch_iter, model):
             batch = next(batch_iter)
@@ -531,11 +497,12 @@ def logits_processor(logits, label, label_mask):
                     ret = {}
                     if calculate_entropy:
                         logits_bak = logits.clone()
-                        logger.warning_once(
-                            "For memory-efficient computation, enable fused kernels via "
-                            "`actor_rollout_ref.model.use_fused_kernels=True`. "
-                            "The current `clone()` operation ensures correctness but increases memory usage."
-                        )
+                        if torch.distributed.get_rank() == 0:
+                            logger.warning_once(
+                                "For memory-efficient computation, enable fused kernels via "
+                                "`actor_rollout_ref.model.use_fused_kernels=True`. "
+                                "The current `clone()` operation ensures correctness but increases memory usage."
+                            )
                         entropy = vocab_parallel_entropy(logits)
                         ret["entropy"] = entropy
                     else:
@@ -557,42 +524,22 @@ def logits_processor(logits, label, label_mask):
                     logits_processor_args=logits_processor_args,
                 )
 
-            if forward_only:
-                meta_info = None
-            else:
-                clip_ratio_c = self.config.get("clip_ratio_c", 3.0)
-                meta_info = {
-                    "clip_ratio": self.config.clip_ratio,
-                    "entropy_coeff": self.config.entropy_coeff,
-                    "clip_ratio_c": clip_ratio_c,
-                }
-            return output, partial(loss_func, data=batch, meta_info=meta_info)
+            return output, partial(loss_func, data=batch)
 
         # batch should be a list of batches inside micro-batches
         batch_generator = make_batch_generator(micro_batches, vpp_size=len(self.actor_module))
 
         # TODO: we may use the new schedule instead
         # for flash-attn: (seq_len, batch_size, hidden_size) = (mbs*seq_len, 1, hidden_size)
-        if mpu.get_pipeline_model_parallel_world_size() > 1:
-            losses_reduced = forward_backward_func(
-                forward_step_func=forward_step,
-                data_iterator=batch_generator,
-                model=self.actor_module,
-                num_microbatches=n_micro_batch,
-                seq_length=total_seqlen,  # no use when input_shapes was set
-                micro_batch_size=1,  # no use when input_shapes was set
-                forward_only=forward_only,
-            )
-        else:
-            losses_reduced = forward_backward_func(
-                forward_step_func=forward_step,
-                data_iterator=batch_generator,
-                model=self.actor_module,
-                num_microbatches=n_micro_batch,
-                seq_length=total_seqlen,  # in use for pp = 1
-                micro_batch_size=1,  # in use for pp = 1
-                forward_only=forward_only,
-            )
+        losses_reduced = forward_backward_func(
+            forward_step_func=forward_step,
+            data_iterator=batch_generator,
+            model=self.actor_module,
+            num_microbatches=n_micro_batch,
+            seq_length=1,  # the communication shape is obtained via p2p comm
+            micro_batch_size=1,  # the communication shape is obtained via p2p comm
+            forward_only=forward_only,
+        )
         # loss_reduces contains the stats returned from loss_func
 
         if self.has_multi_modal_inputs:
@@ -642,12 +589,11 @@ def update_policy(self, dataloader: Iterable[DataProto]) -> dict:
                 use_dynamic_bsz=self.config.use_dynamic_bsz,
                 micro_batch_size=micro_batch_size,
                 max_token_len=max_token_len,
-                mini_batch_size=self.config.ppo_mini_batch_size,
             )
             metric_micro_batch = metric_micro_batch["output"]
             for metric in metric_micro_batch:
                 # Note that o[0] is metrics, o[1] is entropy, o[2] is response_mask
-                append_to_dict(metrics, metric[0])  # append the metric from this micro-batch to global metrics.
+                append_to_dict(metrics, metric)  # append the metric from this micro-batch to global metrics.
 
             update_successful, grad_norm, num_zeros_in_grad = self.actor_optimizer.step()
             data = {"actor/grad_norm": grad_norm}
diff --git a/verl/workers/config/model.py b/verl/workers/config/model.py
index e6bd4120b07..06466977365 100644
--- a/verl/workers/config/model.py
+++ b/verl/workers/config/model.py
@@ -37,12 +37,16 @@ class HFModelConfig(BaseConfig):
         "tokenizer",
         "processor",
         "local_path",
+        "local_hf_config_path",
+        "local_tokenizer_path",
     }
 
     path: str = MISSING
     local_path: Optional[str] = None
     hf_config_path: Optional[str] = None
+    local_hf_config_path: Optional[str] = None
     tokenizer_path: Optional[str] = None
+    local_tokenizer_path: Optional[str] = None
 
     hf_config: Any = None
     generation_config: Any = None
@@ -82,17 +86,22 @@ def __post_init__(self):
         if self.tokenizer_path is None:
             self.tokenizer_path = self.path
 
-        # constuct tokenizer
         self.local_path = copy_to_local(self.path, use_shm=self.use_shm)
-        self.tokenizer = hf_tokenizer(self.local_path, trust_remote_code=self.trust_remote_code)
-        self.processor = hf_processor(self.local_path, trust_remote_code=self.trust_remote_code)
 
-        self.generation_config = get_generation_config(self.hf_config_path, trust_remote_code=self.trust_remote_code)
+        # constuct tokenizer
+        self.local_tokenizer_path = copy_to_local(self.tokenizer_path, use_shm=self.use_shm)
+        self.tokenizer = hf_tokenizer(self.local_tokenizer_path, trust_remote_code=self.trust_remote_code)
+        self.processor = hf_processor(self.local_tokenizer_path, trust_remote_code=self.trust_remote_code)
+
+        self.local_hf_config_path = copy_to_local(self.hf_config_path, use_shm=self.use_shm)
+        self.generation_config = get_generation_config(
+            self.local_hf_config_path, trust_remote_code=self.trust_remote_code
+        )
 
         # constuct hf_config
         attn_implementation = self.override_config.get("attn_implementation", "flash_attention_2")
         self.hf_config = AutoConfig.from_pretrained(
-            self.hf_config_path, trust_remote_code=self.trust_remote_code, attn_implementation=attn_implementation
+            self.local_hf_config_path, trust_remote_code=self.trust_remote_code, attn_implementation=attn_implementation
         )
 
         override_config_kwargs = {
diff --git a/verl/workers/fsdp_workers.py b/verl/workers/fsdp_workers.py
index df14d910db6..712568f2e8c 100644
--- a/verl/workers/fsdp_workers.py
+++ b/verl/workers/fsdp_workers.py
@@ -288,6 +288,11 @@ def _build_model_optimizer(
         actor_model_config = AutoConfig.from_pretrained(
             local_path, trust_remote_code=trust_remote_code, attn_implementation="flash_attention_2"
         )
+        # TODO: VL models use VisionAttention, which directly uses flash_attention in transformers>=4.53
+        # which will be patched by _ulysses_flash_attention_forward, but errorly misses position_ids
+        # Maybe support Ulysses in VisionAttention in the future and remove this patch
+        if self.ulysses_sequence_parallel_size > 1 and hasattr(actor_model_config, "vision_config"):
+            actor_model_config.vision_config._attn_implementation = "eager"
 
         # patch for kimi-vl
         if getattr(actor_model_config, "model_type", None) == "kimi_vl":
@@ -1072,6 +1077,12 @@ def _build_critic_model_optimizer(self, config):
             attn_implementation="flash_attention_2",
             trust_remote_code=config.model.get("trust_remote_code", False),
         )
+        # TODO: VL models use VisionAttention, which directly uses flash_attention in transformers>=4.53
+        # which will be patched by _ulysses_flash_attention_forward, but errorly misses position_ids
+        # Maybe support Ulysses in VisionAttention in the future and remove this patch
+        if self.ulysses_sequence_parallel_size > 1 and hasattr(critic_model_config, "vision_config"):
+            critic_model_config.vision_config._attn_implementation = "eager"
+
         critic_model_config.num_labels = 1
         # patch for kimi-vl
         if getattr(critic_model_config, "model_type", None) == "kimi_vl":
diff --git a/verl/workers/reward_manager/batch.py b/verl/workers/reward_manager/batch.py
index 989ca14f466..d1a13cefac6 100644
--- a/verl/workers/reward_manager/batch.py
+++ b/verl/workers/reward_manager/batch.py
@@ -77,7 +77,9 @@ def __call__(self, data: DataProto, return_dict: bool = False) -> torch.Tensor |
         # If there is rm score, we directly return rm score. Otherwise, we compute via rm_score_fn
         if "rm_scores" in data.batch.keys():
             if return_dict:
-                return {"reward_tensor": data.batch["rm_scores"]}
+                reward_extra_keys = data.meta_info.get("reward_extra_keys", [])
+                reward_extra_info = {key: data.non_tensor_batch[key] for key in reward_extra_keys}
+                return {"reward_tensor": data.batch["rm_scores"], "reward_extra_info": reward_extra_info}
             else:
                 return data.batch["rm_scores"]
 
diff --git a/verl/workers/reward_manager/dapo.py b/verl/workers/reward_manager/dapo.py
index bb6e0895f40..d8b6b4742ef 100644
--- a/verl/workers/reward_manager/dapo.py
+++ b/verl/workers/reward_manager/dapo.py
@@ -56,7 +56,9 @@ def __call__(self, data: DataProto, return_dict: bool = False):
         # If there is rm score, we directly return rm score. Otherwise, we compute via rm_score_fn
         if "rm_scores" in data.batch.keys():
             if return_dict:
-                return {"reward_tensor": data.batch["rm_scores"]}
+                reward_extra_keys = data.meta_info.get("reward_extra_keys", [])
+                reward_extra_info = {key: data.non_tensor_batch[key] for key in reward_extra_keys}
+                return {"reward_tensor": data.batch["rm_scores"], "reward_extra_info": reward_extra_info}
             else:
                 return data.batch["rm_scores"]
 
diff --git a/verl/workers/reward_manager/naive.py b/verl/workers/reward_manager/naive.py
index f10bbc636ec..d21c423e3da 100644
--- a/verl/workers/reward_manager/naive.py
+++ b/verl/workers/reward_manager/naive.py
@@ -49,7 +49,9 @@ def __call__(self, data: DataProto, return_dict: bool = False) -> torch.Tensor |
         # If there is rm score, we directly return rm score. Otherwise, we compute via rm_score_fn
         if "rm_scores" in data.batch.keys():
             if return_dict:
-                return {"reward_tensor": data.batch["rm_scores"]}
+                reward_extra_keys = data.meta_info.get("reward_extra_keys", [])
+                reward_extra_info = {key: data.non_tensor_batch[key] for key in reward_extra_keys}
+                return {"reward_tensor": data.batch["rm_scores"], "reward_extra_info": reward_extra_info}
             else:
                 return data.batch["rm_scores"]
 
diff --git a/verl/workers/reward_manager/prime.py b/verl/workers/reward_manager/prime.py
index 98c094f2c6c..ab7e5f95e8c 100644
--- a/verl/workers/reward_manager/prime.py
+++ b/verl/workers/reward_manager/prime.py
@@ -153,7 +153,12 @@ def __call__(self, data: DataProto, return_dict: bool = False) -> torch.Tensor |
 
         # If there is rm score, we directly return rm score. Otherwise, we compute via rm_score_fn
         if "rm_scores" in data.batch.keys():
-            return data.batch["rm_scores"]
+            if return_dict:
+                reward_extra_keys = data.meta_info.get("reward_extra_keys", [])
+                reward_extra_info = {key: data.non_tensor_batch[key] for key in reward_extra_keys}
+                return {"reward_tensor": data.batch["rm_scores"], "reward_extra_info": reward_extra_info}
+            else:
+                return data.batch["rm_scores"]
 
         reward_tensor = torch.zeros_like(data.batch["responses"], dtype=torch.float32)
 
diff --git a/verl/workers/rollout/sglang_rollout/sglang_rollout.py b/verl/workers/rollout/sglang_rollout/sglang_rollout.py
index 25e0eb222c8..8f11c94051d 100644
--- a/verl/workers/rollout/sglang_rollout/sglang_rollout.py
+++ b/verl/workers/rollout/sglang_rollout/sglang_rollout.py
@@ -860,7 +860,7 @@ async def _async_rollout_a_request(
                             self._tool_map[tool_call.function.name].execute(
                                 _req.request_id,
                                 tool_call.function.arguments,
-                                **_req.tools_kwargs[tool_call.function.name].get("execute_kwargs", {}),
+                                **_req.tools_kwargs.get(tool_call.function.name, {}).get("execute_kwargs", {}),
                             )
                             for tool_call in parsed_tool_calls
                         ]
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index 2a50a11a7b7..20bf0bfad41 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -276,7 +276,7 @@ async def init_engine(self):
             skip_tokenizer_init=False,
             max_model_len=self.max_model_len,
             max_num_seqs=config.max_num_seqs,
-            load_format="auto",
+            load_format="dummy" if config.load_format.startswith("dummy") else config.load_format,
             disable_log_stats=config.disable_log_stats,
             max_num_batched_tokens=max_num_batched_tokens,
             enable_chunked_prefill=config.enable_chunked_prefill,
diff --git a/verl/workers/sharding_manager/megatron_sglang.py b/verl/workers/sharding_manager/megatron_sglang.py
index 2014ce9c6a8..fb1ffe078b8 100644
--- a/verl/workers/sharding_manager/megatron_sglang.py
+++ b/verl/workers/sharding_manager/megatron_sglang.py
@@ -28,7 +28,7 @@
 from torch.distributed.device_mesh import DeviceMesh
 
 from verl.protocol import DataProto, all_gather_data_proto
-from verl.utils.device import get_torch_device
+from verl.utils.device import get_torch_device, set_expandable_segments
 from verl.utils.megatron_utils import (
     load_megatron_model_to_gpu,
     offload_megatron_model_to_cpu,
@@ -178,6 +178,9 @@ async def wake_up(self):
                 self.transformer_config,
                 self.layer_name_mapping,
             )
+
+        set_expandable_segments(False)
+
         await self.update_weights(per_tensor_param)
         if self.offload_param:
             offload_megatron_model_to_cpu(self.actor_module)
@@ -199,6 +202,8 @@ async def sleep(self):
         # add empty cache after each compute
         aggressive_empty_cache(force_sync=True)
 
+        set_expandable_segments(True)
+
         # restore random states
         if self.device_mesh is not None:
             self.gen_random_states = get_torch_device().get_rng_state()
diff --git a/verl/workers/sharding_manager/megatron_vllm.py b/verl/workers/sharding_manager/megatron_vllm.py
index a6ddb065c67..1a1d809be7b 100644
--- a/verl/workers/sharding_manager/megatron_vllm.py
+++ b/verl/workers/sharding_manager/megatron_vllm.py
@@ -30,7 +30,7 @@
 from verl.protocol import all_gather_data_proto
 from verl.third_party.vllm import LLM, VLLM_SLEEP_LEVEL
 from verl.third_party.vllm import parallel_state as vllm_ps
-from verl.utils.device import get_torch_device
+from verl.utils.device import get_torch_device, set_expandable_segments
 from verl.utils.megatron_utils import load_megatron_model_to_gpu, offload_megatron_model_to_cpu, per_tensor_generator
 from verl.utils.memory_utils import aggressive_empty_cache
 from verl.utils.profiler import GPUMemoryLogger, log_gpu_memory_usage
@@ -149,6 +149,8 @@ def __enter__(self):
             if self.offload_param:
                 load_megatron_model_to_gpu(self.actor_module, load_grad=False)
 
+            set_expandable_segments(False)
+
             if self.rollout_config.free_cache_engine:
                 if "tags" in inspect.signature(self.inference_engine.wake_up).parameters:
                     self.inference_engine.wake_up(tags=["weights"])
@@ -196,6 +198,8 @@ def __exit__(self, exc_type, exc_value, traceback):
 
         aggressive_empty_cache(force_sync=True)
 
+        set_expandable_segments(True)
+
         # restore random states
         if self.device_mesh is not None:
             self.gen_random_states = get_torch_device().get_rng_state()