diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 4f092f174f0..91d368abe69 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -6,7 +6,7 @@
 
 - [ ] Search for similar PRs. Paste at least one query link here: ...
 - [ ] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI)
-  - `{modules}` include `fsdp`, `megatron`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data`
+  - `{modules}` include `fsdp`, `megatron`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data`, `cfg`, `reward`
   - If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]`
   - `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test`
   - If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title.
diff --git a/.github/workflows/checkpoint_converter.yml b/.github/workflows/checkpoint_converter.yml
deleted file mode 100644
index 4820497f79c..00000000000
--- a/.github/workflows/checkpoint_converter.yml
+++ /dev/null
@@ -1,175 +0,0 @@
-# # Tests layout
-
-# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
-# - `tests/trainer` for testing functionality related to `verl/trainer`
-# - `tests/models` for testing functionality related to `verl/models`
-# - ...
-
-# There are a few folders with `special_` prefix, created for special purposes:
-# - `special_distributed`: unit tests that must run with multiple GPUs
-# - `special_e2e`: end-to-end tests with training/generation scripts
-# - `special_npu`: tests for NPUs
-# - `special_sanity`: a suite of quick sanity tests
-# - `special_standalone`: a set of test that are designed to run in dedicated environments
-
-# Accelerators for tests
-# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
-# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
-
-# # Workflow layout
-
-# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
-# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
-# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
-# 3. End-to-end tests: `e2e_*.yml`
-# 4. Unit tests
-#   - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
-#   - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
-#   - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
-#     - new workflow yaml is added to `.github/workflows`
-#     - new tests are added to workflow mentioned in 2.
-
-name: checkpoint_converter
-# latest version: Megatron-LM core_v0.14.0 https://github.com/NVIDIA/Megatron-LM/tree/core_v0.14.0
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the main branch
-  push:
-    branches:
-      - main
-      - v0.*
-  pull_request:
-    branches:
-      - main
-      - v0.*
-    paths:
-      - "**/*.py"
-      # Other entrypoints
-      - "!examples/**"
-      - "!tests/**"
-      - "!verl/trainer/main_*.py"
-      - "!verl/trainer/fsdp_sft_trainer.py"
-      # Recipes
-      - "!recipe/**"
-      # FSDP
-      - "!verl/workers/**/*dp_*.py"
-      # Entrypoints
-      - ".github/workflows/checkpoint_converter.yml"
-      - ".github/workflows/e2e_ppo_trainer_megatron.yml"
-      - "examples/data_preprocess/gsm8k.py"
-      - "tests/special_e2e/run_ppo_trainer_megatron.sh"
-      - "verl/trainer/main_ppo.py"
-      - "verl/trainer/config/ppo_megatron_trainer.yaml"
-
-# Cancel jobs on the same ref if a new one is triggered
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-
-# Declare permissions just read content.
-permissions:
-  contents: read
-
-env:
-  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:sgl055.dev2"
-  DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
-
-jobs:
-  setup:
-    if: github.repository_owner == 'volcengine'
-    runs-on: ubuntu-latest
-    outputs:
-      runner-label: ${{ steps.create-runner.outputs.runner-label }}
-      mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
-    steps:
-      - uses: actions/checkout@v4
-      - id: create-runner
-        uses: volcengine/vemlp-github-runner@v1
-        with:
-          mode: "create"
-          faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
-          mlp-image: "${{ env.IMAGE }}"
-
-  checkpoint_converter:
-    needs: setup
-    runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ]
-    timeout-minutes: 20 # Increase this timeout value as needed
-    env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
-      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
-      NO_PROXY: "localhost,127.0.0.1"
-      HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-      - name: Install the current repository
-        run: |
-          pip3 install -e .[test]
-#      - name: Download Model to Use
-#        run: |
-#          huggingface-cli download Qwen/Qwen2.5-0.5B --local-dir ${HOME}/models/Qwen/Qwen2.5-0.5B
-#          huggingface-cli download deepseek-ai/deepseek-coder-1.3b-instruct --local-dir ${HOME}/models/deepseek-ai/deepseek-coder-1.3b-instruct
-#          export HF_HUB_OFFLINE=1
-      - name: Running Huggingface to Megatron dist_ckpt converter (Qwen/Qwen2.5-0.5B)
-        run: |
-          ray stop --force
-          python scripts/converter_hf_to_mcore.py --hf_model_path=${HOME}/models/Qwen/Qwen2.5-0.5B --output_path checkpoints/Qwen/Qwen2.5-0.5B --test
-      - name: Running Huggingface to Megatron dist_ckpt converter (deepseek-ai/deepseek-coder-1.3b-instruct)
-        run: |
-          ray stop --force
-          python scripts/converter_hf_to_mcore.py --hf_model_path=${HOME}/models/deepseek-ai/deepseek-coder-1.3b-instruct --output_path checkpoints/deepseek-ai/deepseek-coder-1.3b-instruct --test
-      - name: Clean up
-        run: |
-          rm -rf checkpoints
-
-  checkpoint_converter_large_moe_models:
-    needs: setup
-    runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ]
-    timeout-minutes: 30 # Increase this timeout value as needed
-    env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
-      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
-      NO_PROXY: "localhost,127.0.0.1"
-      HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
-      HF_ENDPOINT: "https://hf-mirror.com"
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-      - name: Install the current repository
-        run: |
-          pip3 install -e .[test]
-#      - name: Download Model to Use
-#        run: |
-#          huggingface-cli download Qwen/Qwen1.5-MoE-A2.7B-Chat --local-dir ${HOME}/models/Qwen/Qwen1.5-MoE-A2.7B-Chat
-#          export HF_HUB_OFFLINE=1
-      - name: Running Huggingface to Megatron dist_ckpt CPU converter (Qwen/Qwen1.5-MoE-A2.7B-Chat)
-        run: |
-          ray stop --force
-          python scripts/converter_hf_to_mcore.py --hf_model_path=${HOME}/models/Qwen/Qwen1.5-MoE-A2.7B-Chat --output_path checkpoints/Qwen/Qwen1.5-MoE-A2.7B-Chat --use_cpu_initialization
-      - name: Running distributed Huggingface to Megatron dist_ckpt CPU converter (Qwen/Qwen1.5-MoE-A2.7B-Chat)
-        run: |
-          ray stop --force
-          torchrun --nproc_per_node 8 --nnodes 1 scripts/converter_hf_to_mcore.py --hf_model_path=${HOME}/models/Qwen/Qwen1.5-MoE-A2.7B-Chat --output_path checkpoints/Qwen/Qwen1.5-MoE-A2.7B-Chat_dist --use_cpu_initialization
-      - name: clean up
-        run: |
-          rm -rf checkpoints
-
-  cleanup:
-    runs-on: ubuntu-latest
-    needs:
-      [
-        setup,
-        checkpoint_converter,
-        checkpoint_converter_large_moe_models
-      ]
-    if: always()
-    steps:
-      - id: destroy-runner
-        uses: volcengine/vemlp-github-runner@v1
-        with:
-          mode: "destroy"
-          faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
-          mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"
\ No newline at end of file
diff --git a/.github/workflows/e2e_ascend.yml b/.github/workflows/e2e_ascend.yml
index 32ed62e5838..41673db6541 100644
--- a/.github/workflows/e2e_ascend.yml
+++ b/.github/workflows/e2e_ascend.yml
@@ -65,22 +65,24 @@ permissions:
   contents: read
 
 jobs:
-  test:
+  non_rl_job:
     if: github.repository_owner == 'volcengine'
-    name: verl Ascend test (self-host)
-    runs-on: linux-aarch64-a2-8
-    timeout-minutes: 60 # Increase this timeout value as needed
+    name: E2E Ascend testing for non-RL algorithm scenarios
+    runs-on: linux-aarch64-a2-2
+    timeout-minutes: 60
     container:
       image: swr.ap-southeast-1.myhuaweicloud.com/base_image/ascend-ci/verl/verl:verl-8.3.rc1-910b-ubuntu22.04-py3.11-latest
       options: >-
         --shm-size 16g
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
-      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
-      NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     steps:
+      - name: Config third-party dependency download cache
+        run: |
+          sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list
+          pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
+          pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
       - name: Check npu and CANN info
         run: |
           cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
@@ -103,47 +105,126 @@ jobs:
       - name: Preprocess gsm8k dataset
         run: |
           python examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/.cache/datasets/openai/gsm8k
-      - name: Preprocess geo3k dataset
-        run: |
-          python examples/data_preprocess/geo3k.py --local_dataset_path ${HOME}/.cache/datasets/hiyouga/geometry3k
-      - name: Running gsm8k e2e qwen3 training tests with PPO on ASCEND NPU
-        run: |
-          ray stop --force
-          bash tests/special_npu/run_qwen3_06b_ppo.sh
-          rm -rf $HOME/ckpts
       - name: Running gsm8k e2e training tests with peft sft on ASCEND NPU
         run: |
           ray stop --force
           bash tests/special_npu/run_qwen2_5_05b_sft_peft_sp2.sh
           rm -rf $HOME/ckpts
-      - name: Running gsm8k e2e training tests with GRPO on ASCEND NPU
+      - name: Running NPU profiling unit tests
         run: |
           ray stop --force
-          bash tests/special_npu/run_qwen2_5_05b_grpo.sh
-          rm -rf $HOME/ckpts
-      - name: Running geo3k e2e training tests with GRPO on ASCEND NPU
+          pytest -s -x tests/utils/test_special_mstx_profile.py
+
+  llm_rl_job:
+    if: github.repository_owner == 'volcengine'
+    name: E2E Ascend testing for RL training scenarios of LLM models
+    runs-on: linux-aarch64-a2-8
+    timeout-minutes: 60
+    container:
+      image: swr.ap-southeast-1.myhuaweicloud.com/base_image/ascend-ci/verl/verl:verl-8.3.rc1-910b-ubuntu22.04-py3.11-latest
+      options: >-
+        --shm-size 16g
+    env:
+      HF_ENDPOINT: "https://hf-mirror.com"
+      HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
+    steps:
+      - name: Config third-party dependency download cache
+        run: |
+          sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list
+          pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
+          pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
+      - name: Check npu and CANN info
+        run: |
+          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
+          npu-smi info
+      - name: Check initial pip list from image
+        run: |
+          pip list
+      - name: Checkout volcengine/verl repo
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          clean: true
+      - name: Install the current repository
+        run: |
+          pip install -r requirements-npu.txt
+          pip install -e .
+      - name: Check final pip list
+        run: |
+          pip list
+      - name: Preprocess gsm8k dataset
+        run: |
+          python examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/.cache/datasets/openai/gsm8k
+      - name: Running gsm8k e2e training tests with PPO on ASCEND NPU (FSDP backend)
         run: |
           ray stop --force
-          bash tests/special_npu/run_qwen2_5_vl_3b_npu.sh
+          bash tests/special_npu/run_qwen3_06b_ppo.sh
           rm -rf $HOME/ckpts
-      - name: Running gsm8k e2e training tests with DAPO on ASCEND NPU
+      - name: Running gsm8k e2e training tests with GRPO on ASCEND NPU (FSDP backend)
         run: |
           ray stop --force
-          bash tests/special_npu/run_qwen2_5_05b_dapo.sh
+          bash tests/special_npu/run_qwen2_5_05b_grpo.sh
           rm -rf $HOME/ckpts
-      - name: Running gsm8k e2e qwen3 MoE training tests with DAPO MindSpeed on ASCEND NPU
+      - name: Running gsm8k e2e training tests with DAPO on ASCEND NPU (FSDP backend)
         run: |
           ray stop --force
-          export PYTHONPATH=$PYTHONPATH:/Megatron-LM
-          USE_DIST_CKPT=True USE_DUMMY_MODEL=True DUMMY_MODEL_CONFIG_PATH=tests/special_e2e/ppo_trainer/expert_parallel/qwen3moe_minimal.json DUMMY_MODEL_PATH=$HOME/dist_ckpt/qwen3_30b_dapo_mindspeed bash tests/special_npu/run_qwen3_30b_dapo_mindspeed.sh
-      - name: Running gsm8k e2e training tests with GRPO MindSpeed on ASCEND NPU
+          bash tests/special_npu/run_qwen2_5_05b_dapo.sh
+          rm -rf $HOME/ckpts
+      - name: Running gsm8k e2e training tests with GRPO on ASCEND NPU (MindSpeed backend)
         run: |
           ray stop --force
           export PYTHONPATH=$PYTHONPATH:/Megatron-LM
           USE_DIST_CKPT=True bash tests/special_npu/run_qwen2_5_05b_grpo_mindspeed.sh
           rm -rf $HOME/dist_ckpt/qwen2_5_05b_grpo_mindspeed
           rm -rf $HOME/ckpts
-      - name: Running NPU profiling unit tests
+      - name: Running gsm8k e2e training tests with DAPO on ASCEND NPU (MindSpeed backend, MoE Model)
         run: |
           ray stop --force
-          pytest -s -x tests/utils/test_special_mstx_profile.py
+          export PYTHONPATH=$PYTHONPATH:/Megatron-LM
+          USE_DIST_CKPT=True USE_DUMMY_MODEL=True DUMMY_MODEL_CONFIG_PATH=tests/special_e2e/ppo_trainer/expert_parallel/qwen3moe_minimal.json DUMMY_MODEL_PATH=$HOME/dist_ckpt/qwen3_30b_dapo_mindspeed bash tests/special_npu/run_qwen3_30b_dapo_mindspeed.sh
+
+  vlm_rl_job:
+    if: github.repository_owner == 'volcengine'
+    name: E2E Ascend testing for RL training scenarios of VLM models
+    runs-on: linux-aarch64-a2-8
+    timeout-minutes: 60
+    container:
+      image: swr.ap-southeast-1.myhuaweicloud.com/base_image/ascend-ci/verl/verl:verl-8.3.rc1-910b-ubuntu22.04-py3.11-latest
+      options: >-
+        --shm-size 16g
+    env:
+      HF_ENDPOINT: "https://hf-mirror.com"
+      HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
+    steps:
+      - name: Config third-party dependency download cache
+        run: |
+          sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list
+          pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
+          pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
+      - name: Check npu and CANN info
+        run: |
+          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
+          npu-smi info
+      - name: Check initial pip list from image
+        run: |
+          pip list
+      - name: Checkout volcengine/verl repo
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          clean: true
+      - name: Install the current repository
+        run: |
+          pip install -r requirements-npu.txt
+          pip install -e .
+      - name: Check final pip list
+        run: |
+          pip list
+      - name: Preprocess geo3k dataset
+        run: |
+          python examples/data_preprocess/geo3k.py --local_dataset_path ${HOME}/.cache/datasets/hiyouga/geometry3k
+      - name: Running geo3k e2e training tests with GRPO on ASCEND NPU
+        run: |
+          ray stop --force
+          bash tests/special_npu/run_qwen2_5_vl_3b_npu.sh
+          rm -rf $HOME/ckpts
diff --git a/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml b/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml
index 5e81bc48e96..df049bb0871 100644
--- a/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml
+++ b/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml
@@ -115,6 +115,7 @@ jobs:
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
+      ENGINE: sglang
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
@@ -135,11 +136,6 @@ jobs:
           export VLLM_USE_V1=1
           ray start --head
           ENGINE=sglang MODE=async RESUME_MODE=auto MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct TOTAL_TRAIN_STEPS=2 bash tests/special_e2e/run_ppo_trainer_megatron.sh
-      - name: Test Megatron checkpoints merging function (DeepSeek Actor and Critic)
-        run: |
-          exp_name="deepseek-coder-1.3b-instruct-megatron-gsm8k-minimal"
-          python -m verl.model_merger test --backend megatron --local_dir checkpoints/verl-test/${exp_name}/global_step_1/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface
-          python -m verl.model_merger test --backend megatron --is-value-model --local_dir checkpoints/verl-test/${exp_name}/global_step_1/critic --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/critic/huggingface
       - name: Profiling GRPO GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Deepseek)
         run: |
           ray stop --force
@@ -154,39 +150,8 @@ jobs:
         run: |
           rm -rf checkpoints
 
-  e2e_ppo_trainer_megatron-different-train-infer-tp-qwen-tie-embedding:
-    needs: setup
-    runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
-    timeout-minutes: 60 # Increase this timeout value as needed
-    env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
-      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
-      NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
-      HF_ENDPOINT: "https://hf-mirror.com"
-      HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-      - name: Install the current repository
-        run: |
-          pip3 install --no-deps -e .[test]
-      - name: Prepare GSM8K dataset
-        run: |
-          python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
-      - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with tie-embedding Megatron (Qwen) with train tp > infer tp
-        run: |
-          ray stop --force
-          ENGINE=sglang VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 TRAIN_TP=2 INFER_TP=1 MODEL_ID=Qwen/Qwen2.5-1.5B bash tests/special_e2e/run_ppo_trainer_megatron.sh
-      - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen) with  train tp < infer tp
-        run: |
-          ray stop --force
-          ENGINE=sglang VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 TRAIN_TP=1 INFER_TP=2 MODEL_ID=Qwen/Qwen2.5-1.5B bash tests/special_e2e/run_ppo_trainer_megatron.sh
-      - name: clean up
-        run: |
-          rm -rf checkpoints
-
-  e2e_ppo_trainer_megatron-qwen-override-transformer-config:
+  # Qwen3-0.6B: dense, tie_word_embeddings=True
+  e2e_ppo_trainer_megatron-qwen3:
     needs: setup
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 60 # Increase this timeout value as needed
@@ -196,6 +161,7 @@ jobs:
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
+      ENGINE: sglang
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
@@ -206,57 +172,15 @@ jobs:
       - name: Prepare GSM8K dataset
         run: |
           python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
-#      - name: Download Model to Use
-#        run: |
-#          huggingface-cli download Qwen/Qwen2.5-0.5B --local-dir ${HOME}/models/Qwen/Qwen2.5-0.5B
-#          export HF_HUB_OFFLINE=1
-      - name: Prepare dist_ckpt of Qwen2.5-0.5B, uneven layer distribution only supports dist_ckpt
-        run: |
-          python3 scripts/converter_hf_to_mcore.py --hf_model_path ${HOME}/models/Qwen/Qwen2.5-0.5B --output_path checkpoints/verl-test/qwen2.5-0.5b-megatron
-      - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen)
+      - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen3) testing learning rate scheduler
         run: |
           ray stop --force
-          ENGINE=sglang SAVE_FREQ=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 SKIP_SAVE_HF_MODEL=1 bash tests/special_e2e/run_ppo_trainer_megatron.sh +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_first_pipeline_stage=8 +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=4 actor_rollout_ref.actor.megatron.use_dist_checkpointing=true actor_rollout_ref.actor.megatron.dist_checkpointing_path=checkpoints/verl-test/qwen2.5-0.5b-megatron actor_rollout_ref.ref.megatron.use_dist_checkpointing=true actor_rollout_ref.ref.megatron.dist_checkpointing_path=checkpoints/verl-test/qwen2.5-0.5b-megatron critic.megatron.use_dist_checkpointing=true critic.megatron.dist_checkpointing_path=checkpoints/verl-test/qwen2.5-0.5b-megatron reward_model.megatron.use_dist_checkpointing=true reward_model.megatron.dist_checkpointing_path=checkpoints/verl-test/qwen2.5-0.5b-megatron
-          cp -r checkpoints checkpoints-dut
-          ENGINE=sglang SAVE_FREQ=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 bash tests/special_e2e/run_ppo_trainer_megatron.sh
-      - name: Test Megatron checkpoints merging function (Qwen Actor and Critic)
-        run: |
-          exp_name="qwen2.5-0.5b-megatron-gsm8k-minimal"
-          python -m verl.model_merger test --backend megatron --tie-word-embedding --local_dir checkpoints-dut/verl-test/${exp_name}/global_step_1/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface
-          python -m verl.model_merger test --backend megatron --is-value-model --local_dir checkpoints-dut/verl-test/${exp_name}/global_step_1/critic --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/critic/huggingface
-      - name: clean up
-        run: |
-          rm -rf checkpoints
-
-  e2e_ppo_trainer_megatron-deepseek-override-transformer-config:
-    needs: setup
-    runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
-    timeout-minutes: 60 # Increase this timeout value as needed
-    env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
-      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
-      NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
-      HF_ENDPOINT: "https://hf-mirror.com"
-      HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-      - name: Install the current repository
-        run: |
-          pip3 install --no-deps -e .[test]
-      - name: Prepare GSM8K dataset
-        run: |
-          python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
-      - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (DeepSeek)
+          ALL_OFFLOAD=True VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 LR_WARMUP_STEPS=1 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh
+      - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with FP8 rollout
         run: |
           ray stop --force
-          ENGINE=sglang SAVE_FREQ=1 MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct COMMON_PP=2 COMMON_VPP=null bash tests/special_e2e/run_ppo_trainer_megatron.sh +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=true +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=true
-      - name: Test Megatron checkpoints merging function (DeepSeek Actor and Critic)
-        run: |
-          exp_name="deepseek-coder-1.3b-instruct-megatron-gsm8k-minimal"
-          python -m verl.model_merger test --backend megatron --local_dir checkpoints/verl-test/${exp_name}/global_step_1/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface
-          python -m verl.model_merger test --backend megatron --is-value-model --local_dir checkpoints/verl-test/${exp_name}/global_step_1/critic --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/critic/huggingface
+          export VLLM_USE_V1=1
+          ROLLOUT_QUANTIZATION=fp8 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh
       - name: clean up
         run: |
           rm -rf checkpoints
@@ -267,9 +191,7 @@ jobs:
       [
         setup,
         e2e_ppo_trainer_megatron-deepseek,
-        e2e_ppo_trainer_megatron-different-train-infer-tp-qwen-tie-embedding,
-        e2e_ppo_trainer_megatron-qwen-override-transformer-config,
-        e2e_ppo_trainer_megatron-deepseek-override-transformer-config,
+        e2e_ppo_trainer_megatron-qwen3,
       ]
     if: always()
     steps:
diff --git a/.github/workflows/e2e_ppo_trainer_megatron_sglang_2.yml b/.github/workflows/e2e_ppo_trainer_megatron_sglang_2.yml
index d5e5efad222..e738fde2f8b 100644
--- a/.github/workflows/e2e_ppo_trainer_megatron_sglang_2.yml
+++ b/.github/workflows/e2e_ppo_trainer_megatron_sglang_2.yml
@@ -105,38 +105,7 @@ jobs:
           faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
           mlp-image: "${{ env.IMAGE }}"
 
-  e2e_ppo_trainer_megatron-qwen2_5vl-3b:
-    needs: setup
-    runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
-    timeout-minutes: 60 # Increase this timeout value as needed
-    env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
-      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
-      NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
-      HF_ENDPOINT: "https://hf-mirror.com"
-      HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-      - name: Install the current repository
-        run: |
-          pip3 install --no-deps -e .[test]
-      - name: Prepare Geo3k dataset
-        run: |
-          python3 examples/data_preprocess/geo3k.py --local_dataset_path ${HOME}/models/hf_data/hiyouga/geometry3k/
-      - name: Prepare dist_ckpt of Qwen2.5-VL-3B, only supports dist_ckpt
-        run: |
-          python3 scripts/converter_hf_to_mcore.py --hf_model_path ${HOME}/models/Qwen/Qwen2.5-VL-3B-Instruct --output_path checkpoints/verl-test/qwen2.5-vl-3b-megatron
-      - name: Running Geo3k E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen)
-        run: |
-          ray stop --force
-          ENGINE=sglang ROLLOUT_MODE=async TRAIN_FILES=${HOME}/data/geo3k/train.parquet VAL_FILES=${HOME}/data/geo3k/test.parquet MAX_PROMPT_LENGTH=1024 MAX_RESPONSE_LENGTH=2048  MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct ADV_ESTIMATOR=grpo USE_DYNAMIC_BSZ=False SKIP_SAVE_HF_MODEL=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 COMMON_TP=2 USE_DIST_CKPT=true DIST_CKPT_PATH=checkpoints/verl-test/qwen2.5-vl-3b-megatron bash tests/special_e2e/run_ppo_trainer_megatron.sh
-      - name: clean up
-        run: |
-          rm -rf checkpoints
-
-  e2e_ppo_trainer_sglang:
+  e2e_ppo_trainer_fsdp_sglang:
     needs: setup
     runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ]
     timeout-minutes: 40 # Increase this timeout value as needed
@@ -161,12 +130,8 @@ jobs:
         run: |
           ray stop --force
           ENGINE=sglang bash tests/special_e2e/ppo_trainer/run_function_reward.sh
-      - name: Running GSM8K E2E training tests on sglang async
-        run: |
-          ray stop --force
-          TOTAL_TRAIN_STEPS=2 ENGINE=sglang ROLLOUT_MODE=async bash tests/special_e2e/ppo_trainer/run_function_reward.sh
 
-  e2e_ppo_trainer_sglang_vlm:
+  e2e_ppo_trainer_fsdp-qwen2_5vl-3b:
     needs: setup
     runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ]
     timeout-minutes: 60 # Increase this timeout value as needed
@@ -220,42 +185,13 @@ jobs:
             ACTOR_FSDP_OPTIMIZER_OFFLOAD=True REF_FSDP_PARAM_OFFLOAD=True \
             bash tests/special_e2e/ppo_trainer/run_function_reward.sh
 
-  e2e_ppo_trainer_megatron-sglang-fp8:
-    needs: setup
-    runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
-    timeout-minutes: 60 # Increase this timeout value as needed
-    env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
-      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
-      NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
-      HF_ENDPOINT: "https://hf-mirror.com"
-      HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-      - name: Install the current repository
-        run: |
-          pip3 install --no-deps -e .[test]
-      - name: Prepare GSM8K dataset
-        run: |
-          python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
-      - name: Running GSM8K E2E training tests on 8 L20 GPUs with SGLang (FP8)
-        run: |
-          ray stop --force
-          ENGINE=sglang ROLLOUT_QUANTIZATION=fp8 ROLLOUT_MODE=async TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh
-      - name: clean up
-        run: |
-          rm -rf checkpoints
-
   cleanup:
     runs-on: ubuntu-latest
     needs:
       [
         setup,
-        e2e_ppo_trainer_megatron-qwen2_5vl-3b,
-        e2e_ppo_trainer_sglang,
-        e2e_ppo_trainer_sglang_vlm
+        e2e_ppo_trainer_fsdp-qwen2_5vl-3b,
+        e2e_ppo_trainer_fsdp_sglang,
       ]
     if: always()
     steps:
diff --git a/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml b/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml
index 58554246336..f329ae9b7aa 100644
--- a/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml
+++ b/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml
@@ -105,6 +105,7 @@ jobs:
           faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
           mlp-image: "${{ env.IMAGE }}"
 
+  # deepseek-ai/deepseek-coder-1.3b-instruct: dense, tie_word_embeddings=False
   e2e_ppo_trainer_megatron-deepseek:
     needs: setup
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
@@ -127,6 +128,7 @@ jobs:
       - name: Prepare GSM8K dataset
         run: |
           python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
+      # Full training save&load
       - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron, use mbridge e2e to pre-load and save (Deepseek)
         run: |
           ray stop --force
@@ -137,11 +139,12 @@ jobs:
           ray stop --force
           RESUME_MODE=auto MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct TOTAL_TRAIN_STEPS=2 SAVE_FREQ=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 USE_MBRIDGE=True USE_DIST_CKPT=False \
           bash tests/special_e2e/run_ppo_trainer_megatron.sh
+      # LoRA training save&load
       - name: clean up and install Megatron-Bridge
         run: |
           rm -rf checkpoints
-          pip3 install git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@af21db0 --no-deps --no-build-isolation
-          pip3 install git+https://github.com/NVIDIA/Megatron-LM.git@3cbe5c6 --no-deps --no-build-isolation
+          pip3 install git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@a489bed --no-deps --no-build-isolation
+          pip3 install git+https://github.com/NVIDIA/Megatron-LM.git@2d398b4 --no-deps --no-build-isolation
           pip3 install "nvidia-modelopt[torch]>=0.37.0" transformers==4.57.1
       - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron, use Megatron-Bridge LoRA e2e to pre-load and save (Deepseek)
         run: |
@@ -156,28 +159,8 @@ jobs:
       - name: clean up
         run: |
           rm -rf checkpoints
-      - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (DeepSeek)
-        run: |
-          ray stop --force
-          export VLLM_USE_V1=1
-          ray start --head
-          MODE=async USE_FUSED_KERNELS=True MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct TOTAL_TRAIN_STEPS=2 SAVE_FREQ=2 bash tests/special_e2e/run_ppo_trainer_megatron.sh
-      - name: Test Megatron checkpoints merging function (DeepSeek Actor and Critic)
-        run: |
-          exp_name="deepseek-coder-1.3b-instruct-megatron-gsm8k-minimal"
-          python -m verl.model_merger test --backend megatron --local_dir checkpoints/verl-test/${exp_name}/global_step_2/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_2/actor/huggingface
-          python -m verl.model_merger test --backend megatron --is-value-model --local_dir checkpoints/verl-test/${exp_name}/global_step_2/critic --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_2/critic/huggingface
-      - name: Test Megatron distributed checkpoints merging function (DeepSeek)
-        run: |
-          exp_name="deepseek-coder-1.3b-instruct-megatron-gsm8k-minimal"
-          torchrun --nproc_per_node 4 --nnodes 1  -m verl.model_merger merge --backend megatron --local_dir checkpoints/verl-test/${exp_name}/global_step_2/actor --target_dir checkpoints/verl-test/${exp_name}/global_step_2/actor/hf_model
-      - name: Running GRPO GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Deepseek)
-        run: |
-          ray stop --force
-          ADV_ESTIMATOR=grpo USE_DYNAMIC_BSZ=False MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/special_e2e/run_ppo_trainer_megatron.sh
-      - name: clean up
-        run: |
-          rm -rf checkpoints
+
+  # Qwen3-0.6B: dense, tie_word_embeddings=True
   e2e_ppo_trainer_megatron-qwen3:
     needs: setup
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
@@ -199,100 +182,15 @@ jobs:
       - name: Prepare GSM8K dataset
         run: |
           python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
-      - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen3) with validation and saving
-        run: |
-          ray stop --force
-          ALL_OFFLOAD=True VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh
       - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen3) testing learning rate scheduler
         run: |
           ray stop --force
-          LR_WARMUP_STEPS=1 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh
-
-      - name: Test Megatron checkpoints merging function (Qwen3 Actor and Critic)
-        run: |
-          exp_name="qwen3-0.6b-megatron-gsm8k-minimal"
-          python -m verl.model_merger test --backend megatron --tie-word-embedding --local_dir checkpoints/verl-test/${exp_name}/global_step_1/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface
-          python -m verl.model_merger test --backend megatron --is-value-model --local_dir checkpoints/verl-test/${exp_name}/global_step_1/critic --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/critic/huggingface
+          ALL_OFFLOAD=True VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 LR_WARMUP_STEPS=1 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh
       - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with FP8 rollout
         run: |
           ray stop --force
           export VLLM_USE_V1=1
-          ROLLOUT_QUANTIZATION=fp8 ROLLOUT_MODE=async TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh
-      - name: clean up
-        run: |
-          rm -rf checkpoints
-  e2e_ppo_trainer_megatron-different-train-infer-tp-qwen-tie-embedding:
-    needs: setup
-    runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
-    timeout-minutes: 60 # Increase this timeout value as needed
-    env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
-      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
-      NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
-      HF_ENDPOINT: "https://hf-mirror.com"
-      HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-      - name: Install the current repository
-        run: |
-          pip3 install --no-deps -e .[test]
-          pip3 install math-verify transformers==$TRANSFORMERS_VERSION
-      - name: Prepare GSM8K dataset
-        run: |
-          python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
-      - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with tie-embedding Megatron (Qwen) with train tp > infer tp
-        run: |
-          ray stop --force
-          VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 TRAIN_TP=2 INFER_TP=1 MODEL_ID=Qwen/Qwen2.5-1.5B bash tests/special_e2e/run_ppo_trainer_megatron.sh
-      - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen) with  train tp < infer tp
-        run: |
-          ray stop --force
-          VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 TRAIN_TP=1 INFER_TP=2 ALL_OFFLOAD=True MODEL_ID=Qwen/Qwen2.5-1.5B bash tests/special_e2e/run_ppo_trainer_megatron.sh
-      - name: clean up
-        run: |
-          rm -rf checkpoints
-  e2e_ppo_trainer_megatron-qwen-override-transformer-config:
-    needs: setup
-    runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
-    timeout-minutes: 60 # Increase this timeout value as needed
-    env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
-      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
-      NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
-      HF_ENDPOINT: "https://hf-mirror.com"
-      HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-      - name: Install the current repository
-        run: |
-          pip3 install --no-deps -e .[test]
-          pip3 install math-verify transformers==$TRANSFORMERS_VERSION
-      - name: Prepare GSM8K dataset
-        run: |
-          python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
-#      - name: Download Model to Use
-#        run: |
-#          huggingface-cli download Qwen/Qwen2.5-0.5B --local-dir ${HOME}/models/Qwen/Qwen2.5-0.5B
-#          export HF_HUB_OFFLINE=1
-      - name: Prepare dist_ckpt of Qwen2.5-0.5B, uneven layer distribution only supports dist_ckpt
-        run: |
-          python3 scripts/converter_hf_to_mcore.py --hf_model_path ${HOME}/models/Qwen/Qwen2.5-0.5B --output_path checkpoints/verl-test/qwen2.5-0.5b-megatron
-      - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen)
-        run: |
-          ray stop --force
-          SAVE_FREQ=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 SKIP_SAVE_HF_MODEL=1 USE_DIST_CKPT=True DIST_CKPT_PATH=checkpoints/verl-test/qwen2.5-0.5b-megatron \
-          bash tests/special_e2e/run_ppo_trainer_megatron.sh +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_first_pipeline_stage=8 +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=4
-          cp -r checkpoints checkpoints-dut
-          SAVE_FREQ=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 bash tests/special_e2e/run_ppo_trainer_megatron.sh
-      - name: Test Megatron checkpoints merging function (Qwen Actor and Critic)
-        run: |
-          exp_name="qwen2.5-0.5b-megatron-gsm8k-minimal"
-          python -m verl.model_merger test --backend megatron --tie-word-embedding --local_dir checkpoints-dut/verl-test/${exp_name}/global_step_1/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface
-          python -m verl.model_merger test --backend megatron --is-value-model --local_dir checkpoints-dut/verl-test/${exp_name}/global_step_1/critic --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/critic/huggingface
+          ROLLOUT_QUANTIZATION=fp8 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh
       - name: clean up
         run: |
           rm -rf checkpoints
@@ -304,8 +202,6 @@ jobs:
         setup,
         e2e_ppo_trainer_megatron-deepseek,
         e2e_ppo_trainer_megatron-qwen3,
-        e2e_ppo_trainer_megatron-different-train-infer-tp-qwen-tie-embedding,
-        e2e_ppo_trainer_megatron-qwen-override-transformer-config,
       ]
     if: always()
     steps:
diff --git a/.github/workflows/e2e_ppo_trainer_megatron_vllm_2.yml b/.github/workflows/e2e_ppo_trainer_megatron_vllm_2.yml
index 8908718f144..a35756dd224 100644
--- a/.github/workflows/e2e_ppo_trainer_megatron_vllm_2.yml
+++ b/.github/workflows/e2e_ppo_trainer_megatron_vllm_2.yml
@@ -105,39 +105,6 @@ jobs:
           faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
           mlp-image: "${{ env.IMAGE }}"
 
-  e2e_ppo_trainer_megatron-deepseek-override-transformer-config:
-    needs: setup
-    runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
-    timeout-minutes: 60 # Increase this timeout value as needed
-    env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
-      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
-      NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
-      HF_ENDPOINT: "https://hf-mirror.com"
-      HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-      - name: Install the current repository
-        run: |
-          pip3 install --no-deps -e .[test]
-          pip3 install transformers==$TRANSFORMERS_VERSION
-      - name: Prepare GSM8K dataset
-        run: |
-          python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
-      - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (DeepSeek)
-        run: |
-          ray stop --force
-          SAVE_FREQ=1 MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct COMMON_PP=2 COMMON_VPP=null bash tests/special_e2e/run_ppo_trainer_megatron.sh +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=true +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=true
-      - name: Test Megatron checkpoints merging function (DeepSeek Actor and Critic)
-        run: |
-          exp_name="deepseek-coder-1.3b-instruct-megatron-gsm8k-minimal"
-          python -m verl.model_merger test --backend megatron --local_dir checkpoints/verl-test/${exp_name}/global_step_1/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface
-          python -m verl.model_merger test --backend megatron --is-value-model --local_dir checkpoints/verl-test/${exp_name}/global_step_1/critic --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/critic/huggingface
-      - name: clean up
-        run: |
-          rm -rf checkpoints
   e2e_ppo_trainer_megatron-moe-expert-parallel:
     needs: setup
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
@@ -155,8 +122,8 @@ jobs:
       - name: Install the current repository
         run: |
           pip3 install --no-deps -e .[test]
-          pip3 install git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@af21db0 --no-deps --no-build-isolation
-          pip3 install git+https://github.com/NVIDIA/Megatron-LM.git@3cbe5c6 --no-deps --no-build-isolation
+          pip3 install git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@a489bed --no-deps --no-build-isolation
+          pip3 install git+https://github.com/NVIDIA/Megatron-LM.git@2d398b4 --no-deps --no-build-isolation
           pip3 install "nvidia-modelopt[torch]>=0.37.0" transformers==4.57.1
       - name: Prepare GSM8K dataset
         run: |
@@ -185,42 +152,8 @@ jobs:
       - name: clean up
         run: |
           rm -rf checkpoints
-  e2e_ppo_trainer_megatron-qwen2_5vl-3b:
-    needs: setup
-    runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
-    timeout-minutes: 60 # Increase this timeout value as needed
-    env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
-      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
-      NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
-      HF_ENDPOINT: "https://hf-mirror.com"
-      HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-      - name: Install the current repository
-        run: |
-          pip3 install --no-deps -e .[test]
-          pip3 install transformers==$TRANSFORMERS_VERSION
-      - name: Prepare Geo3k dataset
-        run: |
-          python3 examples/data_preprocess/geo3k.py --local_dataset_path ${HOME}/models/hf_data/hiyouga/geometry3k/
-      - name: Prepare dist_ckpt of Qwen2.5-VL-3B, only supports dist_ckpt
-        run: |
-          python3 scripts/converter_hf_to_mcore.py --hf_model_path ${HOME}/models/Qwen/Qwen2.5-VL-3B-Instruct --output_path checkpoints/verl-test/qwen2.5-vl-3b-megatron
-      - name: Running Geo3k E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen)
-        run: |
-          ray stop --force
-          TRAIN_FILES=${HOME}/data/geo3k/train.parquet VAL_FILES=${HOME}/data/geo3k/test.parquet \
-          MAX_PROMPT_LENGTH=1024 MAX_RESPONSE_LENGTH=2048  MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct ADV_ESTIMATOR=grpo \
-          USE_DYNAMIC_BSZ=False USE_FUSED_KERNELS=True SKIP_SAVE_HF_MODEL=1 \
-          COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 COMMON_TP=2 USE_DIST_CKPT=true \
-          DIST_CKPT_PATH=checkpoints/verl-test/qwen2.5-vl-3b-megatron bash tests/special_e2e/run_ppo_trainer_megatron.sh
-      - name: clean up
-        run: |
-          rm -rf checkpoints
-  e2e_ppo_trainer_vllm:
+
+  e2e_ppo_trainer_fsdp_vllm:
     needs: setup
     runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ]
     timeout-minutes: 60 # Increase this timeout value as needed
@@ -242,16 +175,6 @@ jobs:
         run: |
           ray stop --force
           python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
-      # HF sanity
-#      - name: Running GSM8K E2E training tests on 1 L20 GPU with hf for sanity
-#        run: |
-#          ray stop --force
-#          bash tests/special_e2e/ppo_trainer/run_single_gpu.sh
-#      # HF sanity
-#      - name: Running GSM8K E2E training tests on 1 L20 GPU with engine interface for sanity.
-#        run: |
-#          ray stop --force
-#          bash tests/special_e2e/ppo_trainer/run_single_gpu_with_engine.sh
       # Function RM
       - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm with validation and saving (FSDP_SIZE=8)
         run: |
@@ -268,7 +191,7 @@ jobs:
       - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm with validation and saving (DDP_SIZE=2, FSDP_SIZE=4)
         run: |
           ray stop --force
-          VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 SAVE_HF_MODEL=True FSDP_SIZE=4 VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-ddp-size2-fsdp-size4" bash tests/special_e2e/ppo_trainer/run_function_reward.sh
+          VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 SAVE_HF_MODEL=True FSDP_SIZE=4 USE_KL=True VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-ddp-size2-fsdp-size4" bash tests/special_e2e/ppo_trainer/run_function_reward.sh
       - name: Test merging DDP+FSDP checkpoints (Qwen Actor)
         run: |
           exp_name="qwen2.5-0.5b-function-reward-minimal-ddp-size2-fsdp-size4"
@@ -288,19 +211,11 @@ jobs:
       - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm (GRPO)
         run: |
           ray stop --force
-          ADV_ESTIMATOR=grpo USE_KL=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh
+          CUSTOM_REWARD_FN=True ADV_ESTIMATOR=grpo USE_KL=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh
       # - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm (ReMax)
       #   run: |
       #     ray stop --force
       #     ADV_ESTIMATOR=remax USE_KL=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh
-      - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using customized reward function
-        run: |
-          ray stop --force
-          CUSTOM_REWARD_FN=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh
-      - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm with in-reward kl and kl loss
-        run: |
-          ray stop --force
-          USE_KL=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh
       # LoRA tests
       - name: Running GSM8K E2E training tests on 8 L20 GPUs with grpo lora using function rm with use_shm
         run: |
@@ -320,47 +235,8 @@ jobs:
         run: |
           ray stop --force
           ADV_ESTIMATOR=grpo USE_SHM=True LORA_RANK=32 LOAD_FORMAT=safetensors LAYERED_SUMMON=True STRATEGY=fsdp2 bash tests/special_e2e/ppo_trainer/run_function_reward.sh
-      # Model RM
-      - name: Running GRPO GSM8K E2E training tests with FSDP on 8 L20 GPUs (DeepSeek)
-        run: |
-          ray stop --force
-          MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/special_e2e/ppo_trainer/run_function_reward.sh
-      - name: Running GSM8K E2E with rmpad using model rm
-        run: |
-          ray stop --force
-          bash tests/special_e2e/ppo_trainer/run_model_reward.sh
-      - name: Running GSM8K E2E without rmpad using model rm
-        run: |
-          ray stop --force
-          RM_PAD=False bash tests/special_e2e/ppo_trainer/run_model_reward.sh
-      - name: Running GSM8K E2E with rmpad using model rm and ulysses sp=2
-        run: |
-          ray stop --force
-          SP_SIZE=2 bash tests/special_e2e/ppo_trainer/run_model_reward.sh
-      - name: Running GSM8K E2E with rmpad using model rm and dynamic batch size
-        run: |
-          ray stop --force
-          SEQ_BALANCE=True bash tests/special_e2e/ppo_trainer/run_model_reward.sh
-      - name: Running GSM8K E2E with rmpad using model rm with Liger Kernel enabled
-        run: |
-          ray stop --force
-          LIGER=True bash tests/special_e2e/ppo_trainer/run_model_reward.sh
-      - name: Running GSM8K E2E with rmpad using model rm with Fused Kernel enabled
-        run: |
-          ray stop --force
-          FUSED_KERNELS=True bash tests/special_e2e/ppo_trainer/run_model_reward.sh
-      - name: Running GSM8K E2E with rmpad using model rm with Fused Kernel enabled
-        run: |
-          ray stop --force
-          FUSED_KERNEL=True FUSED_KERNEL_BACKEND=triton bash tests/special_e2e/ppo_trainer/run_model_reward.sh
-      - name: Running GSM8K E2E training tests on vllm async
-        run: |
-          ray stop --force
-          export VLLM_USE_V1=1
-          ray start --head
-          TOTAL_TRAIN_STEPS=2 ENGINE=vllm ROLLOUT_MODE=async bash tests/special_e2e/ppo_trainer/run_function_reward.sh
 
-  e2e_ppo_trainer_vllm_vlm:
+  e2e_ppo_trainer_fsdp-qwen2_5vl-3b:
     needs: setup
     runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ]
     timeout-minutes: 40 # Increase this timeout value as needed
@@ -417,10 +293,9 @@ jobs:
     needs:
       [
         setup,
-        e2e_ppo_trainer_megatron-deepseek-override-transformer-config,
-        e2e_ppo_trainer_megatron-qwen2_5vl-3b,
-        e2e_ppo_trainer_vllm,
-        e2e_ppo_trainer_vllm_vlm
+        e2e_ppo_trainer_megatron-moe-expert-parallel,
+        e2e_ppo_trainer_fsdp-qwen2_5vl-3b,
+        e2e_ppo_trainer_fsdp_vllm,
       ]
     if: always()
     steps:
diff --git a/.github/workflows/e2e_sft.yml b/.github/workflows/e2e_sft.yml
index 80f73b76d8f..64d55a185a6 100644
--- a/.github/workflows/e2e_sft.yml
+++ b/.github/workflows/e2e_sft.yml
@@ -91,7 +91,7 @@ jobs:
   e2e_sft:
     needs: setup
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
-    timeout-minutes: 30 # Increase this timeout value as needed
+    timeout-minutes: 40 # Increase this timeout value as needed
     env:
       HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
@@ -146,7 +146,13 @@ jobs:
       - name: Running GSM8K E2E training tests with multiturn and various configs and compare results
         run: |
           bash tests/special_e2e/sft/test_sft_engine_all.sh
-
+      - name: Prepare pokemon-gpt4o-captions dataset
+        run: |
+          ray stop --force
+          python3 examples/data_preprocess/pokemon.py --local_dataset_path ${HOME}/models/hf_data/pokemon-gpt4o-captions
+      - name: Running Pokemon E2E training tests with multiturn and various configs and compare results
+        run: |
+          MODEL_ID=Qwen/Qwen3-VL-2B-Instruct DATASET_DIR=~/data/pokemon-gpt4o-captions VPP_SIZE=null bash tests/special_e2e/sft/test_sft_engine_all.sh
   
   cleanup:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/e2e_transferqueue.yml b/.github/workflows/e2e_transferqueue.yml
index da5443f43aa..1abefc14be1 100644
--- a/.github/workflows/e2e_transferqueue.yml
+++ b/.github/workflows/e2e_transferqueue.yml
@@ -124,13 +124,14 @@ jobs:
         run: |
           pip3 install --no-deps -e .[test,gpu]
           pip3 install transformers==$TRANSFORMERS_VERSION
-          pip3 install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple TransferQueue==0.1.2.dev0
+          pip3 install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple TransferQueue==0.1.4.dev1
       - name: Prepare GSM8K dataset
         run: |
           python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
-      - name: Running the E2E test with TransferQueue (FSDP)
+      - name: Running the E2E test with TransferQueue (FSDP), enable zero copy serialization
         run: |
           ray stop --force
+          export TQ_ZERO_COPY_SERIALIZATION=True
           bash tests/special_e2e/run_transferqueue.sh
 
   # Test Megatron strategy
@@ -153,13 +154,14 @@ jobs:
         run: |
           pip3 install --no-deps -e .[test,gpu]
           pip3 install transformers==$TRANSFORMERS_VERSION
-          pip3 install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple TransferQueue==0.1.2.dev0
+          pip3 install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple TransferQueue==0.1.4.dev1
       - name: Prepare GSM8K dataset
         run: |
           python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
-      - name: Running the E2E test with TransferQueue (Megatron)
+      - name: Running the E2E test with TransferQueue (Megatron), disable zero copy serialization
         run: |
           ray stop --force
+          export TQ_ZERO_COPY_SERIALIZATION=False
           bash tests/special_e2e/run_transferqueue.sh
 
   cleanup:
diff --git a/.github/workflows/model.yml b/.github/workflows/model.yml
index cab35a68d96..c9f1f2deac2 100644
--- a/.github/workflows/model.yml
+++ b/.github/workflows/model.yml
@@ -48,7 +48,6 @@ on:
       # Entrypoints
       - ".github/workflows/model.yml"
       - "tests/special_distributed/test_fsdp_ckpt.py"
-      - "tests/special_distributed/test_mcore_config_converter.py"
       - "tests/special_distributed/test_tensor_dict.py"
       - "tests/models/**"
       - "tests/special_distributed/run_all.sh"
@@ -144,34 +143,6 @@ jobs:
         run: |
           STRATEGY=fsdp2 torchrun --nproc_per_node=8 tests/special_distributed/test_fsdp_ckpt.py
 
-  mcore_config_converter:
-    needs: setup
-    runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ]
-    timeout-minutes: 20 # Increase this timeout value as needed
-    env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
-      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
-      NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
-      HF_ENDPOINT: "https://hf-mirror.com"
-      HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-      - name: Install the current repository
-        run: |
-          pip3 install -e .[test]
-#      - name: Download model config files
-#        run: |
-#          hf download Qwen/Qwen2.5-7B config.json --local-dir $HOME/configs/Qwen/Qwen2.5-7B
-#          hf download Qwen/Qwen3-8B config.json --local-dir $HOME/configs/Qwen/Qwen3-8B
-#          hf download deepseek-ai/deepseek-coder-1.3b-instruct config.json --local-dir $HOME/configs/deepseek-ai/deepseek-coder-1.3b-instruct
-#          hf download Qwen/Qwen2-57B-A14B config.json --local-dir $HOME/configs/Qwen/Qwen2-57B-A14B
-#          hf download Qwen/Qwen3-30B-A3B config.json --local-dir $HOME/configs/Qwen/Qwen3-30B-A3B
-#          hf download deepseek-ai/DeepSeek-V3-Base config.json --local-dir $HOME/configs/deepseek-ai/DeepSeek-V3-Base
-      - name: Running mcore config converter tests on 8 L20 GPUs
-        run: |
-          torchrun --nproc_per_node=8 tests/special_distributed/test_mcore_config_converter.py
 
   model_engine:
     needs: setup
@@ -206,7 +177,6 @@ jobs:
         setup,
         model_rmpad,
         model_rmpad_fsdp2_unstable,
-        mcore_config_converter,
         model_engine
       ]
     if: always()
diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml
index 2520968a318..fd9349683e8 100644
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@@ -124,6 +124,9 @@ jobs:
       - name: Test the latest vLLM Rollout async with agent loop
         run: |
           ROLLOUT_NAME=vllm pytest -svvv tests/experimental/agent_loop
+      - name: Test vllm server abort functionality
+        run: |
+          pytest tests/workers/rollout/rollout_vllm/test_vllm_abort.py -v -s
       # Note(haibin.lin): for any new test, please update gpu_unit_tests.yaml to avoid repeated tests
 
   cleanup:
diff --git a/docker/ascend/Dockerfile.ascend_8.3.rc1_a2 b/docker/ascend/Dockerfile.ascend_8.3.rc1_a2
index cd9fa59502c..200e7a05f35 100644
--- a/docker/ascend/Dockerfile.ascend_8.3.rc1_a2
+++ b/docker/ascend/Dockerfile.ascend_8.3.rc1_a2
@@ -44,6 +44,8 @@ RUN ARCH=$(uname -m) && \
     echo "export PYTHONPATH=\$PYTHONPATH:/Megatron-LM" >> ~/.bashrc && \
     # Remove existing triton or triton-ascend installed by some third-party packages
     pip uninstall -y triton triton-ascend && \
+    # Install mbridge
+    pip install mbridge && \
     # Clear extra files
     rm -rf /tmp/* /var/tmp/* && \
     pip cache purge
diff --git a/docker/ascend/Dockerfile.ascend_8.3.rc1_a3 b/docker/ascend/Dockerfile.ascend_8.3.rc1_a3
index 7b62a48b9e2..bbf7de87bbe 100644
--- a/docker/ascend/Dockerfile.ascend_8.3.rc1_a3
+++ b/docker/ascend/Dockerfile.ascend_8.3.rc1_a3
@@ -44,6 +44,8 @@ RUN ARCH=$(uname -m) && \
     echo "export PYTHONPATH=\$PYTHONPATH:/Megatron-LM" >> ~/.bashrc && \
     # Remove existing triton or triton-ascend installed by some third-party packages
     pip uninstall -y triton triton-ascend && \
+    # Install mbridge
+    pip install mbridge && \
     # Clear extra files
     rm -rf /tmp/* /var/tmp/* && \
     pip cache purge
diff --git a/docs/advance/checkpoint.rst b/docs/advance/checkpoint.rst
index 56bec4a75c3..9782af951d9 100644
--- a/docs/advance/checkpoint.rst
+++ b/docs/advance/checkpoint.rst
@@ -137,32 +137,8 @@ Current implementation use solution 2.
 HuggingFace to Megatron DistCheckpoint details
 ----------------------------------------------
 
-If your model is quite huge, we recommend you to use Megatron dist-checkpoint to load the model.
-Megatron dist-checkpoint supports loading with different kinds of model parallelism,
-and it is much faster than the original checkpoint loading.
-
-To convert original HuggingFace model to Megatron dist-checkpoint,
-you can use the ``scripts/converter_hf_to_mcore.py`` script. Large MoE models are temporarily supported with CPU initialization,
-which is a little slower. While we are working on a better solution to support large models.
-
-Example command to convert the model is as follows:
-
-.. code:: bash
-
-    python scripts/converter_hf_to_mcore.py \
-        --hf_model_path Qwen/Qwen1.5-MoE-A2.7B-Chat \
-        --output_path /mnt/disk/Qwen/Qwen1.5-MoE-A2.7B-Chat \
-        --use_cpu_initialization    # Only work for MoE models
-
-
-Example command to distributed convert the huge model like deepseekv3 671B is as follows:
-
-.. code:: bash
-
-    torchrun --nproc_per_node 1 --nnodes 8 --node_rank ${RANK} scripts/converter_hf_to_mcore.py \
-        --hf_model_path deepseek-ai/DeepSeek-V3 \
-        --output_path /mnt/disk/deepseek-ai/DeepSeek-V3 \
-        --use_cpu_initialization    # Only work for MoE models
+Through ``mbridge``, we can directly save the mcore model to huggingface format during training.
+No need to convert the model to Megatron dist-checkpoint format.
 
 Original Checkpoint Utils
 -------------------------
diff --git a/docs/advance/one_step_off.md b/docs/advance/one_step_off.md
index 9ab644be688..d8861534343 100644
--- a/docs/advance/one_step_off.md
+++ b/docs/advance/one_step_off.md
@@ -225,7 +225,7 @@ def sync_rollout_weights(self):
 ### PPO Correctness
 To ensure the correctness of the PPO algorithm, we use rollout log_probs for PPO importance sampling. 
 For the related algorithm details, please refer to: https://verl.readthedocs.io/en/latest/algo/rollout_corr_math.html
-The default mode is ppo_is_bypass, but other modification strategies can also be explored.
+The default mode is `bypass_ppo_clip`, but other modification strategies can also be explored.
 
 ### AgentLoop
 In the current implementation, we no longer provide SPMD model rollout mode. 
diff --git a/docs/algo/rollout_corr.md b/docs/algo/rollout_corr.md
index da9512d6706..a2421e238c3 100644
--- a/docs/algo/rollout_corr.md
+++ b/docs/algo/rollout_corr.md
@@ -130,14 +130,15 @@ config = RolloutCorrectionConfig.decoupled_geo_rs()     # Geo-RS
 config = RolloutCorrectionConfig.geo_rs_seq_tis()       # Geo-RS-Seq-TIS
 
 # === Bypass PPO mode (2 policies: π_rollout = π_old, π_θ) - fast ===
-# No IS correction needed since π_old = π_rollout
-config = RolloutCorrectionConfig.ppo_is_bypass()        # PPO with rollout as anchor
+# PPO ratio handles IS, so no explicit IS weights needed
+config = RolloutCorrectionConfig.bypass_ppo_clip()          # PPO-clip only
+config = RolloutCorrectionConfig.bypass_ppo_clip_geo_rs()   # PPO-clip + Geo-RS
 
 # === Bypass PG mode (2 policies, no PPO clipping) - fast ===
 # IS weights computed on-the-fly as π_θ / π_rollout
-config = RolloutCorrectionConfig.pg_is()                # Seq-TIS + PG
-config = RolloutCorrectionConfig.pg_rs()                # Geo-RS + PG
-config = RolloutCorrectionConfig.pg_geo_rs_seq_tis()    # Geo-RS-Seq-TIS + PG
+config = RolloutCorrectionConfig.bypass_pg_is()                # Seq-TIS + PG
+config = RolloutCorrectionConfig.bypass_pg_rs()                # Geo-RS + PG
+config = RolloutCorrectionConfig.bypass_pg_geo_rs_seq_tis()    # Geo-RS-Seq-TIS + PG
 
 # === Other ===
 config = RolloutCorrectionConfig.disabled()             # Metrics only (no correction)
@@ -157,8 +158,8 @@ algorithm:
     rollout_rs_threshold: null             # RS upper threshold (required if rollout_rs is enabled)
     rollout_rs_threshold_lower: null       # RS lower threshold (auto-reciprocal if null)
     rollout_token_veto_threshold: null     # Per-token veto threshold (null = disabled)
-    bypass_mode: false  # Skip old_log_prob computation
-    use_policy_gradient: false     # Use policy gradient loss (vs PPO loss)
+    bypass_mode: false  # Skip old_log_prob computation (sets π_old = π_rollout)
+    loss_type: ppo_clip            # Loss type in bypass mode: "ppo_clip" (default) or "reinforce"
 
 # REQUIRED: Enable log prob calculation
 actor_rollout_ref:
@@ -171,7 +172,7 @@ actor_rollout_ref:
 ### **Core Implementation**
 
 - `verl/trainer/ppo/rollout_corr_helper.py` - Contains `compute_rollout_correction_and_rejection_mask()` and `compute_offpolicy_metrics()`
-- `verl/trainer/ppo/core_algos.py` - Rollout Correction integration with PPO and pure IS mode (`compute_policy_loss_with_rollout_correction()`)
+- `verl/trainer/ppo/core_algos.py` - Rollout Correction integration with PPO and REINFORCE modes (`compute_policy_loss_bypass_mode()`, `compute_policy_loss_reinforce()`)
 - `verl/trainer/ppo/ray_trainer.py` - Bypass mode implementation (skips `old_log_prob` computation)
 - `verl/workers/actor/dp_actor.py` - Mode selection logic and metrics collection
 
@@ -266,9 +267,9 @@ The rollout correction framework is built from **orthogonal components** that ca
    - **Decoupled**: Three policies (π_rollout, π_old, π_θ) with separate π_old computation
    - **Bypass**: Two policies (π_rollout = π_old, π_θ), skips π_old computation
 
-2. **Loss Function**
-   - **PPO**: With clipping (standard RL training)
-   - **Pure IS**: Policy gradient only (no clipping)
+2. **Loss Function** (in bypass mode, controlled by `loss_type`)
+   - **PPO-clip** (`loss_type="ppo_clip"`, default): PPO clipped objective (IS handled by ratio)
+   - **REINFORCE** (`loss_type="reinforce"`): Policy gradient with explicit IS weights (no clipping)
 
 3. **IS/RS Aggregation Level**
    - **Token**: Per-token IS weights/rejection
@@ -298,19 +299,22 @@ This section provides detailed guidance on choosing and using the verified prese
 | `decoupled_seq_is_rs()` | Seq-MIS | Decoupled | sequence | sequence | Sequence IS + sequence RS |
 | `decoupled_geo_rs()` | Geo-RS | Decoupled | - | geometric + veto | Geometric RS + veto, no IS weights |
 | `geo_rs_seq_tis()` | Geo-RS-Seq-TIS | Decoupled | sequence | geometric + veto | Geometric filter + clipped weight |
-| **Bypass PPO Mode** (2 policies: π_rollout = π_old, π_θ) |
-| `ppo_is_bypass()` | - | Bypass PPO | - | - | PPO with rollout as anchor (no IS correction needed) |
-| **Bypass PG Mode** (2 policies: π_rollout, π_θ; IS = π_θ/π_rollout) |
-| `pg_is()` | Seq-TIS | Bypass PG | sequence | - | Policy gradient with IS |
-| `pg_rs()` | Geo-RS | Bypass PG | - | geometric + veto | Policy gradient with Geo-RS |
-| `pg_geo_rs_seq_tis()` | Geo-RS-Seq-TIS | Bypass PG | sequence | geometric + veto | PG + Geo filter + seq IS |
+| **Bypass Mode (PPO-clip)** (2 policies; ratio handles IS, RS masks outliers) |
+| `bypass_ppo_clip()` | - | Bypass (PPO-clip) | - | - | PPO-clip only |
+| `bypass_ppo_clip_geo_rs()` | Geo-RS | Bypass (PPO-clip) | - | geometric + veto | PPO-clip + Geo-RS |
+| **Bypass Mode (REINFORCE)** (2 policies; explicit IS weights, no PPO clipping) |
+| `bypass_pg_is()` | Seq-TIS | Bypass (REINFORCE) | sequence | - | REINFORCE with explicit IS |
+| `bypass_pg_rs()` | Geo-RS | Bypass (REINFORCE) | - | geometric + veto | REINFORCE with Geo-RS |
+| `bypass_pg_geo_rs_seq_tis()` | Geo-RS-Seq-TIS | Bypass (REINFORCE) | sequence | geometric + veto | REINFORCE + Geo filter + seq IS |
 | **Other** |
 | `disabled()` | - | - | - | - | Metrics only, no correction |
 
 **Note:**
-- **Bypass PPO mode** sets π_old = π_rollout, so IS correction is not applicable (the ratio would be 1.0).
-- **Bypass PG mode** computes IS weights as π_θ / π_rollout on-the-fly - use this for fast execution with IS/RS correction.
-- Estimators (Token-TIS, Seq-TIS, Seq-MIS, Geo-RS, Geo-RS-Seq-TIS) are compatible with Decoupled PPO and Bypass PG modes.
+- **Bypass mode** sets π_old = π_rollout and uses `loss_type` to select the loss function:
+  - `"ppo_clip"` (default): PPO clipped objective where ratio = π_θ/π_rollout already handles IS
+  - `"reinforce"`: REINFORCE with explicit IS weights as π_θ / π_rollout
+- Both loss types benefit from rejection sampling (RS) which masks out-of-distribution samples.
+- Estimators (Token-TIS, Seq-TIS, Seq-MIS, Geo-RS, Geo-RS-Seq-TIS) are compatible with Decoupled and Bypass modes.
 
 #### Other Supported Combinations (Manual Configuration Required)
 
@@ -325,7 +329,7 @@ See [detailed configuration examples below](#additional-useful-configurations-no
 - Any aggregation level (token/sequence/geometric) works in either decoupled or bypass mode
 - All combinations are fully supported by the implementation
 - Rejection sampling is independent of IS weighting
-- Pure RS (`pg_rs`) uses bypass + geometric RS with `use_policy_gradient=True` (no IS weights)
+- Pure RS (`bypass_pg_rs`) uses bypass + geometric RS with `loss_type="reinforce"` (no IS weights)
 
 ---
 
@@ -521,35 +525,35 @@ algorithm:
 
 ---
 
-### 6. PPO with Bypass Mode (`ppo_is_bypass`)
+### 6. Bypass Mode with PPO-clip (`bypass_ppo_clip`)
 
 **Configuration:**
 ```python
-config = RolloutCorrectionConfig.ppo_is_bypass(threshold=2.0)
+config = RolloutCorrectionConfig.bypass_ppo_clip()
 ```
 
 **Components:**
 - **Operating Mode**: Bypass (2 policies: π_rollout = π_old, π_θ)
-- **Loss**: PPO with clipping
-- **IS Aggregation**: None (not needed, π_old = π_rollout)
+- **Loss**: PPO-clip (IS handled by ratio, no explicit IS weights)
+- **IS Aggregation**: None (PPO ratio handles it)
 - **RS**: None
+- **Veto**: None
 
 **Equivalent YAML:**
 ```yaml
 algorithm:
   rollout_correction:
-    rollout_is: token  # Placeholder for metrics
-    rollout_is_threshold: 2.0
+    rollout_is: null
     rollout_rs: null
-    bypass_mode: true  # Bypass mode
-    use_policy_gradient: false
+    bypass_mode: true
+    loss_type: ppo_clip
 ```
 
 **Properties:**
-- Skips `actor.compute_log_prob()` forward pass
-- PPO clips against π_rollout (behavior policy)
-- Sets π_old = π_rollout (two-policy setup)
-- Does not separate proximal from behavior policy
+- PPO clipped objective in bypass mode
+- The PPO ratio = π_θ/π_rollout already handles IS (no explicit IS weights needed)
+- Skips `actor.compute_log_prob()` forward pass (2 policies instead of 3)
+- No rejection sampling - use `bypass_ppo_clip_geo_rs()` for RS
 
 **Configuration requirement:**
 - Set `actor_rollout_ref.rollout.calculate_log_probs: true`
@@ -558,16 +562,61 @@ algorithm:
 
 ---
 
-### 7. Policy Gradient with IS (`pg_is`)
+### 6b. Bypass Mode with PPO-clip + Geo-RS (`bypass_ppo_clip_geo_rs`)
+
+**Configuration:**
+```python
+config = RolloutCorrectionConfig.bypass_ppo_clip_geo_rs(
+    rs_threshold=1.001,
+    veto_threshold=1e-4
+)
+```
+
+**Components:**
+- **Operating Mode**: Bypass (2 policies: π_rollout = π_old, π_θ)
+- **Loss**: PPO-clip (IS handled by ratio, no explicit IS weights)
+- **IS Aggregation**: None (PPO ratio handles it)
+- **RS**: Geometric-level rejection
+- **Veto**: Enabled
+
+**Equivalent YAML:**
+```yaml
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_rs: geometric
+    rollout_rs_threshold: 1.001
+    rollout_rs_threshold_lower: 0.999
+    rollout_token_veto_threshold: 1e-4
+    bypass_mode: true
+    loss_type: ppo_clip
+```
+
+**Properties:**
+- PPO clipped objective in bypass mode with geometric RS
+- The PPO ratio = π_θ/π_rollout already handles IS (no explicit IS weights needed)
+- Skips `actor.compute_log_prob()` forward pass (2 policies instead of 3)
+- Geometric RS masks outliers
+- Veto mechanism enabled
+- Solves Length Trap problem for CoT/agent workloads
+
+**Configuration requirement:**
+- Set `actor_rollout_ref.rollout.calculate_log_probs: true`
+
+**Theory:** [§3.1.2 (Bypass)](rollout_corr_math.md#312-bypass-mode-two-policies) + [§3.3.3 (Geometric)](rollout_corr_math.md#333-geometric-aggregation-geo-rs)
+
+---
+
+### 7. REINFORCE with IS (`bypass_pg_is`)
 
 **Configuration:**
 ```python
-config = RolloutCorrectionConfig.pg_is(threshold=2.0)
+config = RolloutCorrectionConfig.bypass_pg_is(threshold=2.0)
 ```
 
 **Components:**
 - **Operating Mode**: Bypass (2 policies: π_rollout, π_θ)
-- **Loss**: Pure IS (policy gradient only, no PPO clipping)
+- **Loss**: REINFORCE (policy gradient with explicit IS weights, no PPO clipping)
 - **IS Aggregation**: Sequence-level
 - **RS**: None
 
@@ -578,12 +627,12 @@ algorithm:
     rollout_is: sequence
     rollout_is_threshold: 2.0
     rollout_rs: null
-    bypass_mode: true  # Required
-    use_policy_gradient: true  # Use policy gradient loss (no PPO clipping)
+    bypass_mode: true
+    loss_type: reinforce  # REINFORCE with explicit IS weights
 ```
 
 **Properties:**
-- Policy gradient loss (no PPO clipping)
+- REINFORCE loss with explicit IS weights (no PPO clipping)
 - Single forward pass (skips old_log_prob computation)
 - IS weights computed on-the-fly in loss function
 
@@ -591,11 +640,11 @@ algorithm:
 
 ---
 
-### 8. Policy Gradient with Rejection Sampling (`pg_rs`)
+### 8. REINFORCE with Rejection Sampling (`bypass_pg_rs`)
 
 **Configuration:**
 ```python
-config = RolloutCorrectionConfig.pg_rs(
+config = RolloutCorrectionConfig.bypass_pg_rs(
     rs_threshold=1.001,
     veto_threshold=1e-4
 )
@@ -603,7 +652,7 @@ config = RolloutCorrectionConfig.pg_rs(
 
 **Components:**
 - **Operating Mode**: Bypass (2 policies: π_rollout, π_θ)
-- **Loss**: Pure policy gradient (no PPO clipping, via `use_policy_gradient=True`)
+- **Loss**: REINFORCE (no PPO clipping)
 - **IS Aggregation**: None
 - **RS**: Geometric-level rejection
 - **Veto**: Enabled
@@ -618,7 +667,7 @@ algorithm:
     rollout_rs_threshold_lower: 0.999
     rollout_token_veto_threshold: 1e-4
     bypass_mode: true
-    use_policy_gradient: true
+    loss_type: reinforce
 ```
 
 **Properties:**
@@ -631,13 +680,13 @@ algorithm:
 
 ---
 
-### 9. Policy Gradient with Geo-RS-Seq-TIS (`pg_geo_rs_seq_tis`)
+### 9. REINFORCE with Geo-RS-Seq-TIS (`bypass_pg_geo_rs_seq_tis`)
 
 **Also known as: Geo-RS-Seq-TIS in bypass mode**
 
 **Configuration:**
 ```python
-config = RolloutCorrectionConfig.pg_geo_rs_seq_tis(
+config = RolloutCorrectionConfig.bypass_pg_geo_rs_seq_tis(
     is_threshold=2.0,
     rs_threshold=1.001,
     veto_threshold=1e-4
@@ -646,7 +695,7 @@ config = RolloutCorrectionConfig.pg_geo_rs_seq_tis(
 
 **Components:**
 - **Operating Mode**: Bypass (2 policies: π_rollout, π_θ)
-- **Loss**: Pure policy gradient (no PPO clipping)
+- **Loss**: REINFORCE (no PPO clipping)
 - **IS Aggregation**: Sequence-level (Seq-TIS)
 - **RS**: Geometric-level rejection (Geo-RS)
 - **Veto**: Enabled
@@ -662,11 +711,11 @@ algorithm:
     rollout_rs_threshold_lower: 0.999
     rollout_token_veto_threshold: 1e-4
     bypass_mode: true
-    use_policy_gradient: true
+    loss_type: reinforce
 ```
 
 **Properties:**
-- Combines geometric filter + clipped sequence weight with policy gradient loss
+- Combines geometric filter + clipped sequence weight with REINFORCE loss
 - Skips `actor.compute_log_prob()` forward pass (bypass mode)
 - Suitable for reasoning models (CoT, o1-style) when you want bypass mode efficiency
 - No PPO clipping - relies on IS/RS for stability
@@ -760,11 +809,11 @@ The framework provides **two operating modes** for computing π_old, which can b
 
 ### Operating Modes and Configuration
 
-| Configuration | `bypass_mode` | `use_policy_gradient` | Operating Mode | Loss Function | Description |
-|---------------|----------------------------------|------------------------------|----------------|---------------|-------------|
-| **Decoupled** | `false` | `false` | Decoupled | PPO | Computes `old_log_prob` separately via `actor.compute_log_prob()` |
-| **Bypass** | `true` | `false` | Bypass | PPO | Sets `old_log_prob = rollout_log_prob`, PPO clips against rollout policy |
-| **Bypass + PG** | `true` | `true` | Bypass | Policy Gradient | Bypass mode with policy gradient loss (no PPO clipping) |
+| Configuration | `bypass_mode` | `loss_type` | Operating Mode | Loss Function | Description |
+|---------------|---------------|-------------|----------------|---------------|-------------|
+| **Decoupled** | `false` | N/A | Decoupled | PPO | Computes `old_log_prob` separately via `actor.compute_log_prob()` |
+| **Bypass + PPO-clip** | `true` | `"ppo_clip"` (default) | Bypass | PPO-clip | PPO clipped objective (IS handled by ratio) |
+| **Bypass + REINFORCE** | `true` | `"reinforce"` | Bypass | REINFORCE | Policy gradient with explicit IS weights (no PPO clipping) |
 
 ### Operating Mode Details
 
@@ -829,9 +878,9 @@ The aggregation level can be chosen **independently** of the operating mode. Any
 
 ### Example Workflow
 
-**Recommended: Bypass + Policy Gradient Mode**
+**Recommended: Bypass Mode**
 
-This workflow uses bypass mode with pure policy gradient loss for efficiency.
+This workflow uses bypass mode for efficiency.
 
 1. **Start with metrics only** to understand the off-policy gap:
    ```yaml
@@ -840,7 +889,7 @@ This workflow uses bypass mode with pure policy gradient loss for efficiency.
        rollout_is: null
        rollout_rs: null
        bypass_mode: true  # Bypass mode (recommended)
-       use_policy_gradient: true  # Pure policy gradient (recommended)
+       loss_type: ppo_clip  # Default: PPO clipped objective
    ```
    Monitor `rollout_corr/kl`, `rollout_corr/log_ppl_abs_diff`, `rollout_corr/chi2_token` to assess off-policy gap.
 
@@ -852,11 +901,11 @@ This workflow uses bypass mode with pure policy gradient loss for efficiency.
        rollout_rs: sequence  # or "geometric" for higher sensitivity
        rollout_rs_threshold: 2.0
        bypass_mode: true  # Bypass mode
-       use_policy_gradient: true  # Pure policy gradient
+       loss_type: ppo_clip  # or "reinforce" for explicit IS weights
    ```
    This excludes outliers from training without modifying gradients.
 
-3. **Enable full IS correction** once comfortable with metrics:
+3. **Enable full IS correction** (with REINFORCE loss) once comfortable with metrics:
    ```yaml
    algorithm:
      rollout_correction:
@@ -865,14 +914,15 @@ This workflow uses bypass mode with pure policy gradient loss for efficiency.
        rollout_rs: sequence  # or "geometric" for more aggressive filtering
        rollout_rs_threshold: 2.0
        bypass_mode: true  # Bypass mode
-       use_policy_gradient: true  # Pure policy gradient
+       loss_type: reinforce  # REINFORCE with explicit IS weights
    ```
 
-**Benefits of bypass + policy gradient mode:**
+**Benefits of bypass mode:**
 - ✅ Skips expensive `actor.compute_log_prob()` forward pass (faster)
-- ✅ IS weights computed on-the-fly in loss function (π_θ / π_rollout)
-- ✅ Simpler than PPO (no clipping, pure policy gradient with IS/RS)
-- ✅ Works for all IS/RS combinations
+- ✅ `loss_type` controls the loss function: "ppo_clip" (default) or "reinforce"
+- ✅ PPO-clip: IS handled by ratio (no explicit weights), RS mask applied
+- ✅ REINFORCE: Explicit IS weights computed on-the-fly (π_θ / π_rollout)
+- ✅ Both loss types work with all IS/RS combinations
 
 ## Usage
 
@@ -1249,7 +1299,7 @@ algorithm:
     rollout_token_veto_threshold: 1e-4  # Veto catastrophic tokens
 ```
 
-### Example 5: Bypass Mode
+### Example 5: Bypass Mode with PPO-clip (Default)
 ```yaml
 algorithm:
   rollout_correction:
@@ -1258,22 +1308,35 @@ algorithm:
     rollout_rs: token
     rollout_rs_threshold: 2.0
     bypass_mode: true   # Skip old_log_prob computation
-    use_policy_gradient: false     # Use bypass mode: PPO with rollout_log_prob as old_log_prob
+    loss_type: ppo_clip # PPO clipped objective (default)
 ```
-**Skips expensive `actor.compute_log_prob()` forward pass**
+**Skips expensive `actor.compute_log_prob()` forward pass. PPO ratio = π_θ/π_rollout handles IS.**
 
-### Example 6: Pure Policy Gradient Mode
+### Example 6: Bypass Mode with REINFORCE
 ```yaml
 algorithm:
   rollout_correction:
-    rollout_is: token                      # Explicit IS correction in loss
+    rollout_is: sequence                   # Explicit IS correction in loss
     rollout_is_threshold: 2.0
     rollout_rs: null                       # Optional: can add rejection sampling
-    bypass_mode: true   # Required for policy gradient mode
-    use_policy_gradient: true      # Use policy gradient loss (no PPO clipping)
+    bypass_mode: true
+    loss_type: reinforce           # REINFORCE with explicit IS weights
 ```
 **No PPO clipping, pure policy gradient with IS correction**
 
+### Example 7: Bypass Mode with PPO-clip + Rejection Sampling
+```yaml
+algorithm:
+  rollout_correction:
+    rollout_is: sequence                   # Computed for metrics
+    rollout_is_threshold: 2.0
+    rollout_rs: geometric                  # Rejection sampling enabled
+    rollout_rs_threshold: 1.001
+    bypass_mode: true
+    loss_type: ppo_clip            # PPO clipped objective (IS handled by ratio)
+```
+**PPO clipping with rejection sampling. IS handled by PPO ratio (no explicit IS weights).**
+
 ## Troubleshooting
 
 ### Issue: High spread in IS weights
diff --git a/docs/algo/rollout_corr_math.md b/docs/algo/rollout_corr_math.md
index 91ca84ae711..5ac34336502 100644
--- a/docs/algo/rollout_corr_math.md
+++ b/docs/algo/rollout_corr_math.md
@@ -96,7 +96,7 @@ The transition dynamics $p(s_{t+1}|s_t, a_t)$ and initial state $p(s_0)$ cancel
 - **Off-policy capable**: Can learn from any behavior policy via importance sampling
 - **No trust region**: Policy updates not constrained
 
-**Implementation in verl:** The `pg_is` method implements off-policy REINFORCE with truncated importance sampling.
+**Implementation in verl:** The `bypass_pg_is` preset implements off-policy REINFORCE with truncated importance sampling.
 
 ### 1.2 PPO: Adding Trust Region Control
 
@@ -271,8 +271,8 @@ The operating mode determines how the proximal policy $\pi_{\text{old}}$ is comp
 - $\pi_{\theta}$: Current policy (being updated)
 
 **Ratios:**
-- **With PPO loss** (`use_policy_gradient = false`): No separate IS computation; PPO ratio $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ clips against rollout policy
-- **With policy gradient loss** (`use_policy_gradient = true`): IS ratio $\rho_t = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ computed on-the-fly in loss function
+- **With PPO-clip loss** (`loss_type = "ppo_clip"`, default): PPO ratio $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ clips against rollout policy (IS handled by ratio)
+- **With REINFORCE loss** (`loss_type = "reinforce"`): IS ratio $\rho_t = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ computed on-the-fly in loss function
 
 **Properties:**
 - ✅ Skips `actor.compute_log_prob()` call (faster)
@@ -286,7 +286,7 @@ The operating mode determines how the proximal policy $\pi_{\text{old}}$ is comp
 
 #### 3.2.1 PPO Loss (with Clipping)
 
-**Configuration:** `use_policy_gradient = false`
+**Configuration:** `loss_type = "ppo_clip"` (default in bypass mode)
 
 **Loss function:**
 
@@ -306,7 +306,7 @@ where:
 
 #### 3.2.2 Policy Gradient Loss (with IS/RS Correction)
 
-**Configuration:** `use_policy_gradient = true` (requires `bypass_mode = true`)
+**Configuration:** `loss_type = "reinforce"` (requires `bypass_mode = true`)
 
 **Loss function** (example with sequence-level IS):
 
@@ -368,12 +368,17 @@ The stopgrad operator is **mathematically required** by importance sampling theo
 **Intuition**: The IS weight $w(\theta)$ tells us "how much to trust this sample" for estimating the gradient under $\pi_\theta$. We update $\theta$ to maximize the reweighted objective, but we don't update $\theta$ to maximize the weight itself—that would be circular reasoning (optimizing the correction factor instead of the actual objective).
 
 **Properties:**
-- **Algorithm**: Off-policy REINFORCE + IS/RS correction
-- **No PPO clipping**: Pure policy gradient
+- **Algorithm**: Off-policy policy gradient with IS/RS correction
+- **Loss types** (`loss_type` config option in bypass mode):
+  - `"ppo_clip"` (default): PPO clipped objective
+    - $L = -\mathbb{E}[\min(r \cdot A, \text{clip}(r) \cdot A)]$ where $r = \pi_\theta / \pi_{\text{rollout}}$
+    - Note: IS weights NOT applied (PPO ratio already handles it; would be double-counting)
+  - `"reinforce"`: Pure policy gradient with explicit IS weights, no PPO clipping
+    - $L = -\mathbb{E}[w \cdot \log \pi_\theta(a|s) \cdot A]$ where $w = \pi_\theta / \pi_{\text{rollout}}$
 - **Always uses bypass mode**: Direct $\pi_\theta$ to $\pi_{\text{rollout}}$ comparison
 - **Fast**: Single forward pass
 
-**Implementation:** `compute_policy_loss_with_rollout_correction()` in [core_algos.py](../../verl/trainer/ppo/core_algos.py#L1537-L1681)
+**Implementation:** `compute_policy_loss_bypass_mode()` and `compute_policy_loss_reinforce()` in [core_algos.py](../../verl/trainer/ppo/core_algos.py)
 
 ---
 
@@ -613,7 +618,7 @@ where $\bar{w}_j = \frac{1}{T_j}\sum_{t=1}^{T_j} w_{j,t} \cdot m_{j,t}$ is the p
 | **Geo-RS** | `rollout_rs="geometric"` | Decoupled PPO, Bypass PG |
 | **Geo-RS-Seq-TIS** | `rollout_is="sequence"` + `rollout_rs="geometric"` | Decoupled PPO, Bypass PG |
 
-**Note:** Bypass PPO mode (π_old = π_rollout) does not use IS correction since there's no gap to correct. Use Bypass PG mode for fast execution with IS/RS correction.
+**Note:** In bypass mode, `loss_type` controls the loss function. Use "ppo_clip" (default) or "reinforce".
 
 #### Available Preset Methods
 
@@ -625,16 +630,17 @@ where $\bar{w}_j = \frac{1}{T_j}\sum_{t=1}^{T_j} w_{j,t} \cdot m_{j,t}$ is the p
 | `decoupled_seq_is_rs()` | Seq-MIS | Decoupled PPO | Sequence IS + sequence RS |
 | `decoupled_geo_rs()` | Geo-RS | Decoupled PPO | Geometric RS + veto |
 | `geo_rs_seq_tis()` | Geo-RS-Seq-TIS | Decoupled PPO | Geometric filter + seq IS |
-| **Bypass PPO Mode** (2 policies: π_rollout = π_old, π_θ) |
-| `ppo_is_bypass()` | - | Bypass PPO | PPO with rollout as anchor (no IS correction needed) |
-| **Bypass PG Mode** (2 policies: π_rollout, π_θ; IS = π_θ/π_rollout) |
-| `pg_is()` | Seq-TIS | Bypass PG | Policy gradient + Seq IS |
-| `pg_rs()` | Geo-RS | Bypass PG | Policy gradient + Geo-RS |
-| `pg_geo_rs_seq_tis()` | Geo-RS-Seq-TIS | Bypass PG | PG + Geo filter + seq IS |
+| **Bypass Mode (PPO-clip)** (ratio handles IS, RS masks outliers) |
+| `bypass_ppo_clip()` | - | Bypass (PPO-clip) | PPO-clip only |
+| `bypass_ppo_clip_geo_rs()` | Geo-RS | Bypass (PPO-clip) | PPO-clip + Geo-RS |
+| **Bypass Mode (REINFORCE)** (explicit IS weights, no PPO clipping) |
+| `bypass_pg_is()` | Seq-TIS | Bypass (REINFORCE) | REINFORCE + Seq IS |
+| `bypass_pg_rs()` | Geo-RS | Bypass (REINFORCE) | REINFORCE + Geo-RS |
+| `bypass_pg_geo_rs_seq_tis()` | Geo-RS-Seq-TIS | Bypass (REINFORCE) | REINFORCE + Geo filter + seq IS |
 | **Other** |
 | `disabled()` | - | - | Metrics only |
 
-**Note:** Bypass PPO mode sets π_old = π_rollout, so IS correction is not applicable. Use Bypass PG mode for fast execution with IS/RS correction.
+**Note:** Bypass mode sets π_old = π_rollout and uses `loss_type` to select the loss function.
 
 #### Additional Supported Combinations (Manual Configuration)
 
@@ -676,7 +682,7 @@ config = RolloutCorrectionConfig(
 - Rejection sampling can be added to any combination
 - Veto is independent and can be added to any combination
 - Geometric aggregation is typically used for RS only (not IS weighting)
-- Pure RS (`pg_rs`) uses bypass + geometric RS with `use_policy_gradient=True` for pure policy gradient (no IS weights)
+- Pure RS (`bypass_pg_rs`) uses bypass + geometric RS with `loss_type="reinforce"` for REINFORCE (no IS weights)
 - All combinations in the table above are valid and supported by the implementation
 
 ---
@@ -785,12 +791,16 @@ $$
 
 | Method | Theory | Policies | PPO Clip | IS Correction | Correctness | Speed |
 |--------|--------|----------|----------|---------------|-------------|-------|
-| **Bypass PG Mode** (IS weights = π_θ / π_rollout) |
-| `pg_is` | Off-policy REINFORCE | 2 (rollout, θ) | ❌ | ✅ Seq-TIS | ✅ Correct | **Fast** |
-| `pg_rs` | Pure PG + Geo RS | 2 (rollout, θ) | ❌ | Geo-RS only | ✅ Correct | **Fast** |
-| `pg_geo_rs_seq_tis` | Pure PG + Geo RS + Seq IS | 2 (rollout, θ) | ❌ | ✅ Geo-RS-Seq-TIS | ✅ Correct | **Fast** |
-| **Bypass PPO Mode** (π_old = π_rollout, no IS correction needed) |
-| `ppo_is_bypass` | PPO (rollout as prox) | 2 (rollout, θ) | ✅ | ❌ (not needed) | ✅ Correct | **Fast** |
+| **Bypass Mode** (π_old = π_rollout, `loss_type` selects algorithm) |
+| `loss_type="ppo_clip"` (default) | PPO (ratio = π_θ/π_rollout) | 2 (rollout, θ) | ✅ | RS mask only (ratio handles IS) | ✅ Correct | **Fast** |
+| `loss_type="reinforce"` | Off-policy REINFORCE | 2 (rollout, θ) | ❌ | ✅ (explicit IS weights) | ✅ Correct | **Fast** |
+| **Bypass Mode Presets (PPO-clip)** |
+| `bypass_ppo_clip` | PPO only | 2 (rollout, θ) | ✅ | - | ✅ Correct | **Fast** |
+| `bypass_ppo_clip_geo_rs` | PPO + Geo-RS | 2 (rollout, θ) | ✅ | Geo-RS mask | ✅ Correct | **Fast** |
+| **Bypass Mode Presets (REINFORCE)** |
+| `bypass_pg_is` | REINFORCE + Seq-TIS | 2 (rollout, θ) | ❌ | ✅ Seq-TIS | ✅ Correct | **Fast** |
+| `bypass_pg_rs` | REINFORCE + Geo RS | 2 (rollout, θ) | ❌ | Geo-RS only | ✅ Correct | **Fast** |
+| `bypass_pg_geo_rs_seq_tis` | REINFORCE + Geo RS + Seq IS | 2 (rollout, θ) | ❌ | ✅ Geo-RS-Seq-TIS | ✅ Correct | **Fast** |
 | **Decoupled PPO Mode** (IS weights = π_old / π_rollout) |
 | `decoupled_token_is` | Decoupled PPO | 3 (rollout, old, θ) | ✅ | ✅ Token-TIS | ✅ Correct | Standard |
 | `decoupled_seq_is` | Decoupled PPO | 3 (rollout, old, θ) | ✅ | ✅ Seq-TIS | ✅ Correct | Standard |
@@ -800,7 +810,11 @@ $$
 | **Incorrect (for reference)** |
 | Naive LLM-RL | Incorrect PPO usage | 2 (old, θ) | ✅ | ❌ | ⚠️ Incorrect | Standard |
 
-**Note:** Bypass PPO mode sets π_old = π_rollout, so IS correction is not applicable (the ratio would be 1.0). Use Bypass PG mode if you want IS/RS correction with fast execution.
+**Notes:**
+- **Bypass mode** sets π_old = π_rollout and uses `loss_type` to select the loss function:
+  - `"ppo_clip"` (default): PPO clipped ratio (IS handled by ratio = π_θ/π_rollout, no explicit IS weights to avoid double-counting)
+  - `"reinforce"`: Explicit IS weights applied as $w \cdot \log \pi \cdot A$
+- Both loss types benefit from rejection sampling (RS) which masks out-of-distribution samples
 
 ### 5.2 Estimator Hierarchy
 
@@ -816,7 +830,9 @@ These estimators define **how IS weights and rejection masks are computed**. The
 
 **Note:** Each estimator can be used with either:
 - **Decoupled PPO** (`bypass_mode=false`): Three policies with PPO clipping
-- **Bypass Policy Gradient** (`bypass_mode=true`, `use_policy_gradient=true`): Two policies without PPO clipping
+- **Bypass Mode** (`bypass_mode=true`): Two policies with configurable loss type
+  - `loss_type="ppo_clip"` (default): PPO clipped objective (IS via ratio, RS mask applied)
+  - `loss_type="reinforce"`: REINFORCE with explicit IS weights
 
 ### 5.3 Method Characteristics by Scenario
 
@@ -832,7 +848,7 @@ These estimators define **how IS weights and rejection masks are computed**. The
 **Choosing operating mode:**
 - **Batch size invariance needed**: Use decoupled mode (`bypass_mode=false`)
 - **Computational efficiency needed**: Use bypass mode (`bypass_mode=true`) to skip `old_log_prob` computation
-- **No PPO clipping**: Use bypass + policy gradient (`bypass_mode=true`, `use_policy_gradient=true`)
+- **No PPO clipping**: Use bypass mode with `loss_type="reinforce"`
 
 ### 5.4 Decoupled Mode vs Bypass Mode
 
diff --git a/docs/ascend_tutorial/ascend_profiling_en.rst b/docs/ascend_tutorial/ascend_profiling_en.rst
index 04a77e0cca4..bcd089e21dd 100644
--- a/docs/ascend_tutorial/ascend_profiling_en.rst
+++ b/docs/ascend_tutorial/ascend_profiling_en.rst
@@ -123,6 +123,13 @@ Visualization
 Collected data is stored in the user-defined save_path and can be
 visualized by using the `MindStudio Insight <https://www.hiascend.com/document/detail/zh/mindstudio/80RC1/GUI_baseddevelopmenttool/msascendinsightug/Insight_userguide_0002.html>`_ tool.
 
+Additionally, in a Linux environment, the MindStudio Insight tool is provided in the form of a `JupyterLab Plugin <https://www.hiascend.com/document/detail/zh/mindstudio/82RC1/GUI_baseddevelopmenttool/msascendinsightug/Insight_userguide_0130.html>`_ ，offering a more intuitive and highly interactive user interface. The advantages of the JupyterLab plugin are as follows:
+
+- Seamless integration: Supports running the MindStudio Insight tool directly within the Jupyter environment, eliminating the need to switch platforms or copy data from the server, enabling data to be collected and used immediately.
+- Fast startup: Allows MindStudio Insight to be launched quickly via the JupyterLab command line or graphical interface.
+- Smooth operation: In a Linux environment, launching MindStudio Insight through JupyterLab effectively alleviates performance lag compared to the full-package communication mode, significantly improving the user experience.
+- Remote access: Supports remotely launching MindStudio Insight. Users can connect to the service via a local browser for direct visual analysis, reducing the difficulty of uploading and downloading data during large-model training or inference.
+
 If the analysis parameter is set to False, offline parsing is required after data collection:
 
 .. code:: python
diff --git a/docs/ascend_tutorial/ascend_profiling_zh.rst b/docs/ascend_tutorial/ascend_profiling_zh.rst
index b4b4896f8b0..00e8565a7e1 100644
--- a/docs/ascend_tutorial/ascend_profiling_zh.rst
+++ b/docs/ascend_tutorial/ascend_profiling_zh.rst
@@ -110,6 +110,13 @@ Last updated: 08/14/2025.
 
 采集后的数据存放在用户设置的save_path下，可通过 `MindStudio Insight <https://www.hiascend.com/document/detail/zh/mindstudio/80RC1/GUI_baseddevelopmenttool/msascendinsightug/Insight_userguide_0002.html>`_ 工具进行可视化。
 
+另外在Linux环境下，MindStudio Insight工具提供了 `JupyterLab插件 <https://www.hiascend.com/document/detail/zh/mindstudio/82RC1/GUI_baseddevelopmenttool/msascendinsightug/Insight_userguide_0130.html>`_ 形态，提供更直观和交互式强的操作界面。JupyterLab插件优势如下：
+
+- 无缝集成：支持在Jupyter环境中直接运行MindStudio Insight工具，无需切换平台，无需拷贝服务器上的数据，实现数据即采即用。
+- 快速启动：通过JupyterLab的命令行或图形界面，可快速启动MindStudio Insight工具。
+- 运行流畅：在Linux环境下，通过JupyterLab环境启动MindStudio Insight，相较于整包通信，有效解决了运行卡顿问题，操作体验显著提升。
+- 远程访问：支持远程启动MindStudio Insight，可通过本地浏览器远程连接服务直接进行可视化分析，缓解了大模型训练或推理数据上传和下载的困难。
+
 如果analysis参数设置为False，采集之后需要进行离线解析：
 
 .. code:: python
diff --git a/docs/ascend_tutorial/ascend_quick_start.rst b/docs/ascend_tutorial/ascend_quick_start.rst
index 8e381e46cfa..bb335178214 100644
--- a/docs/ascend_tutorial/ascend_quick_start.rst
+++ b/docs/ascend_tutorial/ascend_quick_start.rst
@@ -1,10 +1,17 @@
 Ascend Quickstart
 ===================================
 
-Last updated: 12/4/2025.
+Last updated: 12/11/2025.
 
 我们在 verl 上增加对华为昇腾设备的支持。
 
+
+关键更新
+----------------------------------
+
+2025/12/11：verl 存量场景目前支持自动识别 NPU 设备类型， GPU 脚本在昇腾上运行，原则上不再需要显式设置 trainer.device=npu 参数，新增特性通过设置 trainer.device 仍可优先使用，逐步适配自动识别能力。
+
+
 硬件支持
 -----------------------------------
 
@@ -122,6 +129,9 @@ MindSpeed 源码安装指令：
         # （可选）如希望 shell 关闭，或系统重启后，PYTHONPATH 环境变量仍然生效，建议将它添加到 .bashrc 配置文件中
         echo "export PYTHONPATH=$PYTHONPATH:\"$(pwd)/Megatron-LM\"" >> ~/.bashrc
 
+        # 安装 mbridge
+        pip install mbridge
+
 MindSpeed 对应 Megatron-LM 后端使用场景，使用方式如下：
 
     1. 使能 verl worker 模型 ``strategy`` 配置为 ``megatron`` ，例如 ``actor_rollout_ref.actor.strategy=megatron``。
@@ -213,8 +223,7 @@ verl 中昇腾暂不支持生态库如下：
             trainer.nnodes=1 \
             trainer.save_freq=-1 \
             trainer.test_freq=5 \
-            trainer.total_epochs=1 \
-            trainer.device=npu $@
+            trainer.total_epochs=1 $@
 
 
 算法支持现状
diff --git a/docs/ascend_tutorial/dockerfile_build_guidance.rst b/docs/ascend_tutorial/dockerfile_build_guidance.rst
index ce4584aaed5..c27a3090b6b 100644
--- a/docs/ascend_tutorial/dockerfile_build_guidance.rst
+++ b/docs/ascend_tutorial/dockerfile_build_guidance.rst
@@ -33,6 +33,7 @@ vLLM-ascend        0.11.0rc1
 Megatron-LM        v0.12.1
 MindSpeed          (f2b0977e)
 triton-ascend      3.2.0rc4
+mbridge            latest version
 ================= ============
 
 
@@ -57,7 +58,7 @@ A3              8.3.RC1       `Dockerfile.ascend_8.3.rc1_a3 <https://github.com/
    # Navigate to the directory containing the Dockerfile 
    cd {verl-root-path}/docker/ascend
    # Build the image
-   docker build -f Dockerfile.ascend_8.2.rc1_a2 -t verl-ascend:8.2.rc1-a2 .
+   docker build -f Dockerfile.ascend_8.3.rc1_a2 -t verl-ascend:8.3.rc1-a2 .
 
 
 公开镜像地址
diff --git a/docs/perf/best_practices.rst b/docs/perf/best_practices.rst
index d7ff382c250..69d8286710a 100644
--- a/docs/perf/best_practices.rst
+++ b/docs/perf/best_practices.rst
@@ -110,6 +110,10 @@ Parameter Reference
     Path to the actor checkpoint in HuggingFace-compatible format.
   - ``actor_rollout_ref.actor.megatron.use_mbridge``:
     Enable mbridge format conversion when the model was trained with Megatron. Use the latest mbridge release: https://github.com/ISEEKYAN/mbridge.
+    Now it must be True.
+  - ``actor_rollout_ref.actor.megatron.vanilla_mbridge``:
+    If set to True, use mbridge, else use Megatron-Bridge https://github.com/NVIDIA-NeMo/Megatron-Bridge.
+    Now it is True by default. and it will defaultly be set to False in the future(v0.8).
 
 :math:`\pi`
   - ``actor_rollout_ref.rollout.name``:
diff --git a/examples/data_preprocess/pokemon.py b/examples/data_preprocess/pokemon.py
new file mode 100644
index 00000000000..3bbf4d4b46e
--- /dev/null
+++ b/examples/data_preprocess/pokemon.py
@@ -0,0 +1,75 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+"""
+Preprocess the llamafactory/pokemon-gpt4o-captions dataset to parquet format
+"""
+
+import argparse
+import os
+
+import datasets
+
+from verl.utils.hdfs_io import copy, makedirs
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--local_dir", default=None)
+    parser.add_argument("--hdfs_dir", default=None)
+    parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
+    parser.add_argument(
+        "--local_save_dir",
+        default="~/data/pokemon-gpt4o-captions",
+        help="The save directory for the preprocessed dataset.",
+    )
+
+    args = parser.parse_args()
+    local_dataset_path = args.local_dataset_path
+
+    data_source = "llamafactory/pokemon-gpt4o-captions"
+
+    if local_dataset_path is not None:
+        dataset = datasets.load_dataset(
+            local_dataset_path,
+        )
+    else:
+        dataset = datasets.load_dataset(
+            data_source,
+        )
+
+    def map_fn(row: dict):
+        messages = []
+        conversation = row.pop("conversations")
+        for conv in conversation:
+            if conv["from"] == "gpt":
+                role = "assistant"
+            elif conv["from"] == "human":
+                role = "user"
+            else:
+                raise ValueError(f"Unknown role: {conv['from']}")
+            messages.append(
+                {
+                    "role": role,
+                    "content": conv["value"],
+                }
+            )
+
+        row["messages"] = messages
+        return row
+
+    dataset = dataset["train"].map(map_fn, num_proc=16)
+    dataset = dataset.train_test_split(test_size=0.1)
+    train_dataset = dataset["train"]
+    test_dataset = dataset["test"]
+
+    hdfs_dir = args.hdfs_dir
+    local_save_dir = args.local_dir
+    if local_save_dir is not None:
+        print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
+    else:
+        local_save_dir = args.local_save_dir
+
+    train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
+    test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
+
+    if hdfs_dir is not None:
+        makedirs(hdfs_dir)
+        copy(src=local_save_dir, dst=hdfs_dir)
diff --git a/examples/grpo_trainer/run_mistral13b_skyworkrm_hhrlhf.sh b/examples/grpo_trainer/run_mistral13b_skyworkrm_hhrlhf.sh
index dd2fd793bd9..c1808dd5a62 100644
--- a/examples/grpo_trainer/run_mistral13b_skyworkrm_hhrlhf.sh
+++ b/examples/grpo_trainer/run_mistral13b_skyworkrm_hhrlhf.sh
@@ -34,10 +34,14 @@ python3 -m verl.trainer.main_ppo \
     actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
     actor_rollout_ref.model.enable_gradient_checkpointing=True \
     reward_model.enable=True \
-    reward_model.model.fsdp_config.param_offload=True \
     reward_model.model.path=Skywork/Skywork-Reward-Llama-3.1-8B \
-    reward_model.model.input_tokenizer=mistralai/Mistral-Nemo-Instruct-2407 \
-    reward_model.micro_batch_size_per_gpu=4 \
+    reward_model.use_reward_loop=True \
+    reward_model.rollout.name=vllm \
+    reward_model.rollout.gpu_memory_utilization=0.8 \
+    reward_model.rollout.tensor_model_parallel_size=1 \
+    reward_model.rollout.prompt_length=8192 \
+    reward_model.rollout.response_length=4096 \
+    reward_model.num_workers=8 \
     algorithm.use_kl_in_reward=False \
     trainer.logger='["console","wandb"]' \
     trainer.val_before_train=False \
diff --git a/examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh b/examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh
index 5344cfd9aa6..890b719b349 100644
--- a/examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh
+++ b/examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh
@@ -1,6 +1,11 @@
 #!/usr/bin/env bash
 set -xeuo pipefail
 
+# Need to install Megatron-Bridge
+# NOTE: Make sure you use Megatron-Bridge later than 0.2.0 
+# (Recommend https://github.com/NVIDIA-NeMo/Megatron-Bridge/commit/a489bed3a2410ed9b000ec13a3c90176fec7d99c or later)
+# for proper MoE LoRA support.
+
 # For Megatron communication/computation overlapping
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 
@@ -41,8 +46,16 @@ DATA=(
 
 MODEL=(
     actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct
-    actor_rollout_ref.model.lora.rank=16
-    actor_rollout_ref.model.lora.alpha=32
+    actor_rollout_ref.model.lora.rank=256
+    actor_rollout_ref.model.lora.alpha=512
+    actor_rollout_ref.model.lora.lora_A_init_method=kaiming
+    # # Optional: Use canonical LoRA
+    # actor_rollout_ref.model.lora.type="canonical_lora"
+    # actor_rollout_ref.model.lora.target_modules='["linear_q","linear_k","linear_v","linear_proj","linear_fc1_up","linear_fc1_gate","linear_fc2"]'
+
+    # # Optional: Add dropout to LoRA layers
+    # actor_rollout_ref.model.lora.dropout=0.05
+    # actor_rollout_ref.model.lora.dropout_position=pre
 )
 
 ACTOR=(
@@ -58,6 +71,9 @@ ACTOR=(
     actor_rollout_ref.actor.kl_loss_coef=0.001
     actor_rollout_ref.actor.kl_loss_type=low_var_kl
     actor_rollout_ref.actor.entropy_coeff=0
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1
 )
 
 ROLLOUT=(
diff --git a/examples/grpo_trainer/run_qwen2_5_32b_grpo_npu.sh b/examples/grpo_trainer/run_qwen2_5_32b_grpo_npu.sh
index 6d0d4fe4e2e..cdee0539c09 100644
--- a/examples/grpo_trainer/run_qwen2_5_32b_grpo_npu.sh
+++ b/examples/grpo_trainer/run_qwen2_5_32b_grpo_npu.sh
@@ -37,5 +37,4 @@ python3 -m verl.trainer.main_ppo \
     trainer.nnodes=2 \
     trainer.save_freq=-1 \
     trainer.test_freq=10 \
-    trainer.total_epochs=15 \
-    trainer.device=npu $@
\ No newline at end of file
+    trainer.total_epochs=15 $@
\ No newline at end of file
diff --git a/examples/grpo_trainer/run_qwen2_5_7b_grpo_discrete_prof_npu.sh b/examples/grpo_trainer/run_qwen2_5_7b_grpo_discrete_prof_npu.sh
index 27ab478da28..020915c47ce 100644
--- a/examples/grpo_trainer/run_qwen2_5_7b_grpo_discrete_prof_npu.sh
+++ b/examples/grpo_trainer/run_qwen2_5_7b_grpo_discrete_prof_npu.sh
@@ -65,7 +65,6 @@ python3 -m verl.trainer.main_ppo \
     trainer.save_freq=-1 \
     trainer.test_freq=5 \
     trainer.total_epochs=5 \
-    trainer.device=npu \
     global_profiler.tool=npu \
     global_profiler.steps=$PROFILE_STEPS \
     global_profiler.save_path=$SAVE_PATH
diff --git a/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh b/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh
index 1ac6dfe9445..b7c806762a7 100644
--- a/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh
+++ b/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh
@@ -62,7 +62,6 @@ python3 -m verl.trainer.main_ppo \
     trainer.save_freq=-1 \
     trainer.test_freq=5 \
     trainer.total_epochs=5 \
-    trainer.device=npu \
     global_profiler.tool=npu \
     global_profiler.steps=$PROFILE_STEPS \
     global_profiler.save_path=$SAVE_PATH
diff --git a/examples/grpo_trainer/run_qwen2_5_7b_grpo_npu.sh b/examples/grpo_trainer/run_qwen2_5_7b_grpo_npu.sh
index 07dda340c39..51273256ae5 100644
--- a/examples/grpo_trainer/run_qwen2_5_7b_grpo_npu.sh
+++ b/examples/grpo_trainer/run_qwen2_5_7b_grpo_npu.sh
@@ -38,5 +38,4 @@ python3 -m verl.trainer.main_ppo \
     trainer.nnodes=1 \
     trainer.save_freq=-1 \
     trainer.test_freq=5 \
-    trainer.total_epochs=5 \
-    trainer.device=npu $@
\ No newline at end of file
+    trainer.total_epochs=5 $@
\ No newline at end of file
diff --git a/examples/grpo_trainer/run_qwen2_5_vl_32b_npu.sh b/examples/grpo_trainer/run_qwen2_5_vl_32b_npu.sh
index c29838a336b..7f99a89213d 100644
--- a/examples/grpo_trainer/run_qwen2_5_vl_32b_npu.sh
+++ b/examples/grpo_trainer/run_qwen2_5_vl_32b_npu.sh
@@ -48,5 +48,4 @@ python3 -m verl.trainer.main_ppo \
     trainer.nnodes=2 \
     trainer.save_freq=-1 \
     trainer.test_freq=-1 \
-    trainer.total_epochs=15 \
-    trainer.device=npu $@
\ No newline at end of file
+    trainer.total_epochs=15 $@
\ No newline at end of file
diff --git a/examples/grpo_trainer/run_qwen2_5_vl_3b_npu.sh b/examples/grpo_trainer/run_qwen2_5_vl_3b_npu.sh
index 07ab65ee2a1..1fd1228a418 100644
--- a/examples/grpo_trainer/run_qwen2_5_vl_3b_npu.sh
+++ b/examples/grpo_trainer/run_qwen2_5_vl_3b_npu.sh
@@ -48,5 +48,4 @@ python3 -m verl.trainer.main_ppo \
     trainer.nnodes=1 \
     trainer.save_freq=-1 \
     trainer.test_freq=-1 \
-    trainer.total_epochs=15 \
-    trainer.device=npu $@
\ No newline at end of file
+    trainer.total_epochs=15 $@
\ No newline at end of file
diff --git a/examples/grpo_trainer/run_qwen2_5_vl_7b_npu.sh b/examples/grpo_trainer/run_qwen2_5_vl_7b_npu.sh
index 6d8f959817b..607176ffc0e 100644
--- a/examples/grpo_trainer/run_qwen2_5_vl_7b_npu.sh
+++ b/examples/grpo_trainer/run_qwen2_5_vl_7b_npu.sh
@@ -48,5 +48,4 @@ python3 -m verl.trainer.main_ppo \
     trainer.nnodes=1 \
     trainer.save_freq=-1 \
     trainer.test_freq=-1 \
-    trainer.total_epochs=15 \
-    trainer.device=npu $@
\ No newline at end of file
+    trainer.total_epochs=15 $@
\ No newline at end of file
diff --git a/examples/grpo_trainer/run_qwen3-32b_npu.sh b/examples/grpo_trainer/run_qwen3-32b_npu.sh
index 0ee01c43d1a..ea4883f9516 100644
--- a/examples/grpo_trainer/run_qwen3-32b_npu.sh
+++ b/examples/grpo_trainer/run_qwen3-32b_npu.sh
@@ -55,5 +55,4 @@ python3 -m verl.trainer.main_ppo \
     trainer.resume_from_path=checkpoints/ \
     trainer.save_freq=500 \
     trainer.test_freq=50 \
-    trainer.total_epochs=50 \
-    trainer.device=npu $@
\ No newline at end of file
+    trainer.total_epochs=50 $@
\ No newline at end of file
diff --git a/examples/grpo_trainer/run_qwen3-8b_npu.sh b/examples/grpo_trainer/run_qwen3-8b_npu.sh
index 1d3a190b3f2..b4d5e9fb548 100644
--- a/examples/grpo_trainer/run_qwen3-8b_npu.sh
+++ b/examples/grpo_trainer/run_qwen3-8b_npu.sh
@@ -47,7 +47,6 @@ python3 -m verl.trainer.main_ppo \
     trainer.n_gpus_per_node=8 \
     trainer.nnodes=1 \
     trainer.default_local_dir=${CKPTS_DIR} \
-    trainer.device=npu \
     trainer.resume_mode=auto \
     actor_rollout_ref.actor.fsdp_config.forward_prefetch=True \
     actor_rollout_ref.ref.fsdp_config.forward_prefetch=True \
diff --git a/examples/grpo_trainer/run_qwen3_4b_grpo_vllm_1k_npu.sh b/examples/grpo_trainer/run_qwen3_4b_grpo_vllm_1k_npu.sh
index e7b040495b1..19ca32a6595 100644
--- a/examples/grpo_trainer/run_qwen3_4b_grpo_vllm_1k_npu.sh
+++ b/examples/grpo_trainer/run_qwen3_4b_grpo_vllm_1k_npu.sh
@@ -78,5 +78,4 @@ python3 -m verl.trainer.main_ppo \
     trainer.save_freq=-1 \
     trainer.test_freq=5 \
     trainer.total_epochs=15 \
-    trainer.val_before_train=False \
-    trainer.device=npu 2>&1 | tee ${LOG_PATH}
\ No newline at end of file
+    trainer.val_before_train=False 2>&1 | tee ${LOG_PATH}
\ No newline at end of file
diff --git a/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_1k_spmd_npu.sh b/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_1k_spmd_npu.sh
index 5d592410d5e..b2d259b4330 100644
--- a/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_1k_spmd_npu.sh
+++ b/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_1k_spmd_npu.sh
@@ -67,5 +67,4 @@ python3 -m verl.trainer.main_ppo \
     trainer.total_epochs=5 \
     trainer.default_local_dir="${CKPTS_DIR}" \
     actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    trainer.device=npu $@
\ No newline at end of file
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} $@
\ No newline at end of file
diff --git a/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_32k_spmd_npu.sh b/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_32k_spmd_npu.sh
index 3684e8a2d48..9076360bb6d 100644
--- a/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_32k_spmd_npu.sh
+++ b/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_32k_spmd_npu.sh
@@ -67,5 +67,4 @@ python3 -m verl.trainer.main_ppo \
     trainer.total_epochs=5 \
     trainer.default_local_dir="${CKPTS_DIR}" \
     actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
-    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
-    trainer.device=npu $@
\ No newline at end of file
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} $@
\ No newline at end of file
diff --git a/examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh b/examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh
index 77805cdfb3b..3a92171b6e2 100644
--- a/examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh
+++ b/examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh
@@ -3,9 +3,11 @@ set -xeuo pipefail
 
 # Need to install Megatron-Bridge
 # NOTE: Make sure you use Megatron-Bridge later than 0.2.0 
-# (after https://github.com/NVIDIA-NeMo/Megatron-Bridge/commit/36302b7ca1305f0690e17cf4e4019ac822746872)
-# for MoE LoRA When you want to set ETP and ETP!=TP.
-# https://github.com/NVIDIA-NeMo/Megatron-Bridge/issues/1363
+# (Recommend https://github.com/NVIDIA-NeMo/Megatron-Bridge/commit/a489bed3a2410ed9b000ec13a3c90176fec7d99c or later)
+# for proper MoE LoRA support.
+
+# For Megatron communication/computation overlapping
+export CUDA_DEVICE_MAX_CONNECTIONS=1
 
 ########################### Quick Config ###########################
 
@@ -41,9 +43,17 @@ DATA=(
 
 MODEL=(
     actor_rollout_ref.model.path=Qwen/Qwen3-30B-A3B-Instruct-2507
-    actor_rollout_ref.model.lora.rank=16
-    actor_rollout_ref.model.lora.alpha=32
     actor_rollout_ref.model.use_fused_kernels=True
+    actor_rollout_ref.model.lora.rank=32
+    actor_rollout_ref.model.lora.alpha=64
+    actor_rollout_ref.model.lora.lora_A_init_method=kaiming
+    # # Optional: Use canonical LoRA
+    # actor_rollout_ref.model.lora.type="canonical_lora"
+    # actor_rollout_ref.model.lora.target_modules='["linear_q","linear_k","linear_v","linear_proj","linear_fc1_up","linear_fc1_gate","linear_fc2"]'
+
+    # # Optional: Add dropout to LoRA layers
+    # actor_rollout_ref.model.lora.dropout=0.05
+    # actor_rollout_ref.model.lora.dropout_position=pre
 )
 
 ACTOR=(
diff --git a/examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh b/examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh
index 2944de647c4..aa2b3e4a118 100644
--- a/examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh
+++ b/examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh
@@ -25,10 +25,14 @@ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megat
     critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
     critic.ppo_micro_batch_size_per_gpu=4 \
     reward_model.enable=True \
-    reward_model.megatron.tensor_model_parallel_size=4 \
     reward_model.model.path=deepseek-ai/deepseek-llm-7b-chat \
-    reward_model.micro_batch_size_per_gpu=4 \
-    reward_model.param_offload=False \
+    reward_model.use_reward_loop=True \
+    reward_model.rollout.name=vllm \
+    reward_model.rollout.gpu_memory_utilization=0.8 \
+    reward_model.rollout.tensor_model_parallel_size=4 \
+    reward_model.rollout.prompt_length=256 \
+    reward_model.rollout.response_length=128 \
+    reward_model.num_workers=8 \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
diff --git a/examples/ppo_trainer/run_qwen2-7b_rm.sh b/examples/ppo_trainer/run_qwen2-7b_rm.sh
index 57b7bd7524b..33caabf40d8 100644
--- a/examples/ppo_trainer/run_qwen2-7b_rm.sh
+++ b/examples/ppo_trainer/run_qwen2-7b_rm.sh
@@ -55,9 +55,13 @@ python3 -m verl.trainer.main_ppo \
     critic.model.fsdp_config.optimizer_offload=False \
     reward_model.enable=True \
     reward_model.model.path="$HOME/models/FsfairX-LLaMA3-RM-v0.1" \
-    reward_model.model.use_remove_padding=True \
-    reward_model.model.fsdp_config.param_offload=True \
-    reward_model.micro_batch_size_per_gpu=32 \
+    reward_model.use_reward_loop=True \
+    reward_model.rollout.name=vllm \
+    reward_model.rollout.gpu_memory_utilization=0.8 \
+    reward_model.rollout.tensor_model_parallel_size=1 \
+    reward_model.rollout.prompt_length=2048 \
+    reward_model.rollout.response_length=1024 \
+    reward_model.num_workers=8 \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
diff --git a/examples/ppo_trainer/run_qwen2-7b_rm_legacy.sh b/examples/ppo_trainer/run_qwen2-7b_rm_legacy.sh
new file mode 100644
index 00000000000..99574a33c96
--- /dev/null
+++ b/examples/ppo_trainer/run_qwen2-7b_rm_legacy.sh
@@ -0,0 +1,63 @@
+# download datasets and models
+# python3 examples/data_preprocess/gsm8k.py
+# python3 examples/data_preprocess/math_dataset.py
+# huggingface-cli download Skywork/Skywork-Reward-V2-Llama-3.2-3B --local-dir $HOME/models/Skywork-Reward-V2-Llama-3.2-3B
+# huggingface-cli download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct
+
+gsm8k_train_path=$HOME/data/gsm8k/train.parquet
+gsm8k_test_path=$HOME/data/gsm8k/test.parquet
+math_train_path=$HOME/data/math/train.parquet
+math_test_path=$HOME/data/math/test.parquet
+
+train_files="['$gsm8k_train_path', '$math_train_path']"
+test_files="['$gsm8k_test_path', '$math_test_path']"
+
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=gae \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=2048 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.return_raw_chat=True \
+    actor_rollout_ref.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    critic.optim.lr=1e-5 \
+    critic.model.use_remove_padding=True \
+    critic.optim.lr_warmup_steps_ratio=0.05 \
+    critic.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
+    critic.model.enable_gradient_checkpointing=True \
+    critic.ppo_micro_batch_size_per_gpu=32 \
+    critic.model.fsdp_config.param_offload=False \
+    critic.model.fsdp_config.optimizer_offload=False \
+    reward_model.enable=True \
+    reward_model.model.path="$HOME/models/Skywork-Reward-V2-Llama-3.2-3B" \
+    reward_model.use_reward_loop=False \
+    reward_model.model.use_remove_padding=True \
+    reward_model.model.fsdp_config.param_offload=True \
+    reward_model.micro_batch_size_per_gpu=32 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_test_qwen25_rm' \
+    trainer.val_before_train=True \
+    trainer.experiment_name='legacy_fsdp_reward_model' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=10 \
+    trainer.total_epochs=15 $@
diff --git a/examples/ppo_trainer/run_qwen2-7b_rm_reward_loop_colocate.sh b/examples/ppo_trainer/run_qwen2-7b_rm_reward_loop_colocate.sh
new file mode 100644
index 00000000000..9641fdcb907
--- /dev/null
+++ b/examples/ppo_trainer/run_qwen2-7b_rm_reward_loop_colocate.sh
@@ -0,0 +1,69 @@
+# download datasets and models
+# python3 examples/data_preprocess/gsm8k.py
+# python3 examples/data_preprocess/math_dataset.py
+# huggingface-cli download Skywork/Skywork-Reward-V2-Llama-3.2-3B --local-dir $HOME/models/Skywork-Reward-V2-Llama-3.2-3B
+# huggingface-cli download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct
+
+gsm8k_train_path=$HOME/data/gsm8k/train.parquet
+gsm8k_test_path=$HOME/data/gsm8k/test.parquet
+math_train_path=$HOME/data/math/train.parquet
+math_test_path=$HOME/data/math/test.parquet
+
+train_files="['$gsm8k_train_path', '$math_train_path']"
+test_files="['$gsm8k_test_path', '$math_test_path']"
+
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=gae \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=2048 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.return_raw_chat=True \
+    actor_rollout_ref.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    critic.optim.lr=1e-5 \
+    critic.model.use_remove_padding=True \
+    critic.optim.lr_warmup_steps_ratio=0.05 \
+    critic.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
+    critic.model.enable_gradient_checkpointing=True \
+    critic.ppo_micro_batch_size_per_gpu=32 \
+    critic.model.fsdp_config.param_offload=False \
+    critic.model.fsdp_config.optimizer_offload=False \
+    reward_model.enable=True \
+    reward_model.model.path="$HOME/models/Skywork-Reward-V2-Llama-3.2-3B" \
+    reward_model.use_reward_loop=True \
+    reward_model.rollout.name=vllm \
+    reward_model.rollout.gpu_memory_utilization=0.8 \
+    reward_model.rollout.prompt_length=4096 \
+    reward_model.rollout.response_length=4096 \
+    reward_model.rollout.tensor_model_parallel_size=1 \
+    reward_model.num_workers=8 \
+    reward_model.model.use_remove_padding=True \
+    reward_model.model.fsdp_config.param_offload=True \
+    reward_model.micro_batch_size_per_gpu=32 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_test_qwen25_rm' \
+    trainer.val_before_train=False \
+    trainer.experiment_name='reward_loop_colocate_reward_model' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=10 \
+    trainer.total_epochs=15 $@
diff --git a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh
index e0ddc01e75e..902bcb8ede2 100644
--- a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh
+++ b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh
@@ -42,11 +42,13 @@ python3 -m verl.trainer.main_ppo \
     critic.model.fsdp_config.optimizer_offload=False \
     reward_model.enable=True \
     reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\
-    reward_model.model.use_remove_padding=True \
-    reward_model.model.fsdp_config.param_offload=True \
-    reward_model.micro_batch_size_per_gpu=32 \
-    reward_model.use_dynamic_bsz=True \
-    reward_model.forward_max_token_len_per_gpu=98304 \
+    reward_model.use_reward_loop=True \
+    reward_model.rollout.name=vllm \
+    reward_model.rollout.gpu_memory_utilization=0.8 \
+    reward_model.rollout.tensor_model_parallel_size=1 \
+    reward_model.rollout.prompt_length=8192 \
+    reward_model.rollout.response_length=4096 \
+    reward_model.num_workers=8 \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
diff --git a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_fused_kernels.sh b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_fused_kernels.sh
index 7e0a335efe2..fa2c154f3a1 100644
--- a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_fused_kernels.sh
+++ b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_fused_kernels.sh
@@ -45,12 +45,14 @@ python3 -m verl.trainer.main_ppo \
     critic.model.fsdp_config.param_offload=False \
     critic.model.fsdp_config.optimizer_offload=False \
     reward_model.enable=True \
-    reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\
-    reward_model.model.use_remove_padding=True \
-    reward_model.model.fsdp_config.param_offload=True \
-    reward_model.micro_batch_size_per_gpu=32 \
-    reward_model.use_dynamic_bsz=True \
-    reward_model.forward_max_token_len_per_gpu=98304 \
+    reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1 \
+    reward_model.use_reward_loop=True \
+    reward_model.rollout.name=vllm \
+    reward_model.rollout.gpu_memory_utilization=0.8 \
+    reward_model.rollout.tensor_model_parallel_size=1 \
+    reward_model.rollout.prompt_length=8192 \
+    reward_model.rollout.response_length=4096 \
+    reward_model.num_workers=8 \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
diff --git a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_nsys.sh b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_nsys.sh
index 0acfe43e862..5ccfe1b3cd5 100644
--- a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_nsys.sh
+++ b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_nsys.sh
@@ -55,14 +55,13 @@ python3 -m verl.trainer.main_ppo \
     critic.profiler.all_ranks=$PROFILE_RANKS_ALL \
     reward_model.enable=True \
     reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\
-    reward_model.model.use_remove_padding=True \
-    reward_model.model.fsdp_config.param_offload=True \
-    reward_model.micro_batch_size_per_gpu=32 \
-    reward_model.use_dynamic_bsz=True \
-    reward_model.forward_max_token_len_per_gpu=98304 \
-    reward_model.profiler.enable=True \
-    reward_model.profiler.ranks=$PROFILE_RANKS \
-    reward_model.profiler.all_ranks=$PROFILE_RANKS_ALL \
+    reward_model.use_reward_loop=True \
+    reward_model.rollout.name=vllm \
+    reward_model.rollout.gpu_memory_utilization=0.8 \
+    reward_model.rollout.tensor_model_parallel_size=1 \
+    reward_model.rollout.prompt_length=8192 \
+    reward_model.rollout.response_length=4096 \
+    reward_model.num_workers=8 \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
diff --git a/examples/ppo_trainer/run_qwen2.5-3b_rm_legacy.sh b/examples/ppo_trainer/run_qwen2.5-3b_rm_legacy.sh
new file mode 100644
index 00000000000..99574a33c96
--- /dev/null
+++ b/examples/ppo_trainer/run_qwen2.5-3b_rm_legacy.sh
@@ -0,0 +1,63 @@
+# download datasets and models
+# python3 examples/data_preprocess/gsm8k.py
+# python3 examples/data_preprocess/math_dataset.py
+# huggingface-cli download Skywork/Skywork-Reward-V2-Llama-3.2-3B --local-dir $HOME/models/Skywork-Reward-V2-Llama-3.2-3B
+# huggingface-cli download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct
+
+gsm8k_train_path=$HOME/data/gsm8k/train.parquet
+gsm8k_test_path=$HOME/data/gsm8k/test.parquet
+math_train_path=$HOME/data/math/train.parquet
+math_test_path=$HOME/data/math/test.parquet
+
+train_files="['$gsm8k_train_path', '$math_train_path']"
+test_files="['$gsm8k_test_path', '$math_test_path']"
+
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=gae \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=2048 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.return_raw_chat=True \
+    actor_rollout_ref.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    critic.optim.lr=1e-5 \
+    critic.model.use_remove_padding=True \
+    critic.optim.lr_warmup_steps_ratio=0.05 \
+    critic.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
+    critic.model.enable_gradient_checkpointing=True \
+    critic.ppo_micro_batch_size_per_gpu=32 \
+    critic.model.fsdp_config.param_offload=False \
+    critic.model.fsdp_config.optimizer_offload=False \
+    reward_model.enable=True \
+    reward_model.model.path="$HOME/models/Skywork-Reward-V2-Llama-3.2-3B" \
+    reward_model.use_reward_loop=False \
+    reward_model.model.use_remove_padding=True \
+    reward_model.model.fsdp_config.param_offload=True \
+    reward_model.micro_batch_size_per_gpu=32 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_test_qwen25_rm' \
+    trainer.val_before_train=True \
+    trainer.experiment_name='legacy_fsdp_reward_model' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=10 \
+    trainer.total_epochs=15 $@
diff --git a/examples/ppo_trainer/run_qwen2.5-3b_rm_reward_loop_colocate.sh b/examples/ppo_trainer/run_qwen2.5-3b_rm_reward_loop_colocate.sh
new file mode 100644
index 00000000000..d9d66f6f695
--- /dev/null
+++ b/examples/ppo_trainer/run_qwen2.5-3b_rm_reward_loop_colocate.sh
@@ -0,0 +1,66 @@
+# download datasets and models
+# python3 examples/data_preprocess/gsm8k.py
+# python3 examples/data_preprocess/math_dataset.py
+# huggingface-cli download Skywork/Skywork-Reward-V2-Llama-3.2-3B --local-dir $HOME/models/Skywork-Reward-V2-Llama-3.2-3B
+# huggingface-cli download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct
+
+gsm8k_train_path=$HOME/data/gsm8k/train.parquet
+gsm8k_test_path=$HOME/data/gsm8k/test.parquet
+math_train_path=$HOME/data/math/train.parquet
+math_test_path=$HOME/data/math/test.parquet
+
+train_files="['$gsm8k_train_path', '$math_train_path']"
+test_files="['$gsm8k_test_path', '$math_test_path']"
+
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=gae \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=2048 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.return_raw_chat=True \
+    actor_rollout_ref.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    critic.optim.lr=1e-5 \
+    critic.model.use_remove_padding=True \
+    critic.optim.lr_warmup_steps_ratio=0.05 \
+    critic.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
+    critic.model.enable_gradient_checkpointing=True \
+    critic.ppo_micro_batch_size_per_gpu=32 \
+    critic.model.fsdp_config.param_offload=False \
+    critic.model.fsdp_config.optimizer_offload=False \
+    reward_model.enable=True \
+    reward_model.model.path="$HOME/models/Skywork-Reward-V2-Llama-3.2-3B" \
+    reward_model.use_reward_loop=True \
+    reward_model.rollout.name=vllm \
+    reward_model.rollout.gpu_memory_utilization=0.8 \
+    reward_model.rollout.tensor_model_parallel_size=1 \
+    reward_model.rollout.prompt_length=4096 \
+    reward_model.rollout.response_length=4096 \
+    reward_model.num_workers=8 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_test_qwen25_rm' \
+    trainer.val_before_train=False \
+    trainer.experiment_name='reward_loop_colocate_reward_model' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=10 \
+    trainer.total_epochs=15 $@
diff --git a/examples/ppo_trainer/run_qwen3-8b_npu.sh b/examples/ppo_trainer/run_qwen3-8b_npu.sh
index 40fb751e62c..a0ada0eb388 100644
--- a/examples/ppo_trainer/run_qwen3-8b_npu.sh
+++ b/examples/ppo_trainer/run_qwen3-8b_npu.sh
@@ -49,7 +49,6 @@ python3 -m verl.trainer.main_ppo \
     trainer.save_freq=20 \
     trainer.test_freq=-1 \
     trainer.val_before_train=False \
-    trainer.device=npu \
     trainer.max_actor_ckpt_to_keep=1 \
     trainer.max_critic_ckpt_to_keep=1 \
     trainer.total_training_steps=100 $@
\ No newline at end of file
diff --git a/examples/rollout_correction/run_with_rollout_corr.sh b/examples/rollout_correction/run_with_rollout_corr.sh
index 15a004eefa8..e6b1061a2b3 100755
--- a/examples/rollout_correction/run_with_rollout_corr.sh
+++ b/examples/rollout_correction/run_with_rollout_corr.sh
@@ -25,9 +25,9 @@ rollout_rs_threshold_lower="null"         # RS lower threshold
 # Veto mechanism (optional, independent of IS/RS)
 rollout_token_veto_threshold="null"       # Per-token veto threshold (null to disable)
 
-# Policy Gradient loss mode (bypass mode with policy gradient loss, no PPO clipping)
-bypass_mode="true"     # Required for policy gradient mode
-use_policy_gradient="true"        # Use policy gradient loss (works with IS/RS/both)
+# Bypass mode with REINFORCE loss (no PPO clipping)
+bypass_mode="true"     # Skip old_log_prob computation
+loss_type="reinforce"  # REINFORCE with explicit IS weights (alternative: "ppo_clip")
 
 # ==============================================================================
 # Model and Data Configuration
@@ -76,7 +76,7 @@ python3 -m verl.trainer.main_ppo \
     algorithm.rollout_correction.rollout_rs_threshold_lower=${rollout_rs_threshold_lower} \
     algorithm.rollout_correction.rollout_token_veto_threshold=${rollout_token_veto_threshold} \
     algorithm.rollout_correction.bypass_mode=${bypass_mode} \
-    algorithm.rollout_correction.use_policy_gradient=${use_policy_gradient} \
+    algorithm.rollout_correction.loss_type=${loss_type} \
     actor_rollout_ref.model.path="${MODEL_PATH}" \
     actor_rollout_ref.actor.optim.lr=${learning_rate} \
     actor_rollout_ref.actor.ppo_mini_batch_size=${ppo_mini_batch_size} \
@@ -95,7 +95,7 @@ echo "  - Algorithm: RLOO (REINFORCE Leave-One-Out)"
 echo "  - Advantage estimator: ${adv_estimator}"
 echo "  - IS mode: ${rollout_is} (self-normalized: ${rollout_is_batch_normalize})"
 echo "  - IS threshold: ${rollout_is_threshold}"
-echo "  - Policy gradient mode: ${use_policy_gradient} (bypass: ${bypass_mode})"
+echo "  - Bypass mode: ${bypass_mode}, loss_type: ${loss_type}"
 echo ""
 echo "Monitor these key metrics in wandb:"
 echo "  - rollout_corr/rollout_is_mean (should be ~1.0 before batch norm)"
diff --git a/examples/sft/gsm8k/run_qwen3_8b_sft_peft_sp2_npu.sh b/examples/sft/gsm8k/run_qwen3_8b_sft_peft_sp2_npu.sh
index 720e2340838..7de7ebd67e4 100644
--- a/examples/sft/gsm8k/run_qwen3_8b_sft_peft_sp2_npu.sh
+++ b/examples/sft/gsm8k/run_qwen3_8b_sft_peft_sp2_npu.sh
@@ -32,5 +32,4 @@ torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
     model.target_modules=all-linear \
     model.strategy=fsdp \
     ulysses_sequence_parallel_size=2 \
-    use_remove_padding=true \
-    trainer.device=npu
+    use_remove_padding=true
diff --git a/examples/sft/vlm/run_qwen3_vl_2b.sh b/examples/sft/vlm/run_qwen3_vl_2b.sh
new file mode 100644
index 00000000000..28c21ffa049
--- /dev/null
+++ b/examples/sft/vlm/run_qwen3_vl_2b.sh
@@ -0,0 +1,100 @@
+#!/usr/bin/env bash
+# python examples/data_preprocess/pokemon.py
+set -xeuo pipefail
+
+HDFS_ROOT=${HDFS_ROOT:-$PWD}
+DATA_ROOT=${DATA_ROOT:-$PWD}
+
+ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.sft_trainer"}
+
+TRAIN_FILES=${HOME}/data/pokemon-gpt4o-captions/train.parquet
+
+backend=${BACKEND:-fsdp}
+
+project_name=verl_sft_test
+
+RESUME_MODE=auto
+MODEL_ID=${HDFS_ROOT}/model/Qwen3-VL-2B-Instruct
+# MODEL_ID=${HDFS_ROOT}/model/Qwen3-VL-30B-A3B-Instruct
+
+SP_SIZE=${SP_SIZE:-2}
+FSDP_SIZE=${FSDP_SIZE:--1}
+FSDP_STRATEGY=${FSDP_STRATEGY:-"fsdp2"}
+
+TP_SIZE=${TP_SIZE:-2}
+PP_SIZE=${PP_SIZE:-2}
+VPP_SIZE=${VPP_SIZE:-null}
+CP_SIZE=${CP_SIZE:-1}
+
+PAD_MODE=${PAD_MODE:-no_padding}
+
+USE_REMOVE_PADDING=${USE_REMOVE_PADDING:-True}
+
+FSDP_ENGINE_CONFIG="\
+    engine=${backend} \
+    optim=${backend} \
+    optim.lr=2e-5 \
+    optim.lr_warmup_steps_ratio=0.01 \
+    optim.weight_decay=0.1 \
+    optim.betas="[0.9,0.95]" \
+    optim.clip_grad=1.0 \
+    optim.min_lr_ratio=0.1 \
+    optim.warmup_style=cosine \
+    engine.ulysses_sequence_parallel_size=${SP_SIZE} \
+    engine.strategy=${FSDP_STRATEGY} \
+    engine.fsdp_size=${FSDP_SIZE}"
+
+
+MEGATRON_ENGINE_CONFIG="\
+    engine=${backend} \
+    optim=${backend} \
+    optim.lr=2e-5 \
+    optim.lr_warmup_steps_ratio=0.01 \
+    optim.weight_decay=0.1 \
+    optim.betas="[0.9,0.95]" \
+    optim.clip_grad=1.0 \
+    optim.lr_warmup_init=0 \
+    optim.lr_decay_style=cosine \
+    optim.min_lr=2e-6 \
+    engine.tensor_model_parallel_size=${TP_SIZE} \
+    engine.pipeline_model_parallel_size=${PP_SIZE} \
+    engine.virtual_pipeline_model_parallel_size=${VPP_SIZE} \
+    engine.context_parallel_size=${CP_SIZE} \
+    engine.use_mbridge=True \
+    engine.vanilla_mbridge=True"
+
+if [ "$backend" = "fsdp" ]; then
+    ENGINE_CONFIG="$FSDP_ENGINE_CONFIG"
+    echo "Using fsdp engine"
+    exp_name=pokemon-qwen3-2b-${backend}-${FSDP_STRATEGY}-sp${SP_SIZE}-fsdp-1202a1
+else
+    ENGINE_CONFIG="$MEGATRON_ENGINE_CONFIG"
+    echo "Using megatron engine"
+    exp_name=pokemon-qwen3-2b-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-vpp${VPP_SIZE}-cp${CP_SIZE}-megatron-1202a1
+fi
+
+CKPT_HOME=${CKPT_HOME:-$HOME/open_verl/sft/${project_name}/${exp_name}}
+mkdir -p "${CKPT_HOME}"
+
+torchrun --standalone --nnodes=1 --nproc-per-node=${NUM_TRAINERS:-8} \
+    ${ENTRYPOINT} \
+    data.train_files="${TRAIN_FILES}" \
+    data.train_batch_size=96 \
+    data.max_length=2048 \
+    data.pad_mode=${PAD_MODE} \
+    data.truncation=error \
+    data.use_dynamic_bsz=True \
+    data.max_token_len_per_gpu=65536 \
+    model.path=$MODEL_ID \
+    model.use_remove_padding=${USE_REMOVE_PADDING} \
+    ${ENGINE_CONFIG} \
+    trainer.test_freq=-1 \
+    trainer.save_freq=4000 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.total_epochs=10 \
+    trainer.default_local_dir="${CKPT_HOME}" \
+    trainer.resume_mode=${RESUME_MODE} \
+    trainer.max_ckpt_to_keep=5 \
+    checkpoint.save_contents=[model,optimizer,extra]
\ No newline at end of file
diff --git a/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_vllm_fsdp.sh b/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_vllm_fsdp.sh
index b1be3bf56cb..cf5e065097f 100644
--- a/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_vllm_fsdp.sh
+++ b/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_vllm_fsdp.sh
@@ -42,7 +42,6 @@ python3 -m verl.trainer.main_ppo \
     trainer.critic_warmup=0 \
     trainer.project_name='gsm8k_async_rl' \
     trainer.experiment_name='qwen2.5-3b_function_rm-gsm8k-sgl-multi-w-tool-verify-n16' \
-    trainer.device=npu \
     trainer.n_gpus_per_node=16 \
     trainer.nnodes=1 \
     trainer.save_freq=-1 \
diff --git a/recipe/dapo/main_dapo.py b/recipe/dapo/main_dapo.py
index 303c8edbf4b..870ce30b54f 100644
--- a/recipe/dapo/main_dapo.py
+++ b/recipe/dapo/main_dapo.py
@@ -22,23 +22,25 @@
 import ray
 from omegaconf import OmegaConf
 
+from verl.trainer.constants_ppo import get_ppo_ray_runtime_env
 from verl.trainer.ppo.reward import load_reward_manager
-from verl.utils.device import is_cuda_available
+from verl.utils.device import auto_set_ascend_device_name, is_cuda_available
 
 from .dapo_ray_trainer import RayDAPOTrainer
 
 
 @hydra.main(config_path="config", config_name="dapo_trainer", version_base=None)
 def main(config):
+    # Automatically set `config.trainer.device = npu` when running on Ascend NPU.
+    auto_set_ascend_device_name(config)
+
     run_ppo(config)
 
 
 def run_ppo(config) -> None:
     if not ray.is_initialized():
         # this is for local ray cluster
-        default_runtime_env = {
-            "env_vars": {"TOKENIZERS_PARALLELISM": "true", "NCCL_DEBUG": "WARN", "VLLM_LOGGING_LEVEL": "WARN"}
-        }
+        default_runtime_env = get_ppo_ray_runtime_env()
         ray_init_kwargs = config.ray_kwargs.get("ray_init", {})
         runtime_env_kwargs = ray_init_kwargs.get("runtime_env", {})
         runtime_env = OmegaConf.merge(default_runtime_env, runtime_env_kwargs)
diff --git a/recipe/dapo/run_dapo_qwen2.5_32b_npu.sh b/recipe/dapo/run_dapo_qwen2.5_32b_npu.sh
index 0e99b5fa6fd..bce3ab8eca6 100644
--- a/recipe/dapo/run_dapo_qwen2.5_32b_npu.sh
+++ b/recipe/dapo/run_dapo_qwen2.5_32b_npu.sh
@@ -135,7 +135,6 @@ ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     trainer.save_freq=20 \
     trainer.total_epochs=1 \
     trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.device=npu \
     trainer.resume_mode=auto \
     actor_rollout_ref.actor.fsdp_config.forward_prefetch=True \
     actor_rollout_ref.ref.fsdp_config.forward_prefetch=True \
diff --git a/recipe/dapo/run_dapo_qwen2.5_7b_npu.sh b/recipe/dapo/run_dapo_qwen2.5_7b_npu.sh
index bd6b3689b23..834ab21fa6d 100644
--- a/recipe/dapo/run_dapo_qwen2.5_7b_npu.sh
+++ b/recipe/dapo/run_dapo_qwen2.5_7b_npu.sh
@@ -133,7 +133,6 @@ ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     trainer.save_freq=20 \
     trainer.total_epochs=1 \
     trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.device=npu \
     trainer.resume_mode=auto \
     actor_rollout_ref.actor.entropy_checkpointing=True \
     actor_rollout_ref.ref.entropy_checkpointing=True \
diff --git a/recipe/dapo/run_dapo_qwen3_14b_base_npu.sh b/recipe/dapo/run_dapo_qwen3_14b_base_npu.sh
index 3c8a9e9d5a6..9e0fdae374c 100644
--- a/recipe/dapo/run_dapo_qwen3_14b_base_npu.sh
+++ b/recipe/dapo/run_dapo_qwen3_14b_base_npu.sh
@@ -136,5 +136,4 @@ ray job submit --runtime-env="${RUNTIME_ENV}" \
     actor_rollout_ref.actor.entropy_checkpointing=True \
     actor_rollout_ref.ref.entropy_checkpointing=True \
     actor_rollout_ref.actor.fsdp_config.forward_prefetch=True \
-    actor_rollout_ref.ref.fsdp_config.forward_prefetch=True \
-    trainer.device=npu
+    actor_rollout_ref.ref.fsdp_config.forward_prefetch=True
diff --git a/recipe/dapo/run_dapo_qwen3_8b_base_npu.sh b/recipe/dapo/run_dapo_qwen3_8b_base_npu.sh
index 29c7fd66b6c..9c34fa7e06c 100644
--- a/recipe/dapo/run_dapo_qwen3_8b_base_npu.sh
+++ b/recipe/dapo/run_dapo_qwen3_8b_base_npu.sh
@@ -135,5 +135,4 @@ ray job submit --runtime-env="${RUNTIME_ENV}" \
     actor_rollout_ref.actor.entropy_checkpointing=True \
     actor_rollout_ref.ref.entropy_checkpointing=True \
     actor_rollout_ref.actor.fsdp_config.forward_prefetch=True \
-    actor_rollout_ref.ref.fsdp_config.forward_prefetch=True \
-    trainer.device=npu
+    actor_rollout_ref.ref.fsdp_config.forward_prefetch=True
diff --git a/recipe/dapo/run_dapo_qwen3_moe_30b_base_fsdp_npu.sh b/recipe/dapo/run_dapo_qwen3_moe_30b_base_fsdp_npu.sh
index d399ddfe8b7..52fb0b4e6a5 100644
--- a/recipe/dapo/run_dapo_qwen3_moe_30b_base_fsdp_npu.sh
+++ b/recipe/dapo/run_dapo_qwen3_moe_30b_base_fsdp_npu.sh
@@ -138,7 +138,6 @@ ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     trainer.test_freq=5 \
     trainer.save_freq=-1 \
     trainer.total_epochs=1 \
-    trainer.device="npu" \
     actor_rollout_ref.actor.use_torch_compile=False \
     actor_rollout_ref.ref.use_torch_compile=False 
    
diff --git a/recipe/dapo/run_dapo_qwen3_moe_30b_megatron_npu.sh b/recipe/dapo/run_dapo_qwen3_moe_30b_megatron_npu.sh
index 9e8d21d8890..24624275929 100644
--- a/recipe/dapo/run_dapo_qwen3_moe_30b_megatron_npu.sh
+++ b/recipe/dapo/run_dapo_qwen3_moe_30b_megatron_npu.sh
@@ -160,7 +160,6 @@ ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     trainer.save_freq=-1 \
     trainer.total_epochs=1 \
     trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.device="npu" \
     actor_rollout_ref.nccl_timeout=14400 \
     actor_rollout_ref.actor.use_torch_compile=False \
     actor_rollout_ref.ref.use_torch_compile=False \
diff --git a/recipe/fapo/README.md b/recipe/fapo/README.md
index 485072c409f..4401bbc4f7a 100644
--- a/recipe/fapo/README.md
+++ b/recipe/fapo/README.md
@@ -78,3 +78,12 @@ bash recipe/fapo/run_fapo_32b.sh  # 32b fapo model
 We implement RewardLoop to enable efficient and flexible reward computation.
 The core implementation can be found in `verl/experimental/reward/`.
 Refer to [this official document](https://verl.readthedocs.io/en/latest/advance/reward_loop.html) for more implementation details.
+
+```bibtex
+@article{ding2025fapo,
+  title={FAPO: Flawed-Aware Policy Optimization for Efficient and Reliable Reasoning},
+  author={Ding, Yuyang and Zhang, Chi and Li, Juntao and Lin, Haibin and Liu, Xin and Zhang, Min},
+  journal={arXiv preprint arXiv:2510.22543},
+  year={2025}
+}
+```
\ No newline at end of file
diff --git a/recipe/fapo/run_baseline_32b.sh b/recipe/fapo/run_baseline_32b.sh
index 3bb14bed7e3..f788066b5c5 100644
--- a/recipe/fapo/run_baseline_32b.sh
+++ b/recipe/fapo/run_baseline_32b.sh
@@ -53,15 +53,10 @@ offload=True
 gen_tp=4
 fsdp_size=32
 
-PROJECT_DIR="$(pwd)"
-CONFIG_PATH="$PROJECT_DIR/recipe/fapo/config"
-
 ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     --address "${RAY_ADDRESS}" \
     --working-dir "${WORKING_DIR}" \
     -- python3 -m verl.trainer.main_ppo \
-    --config-path $CONFIG_PATH \
-    --config-name rm_config.yaml \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
diff --git a/recipe/fapo/run_baseline_7b.sh b/recipe/fapo/run_baseline_7b.sh
index b39014f1e19..77605b1bbac 100644
--- a/recipe/fapo/run_baseline_7b.sh
+++ b/recipe/fapo/run_baseline_7b.sh
@@ -54,15 +54,10 @@ offload=True
 gen_tp=1
 fsdp_size=8
 
-PROJECT_DIR="$(pwd)"
-CONFIG_PATH="$PROJECT_DIR/recipe/fapo/config"
-
 ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     --address "${RAY_ADDRESS}" \
     --working-dir "${WORKING_DIR}" \
     -- python3 -m verl.trainer.main_ppo \
-    --config-path $CONFIG_PATH \
-    --config-name rm_config.yaml \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
diff --git a/recipe/fapo/run_fapo_32b.sh b/recipe/fapo/run_fapo_32b.sh
index bb442b76e4e..f458070c4a7 100644
--- a/recipe/fapo/run_fapo_32b.sh
+++ b/recipe/fapo/run_fapo_32b.sh
@@ -55,15 +55,10 @@ offload=True
 gen_tp=4
 fsdp_size=32
 
-PROJECT_DIR="$(pwd)"
-CONFIG_PATH="$PROJECT_DIR/recipe/fapo/config"
-
 ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     --address "${RAY_ADDRESS}" \
     --working-dir "${WORKING_DIR}" \
     -- python3 -m verl.trainer.main_ppo \
-    --config-path $CONFIG_PATH \
-    --config-name rm_config.yaml \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
diff --git a/recipe/fapo/run_fapo_32b_remote.sh b/recipe/fapo/run_fapo_32b_remote.sh
index 748d1bbf0ed..8833f109138 100644
--- a/recipe/fapo/run_fapo_32b_remote.sh
+++ b/recipe/fapo/run_fapo_32b_remote.sh
@@ -53,15 +53,10 @@ offload=True
 gen_tp=4
 fsdp_size=32
 
-PROJECT_DIR="$(pwd)"
-CONFIG_PATH="$PROJECT_DIR/recipe/fapo/config"
-
 ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     --address "${RAY_ADDRESS}" \
     --working-dir "${WORKING_DIR}" \
     -- python3 -m verl.trainer.main_ppo \
-    --config-path $CONFIG_PATH \
-    --config-name rm_config.yaml \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
diff --git a/recipe/fapo/run_fapo_7b.sh b/recipe/fapo/run_fapo_7b.sh
index 046b2b26873..96884d94e9b 100644
--- a/recipe/fapo/run_fapo_7b.sh
+++ b/recipe/fapo/run_fapo_7b.sh
@@ -56,15 +56,10 @@ offload=True
 gen_tp=1
 fsdp_size=8
 
-PROJECT_DIR="$(pwd)"
-CONFIG_PATH="$PROJECT_DIR/recipe/fapo/config"
-
 ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     --address "${RAY_ADDRESS}" \
     --working-dir "${WORKING_DIR}" \
     -- python3 -m verl.trainer.main_ppo \
-    --config-path $CONFIG_PATH \
-    --config-name rm_config.yaml \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
diff --git a/recipe/fapo/run_fapo_7b_remote.sh b/recipe/fapo/run_fapo_7b_remote.sh
index 1bd757bfc99..663e10c385b 100644
--- a/recipe/fapo/run_fapo_7b_remote.sh
+++ b/recipe/fapo/run_fapo_7b_remote.sh
@@ -54,15 +54,10 @@ offload=True
 gen_tp=1
 fsdp_size=8
 
-PROJECT_DIR="$(pwd)"
-CONFIG_PATH="$PROJECT_DIR/recipe/fapo/config"
-
 ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     --address "${RAY_ADDRESS}" \
     --working-dir "${WORKING_DIR}" \
     -- python3 -m verl.trainer.main_ppo \
-    --config-path $CONFIG_PATH \
-    --config-name rm_config.yaml \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py
index 676aa524c3d..d486579596f 100644
--- a/recipe/fully_async_policy/agent_loop/agent_loop.py
+++ b/recipe/fully_async_policy/agent_loop/agent_loop.py
@@ -27,8 +27,8 @@
     AgentLoopOutput,
     AgentLoopWorkerBase,
     AsyncLLMServerManager,
+    DictConfigWrap,
     _agent_loop_registry,
-    _DummyConfig,
     get_trajectory_info,
 )
 from verl.experimental.agent_loop.prometheus_utils import update_prometheus_config
@@ -182,7 +182,7 @@ async def _partial_run_agent_loop(
                 agent_loop_config = _agent_loop_registry[agent_name]
                 agent_loop = hydra.utils.instantiate(
                     config=agent_loop_config,
-                    trainer_config=_DummyConfig(config=self.config),
+                    trainer_config=DictConfigWrap(config=self.config),
                     server_manager=self.server_manager,
                     tokenizer=self.tokenizer,
                     processor=self.processor,
diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py
index c185e34c795..95fff9c478f 100644
--- a/recipe/fully_async_policy/fully_async_rollouter.py
+++ b/recipe/fully_async_policy/fully_async_rollouter.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import asyncio
 import os
 import time
diff --git a/recipe/fully_async_policy/shell/dapo_30b_a3b_base_math_fsdp.sh b/recipe/fully_async_policy/shell/dapo_30b_a3b_base_math_fsdp.sh
new file mode 100644
index 00000000000..c061ad2b1ee
--- /dev/null
+++ b/recipe/fully_async_policy/shell/dapo_30b_a3b_base_math_fsdp.sh
@@ -0,0 +1,191 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+project_name='DAPO-Qwen3-30B-A3B-Base-Async'
+exp_name='Fsdp2-tp4sp4'
+
+# Ray
+RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+# Paths
+DATA_PATH=${RAY_DATA_HOME:-"${HOME}/verl"}
+DATA_PATH=${DATA_PATH:-"/mnt/bn/${BYTENAS}"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${DATA_PATH}/shared/models/Qwen3-30B-A3B-Base"}
+CKPTS_DIR=${CKPTS_DIR:-"${DATA_PATH}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${DATA_PATH}/shared/data/dapo-math/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${DATA_PATH}/shared/data/dapo-math/aime-2024.parquet"}
+
+
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+# Algorithm parameters
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+# Response length parameters
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 20))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+
+# Training parameters
+loss_agg_mode="token-mean"
+enable_filter_groups=True
+filter_groups_metric=acc
+max_num_gen_batches=10
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+
+NNODES=${NNODES:-4}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+
+# Fully async specific parameters
+n_gpus_rollout=8
+n_gpus_training=8 
+n_nodes_rollout=2 
+n_nodes_train=2 # $((NNODES - n_nodes_rollout))
+
+train_bsz=512
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+total_rollout_steps=$(((train_bsz * 400)))
+test_freq=25
+staleness_threshold=0.6 # 0 0.3 1
+require_batches=1
+total_train_gpus=$((n_gpus_training * n_nodes_train))
+total_rollout_gpus=$((n_gpus_rollout * n_nodes_rollout))
+trigger_parameter_sync_step=$((train_bsz / ( train_prompt_mini_bsz * require_batches))) # 8 16 32
+partial_rollout=True
+enforce_eager=False
+nccl_timeout=72000
+enable_sleep_mode=False
+
+# Performance Related Parameter
+sp_size=4
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
+infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
+ref_offload=True
+actor_offload=False
+gen_tp=4
+fsdp_size=-1
+
+
+ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
+    --working-dir "${WORKING_DIR}" \
+    --address "${RAY_ADDRESS}" \
+    -- python3 -m recipe.fully_async_policy.fully_async_main \
+    --config-path=config \
+    --config-name='fully_async_dapo_trainer.yaml' \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    actor_rollout_ref.actor.strategy=fsdp \
+    critic.strategy=fsdp \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    actor_rollout_ref.nccl_timeout=${nccl_timeout} \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.hybrid_engine=False \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.50 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    +actor_rollout_ref.rollout.enable_sleep_mode=${enable_sleep_mode} \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.enforce_eager=${enforce_eager} \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    reward_model.reward_manager=dapo \
+    reward_model.overlong_buffer.enable=${enable_overlong_buffer} \
+    reward_model.overlong_buffer.len=${overlong_buffer_len} \
+    reward_model.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}-i${total_rollout_gpus}_t${total_train_gpus}_s${staleness_threshold}" \
+    trainer.val_before_train=True \
+    trainer.test_freq="${test_freq}" \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.nnodes="${n_nodes_train}" \
+    trainer.n_gpus_per_node="${n_gpus_training}" \
+    rollout.nnodes="${n_nodes_rollout}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.test_freq=${test_freq} \
+    rollout.total_epochs=10 \
+    async_training.require_batches=${require_batches} \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}" \
+    async_training.use_rollout_log_probs=True
diff --git a/recipe/one_step_off_policy/README.md b/recipe/one_step_off_policy/README.md
index c698e2cf178..2cb0b9b85e8 100644
--- a/recipe/one_step_off_policy/README.md
+++ b/recipe/one_step_off_policy/README.md
@@ -215,7 +215,7 @@ def sync_rollout_weights(self):
 ### PPO Correctness
 To ensure the correctness of the PPO algorithm, we use rollout log_probs for PPO importance sampling. 
 For the related algorithm details, please refer to: https://verl.readthedocs.io/en/latest/algo/rollout_corr_math.html
-The default mode is ppo_is_bypass, but other modification strategies can also be explored.
+The default mode is `bypass_ppo_clip`, but other modification strategies can also be explored.
 
 ### AgentLoop
 In the current implementation, we no longer provide SPMD model rollout mode. 
@@ -297,9 +297,6 @@ python3 -m recipe.one_step_off_policy.async_main_ppo \
    > - When `trainer.n_gpus_per_node + rollout.n_gpus_per_node > physical_gpus_per_node`,
        > the required node count is `trainer.nnodes + rollout.nnodes`
 
-3. **Ascend NPU Configuration**
-    If you are using Ascend NPU devices, add the following parameter:
-    - `trainer.device=npu`
 
 ## Functional Support
 
diff --git a/recipe/one_step_off_policy/main_ppo.py b/recipe/one_step_off_policy/main_ppo.py
index fa62f7c7500..c24b4d01774 100644
--- a/recipe/one_step_off_policy/main_ppo.py
+++ b/recipe/one_step_off_policy/main_ppo.py
@@ -30,6 +30,7 @@
 from verl.trainer.ppo.reward import load_reward_manager
 from verl.trainer.ppo.utils import Role, need_reference_policy
 from verl.utils.config import validate_config
+from verl.utils.device import auto_set_ascend_device_name
 
 
 def create_resource_pool_manager(config, roles: list) -> ResourcePoolManager:
@@ -222,6 +223,10 @@ def main(config):
     from verl.trainer.main_ppo import run_ppo
 
     start_time = time()
+
+    # Automatically set `config.trainer.device = npu` when running on Ascend NPU.
+    auto_set_ascend_device_name(config)
+
     run_ppo(config, task_runner_class=OneStepTaskRunner)
     print(f"total time: {time() - start_time:.2f} seconds")
 
diff --git a/recipe/one_step_off_policy/ray_trainer.py b/recipe/one_step_off_policy/ray_trainer.py
index 76415016743..c3890f61bb9 100644
--- a/recipe/one_step_off_policy/ray_trainer.py
+++ b/recipe/one_step_off_policy/ray_trainer.py
@@ -531,9 +531,9 @@ async def fit(self):
                 rollout_corr_config = self.config.algorithm.get("rollout_correction", None)
                 bypass_recomputing_logprobs = rollout_corr_config and rollout_corr_config.get("bypass_mode", False)
                 if bypass_recomputing_logprobs:  # Use `rollout_log_probs`
-                    from verl.trainer.ppo.rollout_corr_helper import apply_rollout_correction
+                    from verl.trainer.ppo.rollout_corr_helper import apply_bypass_mode
 
-                    apply_rollout_correction(
+                    apply_bypass_mode(
                         batch=batch,
                         rollout_corr_config=rollout_corr_config,
                         policy_loss_config=self.config.actor_rollout_ref.actor.policy_loss,
diff --git a/recipe/one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh b/recipe/one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh
index e06f65d82e4..9fcddfe246e 100644
--- a/recipe/one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh
+++ b/recipe/one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh
@@ -87,7 +87,6 @@ python3 -m recipe.one_step_off_policy.main_ppo \
     trainer.save_freq=10 \
     trainer.test_freq=-1 \
     trainer.total_epochs=15 \
-    trainer.device=npu \
     trainer.resume_mode=auto \
     trainer.nnodes="${NNODES}" \
     trainer.n_gpus_per_node="${n_gpus_training}" \
diff --git a/recipe/open_math_reasoning/run_sft_qwen3_8b.sh b/recipe/open_math_reasoning/run_sft_qwen3_8b.sh
index 3b7e9bb5c6c..ec564a1d602 100644
--- a/recipe/open_math_reasoning/run_sft_qwen3_8b.sh
+++ b/recipe/open_math_reasoning/run_sft_qwen3_8b.sh
@@ -55,7 +55,7 @@ MEGATRON_ENGINE_CONFIG="\
     engine.pipeline_model_parallel_size=${PP_SIZE} \
     engine.virtual_pipeline_model_parallel_size=${VPP_SIZE} \
     engine.context_parallel_size=${CP_SIZE} \
-    engine.use_mbridge=False"
+    engine.use_mbridge=True"
 
 if [ "$backend" = "fsdp" ]; then
     ENGINE_CONFIG="$FSDP_ENGINE_CONFIG"
diff --git a/recipe/r1_ascend/main_ppo.py b/recipe/r1_ascend/main_ppo.py
index 57d9b9796a8..2c3614de460 100644
--- a/recipe/r1_ascend/main_ppo.py
+++ b/recipe/r1_ascend/main_ppo.py
@@ -27,7 +27,7 @@
 
 from verl.trainer.constants_ppo import get_ppo_ray_runtime_env
 from verl.trainer.main_ppo import TaskRunner as TaskRunnerBase
-from verl.utils.device import is_cuda_available
+from verl.utils.device import auto_set_ascend_device_name, is_cuda_available
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
@@ -40,6 +40,9 @@ def main(config):
     Args:
         config_dict: Hydra configuration dictionary containing training parameters.
     """
+    # Automatically set `config.trainer.device = npu` when running on Ascend NPU.
+    auto_set_ascend_device_name(config)
+
     run_ppo(config)
 
 
diff --git a/recipe/r1_ascend/run_deepseekv3_671b_grpo_megatron_npu.sh b/recipe/r1_ascend/run_deepseekv3_671b_grpo_megatron_npu.sh
index 2bade83d4c1..44ac1e2e57a 100644
--- a/recipe/r1_ascend/run_deepseekv3_671b_grpo_megatron_npu.sh
+++ b/recipe/r1_ascend/run_deepseekv3_671b_grpo_megatron_npu.sh
@@ -105,7 +105,6 @@ python3 -m recipe.r1_ascend.main_ppo \
     trainer.test_freq=5 \
     trainer.save_freq=-1 \
     trainer.total_epochs=1 \
-    trainer.device="npu" \
     +actor_rollout_ref.actor.megatron.override_transformer_config.multi_head_latent_attention=True \
     +actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True \
     +actor_rollout_ref.actor.megatron.override_transformer_config.pipeline_num_transformer_layers=[[6],[8],[8],[8],[8],[8],[8],[7]] \
diff --git a/recipe/spin/utils.py b/recipe/spin/utils.py
index 571ad1e9154..e3855f64541 100644
--- a/recipe/spin/utils.py
+++ b/recipe/spin/utils.py
@@ -92,7 +92,11 @@ def check_mutually_exclusive(mbs, mbs_per_gpu, name: str):
         )
 
     # Check for reward model micro-batch size conflicts
-    if config.reward_model.enable and not config.reward_model.use_dynamic_bsz:
+    if (
+        config.reward_model.enable
+        and not config.reward_model.use_dynamic_bsz
+        and not config.reward_model.use_reward_loop
+    ):
         check_mutually_exclusive(
             config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu, "reward_model"
         )
diff --git a/recipe/transfer_queue/main_ppo.py b/recipe/transfer_queue/main_ppo.py
index 4f982795eeb..236d59d51fb 100644
--- a/recipe/transfer_queue/main_ppo.py
+++ b/recipe/transfer_queue/main_ppo.py
@@ -33,7 +33,7 @@
 from verl.trainer.ppo.reward import load_reward_manager
 from verl.trainer.ppo.utils import need_critic, need_reference_policy
 from verl.utils.config import validate_config
-from verl.utils.device import is_cuda_available
+from verl.utils.device import auto_set_ascend_device_name, is_cuda_available
 
 from .ray_trainer import RayPPOTrainer
 
@@ -45,6 +45,9 @@ def main(config):
     Args:
         config_dict: Hydra configuration dictionary containing training parameters.
     """
+    # Automatically set `config.trainer.device = npu` when running on Ascend NPU.
+    auto_set_ascend_device_name(config)
+
     run_ppo(config)
 
 
diff --git a/recipe/transfer_queue/ray_trainer.py b/recipe/transfer_queue/ray_trainer.py
index 2acef1f84af..b3e7597cf4b 100644
--- a/recipe/transfer_queue/ray_trainer.py
+++ b/recipe/transfer_queue/ray_trainer.py
@@ -1315,15 +1315,10 @@ def fit(self):
                     batch_dict, repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True
                 )
                 batch: TensorDict = self.dict_to_tensordict(repeated_batch_dict)
-                asyncio.run(self.tq_client.async_put(data=batch, partition_id=f"train_{self.global_steps - 1}"))
-
                 gen_meta = asyncio.run(
-                    self.tq_client.async_get_meta(
-                        data_fields=list(batch.keys()),  # TODO (TQ): Get metadata by specified fields
-                        task_name="generate_sequences",
-                        **base_get_meta_kwargs,
-                    )
+                    self.tq_client.async_put(data=batch, partition_id=f"train_{self.global_steps - 1}")
                 )
+
                 # pass global_steps to trace
                 gen_meta.set_extra_info("global_steps", self.global_steps)
 
@@ -1411,14 +1406,9 @@ def fit(self):
                         ]
                         if "rm_scores" in batch_meta.field_names:
                             compute_reward_fields.append("rm_scores")
-                        compute_reward_meta = asyncio.run(
-                            self.tq_client.async_get_meta(
-                                data_fields=compute_reward_fields,
-                                task_name="compute_reward",
-                                **base_get_meta_kwargs,
-                            )
-                        )
-                        compute_reward_meta.reorder(balanced_idx)
+
+                        compute_reward_meta = batch_meta.select_fields(compute_reward_fields)
+
                         if self.config.reward_model.launch_reward_fn_async:
                             future_reward = compute_reward_async_decorated(
                                 data=compute_reward_meta,
@@ -1432,31 +1422,26 @@ def fit(self):
 
                     # recompute old_log_probs
                     with marked_timer("old_log_prob", timing_raw, color="blue"):
-                        old_log_prob_meta = asyncio.run(
-                            self.tq_client.async_get_meta(
-                                data_fields=[
-                                    "input_ids",
-                                    "attention_mask",
-                                    "position_ids",
-                                    "prompts",
-                                    "responses",
-                                    "response_mask",
-                                    "data_source",
-                                    "reward_model",
-                                    "extra_info",
-                                    "uid",
-                                    "index",
-                                    "tools_kwargs",
-                                    "interaction_kwargs",
-                                    "ability",
-                                ],
-                                task_name="compute_log_prob",
-                                **base_get_meta_kwargs,
-                            )
-                        )
-                        old_log_prob_meta.reorder(balanced_idx)
-
+                        old_log_prob_meta_fields = [
+                            "input_ids",
+                            "attention_mask",
+                            "position_ids",
+                            "prompts",
+                            "responses",
+                            "response_mask",
+                            "data_source",
+                            "reward_model",
+                            "extra_info",
+                            "uid",
+                            "index",
+                            "tools_kwargs",
+                            "interaction_kwargs",
+                            "ability",
+                        ]
+                        old_log_prob_meta = batch_meta.select_fields(old_log_prob_meta_fields)
                         old_log_prob_output_meta = self.actor_rollout_wg.compute_log_prob(old_log_prob_meta)
+                        batch_meta = batch_meta.union(old_log_prob_output_meta)
+
                         data = asyncio.run(self.tq_client.async_get_data(old_log_prob_output_meta))
                         entropys = data["entropys"]
                         response_masks = data["response_mask"]
@@ -1470,52 +1455,39 @@ def fit(self):
                         old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()}
                         metrics.update(old_log_prob_metrics)
 
-                        batch_meta = batch_meta.union(old_log_prob_output_meta)
-
                         if "rollout_log_probs" in batch_meta.field_names:
                             # TODO: we may want to add diff of probs too.
-                            data_fields = ["rollout_log_probs", "old_log_probs", "responses"]
+                            calculate_debug_metrics_fields = ["rollout_log_probs", "old_log_probs", "responses"]
+
                             if "response_mask" in batch_meta.field_names:
-                                data_fields.append("response_mask")
+                                calculate_debug_metrics_fields.append("response_mask")
                             if "attention_mask" in batch_meta.field_names:
-                                data_fields.append("attention_mask")
-                            calculate_debug_metrics_meta = asyncio.run(
-                                self.tq_client.async_get_meta(
-                                    data_fields=data_fields,
-                                    task_name="calculate_debug_metrics",
-                                    **base_get_meta_kwargs,
-                                )
-                            )
-                            calculate_debug_metrics_meta.reorder(balanced_idx)
+                                calculate_debug_metrics_fields.append("attention_mask")
 
+                            calculate_debug_metrics_meta = batch_meta.select_fields(calculate_debug_metrics_fields)
                             metrics.update(calculate_debug_metrics_decorated(calculate_debug_metrics_meta))
 
                     if self.use_reference_policy:
                         # compute reference log_prob
-                        ref_log_prob_meta = asyncio.run(
-                            self.tq_client.async_get_meta(
-                                data_fields=[
-                                    "input_ids",
-                                    "attention_mask",
-                                    "position_ids",
-                                    "prompts",
-                                    "responses",
-                                    "response_mask",
-                                    "old_log_probs",
-                                    "data_source",
-                                    "reward_model",
-                                    "extra_info",
-                                    "uid",
-                                    "index",
-                                    "tools_kwargs",
-                                    "interaction_kwargs",
-                                    "ability",
-                                ],
-                                task_name="compute_ref_log_prob",
-                                **base_get_meta_kwargs,
-                            )
-                        )
-                        ref_log_prob_meta.reorder(balanced_idx)
+                        ref_log_prob_fields = [
+                            "input_ids",
+                            "attention_mask",
+                            "position_ids",
+                            "prompts",
+                            "responses",
+                            "response_mask",
+                            "old_log_probs",
+                            "data_source",
+                            "reward_model",
+                            "extra_info",
+                            "uid",
+                            "index",
+                            "tools_kwargs",
+                            "interaction_kwargs",
+                            "ability",
+                        ]
+                        ref_log_prob_meta = batch_meta.select_fields(ref_log_prob_fields)
+
                         with marked_timer("ref", timing_raw, color="olive"):
                             if not self.ref_in_actor:
                                 ref_log_prob_output_meta = self.ref_policy_wg.compute_ref_log_prob(ref_log_prob_meta)
@@ -1535,14 +1507,14 @@ def fit(self):
                         if self.config.reward_model.launch_reward_fn_async:
                             reward_tensor, reward_extra_infos_dict = ray.get(future_reward)
                         reward_td = TensorDict({"token_level_scores": reward_tensor}, batch_size=reward_tensor.size(0))
-                        asyncio.run(self.tq_client.async_put(data=reward_td, metadata=batch_meta))
-                        batch_meta.add_fields(reward_td)
+                        batch_meta = asyncio.run(self.tq_client.async_put(data=reward_td, metadata=batch_meta))
 
                         if reward_extra_infos_dict:
                             reward_extra_infos_dict_new = {k: np.array(v) for k, v in reward_extra_infos_dict.items()}
                             reward_extra_infos_td = self.dict_to_tensordict(reward_extra_infos_dict_new)
-                            asyncio.run(self.tq_client.async_put(data=reward_extra_infos_td, metadata=batch_meta))
-                            batch_meta.add_fields(reward_extra_infos_td)
+                            batch_meta = asyncio.run(
+                                self.tq_client.async_put(data=reward_extra_infos_td, metadata=batch_meta)
+                            )
 
                         # compute rewards. apply_kl_penalty if available
                         if self.config.algorithm.use_kl_in_reward:
@@ -1552,14 +1524,9 @@ def fit(self):
                                 "old_log_probs",
                                 "ref_log_prob",
                             ]
-                            apply_kl_penalty_meta = asyncio.run(
-                                self.tq_client.async_get_meta(
-                                    data_fields=apply_kl_penalty_fields,
-                                    task_name="apply_kl_penalty",
-                                    **base_get_meta_kwargs,
-                                )
-                            )
-                            apply_kl_penalty_meta.reorder(balanced_idx)
+
+                            apply_kl_penalty_meta = batch_meta.select_fields(apply_kl_penalty_fields)
+
                             token_level_rewards, kl_metrics = apply_kl_penalty(
                                 apply_kl_penalty_meta,
                                 kl_ctrl=self.kl_ctrl_in_reward,
@@ -1568,31 +1535,24 @@ def fit(self):
                             token_level_rewards_td = TensorDict(
                                 {"token_level_rewards": token_level_rewards}, batch_size=token_level_rewards.size(0)
                             )
-                            asyncio.run(
+                            apply_kl_penalty_meta = asyncio.run(
                                 self.tq_client.async_put(data=token_level_rewards_td, metadata=apply_kl_penalty_meta)
                             )
-                            apply_kl_penalty_meta.add_fields(token_level_rewards_td)
 
                             metrics.update(kl_metrics)
                             batch_meta = batch_meta.union(apply_kl_penalty_meta)
                         else:
-                            token_level_scores_meta = asyncio.run(
-                                self.tq_client.async_get_meta(
-                                    data_fields=["token_level_scores"],
-                                    task_name="token_level_scores",
-                                    **base_get_meta_kwargs,
-                                )
-                            )
-                            token_level_scores_meta.reorder(balanced_idx)
+                            token_level_scores_meta = batch_meta.select_fields(["token_level_scores"])
+
                             data = asyncio.run(self.tq_client.async_get_data(token_level_scores_meta))
                             token_level_rewards_td = TensorDict(
                                 {"token_level_rewards": data["token_level_scores"]},
                                 batch_size=data["token_level_scores"].size(0),
                             )
-                            asyncio.run(
+                            token_level_scores_meta = asyncio.run(
                                 self.tq_client.async_put(data=token_level_rewards_td, metadata=token_level_scores_meta)
                             )
-                            batch_meta.add_fields(token_level_rewards_td)
+                            batch_meta = batch_meta.union(token_level_scores_meta)
 
                         # compute advantages, executed on the driver process
 
@@ -1617,14 +1577,7 @@ def fit(self):
                             if "reward_baselines" in batch_meta.field_names:
                                 compute_advantage_fields.append("reward_baselines")
 
-                        compute_advantage_meta = asyncio.run(
-                            self.tq_client.async_get_meta(
-                                data_fields=compute_advantage_fields,
-                                task_name="compute_advantage",
-                                **base_get_meta_kwargs,
-                            )
-                        )
-                        compute_advantage_meta.reorder(balanced_idx)
+                        compute_advantage_meta = batch_meta.select_fields(compute_advantage_fields)
 
                         advantages, returns = compute_advantage(
                             compute_advantage_meta,
@@ -1639,9 +1592,9 @@ def fit(self):
                         advantages_td = TensorDict(
                             {"advantages": advantages, "returns": returns}, batch_size=advantages.size(0)
                         )
-                        asyncio.run(self.tq_client.async_put(data=advantages_td, metadata=compute_advantage_meta))
-                        compute_advantage_meta.add_fields(advantages_td)
-
+                        compute_advantage_meta = asyncio.run(
+                            self.tq_client.async_put(data=advantages_td, metadata=compute_advantage_meta)
+                        )
                         batch_meta = batch_meta.union(compute_advantage_meta)
 
                     # update critic
@@ -1660,37 +1613,30 @@ def fit(self):
                                 self.config.actor_rollout_ref.rollout.multi_turn.enable
                             )
 
-                            update_actor_meta = asyncio.run(
-                                self.tq_client.async_get_meta(
-                                    data_fields=[
-                                        "input_ids",
-                                        "attention_mask",
-                                        "position_ids",
-                                        "prompts",
-                                        "responses",
-                                        "response_mask",
-                                        "old_log_probs",
-                                        "ref_log_prob",
-                                        "advantages",
-                                        "returns",
-                                        "token_level_rewards",
-                                        "token_level_scores",
-                                        "data_source",
-                                        "reward_model",
-                                        "extra_info",
-                                        "uid",
-                                        "index",
-                                        "tools_kwargs",
-                                        "interaction_kwargs",
-                                        "ability",
-                                    ],
-                                    batch_size=self.config.data.train_batch_size
-                                    * self.config.actor_rollout_ref.rollout.n,
-                                    partition_id=f"train_{self.global_steps - 1}",
-                                    task_name="update_actor",
-                                )
-                            )
-                            update_actor_meta.reorder(balanced_idx)
+                            update_actor_fields = [
+                                "input_ids",
+                                "attention_mask",
+                                "position_ids",
+                                "prompts",
+                                "responses",
+                                "response_mask",
+                                "old_log_probs",
+                                "ref_log_prob",
+                                "advantages",
+                                "returns",
+                                "token_level_rewards",
+                                "token_level_scores",
+                                "data_source",
+                                "reward_model",
+                                "extra_info",
+                                "uid",
+                                "index",
+                                "tools_kwargs",
+                                "interaction_kwargs",
+                                "ability",
+                            ]
+                            update_actor_meta = batch_meta.select_fields(update_actor_fields)
+
                             update_actor_meta.set_extra_info(
                                 "global_token_num", batch_meta.get_extra_info("global_token_num")
                             )
@@ -1704,22 +1650,12 @@ def fit(self):
                     # Log rollout generations if enabled
                     rollout_data_dir = self.config.trainer.get("rollout_data_dir", None)
                     if rollout_data_dir:
-                        data_fields = ["prompts", "responses", "token_level_scores", "reward_model"]
+                        log_rollout_fields = ["prompts", "responses", "token_level_scores", "reward_model"]
                         if "request_id" in batch_meta.field_names:
-                            data_fields.append("request_id")
-                        log_rollout_meta = asyncio.run(
-                            self.tq_client.async_get_meta(
-                                data_fields=data_fields,
-                                batch_size=self.config.data.train_batch_size * self.config.actor_rollout_ref.rollout.n,
-                                partition_id=f"train_{self.global_steps - 1}",
-                                task_name="log_rollout",
-                            )
-                        )
-                        log_rollout_meta.reorder(balanced_idx)
+                            log_rollout_fields.append("request_id")
+                        log_rollout_meta = batch_meta.select_fields(log_rollout_fields)
                         self._log_rollout_data(log_rollout_meta, reward_extra_infos_dict, timing_raw, rollout_data_dir)
 
-                # TODO: clear meta after iteration
-
                 # TODO: validate
                 if (
                     self.val_reward_fn is not None
diff --git a/recipe/transfer_queue/run_qwen3-8b_transferqueue.sh b/recipe/transfer_queue/run_qwen3-8b_transferqueue.sh
index a6a013903b8..573e71a1f1b 100644
--- a/recipe/transfer_queue/run_qwen3-8b_transferqueue.sh
+++ b/recipe/transfer_queue/run_qwen3-8b_transferqueue.sh
@@ -9,6 +9,9 @@ mkdir -p ${log_dir}
 timestamp=$(date +"%Y%m%d%H%M%S")
 log_file="${log_dir}/qwen3-8b_tq_${timestamp}.log"
 
+# You may try to enable zero-copy serialization for TransferQueue when using SimpleStorageUnit backend.
+export TQ_ZERO_COPY_SERIALIZATION=False
+
 rollout_mode="async"
 rollout_name="vllm" # sglang or vllm
 if [ "$rollout_mode" = "async" ]; then
diff --git a/requirements-npu.txt b/requirements-npu.txt
index 90304a94561..ea197c98f31 100644
--- a/requirements-npu.txt
+++ b/requirements-npu.txt
@@ -11,7 +11,7 @@ pyarrow>=15.0.0
 pybind11
 pylatexenc
 tensordict>=0.8.0,<=0.10.0,!=0.9.0
-ray==2.46.0
+ray[default]
 wandb
 mathruler
 torchdata
diff --git a/requirements_transferqueue.txt b/requirements_transferqueue.txt
deleted file mode 100644
index b4a1034f42d..00000000000
--- a/requirements_transferqueue.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-# requirements.txt records the full set of dependencies for development
-TransferQueue==0.1.2.dev0
diff --git a/setup.py b/setup.py
index 8c9f5e1026d..9f5fbb03b67 100644
--- a/setup.py
+++ b/setup.py
@@ -57,7 +57,7 @@
 ]
 TRL_REQUIRES = ["trl<=0.9.6"]
 MCORE_REQUIRES = ["mbridge"]
-TRANSFERQUEUE_REQUIRES = ["TransferQueue==0.1.2.dev0"]
+TRANSFERQUEUE_REQUIRES = ["TransferQueue==0.1.4.dev1"]
 
 extras_require = {
     "test": TEST_REQUIRES,
diff --git a/tests/experimental/reward/test_agent_loop_reward_manager.py b/tests/experimental/reward/test_agent_loop_reward_manager.py
index a7e3043835a..05ccc71c3e8 100644
--- a/tests/experimental/reward/test_agent_loop_reward_manager.py
+++ b/tests/experimental/reward/test_agent_loop_reward_manager.py
@@ -35,8 +35,8 @@ def test_agent_loop_reward_manager():
             }
         }
     )
-    with initialize_config_dir(config_dir=os.path.abspath("recipe/fapo/config")):
-        config = compose("rm_config")
+    with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+        config = compose(config_name="ppo_trainer")
 
     rollout_model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-0.5B-Instruct")
     reward_model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct")
diff --git a/tests/experimental/reward/test_agent_reward_loop_colocate.py b/tests/experimental/reward/test_agent_reward_loop_colocate.py
index 9363944b510..5f76bab25dd 100644
--- a/tests/experimental/reward/test_agent_reward_loop_colocate.py
+++ b/tests/experimental/reward/test_agent_reward_loop_colocate.py
@@ -39,8 +39,8 @@ def test_agent_loop_reward_manager():
             }
         }
     )
-    with initialize_config_dir(config_dir=os.path.abspath("recipe/fapo/config")):
-        config = compose("rm_config")
+    with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+        config = compose(config_name="ppo_trainer")
 
     rollout_model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-0.5B-Instruct")
     reward_model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct")
diff --git a/tests/experimental/reward/test_async_token_bucket_on_cpu.py b/tests/experimental/reward/test_async_token_bucket_on_cpu.py
index ceef232c5b0..0a044190bd0 100644
--- a/tests/experimental/reward/test_async_token_bucket_on_cpu.py
+++ b/tests/experimental/reward/test_async_token_bucket_on_cpu.py
@@ -17,7 +17,7 @@
 
 import pytest
 
-from verl.experimental.reward.reward_loop.limited import AsyncTokenBucket
+from verl.experimental.reward.reward_manager.limited import AsyncTokenBucket
 
 
 class TestAsyncTokenBucket:
diff --git a/tests/experimental/reward/test_rate_limited_reward_manager_on_cpu.py b/tests/experimental/reward/test_rate_limited_reward_manager_on_cpu.py
index f91b23aae23..446dee56438 100644
--- a/tests/experimental/reward/test_rate_limited_reward_manager_on_cpu.py
+++ b/tests/experimental/reward/test_rate_limited_reward_manager_on_cpu.py
@@ -21,7 +21,7 @@
 from transformers import AutoTokenizer
 
 from verl import DataProto
-from verl.experimental.reward.reward_loop.limited import RateLimitedRewardLoopManager
+from verl.experimental.reward.reward_manager.limited import RateLimitedRewardLoopManager
 
 
 # Mock API reward functions for testing
diff --git a/tests/experimental/reward/test_reward_model_disrm.py b/tests/experimental/reward/test_reward_model_disrm.py
index 5e0cfa0e553..1e79f57e32a 100644
--- a/tests/experimental/reward/test_reward_model_disrm.py
+++ b/tests/experimental/reward/test_reward_model_disrm.py
@@ -114,8 +114,8 @@ def test_reward_model_manager():
             }
         }
     )
-    with initialize_config_dir(config_dir=os.path.abspath("recipe/fapo/config")):
-        config = compose("rm_config")
+    with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+        config = compose(config_name="ppo_trainer")
 
     rollout_model_name = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct")
     reward_model_name = os.path.expanduser("~/models/Skywork/Skywork-Reward-V2-Llama-3.2-1B")
diff --git a/tests/experimental/reward/test_reward_model_genrm.py b/tests/experimental/reward/test_reward_model_genrm.py
index c505267ab9e..ed853fbd811 100644
--- a/tests/experimental/reward/test_reward_model_genrm.py
+++ b/tests/experimental/reward/test_reward_model_genrm.py
@@ -115,8 +115,8 @@ def test_reward_model_manager():
             }
         }
     )
-    with initialize_config_dir(config_dir=os.path.abspath("recipe/fapo/config")):
-        config = compose("rm_config")
+    with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+        config = compose(config_name="ppo_trainer")
 
     rollout_model_name = os.path.expanduser("~/models/Qwen/Qwen2.5-0.5B-Instruct")
     reward_model_name = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct")
diff --git a/tests/models/test_engine.py b/tests/models/test_engine.py
index ae413996b9f..9878ece4d06 100644
--- a/tests/models/test_engine.py
+++ b/tests/models/test_engine.py
@@ -24,11 +24,19 @@
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
-from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForTokenClassification, Qwen3Config, Qwen3MoeConfig
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoModelForTokenClassification,
+    AutoTokenizer,
+    Qwen3Config,
+    Qwen3MoeConfig,
+)
 
 from verl import DataProto
 from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
 from verl.trainer.config import CheckpointConfig
+from verl.utils import tensordict_utils as tu
 from verl.utils.model import compute_position_id_with_mask, create_random_mask
 from verl.utils.torch_functional import logprobs_from_logits_naive
 from verl.workers.config import (
@@ -40,49 +48,87 @@
     McoreEngineConfig,
     McoreOptimizerConfig,
 )
-from verl.workers.engine_workers import ActorWorker, CriticWorker
-from verl.workers.utils.losses import ppo_loss
+from verl.workers.engine_workers import TrainingWorker, TrainingWorkerConfig
+from verl.workers.utils.losses import ppo_loss, sft_loss, value_loss
+from verl.workers.utils.padding import left_right_2_no_padding, no_padding_2_padding
 
 
-@pytest.mark.parametrize("strategy", ["megatron", "fsdp", "fsdp2"])
-def test_actor_engine(strategy):
-    ray.init()
+def get_test_language_model(device_count):
+    if device_count == 1:
+        model = "~/models/HuggingFaceTB/SmolLM2-135M-Instruct"
+    else:
+        model = "~/models/Qwen/Qwen2.5-0.5B"
+    model = os.path.expanduser(model)
+    return model
 
-    path = os.path.expanduser("~/models/Qwen/Qwen2.5-0.5B")
-    model_config = HFModelConfig(path=path)
+
+def create_training_config(model_type, strategy, device_count, model):
+    if device_count == 1:
+        tp = pp = cp = fsdp_size = 1
+    else:
+        tp = pp = cp = 2
+        fsdp_size = 4
+
+    path = os.path.expanduser(model)
+    model_config = HFModelConfig(path=path, use_remove_padding=True)
+
+    kwargs = dict(
+        param_offload=True,
+        optimizer_offload=True,
+        grad_offload=True,
+        use_dynamic_bsz=True,
+        use_remove_padding=True,
+        max_token_len_per_gpu=500,
+        infer_max_token_len_per_gpu=1000,
+    )
 
     if strategy == "megatron":
         engine_config = McoreEngineConfig(
             forward_only=False,
-            use_mbridge=False,
-            tensor_model_parallel_size=2,
-            pipeline_model_parallel_size=2,
-            context_parallel_size=2,
+            use_mbridge=True,
+            tensor_model_parallel_size=tp,
+            pipeline_model_parallel_size=pp,
+            context_parallel_size=cp,
+            **kwargs,
         )
         optimizer_config = McoreOptimizerConfig(lr_decay_steps=10)
     elif strategy in ["fsdp", "fsdp2"]:
         engine_config = FSDPEngineConfig(
-            forward_only=False, fsdp_size=4, strategy=strategy, ulysses_sequence_parallel_size=2
+            forward_only=False, fsdp_size=fsdp_size, strategy=strategy, ulysses_sequence_parallel_size=cp, **kwargs
         )
         optimizer_config = FSDPOptimizerConfig()
     else:
         raise NotImplementedError(f"strategy {strategy} is not supported")
 
-    config = ActorConfig(
+    config = TrainingWorkerConfig(
+        model_type=model_type,
         model_config=model_config,
-        engine=engine_config,
+        engine_config=engine_config,
+        optimizer_config=optimizer_config,
+        checkpoint_config=None,
+    )
+    return config
+
+
+@pytest.mark.parametrize("strategy", ["fsdp", "fsdp2", "megatron"])
+def test_actor_engine(strategy):
+    ray.init()
+    device_count = torch.cuda.device_count()
+    config = create_training_config(
+        model_type="language_model",
         strategy=strategy,
-        ppo_micro_batch_size_per_gpu=256,
-        ppo_mini_batch_size=4,
-        optim=optimizer_config,
-        use_dynamic_bsz=True,
-        rollout_n=1,
+        device_count=device_count,
+        model=get_test_language_model(device_count),
     )
-    ray_cls_with_init = RayClassWithInitArgs(cls=ray.remote(ActorWorker), config=config)
-    resource_pool = RayResourcePool(process_on_nodes=[8])
+    ray_cls_with_init = RayClassWithInitArgs(cls=ray.remote(TrainingWorker), config=config)
+    resource_pool = RayResourcePool(process_on_nodes=[device_count])
     wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init)
     # init model
-    wg.init_model()
+    wg.reset()
+
+    sft_loss_ = partial(sft_loss, config=config)
+
+    wg.set_loss_fn(sft_loss_)
 
     batch_size = 8
     seqlen = 32
@@ -92,7 +138,7 @@ def test_actor_engine(strategy):
     torch.manual_seed(1)
     np.random.seed(1)
 
-    input_ids = torch.randint(0, model_config.hf_config.vocab_size, (batch_size, seqlen))
+    input_ids = torch.randint(0, config.model_config.hf_config.vocab_size, (batch_size, seqlen))
     attention_mask = create_random_mask(
         input_ids=input_ids, max_ratio_of_valid_token=0.8, max_ratio_of_left_padding=0.2, min_ratio_of_valid_token=0.6
     )
@@ -116,15 +162,22 @@ def test_actor_engine(strategy):
             "responses": responses,
             "response_mask": response_mask,
         },
-        meta_info={"temperature": 1.0, "global_token_num": global_token_num},
+        meta_info={"temperature": 1.0, "global_token_num": global_token_num, "compute_loss": False},
     )
 
-    # sft_loss_ = partial(sft_loss, config=config)
+    data_td = data.to_tensordict()
+    data_td = left_right_2_no_padding(data_td)
 
     # eval
-    output = wg.compute_log_prob(data)
+    output = wg.infer_batch(data_td)
+    output = output.get()
+    logprobs_unpad = tu.get(output, "log_probs").cpu()
+    logprobs = no_padding_2_padding(logprobs_unpad, data_td)
+
+    output = DataProto.from_single_dict({"old_log_probs": logprobs})
 
     # load hf model and compare results with hf model
+    path = config.model_config.path
     hf_model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.bfloat16)
     hf_output = hf_model(input_ids, attention_mask=attention_mask)
     hf_logprobs = logprobs_from_logits_naive(
@@ -148,78 +201,77 @@ def test_actor_engine(strategy):
     data.batch["advantages"] = torch.rand_like(responses, dtype=torch.float32)
     data.batch["ref_log_prob"] = torch.rand_like(responses, dtype=torch.float32)
 
+    # construct actor config
+    actor_config = ActorConfig(strategy=strategy, rollout_n=1, ppo_micro_batch_size_per_gpu=-1)
+
     # set ppo loss
-    ppo_loss_ = partial(ppo_loss, config=config)
+    ppo_loss_ = partial(ppo_loss, config=actor_config)
     wg.set_loss_fn(ppo_loss_)
 
     # update again
-    ppo_metrics = wg.update_actor(data)
+    data_td = data.to_tensordict()
+    data_td = left_right_2_no_padding(data_td)
+
+    # auto load/offload
+    tu.assign_non_tensor(data_td, global_batch_size=data_td.shape[0])
+    ppo_metrics = wg.train_batch(data_td)
+    ppo_metrics = ppo_metrics.get()
+    ppo_metrics = tu.get(ppo_metrics, "metrics")
     print(ppo_metrics)
 
-    ray.shutdown()
+    # test manual load/offload
+    tu.assign_non_tensor(data_td, disable_auto_offload=True)
+    wg.to("device")
+    ppo_metrics = wg.train_batch(data_td)
+    ppo_metrics = ppo_metrics.get()
+    ppo_metrics = tu.get(ppo_metrics, "metrics")
+    print(ppo_metrics)
+    wg.to("cpu")
 
+    ray.shutdown()
 
-def create_model():
-    from transformers import Qwen3Config
 
-    config = Qwen3Config(num_hidden_layers=2, num_labels=1)
+def create_value_model(language_model_path, output_path):
+    config = AutoConfig.from_pretrained(language_model_path)
+    config.num_labels = 1
+    config.classifier_dropout = 0
+    config.tie_word_embeddings = False
     model = AutoModelForTokenClassification.from_config(config)
+    tokenizer = AutoTokenizer.from_pretrained(os.path.expanduser(language_model_path))
     assert model.config.num_labels == 1
-    path = os.path.expanduser("~/models/test_model")
+    path = os.path.expanduser(output_path)
     model.save_pretrained(path)
+    tokenizer.save_pretrained(path)
     config.save_pretrained(path)
     return path
 
 
-@pytest.mark.parametrize("strategy", ["megatron", "fsdp", "fsdp2"])
+@pytest.mark.parametrize("strategy", ["fsdp", "fsdp2"])
 def test_critic_engine(strategy):
-    ray.init()
+    device_count = torch.cuda.device_count()
+    value_model_path = os.path.expanduser("~/models/test_model")
+    language_model_path = get_test_language_model(device_count=device_count)
+    create_value_model(language_model_path, value_model_path)
 
-    path = create_model()
-    model_config = HFModelConfig(path=path, load_tokenizer=False)
+    torch.manual_seed(1)
+    np.random.seed(1)
 
-    if strategy == "megatron":
-        engine_config = McoreEngineConfig(
-            forward_only=False,
-            use_mbridge=False,
-            tensor_model_parallel_size=2,
-            pipeline_model_parallel_size=2,
-            context_parallel_size=2,
-        )
-        optimizer_config = McoreOptimizerConfig(lr_decay_steps=10)
-    elif strategy in ["fsdp", "fsdp2"]:
-        engine_config = FSDPEngineConfig(
-            forward_only=False, fsdp_size=4, strategy=strategy, ulysses_sequence_parallel_size=2
-        )
-        optimizer_config = FSDPOptimizerConfig()
-    else:
-        raise NotImplementedError(f"strategy {strategy} is not supported")
+    ray.init()
 
-    config = CriticConfig(
-        model_config=model_config,
-        engine=engine_config,
-        strategy=strategy,
-        ppo_micro_batch_size_per_gpu=256,
-        ppo_mini_batch_size=4,
-        optim=optimizer_config,
-        use_dynamic_bsz=True,
-        rollout_n=1,
+    config = create_training_config(
+        model_type="value_model", strategy=strategy, device_count=device_count, model=value_model_path
     )
-    ray_cls_with_init = RayClassWithInitArgs(cls=ray.remote(CriticWorker), config=config)
-    resource_pool = RayResourcePool(process_on_nodes=[8])
+    ray_cls_with_init = RayClassWithInitArgs(cls=ray.remote(TrainingWorker), config=config)
+    resource_pool = RayResourcePool(process_on_nodes=[device_count])
     wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init)
     # init model
-    wg.init_model()
+    wg.reset()
 
     batch_size = 8
     seqlen = 32
 
     response_length = seqlen // 2
-
-    torch.manual_seed(1)
-    np.random.seed(1)
-
-    input_ids = torch.randint(0, model_config.hf_config.vocab_size, (batch_size, seqlen))
+    input_ids = torch.randint(0, config.model_config.hf_config.vocab_size, (batch_size, seqlen))
     attention_mask = create_random_mask(
         input_ids=input_ids, max_ratio_of_valid_token=0.8, max_ratio_of_left_padding=0.2, min_ratio_of_valid_token=0.6
     )
@@ -243,21 +295,30 @@ def test_critic_engine(strategy):
             "responses": responses,
             "response_mask": response_mask,
         },
-        meta_info={"temperature": 1.0, "global_token_num": global_token_num},
+        meta_info={"temperature": 1.0, "global_token_num": global_token_num, "compute_loss": False},
     )
 
+    data_td = data.to_tensordict()
+    data_td = left_right_2_no_padding(data_td)
+
     # eval
-    output = wg.compute_values(data)
+    output = wg.infer_batch(data_td)
+    output = output.get()
+
+    values_unpad = tu.get(output, "values").float().cpu()
+    values = no_padding_2_padding(values_unpad, data_td)
+
+    output = DataProto.from_single_dict({"values": values})
 
     # load hf model and compare results with hf model
-    with torch.device("cuda"):
+    with torch.device("cuda"), torch.autocast(device_type="cuda", dtype=torch.bfloat16):
         hf_model = AutoModelForTokenClassification.from_pretrained(
-            path, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+            value_model_path, torch_dtype=torch.float32, attn_implementation="flash_attention_2"
         )
         hf_output = hf_model(input_ids.cuda(), attention_mask=attention_mask.cuda())
         hf_values = hf_output.logits[:, -response_length - 1 : -1, :].float().squeeze(-1).cpu()
-    hf_values_mean = torch.mean(hf_values * response_mask)
 
+    hf_values_mean = torch.mean(hf_values * response_mask)
     engine_values = torch.mean(output.batch["values"] * response_mask)
 
     torch.testing.assert_close(hf_values_mean, engine_values, atol=1e-2, rtol=1e-2)
@@ -265,11 +326,25 @@ def test_critic_engine(strategy):
     data = data.union(output)
 
     # add ppo data
-    data.batch["values"] = torch.rand_like(responses, dtype=torch.float32)
     data.batch["returns"] = torch.rand_like(responses, dtype=torch.float32)
 
     # update again
-    ppo_metrics = wg.update_critic(data)
+    # create critic config
+    critic_config = CriticConfig(
+        strategy=strategy, rollout_n=1, ppo_micro_batch_size_per_gpu=-1, model_config=config.model_config
+    )
+    value_loss_ = partial(value_loss, config=critic_config)
+    wg.set_loss_fn(value_loss_)
+
+    # update again
+    data_td = data.to_tensordict()
+    data_td = left_right_2_no_padding(data_td)
+
+    # auto load/offload
+    tu.assign_non_tensor(data_td, global_batch_size=data_td.shape[0])
+    ppo_metrics = wg.train_batch(data_td)
+    ppo_metrics = ppo_metrics.get()
+    ppo_metrics = tu.get(ppo_metrics, "metrics")
     print(ppo_metrics)
 
     ray.shutdown()
diff --git a/tests/single_controller/test_decorator_on_cpu.py b/tests/single_controller/test_decorator_on_cpu.py
index 1178d256cf5..8dc74670410 100644
--- a/tests/single_controller/test_decorator_on_cpu.py
+++ b/tests/single_controller/test_decorator_on_cpu.py
@@ -66,6 +66,9 @@ async def async_dp_compute(self, data: DataProto) -> DataProto:
     def dp_compute_td(self, data: TensorDict) -> TensorDict:
         rank_value = torch.tensor(self.rank, device=data["input"].device, dtype=data["input"].dtype)
         data["output"] = data["input"] + self.value + rank_value
+        position_ids = data.pop("position_ids")
+        for i, position_id in enumerate(position_ids.unbind(dim=0)):
+            assert (position_id == torch.arange(4 + rank_value * 2 + i).expand(position_id.shape)).all()
         return data
 
 
@@ -159,7 +162,16 @@ def test_decorator_dp_compute_td(ray_init_shutdown):
 
     # Prepare input data (size 4, for 2 workers)
     input_tensor = torch.arange(4, dtype=torch.float32)
-    data = TensorDict({"input": input_tensor}, batch_size=[4])
+    position_ids = torch.nested.as_nested_tensor(
+        [
+            torch.arange(4).expand(4, 4),
+            torch.arange(5).expand(4, 5),
+            torch.arange(6).expand(4, 6),
+            torch.arange(7).expand(4, 7),
+        ],
+        layout=torch.jagged,
+    )
+    data = TensorDict({"input": input_tensor, "position_ids": position_ids}, batch_size=[4])
 
     # Call the decorated method
     output = worker_group.dp_compute_td(data)
diff --git a/tests/special_e2e/ppo_trainer/run_model_reward.sh b/tests/special_e2e/ppo_trainer/run_model_reward.sh
index 09d6757b511..46fb7c64e38 100644
--- a/tests/special_e2e/ppo_trainer/run_model_reward.sh
+++ b/tests/special_e2e/ppo_trainer/run_model_reward.sh
@@ -79,13 +79,13 @@ python3 -m verl.trainer.main_ppo \
     critic.model.fsdp_config.param_offload=False \
     critic.model.fsdp_config.optimizer_offload=False \
     reward_model.enable=True \
-    reward_model.ulysses_sequence_parallel_size="${SP_SIZE}" \
     reward_model.model.path="${MODEL_PATH}" \
-    reward_model.model.use_remove_padding="${RM_PAD}" \
-    reward_model.model.fsdp_config.param_offload=True \
-    reward_model.use_dynamic_bsz="${SEQ_BALANCE}" \
-    reward_model.forward_max_token_len_per_gpu=${infer_max_token_num_per_gpu} \
-    reward_model.micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    reward_model.use_reward_loop=True \
+    reward_model.rollout.gpu_memory_utilization=0.8 \
+    reward_model.rollout.tensor_model_parallel_size=1 \
+    reward_model.rollout.prompt_length=1024 \
+    reward_model.rollout.response_length=512 \
+    reward_model.num_workers=8 \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger=console \
diff --git a/tests/special_e2e/run_ppo_trainer_megatron.sh b/tests/special_e2e/run_ppo_trainer_megatron.sh
index a88500aba40..cd8033f132e 100644
--- a/tests/special_e2e/run_ppo_trainer_megatron.sh
+++ b/tests/special_e2e/run_ppo_trainer_megatron.sh
@@ -9,6 +9,7 @@ NUM_GPUS=${NUM_GPUS:-8}
 
 MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B}
 MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
+RM_MODEL_PATH=${RM_MODEL_PATH:-${HOME}/models/Skywork/Skywork-Reward-V2-Llama-3.2-1B}
 #huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
 
 USE_DUMMY_MODEL=${USE_DUMMY_MODEL:-False}
@@ -57,6 +58,7 @@ LORA_TARGET_MODULES=${LORA_TARGET_MODULES:-"['linear_qkv','linear_proj','linear_
 
 MAX_PROMPT_LENGTH=${MAX_PROMPT_LENGTH:-512}
 MAX_RESPONSE_LENGTH=${MAX_RESPONSE_LENGTH:-512}
+MAX_RM_LENGTH=$((MAX_PROMPT_LENGTH + MAX_RESPONSE_LENGTH))
 
 COMMON_PP=${COMMON_PP:-2}
 COMMON_VPP=${COMMON_VPP:-2}
@@ -87,12 +89,6 @@ CRITIC_CP=${CRITIC_CP:-$COMMON_CP}
 CRITIC_TP=${CRITIC_TP:-$TRAIN_TP}
 CRITIC_EP=${CRITIC_EP:-$COMMON_EP}
 CRITIC_ETP=${CRITIC_ETP:-$COMMON_ETP}
-RM_PP=${RM_PP:-$COMMON_PP}
-RM_VPP=${RM_VPP:-$COMMON_VPP}
-RM_CP=${RM_CP:-$COMMON_CP}
-RM_TP=${RM_TP:-$TRAIN_TP}
-RM_EP=${RM_EP:-$COMMON_EP}
-RM_ETP=${RM_ETP:-$COMMON_ETP}
 
 ALL_OFFLOAD=${ALL_OFFLOAD:-False}
 COMMON_PARAM_OFFLOAD=${COMMON_PARAM_OFFLOAD:-$ALL_OFFLOAD}
@@ -244,22 +240,14 @@ python3 -m verl.trainer.main_ppo --config-path=config \
     critic.profiler.ranks=$PROFILE_RANKS \
     critic.profiler.all_ranks=$PROFILE_RANKS_ALL \
     reward_model.enable=True \
-    reward_model.model.path="${MODEL_PATH}" \
-    reward_model.micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
-    reward_model.megatron.use_mbridge=${USE_MBRIDGE} \
-    reward_model.megatron.vanilla_mbridge=${VALUE_VANILLA_MBRIDGE} \
-    reward_model.megatron.pipeline_model_parallel_size=$RM_PP \
-    reward_model.megatron.virtual_pipeline_model_parallel_size=$RM_VPP \
-    reward_model.megatron.context_parallel_size=$RM_CP \
-    reward_model.megatron.tensor_model_parallel_size=$RM_TP \
-    reward_model.megatron.expert_model_parallel_size=$RM_EP \
-    reward_model.megatron.expert_tensor_parallel_size=$RM_ETP \
-    reward_model.megatron.param_offload=${RM_PARAM_OFFLOAD} \
-    reward_model.megatron.use_dist_checkpointing=${USE_DIST_CKPT} \
-    reward_model.megatron.dist_checkpointing_path=${DIST_CKPT_PATH} \
-    reward_model.profiler.enable=$PROFILE_ENABLE \
-    reward_model.profiler.ranks=$PROFILE_RANKS \
-    reward_model.profiler.all_ranks=$PROFILE_RANKS_ALL \
+    reward_model.model.path="${RM_MODEL_PATH}" \
+    reward_model.use_reward_loop=True \
+    reward_model.rollout.name=${ENGINE} \
+    reward_model.rollout.gpu_memory_utilization=0.6 \
+    reward_model.rollout.tensor_model_parallel_size=${INFER_TP} \
+    reward_model.rollout.prompt_length=${MAX_RM_LENGTH} \
+    reward_model.rollout.response_length=${MAX_RESPONSE_LENGTH} \
+    reward_model.num_workers=8 \
     algorithm.use_kl_in_reward=False \
     algorithm.kl_penalty=kl \
     algorithm.kl_ctrl.kl_coef=0.001 \
diff --git a/tests/special_e2e/run_transferqueue.sh b/tests/special_e2e/run_transferqueue.sh
index 541742bf765..9e1c34acde1 100644
--- a/tests/special_e2e/run_transferqueue.sh
+++ b/tests/special_e2e/run_transferqueue.sh
@@ -63,8 +63,6 @@ echo "Running transferqueue with ${ACTOR_STRATEGY} strategy"
 echo "Total GPUs: ${NUM_GPUS}"
 
 # Common parameters for both FSDP and Megatron
-# For Ascend NPU, please add
-# trainer.device=npu
 common_params=(
     data.train_files="${HOME}/data/gsm8k/train.parquet"
     data.val_files="${HOME}/data/gsm8k/test.parquet"
diff --git a/tests/special_e2e/sft/compare_sft_engine_results.py b/tests/special_e2e/sft/compare_sft_engine_results.py
index b39e133ee5e..322f5353c06 100644
--- a/tests/special_e2e/sft/compare_sft_engine_results.py
+++ b/tests/special_e2e/sft/compare_sft_engine_results.py
@@ -36,7 +36,7 @@ def compare_results(golden_results, other_result):
     grad_norm = other_result[0]["data"]["train/grad_norm"]
 
     torch.testing.assert_close(golden_loss, loss, atol=1e-2, rtol=1e-2)
-    torch.testing.assert_close(golden_grad_norm, grad_norm, atol=1e-4, rtol=1e-2)
+    torch.testing.assert_close(golden_grad_norm, grad_norm, atol=1e-4, rtol=3e-2)
 
 
 if __name__ == "__main__":
@@ -53,5 +53,6 @@ def compare_results(golden_results, other_result):
     for file, other_result in other_results.items():
         print(f"compare results {file}")
         compare_results(golden_results, other_result)
+        print(f"compare results {file} done")
 
     print("All results are close to golden results")
diff --git a/tests/special_e2e/sft/run_sft_engine_gsm8k.sh b/tests/special_e2e/sft/run_sft_engine.sh
similarity index 91%
rename from tests/special_e2e/sft/run_sft_engine_gsm8k.sh
rename to tests/special_e2e/sft/run_sft_engine.sh
index ead86f1747b..16b7631473d 100644
--- a/tests/special_e2e/sft/run_sft_engine_gsm8k.sh
+++ b/tests/special_e2e/sft/run_sft_engine.sh
@@ -13,9 +13,9 @@ else
   COMMAND="python ${ENTRYPOINT} trainer.nnodes=${NNODES:-1} trainer.n_gpus_per_node=${NUM_GPUS:-1}"
 fi
 
-
-TRAIN_FILES=~/data/gsm8k_sft/train.parquet
-VAL_FILES=~/data/gsm8k_sft/test.parquet
+DATASET_DIR=${DATASET_DIR:-~/data/gsm8k_sft}
+TRAIN_FILES=${DATASET_DIR}/train.parquet
+VAL_FILES=${DATASET_DIR}/test.parquet
 
 backend=${BACKEND:-fsdp}
 
@@ -25,7 +25,7 @@ RESUME_MODE=disable
 
 ckpts_home=${ckpts_home:-~/verl/test/gsm8k-sft-${backend}}
 
-MODEL_ID=${MODEL_ID:-Qwen/Qwen3-0.6B}
+MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B}
 MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
 #huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
 
@@ -71,7 +71,8 @@ MEGATRON_ENGINE_CONFIG="\
     engine.tensor_model_parallel_size=${TP_SIZE} \
     engine.pipeline_model_parallel_size=${PP_SIZE} \
     engine.virtual_pipeline_model_parallel_size=${VPP_SIZE} \
-    engine.context_parallel_size=${CP_SIZE}"
+    engine.context_parallel_size=${CP_SIZE}
+    engine.use_mbridge=True"
 
 if [ "$backend" = "fsdp" ]; then
     ENGINE_CONFIG="$FSDP_ENGINE_CONFIG"
@@ -88,11 +89,11 @@ mkdir -p "${ckpts_home}"
 $COMMAND \
     data.train_files="${TRAIN_FILES}" \
     data.val_files="${VAL_FILES}" \
-    data.train_batch_size=256 \
+    data.train_batch_size=128 \
     data.pad_mode=${PAD_MODE} \
     data.truncation=error \
     data.use_dynamic_bsz=True \
-    data.max_token_len_per_gpu=8192 \
+    data.max_token_len_per_gpu=2048 \
     data.messages_key=messages \
     model.path=$MODEL_PATH \
     model.use_remove_padding=${USE_REMOVE_PADDING} \
diff --git a/tests/special_e2e/sft/test_sft_engine_all.sh b/tests/special_e2e/sft/test_sft_engine_all.sh
index 1548ea9d588..0fba9606587 100644
--- a/tests/special_e2e/sft/test_sft_engine_all.sh
+++ b/tests/special_e2e/sft/test_sft_engine_all.sh
@@ -5,51 +5,32 @@ rm -rf ~/verl/test/log
 mkdir -p ~/verl/test/log
 
 export VERL_FILE_LOGGER_ROOT=~/verl/test/log
+VPP_SIZE=${VPP_SIZE:-2}
 
 # test with single gpu as golden
 echo "run with single gpu as golden"
-BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=1 NUM_GPUS=1 FSDP_STRATEGY=fsdp VERL_FILE_LOGGER_PATH=~/verl/test/log/golden.jsonl bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh
+BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=1 NUM_GPUS=1 FSDP_STRATEGY=fsdp VERL_FILE_LOGGER_PATH=~/verl/test/log/golden.jsonl bash tests/special_e2e/sft/run_sft_engine.sh
 
 # test with fsdp 1
-echo "run with sp1 fsdp_size2 num_gpus8 fsdp_strategy fsdp pad_mode no_padding"
-BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=2 NUM_GPUS=8 FSDP_STRATEGY=fsdp PAD_MODE=no_padding bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh
-echo "run with sp1 fsdp_size-1 num_gpus8 fsdp_strategy fsdp pad_mode no_padding"
-BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=-1 NUM_GPUS=8 FSDP_STRATEGY=fsdp PAD_MODE=no_padding bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh
-echo "run with sp2 fsdp_size-1 num_gpus8 fsdp_strategy fsdp pad_mode no_padding"
-BACKEND=fsdp SP_SIZE=2 FSDP_SIZE=-1 NUM_GPUS=8 FSDP_STRATEGY=fsdp PAD_MODE=no_padding bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh
-echo "run with sp4 fsdp_size4 num_gpus8 fsdp_strategy fsdp pad_mode no_padding"
-BACKEND=fsdp SP_SIZE=4 FSDP_SIZE=4 NUM_GPUS=8 FSDP_STRATEGY=fsdp PAD_MODE=no_padding bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh
-
-# test use_remove_padding and pad_mode no_padding
+echo "run with sp2 fsdp_size2 num_gpus8 fsdp_strategy fsdp pad_mode no_padding"
+BACKEND=fsdp SP_SIZE=2 FSDP_SIZE=2 NUM_GPUS=8 FSDP_STRATEGY=fsdp PAD_MODE=no_padding bash tests/special_e2e/sft/run_sft_engine.sh
+
+# test with fsdp 1 use_remove_padding and pad_mode no_padding
 echo "run with sp4 fsdp_size4 num_gpus8 fsdp_strategy fsdp pad_mode no_padding use_remove_padding False"
-BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=-1 NUM_GPUS=8 FSDP_STRATEGY=fsdp PAD_MODE=no_padding USE_REMOVE_PADDING=False bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh
+BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=-1 NUM_GPUS=8 FSDP_STRATEGY=fsdp PAD_MODE=no_padding USE_REMOVE_PADDING=False bash tests/special_e2e/sft/run_sft_engine.sh
 
 
 # test with fsdp 2
-echo "run with sp1 fsdp_size1 num_gpus1 fsdp_strategy fsdp2"
-BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=1 NUM_GPUS=1 FSDP_STRATEGY=fsdp2 bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh
-
-echo "run with sp1 fsdp_size-1 num_gpus8 fsdp_strategy fsdp2"
-BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=-1 NUM_GPUS=8 FSDP_STRATEGY=fsdp2 bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh
-echo "run with sp2 fsdp_size-1 num_gpus8 fsdp_strategy fsdp2"
-BACKEND=fsdp SP_SIZE=2 FSDP_SIZE=-1 NUM_GPUS=8 FSDP_STRATEGY=fsdp2 bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh
-BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=2 NUM_GPUS=8 FSDP_STRATEGY=fsdp2 bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh
-BACKEND=fsdp SP_SIZE=4 FSDP_SIZE=4 NUM_GPUS=8 FSDP_STRATEGY=fsdp2 bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh
+echo "run with sp2 fsdp_size2 num_gpus8 fsdp_strategy fsdp2"
+BACKEND=fsdp SP_SIZE=2 FSDP_SIZE=2 NUM_GPUS=8 FSDP_STRATEGY=fsdp2 bash tests/special_e2e/sft/run_sft_engine.sh
 
 # test with megatron
-echo "run with tp1 pp1 cp1 num_gpus1"
-BACKEND=megatron TP_SIZE=1 PP_SIZE=1 CP_SIZE=1 NUM_GPUS=1 bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh
-
-echo "run with tp2 pp2 vpp2 cp1 num_gpus8"
-BACKEND=megatron TP_SIZE=2 PP_SIZE=2 VPP_SIZE=2 CP_SIZE=1 NUM_GPUS=8 bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh
-
-# test with cp
 echo "run with tp2 pp2 vpp2 cp2 num_gpus8"
-BACKEND=megatron TP_SIZE=2 PP_SIZE=2 VPP_SIZE=2 CP_SIZE=2 NUM_GPUS=8 bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh
+BACKEND=megatron TP_SIZE=2 PP_SIZE=2 VPP_SIZE=${VPP_SIZE} CP_SIZE=2 NUM_GPUS=8 bash tests/special_e2e/sft/run_sft_engine.sh
 
 # test with cp in ray
 echo "run with tp2 pp2 vpp2 cp2 num_gpus8 mode=ray"
-BACKEND=megatron TP_SIZE=2 PP_SIZE=2 VPP_SIZE=2 CP_SIZE=2 NUM_GPUS=8 mode=ray bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh
+BACKEND=megatron TP_SIZE=2 PP_SIZE=2 VPP_SIZE=${VPP_SIZE} CP_SIZE=2 NUM_GPUS=8 mode=ray bash tests/special_e2e/sft/run_sft_engine.sh
 
 python3 tests/special_e2e/sft/compare_sft_engine_results.py
 
diff --git a/tests/special_npu/run_qwen2_5_05b_dapo.sh b/tests/special_npu/run_qwen2_5_05b_dapo.sh
index d90b63cb277..b27c7876f80 100644
--- a/tests/special_npu/run_qwen2_5_05b_dapo.sh
+++ b/tests/special_npu/run_qwen2_5_05b_dapo.sh
@@ -91,5 +91,4 @@ python3 -m recipe.dapo.main_dapo \
     trainer.total_epochs=1 \
     trainer.resume_mode=disable \
     trainer.val_before_train=False \
-    trainer.total_training_steps=1 \
-    trainer.device=npu $@
+    trainer.total_training_steps=1 $@
diff --git a/tests/special_npu/run_qwen2_5_05b_grpo.sh b/tests/special_npu/run_qwen2_5_05b_grpo.sh
index cd3edc1e30e..352b4738948 100644
--- a/tests/special_npu/run_qwen2_5_05b_grpo.sh
+++ b/tests/special_npu/run_qwen2_5_05b_grpo.sh
@@ -44,5 +44,4 @@ python3 -m verl.trainer.main_ppo \
     trainer.save_freq=-1 \
     trainer.test_freq=-1 \
     trainer.total_epochs=1 \
-    trainer.total_training_steps=1 \
-    trainer.device=npu $@
+    trainer.total_training_steps=1 $@
diff --git a/tests/special_npu/run_qwen2_5_05b_grpo_mindspeed.sh b/tests/special_npu/run_qwen2_5_05b_grpo_mindspeed.sh
index bdf225dc3a1..a821433790a 100644
--- a/tests/special_npu/run_qwen2_5_05b_grpo_mindspeed.sh
+++ b/tests/special_npu/run_qwen2_5_05b_grpo_mindspeed.sh
@@ -65,5 +65,4 @@ python3 -m verl.trainer.main_ppo --config-path=config \
     trainer.test_freq=-1 \
     trainer.total_epochs=1 \
     trainer.total_training_steps=1 \
-    trainer.device=npu \
     +actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True $@
diff --git a/tests/special_npu/run_qwen2_5_05b_sft_peft_sp2.sh b/tests/special_npu/run_qwen2_5_05b_sft_peft_sp2.sh
index 5af44c9907a..cb3aacf7190 100644
--- a/tests/special_npu/run_qwen2_5_05b_sft_peft_sp2.sh
+++ b/tests/special_npu/run_qwen2_5_05b_sft_peft_sp2.sh
@@ -5,7 +5,7 @@ mkdir -p ./save_ckpts
 MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct}
 MODEL_PATH=${MODEL_PATH:-${HOME}/.cache/models/${MODEL_ID}}
 
-torchrun --standalone --nnodes=1 --nproc_per_node=8 \
+torchrun --standalone --nnodes=1 --nproc_per_node=2 \
      -m verl.trainer.fsdp_sft_trainer \
     data.train_files=$HOME/data/gsm8k/train.parquet \
     data.val_files=$HOME/data/gsm8k/test.parquet \
@@ -27,7 +27,6 @@ torchrun --standalone --nnodes=1 --nproc_per_node=8 \
     model.target_modules=all-linear \
     model.strategy=fsdp \
     ulysses_sequence_parallel_size=2 \
-    use_remove_padding=true \
-    trainer.device=npu
+    use_remove_padding=true
 
 rm -rf ./outputs ./save_ckpts
diff --git a/tests/special_npu/run_qwen2_5_vl_3b_npu.sh b/tests/special_npu/run_qwen2_5_vl_3b_npu.sh
index 10ffdf3747f..aca2dd6e5a4 100644
--- a/tests/special_npu/run_qwen2_5_vl_3b_npu.sh
+++ b/tests/special_npu/run_qwen2_5_vl_3b_npu.sh
@@ -54,5 +54,4 @@ python3 -m verl.trainer.main_ppo \
     trainer.save_freq=-1 \
     trainer.test_freq=-1 \
     trainer.total_epochs=1 \
-    trainer.total_training_steps=1 \
-    trainer.device=npu $@
\ No newline at end of file
+    trainer.total_training_steps=1 $@
\ No newline at end of file
diff --git a/tests/special_npu/run_qwen3_06b_ppo.sh b/tests/special_npu/run_qwen3_06b_ppo.sh
index 284ad091e84..2c446379b9a 100644
--- a/tests/special_npu/run_qwen3_06b_ppo.sh
+++ b/tests/special_npu/run_qwen3_06b_ppo.sh
@@ -49,5 +49,4 @@ python3 -m verl.trainer.main_ppo \
     trainer.save_freq=-1 \
     trainer.test_freq=-1 \
     trainer.total_epochs=1 \
-    trainer.total_training_steps=1 \
-    trainer.device=npu $@
+    trainer.total_training_steps=1 $@
diff --git a/tests/special_npu/run_qwen3_30b_dapo_mindspeed.sh b/tests/special_npu/run_qwen3_30b_dapo_mindspeed.sh
index aece3d11471..cd06eba18ce 100644
--- a/tests/special_npu/run_qwen3_30b_dapo_mindspeed.sh
+++ b/tests/special_npu/run_qwen3_30b_dapo_mindspeed.sh
@@ -125,7 +125,6 @@ python3 -m recipe.dapo.main_dapo \
     trainer.test_freq=-1 \
     trainer.total_epochs=1 \
     trainer.total_training_steps=1 \
-    trainer.device=npu \
     actor_rollout_ref.actor.use_torch_compile=False \
     actor_rollout_ref.ref.use_torch_compile=False \
     +actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True $@
diff --git a/tests/special_sanity/check_pr_title.py b/tests/special_sanity/check_pr_title.py
index cabc2f50d85..601484dc4bd 100644
--- a/tests/special_sanity/check_pr_title.py
+++ b/tests/special_sanity/check_pr_title.py
@@ -22,7 +22,7 @@
 allowed_modules = ["fsdp", "megatron", "sglang", "vllm", "rollout", "trainer"]
 allowed_modules += ["tests", "training_utils", "recipe", "hardware", "deployment"]
 allowed_modules += ["ray", "worker", "single_controller", "misc", "docker", "ci"]
-allowed_modules += ["perf", "model", "algo", "env", "tool", "ckpt", "doc", "data", "cfg"]
+allowed_modules += ["perf", "model", "algo", "env", "tool", "ckpt", "doc", "data", "cfg", "reward"]
 allowed_types = ["feat", "fix", "refactor", "chore", "test"]
 
 # Check for [1/N] prefix and extract the rest of the title
diff --git a/tests/test_protocol_v2_on_cpu.py b/tests/test_protocol_v2_on_cpu.py
index 831a89935c0..bc6788387ab 100644
--- a/tests/test_protocol_v2_on_cpu.py
+++ b/tests/test_protocol_v2_on_cpu.py
@@ -247,15 +247,35 @@ def test_tensordict_eq():
 
 def test_tensor_dict_make_iterator():
     obs = torch.tensor([1, 2, 3, 4, 5, 6])
+    input_ids = torch.nested.as_nested_tensor(
+        [
+            torch.tensor([0, 1]),
+            torch.tensor([2]),
+            torch.tensor([3, 4]),
+            torch.tensor([5]),
+            torch.tensor([6, 7, 8]),
+            torch.tensor([9]),
+        ],
+        layout=torch.jagged,
+    )
     data_sources = ["abc", "def", "abc", "def", "pol", "klj"]
     non_tensor_dict = {"train_sample_kwargs": {"top_p": 1.0}, "val_sample_kwargs": {"top_p": 0.7}}
-    dataset = tu.get_tensordict({"obs": obs, "data_sources": data_sources}, non_tensor_dict=non_tensor_dict)
+    dataset = tu.get_tensordict(
+        {"obs": obs, "data_sources": data_sources, "input_ids": input_ids}, non_tensor_dict=non_tensor_dict
+    )
 
     dataloader = tu.make_iterator(
         dataset, mini_batch_size=2, epochs=2, seed=0, dataloader_kwargs={"shuffle": False, "drop_last": False}
     )
 
-    expected_tensor_dict = [dataset[0:2], dataset[2:4], dataset[4:6], dataset[0:2], dataset[2:4], dataset[4:6]]
+    expected_tensor_dict = [
+        tu.index_select_tensor_dict(dataset, indices=list(range(0, 2))),
+        tu.index_select_tensor_dict(dataset, indices=list(range(2, 4))),
+        tu.index_select_tensor_dict(dataset, indices=list(range(4, 6))),
+        tu.index_select_tensor_dict(dataset, indices=list(range(0, 2))),
+        tu.index_select_tensor_dict(dataset, indices=list(range(2, 4))),
+        tu.index_select_tensor_dict(dataset, indices=list(range(4, 6))),
+    ]
 
     i = 0
 
@@ -721,6 +741,55 @@ def test_concat_tensordict():
     assert output["temp"] == 1.0
 
 
+def test_chunk_tensordict():
+    # Qwen-VL 3d position_ids
+    position_ids = torch.nested.as_nested_tensor(
+        [
+            torch.arange(4).expand(4, 4),
+            torch.arange(5).expand(4, 5),
+            torch.arange(6).expand(4, 6),
+            torch.arange(7).expand(4, 7),
+        ],
+        layout=torch.jagged,
+    )
+    input_ids = torch.nested.as_nested_tensor(
+        [torch.arange(4), torch.arange(5), torch.arange(6), torch.arange(7)], layout=torch.jagged
+    )
+
+    multi_modal_inputs = torch.stack(
+        [
+            NonTensorData({"pixel_values": torch.randn(3, 224, 224)}),
+            NonTensorData(None),
+            NonTensorData({"pixel_values": torch.randn(3, 128, 128)}),
+            NonTensorData({"pixel_values": torch.randn(3, 128, 128)}),
+        ]
+    )
+    td = tu.get_tensordict(
+        {
+            "input_ids": input_ids,
+            "position_ids": position_ids,
+            "multi_modal_inputs": multi_modal_inputs,
+        },
+    )
+    assert len(td) == 4
+    chunks = tu.chunk_tensordict(td, chunks=2)
+
+    for i, chunk in enumerate(chunks):
+        assert len(chunk) == 2
+        for key, val in chunk.items():
+            if isinstance(val, torch.Tensor) and val.is_nested:
+                tensors = td[key].unbind(dim=0)
+                expected = torch.nested.as_nested_tensor(tensors[i * 2 : (i + 1) * 2], layout=torch.jagged)
+                assert torch.all(torch.eq(val.values(), expected.values())).item()
+            else:
+                expected = td[key][i * 2 : (i + 1) * 2]
+                for tensor, expect in zip(val, expected, strict=False):
+                    if tensor.data is None:
+                        assert expect is None
+                    else:
+                        assert torch.all(torch.eq(tensor.data["pixel_values"], expect["pixel_values"])).item()
+
+
 def test_assign_non_tensor_stack_with_nested_lists():
     """Test assign_non_tensor_stack with lists of lists."""
     td = tu.get_tensordict({"obs": torch.randn(3, 4)}, non_tensor_dict={})
diff --git a/tests/trainer/config/legacy_ppo_megatron_trainer.yaml b/tests/trainer/config/legacy_ppo_megatron_trainer.yaml
index ea2a15d685e..06e2e94a662 100644
--- a/tests/trainer/config/legacy_ppo_megatron_trainer.yaml
+++ b/tests/trainer/config/legacy_ppo_megatron_trainer.yaml
@@ -111,7 +111,7 @@ actor_rollout_ref:
       dist_checkpointing_path: null
       seed: 42
       override_transformer_config: {} # additional transformer config like: num_layers_in_first(/last)_pipeline_stage
-      use_mbridge: False
+      use_mbridge: True
       vanilla_mbridge: True
     profile: # profile the actor model in `update_policy`
       use_profile: False # open it when you want to profile the actor model
diff --git a/tests/trainer/config/legacy_ppo_trainer.yaml b/tests/trainer/config/legacy_ppo_trainer.yaml
index 3139e8a39db..c09e06e978d 100644
--- a/tests/trainer/config/legacy_ppo_trainer.yaml
+++ b/tests/trainer/config/legacy_ppo_trainer.yaml
@@ -165,7 +165,7 @@ actor_rollout_ref:
     enable_activation_offload: false
 
     # Whether to remove padding tokens in inputs during training
-    use_remove_padding: false
+    use_remove_padding: true
 
     # Set to positive value to enable LoRA (e.g., 32)
     lora_rank: 0
diff --git a/tests/utils/dataset/test_multiturn_sft_dataset_on_cpu.py b/tests/utils/dataset/test_multiturn_sft_dataset_on_cpu.py
index 0c5bbb65084..6962e124000 100644
--- a/tests/utils/dataset/test_multiturn_sft_dataset_on_cpu.py
+++ b/tests/utils/dataset/test_multiturn_sft_dataset_on_cpu.py
@@ -16,28 +16,46 @@
 """
 
 import os
+from io import BytesIO
 
 import pandas as pd
+import pytest
 import torch
-from transformers import AutoTokenizer
-
+from PIL import Image
+from tensordict import TensorDict
+from torch.utils.data import DistributedSampler
+from torchdata.stateful_dataloader import StatefulDataLoader
+from transformers import AutoProcessor, AutoTokenizer
+from transformers.utils import get_json_schema
+
+from verl.utils.dataset.dataset_utils import DatasetPadMode, SFTTensorCollator
 from verl.utils.dataset.multiturn_sft_dataset import MultiTurnSFTDataset
-
-
-def test_multiturn_sft_dataset():
-    print("Starting test...")
+from verl.utils.model import extract_multi_modal_inputs
+
+
+@pytest.mark.parametrize(
+    "model_path",
+    [
+        "Qwen/Qwen2.5-0.5B",
+        "Qwen/Qwen2.5-Coder-7B-Instruct",
+        "Qwen/Qwen3-30B-A3B-Instruct-2507",
+        # "Qwen/Qwen3-30B-A3B-Thinking-2507" # Thinking series models add <think></think> tags to last turn.
+    ],
+)
+@pytest.mark.parametrize("enable_thinking", [False, True])
+def test_multiturn_sft_dataset(model_path: str, enable_thinking: bool):
+    print(f"Starting test... model_path={model_path}, enable_thinking={enable_thinking}")
     # Create a temporary parquet file with test data
     test_data = {
         "messages": [
             [
-                {"role": "system", "content": "You are a helpful assistant."},
                 {"role": "user", "content": "What is 2+2?"},
                 {"role": "assistant", "content": "2+2 equals 4."},
                 {"role": "user", "content": "And what is 4+4?"},
                 {"role": "assistant", "content": "4+4 equals 8."},
             ],
             [
-                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "system", "content": "You are a powerful assistant."},
                 {"role": "user", "content": "Tell me a joke."},
                 {"role": "assistant", "content": "Why did the chicken cross the road?"},
                 {"role": "user", "content": "Why?"},
@@ -55,8 +73,13 @@ def test_multiturn_sft_dataset():
     df.to_parquet(test_file)
 
     # Initialize tokenizer and dataset
-    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-7B-Instruct")
-    config = {"max_length": 512, "truncation": "error", "multiturn": {"messages_key": "messages"}}
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    config = {
+        "max_length": 512,
+        "truncation": "error",
+        "multiturn": {"messages_key": "messages"},
+        "apply_chat_template_kwargs": {"enable_thinking": enable_thinking},
+    }
     dataset = MultiTurnSFTDataset(parquet_files=test_file, tokenizer=tokenizer, config=config)
 
     # Test 1: Dataset Length
@@ -200,3 +223,220 @@ def test_multiturn_sft_dataset():
 
     print("All tests passed!")
     print("Starting test...")
+
+
+def generate_image(description: str, size: str = "256x256"):
+    """Generate a simple image based on description.
+
+    Args:
+        description: The description of the image to generate.
+        size: The size of the image. Defaults to "256x256". (choices: ["256x256", "512x512"])
+
+    Returns:
+        A generated image
+    """
+    ...
+
+
+@pytest.fixture
+def vlm_data_file():
+    test_data = [
+        # sample 0: single turn with image input
+        {
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "<image>Describe this image.",
+                },
+                {
+                    "role": "assistant",
+                    "content": "The image is a red square.",
+                },
+            ],
+            "images": [Image.new("RGB", (300, 300), color="red")],
+            "tools": [],
+        },
+        # sample 1: single turn with multiple images input
+        {
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "<image><image>Compare these images.",
+                },
+                {
+                    "role": "assistant",
+                    "content": "The first image is a red square and the second image is a green square.",
+                },
+            ],
+            "images": [Image.new("RGB", (100, 100), color="red"), Image.new("RGB", (100, 300), color="green")],
+            "tools": [],
+        },
+        # sample 2: multi turn with image input and tool generated image
+        {
+            "messages": [
+                {
+                    "role": "user",
+                    "content": "<image>Describe this image.",
+                },
+                {
+                    "role": "assistant",
+                    "content": "Let's generate a zoom-in image.",
+                    "tool_calls": [
+                        {
+                            "function": {"arguments": '{"bbox_2d": "[0, 1, 2, 4]"}', "name": "image_zoom_in_tool"},
+                            "type": "function",
+                        }
+                    ],
+                },
+                {
+                    "role": "tool",
+                    "content": "<image>Generated image.",
+                },
+                {"role": "assistant", "content": "The zoom-in image is a red square."},
+            ],
+            "images": [Image.new("RGB", (300, 500), color="red"), Image.new("RGB", (100, 100), color="red")],
+            "tools": [get_json_schema(generate_image)],
+        },
+        # sample 3: single turn without image input
+        {
+            "messages": [
+                {"role": "user", "content": "How is the weather today?"},
+                {"role": "assistant", "content": "The weather is sunny."},
+            ],
+            "images": [],
+            "tools": [],
+        },
+    ]
+
+    # Create test directory if it doesn't exist
+    os.makedirs("test_data", exist_ok=True)
+    test_file = "test_data/test_vlm.parquet"
+
+    # Save test data to parquet
+    df = pd.DataFrame(test_data)
+
+    def serialize_image(img):
+        if isinstance(img, Image.Image):
+            img_byte_arr = BytesIO()
+            img.save(img_byte_arr, format="PNG")
+            return {"bytes": img_byte_arr.getvalue()}
+        return img
+
+    df["images"] = df["images"].apply(lambda x: [serialize_image(img) for img in x])
+
+    df.to_parquet(test_file)
+    return test_file
+
+
+def test_multiturn_sft_vlm_dataset_on_cpu(vlm_data_file):
+    df = pd.read_parquet(vlm_data_file)
+    model_path = "Qwen/Qwen3-VL-2B-Instruct"
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    processor = AutoProcessor.from_pretrained(model_path)
+    config = {"max_length": 512, "pad_mode": "no_padding", "truncation": "error", "messages_key": "messages"}
+    dataset = MultiTurnSFTDataset(parquet_files=vlm_data_file, tokenizer=tokenizer, config=config, processor=processor)
+    assert dataset.pad_mode == DatasetPadMode.NO_PADDING
+
+    for i in range(len(dataset)):
+        item = dataset[i]
+        input_ids = item["input_ids"]
+        loss_mask = item["loss_mask"]
+        position_ids = item["position_ids"]
+        pixel_values = item.get("multi_modal_inputs", {}).get("pixel_values")
+        image_grid_thw = item.get("multi_modal_inputs", {}).get("image_grid_thw")
+
+        assert input_ids.shape == loss_mask.shape, "Shapes of input_ids and loss_mask must be equal"
+        assert position_ids.dim() == 2, "position_ids must be 2-dimensional"
+        assert position_ids.shape[0] == 4, f"position_ids[0] should be 4: {position_ids[0]}"
+        assert position_ids.shape[1] == input_ids.shape[0]
+
+        # 1. verify input_ids without assistant text
+        text = tokenizer.decode(input_ids[loss_mask == 0], skip_special_tokens=True)
+        print(f"Text without assistant: {repr(text)}")
+        for message in df["messages"][i]:
+            if message["role"] != "assistant":
+                content = message["content"].replace("<image>", "")
+                assert content in text, f"user/tool text should be in the input_ids: {text}"
+
+        # 2. verify input_ids with assistant text
+        text = tokenizer.decode(input_ids[loss_mask == 1], skip_special_tokens=True)
+        print(f"Text with assistant: {repr(text)}")
+        for message in df["messages"][i]:
+            if message["role"] == "assistant":
+                assert message["content"] in text, f"Assistant text should be in the input_ids: {text}"
+                assert "assistant" not in text, f"Assistant token should not be in the input_ids: {text}"
+
+        # 3. verify image token match with image_grid_thw
+        if len(df["images"][i]) > 0:
+            patch_size = processor.image_processor.patch_size
+            temporal_patch_size = processor.image_processor.temporal_patch_size
+            merge_size = processor.image_processor.merge_size
+            num_patches = image_grid_thw.prod(dim=1).sum()
+            assert image_grid_thw.shape == (len(df["images"][i]), 3), (
+                f"image_grid_thw: {image_grid_thw.shape} should have shape ({len(df['images'][i])}, 3)"
+            )
+            assert pixel_values.shape == (num_patches, 3 * temporal_patch_size * patch_size * patch_size), (
+                f"pixel_values: {pixel_values.shape} should have shape ({num_patches}, {3 * patch_size * patch_size})"
+            )
+            assert (input_ids == processor.image_token_id).sum() == num_patches // (merge_size**2)
+        else:
+            assert pixel_values is None, "pixel_values should be None when no image is provided"
+            assert image_grid_thw is None, "image_grid_thw should be None when no image is provided"
+
+
+def test_multiturn_sft_vlm_dataloader_on_cpu(vlm_data_file):
+    df = pd.read_parquet(vlm_data_file)
+    model_path = "Qwen/Qwen3-VL-2B-Instruct"
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    processor = AutoProcessor.from_pretrained(model_path)
+    config = {"max_length": 512, "pad_mode": "no_padding", "truncation": "error", "messages_key": "messages"}
+    dataset = MultiTurnSFTDataset(parquet_files=vlm_data_file, tokenizer=tokenizer, config=config, processor=processor)
+    assert dataset.pad_mode == DatasetPadMode.NO_PADDING
+
+    collate_fn = SFTTensorCollator(DatasetPadMode.NO_PADDING)
+    sampler = DistributedSampler(dataset, shuffle=False, num_replicas=1, rank=0, drop_last=True)
+    batch_size = 2
+    dataloader = StatefulDataLoader(
+        dataset=dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        collate_fn=collate_fn,
+        num_workers=0,
+        pin_memory=False,
+        drop_last=True,
+    )
+
+    for i, batch in enumerate(dataloader):
+        # 1. verify input_ids, loss_mask
+        input_ids = batch["input_ids"]
+        loss_mask = batch["loss_mask"]
+        assert input_ids.is_nested, "input_ids should be a nested tensor"
+        assert loss_mask.is_nested, "loss_mask should be a nested tensor"
+        assert input_ids.shape[0] == loss_mask.shape[0] == batch_size, "Shapes of input_ids, loss_mask must be equal"
+
+        # 2. verify position_ids: (bs, 4, seq_len)
+        position_ids = batch["position_ids"]
+        assert position_ids.is_nested, "position_ids should be a nested tensor"
+        assert position_ids.dim() == 3, "position_ids must be 3-dimensional"
+        assert position_ids.shape[0] == batch_size
+        assert position_ids.shape[1] == 4
+        values = position_ids.values()
+        assert values.shape == (4, len(input_ids.values()))
+
+        # 3. verify multi-modal data
+        td = TensorDict(**batch, batch_size=batch_size)
+        multi_modal_inputs = extract_multi_modal_inputs(td["multi_modal_inputs"])
+        pixel_values = multi_modal_inputs["pixel_values"]
+        image_grid_thw = multi_modal_inputs["image_grid_thw"]
+
+        num_images = sum([len(images) for images in df["images"][i * batch_size : (i + 1) * batch_size]])
+        assert image_grid_thw.shape == (num_images, 3), (
+            f"image_grid_thw: {image_grid_thw.shape} should have shape ({num_images}, 3)"
+        )
+        patch_size = processor.image_processor.patch_size
+        temporal_patch_size = processor.image_processor.temporal_patch_size
+        num_patches = image_grid_thw.prod(dim=1).sum()
+        assert pixel_values.shape[0] == num_patches, (
+            f"pixel_values: {pixel_values.shape} should have shape "
+            f"({num_patches}, 3 * {temporal_patch_size} * {patch_size} * {patch_size})"
+        )
diff --git a/tests/utils/test_mlflow_key_sanitization.py b/tests/utils/test_mlflow_key_sanitization.py
index 54605db241d..daf457869e3 100644
--- a/tests/utils/test_mlflow_key_sanitization.py
+++ b/tests/utils/test_mlflow_key_sanitization.py
@@ -20,24 +20,44 @@
 
 class TestMlflowLoggingAdapter(unittest.TestCase):
     def test_sanitize_key_and_warning(self):
+        """Test key sanitization for invalid characters and consecutive slashes with warnings."""
         adapter = _MlflowLoggingAdapter()
-        data = {"valid_key": 1.0, "invalid@key!": 2.0, "another/valid-key": 3.0, "bad key#": 4.0}
+        data = {
+            "valid_key": 1.0,
+            "invalid@key!": 2.0,
+            "another/valid-key": 3.0,
+            "bad key#": 4.0,
+            "val-aux//reward/mean_at_1": 5.0,
+            "val-core///acc/best_at_5": 6.0,
+            "metric////with/many////slashes": 7.0,
+        }
         # Patch mlflow.log_metrics to capture the metrics actually sent
         with (
             patch("mlflow.log_metrics") as mock_log_metrics,
             patch.object(adapter, "logger") as mock_logger,
         ):
             adapter.log(data, step=5)
-            # Check that keys are sanitized
+            # Check that invalid characters are sanitized
             sent_metrics = mock_log_metrics.call_args[1]["metrics"]
             self.assertIn("invalid_at_key_", sent_metrics)  # @ becomes _at_, ! becomes _
             self.assertIn("bad key_", sent_metrics)  # # becomes _, space remains
             self.assertNotIn("invalid@key!", sent_metrics)
             self.assertNotIn("bad key#", sent_metrics)
-            # Check that a warning was logged for each sanitized key
+            # Check that consecutive slashes are collapsed to single slashes
+            self.assertIn("val-aux/reward/mean_at_1", sent_metrics)
+            self.assertIn("val-core/acc/best_at_5", sent_metrics)
+            self.assertIn("metric/with/many/slashes", sent_metrics)
+            self.assertNotIn("val-aux//reward/mean_at_1", sent_metrics)
+            self.assertNotIn("val-core///acc/best_at_5", sent_metrics)
+            # Check that warnings were logged for all sanitized keys
             warning_msgs = [str(call) for call in mock_logger.warning.call_args_list]
+            # Warnings for invalid characters
             self.assertTrue(any("invalid@key!" in msg and "invalid_at_key_" in msg for msg in warning_msgs))
             self.assertTrue(any("bad key#" in msg and "bad key_" in msg for msg in warning_msgs))
+            # Warnings for consecutive slashes
+            self.assertTrue(any("val-aux//reward/mean_at_1" in msg for msg in warning_msgs))
+            self.assertTrue(any("val-core///acc/best_at_5" in msg for msg in warning_msgs))
+            self.assertTrue(any("metric////with/many////slashes" in msg for msg in warning_msgs))
 
 
 if __name__ == "__main__":
diff --git a/tests/workers/config/test_critic_config_on_cpu.py b/tests/workers/config/test_critic_config_on_cpu.py
index d762763e0f1..fb03560e0f4 100644
--- a/tests/workers/config/test_critic_config_on_cpu.py
+++ b/tests/workers/config/test_critic_config_on_cpu.py
@@ -30,6 +30,7 @@
 )
 
 
+@pytest.mark.skip(reason="This test is flaky when we actively load model config")
 class TestCriticConfig:
     """Test suite for critic configuration dataclasses."""
 
diff --git a/tests/workers/rollout/rollout_vllm/test_vllm_abort.py b/tests/workers/rollout/rollout_vllm/test_vllm_abort.py
new file mode 100644
index 00000000000..82034f1e905
--- /dev/null
+++ b/tests/workers/rollout/rollout_vllm/test_vllm_abort.py
@@ -0,0 +1,217 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Test vLLM abort functionality.
+
+Usage:
+    pytest tests/workers/rollout/rollout_vllm/test_vllm_abort.py -v -s
+    or
+    python tests/workers/rollout/rollout_vllm/test_vllm_abort.py
+"""
+
+import asyncio
+import os
+import time
+from uuid import uuid4
+
+
+def test_vllm_abort():
+    # ==================== Configuration ====================
+    MODEL_PATH = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct")  # /root/models/Qwen/Qwen2.5-1.5B-Instruct
+    GPUS_PER_NODE = 2
+    TP_SIZE = 1
+    ROLLOUT_NAME = "vllm"
+    ABORT_DELAY = 0.5  # seconds to wait before aborting
+
+    print("=" * 60)
+    print("vLLM Abort Test")
+    print("=" * 60)
+    print(f"Model: {MODEL_PATH}")
+    print(f"GPUs: {GPUS_PER_NODE}, TP Size: {TP_SIZE}")
+    print(f"Abort Delay: {ABORT_DELAY}s")
+    print("=" * 60)
+
+    # ==================== Initialize Ray ====================
+    print("\n[1] Initializing Ray...")
+    import ray
+
+    ray.init(
+        runtime_env={
+            "env_vars": {
+                "TOKENIZERS_PARALLELISM": "true",
+                "NCCL_DEBUG": "WARN",
+                "VLLM_LOGGING_LEVEL": "INFO",
+                "VLLM_USE_V1": "1",
+            }
+        },
+        ignore_reinit_error=True,
+    )
+
+    try:
+        # ==================== Create Config ====================
+        print("\n[2] Creating config...")
+        from hydra import compose, initialize_config_dir
+
+        config_dir = os.path.abspath("verl/verl/trainer/config")
+        if not os.path.exists(config_dir):
+            config_dir = os.path.abspath("verl/trainer/config")
+
+        with initialize_config_dir(config_dir=config_dir, version_base=None):
+            config = compose(config_name="ppo_trainer")
+
+        config.trainer.n_gpus_per_node = GPUS_PER_NODE
+        config.trainer.nnodes = 1
+        config.actor_rollout_ref.model.path = MODEL_PATH
+        config.actor_rollout_ref.rollout.name = ROLLOUT_NAME
+        config.actor_rollout_ref.rollout.mode = "async"
+        config.actor_rollout_ref.rollout.tensor_model_parallel_size = TP_SIZE
+        config.actor_rollout_ref.rollout.prompt_length = 512
+        config.actor_rollout_ref.rollout.response_length = 512  # Longer for abort test
+
+        # ==================== Create Rollout Server ====================
+        print("\n[3] Creating rollout server (this may take a while)...")
+        from verl.workers.rollout.replica import get_rollout_replica_class
+
+        rollout_config = config.actor_rollout_ref.rollout
+        model_config = config.actor_rollout_ref.model
+
+        rollout_server_class = get_rollout_replica_class(ROLLOUT_NAME)
+        server = rollout_server_class(
+            replica_rank=0,
+            config=rollout_config,
+            model_config=model_config,
+            gpus_per_node=GPUS_PER_NODE,
+        )
+
+        asyncio.run(server.init_standalone())
+        server_handle = server._server_handle
+        print(f"Server address: {server._server_address}")
+
+        # ==================== Load Tokenizer ====================
+        print("\n[4] Loading tokenizer...")
+        from transformers import AutoTokenizer
+
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
+
+        # ==================== Prepare Prompts ====================
+        print("\n[5] Preparing prompts (to ensure generation takes time)...")
+        NUM_PROMPTS = 8
+        prompts = [
+            "Write a very long story about a brave knight and dragon.",
+            "Explain the history of the Roman Empire in great detail.",
+            "Describe quantum computing and its applications thoroughly.",
+            "Write an essay about climate change and its global effects.",
+            "Who won the Champions League in 2019?",
+            "Write a detailed analysis of Shakespeare's Hamlet.",
+            "Describe the process of photosynthesis in plants.",
+            "Write about the French Revolution and its consequences.",
+        ]
+
+        all_prompt_ids = []
+        for prompt in prompts[:NUM_PROMPTS]:
+            messages = [{"role": "user", "content": prompt}]
+            prompt_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True)
+            all_prompt_ids.append(prompt_ids)
+        print(f"Prepared {NUM_PROMPTS} prompts")
+
+        # ==================== Start Generations and Abort ====================
+        print("\n[6] Starting generations and then aborting...")
+
+        sampling_params = {
+            "temperature": 1.0,
+            "top_p": 1.0,
+            "logprobs": False,
+        }
+
+        # Start all generations concurrently
+        print(f"\n   Starting {NUM_PROMPTS} generations...")
+        generate_refs = []
+        for i, prompt_ids in enumerate(all_prompt_ids):
+            request_id = f"abort_test_{i}_{uuid4().hex[:8]}"
+            ref = server_handle.generate.remote(
+                request_id=request_id,
+                prompt_ids=prompt_ids,
+                sampling_params=sampling_params,
+                image_data=None,
+            )
+            generate_refs.append((i, request_id, ref))
+            print(f"      Started request {i}: {request_id}")
+
+        # Wait before aborting
+        print(f"\n   Waiting {ABORT_DELAY}s before abort...")
+        time.sleep(ABORT_DELAY)
+
+        # Call abort
+        print("   Calling abort_all_requests...")
+        abort_start = time.perf_counter()
+        abort_result = ray.get(server_handle.abort_all_requests.remote())
+        abort_time = time.perf_counter() - abort_start
+
+        print(f"   Abort took: {abort_time * 1000:.2f}ms")
+        print(f"   Abort result: {abort_result}")
+
+        # Wait for all generations to finish
+        print("\n   Waiting for all generations to complete...")
+        outputs = []
+        for i, request_id, ref in generate_refs:
+            try:
+                output = ray.get(ref, timeout=10.0)
+                outputs.append((i, request_id, output))
+            except ray.exceptions.GetTimeoutError:
+                print(f"      Request {i} timed out!")
+                outputs.append((i, request_id, None))
+
+        # ==================== Print Results ====================
+        print("\n" + "=" * 60)
+        print("RESULTS")
+        print("=" * 60)
+
+        aborted_count = 0
+        completed_count = 0
+        timeout_count = 0
+
+        for i, request_id, output in outputs:
+            if output is None:
+                timeout_count += 1
+                print(f"[{i}] {request_id}: TIMEOUT")
+            elif output.stop_reason == "aborted":
+                aborted_count += 1
+                print(f"[{i}] {request_id}: ABORTED ({len(output.token_ids)} tokens)")
+                print(f"Partial Output: {tokenizer.decode(output.token_ids)}")
+            else:
+                completed_count += 1
+                print(f"[{i}] {request_id}: COMPLETED ({output.stop_reason}, {len(output.token_ids)} tokens)")
+                print(f"Full Output: {tokenizer.decode(output.token_ids)}")
+
+        print(f"\nSummary: {aborted_count} aborted, {completed_count} completed, {timeout_count} timeout")
+
+        print("\n" + "=" * 60)
+        print(f"Abort result: {abort_result}")
+        print("=" * 60)
+        print("Abort test completed!")
+
+        # Assertions for pytest
+        assert timeout_count == 0, "No requests should timeout"
+        assert aborted_count + completed_count == NUM_PROMPTS, "All requests should finish"
+        assert "aborted_count" in abort_result, "Abort result should contain aborted_count"
+        assert abort_time < 1.0, "Abort should be fast (< 1 second)"
+
+    finally:
+        print("\nShutting down Ray...")
+        ray.shutdown()
+
+
+if __name__ == "__main__":
+    # Can still run as standalone script
+    test_vllm_abort()
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index 42e456acf6f..ec0ed25afb5 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -70,7 +70,7 @@ def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandl
         random.shuffle(self.server_handles)
 
         # Least requests load balancing
-        self.weighted_serveres = [[0, (hash(server), server)] for server in server_handles]
+        self.weighted_serveres = [[0, idx, server] for idx, server in enumerate(self.server_handles)]
         heapq.heapify(self.weighted_serveres)
 
         # LRU cache to map request_id to server
@@ -81,7 +81,7 @@ def _choose_server(self, request_id: str) -> ray.actor.ActorHandle:
         if request_id in self.request_id_to_server:
             return self.request_id_to_server[request_id]
 
-        server = self.weighted_serveres[0][1][1]
+        _, _, server = self.weighted_serveres[0]
         self.weighted_serveres[0][0] += 1
         heapq.heapreplace(self.weighted_serveres, self.weighted_serveres[0])
         self.request_id_to_server[request_id] = server
@@ -175,9 +175,10 @@ class _InternalAgentLoopOutput(AgentLoopOutput):
     """Extra fields for dynamic addition."""
 
 
-# make hydra.utils.instantiate happy
-class _DummyConfig:
-    def __init__(self, config: DictConfig) -> None:
+class DictConfigWrap:
+    """Wrapper for DictConfig to avoid hydra.utils.instantiate recursive resolve."""
+
+    def __init__(self, config: DictConfig):
         self.config = config
 
 
@@ -185,11 +186,9 @@ class AgentLoopBase(ABC):
     """An agent loop takes an input message, chat with OpenAI compatible LLM server and interact with various
     environments."""
 
-    _class_initialized = False
-
     def __init__(
         self,
-        trainer_config: _DummyConfig,
+        trainer_config: DictConfigWrap,
         server_manager: AsyncLLMServerManager,
         tokenizer: AutoTokenizer,
         processor: AutoProcessor,
@@ -198,32 +197,17 @@ def __init__(
         """Initialize agent loop, each sample will have its own loop instance.
 
         Args:
-            trainer_config (_DummyConfig): trainer config.
+            trainer_config (DictConfigWrap): trainer config.
             server_manager (AsyncLLMServerManager): OpenAI compatible LLM server manager.
             tokenizer (AutoTokenizer): Tokenizer for tokenize messages.
             processor (AutoProcessor): Processor for process messages.
         """
-        self.init_class(config=trainer_config.config, tokenizer=tokenizer, processor=processor, **kwargs)
         self.config = trainer_config.config
         self.server_manager = server_manager
         self.tokenizer = tokenizer
         self.processor = processor
         self.loop = asyncio.get_running_loop()
 
-    @classmethod
-    def init_class(cls, config: DictConfig, tokenizer: AutoTokenizer, processor: AutoProcessor, **kwargs):
-        """This is used to do heavy initialization work that should shared across all instances. It's only called once.
-
-        Args:
-            config (DictConfig): trainer config.
-            tokenizer (AutoTokenizer): Tokenizer for tokenize messages.
-            processor (AutoProcessor): Processor for process multi_modal data.
-            **kwargs: extra kwargs from config file passed in by `hydra.utils.instantiate`.
-        """
-        if cls._class_initialized:
-            return
-        cls._class_initialized = True
-
     @abstractmethod
     async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput:
         """Run agent loop to interact with LLM server and environment.
@@ -297,12 +281,15 @@ def __init__(
                 self.processor.chat_template = self.config.actor_rollout_ref.model.custom_chat_template
             self.tokenizer.chat_template = self.config.actor_rollout_ref.model.custom_chat_template
 
-        self.reward_manager_worker = RewardLoopWorker.options(
-            scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
-                node_id=ray.get_runtime_context().get_node_id(),
-                soft=False,
-            ),
-        ).remote(self.config, self.reward_router_address)
+        use_reward_loop = True if self.config.reward_model.use_reward_loop else None
+        self.use_reward_loop = use_reward_loop
+        if use_reward_loop and not hasattr(self, "reward_loop_worker"):
+            self.reward_loop_worker = RewardLoopWorker.options(
+                scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
+                    node_id=ray.get_runtime_context().get_node_id(),
+                    soft=False,
+                ),
+            ).remote(self.config, self.reward_router_address)
 
         trace_config = self.config.actor_rollout_ref.rollout.get("trace", {})
         RolloutTraceConfig.init(
@@ -417,7 +404,7 @@ async def _run_agent_loop(
             agent_loop_config = _agent_loop_registry[agent_name]
             agent_loop = hydra.utils.instantiate(
                 config=agent_loop_config,
-                trainer_config=_DummyConfig(config=self.config),
+                trainer_config=DictConfigWrap(config=self.config),
                 server_manager=self.server_manager,
                 tokenizer=self.tokenizer,
                 processor=self.processor,
@@ -551,7 +538,7 @@ async def _agent_loop_postprocess(self, output, **kwargs) -> _InternalAgentLoopO
         enable_async_reward = (
             self.reward_router_address is not None and self.config.reward_model.enable_resource_pool
         ) or not self.config.reward_model.enable
-        if output.reward_score is None and enable_async_reward:
+        if output.reward_score is None and enable_async_reward and self.use_reward_loop:
             batch = TensorDict(
                 {
                     "prompts": prompt_output["input_ids"],  # [1, prompt_length]
@@ -572,7 +559,7 @@ async def _agent_loop_postprocess(self, output, **kwargs) -> _InternalAgentLoopO
                 batch=batch,
                 non_tensor_batch=non_tensor_batch,
             )
-            result = await self.reward_manager_worker.compute_score.remote(data)
+            result = await self.reward_loop_worker.compute_score.remote(data)
             output.reward_score = result["reward_score"]
             output.extra_fields["reward_extra_info"] = result["reward_extra_info"]
 
diff --git a/verl/experimental/agent_loop/tool_agent_loop.py b/verl/experimental/agent_loop/tool_agent_loop.py
index e107bb37b51..ef08376df75 100644
--- a/verl/experimental/agent_loop/tool_agent_loop.py
+++ b/verl/experimental/agent_loop/tool_agent_loop.py
@@ -20,9 +20,17 @@
 from typing import Any, Optional
 from uuid import uuid4
 
-from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register
+from transformers import AutoProcessor, AutoTokenizer
+
+from verl.experimental.agent_loop.agent_loop import (
+    AgentLoopBase,
+    AgentLoopOutput,
+    AsyncLLMServerManager,
+    DictConfigWrap,
+    register,
+)
 from verl.experimental.agent_loop.tool_parser import FunctionCall, ToolParser
-from verl.experimental.agent_loop.utils import add_generation_prompt_for_gpt_oss, format_gpt_oss_tool_response_manually
+from verl.experimental.agent_loop.utils import build_gpt_oss_tool_response_text
 from verl.interactions.base import BaseInteraction
 from verl.interactions.utils.interaction_registry import initialize_interactions_from_config
 from verl.tools.schemas import ToolResponse
@@ -44,7 +52,8 @@ class AgentState(Enum):
 
 
 class AgentData:
-    """Encapsulates all state variables for the agent loop."""
+    """Encapsulates all state variables for the agent loop. AgentData is passed to tool calling in case that
+    tool may need to access full history state. User can store any tool session data in `extra_fields`."""
 
     def __init__(
         self,
@@ -77,44 +86,49 @@ def __init__(
         # Temporary state for tool calls
         self.tool_calls: list[FunctionCall] = []
 
-        # Extra fields for dynamic addition
+        # Extra fields for dynamic addition, e.g., tool session data
         self.extra_fields: dict[str, Any] = {}
 
 
 @register("tool_agent")
 class ToolAgentLoop(AgentLoopBase):
-    @classmethod
-    def init_class(cls, config, tokenizer, processor, **kwargs):
-        if cls._class_initialized:
-            return
-        cls._class_initialized = True
-        print("Performing class-level ToolAgentLoop initialization")
+    def __init__(
+        self,
+        trainer_config: DictConfigWrap,
+        server_manager: AsyncLLMServerManager,
+        tokenizer: AutoTokenizer,
+        processor: AutoProcessor,
+        **kwargs,
+    ):
+        super().__init__(trainer_config, server_manager, tokenizer, processor, **kwargs)
+        config = trainer_config.config
 
         # Initialize tools from config file
-        cls.tokenizer = tokenizer
-        cls.processor = processor
-        cls.max_user_turns = config.actor_rollout_ref.rollout.multi_turn.max_user_turns
-        cls.max_assistant_turns = config.actor_rollout_ref.rollout.multi_turn.max_assistant_turns
-        cls.max_parallel_calls = config.actor_rollout_ref.rollout.multi_turn.max_parallel_calls
-        cls.max_tool_response_length = config.actor_rollout_ref.rollout.multi_turn.max_tool_response_length
-        cls.tool_response_truncate_side = config.actor_rollout_ref.rollout.multi_turn.tool_response_truncate_side
+        self.max_user_turns = config.actor_rollout_ref.rollout.multi_turn.max_user_turns
+        self.max_assistant_turns = config.actor_rollout_ref.rollout.multi_turn.max_assistant_turns
+        self.max_parallel_calls = config.actor_rollout_ref.rollout.multi_turn.max_parallel_calls
+        self.max_tool_response_length = config.actor_rollout_ref.rollout.multi_turn.max_tool_response_length
+        self.tool_response_truncate_side = config.actor_rollout_ref.rollout.multi_turn.tool_response_truncate_side
         tool_config_path = config.actor_rollout_ref.rollout.multi_turn.tool_config_path
         tool_list = initialize_tools_from_config(tool_config_path) if tool_config_path else []
-        cls.tools = {tool.name: tool for tool in tool_list}
-        cls.tool_schemas = [tool.tool_schema.model_dump(exclude_unset=True, exclude_none=True) for tool in tool_list]
-        cls.tool_parser = ToolParser.get_tool_parser(config.actor_rollout_ref.rollout.multi_turn.format, cls.tokenizer)
-        cls.tool_parser_name = config.actor_rollout_ref.rollout.multi_turn.format
-        print(f"Initialized tools: {cls.tools}")
+        self.tools = {tool.name: tool for tool in tool_list}
+        self.tool_schemas = [tool.tool_schema.model_dump(exclude_unset=True, exclude_none=True) for tool in tool_list]
+        self.tool_parser = ToolParser.get_tool_parser(
+            config.actor_rollout_ref.rollout.multi_turn.format, self.tokenizer
+        )
+        self.tool_parser_name = config.actor_rollout_ref.rollout.multi_turn.format
 
-        cls.apply_chat_template_kwargs = config.data.get("apply_chat_template_kwargs", {})
-        cls.prompt_length = config.actor_rollout_ref.rollout.prompt_length
-        cls.response_length = config.actor_rollout_ref.rollout.response_length
-        cls.system_prompt = initialize_system_prompt(cls.tokenizer, **cls.apply_chat_template_kwargs)
+        self.apply_chat_template_kwargs = config.data.get("apply_chat_template_kwargs", {})
+        self.prompt_length = config.actor_rollout_ref.rollout.prompt_length
+        self.response_length = config.actor_rollout_ref.rollout.response_length
+        self.system_prompt = initialize_system_prompt(self.tokenizer, **self.apply_chat_template_kwargs)
 
         # Initialize interactions from config file
-        cls.interaction_config_file = config.actor_rollout_ref.rollout.multi_turn.interaction_config_path
-        if cls.interaction_config_file:
-            cls.interaction_map: dict[str, BaseInteraction] = cls._initialize_interactions(cls.interaction_config_file)
+        self.interaction_config_file = config.actor_rollout_ref.rollout.multi_turn.interaction_config_path
+        if self.interaction_config_file:
+            self.interaction_map: dict[str, BaseInteraction] = self._initialize_interactions(
+                self.interaction_config_file
+            )
 
     @rollout_trace_op
     async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput:
@@ -271,7 +285,7 @@ async def _handle_processing_tools_state(self, agent_data: AgentData) -> AgentSt
         tasks = []
         tool_call_names = []
         for tool_call in agent_data.tool_calls[: self.max_parallel_calls]:
-            tasks.append(self._call_tool(tool_call, agent_data.tools_kwargs))
+            tasks.append(self._call_tool(tool_call, agent_data.tools_kwargs, agent_data))
             tool_call_names.append(tool_call.name)
 
         with simple_timer("tool_calls", agent_data.metrics):
@@ -346,14 +360,7 @@ async def _handle_processing_tools_state(self, agent_data: AgentData) -> AgentSt
         else:
             if self.tool_parser_name == "gpt-oss":
                 logger.info("manually format tool responses for gpt-oss")
-                # Format tool responses manually
-                tool_response_texts = []
-                for i, tool_msg in enumerate(add_messages):
-                    actual_tool_name = tool_call_names[i]
-                    formatted = format_gpt_oss_tool_response_manually(tool_msg["content"], actual_tool_name)
-                    tool_response_texts.append(formatted)
-
-                tool_response_text = add_generation_prompt_for_gpt_oss("".join(tool_response_texts))
+                tool_response_text = build_gpt_oss_tool_response_text(add_messages, tool_call_names)
                 response_ids = await self.loop.run_in_executor(
                     None, lambda: self.tokenizer.encode(tool_response_text, add_special_tokens=False)
                 )
@@ -434,7 +441,7 @@ async def _handle_interacting_state(self, agent_data: AgentData) -> AgentState:
             return AgentState.GENERATING
 
     async def _call_tool(
-        self, tool_call: FunctionCall, tools_kwargs: dict[str, Any]
+        self, tool_call: FunctionCall, tools_kwargs: dict[str, Any], agent_data: AgentData
     ) -> tuple[ToolResponse, float, dict]:
         """Call tool and return tool response."""
         tool, instance_id = None, None
@@ -445,7 +452,9 @@ async def _call_tool(
             tool = self.tools[tool_name]
             kwargs = tools_kwargs.get(tool_name, {})
             instance_id, _ = await tool.create(create_kwargs=kwargs.get("create_kwargs", {}))
-            tool_execution_response, tool_reward, res = await tool.execute(instance_id, tool_args)
+            tool_execution_response, tool_reward, res = await tool.execute(
+                instance_id, tool_args, agent_data=agent_data
+            )
         except Exception as e:
             logger.warning(f"Error when executing tool: {e}")
             return (
@@ -481,8 +490,7 @@ async def _call_tool(
 
         return ToolResponse(**tool_response_kwargs), tool_reward, res
 
-    @classmethod
-    def _initialize_interactions(cls, interaction_config_file):
+    def _initialize_interactions(self, interaction_config_file):
         """Initialize interactions from configuration.
         Returns:
             dict[str, BaseInteraction]: A dictionary mapping interaction names to interaction instances.
@@ -491,5 +499,4 @@ def _initialize_interactions(cls, interaction_config_file):
             return {}
 
         interaction_map = initialize_interactions_from_config(interaction_config_file)
-        logger.info(f"Initialize interactions from configuration: interaction_map: {list(interaction_map.keys())}")
         return interaction_map
diff --git a/verl/experimental/agent_loop/utils.py b/verl/experimental/agent_loop/utils.py
index 39ffbd0335a..68cb57d870f 100644
--- a/verl/experimental/agent_loop/utils.py
+++ b/verl/experimental/agent_loop/utils.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+from typing import Any
 
 
 def resolve_config_path(config_path: str) -> str:
@@ -95,3 +96,13 @@ def add_generation_prompt_for_gpt_oss(message_content: str) -> str:
         Message content string with generation prompt
     """
     return message_content + "<|start|>assistant"
+
+
+def build_gpt_oss_tool_response_text(messages: list[dict[str, Any]], tool_call_names: list[str]) -> str:
+    """Build gpt-oss tool response text (manual formatting + generation prompt)."""
+    tool_response_texts: list[str] = []
+    for i, tool_msg in enumerate(messages):
+        actual_tool_name = tool_call_names[i]
+        formatted = format_gpt_oss_tool_response_manually(tool_msg["content"], actual_tool_name)
+        tool_response_texts.append(formatted)
+    return add_generation_prompt_for_gpt_oss("".join(tool_response_texts))
diff --git a/verl/experimental/reward/__init__.py b/verl/experimental/reward/__init__.py
index 1c8c72a423d..03807f0277b 100644
--- a/verl/experimental/reward/__init__.py
+++ b/verl/experimental/reward/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .reward_manager import RewardLoopManager, RewardLoopWorker
+from .reward_loop import RewardLoopManager, RewardLoopWorker
 from .reward_model import RewardModelManager
 
 __all__ = ["RewardModelManager", "RewardLoopWorker", "RewardLoopManager"]
diff --git a/verl/experimental/reward/reward_manager.py b/verl/experimental/reward/reward_loop.py
similarity index 94%
rename from verl/experimental/reward/reward_manager.py
rename to verl/experimental/reward/reward_loop.py
index 52e7403ab6e..8e33ef86fec 100644
--- a/verl/experimental/reward/reward_manager.py
+++ b/verl/experimental/reward/reward_loop.py
@@ -30,7 +30,7 @@
 from verl.utils import hf_tokenizer
 from verl.utils.fs import copy_to_local
 
-from .reward_loop import get_reward_loop_manager_cls
+from .reward_manager import get_reward_loop_manager_cls
 from .reward_model import RewardModelManager
 
 logger = logging.getLogger(__file__)
@@ -136,6 +136,14 @@ async def _preprocess_reward_inputs(self, data: DataProto) -> str:
             add_generation_prompt=False,
             tokenize=False,
         )
+
+        # llama tokenizer will add bos token by default
+        # will be removed in vllm >= 0.11.2, where we can add "add_special_tokens" = False
+        if self.reward_model_tokenizer.bos_token is not None and rm_prompt.startswith(
+            self.reward_model_tokenizer.bos_token
+        ):
+            rm_prompt = rm_prompt[len(self.reward_model_tokenizer.bos_token) :]
+
         return rm_prompt
 
     async def compute_score_disrm(self, data: DataProto) -> dict:
@@ -148,7 +156,7 @@ async def compute_score_disrm(self, data: DataProto) -> dict:
                 "model": model_name,
                 "input": disrm_prompt,
                 "activation": False,
-                "add_special_tokens": False,
+                # "add_special_tokens": False,  # vllm >= 0.11.2
             }
             output = await self._post_request(payloads, "classify")
             rm_score = output["data"][-1]["probs"][-1]
@@ -187,7 +195,7 @@ def __init__(self, config: DictConfig, rm_resource_pool: RayResourcePool = None)
 
     def _init_reward_loop_workers(self):
         self.reward_loop_workers = []
-        num_workers = self.config.reward_model.get("num_workers", 1)
+        num_workers = self.config.reward_model.num_workers
         node_ids = [node["NodeID"] for node in ray.nodes() if node["Alive"] and node["Resources"].get("CPU", 0) > 0]
 
         for i in range(num_workers):
diff --git a/verl/experimental/reward/reward_loop/__init__.py b/verl/experimental/reward/reward_manager/__init__.py
similarity index 100%
rename from verl/experimental/reward/reward_loop/__init__.py
rename to verl/experimental/reward/reward_manager/__init__.py
diff --git a/verl/experimental/reward/reward_loop/base.py b/verl/experimental/reward/reward_manager/base.py
similarity index 100%
rename from verl/experimental/reward/reward_loop/base.py
rename to verl/experimental/reward/reward_manager/base.py
diff --git a/verl/experimental/reward/reward_loop/dapo.py b/verl/experimental/reward/reward_manager/dapo.py
similarity index 97%
rename from verl/experimental/reward/reward_loop/dapo.py
rename to verl/experimental/reward/reward_manager/dapo.py
index 5bd032c0827..d9c9307c6d7 100644
--- a/verl/experimental/reward/reward_loop/dapo.py
+++ b/verl/experimental/reward/reward_manager/dapo.py
@@ -15,8 +15,8 @@
 import inspect
 
 from verl import DataProto
-from verl.experimental.reward.reward_loop import register
-from verl.experimental.reward.reward_loop.base import RewardLoopManagerBase
+from verl.experimental.reward.reward_manager import register
+from verl.experimental.reward.reward_manager.base import RewardLoopManagerBase
 from verl.utils.reward_score import default_compute_score
 
 
diff --git a/verl/experimental/reward/reward_loop/limited.py b/verl/experimental/reward/reward_manager/limited.py
similarity index 99%
rename from verl/experimental/reward/reward_loop/limited.py
rename to verl/experimental/reward/reward_manager/limited.py
index 71bdcf95bee..180896915ce 100644
--- a/verl/experimental/reward/reward_loop/limited.py
+++ b/verl/experimental/reward/reward_manager/limited.py
@@ -20,8 +20,8 @@
 from transformers import AutoTokenizer
 
 from verl import DataProto
-from verl.experimental.reward.reward_loop import register as register_loop
-from verl.experimental.reward.reward_loop.base import RewardLoopManagerBase
+from verl.experimental.reward.reward_manager import register as register_loop
+from verl.experimental.reward.reward_manager.base import RewardLoopManagerBase
 from verl.utils.reward_score import default_compute_score
 from verl.workers.reward_manager import register as register_manager
 
diff --git a/verl/experimental/reward/reward_loop/naive.py b/verl/experimental/reward/reward_manager/naive.py
similarity index 96%
rename from verl/experimental/reward/reward_loop/naive.py
rename to verl/experimental/reward/reward_manager/naive.py
index d607cef7ff3..cbe5c474bff 100644
--- a/verl/experimental/reward/reward_loop/naive.py
+++ b/verl/experimental/reward/reward_manager/naive.py
@@ -15,8 +15,8 @@
 import inspect
 
 from verl import DataProto
-from verl.experimental.reward.reward_loop import register
-from verl.experimental.reward.reward_loop.base import RewardLoopManagerBase
+from verl.experimental.reward.reward_manager import register
+from verl.experimental.reward.reward_manager.base import RewardLoopManagerBase
 from verl.utils.reward_score import default_compute_score
 
 
diff --git a/verl/experimental/reward/reward_loop/registry.py b/verl/experimental/reward/reward_manager/registry.py
similarity index 96%
rename from verl/experimental/reward/reward_loop/registry.py
rename to verl/experimental/reward/reward_manager/registry.py
index 099e5eb200d..f31a1762e05 100644
--- a/verl/experimental/reward/reward_loop/registry.py
+++ b/verl/experimental/reward/reward_manager/registry.py
@@ -14,7 +14,7 @@
 
 from typing import Callable
 
-from verl.experimental.reward.reward_loop.base import RewardLoopManagerBase
+from verl.experimental.reward.reward_manager.base import RewardLoopManagerBase
 
 __all__ = ["register", "get_reward_loop_manager_cls"]
 
diff --git a/verl/experimental/reward/reward_model.py b/verl/experimental/reward/reward_model.py
index 957ef05bf49..2bc05e1eea1 100644
--- a/verl/experimental/reward/reward_model.py
+++ b/verl/experimental/reward/reward_model.py
@@ -91,10 +91,13 @@ def _initialize_llm_servers(self):
     def _initialize_router(self):
         worker_urls = [f"http://{server_address}" for server_address in self.server_addresses]
 
-        if self.config.rollout.name == "sglang":
-            from .router.inner_sglang_router import launch_router_process
-        else:
-            from .router.naive_router import launch_router_process
+        # TODO (dyy): sglang router is not ready yet.
+        # if self.config.rollout.name == "sglang":
+        #     from .router.inner_sglang_router import launch_router_process
+        # else:
+        #     from .router.naive_router import launch_router_process
+
+        from .router.naive_router import launch_router_process
 
         self.router_address, _ = launch_router_process(worker_urls=worker_urls)
 
diff --git a/verl/models/llama/megatron/layers/parallel_rmsnorm.py b/verl/models/llama/megatron/layers/parallel_rmsnorm.py
index bc2e9ae36f0..23a4a847ff8 100644
--- a/verl/models/llama/megatron/layers/parallel_rmsnorm.py
+++ b/verl/models/llama/megatron/layers/parallel_rmsnorm.py
@@ -15,7 +15,6 @@
 import numbers
 
 import torch
-from apex.normalization.fused_layer_norm import fused_rms_norm_affine
 from megatron.core import ModelParallelConfig
 from torch import nn
 from transformers import LlamaConfig
@@ -39,6 +38,8 @@ def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig):
             sp_utils.mark_parameter_as_sequence_parallel(self.weight)
 
     def forward(self, hidden_states):
+        from apex.normalization.fused_layer_norm import fused_rms_norm_affine
+
         return fused_rms_norm_affine(
             input=hidden_states,
             weight=self.weight,
diff --git a/verl/models/mcore/model_forward.py b/verl/models/mcore/model_forward.py
index 3a9d6bb4aba..5b1d4dc4391 100644
--- a/verl/models/mcore/model_forward.py
+++ b/verl/models/mcore/model_forward.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch
 
 from verl.utils.megatron_utils import unwrap_model
 
@@ -152,6 +153,8 @@ def gptmodel_forward_no_padding(
     logits_processor=None,
     logits_processor_args: dict = None,
     value_model=False,
+    vision_model=False,
+    pad_token_id=None,
     data_format: str = "thd",
 ):
     """Default forward pass for GPT models with optional sequence packing."""
@@ -174,9 +177,19 @@ def gptmodel_forward_no_padding(
     if data_format == "thd":
         input_ids_rmpad, packed_seq_params = preprocess_thd_no_padding(input_ids, pre_process=pre_process)
         input_ids_rmpad = input_ids_rmpad.contiguous()
+
+        # For VLM model, need to pass bshd format `input_ids` and `attention_mask`.
+        attention_mask = None
+        if vision_model:
+            input_ids_rmpad = input_ids.to_padded_tensor(pad_token_id)
+            seqlens_in_batch = input_ids.offsets().diff()
+            attention_mask = torch.zeros_like(input_ids_rmpad, dtype=torch.bool)
+            for i, seqlen in enumerate(seqlens_in_batch):
+                attention_mask[i, :seqlen] = True
+
         output_orig = model(
             input_ids=input_ids_rmpad,
-            attention_mask=None,
+            attention_mask=attention_mask,
             position_ids=None,
             packed_seq_params=packed_seq_params,
             **model_kwargs,
diff --git a/verl/models/mcore/registry.py b/verl/models/mcore/registry.py
index d8c7b2cfa86..fd0f0ce1a5f 100644
--- a/verl/models/mcore/registry.py
+++ b/verl/models/mcore/registry.py
@@ -22,6 +22,54 @@
 import torch
 import torch.nn as nn
 
+from .model_forward import gptmodel_forward_no_padding, model_forward_gen
+from .model_forward_fused import fused_forward_model_gen
+
+
+class SupportedVLM(Enum):
+    QWEN2_5_VL = "Qwen2_5_VLForConditionalGeneration"
+    QWEN3_MOE_VL = "Qwen3VLMoeForConditionalGeneration"
+    QWEN3_VL = "Qwen3VLForConditionalGeneration"
+
+
+def get_mcore_forward_fn(hf_config) -> Callable:
+    """
+    Get the forward function for given model architecture.
+    """
+    assert len(hf_config.architectures) == 1, "Only one architecture is supported for now"
+    if hf_config.architectures[0] in SupportedVLM:
+        return model_forward_gen(True)
+    else:
+        # default to language model
+        return model_forward_gen(False)
+
+
+def get_mcore_forward_no_padding_fn(hf_config) -> Callable:
+    """
+    Get the forward function for given model architecture.
+    """
+    assert len(hf_config.architectures) == 1, "Only one architecture is supported for now"
+    return gptmodel_forward_no_padding
+
+
+def get_mcore_forward_fused_fn(hf_config) -> Callable:
+    """
+    Get the forward function for given model architecture.
+    """
+    assert len(hf_config.architectures) == 1, "Only one architecture is supported for now"
+    if hf_config.architectures[0] in SupportedVLM:
+        return fused_forward_model_gen(True)
+    else:
+        # default to language model
+        return fused_forward_model_gen(False)
+
+
+# ruff: noqa
+
+########################################################
+# below is the deprecated code
+########################################################
+
 from .config_converter import (
     PretrainedConfig,
     TransformerConfig,
@@ -33,8 +81,6 @@
     hf_to_mcore_config_qwen2moe,
     hf_to_mcore_config_qwen3moe,
 )
-from .model_forward import gptmodel_forward_no_padding, model_forward_gen
-from .model_forward_fused import fused_forward_model_gen
 from .model_initializer import (
     BaseModelInitializer,
     DeepseekV3Model,
@@ -67,6 +113,7 @@ class SupportedModel(Enum):
     GLM4_MOE = "Glm4MoeForCausalLM"
 
     QWEN3_TOKEN_CLASSIFICATION = "Qwen3ForTokenClassification"
+    LLAMA_TOKEN_CLASSIFICATION = "LlamaForTokenClassification"
     QWEN3_MOE_VL = "Qwen3VLMoeForConditionalGeneration"
     QWEN3_VL = "Qwen3VLForConditionalGeneration"
     GPT_OSS = "GptOssForCausalLM"
@@ -84,6 +131,7 @@ class SupportedModel(Enum):
     SupportedModel.QWEN3: hf_to_mcore_config_dense,
     SupportedModel.QWEN3_MOE: hf_to_mcore_config_qwen3moe,
     SupportedModel.QWEN3_TOKEN_CLASSIFICATION: hf_to_mcore_config_dense,
+    SupportedModel.LLAMA_TOKEN_CLASSIFICATION: hf_to_mcore_config_dense,
 }
 
 # Registry for model initializers
@@ -98,6 +146,7 @@ class SupportedModel(Enum):
     SupportedModel.QWEN3: DenseModel,
     SupportedModel.QWEN3_MOE: Qwen3MoEModel,
     SupportedModel.QWEN3_TOKEN_CLASSIFICATION: DenseModel,
+    SupportedModel.LLAMA_TOKEN_CLASSIFICATION: DenseModel,
 }
 
 # Registry for model forward functions
@@ -113,9 +162,9 @@ class SupportedModel(Enum):
     SupportedModel.QWEN2_5_VL: model_forward_gen(True),
     SupportedModel.QWEN3_MOE_VL: model_forward_gen(True),
     SupportedModel.QWEN3_VL: model_forward_gen(True),
-    SupportedModel.DEEPSEEK_V3: model_forward_gen(),
     SupportedModel.GLM4_MOE: model_forward_gen(),
     SupportedModel.QWEN3_TOKEN_CLASSIFICATION: model_forward_gen(),
+    SupportedModel.LLAMA_TOKEN_CLASSIFICATION: model_forward_gen(),
     SupportedModel.GPT_OSS: model_forward_gen(),
 }
 
@@ -132,9 +181,9 @@ class SupportedModel(Enum):
     SupportedModel.LLAMA4: gptmodel_forward_no_padding,
     SupportedModel.QWEN3: gptmodel_forward_no_padding,
     SupportedModel.QWEN3_MOE: gptmodel_forward_no_padding,
-    SupportedModel.DEEPSEEK_V3: gptmodel_forward_no_padding,
     SupportedModel.GLM4_MOE: gptmodel_forward_no_padding,
     SupportedModel.QWEN3_TOKEN_CLASSIFICATION: gptmodel_forward_no_padding,
+    SupportedModel.LLAMA_TOKEN_CLASSIFICATION: gptmodel_forward_no_padding,
     SupportedModel.GPT_OSS: gptmodel_forward_no_padding,
 }
 
@@ -144,7 +193,6 @@ class SupportedModel(Enum):
     SupportedModel.QWEN2: fused_forward_model_gen(),
     SupportedModel.QWEN2_MOE: fused_forward_model_gen(),
     SupportedModel.MIXTRAL: fused_forward_model_gen(),
-    SupportedModel.DEEPSEEK_V3: fused_forward_model_gen(),
     SupportedModel.QWEN2_5_VL: fused_forward_model_gen(True),
     SupportedModel.QWEN3_MOE_VL: fused_forward_model_gen(True),
     SupportedModel.QWEN3_VL: fused_forward_model_gen(True),
@@ -167,6 +215,7 @@ class SupportedModel(Enum):
     SupportedModel.QWEN3_MOE: McoreToHFWeightConverterQwen3Moe,
     SupportedModel.QWEN2_5_VL: McoreToHFWeightConverterQwen2_5_VL,
     SupportedModel.QWEN3_TOKEN_CLASSIFICATION: McoreToHFWeightConverterDense,
+    SupportedModel.LLAMA_TOKEN_CLASSIFICATION: McoreToHFWeightConverterDense,
 }
 
 
@@ -236,33 +285,6 @@ def init_mcore_model(
     )
 
 
-def get_mcore_forward_fn(hf_config: PretrainedConfig) -> Callable:
-    """
-    Get the forward function for given model architecture.
-    """
-    assert len(hf_config.architectures) == 1, "Only one architecture is supported for now"
-    model = get_supported_model(hf_config.architectures[0])
-    return MODEL_FORWARD_REGISTRY[model]
-
-
-def get_mcore_forward_no_padding_fn(hf_config: PretrainedConfig) -> Callable:
-    """
-    Get the forward function for given model architecture.
-    """
-    assert len(hf_config.architectures) == 1, "Only one architecture is supported for now"
-    model = get_supported_model(hf_config.architectures[0])
-    return MODEL_FORWARD_NOPAD_REGISTRY[model]
-
-
-def get_mcore_forward_fused_fn(hf_config: PretrainedConfig) -> Callable:
-    """
-    Get the forward function for given model architecture.
-    """
-    assert len(hf_config.architectures) == 1, "Only one architecture is supported for now"
-    model = get_supported_model(hf_config.architectures[0])
-    return MODEL_FORWARD_FUSED_REGISTRY[model]
-
-
 def get_mcore_weight_converter(hf_config: PretrainedConfig, dtype: torch.dtype) -> Callable:
     """
     Get the weight converter for given model architecture.
diff --git a/verl/models/qwen2/megatron/modeling_qwen2_megatron.py b/verl/models/qwen2/megatron/modeling_qwen2_megatron.py
index c536a0fba27..b3512f8afa5 100644
--- a/verl/models/qwen2/megatron/modeling_qwen2_megatron.py
+++ b/verl/models/qwen2/megatron/modeling_qwen2_megatron.py
@@ -583,7 +583,7 @@ def _init_head(self, config):
     def setup_embeddings_and_output_layer(self) -> None:
         """Sets up embedding layer in first stage and output layer in last stage.
 
-        This function initalizes word embeddings in the final stage when we are
+        This function initializes word embeddings in the final stage when we are
         using pipeline parallelism and sharing word embeddings, and sets up param
         attributes on the embedding and output layers.
         """
diff --git a/verl/models/transformers/monkey_patch.py b/verl/models/transformers/monkey_patch.py
index 59b342f879f..c4ec20c1362 100644
--- a/verl/models/transformers/monkey_patch.py
+++ b/verl/models/transformers/monkey_patch.py
@@ -356,7 +356,11 @@ def state_dict(self, *args, **kwargs):
             Qwen3VLMoeTextModel,
         )
 
-        from verl.models.transformers.qwen3_vl import forward_with_normal_backend, qwen3_vl_base_forward
+        from verl.models.transformers.qwen3_vl import (
+            forward_with_normal_backend,
+            patch_qwen3_vl_moe_sparse_moe_block_forward,
+            qwen3_vl_base_forward,
+        )
 
         Qwen3VLModel.forward = qwen3_vl_base_forward
         Qwen3VLMoeModel.forward = qwen3_vl_base_forward
@@ -364,6 +368,10 @@ def state_dict(self, *args, **kwargs):
         Qwen3VLMoeForConditionalGeneration.forward = forward_with_normal_backend
         print(f"Monkey patch {model.__class__.__name__} model forward")
 
+        # Step 1.5: patch Qwen3VLMoeTextSparseMoeBlock to fix transformers 4.57.3 bug
+        if model.config.model_type == "qwen3_vl_moe" and is_transformers_version_in_range(max_version="4.57.3"):
+            patch_qwen3_vl_moe_sparse_moe_block_forward()
+
         # Step 2: patch input for multimodal sequence parallelism
         if ulysses_sp_size > 1:
             patch_vlm_for_ulysses_input_slicing(Qwen3VLTextModel)
diff --git a/verl/models/transformers/qwen3_vl.py b/verl/models/transformers/qwen3_vl.py
index d0512172f46..38aa9cbfd4b 100644
--- a/verl/models/transformers/qwen3_vl.py
+++ b/verl/models/transformers/qwen3_vl.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import functools
 import logging
 import os
 from dataclasses import dataclass
@@ -334,3 +335,41 @@ def forward_with_triton_backend(
         entropy=entropy,
         hidden_states=outputs.hidden_states,
     )
+
+
+def patch_qwen3_vl_moe_sparse_moe_block_forward():
+    """
+    Monkey patch to fix a bug in transformers 4.57.3 where Qwen3VLMoeTextSparseMoeBlock.forward
+    incorrectly uses torch.zeros_like(hidden_states) instead of torch.zeros_like(router_logits)
+    when creating router_weights (line 148 in modeling_qwen3_vl_moe.py).
+
+    This is a minimal fix that only changes the problematic line while keeping the rest of the
+    original implementation intact.
+    """
+    try:
+        from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextSparseMoeBlock
+    except ImportError:
+        # Model not available, skip patching
+        return
+
+    # Store the original forward method for reference
+    original_forward = Qwen3VLMoeTextSparseMoeBlock.forward
+
+    @functools.wraps(original_forward)
+    def patched_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size = hidden_states.shape[0]
+        hidden_states = hidden_states.reshape(-1, self.hidden_size)
+        router_logits = self.gate(hidden_states)
+        routing_weights = torch.nn.functional.softmax(router_logits, dim=-1, dtype=torch.float)
+        routing_weights, router_indices = torch.topk(routing_weights, self.top_k, dim=-1)
+        routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)
+        # BUG FIX: Original code incorrectly uses hidden_states here, should use router_logits
+        routing_weights = routing_weights.to(router_logits.dtype)
+        router_weights = torch.zeros_like(router_logits).scatter_(1, router_indices, routing_weights)
+        hidden_states = hidden_states.reshape(batch_size, -1, self.hidden_size)
+        routed_out = self.experts(hidden_states, router_weights, router_indices)
+        return routed_out
+
+    # Apply the patch
+    Qwen3VLMoeTextSparseMoeBlock.forward = patched_forward
+    logger.info("Monkey patched Qwen3VLMoeTextSparseMoeBlock.forward to fix router_weights bug")
diff --git a/verl/models/weight_loader_registry.py b/verl/models/weight_loader_registry.py
index 0904f14fad4..ee60ea71f0e 100644
--- a/verl/models/weight_loader_registry.py
+++ b/verl/models/weight_loader_registry.py
@@ -48,6 +48,7 @@ def get_weight_saver(arch: str):
         "Qwen3ForCausalLM": merge_megatron_ckpt_gptmodel,
         "Qwen3ForTokenClassification": merge_megatron_ckpt_gptmodel,
         "Qwen3MoeForCausalLM": merge_megatron_ckpt_gptmodel_qwen_moe,
+        "LlamaForTokenClassification": merge_megatron_ckpt_gptmodel,
     }
     if arch in _MODEL_WEIGHT_MEGATRON_SAVER_REGISTRY:
         return _MODEL_WEIGHT_MEGATRON_SAVER_REGISTRY[arch]
diff --git a/verl/single_controller/base/decorator.py b/verl/single_controller/base/decorator.py
index 1fa0496eaaa..cfcd793045d 100644
--- a/verl/single_controller/base/decorator.py
+++ b/verl/single_controller/base/decorator.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import inspect
 from functools import partial, wraps
 from types import FunctionType
@@ -20,7 +19,7 @@
 
 from verl.protocol import DataProtoFuture, _padding_size_key
 from verl.utils.py_functional import DynamicEnum
-from verl.utils.tensordict_utils import concat_tensordict
+from verl.utils.tensordict_utils import chunk_tensordict, concat_tensordict
 from verl.utils.transferqueue_utils import BatchMeta
 
 # here we add a magic number of avoid user-defined function already have this attribute
@@ -78,14 +77,20 @@ def _split_args_kwargs_data_proto(chunks, *args, **kwargs):
     splitted_args = []
     for arg in args:
         assert isinstance(arg, DataProto | DataProtoFuture | BatchMeta | TensorDict)
-        chunked_arg = arg.chunk(chunks=chunks)
+        if isinstance(arg, TensorDict):
+            chunked_arg = chunk_tensordict(arg, chunks)
+        else:
+            chunked_arg = arg.chunk(chunks=chunks)
         assert len(chunked_arg) == chunks
         splitted_args.append(chunked_arg)
 
     splitted_kwargs = {}
     for key, val in kwargs.items():
         assert isinstance(val, DataProto | DataProtoFuture | BatchMeta | TensorDict)
-        chunked_kwarg = val.chunk(chunks=chunks)
+        if isinstance(val, TensorDict):
+            chunked_kwarg = chunk_tensordict(val, chunks)
+        else:
+            chunked_kwarg = val.chunk(chunks=chunks)
         assert len(chunked_kwarg) == chunks
         splitted_kwargs[key] = chunked_kwarg
 
diff --git a/verl/single_controller/ray/base.py b/verl/single_controller/ray/base.py
index feb73a5430e..f1bdb553d5f 100644
--- a/verl/single_controller/ray/base.py
+++ b/verl/single_controller/ray/base.py
@@ -220,13 +220,15 @@ def split_resource_pool(
     else:
         start_bundle_idx_list = np.cumsum([0] + split_size_list[:-1])
 
+    # ensure resource_pool.pgs has been initialized
+    placement_groups = resource_pool.get_placement_groups()
     split_resource_pools = [
         SubRayResourcePool(
             process_on_nodes=resource_pool.store,
             use_gpu=resource_pool.use_gpu,
             name_prefix=f"{resource_pool.name_prefix}_split_{split_idx}",
             max_colocate_count=resource_pool.max_colocate_count,
-            placement_groups=resource_pool.pgs,
+            placement_groups=placement_groups,
             start_bundle_index=start_bundle_idx_list[split_idx],
             subgroup_world_size=split_size_list[split_idx],
         )
diff --git a/verl/third_party/sglang/__init__.py b/verl/third_party/sglang/__init__.py
deleted file mode 100644
index 15593caaf36..00000000000
--- a/verl/third_party/sglang/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright 2023-2024 SGLang Team
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/verl/third_party/sglang/parallel_state.py b/verl/third_party/sglang/parallel_state.py
deleted file mode 100644
index cdec743d13f..00000000000
--- a/verl/third_party/sglang/parallel_state.py
+++ /dev/null
@@ -1,328 +0,0 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-# Copyright 2023 The SGlang team.
-# Adapted from
-# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-"""Model and data parallel groups."""
-
-import os
-from typing import Optional
-
-import sglang.srt.distributed.parallel_state as ps
-import torch
-import torch.distributed
-from sglang.srt.distributed.parallel_state import (
-    get_pp_group,
-    get_world_group,
-    init_distributed_environment,
-    init_model_parallel_group,
-)
-
-"""
-This version is strongly tied with Megatron to implement HybridEngine and weight sharing between vllm and Megatron.
-- We assume the Megatron tp+dp+pp world is already established before calling this function.
-
-"""
-
-# Device mesh for using DTensor
-_DEVICE_MESH = None
-
-# Tensor model parallel group that the current rank belongs to.
-_TP = None
-# Pipeline model parallel group that the current rank belongs to.
-_PP = None
-
-
-# This method is for initializing the ParallelGroup when using HybridEngine
-# NOTE(linjunrong): this function is for megatron
-def initialize_parallel_state(
-    distributed_init_method: str = "env://",
-    backend: str = "nccl",
-    tensor_model_parallel_size: int = 1,
-    num_tp_per_train_tp: int = 1,
-    pipeline_model_parallel_size: int = 1,
-):
-    # torch.distributed.all_reduce does not free the input tensor until
-    # the synchronization point. This causes the memory usage to grow
-    # as the number of all_reduce calls increases. This env var disables
-    # this behavior.
-    # Related issue:
-    # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
-    os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
-
-    # NOTE(sgm): Modify for verl, Env vars will be set by TORCHRUN.
-    rank = int(os.getenv("RANK", "-1"))
-    local_rank = int(os.getenv("LOCAL_RANK", "0"))
-
-    # Use the world_size set by TORCHRUN
-    world_size = int(os.getenv("WORLD_SIZE", "-1"))
-    assert world_size != -1, "The world_size is set to -1, not initialized by TORCHRUN"
-    init_distributed_environment(world_size, rank, distributed_init_method, local_rank, backend)
-    if torch.distributed.get_world_size() > 1:
-        # NOTE: build a separate inference group with infer tp & micro dp
-        initialize_model_parallel_for_sglang(
-            tensor_model_parallel_size=tensor_model_parallel_size,
-            num_tensor_model_parallel_groups_per_train_tp=num_tp_per_train_tp,
-        )
-    else:
-        initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, backend)
-
-
-# NOTE(linjunrong): After init SGLang rollout using class EngineFragment, user should always remember to call
-# this function to sync the _TP, _PP define at the beginning of this file. Otherwise, only the conterparts
-# inside sglang.srt.distributed are init as ProcessGroup, the symbols defined in this file remain as None.
-# It could be weird to maintain two _TP and _PP, I follow the same way to maintain an extra ones for
-# verl itself as how it was done in verl.third_party.vllm.parallel_state. Note that the process is a little
-# bit different
-def ensure_model_parallel_initialized(
-    tensor_model_parallel_size: int,
-    pipeline_model_parallel_size: int = 1,
-    backend: Optional[str] = None,
-) -> None:
-    """Helper to initialize model parallel groups if they are not initialized,
-    or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
-    values if the model parallel groups are initialized.
-    """
-    # get the backend of _DEVICE_WORLD_GROUP
-    backend = backend or torch.distributed.get_backend(get_world_group().device_group)
-    if not model_parallel_is_initialized():
-        initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, backend)
-        return
-
-    assert get_tensor_model_parallel_world_size() == tensor_model_parallel_size, (
-        f"tensor parallel group already initialized, but of unexpected size: "
-        f"{get_tensor_model_parallel_world_size()=} vs. {tensor_model_parallel_size=}"
-    )
-    pp_world_size = get_pp_group().world_size
-    assert pp_world_size == pipeline_model_parallel_size, (
-        f"pipeline parallel group already initialized, but of unexpected size: {pp_world_size=} vs. "
-        f"{pipeline_model_parallel_size=}"
-    )
-
-
-# TODO(sgm): deviate from the v0.5.4, not pp now
-# NOTE(linjunrong): the SGLang version using _TP instead of ps._TP
-def model_parallel_is_initialized():
-    """Check if tensor and pipeline parallel groups are initialized."""
-    return _TP is not None
-    # and _PIPELINE_MODEL_PARALLEL_GROUP is not None)
-
-
-def initialize_model_parallel_for_sglang(
-    tensor_model_parallel_size: int,
-    num_tensor_model_parallel_groups_per_train_tp: int = 1,
-    pipeline_model_parallel_size: int = 1,
-) -> None:
-    pass
-
-    # Get world size and rank. Ensure some consistencies.
-    assert torch.distributed.is_initialized()
-
-    assert isinstance(tensor_model_parallel_size, int)
-
-    # assert num_tensor_model_parallel_groups_per_train_tp == 1 and not different_tp_group
-    # assert num_tensor_model_parallel_groups_per_train_tp > 1 and different_tp_group
-
-    # Build the tensor model-parallel groups.
-    assert ps._TP is None, "tensor model parallel group is already initialized"
-
-    global _TP
-
-    world_size: int = torch.distributed.get_world_size()
-
-    backend = torch.distributed.get_backend()
-
-    num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size
-
-    if num_tensor_model_parallel_groups_per_train_tp == 1:
-        # if tensor_model_parallel_size == train_tensor_parallel_size:
-        # using the same tp group as Megatron/vllm
-        assert _TP is None, "tensor model parallel group is already initialized"
-        group_ranks = []
-        for i in range(num_tensor_model_parallel_groups):
-            ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
-            group_ranks.append(ranks)
-        _TP = init_model_parallel_group(
-            group_ranks=group_ranks,
-            local_rank=get_world_group().local_rank,
-            backend=backend,
-            use_custom_allreduce=False,  # TODO: check why True is not work in Ray trainer
-            use_message_queue_broadcaster=True,
-        )
-        ps._TP = _TP
-        # _MICRO_DATA_PARALLEL_GROUP is move to hybrid engine
-    else:
-        # initialize a micro_dp group and a tp group
-        # assume training tp=4, infer tp=2, then, weight is partitioned as
-        # [1], [2], [3], [4] for training and [1,2], [1,2], [3,4], [3,4] for inference
-
-        # Build the inference tp groups
-        # train_tp = train_tensor_parallel_size
-        train_tp = num_tensor_model_parallel_groups_per_train_tp * tensor_model_parallel_size
-        # num_tensor_model_parallel_groups_per_train_tp = train_tp // tensor_model_parallel_size
-        assert _TP is None, "tensor model parallel group is already initialized"
-        group_ranks = []
-        for i in range(num_tensor_model_parallel_groups // num_tensor_model_parallel_groups_per_train_tp):
-            start = train_tp * i
-            end = train_tp * (i + 1)
-            for j in range(num_tensor_model_parallel_groups_per_train_tp):
-                ranks = list(range(start, end, num_tensor_model_parallel_groups_per_train_tp))
-                for i in range(len(ranks)):
-                    ranks[i] += j
-                group_ranks.append(ranks)
-        _TP = init_model_parallel_group(
-            group_ranks=group_ranks,
-            local_rank=get_world_group().local_rank,
-            backend=backend,
-            use_custom_allreduce=False,  # TODO: check why True is not work in Ray trainer
-            use_message_queue_broadcaster=True,
-        )
-        ps._TP = _TP
-
-    # Build the pipeline model-parallel groups.
-    # global _PIPELINE_MODEL_PARALLEL_GROUP
-    # global _PIPELINE_GLOBAL_RANKS
-    # assert ps._PIPELINE_MODEL_PARALLEL_GROUP is None, ("pipeline model parallel group is already initialized")
-
-    # ps._PIPELINE_MODEL_PARALLEL_GROUP = mpu.get_pipeline_model_parallel_group()
-    # ps._PIPELINE_GLOBAL_RANKS = mpu.get_pipeline_model_parallel_ranks()
-
-    # TODO: init using device mesh (not support hybrid engine now)
-    # Build the pipeline model-parallel groups.
-    num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
-    global _PP
-    assert _PP is None, "pipeline model parallel group is already initialized"
-    group_ranks = []
-    for i in range(num_pipeline_model_parallel_groups):
-        ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
-        group_ranks.append(ranks)
-    # pipeline parallel does not need custom allreduce
-    _PP = init_model_parallel_group(group_ranks, get_world_group().local_rank, backend, use_custom_allreduce=False)
-    ps._PP = _PP  # for verl
-
-
-def initialize_model_parallel(
-    tensor_model_parallel_size: int = 1,
-    pipeline_model_parallel_size: int = 1,
-    backend: Optional[str] = None,
-) -> None:
-    """
-    NOTE: This method is a hack from the open-sourced version without
-    asertion of world_size = tp * pp
-
-    Initialize model parallel groups.
-
-    Arguments:
-        tensor_model_parallel_size: number of GPUs used for tensor model
-            parallelism.
-        pipeline_model_parallel_size: number of GPUs used for pipeline model
-            parallelism.
-
-    Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
-    use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
-    the model pipeline. The present function will
-    create 4 tensor model-parallel groups and 2 pipeline model-parallel groups:
-        4 tensor model-parallel groups:
-            [g0, g1], [g2, g3], [g4, g5], [g6, g7]
-        2 pipeline model-parallel groups:
-            [g0, g2, g4, g6], [g1, g3, g5, g7]
-    Note that for efficiency, the caller should make sure adjacent ranks
-    are on the same DGX box. For example if we are using 2 DGX-1 boxes
-    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
-    ranks 8 to 15 belong to the second box.
-    """
-    # Get world size and rank. Ensure some consistencies.
-    assert torch.distributed.is_initialized()
-    world_size: int = torch.distributed.get_world_size()
-    backend = backend or torch.distributed.get_backend(ps.get_world_group().device_group)
-
-    # NOTE(sgm) we don't assert world_size == tp * pp
-    # DP is not managed by vllm but by the VeRL WorkerGroup
-    # if (world_size !=
-    #         tensor_model_parallel_size * pipeline_model_parallel_size):
-    #     raise RuntimeError(
-    #         f"world_size ({world_size}) is not equal to "
-    #         f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
-    #         f"pipeline_model_parallel_size ({pipeline_model_parallel_size})")
-
-    num_tensor_model_parallel_groups: int = world_size // tensor_model_parallel_size
-
-    global _TP
-    assert _TP is None, "tensor model parallel group is already initialized"
-    group_ranks = []
-    for i in range(num_tensor_model_parallel_groups):
-        ranks = list(range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size))
-        group_ranks.append(ranks)
-
-    # message queue broadcaster is only used in tensor model parallel group
-    if ps._TP is not None:
-        _TP = ps._TP
-    else:
-        _TP = init_model_parallel_group(
-            group_ranks,
-            get_world_group().local_rank,
-            backend,
-            use_custom_allreduce=False,  # TODO: check why True is not work in Ray trainer
-            use_message_queue_broadcaster=True,
-        )
-        ps._TP = _TP
-
-    # TODO: init using device mesh (not support hybrid engine now)
-    # Build the pipeline model-parallel groups.
-    num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
-    global _PP
-    assert _PP is None, "pipeline model parallel group is already initialized"
-    group_ranks = []
-    for i in range(num_pipeline_model_parallel_groups):
-        ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
-        group_ranks.append(ranks)
-    # pipeline parallel does not need custom allreduce
-    if ps._TP is not None:
-        _PP = ps._TP
-    else:
-        _PP = init_model_parallel_group(group_ranks, get_world_group().local_rank, backend, use_custom_allreduce=False)
-        ps._PP = _PP
-
-
-"""
-Device mesh utilities
-"""
-
-
-def get_device_mesh():
-    assert _DEVICE_MESH is not None, "device mesh is not initialized"
-    return _DEVICE_MESH
-
-
-"""
-Tensor model parallel utilities
-"""
-
-
-# NOTE(linjunrong): In the vllm version parallel_state.py. verl created its own _TP and _PP as verl want to use
-# the process group for some extra purpose. Under the hood, there is no difference between them and the original
-# one in vllm.distributed.parallel_state. However, the implementation need to hack the init process of inference
-# engine, as we do not maintain another SGLang here, I just use the original _TP and _PP directly.
-def get_tensor_model_parallel_group():
-    """Get the tensor model parallel group the caller rank belongs to."""
-
-    assert _TP is not None, "tensor model parallel group is not initialized"
-    return _TP.device_group
-
-
-def get_tensor_model_parallel_world_size():
-    """Return world size for the tensor model parallel group."""
-    return torch.distributed.get_world_size(group=get_tensor_model_parallel_group())
-
-
-def get_tensor_model_parallel_rank():
-    """Return my rank for the tensor model parallel group."""
-    return torch.distributed.get_rank(group=get_tensor_model_parallel_group())
-
-
-def get_tensor_model_parallel_src_rank():
-    """Calculate the global rank corresponding to the first local rank
-    in the tensor model parallel group."""
-    global_rank = torch.distributed.get_rank()
-    local_world_size = get_tensor_model_parallel_world_size()
-    return (global_rank // local_world_size) * local_world_size
diff --git a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
index b40d462d4f6..a117c0f332f 100644
--- a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
@@ -52,7 +52,7 @@ actor_rollout_ref:
         recompute_num_layers: null
         attention_backend: flash
       override_mcore_model_config: {}
-      use_mbridge: false
+      use_mbridge: true
       vanilla_mbridge: true
       use_remove_padding: true
       forward_only: false
@@ -88,6 +88,7 @@ actor_rollout_ref:
     kl_loss_type: low_var_kl
     ppo_epochs: 1
     shuffle: false
+    data_loader_seed: 42
     checkpoint:
       _target_: verl.trainer.config.CheckpointConfig
       save_contents:
@@ -127,7 +128,6 @@ actor_rollout_ref:
       mode: disabled
       record_file: null
       replay_file: null
-    data_loader_seed: 42
     load_weight: true
   ref:
     rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
@@ -433,7 +433,7 @@ critic:
       recompute_num_layers: null
       attention_backend: flash
     override_mcore_model_config: {}
-    use_mbridge: false
+    use_mbridge: true
     vanilla_mbridge: true
     use_remove_padding: true
     forward_only: false
@@ -480,6 +480,7 @@ critic:
   forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
   ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
   shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: ${oc.select:actor_rollout_ref.actor.data_loader_seed,null}
   cliprange_value: 0.5
   loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
   checkpoint:
@@ -517,11 +518,10 @@ critic:
         stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
   nccl_timeout: 600
   load_weight: true
-  data_loader_seed: ${oc.select:actor_rollout_ref.actor.data_loader_seed,null}
 reward_model:
   enable: false
   enable_resource_pool: false
-  n_gpus_per_node: 0
+  n_gpus_per_node: 8
   nnodes: 0
   strategy: megatron
   model:
@@ -571,6 +571,31 @@ reward_model:
     use_remove_padding: ${oc.select:actor_rollout_ref.actor.megatron.use_remove_padding,True}
     dtype: bfloat16
   load_weight: true
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
 algorithm:
   rollout_correction:
     rollout_is: null
@@ -580,7 +605,7 @@ algorithm:
     rollout_rs_threshold_lower: null
     rollout_token_veto_threshold: null
     bypass_mode: false
-    use_policy_gradient: false
+    loss_type: ppo_clip
     rollout_is_batch_normalize: false
   _target_: verl.trainer.config.AlgoConfig
   gamma: 1.0
diff --git a/verl/trainer/config/_generated_ppo_trainer.yaml b/verl/trainer/config/_generated_ppo_trainer.yaml
index d37965dbc50..833ebb70d5b 100644
--- a/verl/trainer/config/_generated_ppo_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_trainer.yaml
@@ -75,6 +75,7 @@ actor_rollout_ref:
     kl_loss_type: low_var_kl
     ppo_epochs: 1
     shuffle: false
+    data_loader_seed: 42
     checkpoint:
       _target_: verl.trainer.config.CheckpointConfig
       save_contents:
@@ -287,7 +288,7 @@ actor_rollout_ref:
     override_config: {}
     enable_gradient_checkpointing: true
     enable_activation_offload: false
-    use_remove_padding: false
+    use_remove_padding: true
     lora_rank: 0
     lora_alpha: 16
     target_modules: all-linear
@@ -413,6 +414,7 @@ critic:
   forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
   ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
   shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
   cliprange_value: 0.5
   loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
   checkpoint:
@@ -455,7 +457,7 @@ critic:
 reward_model:
   enable: false
   enable_resource_pool: false
-  n_gpus_per_node: 0
+  n_gpus_per_node: 8
   nnodes: 0
   strategy: fsdp
   model:
@@ -495,6 +497,31 @@ reward_model:
     save_path: ${oc.select:global_profiler.save_path,null}
     tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
   ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  num_workers: 1
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: false
+    prompt_length: 2048
+    response_length: 2048
 algorithm:
   rollout_correction:
     rollout_is: null
@@ -504,7 +531,7 @@ algorithm:
     rollout_rs_threshold_lower: null
     rollout_token_veto_threshold: null
     bypass_mode: false
-    use_policy_gradient: false
+    loss_type: ppo_clip
     rollout_is_batch_normalize: false
   _target_: verl.trainer.config.AlgoConfig
   gamma: 1.0
diff --git a/verl/trainer/config/actor/actor.yaml b/verl/trainer/config/actor/actor.yaml
index f5f1d15eee5..283095a1527 100644
--- a/verl/trainer/config/actor/actor.yaml
+++ b/verl/trainer/config/actor/actor.yaml
@@ -103,6 +103,9 @@ ppo_epochs: 1
 # Shuffle training data across PPO epochs
 shuffle: false
 
+# The seed used to construct mini-batch
+data_loader_seed: 42
+
 # checkpoint configs
 checkpoint:
 
diff --git a/verl/trainer/config/actor/megatron_actor.yaml b/verl/trainer/config/actor/megatron_actor.yaml
index a632fe4380b..fde70c363c4 100644
--- a/verl/trainer/config/actor/megatron_actor.yaml
+++ b/verl/trainer/config/actor/megatron_actor.yaml
@@ -15,6 +15,4 @@ _target_: verl.workers.config.McoreActorConfig
 
 strategy: megatron
 
-data_loader_seed: 42
-
 load_weight: True
diff --git a/verl/trainer/config/algorithm.py b/verl/trainer/config/algorithm.py
index a7c86da0297..a40973e669f 100644
--- a/verl/trainer/config/algorithm.py
+++ b/verl/trainer/config/algorithm.py
@@ -114,14 +114,17 @@ class RolloutCorrectionConfig(BaseConfig):
 
         bypass_mode (bool): Operating mode - bypass or decoupled.
             - True: Bypass mode - reuse rollout_log_prob as old_log_prob (2 policies)
+              Uses compute_policy_loss_bypass_mode() with loss_type selection
             - False: Decoupled mode - compute old_log_prob separately (3 policies)
+              Uses standard PPO loss with IS weight correction
             Default: False (decoupled mode)
 
-        use_policy_gradient (bool): Loss function type.
-            - Requires bypass_mode=True
-            - True: Policy gradient loss (no PPO clipping)
-            - False: PPO loss (with clipping)
-            Default: False (PPO loss)
+        loss_type (str): Loss function type in bypass mode (bypass_mode=True).
+            - "reinforce": REINFORCE-style policy gradient with explicit IS weights
+              L = -E[w * log π(a|s) * A] where w = π_current / π_rollout
+            - "ppo_clip": PPO clipped objective (IS handled by ratio, no explicit weights)
+              L = -E[min(r*A, clip(r)*A)] where r = π_current / π_rollout
+            Default: "ppo_clip"
 
         rollout_is_batch_normalize (bool): Apply batch normalization to IS weights.
             - True: Normalize IS weights to have mean=1.0 within each batch
@@ -142,15 +145,15 @@ class RolloutCorrectionConfig(BaseConfig):
         config = RolloutCorrectionConfig.decoupled_geo_rs()  # Geo-RS
         config = RolloutCorrectionConfig.geo_rs_seq_tis()  # Geo-RS-Seq-TIS
 
-        # Bypass PPO mode (2 policies: π_rollout = π_old, π_θ)
-        # No IS correction needed since π_old = π_rollout
-        config = RolloutCorrectionConfig.ppo_is_bypass()  # PPO with rollout as anchor
-
-        # Bypass PG mode presets (2 policies, no PPO clipping)
-        # IS weights computed on-the-fly as π_θ / π_rollout
-        config = RolloutCorrectionConfig.pg_is()  # Seq-TIS + PG
-        config = RolloutCorrectionConfig.pg_rs()  # Geo-RS + PG
-        config = RolloutCorrectionConfig.pg_geo_rs_seq_tis()  # Geo-RS-Seq-TIS + PG
+        # Bypass mode presets (2 policies: π_rollout = π_old, π_θ)
+        # loss_type controls the loss function
+        # PPO-clip presets (ratio handles IS, so no separate IS weights needed):
+        config = RolloutCorrectionConfig.bypass_ppo_clip()          # PPO-clip only
+        config = RolloutCorrectionConfig.bypass_ppo_clip_geo_rs()   # PPO-clip + Geo-RS
+        # REINFORCE presets (explicit IS weights):
+        config = RolloutCorrectionConfig.bypass_pg_is()             # REINFORCE + Seq-TIS
+        config = RolloutCorrectionConfig.bypass_pg_rs()             # REINFORCE + Geo-RS
+        config = RolloutCorrectionConfig.bypass_pg_geo_rs_seq_tis() # REINFORCE + Geo-RS + Seq-TIS
 
     Reference:
         Liu, Li, Fu, Wang, Liu, Shen (2025)
@@ -165,7 +168,7 @@ class RolloutCorrectionConfig(BaseConfig):
     rollout_rs_threshold_lower: Optional[float] = None
     rollout_token_veto_threshold: Optional[float] = None
     bypass_mode: bool = False
-    use_policy_gradient: bool = False
+    loss_type: str = "ppo_clip"
     rollout_is_batch_normalize: bool = False
 
     @classmethod
@@ -256,57 +259,89 @@ def decoupled_geo_rs(
         )
 
     @classmethod
-    def ppo_is_bypass(cls, threshold: float = 2.0) -> "RolloutCorrectionConfig":
-        """PPO with IS Correction in Bypass Mode.
+    def bypass_ppo_clip(cls) -> "RolloutCorrectionConfig":
+        """Bypass mode with PPO-clip loss.
 
-        Skips old_log_prob computation by reusing rollout_log_prob.
-        PPO clips against rollout policy instead of true old policy.
+        PPO clipped objective in bypass mode. The PPO ratio = π_θ/π_rollout
+        already handles IS correction, so no explicit IS weights are applied.
 
-        Args:
-            threshold (float): Upper threshold for IS weights. Default: 2.0
+        Skips old_log_prob computation for faster execution (2 policies instead of 3).
 
         Returns:
-            RolloutCorrectionConfig configured for PPO_IS bypass mode
+            RolloutCorrectionConfig configured for bypass mode with PPO-clip
         """
         return cls(
-            rollout_is="token",
-            rollout_is_threshold=threshold,
+            rollout_is=None,
             rollout_rs=None,
             bypass_mode=True,
-            use_policy_gradient=False,
+            loss_type="ppo_clip",
+        )
+
+    @classmethod
+    def bypass_ppo_clip_geo_rs(
+        cls,
+        rs_threshold: float = 1.001,
+        rs_threshold_lower: Optional[float] = None,
+        veto_threshold: float = 1e-4,
+    ) -> "RolloutCorrectionConfig":
+        """Bypass mode with PPO-clip loss and Geometric Rejection Sampling.
+
+        PPO clipped objective in bypass mode with geometric RS to mask outliers.
+        The PPO ratio = π_θ/π_rollout already handles IS correction.
+
+        Skips old_log_prob computation for faster execution (2 policies instead of 3).
+        Solves the "Length Trap" problem for CoT/agent workloads.
+
+        Args:
+            rs_threshold (float): Geometric RS threshold (upper). Default: 1.001 (±0.1%)
+            rs_threshold_lower (Optional[float]): Geometric RS threshold (lower).
+                If None, auto-computed as reciprocal of rs_threshold. Default: None
+            veto_threshold (float): Per-token veto threshold. Default: 1e-4
+
+        Returns:
+            RolloutCorrectionConfig configured for bypass mode with PPO-clip + Geo-RS
+        """
+        return cls(
+            rollout_is=None,
+            rollout_rs="geometric",
+            rollout_rs_threshold=rs_threshold,
+            rollout_rs_threshold_lower=rs_threshold_lower,
+            rollout_token_veto_threshold=veto_threshold,
+            bypass_mode=True,
+            loss_type="ppo_clip",
         )
 
     @classmethod
-    def pg_is(cls, threshold: float = 2.0) -> "RolloutCorrectionConfig":
-        """Policy Gradient with IS Correction.
+    def bypass_pg_is(cls, threshold: float = 2.0) -> "RolloutCorrectionConfig":
+        """Bypass mode with REINFORCE loss and IS Correction.
 
-        Uses policy gradient loss with explicit IS correction.
+        Uses REINFORCE loss with explicit IS correction in bypass mode.
         No PPO clipping.
 
         Args:
             threshold (float): Upper threshold for IS weights. Default: 2.0
 
         Returns:
-            RolloutCorrectionConfig configured for PG with IS
+            RolloutCorrectionConfig configured for bypass mode with REINFORCE + IS
         """
         return cls(
             rollout_is="sequence",
             rollout_is_threshold=threshold,
             rollout_rs=None,
             bypass_mode=True,
-            use_policy_gradient=True,
+            loss_type="reinforce",
         )
 
     @classmethod
-    def pg_rs(
+    def bypass_pg_rs(
         cls,
         rs_threshold: float = 1.001,
         rs_threshold_lower: Optional[float] = None,
         veto_threshold: float = 1e-4,
     ) -> "RolloutCorrectionConfig":
-        """Policy Gradient with Rejection Sampling (Geo-RS).
+        """Bypass mode with REINFORCE loss and Geometric Rejection Sampling.
 
-        Policy gradient with geometric rejection sampling (no IS weights) in bypass mode.
+        REINFORCE with geometric rejection sampling (no IS weights) in bypass mode.
         Skips old_log_prob computation for faster execution.
 
         Solves the "Length Trap" problem where standard IS estimators penalize long sequences.
@@ -319,7 +354,7 @@ def pg_rs(
             veto_threshold (float): Per-token veto threshold. Default: 1e-4
 
         Returns:
-            RolloutCorrectionConfig configured for PG with Geo-RS
+            RolloutCorrectionConfig configured for bypass mode with REINFORCE + Geo-RS
         """
         return cls(
             rollout_is=None,
@@ -328,7 +363,7 @@ def pg_rs(
             rollout_rs_threshold_lower=rs_threshold_lower,
             rollout_token_veto_threshold=veto_threshold,
             bypass_mode=True,
-            use_policy_gradient=True,
+            loss_type="reinforce",
         )
 
     @classmethod
@@ -367,17 +402,17 @@ def geo_rs_seq_tis(
         )
 
     @classmethod
-    def pg_geo_rs_seq_tis(
+    def bypass_pg_geo_rs_seq_tis(
         cls,
         is_threshold: float = 2.0,
         rs_threshold: float = 1.001,
         rs_threshold_lower: Optional[float] = None,
         veto_threshold: Optional[float] = 1e-4,
     ) -> "RolloutCorrectionConfig":
-        """Policy Gradient with Geo-RS-Seq-TIS (Bypass mode).
+        """Bypass mode with REINFORCE loss, Geo-RS, and Sequence-level IS.
 
         Combines geometric rejection with sequence-level IS
-        in bypass mode with policy gradient loss (no PPO clipping).
+        in bypass mode with REINFORCE loss (no PPO clipping).
 
         Suitable for reasoning models (CoT, o1-style) and agents when you want
         bypass mode efficiency.
@@ -390,7 +425,7 @@ def pg_geo_rs_seq_tis(
             veto_threshold (Optional[float]): Per-token veto threshold. Default: 1e-4
 
         Returns:
-            RolloutCorrectionConfig configured for PG with Geo-RS-Seq-TIS
+            RolloutCorrectionConfig configured for bypass mode with REINFORCE + Geo-RS + Seq-TIS
         """
         return cls(
             rollout_is="sequence",
@@ -400,7 +435,7 @@ def pg_geo_rs_seq_tis(
             rollout_rs_threshold_lower=rs_threshold_lower,
             rollout_token_veto_threshold=veto_threshold,
             bypass_mode=True,
-            use_policy_gradient=True,
+            loss_type="reinforce",
         )
 
     @classmethod
@@ -440,9 +475,9 @@ class AlgoConfig(BaseConfig):
             - RolloutCorrectionConfig.decoupled_seq_is() - Decoupled mode with sequence-level IS
             - RolloutCorrectionConfig.decoupled_seq_is_rs() - Decoupled mode with sequence IS + RS
             - RolloutCorrectionConfig.decoupled_geo_rs() - Decoupled mode with geometric RS + veto
-            - RolloutCorrectionConfig.ppo_is_bypass() - Bypass mode (skips old_log_prob)
-            - RolloutCorrectionConfig.pg_is() - Policy gradient with IS
-            - RolloutCorrectionConfig.pg_rs() - Policy gradient with RS
+            - RolloutCorrectionConfig.bypass_ppo_clip() - Bypass mode with PPO-clip
+            - RolloutCorrectionConfig.bypass_pg_is() - Bypass mode with REINFORCE + IS
+            - RolloutCorrectionConfig.bypass_pg_rs() - Bypass mode with REINFORCE + RS
 
             For backward compatibility, you can still pass a dict, which will be converted to
             RolloutCorrectionConfig automatically.
diff --git a/verl/trainer/config/algorithm/rollout_correction.yaml b/verl/trainer/config/algorithm/rollout_correction.yaml
index 7c958c5ee76..cfd74316f4f 100644
--- a/verl/trainer/config/algorithm/rollout_correction.yaml
+++ b/verl/trainer/config/algorithm/rollout_correction.yaml
@@ -1,6 +1,6 @@
 # Rollout Correction: corrects off-policy distribution shifts
 # See documentation: docs/algo/rollout_corr.md
-# Use presets: RolloutCorrectionConfig.decoupled_seq_is(), .pg_is(), etc.
+# Use presets: RolloutCorrectionConfig.decoupled_seq_is(), .bypass_pg_is(), etc.
 
 # IS aggregation level: null (disabled), "token" (per-token), "sequence" (per-sequence)
 rollout_is: null
@@ -23,8 +23,10 @@ rollout_token_veto_threshold: null
 # Operating mode: false = Decoupled (3 policies), true = Bypass (2 policies)
 bypass_mode: false
 
-# Loss function: false = PPO with clipping, true = Policy gradient (no clipping)
-use_policy_gradient: false
+# Loss type in bypass mode (bypass_mode=true):
+# - "ppo_clip": PPO clipped objective (IS handled by ratio, default)
+# - "reinforce": REINFORCE with explicit IS weights (no PPO clipping)
+loss_type: ppo_clip
 
 # Batch normalize IS weights: false = raw weights, true = normalize to mean=1.0
 rollout_is_batch_normalize: false
diff --git a/verl/trainer/config/critic/critic.yaml b/verl/trainer/config/critic/critic.yaml
index f201a34b40c..95cbeaf92bc 100644
--- a/verl/trainer/config/critic/critic.yaml
+++ b/verl/trainer/config/critic/critic.yaml
@@ -73,6 +73,9 @@ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
 # Shuffle training data across PPO epochs
 shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
 
+# The seed used to construct mini-batch
+data_loader_seed: 42
+
 # PPO value function clipping range
 cliprange_value: 0.5
 
diff --git a/verl/trainer/config/engine/megatron.yaml b/verl/trainer/config/engine/megatron.yaml
index 84601f5a3f5..b588a96c1b3 100644
--- a/verl/trainer/config/engine/megatron.yaml
+++ b/verl/trainer/config/engine/megatron.yaml
@@ -75,7 +75,7 @@ override_transformer_config:
 override_mcore_model_config: {}
 
 # oc.select: default val for ref.megatron.use_mbridge
-use_mbridge: False
+use_mbridge: True
 
 # oc.select: default val for ref.megatron.vanilla_mbridge
 vanilla_mbridge: True
diff --git a/verl/trainer/config/model/hf_model.yaml b/verl/trainer/config/model/hf_model.yaml
index 6d02b8eac89..1aefa6984ea 100644
--- a/verl/trainer/config/model/hf_model.yaml
+++ b/verl/trainer/config/model/hf_model.yaml
@@ -37,7 +37,7 @@ enable_gradient_checkpointing: True
 enable_activation_offload: False
 
 # whether to use remove padding. Only valid when we use hf model definition
-use_remove_padding: False
+use_remove_padding: True
 
 # Set to positive value to enable LoRA (e.g., 32)
 lora_rank: 0
diff --git a/verl/trainer/config/ppo_megatron_trainer.yaml b/verl/trainer/config/ppo_megatron_trainer.yaml
index 9d9959aeabd..5050d3d8890 100644
--- a/verl/trainer/config/ppo_megatron_trainer.yaml
+++ b/verl/trainer/config/ppo_megatron_trainer.yaml
@@ -18,7 +18,7 @@ defaults:
   # Critic model config.
   - critic@critic: megatron_critic
   # Reward model config.
-  - reward_model@reward_model: megatron_reward_model
+  - reward_model@reward_model: megatron_reward_loop
   # Rollout correction config.
   - algorithm@algorithm.rollout_correction: rollout_correction
   - _self_
diff --git a/verl/trainer/config/ppo_trainer.yaml b/verl/trainer/config/ppo_trainer.yaml
index c226d2d06d9..7489b522fa2 100644
--- a/verl/trainer/config/ppo_trainer.yaml
+++ b/verl/trainer/config/ppo_trainer.yaml
@@ -31,7 +31,7 @@ defaults:
   - critic@critic: dp_critic
 
   # Reward model config.
-  - reward_model@reward_model: dp_reward_model
+  - reward_model@reward_model: dp_reward_loop
 
   # Rollout correction config.
   - algorithm@algorithm.rollout_correction: rollout_correction
diff --git a/verl/trainer/config/reward_model/dp_reward_loop.yaml b/verl/trainer/config/reward_model/dp_reward_loop.yaml
new file mode 100644
index 00000000000..04fb106df1c
--- /dev/null
+++ b/verl/trainer/config/reward_model/dp_reward_loop.yaml
@@ -0,0 +1,43 @@
+defaults:
+  - dp_reward_model
+  - _self_
+
+use_reward_loop: True
+reward_manager: naive
+enable: False
+
+# Whether to deploy the model to a separate resource pool.
+enable_resource_pool: False
+n_gpus_per_node: 8
+num_workers: 1
+nnodes: 0
+
+model:
+  path: ~/models/FsfairX-LLaMA3-RM-v0.1
+  external_lib: ${actor_rollout_ref.model.external_lib}
+  trust_remote_code: False
+
+rollout:
+  _target_: verl.workers.config.RolloutConfig
+  name: ???
+  dtype: bfloat16
+  gpu_memory_utilization: 0.5
+  enforce_eager: true
+  cudagraph_capture_sizes: null
+  free_cache_engine: true
+  data_parallel_size: 1
+  expert_parallel_size: 1
+  tensor_model_parallel_size: 2
+  max_num_batched_tokens: 8192
+  max_model_len: null
+  max_num_seqs: 1024
+  load_format: auto
+  engine_kwargs: {}
+  limit_images: null
+  enable_chunked_prefill: true
+  enable_prefix_caching: true
+  disable_log_stats: true
+  skip_tokenizer_init: false
+
+  prompt_length: 2048
+  response_length: 2048
\ No newline at end of file
diff --git a/verl/trainer/config/reward_model/megatron_reward_loop.yaml b/verl/trainer/config/reward_model/megatron_reward_loop.yaml
new file mode 100644
index 00000000000..f99b94abcc4
--- /dev/null
+++ b/verl/trainer/config/reward_model/megatron_reward_loop.yaml
@@ -0,0 +1,43 @@
+defaults:
+  - megatron_reward_model
+  - _self_
+
+use_reward_loop: True
+reward_manager: naive
+enable: False
+
+# Whether to deploy the model to a separate resource pool.
+enable_resource_pool: False
+n_gpus_per_node: 8
+num_workers: 1
+nnodes: 0
+
+model:
+  path: ~/models/FsfairX-LLaMA3-RM-v0.1
+  external_lib: ${actor_rollout_ref.model.external_lib}
+  trust_remote_code: False
+
+rollout:
+  _target_: verl.workers.config.RolloutConfig
+  name: ???
+  dtype: bfloat16
+  gpu_memory_utilization: 0.5
+  enforce_eager: true
+  cudagraph_capture_sizes: null
+  free_cache_engine: true
+  data_parallel_size: 1
+  expert_parallel_size: 1
+  tensor_model_parallel_size: 2
+  max_num_batched_tokens: 8192
+  max_model_len: null
+  max_num_seqs: 1024
+  load_format: auto
+  engine_kwargs: {}
+  limit_images: null
+  enable_chunked_prefill: true
+  enable_prefix_caching: true
+  disable_log_stats: true
+  skip_tokenizer_init: false
+
+  prompt_length: 2048
+  response_length: 2048
\ No newline at end of file
diff --git a/verl/trainer/config/sft_trainer_engine.yaml b/verl/trainer/config/sft_trainer_engine.yaml
index dd70640353b..f11b3bf8f1f 100644
--- a/verl/trainer/config/sft_trainer_engine.yaml
+++ b/verl/trainer/config/sft_trainer_engine.yaml
@@ -36,6 +36,13 @@ data:
   use_shm: False
   apply_chat_template_kwargs: {}
 
+  # MultiTurnSFTDataset apply_chat_template to each turn separately and concat `input_ids`
+  # as a whole sequence, which may not equal to apply_chat_template to whole messages at once. 
+  # For example, Qwen Thinking series models add <think></think> tags to last turn, please check
+  # your tokenizer chat template settings.
+  # Set to True to ignore input_ids mismatch and use the concatenated input_ids as the final input_ids.
+  ignore_input_ids_mismatch: False
+
 # Checkpoint configuration
 checkpoint:
   _target_: verl.trainer.config.CheckpointConfig
diff --git a/verl/trainer/fsdp_sft_trainer.py b/verl/trainer/fsdp_sft_trainer.py
index 4088fafaf03..27ff801b362 100644
--- a/verl/trainer/fsdp_sft_trainer.py
+++ b/verl/trainer/fsdp_sft_trainer.py
@@ -49,7 +49,13 @@
 from verl.utils.checkpoint.fsdp_checkpoint_manager import FSDPCheckpointManager
 from verl.utils.dataset import SFTDataset
 from verl.utils.dataset.multiturn_sft_dataset import MultiTurnSFTDataset
-from verl.utils.device import get_device_id, get_device_name, is_cuda_available, is_npu_available
+from verl.utils.device import (
+    auto_set_ascend_device_name,
+    get_device_id,
+    get_device_name,
+    is_cuda_available,
+    is_npu_available,
+)
 from verl.utils.distributed import destroy_global_process_group, initialize_global_process_group
 from verl.utils.fs import copy_to_local
 from verl.utils.fsdp_utils import (
@@ -132,6 +138,7 @@ def __init__(
 
         if self.device_mesh.get_rank() == 0:
             print(self.config)
+
         self.device_name = self.config.trainer.device
 
     def _normalize_config_bsz(self):
@@ -835,6 +842,9 @@ def run_sft(config):
 
 @hydra.main(config_path="config", config_name="sft_trainer", version_base=None)
 def main(config):
+    # Automatically set `config.trainer.device = npu` when running on Ascend NPU.
+    auto_set_ascend_device_name(config)
+
     run_sft(config)
 
 
diff --git a/verl/trainer/main_generation.py b/verl/trainer/main_generation.py
index 791c17af7ef..18aaa8cdbd0 100644
--- a/verl/trainer/main_generation.py
+++ b/verl/trainer/main_generation.py
@@ -84,6 +84,7 @@ def main_task(config):
 
     ray_cls_with_init = RayClassWithInitArgs(cls=ray.remote(ActorRolloutRefWorker), config=config, role="rollout")
     resource_pool = RayResourcePool(process_on_nodes=[config.trainer.n_gpus_per_node] * config.trainer.nnodes)
+
     wg = RayWorkerGroup(
         resource_pool=resource_pool,
         ray_cls_with_init=ray_cls_with_init,
diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py
index f738c98150f..0f3935b5bfb 100644
--- a/verl/trainer/main_ppo.py
+++ b/verl/trainer/main_ppo.py
@@ -28,7 +28,7 @@
 from verl.trainer.ppo.reward import load_reward_manager
 from verl.trainer.ppo.utils import need_critic, need_reference_policy
 from verl.utils.config import validate_config
-from verl.utils.device import is_cuda_available
+from verl.utils.device import auto_set_ascend_device_name, is_cuda_available
 from verl.utils.import_utils import load_extern_object
 
 
@@ -39,6 +39,9 @@ def main(config):
     Args:
         config_dict: Hydra configuration dictionary containing training parameters.
     """
+    # Automatically set `config.trainer.device = npu` when running on Ascend NPU.
+    auto_set_ascend_device_name(config)
+
     run_ppo(config)
 
 
@@ -175,18 +178,21 @@ def add_actor_rollout_worker(self, config):
 
     def add_critic_worker(self, config):
         """Add critic worker to role mapping."""
+        use_legacy_worker_impl = config.trainer.get("use_legacy_worker_impl", "auto")
         if config.critic.strategy in {"fsdp", "fsdp2"}:
-            use_legacy_worker_impl = config.trainer.get("use_legacy_worker_impl", "auto")
             if use_legacy_worker_impl in ["auto", "enable"]:
                 from verl.workers.fsdp_workers import CriticWorker
             elif use_legacy_worker_impl == "disable":
-                from verl.workers.engine_workers import CriticWorker
+                # we don't need to specialize critic worker. Just use TrainingWorker
+                from verl.workers.engine_workers import TrainingWorker
 
+                CriticWorker = TrainingWorker
                 print("Using new worker implementation")
             else:
                 raise ValueError(f"Invalid use_legacy_worker_impl: {use_legacy_worker_impl}")
 
         elif config.critic.strategy == "megatron":
+            # TODO: switch this to TrainingWorker as well
             from verl.workers.megatron_workers import CriticWorker
 
         else:
diff --git a/verl/trainer/ppo/core_algos.py b/verl/trainer/ppo/core_algos.py
index dc30adfc343..7849bfbae9f 100644
--- a/verl/trainer/ppo/core_algos.py
+++ b/verl/trainer/ppo/core_algos.py
@@ -781,15 +781,21 @@ def agg_loss(
     """
     Aggregate the loss across global batch to ensure the loss is invariant to fsdp/megatron parallelism.
 
+    NOTE: ``dp_size``, ``batch_num_tokens``, and ``global_batch_size`` are only compatible with the new model engine
+        for now, while the legacy model engines conduct the aggregation outside ``agg_loss``.
+
     NOTE: The returned loss has different behaviors for different backend:
     - FSDP: the loss is directly used for backward.
     - Megatron: the loss should be scaled by `num_microbatches` and `cp_size` for pp schedule.
 
+    # TODO: Consider the numerical stability?
+
     Args:
         loss_mat: micro batch loss matrix, (bs, response_length)
         loss_mask: micro batch loss mask, (bs, response_length)
         loss_agg_mode: method to aggregate the loss matrix into a scalar
-        dp_size: data parallel size
+        dp_size: data parallel size. When appling manual aggregation,
+            scaling up the ``loss`` by ``dp_size`` can cancel out FSDP averaging.
         batch_num_tokens: number of valid tokens in global batch
         global_batch_size: global batch size
         loss_scale_factor: scale factor for "seq-mean-token-sum-norm" mode. If None, uses loss_mask.shape[-1].
@@ -799,30 +805,39 @@ def agg_loss(
         loss: `a scalar torch.Tensor`
             aggregated loss
     """
+    # NOTE: `masked_sum` is more robust than multiplying the `mask`.
     if loss_agg_mode == "token-mean":
         if batch_num_tokens is None:
             batch_num_tokens = loss_mask.sum()
         loss = verl_F.masked_sum(loss_mat, loss_mask) / batch_num_tokens * dp_size
-    elif loss_agg_mode == "seq-mean-token-sum":
-        seq_losses = torch.sum(loss_mat * loss_mask, dim=-1)  # token-sum
-        seq_mask = (torch.sum(loss_mask, dim=-1) > 0).float()  # exclude fully masked sequences
-        if global_batch_size is None:
-            global_batch_size = seq_mask.sum()
-        loss = verl_F.masked_sum(seq_losses, seq_mask) / global_batch_size * dp_size  # seq-mean
-    elif loss_agg_mode == "seq-mean-token-mean":
-        seq_mask = torch.sum(loss_mask, dim=-1)  # per-sequence token count
-        seq_losses = torch.sum(loss_mat * loss_mask, dim=-1) / (seq_mask + 1e-8)  # token-mean
-        seq_mask = (seq_mask > 0).float()  # exclude fully masked sequences
-        if global_batch_size is None:
-            global_batch_size = seq_mask.sum()
-        loss = verl_F.masked_sum(seq_losses, seq_mask) / global_batch_size * dp_size  # seq-mean
-    elif loss_agg_mode == "seq-mean-token-sum-norm":
-        seq_losses = torch.sum(loss_mat * loss_mask, dim=-1)
-        if loss_scale_factor is None:
-            loss_scale_factor = loss_mask.shape[-1]
-        loss = torch.sum(seq_losses) / loss_scale_factor
+    elif loss_agg_mode.startswith("seq-mean"):
+        # TODO: Correct and unify the denominator logic.
+        if global_batch_size is not None:
+            seq_denominator = global_batch_size * dp_size
+        else:  # The default logic which is only correct when the batch sizes are even.
+            local_bsz = loss_mat.shape[0]
+            seq_denominator = local_bsz
+
+        if loss_agg_mode.startswith("seq-mean-token-sum"):
+            seq_losses = verl_F.masked_sum(loss_mat, loss_mask, axis=-1)  # token-sum per sequence
+
+            if loss_agg_mode == "seq-mean-token-sum":
+                pass  # TODO: Add assertation.
+            elif loss_agg_mode == "seq-mean-token-sum-norm":
+                if loss_scale_factor is None:
+                    loss_scale_factor = loss_mask.shape[-1]
+                seq_losses = seq_losses / loss_scale_factor
+            else:
+                raise ValueError(f"Invalid {loss_agg_mode=}")
+        elif loss_agg_mode == "seq-mean-token-mean":
+            token_counts = torch.sum(loss_mask, dim=-1)  # per-sequence token count
+            # token-mean per sequence
+            seq_losses = verl_F.masked_sum(loss_mat, loss_mask, axis=-1) / (token_counts + 1e-8)
+        else:
+            raise ValueError(f"Invalid {loss_agg_mode=}")
+        loss = torch.sum(seq_losses) / seq_denominator  # seq-mean
     else:
-        raise ValueError(f"Invalid loss_agg_mode: {loss_agg_mode}")
+        raise ValueError(f"Invalid {loss_agg_mode=}")
 
     return loss
 
@@ -1582,115 +1597,60 @@ def compute_weights(scores: torch.Tensor, reweight_method: str, weight_pow: floa
     return resampled_data
 
 
-def compute_policy_loss_with_rollout_correction(
-    rollout_log_prob,
-    log_prob,
-    advantages,
-    eos_mask,
-    loss_agg_mode="seq-mean-token-sum",
+def compute_policy_loss_reinforce(
+    rollout_log_prob: torch.Tensor,
+    log_prob: torch.Tensor,
+    advantages: torch.Tensor,
+    response_mask: torch.Tensor,
+    loss_agg_mode: str = "seq-mean-token-sum",
     config: Optional[ActorConfig] = None,
-    loss_scale_factor=1.0,
-    rollout_is: Optional[str] = None,
-    rollout_is_threshold: float = 2.0,
-    rollout_rs: Optional[str] = None,
-    rollout_rs_threshold: Optional[float] = None,
-    rollout_rs_threshold_lower: Optional[float] = None,
-    rollout_token_veto_threshold: Optional[float] = None,
-    rollout_is_batch_normalize: bool = False,
-):
-    """Compute policy loss with pure rollout correction (no PPO clipping).
+    rollout_is_weights: Optional[torch.Tensor] = None,
+) -> tuple[torch.Tensor, dict[str, Any]]:
+    """Compute REINFORCE-style policy gradient loss with optional IS correction.
 
-    This function implements policy gradient with importance sampling correction
-    for rollout-training policy mismatch, without PPO's clipping mechanism.
+    This function implements policy gradient (REINFORCE) with optional importance
+    sampling correction for rollout-training policy mismatch.
 
     Mathematical formulation:
-        Without IS (rollout_is=None):
+        Without IS (rollout_is_weights=None):
             L = -E[log π(a|s) * A(s,a)]
             Gradient: ∇_θ L = -E[∇log π(a|s) * A] (standard REINFORCE)
 
-        With IS (rollout_is enabled):
+        With IS (rollout_is_weights provided):
             L = -E_π_rollout[w * log π(a|s) * A(s,a)]
             where w = π_current / π_rollout (truncated IS weight)
             Gradient: ∇_θ L = -E[w * ∇log π(a|s) * A] (IS-corrected policy gradient)
 
     Args:
         rollout_log_prob: Log probabilities from rollout policy (e.g., vLLM BF16).
-            Shape: (batch_size, seq_length)
+            Shape: (batch_size, seq_length). Used for KL computation.
         log_prob: Log probabilities from current training policy.
             Shape: (batch_size, seq_length)
         advantages: Advantage estimates for each token.
             Shape: (batch_size, seq_length)
-        eos_mask: Mask indicating valid tokens (1 for valid, 0 for padding).
-            Shape: (batch_size, seq_length)
+        response_mask: Mask indicating valid tokens (1 for valid, 0 for padding).
+            Shape: (batch_size, seq_length). Should already include rejection sampling.
         loss_agg_mode: Loss aggregation strategy (see agg_loss for details).
-        loss_scale_factor: Multiplicative scaling factor applied to final loss.
-        rollout_is: IS aggregation level ("token", "sequence", or None).
-        rollout_is_threshold: Upper threshold for truncating IS weights.
-        rollout_rs: Rejection sampling aggregation level (or None to disable).
-        rollout_rs_threshold: Upper threshold for rejection sampling.
-        rollout_rs_threshold_lower: Lower threshold for rejection sampling.
-        rollout_token_veto_threshold: Per-token veto threshold for catastrophic outliers.
-        rollout_is_batch_normalize: Whether to normalize IS weights to have mean=1.0 per batch.
+        config: Actor config (required for global_batch_info).
+        rollout_is_weights: Pre-computed IS weights (π_current / π_rollout).
+            Shape: (batch_size, seq_length). None to disable IS correction.
 
-    Note:
-        Unlike compute_policy_loss (PPO), this function:
-        - Does NOT use PPO clipping (no old_log_prob needed)
-        - Directly applies IS correction computed from current vs rollout
-        - Computes IS/RS on-the-fly during training
-
-    Usage:
-        This function is called by the actor when:
-        - bypass_mode=True (trainer uses rollout_log_prob as old_log_prob)
-        - use_policy_gradient=True (actor uses this function instead of compute_policy_loss)
-
-    Example config:
-        algorithm:
-          rollout_correction:
-            bypass_mode: true
-            use_policy_gradient: true
-            rollout_is: "token"
-            rollout_is_threshold: 2.0
-            rollout_rs: "token"
-            rollout_rs_threshold: 2.0
-            rollout_rs_threshold_lower: 0.5
+    Returns:
+        Tuple of (loss, metrics):
+            loss: Scalar policy gradient loss
+            metrics: Dictionary with "actor/ppo_kl"
 
+    Note:
+        Unlike PPO (compute_policy_loss_vanilla), this function:
+        - Does NOT use PPO clipping
+        - Uses log π(a|s) directly (not ratio)
+        - IS weights are applied as multiplicative factor
     """
-    # Import rollout correction helper
-    from verl.trainer.ppo.rollout_corr_helper import compute_rollout_correction_and_rejection_mask
-
-    assert config is not None, "ActorConfig must be provided for rollout correction"
-
-    # Compute IS weights and rejection mask on-the-fly
-    # Use no_grad since weights are detached inside and metrics don't need gradients
-    with torch.no_grad():
-        rollout_is_weights_proto, modified_response_mask, rollout_metrics = (
-            compute_rollout_correction_and_rejection_mask(
-                old_log_prob=log_prob,  # Current policy
-                rollout_log_prob=rollout_log_prob,  # Rollout policy
-                response_mask=eos_mask,
-                rollout_is=rollout_is,
-                rollout_is_threshold=rollout_is_threshold,
-                rollout_rs=rollout_rs,
-                rollout_rs_threshold=rollout_rs_threshold,
-                rollout_rs_threshold_lower=rollout_rs_threshold_lower,
-                rollout_token_veto_threshold=rollout_token_veto_threshold,
-                rollout_is_batch_normalize=rollout_is_batch_normalize,
-            )
-        )
-
-    # Extract weights tensor from DataProto (or None if disabled)
-    rollout_is_weights = rollout_is_weights_proto.batch["rollout_is_weights"] if rollout_is_weights_proto else None
+    assert config is not None, "ActorConfig must be provided for REINFORCE loss"
 
-    # Apply rejection mask (if RS is enabled)
-    effective_mask = modified_response_mask if rollout_rs is not None else eos_mask
-
-    # Compute pure policy gradient loss with IS correction
+    # Compute pure policy gradient loss with optional IS correction
     # Standard REINFORCE: L = -E[log π(a|s) * A]
     # With IS: L = -E[w * log π(a|s) * A] where w = π_current / π_rollout
-    #
-    # Note: rollout_is_weights already contains w = π_current / π_rollout
-    # So we apply it to the standard log-prob trick formula
-
     if rollout_is_weights is not None:
         # IS-corrected policy gradient: L = -E[stopgrad(w) · log π · A]
         pg_losses = -advantages * log_prob * rollout_is_weights
@@ -1698,33 +1658,27 @@ def compute_policy_loss_with_rollout_correction(
         # Standard REINFORCE: L = -E[log π · A]
         pg_losses = -advantages * log_prob
 
-    # Aggregate loss (apply scale factor manually)
-    pg_loss = (
-        agg_loss(
-            loss_mat=pg_losses,
-            loss_mask=effective_mask,
-            loss_agg_mode=loss_agg_mode,
-            **config.global_batch_info,
-        )
-        * loss_scale_factor
+    # Aggregate loss
+    pg_loss = agg_loss(
+        loss_mat=pg_losses,
+        loss_mask=response_mask,
+        loss_agg_mode=loss_agg_mode,
+        **config.global_batch_info,
     )
 
     # Compute KL divergence between current and rollout policy
     negative_approx_kl = log_prob - rollout_log_prob
-    kl_divergence = verl_F.masked_mean(-negative_approx_kl, effective_mask)
+    kl_divergence = verl_F.masked_mean(-negative_approx_kl, response_mask)
 
-    pg_metrics = rollout_metrics
-    pg_metrics.update(
-        {
-            "actor/ppo_kl": kl_divergence.detach().item(),
-        }
-    )
+    pg_metrics = {
+        "actor/ppo_kl": kl_divergence.detach().item(),
+    }
 
     return pg_loss, pg_metrics
 
 
-@register_policy_loss("rollout_correction")
-def compute_policy_loss_rollout_correction_wrapper(
+@register_policy_loss("bypass_mode")
+def compute_policy_loss_bypass_mode(
     old_log_prob: torch.Tensor,
     log_prob: torch.Tensor,
     advantages: torch.Tensor,
@@ -1733,34 +1687,70 @@ def compute_policy_loss_rollout_correction_wrapper(
     config: Optional[ActorConfig] = None,
     rollout_is_weights: torch.Tensor | None = None,
 ) -> tuple[torch.Tensor, dict[str, Any]]:
-    """Wrapper for compute_policy_loss_with_rollout_correction to match PolicyLossFn interface.
-
-    This function is used when algorithm.rollout_correction.use_policy_gradient=True.
-    In this mode, the trainer has already set old_log_prob=rollout_log_prob (bypass mode).
+    """Bypass mode policy loss supporting both REINFORCE and PPO-clip.
+
+    This function is the entry point for bypass mode, where old_log_prob = rollout_log_prob.
+    It computes IS weights and rejection masks, then dispatches to either REINFORCE or
+    PPO-clip loss based on the loss_type configuration.
+
+    IMPORTANT - Bypass mode semantics:
+        In bypass mode, the trainer sets old_log_prob = rollout_log_prob.
+        This means:
+        - For REINFORCE: We use IS weights w = π_current / π_rollout explicitly
+        - For PPO-clip: The PPO ratio π_current / π_old = π_current / π_rollout
+          already incorporates the IS correction through clipping, so we do NOT
+          apply additional IS weights (would be double-counting)
+
+    Loss types:
+        - "ppo_clip" (default): PPO clipped objective (compute_policy_loss_vanilla)
+            L = -E[min(r*A, clip(r)*A)] where r = π_current / π_rollout
+            Note: IS weights are NOT applied (clipping handles the ratio)
+        - "reinforce": REINFORCE-style policy gradient with IS correction
+            L = -E[w * log π(a|s) * A] where w = π_current / π_rollout
 
     Args:
-        old_log_prob: In bypass mode, this is actually rollout_log_prob
-        log_prob: Current policy log probabilities
-        advantages: Advantage estimates
-        response_mask: Valid token mask
-        loss_agg_mode: Loss aggregation mode
-        config: Actor config containing rollout_correction settings
-        rollout_is_weights: Pre-computed IS weights (ignored, computed internally)
+        old_log_prob: In bypass mode, this is actually rollout_log_prob.
+            Shape: (batch_size, seq_length)
+        log_prob: Current policy log probabilities.
+            Shape: (batch_size, seq_length)
+        advantages: Advantage estimates.
+            Shape: (batch_size, seq_length)
+        response_mask: Valid token mask (1=valid, 0=padding).
+            Shape: (batch_size, seq_length)
+        loss_agg_mode: Loss aggregation mode (passed to underlying loss function).
+        config: Actor config containing rollout_correction settings in policy_loss.
+        rollout_is_weights: Pre-computed IS weights (ignored, computed internally).
+
+    Config options (in config.policy_loss.rollout_correction):
+        loss_type: "ppo_clip" (default) or "reinforce"
+        rollout_is: IS aggregation level ("token", "sequence", or None)
+        rollout_is_threshold: Upper threshold for truncating IS weights (default: 2.0)
+        rollout_rs: Rejection sampling level ("token", "sequence", "geometric", or None)
+        rollout_rs_threshold: Upper threshold for rejection sampling
+        rollout_rs_threshold_lower: Lower threshold for rejection sampling
+        rollout_token_veto_threshold: Per-token veto threshold for catastrophic outliers
+        rollout_is_batch_normalize: Whether to normalize IS weights to mean=1.0
+
+    Returns:
+        Tuple of (loss, metrics):
+            loss: Scalar policy loss
+            metrics: Dictionary with rollout correction metrics and actor/ppo_kl
     """
-    assert config is not None, "config is required for rollout_correction loss mode"
+    from verl.trainer.ppo.rollout_corr_helper import compute_rollout_correction_and_rejection_mask
 
-    # Extract rollout_correction config
-    # In ray_trainer, when use_policy_gradient=True, the rollout_correction config
-    # is embedded in actor config's policy_loss field
+    assert config is not None, "config is required for bypass_mode loss"
+
+    # Extract rollout_correction config from policy_loss
     rollout_corr_config = config.policy_loss.get("rollout_correction", None) if hasattr(config, "policy_loss") else None
 
     if rollout_corr_config is None:
         raise ValueError(
             "rollout_correction config not found in policy_loss. "
-            "When using loss_mode='rollout_correction', ensure rollout_correction config is passed."
+            "When using loss_mode='bypass_mode', ensure rollout_correction config is passed."
         )
 
     # Extract parameters
+    loss_type = rollout_corr_config.get("loss_type", "ppo_clip")
     rollout_is = rollout_corr_config.get("rollout_is", None)
     rollout_is_threshold = rollout_corr_config.get("rollout_is_threshold", 2.0)
     rollout_rs = rollout_corr_config.get("rollout_rs", None)
@@ -1769,21 +1759,64 @@ def compute_policy_loss_rollout_correction_wrapper(
     rollout_token_veto_threshold = rollout_corr_config.get("rollout_token_veto_threshold", None)
     rollout_is_batch_normalize = rollout_corr_config.get("rollout_is_batch_normalize", False)
 
-    # Call the actual implementation
-    # In bypass mode, old_log_prob IS rollout_log_prob
-    return compute_policy_loss_with_rollout_correction(
-        rollout_log_prob=old_log_prob,  # This is rollout_log_prob in bypass mode
-        log_prob=log_prob,
-        advantages=advantages,
-        eos_mask=response_mask,
-        loss_agg_mode=loss_agg_mode,
-        config=config,
-        loss_scale_factor=1.0,
-        rollout_is=rollout_is,
-        rollout_is_threshold=rollout_is_threshold,
-        rollout_rs=rollout_rs,
-        rollout_rs_threshold=rollout_rs_threshold,
-        rollout_rs_threshold_lower=rollout_rs_threshold_lower,
-        rollout_token_veto_threshold=rollout_token_veto_threshold,
-        rollout_is_batch_normalize=rollout_is_batch_normalize,
-    )
+    # In bypass mode: old_log_prob IS rollout_log_prob
+    rollout_log_prob = old_log_prob
+
+    # Compute IS weights and rejection mask
+    # Note: For PPO-clip, we still compute IS weights for metrics, but don't apply them
+    with torch.no_grad():
+        rollout_is_weights_proto, modified_response_mask, rollout_metrics = (
+            compute_rollout_correction_and_rejection_mask(
+                old_log_prob=log_prob,  # Current policy (for IS ratio: π_current / π_rollout)
+                rollout_log_prob=rollout_log_prob,  # Rollout policy
+                response_mask=response_mask,
+                rollout_is=rollout_is,
+                rollout_is_threshold=rollout_is_threshold,
+                rollout_rs=rollout_rs,
+                rollout_rs_threshold=rollout_rs_threshold,
+                rollout_rs_threshold_lower=rollout_rs_threshold_lower,
+                rollout_token_veto_threshold=rollout_token_veto_threshold,
+                rollout_is_batch_normalize=rollout_is_batch_normalize,
+            )
+        )
+
+    # Extract IS weights tensor (or None if disabled)
+    computed_is_weights = rollout_is_weights_proto.batch["rollout_is_weights"] if rollout_is_weights_proto else None
+
+    # Apply rejection mask (RS + veto)
+    effective_mask = modified_response_mask
+
+    # Dispatch to appropriate loss function based on loss_type
+    if loss_type == "reinforce":
+        # REINFORCE: Apply IS weights explicitly
+        pg_loss, pg_metrics = compute_policy_loss_reinforce(
+            rollout_log_prob=rollout_log_prob,
+            log_prob=log_prob,
+            advantages=advantages,
+            response_mask=effective_mask,
+            loss_agg_mode=loss_agg_mode,
+            config=config,
+            rollout_is_weights=computed_is_weights,
+        )
+
+    elif loss_type == "ppo_clip":
+        # PPO-clip: The ratio π_current/π_old = π_current/π_rollout already handles IS
+        # DO NOT apply IS weights - would be double-counting!
+        # The clipping mechanism constrains the effective IS ratio
+        pg_loss, pg_metrics = compute_policy_loss_vanilla(  # type: ignore[call-arg]
+            old_log_prob=rollout_log_prob,  # = old_log_prob in bypass mode
+            log_prob=log_prob,
+            advantages=advantages,
+            response_mask=effective_mask,
+            loss_agg_mode=loss_agg_mode,
+            config=config,
+            rollout_is_weights=None,  # Explicitly None - no IS weights for PPO-clip
+        )
+
+    else:
+        raise ValueError(f"Invalid loss_type: {loss_type}. Must be 'reinforce' or 'ppo_clip'.")
+
+    # Merge rollout correction metrics
+    pg_metrics.update(rollout_metrics)
+
+    return pg_loss, pg_metrics
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index e439a76d361..4558e750cc3 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -51,14 +51,19 @@
 )
 from verl.trainer.ppo.reward import compute_reward, compute_reward_async
 from verl.trainer.ppo.utils import Role, WorkerType, need_critic, need_reference_policy, need_reward_model
+from verl.utils import tensordict_utils as tu
 from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path, should_save_ckpt_esi
 from verl.utils.config import omega_conf_to_dataclass
 from verl.utils.debug import marked_timer
+from verl.utils.import_utils import load_class_from_fqn
 from verl.utils.metric import reduce_metrics
+from verl.utils.py_functional import rename_dict
 from verl.utils.rollout_skip import RolloutSkip
 from verl.utils.seqlen_balancing import calculate_workload, get_seqlen_balanced_partitions, log_seqlen_unbalance
 from verl.utils.torch_functional import masked_mean
 from verl.utils.tracking import ValidationGenerationsLogger
+from verl.workers.config import FSDPEngineConfig
+from verl.workers.utils.padding import left_right_2_no_padding, no_padding_2_padding
 
 
 @dataclass
@@ -323,7 +328,10 @@ def __init__(
         self.role_worker_mapping = role_worker_mapping
         self.resource_pool_manager = resource_pool_manager
         self.use_reference_policy = need_reference_policy(self.role_worker_mapping)
+        # legacy reward model implementation
         self.use_rm = need_reward_model(self.role_worker_mapping)
+        self.use_reward_loop = self.config.reward_model.use_reward_loop
+
         self.use_critic = need_critic(self.config)
         self.ray_worker_group_cls = ray_worker_group_cls
         self.device_name = device_name if device_name else self.config.trainer.device
@@ -343,6 +351,8 @@ def __init__(
         if self.config.algorithm.use_kl_in_reward:
             self.kl_ctrl_in_reward = core_algos.get_kl_controller(self.config.algorithm.kl_ctrl)
 
+        self.use_legacy_worker_impl = config.trainer.get("use_legacy_worker_impl", "auto")
+
         self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler)
 
     def _create_dataloader(self, train_dataset, val_dataset, collate_fn, train_sampler: Optional[Sampler]):
@@ -696,7 +706,31 @@ def init_workers(self):
         # create critic
         if self.use_critic:
             resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic)
-            critic_cfg = omega_conf_to_dataclass(self.config.critic)
+
+            from verl.workers.config import CriticConfig
+
+            critic_cfg: CriticConfig = omega_conf_to_dataclass(self.config.critic)
+
+            if self.use_legacy_worker_impl == "disable":
+                # convert critic_cfg into TrainingWorkerConfig
+                from verl.workers.engine_workers import TrainingWorkerConfig
+
+                orig_critic_cfg = critic_cfg
+                if orig_critic_cfg.strategy == "fsdp":
+                    engine_config: FSDPEngineConfig = orig_critic_cfg.model.fsdp_config
+                    engine_config.infer_max_token_len_per_gpu = critic_cfg.ppo_infer_max_token_len_per_gpu
+                    engine_config.max_token_len_per_gpu = critic_cfg.ppo_max_token_len_per_gpu
+                else:
+                    raise NotImplementedError(f"Unknown strategy {orig_critic_cfg.strategy=}")
+
+                critic_cfg = TrainingWorkerConfig(
+                    model_type="value_model",
+                    model_config=orig_critic_cfg.model_config,
+                    engine_config=engine_config,
+                    optimizer_config=orig_critic_cfg.optim,
+                    checkpoint_config=orig_critic_cfg.checkpoint,
+                )
+
             critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=critic_cfg)
             self.resource_pool_to_cls[resource_pool][str(Role.Critic)] = critic_cls
 
@@ -711,11 +745,37 @@ def init_workers(self):
             self.resource_pool_to_cls[resource_pool][str(Role.RefPolicy)] = ref_policy_cls
 
         # create a reward model if reward_fn is None
-        if self.use_rm:
-            # we create a RM here
-            resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel)
-            rm_cls = RayClassWithInitArgs(self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model)
-            self.resource_pool_to_cls[resource_pool][str(Role.RewardModel)] = rm_cls
+        # for legacy discriminative reward model, we create a reward model worker here
+        # for reward loop discriminative reward model, we create a reward loop manager here
+        if not self.use_reward_loop:
+            # legacy reward model only handle reward-model based scenario
+            if self.use_rm:
+                # we create a RM here
+                resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel)
+                rm_cls = RayClassWithInitArgs(
+                    self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model
+                )
+                self.resource_pool_to_cls[resource_pool][str(Role.RewardModel)] = rm_cls
+        else:
+            # reward loop handle hybrid reward scenario (rule, disrm, genrm, ...)
+            can_reward_loop_parallelize = self.config.actor_rollout_ref.rollout.mode == "async" and (
+                not self.use_rm or self.config.reward_model.enable_resource_pool
+            )
+            # judge if we can asynchronously parallelize reward model with actor rollout
+            # two condition that we can parallelize reward model with actor rollout:
+            # 1. reward model is not enabled (rule-based reward can parallelize)
+            # 2. reward model is enabled but extra resource pool is enabled
+            # If we cannot parallelize, we should enable synchronous mode here, and launch a reward loop manager here
+            # else for parallelize mode, we launch a reward worker for each rollout worker (in agent loop, not here)
+            if not can_reward_loop_parallelize:
+                from verl.experimental.reward import RewardLoopManager
+
+                self.config.reward_model.n_gpus_per_node = self.config.trainer.n_gpus_per_node
+                resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel)
+                self.reward_loop_manager = RewardLoopManager(
+                    config=self.config,
+                    rm_resource_pool=resource_pool,
+                )
 
         # initialize WorkerGroup
         # NOTE: if you want to use a different resource pool for each role, which can support different parallel size,
@@ -751,7 +811,17 @@ def init_workers(self):
 
         if self.use_critic:
             self.critic_wg = all_wg[str(Role.Critic)]
-            self.critic_wg.init_model()
+            if self.use_legacy_worker_impl == "disable":
+                self.critic_wg.reset()
+                # assign critic loss
+                from functools import partial
+
+                from verl.workers.utils.losses import value_loss
+
+                value_loss_ = partial(value_loss, config=orig_critic_cfg)
+                self.critic_wg.set_loss_fn(value_loss_)
+            else:
+                self.critic_wg.init_model()
 
         if self.use_reference_policy and not self.ref_in_actor:
             if str(Role.RefPolicy) in all_wg:
@@ -764,7 +834,7 @@ def init_workers(self):
 
         self.rm_wg = None
         # initalization of rm_wg will be deprecated in the future
-        if self.use_rm:
+        if self.use_rm and not self.use_reward_loop:
             self.rm_wg = all_wg[str(Role.RewardModel)]
             self.rm_wg.init_model()
 
@@ -772,10 +842,18 @@ def init_workers(self):
         self.actor_rollout_wg = all_wg[str(actor_role)]
         self.actor_rollout_wg.init_model()
 
+        if self.ref_in_actor:
+            self.ref_policy_wg = self.actor_rollout_wg
+
         # create async rollout manager and request scheduler
         self.async_rollout_mode = False
         if self.config.actor_rollout_ref.rollout.mode == "async":
-            from verl.experimental.agent_loop import AgentLoopManager
+            # Support custom AgentLoopManager via config
+            manager_class_fqn = self.config.actor_rollout_ref.rollout.get("agent", {}).get("agent_loop_manager_class")
+            if manager_class_fqn:
+                AgentLoopManager = load_class_from_fqn(manager_class_fqn, "AgentLoopManager")
+            else:
+                from verl.experimental.agent_loop import AgentLoopManager
 
             self.async_rollout_mode = True
             if self.config.reward_model.enable and self.config.reward_model.enable_resource_pool:
@@ -923,7 +1001,7 @@ def _start_profiling(self, do_profile: bool) -> None:
                 self.ref_policy_wg.start_profile(profile_step=self.global_steps)
             if self.use_critic:
                 self.critic_wg.start_profile(profile_step=self.global_steps)
-            if self.use_rm:
+            if self.use_rm and not self.use_reward_loop:
                 self.rm_wg.start_profile(profile_step=self.global_steps)
 
     def _stop_profiling(self, do_profile: bool) -> None:
@@ -934,7 +1012,7 @@ def _stop_profiling(self, do_profile: bool) -> None:
                 self.ref_policy_wg.stop_profile()
             if self.use_critic:
                 self.critic_wg.stop_profile()
-            if self.use_rm:
+            if self.use_rm and not self.use_reward_loop:
                 self.rm_wg.stop_profile()
 
     def _balance_batch(self, batch: DataProto, metrics, logging_prefix="global_seqlen", keep_minibatch=False):
@@ -974,6 +1052,135 @@ def _balance_batch(self, batch: DataProto, metrics, logging_prefix="global_seqle
         )
         metrics.update(global_balance_stats)
 
+    def _compute_values(self, batch: DataProto) -> DataProto:
+        if self.use_legacy_worker_impl == "disable":
+            batch_td = batch.to_tensordict()
+            # step 2: convert from padding to nopadding
+            batch_td = left_right_2_no_padding(batch_td)
+            # step 3: add meta info
+            tu.assign_non_tensor(batch_td, compute_loss=False)
+            output = self.critic_wg.infer_batch(batch_td)
+            output = output.get()
+            values = tu.get(output, "values")
+            values = no_padding_2_padding(values, batch_td)
+            values = tu.get_tensordict({"values": values.float()})
+            values = DataProto.from_tensordict(values)
+        else:
+            values = self.critic_wg.compute_values(batch)
+        return values
+
+    def _compute_ref_log_prob(self, batch: DataProto) -> DataProto:
+        if self.use_legacy_worker_impl == "disable":
+            # step 1: convert dataproto to tensordict.
+            batch_td = batch.to_tensordict()
+            # step 2: convert from padding to nopadding
+            batch_td = left_right_2_no_padding(batch_td)
+            # step 3: add meta info
+            tu.assign_non_tensor(batch_td, calculate_entropy=False, compute_loss=False)
+            output = self.ref_policy_wg.compute_ref_log_prob(batch_td)
+            # gather output
+            log_probs = tu.get(output, "log_probs")
+            # step 4. No padding to padding
+            log_probs = no_padding_2_padding(log_probs, batch_td)
+            # step 5: rebuild a tensordict and convert to dataproto
+            ref_log_prob = tu.get_tensordict({"ref_log_prob": log_probs.float()})
+            ref_log_prob = DataProto.from_tensordict(ref_log_prob)
+        else:
+            ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch)
+
+        return ref_log_prob
+
+    def _compute_old_log_prob(self, batch: DataProto):
+        if self.use_legacy_worker_impl == "disable":
+            # TODO: remove step 1, 2, 4 after we make the whole training tensordict and padding free
+            # step 1: convert dataproto to tensordict.
+            batch_td = batch.to_tensordict()
+            # step 2: convert from padding to nopadding
+            batch_td = left_right_2_no_padding(batch_td)
+            # step 3: add meta info
+            tu.assign_non_tensor(batch_td, calculate_entropy=True, compute_loss=False)
+            output = self.actor_rollout_wg.compute_log_prob(batch_td)
+            # gather output
+            entropy = tu.get(output, "entropy")
+            log_probs = tu.get(output, "log_probs")
+            old_log_prob_mfu = tu.get(output, "metrics")["mfu"]
+            # step 4. No padding to padding
+            entropy = no_padding_2_padding(entropy, batch_td)
+            log_probs = no_padding_2_padding(log_probs, batch_td)
+            # step 5: rebuild a tensordict and convert to dataproto
+            old_log_prob = tu.get_tensordict({"old_log_probs": log_probs.float(), "entropys": entropy.float()})
+            old_log_prob = DataProto.from_tensordict(old_log_prob)
+        else:
+            old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
+            old_log_prob_mfu = 0
+        return old_log_prob, old_log_prob_mfu
+
+    def _update_actor(self, batch: DataProto) -> DataProto:
+        rollout_config = self.config.actor_rollout_ref.rollout
+        batch.meta_info["multi_turn"] = rollout_config.multi_turn.enable
+        # TODO: Make "temperature" single source of truth from generation.
+        batch.meta_info["temperature"] = rollout_config.temperature
+        # update actor
+        if self.use_legacy_worker_impl == "disable":
+            batch_td = batch.to_tensordict()
+            # step 2: convert from padding to no-padding
+            batch_td = left_right_2_no_padding(batch_td)
+            calculate_entropy = self.config.actor_rollout_ref.actor.entropy_coeff != 0.0
+            ppo_mini_batch_size = self.config.actor_rollout_ref.actor.ppo_mini_batch_size
+            ppo_mini_batch_size = ppo_mini_batch_size * self.config.actor_rollout_ref.rollout.n
+            ppo_epochs = self.config.actor_rollout_ref.actor.ppo_epochs
+            seed = self.config.actor_rollout_ref.actor.data_loader_seed
+            shuffle = self.config.actor_rollout_ref.actor.shuffle
+            tu.assign_non_tensor(
+                batch_td,
+                calculate_entropy=calculate_entropy,
+                global_batch_size=ppo_mini_batch_size,
+                mini_batch_size=ppo_mini_batch_size,
+                epochs=ppo_epochs,
+                seed=seed,
+                dataloader_kwargs={"shuffle": shuffle},
+            )
+
+            actor_output = self.actor_rollout_wg.update_actor(batch_td)
+            actor_output = tu.get(actor_output, "metrics")
+            actor_output = rename_dict(actor_output, "actor/")
+            # modify key name
+            actor_output["perf/mfu/actor"] = actor_output.pop("actor/mfu")
+            actor_output = DataProto.from_single_dict(data={}, meta_info={"metrics": actor_output})
+        else:
+            actor_output = self.actor_rollout_wg.update_actor(batch)
+        return actor_output
+
+    def _update_critic(self, batch: DataProto) -> DataProto:
+        if self.use_legacy_worker_impl == "disable":
+            batch_td = batch.to_tensordict()
+            # step 2: convert from padding to no-padding
+            batch_td = left_right_2_no_padding(batch_td)
+            ppo_mini_batch_size = self.config.critic.ppo_mini_batch_size
+            ppo_mini_batch_size = ppo_mini_batch_size * self.config.actor_rollout_ref.rollout.n
+            ppo_epochs = self.config.critic.ppo_epochs
+            seed = self.config.critic.data_loader_seed
+            shuffle = self.config.critic.shuffle
+            tu.assign_non_tensor(
+                batch_td,
+                global_batch_size=ppo_mini_batch_size,
+                mini_batch_size=ppo_mini_batch_size,
+                epochs=ppo_epochs,
+                seed=seed,
+                dataloader_kwargs={"shuffle": shuffle},
+            )
+
+            output = self.critic_wg.train_mini_batch(batch_td)
+            output = output.get()
+            output = tu.get(output, "metrics")
+            output = rename_dict(output, "critic/")
+            # modify key name
+            output["perf/mfu/critic"] = output.pop("critic/mfu")
+            critic_output = DataProto.from_single_dict(data={}, meta_info={"metrics": output})
+        else:
+            critic_output = self.critic_wg.update_critic(batch)
+        return critic_output
+
     def fit(self):
         """
         The training loop of PPO.
@@ -1085,7 +1292,11 @@ def fit(self):
                             # compute reward model score on batch
                             rm_scores = None
                             if self.use_rm and "rm_scores" not in batch.batch.keys():
-                                rm_scores = self.rm_wg.compute_rm_score(batch)
+                                if not self.use_reward_loop:
+                                    rm_scores = self.rm_wg.compute_rm_score(batch)
+                                else:
+                                    assert self.reward_loop_manager is not None, "RewardLoopManager is None"
+                                    rm_scores = self.reward_loop_manager.compute_rm_score(batch)
                                 batch = batch.union(rm_scores)
                             reward_baseline_tensor, _ = compute_reward(batch, self.reward_fn)
                             reward_baseline_tensor = reward_baseline_tensor.sum(dim=-1)
@@ -1117,7 +1328,11 @@ def fit(self):
                     with marked_timer("reward", timing_raw, color="yellow"):
                         # compute reward model score
                         if self.use_rm and "rm_scores" not in batch.batch.keys():
-                            reward_tensor = self.rm_wg.compute_rm_score(batch)
+                            if not self.use_reward_loop:
+                                reward_tensor = self.rm_wg.compute_rm_score(batch)
+                            else:
+                                assert self.reward_loop_manager is not None, "RewardLoopManager is None"
+                                reward_tensor = self.reward_loop_manager.compute_rm_score(batch)
                             batch = batch.union(reward_tensor)
 
                         if self.config.reward_model.launch_reward_fn_async:
@@ -1134,16 +1349,16 @@ def fit(self):
                     rollout_corr_config = self.config.algorithm.get("rollout_correction", None)
                     bypass_recomputing_logprobs = rollout_corr_config and rollout_corr_config.get("bypass_mode", False)
                     if bypass_recomputing_logprobs:  # Use `rollout_log_probs`
-                        from verl.trainer.ppo.rollout_corr_helper import apply_rollout_correction
+                        from verl.trainer.ppo.rollout_corr_helper import apply_bypass_mode
 
-                        apply_rollout_correction(
+                        apply_bypass_mode(
                             batch=batch,
                             rollout_corr_config=rollout_corr_config,
                             policy_loss_config=self.config.actor_rollout_ref.actor.policy_loss,
                         )
                     else:  # Recompute old_log_probs
                         with marked_timer("old_log_prob", timing_raw, color="blue"):
-                            old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
+                            old_log_prob, old_log_prob_mfu = self._compute_old_log_prob(batch)
                             entropys = old_log_prob.batch["entropys"]
                             response_masks = batch.batch["response_mask"]
                             actor_config = self.config.actor_rollout_ref.actor
@@ -1153,7 +1368,10 @@ def fit(self):
                                 loss_agg_mode=actor_config.loss_agg_mode,
                                 loss_scale_factor=actor_config.loss_scale_factor,
                             )
-                            old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()}
+                            old_log_prob_metrics = {
+                                "actor/entropy": entropy_agg.detach().item(),
+                                "perf/mfu/actor_infer": old_log_prob_mfu,
+                            }
                             metrics.update(old_log_prob_metrics)
                             old_log_prob.batch.pop("entropys")
                             batch = batch.union(old_log_prob)
@@ -1168,16 +1386,13 @@ def fit(self):
                     if self.use_reference_policy:
                         # compute reference log_prob
                         with marked_timer(str(Role.RefPolicy), timing_raw, color="olive"):
-                            if not self.ref_in_actor:
-                                ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch)
-                            else:
-                                ref_log_prob = self.actor_rollout_wg.compute_ref_log_prob(batch)
+                            ref_log_prob = self._compute_ref_log_prob(batch)
                             batch = batch.union(ref_log_prob)
 
                     # compute values
                     if self.use_critic:
                         with marked_timer("values", timing_raw, color="cyan"):
-                            values = self.critic_wg.compute_values(batch)
+                            values = self._compute_values(batch)
                             batch = batch.union(values)
 
                     with marked_timer("adv", timing_raw, color="brown"):
@@ -1232,7 +1447,7 @@ def fit(self):
                     # update critic
                     if self.use_critic:
                         with marked_timer("update_critic", timing_raw, color="pink"):
-                            critic_output = self.critic_wg.update_critic(batch)
+                            critic_output = self._update_critic(batch)
                         critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"])
                         metrics.update(critic_output_metrics)
 
@@ -1240,11 +1455,7 @@ def fit(self):
                     if self.config.trainer.critic_warmup <= self.global_steps:
                         # update actor
                         with marked_timer("update_actor", timing_raw, color="red"):
-                            rollout_config = self.config.actor_rollout_ref.rollout
-                            batch.meta_info["multi_turn"] = rollout_config.multi_turn.enable
-                            # TODO: Make "temperature" single source of truth from generation.
-                            batch.meta_info["temperature"] = rollout_config.temperature
-                            actor_output = self.actor_rollout_wg.update_actor(batch)
+                            actor_output = self._update_actor(batch)
                         actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"])
                         metrics.update(actor_output_metrics)
 
diff --git a/verl/trainer/ppo/reward.py b/verl/trainer/ppo/reward.py
index d9e2872c405..892610243f4 100644
--- a/verl/trainer/ppo/reward.py
+++ b/verl/trainer/ppo/reward.py
@@ -32,12 +32,12 @@
     from omegaconf import DictConfig
 
     from verl import DataProto
-    from verl.experimental.reward.reward_loop.base import RewardLoopManagerBase
+    from verl.experimental.reward.reward_manager.base import RewardLoopManagerBase
     from verl.trainer.config.config import ModuleConfig, RewardManagerConfig
     from verl.workers.reward_manager.abstract import AbstractRewardManager, RawRewardFn
 else:
     try:
-        from verl.experimental.reward.reward_loop.base import RewardLoopManagerBase
+        from verl.experimental.reward.reward_manager.base import RewardLoopManagerBase
     except ImportError:
         RewardLoopManagerBase = None  # type: ignore[assignment,misc]
 
diff --git a/verl/trainer/ppo/rollout_corr_helper.py b/verl/trainer/ppo/rollout_corr_helper.py
index c09d08f4505..dfff5dff08b 100644
--- a/verl/trainer/ppo/rollout_corr_helper.py
+++ b/verl/trainer/ppo/rollout_corr_helper.py
@@ -913,24 +913,22 @@ def compute_rollout_corr_metrics_from_logprobs(
     return metrics_with_prefix
 
 
-def apply_rollout_correction(
+def apply_bypass_mode(
     batch: DataProto,
     rollout_corr_config: Optional[RolloutCorrectionConfig] = None,
     policy_loss_config: PolicyLossConfig = None,
 ) -> None:
     """
-    BYPASS MODE: Use rollout_log_probs as old_log_probs
-    Skips expensive actor forward pass for old_log_prob computation
+    Setup bypass mode: Use rollout_log_probs as old_log_probs.
 
-    Two sub-modes (controlled by use_policy_gradient):
-    1. Bypass + PPO loss (use_policy_gradient=False, default):
-       - Uses standard PPO loss function with old_log_prob=rollout_log_prob
-       - PPO clips ratio π_θ/π_rollout instead of π_θ/π_old
+    Bypass mode skips expensive actor forward pass for old_log_prob computation
+    by setting old_log_probs = rollout_log_probs (2 policies instead of 3).
 
-    2. Bypass + Policy Gradient loss (use_policy_gradient=True):
-       - Uses compute_policy_loss_with_rollout_correction()
-       - Policy gradient (REINFORCE-style) with IS/RS correction applied
-       - No PPO clipping
+    Uses compute_policy_loss_bypass_mode() which supports:
+    - loss_type="ppo_clip" (default): PPO clipped objective (IS handled by ratio)
+    - loss_type="reinforce": REINFORCE with explicit IS weights
+
+    Both loss types benefit from rejection sampling (RS) which masks out-of-distribution samples.
 
     Note:
         The implementation is copied from szrlee <szrlee@gmail.com>.
@@ -947,13 +945,7 @@ def apply_rollout_correction(
     batch.batch["old_log_probs"] = batch.batch["rollout_log_probs"]
 
     with open_dict(policy_loss_config):
-        # Always pass rollout_correction config to actor for metrics computation
+        # Pass rollout_correction config to actor for loss computation and metrics
         policy_loss_config["rollout_correction"] = rollout_corr_config
-
-    # Check if policy gradient loss mode is enabled
-    use_policy_gradient = rollout_corr_config.get("use_policy_gradient", False)
-
-    if use_policy_gradient:
-        # Policy gradient mode: Configure actor to use rollout_correction loss function
-        # This will use compute_policy_loss_with_rollout_correction (no PPO clipping)
-        policy_loss_config["loss_mode"] = "rollout_correction"
+        # Always use bypass_mode loss function which handles both loss_types
+        policy_loss_config["loss_mode"] = "bypass_mode"
diff --git a/verl/trainer/sft_trainer.py b/verl/trainer/sft_trainer.py
index ef02077d7c9..d498e9153af 100644
--- a/verl/trainer/sft_trainer.py
+++ b/verl/trainer/sft_trainer.py
@@ -140,12 +140,21 @@ def _init_engine(self):
     def _build_dataset(self):
         config = self.config
         tokenizer = self.model_config.tokenizer
+        processor = self.model_config.processor
         train_dataset = create_sft_dataset(
-            config.data.train_files, config.data, tokenizer, max_samples=config.data.get("train_max_samples", -1)
+            config.data.train_files,
+            config.data,
+            tokenizer,
+            processor,
+            max_samples=config.data.get("train_max_samples", -1),
         )
         if config.data.val_files:
             val_dataset = create_sft_dataset(
-                config.data.val_files, config.data, tokenizer, max_samples=config.data.get("val_max_samples", -1)
+                config.data.val_files,
+                config.data,
+                tokenizer,
+                processor,
+                max_samples=config.data.get("val_max_samples", -1),
             )
         else:
             val_dataset = None
@@ -178,7 +187,7 @@ def _build_dataloader(self):
             sampler=self.train_sampler,
             collate_fn=self.collate_fn,
             num_workers=8,
-            pin_memory=True,
+            pin_memory=False,
             drop_last=True,
             pin_memory_device=device_name,
         )
@@ -193,7 +202,7 @@ def _build_dataloader(self):
                 sampler=self.val_sampler,
                 collate_fn=self.collate_fn,
                 num_workers=8,
-                pin_memory=True,
+                pin_memory=False,
                 drop_last=True,
                 pin_memory_device=device_name,
             )
@@ -367,7 +376,7 @@ def main(config):
     run_sft(config)
 
 
-def create_sft_dataset(data_paths, data_config, tokenizer, max_samples=-1):
+def create_sft_dataset(data_paths, data_config, tokenizer, processor, max_samples=-1):
     """Create a dataset."""
     # build dataset
     # First check if a custom dataset class is specified
@@ -380,7 +389,9 @@ def create_sft_dataset(data_paths, data_config, tokenizer, max_samples=-1):
         dataset_cls = MultiTurnSFTDataset
 
     # Create datasets based on the selected class
-    dataset = dataset_cls(parquet_files=data_paths, tokenizer=tokenizer, config=data_config, max_samples=max_samples)
+    dataset = dataset_cls(
+        parquet_files=data_paths, tokenizer=tokenizer, config=data_config, processor=processor, max_samples=max_samples
+    )
     return dataset
 
 
diff --git a/verl/trainer/sft_trainer_ray.py b/verl/trainer/sft_trainer_ray.py
index 759514710b6..9178168c32b 100644
--- a/verl/trainer/sft_trainer_ray.py
+++ b/verl/trainer/sft_trainer_ray.py
@@ -119,12 +119,21 @@ def _build_engine(self):
     def _build_dataset(self):
         config = self.config
         tokenizer = self.model_config.tokenizer
+        processor = self.model_config.processor
         train_dataset = create_sft_dataset(
-            config.data.train_files, config.data, tokenizer, max_samples=config.data.get("train_max_samples", -1)
+            config.data.train_files,
+            config.data,
+            tokenizer,
+            processor=processor,
+            max_samples=config.data.get("train_max_samples", -1),
         )
         if config.data.val_files:
             val_dataset = create_sft_dataset(
-                config.data.val_files, config.data, tokenizer, max_samples=config.data.get("val_max_samples", -1)
+                config.data.val_files,
+                config.data,
+                tokenizer,
+                processor=processor,
+                max_samples=config.data.get("val_max_samples", -1),
             )
         else:
             val_dataset = None
@@ -157,7 +166,7 @@ def _build_dataloader(self):
             sampler=self.train_sampler,
             collate_fn=self.collate_fn,
             num_workers=8,
-            pin_memory=True,
+            pin_memory=False,
             drop_last=True,
             pin_memory_device=device_name,
         )
@@ -172,7 +181,7 @@ def _build_dataloader(self):
                 sampler=self.val_sampler,
                 collate_fn=self.collate_fn,
                 num_workers=8,
-                pin_memory=True,
+                pin_memory=False,
                 drop_last=True,
                 pin_memory_device=device_name,
             )
@@ -327,7 +336,7 @@ def main(config):
     run_sft(config)
 
 
-def create_sft_dataset(data_paths, data_config, tokenizer, max_samples=-1):
+def create_sft_dataset(data_paths, data_config, tokenizer, processor, max_samples=-1):
     """Create a dataset."""
     # build dataset
     # First check if a custom dataset class is specified
@@ -340,7 +349,9 @@ def create_sft_dataset(data_paths, data_config, tokenizer, max_samples=-1):
         dataset_cls = MultiTurnSFTDataset
 
     # Create datasets based on the selected class
-    dataset = dataset_cls(parquet_files=data_paths, tokenizer=tokenizer, config=data_config, max_samples=max_samples)
+    dataset = dataset_cls(
+        parquet_files=data_paths, tokenizer=tokenizer, config=data_config, processor=processor, max_samples=max_samples
+    )
     return dataset
 
 
diff --git a/verl/utils/attention_utils.py b/verl/utils/attention_utils.py
index 8340155e761..ea9884307fc 100644
--- a/verl/utils/attention_utils.py
+++ b/verl/utils/attention_utils.py
@@ -20,14 +20,14 @@
 def _get_attention_functions() -> tuple[Callable, Callable, Callable, Callable]:
     """Dynamically import attention functions based on available hardware."""
 
-    from verl.utils.device import is_cuda_available, is_npu_available
+    from verl.utils.device import is_npu_available
 
     global _index_first_axis, _pad_input, _rearrange, _unpad_input
 
-    if is_cuda_available:
-        from flash_attn.bert_padding import index_first_axis, pad_input, rearrange, unpad_input
-    elif is_npu_available:
+    if is_npu_available:
         from verl.utils.npu_flash_attn_utils import index_first_axis, pad_input, rearrange, unpad_input
+    else:
+        from flash_attn.bert_padding import index_first_axis, pad_input, rearrange, unpad_input
 
     _index_first_axis, _pad_input, _rearrange, _unpad_input = index_first_axis, pad_input, rearrange, unpad_input
 
diff --git a/verl/utils/chat_template.py b/verl/utils/chat_template.py
index 70b30452c01..6bda790641f 100644
--- a/verl/utils/chat_template.py
+++ b/verl/utils/chat_template.py
@@ -20,9 +20,23 @@ def initialize_system_prompt(tokenizer, **apply_chat_template_kwargs) -> list[in
         List of token IDs for the system prompt, or empty list if not supported
     """
     try:
-        return tokenizer.apply_chat_template(
-            [{}], add_generation_prompt=False, tokenize=True, **apply_chat_template_kwargs
-        )
+        return tokenizer.apply_chat_template([{}], tokenize=True, **apply_chat_template_kwargs)
     except TemplateError as e:
         logger.warning(f"Chat template does not support system prompt: {e}")
         return []
+
+
+def extract_system_prompt_and_generation(tokenizer):
+    token1 = tokenizer.apply_chat_template(
+        [{"role": "user", "content": ""}], add_generation_prompt=False, tokenize=True
+    )
+    token2 = tokenizer.apply_chat_template(
+        [{"role": "user", "content": ""}] * 2, add_generation_prompt=False, tokenize=True
+    )
+    # get system prompt tokens
+    system_prompt = token1[: -(len(token2) - len(token1))]
+    # get generate prompt tokens
+    token3 = tokenizer.apply_chat_template([{"role": "user", "content": ""}], add_generation_prompt=True, tokenize=True)
+    generate_prompt = token3[len(token1) :]
+
+    return system_prompt, generate_prompt
diff --git a/verl/utils/config.py b/verl/utils/config.py
index 14b16538c25..094024a224e 100644
--- a/verl/utils/config.py
+++ b/verl/utils/config.py
@@ -168,7 +168,11 @@ def check_mutually_exclusive(mbs, mbs_per_gpu, name: str):
         )
 
     # Check for reward model micro-batch size conflicts
-    if config.reward_model.enable and not config.reward_model.use_dynamic_bsz:
+    if (
+        config.reward_model.enable
+        and not config.reward_model.use_dynamic_bsz
+        and not config.reward_model.use_reward_loop
+    ):
         check_mutually_exclusive(
             config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu, "reward_model"
         )
diff --git a/verl/utils/dataset/dataset_utils.py b/verl/utils/dataset/dataset_utils.py
index 7354a0c896d..03bde7b01d2 100644
--- a/verl/utils/dataset/dataset_utils.py
+++ b/verl/utils/dataset/dataset_utils.py
@@ -16,6 +16,7 @@
 from enum import Enum
 
 import torch
+from tensordict.tensorclass import NonTensorData
 
 
 class DatasetPadMode(str, Enum):
@@ -60,11 +61,15 @@ def collate_variable_batch(self, batch: list[dict[str, any]]) -> dict[str, any]:
 
         final_batch = {}
 
-        tensor_keys = [key for key in batch[0].keys() if isinstance(batch[0][key], torch.Tensor)]
+        tensor_keys = set().union(*(d.keys() for d in batch))
 
         # Handle tensor values by creating a NestedTensor.
         for key in tensor_keys:
-            tensors = [item[key] for item in batch]
-            final_batch[key] = torch.nested.as_nested_tensor(tensors, layout=torch.jagged)
+            if isinstance(batch[0][key], torch.Tensor):
+                tensors = [item[key] for item in batch]
+                final_batch[key] = torch.nested.as_nested_tensor(tensors, layout=torch.jagged)
+            else:
+                tensors = [NonTensorData(item.get(key)) for item in batch]
+                final_batch[key] = torch.stack(tensors, dim=0)
 
         return final_batch
diff --git a/verl/utils/dataset/multiturn_sft_dataset.py b/verl/utils/dataset/multiturn_sft_dataset.py
index 73450530c7a..0eab4701458 100644
--- a/verl/utils/dataset/multiturn_sft_dataset.py
+++ b/verl/utils/dataset/multiturn_sft_dataset.py
@@ -17,19 +17,28 @@
 """
 
 import logging
+import os
+import re
 from typing import Any, Optional
 
 import numpy as np
 import pandas as pd
 import torch
-from omegaconf import ListConfig
+import torch.nn.functional as F
+from omegaconf import DictConfig, ListConfig
 from torch.utils.data import Dataset
-from transformers import PreTrainedTokenizer
+from transformers import PreTrainedTokenizer, ProcessorMixin
 
+from verl.models.transformers.qwen2_vl import get_rope_index
 from verl.utils import hf_tokenizer
+from verl.utils.chat_template import extract_system_prompt_and_generation
 from verl.utils.dataset.dataset_utils import DatasetPadMode
+from verl.utils.dataset.vision_utils import process_image, process_video
 from verl.utils.fs import copy_local_path_from_hdfs
 
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+
 
 def convert_nested_value_to_list_recursive(data_item):
     if isinstance(data_item, dict):
@@ -47,9 +56,23 @@ def convert_nested_value_to_list_recursive(data_item):
 class MultiTurnSFTDataset(Dataset):
     """
     Dataset for multi-turn conversations where each assistant response should be trained
+
+    Args:
+        data_files (str or list): Path(s) to Parquet file(s).
+        tokenizer (PreTrainedTokenizer): For the tokenization of text to token IDs.
+        config (DictConfig): Options like cache_dir, prompt_key, max_prompt_length, truncation, etc.
+        processor (ProcessorMixin, optional): Multimodal preprocessor for images/videos.
+        max_samples (int, optional): Limit the number of samples. Defaults to -1 (use all).
     """
 
-    def __init__(self, parquet_files: str | list[str], tokenizer, config=None, max_samples: int = -1):
+    def __init__(
+        self,
+        parquet_files: str | list[str],
+        tokenizer: PreTrainedTokenizer,
+        config: DictConfig,
+        processor: Optional[ProcessorMixin] = None,
+        max_samples: int = -1,
+    ):
         # Set defaults and extract parameters from config if provided
         config = config or {}
         self.pad_mode = config.get("pad_mode", "right")
@@ -60,14 +83,19 @@ def __init__(self, parquet_files: str | list[str], tokenizer, config=None, max_s
         # for right padding
         self.max_length = config.get("max_length", 1024)
         # Get messages_key from the new multiturn config structure
-        multiturn_config = config.get("multiturn", {})
-        self.messages_key = multiturn_config.get("messages_key", "messages")
-        self.tools_key = multiturn_config.get("tools_key", "tools")
-        self.enable_thinking_key = multiturn_config.get("enable_thinking_key", "enable_thinking")
+        self.messages_key = config.get("messages_key", "messages")
+        self.image_key = config.get("image_key", "images")
+        self.video_key = config.get("video_key", "videos")
+        self.image_patch_size = config.get(
+            "image_patch_size", processor.image_processor.patch_size if processor else None
+        )
+        self.tools_key = config.get("tools_key", "tools")
+        self.enable_thinking_key = config.get("enable_thinking_key", "enable_thinking")
         self.apply_chat_template_kwargs = config.get("apply_chat_template_kwargs", {})
         self.shuffle = config.get("shuffle", False)
         self.seed = config.get("seed")
         self.max_samples = max_samples
+        self.ignore_input_ids_mismatch = config.get("ignore_input_ids_mismatch", False)
         assert self.truncation in ["error", "left", "right"]
 
         if not isinstance(parquet_files, list | ListConfig):
@@ -77,6 +105,7 @@ def __init__(self, parquet_files: str | list[str], tokenizer, config=None, max_s
         if isinstance(tokenizer, str):
             tokenizer = hf_tokenizer(tokenizer)
         self.tokenizer: PreTrainedTokenizer = tokenizer
+        self.processor = processor
 
         self._download()
         self._read_files_and_process()
@@ -127,215 +156,165 @@ def series_to_item(ls):
         else:
             self.enable_thinking = None
 
+        # system prompt: <|im_start|>system\nYou are a helpful assistant.<|im_end|>\n
+        # generation prompt: <|im_start|>assistant\n
+        self.system_prompt, self.generation_prompt = extract_system_prompt_and_generation(self.tokenizer)
+
     def __len__(self):
         return len(self.messages)
 
-    def _process_message_tokens(
+    def _process_single_message(
         self,
-        messages: list[dict[str, Any]],
-        start_idx: int,
-        end_idx: int,
-        is_assistant: bool = False,
-        enable_thinking: Optional[bool] = None,
+        index: int,
+        message: dict[str, Any],
         tools: Optional[list[dict[str, Any]]] = None,
+        enable_thinking: Optional[bool] = None,
     ) -> tuple[list[int], list[int], list[int]]:
         """
-        Process tokens for a single message or a group of messages.
+        Process a single message and return its tokenized representation.
 
         Args:
-            messages: List of message dictionaries
-            start_idx: Start index in messages list
-            end_idx: End index in messages list
-            is_assistant: Whether this is an assistant message
+            index: turn index in the conversation
+            message: A single message dictionary
+            images: List of images to be used
+            videos: List of videos to be used
+            tools: List of tools to be used
             enable_thinking: Whether to enable thinking mode
 
         Returns:
-            Tuple of (tokens, loss_mask, attention_mask)
+            Tuple of (input_ids, loss_mask, attention_mask, dict[str, torch.Tensor])
         """
-        if start_idx > 0:
-            prev_applied_text = self.tokenizer.apply_chat_template(
-                messages[:start_idx],
-                tokenize=False,
-                add_generation_prompt=False,
-                enable_thinking=enable_thinking,
-                tools=tools,
-                **self.apply_chat_template_kwargs,
-            )
-            if is_assistant:
-                prev_applied_text_w_generation_prompt = self.tokenizer.apply_chat_template(
-                    messages[:start_idx],
-                    tokenize=False,
-                    add_generation_prompt=True,
-                    enable_thinking=enable_thinking,
-                    tools=tools,
-                    **self.apply_chat_template_kwargs,
-                )
-
-        else:
-            prev_applied_text = ""
+        processor = self.processor if self.processor is not None else self.tokenizer
+        apply_chat_template_kwargs = {**self.apply_chat_template_kwargs}
+        if enable_thinking is not None:
+            apply_chat_template_kwargs["enable_thinking"] = enable_thinking
 
-        cur_applied_text = self.tokenizer.apply_chat_template(
-            messages[:end_idx],
-            tokenize=False,
-            add_generation_prompt=False,
-            enable_thinking=enable_thinking,
+        inputs = processor.apply_chat_template(
+            [message],
             tools=tools,
-            **self.apply_chat_template_kwargs,
+            add_generation_prompt=False,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            **apply_chat_template_kwargs,
         )
-        # Get tokens for the current message only
-        if is_assistant:
-            generation_prompt_text = prev_applied_text_w_generation_prompt[len(prev_applied_text) :]
-            generation_prompt_tokens = self.tokenizer.encode(
-                generation_prompt_text,
-                add_special_tokens=False,
-            )
-            _message_tokens = self.tokenizer.encode(
-                cur_applied_text[len(prev_applied_text_w_generation_prompt) :],
-                add_special_tokens=False,
-            )
-            message_tokens = generation_prompt_tokens + _message_tokens
-            loss_mask = [0] * (len(generation_prompt_tokens)) + [1] * (
-                len(message_tokens) - len(generation_prompt_tokens)
-            )
-        else:
-            message_tokens = self.tokenizer.encode(
-                cur_applied_text[len(prev_applied_text) :],
-                add_special_tokens=False,
-            )
-            loss_mask = [0] * len(message_tokens)
 
-        attention_mask = [1] * len(message_tokens)
+        inputs = dict(inputs)
+        input_ids = inputs.pop("input_ids")[0]
+        attention_mask = inputs.pop("attention_mask")[0]
 
-        return message_tokens, loss_mask, attention_mask
+        # remove system prompt if exists
+        if index != 0 and message["role"] != "system":
+            input_ids = input_ids[len(self.system_prompt) :]
+            attention_mask = attention_mask[len(self.system_prompt) :]
 
-    def _validate_and_convert_tokens(
-        self,
-        full_tokens: torch.Tensor,
-        concat_tokens: list[int],
-        concat_loss_mask: list[int],
-        concat_attention_mask: list[int],
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Validate tokenization and convert to tensors.
+        if message["role"] == "assistant":
+            loss_mask = torch.ones_like(attention_mask)
+            # mask out generation prompt if assistant message
+            loss_mask[: len(self.generation_prompt)] = 0
+        else:
+            loss_mask = torch.zeros_like(attention_mask)
+
+        return input_ids, loss_mask, attention_mask, inputs
+
+    def _build_messages(self, example: dict):
+        """Replace <image> and <video> placeholder in messages with corresponding image and video
+        which is required by processor.apply_chat_template.
+        - <image>: {"type": "image", "image": image}
+        - <video>: {"type": "video", "video": video}
 
         Args:
-            full_tokens: Full conversation tokens
-            concat_tokens: Concatenated tokens
-            concat_loss_mask: Concatenated loss mask
-            concat_attention_mask: Concatenated attention mask
+            example: Row dictionary from dataframe.
 
         Returns:
-            Tuple of (input_ids, loss_mask, attention_mask) as tensors
+            messages: List of messages with replaced placeholder.
         """
-        full_tokens_list = full_tokens.tolist()
-
-        if len(concat_tokens) != len(full_tokens_list) or not all(
-            a == b for a, b in zip(concat_tokens, full_tokens_list, strict=True)
-        ):
-            logging.warning(
-                f"Token mismatch detected! Full tokenization length: {len(full_tokens_list)}, Concatenated tokens "
-                f"length: {len(concat_tokens)}. Using concatenated version."
-                # f"full tokens text: {self.tokenizer.decode(full_tokens_list)}"
-                # f"concat tokens text: {self.tokenizer.decode(concat_tokens)}"
-            )
-            return (
-                torch.tensor(concat_tokens, dtype=torch.long),
-                torch.tensor(concat_loss_mask, dtype=torch.long),
-                torch.tensor(concat_attention_mask, dtype=torch.long),
-            )
+        messages: list = example[self.messages_key]
+        images = example[self.image_key] if self.image_key in example else []
+        videos = example[self.video_key] if self.video_key in example else []
+
+        image_offset, video_offset = 0, 0
+        for message in messages:
+            if self.image_key not in example and self.video_key not in example:
+                continue
+            assert self.processor is not None, "processor is needed to process image and video"
+
+            content = message["content"]
+            if not isinstance(content, str):
+                continue
+
+            content_list = []
+            segments = re.split("(<image>|<video>)", content)
+            segments = [item for item in segments if item != ""]
+            for segment in segments:
+                if segment == "<image>":
+                    image = process_image(images[image_offset], image_patch_size=self.image_patch_size)
+                    content_list.append({"type": "image", "image": image})
+                    image_offset += 1
+                elif segment == "<video>":
+                    video = process_video(videos[video_offset], image_patch_size=self.image_patch_size)
+                    content_list.append({"type": "video", "video": video})
+                    video_offset += 1
+                else:
+                    content_list.append({"type": "text", "text": segment})
+            message["content"] = content_list
 
-        return (
-            full_tokens,
-            torch.tensor(concat_loss_mask, dtype=torch.long),
-            torch.tensor(concat_attention_mask, dtype=torch.long),
-        )
+        assert image_offset == len(images), f"image_offset {image_offset} != len(images) {len(images)}"
+        assert video_offset == len(videos), f"video_offset {video_offset} != len(videos) {len(videos)}"
+        return messages
 
     def __getitem__(self, item):
-        tokenizer = self.tokenizer
-        messages = self.messages[item]
+        row_dict: dict = self.dataframe.iloc[item].to_dict()
+        messages = self._build_messages(row_dict)
         tools = self.tools[item] if self.tools is not None else None
         enable_thinking = self.enable_thinking[item] if self.enable_thinking is not None else None
 
-        # First, get the full conversation tokens
-        try:
-            full_tokens = tokenizer.apply_chat_template(
-                messages,
-                tools=tools,
-                tokenize=True,
-                return_tensors="pt",
-                add_generation_prompt=False,
+        # 1. tokenize each message
+        input_ids, loss_mask, attention_mask, multi_modal_inputs = [], [], [], {}
+        for i, message in enumerate(messages):
+            _input_ids, _loss_mask, _attention_mask, _inputs = self._process_single_message(
+                index=i,
+                message=message,
+                tools=tools if i == 0 else None,
                 enable_thinking=enable_thinking,
-                **self.apply_chat_template_kwargs,
-            )
-        except Exception as e:
-            logging.error(
-                f"Error applying chat template: {e}\nMessages: {messages}\nTools: {tools}\nEnable thinking: "
-                f"{enable_thinking}"
             )
-            raise
-
-        # Track concatenated tokens for validation
-        concat_tokens = []
-        concat_loss_mask = []
-        concat_attention_mask = []
-
-        i = 0
-        while i < len(messages):
-            cur_messages = messages[i]
-            if cur_messages["role"] == "assistant":
-                # Process assistant message
-                tokens, loss_mask, attention_mask = self._process_message_tokens(
-                    messages, i, i + 1, is_assistant=True, enable_thinking=enable_thinking, tools=tools
-                )
-                i += 1
-            elif cur_messages["role"] == "tool":
-                # Process consecutive tool messages
-                st = i
-                ed = i + 1
-                while ed < len(messages) and messages[ed]["role"] == "tool":
-                    ed += 1
-                tokens, loss_mask, attention_mask = self._process_message_tokens(
-                    messages, st, ed, enable_thinking=enable_thinking, tools=tools
-                )
-                i = ed
-            elif cur_messages["role"] in ["user", "system"]:
-                # Process user or system message
-                if cur_messages["role"] == "system" and i != 0:
-                    raise ValueError("System message should be the first message")
-                tokens, loss_mask, attention_mask = self._process_message_tokens(
-                    messages, i, i + 1, enable_thinking=enable_thinking, tools=tools
-                )
-                i += 1
-            else:
-                raise ValueError(f"Unknown role: {cur_messages['role']}")
-
-            # override loss mask with mask in the dataset to handle multi-turn conversation
-            override_loss_mask = cur_messages.get("loss_mask", None)
-            if override_loss_mask is not None:
-                if isinstance(override_loss_mask, np.ndarray):
-                    override_loss_mask = override_loss_mask.item()
-                assert isinstance(override_loss_mask, int), f"loss_mask should be int, got {type(override_loss_mask)}"
-                assert override_loss_mask in [0, 1], f"loss_mask should be 0 or 1, got {override_loss_mask}"
-                loss_mask = [override_loss_mask] * len(tokens)
-
-            concat_tokens.extend(tokens)
-            concat_loss_mask.extend(loss_mask)
-            concat_attention_mask.extend(attention_mask)
-
-        # Validate and convert tokens
-        input_ids, loss_mask, attention_mask = self._validate_and_convert_tokens(
-            full_tokens[0], concat_tokens, concat_loss_mask, concat_attention_mask
+            input_ids.append(_input_ids)
+            loss_mask.append(_loss_mask)
+            attention_mask.append(_attention_mask)
+            for k, v in _inputs.items():
+                multi_modal_inputs.setdefault(k, []).append(v)
+
+        input_ids = torch.cat(input_ids, dim=0)
+        loss_mask = torch.cat(loss_mask, dim=0)
+        attention_mask = torch.cat(attention_mask, dim=0)
+        assert input_ids.shape == loss_mask.shape == attention_mask.shape, (
+            f"Shape mismatch: {input_ids.shape}, {loss_mask.shape}, {attention_mask.shape}"
         )
-
-        # encode prompt
-        if messages[0]["role"] == "system":
-            assert messages[1]["role"] == "user"
-            assert messages[2]["role"] == "assistant"
-        elif messages[0]["role"] == "user":
-            assert messages[1]["role"] == "assistant"
+        self.sanity_check(input_ids, messages, tools, enable_thinking)
+
+        for k, v in multi_modal_inputs.items():
+            multi_modal_inputs[k] = torch.concat(v, dim=0)
+
+        # 2. handle position_ids for Qwen-VL series models
+        if self.processor is not None and "Qwen2VLImageProcessor" in self.processor.image_processor.__class__.__name__:
+            image_grid_thw = multi_modal_inputs.get("image_grid_thw", None)
+            video_grid_thw = multi_modal_inputs.get("video_grid_thw", None)
+            second_per_grid_ts = multi_modal_inputs.get("second_per_grid_ts", None)
+
+            vision_position_ids = get_rope_index(
+                self.processor,
+                input_ids=input_ids,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+                attention_mask=attention_mask,
+            )  # (3, seq_len)
+            text_position_ids = torch.arange(input_ids.shape[0], dtype=torch.long).unsqueeze(0)  # (1, seq_len)
+            position_ids = torch.cat((text_position_ids, vision_position_ids), dim=0)  # (4, seq_length)
         else:
-            raise ValueError(f"Unknown role: {messages[0]['role']}")
+            position_ids = torch.arange(input_ids.shape[0], dtype=torch.long)  # (seq_len,)
 
+        # 3. handle padding
         sequence_length = input_ids.shape[0]
         # Handle sequence length
         if self.pad_mode == DatasetPadMode.RIGHT:
@@ -349,43 +328,80 @@ def __getitem__(self, item):
                 input_ids = torch.cat((input_ids, padded_input_ids))
                 attention_mask = torch.cat((attention_mask, padded_attention_mask))
                 loss_mask = torch.cat((loss_mask, padded_loss_mask))
+                position_ids = F.pad(position_ids, (0, self.max_length - sequence_length), value=0)
             elif sequence_length > self.max_length:
                 if self.truncation == "left":
                     input_ids = input_ids[-self.max_length :]
                     attention_mask = attention_mask[-self.max_length :]
                     loss_mask = loss_mask[-self.max_length :]
+                    position_ids = position_ids[..., -self.max_length :]
                 elif self.truncation == "right":
                     input_ids = input_ids[: self.max_length]
                     attention_mask = attention_mask[: self.max_length]
                     loss_mask = loss_mask[: self.max_length]
+                    position_ids = position_ids[..., : self.max_length]
                 elif self.truncation == "error":
                     raise ValueError(f"{sequence_length=} is larger than {self.max_length=}")
                 else:
                     raise ValueError(f"Unknown truncation method {self.truncation}")
 
-            # Create position IDs
-            position_ids = torch.arange(len(input_ids), dtype=torch.long)
-            # Zero out position IDs for padding
-            position_ids = position_ids * attention_mask
-
-            return {
+            res = {
                 "input_ids": input_ids,
                 "attention_mask": attention_mask,
                 "position_ids": position_ids,
                 "loss_mask": loss_mask,
             }
+            if len(multi_modal_inputs) > 0:
+                res["multi_modal_inputs"] = multi_modal_inputs
+            return res
         elif self.pad_mode == DatasetPadMode.NO_PADDING:
             # truncate input_ids if it is longer than max_length
             if len(input_ids) > self.max_length:
                 input_ids = input_ids[: self.max_length]
                 loss_mask = loss_mask[: self.max_length]
-            # create position IDs
-            position_ids = torch.arange(len(input_ids), dtype=torch.long)
+                position_ids = position_ids[..., : self.max_length]
+
             # return nested tensor with out padding
-            return {
+            res = {
                 "input_ids": input_ids,
                 "position_ids": position_ids,
                 "loss_mask": loss_mask,
             }
+            if len(multi_modal_inputs) > 0:
+                res["multi_modal_inputs"] = multi_modal_inputs
+            return res
         else:
             raise ValueError(f"Unknown pad mode {self.pad_mode}")
+
+    def sanity_check(self, input_ids: torch.Tensor, messages: list[dict], tools: list[dict], enable_thinking: bool):
+        """Check concatenated input_ids of apply_chat_template to each turn equals
+        apply_chat_template to whole messages.
+        """
+        processor = self.processor if self.processor is not None else self.tokenizer
+        apply_chat_template_kwargs = {**self.apply_chat_template_kwargs}
+        if enable_thinking is not None:
+            apply_chat_template_kwargs["enable_thinking"] = enable_thinking
+        inputs = processor.apply_chat_template(
+            messages,
+            tools=tools,
+            add_generation_prompt=False,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            **apply_chat_template_kwargs,
+        )
+
+        error_message = (
+            "MultiTurnSFTDataset apply_chat_template to each turn separately and concat `input_ids` "
+            "as a whole sequence, which may not equal to apply_chat_template to whole messages at once.\n"
+            "For example, Qwen Thinking series models add <think></think> tags to last turn, please check "
+            "your tokenizer chat template settings.\n"
+            "Set `ignore_input_ids_mismatch=True` to ignore input_ids mismatch and use the concatenated "
+            "input_ids as the final input_ids. "
+        )
+
+        if not torch.equal(input_ids, inputs["input_ids"].squeeze(0)):
+            if self.ignore_input_ids_mismatch:
+                logger.warning_once(error_message)
+            else:
+                raise AssertionError(error_message)
diff --git a/verl/utils/device.py b/verl/utils/device.py
index 4312989b2bd..2a21a31b668 100644
--- a/verl/utils/device.py
+++ b/verl/utils/device.py
@@ -92,3 +92,15 @@ def set_expandable_segments(enable: bool) -> None:
     """
     if is_cuda_available:
         torch.cuda.memory._set_allocator_settings(f"expandable_segments:{enable}")
+
+
+def auto_set_ascend_device_name(config):
+    if config and config.trainer and config.trainer.device:
+        if is_torch_npu_available():
+            if config.trainer.device != "npu":
+                logger.warning(
+                    f"Detect setting config.trainer.device to {config.trainer.device} for Ascend NPU, maybe"
+                    f"from default value in config file, automatically set to `npu` instead."
+                )
+
+            config.trainer.device = "npu"
diff --git a/verl/utils/import_utils.py b/verl/utils/import_utils.py
index 626b62856e2..ee78b580675 100644
--- a/verl/utils/import_utils.py
+++ b/verl/utils/import_utils.py
@@ -197,6 +197,39 @@ def load_extern_object(module_path: str, object_name: str) -> object:
     return getattr(module, object_name)
 
 
+def load_class_from_fqn(fqn: str, description: str = "class") -> type:
+    """Load a class from its fully qualified name.
+
+    Args:
+        fqn: Fully qualified class name (e.g., 'mypackage.module.ClassName').
+        description: Description for error messages (e.g., 'AgentLoopManager').
+
+    Returns:
+        The loaded class.
+
+    Raises:
+        ValueError: If fqn format is invalid (missing dot separator).
+        ImportError: If the module cannot be imported.
+        AttributeError: If the class is not found in the module.
+
+    Example:
+        >>> cls = load_class_from_fqn("verl.experimental.agent_loop.AgentLoopManager")
+        >>> instance = cls(config=config, ...)
+    """
+    if "." not in fqn:
+        raise ValueError(
+            f"Invalid {description} '{fqn}'. Expected fully qualified class name (e.g., 'mypackage.module.ClassName')."
+        )
+    try:
+        module_path, class_name = fqn.rsplit(".", 1)
+        module = importlib.import_module(module_path)
+        return getattr(module, class_name)
+    except ImportError as e:
+        raise ImportError(f"Failed to import module '{module_path}' for {description}: {e}") from e
+    except AttributeError as e:
+        raise AttributeError(f"Class '{class_name}' not found in module '{module_path}': {e}") from e
+
+
 @deprecated(replacement="load_module(file_path); getattr(module, type_name)")
 def load_extern_type(file_path: str, type_name: str) -> type:
     """DEPRECATED. Directly use `load_extern_object` instead."""
diff --git a/verl/utils/megatron_utils.py b/verl/utils/megatron_utils.py
index 42ed8a62da3..13c63ebdce0 100644
--- a/verl/utils/megatron_utils.py
+++ b/verl/utils/megatron_utils.py
@@ -192,7 +192,10 @@ def make_megatron_module(
         else:
             from verl.models.mcore.bridge import freeze_moe_router, make_value_model
 
-            value_model_hook = make_value_model(hf_config.hidden_size, provider.sequence_parallel)
+            hidden_size = (
+                hf_config.text_config.hidden_size if hasattr(hf_config, "text_config") else hf_config.hidden_size
+            )
+            value_model_hook = make_value_model(hidden_size, provider.sequence_parallel)
 
         post_model_creation_callbacks = []
         if wrap_config.is_value_model:
@@ -442,7 +445,7 @@ def load_megatron_model_to_gpu(models, load_grad=True):
             for buffers in model_chunk_all_buffers:
                 for buffer in buffers:
                     # sometimes, we don't want to load grad for pure inference
-                    if load_grad:
+                    if load_grad and hasattr(buffer, "grad_data_size"):
                         buffer.grad_data.storage().resize_(buffer.grad_data_size)
                         buffer.grad_data.zero_()
 
@@ -1217,3 +1220,11 @@ def register_megatron_training_hooks(model: list[torch.nn.Module], optimizer):
             config.param_sync_func = [model_chunk.start_param_sync for model_chunk in model]
             if len(model) == 1:
                 config.param_sync_func = config.param_sync_func[0]
+
+
+def mapping_string_to_attn_backend(args: dict) -> dict:
+    if "attention_backend" in args and isinstance(args["attention_backend"], str):
+        from megatron.core.transformer.enums import AttnBackend
+
+        args["attention_backend"] = AttnBackend[args["attention_backend"]]
+    return args
diff --git a/verl/utils/model.py b/verl/utils/model.py
index 5e5e035805b..a59c4c32962 100644
--- a/verl/utils/model.py
+++ b/verl/utils/model.py
@@ -24,11 +24,13 @@
 
 import numpy as np
 import torch
+from tensordict.tensorclass import NonTensorData
 from torch import nn
 from transformers import (
     AutoConfig,
     AutoModel,
     AutoModelForCausalLM,
+    AutoModelForImageTextToText,
     AutoModelForSequenceClassification,
     AutoModelForTokenClassification,
     AutoModelForVision2Seq,
@@ -673,14 +675,20 @@ def get_hf_auto_model_class(hf_config):
                 actor_module_class = AutoModelForVision2Seq
             case "AutoModelForCausalLM":
                 actor_module_class = AutoModelForCausalLM
+            case "AutoModelForImageTextToText":
+                actor_module_class = AutoModelForImageTextToText
             case _:
                 actor_module_class = AutoModel
     else:
         actor_module_class = AutoModel
-        for key, cls in _architecture_to_auto_class.items():
-            if key in hf_config.architectures[0]:
-                actor_module_class = cls
-                break
+        # For VLM models, we use type to check instead of architecture
+        if type(hf_config) in AutoModelForImageTextToText._model_mapping.keys():
+            actor_module_class = AutoModelForImageTextToText
+        else:
+            for key, cls in _architecture_to_auto_class.items():
+                if key in hf_config.architectures[0]:
+                    actor_module_class = cls
+                    break
 
     return actor_module_class
 
@@ -709,6 +717,10 @@ def extract_multi_modal_inputs(
         selected_batch_data = [batch_data[i] for i in indices if i < len(batch_data)]
 
     for inputs in selected_batch_data:
+        inputs = inputs.data if isinstance(inputs, NonTensorData) else inputs
+        # Mixed pure text and multi-modal dataset.
+        if inputs is None:
+            continue
         if "image_bound" in inputs:
             has_image_bound = True
         for key, value in inputs.items():
diff --git a/verl/utils/py_functional.py b/verl/utils/py_functional.py
index 79bfe7553ba..760a2057b94 100644
--- a/verl/utils/py_functional.py
+++ b/verl/utils/py_functional.py
@@ -162,6 +162,24 @@ def union_two_dict(dict1: dict, dict2: dict):
     return dict1
 
 
+def rename_dict(data: dict, prefix: str = "") -> dict:
+    """Add a prefix to all the keys in the data dict if it's name is not started with prefix
+
+    Args:
+        data: a dictionary
+        prefix: prefix
+
+    Returns:
+        dictionary with modified name
+
+    """
+    new_data = {}
+    for key, val in data.items():
+        new_key = f"{prefix}{key}" if not key.startswith(prefix) else key
+        new_data[new_key] = val
+    return new_data
+
+
 def append_to_dict(data: dict, new_data: dict, prefix: str = ""):
     """Append values from new_data to lists in data.
 
diff --git a/verl/utils/tensordict_utils.py b/verl/utils/tensordict_utils.py
index 62addb6fad5..f0e6b980a04 100644
--- a/verl/utils/tensordict_utils.py
+++ b/verl/utils/tensordict_utils.py
@@ -152,6 +152,28 @@ def concat_tensordict(data: list[TensorDict]) -> TensorDict:
     return output
 
 
+def chunk_tensordict(td: TensorDict, chunks: int) -> list[TensorDict]:
+    """Splits a tensordict into the specified number of chunks with special handling of 3d nested tensors.
+
+    This is a workaround for torch.chunk() not support 3d jagged tensor, e.g. MRoPE position_id.
+    https://github.com/pytorch/pytorch/issues/153238
+    """
+    assert isinstance(td, TensorDict) and len(td) % chunks == 0, (
+        f"expecting td with length divisible by chunks, but got {len(td)} and {chunks}"
+    )
+    chunk_size = len(td) // chunks
+    keys = {key for key, val in td.items() if isinstance(val, torch.Tensor) and val.is_nested and val.dim() >= 3}
+    new_td = TensorDict({k: v for k, v in td.items() if k not in keys}, batch_size=td.batch_size, device=td.device)
+
+    tds = new_td.chunk(chunks=chunks)
+    for key in keys:
+        tensors = td[key].unbind(dim=0)
+        for i, td in enumerate(tds):
+            td[key] = torch.nested.as_nested_tensor(tensors[i * chunk_size : (i + 1) * chunk_size], layout=torch.jagged)
+
+    return tds
+
+
 def get_tensordict(tensor_dict: dict[str, torch.Tensor | list], non_tensor_dict: dict = None) -> TensorDict:
     """Create a TensorDict from tensors and non-tensor data.
 
@@ -186,6 +208,7 @@ def get_tensordict(tensor_dict: dict[str, torch.Tensor | list], non_tensor_dict:
     for key, val in tensor_dict.items():
         if isinstance(val, torch.Tensor) and val.is_nested:
             assert val.is_contiguous(), "Nested tensors must be contiguous. Try setting layout=torch.jagged"
+            assert val.layout == torch.jagged, "Nested tensors must be jagged."
 
         # Skip validation for NonTensorStack as it's already properly formatted
         if isinstance(val, NonTensorStack):
@@ -300,13 +323,17 @@ def make_iterator(tensordict: TensorDict, mini_batch_size, epochs, seed=None, da
         generator = None
 
     assert isinstance(dataloader_kwargs, dict)
+
+    idx_lst = torch.arange(tensordict.shape[0])
+
     train_dataloader = DataLoader(
-        dataset=tensordict, batch_size=mini_batch_size, collate_fn=lambda x: x, generator=generator, **dataloader_kwargs
+        dataset=idx_lst, batch_size=mini_batch_size, collate_fn=lambda x: x, generator=generator, **dataloader_kwargs
     )
 
     def get_data():
         for _ in range(epochs):
-            yield from train_dataloader
+            for idx in train_dataloader:
+                yield index_select_tensor_dict(tensordict, idx)
 
     return iter(get_data())
 
diff --git a/verl/utils/tracking.py b/verl/utils/tracking.py
index 65541a2b34b..251a5be6e09 100644
--- a/verl/utils/tracking.py
+++ b/verl/utils/tracking.py
@@ -278,6 +278,7 @@ def __init__(self):
         self._invalid_chars_pattern = re.compile(
             r"[^/\w.\- :]"
         )  # Allowed: slashes, alphanumerics, underscores, periods, dashes, colons, and spaces.
+        self._consecutive_slashes_pattern = re.compile(r"/+")
 
     def log(self, data, step):
         import mlflow
@@ -285,6 +286,8 @@ def log(self, data, step):
         def sanitize_key(key):
             # First replace @ with _at_ for backward compatibility
             sanitized = key.replace("@", "_at_")
+            # Replace consecutive slashes with a single slash (MLflow treats them as file paths)
+            sanitized = self._consecutive_slashes_pattern.sub("/", sanitized)
             # Then replace any other invalid characters with _
             sanitized = self._invalid_chars_pattern.sub("_", sanitized)
             if sanitized != key:
diff --git a/verl/utils/transferqueue_utils.py b/verl/utils/transferqueue_utils.py
index fe5773bbf4f..206d51899b4 100644
--- a/verl/utils/transferqueue_utils.py
+++ b/verl/utils/transferqueue_utils.py
@@ -14,6 +14,7 @@
 
 import asyncio
 import inspect
+import logging
 import os
 import threading
 from functools import wraps
@@ -36,6 +37,9 @@ class BatchMeta:
 
 from verl.protocol import DataProto
 
+logger = logging.getLogger(__name__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+
 _TRANSFER_QUEUE_CLIENT = None
 
 is_transferqueue_enabled = os.environ.get("TRANSFER_QUEUE_ENABLE", False)
@@ -111,7 +115,9 @@ def _batchmeta_to_dataproto(batchmeta: "BatchMeta") -> DataProto:
     return _run_async_in_temp_loop(_async_batchmeta_to_dataproto, batchmeta)
 
 
-async def _async_update_batchmeta_with_output(output: DataProto, batchmeta: "BatchMeta") -> None:
+async def _async_update_batchmeta_with_output(output: DataProto, batchmeta: "BatchMeta", func_name=None) -> "BatchMeta":
+    pid = os.getpid()
+
     for k, v in output.meta_info.items():
         batchmeta.set_extra_info(k, v)
 
@@ -120,12 +126,22 @@ async def _async_update_batchmeta_with_output(output: DataProto, batchmeta: "Bat
         # pop meta_info
         for key in output.meta_info.keys():
             tensordict.pop(key)
-        batchmeta.add_fields(tensordict)
-        await _TRANSFER_QUEUE_CLIENT.async_put(data=tensordict, metadata=batchmeta)
+
+        logger.info(
+            f"Task {func_name} (pid={pid}) putting output data to TransferQueue with "
+            f"batch_size={tensordict.batch_size},\n"
+            f"tensordict keys={list(tensordict.keys())}"
+        )
+
+        updated_batch_meta = await _TRANSFER_QUEUE_CLIENT.async_put(data=tensordict, metadata=batchmeta)
+        return updated_batch_meta
+    else:
+        return batchmeta
 
 
-def _update_batchmeta_with_output(output: DataProto, batchmeta: "BatchMeta") -> None:
-    _run_async_in_temp_loop(_async_update_batchmeta_with_output, output, batchmeta)
+def _update_batchmeta_with_output(output: DataProto, batchmeta: "BatchMeta", func_name=None) -> "BatchMeta":
+    updated_batch_meta = _run_async_in_temp_loop(_async_update_batchmeta_with_output, output, batchmeta, func_name)
+    return updated_batch_meta
 
 
 def tqbridge(put_data: bool = True):
@@ -150,18 +166,24 @@ def tqbridge(put_data: bool = True):
     """
 
     def decorator(func):
+        pid = os.getpid()
+
         @wraps(func)
         def inner(*args, **kwargs):
             batchmeta = _find_batchmeta(*args, **kwargs)
             if batchmeta is None:
                 return func(*args, **kwargs)
             else:
+                logger.info(
+                    f"Task {func.__name__} (pid={pid}) is getting len_samples={batchmeta.size}, "
+                    f"global_idx={batchmeta.global_indexes}"
+                )
                 args = [_batchmeta_to_dataproto(arg) if isinstance(arg, BatchMeta) else arg for arg in args]
                 kwargs = {k: _batchmeta_to_dataproto(v) if isinstance(v, BatchMeta) else v for k, v in kwargs.items()}
                 output = func(*args, **kwargs)
                 if put_data:
-                    _update_batchmeta_with_output(output, batchmeta)
-                    return batchmeta
+                    updated_batch_meta = _update_batchmeta_with_output(output, batchmeta, func.__name__)
+                    return updated_batch_meta
                 else:
                     return output
 
@@ -171,6 +193,10 @@ async def async_inner(*args, **kwargs):
             if batchmeta is None:
                 return await func(*args, **kwargs)
             else:
+                logger.info(
+                    f"Task {func.__name__} (pid={pid}) is getting len_samples={batchmeta.size}, "
+                    f"global_idx={batchmeta.global_indexes}"
+                )
                 args = [await _async_batchmeta_to_dataproto(arg) if isinstance(arg, BatchMeta) else arg for arg in args]
                 kwargs = {
                     k: await _async_batchmeta_to_dataproto(v) if isinstance(v, BatchMeta) else v
@@ -178,8 +204,8 @@ async def async_inner(*args, **kwargs):
                 }
                 output = await func(*args, **kwargs)
                 if put_data:
-                    await _async_update_batchmeta_with_output(output, batchmeta)
-                    return batchmeta
+                    updated_batchmeta = await _async_update_batchmeta_with_output(output, batchmeta, func.__name__)
+                    return updated_batchmeta
                 return output
 
         @wraps(func)
diff --git a/verl/workers/actor/dp_actor.py b/verl/workers/actor/dp_actor.py
index 7931b6ab348..9efad34eb85 100644
--- a/verl/workers/actor/dp_actor.py
+++ b/verl/workers/actor/dp_actor.py
@@ -500,9 +500,9 @@ def update_policy(self, data: DataProto):
                     )
                     micro_batch_metrics.update(pg_metrics)
 
-                    # Skip if using pure rollout correction mode (metrics already in pg_metrics)
+                    # Skip if using bypass_mode loss (metrics already computed in pg_metrics)
                     rollout_log_prob = model_inputs.get("rollout_log_probs", None)
-                    if loss_mode != "rollout_correction" and rollout_log_prob is not None:
+                    if loss_mode != "bypass_mode" and rollout_log_prob is not None:
                         # Compute metrics using CURRENT policy π_θ vs π_rollout
                         # Tracks evolving off-policy gap as π_θ updates during mini-batch training
                         from verl.trainer.ppo.rollout_corr_helper import compute_rollout_corr_metrics_from_logprobs
diff --git a/verl/workers/actor/megatron_actor.py b/verl/workers/actor/megatron_actor.py
index 466402cab4e..666f26be0d0 100644
--- a/verl/workers/actor/megatron_actor.py
+++ b/verl/workers/actor/megatron_actor.py
@@ -479,7 +479,6 @@ def loss_func(output, data, meta_info):
 
                 entropy_coeff = self.config.entropy_coeff
                 loss_agg_mode = self.config.loss_agg_mode
-
                 loss_mode = self.config.policy_loss.get("loss_mode", "vanilla")
 
                 policy_loss_fn = get_policy_loss_fn(loss_mode)
@@ -498,9 +497,9 @@ def loss_func(output, data, meta_info):
                 )
                 stats.update(pg_metrics)
 
-                # Skip if using pure rollout correction mode (metrics already in pg_metrics)
+                # Skip if using bypass_mode loss (metrics already computed in pg_metrics)
                 rollout_log_prob = data.get("rollout_log_probs", None)
-                if loss_mode != "rollout_correction" and rollout_log_prob is not None:
+                if loss_mode != "bypass_mode" and rollout_log_prob is not None:
                     # Compute metrics using CURRENT policy π_θ vs π_rollout
                     # Tracks evolving off-policy gap as π_θ updates during mini-batch training
                     from verl.trainer.ppo.rollout_corr_helper import compute_rollout_corr_metrics_from_logprobs
diff --git a/verl/workers/config/critic.py b/verl/workers/config/critic.py
index e3a899f0bb1..6d6e94adbb4 100644
--- a/verl/workers/config/critic.py
+++ b/verl/workers/config/critic.py
@@ -93,7 +93,13 @@ def __post_init__(self):
 
         if self.model_config is None:
             warnings.warn("using model in Critic Config is deprecated, please use model_config instead", stacklevel=2)
-            self.model_config = self.model
+            self.model_config = HFModelConfig(
+                path=self.model.path,
+                tokenizer_path=self.model.tokenizer_path,
+                override_config=self.model.override_config,
+                external_lib=self.model.external_lib,
+                trust_remote_code=self.model.trust_remote_code,
+            )
 
         if not self.use_dynamic_bsz:
             self._check_mutually_exclusive(self.ppo_micro_batch_size, self.ppo_micro_batch_size_per_gpu, "critic")
diff --git a/verl/workers/config/engine.py b/verl/workers/config/engine.py
index 9a4c9d08fa6..b645715489e 100644
--- a/verl/workers/config/engine.py
+++ b/verl/workers/config/engine.py
@@ -27,12 +27,52 @@
 
 @dataclass
 class EngineConfig(BaseConfig):
+    _mutable_fields = BaseConfig._mutable_fields | {
+        "use_dynamic_bsz",
+        "max_token_len_per_gpu",
+        "micro_batch_size_per_gpu",
+        "infer_max_token_len_per_gpu",
+        "infer_micro_batch_size_per_gpu",
+        "use_fused_kernels",
+        "use_remove_padding",
+    }
+
+    # whether to offload param
     param_offload: bool = False
+    # whether to offload optimizer
     optimizer_offload: bool = False
+    # whether to offload grad
     grad_offload: bool = False
+    # whether the engine is forward only (e.g., ref policy)
     forward_only: bool = False
+    # the strategy (backend)
     strategy: str = None
+    # model dtype
     dtype: str = "bfloat16"  # ["bfloat16", "float16"]
+    # whether to use dynamic bsz
+    use_dynamic_bsz: bool = True
+    # for training
+    max_token_len_per_gpu: int = None
+    micro_batch_size_per_gpu: int = None
+    # for inference
+    infer_max_token_len_per_gpu: int = None
+    infer_micro_batch_size_per_gpu: int = None
+    # whether use fuse lm head kernel
+    use_fused_kernels: bool = False
+    # TODO (this may conflict with the one in model config)
+    use_remove_padding: bool = True
+
+    seed: int = 42
+
+    full_determinism: bool = False
+
+    def __post_init__(self):
+        pass
+        # TODO: turn on this check after we reorg config
+        # if self.use_dynamic_bsz:
+        #     assert self.max_token_len_per_gpu is not None
+        # else:
+        #     assert self.micro_batch_size_per_gpu is not None
 
 
 @dataclass
@@ -64,7 +104,7 @@ class McoreEngineConfig(EngineConfig):
     """
 
     # sequence_parallel is not listed as a frozen field for auto-correction purpose
-    _mutable_fields = BaseConfig._mutable_fields | {"sequence_parallel"}
+    _mutable_fields = EngineConfig._mutable_fields | {"sequence_parallel"}
     # mcore parallelism
     tensor_model_parallel_size: int = 1
     expert_model_parallel_size: int = 1
@@ -77,16 +117,15 @@ class McoreEngineConfig(EngineConfig):
     use_dist_checkpointing: bool = False
     dist_checkpointing_path: Optional[str] = None
     dist_checkpointing_prefix: str = ""
-    seed: int = 42
     override_ddp_config: dict[str, Any] = field(default_factory=dict)
     override_transformer_config: dict[str, Any] = field(default_factory=dict)
     override_mcore_model_config: dict[str, Any] = field(default_factory=dict)
-    use_mbridge: bool = False
+    use_mbridge: bool = True
     vanilla_mbridge: bool = True
-    use_remove_padding: bool = True
     strategy: str = "megatron"
 
     def __post_init__(self) -> None:
+        super().__post_init__()
         """config validation logics go here"""
         assert self.strategy == "megatron"
         assert self.dtype in ["bfloat16", "float16"], f"dtype {self.dtype} not supported"
@@ -120,7 +159,7 @@ class FSDPEngineConfig(EngineConfig):
     """
 
     # ulysses_sequence_parallel_size is mutable for backward compatibility
-    _mutable_fields = BaseConfig._mutable_fields | {"ulysses_sequence_parallel_size"}
+    _mutable_fields = EngineConfig._mutable_fields | {"ulysses_sequence_parallel_size"}
 
     # fsdp specific flags
     wrap_policy: dict[str, Any] = field(default_factory=dict)
@@ -131,8 +170,6 @@ class FSDPEngineConfig(EngineConfig):
     model_dtype: str = "fp32"
     use_orig_params: bool = False
     mixed_precision: Optional[dict[str, Any]] = None
-    seed: int = 42
-    full_determinism: bool = False
     ulysses_sequence_parallel_size: int = 1
     entropy_from_logits_with_chunking: bool = False
     use_torch_compile: bool = True
@@ -140,6 +177,7 @@ class FSDPEngineConfig(EngineConfig):
     strategy: str = "fsdp"
 
     def __post_init__(self):
+        super().__post_init__()
         assert self.strategy in ["fsdp", "fsdp2"], f"strategy {self.strategy} not supported"
 
 
diff --git a/verl/workers/config/model.py b/verl/workers/config/model.py
index 33eb9c288d2..d42c8f4349e 100644
--- a/verl/workers/config/model.py
+++ b/verl/workers/config/model.py
@@ -72,15 +72,19 @@ class HFModelConfig(BaseConfig):
     enable_gradient_checkpointing: bool = True
     enable_activation_offload: bool = False
 
-    use_remove_padding: bool = False
+    use_remove_padding: bool = True
 
-    # lora related. We may setup a separate config later
+    # TODO: unify fsdp and megatron lora config
+    # fsdp lora related. We may setup a separate config later
     lora_rank: int = 0
     lora_alpha: int = 16
     target_modules: Optional[str] = "all-linear"
 
     exclude_modules: Optional[str] = None
 
+    # megatron lora config
+    lora: dict[str, Any] = field(default_factory=dict)
+
     # path to pre-trained LoRA adapter to load for continued training
     lora_adapter_path: Optional[str] = None
     use_liger: bool = False
@@ -100,7 +104,7 @@ def __post_init__(self):
 
         self.local_path = copy_to_local(self.path, use_shm=self.use_shm)
 
-        # constuct tokenizer
+        # construct tokenizer
         if self.load_tokenizer:
             self.local_tokenizer_path = copy_to_local(self.tokenizer_path, use_shm=self.use_shm)
             self.tokenizer = hf_tokenizer(self.local_tokenizer_path, trust_remote_code=self.trust_remote_code)
diff --git a/verl/workers/config/rollout.py b/verl/workers/config/rollout.py
index 8128f4bc1f2..d308e3f53fe 100644
--- a/verl/workers/config/rollout.py
+++ b/verl/workers/config/rollout.py
@@ -71,6 +71,9 @@ class AgentLoopConfig(BaseConfig):
     default_agent_loop: str = "single_turn_agent"
     agent_loop_config_path: Optional[str] = None
     custom_async_server: CustomAsyncServerConfig = field(default_factory=CustomAsyncServerConfig)
+    # Fully qualified class name for custom AgentLoopManager (e.g., "mypackage.module.MyManager").
+    # Security: This class will be dynamically imported via importlib. Only use trusted class paths.
+    agent_loop_manager_class: Optional[str] = None
 
 
 @dataclass
@@ -181,6 +184,9 @@ class RolloutConfig(BaseConfig):
     # Use Prometheus to collect and monitor rollout statistics
     prometheus: PrometheusConfig = field(default_factory=PrometheusConfig)
 
+    # Extension point for custom configurations
+    custom: Optional[dict] = None
+
     update_weights_bucket_megabytes: int = 512
 
     skip_rollout: bool = False
@@ -209,6 +215,8 @@ class RolloutConfig(BaseConfig):
 
     enable_rollout_routing_replay: bool = False
 
+    enable_sleep_mode: bool = True
+
     def __post_init__(self):
         """Validate the rollout config"""
         if self.expert_parallel_size > 1:
diff --git a/verl/workers/engine/base.py b/verl/workers/engine/base.py
index ad4676f1f6c..5b5f994bbb2 100644
--- a/verl/workers/engine/base.py
+++ b/verl/workers/engine/base.py
@@ -15,6 +15,7 @@
 The abstract base class defining the interface for model training engines.
 """
 
+from abc import abstractmethod
 from typing import Any, Callable, Generator, Optional
 
 import torch
@@ -39,7 +40,19 @@ def initialize(self):
         """
         raise NotImplementedError
 
-    def train_mode(self):
+    @property
+    @abstractmethod
+    def is_param_offload_enabled(self) -> bool:
+        """Whether parameter offloading is enabled."""
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def is_optimizer_offload_enabled(self) -> bool:
+        """Whether optimizer offloading is enabled."""
+        raise NotImplementedError
+
+    def train_mode(self, **kwargs):
         """
         Context manager entry for switching the engine and model into training mode.
 
@@ -49,7 +62,7 @@ def train_mode(self):
         """
         raise NotImplementedError
 
-    def eval_mode(self):
+    def eval_mode(self, **kwargs):
         """
         Context manager entry for switching the engine and model into evaluation mode.
 
@@ -156,7 +169,8 @@ def to(self, device: str, model: bool = True, optimizer: bool = True, grad: bool
             optimizer: If True, move the optimizer states.
             grad: If True, move the gradient buffer.
         """
-        raise NotImplementedError
+        if not model:
+            assert not optimizer and not grad, "Model must be moved to device along with optimizer and grad"
 
     def save_checkpoint(
         self,
@@ -199,6 +213,41 @@ def is_mp_src_rank_with_outputs(self):
         raise NotImplementedError
 
 
+class BaseEngineCtx:
+    def __init__(self, engine: BaseEngine, mode, **kwargs):
+        """Base Engine context that handles load and offload
+
+        Args:
+            engine:
+            **kwargs:
+        """
+        self.engine = engine
+        self.mode = mode
+        assert self.mode in ("train", "eval")
+        self.disable_auto_offload = kwargs.pop("disable_auto_offload", False)
+
+    def _context_switch(self, device):
+        if self.disable_auto_offload:
+            return
+        if self.mode == "eval":
+            self.engine.to(device=device, model=self.engine.is_param_offload_enabled, optimizer=False, grad=False)
+        elif self.mode == "train":
+            self.engine.to(
+                device=device,
+                model=self.engine.is_param_offload_enabled,
+                optimizer=self.engine.is_optimizer_offload_enabled,
+                grad=self.engine.is_param_offload_enabled,
+            )
+
+    def __enter__(self):
+        self._context_switch(get_device_name())
+        self.engine.mode = self.mode
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self._context_switch("cpu")
+        self.engine.mode = None
+
+
 class EngineRegistry:
     """
     A registry for managing and instantiating different types of training engines.
diff --git a/verl/workers/engine/fsdp/transformer_impl.py b/verl/workers/engine/fsdp/transformer_impl.py
index ebedd36b62f..b5dbba92717 100644
--- a/verl/workers/engine/fsdp/transformer_impl.py
+++ b/verl/workers/engine/fsdp/transformer_impl.py
@@ -41,7 +41,6 @@
 from verl.utils.device import (
     get_device_id,
     get_device_name,
-    get_torch_device,
 )
 from verl.utils.fsdp_utils import (
     CPUOffloadPolicy,
@@ -61,14 +60,14 @@
     offload_fsdp_optimizer,
     replace_lora_wrapper,
 )
-from verl.utils.model import convert_weight_keys
+from verl.utils.model import convert_weight_keys, extract_multi_modal_inputs
 from verl.utils.py_functional import convert_to_regular_types
 from verl.utils.torch_functional import logprobs_from_logits
 from verl.utils.ulysses import gather_outputs_and_unpad, ulysses_pad, ulysses_pad_and_slice_inputs
 from verl.workers.config import FSDPEngineConfig, FSDPOptimizerConfig, HFModelConfig
 from verl.workers.sharding_manager.fsdp_ulysses import FSDPUlyssesShardingManager
 
-from ..base import BaseEngine, EngineRegistry
+from ..base import BaseEngine, BaseEngineCtx, EngineRegistry
 from ..utils import enable_full_determinism, postprocess_batch_func, prepare_micro_batches
 from .utils import create_device_mesh, get_sharding_strategy
 
@@ -135,6 +134,14 @@ def __init__(
             else entropy_from_logits
         )
 
+    @property
+    def is_param_offload_enabled(self) -> bool:
+        return self._is_offload_param
+
+    @property
+    def is_optimizer_offload_enabled(self) -> bool:
+        return self._is_offload_optimizer
+
     def is_mp_src_rank_with_outputs(self):
         if self.ulysses_device_mesh is not None:
             is_collect = self.ulysses_device_mesh["sp"].get_local_rank() == 0
@@ -152,21 +159,23 @@ def initialize(self):
         # This is used to import external_lib into the huggingface systems
         self._build_model_optimizer()
 
-        if self._is_offload_param:
-            offload_fsdp_model_to_cpu(self.module)
-            log_gpu_memory_usage("After offload model during init", logger=logger)
-        if self._is_offload_optimizer:
-            offload_fsdp_optimizer(optimizer=self.optimizer)
-            log_gpu_memory_usage("After offload optimizer during init", logger=logger)
-
         self.checkpoint_manager = FSDPCheckpointManager(
             model=self.module,
             optimizer=self.optimizer,
             lr_scheduler=self.lr_scheduler,
             processing_class=self.model_config.get_processor(),
-            checkpoint_contents=self.checkpoint_config,
+            checkpoint_config=self.checkpoint_config,
         )
 
+        self.to(
+            device="cpu",
+            model=self._is_offload_param,
+            optimizer=self._is_offload_optimizer,
+            grad=self._is_offload_param,
+        )
+
+        log_gpu_memory_usage("After offload model/optimizer/grad during init", logger=logger)
+
     def _init_device_mesh(self):
         world_size = torch.distributed.get_world_size()
         from torch.distributed.device_mesh import init_device_mesh
@@ -444,21 +453,21 @@ def _build_model_optimizer(self):
         self.optimizer = optimizer
         self.lr_scheduler = lr_scheduler
 
-    def train_mode(self):
+    def train_mode(self, **kwargs):
         """
         Return a context manager that switches to training mode with FSDP-specific handling.
 
         Includes parameter and optimizer offload entry/exit.
         """
-        return EngineTrainModeCtx(self)
+        return EngineTrainModeCtx(self, **kwargs)
 
-    def eval_mode(self):
+    def eval_mode(self, **kwargs):
         """
         Return a context manager that switches to evaluation mode with FSDP-specific handling.
 
         Includes activation offload entry/exit.
         """
-        return EngineEvalModeCtx(self)
+        return EngineEvalModeCtx(self, **kwargs)
 
     def get_data_parallel_rank(self):
         if self.ulysses_device_mesh is not None:
@@ -556,7 +565,10 @@ def lr_scheduler_step(self):
     def to(self, device: str, model: bool = True, optimizer: bool = True, grad: bool = True):
         """
         Move FSDP model and/or optimizer to CPU or GPU with offload support.
+        Note that this function executes irrespective of offload config. It serves as manual control
         """
+        super().to(device=device, model=model, optimizer=optimizer, grad=grad)
+
         if self.engine_config.forward_only:
             # force cpu_offload
             return
@@ -565,18 +577,16 @@ def to(self, device: str, model: bool = True, optimizer: bool = True, grad: bool
 
         assert device in (device_name, "cpu")
         if device == device_name:
-            if self.engine_config.param_offload:
-                if model:
-                    load_fsdp_model_to_gpu(self.module)
-                if optimizer and self.optimizer is not None:
-                    load_fsdp_optimizer(self.optimizer, device)
+            if model:
+                load_fsdp_model_to_gpu(self.module)
+            if optimizer and self.optimizer is not None:
+                load_fsdp_optimizer(self.optimizer, device)
             gc.collect()
         elif device == "cpu":
-            if self.engine_config.param_offload:
-                if model:
-                    offload_fsdp_model_to_cpu(self.module)
-                if optimizer and self.optimizer is not None:
-                    offload_fsdp_optimizer(self.optimizer)
+            if model:
+                offload_fsdp_model_to_cpu(self.module)
+            if optimizer and self.optimizer is not None:
+                offload_fsdp_optimizer(self.optimizer)
         else:
             raise ValueError(f"Invalid device type: {device}")
 
@@ -664,19 +674,18 @@ def get_per_tensor_param(self, layered_summon=False, base_sync_done=False):
         return per_tensor_param, peft_config
 
 
-class EngineEvalModeCtx:
-    def __init__(self, engine: FSDPEngine):
-        self.engine = engine
+class EngineEvalModeCtx(BaseEngineCtx):
+    def __init__(self, engine: FSDPEngine, **kwargs):
+        super().__init__(engine=engine, mode="eval", **kwargs)
 
     def __enter__(self):
-        self.engine.mode = "eval"
-        if self.engine._is_offload_param:
-            load_fsdp_model_to_gpu(self.engine.module)
-
+        assert isinstance(self.engine, FSDPEngine)
+        super().__enter__()
         self.engine.ulysses_sharding_manager.__enter__()
         self.engine.module.eval()
 
     def __exit__(self, exc_type, exc_value, traceback):
+        assert isinstance(self.engine, FSDPEngine)
         self.engine.ulysses_sharding_manager.__exit__(exc_type, exc_value, traceback)
 
         # https://pytorch.org/docs/stable/notes/fsdp.html#fsdp-notes
@@ -687,34 +696,24 @@ def __exit__(self, exc_type, exc_value, traceback):
             elif fsdp_version(self.engine.module) == 2:
                 self.engine.module.reshard()
 
-        if self.engine._is_offload_param:
-            offload_fsdp_model_to_cpu(self.engine.module)
-        self.engine.mode = None
+        super().__exit__(exc_type, exc_value, traceback)
 
 
-class EngineTrainModeCtx:
-    def __init__(self, engine: FSDPEngine):
-        self.engine = engine
+class EngineTrainModeCtx(BaseEngineCtx):
+    def __init__(self, engine: FSDPEngine, **kwargs):
+        super().__init__(engine=engine, mode="train", **kwargs)
 
     def __enter__(self):
-        self.engine.mode = "train"
-        if self.engine._is_offload_param:
-            load_fsdp_model_to_gpu(self.engine.module)
-        if self.engine._is_offload_optimizer:
-            load_fsdp_optimizer(optimizer=self.engine.optimizer, device_id=get_torch_device().current_device())
-
+        assert isinstance(self.engine, FSDPEngine)
+        super().__enter__()
         self.engine.ulysses_sharding_manager.__enter__()
         self.engine.module.train()
 
     def __exit__(self, exc_type, exc_value, traceback):
+        assert isinstance(self.engine, FSDPEngine)
         self.engine.ulysses_sharding_manager.__exit__(exc_type, exc_value, traceback)
         self.engine.optimizer_zero_grad()
-
-        if self.engine._is_offload_param:
-            offload_fsdp_model_to_cpu(self.engine.module)
-        if self.engine._is_offload_optimizer:
-            offload_fsdp_optimizer(optimizer=self.engine.optimizer)
-        self.engine.mode = None
+        super().__exit__(exc_type, exc_value, traceback)
 
 
 @EngineRegistry.register(model_type="language_model", backend=["fsdp", "fsdp2"], device=["cuda", "npu"])
@@ -727,25 +726,20 @@ def prepare_model_inputs(self, micro_batch: TensorDict):
 
         assert pad_mode == DatasetPadMode.NO_PADDING, f"pad_mode {pad_mode} not supported"
 
-        multi_modal_inputs = {}
-        if "multi_modal_inputs" in micro_batch.keys():
-            from verl.utils.model import extract_multi_modal_inputs
-
-            multi_modal_inputs = extract_multi_modal_inputs(micro_batch["multi_modal_inputs"])
-
+        multi_modal_inputs = extract_multi_modal_inputs(micro_batch.get("multi_modal_inputs", []))
         input_ids = micro_batch["input_ids"]
         position_ids = micro_batch["position_ids"]
 
-        if position_ids.dim() == 3:  # qwen2vl mrope
-            position_ids = position_ids.transpose(0, 1)  # (bsz, 3, seqlen) -> (3, bsz, seqlen)
-
         # args used to get outputs
         output_args = {}
 
         if use_remove_padding:
             if pad_mode == DatasetPadMode.NO_PADDING:
                 input_ids_rmpad = input_ids.values().unsqueeze(0)  # (1, total_nnz)
-                position_ids_rmpad = position_ids.values().unsqueeze(0)  # (1, total_nnz)
+                if position_ids.dim() == 3:
+                    position_ids_rmpad = position_ids.values().unsqueeze(1)  # (4, 1, total_nnz)
+                else:
+                    position_ids_rmpad = position_ids.values().unsqueeze(0)  # (1, total_nnz)
             else:
                 raise NotImplementedError(f"pad_mode {pad_mode} not implemented")
 
@@ -805,9 +799,14 @@ def prepare_model_inputs(self, micro_batch: TensorDict):
                     input_ids, padding=pad_token_id, output_size=(batch_size, max_seq_len)
                 )
 
-                position_ids = torch.nested.to_padded_tensor(
-                    position_ids, padding=0, output_size=(batch_size, max_seq_len)
-                )
+                if position_ids.dim() == 3:
+                    position_ids = torch.nested.to_padded_tensor(
+                        position_ids, padding=0, output_size=(batch_size, 4, max_seq_len)
+                    ).transpose(0, 1)  # (4, batch_size, max_seq_len)
+                else:
+                    position_ids = torch.nested.to_padded_tensor(
+                        position_ids, padding=0, output_size=(batch_size, max_seq_len)
+                    )
 
                 attention_mask_list = [torch.ones_like(t, dtype=torch.int32) for t in loss_mask]
                 attention_mask = torch.nested.as_nested_tensor(attention_mask_list, layout=torch.jagged)
@@ -983,10 +982,8 @@ def prepare_model_outputs(self, output, output_args, micro_batch: TensorDict):
         use_remove_padding = tu.get_non_tensor_data(data=micro_batch, key="use_remove_padding", default=True)
         pad_mode = tu.get_non_tensor_data(data=micro_batch, key="pad_mode", default=DatasetPadMode.NO_PADDING)
 
+        input_ids = micro_batch["input_ids"]
         if use_remove_padding:
-            input_ids = micro_batch["input_ids"]
-            batch_size, seqlen = input_ids.shape
-
             if hasattr(self.module, "v_head"):
                 # For trl.AutoModelForCausalLMWithValueHead
                 values_rmpad = output[2].squeeze(0).unsqueeze(-1)
diff --git a/verl/workers/engine/megatron/transformer_impl.py b/verl/workers/engine/megatron/transformer_impl.py
index a19d4664ad8..5bece0b0f1d 100644
--- a/verl/workers/engine/megatron/transformer_impl.py
+++ b/verl/workers/engine/megatron/transformer_impl.py
@@ -29,6 +29,7 @@
 from verl.utils import tensordict_utils as tu
 from verl.utils.checkpoint.megatron_checkpoint_manager import MegatronCheckpointManager
 from verl.utils.dataset.dataset_utils import DatasetPadMode
+from verl.utils.debug import log_gpu_memory_usage
 from verl.utils.device import get_device_id, get_device_name
 from verl.utils.megatron.pipeline_parallel import make_batch_generator
 from verl.utils.megatron.tensor_parallel import (
@@ -40,13 +41,15 @@
     load_megatron_optimizer,
     offload_megatron_model_to_cpu,
     offload_megatron_optimizer,
-    per_tensor_generator,
     register_megatron_training_hooks,
 )
-from verl.utils.model import load_mcore_dist_weights, load_megatron_gptmodel_weights
+from verl.utils.model import (
+    extract_multi_modal_inputs,
+    load_mcore_dist_weights,
+)
 from verl.workers.config import HFModelConfig, McoreEngineConfig, McoreOptimizerConfig
 
-from ..base import BaseEngine, EngineRegistry
+from ..base import BaseEngine, BaseEngineCtx, EngineRegistry
 from ..utils import (
     postprocess_batch_func,
     prepare_micro_batches,
@@ -71,7 +74,7 @@ def __init__(
         self.engine_config = engine_config
         self.optimizer_config = optimizer_config
         self.checkpoint_config = checkpoint_config
-
+        assert self.engine_config.use_mbridge, "use_mbridge must be True"
         self._init_device_mesh()
 
         set_random_seed(seed=self.engine_config.seed)
@@ -105,70 +108,62 @@ def _init_device_mesh(self):
         )
 
     def _build_tf_config(self):
-        from verl.models.mcore import hf_to_mcore_config
-        from verl.models.mcore.config_converter import mapping_string_to_attn_backend
+        from verl.utils.megatron_utils import mapping_string_to_attn_backend
         from verl.utils.torch_dtypes import PrecisionType
 
         self.param_dtype = PrecisionType.to_dtype(self.engine_config.dtype)
-        if self.param_dtype == torch.float16:
-            assert self.engine_config.use_mbridge, "fp16 mode requires use_mbridge to be True"
         self.dtype = PrecisionType.to_dtype(self.param_dtype)
 
         override_transformer_config = mapping_string_to_attn_backend({**self.engine_config.override_transformer_config})
 
-        use_mbridge = self.engine_config.use_mbridge
         self.provider = None
         self.vanilla_bridge = self.engine_config.vanilla_mbridge
-        if use_mbridge:
-            if self.vanilla_bridge:
-                from verl.models.mcore.mbridge import AutoBridge
-
-                bridge = AutoBridge.from_config(self.model_config.hf_config, dtype=self.param_dtype)
-                bridge.set_extra_args(**override_transformer_config)
-                tf_config = bridge.config
-                tf_config.fp16 = self.param_dtype == torch.float16
-                tf_config.bf16 = self.param_dtype == torch.bfloat16
-            else:
-                from verl.models.mcore.bridge import AutoBridge
-
-                # Use Megatron-Bridge to convert HF config to Megatron config
-                bridge = AutoBridge.from_hf_pretrained(
-                    self.model_config.local_path, trust_remote_code=self.model_config.trust_remote_code
-                )
-                # Get Megatron provider and configure it
-                provider = bridge.to_megatron_provider(load_weights=False)
-
-                # In case of invalid overrides, we need to make sure some critical params are set correctly
-                provider.params_dtype = self.param_dtype
-
-                # Pass distributed info
-                provider.tensor_model_parallel_size = self.engine_config.tensor_model_parallel_size
-                provider.pipeline_model_parallel_size = self.engine_config.pipeline_model_parallel_size
-                provider.expert_model_parallel_size = self.engine_config.expert_model_parallel_size
-                provider.expert_tensor_parallel_size = self.engine_config.expert_tensor_parallel_size
-                provider.virtual_pipeline_model_parallel_size = self.engine_config.virtual_pipeline_model_parallel_size
-                provider.context_parallel_size = self.engine_config.context_parallel_size
-                provider.sequence_parallel = self.engine_config.sequence_parallel
-
-                # Match verl implementation (need variable_seq_lengths)
-                from megatron.core.transformer.enums import AttnBackend
-
-                provider.attention_backend = AttnBackend.flash
-                provider.variable_seq_lengths = True
-                provider.moe_token_dispatcher_type = "alltoall"
-                provider.moe_router_load_balancing_type = "none"
-
-                # Apply transformer config overrides
-                for key, value in override_transformer_config.items():
-                    setattr(provider, key, value)
-
-                provider.finalize()
-                self.provider = provider
-                tf_config = None  # Will be set after model creation
-            self.bridge = bridge
+        if self.vanilla_bridge:
+            from verl.models.mcore.mbridge import AutoBridge
+
+            bridge = AutoBridge.from_config(self.model_config.hf_config, dtype=self.param_dtype)
+            bridge.set_extra_args(**override_transformer_config)
+            tf_config = bridge.config
+            tf_config.fp16 = self.param_dtype == torch.float16
+            tf_config.bf16 = self.param_dtype == torch.bfloat16
         else:
-            self.bridge = None
-            tf_config = hf_to_mcore_config(self.model_config.hf_config, self.dtype, **override_transformer_config)
+            from verl.models.mcore.bridge import AutoBridge
+
+            # Use Megatron-Bridge to convert HF config to Megatron config
+            bridge = AutoBridge.from_hf_pretrained(
+                self.model_config.local_path, trust_remote_code=self.model_config.trust_remote_code
+            )
+            # Get Megatron provider and configure it
+            provider = bridge.to_megatron_provider(load_weights=False)
+
+            # In case of invalid overrides, we need to make sure some critical params are set correctly
+            provider.params_dtype = self.param_dtype
+
+            # Pass distributed info
+            provider.tensor_model_parallel_size = self.engine_config.tensor_model_parallel_size
+            provider.pipeline_model_parallel_size = self.engine_config.pipeline_model_parallel_size
+            provider.expert_model_parallel_size = self.engine_config.expert_model_parallel_size
+            provider.expert_tensor_parallel_size = self.engine_config.expert_tensor_parallel_size
+            provider.virtual_pipeline_model_parallel_size = self.engine_config.virtual_pipeline_model_parallel_size
+            provider.context_parallel_size = self.engine_config.context_parallel_size
+            provider.sequence_parallel = self.engine_config.sequence_parallel
+
+            # Match verl implementation (need variable_seq_lengths)
+            from megatron.core.transformer.enums import AttnBackend
+
+            provider.attention_backend = AttnBackend.flash
+            provider.variable_seq_lengths = True
+            provider.moe_token_dispatcher_type = "alltoall"
+            provider.moe_router_load_balancing_type = "none"
+
+            # Apply transformer config overrides
+            for key, value in override_transformer_config.items():
+                setattr(provider, key, value)
+
+            provider.finalize()
+            self.provider = provider
+            tf_config = None  # Will be set after model creation
+        self.bridge = bridge
 
         if not self.bridge:
             self.weight_converter = get_mcore_weight_converter(self.model_config.hf_config, self.dtype)
@@ -227,28 +222,14 @@ def _build_megatron_module(self):
         if self.engine_config.use_dist_checkpointing:
             load_mcore_dist_weights(module, self.engine_config.dist_checkpointing_path, is_value_model=is_value_model)
         else:
-            if self.bridge is not None:
-                if self.vanilla_bridge:
-                    self.bridge.load_weights(module, self.model_config.local_path)
-                else:
-                    allowed_mismatched_params = []
-                    if self.engine_config.is_value_model:
-                        allowed_mismatched_params = ["output_layer.weight"]
-                    self.bridge.load_hf_weights(
-                        module, self.model_config.local_path, allowed_mismatched_params=allowed_mismatched_params
-                    )
+            if self.vanilla_bridge:
+                self.bridge.load_weights(module, self.model_config.local_path)
             else:
-                # (vermouth1992) this is a workaround to be compatible with the old API
-                tmp_config = OmegaConf.create(
-                    {"model": {"path": self.model_config.local_path, "use_shm": self.model_config.use_shm}}
-                )
-
-                load_megatron_gptmodel_weights(
-                    tmp_config,
-                    self.model_config.hf_config,
-                    module,
-                    params_dtype=self.dtype,
-                    is_value_model=is_value_model,
+                allowed_mismatched_params = []
+                if self.is_value_model:
+                    allowed_mismatched_params = ["output_layer.weight"]
+                self.bridge.load_hf_weights(
+                    module, self.model_config.local_path, allowed_mismatched_params=allowed_mismatched_params
                 )
 
         if torch.distributed.get_rank() == 0:
@@ -279,6 +260,14 @@ def _build_lr_scheduler(self):
         )
         return optimizer_scheduler
 
+    @property
+    def is_param_offload_enabled(self) -> bool:
+        return self._is_offload_param
+
+    @property
+    def is_optimizer_offload_enabled(self) -> bool:
+        return self._is_offload_optimizer
+
     def is_mp_src_rank_with_outputs(self):
         return (
             mpu.get_tensor_model_parallel_rank() == 0
@@ -325,7 +314,16 @@ def initialize(self):
             use_dist_checkpointing=self.engine_config.use_dist_checkpointing,
         )
 
-    def train_mode(self):
+        self.to(
+            device="cpu",
+            model=self._is_offload_param,
+            optimizer=self._is_offload_optimizer,
+            grad=self._is_offload_param,
+        )
+
+        log_gpu_memory_usage("After offload model/optimizer/grad during init", logger=logger)
+
+    def train_mode(self, **kwargs):
         """
         Context manager entry for switching the engine and model into training mode.
 
@@ -333,9 +331,9 @@ def train_mode(self):
             with engine.train_mode():
                 # runs in training mode
         """
-        return EngineTrainModeCtx(self)
+        return EngineTrainModeCtx(self, **kwargs)
 
-    def eval_mode(self):
+    def eval_mode(self, **kwargs):
         """
         Context manager entry for switching the engine and model into evaluation mode.
 
@@ -343,7 +341,7 @@ def eval_mode(self):
             with engine.eval_mode():
                 # runs in evaluation mode
         """
-        return EngineEvalModeCtx(self)
+        return EngineEvalModeCtx(self, **kwargs)
 
     def optimizer_zero_grad(self):
         """
@@ -387,27 +385,28 @@ def lr_scheduler_step(self):
     def to(self, device: str, model: bool = True, optimizer: bool = True, grad: bool = True):
         """
         Move model parameters, optimizer states, or both to the specified device.
+        Note that this function executes irrespective of offload config. It serves as manual control
 
         Args:
             device: Target device identifier.
             model: If True, move the model.
             optimizer: If True, move the optimizer states.
         """
+        super().to(device=device, model=model, optimizer=optimizer, grad=grad)
+
         device_name = get_device_name()
 
         assert device in (device_name, "cpu")
         if device == device_name:
-            if self.engine_config.param_offload:
-                if model:
-                    load_megatron_model_to_gpu(self.module, load_grad=grad)
-                if optimizer and self.optimizer is not None:
-                    load_megatron_optimizer(self.optimizer, device)
+            if model:
+                load_megatron_model_to_gpu(self.module, load_grad=grad)
+            if optimizer and self.optimizer is not None:
+                load_megatron_optimizer(self.optimizer)
         elif device == "cpu":
-            if self.engine_config.param_offload:
-                if model:
-                    offload_megatron_model_to_cpu(self.module)
-                if optimizer and self.optimizer is not None:
-                    offload_megatron_optimizer(self.optimizer)
+            if model:
+                offload_megatron_model_to_cpu(self.module)
+            if optimizer and self.optimizer is not None:
+                offload_megatron_optimizer(self.optimizer)
         else:
             raise ValueError(f"Invalid device type: {device}")
 
@@ -539,16 +538,7 @@ def forward_backward_batch(self, data: TensorDict, loss_function: Callable, forw
     def get_per_tensor_param(self):
         if self._is_offload_param:
             load_megatron_model_to_gpu(self.module, load_grad=False)
-        if self.bridge is not None:
-            per_tensor_param = self.bridge.export_weights(self.module)
-        else:
-            per_tensor_param = per_tensor_generator(
-                self.module,
-                self.model_config.hf_config,
-                self.weight_converter,
-                self.tf_config,
-                self.layer_name_mapping,
-            )
+        per_tensor_param = self.bridge.export_weights(self.module)
         # TODO: support megatron LoRA
         return per_tensor_param, None
 
@@ -559,85 +549,48 @@ def postprocess_micro_batch_func(self, output, data: TensorDict, forward_only: b
         raise NotImplementedError("postprocess_micro_batch_func must be implemented in subclass")
 
 
-class EngineEvalModeCtx:
-    def __init__(self, engine: MegatronEngine):
-        self.engine = engine
+class EngineEvalModeCtx(BaseEngineCtx):
+    def __init__(self, engine: MegatronEngine, **kwargs):
+        super().__init__(engine=engine, mode="eval", **kwargs)
 
     def __enter__(self):
         assert isinstance(self.engine, MegatronEngine)
-
-        self.engine.mode = "eval"
-        if self.engine._is_offload_param:
-            load_megatron_model_to_gpu(self.engine.module, load_grad=True)
-
+        super().__enter__()
         # mcore module is a list of model chunk in each vpp stage
         for module in self.engine.module:
             module.eval()
 
     def __exit__(self, exc_type, exc_value, traceback):
-        if self.engine._is_offload_param:
-            offload_megatron_model_to_cpu(self.engine.module)
-        self.engine.mode = None
+        assert isinstance(self.engine, MegatronEngine)
+        super().__exit__(exc_type, exc_value, traceback)
 
 
-class EngineTrainModeCtx:
-    def __init__(self, engine: MegatronEngine):
-        self.engine = engine
+class EngineTrainModeCtx(BaseEngineCtx):
+    def __init__(self, engine: MegatronEngine, **kwargs):
+        super().__init__(engine=engine, mode="train", **kwargs)
 
     def __enter__(self):
         assert isinstance(self.engine, MegatronEngine)
-
-        self.engine.mode = "train"
-        if self.engine._is_offload_param:
-            load_megatron_model_to_gpu(self.engine.module, load_grad=True)
-        if self.engine._is_offload_optimizer:
-            load_megatron_optimizer(self.engine.optimizer)
-
+        super().__enter__()
         # mcore module is a list of model chunk in each vpp stage
         for module in self.engine.module:
             module.train()
 
     def __exit__(self, exc_type, exc_value, traceback):
-        if self.engine._is_offload_param:
-            offload_megatron_model_to_cpu(self.engine.module)
-        if self.engine._is_offload_optimizer:
-            offload_megatron_optimizer(self.engine.optimizer)
-        self.engine.mode = None
+        assert isinstance(self.engine, MegatronEngine)
+        super().__exit__(exc_type, exc_value, traceback)
 
 
 @EngineRegistry.register(model_type="language_model", backend="megatron")
 class MegatronEngineWithLMHead(MegatronEngine):
     def prepare_model_inputs(self, batch: TensorDict):
-        batch = batch.to(get_device_id())
-        batch = batch.contiguous()
         input_ids = batch["input_ids"]
         loss_mask = batch["loss_mask"].to(bool)
-        position_ids = batch["position_ids"]
-
-        # process vlm inputs
-        has_multi_modal_inputs = "multi_modal_inputs" in batch.keys()
-        if has_multi_modal_inputs:
-            batch["multi_modal_inputs"] = batch["multi_modal_inputs"]
-            batch["multi_modal_inputs_idx"] = torch.Tensor(list(range(len(batch["multi_modal_inputs"])))).to(
-                torch.int64
-            )
-
-        if batch["position_ids"].dim() == 3:  # qwen2vl mrope [bs, 3, seq_len]
-            batch["position_ids"] = batch["position_ids"][
-                :, 0
-            ]  # mcore patch recompute qwen2vl's pos ids during forward
-
-        multi_modal_inputs = {}
-        if "multi_modal_inputs" in batch:
-            from verl.utils.model import extract_multi_modal_inputs
-
-            indices = batch.get("multi_modal_inputs_idx", None)
-            multi_modal_inputs = extract_multi_modal_inputs(batch["multi_modal_inputs"], indices)
+        multi_modal_inputs = extract_multi_modal_inputs(batch.get("multi_modal_inputs", []))
 
         return {
             "input_ids": input_ids,
             "loss_mask": loss_mask,
-            "position_ids": position_ids,
             "multi_modal_inputs": multi_modal_inputs,
         }
 
@@ -706,6 +659,8 @@ def logits_processor(logits, label):
             multi_modal_inputs,
             logits_processor=logits_processor,
             logits_processor_args=logits_processor_args,
+            vision_model=hasattr(self.model_config.hf_config, "vision_config"),
+            pad_token_id=self.model_config.tokenizer.pad_token_id,
             data_format="thd" if self.engine_config.use_remove_padding else "bshd",
         )
 
@@ -757,6 +712,8 @@ def forward_step(self, batch_iter, model, postprocess_micro_batch_func):
             input_ids,
             multi_modal_inputs,
             value_model=True,
+            vision_model=hasattr(self.model_config.hf_config, "vision_config"),
+            pad_token_id=self.model_config.tokenizer.pad_token_id,
         )
 
         return output, partial(postprocess_micro_batch_func, data=batch)
diff --git a/verl/workers/engine_workers.py b/verl/workers/engine_workers.py
index d5bf19727a7..7f26f95f992 100644
--- a/verl/workers/engine_workers.py
+++ b/verl/workers/engine_workers.py
@@ -13,24 +13,21 @@
 # limitations under the License.
 import logging
 import os
-import warnings
 from functools import partial
+from itertools import chain
 from typing import Any, Optional
 
-import psutil
 import torch
 from codetiming import Timer
 from omegaconf import DictConfig, open_dict
-from tensordict import TensorDict
+from tensordict import NonTensorData, TensorDict
 from torch.distributed.device_mesh import init_device_mesh
 
-from verl import DataProto
 from verl.single_controller.base import Worker
 from verl.single_controller.base.decorator import Dispatch, make_nd_compute_dataproto_dispatch_fn, register
 from verl.utils import tensordict_utils as tu
 from verl.utils.config import omega_conf_to_dataclass
 from verl.utils.device import (
-    get_device_id,
     get_device_name,
     get_torch_device,
     set_expandable_segments,
@@ -41,10 +38,9 @@
 from verl.utils.profiler import DistProfiler, DistProfilerExtension, log_gpu_memory_usage
 from verl.utils.py_functional import append_to_dict
 from verl.utils.torch_functional import allgather_dict_into_dict
-from verl.workers.config import ActorConfig, CriticConfig, HFModelConfig, RolloutConfig, TrainingWorkerConfig
+from verl.workers.config import ActorConfig, HFModelConfig, RolloutConfig, TrainingWorkerConfig
 from verl.workers.rollout.base import BaseRollout, get_rollout_class
-from verl.workers.utils.losses import ppo_loss, value_loss
-from verl.workers.utils.padding import left_right_2_no_padding, no_padding_2_padding
+from verl.workers.utils.losses import ppo_loss
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
@@ -71,6 +67,9 @@ def __init__(self, config: TrainingWorkerConfig):
         self.checkpoint_config = self.config.checkpoint_config
         self.device_name = get_device_name()
 
+        # we use the one defined in model
+        self.engine_config.use_remove_padding = self.model_config.use_remove_padding
+
         # TODO: add DistProfilerExtension
         # self.profiler_config = self.config.profiler_config
         # tool_config = self.profiler_config.tool_config
@@ -98,6 +97,16 @@ def __init__(self, config: TrainingWorkerConfig):
 
         self.loss_fn = None
 
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+    def to(self, device, model=True, optimizer=True, grad=True):
+        """Manual control of load/offload"""
+        assert device in ["cpu", "device"]
+
+        if device == "device":
+            device = get_device_name()
+
+        self.engine.to(device=device, model=model, optimizer=optimizer, grad=grad)
+
     @register(dispatch_mode=Dispatch.ONE_TO_ALL)
     def set_loss_fn(self, loss_fn):
         self.loss_fn = loss_fn
@@ -157,13 +166,110 @@ def _postprocess_output(self, output, *, global_token_num, delta_time, forward_o
         final_output = tu.get_tensordict(tensor_dict=model_output, non_tensor_dict={"metrics": final_metrics})
         return final_output
 
+    @register(dispatch_mode=make_nd_compute_dataproto_dispatch_fn(mesh_name="train"), blocking=False)
+    def train_mini_batch(self, data: TensorDict) -> TensorDict:
+        """Split a batch into N mini-batches run for multiple epochs
+
+        Args:
+            data:
+
+        Returns:
+
+        """
+
+        batch_size_per_dp = data.shape[0]
+        disable_auto_offload = tu.pop(data, key="disable_auto_offload", default=False)
+        mini_batch_size = tu.pop(data, key="mini_batch_size", default=None)
+        num_mini_batch = tu.pop(data, key="num_mini_batch", default=None)
+        epochs = tu.pop(data, key="epochs", default=1)
+        seed = tu.pop(data, key="seed", default=42)
+        dataloader_kwargs = tu.pop(data, key="dataloader_kwargs", default={})
+
+        assert mini_batch_size is not None or num_mini_batch is not None
+
+        if mini_batch_size is None:
+            assert batch_size_per_dp % num_mini_batch == 0, f"Got {batch_size_per_dp=} and {num_mini_batch=}"
+            mini_batch_size_per_gpu = batch_size_per_dp // num_mini_batch
+        else:
+            assert mini_batch_size % self.engine.get_data_parallel_size() == 0, (
+                f"Got {mini_batch_size=} and {self.engine.get_data_parallel_size()=}"
+            )
+            mini_batch_size_per_gpu = mini_batch_size // self.engine.get_data_parallel_size()
+
+        # make iterator
+        dataloader = tu.make_iterator(
+            data,
+            mini_batch_size=mini_batch_size_per_gpu,
+            epochs=epochs,
+            seed=seed + self.engine.get_data_parallel_rank(),
+            dataloader_kwargs=dataloader_kwargs,
+        )
+
+        with (
+            self.engine.train_mode(disable_auto_offload=disable_auto_offload),
+            Timer(name="train_batch", logger=None),
+        ):
+            # update
+            output_lst = []
+            total_num_iterations = data.shape[0] // mini_batch_size_per_gpu * epochs
+
+            for batch_idx, mini_batch_td in enumerate(dataloader):
+                # add global token num
+                global_token_num = mini_batch_td["input_ids"].offsets().diff().tolist()  # (total_nnz,)
+                # allgather from dp rank
+                global_token_num_output = [None] * self.engine.get_data_parallel_size()
+                torch.distributed.all_gather_object(
+                    global_token_num_output, global_token_num, self.engine.get_data_parallel_group()
+                )
+                global_token_num = [x for xs in global_token_num_output for x in xs]
+                tu.assign_non_tensor(
+                    mini_batch_td,
+                    global_token_num=NonTensorData(global_token_num),
+                    update_lr_scheduler=batch_idx == total_num_iterations - 1,
+                    disable_auto_offload=True,
+                )
+                actor_output = self.train_batch(mini_batch_td)
+                output_lst.append(actor_output)
+
+            if self.engine.is_mp_src_rank_with_outputs():
+                actor_output = [tu.get(output, "metrics") for output in output_lst]
+                metrics = {}
+                for output in actor_output:
+                    for key, val in output.items():
+                        # flattn dp and micro batch
+                        if isinstance(val, list):
+                            output[key] = list(chain.from_iterable(val))
+                    append_to_dict(metrics, output)
+
+                output = tu.get_tensordict(tensor_dict={}, non_tensor_dict={"metrics": metrics}).cpu()
+            else:
+                output = None
+        return output
+
     @register(dispatch_mode=make_nd_compute_dataproto_dispatch_fn(mesh_name="train"), blocking=False)
     def train_batch(self, data: TensorDict) -> TensorDict:
         assert self.loss_fn is not None, "loss function can't be None when calling train_batch"
         # global_token_num should be a list of number of tokens of each seq in this batch
         global_token_num = tu.get(data, key="global_token_num")
+        disable_auto_offload = tu.get(data, key="disable_auto_offload", default=False)
+
+        # inject engineering parameters if not specified
+        default_keys = dict(
+            use_remove_padding=self.model_config.use_remove_padding,
+            use_dynamic_bsz=self.engine_config.use_dynamic_bsz,
+            max_token_len_per_gpu=self.engine_config.max_token_len_per_gpu,
+            micro_batch_size_per_gpu=self.engine_config.micro_batch_size_per_gpu,
+            use_fused_kernels=self.engine_config.use_fused_kernels,
+        )
+
+        for key, val in default_keys.items():
+            if key not in data.keys():
+                tu.assign_non_tensor(data, **{key: val})
 
-        with self.engine.train_mode(), Timer(name="train_batch", logger=None) as timer:
+        with (
+            self.engine.train_mode(disable_auto_offload=disable_auto_offload),
+            Timer(name="train_batch", logger=None) as timer,
+        ):
             output = self.engine.train_batch(data, loss_function=self.loss_fn)
             # containing loss, model_output and metrics
             # for training, we only care about loss and metrics
@@ -183,7 +289,7 @@ def train_batch(self, data: TensorDict) -> TensorDict:
                 output["metrics"]["lr"] = lr
             final_output = self._postprocess_output(
                 output, global_token_num=global_token_num, delta_time=delta_time, forward_only=False
-            )
+            ).cpu()
         else:
             final_output = None
         return final_output
@@ -192,15 +298,35 @@ def train_batch(self, data: TensorDict) -> TensorDict:
     def infer_batch(self, data: TensorDict) -> TensorDict:
         # add mfu calculator
         global_token_num = tu.get(data, key="global_token_num")
+        compute_loss = tu.get(data, key="compute_loss", default=True)
+        disable_auto_offload = tu.get(data, key="disable_auto_offload", default=False)
+
+        default_keys = dict(
+            use_remove_padding=self.model_config.use_remove_padding,
+            use_dynamic_bsz=self.engine_config.use_dynamic_bsz,
+            max_token_len_per_gpu=self.engine_config.infer_max_token_len_per_gpu,
+            micro_batch_size_per_gpu=self.engine_config.infer_micro_batch_size_per_gpu,
+            use_fused_kernels=self.engine_config.use_fused_kernels,
+        )
+
+        for key, val in default_keys.items():
+            if key not in data.keys():
+                tu.assign_non_tensor(data, **{key: val})
+
+        # for sft training, we need to compute loss in eval
+        loss_function = self.loss_fn if compute_loss else None
 
-        with self.engine.eval_mode(), Timer(name="eval_batch", logger=None) as timer:
-            output = self.engine.infer_batch(data, loss_function=self.loss_fn)
+        with (
+            self.engine.eval_mode(disable_auto_offload=disable_auto_offload),
+            Timer(name="eval_batch", logger=None) as timer,
+        ):
+            output = self.engine.infer_batch(data, loss_function=loss_function)
         delta_time = timer.last
 
         if self.engine.is_mp_src_rank_with_outputs():
             final_output = self._postprocess_output(
                 output, global_token_num=global_token_num, delta_time=delta_time, forward_only=True
-            )
+            ).cpu()
         else:
             final_output = None
         return final_output
@@ -214,330 +340,6 @@ def load_checkpoint(self, local_path, hdfs_path=None, del_local_after_load=False
         return self.engine.load_checkpoint(local_path, hdfs_path, del_local_after_load)
 
 
-class ActorWorker(Worker, DistProfilerExtension):
-    """
-    This worker can be instantiated as a standalone actor or a standalone reference policy
-    or a hybrid engine based on the config.rollout
-    """
-
-    def __init__(self, config: ActorConfig):
-        self.config = config
-        Worker.__init__(self)
-        self.profiler_config = self.config.profiler
-        tool_config = self.profiler_config.tool_config
-        DistProfilerExtension.__init__(
-            self, DistProfiler(rank=self.rank, config=self.profiler_config, tool_config=tool_config)
-        )
-
-        initialize_global_process_group_ray(timeout_second=None)
-
-        self.loss_fn = partial(ppo_loss, config=self.config)
-
-    def _build_engine(self):
-        self.model_config = self.config.model_config
-        self.engine_config = self.config.engine
-        self.optimizer_config = self.config.optim
-        self.checkpoint_config = self.config.checkpoint
-
-        from verl.workers.engine import BaseEngine, EngineRegistry
-
-        self.engine: BaseEngine = EngineRegistry.new(
-            model_type="language_model",
-            backend=self.config.strategy,
-            model_config=self.model_config,
-            engine_config=self.engine_config,
-            optimizer_config=self.optimizer_config,
-            checkpoint_config=self.checkpoint_config,
-        )
-
-        # build dispatch info
-        self._register_dispatch_collect_info(
-            mesh_name="actor",
-            dp_rank=self.engine.get_data_parallel_rank(),
-            is_collect=self.engine.is_mp_src_rank_with_outputs(),
-        )
-
-        # aggregate with bon sampling
-        self.ppo_mini_batch_size = self.config.ppo_mini_batch_size * self.config.rollout_n
-        assert self.ppo_mini_batch_size % self.engine.get_data_parallel_size() == 0, (
-            f"{self.ppo_mini_batch_size=} is not divisible by {self.engine.get_data_parallel_size()=}"
-        )
-        self.ppo_mini_batch_size_per_dp = self.ppo_mini_batch_size // self.engine.get_data_parallel_size()
-
-        # setup flops counter
-        self.flops_counter = FlopsCounter(self.model_config.hf_config)
-
-    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
-    def init_model(self):
-        self._build_engine()
-        self.engine.initialize()
-
-    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
-    def set_loss_fn(self, loss_fn):
-        self.loss_fn = loss_fn
-
-    @register(dispatch_mode=make_nd_compute_dataproto_dispatch_fn(mesh_name="actor"))
-    @DistProfiler.annotate(color="blue", role="actor_compute_log_prob")
-    def compute_log_prob(self, data: DataProto):
-        data.meta_info["use_dynamic_bsz"] = self.config.use_dynamic_bsz
-        data.meta_info["use_fused_kernels"] = self.config.use_fused_kernels
-        if "calculate_entropy" not in data.meta_info:
-            data.meta_info["calculate_entropy"] = True
-        calculate_entropy = data.meta_info["calculate_entropy"]
-
-        if self.config.use_dynamic_bsz:
-            data.meta_info["max_token_len_per_gpu"] = self.config.ppo_infer_max_token_len_per_gpu
-        else:
-            data.meta_info["micro_batch_size_per_gpu"] = self.config.ppo_infer_micro_batch_size_per_gpu
-
-        with self.engine.eval_mode():
-            # TODO: make worker API to accept TensorDict as well
-            data = data.to_tensordict()
-            data = left_right_2_no_padding(data)
-            output = self.engine.infer_batch(data)
-
-        if self.engine.is_mp_src_rank_with_outputs():
-            output = output["model_output"]
-            log_probs = output["log_probs"]
-            log_probs = no_padding_2_padding(log_probs, data)  # (bsz, response_length)
-
-            tensors = {"old_log_probs": log_probs.float()}
-            if calculate_entropy:
-                entropy = no_padding_2_padding(output["entropy"], data)  # (bsz, response_length)
-                tensors["entropys"] = entropy.float()
-
-            # in megatron, only last pp contains valid data and returned to the single controller
-            output = DataProto.from_dict(tensors=tensors)
-            output = output.to("cpu")
-
-        return output if self.engine.is_mp_src_rank_with_outputs() else None
-
-    @register(dispatch_mode=make_nd_compute_dataproto_dispatch_fn(mesh_name="actor"))
-    @DistProfiler.annotate(color="red", role="actor_update")
-    def update_actor(self, data: DataProto):
-        data.meta_info["use_dynamic_bsz"] = self.config.use_dynamic_bsz
-        data.meta_info["use_fused_kernels"] = self.config.use_fused_kernels
-        data.meta_info["calculate_entropy"] = self.config.entropy_coeff != 0.0
-        if self.config.use_dynamic_bsz:
-            data.meta_info["max_token_len_per_gpu"] = self.config.ppo_max_token_len_per_gpu
-        else:
-            data.meta_info["micro_batch_size_per_gpu"] = self.config.ppo_micro_batch_size_per_gpu
-
-        metrics = {}
-        # Support all hardwares
-        data = data.to(get_device_id())
-        # perform forward computation
-        with self.engine.train_mode():
-            dataloader = data.make_iterator(
-                mini_batch_size=self.ppo_mini_batch_size_per_dp,
-                epochs=self.config.ppo_epochs,
-                seed=self.config.data_loader_seed + self.engine.get_data_parallel_rank(),
-                dataloader_kwargs={"shuffle": self.config.shuffle},
-            )
-            with Timer(name="update_policy", logger=None) as timer:
-                for batch_idx, mini_batch in enumerate(dataloader):
-                    mini_batch.meta_info["global_batch_size"] = self.ppo_mini_batch_size
-                    # TODO: make worker API to accept TensorDict as well
-                    mini_batch = mini_batch.to_tensordict()
-                    mini_batch = left_right_2_no_padding(mini_batch)
-                    output = self.engine.train_batch(mini_batch, self.loss_fn)
-                    mini_batch_metrics = output.get("metrics", {})
-                    append_to_dict(metrics, mini_batch_metrics, prefix="actor/")
-
-            delta_time = timer.last
-
-            global_num_tokens = data.meta_info["global_token_num"]
-            estimated_flops, promised_flops = self.flops_counter.estimate_flops(global_num_tokens, delta_time)
-            metrics["perf/mfu/actor"] = estimated_flops * self.config.ppo_epochs / promised_flops / self.world_size
-            metrics["perf/max_memory_allocated_gb"] = get_torch_device().max_memory_allocated() / (1024**3)
-            metrics["perf/max_memory_reserved_gb"] = get_torch_device().max_memory_reserved() / (1024**3)
-            metrics["perf/cpu_memory_used_gb"] = psutil.virtual_memory().used / (1024**3)
-
-            lr = self.engine.lr_scheduler_step()
-            metrics["actor/lr"] = lr
-
-            output = DataProto(batch=None, meta_info={"metrics": metrics})
-
-        return output
-
-    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
-    def save_checkpoint(self, local_path, hdfs_path=None, global_step=0, max_ckpt_to_keep=None):
-        return self.engine.save_checkpoint(local_path, hdfs_path, global_step, max_ckpt_to_keep)
-
-    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
-    def load_checkpoint(self, local_path, hdfs_path=None, del_local_after_load=False):
-        return self.engine.load_checkpoint(local_path, hdfs_path, del_local_after_load)
-
-
-class CriticWorker(Worker, DistProfilerExtension):
-    """
-    This worker can be instantiated as a standalone actor or a standalone rollout or a standalone reference policy
-    or a hybrid engine based on the config.rollout
-    """
-
-    def __init__(self, config: CriticConfig):
-        self.config = config
-        Worker.__init__(self)
-        self.profiler_config = self.config.profiler
-        tool_config = self.profiler_config.tool_config
-        DistProfilerExtension.__init__(
-            self, DistProfiler(rank=self.rank, config=self.profiler_config, tool_config=tool_config)
-        )
-
-        initialize_global_process_group_ray(timeout_second=None)
-
-        self.loss_fn = partial(value_loss, config=self.config)
-
-    def _build_engine(self):
-        from copy import copy, deepcopy
-
-        self.model_config = copy(self.config.model_config)
-        self.model_config.hf_config = deepcopy(self.config.model_config.hf_config)
-        self.engine_config = self.config.engine
-        self.optimizer_config = self.config.optim
-        self.checkpoint_config = self.config.checkpoint
-
-        from verl.workers.engine import BaseEngine, EngineRegistry
-
-        # replace AutoModelForSequenceClassification to AutoModelForTokenClassification
-        hf_config = self.model_config.hf_config
-
-        arch = hf_config.architectures[0]
-        # This logic assumes the critic is a token classification model.
-        # If the provided model is a CausalLM, we adapt it.
-        if "ForCausalLM" in arch:
-            model_name = arch.split("ForCausalLM")[0]
-            new_arch = f"{model_name}ForTokenClassification"
-            warnings.warn(f"Implicitly changing critic architecture from '{arch}' to '{new_arch}'", stacklevel=2)
-            hf_config.architectures[0] = new_arch
-        elif "ForTokenClassification" not in arch and "ForSequenceClassification" not in arch:
-            raise ValueError(
-                f"Unsupported critic architecture: {arch}. "
-                f"Critic worker expects an architecture suitable for value function estimation, "
-                f"such as '...ForTokenClassification' or '...ForSequenceClassification'."
-            )
-
-        # make sure output dropout is 0
-        hf_config.classifier_dropout = 0
-
-        self.engine: BaseEngine = EngineRegistry.new(
-            model_type="value_model",
-            backend=self.config.strategy,
-            model_config=self.model_config,
-            engine_config=self.engine_config,
-            optimizer_config=self.optimizer_config,
-            checkpoint_config=self.checkpoint_config,
-        )
-
-        # build dispatch info
-        self._register_dispatch_collect_info(
-            mesh_name="critic",
-            dp_rank=self.engine.get_data_parallel_rank(),
-            is_collect=self.engine.is_mp_src_rank_with_outputs(),
-        )
-
-        # aggregate with bon sampling
-        self.ppo_mini_batch_size = self.config.ppo_mini_batch_size * self.config.rollout_n
-        assert self.ppo_mini_batch_size % self.engine.get_data_parallel_size() == 0, (
-            f"{self.ppo_mini_batch_size=} is not divisible by {self.engine.get_data_parallel_size()=}"
-        )
-        self.ppo_mini_batch_size_per_dp = self.ppo_mini_batch_size // self.engine.get_data_parallel_size()
-
-        # setup flops counter
-        self.flops_counter = FlopsCounter(self.model_config.hf_config)
-
-    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
-    def init_model(self):
-        self._build_engine()
-        self.engine.initialize()
-
-    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
-    def set_loss_fn(self, loss_fn):
-        self.loss_fn = loss_fn
-
-    @register(dispatch_mode=make_nd_compute_dataproto_dispatch_fn(mesh_name="critic"))
-    @DistProfiler.annotate(color="blue", role="critic_compute_values")
-    def compute_values(self, data: DataProto):
-        data.meta_info["use_dynamic_bsz"] = self.config.use_dynamic_bsz
-        if self.config.use_dynamic_bsz:
-            data.meta_info["max_token_len_per_gpu"] = self.config.ppo_infer_max_token_len_per_gpu
-        else:
-            data.meta_info["micro_batch_size_per_gpu"] = self.config.ppo_infer_micro_batch_size_per_gpu
-
-        with self.engine.eval_mode():
-            # TODO: make worker API to accept TensorDict as well
-            data = data.to_tensordict()
-            data = left_right_2_no_padding(data)
-            output = self.engine.infer_batch(data)
-
-        if self.engine.is_mp_src_rank_with_outputs():
-            # in megatron, only last pp contains valid data and returned to the single controller
-            output = output["model_output"]
-            values = output["values"]
-            values = no_padding_2_padding(values, data)  # (bsz, response_length)
-
-            output = DataProto.from_dict(
-                tensors={"values": values.float()},
-            )
-            output = output.to("cpu")
-
-        return output
-
-    @register(dispatch_mode=make_nd_compute_dataproto_dispatch_fn(mesh_name="critic"))
-    @DistProfiler.annotate(color="red", role="critic_update")
-    def update_critic(self, data: DataProto):
-        data.meta_info["use_dynamic_bsz"] = self.config.use_dynamic_bsz
-        if self.config.use_dynamic_bsz:
-            data.meta_info["max_token_len_per_gpu"] = self.config.ppo_max_token_len_per_gpu
-        else:
-            data.meta_info["micro_batch_size_per_gpu"] = self.config.ppo_micro_batch_size_per_gpu
-
-        metrics = {}
-        # Support all hardwares
-        data = data.to(get_device_id())
-        # perform forward computation
-        with self.engine.train_mode():
-            dataloader = data.make_iterator(
-                mini_batch_size=self.ppo_mini_batch_size_per_dp,
-                epochs=self.config.ppo_epochs,
-                seed=self.config.data_loader_seed + self.engine.get_data_parallel_rank(),
-                dataloader_kwargs={"shuffle": self.config.shuffle},
-            )
-            with Timer(name="update_policy", logger=None) as timer:
-                for batch_idx, mini_batch in enumerate(dataloader):
-                    mini_batch.meta_info["global_batch_size"] = self.ppo_mini_batch_size
-                    # TODO: make worker API to accept TensorDict as well
-                    mini_batch = mini_batch.to_tensordict()
-                    mini_batch = left_right_2_no_padding(mini_batch)
-                    output = self.engine.train_batch(mini_batch, self.loss_fn)
-                    mini_batch_metrics = output.get("metrics", {})
-                    append_to_dict(metrics, mini_batch_metrics, prefix="critic/")
-
-            delta_time = timer.last
-
-            global_num_tokens = data.meta_info["global_token_num"]
-            estimated_flops, promised_flops = self.flops_counter.estimate_flops(global_num_tokens, delta_time)
-            metrics["perf/mfu/critic"] = estimated_flops * self.config.ppo_epochs / promised_flops / self.world_size
-            metrics["perf/max_memory_allocated_gb"] = get_torch_device().max_memory_allocated() / (1024**3)
-            metrics["perf/max_memory_reserved_gb"] = get_torch_device().max_memory_reserved() / (1024**3)
-            metrics["perf/cpu_memory_used_gb"] = psutil.virtual_memory().used / (1024**3)
-
-            lr = self.engine.lr_scheduler_step()
-            metrics["critic/lr"] = lr
-
-            output = DataProto(batch=None, meta_info={"metrics": metrics})
-
-        return output
-
-    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
-    def save_checkpoint(self, local_path, hdfs_path=None, global_step=0, max_ckpt_to_keep=None):
-        return self.engine.save_checkpoint(local_path, hdfs_path, global_step, max_ckpt_to_keep)
-
-    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
-    def load_checkpoint(self, local_path, hdfs_path=None, del_local_after_load=False):
-        return self.engine.load_checkpoint(local_path, hdfs_path, del_local_after_load)
-
-
 class ActorRolloutRefWorker(Worker, DistProfilerExtension):
     """Hybrid worker that includes actor model, rollout and optional ref model.
     For standalone actor or rollout, use ActorWorker or BaseRollout respectively.
@@ -549,10 +351,19 @@ def __init__(self, config: DictConfig, role: str, **kwargs):
         Worker.__init__(self)
         self.config = config
         self.role = role
-        self.actor: ActorWorker = None
-        self.ref: ActorWorker = None
+        self.actor: TrainingWorker = None
+        self.ref: TrainingWorker = None
         self.rollout: BaseRollout = None
 
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+    def set_loss_fn(self, loss_fn):
+        self.actor.set_loss_fn(loss_fn=loss_fn)
+
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+    def to(self, device, model=True, optimizer=True, grad=True):
+        """Manual control of load/offload"""
+        self.actor.to(device=device, model=model, optimizer=optimizer, grad=grad)
+
     @register(dispatch_mode=Dispatch.ONE_TO_ALL)
     def init_model(self):
         model_config: HFModelConfig = omega_conf_to_dataclass(self.config.model)
@@ -571,9 +382,25 @@ def init_model(self):
             ref_config: ActorConfig = omega_conf_to_dataclass(self.config.ref)
             ref_config.model_config = model_config
 
-            self.ref = ActorWorker(ref_config)
-            self.ref.init_model()
-            self.ref.engine.to("cpu")
+            # construct TrainingWorkerConfig
+            ref_training_config = TrainingWorkerConfig(
+                model_type="language_model",
+                model_config=ref_config.model_config,
+                engine_config=ref_config.engine,
+                optimizer_config=ref_config.optim,
+                checkpoint_config=ref_config.checkpoint,
+            )
+
+            # assign engine configs
+            ref_training_config.engine_config.use_dynamic_bsz = self.config.ref.use_dynamic_bsz
+            ref_training_config.engine_config.infer_max_token_len_per_gpu = self.config.ref.ppo_max_token_len_per_gpu
+            ref_training_config.engine_config.infer_micro_batch_size_per_gpu = (
+                self.config.ref.ppo_micro_batch_size_per_gpu
+            )
+            ref_training_config.engine_config.use_remove_padding = model_config.use_remove_padding
+
+            self.ref = TrainingWorker(config=ref_training_config)
+            self.ref.reset()
             self.set_dispatch_collect(mesh_name="ref", **self.ref.get_dispatch_collect())
 
         # 2. build actor model
@@ -581,9 +408,41 @@ def init_model(self):
             actor_config: ActorConfig = omega_conf_to_dataclass(self.config.actor)
             actor_config.model_config = model_config
 
-            self.actor = ActorWorker(actor_config)
-            self.actor.init_model()
-            self.actor.engine.to("cpu")
+            actor_training_config = TrainingWorkerConfig(
+                model_type="language_model",
+                model_config=actor_config.model_config,
+                engine_config=actor_config.engine,
+                optimizer_config=actor_config.optim,
+                checkpoint_config=actor_config.checkpoint,
+            )
+
+            assert self.config.actor.use_dynamic_bsz == self.config.rollout.log_prob_use_dynamic_bsz
+
+            # assign engine configs
+            actor_training_config.engine_config.use_dynamic_bsz = self.config.actor.use_dynamic_bsz
+            actor_training_config.engine_config.infer_max_token_len_per_gpu = (
+                self.config.rollout.log_prob_max_token_len_per_gpu
+            )
+            actor_training_config.engine_config.infer_micro_batch_size_per_gpu = (
+                self.config.rollout.log_prob_micro_batch_size_per_gpu
+            )
+            actor_training_config.engine_config.max_token_len_per_gpu = self.config.actor.ppo_max_token_len_per_gpu
+            actor_training_config.engine_config.micro_batch_size_per_gpu = (
+                self.config.actor.ppo_micro_batch_size_per_gpu
+            )
+            actor_training_config.engine_config.use_remove_padding = model_config.use_remove_padding
+
+            if self.config.actor.use_dynamic_bsz:
+                assert self.config.rollout.log_prob_max_token_len_per_gpu is not None
+                assert self.config.actor.ppo_max_token_len_per_gpu is not None
+            else:
+                assert self.config.rollout.log_prob_micro_batch_size_per_gpu is not None
+                assert self.config.rollout.ppo_micro_batch_size_per_gpu is not None
+
+            self.loss_fn = partial(ppo_loss, config=actor_config)
+            self.actor = TrainingWorker(config=actor_training_config)
+            self.actor.reset()
+            self.actor.set_loss_fn(self.loss_fn)
             self.set_dispatch_collect(mesh_name="actor", **self.actor.get_dispatch_collect())
 
         # 3. build rollout engine
@@ -623,22 +482,21 @@ def init_model(self):
 
     @register(dispatch_mode=make_nd_compute_dataproto_dispatch_fn(mesh_name="ref"))
     @DistProfiler.annotate(color="olive", role="ref_compute_log_prob")
-    def compute_ref_log_prob(self, data: DataProto):
-        data.meta_info["calculate_entropy"] = False
-        output = self.ref.compute_log_prob(data)
-        if output is not None:
-            output.batch["ref_log_prob"] = output.batch.pop("old_log_probs")
-        return output
+    def compute_ref_log_prob(self, data: TensorDict) -> TensorDict:
+        output = self.ref.infer_batch(data=data)
+        return output.cpu() if output is not None else None
 
     @register(dispatch_mode=make_nd_compute_dataproto_dispatch_fn(mesh_name="actor"))
     @DistProfiler.annotate(color="blue", role="actor_compute_log_prob")
-    def compute_log_prob(self, data: DataProto):
-        return self.actor.compute_log_prob(data)
+    def compute_log_prob(self, data: TensorDict) -> TensorDict:
+        output = self.actor.infer_batch(data)
+        return output.cpu() if output is not None else None
 
     @register(dispatch_mode=make_nd_compute_dataproto_dispatch_fn(mesh_name="actor"))
     @DistProfiler.annotate(color="red", role="actor_update")
-    def update_actor(self, data: DataProto):
-        return self.actor.update_actor(data)
+    def update_actor(self, data: TensorDict) -> TensorDict:
+        output = self.actor.train_mini_batch(data=data)
+        return output.cpu() if output is not None else None
 
     @register(dispatch_mode=Dispatch.ONE_TO_ALL)
     def load_checkpoint(self, local_path, hdfs_path=None, del_local_after_load=False):
diff --git a/verl/workers/fsdp_workers.py b/verl/workers/fsdp_workers.py
index cce390575c5..0beff126130 100644
--- a/verl/workers/fsdp_workers.py
+++ b/verl/workers/fsdp_workers.py
@@ -1669,7 +1669,12 @@ def _build_model(self, config):
             self.tokenizer = hf_tokenizer(local_path, trust_remote_code=config.model.get("trust_remote_code", False))
 
         trust_remote_code = config.model.get("trust_remote_code", False)
-        model_config = AutoConfig.from_pretrained(local_path, trust_remote_code=trust_remote_code)
+        override_config = OmegaConf.to_container(OmegaConf.create(config.model.get("override_config", {})))
+        model_config = AutoConfig.from_pretrained(
+            local_path,
+            trust_remote_code=trust_remote_code,
+            attn_implementation=override_config.get("attn_implementation", "flash_attention_2"),
+        )
         model_config.num_labels = 1
 
         # note that we have to create model in fp32. Otherwise, the optimizer is in bf16, which is incorrect
@@ -1684,7 +1689,6 @@ def _build_model(self, config):
                 pretrained_model_name_or_path=local_path,
                 config=model_config,
                 torch_dtype=torch.bfloat16,
-                attn_implementation="flash_attention_2",
                 trust_remote_code=trust_remote_code,
             )
 
diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py
index 8c54a0868c4..db2e3fb1b97 100644
--- a/verl/workers/megatron_workers.py
+++ b/verl/workers/megatron_workers.py
@@ -667,6 +667,7 @@ def init_model(self):
     async def rollout_mode(self):
         """Context switch hybridengine to rollout mode."""
         aggressive_empty_cache(force_sync=True)
+        set_expandable_segments(False)
 
         if self._is_offload_param:
             load_megatron_model_to_gpu(self.actor.actor_module, load_grad=False)
@@ -686,8 +687,6 @@ async def rollout_mode(self):
                 self.layer_name_mapping,
             )
 
-        set_expandable_segments(False)
-
         if self.config.rollout.free_cache_engine:
             await self.rollout.resume(tags=["weights"])
         await self.rollout.update_weights(per_tensor_param)
diff --git a/verl/workers/reward_manager/__init__.py b/verl/workers/reward_manager/__init__.py
index fcfb65d7083..9ee599740d0 100644
--- a/verl/workers/reward_manager/__init__.py
+++ b/verl/workers/reward_manager/__init__.py
@@ -36,7 +36,7 @@
 
 # Import experimental reward managers to ensure they are registered
 try:
-    from verl.experimental.reward.reward_loop.limited import RateLimitedRewardLoopManager  # noqa: F401
+    from verl.experimental.reward.reward_manager.limited import RateLimitedRewardLoopManager  # noqa: F401
 
     __all__.append("RateLimitedRewardLoopManager")
 except ImportError:
diff --git a/verl/workers/rollout/replica.py b/verl/workers/rollout/replica.py
index a49c7dfd2a3..f8f6f1084ad 100644
--- a/verl/workers/rollout/replica.py
+++ b/verl/workers/rollout/replica.py
@@ -18,7 +18,7 @@
 from enum import Enum
 from typing import Any, Callable, Optional
 
-from omegaconf import DictConfig, OmegaConf
+from omegaconf import DictConfig
 from pydantic import BaseModel
 from ray.actor import ActorHandle
 
@@ -37,6 +37,8 @@ class TokenOutput(BaseModel):
     """logprobs of response token ids"""
     routed_experts: Optional[Any] = None
     """routed experts of response token ids"""
+    stop_reason: Optional[str] = None
+    """stop reason: 'completed', 'aborted', or None for unknown"""
 
 
 class RolloutMode(Enum):
@@ -88,18 +90,7 @@ def __init__(
     ) -> None:
         self.replica_rank = replica_rank
         self.config = omega_conf_to_dataclass(config)
-        # TODO: make lora config irrelevant to the model engine choice
-        # Convert megatron lora config to HFModelConfig
-        # If model_config is not an OmegaConf object, convert it first
-        if OmegaConf.is_config(model_config):
-            model_config_dict = OmegaConf.to_container(model_config)
-            model_config_dict.pop("lora", None)
-
-            self.model_config: HFModelConfig = omega_conf_to_dataclass(
-                OmegaConf.create(model_config_dict), dataclass_type=HFModelConfig
-            )
-        else:
-            self.model_config: HFModelConfig = model_config
+        self.model_config: HFModelConfig = model_config
 
         self.world_size = (
             self.config.tensor_model_parallel_size
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index 012010b5647..03e5084629c 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -13,9 +13,11 @@
 # limitations under the License.
 import argparse
 import asyncio
+import inspect
 import json
 import logging
 import os
+from concurrent.futures import Future
 from pprint import pprint
 from typing import Any, Callable, Optional
 
@@ -35,8 +37,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.outputs import RequestOutput
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils.argparse_utils import FlexibleArgumentParser
-from vllm.utils.network_utils import get_tcp_uri
+
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.core import EngineCoreProc
 from vllm.v1.engine.utils import CoreEngineProcManager
@@ -56,6 +57,20 @@
     get_vllm_max_lora_rank,
 )
 
+if vllm.__version__ > "0.11.0":
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
+    from vllm.utils.network_utils import get_tcp_uri
+
+    if vllm.__version__ == "0.12.0":
+        from vllm.entrypoints.harmony_utils import get_encoding
+
+        get_encoding()
+else:
+    from vllm.utils import FlexibleArgumentParser, get_tcp_uri
+if vllm.__version__ >= "0.12.0":
+    from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
+    from vllm.v1.outputs import ModelRunnerOutput
+
 logger = logging.getLogger(__file__)
 logger.setLevel(logging.INFO)
 
@@ -91,6 +106,30 @@ def _init_executor(self) -> None:
         self.collective_rpc("init_device")
         self.collective_rpc("load_model")
 
+    if vllm.__version__ >= "0.12.0":
+
+        def execute_model(
+            self, scheduler_output: "SchedulerOutput", non_block: bool = False
+        ) -> "ModelRunnerOutput | None | Future[ModelRunnerOutput | None]":
+            output = self.collective_rpc("execute_model", args=(scheduler_output,))
+            result = output[0]
+            if non_block:
+                f = Future()
+                f.set_result(result)
+                return f
+            return result
+
+        def sample_tokens(
+            self, grammar_output: "GrammarOutput | None", non_block: bool = False
+        ) -> "ModelRunnerOutput | None | Future[ModelRunnerOutput | None]":
+            output = self.collective_rpc("sample_tokens", args=(grammar_output,))
+            result = output[0]
+            if non_block:
+                f = Future()
+                f.set_result(result)
+                return f
+            return result
+
     def collective_rpc(
         self,
         method: str | Callable,
@@ -217,6 +256,13 @@ async def launch_server(self, master_address: str = None, master_port: int = Non
             max_new_tokens=self.config.response_length,
         )
         logger.info(f"override_generation_config: {override_generation_config}")
+
+        logger.info(f"enable_sleep_mode: {self.config.enable_sleep_mode}")
+        if not self.config.enable_sleep_mode:
+            from verl.utils.device import set_expandable_segments
+
+            set_expandable_segments(True)
+
         quantization = self.config.quantization
         if quantization is not None:
             if quantization == "fp8":
@@ -242,7 +288,7 @@ async def launch_server(self, master_address: str = None, master_port: int = Non
             "enable_chunked_prefill": self.config.enable_chunked_prefill,
             "max_num_batched_tokens": self.config.max_num_batched_tokens,
             "enable_prefix_caching": self.config.enable_prefix_caching,
-            "enable_sleep_mode": True,
+            "enable_sleep_mode": self.config.enable_sleep_mode,
             "disable_custom_all_reduce": True,
             "enforce_eager": self.config.enforce_eager,
             "gpu_memory_utilization": self.config.gpu_memory_utilization,
@@ -348,18 +394,23 @@ async def run_server(self, args: argparse.Namespace):
         vllm_config = engine_args.create_engine_config(usage_context=usage_context)
         vllm_config.parallel_config.data_parallel_master_port = self._dp_master_port
 
-        engine_client = AsyncLLM.from_vllm_config(
-            vllm_config=vllm_config,
-            usage_context=usage_context,
-            disable_log_requests=engine_args.disable_log_requests,
-            disable_log_stats=engine_args.disable_log_stats,
-        )
+        fn_args = set(dict(inspect.signature(AsyncLLM.from_vllm_config).parameters).keys())
+        kwargs = {}
+        if "enable_log_requests" in fn_args:
+            kwargs["enable_log_requests"] = engine_args.enable_log_requests
+        if "disable_log_stats" in fn_args:
+            kwargs["disable_log_stats"] = engine_args.disable_log_stats
+
+        engine_client = AsyncLLM.from_vllm_config(vllm_config=vllm_config, usage_context=usage_context, **kwargs)
 
         # Don't keep the dummy data in memory
         await engine_client.reset_mm_cache()
 
         app = build_app(args)
-        await init_app_state(engine_client, vllm_config, app.state, args)
+        if vllm.__version__ > "0.11.0":
+            await init_app_state(engine_client, app.state, args)
+        else:
+            await init_app_state(engine_client, vllm_config, app.state, args)
         if self.replica_rank == 0 and self.node_rank == 0:
             logger.info(f"Initializing a V1 LLM engine with config: {vllm_config}")
 
@@ -439,7 +490,18 @@ async def generate(
         if self.config.enable_rollout_routing_replay:
             routed_experts = final_res.outputs[0].routed_experts
 
-        return TokenOutput(token_ids=token_ids, log_probs=log_probs, routed_experts=routed_experts)
+        # Determine stop reason from finish_reason
+        finish_reason = final_res.outputs[0].finish_reason
+        if finish_reason == "abort":
+            stop_reason = "aborted"
+        elif finish_reason in ("stop", "length"):
+            stop_reason = "completed"
+        else:
+            stop_reason = finish_reason  # for more stop reason in the future
+
+        return TokenOutput(
+            token_ids=token_ids, log_probs=log_probs, routed_experts=routed_experts, stop_reason=stop_reason
+        )
 
     async def wake_up(self):
         if self.rollout_mode == RolloutMode.HYBRID:
@@ -471,6 +533,88 @@ async def clear_kv_cache(self):
     async def wait_for_requests_to_drain(self):
         await self.engine.wait_for_requests_to_drain()
 
+    async def abort_all_requests(self, reset_prefix_cache: bool = True) -> dict[str, Any]:
+        """Abort all ongoing generation requests.
+
+        Returns:
+            dict[str, Any]: Dictionary containing:
+                - aborted_count: Number of requests aborted
+                - request_ids: List of aborted request IDs
+        """
+        try:
+            # Take an atomic snapshot to avoid race conditions with the vLLM engine thread
+            request_states_snapshot = list(self.engine.output_processor.request_states.items())
+            request_ids = [req_id for req_id, _ in request_states_snapshot]
+
+            if not request_ids:
+                return {"aborted_count": 0, "request_ids": []}
+
+            # For each request, create an abort output and put it to its queue
+            # This allows the generator to receive the aborted result
+            from vllm.v1.engine import FinishReason
+
+            for _, req_state in request_states_snapshot:
+                request_output = req_state.make_request_output(
+                    [], pooling_output=None, finish_reason=FinishReason.ABORT, stop_reason=None
+                )
+                req_state.queue.put(request_output)
+
+            # Abort requests in the output processor and engine core
+            self.engine.output_processor.abort_requests(request_ids)
+            await self.engine.engine_core.abort_requests_async(request_ids)
+
+            # Try to reset prefix cache to ensure clean state
+            if reset_prefix_cache:
+                await self.clear_kv_cache()
+                logger.info("Prefix cache reset after abort")
+
+            logger.info(f"Aborted {len(request_ids)} requests: {request_ids}")
+            return {"aborted_count": len(request_ids), "request_ids": request_ids}
+
+        except Exception as e:
+            logger.error(f"Error aborting requests: {e}")
+            return {"aborted_count": 0, "request_ids": [], "error": str(e)}
+
+    async def abort_request(self, request_id: str, reset_prefix_cache: bool = True) -> dict[str, Any]:
+        """Abort a specific generation request.
+
+        Args:
+            request_id: The ID of the request to abort.
+
+        Returns:
+            dict[str, Any]: Dictionary containing abort result.
+        """
+        try:
+            request_states = self.engine.output_processor.request_states
+            req_state = request_states.get(request_id)
+
+            if req_state is None:
+                return {"aborted": False, "error": f"Request {request_id} not found"}
+
+            # Create abort output and put it to the queue
+            from vllm.v1.engine import FinishReason
+
+            request_output = req_state.make_request_output(
+                [], pooling_output=None, finish_reason=FinishReason.ABORT, stop_reason=None
+            )
+            req_state.queue.put(request_output)
+
+            # Abort in output processor and engine core
+            self.engine.output_processor.abort_requests([request_id])
+            await self.engine.engine_core.abort_requests_async([request_id])
+
+            # Try to reset prefix cache to ensure clean state
+            if reset_prefix_cache:
+                await self.clear_kv_cache()
+                logger.info(f"Prefix cache reset after abort request {request_id}")
+
+            logger.info(f"Aborted request: {request_id}")
+            return {"aborted": True, "request_id": request_id}
+
+        except Exception as e:
+            logger.error(f"Error aborting request {request_id}: {e}")
+            return {"aborted": False, "request_id": request_id, "error": str(e)}
+
 
 @ray.remote(num_cpus=1)
 class vLLMHttpServer(vLLMHttpServerBase):
@@ -590,6 +734,43 @@ async def sleep(self):
         await self.servers[0].wait_for_requests_to_drain.remote()
         await asyncio.gather(*[server.sleep.remote() for server in self.servers])
 
+    async def abort_all_requests(self) -> dict[str, Any]:
+        """Abort all ongoing generation requests across all servers.
+
+        Returns:
+            dict[str, Any]: Combined abort results from all servers.
+        """
+        results = await asyncio.gather(*[server.abort_all_requests.remote() for server in self.servers])
+
+        total_aborted = sum(r.get("aborted_count", 0) for r in results)
+        all_request_ids = []
+        for r in results:
+            all_request_ids.extend(r.get("request_ids", []))
+
+        return {
+            "aborted_count": total_aborted,
+            "request_ids": all_request_ids,
+            "server_results": results,
+        }
+
+    async def abort_request(self, request_id: str) -> dict[str, Any]:
+        """Abort a specific request. Tries all servers since we don't know which one has it.
+
+        Args:
+            request_id: The ID of the request to abort.
+
+        Returns:
+            dict[str, Any]: Abort result.
+        """
+        # TODO(petersh6): we should only abort on the server that has the request.
+        results = await asyncio.gather(*[server.abort_request.remote(request_id) for server in self.servers])
+
+        for r in results:
+            if r.get("aborted", False):
+                return r
+
+        return {"aborted": False, "request_id": request_id, "error": "Request not found on any server"}
+
 
 def _qwen2_5_vl_dedup_image_tokens(prompt_ids: list[int], processor):
     """Deduplicate consecutive image tokens in prompt_ids for Qwen2.5-VL, since vLLM will replicate the
diff --git a/verl/workers/utils/losses.py b/verl/workers/utils/losses.py
index 51ff9d46ac8..65dceec36e9 100644
--- a/verl/workers/utils/losses.py
+++ b/verl/workers/utils/losses.py
@@ -156,6 +156,17 @@ def ppo_loss(config: ActorConfig, model_output, data: TensorDict, dp_group=None)
 
 
 def value_loss(config: CriticConfig, model_output, data: TensorDict, dp_group=None):
+    """value loss
+
+    Args:
+        config: CriticConfig
+        model_output: model output from the model
+        data: the input to the model
+        dp_group: data paralle group
+
+    Returns:
+        value loss
+    """
     vpreds = _slice_response_from_unpad_output(model_output["values"], data)  # (bsz, response_length)
 
     values = data["values"]
diff --git a/verl/workers/utils/padding.py b/verl/workers/utils/padding.py
index e4ea2bd8c31..35541455833 100644
--- a/verl/workers/utils/padding.py
+++ b/verl/workers/utils/padding.py
@@ -16,15 +16,7 @@
 from tensordict import TensorDict
 
 from verl.utils import tensordict_utils as tu
-from verl.utils.device import (
-    is_cuda_available,
-    is_npu_available,
-)
-
-if is_cuda_available:
-    from flash_attn.bert_padding import pad_input, unpad_input
-elif is_npu_available:
-    from transformers.integrations.npu_flash_attention import pad_input, unpad_input
+from verl.utils.attention_utils import pad_input, unpad_input
 
 
 def left_right_2_no_padding(data: TensorDict) -> TensorDict: