From 361ad071b8fa38fec3b71c155f08d48b62798e13 Mon Sep 17 00:00:00 2001
From: Haibin Lin <haibin.lin@bytedance.com>
Date: Sun, 4 May 2025 13:28:20 -0700
Subject: [PATCH 1/7] [tests] refactor: move test files to their own namespaces
 (verl/xxx -> tests/xxx)

---
 .github/workflows/dataset.yml                          |  8 ++++----
 .github/workflows/sgl.yml                              |  6 +++---
 .github/workflows/verl_unit_test.yml                   |  4 ++--
 .github/workflows/vllm.yml                             | 10 +++++-----
 .gitignore                                             |  3 ---
 recipe/dapo/{src => }/config/dapo_trainer.yaml         |  0
 recipe/dapo/{src => }/dapo_ray_trainer.py              |  0
 recipe/dapo/{src => }/main_dapo.py                     |  0
 recipe/dapo/run_dapo_early_qwen2.5_32b.sh              |  4 ++--
 recipe/dapo/run_dapo_qwen2.5_32b.sh                    |  4 ++--
 recipe/dapo/run_dapo_wo_ds_qwen2.5_32b.sh              |  4 ++--
 recipe/dapo/test_dapo_7b.sh                            |  2 +-
 tests/{ => e2e}/generation/run_gen_qwen05.sh           |  0
 tests/{model => models}/test_transformer.py            |  0
 tests/{model => models}/test_transformers_ulysses.py   |  0
 tests/{verl => }/test_protocol.py                      |  0
 tests/{ => utils}/checkpoint/test_fsdp_ckpt.py         |  0
 .../utils/dataset/test_multiturn_sft_dataset.py        |  0
 tests/{verl => }/utils/dataset/test_rl_dataset.py      |  0
 tests/{verl => }/utils/dataset/test_rm_dataset.py      |  0
 tests/{verl => }/utils/dataset/test_sft_dataset.py     |  0
 tests/{verl => }/utils/test_import_utils.py            |  0
 tests/{verl => }/utils/test_model.py                   |  0
 tests/{verl => }/utils/test_module.py                  |  0
 tests/{ => workers}/rollout/async_rollout_utils.py     |  0
 tests/{ => workers}/rollout/run_fsdp_vllm.py           |  0
 tests/{ => workers}/rollout/test_hf_rollout.py         |  0
 .../rollout/test_sglang_async_rollout_w_tools.py       |  0
 tests/{ => workers}/rollout/test_sglang_async_spmd.py  |  0
 tests/{ => workers}/rollout/test_sglang_spmd.py        |  0
 tests/{ => workers}/rollout/test_vllm_hf_loader.py     |  0
 tests/{ => workers}/rollout/test_vllm_multi_turn.py    |  0
 tests/{ => workers}/rollout/test_vllm_spmd.py          |  0
 tests/{ => workers}/rollout/test_vllm_tool_calling.py  |  0
 tests/{ => workers}/rollout/utils_sglang.py            |  0
 35 files changed, 21 insertions(+), 24 deletions(-)
 rename recipe/dapo/{src => }/config/dapo_trainer.yaml (100%)
 rename recipe/dapo/{src => }/dapo_ray_trainer.py (100%)
 rename recipe/dapo/{src => }/main_dapo.py (100%)
 rename tests/{ => e2e}/generation/run_gen_qwen05.sh (100%)
 rename tests/{model => models}/test_transformer.py (100%)
 rename tests/{model => models}/test_transformers_ulysses.py (100%)
 rename tests/{verl => }/test_protocol.py (100%)
 rename tests/{ => utils}/checkpoint/test_fsdp_ckpt.py (100%)
 rename tests/{verl => }/utils/dataset/test_multiturn_sft_dataset.py (100%)
 rename tests/{verl => }/utils/dataset/test_rl_dataset.py (100%)
 rename tests/{verl => }/utils/dataset/test_rm_dataset.py (100%)
 rename tests/{verl => }/utils/dataset/test_sft_dataset.py (100%)
 rename tests/{verl => }/utils/test_import_utils.py (100%)
 rename tests/{verl => }/utils/test_model.py (100%)
 rename tests/{verl => }/utils/test_module.py (100%)
 rename tests/{ => workers}/rollout/async_rollout_utils.py (100%)
 rename tests/{ => workers}/rollout/run_fsdp_vllm.py (100%)
 rename tests/{ => workers}/rollout/test_hf_rollout.py (100%)
 rename tests/{ => workers}/rollout/test_sglang_async_rollout_w_tools.py (100%)
 rename tests/{ => workers}/rollout/test_sglang_async_spmd.py (100%)
 rename tests/{ => workers}/rollout/test_sglang_spmd.py (100%)
 rename tests/{ => workers}/rollout/test_vllm_hf_loader.py (100%)
 rename tests/{ => workers}/rollout/test_vllm_multi_turn.py (100%)
 rename tests/{ => workers}/rollout/test_vllm_spmd.py (100%)
 rename tests/{ => workers}/rollout/test_vllm_tool_calling.py (100%)
 rename tests/{ => workers}/rollout/utils_sglang.py (100%)

diff --git a/.github/workflows/dataset.yml b/.github/workflows/dataset.yml
index 2535dd81fd3..445796e4a36 100644
--- a/.github/workflows/dataset.yml
+++ b/.github/workflows/dataset.yml
@@ -52,10 +52,10 @@ jobs:
         run: |
           [ ! -d "$HOME/verl-data" ] && git clone --depth 1 https://github.com/eric-haibin-lin/verl-data ~/verl-data
           python3 examples/data_preprocess/geo3k.py
-          pytest -s -x tests/verl/utils/dataset/test_rl_dataset.py
-          pytest -s -x tests/verl/utils/dataset/test_sft_dataset.py
-          pytest -s -x tests/verl/utils/test_import_utils.py
-      #          pytest -s -x tests/verl/utils/dataset/test_rm_dataset.py
+          pytest -s -x tests/utils/dataset/test_rl_dataset.py
+          pytest -s -x tests/utils/dataset/test_sft_dataset.py
+          pytest -s -x tests/utils/test_import_utils.py
+          # pytest -s -x tests/utils/dataset/test_rm_dataset.py
       - name: Running ray test using cupy (move it to L20 when dockerfile ready)
         run: |
           cd tests/ray_gpu
diff --git a/.github/workflows/sgl.yml b/.github/workflows/sgl.yml
index 4a3c192a11c..111565dbc45 100644
--- a/.github/workflows/sgl.yml
+++ b/.github/workflows/sgl.yml
@@ -53,13 +53,13 @@ jobs:
           pip3 install -e .[test,gpu,sglang] --no-deps
       - name: Test the latest SGLang
         run: |
-          cd tests/rollout
+          cd tests/workers/rollout
           torchrun --nnodes=1 --nproc_per_node=4 $(which pytest) -s test_sglang_spmd.py
       - name: Test the latest SGLang async
         run: |
-          cd tests/rollout
+          cd tests/workers/rollout
           torchrun --nnodes=1 --nproc_per_node=2 $(which pytest) -s test_sglang_async_spmd.py
       - name: Test the latest SGLang Rollout async with tool
         run: |
-          cd tests/rollout
+          cd tests/workers/rollout
           torchrun --nnodes=1 --nproc_per_node=2 $(which pytest) -s test_sglang_async_rollout_w_tools.py
diff --git a/.github/workflows/verl_unit_test.yml b/.github/workflows/verl_unit_test.yml
index c55caca0ded..ef1ee78ab33 100644
--- a/.github/workflows/verl_unit_test.yml
+++ b/.github/workflows/verl_unit_test.yml
@@ -43,9 +43,9 @@ jobs:
           pip install -e .[test]
       - name: Running test protocol.py 
         run: |
-          cd tests/verl
+          cd tests
           pytest -s -x test_protocol.py
       - name: Running utils tests 
         run: |
-          cd tests/verl/utils
+          cd tests/utils
           pytest -s -x --ignore=dataset/ .
diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml
index abe8bc78284..8b086b4bf87 100644
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@@ -15,7 +15,7 @@ on:
       - "**/*.py"
       # Entrypoints
       - ".github/workflows/vllm.yml"
-      - "tests/generation"
+      - "tests/e2e/generation"
       - "verl/trainer/main_generation.py"
       - "verl/trainer/config/generation.yaml"
       - "!examples"
@@ -69,22 +69,22 @@ jobs:
         # Disable requests to avoid network errors
       - name: Running vllm tests on 8 L20 GPUs
         run: |
-          cd tests/rollout
+          cd tests/workers/rollout
           torchrun --standalone --nnodes=1 --nproc_per_node=8 $(which pytest) -s test_vllm_hf_loader.py
       - name: Test the latest vLLM
         run: |
           pip3 install --upgrade vllm==0.7.3
-          cd tests/rollout
+          cd tests/workers/rollout
           torchrun --standalone --nnodes=1 --nproc_per_node=4 $(which pytest) -s test_vllm_spmd.py
       - name: Run Qwen 0.5B generation test
         run: |
-          cd tests/generation
+          cd tests/e2e/generation
           export OUTPUT_PATH="${HOME}/data/gen/qwen_05_gen_test.parquet"
           MODEL_ID=Qwen/Qwen2.5-0.5B-Instruct NGPUS_PER_NODE=4 GEN_TP=2 bash ./run_gen_qwen05.sh
           rm -rf "${OUTPUT_PATH}"
       - name: Run Qwen 0.5B generation test when world_size == 1
         run: |
-          cd tests/generation
+          cd tests/e2e/generation
           export OUTPUT_PATH="${HOME}/data/gen/qwen_05_gen_test.parquet"
           MODEL_ID=Qwen/Qwen2.5-0.5B-Instruct NGPUS_PER_NODE=1 GEN_TP=1 bash ./run_gen_qwen05.sh
           rm -rf "${OUTPUT_PATH}"
diff --git a/.gitignore b/.gitignore
index 98df3516ee6..f444f0e4c9d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -109,9 +109,6 @@ ENV/
 # Mac
 .DS_Store
 
-# output logs
-tests/e2e/toy_examples/deepspeed/synchronous/output.txt
-
 # vim
 *.swp
 
diff --git a/recipe/dapo/src/config/dapo_trainer.yaml b/recipe/dapo/config/dapo_trainer.yaml
similarity index 100%
rename from recipe/dapo/src/config/dapo_trainer.yaml
rename to recipe/dapo/config/dapo_trainer.yaml
diff --git a/recipe/dapo/src/dapo_ray_trainer.py b/recipe/dapo/dapo_ray_trainer.py
similarity index 100%
rename from recipe/dapo/src/dapo_ray_trainer.py
rename to recipe/dapo/dapo_ray_trainer.py
diff --git a/recipe/dapo/src/main_dapo.py b/recipe/dapo/main_dapo.py
similarity index 100%
rename from recipe/dapo/src/main_dapo.py
rename to recipe/dapo/main_dapo.py
diff --git a/recipe/dapo/run_dapo_early_qwen2.5_32b.sh b/recipe/dapo/run_dapo_early_qwen2.5_32b.sh
index 8ddfcb9445f..c7bd5c189e5 100644
--- a/recipe/dapo/run_dapo_early_qwen2.5_32b.sh
+++ b/recipe/dapo/run_dapo_early_qwen2.5_32b.sh
@@ -58,7 +58,7 @@ gen_tp=4
 
 ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     --working-dir "${WORKING_DIR}" \
-    -- python3 -m recipe.dapo.src.main_dapo \
+    -- python3 -m recipe.dapo.main_dapo \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
@@ -125,4 +125,4 @@ ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     trainer.save_freq=5 \
     trainer.total_epochs=1 \
     trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto
\ No newline at end of file
+    trainer.resume_mode=auto
diff --git a/recipe/dapo/run_dapo_qwen2.5_32b.sh b/recipe/dapo/run_dapo_qwen2.5_32b.sh
index 4a85e0f2fcf..6eec26c80e0 100644
--- a/recipe/dapo/run_dapo_qwen2.5_32b.sh
+++ b/recipe/dapo/run_dapo_qwen2.5_32b.sh
@@ -58,7 +58,7 @@ gen_tp=4
 
 ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     --working-dir "${WORKING_DIR}" \
-    -- python3 -m recipe.dapo.src.main_dapo \
+    -- python3 -m recipe.dapo.main_dapo \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
@@ -127,4 +127,4 @@ ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     trainer.save_freq=5 \
     trainer.total_epochs=1 \
     trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto
\ No newline at end of file
+    trainer.resume_mode=auto
diff --git a/recipe/dapo/run_dapo_wo_ds_qwen2.5_32b.sh b/recipe/dapo/run_dapo_wo_ds_qwen2.5_32b.sh
index e1699695061..6064b5be6c1 100644
--- a/recipe/dapo/run_dapo_wo_ds_qwen2.5_32b.sh
+++ b/recipe/dapo/run_dapo_wo_ds_qwen2.5_32b.sh
@@ -56,7 +56,7 @@ gen_tp=4
 
 ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     --working-dir "${WORKING_DIR}" \
-    -- python3 -m recipe.dapo.src.main_dapo \
+    -- python3 -m recipe.dapo.main_dapo \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
@@ -122,4 +122,4 @@ ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     trainer.save_freq=5 \
     trainer.total_epochs=1 \
     trainer.default_local_dir="${CKPTS_DIR}" \
-    trainer.resume_mode=auto
\ No newline at end of file
+    trainer.resume_mode=auto
diff --git a/recipe/dapo/test_dapo_7b.sh b/recipe/dapo/test_dapo_7b.sh
index 12a60cbabb2..fe5cb297bc1 100644
--- a/recipe/dapo/test_dapo_7b.sh
+++ b/recipe/dapo/test_dapo_7b.sh
@@ -55,7 +55,7 @@ offload=False
 
 ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
     --working-dir "${WORKING_DIR}" \
-    -- python3 -m recipe.dapo.src.main_dapo \
+    -- python3 -m recipe.dapo.main_dapo \
     data.train_files="${TRAIN_FILE}" \
     data.val_files="${TEST_FILE}" \
     data.prompt_key=prompt \
diff --git a/tests/generation/run_gen_qwen05.sh b/tests/e2e/generation/run_gen_qwen05.sh
similarity index 100%
rename from tests/generation/run_gen_qwen05.sh
rename to tests/e2e/generation/run_gen_qwen05.sh
diff --git a/tests/model/test_transformer.py b/tests/models/test_transformer.py
similarity index 100%
rename from tests/model/test_transformer.py
rename to tests/models/test_transformer.py
diff --git a/tests/model/test_transformers_ulysses.py b/tests/models/test_transformers_ulysses.py
similarity index 100%
rename from tests/model/test_transformers_ulysses.py
rename to tests/models/test_transformers_ulysses.py
diff --git a/tests/verl/test_protocol.py b/tests/test_protocol.py
similarity index 100%
rename from tests/verl/test_protocol.py
rename to tests/test_protocol.py
diff --git a/tests/checkpoint/test_fsdp_ckpt.py b/tests/utils/checkpoint/test_fsdp_ckpt.py
similarity index 100%
rename from tests/checkpoint/test_fsdp_ckpt.py
rename to tests/utils/checkpoint/test_fsdp_ckpt.py
diff --git a/tests/verl/utils/dataset/test_multiturn_sft_dataset.py b/tests/utils/dataset/test_multiturn_sft_dataset.py
similarity index 100%
rename from tests/verl/utils/dataset/test_multiturn_sft_dataset.py
rename to tests/utils/dataset/test_multiturn_sft_dataset.py
diff --git a/tests/verl/utils/dataset/test_rl_dataset.py b/tests/utils/dataset/test_rl_dataset.py
similarity index 100%
rename from tests/verl/utils/dataset/test_rl_dataset.py
rename to tests/utils/dataset/test_rl_dataset.py
diff --git a/tests/verl/utils/dataset/test_rm_dataset.py b/tests/utils/dataset/test_rm_dataset.py
similarity index 100%
rename from tests/verl/utils/dataset/test_rm_dataset.py
rename to tests/utils/dataset/test_rm_dataset.py
diff --git a/tests/verl/utils/dataset/test_sft_dataset.py b/tests/utils/dataset/test_sft_dataset.py
similarity index 100%
rename from tests/verl/utils/dataset/test_sft_dataset.py
rename to tests/utils/dataset/test_sft_dataset.py
diff --git a/tests/verl/utils/test_import_utils.py b/tests/utils/test_import_utils.py
similarity index 100%
rename from tests/verl/utils/test_import_utils.py
rename to tests/utils/test_import_utils.py
diff --git a/tests/verl/utils/test_model.py b/tests/utils/test_model.py
similarity index 100%
rename from tests/verl/utils/test_model.py
rename to tests/utils/test_model.py
diff --git a/tests/verl/utils/test_module.py b/tests/utils/test_module.py
similarity index 100%
rename from tests/verl/utils/test_module.py
rename to tests/utils/test_module.py
diff --git a/tests/rollout/async_rollout_utils.py b/tests/workers/rollout/async_rollout_utils.py
similarity index 100%
rename from tests/rollout/async_rollout_utils.py
rename to tests/workers/rollout/async_rollout_utils.py
diff --git a/tests/rollout/run_fsdp_vllm.py b/tests/workers/rollout/run_fsdp_vllm.py
similarity index 100%
rename from tests/rollout/run_fsdp_vllm.py
rename to tests/workers/rollout/run_fsdp_vllm.py
diff --git a/tests/rollout/test_hf_rollout.py b/tests/workers/rollout/test_hf_rollout.py
similarity index 100%
rename from tests/rollout/test_hf_rollout.py
rename to tests/workers/rollout/test_hf_rollout.py
diff --git a/tests/rollout/test_sglang_async_rollout_w_tools.py b/tests/workers/rollout/test_sglang_async_rollout_w_tools.py
similarity index 100%
rename from tests/rollout/test_sglang_async_rollout_w_tools.py
rename to tests/workers/rollout/test_sglang_async_rollout_w_tools.py
diff --git a/tests/rollout/test_sglang_async_spmd.py b/tests/workers/rollout/test_sglang_async_spmd.py
similarity index 100%
rename from tests/rollout/test_sglang_async_spmd.py
rename to tests/workers/rollout/test_sglang_async_spmd.py
diff --git a/tests/rollout/test_sglang_spmd.py b/tests/workers/rollout/test_sglang_spmd.py
similarity index 100%
rename from tests/rollout/test_sglang_spmd.py
rename to tests/workers/rollout/test_sglang_spmd.py
diff --git a/tests/rollout/test_vllm_hf_loader.py b/tests/workers/rollout/test_vllm_hf_loader.py
similarity index 100%
rename from tests/rollout/test_vllm_hf_loader.py
rename to tests/workers/rollout/test_vllm_hf_loader.py
diff --git a/tests/rollout/test_vllm_multi_turn.py b/tests/workers/rollout/test_vllm_multi_turn.py
similarity index 100%
rename from tests/rollout/test_vllm_multi_turn.py
rename to tests/workers/rollout/test_vllm_multi_turn.py
diff --git a/tests/rollout/test_vllm_spmd.py b/tests/workers/rollout/test_vllm_spmd.py
similarity index 100%
rename from tests/rollout/test_vllm_spmd.py
rename to tests/workers/rollout/test_vllm_spmd.py
diff --git a/tests/rollout/test_vllm_tool_calling.py b/tests/workers/rollout/test_vllm_tool_calling.py
similarity index 100%
rename from tests/rollout/test_vllm_tool_calling.py
rename to tests/workers/rollout/test_vllm_tool_calling.py
diff --git a/tests/rollout/utils_sglang.py b/tests/workers/rollout/utils_sglang.py
similarity index 100%
rename from tests/rollout/utils_sglang.py
rename to tests/workers/rollout/utils_sglang.py

From 0d55acb640fb8be4eb7261658cdcba4701184ccd Mon Sep 17 00:00:00 2001
From: Haibin Lin <haibin.lin@bytedance.com>
Date: Sun, 4 May 2025 14:29:23 -0700
Subject: [PATCH 2/7] fix paths

---
 .github/workflows/model.yml | 20 ++++++------
 .github/workflows/vllm.yml  |  2 +-
 docs/api/trainer.rst        | 22 +++++++++++++
 docs/api/workers.rst        | 61 +++++++++++++++++++++++++++++++++++++
 docs/perf/perf_tuning.rst   |  4 +--
 5 files changed, 96 insertions(+), 13 deletions(-)
 create mode 100644 docs/api/trainer.rst
 create mode 100644 docs/api/workers.rst

diff --git a/.github/workflows/model.yml b/.github/workflows/model.yml
index 61751cdb73a..332db307694 100644
--- a/.github/workflows/model.yml
+++ b/.github/workflows/model.yml
@@ -14,7 +14,7 @@ on:
     paths:
       - "verl/**/*.py"
       - "tests/**/*.sh"
-      - "tests/model/*"
+      - "tests/models/*"
       - .github/workflows/model.yml
       - "!recipe/**"
 
@@ -45,37 +45,37 @@ jobs:
           pip3 install --upgrade transformers
       - name: Running rmpad model tests on 8 L20 GPUs + flash_attn 2.5.8
         run: |
-          pytest -s tests/model/test_transformer.py
+          pytest -s tests/models/test_transformer.py
       - name: Running rmpad model tests on 8 L20 GPUs + latest flash_attn
         run: |
           pip3 install --upgrade flash_attn --no-build-isolation
-          pytest -s tests/model/test_transformer.py
+          pytest -s tests/models/test_transformer.py
       - name: Running FSDP rmpad model tests on 8 L20 GPUs + latest flash_attn
         run: |
-          torchrun --nproc_per_node=8 tests/checkpoint/test_fsdp_ckpt.py
+          torchrun --nproc_per_node=8 tests/utils/checkpoint/test_fsdp_ckpt.py
       - name: Running transformers ulysses tests on 8 L20 GPUs + latest transformers
         run: |
-          torchrun --nproc_per_node=8 -m pytest tests/model/test_transformers_ulysses.py
+          torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py
       - name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.49.0
         run: |
           pip3 install transformers==4.49.0
-          torchrun --nproc_per_node=8 -m pytest tests/model/test_transformers_ulysses.py
+          torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py
       - name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.48.0
         run: |
           pip3 install transformers==4.48.0
-          torchrun --nproc_per_node=8 -m pytest tests/model/test_transformers_ulysses.py
+          torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py
       - name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.47.0
         run: |
           pip3 install transformers==4.47.0
-          torchrun --nproc_per_node=8 -m pytest tests/model/test_transformers_ulysses.py
+          torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py
       - name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.46.0
         run: |
           pip3 install transformers==4.46.0
-          torchrun --nproc_per_node=8 -m pytest tests/model/test_transformers_ulysses.py
+          torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py
       - name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.45.0
         run: |
           pip3 install transformers==4.45.0
-          torchrun --nproc_per_node=8 -m pytest tests/model/test_transformers_ulysses.py
+          torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py
       - name: Run distributed test
         run: |
           bash tests/distributed/run_all.sh
diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml
index 8b086b4bf87..96b50cc2164 100644
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@@ -91,4 +91,4 @@ jobs:
       - name: Running multi-turn rollout tests on 8 L20 GPUs
         run: |
           pip3 install --upgrade vllm==0.8.3 tensordict==0.7.2
-          python3 tests/rollout/test_vllm_multi_turn.py
+          python3 tests/workers/rollout/test_vllm_multi_turn.py
diff --git a/docs/api/trainer.rst b/docs/api/trainer.rst
new file mode 100644
index 00000000000..791f56af6cf
--- /dev/null
+++ b/docs/api/trainer.rst
@@ -0,0 +1,22 @@
+Trainers
+=========================
+
+Trainers drive the training loop. Introducing new trainer classes in case of new training paradiam is encouraged.
+
+.. autosummary::
+   :nosignatures:
+
+   verl.trainer.fsdp_sft_trainer.FSDPSFTTrainer
+   verl.trainer.ppo.ray_trainer.RayPPOTrainer
+
+
+Core APIs
+~~~~~~~~~~~~~~~~~
+
+.. autoclass::  verl.trainer.ppo.ray_trainer.RayPPOTrainer
+
+.. automodule:: verl.utils.tokenizer
+   :members: hf_tokenizer
+
+.. automodule:: verl.single_controller
+   :members: Worker, WorkerGroup, ClassWithInitArgs, ResourcePool
diff --git a/docs/api/workers.rst b/docs/api/workers.rst
new file mode 100644
index 00000000000..34889851475
--- /dev/null
+++ b/docs/api/workers.rst
@@ -0,0 +1,61 @@
+Data interface
+=========================
+
+DataProto is the interface for data exchange.
+
+The :class:`verl.DataProto` class contains two key members:
+
+- batch: a :class:`tensordict.TensorDict` object for the actual data
+- meta_info: a :class:`Dict` with additional meta information
+
+TensorDict
+~~~~~~~~~~~~
+
+:attr:`DataProto.batch` is built on top of :class:`tensordict`, a project in the PyTorch ecosystem.
+A TensorDict is a dict-like container for tensors. To instantiate a TensorDict, you must specify key-value pairs as well as the batch size.
+
+.. code-block:: python
+
+    >>> import torch
+    >>> from tensordict import TensorDict
+    >>> tensordict = TensorDict({"zeros": torch.zeros(2, 3, 4), "ones": torch.ones(2, 3, 5)}, batch_size=[2,])
+    >>> tensordict["twos"] = 2 * torch.ones(2, 5, 6)
+    >>> zeros = tensordict["zeros"]
+    >>> tensordict
+    TensorDict(
+    fields={
+        ones: Tensor(shape=torch.Size([2, 3, 5]), device=cpu, dtype=torch.float32, is_shared=False),
+        twos: Tensor(shape=torch.Size([2, 5, 6]), device=cpu, dtype=torch.float32, is_shared=False),
+        zeros: Tensor(shape=torch.Size([2, 3, 4]), device=cpu, dtype=torch.float32, is_shared=False)},
+    batch_size=torch.Size([2]),
+    device=None,
+    is_shared=False)
+
+One can also index a tensordict along its batch_size. The contents of the TensorDict can be manipulated collectively as well.
+
+.. code-block:: python
+
+    >>> tensordict[..., :1]
+    TensorDict(
+    fields={
+        ones: Tensor(shape=torch.Size([1, 3, 5]), device=cpu, dtype=torch.float32, is_shared=False),
+        twos: Tensor(shape=torch.Size([1, 5, 6]), device=cpu, dtype=torch.float32, is_shared=False),
+        zeros: Tensor(shape=torch.Size([1, 3, 4]), device=cpu, dtype=torch.float32, is_shared=False)},
+    batch_size=torch.Size([1]),
+    device=None,
+    is_shared=False)
+    >>> tensordict = tensordict.to("cuda:0")
+    >>> tensordict = tensordict.reshape(6)
+
+For more about :class:`tensordict.TensorDict` usage, see the official tensordict_ documentation.
+
+.. _tensordict: https://pytorch.org/tensordict/overview.html
+
+
+Core APIs
+~~~~~~~~~~~~~~~~~
+
+
+.. autoclass::  verl.workers.actor.DataParallelPPOActor
+
+.. autoclass::  verl.workers.actor.DataParallelPPOActor
diff --git a/docs/perf/perf_tuning.rst b/docs/perf/perf_tuning.rst
index fa2d416d69f..9a509ce1102 100644
--- a/docs/perf/perf_tuning.rst
+++ b/docs/perf/perf_tuning.rst
@@ -51,12 +51,12 @@ Currently, for llama, mistral, gemma1 and qwen based models, users can enable `u
 sequence packing implementation provided by transformers library.
 
 For other models, transformers library may also support it but we haven't tested it yet.
-Users can add the desired model config to the  `test_transformer.py <https://github.com/volcengine/verl/blob/main/tests/model/test_transformer.py#L24>`_ file.
+Users can add the desired model config to the  `test_transformer.py <https://github.com/volcengine/verl/blob/main/tests/models/test_transformer.py#L24>`_ file.
 And test its functionaility by running the following command:
 
 .. code-block:: bash
 
-  pytest -s tests/model/test_transformer.py
+  pytest -s tests/models/test_transformer.py
 
 If the test passes, you can add your desired model into the model `registry.py <https://github.com/volcengine/verl/blob/main/verl/models/registry.py#L24>`_ file.
 Then, you can enjoy the performance boost of sequence packing

From df2a2b68c07b86acec1c859e2e6670bf06fdba7c Mon Sep 17 00:00:00 2001
From: Haibin Lin <haibin.lin@bytedance.com>
Date: Sun, 4 May 2025 14:31:10 -0700
Subject: [PATCH 3/7] remove  unintended files

---
 docs/api/trainer.rst |  1 -
 docs/api/workers.rst | 61 --------------------------------------------
 2 files changed, 62 deletions(-)
 delete mode 100644 docs/api/workers.rst

diff --git a/docs/api/trainer.rst b/docs/api/trainer.rst
index 791f56af6cf..d890b7341c6 100644
--- a/docs/api/trainer.rst
+++ b/docs/api/trainer.rst
@@ -6,7 +6,6 @@ Trainers drive the training loop. Introducing new trainer classes in case of new
 .. autosummary::
    :nosignatures:
 
-   verl.trainer.fsdp_sft_trainer.FSDPSFTTrainer
    verl.trainer.ppo.ray_trainer.RayPPOTrainer
 
 
diff --git a/docs/api/workers.rst b/docs/api/workers.rst
deleted file mode 100644
index 34889851475..00000000000
--- a/docs/api/workers.rst
+++ /dev/null
@@ -1,61 +0,0 @@
-Data interface
-=========================
-
-DataProto is the interface for data exchange.
-
-The :class:`verl.DataProto` class contains two key members:
-
-- batch: a :class:`tensordict.TensorDict` object for the actual data
-- meta_info: a :class:`Dict` with additional meta information
-
-TensorDict
-~~~~~~~~~~~~
-
-:attr:`DataProto.batch` is built on top of :class:`tensordict`, a project in the PyTorch ecosystem.
-A TensorDict is a dict-like container for tensors. To instantiate a TensorDict, you must specify key-value pairs as well as the batch size.
-
-.. code-block:: python
-
-    >>> import torch
-    >>> from tensordict import TensorDict
-    >>> tensordict = TensorDict({"zeros": torch.zeros(2, 3, 4), "ones": torch.ones(2, 3, 5)}, batch_size=[2,])
-    >>> tensordict["twos"] = 2 * torch.ones(2, 5, 6)
-    >>> zeros = tensordict["zeros"]
-    >>> tensordict
-    TensorDict(
-    fields={
-        ones: Tensor(shape=torch.Size([2, 3, 5]), device=cpu, dtype=torch.float32, is_shared=False),
-        twos: Tensor(shape=torch.Size([2, 5, 6]), device=cpu, dtype=torch.float32, is_shared=False),
-        zeros: Tensor(shape=torch.Size([2, 3, 4]), device=cpu, dtype=torch.float32, is_shared=False)},
-    batch_size=torch.Size([2]),
-    device=None,
-    is_shared=False)
-
-One can also index a tensordict along its batch_size. The contents of the TensorDict can be manipulated collectively as well.
-
-.. code-block:: python
-
-    >>> tensordict[..., :1]
-    TensorDict(
-    fields={
-        ones: Tensor(shape=torch.Size([1, 3, 5]), device=cpu, dtype=torch.float32, is_shared=False),
-        twos: Tensor(shape=torch.Size([1, 5, 6]), device=cpu, dtype=torch.float32, is_shared=False),
-        zeros: Tensor(shape=torch.Size([1, 3, 4]), device=cpu, dtype=torch.float32, is_shared=False)},
-    batch_size=torch.Size([1]),
-    device=None,
-    is_shared=False)
-    >>> tensordict = tensordict.to("cuda:0")
-    >>> tensordict = tensordict.reshape(6)
-
-For more about :class:`tensordict.TensorDict` usage, see the official tensordict_ documentation.
-
-.. _tensordict: https://pytorch.org/tensordict/overview.html
-
-
-Core APIs
-~~~~~~~~~~~~~~~~~
-
-
-.. autoclass::  verl.workers.actor.DataParallelPPOActor
-
-.. autoclass::  verl.workers.actor.DataParallelPPOActor

From 6ed42a3293d3069dbe2b3f8ffa45fbdada05b0ac Mon Sep 17 00:00:00 2001
From: Haibin Lin <haibin.lin@bytedance.com>
Date: Sun, 4 May 2025 14:45:11 -0700
Subject: [PATCH 4/7] ignore ckpt gpu tests

---
 .github/workflows/verl_unit_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/verl_unit_test.yml b/.github/workflows/verl_unit_test.yml
index ef1ee78ab33..43a6e4ed22d 100644
--- a/.github/workflows/verl_unit_test.yml
+++ b/.github/workflows/verl_unit_test.yml
@@ -48,4 +48,4 @@ jobs:
       - name: Running utils tests 
         run: |
           cd tests/utils
-          pytest -s -x --ignore=dataset/ .
+          pytest -s -x --ignore=dataset/ --ignore=checkpoint/ .

From fbf7524cdbd1dc79bd167a8f77bd461ad7825b4e Mon Sep 17 00:00:00 2001
From: Haibin Lin <haibin.lin@bytedance.com>
Date: Sun, 4 May 2025 18:31:34 -0700
Subject: [PATCH 5/7] fix tests

---
 tests/e2e/run_dapo.sh                         | 2 +-
 tests/workers/rollout/test_vllm_multi_turn.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/e2e/run_dapo.sh b/tests/e2e/run_dapo.sh
index 34f520ab0c5..ef748dd92fb 100644
--- a/tests/e2e/run_dapo.sh
+++ b/tests/e2e/run_dapo.sh
@@ -41,7 +41,7 @@ gen_prompt_bsz=$((train_prompt_bsz * 4))
 
 exp_name="$(basename "${MODEL_ID,,}")-dapo-minimal"
 
-python3 -m recipe.dapo.src.main_dapo \
+python3 -m recipe.dapo.main_dapo \
     data.train_files="${HOME}/data/gsm8k/train.parquet" \
     data.val_files="${HOME}/data/gsm8k/test.parquet" \
     reward_model.reward_manager=dapo \
diff --git a/tests/workers/rollout/test_vllm_multi_turn.py b/tests/workers/rollout/test_vllm_multi_turn.py
index ea683b83024..f691cfe9dd4 100644
--- a/tests/workers/rollout/test_vllm_multi_turn.py
+++ b/tests/workers/rollout/test_vllm_multi_turn.py
@@ -21,7 +21,7 @@
 from openai.types.chat.chat_completion import ChatCompletion
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ChatCompletionResponse, ChatCompletionStreamResponse, ErrorResponse
 
-from tests.rollout.async_rollout_utils import init_async_rollout_manager
+from tests.workers.rollout.async_rollout_utils import init_async_rollout_manager
 from verl.protocol import DataProto
 
 

From b1a51534e7781f96a60ab67ca954ae6080b05182 Mon Sep 17 00:00:00 2001
From: Haibin Lin <haibin.lin@bytedance.com>
Date: Fri, 9 May 2025 11:19:24 -0700
Subject: [PATCH 6/7] fix lint

---
 tests/utility/test_timeout_decorator.py       | 11 ++++-----
 tests/utils/test_flops_counter.py             |  5 ++--
 verl/single_controller/base/worker.py         |  1 +
 verl/utils/megatron_utils.py                  |  4 ++--
 verl/utils/reward_score/math_dapo.py          |  2 +-
 .../utils/reward_score/prime_math/__init__.py |  3 ++-
 verl/utils/reward_score/prime_math/grader.py  |  1 -
 verl/workers/megatron_workers.py              |  1 +
 .../sharding_manager/megatron_sglang.py       | 23 ++++---------------
 9 files changed, 18 insertions(+), 33 deletions(-)

diff --git a/tests/utility/test_timeout_decorator.py b/tests/utility/test_timeout_decorator.py
index e9f78a7c6c9..57b563bce69 100644
--- a/tests/utility/test_timeout_decorator.py
+++ b/tests/utility/test_timeout_decorator.py
@@ -12,17 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import time
-import os
-import sys
 import multiprocessing
-import queue
-import pytest # Import pytest
-from functools import wraps
+import sys
+import threading
+import time
 
+import pytest  # Import pytest
 
 from verl.utils.py_functional import timeout_limit as timeout
-import threading
 
 # --- Test Task Functions ---
 TEST_TIMEOUT_SECONDS = 1.5 # Timeout duration for tests
diff --git a/tests/utils/test_flops_counter.py b/tests/utils/test_flops_counter.py
index c0420989783..c8d3589e9d3 100644
--- a/tests/utils/test_flops_counter.py
+++ b/tests/utils/test_flops_counter.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytest
 import math
-import json
+
+import pytest
+
 from verl.utils.flops_counter import FlopsCounter
 
 VALID_CONFIG_TYPE = {"llama", "qwen2", "qwen3", "qwen3_moe", "deepseek_v3"}
diff --git a/verl/single_controller/base/worker.py b/verl/single_controller/base/worker.py
index 8acb4b6a040..7e7a3f2d9ae 100644
--- a/verl/single_controller/base/worker.py
+++ b/verl/single_controller/base/worker.py
@@ -136,6 +136,7 @@ def _configure_before_init(self, register_center_name: str, rank: int):
     def __init__(self, cuda_visible_devices=None) -> None:
         # construct a meta from environment variable. Note that the import must be inside the class because it is executed remotely
         import os
+
         import torch
         from packaging import version
 
diff --git a/verl/utils/megatron_utils.py b/verl/utils/megatron_utils.py
index 60b220b52c0..a16b32ca4d3 100644
--- a/verl/utils/megatron_utils.py
+++ b/verl/utils/megatron_utils.py
@@ -31,9 +31,9 @@
 from megatron.core.utils import get_attr_wrapped_model
 from transformers import PretrainedConfig
 
-from verl.utils.torch_dtypes import PrecisionType
-from verl.utils.model import normalize_model_name
 import verl.utils.megatron.tensor_parallel as tp_utils
+from verl.utils.model import normalize_model_name
+from verl.utils.torch_dtypes import PrecisionType
 
 
 def get_model_config(model):
diff --git a/verl/utils/reward_score/math_dapo.py b/verl/utils/reward_score/math_dapo.py
index d48f68bf4ec..33a699e5641 100644
--- a/verl/utils/reward_score/math_dapo.py
+++ b/verl/utils/reward_score/math_dapo.py
@@ -14,9 +14,9 @@
 # Adapted from https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/hendrycks_math/utils.py
 
 import re
-import signal
 from typing import Optional
 
+
 def last_boxed_only_string(string: str) -> Optional[str]:
     """Extract the last LaTeX boxed expression from a string.
 
diff --git a/verl/utils/reward_score/prime_math/__init__.py b/verl/utils/reward_score/prime_math/__init__.py
index b29a6dc971c..f24c78b917a 100644
--- a/verl/utils/reward_score/prime_math/__init__.py
+++ b/verl/utils/reward_score/prime_math/__init__.py
@@ -28,9 +28,10 @@
 from pylatexenc import latex2text
 from sympy.parsing import sympy_parser
 
+from verl.utils.py_functional import timeout_limit
+
 from . import math_normalize
 from .grader import math_equal
-from verl.utils.py_functional import timeout_limit
 
 # import math_normalize
 # from grader import math_equal
diff --git a/verl/utils/reward_score/prime_math/grader.py b/verl/utils/reward_score/prime_math/grader.py
index 55048ac8dbe..e2d5fe4862c 100644
--- a/verl/utils/reward_score/prime_math/grader.py
+++ b/verl/utils/reward_score/prime_math/grader.py
@@ -95,7 +95,6 @@
 import contextlib
 import math
 import re
-import signal
 from math import isclose
 from typing import Union
 
diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py
index 3cbcacc6c76..2828448cf18 100644
--- a/verl/workers/megatron_workers.py
+++ b/verl/workers/megatron_workers.py
@@ -262,6 +262,7 @@ def _build_rollout(self, trust_remote_code=False):
             log_gpu_memory_usage("After building sharding manager", logger=logger)
         elif self.config.rollout.name == 'sglang':
             from verl.workers.rollout.sglang_rollout import SGLangRollout
+
             # NOTE(linjunrong): Due to recent fp8 support in SGLang. Now importing any symbol relate to SGLang's model_runner would check CUDA device capability.
             # However, due to veRL's setting, the main process of ray can not find any CUDA device, which would potentially lead to:
             # "RuntimeError: No CUDA GPUs are available".
diff --git a/verl/workers/sharding_manager/megatron_sglang.py b/verl/workers/sharding_manager/megatron_sglang.py
index 5d4167916ff..817867a5a49 100644
--- a/verl/workers/sharding_manager/megatron_sglang.py
+++ b/verl/workers/sharding_manager/megatron_sglang.py
@@ -15,19 +15,13 @@
 This file contains a Megatron style Hybrid Engine that shares the weights of the actor with the inference engine.
 """
 
-import importlib
 import logging
 import os
+
 import torch
-import torch.distributed as dist
 from torch import nn
 
-from verl.utils.model import normalize_model_name
-from verl.utils.megatron_utils import broadcast_from_megatron_pp, broadcast_str_from_megatron_pp
-
-from verl.utils.megatron_utils import get_model, unwrap_model
 from verl.utils.debug import log_gpu_memory_usage
-from verl.utils.megatron_utils import convert_megatron_model_to_transformers_model
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv('VERL_PPO_LOGGING_LEVEL', 'WARN'))
@@ -40,23 +34,14 @@
 - After inference, all the parameters that doesn't belong to this pp rank is freed.
 """
 
-from .base import BaseShardingManager
-
-import torch
-from torch import nn
 import torch.distributed
+from sglang.srt.entrypoints.verl_engine import VerlEngine
 from torch.distributed import new_group
-from torch.distributed._tensor import DTensor
-from typing import Dict, Iterable, Union, Tuple
 
-from verl import DataProto
-from verl.protocol import all_gather_data_proto
-from verl.utils.torch_functional import (broadcast_dict_tensor, allgather_dict_tensors)
-from sglang.srt.entrypoints.verl_engine import VerlEngine
 from verl.utils.debug import GPUMemoryLogger
+from verl.utils.megatron_utils import per_tensor_generator
 
-import verl.utils.megatron.tensor_parallel as tp_utils
-from verl.utils.megatron_utils import per_tensor_generator, default_tp_concat_fn
+from .base import BaseShardingManager
 
 _MICRO_DATA_PARALLEL_GROUP = None
 

From 537de291ab70dc7be9e2492b8551c87dc1b0bf9a Mon Sep 17 00:00:00 2001
From: Haibin Lin <haibin.lin@bytedance.com>
Date: Fri, 9 May 2025 13:17:03 -0700
Subject: [PATCH 7/7] fix tests

---
 .github/workflows/model.yml          | 2 +-
 .github/workflows/verl_unit_test.yml | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/model.yml b/.github/workflows/model.yml
index 15eb79e2083..6193321a3e7 100644
--- a/.github/workflows/model.yml
+++ b/.github/workflows/model.yml
@@ -106,4 +106,4 @@ jobs:
       - name: Running FSDP2 rmpad model tests on 8 L20 GPUs + latest flash_attn
         run: |
           pip3 install --upgrade flash_attn --no-build-isolation
-          STRATEGY=fsdp2 torchrun --nproc_per_node=8 tests/checkpoint/test_fsdp_ckpt.py
+          STRATEGY=fsdp2 torchrun --nproc_per_node=8 tests/utils/checkpoint/test_fsdp_ckpt.py
diff --git a/.github/workflows/verl_unit_test.yml b/.github/workflows/verl_unit_test.yml
index a84a52f42e4..1d4028fb3b4 100644
--- a/.github/workflows/verl_unit_test.yml
+++ b/.github/workflows/verl_unit_test.yml
@@ -45,7 +45,7 @@ jobs:
         run: |
           cd tests
           pytest -s -x test_protocol.py
-      - name: Running utils tests
+      - name: running utils tests
         run: |
           cd tests/utils
-          pytest -s -x --ignore=dataset/ --ignore=checkpoint/ .
+          pytest -s -x --ignore=dataset/ --ignore=checkpoint/ --ignore=test_flops_counter.py .