From 361ad071b8fa38fec3b71c155f08d48b62798e13 Mon Sep 17 00:00:00 2001 From: Haibin Lin Date: Sun, 4 May 2025 13:28:20 -0700 Subject: [PATCH 1/7] [tests] refactor: move test files to their own namespaces (verl/xxx -> tests/xxx) --- .github/workflows/dataset.yml | 8 ++++---- .github/workflows/sgl.yml | 6 +++--- .github/workflows/verl_unit_test.yml | 4 ++-- .github/workflows/vllm.yml | 10 +++++----- .gitignore | 3 --- recipe/dapo/{src => }/config/dapo_trainer.yaml | 0 recipe/dapo/{src => }/dapo_ray_trainer.py | 0 recipe/dapo/{src => }/main_dapo.py | 0 recipe/dapo/run_dapo_early_qwen2.5_32b.sh | 4 ++-- recipe/dapo/run_dapo_qwen2.5_32b.sh | 4 ++-- recipe/dapo/run_dapo_wo_ds_qwen2.5_32b.sh | 4 ++-- recipe/dapo/test_dapo_7b.sh | 2 +- tests/{ => e2e}/generation/run_gen_qwen05.sh | 0 tests/{model => models}/test_transformer.py | 0 tests/{model => models}/test_transformers_ulysses.py | 0 tests/{verl => }/test_protocol.py | 0 tests/{ => utils}/checkpoint/test_fsdp_ckpt.py | 0 .../utils/dataset/test_multiturn_sft_dataset.py | 0 tests/{verl => }/utils/dataset/test_rl_dataset.py | 0 tests/{verl => }/utils/dataset/test_rm_dataset.py | 0 tests/{verl => }/utils/dataset/test_sft_dataset.py | 0 tests/{verl => }/utils/test_import_utils.py | 0 tests/{verl => }/utils/test_model.py | 0 tests/{verl => }/utils/test_module.py | 0 tests/{ => workers}/rollout/async_rollout_utils.py | 0 tests/{ => workers}/rollout/run_fsdp_vllm.py | 0 tests/{ => workers}/rollout/test_hf_rollout.py | 0 .../rollout/test_sglang_async_rollout_w_tools.py | 0 tests/{ => workers}/rollout/test_sglang_async_spmd.py | 0 tests/{ => workers}/rollout/test_sglang_spmd.py | 0 tests/{ => workers}/rollout/test_vllm_hf_loader.py | 0 tests/{ => workers}/rollout/test_vllm_multi_turn.py | 0 tests/{ => workers}/rollout/test_vllm_spmd.py | 0 tests/{ => workers}/rollout/test_vllm_tool_calling.py | 0 tests/{ => workers}/rollout/utils_sglang.py | 0 35 files changed, 21 insertions(+), 24 deletions(-) rename recipe/dapo/{src => }/config/dapo_trainer.yaml (100%) rename recipe/dapo/{src => }/dapo_ray_trainer.py (100%) rename recipe/dapo/{src => }/main_dapo.py (100%) rename tests/{ => e2e}/generation/run_gen_qwen05.sh (100%) rename tests/{model => models}/test_transformer.py (100%) rename tests/{model => models}/test_transformers_ulysses.py (100%) rename tests/{verl => }/test_protocol.py (100%) rename tests/{ => utils}/checkpoint/test_fsdp_ckpt.py (100%) rename tests/{verl => }/utils/dataset/test_multiturn_sft_dataset.py (100%) rename tests/{verl => }/utils/dataset/test_rl_dataset.py (100%) rename tests/{verl => }/utils/dataset/test_rm_dataset.py (100%) rename tests/{verl => }/utils/dataset/test_sft_dataset.py (100%) rename tests/{verl => }/utils/test_import_utils.py (100%) rename tests/{verl => }/utils/test_model.py (100%) rename tests/{verl => }/utils/test_module.py (100%) rename tests/{ => workers}/rollout/async_rollout_utils.py (100%) rename tests/{ => workers}/rollout/run_fsdp_vllm.py (100%) rename tests/{ => workers}/rollout/test_hf_rollout.py (100%) rename tests/{ => workers}/rollout/test_sglang_async_rollout_w_tools.py (100%) rename tests/{ => workers}/rollout/test_sglang_async_spmd.py (100%) rename tests/{ => workers}/rollout/test_sglang_spmd.py (100%) rename tests/{ => workers}/rollout/test_vllm_hf_loader.py (100%) rename tests/{ => workers}/rollout/test_vllm_multi_turn.py (100%) rename tests/{ => workers}/rollout/test_vllm_spmd.py (100%) rename tests/{ => workers}/rollout/test_vllm_tool_calling.py (100%) rename tests/{ => workers}/rollout/utils_sglang.py (100%) diff --git a/.github/workflows/dataset.yml b/.github/workflows/dataset.yml index 2535dd81fd3..445796e4a36 100644 --- a/.github/workflows/dataset.yml +++ b/.github/workflows/dataset.yml @@ -52,10 +52,10 @@ jobs: run: | [ ! -d "$HOME/verl-data" ] && git clone --depth 1 https://github.com/eric-haibin-lin/verl-data ~/verl-data python3 examples/data_preprocess/geo3k.py - pytest -s -x tests/verl/utils/dataset/test_rl_dataset.py - pytest -s -x tests/verl/utils/dataset/test_sft_dataset.py - pytest -s -x tests/verl/utils/test_import_utils.py - # pytest -s -x tests/verl/utils/dataset/test_rm_dataset.py + pytest -s -x tests/utils/dataset/test_rl_dataset.py + pytest -s -x tests/utils/dataset/test_sft_dataset.py + pytest -s -x tests/utils/test_import_utils.py + # pytest -s -x tests/utils/dataset/test_rm_dataset.py - name: Running ray test using cupy (move it to L20 when dockerfile ready) run: | cd tests/ray_gpu diff --git a/.github/workflows/sgl.yml b/.github/workflows/sgl.yml index 4a3c192a11c..111565dbc45 100644 --- a/.github/workflows/sgl.yml +++ b/.github/workflows/sgl.yml @@ -53,13 +53,13 @@ jobs: pip3 install -e .[test,gpu,sglang] --no-deps - name: Test the latest SGLang run: | - cd tests/rollout + cd tests/workers/rollout torchrun --nnodes=1 --nproc_per_node=4 $(which pytest) -s test_sglang_spmd.py - name: Test the latest SGLang async run: | - cd tests/rollout + cd tests/workers/rollout torchrun --nnodes=1 --nproc_per_node=2 $(which pytest) -s test_sglang_async_spmd.py - name: Test the latest SGLang Rollout async with tool run: | - cd tests/rollout + cd tests/workers/rollout torchrun --nnodes=1 --nproc_per_node=2 $(which pytest) -s test_sglang_async_rollout_w_tools.py diff --git a/.github/workflows/verl_unit_test.yml b/.github/workflows/verl_unit_test.yml index c55caca0ded..ef1ee78ab33 100644 --- a/.github/workflows/verl_unit_test.yml +++ b/.github/workflows/verl_unit_test.yml @@ -43,9 +43,9 @@ jobs: pip install -e .[test] - name: Running test protocol.py run: | - cd tests/verl + cd tests pytest -s -x test_protocol.py - name: Running utils tests run: | - cd tests/verl/utils + cd tests/utils pytest -s -x --ignore=dataset/ . diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml index abe8bc78284..8b086b4bf87 100644 --- a/.github/workflows/vllm.yml +++ b/.github/workflows/vllm.yml @@ -15,7 +15,7 @@ on: - "**/*.py" # Entrypoints - ".github/workflows/vllm.yml" - - "tests/generation" + - "tests/e2e/generation" - "verl/trainer/main_generation.py" - "verl/trainer/config/generation.yaml" - "!examples" @@ -69,22 +69,22 @@ jobs: # Disable requests to avoid network errors - name: Running vllm tests on 8 L20 GPUs run: | - cd tests/rollout + cd tests/workers/rollout torchrun --standalone --nnodes=1 --nproc_per_node=8 $(which pytest) -s test_vllm_hf_loader.py - name: Test the latest vLLM run: | pip3 install --upgrade vllm==0.7.3 - cd tests/rollout + cd tests/workers/rollout torchrun --standalone --nnodes=1 --nproc_per_node=4 $(which pytest) -s test_vllm_spmd.py - name: Run Qwen 0.5B generation test run: | - cd tests/generation + cd tests/e2e/generation export OUTPUT_PATH="${HOME}/data/gen/qwen_05_gen_test.parquet" MODEL_ID=Qwen/Qwen2.5-0.5B-Instruct NGPUS_PER_NODE=4 GEN_TP=2 bash ./run_gen_qwen05.sh rm -rf "${OUTPUT_PATH}" - name: Run Qwen 0.5B generation test when world_size == 1 run: | - cd tests/generation + cd tests/e2e/generation export OUTPUT_PATH="${HOME}/data/gen/qwen_05_gen_test.parquet" MODEL_ID=Qwen/Qwen2.5-0.5B-Instruct NGPUS_PER_NODE=1 GEN_TP=1 bash ./run_gen_qwen05.sh rm -rf "${OUTPUT_PATH}" diff --git a/.gitignore b/.gitignore index 98df3516ee6..f444f0e4c9d 100644 --- a/.gitignore +++ b/.gitignore @@ -109,9 +109,6 @@ ENV/ # Mac .DS_Store -# output logs -tests/e2e/toy_examples/deepspeed/synchronous/output.txt - # vim *.swp diff --git a/recipe/dapo/src/config/dapo_trainer.yaml b/recipe/dapo/config/dapo_trainer.yaml similarity index 100% rename from recipe/dapo/src/config/dapo_trainer.yaml rename to recipe/dapo/config/dapo_trainer.yaml diff --git a/recipe/dapo/src/dapo_ray_trainer.py b/recipe/dapo/dapo_ray_trainer.py similarity index 100% rename from recipe/dapo/src/dapo_ray_trainer.py rename to recipe/dapo/dapo_ray_trainer.py diff --git a/recipe/dapo/src/main_dapo.py b/recipe/dapo/main_dapo.py similarity index 100% rename from recipe/dapo/src/main_dapo.py rename to recipe/dapo/main_dapo.py diff --git a/recipe/dapo/run_dapo_early_qwen2.5_32b.sh b/recipe/dapo/run_dapo_early_qwen2.5_32b.sh index 8ddfcb9445f..c7bd5c189e5 100644 --- a/recipe/dapo/run_dapo_early_qwen2.5_32b.sh +++ b/recipe/dapo/run_dapo_early_qwen2.5_32b.sh @@ -58,7 +58,7 @@ gen_tp=4 ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ --working-dir "${WORKING_DIR}" \ - -- python3 -m recipe.dapo.src.main_dapo \ + -- python3 -m recipe.dapo.main_dapo \ data.train_files="${TRAIN_FILE}" \ data.val_files="${TEST_FILE}" \ data.prompt_key=prompt \ @@ -125,4 +125,4 @@ ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ trainer.save_freq=5 \ trainer.total_epochs=1 \ trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ No newline at end of file + trainer.resume_mode=auto diff --git a/recipe/dapo/run_dapo_qwen2.5_32b.sh b/recipe/dapo/run_dapo_qwen2.5_32b.sh index 4a85e0f2fcf..6eec26c80e0 100644 --- a/recipe/dapo/run_dapo_qwen2.5_32b.sh +++ b/recipe/dapo/run_dapo_qwen2.5_32b.sh @@ -58,7 +58,7 @@ gen_tp=4 ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ --working-dir "${WORKING_DIR}" \ - -- python3 -m recipe.dapo.src.main_dapo \ + -- python3 -m recipe.dapo.main_dapo \ data.train_files="${TRAIN_FILE}" \ data.val_files="${TEST_FILE}" \ data.prompt_key=prompt \ @@ -127,4 +127,4 @@ ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ trainer.save_freq=5 \ trainer.total_epochs=1 \ trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ No newline at end of file + trainer.resume_mode=auto diff --git a/recipe/dapo/run_dapo_wo_ds_qwen2.5_32b.sh b/recipe/dapo/run_dapo_wo_ds_qwen2.5_32b.sh index e1699695061..6064b5be6c1 100644 --- a/recipe/dapo/run_dapo_wo_ds_qwen2.5_32b.sh +++ b/recipe/dapo/run_dapo_wo_ds_qwen2.5_32b.sh @@ -56,7 +56,7 @@ gen_tp=4 ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ --working-dir "${WORKING_DIR}" \ - -- python3 -m recipe.dapo.src.main_dapo \ + -- python3 -m recipe.dapo.main_dapo \ data.train_files="${TRAIN_FILE}" \ data.val_files="${TEST_FILE}" \ data.prompt_key=prompt \ @@ -122,4 +122,4 @@ ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ trainer.save_freq=5 \ trainer.total_epochs=1 \ trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.resume_mode=auto \ No newline at end of file + trainer.resume_mode=auto diff --git a/recipe/dapo/test_dapo_7b.sh b/recipe/dapo/test_dapo_7b.sh index 12a60cbabb2..fe5cb297bc1 100644 --- a/recipe/dapo/test_dapo_7b.sh +++ b/recipe/dapo/test_dapo_7b.sh @@ -55,7 +55,7 @@ offload=False ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ --working-dir "${WORKING_DIR}" \ - -- python3 -m recipe.dapo.src.main_dapo \ + -- python3 -m recipe.dapo.main_dapo \ data.train_files="${TRAIN_FILE}" \ data.val_files="${TEST_FILE}" \ data.prompt_key=prompt \ diff --git a/tests/generation/run_gen_qwen05.sh b/tests/e2e/generation/run_gen_qwen05.sh similarity index 100% rename from tests/generation/run_gen_qwen05.sh rename to tests/e2e/generation/run_gen_qwen05.sh diff --git a/tests/model/test_transformer.py b/tests/models/test_transformer.py similarity index 100% rename from tests/model/test_transformer.py rename to tests/models/test_transformer.py diff --git a/tests/model/test_transformers_ulysses.py b/tests/models/test_transformers_ulysses.py similarity index 100% rename from tests/model/test_transformers_ulysses.py rename to tests/models/test_transformers_ulysses.py diff --git a/tests/verl/test_protocol.py b/tests/test_protocol.py similarity index 100% rename from tests/verl/test_protocol.py rename to tests/test_protocol.py diff --git a/tests/checkpoint/test_fsdp_ckpt.py b/tests/utils/checkpoint/test_fsdp_ckpt.py similarity index 100% rename from tests/checkpoint/test_fsdp_ckpt.py rename to tests/utils/checkpoint/test_fsdp_ckpt.py diff --git a/tests/verl/utils/dataset/test_multiturn_sft_dataset.py b/tests/utils/dataset/test_multiturn_sft_dataset.py similarity index 100% rename from tests/verl/utils/dataset/test_multiturn_sft_dataset.py rename to tests/utils/dataset/test_multiturn_sft_dataset.py diff --git a/tests/verl/utils/dataset/test_rl_dataset.py b/tests/utils/dataset/test_rl_dataset.py similarity index 100% rename from tests/verl/utils/dataset/test_rl_dataset.py rename to tests/utils/dataset/test_rl_dataset.py diff --git a/tests/verl/utils/dataset/test_rm_dataset.py b/tests/utils/dataset/test_rm_dataset.py similarity index 100% rename from tests/verl/utils/dataset/test_rm_dataset.py rename to tests/utils/dataset/test_rm_dataset.py diff --git a/tests/verl/utils/dataset/test_sft_dataset.py b/tests/utils/dataset/test_sft_dataset.py similarity index 100% rename from tests/verl/utils/dataset/test_sft_dataset.py rename to tests/utils/dataset/test_sft_dataset.py diff --git a/tests/verl/utils/test_import_utils.py b/tests/utils/test_import_utils.py similarity index 100% rename from tests/verl/utils/test_import_utils.py rename to tests/utils/test_import_utils.py diff --git a/tests/verl/utils/test_model.py b/tests/utils/test_model.py similarity index 100% rename from tests/verl/utils/test_model.py rename to tests/utils/test_model.py diff --git a/tests/verl/utils/test_module.py b/tests/utils/test_module.py similarity index 100% rename from tests/verl/utils/test_module.py rename to tests/utils/test_module.py diff --git a/tests/rollout/async_rollout_utils.py b/tests/workers/rollout/async_rollout_utils.py similarity index 100% rename from tests/rollout/async_rollout_utils.py rename to tests/workers/rollout/async_rollout_utils.py diff --git a/tests/rollout/run_fsdp_vllm.py b/tests/workers/rollout/run_fsdp_vllm.py similarity index 100% rename from tests/rollout/run_fsdp_vllm.py rename to tests/workers/rollout/run_fsdp_vllm.py diff --git a/tests/rollout/test_hf_rollout.py b/tests/workers/rollout/test_hf_rollout.py similarity index 100% rename from tests/rollout/test_hf_rollout.py rename to tests/workers/rollout/test_hf_rollout.py diff --git a/tests/rollout/test_sglang_async_rollout_w_tools.py b/tests/workers/rollout/test_sglang_async_rollout_w_tools.py similarity index 100% rename from tests/rollout/test_sglang_async_rollout_w_tools.py rename to tests/workers/rollout/test_sglang_async_rollout_w_tools.py diff --git a/tests/rollout/test_sglang_async_spmd.py b/tests/workers/rollout/test_sglang_async_spmd.py similarity index 100% rename from tests/rollout/test_sglang_async_spmd.py rename to tests/workers/rollout/test_sglang_async_spmd.py diff --git a/tests/rollout/test_sglang_spmd.py b/tests/workers/rollout/test_sglang_spmd.py similarity index 100% rename from tests/rollout/test_sglang_spmd.py rename to tests/workers/rollout/test_sglang_spmd.py diff --git a/tests/rollout/test_vllm_hf_loader.py b/tests/workers/rollout/test_vllm_hf_loader.py similarity index 100% rename from tests/rollout/test_vllm_hf_loader.py rename to tests/workers/rollout/test_vllm_hf_loader.py diff --git a/tests/rollout/test_vllm_multi_turn.py b/tests/workers/rollout/test_vllm_multi_turn.py similarity index 100% rename from tests/rollout/test_vllm_multi_turn.py rename to tests/workers/rollout/test_vllm_multi_turn.py diff --git a/tests/rollout/test_vllm_spmd.py b/tests/workers/rollout/test_vllm_spmd.py similarity index 100% rename from tests/rollout/test_vllm_spmd.py rename to tests/workers/rollout/test_vllm_spmd.py diff --git a/tests/rollout/test_vllm_tool_calling.py b/tests/workers/rollout/test_vllm_tool_calling.py similarity index 100% rename from tests/rollout/test_vllm_tool_calling.py rename to tests/workers/rollout/test_vllm_tool_calling.py diff --git a/tests/rollout/utils_sglang.py b/tests/workers/rollout/utils_sglang.py similarity index 100% rename from tests/rollout/utils_sglang.py rename to tests/workers/rollout/utils_sglang.py From 0d55acb640fb8be4eb7261658cdcba4701184ccd Mon Sep 17 00:00:00 2001 From: Haibin Lin Date: Sun, 4 May 2025 14:29:23 -0700 Subject: [PATCH 2/7] fix paths --- .github/workflows/model.yml | 20 ++++++------ .github/workflows/vllm.yml | 2 +- docs/api/trainer.rst | 22 +++++++++++++ docs/api/workers.rst | 61 +++++++++++++++++++++++++++++++++++++ docs/perf/perf_tuning.rst | 4 +-- 5 files changed, 96 insertions(+), 13 deletions(-) create mode 100644 docs/api/trainer.rst create mode 100644 docs/api/workers.rst diff --git a/.github/workflows/model.yml b/.github/workflows/model.yml index 61751cdb73a..332db307694 100644 --- a/.github/workflows/model.yml +++ b/.github/workflows/model.yml @@ -14,7 +14,7 @@ on: paths: - "verl/**/*.py" - "tests/**/*.sh" - - "tests/model/*" + - "tests/models/*" - .github/workflows/model.yml - "!recipe/**" @@ -45,37 +45,37 @@ jobs: pip3 install --upgrade transformers - name: Running rmpad model tests on 8 L20 GPUs + flash_attn 2.5.8 run: | - pytest -s tests/model/test_transformer.py + pytest -s tests/models/test_transformer.py - name: Running rmpad model tests on 8 L20 GPUs + latest flash_attn run: | pip3 install --upgrade flash_attn --no-build-isolation - pytest -s tests/model/test_transformer.py + pytest -s tests/models/test_transformer.py - name: Running FSDP rmpad model tests on 8 L20 GPUs + latest flash_attn run: | - torchrun --nproc_per_node=8 tests/checkpoint/test_fsdp_ckpt.py + torchrun --nproc_per_node=8 tests/utils/checkpoint/test_fsdp_ckpt.py - name: Running transformers ulysses tests on 8 L20 GPUs + latest transformers run: | - torchrun --nproc_per_node=8 -m pytest tests/model/test_transformers_ulysses.py + torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py - name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.49.0 run: | pip3 install transformers==4.49.0 - torchrun --nproc_per_node=8 -m pytest tests/model/test_transformers_ulysses.py + torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py - name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.48.0 run: | pip3 install transformers==4.48.0 - torchrun --nproc_per_node=8 -m pytest tests/model/test_transformers_ulysses.py + torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py - name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.47.0 run: | pip3 install transformers==4.47.0 - torchrun --nproc_per_node=8 -m pytest tests/model/test_transformers_ulysses.py + torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py - name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.46.0 run: | pip3 install transformers==4.46.0 - torchrun --nproc_per_node=8 -m pytest tests/model/test_transformers_ulysses.py + torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py - name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.45.0 run: | pip3 install transformers==4.45.0 - torchrun --nproc_per_node=8 -m pytest tests/model/test_transformers_ulysses.py + torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py - name: Run distributed test run: | bash tests/distributed/run_all.sh diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml index 8b086b4bf87..96b50cc2164 100644 --- a/.github/workflows/vllm.yml +++ b/.github/workflows/vllm.yml @@ -91,4 +91,4 @@ jobs: - name: Running multi-turn rollout tests on 8 L20 GPUs run: | pip3 install --upgrade vllm==0.8.3 tensordict==0.7.2 - python3 tests/rollout/test_vllm_multi_turn.py + python3 tests/workers/rollout/test_vllm_multi_turn.py diff --git a/docs/api/trainer.rst b/docs/api/trainer.rst new file mode 100644 index 00000000000..791f56af6cf --- /dev/null +++ b/docs/api/trainer.rst @@ -0,0 +1,22 @@ +Trainers +========================= + +Trainers drive the training loop. Introducing new trainer classes in case of new training paradiam is encouraged. + +.. autosummary:: + :nosignatures: + + verl.trainer.fsdp_sft_trainer.FSDPSFTTrainer + verl.trainer.ppo.ray_trainer.RayPPOTrainer + + +Core APIs +~~~~~~~~~~~~~~~~~ + +.. autoclass:: verl.trainer.ppo.ray_trainer.RayPPOTrainer + +.. automodule:: verl.utils.tokenizer + :members: hf_tokenizer + +.. automodule:: verl.single_controller + :members: Worker, WorkerGroup, ClassWithInitArgs, ResourcePool diff --git a/docs/api/workers.rst b/docs/api/workers.rst new file mode 100644 index 00000000000..34889851475 --- /dev/null +++ b/docs/api/workers.rst @@ -0,0 +1,61 @@ +Data interface +========================= + +DataProto is the interface for data exchange. + +The :class:`verl.DataProto` class contains two key members: + +- batch: a :class:`tensordict.TensorDict` object for the actual data +- meta_info: a :class:`Dict` with additional meta information + +TensorDict +~~~~~~~~~~~~ + +:attr:`DataProto.batch` is built on top of :class:`tensordict`, a project in the PyTorch ecosystem. +A TensorDict is a dict-like container for tensors. To instantiate a TensorDict, you must specify key-value pairs as well as the batch size. + +.. code-block:: python + + >>> import torch + >>> from tensordict import TensorDict + >>> tensordict = TensorDict({"zeros": torch.zeros(2, 3, 4), "ones": torch.ones(2, 3, 5)}, batch_size=[2,]) + >>> tensordict["twos"] = 2 * torch.ones(2, 5, 6) + >>> zeros = tensordict["zeros"] + >>> tensordict + TensorDict( + fields={ + ones: Tensor(shape=torch.Size([2, 3, 5]), device=cpu, dtype=torch.float32, is_shared=False), + twos: Tensor(shape=torch.Size([2, 5, 6]), device=cpu, dtype=torch.float32, is_shared=False), + zeros: Tensor(shape=torch.Size([2, 3, 4]), device=cpu, dtype=torch.float32, is_shared=False)}, + batch_size=torch.Size([2]), + device=None, + is_shared=False) + +One can also index a tensordict along its batch_size. The contents of the TensorDict can be manipulated collectively as well. + +.. code-block:: python + + >>> tensordict[..., :1] + TensorDict( + fields={ + ones: Tensor(shape=torch.Size([1, 3, 5]), device=cpu, dtype=torch.float32, is_shared=False), + twos: Tensor(shape=torch.Size([1, 5, 6]), device=cpu, dtype=torch.float32, is_shared=False), + zeros: Tensor(shape=torch.Size([1, 3, 4]), device=cpu, dtype=torch.float32, is_shared=False)}, + batch_size=torch.Size([1]), + device=None, + is_shared=False) + >>> tensordict = tensordict.to("cuda:0") + >>> tensordict = tensordict.reshape(6) + +For more about :class:`tensordict.TensorDict` usage, see the official tensordict_ documentation. + +.. _tensordict: https://pytorch.org/tensordict/overview.html + + +Core APIs +~~~~~~~~~~~~~~~~~ + + +.. autoclass:: verl.workers.actor.DataParallelPPOActor + +.. autoclass:: verl.workers.actor.DataParallelPPOActor diff --git a/docs/perf/perf_tuning.rst b/docs/perf/perf_tuning.rst index fa2d416d69f..9a509ce1102 100644 --- a/docs/perf/perf_tuning.rst +++ b/docs/perf/perf_tuning.rst @@ -51,12 +51,12 @@ Currently, for llama, mistral, gemma1 and qwen based models, users can enable `u sequence packing implementation provided by transformers library. For other models, transformers library may also support it but we haven't tested it yet. -Users can add the desired model config to the `test_transformer.py `_ file. +Users can add the desired model config to the `test_transformer.py `_ file. And test its functionaility by running the following command: .. code-block:: bash - pytest -s tests/model/test_transformer.py + pytest -s tests/models/test_transformer.py If the test passes, you can add your desired model into the model `registry.py `_ file. Then, you can enjoy the performance boost of sequence packing From df2a2b68c07b86acec1c859e2e6670bf06fdba7c Mon Sep 17 00:00:00 2001 From: Haibin Lin Date: Sun, 4 May 2025 14:31:10 -0700 Subject: [PATCH 3/7] remove unintended files --- docs/api/trainer.rst | 1 - docs/api/workers.rst | 61 -------------------------------------------- 2 files changed, 62 deletions(-) delete mode 100644 docs/api/workers.rst diff --git a/docs/api/trainer.rst b/docs/api/trainer.rst index 791f56af6cf..d890b7341c6 100644 --- a/docs/api/trainer.rst +++ b/docs/api/trainer.rst @@ -6,7 +6,6 @@ Trainers drive the training loop. Introducing new trainer classes in case of new .. autosummary:: :nosignatures: - verl.trainer.fsdp_sft_trainer.FSDPSFTTrainer verl.trainer.ppo.ray_trainer.RayPPOTrainer diff --git a/docs/api/workers.rst b/docs/api/workers.rst deleted file mode 100644 index 34889851475..00000000000 --- a/docs/api/workers.rst +++ /dev/null @@ -1,61 +0,0 @@ -Data interface -========================= - -DataProto is the interface for data exchange. - -The :class:`verl.DataProto` class contains two key members: - -- batch: a :class:`tensordict.TensorDict` object for the actual data -- meta_info: a :class:`Dict` with additional meta information - -TensorDict -~~~~~~~~~~~~ - -:attr:`DataProto.batch` is built on top of :class:`tensordict`, a project in the PyTorch ecosystem. -A TensorDict is a dict-like container for tensors. To instantiate a TensorDict, you must specify key-value pairs as well as the batch size. - -.. code-block:: python - - >>> import torch - >>> from tensordict import TensorDict - >>> tensordict = TensorDict({"zeros": torch.zeros(2, 3, 4), "ones": torch.ones(2, 3, 5)}, batch_size=[2,]) - >>> tensordict["twos"] = 2 * torch.ones(2, 5, 6) - >>> zeros = tensordict["zeros"] - >>> tensordict - TensorDict( - fields={ - ones: Tensor(shape=torch.Size([2, 3, 5]), device=cpu, dtype=torch.float32, is_shared=False), - twos: Tensor(shape=torch.Size([2, 5, 6]), device=cpu, dtype=torch.float32, is_shared=False), - zeros: Tensor(shape=torch.Size([2, 3, 4]), device=cpu, dtype=torch.float32, is_shared=False)}, - batch_size=torch.Size([2]), - device=None, - is_shared=False) - -One can also index a tensordict along its batch_size. The contents of the TensorDict can be manipulated collectively as well. - -.. code-block:: python - - >>> tensordict[..., :1] - TensorDict( - fields={ - ones: Tensor(shape=torch.Size([1, 3, 5]), device=cpu, dtype=torch.float32, is_shared=False), - twos: Tensor(shape=torch.Size([1, 5, 6]), device=cpu, dtype=torch.float32, is_shared=False), - zeros: Tensor(shape=torch.Size([1, 3, 4]), device=cpu, dtype=torch.float32, is_shared=False)}, - batch_size=torch.Size([1]), - device=None, - is_shared=False) - >>> tensordict = tensordict.to("cuda:0") - >>> tensordict = tensordict.reshape(6) - -For more about :class:`tensordict.TensorDict` usage, see the official tensordict_ documentation. - -.. _tensordict: https://pytorch.org/tensordict/overview.html - - -Core APIs -~~~~~~~~~~~~~~~~~ - - -.. autoclass:: verl.workers.actor.DataParallelPPOActor - -.. autoclass:: verl.workers.actor.DataParallelPPOActor From 6ed42a3293d3069dbe2b3f8ffa45fbdada05b0ac Mon Sep 17 00:00:00 2001 From: Haibin Lin Date: Sun, 4 May 2025 14:45:11 -0700 Subject: [PATCH 4/7] ignore ckpt gpu tests --- .github/workflows/verl_unit_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/verl_unit_test.yml b/.github/workflows/verl_unit_test.yml index ef1ee78ab33..43a6e4ed22d 100644 --- a/.github/workflows/verl_unit_test.yml +++ b/.github/workflows/verl_unit_test.yml @@ -48,4 +48,4 @@ jobs: - name: Running utils tests run: | cd tests/utils - pytest -s -x --ignore=dataset/ . + pytest -s -x --ignore=dataset/ --ignore=checkpoint/ . From fbf7524cdbd1dc79bd167a8f77bd461ad7825b4e Mon Sep 17 00:00:00 2001 From: Haibin Lin Date: Sun, 4 May 2025 18:31:34 -0700 Subject: [PATCH 5/7] fix tests --- tests/e2e/run_dapo.sh | 2 +- tests/workers/rollout/test_vllm_multi_turn.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/e2e/run_dapo.sh b/tests/e2e/run_dapo.sh index 34f520ab0c5..ef748dd92fb 100644 --- a/tests/e2e/run_dapo.sh +++ b/tests/e2e/run_dapo.sh @@ -41,7 +41,7 @@ gen_prompt_bsz=$((train_prompt_bsz * 4)) exp_name="$(basename "${MODEL_ID,,}")-dapo-minimal" -python3 -m recipe.dapo.src.main_dapo \ +python3 -m recipe.dapo.main_dapo \ data.train_files="${HOME}/data/gsm8k/train.parquet" \ data.val_files="${HOME}/data/gsm8k/test.parquet" \ reward_model.reward_manager=dapo \ diff --git a/tests/workers/rollout/test_vllm_multi_turn.py b/tests/workers/rollout/test_vllm_multi_turn.py index ea683b83024..f691cfe9dd4 100644 --- a/tests/workers/rollout/test_vllm_multi_turn.py +++ b/tests/workers/rollout/test_vllm_multi_turn.py @@ -21,7 +21,7 @@ from openai.types.chat.chat_completion import ChatCompletion from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ChatCompletionResponse, ChatCompletionStreamResponse, ErrorResponse -from tests.rollout.async_rollout_utils import init_async_rollout_manager +from tests.workers.rollout.async_rollout_utils import init_async_rollout_manager from verl.protocol import DataProto From b1a51534e7781f96a60ab67ca954ae6080b05182 Mon Sep 17 00:00:00 2001 From: Haibin Lin Date: Fri, 9 May 2025 11:19:24 -0700 Subject: [PATCH 6/7] fix lint --- tests/utility/test_timeout_decorator.py | 11 ++++----- tests/utils/test_flops_counter.py | 5 ++-- verl/single_controller/base/worker.py | 1 + verl/utils/megatron_utils.py | 4 ++-- verl/utils/reward_score/math_dapo.py | 2 +- .../utils/reward_score/prime_math/__init__.py | 3 ++- verl/utils/reward_score/prime_math/grader.py | 1 - verl/workers/megatron_workers.py | 1 + .../sharding_manager/megatron_sglang.py | 23 ++++--------------- 9 files changed, 18 insertions(+), 33 deletions(-) diff --git a/tests/utility/test_timeout_decorator.py b/tests/utility/test_timeout_decorator.py index e9f78a7c6c9..57b563bce69 100644 --- a/tests/utility/test_timeout_decorator.py +++ b/tests/utility/test_timeout_decorator.py @@ -12,17 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -import time -import os -import sys import multiprocessing -import queue -import pytest # Import pytest -from functools import wraps +import sys +import threading +import time +import pytest # Import pytest from verl.utils.py_functional import timeout_limit as timeout -import threading # --- Test Task Functions --- TEST_TIMEOUT_SECONDS = 1.5 # Timeout duration for tests diff --git a/tests/utils/test_flops_counter.py b/tests/utils/test_flops_counter.py index c0420989783..c8d3589e9d3 100644 --- a/tests/utils/test_flops_counter.py +++ b/tests/utils/test_flops_counter.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pytest import math -import json + +import pytest + from verl.utils.flops_counter import FlopsCounter VALID_CONFIG_TYPE = {"llama", "qwen2", "qwen3", "qwen3_moe", "deepseek_v3"} diff --git a/verl/single_controller/base/worker.py b/verl/single_controller/base/worker.py index 8acb4b6a040..7e7a3f2d9ae 100644 --- a/verl/single_controller/base/worker.py +++ b/verl/single_controller/base/worker.py @@ -136,6 +136,7 @@ def _configure_before_init(self, register_center_name: str, rank: int): def __init__(self, cuda_visible_devices=None) -> None: # construct a meta from environment variable. Note that the import must be inside the class because it is executed remotely import os + import torch from packaging import version diff --git a/verl/utils/megatron_utils.py b/verl/utils/megatron_utils.py index 60b220b52c0..a16b32ca4d3 100644 --- a/verl/utils/megatron_utils.py +++ b/verl/utils/megatron_utils.py @@ -31,9 +31,9 @@ from megatron.core.utils import get_attr_wrapped_model from transformers import PretrainedConfig -from verl.utils.torch_dtypes import PrecisionType -from verl.utils.model import normalize_model_name import verl.utils.megatron.tensor_parallel as tp_utils +from verl.utils.model import normalize_model_name +from verl.utils.torch_dtypes import PrecisionType def get_model_config(model): diff --git a/verl/utils/reward_score/math_dapo.py b/verl/utils/reward_score/math_dapo.py index d48f68bf4ec..33a699e5641 100644 --- a/verl/utils/reward_score/math_dapo.py +++ b/verl/utils/reward_score/math_dapo.py @@ -14,9 +14,9 @@ # Adapted from https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/hendrycks_math/utils.py import re -import signal from typing import Optional + def last_boxed_only_string(string: str) -> Optional[str]: """Extract the last LaTeX boxed expression from a string. diff --git a/verl/utils/reward_score/prime_math/__init__.py b/verl/utils/reward_score/prime_math/__init__.py index b29a6dc971c..f24c78b917a 100644 --- a/verl/utils/reward_score/prime_math/__init__.py +++ b/verl/utils/reward_score/prime_math/__init__.py @@ -28,9 +28,10 @@ from pylatexenc import latex2text from sympy.parsing import sympy_parser +from verl.utils.py_functional import timeout_limit + from . import math_normalize from .grader import math_equal -from verl.utils.py_functional import timeout_limit # import math_normalize # from grader import math_equal diff --git a/verl/utils/reward_score/prime_math/grader.py b/verl/utils/reward_score/prime_math/grader.py index 55048ac8dbe..e2d5fe4862c 100644 --- a/verl/utils/reward_score/prime_math/grader.py +++ b/verl/utils/reward_score/prime_math/grader.py @@ -95,7 +95,6 @@ import contextlib import math import re -import signal from math import isclose from typing import Union diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py index 3cbcacc6c76..2828448cf18 100644 --- a/verl/workers/megatron_workers.py +++ b/verl/workers/megatron_workers.py @@ -262,6 +262,7 @@ def _build_rollout(self, trust_remote_code=False): log_gpu_memory_usage("After building sharding manager", logger=logger) elif self.config.rollout.name == 'sglang': from verl.workers.rollout.sglang_rollout import SGLangRollout + # NOTE(linjunrong): Due to recent fp8 support in SGLang. Now importing any symbol relate to SGLang's model_runner would check CUDA device capability. # However, due to veRL's setting, the main process of ray can not find any CUDA device, which would potentially lead to: # "RuntimeError: No CUDA GPUs are available". diff --git a/verl/workers/sharding_manager/megatron_sglang.py b/verl/workers/sharding_manager/megatron_sglang.py index 5d4167916ff..817867a5a49 100644 --- a/verl/workers/sharding_manager/megatron_sglang.py +++ b/verl/workers/sharding_manager/megatron_sglang.py @@ -15,19 +15,13 @@ This file contains a Megatron style Hybrid Engine that shares the weights of the actor with the inference engine. """ -import importlib import logging import os + import torch -import torch.distributed as dist from torch import nn -from verl.utils.model import normalize_model_name -from verl.utils.megatron_utils import broadcast_from_megatron_pp, broadcast_str_from_megatron_pp - -from verl.utils.megatron_utils import get_model, unwrap_model from verl.utils.debug import log_gpu_memory_usage -from verl.utils.megatron_utils import convert_megatron_model_to_transformers_model logger = logging.getLogger(__file__) logger.setLevel(os.getenv('VERL_PPO_LOGGING_LEVEL', 'WARN')) @@ -40,23 +34,14 @@ - After inference, all the parameters that doesn't belong to this pp rank is freed. """ -from .base import BaseShardingManager - -import torch -from torch import nn import torch.distributed +from sglang.srt.entrypoints.verl_engine import VerlEngine from torch.distributed import new_group -from torch.distributed._tensor import DTensor -from typing import Dict, Iterable, Union, Tuple -from verl import DataProto -from verl.protocol import all_gather_data_proto -from verl.utils.torch_functional import (broadcast_dict_tensor, allgather_dict_tensors) -from sglang.srt.entrypoints.verl_engine import VerlEngine from verl.utils.debug import GPUMemoryLogger +from verl.utils.megatron_utils import per_tensor_generator -import verl.utils.megatron.tensor_parallel as tp_utils -from verl.utils.megatron_utils import per_tensor_generator, default_tp_concat_fn +from .base import BaseShardingManager _MICRO_DATA_PARALLEL_GROUP = None From 537de291ab70dc7be9e2492b8551c87dc1b0bf9a Mon Sep 17 00:00:00 2001 From: Haibin Lin Date: Fri, 9 May 2025 13:17:03 -0700 Subject: [PATCH 7/7] fix tests --- .github/workflows/model.yml | 2 +- .github/workflows/verl_unit_test.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/model.yml b/.github/workflows/model.yml index 15eb79e2083..6193321a3e7 100644 --- a/.github/workflows/model.yml +++ b/.github/workflows/model.yml @@ -106,4 +106,4 @@ jobs: - name: Running FSDP2 rmpad model tests on 8 L20 GPUs + latest flash_attn run: | pip3 install --upgrade flash_attn --no-build-isolation - STRATEGY=fsdp2 torchrun --nproc_per_node=8 tests/checkpoint/test_fsdp_ckpt.py + STRATEGY=fsdp2 torchrun --nproc_per_node=8 tests/utils/checkpoint/test_fsdp_ckpt.py diff --git a/.github/workflows/verl_unit_test.yml b/.github/workflows/verl_unit_test.yml index a84a52f42e4..1d4028fb3b4 100644 --- a/.github/workflows/verl_unit_test.yml +++ b/.github/workflows/verl_unit_test.yml @@ -45,7 +45,7 @@ jobs: run: | cd tests pytest -s -x test_protocol.py - - name: Running utils tests + - name: running utils tests run: | cd tests/utils - pytest -s -x --ignore=dataset/ --ignore=checkpoint/ . + pytest -s -x --ignore=dataset/ --ignore=checkpoint/ --ignore=test_flops_counter.py .