diff --git a/.github/workflows/.deprecate/e2e_ppo_trainer.yml b/.github/workflows/.deprecate/e2e_ppo_trainer.yml index 00ecd79152b..94ef83db3d3 100644 --- a/.github/workflows/.deprecate/e2e_ppo_trainer.yml +++ b/.github/workflows/.deprecate/e2e_ppo_trainer.yml @@ -77,7 +77,19 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2 +<<<<<<< HEAD + image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2 +======= +<<<<<<< HEAD +<<<<<<< HEAD + image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2 +======= + image: verlai/verl:app-verl0.5-sglang0.4.10.post2-mcore0.12.2-te2.2 +>>>>>>> 8ebbea9a (Add CI new version) +======= + image: popsodazhp/verl:app-verl0.5-sglang0.4.10.post2-mcore0.12.2-te2.2 +>>>>>>> 0134c016 (Use personal version) +>>>>>>> c0d00361 (Use personal version) options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -110,7 +122,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/.deprecate/e2e_ppo_trainer_megatron_sglang.yml b/.github/workflows/.deprecate/e2e_ppo_trainer_megatron_sglang.yml index 177af9ec8b9..0e5f7a487a1 100644 --- a/.github/workflows/.deprecate/e2e_ppo_trainer_megatron_sglang.yml +++ b/.github/workflows/.deprecate/e2e_ppo_trainer_megatron_sglang.yml @@ -75,7 +75,7 @@ permissions: contents: read env: - IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2" + IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2" DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" jobs: diff --git a/.github/workflows/checkpoint_converter.yml b/.github/workflows/checkpoint_converter.yml index 90ad640dff4..65baa00e956 100644 --- a/.github/workflows/checkpoint_converter.yml +++ b/.github/workflows/checkpoint_converter.yml @@ -81,7 +81,11 @@ jobs: NO_PROXY: "localhost,127.0.0.1" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2 +<<<<<<< HEAD + image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2 +======= + image: popsodazhp/verl:app-verl0.5-sglang0.4.10.post2-mcore0.12.2-te2.2 +>>>>>>> 0134c016 (Use personal version) options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -116,7 +120,11 @@ jobs: HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable HF_ENDPOINT: "https://hf-mirror.com" container: - image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2 +<<<<<<< HEAD + image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2 +======= + image: popsodazhp/verl:app-verl0.5-sglang0.4.10.post2-mcore0.12.2-te2.2 +>>>>>>> 0134c016 (Use personal version) options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/cpu_unit_tests.yml b/.github/workflows/cpu_unit_tests.yml index 698816ce11e..95cbc8be370 100644 --- a/.github/workflows/cpu_unit_tests.yml +++ b/.github/workflows/cpu_unit_tests.yml @@ -68,7 +68,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2 steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: diff --git a/.github/workflows/e2e_dapo.yml b/.github/workflows/e2e_dapo.yml index 60119e8436f..b4e28286069 100644 --- a/.github/workflows/e2e_dapo.yml +++ b/.github/workflows/e2e_dapo.yml @@ -94,7 +94,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/e2e_eval_aime24.yml b/.github/workflows/e2e_eval_aime24.yml index f5718603b9e..b0da8f2acc2 100644 --- a/.github/workflows/e2e_eval_aime24.yml +++ b/.github/workflows/e2e_eval_aime24.yml @@ -88,7 +88,7 @@ permissions: contents: read env: - IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2" + IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2" DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" jobs: diff --git a/.github/workflows/e2e_genrm_remote.yml b/.github/workflows/e2e_genrm_remote.yml index 8c7bc690718..6574ef3b61e 100644 --- a/.github/workflows/e2e_genrm_remote.yml +++ b/.github/workflows/e2e_genrm_remote.yml @@ -87,7 +87,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/e2e_ppo_trainer.yml b/.github/workflows/e2e_ppo_trainer.yml index f27da026aaf..27fa3ba5448 100644 --- a/.github/workflows/e2e_ppo_trainer.yml +++ b/.github/workflows/e2e_ppo_trainer.yml @@ -87,7 +87,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -229,7 +229,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2 options: --gpus all --shm-size=50g # Visual dataloader requires large memory steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -238,11 +238,10 @@ jobs: - name: Install the current repository run: | pip3 install --no-deps -e .[test,gpu,vllm,geo,trl] - pip install "transformers[hf_xet]<4.53.0" # Fix for transformers 4.53.0 + pip install "transformers[hf_xet]==4.54.0" # Geo3k - name: Prepare GEO3K dataset run: | - ray stop --force python3 examples/data_preprocess/geo3k.py - name: Running GEO3K VLM GRPO E2E training tests on 8 L20 GPUs with rmpad using function rm run: | @@ -285,7 +284,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -318,7 +317,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2 options: --gpus all --shm-size=50g # Visual dataloader requires large memory steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -326,7 +325,8 @@ jobs: fetch-depth: 0 - name: Install the current repository run: | - pip3 install -e .[test,geo,gpu,sglang] --no-deps && pip install transformers==4.52.3 + pip3 install -e .[test,geo,gpu,sglang] --no-deps + pip install "transformers[hf_xet]==4.54.0" # Geo3k - name: Prepare GEO3K dataset run: | diff --git a/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml b/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml index 4e635b3351b..f37866274a4 100644 --- a/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml +++ b/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml @@ -86,7 +86,7 @@ permissions: contents: read env: - IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2" + IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2" DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" jobs: diff --git a/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml b/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml index ae12c110693..3fa0e51a2d9 100644 --- a/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml +++ b/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml @@ -85,7 +85,7 @@ permissions: contents: read env: - IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2" + IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2" DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" jobs: @@ -348,7 +348,6 @@ jobs: - name: Install the current repository run: | pip3 install --no-deps -e .[test] - pip3 install "transformers[hf_xet]<4.52.0" - name: Prepare Geo3k dataset run: | python3 examples/data_preprocess/geo3k.py diff --git a/.github/workflows/e2e_spin.yml b/.github/workflows/e2e_spin.yml index cb56fbeac7a..ad6a2bfd748 100644 --- a/.github/workflows/e2e_spin.yml +++ b/.github/workflows/e2e_spin.yml @@ -68,7 +68,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/e2e_sppo.yml b/.github/workflows/e2e_sppo.yml index cf85c296012..15edc4f9d00 100644 --- a/.github/workflows/e2e_sppo.yml +++ b/.github/workflows/e2e_sppo.yml @@ -66,7 +66,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/gpu_unit_tests.yml b/.github/workflows/gpu_unit_tests.yml index d86e7e64d86..25018594d48 100644 --- a/.github/workflows/gpu_unit_tests.yml +++ b/.github/workflows/gpu_unit_tests.yml @@ -80,7 +80,7 @@ jobs: NO_PROXY: "localhost,127.0.0.1" HF_HUB_ENABLE_HF_TRANSFER: 1 container: - image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/model.yml b/.github/workflows/model.yml index d484c2b9d51..280781f8c7c 100644 --- a/.github/workflows/model.yml +++ b/.github/workflows/model.yml @@ -73,7 +73,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.4-sglang0.4.6.post5-vllm0.8.5-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -82,7 +82,7 @@ jobs: - name: Install the current repository and upgrade to latest transformers(4.54.0)/flash_attn, transformers 4.55.0 has strange behavior with model backward run: | pip3 install --no-deps -e .[test] - pip3 install --upgrade transformers==4.54.0 + pip3 install --upgrade transformers - name: Running rmpad model tests on 8 L20 GPUs + flash_attn 2.5.8 run: | pytest -s tests/models/test_transformer.py @@ -95,6 +95,10 @@ jobs: - name: Running transformers ulysses tests on 8 L20 GPUs + latest transformers run: | torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py + - name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.54.1 + run: | + pip3 install transformers==4.54.1 + torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py - name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.53.2 run: | pip3 install transformers==4.53.2 @@ -119,7 +123,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.0-fa2.7.4 + image: verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/sanity.yml b/.github/workflows/sanity.yml index ce759b82664..39eaf0e3156 100644 --- a/.github/workflows/sanity.yml +++ b/.github/workflows/sanity.yml @@ -12,7 +12,7 @@ # - `special_sanity`: a suite of quick sanity tests # - `special_standalone`: a set of test that are designed to run in dedicated environments -# Accelerators for tests +# Accelerators for tests # - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`. # - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment. @@ -78,7 +78,7 @@ jobs: pytest -s -x tests/special_sanity - name: Run license test run: | - python3 tests/special_sanity/check_license.py --directory . + python3 tests/special_sanity/check_license.py --directories . - name: Assert naming convention run: | if grep -rIn --exclude-dir=.git --exclude-dir=.github --exclude-dir=venv --exclude-dir=__pycache__ 'veRL' .; then diff --git a/.github/workflows/sgl.yml b/.github/workflows/sgl.yml index 5795f9c3e7c..9eb07e8faf8 100644 --- a/.github/workflows/sgl.yml +++ b/.github/workflows/sgl.yml @@ -90,7 +90,11 @@ jobs: NCCL_SHM_DISABLE: "1" NCCL_P2P_DISABLE: "1" container: - image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2 +<<<<<<< HEAD + image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2 +======= + image: popsodazhp/verl:app-verl0.5-sglang0.4.10.post2-mcore0.12.2-te2.2 +>>>>>>> 0134c016 (Use personal version) options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml index 2998c08f09f..181eb9be74e 100644 --- a/.github/workflows/vllm.yml +++ b/.github/workflows/vllm.yml @@ -84,7 +84,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4b4c7b8435c..bd77c362015 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,6 +32,6 @@ repos: hooks: - id: check-license name: Check license - entry: python3 tests/special_sanity/check_license.py --directory . + entry: python3 tests/special_sanity/check_license.py --directories examples recipe scripts tests verl setup.py language: python pass_filenames: false diff --git a/README.md b/README.md index 423a3ff83e6..2345b46441e 100644 --- a/README.md +++ b/README.md @@ -232,6 +232,7 @@ verl is inspired by the design of Nemo-Aligner, Deepspeed-chat and OpenRLHF. The - [Agent Lightning](https://github.com/microsoft/agent-lightning): A flexible and extensible framework that enables seamless agent optimization for any existing agent framework. ![GitHub Repo stars](https://img.shields.io/github/stars/microsoft/agent-lightning) - [VTool-R1](https://github.com/VTOOL-R1/vtool-r1): VLMs Learn to Think with Images via Reinforcement Learning on Multimodal Tool Use. ![GitHub Repo stars](https://img.shields.io/github/stars/VTOOL-R1/vtool-r1) - [Kimina-Prover-RL](https://github.com/project-numina/kimina-prover-rl/tree/main/recipe/kimina_prover_rl): Training pipeline for formal theorem proving, based on a paradigm inspired by DeepSeek-R1. +- [RL-PLUS](https://github.com/YihongDong/RL-PLUS): Countering Capability Boundary Collapse of LLMs in Reinforcement Learning with Hybrid-policy Optimization. and many more awesome work listed in [recipe](recipe/README.md). diff --git a/docker/Dockerfile.sglang b/docker/Dockerfile.sglang index 11ad4a77da6..7ed9c43876c 100644 --- a/docker/Dockerfile.sglang +++ b/docker/Dockerfile.sglang @@ -36,11 +36,11 @@ RUN pip config set global.index-url "${PIP_INDEX}" && \ pip config set global.extra-index-url "${PIP_INDEX}" && \ python -m pip install --upgrade pip -# Install sglang-0.4.6.post5 and torch-memory-saver -RUN pip uninstall -y cuda-python && pip install "sglang[all]==0.4.6.post5" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir +# Install sglang-0.4.10.post2 and torch-memory-saver +RUN pip uninstall -y cuda-python && pip install "sglang[all]==0.4.10.post2" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.7/flashinfer-python && pip install torch-memory-saver --no-cache-dir -# Install torch-2.6.0 -RUN pip install --no-cache-dir torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 tensordict torchdata \ +# Install torch-2.7.1 +RUN pip install --no-cache-dir torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 tensordict torchdata \ transformers>=4.49.0 accelerate datasets peft hf_transfer \ ray[default] codetiming hydra-core pandas pyarrow>=15.0.0 pylatexenc qwen-vl-utils wandb liger-kernel \ pytest pre-commit py-spy pyext diff --git a/docker/README.md b/docker/README.md index 787843ec375..d988b0a2b2d 100644 --- a/docker/README.md +++ b/docker/README.md @@ -14,9 +14,7 @@ The first two types of images are hosted on dockerhub [verlai/verl](https://hub. ## Base Image -The stable base image is ``verlai/verl:base-verl0.4-cu124-cudnn9.8-torch2.6-fa2.7.4``. The installed package versions can be found from tags, and the Dockerfile can be found in ``verl[version]-[packages]/Dockerfile.base``. - -The base images for preview are ``verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.8.0`` and ``verlai/verl:base-verl0.5-preview-cu128-cudnn9.8-torch2.7.1-fa2.8.0`` with different CUDA versions. +The stable base image is ``verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4`` with different CUDA versions. The update of base image is not frequent, and the app image can be built on top of it without reinstalling base packages. @@ -25,8 +23,8 @@ The update of base image is not frequent, and the app image can be built on top From this version, we divide images built for vLLM and SGLang as the divergence of dependent packages like FlashInfer. There are 2 types of application images available: -- **vLLM with FSDP and Megatron**: ``verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2`` -- **SGLang with FSDP and Megatron**: ``verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2`` +- **vLLM with FSDP and Megatron**: ``verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2`` +- **SGLang with FSDP and Megatron**: `verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2` Docker images with Megatron backends are runnable with large language model like ``Qwen/Qwen3-235B-A22B``, ``deepseek-ai/DeepSeek-V3-0324`` post-training. Refer to the :doc:`Large Language Model Post-Training documentation<../perf/dpsk>` for more details. @@ -54,7 +52,7 @@ docker start verl docker exec -it verl bash ``` -2. If you use the images provided, you only need to install verl itself without dependencies: +2. If you use the images provided, you only need to install verl itself without dependencies: ```sh # install the nightly version (recommended) diff --git a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.post6.mcore0.12 b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.10.post2.mcore0.12 similarity index 94% rename from docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.post6.mcore0.12 rename to docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.10.post2.mcore0.12 index 292363f9056..64b0de34b49 100644 --- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.post6.mcore0.12 +++ b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.10.post2.mcore0.12 @@ -14,10 +14,10 @@ ENV HF_HUB_ENABLE_HF_TRANSFER="1" # Install FlashInfer Python package RUN pip install --upgrade pip setuptools packaging RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation flashinfer-python==0.2.9rc1 -RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation "sglang[all]==0.4.9.post6" +RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation "sglang[all]==0.4.10.post2" # Fix packages -RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]==4.54.0" accelerate datasets peft hf-transfer \ +RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]==4.54.1" accelerate datasets peft hf-transfer \ "numpy<2.0.0" "pyarrow>=19.0.1" pandas \ ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \ pytest py-spy pyext pre-commit ruff diff --git a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.8.mcore0.12 b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.8.mcore0.12 deleted file mode 100644 index 0ac7904b7c6..00000000000 --- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.8.mcore0.12 +++ /dev/null @@ -1,39 +0,0 @@ -# Start from the verl base image -# Dockerfile.base -FROM verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4 - -# Define environments -ENV MAX_JOBS=8 -ENV VLLM_WORKER_MULTIPROC_METHOD=spawn -ENV DEBIAN_FRONTEND=noninteractive -ENV NODE_OPTIONS="" -ENV PIP_ROOT_USER_ACTION=ignore -ENV HF_HUB_ENABLE_HF_TRANSFER="1" - -# Install sglang-0.4.8 and torch-memory-saver -# Install FlashInfer Python package -RUN pip install --upgrade pip setuptools packaging -RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation flashinfer-python==0.2.6.post1 -RUN pip install --resume-retries 999 --no-cache-dir "sglang[all]==0.4.8" && pip install torch-memory-saver --no-cache-dir - -# Fix packages -RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.52.3" accelerate datasets peft hf-transfer \ - "numpy<2.0.0" "pyarrow>=19.0.1" pandas \ - ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \ - pytest py-spy pyext pre-commit ruff - -RUN pip uninstall -y pynvml nvidia-ml-py && \ - pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1" - -RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87 - -# Install TransformerEngine -RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1 - -# Install Megatron-LM -RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2 - -# Install mbridge -RUN pip3 install --no-cache-dir mbridge - -RUN pip3 install --no-deps --no-cache-dir --no-build-isolation --resume-retries 999 vllm==0.9.2 \ No newline at end of file diff --git a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.mcore0.12 b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.post6.mcore0.13 similarity index 83% rename from docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.mcore0.12 rename to docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.post6.mcore0.13 index 3f7bff3de1f..d79201a92ee 100644 --- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.mcore0.12 +++ b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.post6.mcore0.13 @@ -10,14 +10,14 @@ ENV NODE_OPTIONS="" ENV PIP_ROOT_USER_ACTION=ignore ENV HF_HUB_ENABLE_HF_TRANSFER="1" -# Install sglang-0.4.8 and torch-memory-saver +# Install sglang-0.4.10 # Install FlashInfer Python package RUN pip install --upgrade pip setuptools packaging RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation flashinfer-python==0.2.9rc1 -RUN pip install --resume-retries 999 --no-cache-dir "sglang[all]==0.4.9.post4" && pip install torch-memory-saver --no-cache-dir +RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation "sglang[all]==0.4.9.post6" # Fix packages -RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]==4.53.2" accelerate datasets peft hf-transfer \ +RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]==4.55.4" accelerate datasets peft hf-transfer \ "numpy<2.0.0" "pyarrow>=19.0.1" pandas \ ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \ pytest py-spy pyext pre-commit ruff @@ -31,7 +31,7 @@ RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87 RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1 # Install Megatron-LM -RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2 +RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.13.0 # Install mbridge RUN pip3 install --no-cache-dir mbridge \ No newline at end of file diff --git a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.vllm.mcore0.12 b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.vllm.mcore0.13 similarity index 79% rename from docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.vllm.mcore0.12 rename to docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.vllm.mcore0.13 index 9746301b471..9d73e0ffeeb 100644 --- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.vllm.mcore0.12 +++ b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.vllm.mcore0.13 @@ -1,6 +1,6 @@ # Start from the verl base image # Dockerfile.base -FROM verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.0-fa2.7.4 +FROM verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4 # Define environments ENV MAX_JOBS=32 @@ -10,11 +10,12 @@ ENV NODE_OPTIONS="" ENV PIP_ROOT_USER_ACTION=ignore ENV HF_HUB_ENABLE_HF_TRANSFER="1" -# Install torch-2.7.0+cu126 + vllm-0.9.1 -RUN pip install --resume-retries 999 --no-cache-dir vllm==0.9.1 +# Install torch-2.7.1+cu126 + vllm-0.10.0 +RUN pip install --resume-retries 999 --no-cache-dir vllm==0.10.0 # Fix packages -RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \ +# transformers 4.54.0 still not support +RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.55.4" accelerate datasets peft hf-transfer \ "numpy<2.0.0" "pyarrow>=19.0.1" pandas \ ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \ pytest py-spy pyext pre-commit ruff @@ -28,7 +29,7 @@ RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87 RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1 # Install Megatron-LM -RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2 +RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.13.0 # Install mbridge RUN pip3 install --no-cache-dir mbridge diff --git a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.base.torch2.7.0 b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.base.torch2.7.0 deleted file mode 100644 index 30251f578e9..00000000000 --- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.base.torch2.7.0 +++ /dev/null @@ -1,133 +0,0 @@ -# Base Docker Image of verl, with CUDA/Torch/FlashAttn/Apex/TransformerEngine, without other frameworks -# Target: verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.8.0-fi0.2.6 -# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10) -# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html -FROM nvcr.io/nvidia/pytorch:24.08-py3 - -# Define environments -ENV MAX_JOBS=16 -ENV VLLM_WORKER_MULTIPROC_METHOD=spawn -ENV DEBIAN_FRONTEND=noninteractive -ENV NODE_OPTIONS="" -ENV PIP_ROOT_USER_ACTION=ignore -ENV HF_HUB_ENABLE_HF_TRANSFER="1" - -# Define installation arguments -ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ -ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple - -# Set apt source -RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \ - { \ - echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \ - echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \ - echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \ - echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \ - } > /etc/apt/sources.list - -# Install systemctl -RUN apt-get update && \ - apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \ - apt-get clean - -# Install tini -RUN apt-get update && \ - apt-get install -y tini aria2 libfreeimage3 libfreeimage-dev zlib1g htop && \ - apt-get clean - -# Change pip source -RUN pip config set global.index-url "${PIP_INDEX}" && \ - pip config set global.extra-index-url "${PIP_INDEX}" && \ - python -m pip install --upgrade pip - -# Uninstall nv-pytorch fork -RUN pip uninstall -y torch torchvision torchaudio \ - pytorch-quantization pytorch-triton torch-tensorrt \ - xgboost transformer_engine flash_attn apex megatron-core grpcio - -RUN pip install --resume-retries 999 --no-cache-dir torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 - -# Install flash-attn-2.7.4.post1, although built with torch2.6, it is compatible with torch2.7 -# https://github.com/Dao-AILab/flash-attention/issues/1644#issuecomment-2899396361 -RUN ABI_FLAG=$(python -c "import torch; print('TRUE' if torch._C._GLIBCXX_USE_CXX11_ABI else 'FALSE')") && \ - URL="https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abi${ABI_FLAG}-cp310-cp310-linux_x86_64.whl" && \ - FILE="flash_attn-2.7.4.post1+cu12torch2.6cxx11abi${ABI_FLAG}-cp310-cp310-linux_x86_64.whl" && \ - wget -nv "${URL}" && \ - pip install --no-cache-dir "${FILE}" - -# Fix packages -RUN pip uninstall -y pynvml nvidia-ml-py && \ - pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1" - -# Install cudnn -RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \ - dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \ - cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-*-keyring.gpg /usr/share/keyrings/ && \ - apt-get update && \ - apt-get -y install cudnn-cuda-12 && \ - rm cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb - -# Install Apex -RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" --resume-retries 999 git+https://github.com/NVIDIA/apex.git - -# Profiling tools -RUN aria2c --always-resume=true --max-tries=99999 https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \ - apt-get update && apt-get install -y libxcb-cursor0 - -RUN apt-get install -y ./nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \ - rm -rf /usr/local/cuda/bin/nsys && \ - ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys /usr/local/cuda/bin/nsys && \ - rm -rf /usr/local/cuda/bin/nsys-ui && \ - ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys-ui /usr/local/cuda/bin/nsys-ui && \ - rm nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb - -RUN pip install --resume-retries 999 --no-cache-dir "tensordict==0.6.2" torchdata "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \ - "numpy<2.0.0" "pyarrow>=19.0.1" pandas cuda-bindings \ - ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \ - pytest py-spy pyext pre-commit ruff - -# Install DeepEP -## the dependency of IBGDA -RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so - -## Clone and build deepep and deepep-nvshmem -RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \ - git clone https://github.com/deepseek-ai/DeepEP.git && \ - cd DeepEP && git checkout a84a248 - -# Prepare nvshmem -RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \ - tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \ - cd deepep-nvshmem && git apply ../DeepEP/third-party/nvshmem.patch - -ENV CUDA_HOME=/usr/local/cuda -### Set MPI environment variables. Having errors when not set. -ENV CPATH=/usr/local/mpi/include:$CPATH -ENV LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH -ENV LD_LIBRARY_PATH=/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH -ENV GDRCOPY_HOME=/workspace/gdrcopy - -## Build deepep-nvshmem -RUN cd deepep-nvshmem && \ - NVSHMEM_SHMEM_SUPPORT=0 \ - NVSHMEM_UCX_SUPPORT=0 \ - NVSHMEM_USE_NCCL=0 \ - NVSHMEM_MPI_SUPPORT=0 \ - NVSHMEM_IBGDA_SUPPORT=1 \ - NVSHMEM_PMIX_SUPPORT=0 \ - NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ - NVSHMEM_USE_GDRCOPY=1 \ - cmake -G Ninja -S . -B build/ -DCMAKE_INSTALL_PREFIX=/workspace/deepep-nvshmem/install && cmake --build build/ --target install - -ENV NVSHMEM_DIR=/workspace/deepep-nvshmem/install -ENV LD_LIBRARY_PATH=$NVSHMEM_DIR/lib:$LD_LIBRARY_PATH -ENV PATH=$NVSHMEM_DIR/bin:$PATH - -## Build deepep -RUN cd DeepEP && \ - python setup.py install - -# Reset pip config -RUN pip config unset global.index-url && \ - pip config unset global.extra-index-url - diff --git a/docker/verl0.5-cu126-torch2.7-fa2.7.4/README.md b/docker/verl0.5-cu126-torch2.7-fa2.7.4/README.md index 2db8c58d51d..023e0eec0fa 100644 --- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/README.md +++ b/docker/verl0.5-cu126-torch2.7-fa2.7.4/README.md @@ -20,9 +20,7 @@ megatron.core==core_r0.13.0 ## Target - Base image: - - `verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.0-fa2.7.4`: We offer a base image with deep ep built in, for vllm - - `verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4`: We offer a base image with deep ep built in, for sglang + - `verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4`: We offer a base image with deep ep built in, for vllm/sglang - App image: - - `verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2` - - `verlai/verl:app-verl0.5-sglang0.4.8-mcore0.12.2-te2.2` - - `verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2` \ No newline at end of file + - `verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2` + - `verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2` diff --git a/docs/ascend_tutorial/ascend_quick_start.rst b/docs/ascend_tutorial/ascend_quick_start.rst index 390c864d899..90bf0aebaab 100644 --- a/docs/ascend_tutorial/ascend_quick_start.rst +++ b/docs/ascend_tutorial/ascend_quick_start.rst @@ -187,6 +187,8 @@ vllm & vllm-ascend +-----------+-------------------------+-------------+-------------------+-------------------+-------------------+--------------------------+ | DAPO | Qwen3-14B-base | 5.9% | pending | FSDP | vllm-ascend | Atlas 200T A2 Box16 | +-----------+-------------------------+-------------+-------------------+-------------------+-------------------+--------------------------+ +| DAPO | Qwen3-30B-base | 1.08% | pending | FSDP | vllm-ascend | Atlas 200T A2 Box16 | ++-----------+-------------------------+-------------+-------------------+-------------------+-------------------+--------------------------+ **表2** SFT类算法 diff --git a/docs/perf/nsight_profiling.md b/docs/perf/nsight_profiling.md index 452d5533232..490de5e7e4f 100644 --- a/docs/perf/nsight_profiling.md +++ b/docs/perf/nsight_profiling.md @@ -60,16 +60,16 @@ To enable profiling for specific components and steps, modify your ppo_trainer.y discrete: False actor_rollout_ref: actor: - profile: + profiler: enable: True all_ranks: True # rollout & ref follow actor settings critic: - profile: + profiler: enable: True all_ranks: True reward_model: - profile: + profiler: enable: True all_ranks: True ``` diff --git a/docs/start/install.rst b/docs/start/install.rst index a384a4dc3cf..44ab7b46cd3 100644 --- a/docs/start/install.rst +++ b/docs/start/install.rst @@ -52,7 +52,7 @@ The first two types of images are hosted on dockerhub `verlai/verl ` for more details. @@ -77,7 +77,7 @@ Community images are provided by the community, including the latest versions of For latest vLLM with FSDP, please refer to `hiyouga/verl `_ repository and the latest version is ``hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.4-flashinfer0.2.2-cxx11abi0``. -For latest SGLang with FSDP, please refer to `hebiaobuaa/verl `_ repository and the latest version is ``hebiaobuaa/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2`` which is provided by SGLang RL Group. +For latest SGLang with FSDP, please refer to `hebiaobuaa/verl `_ repository and the latest version is ``hebiaobuaa/verl:app-verl0.5-sglang0.4.10.post2-mcore0.12.2-te2.2`` which is provided by SGLang RL Group. See files under ``docker/`` for NGC-based image or if you want to build your own. diff --git a/examples/grpo_trainer/run_deepseek671b_math_megatron_80gb.sh b/examples/grpo_trainer/run_deepseek671b_math_megatron_80gb.sh index b876b19ba57..2f5a93e4466 100644 --- a/examples/grpo_trainer/run_deepseek671b_math_megatron_80gb.sh +++ b/examples/grpo_trainer/run_deepseek671b_math_megatron_80gb.sh @@ -6,7 +6,7 @@ set -x # huggingface-cli download deepseek-ai/DeepSeek-V3-0324 # no offline dist checkpoint needed, now with mbridge>=0.13.0, we can directly init model from huggingface downloaded fp8 weights -# tested on docker://verlai/verl:app-verl0.5-vllm0.10.0-mcore0.13.0-te2.2 +# tested on docker://verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2 LLM="" diff --git a/examples/grpo_trainer/run_qwen2_5_7b_grpo_discrete_prof_npu.sh b/examples/grpo_trainer/run_qwen2_5_7b_grpo_discrete_prof_npu.sh index 70ea42f2da0..27ab478da28 100644 --- a/examples/grpo_trainer/run_qwen2_5_7b_grpo_discrete_prof_npu.sh +++ b/examples/grpo_trainer/run_qwen2_5_7b_grpo_discrete_prof_npu.sh @@ -48,6 +48,13 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.rollout.enable_chunked_prefill=False \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ + actor_rollout_ref.ref.profiler.enable=True \ + actor_rollout_ref.ref.profiler.ranks=$PROFILE_RANKS \ + actor_rollout_ref.ref.profiler.all_ranks=$PROFILE_RANKS_ALL \ + actor_rollout_ref.ref.profiler.tool_config.npu.discrete=$DISCRETE \ + actor_rollout_ref.ref.profiler.tool_config.npu.contents=$CONTENTS \ + actor_rollout_ref.ref.profiler.tool_config.npu.level=$LEVEL \ + actor_rollout_ref.ref.profiler.tool_config.npu.analysis=$ANALYSIS \ algorithm.use_kl_in_reward=False \ trainer.critic_warmup=0 \ trainer.logger=console \ diff --git a/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh b/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh index a9fff3437e3..1ac6dfe9445 100644 --- a/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh +++ b/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh @@ -46,6 +46,12 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.rollout.enable_chunked_prefill=False \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ + actor_rollout_ref.ref.profiler.enable=True \ + actor_rollout_ref.ref.profiler.all_ranks=$PROFILE_RANKS_ALL \ + actor_rollout_ref.ref.profiler.tool_config.npu.discrete=$DISCRETE \ + actor_rollout_ref.ref.profiler.tool_config.npu.contents=$CONTENTS \ + actor_rollout_ref.ref.profiler.tool_config.npu.level=$LEVEL \ + actor_rollout_ref.ref.profiler.tool_config.npu.analysis=$ANALYSIS \ algorithm.use_kl_in_reward=False \ trainer.critic_warmup=0 \ trainer.logger=console \ diff --git a/examples/grpo_trainer/run_qwen2_5_vl-7b-sglang.sh b/examples/grpo_trainer/run_qwen2_5_vl-7b-sglang.sh index 985bc59c04d..86267a5602a 100644 --- a/examples/grpo_trainer/run_qwen2_5_vl-7b-sglang.sh +++ b/examples/grpo_trainer/run_qwen2_5_vl-7b-sglang.sh @@ -40,6 +40,7 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.rollout.n=5 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=20 \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ + actor_rollout_ref.rollout.mode=sync \ algorithm.use_kl_in_reward=False \ trainer.critic_warmup=0 \ trainer.logger='["console","wandb"]' \ diff --git a/examples/grpo_trainer/run_qwen3moe-30b_megatron_96gb.sh b/examples/grpo_trainer/run_qwen3moe-30b_megatron_96gb.sh index 7bf06e3ada5..6937db5fcfa 100644 --- a/examples/grpo_trainer/run_qwen3moe-30b_megatron_96gb.sh +++ b/examples/grpo_trainer/run_qwen3moe-30b_megatron_96gb.sh @@ -168,7 +168,7 @@ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megat actor_rollout_ref.rollout.free_cache_engine=True \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${infer_ppo_micro_batch_size_per_gpu} \ actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ - actor_rollout_ref.ref.megatron.use_dist_checkpointing=True \ + actor_rollout_ref.ref.megatron.use_dist_checkpointing=${USE_DIST_CKPT} \ actor_rollout_ref.ref.megatron.param_offload=${offload} \ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${REF_TP} \ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${REF_PP} \ @@ -192,4 +192,4 @@ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megat trainer.save_freq=100 \ trainer.total_epochs=10 \ trainer.resume_mode=auto \ - trainer.log_val_generations=10 \ No newline at end of file + trainer.log_val_generations=10 diff --git a/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn.sh b/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn.sh index 9038813c864..3c3dd6a4515 100644 --- a/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn.sh +++ b/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn.sh @@ -48,7 +48,8 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.rollout.n=16 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ - actor_rollout_ref.rollout.over_sample_rate=0 \ + actor_rollout_ref.rollout.over_sample_rate=0.1 \ + actor_rollout_ref.rollout.mode=sync \ algorithm.use_kl_in_reward=False \ trainer.critic_warmup=0 \ trainer.logger='["console","wandb"]' \ diff --git a/examples/sglang_multiturn/run_qwen3-4b_gsm8k_multiturn.sh b/examples/sglang_multiturn/run_qwen3-4b_gsm8k_multiturn.sh index 56228f4b55e..6f1f99e4bd2 100755 --- a/examples/sglang_multiturn/run_qwen3-4b_gsm8k_multiturn.sh +++ b/examples/sglang_multiturn/run_qwen3-4b_gsm8k_multiturn.sh @@ -35,6 +35,8 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.rollout.name=sglang \ actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \ actor_rollout_ref.rollout.n=16 \ + actor_rollout_ref.rollout.over_sample_rate=0.1 \ + actor_rollout_ref.rollout.mode=sync \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ algorithm.use_kl_in_reward=False \ diff --git a/examples/sglang_multiturn/run_qwen3_4b_dapo_multiturn.sh b/examples/sglang_multiturn/run_qwen3_4b_dapo_multiturn.sh index 53f856cca27..39948693264 100644 --- a/examples/sglang_multiturn/run_qwen3_4b_dapo_multiturn.sh +++ b/examples/sglang_multiturn/run_qwen3_4b_dapo_multiturn.sh @@ -17,10 +17,30 @@ hf download \ --repo-type dataset \ --local-dir $HOME/data/Maxwell-Jia/AIME_2024 -# Note that this script is using AgentLoop instead of SGLang Multi-Turn -# We are concerned that the reward is not actually converge, since the -# reward of retool is encouraging the model to generate more turns to -# call more tools. The answers are not actually correct. + +# Note: +# 1. +# a sandbox fusion server is needed to run the code interpreter tool. +# docker run -it -p 8080:8080 volcengine/sandbox-fusion:server-20250609 + +# 2. +# The model located at font-info/qwen3-4b-sft-SGLang-RL (https://huggingface.co/font-info/qwen3-4b-sft-SGLang-RL) +# is a fine-tuned version provided by the SGLang RL team. Without supervised fine-tuning (SFT) +# on the Retool dataset, Dapo training will not converge. + +# If you still wish to perform SFT from scratch, follow the steps below: + +# Step 1: Download the SFT dataset +#huggingface-cli download JoeYing/ReTool-SFT --repo-type dataset --local-dir ./ReTool-SFT + +# Step 2: Preprocess the data for SFT +#python3 recipe/retool/retool_sft_preprocess.py + +# Step 3: Run SFT training +#bash recipe/retool/run_qwen2-32b_sft.sh + +# having trouble setup? see https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/blob/main/rlhf/verl/multi-turn/release_log/latest_sglang.md for more details. + python3 -m verl.trainer.main_ppo \ algorithm.adv_estimator=grpo \ @@ -38,7 +58,7 @@ python3 -m verl.trainer.main_ppo \ data.custom_cls.name=CustomRLHFDataset \ custom_reward_function.path=$PROJECT_DIR/recipe/retool/retool.py \ custom_reward_function.name=compute_score \ - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 \ + actor_rollout_ref.model.path=font-info/qwen3-4b-sft-SGLang-RL \ actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.model.enable_gradient_checkpointing=True \ actor_rollout_ref.actor.use_kl_loss=False \ @@ -47,16 +67,17 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.actor.clip_ratio_high=0.28 \ actor_rollout_ref.actor.clip_ratio_c=10.0 \ actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.use_dynamic_bsz=False \ actor_rollout_ref.actor.ppo_mini_batch_size=32 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=32768 \ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8 \ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \ actor_rollout_ref.rollout.name=sglang \ actor_rollout_ref.rollout.mode=async \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \ actor_rollout_ref.rollout.update_weights_bucket_megabytes=512 \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.85 \ actor_rollout_ref.rollout.multi_stage_wake_up=True \ actor_rollout_ref.rollout.multi_turn.enable=True \ actor_rollout_ref.rollout.multi_turn.max_user_turns=16 \ @@ -69,7 +90,7 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.rollout.val_kwargs.n=30 \ trainer.logger=['console','wandb'] \ trainer.project_name=sglang-dapo-multiturn \ - trainer.experiment_name=qwen3-4b_dapo_multiturn \ + trainer.experiment_name=qwen3_4b_sft_dapo_multiturn \ trainer.n_gpus_per_node=8 \ trainer.log_val_generations=20 \ trainer.val_before_train=True \ diff --git a/recipe/dapo/run_dapo_qwen3_moe_30b_base_npu_fsdp.sh b/recipe/dapo/run_dapo_qwen3_moe_30b_base_npu_fsdp.sh new file mode 100644 index 00000000000..36cf175a18f --- /dev/null +++ b/recipe/dapo/run_dapo_qwen3_moe_30b_base_npu_fsdp.sh @@ -0,0 +1,146 @@ +#!/usr/bin/env bash +set -euxo pipefail + +project_name='DAPO' +exp_name='DAPO-Qwen3-MOE-30B-FSDP-128rank-gbs512' + +NNODES=8 +NPUS_PER_NODE=16 + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 20)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 +loss_agg_mode="token-mean" +ppo_mini_batch_size=32 + +enable_filter_groups=True +filter_groups_metric=acc +max_num_gen_batches=10 +train_prompt_bsz=512 +gen_prompt_bsz=$((train_prompt_bsz * 3)) +n_resp_per_prompt=16 + +RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +WORKING_DIR=${WORKING_DIR:-"${PWD}"} +RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} + +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-30B-A3B-Base"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"} + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +sp_size=16 # For load-balance. For smaller cluster this can be set to as less as 2. +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) / 2)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) / 2)) +offload=True +recompute=True +max_num_seqs=128 +gen_tp=2 +gen_world_size=$((NNODES * NPUS_PER_NODE)) # nnodes* npus_in_per_node + + +ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ + -- python3 -m recipe.dapo.main_dapo \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + actor_rollout_ref.rollout.max_num_seqs=${max_num_seqs} \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + algorithm.filter_groups.enable=${enable_filter_groups} \ + algorithm.filter_groups.max_num_gen_batches=${max_num_gen_batches} \ + algorithm.filter_groups.metric=${filter_groups_metric} \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + +actor_rollout_ref.model.override_config.attention_dropout=0. \ + +actor_rollout_ref.model.override_config.embd_pdrop=0. \ + +actor_rollout_ref.model.override_config.resid_pdrop=0. \ + actor_rollout_ref.model.enable_gradient_checkpointing=${recompute} \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${ppo_mini_batch_size} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \ + actor_rollout_ref.actor.fsdp_config.forward_prefetch=False \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + +actor_rollout_ref.rollout.rollout_world_size=${gen_world_size} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=-1 \ + actor_rollout_ref.ref.fsdp_config.forward_prefetch=False \ + actor_rollout_ref.rollout.enforce_eager=False \ + actor_rollout_ref.rollout.free_cache_engine=True \ + reward_model.reward_manager=dapo \ + reward_model.overlong_buffer.enable=${enable_overlong_buffer} \ + reward_model.overlong_buffer.len=${overlong_buffer_len} \ + reward_model.overlong_buffer.penalty_factor=${overlong_penalty_factor} \ + trainer.logger=['console','wandb'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node="${NPUS_PER_NODE}" \ + trainer.nnodes="${NNODES}" \ + trainer.val_before_train=False \ + trainer.test_freq=5 \ + trainer.save_freq=-1 \ + trainer.total_epochs=1 \ + trainer.device="npu" \ + actor_rollout_ref.actor.use_torch_compile=False \ + actor_rollout_ref.ref.use_torch_compile=False + diff --git a/recipe/one_step_off_policy/main_ppo.py b/recipe/one_step_off_policy/main_ppo.py index ea869b5489f..344fe4b9f0c 100644 --- a/recipe/one_step_off_policy/main_ppo.py +++ b/recipe/one_step_off_policy/main_ppo.py @@ -23,9 +23,12 @@ import ray from omegaconf import OmegaConf +from recipe.one_step_off_policy.utils import need_critic from verl.trainer.constants_ppo import get_ppo_ray_runtime_env from verl.trainer.main_ppo import create_rl_dataset, create_rl_sampler from verl.trainer.ppo.reward import load_reward_manager +from verl.trainer.ppo.utils import need_reference_policy +from verl.utils.config import validate_config from .ray_trainer import OneStepOffRayTrainer @@ -87,20 +90,6 @@ def run(self, config): OmegaConf.resolve(config) - # Download the checkpoint from HDFS to the local machine. - # `use_shm` determines whether to use shared memory, which could lead to faster model loading if turned on - local_path = copy_to_local( - config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False) - ) - - # Instantiate the tokenizer and processor. - from verl.utils import hf_processor, hf_tokenizer - - trust_remote_code = config.data.get("trust_remote_code", False) - tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code) - # Used for multimodal LLM, could be None - processor = hf_processor(local_path, trust_remote_code=trust_remote_code, use_fast=True) - # Define worker classes based on the actor strategy. if config.actor_rollout_ref.actor.strategy == "fsdp2": assert config.actor_rollout_ref.actor.strategy == config.critic.strategy @@ -190,6 +179,27 @@ def run(self, config): role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker) mapping[Role.RefPolicy] = global_pool_id + # validate config + validate_config( + config=config, + use_reference_policy=need_reference_policy(role_worker_mapping), + use_critic=need_critic(config), + ) + + # Download the checkpoint from HDFS to the local machine. + # `use_shm` determines whether to use shared memory, which could lead to faster model loading if turned on + local_path = copy_to_local( + config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False) + ) + + # Instantiate the tokenizer and processor. + from verl.utils import hf_processor, hf_tokenizer + + trust_remote_code = config.data.get("trust_remote_code", False) + tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code) + # Used for multimodal LLM, could be None + processor = hf_processor(local_path, trust_remote_code=trust_remote_code, use_fast=True) + # Load the reward manager for training and validation. reward_fn = load_reward_manager( config, tokenizer, num_examine=0, **config.reward_model.get("reward_kwargs", {}) diff --git a/recipe/one_step_off_policy/ray_trainer.py b/recipe/one_step_off_policy/ray_trainer.py index 70399f82f75..cf989d315d3 100644 --- a/recipe/one_step_off_policy/ray_trainer.py +++ b/recipe/one_step_off_policy/ray_trainer.py @@ -28,11 +28,12 @@ from torch.utils.data import Dataset, Sampler from tqdm import tqdm +from recipe.one_step_off_policy.utils import need_critic from verl import DataProto from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup from verl.single_controller.ray.base import create_colocated_worker_cls from verl.trainer.ppo import core_algos -from verl.trainer.ppo.core_algos import AdvantageEstimator, agg_loss +from verl.trainer.ppo.core_algos import agg_loss from verl.trainer.ppo.metric_utils import ( compute_data_metrics, compute_throughout_metrics, @@ -41,13 +42,12 @@ from verl.trainer.ppo.ray_trainer import ( RayPPOTrainer, ResourcePoolManager, - Role, - WorkerType, apply_kl_penalty, compute_advantage, compute_response_mask, ) from verl.trainer.ppo.reward import compute_reward, compute_reward_async +from verl.trainer.ppo.utils import Role, WorkerType, need_reference_policy, need_reward_model from verl.utils.debug import marked_timer from verl.utils.metric import ( reduce_metrics, @@ -140,8 +140,9 @@ def __init__( self.role_worker_mapping = role_worker_mapping self.resource_pool_manager = resource_pool_manager - self.use_reference_policy = Role.RefPolicy in role_worker_mapping - self.use_rm = Role.RewardModel in role_worker_mapping + self.use_reference_policy = need_reference_policy(self.role_worker_mapping) + self.use_rm = need_reward_model(self.role_worker_mapping) + self.use_critic = need_critic(config) self.ray_worker_group_cls = ray_worker_group_cls self.device_name = device_name self.validation_generations_logger = ValidationGenerationsLogger() @@ -154,23 +155,6 @@ def __init__( if config.algorithm.use_kl_in_reward: self.kl_ctrl_in_reward = core_algos.get_kl_controller(config.algorithm.kl_ctrl) - if self.config.algorithm.adv_estimator == AdvantageEstimator.GAE: - self.use_critic = True - elif self.config.algorithm.adv_estimator in [ - AdvantageEstimator.GRPO, - AdvantageEstimator.GRPO_PASSK, - AdvantageEstimator.REINFORCE_PLUS_PLUS, - # AdvantageEstimator.REMAX, # TODO:REMAX advantage estimator is not yet supported in one_step_off_policy - AdvantageEstimator.RLOO, - AdvantageEstimator.OPO, - AdvantageEstimator.REINFORCE_PLUS_PLUS_BASELINE, - AdvantageEstimator.GPG, - ]: - self.use_critic = False - else: - raise NotImplementedError - - self._validate_config() self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler) def _validate(self): diff --git a/recipe/one_step_off_policy/utils.py b/recipe/one_step_off_policy/utils.py new file mode 100644 index 00000000000..1879b0672fa --- /dev/null +++ b/recipe/one_step_off_policy/utils.py @@ -0,0 +1,38 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# Copyright 2025 Meituan Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from omegaconf import DictConfig + +from verl.trainer.ppo.core_algos import AdvantageEstimator + + +def need_critic(config: DictConfig) -> bool: + """Given a config, do we need critic""" + if config.algorithm.adv_estimator == AdvantageEstimator.GAE: + return True + elif config.algorithm.adv_estimator in [ + AdvantageEstimator.GRPO, + AdvantageEstimator.GRPO_PASSK, + AdvantageEstimator.REINFORCE_PLUS_PLUS, + # AdvantageEstimator.REMAX, # TODO:REMAX advantage estimator is not yet supported in one_step_off_policy + AdvantageEstimator.RLOO, + AdvantageEstimator.OPO, + AdvantageEstimator.REINFORCE_PLUS_PLUS_BASELINE, + AdvantageEstimator.GPG, + ]: + return False + else: + raise NotImplementedError diff --git a/recipe/prime/main_prime.py b/recipe/prime/main_prime.py index 4c3ed6e6d9e..39d20de4326 100644 --- a/recipe/prime/main_prime.py +++ b/recipe/prime/main_prime.py @@ -33,6 +33,9 @@ import ray from omegaconf import OmegaConf +from verl.trainer.ppo.utils import need_reference_policy +from verl.utils.config import validate_config + from .prime_ray_trainer import RayPRIMETrainer @@ -67,14 +70,6 @@ def main_task(config, compute_score=None): pprint(OmegaConf.to_container(config, resolve=True)) # resolve=True will eval symbol values OmegaConf.resolve(config) - # download the checkpoint from hdfs - local_path = copy_local_path_from_hdfs(config.actor_rollout_ref.model.path) - - # instantiate tokenizer - from verl.utils import hf_tokenizer - - tokenizer = hf_tokenizer(local_path) - # define worker classes if config.actor_rollout_ref.actor.strategy in {"fsdp", "fsdp2"}: assert config.critic.strategy in {"fsdp", "fsdp2"} @@ -118,6 +113,21 @@ def main_task(config, compute_score=None): role_worker_mapping[Role.RewardModel] = ray.remote(PRIMERewardModelWorker) mapping[Role.RewardModel] = global_pool_id + # validate config + # TODO: Additional config checks can be added with proper function under prime recipe + validate_config( + config=config, + use_reference_policy=need_reference_policy(role_worker_mapping), + use_critic=False, + ) + + # download the checkpoint from hdfs + local_path = copy_local_path_from_hdfs(config.actor_rollout_ref.model.path) + + # instantiate tokenizer + from verl.utils import hf_tokenizer + + tokenizer = hf_tokenizer(local_path) reward_manager_name = config.reward_model.get("reward_manager", "naive") if reward_manager_name == "naive": from verl.workers.reward_manager import NaiveRewardManager diff --git a/recipe/prime/prime_ray_trainer.py b/recipe/prime/prime_ray_trainer.py index a5ad96431a8..6782b32256a 100644 --- a/recipe/prime/prime_ray_trainer.py +++ b/recipe/prime/prime_ray_trainer.py @@ -30,7 +30,8 @@ from verl.single_controller.ray import RayWorkerGroup from verl.trainer.ppo.core_algos import agg_loss from verl.trainer.ppo.metric_utils import _compute_response_info -from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager, Role, WorkerType +from verl.trainer.ppo.ray_trainer import RayPPOTrainer, ResourcePoolManager +from verl.trainer.ppo.utils import Role, WorkerType from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path from verl.utils.dataset.rl_dataset import RLHFDataset, collate_fn from verl.utils.metric import reduce_metrics @@ -176,10 +177,6 @@ def __init__( self.use_critic = False - def _validate_config(self): - super()._validate_config() - # TODO: Additional config checks can be added here - def _create_dataloader(self, *args, **kwargs): from torch.utils.data import DataLoader, RandomSampler, SequentialSampler diff --git a/recipe/retool/retool.py b/recipe/retool/retool.py index b4d6028ff8f..7bcc70453ee 100644 --- a/recipe/retool/retool.py +++ b/recipe/retool/retool.py @@ -112,7 +112,7 @@ def compute_score(data_source, solution_str, ground_truth, extra_info): num_turns = extra_info["num_turns"] if result["score"] < 0: tool_call_reward = (num_turns - 2) / 2 * 0.1 - result["score"] = min(0, result["score"] + tool_call_reward) + result["score"] = min(-0.6, result["score"] + tool_call_reward) if result["pred"] is None: result["pred"] = "" diff --git a/recipe/retool/sandbox_fusion_tool_config.yaml b/recipe/retool/sandbox_fusion_tool_config.yaml index 20345715525..71b10e50ec9 100644 --- a/recipe/retool/sandbox_fusion_tool_config.yaml +++ b/recipe/retool/sandbox_fusion_tool_config.yaml @@ -1,7 +1,7 @@ tools: - class_name: "recipe.retool.retool.CustomSandboxFusionTool" config: - sandbox_fusion_url: "https://***.apigateway-cn-beijing.volceapi.com/run_code" + sandbox_fusion_url: "http://localhost:8080/run_code" num_workers: 128 enable_global_rate_limit: true rate_limit: 128 diff --git a/recipe/spin/main_spin.py b/recipe/spin/main_spin.py index 782fe4cff17..e66ed07256e 100644 --- a/recipe/spin/main_spin.py +++ b/recipe/spin/main_spin.py @@ -19,7 +19,9 @@ import ray from recipe.spin.spin_trainer import RaySPINTrainer +from recipe.spin.utils import validate_config from verl.trainer.ppo.reward import get_custom_reward_fn +from verl.trainer.ppo.utils import need_reference_policy @hydra.main(config_path="config", config_name="spin_trainer", version_base=None) @@ -56,16 +58,6 @@ def run(self, config): pprint(OmegaConf.to_container(config, resolve=True)) # resolve=True will eval symbol values OmegaConf.resolve(config) - # download the checkpoint from hdfs - local_path = copy_to_local(config.actor_rollout_ref.model.path) - - # instantiate tokenizer - from verl.utils import hf_processor, hf_tokenizer - - trust_remote_code = config.data.get("trust_remote_code", False) - tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code) - processor = hf_processor(local_path, use_fast=True) # used for multimodal LLM, could be none - # define worker classes if config.actor_rollout_ref.actor.strategy in {"fsdp", "fsdp2"}: assert config.critic.strategy in {"fsdp", "fsdp2"} @@ -117,6 +109,23 @@ def run(self, config): role_worker_mapping[Role.RefPolicy] = ray.remote(SPINRolloutRefWorker) mapping[Role.RefPolicy] = global_pool_id + # validate config + validate_config( + config=config, + use_reference_policy=need_reference_policy(self.role_worker_mapping), + use_critic=False, + ) + + # download the checkpoint from hdfs + local_path = copy_to_local(config.actor_rollout_ref.model.path) + + # instantiate tokenizer + from verl.utils import hf_processor, hf_tokenizer + + trust_remote_code = config.data.get("trust_remote_code", False) + tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code) + processor = hf_processor(local_path, use_fast=True) # used for multimodal LLM, could be none + from verl.workers.reward_manager import get_reward_manager_cls # Note(haibin.lin): please make sure custom reward managers are imported and diff --git a/recipe/spin/spin_trainer.py b/recipe/spin/spin_trainer.py index 43789218f57..bb6fe672634 100644 --- a/recipe/spin/spin_trainer.py +++ b/recipe/spin/spin_trainer.py @@ -19,7 +19,6 @@ from collections import defaultdict from contextlib import contextmanager from dataclasses import dataclass, field -from enum import Enum from pprint import pprint from typing import Any, Optional @@ -35,7 +34,6 @@ from recipe.spin import core_algos from verl import DataProto from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto -from verl.single_controller.base import Worker from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup from verl.single_controller.ray.base import create_colocated_worker_cls from verl.trainer.ppo.metric_utils import ( @@ -44,27 +42,12 @@ process_validation_metrics, reduce_metrics, ) -from verl.trainer.ppo.ray_trainer import Role +from verl.trainer.ppo.utils import Role, WorkerType, need_reference_policy, need_reward_model from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path from verl.utils.seqlen_balancing import get_seqlen_balanced_partitions, log_seqlen_unbalance from verl.utils.torch_functional import masked_mean from verl.utils.tracking import ValidationGenerationsLogger -WorkerType = type[Worker] - - -class AdvantageEstimator(str, Enum): - """ - Using an enumeration class to avoid spelling errors in adv_estimator - """ - - GAE = "gae" - GRPO = "grpo" - REINFORCE_PLUS_PLUS = "reinforce_plus_plus" - REINFORCE_PLUS_PLUS_BASELINE = "reinforce_plus_plus_baseline" - REMAX = "remax" - RLOO = "rloo" - @dataclass class ResourcePoolManager: @@ -386,8 +369,9 @@ def __init__( self.role_worker_mapping = role_worker_mapping self.resource_pool_manager = resource_pool_manager - self.use_reference_policy = Role.RefPolicy in role_worker_mapping - self.use_rm = Role.RewardModel in role_worker_mapping + self.use_reference_policy = need_reference_policy(role_worker_mapping) + self.use_rm = need_reward_model(role_worker_mapping) + self.use_critic = False self.ray_worker_group_cls = ray_worker_group_cls self.validation_generations_logger = ValidationGenerationsLogger() self.async_rollout_mode = False @@ -398,146 +382,8 @@ def __init__( if config.algorithm.use_kl_in_reward: self.kl_ctrl_in_reward = core_algos.get_kl_controller(config.algorithm.kl_ctrl) - self.use_critic = False - self._validate_config() self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler) - def _validate_config(self): - config = self.config - # number of GPUs total - n_gpus = config.trainer.n_gpus_per_node * config.trainer.nnodes - - # 1. Check total batch size for data correctness - real_train_batch_size = config.data.train_batch_size * config.actor_rollout_ref.rollout.n - assert real_train_batch_size % n_gpus == 0, ( - f"real_train_batch_size ({real_train_batch_size}) must be divisible by total n_gpus ({n_gpus})." - ) - - # A helper function to check "micro_batch_size" vs "micro_batch_size_per_gpu" - # We throw an error if the user sets both. The new convention is "..._micro_batch_size_per_gpu". - def check_mutually_exclusive(mbs, mbs_per_gpu, name: str): - settings = { - "actor_rollout_ref.actor": "micro_batch_size", - "critic": "micro_batch_size", - "reward_model": "micro_batch_size", - "actor_rollout_ref.ref": "log_prob_micro_batch_size", - "actor_rollout_ref.rollout": "log_prob_micro_batch_size", - } - - if name in settings: - param = settings[name] - param_per_gpu = f"{param}_per_gpu" - - if mbs is None and mbs_per_gpu is None: - raise ValueError( - f"[{name}] Please set at least one of '{name}.{param}' or '{name}.{param_per_gpu}'." - ) - - if mbs is not None and mbs_per_gpu is not None: - raise ValueError( - f"[{name}] You have set both '{name}.{param}' AND '{name}.{param_per_gpu}'. " - f"Please remove '{name}.{param}' because only '*_{param_per_gpu}' is supported " - f"(the former is deprecated)." - ) - - if not config.actor_rollout_ref.actor.use_dynamic_bsz: - # actor: ppo_micro_batch_size vs. ppo_micro_batch_size_per_gpu - check_mutually_exclusive( - config.actor_rollout_ref.actor.ppo_micro_batch_size, - config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu, - "actor_rollout_ref.actor", - ) - - if self.use_reference_policy: - # reference: log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu - check_mutually_exclusive( - config.actor_rollout_ref.ref.log_prob_micro_batch_size, - config.actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu, - "actor_rollout_ref.ref", - ) - - # The rollout section also has log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu - check_mutually_exclusive( - config.actor_rollout_ref.rollout.log_prob_micro_batch_size, - config.actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu, - "actor_rollout_ref.rollout", - ) - - if self.use_critic and not config.critic.use_dynamic_bsz: - # Check for critic micro-batch size conflicts - check_mutually_exclusive( - config.critic.ppo_micro_batch_size, config.critic.ppo_micro_batch_size_per_gpu, "critic" - ) - - # Check for reward model micro-batch size conflicts - if config.reward_model.enable and not config.reward_model.use_dynamic_bsz: - check_mutually_exclusive( - config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu, "reward_model" - ) - - # Actor - # check if train_batch_size is larger than ppo_mini_batch_size - # if NOT dynamic_bsz, we must ensure: - # ppo_mini_batch_size is divisible by ppo_micro_batch_size - # ppo_micro_batch_size * sequence_parallel_size >= n_gpus - if not config.actor_rollout_ref.actor.use_dynamic_bsz: - assert config.data.train_batch_size >= config.actor_rollout_ref.actor.ppo_mini_batch_size - sp_size = config.actor_rollout_ref.actor.get("ulysses_sequence_parallel_size", 1) - if config.actor_rollout_ref.actor.ppo_micro_batch_size is not None: - assert ( - config.actor_rollout_ref.actor.ppo_mini_batch_size - % config.actor_rollout_ref.actor.ppo_micro_batch_size - == 0 - ) - assert config.actor_rollout_ref.actor.ppo_micro_batch_size * sp_size >= n_gpus - - assert config.actor_rollout_ref.actor.loss_agg_mode in [ - "token-mean", - "seq-mean-token-sum", - "seq-mean-token-mean", - ], f"Invalid loss_agg_mode: {config.actor_rollout_ref.actor.loss_agg_mode}" - - if config.algorithm.use_kl_in_reward and config.actor_rollout_ref.actor.use_kl_loss: - print("NOTICE: You have both enabled in-reward kl and kl loss.") - - # critic - if self.use_critic and not config.critic.use_dynamic_bsz: - assert config.data.train_batch_size >= config.critic.ppo_mini_batch_size - sp_size = config.critic.get("ulysses_sequence_parallel_size", 1) - if config.critic.ppo_micro_batch_size is not None: - assert config.critic.ppo_mini_batch_size % config.critic.ppo_micro_batch_size == 0 - assert config.critic.ppo_micro_batch_size * sp_size >= n_gpus - - # Check if use_remove_padding is enabled when using sequence parallelism for fsdp - if config.actor_rollout_ref.actor.strategy in {"fsdp", "fsdp2"}: - if ( - config.actor_rollout_ref.actor.get("ulysses_sequence_parallel_size", 1) > 1 - or config.actor_rollout_ref.ref.get("ulysses_sequence_parallel_size", 1) > 1 - ): - assert config.actor_rollout_ref.model.use_remove_padding, ( - "When using sequence parallelism for actor/ref policy, you must enable `use_remove_padding`." - ) - - if self.use_critic and config.critic.strategy in {"fsdp", "fsdp2"}: - if config.critic.get("ulysses_sequence_parallel_size", 1) > 1: - assert config.critic.model.use_remove_padding, ( - "When using sequence parallelism for critic, you must enable `use_remove_padding`." - ) - - if config.data.get("val_batch_size", None) is not None: - print( - "WARNING: val_batch_size is deprecated. Validation datasets are sent to inference engines " - "as a whole batch, which will schedule the memory themselves." - ) - - # check eval config - if config.actor_rollout_ref.rollout.val_kwargs.do_sample: - assert config.actor_rollout_ref.rollout.temperature > 0, ( - "validation gen temperature should be greater than 0 when enabling do_sample" - ) - - print("[validate_config] All configuration checks passed successfully!") - def _create_dataloader(self, train_dataset, val_dataset, collate_fn, train_sampler): """ Creates the train and validation dataloaders. diff --git a/recipe/spin/utils.py b/recipe/spin/utils.py new file mode 100644 index 00000000000..571ad1e9154 --- /dev/null +++ b/recipe/spin/utils.py @@ -0,0 +1,160 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# Copyright 2023-2024 SGLang Team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from omegaconf import DictConfig + + +def validate_config( + config: DictConfig, + use_reference_policy: bool, + use_critic: bool, +) -> None: + """ + Validate an OmegaConf DictConfig + + Args: + config: The OmegaConf DictConfig to validate. + use_reference_policy (bool): is ref policy needed + use_critic (bool): is critic needed + """ + # number of GPUs total + n_gpus = config.trainer.n_gpus_per_node * config.trainer.nnodes + + # 1. Check total batch size for data correctness + real_train_batch_size = config.data.train_batch_size * config.actor_rollout_ref.rollout.n + assert real_train_batch_size % n_gpus == 0, ( + f"real_train_batch_size ({real_train_batch_size}) must be divisible by total n_gpus ({n_gpus})." + ) + + # A helper function to check "micro_batch_size" vs "micro_batch_size_per_gpu" + # We throw an error if the user sets both. The new convention is "..._micro_batch_size_per_gpu". + def check_mutually_exclusive(mbs, mbs_per_gpu, name: str): + settings = { + "actor_rollout_ref.actor": "micro_batch_size", + "critic": "micro_batch_size", + "reward_model": "micro_batch_size", + "actor_rollout_ref.ref": "log_prob_micro_batch_size", + "actor_rollout_ref.rollout": "log_prob_micro_batch_size", + } + + if name in settings: + param = settings[name] + param_per_gpu = f"{param}_per_gpu" + + if mbs is None and mbs_per_gpu is None: + raise ValueError(f"[{name}] Please set at least one of '{name}.{param}' or '{name}.{param_per_gpu}'.") + + if mbs is not None and mbs_per_gpu is not None: + raise ValueError( + f"[{name}] You have set both '{name}.{param}' AND '{name}.{param_per_gpu}'. " + f"Please remove '{name}.{param}' because only '*_{param_per_gpu}' is supported " + f"(the former is deprecated)." + ) + + if not config.actor_rollout_ref.actor.use_dynamic_bsz: + # actor: ppo_micro_batch_size vs. ppo_micro_batch_size_per_gpu + check_mutually_exclusive( + config.actor_rollout_ref.actor.ppo_micro_batch_size, + config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu, + "actor_rollout_ref.actor", + ) + + if use_reference_policy: + # reference: log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu + check_mutually_exclusive( + config.actor_rollout_ref.ref.log_prob_micro_batch_size, + config.actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu, + "actor_rollout_ref.ref", + ) + + # The rollout section also has log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu + check_mutually_exclusive( + config.actor_rollout_ref.rollout.log_prob_micro_batch_size, + config.actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu, + "actor_rollout_ref.rollout", + ) + + if use_critic and not config.critic.use_dynamic_bsz: + # Check for critic micro-batch size conflicts + check_mutually_exclusive( + config.critic.ppo_micro_batch_size, config.critic.ppo_micro_batch_size_per_gpu, "critic" + ) + + # Check for reward model micro-batch size conflicts + if config.reward_model.enable and not config.reward_model.use_dynamic_bsz: + check_mutually_exclusive( + config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu, "reward_model" + ) + + # Actor + # check if train_batch_size is larger than ppo_mini_batch_size + # if NOT dynamic_bsz, we must ensure: + # ppo_mini_batch_size is divisible by ppo_micro_batch_size + # ppo_micro_batch_size * sequence_parallel_size >= n_gpus + if not config.actor_rollout_ref.actor.use_dynamic_bsz: + assert config.data.train_batch_size >= config.actor_rollout_ref.actor.ppo_mini_batch_size + sp_size = config.actor_rollout_ref.actor.get("ulysses_sequence_parallel_size", 1) + if config.actor_rollout_ref.actor.ppo_micro_batch_size is not None: + assert ( + config.actor_rollout_ref.actor.ppo_mini_batch_size % config.actor_rollout_ref.actor.ppo_micro_batch_size + == 0 + ) + assert config.actor_rollout_ref.actor.ppo_micro_batch_size * sp_size >= n_gpus + + assert config.actor_rollout_ref.actor.loss_agg_mode in [ + "token-mean", + "seq-mean-token-sum", + "seq-mean-token-mean", + ], f"Invalid loss_agg_mode: {config.actor_rollout_ref.actor.loss_agg_mode}" + + if config.algorithm.use_kl_in_reward and config.actor_rollout_ref.actor.use_kl_loss: + print("NOTICE: You have both enabled in-reward kl and kl loss.") + + # critic + if use_critic and not config.critic.use_dynamic_bsz: + assert config.data.train_batch_size >= config.critic.ppo_mini_batch_size + sp_size = config.critic.get("ulysses_sequence_parallel_size", 1) + if config.critic.ppo_micro_batch_size is not None: + assert config.critic.ppo_mini_batch_size % config.critic.ppo_micro_batch_size == 0 + assert config.critic.ppo_micro_batch_size * sp_size >= n_gpus + + # Check if use_remove_padding is enabled when using sequence parallelism for fsdp + if config.actor_rollout_ref.actor.strategy in {"fsdp", "fsdp2"}: + if ( + config.actor_rollout_ref.actor.get("ulysses_sequence_parallel_size", 1) > 1 + or config.actor_rollout_ref.ref.get("ulysses_sequence_parallel_size", 1) > 1 + ): + assert config.actor_rollout_ref.model.use_remove_padding, ( + "When using sequence parallelism for actor/ref policy, you must enable `use_remove_padding`." + ) + + if use_critic and config.critic.strategy in {"fsdp", "fsdp2"}: + if config.critic.get("ulysses_sequence_parallel_size", 1) > 1: + assert config.critic.model.use_remove_padding, ( + "When using sequence parallelism for critic, you must enable `use_remove_padding`." + ) + + if config.data.get("val_batch_size", None) is not None: + print( + "WARNING: val_batch_size is deprecated. Validation datasets are sent to inference engines " + "as a whole batch, which will schedule the memory themselves." + ) + + # check eval config + if config.actor_rollout_ref.rollout.val_kwargs.do_sample: + assert config.actor_rollout_ref.rollout.temperature > 0, ( + "validation gen temperature should be greater than 0 when enabling do_sample" + ) + + print("[validate_config] All configuration checks passed successfully!") diff --git a/recipe/sppo/main_sppo.py b/recipe/sppo/main_sppo.py index eb080eba06b..7f5a9e2c9ad 100644 --- a/recipe/sppo/main_sppo.py +++ b/recipe/sppo/main_sppo.py @@ -24,6 +24,8 @@ from omegaconf import OmegaConf from verl.trainer.ppo.reward import load_reward_manager +from verl.trainer.ppo.utils import need_reference_policy +from verl.utils.config import validate_config from .sppo_ray_trainer import RaySPPOTrainer @@ -66,16 +68,6 @@ def run(self, config): pprint(OmegaConf.to_container(config, resolve=True)) # resolve=True will eval symbol values OmegaConf.resolve(config) - # download the checkpoint from hdfs - local_path = copy_to_local(config.actor_rollout_ref.model.path) - - # instantiate tokenizer - from verl.utils import hf_processor, hf_tokenizer - - trust_remote_code = config.data.get("trust_remote_code", False) - tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code) - processor = hf_processor(local_path, use_fast=True) # used for multimodal LLM, could be none - # define worker classes if config.actor_rollout_ref.actor.strategy in {"fsdp", "fsdp2"}: assert config.critic.strategy in {"fsdp", "fsdp2"} @@ -133,6 +125,23 @@ def run(self, config): role_worker_mapping[Role.RefPolicy] = ray.remote(SPPOActorRolloutRefWorker) mapping[Role.RefPolicy] = global_pool_id + # validate config + validate_config( + config=config, + use_reference_policy=need_reference_policy(role_worker_mapping), + use_critic=False, + ) + + # download the checkpoint from hdfs + local_path = copy_to_local(config.actor_rollout_ref.model.path) + + # instantiate tokenizer + from verl.utils import hf_processor, hf_tokenizer + + trust_remote_code = config.data.get("trust_remote_code", False) + tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code) + processor = hf_processor(local_path, use_fast=True) # used for multimodal LLM, could be none + reward_fn = load_reward_manager( config, tokenizer, num_examine=0, **config.reward_model.get("reward_kwargs", {}) ) diff --git a/recipe/sppo/sppo_ray_trainer.py b/recipe/sppo/sppo_ray_trainer.py index 0725d293e2b..f11421f8511 100644 --- a/recipe/sppo/sppo_ray_trainer.py +++ b/recipe/sppo/sppo_ray_trainer.py @@ -38,12 +38,11 @@ AdvantageEstimator, RayPPOTrainer, ResourcePoolManager, - Role, - WorkerType, apply_kl_penalty, compute_response_mask, ) from verl.trainer.ppo.reward import compute_reward, compute_reward_async +from verl.trainer.ppo.utils import Role, WorkerType, need_reference_policy, need_reward_model from verl.utils.profiler.performance import simple_timer from verl.utils.tracking import ValidationGenerationsLogger @@ -111,8 +110,9 @@ def __init__( self.role_worker_mapping = role_worker_mapping self.resource_pool_manager = resource_pool_manager - self.use_reference_policy = Role.RefPolicy in role_worker_mapping - self.use_rm = Role.RewardModel in role_worker_mapping + self.use_reference_policy = need_reference_policy(role_worker_mapping) + self.use_rm = need_reward_model(role_worker_mapping) + self.use_critic = False self.ray_worker_group_cls = ray_worker_group_cls self.validation_generations_logger = ValidationGenerationsLogger() self.device_name = device_name if device_name else self.config.trainer.device @@ -122,9 +122,6 @@ def __init__( if config.algorithm.use_kl_in_reward: self.kl_ctrl_in_reward = core_algos.get_kl_controller(config.algorithm.kl_ctrl) - self.use_critic = False - - self._validate_config() self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler) def fit(self): diff --git a/requirements_sglang.txt b/requirements_sglang.txt index aa3a30c2924..c366ace43ba 100644 --- a/requirements_sglang.txt +++ b/requirements_sglang.txt @@ -17,5 +17,5 @@ torchdata torchvision transformers wandb -sglang[all]==0.4.9.post6 +sglang[all]==0.4.10.post2 huggingface_hub diff --git a/setup.py b/setup.py index 7e97389f74c..5c10c1547cc 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,7 @@ VLLM_REQUIRES = ["tensordict>=0.8.0,<=0.9.1,!=0.9.0", "vllm>=0.7.3,<=0.9.1"] SGLANG_REQUIRES = [ "tensordict>=0.8.0,<=0.9.1,!=0.9.0", - "sglang[srt,openai]==0.4.9.post6", + "sglang[srt,openai]==0.4.10.post2", "torch==2.7.1", ] TRL_REQUIRES = ["trl<=0.9.6"] diff --git a/tests/experimental/agent_loop/test_basic_agent_loop.py b/tests/experimental/agent_loop/test_basic_agent_loop.py index 553a9a72586..b41538296b3 100644 --- a/tests/experimental/agent_loop/test_basic_agent_loop.py +++ b/tests/experimental/agent_loop/test_basic_agent_loop.py @@ -26,6 +26,7 @@ from verl.protocol import DataProto from verl.tools.base_tool import BaseTool, OpenAIFunctionToolSchema from verl.tools.schemas import ToolResponse +from verl.trainer.ppo.reward import compute_reward, load_reward_manager from verl.utils import hf_tokenizer @@ -41,6 +42,10 @@ def init_config() -> DictConfig: # test sleep/wake_up with fsdp offload "actor_rollout_ref.actor.fsdp_config.param_offload=True", "actor_rollout_ref.actor.fsdp_config.optimizer_offload=True", + "reward_model.reward_manager=dapo", + "+reward_model.reward_kwargs.overlong_buffer_cfg.enable=False", + "+reward_model.reward_kwargs.overlong_buffer_cfg.len=3072", + "+reward_model.reward_kwargs.max_resp_len=4096", ], ) @@ -69,6 +74,10 @@ def test_single_turn(init_config): ) agent_loop_manager = init_agent_loop_manager(init_config) + tokenizer = hf_tokenizer(init_config.actor_rollout_ref.model.path) + reward_fn = load_reward_manager( + init_config, tokenizer, num_examine=0, **init_config.reward_model.get("reward_kwargs", {}) + ) raw_prompts = [ [ @@ -97,10 +106,17 @@ def test_single_turn(init_config): assert result.batch["input_ids"].size(1) == seq_len assert result.batch["attention_mask"].size(1) == seq_len assert result.batch["position_ids"].size(1) == seq_len - assert result.batch["rm_scores"].size(1) == result.batch["responses"].size(1) + if init_config.actor_rollout_ref.rollout.calculate_log_probs: assert result.batch["rollout_log_probs"].size(1) == result.batch["responses"].size(1) + # check compute score + assert result.batch["rm_scores"].shape == result.batch["responses"].shape + reward_tensor, reward_extra_info = compute_reward(result, reward_fn) + assert reward_tensor.shape == result.batch["responses"].shape + assert "acc" in reward_extra_info, f"reward_extra_info {reward_extra_info} should contain 'acc'" + assert reward_extra_info["acc"].shape == (len(result),), f"invalid acc: {reward_extra_info['acc']}" + # check turns num_turns = result.non_tensor_batch["__num_turns__"] assert np.all(num_turns == 2) diff --git a/tests/special_sanity/check_license.py b/tests/special_sanity/check_license.py index 1a2073e6b02..a4ade024433 100644 --- a/tests/special_sanity/check_license.py +++ b/tests/special_sanity/check_license.py @@ -13,6 +13,7 @@ # limitations under the License. from argparse import ArgumentParser from pathlib import Path +from typing import Iterable license_head_bytedance = "Copyright 2024 Bytedance Ltd. and/or its affiliates" license_head_bytedance_25 = "Copyright 2025 Bytedance Ltd. and/or its affiliates" @@ -35,13 +36,37 @@ ] +def get_py_files(path_arg: Path) -> Iterable[Path]: + """get py files under a dir. if already py file return it + + Args: + path_arg (Path): path to scan for py files + + Returns: + Iterable[Path]: list of py files + """ + if path_arg.is_dir(): + return path_arg.glob("**/*.py") + elif path_arg.is_file() and path_arg.suffix == ".py": + return [path_arg] + return [] + + if __name__ == "__main__": parser = ArgumentParser() - parser.add_argument("--directory", "-d", required=True, type=str) + parser.add_argument( + "--directories", + "-d", + required=True, + type=Path, + nargs="+", + help="List of directories to check for license headers", + ) args = parser.parse_args() - directory_in_str = args.directory - pathlist = Path(directory_in_str).glob("**/*.py") + # Collect all Python files from specified directories + pathlist = set(path for path_arg in args.directories for path in get_py_files(path_arg)) + for path in pathlist: # because path is object not string path_in_str = str(path.absolute()) diff --git a/tests/utils/dataset/test_rl_dataset_on_cpu.py b/tests/utils/dataset/test_rl_dataset_on_cpu.py index 2afc3ef49f6..391e89a94d5 100644 --- a/tests/utils/dataset/test_rl_dataset_on_cpu.py +++ b/tests/utils/dataset/test_rl_dataset_on_cpu.py @@ -77,7 +77,7 @@ def test_image_rl_data(): "prompt_key": "prompt", "max_prompt_length": 1024, "filter_overlong_prompts": True, - "filter_overlong_prompts_workers": 2, + "filter_overlong_prompts_workers": 1, } ) dataset = RLHFDataset( diff --git a/tests/utils/test_activation_offload.py b/tests/utils/test_activation_offload.py index 2393d7962ae..25bc23c40ac 100644 --- a/tests/utils/test_activation_offload.py +++ b/tests/utils/test_activation_offload.py @@ -29,6 +29,23 @@ from verl.utils.fsdp_utils import MixedPrecisionPolicy, apply_fsdp2, get_fsdp_wrap_policy +def create_random_input_ids(batch_size, seq_len, vocab_size): + from flash_attn.bert_padding import unpad_input + + from verl.utils.model import compute_position_id_with_mask, create_random_mask + + input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device="cuda") + + attention_mask = create_random_mask( + input_ids, max_ratio_of_left_padding=0.1, min_ratio_of_valid_token=0.5, max_ratio_of_valid_token=0.7 + ) + position_ids = compute_position_id_with_mask(attention_mask) + + input_ids = unpad_input(input_ids.unsqueeze(-1), attention_mask)[0].transpose(0, 1) + position_ids = unpad_input(position_ids.unsqueeze(-1), attention_mask)[0].transpose(0, 1) + return input_ids, position_ids + + def _fsdp_activation_offloading_test(rank, world_size, rendezvous_file, strategy="fsdp"): torch.cuda.set_device(rank) torch.distributed.init_process_group( @@ -85,15 +102,13 @@ def _fsdp_activation_offloading_test(rank, world_size, rendezvous_file, strategy seq_len = 32 vocab_size = 32000 # First input for initial update - input_ids1 = torch.randint(0, vocab_size, (batch_size, seq_len), device="cuda") - attention_mask1 = torch.ones_like(input_ids1) + input_ids1, position_ids1 = create_random_input_ids(batch_size, seq_len, vocab_size) # Second input for verification - input_ids2 = torch.randint(0, vocab_size, (batch_size, seq_len), device="cuda") - attention_mask2 = torch.ones_like(input_ids2) + input_ids2, position_ids2 = create_random_input_ids(batch_size, seq_len, vocab_size) # Step 1: Initial update and save checkpoint - outputs1 = model(input_ids=input_ids1, attention_mask=attention_mask1) + outputs1 = model(input_ids=input_ids1, position_ids=position_ids1) loss1 = outputs1.logits.mean() loss1.backward() optimizer.step() @@ -106,7 +121,7 @@ def _fsdp_activation_offloading_test(rank, world_size, rendezvous_file, strategy checkpoint_manager.save_checkpoint(local_path=checkpoint_path, hdfs_path=None, global_step=0) # Step 2: Second update and forward pass - outputs2 = model(input_ids=input_ids2, attention_mask=attention_mask2) + outputs2 = model(input_ids=input_ids2, position_ids=position_ids2) loss2 = outputs2.logits.mean() loss2.backward() optimizer.step() @@ -115,14 +130,14 @@ def _fsdp_activation_offloading_test(rank, world_size, rendezvous_file, strategy # Record logits after second update with torch.no_grad(): - logits_without_offloading = model(input_ids=input_ids2, attention_mask=attention_mask2).logits + logits_without_offloading = model(input_ids=input_ids2, position_ids=position_ids2).logits # Step 3: wrap module with activation offloading and load checkpoint - enable_activation_offloading(model, "fsdp") + enable_activation_offloading(model, strategy=strategy) checkpoint_manager.load_checkpoint(checkpoint_path) # Step 4: Repeat the second update with same input - outputs3 = model(input_ids=input_ids2, attention_mask=attention_mask2) + outputs3 = model(input_ids=input_ids2, position_ids=position_ids2) loss3 = outputs3.logits.mean() loss3.backward() optimizer.step() @@ -131,7 +146,7 @@ def _fsdp_activation_offloading_test(rank, world_size, rendezvous_file, strategy # Record logits after loaded checkpoint and update with torch.no_grad(): - logits_with_offloading = model(input_ids=input_ids2, attention_mask=attention_mask2).logits + logits_with_offloading = model(input_ids=input_ids2, position_ids=position_ids2).logits # Step 4: Verify outputs match torch.testing.assert_close(logits_without_offloading, logits_with_offloading, atol=0.0, rtol=0.0) diff --git a/tests/utils/test_nvtx_profile.py b/tests/utils/test_nvtx_profile.py index fea7675335a..645da153d0a 100644 --- a/tests/utils/test_nvtx_profile.py +++ b/tests/utils/test_nvtx_profile.py @@ -120,8 +120,9 @@ def test_annotate_decorator(self): mock_self = MagicMock() mock_self.profiler = self.profiler mock_self.profiler.this_step = True + decorator = mock_self.profiler.annotate(message="test") - @NsightSystemsProfiler.annotate(message="test") + @decorator def test_func(self, *args, **kwargs): return "result" diff --git a/tests/utils/test_special_mstx_profile.py b/tests/utils/test_special_mstx_profile.py index c723c240865..a80cabfa49c 100644 --- a/tests/utils/test_special_mstx_profile.py +++ b/tests/utils/test_special_mstx_profile.py @@ -149,8 +149,9 @@ def test_annotate_decorator_applied_correctly(self): mock_start_patch.return_value = mock_mark_range with patch("verl.utils.profiler.mstx_profile.get_npu_profiler") as mock_get_profiler: + decorator = mock_worker.profiler.annotate(message="test") - @NPUProfiler.annotate(message="test") + @decorator def test_func(self, *args, **kwargs): return "result" @@ -171,8 +172,9 @@ def test_annotate_when_profiler_disabled(self): patch("verl.utils.profiler.mstx_profile.mark_end_range") as mock_end_patch, patch("verl.utils.profiler.mstx_profile.get_npu_profiler") as mock_get_profiler, ): + decorator = mock_worker.profiler.annotate(message="test") - @NPUProfiler.annotate(message="test") + @decorator def test_func(self, *args, **kwargs): return "result" @@ -193,8 +195,9 @@ def test_annotate_when_this_step_disabled(self): patch("verl.utils.profiler.mstx_profile.mark_end_range") as mock_end_patch, patch("verl.utils.profiler.mstx_profile.get_npu_profiler") as mock_get_profiler, ): + decorator = mock_worker.profiler.annotate(message="test") - @NPUProfiler.annotate(message="test") + @decorator def test_func(self, *args, **kwargs): return "result" @@ -221,8 +224,9 @@ def test_annotate_discrete_mode_enabled(self): ): mock_start_patch.return_value = mock_mark_range mock_get_profiler.return_value = mock_profile_npu + decorator = mock_worker.profiler.annotate(message="test", role="test_role") - @NPUProfiler.annotate(message="test", role="test_role") + @decorator def test_func(self, *args, **kwargs): return "result" @@ -253,8 +257,9 @@ def test_annotate_with_default_message(self): patch("verl.utils.profiler.mstx_profile.mark_end_range") as mock_end_patch, ): mock_start_patch.return_value = mock_mark_range + decorator = mock_worker.profiler.annotate() - @NPUProfiler.annotate() + @decorator def test_func(self, *args, **kwargs): return "result" diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index f174ef6da4c..13526046a0d 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -134,6 +134,8 @@ class AgentLoopOutput(BaseModel): """Number of chat turns, including user, assistant, tool.""" metrics: AgentLoopMetrics """Auxiliary performance metrics""" + extra_fields: dict[str, Any] = {} + """Extra fields for dynamic addition.""" class _InternalAgentLoopOutput(AgentLoopOutput): @@ -252,7 +254,7 @@ def __init__(self, config: DictConfig, local_path: str) -> None: ) self.loop = asyncio.get_event_loop() - async def compute_score(self, output: AgentLoopOutput, kwargs: dict) -> float: + async def compute_score(self, output: AgentLoopOutput, kwargs: dict) -> dict: """Compute reward score for agent loop output. NOTE: Since `reward_manager.__call__` is blocking function, we run it in thread pool to @@ -263,7 +265,7 @@ async def compute_score(self, output: AgentLoopOutput, kwargs: dict) -> float: kwargs (dict): Dataset fields from `verl.utils.dataset.RLHFDataset`. Returns: - float: Reward score. + dict: Reward score and reward extra info. """ prompts = torch.tensor(output.prompt_ids, dtype=torch.long).unsqueeze(0) responses = torch.tensor(output.response_ids, dtype=torch.long).unsqueeze(0) @@ -284,12 +286,16 @@ async def compute_score(self, output: AgentLoopOutput, kwargs: dict) -> float: batch=batch, non_tensor_batch=non_tensor_batch, ) - reward_tensor = await self.loop.run_in_executor( + result = await self.loop.run_in_executor( None, self.reward_manager, data, + True, # return_dict ) - return reward_tensor.sum(dim=-1).item() + + reward_score = result["reward_tensor"].sum(dim=-1).item() + reward_extra_info = {k: v[0] for k, v in result.get("reward_extra_info", {}).items()} + return {"reward_score": reward_score, "reward_extra_info": reward_extra_info} @ray.remote @@ -424,7 +430,9 @@ async def _run_agent_loop( # Some AgentLoop may have already computed the reward score, e.g SWE-agent. if output.reward_score is None and not self.config.reward_model.enable: - output.reward_score = await self.reward_manager_worker.compute_score.remote(output, kwargs) + result = await self.reward_manager_worker.compute_score.remote(output, kwargs) + output.reward_score = result["reward_score"] + output.extra_fields["reward_extra_info"] = result["reward_extra_info"] # NOTE: consistent with batch version of generate_sequences in vllm_rollout_spmd.py # prompt_ids: left padded with zeros (e.g., [0,0,0,0,1,2,3,4]) @@ -534,6 +542,7 @@ async def _run_agent_loop( reward_score=output.reward_score, num_turns=output.num_turns, metrics=output.metrics, + extra_fields=output.extra_fields, ) def _postprocess(self, inputs: list[_InternalAgentLoopOutput]) -> DataProto: @@ -575,13 +584,23 @@ def _postprocess(self, inputs: list[_InternalAgentLoopOutput]) -> DataProto: "__num_turns__": np.array([input.num_turns for input in inputs], dtype=np.int32), } + # add reward_extra_info to non_tensor_batch + reward_extra_infos = [input.extra_fields.get("reward_extra_info", {}) for input in inputs] + reward_extra_keys = list(reward_extra_infos[0].keys()) + for key in reward_extra_keys: + non_tensor_batch[key] = np.array([info[key] for info in reward_extra_infos]) + # Add multi_modal_inputs to non_tensor_batch if any samples have them multi_modal_inputs_list = [input.multi_modal_inputs for input in inputs] if any(mmi is not None for mmi in multi_modal_inputs_list): non_tensor_batch["multi_modal_inputs"] = np.array(multi_modal_inputs_list, dtype=object) metrics = [input.metrics.model_dump() for input in inputs] - return DataProto(batch=batch, non_tensor_batch=non_tensor_batch, meta_info={"metrics": metrics}) + return DataProto( + batch=batch, + non_tensor_batch=non_tensor_batch, + meta_info={"metrics": metrics, "reward_extra_keys": reward_extra_keys}, + ) async def get_trajectory_info(step, index, validate): @@ -717,10 +736,10 @@ def generate_sequences(self, prompts: DataProto) -> DataProto: self.sleep() # calculate performance metrics - metrics = [output.meta_info["metrics"] for output in outputs] # List[List[Dict[str, str]]] + metrics = [output.meta_info.pop("metrics") for output in outputs] # List[List[Dict[str, str]]] timing = self._performance_metrics(metrics, output) - output.meta_info = {"timing": timing} + output.meta_info = {"timing": timing, **outputs[0].meta_info} return output def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: DataProto) -> dict[str, float]: diff --git a/verl/models/mcore/config_converter.py b/verl/models/mcore/config_converter.py index 9daf550cdb8..9d3809f93c2 100644 --- a/verl/models/mcore/config_converter.py +++ b/verl/models/mcore/config_converter.py @@ -156,7 +156,8 @@ def check_and_construct_configs(original_config: dict, cls: type[T]) -> T: for key in removed_keys: original_config.pop(key) - print(f"Overridden {cls.__name__} init config: {original_config}") + if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: + print(f"Overridden {cls.__name__} init config: {original_config}") return cls(**original_config) diff --git a/verl/models/mcore/qwen2_5_vl/attention.py b/verl/models/mcore/qwen2_5_vl/attention.py index 91a27cc3edf..84e7ba8eda2 100644 --- a/verl/models/mcore/qwen2_5_vl/attention.py +++ b/verl/models/mcore/qwen2_5_vl/attention.py @@ -118,7 +118,8 @@ def forward( output, bias = self.linear_proj(context_layer) return output, bias - query, key, value, rotary_pos_emb, attn_mask_type = self._adjust_key_value_for_inference( + # Use latest mcore 0.13 API and forward-compatible with previous versions. + outputs = self._adjust_key_value_for_inference( inference_context, query, key, @@ -129,6 +130,8 @@ def forward( sequence_len_offset, ) + query, key, value, rotary_pos_emb, attn_mask_type = outputs[:5] + if packed_seq_params is not None: query = query.squeeze(1) key = key.squeeze(1) diff --git a/verl/models/transformers/monkey_patch.py b/verl/models/transformers/monkey_patch.py index 7f0e10ab65e..d8d67d5ebaa 100644 --- a/verl/models/transformers/monkey_patch.py +++ b/verl/models/transformers/monkey_patch.py @@ -110,6 +110,65 @@ def _ulysses_flash_attention_forward( return attn_output +def _ulysses_flash_attention_forward_transformers_4_55( + query_states: torch.Tensor, + key_states: torch.Tensor, + value_states: torch.Tensor, + attention_mask: Optional[torch.Tensor], + query_length: int, + *args, + position_ids: Optional[torch.Tensor] = None, + **kwargs, +): + """For transformers>=4.55, the flash attention api has changed, + we need to pass the query_length after doing ulysses alltoall. + + See https://github.com/huggingface/transformers/issues/40399 + """ + ulysses_sp_size = get_ulysses_sequence_parallel_world_size() + + ########## AlltoAll for Ulysses ########## + if ulysses_sp_size > 1: + assert position_ids is not None, "position_ids is required for Ulysses sequence parallelism" + + # NOTE: repeat kv heads to be divided by sequence parallel. Instead of repeating nheads_q//nheads_k, + # we choose to repeat sp_size//nheads_k, since flash_attention supports MQA/GQA. + # For example: + # - nheads_k=4, sp=8, repeats=2 + # - nheads_k=8, sp=8, repeats=1 + # - nheads_k=16, sp=8, repeats=1 + repeats = max(ulysses_sp_size // key_states.size(2), 1) + key_states = repeat_kv(key_states, repeats) + value_states = repeat_kv(value_states, repeats) + + # (bsz, seq_len/n, n_head, head_dim) -> (bsz, seq_len, n_head/n, head_dim) + query_states = gather_seq_scatter_heads(query_states, seq_dim=1, head_dim=2) + key_states = gather_seq_scatter_heads(key_states, seq_dim=1, head_dim=2) + value_states = gather_seq_scatter_heads(value_states, seq_dim=1, head_dim=2) + + # TODO: all_gather position_ids because `prepare_fa2_from_position_ids` needs it, we can eliminate + # this all_gather by passing cu_seq_lens_q, cu_seq_lens_k, max_length_k, max_length_q explicitly. + # https://github.com/huggingface/transformers/pull/33932 + + # (bsz, seq_len/n) -> (bsz, seq_len) + position_ids_list = [torch.empty_like(position_ids) for _ in range(ulysses_sp_size)] + torch.distributed.all_gather(position_ids_list, position_ids, group=get_ulysses_sequence_parallel_group()) + position_ids = torch.concat(position_ids_list, dim=-1) + + # (bsz, seq_len, n_head/n, head_dim) + query_length = query_states.size(1) + attn_output = _flash_attention_forward( + query_states, key_states, value_states, attention_mask, query_length, *args, position_ids=position_ids, **kwargs + ) + + ########## AlltoAll for Ulysses ########## + if ulysses_sp_size > 1: + # (bsz, seq_len, n_head/n, head_dim) -> (bsz, seq_len/n, n_head, head_dim) + attn_output = gather_heads_scatter_seq(attn_output, seq_dim=1, head_dim=2) + + return attn_output + + def patch_vlm_for_ulysses_input_slicing(model_class: type): """ Applies a monkey patch to the forward method of a given model class @@ -304,11 +363,17 @@ def state_dict(self, *args, **kwargs): module._flash_attention_forward = _ulysses_flash_attention_forward print(f"Monkey patch _flash_attention_forward in {model.__module__}") else: - # transformers>=4.48.0 - from transformers.integrations import flash_attention + if is_transformers_version_in_range(min_version="4.55.0"): + from transformers.integrations import flash_attention + + flash_attention._flash_attention_forward = _ulysses_flash_attention_forward_transformers_4_55 + print(f"Monkey patch _flash_attention_forward in {model.__module__} for new api") + else: + # 4.48.0 <= transformers <= 4.54.1, Vision attention + from transformers.integrations import flash_attention - flash_attention._flash_attention_forward = _ulysses_flash_attention_forward - print(f"Monkey patch _flash_attention_forward in {flash_attention.__name__}") + flash_attention._flash_attention_forward = _ulysses_flash_attention_forward + print(f"Monkey patch _flash_attention_forward in {flash_attention.__name__}") patch_forward_with_backends(model, use_fused_kernels=use_fused_kernels, fused_kernels_backend=fused_kernels_backend) diff --git a/verl/models/transformers/qwen2_vl.py b/verl/models/transformers/qwen2_vl.py index 7c8214c0a8e..0cc7820d114 100644 --- a/verl/models/transformers/qwen2_vl.py +++ b/verl/models/transformers/qwen2_vl.py @@ -217,18 +217,36 @@ def flash_attention_forward( ) # remove channel dimension cu_seqlens_q, cu_seqlens_k = cu_seq_lens max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens - attn_output = flash_attn_varlen_func( + + flash_attn_func = flash_attn_varlen_func + common_attn_kwargs = { + "cu_seqlens_q": cu_seqlens_q, + "cu_seqlens_k": cu_seqlens_k, + "max_seqlen_q": max_seqlen_in_batch_q, + "max_seqlen_k": max_seqlen_in_batch_k, + "dropout_p": kwargs.pop("dropout", 0.0), + "softmax_scale": kwargs.pop("softmax_scale", None), + **flash_kwargs, + } + + if flash_attn_func is None: + # Use transformers >= 4.54 + flash_attn_func = _flash_attention_forward + specific_attn_kwargs = { + "attention_mask": attention_mask, + "position_ids": position_ids, + "query_length": query_length, + "is_causal": causal, + } + else: + specific_attn_kwargs = {"causal": causal} + + attn_output = flash_attn_func( query_states, key_states, value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=kwargs.pop("dropout", 0.0), - softmax_scale=kwargs.pop("softmax_scale", None), - causal=causal, - **flash_kwargs, + **common_attn_kwargs, + **specific_attn_kwargs, ) attn_output = attn_output.view(batch_size, -1, attn_output.size(-2), attn_output.size(-1)) else: diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py index f7d8825b57d..7ab01b456f7 100644 --- a/verl/trainer/main_ppo.py +++ b/verl/trainer/main_ppo.py @@ -26,6 +26,8 @@ from verl.trainer.constants_ppo import get_ppo_ray_runtime_env from verl.trainer.ppo.ray_trainer import RayPPOTrainer from verl.trainer.ppo.reward import load_reward_manager +from verl.trainer.ppo.utils import need_critic, need_reference_policy +from verl.utils.config import validate_config from verl.utils.device import is_cuda_available from verl.utils.import_utils import load_extern_type @@ -219,20 +221,6 @@ def run(self, config): pprint(OmegaConf.to_container(config, resolve=True)) OmegaConf.resolve(config) - # Download the checkpoint from HDFS to the local machine. - # `use_shm` determines whether to use shared memory, which could lead to faster model loading if turned on - local_path = copy_to_local( - config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False) - ) - - # Instantiate the tokenizer and processor. - from verl.utils import hf_processor, hf_tokenizer - - trust_remote_code = config.data.get("trust_remote_code", False) - tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code) - # Used for multimodal LLM, could be None - processor = hf_processor(local_path, trust_remote_code=trust_remote_code, use_fast=True) - actor_rollout_cls, ray_worker_group_cls = self.add_actor_rollout_worker(config) self.add_critic_worker(config) @@ -247,6 +235,27 @@ def run(self, config): # Add a reference policy worker if KL loss or KL reward is used. self.add_ref_policy_worker(config, actor_rollout_cls) + # validate config + validate_config( + config=config, + use_reference_policy=need_reference_policy(self.role_worker_mapping), + use_critic=need_critic(config), + ) + + # Download the checkpoint from HDFS to the local machine. + # `use_shm` determines whether to use shared memory, which could lead to faster model loading if turned on + local_path = copy_to_local( + config.actor_rollout_ref.model.path, use_shm=config.actor_rollout_ref.model.get("use_shm", False) + ) + + # Instantiate the tokenizer and processor. + from verl.utils import hf_processor, hf_tokenizer + + trust_remote_code = config.data.get("trust_remote_code", False) + tokenizer = hf_tokenizer(local_path, trust_remote_code=trust_remote_code) + # Used for multimodal LLM, could be None + processor = hf_processor(local_path, trust_remote_code=trust_remote_code, use_fast=True) + # Load the reward manager for training and validation. reward_fn = load_reward_manager( config, tokenizer, num_examine=0, **config.reward_model.get("reward_kwargs", {}) diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index 5e6e48115e5..d2508a1259c 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -21,11 +21,9 @@ import json import os import uuid -import warnings from collections import defaultdict from copy import deepcopy from dataclasses import dataclass, field -from enum import Enum from pprint import pprint from typing import Optional @@ -40,7 +38,6 @@ from verl import DataProto from verl.experimental.dataset.sampler import AbstractCurriculumSampler from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto -from verl.single_controller.base import Worker from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup from verl.single_controller.ray.base import create_colocated_worker_cls from verl.trainer.config import AlgoConfig @@ -53,6 +50,7 @@ process_validation_metrics, ) from verl.trainer.ppo.reward import compute_reward, compute_reward_async +from verl.trainer.ppo.utils import Role, WorkerType, need_critic, need_reference_policy, need_reward_model from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path, should_save_ckpt_esi from verl.utils.config import omega_conf_to_dataclass from verl.utils.debug import marked_timer @@ -62,22 +60,6 @@ from verl.utils.torch_functional import masked_mean from verl.utils.tracking import ValidationGenerationsLogger -WorkerType = type[Worker] - - -class Role(Enum): - """ - To create more roles dynamically, you can subclass Role and add new members - """ - - Actor = 0 - Rollout = 1 - ActorRollout = 2 - Critic = 3 - RefPolicy = 4 - RewardModel = 5 - ActorRolloutRef = 6 - @dataclass class ResourcePoolManager: @@ -352,8 +334,9 @@ def __init__( self.role_worker_mapping = role_worker_mapping self.resource_pool_manager = resource_pool_manager - self.use_reference_policy = Role.RefPolicy in role_worker_mapping - self.use_rm = Role.RewardModel in role_worker_mapping + self.use_reference_policy = need_reference_policy(self.role_worker_mapping) + self.use_rm = need_reward_model(self.role_worker_mapping) + self.use_critic = need_critic(self.config) self.ray_worker_group_cls = ray_worker_group_cls self.device_name = device_name if device_name else self.config.trainer.device self.validation_generations_logger = ValidationGenerationsLogger( @@ -369,138 +352,8 @@ def __init__( if self.config.algorithm.use_kl_in_reward: self.kl_ctrl_in_reward = core_algos.get_kl_controller(self.config.algorithm.kl_ctrl) - if config.critic.enable is not None: - self.use_critic = bool(config.critic.enable) - elif self.config.algorithm.adv_estimator == AdvantageEstimator.GAE: - self.use_critic = True - else: - warnings.warn( - "Disabled critic as algorithm.adv_estimator != gae. " - "If it is not intended, please set critic.enable=True", - stacklevel=2, - ) - self.use_critic = False - - self._validate_config() self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler) - def _validate_config(self): - config = self.config - # number of GPUs total - n_gpus = config.trainer.n_gpus_per_node * config.trainer.nnodes - - if not config.actor_rollout_ref.actor.use_dynamic_bsz: - if config.actor_rollout_ref.actor.strategy == "megatron": - model_parallel_size = ( - config.actor_rollout_ref.actor.megatron.tensor_model_parallel_size - * config.actor_rollout_ref.actor.megatron.pipeline_model_parallel_size - ) - assert ( - n_gpus % (model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size) == 0 - ), ( - f"n_gpus ({n_gpus}) must be divisible by model_parallel_size ({model_parallel_size}) times " - f"context_parallel_size ({config.actor_rollout_ref.actor.megatron.context_parallel_size})" - ) - megatron_dp = n_gpus // ( - model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size - ) - minimal_bsz = megatron_dp * config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu - else: - minimal_bsz = n_gpus - - # 1. Check total batch size for data correctness - real_train_batch_size = config.data.train_batch_size * config.actor_rollout_ref.rollout.n - assert real_train_batch_size % minimal_bsz == 0, ( - f"real_train_batch_size ({real_train_batch_size}) must be divisible by minimal possible batch size " - f"({minimal_bsz})" - ) - - # A helper function to check "micro_batch_size" vs "micro_batch_size_per_gpu" - # We throw an error if the user sets both. The new convention is "..._micro_batch_size_per_gpu". - def check_mutually_exclusive(mbs, mbs_per_gpu, name: str): - """Validate mutually exclusive micro batch size configuration options. - - Ensures that users don't set both deprecated micro_batch_size and - the new micro_batch_size_per_gpu parameters simultaneously. - - Args: - mbs: Deprecated micro batch size parameter value. - mbs_per_gpu: New micro batch size per GPU parameter value. - name (str): Configuration section name for error messages. - - Raises: - ValueError: If both parameters are set or neither is set. - """ - settings = { - "reward_model": "micro_batch_size", - "actor_rollout_ref.ref": "log_prob_micro_batch_size", - "actor_rollout_ref.rollout": "log_prob_micro_batch_size", - } - - if name in settings: - param = settings[name] - param_per_gpu = f"{param}_per_gpu" - - if mbs is None and mbs_per_gpu is None: - raise ValueError( - f"[{name}] Please set at least one of '{name}.{param}' or '{name}.{param_per_gpu}'." - ) - - if mbs is not None and mbs_per_gpu is not None: - raise ValueError( - f"[{name}] You have set both '{name}.{param}' AND '{name}.{param_per_gpu}'. Please remove " - f"'{name}.{param}' because only '*_{param_per_gpu}' is supported (the former is deprecated)." - ) - - # Actor validation done in ActorConfig.__post_init__ and validate() - actor_config = omega_conf_to_dataclass(config.actor_rollout_ref.actor) - actor_config.validate(n_gpus, config.data.train_batch_size, config.actor_rollout_ref.model) - - if not config.actor_rollout_ref.actor.use_dynamic_bsz: - if self.use_reference_policy: - # reference: log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu - check_mutually_exclusive( - config.actor_rollout_ref.ref.log_prob_micro_batch_size, - config.actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu, - "actor_rollout_ref.ref", - ) - - # The rollout section also has log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu - check_mutually_exclusive( - config.actor_rollout_ref.rollout.log_prob_micro_batch_size, - config.actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu, - "actor_rollout_ref.rollout", - ) - - # Check for reward model micro-batch size conflicts - if config.reward_model.enable and not config.reward_model.use_dynamic_bsz: - check_mutually_exclusive( - config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu, "reward_model" - ) - - if self.config.algorithm.use_kl_in_reward and config.actor_rollout_ref.actor.use_kl_loss: - print("NOTICE: You have both enabled in-reward kl and kl loss.") - - # critic - if self.use_critic: - critic_config = omega_conf_to_dataclass(config.critic) - critic_config.validate(n_gpus, config.data.train_batch_size) - - if config.data.get("val_batch_size", None) is not None: - print( - "WARNING: val_batch_size is deprecated." - + " Validation datasets are sent to inference engines as a whole batch," - + " which will schedule the memory themselves." - ) - - # check eval config - if config.actor_rollout_ref.rollout.val_kwargs.do_sample: - assert config.actor_rollout_ref.rollout.temperature > 0, ( - "validation gen temperature should be greater than 0 when enabling do_sample" - ) - - print("[validate_config] All configuration checks passed successfully!") - def _create_dataloader(self, train_dataset, val_dataset, collate_fn, train_sampler: Optional[Sampler]): """ Creates the train and validation dataloaders. diff --git a/verl/trainer/ppo/utils.py b/verl/trainer/ppo/utils.py new file mode 100644 index 00000000000..22d00a45052 --- /dev/null +++ b/verl/trainer/ppo/utils.py @@ -0,0 +1,65 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from enum import Enum + +from omegaconf import DictConfig + +from verl.single_controller.base import Worker +from verl.trainer.ppo.core_algos import AdvantageEstimator + +WorkerType = type[Worker] + + +class Role(Enum): + """ + To create more roles dynamically, you can subclass Role and add new members + """ + + Actor = 0 + Rollout = 1 + ActorRollout = 2 + Critic = 3 + RefPolicy = 4 + RewardModel = 5 + ActorRolloutRef = 6 + + +def need_reference_policy( + role_worker_mapping: dict[Role, WorkerType], +) -> bool: + """Given a role worker mapping, do we need ref policy.""" + return Role.RefPolicy in role_worker_mapping + + +def need_reward_model( + role_worker_mapping: dict[Role, WorkerType], +) -> bool: + """Given a role worker mapping, do we need reward model.""" + return Role.RewardModel in role_worker_mapping + + +def need_critic(config: DictConfig) -> bool: + """Given a config, do we need critic.""" + if config.critic.enable is not None: + return bool(config.critic.enable) + elif config.algorithm.adv_estimator == AdvantageEstimator.GAE: + return True + else: + warnings.warn( + "Disabled critic as algorithm.adv_estimator != gae. If it is not intended, please set critic.enable=True", + stacklevel=2, + ) + return False diff --git a/verl/utils/config.py b/verl/utils/config.py index fabed0b2526..fa3630c654c 100644 --- a/verl/utils/config.py +++ b/verl/utils/config.py @@ -17,7 +17,7 @@ from omegaconf import DictConfig, ListConfig, OmegaConf -__all__ = ["omega_conf_to_dataclass"] +__all__ = ["omega_conf_to_dataclass", "validate_config"] def omega_conf_to_dataclass(config: DictConfig | dict, dataclass_type: Optional[type[Any]] = None) -> Any: @@ -69,3 +69,129 @@ def update_dict_with_config(dictionary: dict, config: DictConfig): for key in dictionary: if hasattr(config, key): dictionary[key] = getattr(config, key) + + +def validate_config( + config: DictConfig, + use_reference_policy: bool, + use_critic: bool, +) -> None: + """Validate an OmegaConf DictConfig. + + Args: + config (DictConfig): The OmegaConf DictConfig to validate. + use_reference_policy (bool): is ref policy needed + use_critic (bool): is critic needed + """ + # number of GPUs total + n_gpus = config.trainer.n_gpus_per_node * config.trainer.nnodes + + if not config.actor_rollout_ref.actor.use_dynamic_bsz: + if config.actor_rollout_ref.actor.strategy == "megatron": + model_parallel_size = ( + config.actor_rollout_ref.actor.megatron.tensor_model_parallel_size + * config.actor_rollout_ref.actor.megatron.pipeline_model_parallel_size + ) + assert ( + n_gpus % (model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size) == 0 + ), ( + f"n_gpus ({n_gpus}) must be divisible by model_parallel_size ({model_parallel_size}) times " + f"context_parallel_size ({config.actor_rollout_ref.actor.megatron.context_parallel_size})" + ) + megatron_dp = n_gpus // ( + model_parallel_size * config.actor_rollout_ref.actor.megatron.context_parallel_size + ) + minimal_bsz = megatron_dp * config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu + else: + minimal_bsz = n_gpus + + # 1. Check total batch size for data correctness + real_train_batch_size = config.data.train_batch_size * config.actor_rollout_ref.rollout.n + assert real_train_batch_size % minimal_bsz == 0, ( + f"real_train_batch_size ({real_train_batch_size}) must be divisible by minimal possible batch size " + f"({minimal_bsz})" + ) + + # A helper function to check "micro_batch_size" vs "micro_batch_size_per_gpu" + # We throw an error if the user sets both. The new convention is "..._micro_batch_size_per_gpu". + def check_mutually_exclusive(mbs, mbs_per_gpu, name: str): + """Validate mutually exclusive micro batch size configuration options. + + Ensures that users don't set both deprecated micro_batch_size and + the new micro_batch_size_per_gpu parameters simultaneously. + + Args: + mbs: Deprecated micro batch size parameter value. + mbs_per_gpu: New micro batch size per GPU parameter value. + name (str): Configuration section name for error messages. + + Raises: + ValueError: If both parameters are set or neither is set. + """ + settings = { + "reward_model": "micro_batch_size", + "actor_rollout_ref.ref": "log_prob_micro_batch_size", + "actor_rollout_ref.rollout": "log_prob_micro_batch_size", + } + + if name in settings: + param = settings[name] + param_per_gpu = f"{param}_per_gpu" + + if mbs is None and mbs_per_gpu is None: + raise ValueError(f"[{name}] Please set at least one of '{name}.{param}' or '{name}.{param_per_gpu}'.") + + if mbs is not None and mbs_per_gpu is not None: + raise ValueError( + f"[{name}] You have set both '{name}.{param}' AND '{name}.{param_per_gpu}'. Please remove " + f"'{name}.{param}' because only '*_{param_per_gpu}' is supported (the former is deprecated)." + ) + + # Actor validation done in ActorConfig.__post_init__ and validate() + actor_config = omega_conf_to_dataclass(config.actor_rollout_ref.actor) + actor_config.validate(n_gpus, config.data.train_batch_size, config.actor_rollout_ref.model) + + if not config.actor_rollout_ref.actor.use_dynamic_bsz: + if use_reference_policy: + # reference: log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu + check_mutually_exclusive( + config.actor_rollout_ref.ref.log_prob_micro_batch_size, + config.actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu, + "actor_rollout_ref.ref", + ) + + # The rollout section also has log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu + check_mutually_exclusive( + config.actor_rollout_ref.rollout.log_prob_micro_batch_size, + config.actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu, + "actor_rollout_ref.rollout", + ) + + # Check for reward model micro-batch size conflicts + if config.reward_model.enable and not config.reward_model.use_dynamic_bsz: + check_mutually_exclusive( + config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu, "reward_model" + ) + + if config.algorithm.use_kl_in_reward and config.actor_rollout_ref.actor.use_kl_loss: + print("NOTICE: You have both enabled in-reward kl and kl loss.") + + # critic + if use_critic: + critic_config = omega_conf_to_dataclass(config.critic) + critic_config.validate(n_gpus, config.data.train_batch_size) + + if config.data.get("val_batch_size", None) is not None: + print( + "WARNING: val_batch_size is deprecated." + + " Validation datasets are sent to inference engines as a whole batch," + + " which will schedule the memory themselves." + ) + + # check eval config + if config.actor_rollout_ref.rollout.val_kwargs.do_sample: + assert config.actor_rollout_ref.rollout.temperature > 0, ( + "validation gen temperature should be greater than 0 when enabling do_sample" + ) + + print("[validate_config] All configuration checks passed successfully!") diff --git a/verl/utils/dataset/multiturn_sft_dataset.py b/verl/utils/dataset/multiturn_sft_dataset.py index cd58c984359..0d38fdd850d 100644 --- a/verl/utils/dataset/multiturn_sft_dataset.py +++ b/verl/utils/dataset/multiturn_sft_dataset.py @@ -22,6 +22,7 @@ import numpy as np import pandas as pd import torch +from omegaconf import ListConfig from torch.utils.data import Dataset from transformers import PreTrainedTokenizer @@ -60,7 +61,7 @@ def __init__(self, parquet_files: str | list[str], tokenizer, config=None): self.apply_chat_template_kwargs = config.get("apply_chat_template_kwargs", {}) assert self.truncation in ["error", "left", "right"] - if not isinstance(parquet_files, list): + if not isinstance(parquet_files, list | ListConfig): parquet_files = [parquet_files] self.parquet_files = parquet_files diff --git a/verl/utils/profiler/mstx_profile.py b/verl/utils/profiler/mstx_profile.py index 33caedce412..b9576714248 100644 --- a/verl/utils/profiler/mstx_profile.py +++ b/verl/utils/profiler/mstx_profile.py @@ -214,8 +214,7 @@ def stop(self): self.profile_npu.stop() NPUProfiler._define_count -= 1 - @staticmethod - def annotate(message: Optional[str] = None, role: Optional[str] = None, **kwargs) -> Callable: + def annotate(self, message: Optional[str] = None, role: Optional[str] = None, **kwargs_outer) -> Callable: """Decorate a Worker member function to profile the current rank in the current training step. Requires the target function to be a member function of a Worker, @@ -230,32 +229,32 @@ def annotate(message: Optional[str] = None, role: Optional[str] = None, **kwargs def decorator(func): @functools.wraps(func) - def wrapper(self, *args, **kwargs): - if not self.profiler.enable: - return func(self, *args, **kwargs) + def wrapper(*args, **kwargs_inner): + if not self.enable: + return func(*args, **kwargs_inner) profile_name = message or func.__name__ - discrete_mode = self.profiler.discrete - profile_enable = self.profiler.this_step and self.profiler.enable + discrete_mode = self.discrete + profile_enable = self.this_step and self.enable if not profile_enable: - return func(self, *args, **kwargs) + return func(*args, **kwargs_inner) if profile_enable: if not discrete_mode: mark_range = mark_start_range(message=profile_name) else: profile_npu = get_npu_profiler( - contents=self.profiler.profile_contents, - profile_level=self.profiler.profile_level, - profile_save_path=self.profiler.profile_save_path, - analysis=self.profiler.analysis, + contents=self.profile_contents, + profile_level=self.profile_level, + profile_save_path=self.profile_save_path, + analysis=self.analysis, role=role, ) profile_npu.start() mark_range = mark_start_range(message=profile_name) - result = func(self, *args, **kwargs) + result = func(*args, **kwargs_inner) if profile_enable: if not discrete_mode: diff --git a/verl/utils/profiler/nvtx_profile.py b/verl/utils/profiler/nvtx_profile.py index b92e588032f..35857498c03 100644 --- a/verl/utils/profiler/nvtx_profile.py +++ b/verl/utils/profiler/nvtx_profile.py @@ -149,13 +149,13 @@ def stop(self): if not self.discrete: torch.cuda.profiler.stop() - @staticmethod def annotate( + self, message: Optional[str] = None, color: Optional[str] = None, domain: Optional[str] = None, category: Optional[str] = None, - **kwargs, + **kwargs_outer, ) -> Callable: """Decorate a Worker member function to profile the current rank in the current training step. @@ -175,22 +175,22 @@ def annotate( def decorator(func): @functools.wraps(func) - def wrapper(self, *args, **kwargs): - if not self.profiler.enable: - return func(self, *args, **kwargs) + def wrapper(*args, **kwargs_inner): + if not self.enable: + return func(*args, **kwargs_inner) profile_name = message or func.__name__ - if self.profiler.this_step: - if self.profiler.discrete: + if self.this_step: + if self.discrete: torch.cuda.profiler.start() mark_range = mark_start_range(message=profile_name, color=color, domain=domain, category=category) - result = func(self, *args, **kwargs) + result = func(*args, **kwargs_inner) - if self.profiler.this_step: + if self.this_step: mark_end_range(mark_range) - if self.profiler.discrete: + if self.discrete: torch.cuda.profiler.stop() return result diff --git a/verl/utils/profiler/profile.py b/verl/utils/profiler/profile.py index 92baf5ec978..7c9d2fe15ce 100644 --- a/verl/utils/profiler/profile.py +++ b/verl/utils/profiler/profile.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import functools import os from typing import Callable, Optional @@ -226,16 +227,35 @@ def start(self, **kwargs): def stop(self): return getattr(self._impl, "stop", lambda: None)() - @staticmethod + @classmethod def annotate( + cls, message: Optional[str] = None, color: Optional[str] = None, domain: Optional[str] = None, category: Optional[str] = None, - **kwargs, + **kwargs_outer, ) -> Callable: def decorator(func): - return func + @functools.wraps(func) + def wrapper(self_instance, *args, **kwargs_inner): + profiler = getattr(self_instance, "profiler", None) + if not profiler: + return func(self_instance, *args, **kwargs_inner) + + impl = profiler._impl + if hasattr(impl, "annotate"): + try: + actual_decorator = impl.annotate( + message=message, color=color, domain=domain, category=category, **kwargs_outer + ) + + return actual_decorator(func)(self_instance, *args, **kwargs_inner) + except Exception: + return func(self_instance, *args, **kwargs_inner) + return func(self_instance, *args, **kwargs_inner) + + return wrapper return decorator diff --git a/verl/workers/actor/megatron_actor.py b/verl/workers/actor/megatron_actor.py index 40e823a54f5..7f85a8955af 100644 --- a/verl/workers/actor/megatron_actor.py +++ b/verl/workers/actor/megatron_actor.py @@ -28,12 +28,10 @@ import torch import torch.distributed from megatron.core import parallel_state as mpu -from megatron.core.distributed import finalize_model_grads # from megatron.core.optimizer import DistributedOptimizer from megatron.core.optimizer import DistributedOptimizer from megatron.core.pipeline_parallel import get_forward_backward_func -from omegaconf import OmegaConf from torch import nn from verl import DataProto @@ -136,23 +134,9 @@ def __init__( for model in self.actor_module: patch_fused_forward(model) - self.optimizer_step_args = OmegaConf.create( - { - "skip_grad": None, - "overlap_dp_param_comm": False, - "overlap_dp_grad_comm": False, - "gradient_accumulation_steps": 1, - "sequence_parallel": self.tf_config.sequence_parallel, - "DDP_impl": "local", - "layernorm_allreduce_bucket_threshold": 0, - "pipeline_model_parallel_split_rank": None, - "reduce_grads_use_alltoall": False, - } - ) - config = get_model_config(self.actor_module[0]) - print(config) - config.finalize_model_grads_func = finalize_model_grads + if torch.distributed.get_rank() == 0: + print(config) def _validate_config(self, config) -> None: """Validate config options not implemented for Megatron backend""" @@ -194,85 +178,73 @@ def compute_log_prob(self, data: DataProto, calculate_entropy=False) -> torch.Te "micro batch size is needed for forward compute when use_dynamic_bsz is False" ) - def compute_logprobs_fn(output, data, use_dynamic_bsz=False, indices=None): - response = data["responses"] - response_length = response.size(1) - log_probs = output["log_probs"][:, -response_length - 1 : -1].contiguous() - return {"log_probs": log_probs} - # We make recompute_old_log_prob by default here. # TODO (zhangchi.usc1992): actually, this function should only return log_prob and this logic should be # handled by user outside - recompute_old_log_prob = self.config.get("recompute_old_log_prob", True) - entropys = torch.Tensor() - if recompute_old_log_prob: - select_keys = ["responses", "input_ids", "attention_mask", "position_ids"] - batch = data.select(batch_keys=select_keys).batch - input_ids = batch["input_ids"] - batch_size = input_ids.size(0) - response = batch["responses"] - response_length = response.size(1) - with torch.no_grad(): - output = self.forward_backward_batch( - data, - forward_only=True, - post_process_fn=compute_logprobs_fn, - calculate_entropy=calculate_entropy, - use_dynamic_bsz=use_dynamic_bsz, - micro_batch_size=micro_batch_size, - max_token_len=max_token_len, - ) - if mpu.is_pipeline_last_stage(ignore_virtual=True): - # only on last rank. It should be on every tp rank + + select_keys = ["responses", "input_ids", "attention_mask", "position_ids"] + batch = data.select(batch_keys=select_keys).batch + input_ids = batch["input_ids"] + batch_size = input_ids.size(0) + response = batch["responses"] + response_length = response.size(1) + with torch.no_grad(): + output = self.forward_backward_batch( + data, + forward_only=True, + calculate_entropy=calculate_entropy, + use_dynamic_bsz=use_dynamic_bsz, + micro_batch_size=micro_batch_size, + max_token_len=max_token_len, + ) + if mpu.is_pipeline_last_stage(ignore_virtual=True): + # only on last rank. It should be on every tp rank + log_probs = [o["log_probs"] for o in output["output"]] # (bs, seq_size) + log_probs = torch.cat(log_probs, dim=0).to(torch.float32) + + if calculate_entropy: + entropys = torch.cat([o["entropy"] for o in output["output"]], dim=0) + entropys = entropys.to(torch.float32) + + if use_dynamic_bsz: + indices = output["indices"] + indices = list(itertools.chain.from_iterable(indices)) + assert len(indices) == log_probs.size(0), f"{len(indices)} vs. {log_probs.size()}" + revert_indices = torch.tensor(get_reverse_idx(indices), dtype=torch.long) + log_probs = log_probs[revert_indices] if calculate_entropy: - log_probs = [o[0]["log_probs"] for o in output["output"]] # (bs, seq_size) - else: - log_probs = [o["log_probs"] for o in output["output"]] # (bs, seq_size) - log_probs = torch.cat(log_probs, dim=0).to(torch.float32) - if use_dynamic_bsz: - indices = output["indices"] - indices = list(itertools.chain.from_iterable(indices)) - assert len(indices) == log_probs.size(0), f"{len(indices)} vs. {log_probs.size()}" - revert_indices = torch.tensor(get_reverse_idx(indices), dtype=torch.long) - log_probs = log_probs[revert_indices] - else: - log_probs = torch.empty( + assert len(indices) == entropys.size(0), f"{len(indices)} vs. {entropys.size()}" + entropys = entropys[revert_indices] + else: + # other pp ranks + log_probs = torch.empty( + size=(batch_size, response_length), dtype=torch.float32, device=input_ids.device + ) + if calculate_entropy: + entropys = torch.empty( size=(batch_size, response_length), dtype=torch.float32, device=input_ids.device ) - log_probs = log_probs.to(get_device_id()) - # broadcast across pp ranks + + log_probs = log_probs.to(get_device_id()) + # broadcast across pp ranks + torch.distributed.broadcast( + tensor=log_probs, + src=mpu.get_pipeline_model_parallel_last_rank(), + group=mpu.get_pipeline_model_parallel_group(), + async_op=False, + ) + log_probs = log_probs.to("cpu") + + if calculate_entropy: + entropys = entropys.to(get_device_id()) torch.distributed.broadcast( - tensor=log_probs, + tensor=entropys, src=mpu.get_pipeline_model_parallel_last_rank(), group=mpu.get_pipeline_model_parallel_group(), async_op=False, ) - log_probs = log_probs.to("cpu") - if calculate_entropy: - # Note that o[0] is metrics, o[1] is entropy - if mpu.is_pipeline_last_stage(ignore_virtual=True): - entropys = torch.cat([o[1] for o in output["output"]], dim=0) - entropys = entropys.to(torch.float32) - if use_dynamic_bsz: - indices = output["indices"] - indices = list(itertools.chain.from_iterable(indices)) - assert len(indices) == entropys.size(0), f"{len(indices)} vs. {entropys.size()}" - revert_indices = torch.tensor(get_reverse_idx(indices), dtype=torch.long) - entropys = entropys[revert_indices] - else: - entropys = torch.empty( - size=(batch_size, response_length), dtype=torch.float32, device=input_ids.device - ) - # broadcast across pp ranks - entropys = entropys.to(get_device_id()) - torch.distributed.broadcast( - tensor=entropys, - src=mpu.get_pipeline_model_parallel_last_rank(), - group=mpu.get_pipeline_model_parallel_group(), - async_op=False, - ) - entropys = entropys.to("cpu") + entropys = entropys.to("cpu") # add empty cache after each compute get_torch_device().empty_cache() @@ -328,16 +300,68 @@ def make_minibatch_iterator(self, data: DataProto) -> Iterable[DataProto]: dataloader_kwargs={"shuffle": self.config.shuffle}, ) + def compute_ppo_loss(self, model_output, data): + log_prob = model_output["log_probs"] + entropy = model_output.get("entropy", None) + + metrics = {} + + response_mask = data["response_mask"].to(bool) + # compute policy loss + old_log_prob = data["old_log_probs"] + advantages = data["advantages"] + + loss_agg_mode = self.config.loss_agg_mode + + loss_mode = self.config.policy_loss.get("loss_mode", "vanilla") + + policy_loss_fn = get_policy_loss_fn(loss_mode) + pg_loss, pg_clipfrac, ppo_kl, pg_clipfrac_lower = policy_loss_fn( + old_log_prob=old_log_prob, + log_prob=log_prob, + advantages=advantages, + response_mask=response_mask, + loss_agg_mode=loss_agg_mode, + config=self.config, + ) + + metrics.update( + { + "actor/pg_loss": pg_loss.detach().item(), + "actor/pg_clipfrac": pg_clipfrac.detach().item(), + "actor/ppo_kl": ppo_kl.detach().item(), + "actor/pg_clipfrac_lower": pg_clipfrac_lower.detach().item(), + } + ) + policy_loss = pg_loss + + # add entropy loss + if entropy is not None: + entropy_loss = agg_loss(loss_mat=entropy, loss_mask=response_mask, loss_agg_mode=loss_agg_mode) + entropy_coeff = self.config.entropy_coeff + policy_loss -= entropy_coeff * entropy_loss + + # add kl loss + if self.config.use_kl_loss: + ref_log_prob = data["ref_log_prob"] + # compute kl loss + kld = kl_penalty(logprob=log_prob, ref_logprob=ref_log_prob, kl_penalty=self.config.kl_loss_type) + kl_loss = agg_loss(loss_mat=kld, loss_mask=response_mask, loss_agg_mode=self.config.loss_agg_mode) + + policy_loss += kl_loss * self.config.kl_loss_coef + metrics["actor/kl_loss"] = kl_loss.detach().item() + metrics["actor/kl_coef"] = self.config.kl_loss_coef + + return policy_loss, metrics + def forward_backward_batch( self, data: DataProto, forward_only=False, - post_process_fn=None, calculate_entropy=False, use_dynamic_bsz=False, micro_batch_size=None, max_token_len=None, - mini_batch_size=None, ): """ We assume: @@ -387,98 +411,40 @@ def forward_backward_batch( ) else: micro_batches, indices = rearrange_micro_batches(batch=mini_batch.batch, max_token_len=max_token_len) - total_seqlen = max_token_len else: assert micro_batch_size is not None, ( "micro_batch_size is needed to be passed in when not using dynamic batch size" ) micro_batches = mini_batch.batch.split(micro_batch_size) - seq_len = micro_batches[0]["input_ids"].shape[1] - total_seqlen = micro_batch_size * seq_len # compute input shapes for pp stages n_micro_batch = len(micro_batches) forward_backward_func = get_forward_backward_func() - def loss_func(output, data, meta_info): + def loss_func(output, data): # For memory efficiency # We move calculation of entropy to compute_log_probs, forward_only == True device = output["log_probs"].device - metrics = {} - if forward_only: - if post_process_fn is None: - pass - # metrics["logits"] = output - else: - stats = post_process_fn(output, data) - metrics.update(stats) - if not calculate_entropy: - return torch.tensor(1.0, device=device), metrics responses = data["responses"] response_length = responses.size(1) - response_mask = data["response_mask"].to(bool) - loss_agg_mode = self.config.loss_agg_mode - # compute policy loss log_prob = output["log_probs"][:, -response_length - 1 : -1].contiguous() - ret_entropy = None - stats = {} - if not forward_only: - old_log_prob = data["old_log_probs"] - advantages = data["advantages"] - - entropy_coeff = self.config.entropy_coeff - loss_agg_mode = self.config.loss_agg_mode - - loss_mode = self.config.policy_loss.get("loss_mode", "vanilla") - - policy_loss_fn = get_policy_loss_fn(loss_mode) - pg_loss, pg_clipfrac, ppo_kl, pg_clipfrac_lower = policy_loss_fn( - old_log_prob=old_log_prob, - log_prob=log_prob, - advantages=advantages, - response_mask=response_mask, - loss_agg_mode=loss_agg_mode, - config=self.config, - ) - - stats.update( - { - "actor/pg_loss": pg_loss.detach().item(), - "actor/pg_clipfrac": pg_clipfrac.detach().item(), - "actor/ppo_kl": ppo_kl.detach().item(), - "actor/pg_clipfrac_lower": pg_clipfrac_lower.detach().item(), - } - ) - policy_loss = pg_loss - + model_output = {"log_probs": log_prob} if calculate_entropy: entropy = output["entropy"][:, -response_length - 1 : -1].contiguous() - if not forward_only: - entropy_loss = agg_loss(loss_mat=entropy, loss_mask=response_mask, loss_agg_mode=loss_agg_mode) - entropy_coeff = meta_info["entropy_coeff"] - policy_loss = pg_loss - entropy_coeff * entropy_loss - else: - ret_entropy = entropy + model_output["entropy"] = entropy if forward_only: - policy_loss = torch.tensor(1.0, device=device) - else: - if self.config.use_kl_loss: - ref_log_prob = data["ref_log_prob"] - # compute kl loss - kld = kl_penalty(logprob=log_prob, ref_logprob=ref_log_prob, kl_penalty=self.config.kl_loss_type) - kl_loss = agg_loss(loss_mat=kld, loss_mask=response_mask, loss_agg_mode=self.config.loss_agg_mode) + # for inference + return torch.tensor(1.0, device=device), model_output - policy_loss = policy_loss + kl_loss * self.config.kl_loss_coef - metrics["actor/kl_loss"] = kl_loss.detach().item() - metrics["actor/kl_coef"] = self.config.kl_loss_coef + # for training + # note that this loss function can be swapped with other loss functions such as SFT + policy_loss, metrics = self.compute_ppo_loss(model_output, data) - # return loss and stats - - append_to_dict(metrics, stats) - return policy_loss, [metrics, ret_entropy] + # return loss and stats + return policy_loss, metrics def forward_step(batch_iter, model): batch = next(batch_iter) @@ -531,11 +497,12 @@ def logits_processor(logits, label, label_mask): ret = {} if calculate_entropy: logits_bak = logits.clone() - logger.warning_once( - "For memory-efficient computation, enable fused kernels via " - "`actor_rollout_ref.model.use_fused_kernels=True`. " - "The current `clone()` operation ensures correctness but increases memory usage." - ) + if torch.distributed.get_rank() == 0: + logger.warning_once( + "For memory-efficient computation, enable fused kernels via " + "`actor_rollout_ref.model.use_fused_kernels=True`. " + "The current `clone()` operation ensures correctness but increases memory usage." + ) entropy = vocab_parallel_entropy(logits) ret["entropy"] = entropy else: @@ -557,42 +524,22 @@ def logits_processor(logits, label, label_mask): logits_processor_args=logits_processor_args, ) - if forward_only: - meta_info = None - else: - clip_ratio_c = self.config.get("clip_ratio_c", 3.0) - meta_info = { - "clip_ratio": self.config.clip_ratio, - "entropy_coeff": self.config.entropy_coeff, - "clip_ratio_c": clip_ratio_c, - } - return output, partial(loss_func, data=batch, meta_info=meta_info) + return output, partial(loss_func, data=batch) # batch should be a list of batches inside micro-batches batch_generator = make_batch_generator(micro_batches, vpp_size=len(self.actor_module)) # TODO: we may use the new schedule instead # for flash-attn: (seq_len, batch_size, hidden_size) = (mbs*seq_len, 1, hidden_size) - if mpu.get_pipeline_model_parallel_world_size() > 1: - losses_reduced = forward_backward_func( - forward_step_func=forward_step, - data_iterator=batch_generator, - model=self.actor_module, - num_microbatches=n_micro_batch, - seq_length=total_seqlen, # no use when input_shapes was set - micro_batch_size=1, # no use when input_shapes was set - forward_only=forward_only, - ) - else: - losses_reduced = forward_backward_func( - forward_step_func=forward_step, - data_iterator=batch_generator, - model=self.actor_module, - num_microbatches=n_micro_batch, - seq_length=total_seqlen, # in use for pp = 1 - micro_batch_size=1, # in use for pp = 1 - forward_only=forward_only, - ) + losses_reduced = forward_backward_func( + forward_step_func=forward_step, + data_iterator=batch_generator, + model=self.actor_module, + num_microbatches=n_micro_batch, + seq_length=1, # the communication shape is obtained via p2p comm + micro_batch_size=1, # the communication shape is obtained via p2p comm + forward_only=forward_only, + ) # loss_reduces contains the stats returned from loss_func if self.has_multi_modal_inputs: @@ -642,12 +589,11 @@ def update_policy(self, dataloader: Iterable[DataProto]) -> dict: use_dynamic_bsz=self.config.use_dynamic_bsz, micro_batch_size=micro_batch_size, max_token_len=max_token_len, - mini_batch_size=self.config.ppo_mini_batch_size, ) metric_micro_batch = metric_micro_batch["output"] for metric in metric_micro_batch: # Note that o[0] is metrics, o[1] is entropy, o[2] is response_mask - append_to_dict(metrics, metric[0]) # append the metric from this micro-batch to global metrics. + append_to_dict(metrics, metric) # append the metric from this micro-batch to global metrics. update_successful, grad_norm, num_zeros_in_grad = self.actor_optimizer.step() data = {"actor/grad_norm": grad_norm} diff --git a/verl/workers/config/model.py b/verl/workers/config/model.py index e6bd4120b07..06466977365 100644 --- a/verl/workers/config/model.py +++ b/verl/workers/config/model.py @@ -37,12 +37,16 @@ class HFModelConfig(BaseConfig): "tokenizer", "processor", "local_path", + "local_hf_config_path", + "local_tokenizer_path", } path: str = MISSING local_path: Optional[str] = None hf_config_path: Optional[str] = None + local_hf_config_path: Optional[str] = None tokenizer_path: Optional[str] = None + local_tokenizer_path: Optional[str] = None hf_config: Any = None generation_config: Any = None @@ -82,17 +86,22 @@ def __post_init__(self): if self.tokenizer_path is None: self.tokenizer_path = self.path - # constuct tokenizer self.local_path = copy_to_local(self.path, use_shm=self.use_shm) - self.tokenizer = hf_tokenizer(self.local_path, trust_remote_code=self.trust_remote_code) - self.processor = hf_processor(self.local_path, trust_remote_code=self.trust_remote_code) - self.generation_config = get_generation_config(self.hf_config_path, trust_remote_code=self.trust_remote_code) + # constuct tokenizer + self.local_tokenizer_path = copy_to_local(self.tokenizer_path, use_shm=self.use_shm) + self.tokenizer = hf_tokenizer(self.local_tokenizer_path, trust_remote_code=self.trust_remote_code) + self.processor = hf_processor(self.local_tokenizer_path, trust_remote_code=self.trust_remote_code) + + self.local_hf_config_path = copy_to_local(self.hf_config_path, use_shm=self.use_shm) + self.generation_config = get_generation_config( + self.local_hf_config_path, trust_remote_code=self.trust_remote_code + ) # constuct hf_config attn_implementation = self.override_config.get("attn_implementation", "flash_attention_2") self.hf_config = AutoConfig.from_pretrained( - self.hf_config_path, trust_remote_code=self.trust_remote_code, attn_implementation=attn_implementation + self.local_hf_config_path, trust_remote_code=self.trust_remote_code, attn_implementation=attn_implementation ) override_config_kwargs = { diff --git a/verl/workers/fsdp_workers.py b/verl/workers/fsdp_workers.py index df14d910db6..712568f2e8c 100644 --- a/verl/workers/fsdp_workers.py +++ b/verl/workers/fsdp_workers.py @@ -288,6 +288,11 @@ def _build_model_optimizer( actor_model_config = AutoConfig.from_pretrained( local_path, trust_remote_code=trust_remote_code, attn_implementation="flash_attention_2" ) + # TODO: VL models use VisionAttention, which directly uses flash_attention in transformers>=4.53 + # which will be patched by _ulysses_flash_attention_forward, but errorly misses position_ids + # Maybe support Ulysses in VisionAttention in the future and remove this patch + if self.ulysses_sequence_parallel_size > 1 and hasattr(actor_model_config, "vision_config"): + actor_model_config.vision_config._attn_implementation = "eager" # patch for kimi-vl if getattr(actor_model_config, "model_type", None) == "kimi_vl": @@ -1072,6 +1077,12 @@ def _build_critic_model_optimizer(self, config): attn_implementation="flash_attention_2", trust_remote_code=config.model.get("trust_remote_code", False), ) + # TODO: VL models use VisionAttention, which directly uses flash_attention in transformers>=4.53 + # which will be patched by _ulysses_flash_attention_forward, but errorly misses position_ids + # Maybe support Ulysses in VisionAttention in the future and remove this patch + if self.ulysses_sequence_parallel_size > 1 and hasattr(critic_model_config, "vision_config"): + critic_model_config.vision_config._attn_implementation = "eager" + critic_model_config.num_labels = 1 # patch for kimi-vl if getattr(critic_model_config, "model_type", None) == "kimi_vl": diff --git a/verl/workers/reward_manager/batch.py b/verl/workers/reward_manager/batch.py index 989ca14f466..d1a13cefac6 100644 --- a/verl/workers/reward_manager/batch.py +++ b/verl/workers/reward_manager/batch.py @@ -77,7 +77,9 @@ def __call__(self, data: DataProto, return_dict: bool = False) -> torch.Tensor | # If there is rm score, we directly return rm score. Otherwise, we compute via rm_score_fn if "rm_scores" in data.batch.keys(): if return_dict: - return {"reward_tensor": data.batch["rm_scores"]} + reward_extra_keys = data.meta_info.get("reward_extra_keys", []) + reward_extra_info = {key: data.non_tensor_batch[key] for key in reward_extra_keys} + return {"reward_tensor": data.batch["rm_scores"], "reward_extra_info": reward_extra_info} else: return data.batch["rm_scores"] diff --git a/verl/workers/reward_manager/dapo.py b/verl/workers/reward_manager/dapo.py index bb6e0895f40..d8b6b4742ef 100644 --- a/verl/workers/reward_manager/dapo.py +++ b/verl/workers/reward_manager/dapo.py @@ -56,7 +56,9 @@ def __call__(self, data: DataProto, return_dict: bool = False): # If there is rm score, we directly return rm score. Otherwise, we compute via rm_score_fn if "rm_scores" in data.batch.keys(): if return_dict: - return {"reward_tensor": data.batch["rm_scores"]} + reward_extra_keys = data.meta_info.get("reward_extra_keys", []) + reward_extra_info = {key: data.non_tensor_batch[key] for key in reward_extra_keys} + return {"reward_tensor": data.batch["rm_scores"], "reward_extra_info": reward_extra_info} else: return data.batch["rm_scores"] diff --git a/verl/workers/reward_manager/naive.py b/verl/workers/reward_manager/naive.py index f10bbc636ec..d21c423e3da 100644 --- a/verl/workers/reward_manager/naive.py +++ b/verl/workers/reward_manager/naive.py @@ -49,7 +49,9 @@ def __call__(self, data: DataProto, return_dict: bool = False) -> torch.Tensor | # If there is rm score, we directly return rm score. Otherwise, we compute via rm_score_fn if "rm_scores" in data.batch.keys(): if return_dict: - return {"reward_tensor": data.batch["rm_scores"]} + reward_extra_keys = data.meta_info.get("reward_extra_keys", []) + reward_extra_info = {key: data.non_tensor_batch[key] for key in reward_extra_keys} + return {"reward_tensor": data.batch["rm_scores"], "reward_extra_info": reward_extra_info} else: return data.batch["rm_scores"] diff --git a/verl/workers/reward_manager/prime.py b/verl/workers/reward_manager/prime.py index 98c094f2c6c..ab7e5f95e8c 100644 --- a/verl/workers/reward_manager/prime.py +++ b/verl/workers/reward_manager/prime.py @@ -153,7 +153,12 @@ def __call__(self, data: DataProto, return_dict: bool = False) -> torch.Tensor | # If there is rm score, we directly return rm score. Otherwise, we compute via rm_score_fn if "rm_scores" in data.batch.keys(): - return data.batch["rm_scores"] + if return_dict: + reward_extra_keys = data.meta_info.get("reward_extra_keys", []) + reward_extra_info = {key: data.non_tensor_batch[key] for key in reward_extra_keys} + return {"reward_tensor": data.batch["rm_scores"], "reward_extra_info": reward_extra_info} + else: + return data.batch["rm_scores"] reward_tensor = torch.zeros_like(data.batch["responses"], dtype=torch.float32) diff --git a/verl/workers/rollout/sglang_rollout/sglang_rollout.py b/verl/workers/rollout/sglang_rollout/sglang_rollout.py index 25e0eb222c8..8f11c94051d 100644 --- a/verl/workers/rollout/sglang_rollout/sglang_rollout.py +++ b/verl/workers/rollout/sglang_rollout/sglang_rollout.py @@ -860,7 +860,7 @@ async def _async_rollout_a_request( self._tool_map[tool_call.function.name].execute( _req.request_id, tool_call.function.arguments, - **_req.tools_kwargs[tool_call.function.name].get("execute_kwargs", {}), + **_req.tools_kwargs.get(tool_call.function.name, {}).get("execute_kwargs", {}), ) for tool_call in parsed_tool_calls ] diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index 2a50a11a7b7..20bf0bfad41 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -276,7 +276,7 @@ async def init_engine(self): skip_tokenizer_init=False, max_model_len=self.max_model_len, max_num_seqs=config.max_num_seqs, - load_format="auto", + load_format="dummy" if config.load_format.startswith("dummy") else config.load_format, disable_log_stats=config.disable_log_stats, max_num_batched_tokens=max_num_batched_tokens, enable_chunked_prefill=config.enable_chunked_prefill, diff --git a/verl/workers/sharding_manager/megatron_sglang.py b/verl/workers/sharding_manager/megatron_sglang.py index 2014ce9c6a8..fb1ffe078b8 100644 --- a/verl/workers/sharding_manager/megatron_sglang.py +++ b/verl/workers/sharding_manager/megatron_sglang.py @@ -28,7 +28,7 @@ from torch.distributed.device_mesh import DeviceMesh from verl.protocol import DataProto, all_gather_data_proto -from verl.utils.device import get_torch_device +from verl.utils.device import get_torch_device, set_expandable_segments from verl.utils.megatron_utils import ( load_megatron_model_to_gpu, offload_megatron_model_to_cpu, @@ -178,6 +178,9 @@ async def wake_up(self): self.transformer_config, self.layer_name_mapping, ) + + set_expandable_segments(False) + await self.update_weights(per_tensor_param) if self.offload_param: offload_megatron_model_to_cpu(self.actor_module) @@ -199,6 +202,8 @@ async def sleep(self): # add empty cache after each compute aggressive_empty_cache(force_sync=True) + set_expandable_segments(True) + # restore random states if self.device_mesh is not None: self.gen_random_states = get_torch_device().get_rng_state() diff --git a/verl/workers/sharding_manager/megatron_vllm.py b/verl/workers/sharding_manager/megatron_vllm.py index a6ddb065c67..1a1d809be7b 100644 --- a/verl/workers/sharding_manager/megatron_vllm.py +++ b/verl/workers/sharding_manager/megatron_vllm.py @@ -30,7 +30,7 @@ from verl.protocol import all_gather_data_proto from verl.third_party.vllm import LLM, VLLM_SLEEP_LEVEL from verl.third_party.vllm import parallel_state as vllm_ps -from verl.utils.device import get_torch_device +from verl.utils.device import get_torch_device, set_expandable_segments from verl.utils.megatron_utils import load_megatron_model_to_gpu, offload_megatron_model_to_cpu, per_tensor_generator from verl.utils.memory_utils import aggressive_empty_cache from verl.utils.profiler import GPUMemoryLogger, log_gpu_memory_usage @@ -149,6 +149,8 @@ def __enter__(self): if self.offload_param: load_megatron_model_to_gpu(self.actor_module, load_grad=False) + set_expandable_segments(False) + if self.rollout_config.free_cache_engine: if "tags" in inspect.signature(self.inference_engine.wake_up).parameters: self.inference_engine.wake_up(tags=["weights"]) @@ -196,6 +198,8 @@ def __exit__(self, exc_type, exc_value, traceback): aggressive_empty_cache(force_sync=True) + set_expandable_segments(True) + # restore random states if self.device_mesh is not None: self.gen_random_states = get_torch_device().get_rng_state()