diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 4f092f174f0..91d368abe69 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -6,7 +6,7 @@ - [ ] Search for similar PRs. Paste at least one query link here: ... - [ ] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI) - - `{modules}` include `fsdp`, `megatron`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data` + - `{modules}` include `fsdp`, `megatron`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data`, `cfg`, `reward` - If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]` - `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test` - If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title. diff --git a/.github/workflows/checkpoint_converter.yml b/.github/workflows/checkpoint_converter.yml deleted file mode 100644 index 4820497f79c..00000000000 --- a/.github/workflows/checkpoint_converter.yml +++ /dev/null @@ -1,175 +0,0 @@ -# # Tests layout - -# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance: -# - `tests/trainer` for testing functionality related to `verl/trainer` -# - `tests/models` for testing functionality related to `verl/models` -# - ... - -# There are a few folders with `special_` prefix, created for special purposes: -# - `special_distributed`: unit tests that must run with multiple GPUs -# - `special_e2e`: end-to-end tests with training/generation scripts -# - `special_npu`: tests for NPUs -# - `special_sanity`: a suite of quick sanity tests -# - `special_standalone`: a set of test that are designed to run in dedicated environments - -# Accelerators for tests -# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`. -# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment. - -# # Workflow layout - -# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs: -# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml` -# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml` -# 3. End-to-end tests: `e2e_*.yml` -# 4. Unit tests -# - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py` -# - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix. -# - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when -# - new workflow yaml is added to `.github/workflows` -# - new tests are added to workflow mentioned in 2. - -name: checkpoint_converter -# latest version: Megatron-LM core_v0.14.0 https://github.com/NVIDIA/Megatron-LM/tree/core_v0.14.0 - -on: - # Trigger the workflow on push or pull request, - # but only for the main branch - push: - branches: - - main - - v0.* - pull_request: - branches: - - main - - v0.* - paths: - - "**/*.py" - # Other entrypoints - - "!examples/**" - - "!tests/**" - - "!verl/trainer/main_*.py" - - "!verl/trainer/fsdp_sft_trainer.py" - # Recipes - - "!recipe/**" - # FSDP - - "!verl/workers/**/*dp_*.py" - # Entrypoints - - ".github/workflows/checkpoint_converter.yml" - - ".github/workflows/e2e_ppo_trainer_megatron.yml" - - "examples/data_preprocess/gsm8k.py" - - "tests/special_e2e/run_ppo_trainer_megatron.sh" - - "verl/trainer/main_ppo.py" - - "verl/trainer/config/ppo_megatron_trainer.yaml" - -# Cancel jobs on the same ref if a new one is triggered -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} - -# Declare permissions just read content. -permissions: - contents: read - -env: - IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:sgl055.dev2" - DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" - -jobs: - setup: - if: github.repository_owner == 'volcengine' - runs-on: ubuntu-latest - outputs: - runner-label: ${{ steps.create-runner.outputs.runner-label }} - mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }} - steps: - - uses: actions/checkout@v4 - - id: create-runner - uses: volcengine/vemlp-github-runner@v1 - with: - mode: "create" - faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}" - mlp-image: "${{ env.IMAGE }}" - - checkpoint_converter: - needs: setup - runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ] - timeout-minutes: 20 # Increase this timeout value as needed - env: - HTTP_PROXY: ${{ secrets.PROXY_HTTP }} - HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} - NO_PROXY: "localhost,127.0.0.1" - HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - name: Install the current repository - run: | - pip3 install -e .[test] -# - name: Download Model to Use -# run: | -# huggingface-cli download Qwen/Qwen2.5-0.5B --local-dir ${HOME}/models/Qwen/Qwen2.5-0.5B -# huggingface-cli download deepseek-ai/deepseek-coder-1.3b-instruct --local-dir ${HOME}/models/deepseek-ai/deepseek-coder-1.3b-instruct -# export HF_HUB_OFFLINE=1 - - name: Running Huggingface to Megatron dist_ckpt converter (Qwen/Qwen2.5-0.5B) - run: | - ray stop --force - python scripts/converter_hf_to_mcore.py --hf_model_path=${HOME}/models/Qwen/Qwen2.5-0.5B --output_path checkpoints/Qwen/Qwen2.5-0.5B --test - - name: Running Huggingface to Megatron dist_ckpt converter (deepseek-ai/deepseek-coder-1.3b-instruct) - run: | - ray stop --force - python scripts/converter_hf_to_mcore.py --hf_model_path=${HOME}/models/deepseek-ai/deepseek-coder-1.3b-instruct --output_path checkpoints/deepseek-ai/deepseek-coder-1.3b-instruct --test - - name: Clean up - run: | - rm -rf checkpoints - - checkpoint_converter_large_moe_models: - needs: setup - runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ] - timeout-minutes: 30 # Increase this timeout value as needed - env: - HTTP_PROXY: ${{ secrets.PROXY_HTTP }} - HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} - NO_PROXY: "localhost,127.0.0.1" - HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable - HF_ENDPOINT: "https://hf-mirror.com" - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - name: Install the current repository - run: | - pip3 install -e .[test] -# - name: Download Model to Use -# run: | -# huggingface-cli download Qwen/Qwen1.5-MoE-A2.7B-Chat --local-dir ${HOME}/models/Qwen/Qwen1.5-MoE-A2.7B-Chat -# export HF_HUB_OFFLINE=1 - - name: Running Huggingface to Megatron dist_ckpt CPU converter (Qwen/Qwen1.5-MoE-A2.7B-Chat) - run: | - ray stop --force - python scripts/converter_hf_to_mcore.py --hf_model_path=${HOME}/models/Qwen/Qwen1.5-MoE-A2.7B-Chat --output_path checkpoints/Qwen/Qwen1.5-MoE-A2.7B-Chat --use_cpu_initialization - - name: Running distributed Huggingface to Megatron dist_ckpt CPU converter (Qwen/Qwen1.5-MoE-A2.7B-Chat) - run: | - ray stop --force - torchrun --nproc_per_node 8 --nnodes 1 scripts/converter_hf_to_mcore.py --hf_model_path=${HOME}/models/Qwen/Qwen1.5-MoE-A2.7B-Chat --output_path checkpoints/Qwen/Qwen1.5-MoE-A2.7B-Chat_dist --use_cpu_initialization - - name: clean up - run: | - rm -rf checkpoints - - cleanup: - runs-on: ubuntu-latest - needs: - [ - setup, - checkpoint_converter, - checkpoint_converter_large_moe_models - ] - if: always() - steps: - - id: destroy-runner - uses: volcengine/vemlp-github-runner@v1 - with: - mode: "destroy" - faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}" - mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}" \ No newline at end of file diff --git a/.github/workflows/e2e_ascend.yml b/.github/workflows/e2e_ascend.yml index 32ed62e5838..41673db6541 100644 --- a/.github/workflows/e2e_ascend.yml +++ b/.github/workflows/e2e_ascend.yml @@ -65,22 +65,24 @@ permissions: contents: read jobs: - test: + non_rl_job: if: github.repository_owner == 'volcengine' - name: verl Ascend test (self-host) - runs-on: linux-aarch64-a2-8 - timeout-minutes: 60 # Increase this timeout value as needed + name: E2E Ascend testing for non-RL algorithm scenarios + runs-on: linux-aarch64-a2-2 + timeout-minutes: 60 container: image: swr.ap-southeast-1.myhuaweicloud.com/base_image/ascend-ci/verl/verl:verl-8.3.rc1-910b-ubuntu22.04-py3.11-latest options: >- --shm-size 16g env: - HTTP_PROXY: ${{ secrets.PROXY_HTTP }} - HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} - NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable steps: + - name: Config third-party dependency download cache + run: | + sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list + pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple + pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local - name: Check npu and CANN info run: | cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info @@ -103,47 +105,126 @@ jobs: - name: Preprocess gsm8k dataset run: | python examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/.cache/datasets/openai/gsm8k - - name: Preprocess geo3k dataset - run: | - python examples/data_preprocess/geo3k.py --local_dataset_path ${HOME}/.cache/datasets/hiyouga/geometry3k - - name: Running gsm8k e2e qwen3 training tests with PPO on ASCEND NPU - run: | - ray stop --force - bash tests/special_npu/run_qwen3_06b_ppo.sh - rm -rf $HOME/ckpts - name: Running gsm8k e2e training tests with peft sft on ASCEND NPU run: | ray stop --force bash tests/special_npu/run_qwen2_5_05b_sft_peft_sp2.sh rm -rf $HOME/ckpts - - name: Running gsm8k e2e training tests with GRPO on ASCEND NPU + - name: Running NPU profiling unit tests run: | ray stop --force - bash tests/special_npu/run_qwen2_5_05b_grpo.sh - rm -rf $HOME/ckpts - - name: Running geo3k e2e training tests with GRPO on ASCEND NPU + pytest -s -x tests/utils/test_special_mstx_profile.py + + llm_rl_job: + if: github.repository_owner == 'volcengine' + name: E2E Ascend testing for RL training scenarios of LLM models + runs-on: linux-aarch64-a2-8 + timeout-minutes: 60 + container: + image: swr.ap-southeast-1.myhuaweicloud.com/base_image/ascend-ci/verl/verl:verl-8.3.rc1-910b-ubuntu22.04-py3.11-latest + options: >- + --shm-size 16g + env: + HF_ENDPOINT: "https://hf-mirror.com" + HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable + steps: + - name: Config third-party dependency download cache + run: | + sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list + pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple + pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local + - name: Check npu and CANN info + run: | + cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info + npu-smi info + - name: Check initial pip list from image + run: | + pip list + - name: Checkout volcengine/verl repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + clean: true + - name: Install the current repository + run: | + pip install -r requirements-npu.txt + pip install -e . + - name: Check final pip list + run: | + pip list + - name: Preprocess gsm8k dataset + run: | + python examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/.cache/datasets/openai/gsm8k + - name: Running gsm8k e2e training tests with PPO on ASCEND NPU (FSDP backend) run: | ray stop --force - bash tests/special_npu/run_qwen2_5_vl_3b_npu.sh + bash tests/special_npu/run_qwen3_06b_ppo.sh rm -rf $HOME/ckpts - - name: Running gsm8k e2e training tests with DAPO on ASCEND NPU + - name: Running gsm8k e2e training tests with GRPO on ASCEND NPU (FSDP backend) run: | ray stop --force - bash tests/special_npu/run_qwen2_5_05b_dapo.sh + bash tests/special_npu/run_qwen2_5_05b_grpo.sh rm -rf $HOME/ckpts - - name: Running gsm8k e2e qwen3 MoE training tests with DAPO MindSpeed on ASCEND NPU + - name: Running gsm8k e2e training tests with DAPO on ASCEND NPU (FSDP backend) run: | ray stop --force - export PYTHONPATH=$PYTHONPATH:/Megatron-LM - USE_DIST_CKPT=True USE_DUMMY_MODEL=True DUMMY_MODEL_CONFIG_PATH=tests/special_e2e/ppo_trainer/expert_parallel/qwen3moe_minimal.json DUMMY_MODEL_PATH=$HOME/dist_ckpt/qwen3_30b_dapo_mindspeed bash tests/special_npu/run_qwen3_30b_dapo_mindspeed.sh - - name: Running gsm8k e2e training tests with GRPO MindSpeed on ASCEND NPU + bash tests/special_npu/run_qwen2_5_05b_dapo.sh + rm -rf $HOME/ckpts + - name: Running gsm8k e2e training tests with GRPO on ASCEND NPU (MindSpeed backend) run: | ray stop --force export PYTHONPATH=$PYTHONPATH:/Megatron-LM USE_DIST_CKPT=True bash tests/special_npu/run_qwen2_5_05b_grpo_mindspeed.sh rm -rf $HOME/dist_ckpt/qwen2_5_05b_grpo_mindspeed rm -rf $HOME/ckpts - - name: Running NPU profiling unit tests + - name: Running gsm8k e2e training tests with DAPO on ASCEND NPU (MindSpeed backend, MoE Model) run: | ray stop --force - pytest -s -x tests/utils/test_special_mstx_profile.py + export PYTHONPATH=$PYTHONPATH:/Megatron-LM + USE_DIST_CKPT=True USE_DUMMY_MODEL=True DUMMY_MODEL_CONFIG_PATH=tests/special_e2e/ppo_trainer/expert_parallel/qwen3moe_minimal.json DUMMY_MODEL_PATH=$HOME/dist_ckpt/qwen3_30b_dapo_mindspeed bash tests/special_npu/run_qwen3_30b_dapo_mindspeed.sh + + vlm_rl_job: + if: github.repository_owner == 'volcengine' + name: E2E Ascend testing for RL training scenarios of VLM models + runs-on: linux-aarch64-a2-8 + timeout-minutes: 60 + container: + image: swr.ap-southeast-1.myhuaweicloud.com/base_image/ascend-ci/verl/verl:verl-8.3.rc1-910b-ubuntu22.04-py3.11-latest + options: >- + --shm-size 16g + env: + HF_ENDPOINT: "https://hf-mirror.com" + HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable + steps: + - name: Config third-party dependency download cache + run: | + sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list + pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple + pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local + - name: Check npu and CANN info + run: | + cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info + npu-smi info + - name: Check initial pip list from image + run: | + pip list + - name: Checkout volcengine/verl repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + clean: true + - name: Install the current repository + run: | + pip install -r requirements-npu.txt + pip install -e . + - name: Check final pip list + run: | + pip list + - name: Preprocess geo3k dataset + run: | + python examples/data_preprocess/geo3k.py --local_dataset_path ${HOME}/.cache/datasets/hiyouga/geometry3k + - name: Running geo3k e2e training tests with GRPO on ASCEND NPU + run: | + ray stop --force + bash tests/special_npu/run_qwen2_5_vl_3b_npu.sh + rm -rf $HOME/ckpts diff --git a/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml b/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml index 5e81bc48e96..df049bb0871 100644 --- a/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml +++ b/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml @@ -115,6 +115,7 @@ jobs: NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable + ENGINE: sglang steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: @@ -135,11 +136,6 @@ jobs: export VLLM_USE_V1=1 ray start --head ENGINE=sglang MODE=async RESUME_MODE=auto MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct TOTAL_TRAIN_STEPS=2 bash tests/special_e2e/run_ppo_trainer_megatron.sh - - name: Test Megatron checkpoints merging function (DeepSeek Actor and Critic) - run: | - exp_name="deepseek-coder-1.3b-instruct-megatron-gsm8k-minimal" - python -m verl.model_merger test --backend megatron --local_dir checkpoints/verl-test/${exp_name}/global_step_1/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface - python -m verl.model_merger test --backend megatron --is-value-model --local_dir checkpoints/verl-test/${exp_name}/global_step_1/critic --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/critic/huggingface - name: Profiling GRPO GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Deepseek) run: | ray stop --force @@ -154,39 +150,8 @@ jobs: run: | rm -rf checkpoints - e2e_ppo_trainer_megatron-different-train-infer-tp-qwen-tie-embedding: - needs: setup - runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"] - timeout-minutes: 60 # Increase this timeout value as needed - env: - HTTP_PROXY: ${{ secrets.PROXY_HTTP }} - HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} - NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" - HF_ENDPOINT: "https://hf-mirror.com" - HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - name: Install the current repository - run: | - pip3 install --no-deps -e .[test] - - name: Prepare GSM8K dataset - run: | - python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k - - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with tie-embedding Megatron (Qwen) with train tp > infer tp - run: | - ray stop --force - ENGINE=sglang VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 TRAIN_TP=2 INFER_TP=1 MODEL_ID=Qwen/Qwen2.5-1.5B bash tests/special_e2e/run_ppo_trainer_megatron.sh - - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen) with train tp < infer tp - run: | - ray stop --force - ENGINE=sglang VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 TRAIN_TP=1 INFER_TP=2 MODEL_ID=Qwen/Qwen2.5-1.5B bash tests/special_e2e/run_ppo_trainer_megatron.sh - - name: clean up - run: | - rm -rf checkpoints - - e2e_ppo_trainer_megatron-qwen-override-transformer-config: + # Qwen3-0.6B: dense, tie_word_embeddings=True + e2e_ppo_trainer_megatron-qwen3: needs: setup runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"] timeout-minutes: 60 # Increase this timeout value as needed @@ -196,6 +161,7 @@ jobs: NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable + ENGINE: sglang steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: @@ -206,57 +172,15 @@ jobs: - name: Prepare GSM8K dataset run: | python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k -# - name: Download Model to Use -# run: | -# huggingface-cli download Qwen/Qwen2.5-0.5B --local-dir ${HOME}/models/Qwen/Qwen2.5-0.5B -# export HF_HUB_OFFLINE=1 - - name: Prepare dist_ckpt of Qwen2.5-0.5B, uneven layer distribution only supports dist_ckpt - run: | - python3 scripts/converter_hf_to_mcore.py --hf_model_path ${HOME}/models/Qwen/Qwen2.5-0.5B --output_path checkpoints/verl-test/qwen2.5-0.5b-megatron - - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen) + - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen3) testing learning rate scheduler run: | ray stop --force - ENGINE=sglang SAVE_FREQ=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 SKIP_SAVE_HF_MODEL=1 bash tests/special_e2e/run_ppo_trainer_megatron.sh +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_first_pipeline_stage=8 +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=4 actor_rollout_ref.actor.megatron.use_dist_checkpointing=true actor_rollout_ref.actor.megatron.dist_checkpointing_path=checkpoints/verl-test/qwen2.5-0.5b-megatron actor_rollout_ref.ref.megatron.use_dist_checkpointing=true actor_rollout_ref.ref.megatron.dist_checkpointing_path=checkpoints/verl-test/qwen2.5-0.5b-megatron critic.megatron.use_dist_checkpointing=true critic.megatron.dist_checkpointing_path=checkpoints/verl-test/qwen2.5-0.5b-megatron reward_model.megatron.use_dist_checkpointing=true reward_model.megatron.dist_checkpointing_path=checkpoints/verl-test/qwen2.5-0.5b-megatron - cp -r checkpoints checkpoints-dut - ENGINE=sglang SAVE_FREQ=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 bash tests/special_e2e/run_ppo_trainer_megatron.sh - - name: Test Megatron checkpoints merging function (Qwen Actor and Critic) - run: | - exp_name="qwen2.5-0.5b-megatron-gsm8k-minimal" - python -m verl.model_merger test --backend megatron --tie-word-embedding --local_dir checkpoints-dut/verl-test/${exp_name}/global_step_1/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface - python -m verl.model_merger test --backend megatron --is-value-model --local_dir checkpoints-dut/verl-test/${exp_name}/global_step_1/critic --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/critic/huggingface - - name: clean up - run: | - rm -rf checkpoints - - e2e_ppo_trainer_megatron-deepseek-override-transformer-config: - needs: setup - runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"] - timeout-minutes: 60 # Increase this timeout value as needed - env: - HTTP_PROXY: ${{ secrets.PROXY_HTTP }} - HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} - NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" - HF_ENDPOINT: "https://hf-mirror.com" - HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - name: Install the current repository - run: | - pip3 install --no-deps -e .[test] - - name: Prepare GSM8K dataset - run: | - python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k - - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (DeepSeek) + ALL_OFFLOAD=True VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 LR_WARMUP_STEPS=1 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh + - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with FP8 rollout run: | ray stop --force - ENGINE=sglang SAVE_FREQ=1 MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct COMMON_PP=2 COMMON_VPP=null bash tests/special_e2e/run_ppo_trainer_megatron.sh +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=true +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=true - - name: Test Megatron checkpoints merging function (DeepSeek Actor and Critic) - run: | - exp_name="deepseek-coder-1.3b-instruct-megatron-gsm8k-minimal" - python -m verl.model_merger test --backend megatron --local_dir checkpoints/verl-test/${exp_name}/global_step_1/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface - python -m verl.model_merger test --backend megatron --is-value-model --local_dir checkpoints/verl-test/${exp_name}/global_step_1/critic --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/critic/huggingface + export VLLM_USE_V1=1 + ROLLOUT_QUANTIZATION=fp8 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh - name: clean up run: | rm -rf checkpoints @@ -267,9 +191,7 @@ jobs: [ setup, e2e_ppo_trainer_megatron-deepseek, - e2e_ppo_trainer_megatron-different-train-infer-tp-qwen-tie-embedding, - e2e_ppo_trainer_megatron-qwen-override-transformer-config, - e2e_ppo_trainer_megatron-deepseek-override-transformer-config, + e2e_ppo_trainer_megatron-qwen3, ] if: always() steps: diff --git a/.github/workflows/e2e_ppo_trainer_megatron_sglang_2.yml b/.github/workflows/e2e_ppo_trainer_megatron_sglang_2.yml index d5e5efad222..e738fde2f8b 100644 --- a/.github/workflows/e2e_ppo_trainer_megatron_sglang_2.yml +++ b/.github/workflows/e2e_ppo_trainer_megatron_sglang_2.yml @@ -105,38 +105,7 @@ jobs: faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}" mlp-image: "${{ env.IMAGE }}" - e2e_ppo_trainer_megatron-qwen2_5vl-3b: - needs: setup - runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"] - timeout-minutes: 60 # Increase this timeout value as needed - env: - HTTP_PROXY: ${{ secrets.PROXY_HTTP }} - HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} - NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" - HF_ENDPOINT: "https://hf-mirror.com" - HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - name: Install the current repository - run: | - pip3 install --no-deps -e .[test] - - name: Prepare Geo3k dataset - run: | - python3 examples/data_preprocess/geo3k.py --local_dataset_path ${HOME}/models/hf_data/hiyouga/geometry3k/ - - name: Prepare dist_ckpt of Qwen2.5-VL-3B, only supports dist_ckpt - run: | - python3 scripts/converter_hf_to_mcore.py --hf_model_path ${HOME}/models/Qwen/Qwen2.5-VL-3B-Instruct --output_path checkpoints/verl-test/qwen2.5-vl-3b-megatron - - name: Running Geo3k E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen) - run: | - ray stop --force - ENGINE=sglang ROLLOUT_MODE=async TRAIN_FILES=${HOME}/data/geo3k/train.parquet VAL_FILES=${HOME}/data/geo3k/test.parquet MAX_PROMPT_LENGTH=1024 MAX_RESPONSE_LENGTH=2048 MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct ADV_ESTIMATOR=grpo USE_DYNAMIC_BSZ=False SKIP_SAVE_HF_MODEL=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 COMMON_TP=2 USE_DIST_CKPT=true DIST_CKPT_PATH=checkpoints/verl-test/qwen2.5-vl-3b-megatron bash tests/special_e2e/run_ppo_trainer_megatron.sh - - name: clean up - run: | - rm -rf checkpoints - - e2e_ppo_trainer_sglang: + e2e_ppo_trainer_fsdp_sglang: needs: setup runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ] timeout-minutes: 40 # Increase this timeout value as needed @@ -161,12 +130,8 @@ jobs: run: | ray stop --force ENGINE=sglang bash tests/special_e2e/ppo_trainer/run_function_reward.sh - - name: Running GSM8K E2E training tests on sglang async - run: | - ray stop --force - TOTAL_TRAIN_STEPS=2 ENGINE=sglang ROLLOUT_MODE=async bash tests/special_e2e/ppo_trainer/run_function_reward.sh - e2e_ppo_trainer_sglang_vlm: + e2e_ppo_trainer_fsdp-qwen2_5vl-3b: needs: setup runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ] timeout-minutes: 60 # Increase this timeout value as needed @@ -220,42 +185,13 @@ jobs: ACTOR_FSDP_OPTIMIZER_OFFLOAD=True REF_FSDP_PARAM_OFFLOAD=True \ bash tests/special_e2e/ppo_trainer/run_function_reward.sh - e2e_ppo_trainer_megatron-sglang-fp8: - needs: setup - runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"] - timeout-minutes: 60 # Increase this timeout value as needed - env: - HTTP_PROXY: ${{ secrets.PROXY_HTTP }} - HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} - NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" - HF_ENDPOINT: "https://hf-mirror.com" - HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - name: Install the current repository - run: | - pip3 install --no-deps -e .[test] - - name: Prepare GSM8K dataset - run: | - python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k - - name: Running GSM8K E2E training tests on 8 L20 GPUs with SGLang (FP8) - run: | - ray stop --force - ENGINE=sglang ROLLOUT_QUANTIZATION=fp8 ROLLOUT_MODE=async TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh - - name: clean up - run: | - rm -rf checkpoints - cleanup: runs-on: ubuntu-latest needs: [ setup, - e2e_ppo_trainer_megatron-qwen2_5vl-3b, - e2e_ppo_trainer_sglang, - e2e_ppo_trainer_sglang_vlm + e2e_ppo_trainer_fsdp-qwen2_5vl-3b, + e2e_ppo_trainer_fsdp_sglang, ] if: always() steps: diff --git a/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml b/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml index 58554246336..f329ae9b7aa 100644 --- a/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml +++ b/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml @@ -105,6 +105,7 @@ jobs: faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}" mlp-image: "${{ env.IMAGE }}" + # deepseek-ai/deepseek-coder-1.3b-instruct: dense, tie_word_embeddings=False e2e_ppo_trainer_megatron-deepseek: needs: setup runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"] @@ -127,6 +128,7 @@ jobs: - name: Prepare GSM8K dataset run: | python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k + # Full training save&load - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron, use mbridge e2e to pre-load and save (Deepseek) run: | ray stop --force @@ -137,11 +139,12 @@ jobs: ray stop --force RESUME_MODE=auto MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct TOTAL_TRAIN_STEPS=2 SAVE_FREQ=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 USE_MBRIDGE=True USE_DIST_CKPT=False \ bash tests/special_e2e/run_ppo_trainer_megatron.sh + # LoRA training save&load - name: clean up and install Megatron-Bridge run: | rm -rf checkpoints - pip3 install git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@af21db0 --no-deps --no-build-isolation - pip3 install git+https://github.com/NVIDIA/Megatron-LM.git@3cbe5c6 --no-deps --no-build-isolation + pip3 install git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@a489bed --no-deps --no-build-isolation + pip3 install git+https://github.com/NVIDIA/Megatron-LM.git@2d398b4 --no-deps --no-build-isolation pip3 install "nvidia-modelopt[torch]>=0.37.0" transformers==4.57.1 - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron, use Megatron-Bridge LoRA e2e to pre-load and save (Deepseek) run: | @@ -156,28 +159,8 @@ jobs: - name: clean up run: | rm -rf checkpoints - - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (DeepSeek) - run: | - ray stop --force - export VLLM_USE_V1=1 - ray start --head - MODE=async USE_FUSED_KERNELS=True MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct TOTAL_TRAIN_STEPS=2 SAVE_FREQ=2 bash tests/special_e2e/run_ppo_trainer_megatron.sh - - name: Test Megatron checkpoints merging function (DeepSeek Actor and Critic) - run: | - exp_name="deepseek-coder-1.3b-instruct-megatron-gsm8k-minimal" - python -m verl.model_merger test --backend megatron --local_dir checkpoints/verl-test/${exp_name}/global_step_2/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_2/actor/huggingface - python -m verl.model_merger test --backend megatron --is-value-model --local_dir checkpoints/verl-test/${exp_name}/global_step_2/critic --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_2/critic/huggingface - - name: Test Megatron distributed checkpoints merging function (DeepSeek) - run: | - exp_name="deepseek-coder-1.3b-instruct-megatron-gsm8k-minimal" - torchrun --nproc_per_node 4 --nnodes 1 -m verl.model_merger merge --backend megatron --local_dir checkpoints/verl-test/${exp_name}/global_step_2/actor --target_dir checkpoints/verl-test/${exp_name}/global_step_2/actor/hf_model - - name: Running GRPO GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Deepseek) - run: | - ray stop --force - ADV_ESTIMATOR=grpo USE_DYNAMIC_BSZ=False MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/special_e2e/run_ppo_trainer_megatron.sh - - name: clean up - run: | - rm -rf checkpoints + + # Qwen3-0.6B: dense, tie_word_embeddings=True e2e_ppo_trainer_megatron-qwen3: needs: setup runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"] @@ -199,100 +182,15 @@ jobs: - name: Prepare GSM8K dataset run: | python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k - - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen3) with validation and saving - run: | - ray stop --force - ALL_OFFLOAD=True VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen3) testing learning rate scheduler run: | ray stop --force - LR_WARMUP_STEPS=1 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh - - - name: Test Megatron checkpoints merging function (Qwen3 Actor and Critic) - run: | - exp_name="qwen3-0.6b-megatron-gsm8k-minimal" - python -m verl.model_merger test --backend megatron --tie-word-embedding --local_dir checkpoints/verl-test/${exp_name}/global_step_1/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface - python -m verl.model_merger test --backend megatron --is-value-model --local_dir checkpoints/verl-test/${exp_name}/global_step_1/critic --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/critic/huggingface + ALL_OFFLOAD=True VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 LR_WARMUP_STEPS=1 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with FP8 rollout run: | ray stop --force export VLLM_USE_V1=1 - ROLLOUT_QUANTIZATION=fp8 ROLLOUT_MODE=async TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh - - name: clean up - run: | - rm -rf checkpoints - e2e_ppo_trainer_megatron-different-train-infer-tp-qwen-tie-embedding: - needs: setup - runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"] - timeout-minutes: 60 # Increase this timeout value as needed - env: - HTTP_PROXY: ${{ secrets.PROXY_HTTP }} - HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} - NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" - HF_ENDPOINT: "https://hf-mirror.com" - HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - name: Install the current repository - run: | - pip3 install --no-deps -e .[test] - pip3 install math-verify transformers==$TRANSFORMERS_VERSION - - name: Prepare GSM8K dataset - run: | - python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k - - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with tie-embedding Megatron (Qwen) with train tp > infer tp - run: | - ray stop --force - VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 TRAIN_TP=2 INFER_TP=1 MODEL_ID=Qwen/Qwen2.5-1.5B bash tests/special_e2e/run_ppo_trainer_megatron.sh - - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen) with train tp < infer tp - run: | - ray stop --force - VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 TRAIN_TP=1 INFER_TP=2 ALL_OFFLOAD=True MODEL_ID=Qwen/Qwen2.5-1.5B bash tests/special_e2e/run_ppo_trainer_megatron.sh - - name: clean up - run: | - rm -rf checkpoints - e2e_ppo_trainer_megatron-qwen-override-transformer-config: - needs: setup - runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"] - timeout-minutes: 60 # Increase this timeout value as needed - env: - HTTP_PROXY: ${{ secrets.PROXY_HTTP }} - HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} - NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" - HF_ENDPOINT: "https://hf-mirror.com" - HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - name: Install the current repository - run: | - pip3 install --no-deps -e .[test] - pip3 install math-verify transformers==$TRANSFORMERS_VERSION - - name: Prepare GSM8K dataset - run: | - python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k -# - name: Download Model to Use -# run: | -# huggingface-cli download Qwen/Qwen2.5-0.5B --local-dir ${HOME}/models/Qwen/Qwen2.5-0.5B -# export HF_HUB_OFFLINE=1 - - name: Prepare dist_ckpt of Qwen2.5-0.5B, uneven layer distribution only supports dist_ckpt - run: | - python3 scripts/converter_hf_to_mcore.py --hf_model_path ${HOME}/models/Qwen/Qwen2.5-0.5B --output_path checkpoints/verl-test/qwen2.5-0.5b-megatron - - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen) - run: | - ray stop --force - SAVE_FREQ=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 SKIP_SAVE_HF_MODEL=1 USE_DIST_CKPT=True DIST_CKPT_PATH=checkpoints/verl-test/qwen2.5-0.5b-megatron \ - bash tests/special_e2e/run_ppo_trainer_megatron.sh +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_first_pipeline_stage=8 +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=4 - cp -r checkpoints checkpoints-dut - SAVE_FREQ=1 COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 bash tests/special_e2e/run_ppo_trainer_megatron.sh - - name: Test Megatron checkpoints merging function (Qwen Actor and Critic) - run: | - exp_name="qwen2.5-0.5b-megatron-gsm8k-minimal" - python -m verl.model_merger test --backend megatron --tie-word-embedding --local_dir checkpoints-dut/verl-test/${exp_name}/global_step_1/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface - python -m verl.model_merger test --backend megatron --is-value-model --local_dir checkpoints-dut/verl-test/${exp_name}/global_step_1/critic --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/critic/huggingface + ROLLOUT_QUANTIZATION=fp8 TOTAL_TRAIN_STEPS=2 MODEL_ID=Qwen/Qwen3-0.6B bash tests/special_e2e/run_ppo_trainer_megatron.sh - name: clean up run: | rm -rf checkpoints @@ -304,8 +202,6 @@ jobs: setup, e2e_ppo_trainer_megatron-deepseek, e2e_ppo_trainer_megatron-qwen3, - e2e_ppo_trainer_megatron-different-train-infer-tp-qwen-tie-embedding, - e2e_ppo_trainer_megatron-qwen-override-transformer-config, ] if: always() steps: diff --git a/.github/workflows/e2e_ppo_trainer_megatron_vllm_2.yml b/.github/workflows/e2e_ppo_trainer_megatron_vllm_2.yml index 8908718f144..a35756dd224 100644 --- a/.github/workflows/e2e_ppo_trainer_megatron_vllm_2.yml +++ b/.github/workflows/e2e_ppo_trainer_megatron_vllm_2.yml @@ -105,39 +105,6 @@ jobs: faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}" mlp-image: "${{ env.IMAGE }}" - e2e_ppo_trainer_megatron-deepseek-override-transformer-config: - needs: setup - runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"] - timeout-minutes: 60 # Increase this timeout value as needed - env: - HTTP_PROXY: ${{ secrets.PROXY_HTTP }} - HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} - NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" - HF_ENDPOINT: "https://hf-mirror.com" - HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - name: Install the current repository - run: | - pip3 install --no-deps -e .[test] - pip3 install transformers==$TRANSFORMERS_VERSION - - name: Prepare GSM8K dataset - run: | - python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k - - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (DeepSeek) - run: | - ray stop --force - SAVE_FREQ=1 MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct COMMON_PP=2 COMMON_VPP=null bash tests/special_e2e/run_ppo_trainer_megatron.sh +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=true +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=true - - name: Test Megatron checkpoints merging function (DeepSeek Actor and Critic) - run: | - exp_name="deepseek-coder-1.3b-instruct-megatron-gsm8k-minimal" - python -m verl.model_merger test --backend megatron --local_dir checkpoints/verl-test/${exp_name}/global_step_1/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface - python -m verl.model_merger test --backend megatron --is-value-model --local_dir checkpoints/verl-test/${exp_name}/global_step_1/critic --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/critic/huggingface - - name: clean up - run: | - rm -rf checkpoints e2e_ppo_trainer_megatron-moe-expert-parallel: needs: setup runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"] @@ -155,8 +122,8 @@ jobs: - name: Install the current repository run: | pip3 install --no-deps -e .[test] - pip3 install git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@af21db0 --no-deps --no-build-isolation - pip3 install git+https://github.com/NVIDIA/Megatron-LM.git@3cbe5c6 --no-deps --no-build-isolation + pip3 install git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@a489bed --no-deps --no-build-isolation + pip3 install git+https://github.com/NVIDIA/Megatron-LM.git@2d398b4 --no-deps --no-build-isolation pip3 install "nvidia-modelopt[torch]>=0.37.0" transformers==4.57.1 - name: Prepare GSM8K dataset run: | @@ -185,42 +152,8 @@ jobs: - name: clean up run: | rm -rf checkpoints - e2e_ppo_trainer_megatron-qwen2_5vl-3b: - needs: setup - runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"] - timeout-minutes: 60 # Increase this timeout value as needed - env: - HTTP_PROXY: ${{ secrets.PROXY_HTTP }} - HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} - NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" - HF_ENDPOINT: "https://hf-mirror.com" - HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - name: Install the current repository - run: | - pip3 install --no-deps -e .[test] - pip3 install transformers==$TRANSFORMERS_VERSION - - name: Prepare Geo3k dataset - run: | - python3 examples/data_preprocess/geo3k.py --local_dataset_path ${HOME}/models/hf_data/hiyouga/geometry3k/ - - name: Prepare dist_ckpt of Qwen2.5-VL-3B, only supports dist_ckpt - run: | - python3 scripts/converter_hf_to_mcore.py --hf_model_path ${HOME}/models/Qwen/Qwen2.5-VL-3B-Instruct --output_path checkpoints/verl-test/qwen2.5-vl-3b-megatron - - name: Running Geo3k E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen) - run: | - ray stop --force - TRAIN_FILES=${HOME}/data/geo3k/train.parquet VAL_FILES=${HOME}/data/geo3k/test.parquet \ - MAX_PROMPT_LENGTH=1024 MAX_RESPONSE_LENGTH=2048 MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct ADV_ESTIMATOR=grpo \ - USE_DYNAMIC_BSZ=False USE_FUSED_KERNELS=True SKIP_SAVE_HF_MODEL=1 \ - COMMON_PP=4 COMMON_VPP=null COMMON_CP=1 COMMON_TP=2 USE_DIST_CKPT=true \ - DIST_CKPT_PATH=checkpoints/verl-test/qwen2.5-vl-3b-megatron bash tests/special_e2e/run_ppo_trainer_megatron.sh - - name: clean up - run: | - rm -rf checkpoints - e2e_ppo_trainer_vllm: + + e2e_ppo_trainer_fsdp_vllm: needs: setup runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ] timeout-minutes: 60 # Increase this timeout value as needed @@ -242,16 +175,6 @@ jobs: run: | ray stop --force python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k - # HF sanity -# - name: Running GSM8K E2E training tests on 1 L20 GPU with hf for sanity -# run: | -# ray stop --force -# bash tests/special_e2e/ppo_trainer/run_single_gpu.sh -# # HF sanity -# - name: Running GSM8K E2E training tests on 1 L20 GPU with engine interface for sanity. -# run: | -# ray stop --force -# bash tests/special_e2e/ppo_trainer/run_single_gpu_with_engine.sh # Function RM - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm with validation and saving (FSDP_SIZE=8) run: | @@ -268,7 +191,7 @@ jobs: - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm with validation and saving (DDP_SIZE=2, FSDP_SIZE=4) run: | ray stop --force - VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 SAVE_HF_MODEL=True FSDP_SIZE=4 VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-ddp-size2-fsdp-size4" bash tests/special_e2e/ppo_trainer/run_function_reward.sh + VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 SAVE_HF_MODEL=True FSDP_SIZE=4 USE_KL=True VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-ddp-size2-fsdp-size4" bash tests/special_e2e/ppo_trainer/run_function_reward.sh - name: Test merging DDP+FSDP checkpoints (Qwen Actor) run: | exp_name="qwen2.5-0.5b-function-reward-minimal-ddp-size2-fsdp-size4" @@ -288,19 +211,11 @@ jobs: - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm (GRPO) run: | ray stop --force - ADV_ESTIMATOR=grpo USE_KL=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh + CUSTOM_REWARD_FN=True ADV_ESTIMATOR=grpo USE_KL=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh # - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm (ReMax) # run: | # ray stop --force # ADV_ESTIMATOR=remax USE_KL=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh - - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using customized reward function - run: | - ray stop --force - CUSTOM_REWARD_FN=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh - - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm with in-reward kl and kl loss - run: | - ray stop --force - USE_KL=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh # LoRA tests - name: Running GSM8K E2E training tests on 8 L20 GPUs with grpo lora using function rm with use_shm run: | @@ -320,47 +235,8 @@ jobs: run: | ray stop --force ADV_ESTIMATOR=grpo USE_SHM=True LORA_RANK=32 LOAD_FORMAT=safetensors LAYERED_SUMMON=True STRATEGY=fsdp2 bash tests/special_e2e/ppo_trainer/run_function_reward.sh - # Model RM - - name: Running GRPO GSM8K E2E training tests with FSDP on 8 L20 GPUs (DeepSeek) - run: | - ray stop --force - MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/special_e2e/ppo_trainer/run_function_reward.sh - - name: Running GSM8K E2E with rmpad using model rm - run: | - ray stop --force - bash tests/special_e2e/ppo_trainer/run_model_reward.sh - - name: Running GSM8K E2E without rmpad using model rm - run: | - ray stop --force - RM_PAD=False bash tests/special_e2e/ppo_trainer/run_model_reward.sh - - name: Running GSM8K E2E with rmpad using model rm and ulysses sp=2 - run: | - ray stop --force - SP_SIZE=2 bash tests/special_e2e/ppo_trainer/run_model_reward.sh - - name: Running GSM8K E2E with rmpad using model rm and dynamic batch size - run: | - ray stop --force - SEQ_BALANCE=True bash tests/special_e2e/ppo_trainer/run_model_reward.sh - - name: Running GSM8K E2E with rmpad using model rm with Liger Kernel enabled - run: | - ray stop --force - LIGER=True bash tests/special_e2e/ppo_trainer/run_model_reward.sh - - name: Running GSM8K E2E with rmpad using model rm with Fused Kernel enabled - run: | - ray stop --force - FUSED_KERNELS=True bash tests/special_e2e/ppo_trainer/run_model_reward.sh - - name: Running GSM8K E2E with rmpad using model rm with Fused Kernel enabled - run: | - ray stop --force - FUSED_KERNEL=True FUSED_KERNEL_BACKEND=triton bash tests/special_e2e/ppo_trainer/run_model_reward.sh - - name: Running GSM8K E2E training tests on vllm async - run: | - ray stop --force - export VLLM_USE_V1=1 - ray start --head - TOTAL_TRAIN_STEPS=2 ENGINE=vllm ROLLOUT_MODE=async bash tests/special_e2e/ppo_trainer/run_function_reward.sh - e2e_ppo_trainer_vllm_vlm: + e2e_ppo_trainer_fsdp-qwen2_5vl-3b: needs: setup runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ] timeout-minutes: 40 # Increase this timeout value as needed @@ -417,10 +293,9 @@ jobs: needs: [ setup, - e2e_ppo_trainer_megatron-deepseek-override-transformer-config, - e2e_ppo_trainer_megatron-qwen2_5vl-3b, - e2e_ppo_trainer_vllm, - e2e_ppo_trainer_vllm_vlm + e2e_ppo_trainer_megatron-moe-expert-parallel, + e2e_ppo_trainer_fsdp-qwen2_5vl-3b, + e2e_ppo_trainer_fsdp_vllm, ] if: always() steps: diff --git a/.github/workflows/e2e_sft.yml b/.github/workflows/e2e_sft.yml index 80f73b76d8f..64d55a185a6 100644 --- a/.github/workflows/e2e_sft.yml +++ b/.github/workflows/e2e_sft.yml @@ -91,7 +91,7 @@ jobs: e2e_sft: needs: setup runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"] - timeout-minutes: 30 # Increase this timeout value as needed + timeout-minutes: 40 # Increase this timeout value as needed env: HTTP_PROXY: ${{ secrets.PROXY_HTTP }} HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} @@ -146,7 +146,13 @@ jobs: - name: Running GSM8K E2E training tests with multiturn and various configs and compare results run: | bash tests/special_e2e/sft/test_sft_engine_all.sh - + - name: Prepare pokemon-gpt4o-captions dataset + run: | + ray stop --force + python3 examples/data_preprocess/pokemon.py --local_dataset_path ${HOME}/models/hf_data/pokemon-gpt4o-captions + - name: Running Pokemon E2E training tests with multiturn and various configs and compare results + run: | + MODEL_ID=Qwen/Qwen3-VL-2B-Instruct DATASET_DIR=~/data/pokemon-gpt4o-captions VPP_SIZE=null bash tests/special_e2e/sft/test_sft_engine_all.sh cleanup: runs-on: ubuntu-latest diff --git a/.github/workflows/e2e_transferqueue.yml b/.github/workflows/e2e_transferqueue.yml index da5443f43aa..1abefc14be1 100644 --- a/.github/workflows/e2e_transferqueue.yml +++ b/.github/workflows/e2e_transferqueue.yml @@ -124,13 +124,14 @@ jobs: run: | pip3 install --no-deps -e .[test,gpu] pip3 install transformers==$TRANSFORMERS_VERSION - pip3 install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple TransferQueue==0.1.2.dev0 + pip3 install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple TransferQueue==0.1.4.dev1 - name: Prepare GSM8K dataset run: | python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k - - name: Running the E2E test with TransferQueue (FSDP) + - name: Running the E2E test with TransferQueue (FSDP), enable zero copy serialization run: | ray stop --force + export TQ_ZERO_COPY_SERIALIZATION=True bash tests/special_e2e/run_transferqueue.sh # Test Megatron strategy @@ -153,13 +154,14 @@ jobs: run: | pip3 install --no-deps -e .[test,gpu] pip3 install transformers==$TRANSFORMERS_VERSION - pip3 install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple TransferQueue==0.1.2.dev0 + pip3 install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple TransferQueue==0.1.4.dev1 - name: Prepare GSM8K dataset run: | python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k - - name: Running the E2E test with TransferQueue (Megatron) + - name: Running the E2E test with TransferQueue (Megatron), disable zero copy serialization run: | ray stop --force + export TQ_ZERO_COPY_SERIALIZATION=False bash tests/special_e2e/run_transferqueue.sh cleanup: diff --git a/.github/workflows/model.yml b/.github/workflows/model.yml index cab35a68d96..c9f1f2deac2 100644 --- a/.github/workflows/model.yml +++ b/.github/workflows/model.yml @@ -48,7 +48,6 @@ on: # Entrypoints - ".github/workflows/model.yml" - "tests/special_distributed/test_fsdp_ckpt.py" - - "tests/special_distributed/test_mcore_config_converter.py" - "tests/special_distributed/test_tensor_dict.py" - "tests/models/**" - "tests/special_distributed/run_all.sh" @@ -144,34 +143,6 @@ jobs: run: | STRATEGY=fsdp2 torchrun --nproc_per_node=8 tests/special_distributed/test_fsdp_ckpt.py - mcore_config_converter: - needs: setup - runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ] - timeout-minutes: 20 # Increase this timeout value as needed - env: - HTTP_PROXY: ${{ secrets.PROXY_HTTP }} - HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} - NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" - HF_ENDPOINT: "https://hf-mirror.com" - HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - name: Install the current repository - run: | - pip3 install -e .[test] -# - name: Download model config files -# run: | -# hf download Qwen/Qwen2.5-7B config.json --local-dir $HOME/configs/Qwen/Qwen2.5-7B -# hf download Qwen/Qwen3-8B config.json --local-dir $HOME/configs/Qwen/Qwen3-8B -# hf download deepseek-ai/deepseek-coder-1.3b-instruct config.json --local-dir $HOME/configs/deepseek-ai/deepseek-coder-1.3b-instruct -# hf download Qwen/Qwen2-57B-A14B config.json --local-dir $HOME/configs/Qwen/Qwen2-57B-A14B -# hf download Qwen/Qwen3-30B-A3B config.json --local-dir $HOME/configs/Qwen/Qwen3-30B-A3B -# hf download deepseek-ai/DeepSeek-V3-Base config.json --local-dir $HOME/configs/deepseek-ai/DeepSeek-V3-Base - - name: Running mcore config converter tests on 8 L20 GPUs - run: | - torchrun --nproc_per_node=8 tests/special_distributed/test_mcore_config_converter.py model_engine: needs: setup @@ -206,7 +177,6 @@ jobs: setup, model_rmpad, model_rmpad_fsdp2_unstable, - mcore_config_converter, model_engine ] if: always() diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml index 2520968a318..fd9349683e8 100644 --- a/.github/workflows/vllm.yml +++ b/.github/workflows/vllm.yml @@ -124,6 +124,9 @@ jobs: - name: Test the latest vLLM Rollout async with agent loop run: | ROLLOUT_NAME=vllm pytest -svvv tests/experimental/agent_loop + - name: Test vllm server abort functionality + run: | + pytest tests/workers/rollout/rollout_vllm/test_vllm_abort.py -v -s # Note(haibin.lin): for any new test, please update gpu_unit_tests.yaml to avoid repeated tests cleanup: diff --git a/docker/ascend/Dockerfile.ascend_8.3.rc1_a2 b/docker/ascend/Dockerfile.ascend_8.3.rc1_a2 index cd9fa59502c..200e7a05f35 100644 --- a/docker/ascend/Dockerfile.ascend_8.3.rc1_a2 +++ b/docker/ascend/Dockerfile.ascend_8.3.rc1_a2 @@ -44,6 +44,8 @@ RUN ARCH=$(uname -m) && \ echo "export PYTHONPATH=\$PYTHONPATH:/Megatron-LM" >> ~/.bashrc && \ # Remove existing triton or triton-ascend installed by some third-party packages pip uninstall -y triton triton-ascend && \ + # Install mbridge + pip install mbridge && \ # Clear extra files rm -rf /tmp/* /var/tmp/* && \ pip cache purge diff --git a/docker/ascend/Dockerfile.ascend_8.3.rc1_a3 b/docker/ascend/Dockerfile.ascend_8.3.rc1_a3 index 7b62a48b9e2..bbf7de87bbe 100644 --- a/docker/ascend/Dockerfile.ascend_8.3.rc1_a3 +++ b/docker/ascend/Dockerfile.ascend_8.3.rc1_a3 @@ -44,6 +44,8 @@ RUN ARCH=$(uname -m) && \ echo "export PYTHONPATH=\$PYTHONPATH:/Megatron-LM" >> ~/.bashrc && \ # Remove existing triton or triton-ascend installed by some third-party packages pip uninstall -y triton triton-ascend && \ + # Install mbridge + pip install mbridge && \ # Clear extra files rm -rf /tmp/* /var/tmp/* && \ pip cache purge diff --git a/docs/advance/checkpoint.rst b/docs/advance/checkpoint.rst index 56bec4a75c3..9782af951d9 100644 --- a/docs/advance/checkpoint.rst +++ b/docs/advance/checkpoint.rst @@ -137,32 +137,8 @@ Current implementation use solution 2. HuggingFace to Megatron DistCheckpoint details ---------------------------------------------- -If your model is quite huge, we recommend you to use Megatron dist-checkpoint to load the model. -Megatron dist-checkpoint supports loading with different kinds of model parallelism, -and it is much faster than the original checkpoint loading. - -To convert original HuggingFace model to Megatron dist-checkpoint, -you can use the ``scripts/converter_hf_to_mcore.py`` script. Large MoE models are temporarily supported with CPU initialization, -which is a little slower. While we are working on a better solution to support large models. - -Example command to convert the model is as follows: - -.. code:: bash - - python scripts/converter_hf_to_mcore.py \ - --hf_model_path Qwen/Qwen1.5-MoE-A2.7B-Chat \ - --output_path /mnt/disk/Qwen/Qwen1.5-MoE-A2.7B-Chat \ - --use_cpu_initialization # Only work for MoE models - - -Example command to distributed convert the huge model like deepseekv3 671B is as follows: - -.. code:: bash - - torchrun --nproc_per_node 1 --nnodes 8 --node_rank ${RANK} scripts/converter_hf_to_mcore.py \ - --hf_model_path deepseek-ai/DeepSeek-V3 \ - --output_path /mnt/disk/deepseek-ai/DeepSeek-V3 \ - --use_cpu_initialization # Only work for MoE models +Through ``mbridge``, we can directly save the mcore model to huggingface format during training. +No need to convert the model to Megatron dist-checkpoint format. Original Checkpoint Utils ------------------------- diff --git a/docs/advance/one_step_off.md b/docs/advance/one_step_off.md index 9ab644be688..d8861534343 100644 --- a/docs/advance/one_step_off.md +++ b/docs/advance/one_step_off.md @@ -225,7 +225,7 @@ def sync_rollout_weights(self): ### PPO Correctness To ensure the correctness of the PPO algorithm, we use rollout log_probs for PPO importance sampling. For the related algorithm details, please refer to: https://verl.readthedocs.io/en/latest/algo/rollout_corr_math.html -The default mode is ppo_is_bypass, but other modification strategies can also be explored. +The default mode is `bypass_ppo_clip`, but other modification strategies can also be explored. ### AgentLoop In the current implementation, we no longer provide SPMD model rollout mode. diff --git a/docs/algo/rollout_corr.md b/docs/algo/rollout_corr.md index da9512d6706..a2421e238c3 100644 --- a/docs/algo/rollout_corr.md +++ b/docs/algo/rollout_corr.md @@ -130,14 +130,15 @@ config = RolloutCorrectionConfig.decoupled_geo_rs() # Geo-RS config = RolloutCorrectionConfig.geo_rs_seq_tis() # Geo-RS-Seq-TIS # === Bypass PPO mode (2 policies: π_rollout = π_old, π_θ) - fast === -# No IS correction needed since π_old = π_rollout -config = RolloutCorrectionConfig.ppo_is_bypass() # PPO with rollout as anchor +# PPO ratio handles IS, so no explicit IS weights needed +config = RolloutCorrectionConfig.bypass_ppo_clip() # PPO-clip only +config = RolloutCorrectionConfig.bypass_ppo_clip_geo_rs() # PPO-clip + Geo-RS # === Bypass PG mode (2 policies, no PPO clipping) - fast === # IS weights computed on-the-fly as π_θ / π_rollout -config = RolloutCorrectionConfig.pg_is() # Seq-TIS + PG -config = RolloutCorrectionConfig.pg_rs() # Geo-RS + PG -config = RolloutCorrectionConfig.pg_geo_rs_seq_tis() # Geo-RS-Seq-TIS + PG +config = RolloutCorrectionConfig.bypass_pg_is() # Seq-TIS + PG +config = RolloutCorrectionConfig.bypass_pg_rs() # Geo-RS + PG +config = RolloutCorrectionConfig.bypass_pg_geo_rs_seq_tis() # Geo-RS-Seq-TIS + PG # === Other === config = RolloutCorrectionConfig.disabled() # Metrics only (no correction) @@ -157,8 +158,8 @@ algorithm: rollout_rs_threshold: null # RS upper threshold (required if rollout_rs is enabled) rollout_rs_threshold_lower: null # RS lower threshold (auto-reciprocal if null) rollout_token_veto_threshold: null # Per-token veto threshold (null = disabled) - bypass_mode: false # Skip old_log_prob computation - use_policy_gradient: false # Use policy gradient loss (vs PPO loss) + bypass_mode: false # Skip old_log_prob computation (sets π_old = π_rollout) + loss_type: ppo_clip # Loss type in bypass mode: "ppo_clip" (default) or "reinforce" # REQUIRED: Enable log prob calculation actor_rollout_ref: @@ -171,7 +172,7 @@ actor_rollout_ref: ### **Core Implementation** - `verl/trainer/ppo/rollout_corr_helper.py` - Contains `compute_rollout_correction_and_rejection_mask()` and `compute_offpolicy_metrics()` -- `verl/trainer/ppo/core_algos.py` - Rollout Correction integration with PPO and pure IS mode (`compute_policy_loss_with_rollout_correction()`) +- `verl/trainer/ppo/core_algos.py` - Rollout Correction integration with PPO and REINFORCE modes (`compute_policy_loss_bypass_mode()`, `compute_policy_loss_reinforce()`) - `verl/trainer/ppo/ray_trainer.py` - Bypass mode implementation (skips `old_log_prob` computation) - `verl/workers/actor/dp_actor.py` - Mode selection logic and metrics collection @@ -266,9 +267,9 @@ The rollout correction framework is built from **orthogonal components** that ca - **Decoupled**: Three policies (π_rollout, π_old, π_θ) with separate π_old computation - **Bypass**: Two policies (π_rollout = π_old, π_θ), skips π_old computation -2. **Loss Function** - - **PPO**: With clipping (standard RL training) - - **Pure IS**: Policy gradient only (no clipping) +2. **Loss Function** (in bypass mode, controlled by `loss_type`) + - **PPO-clip** (`loss_type="ppo_clip"`, default): PPO clipped objective (IS handled by ratio) + - **REINFORCE** (`loss_type="reinforce"`): Policy gradient with explicit IS weights (no clipping) 3. **IS/RS Aggregation Level** - **Token**: Per-token IS weights/rejection @@ -298,19 +299,22 @@ This section provides detailed guidance on choosing and using the verified prese | `decoupled_seq_is_rs()` | Seq-MIS | Decoupled | sequence | sequence | Sequence IS + sequence RS | | `decoupled_geo_rs()` | Geo-RS | Decoupled | - | geometric + veto | Geometric RS + veto, no IS weights | | `geo_rs_seq_tis()` | Geo-RS-Seq-TIS | Decoupled | sequence | geometric + veto | Geometric filter + clipped weight | -| **Bypass PPO Mode** (2 policies: π_rollout = π_old, π_θ) | -| `ppo_is_bypass()` | - | Bypass PPO | - | - | PPO with rollout as anchor (no IS correction needed) | -| **Bypass PG Mode** (2 policies: π_rollout, π_θ; IS = π_θ/π_rollout) | -| `pg_is()` | Seq-TIS | Bypass PG | sequence | - | Policy gradient with IS | -| `pg_rs()` | Geo-RS | Bypass PG | - | geometric + veto | Policy gradient with Geo-RS | -| `pg_geo_rs_seq_tis()` | Geo-RS-Seq-TIS | Bypass PG | sequence | geometric + veto | PG + Geo filter + seq IS | +| **Bypass Mode (PPO-clip)** (2 policies; ratio handles IS, RS masks outliers) | +| `bypass_ppo_clip()` | - | Bypass (PPO-clip) | - | - | PPO-clip only | +| `bypass_ppo_clip_geo_rs()` | Geo-RS | Bypass (PPO-clip) | - | geometric + veto | PPO-clip + Geo-RS | +| **Bypass Mode (REINFORCE)** (2 policies; explicit IS weights, no PPO clipping) | +| `bypass_pg_is()` | Seq-TIS | Bypass (REINFORCE) | sequence | - | REINFORCE with explicit IS | +| `bypass_pg_rs()` | Geo-RS | Bypass (REINFORCE) | - | geometric + veto | REINFORCE with Geo-RS | +| `bypass_pg_geo_rs_seq_tis()` | Geo-RS-Seq-TIS | Bypass (REINFORCE) | sequence | geometric + veto | REINFORCE + Geo filter + seq IS | | **Other** | | `disabled()` | - | - | - | - | Metrics only, no correction | **Note:** -- **Bypass PPO mode** sets π_old = π_rollout, so IS correction is not applicable (the ratio would be 1.0). -- **Bypass PG mode** computes IS weights as π_θ / π_rollout on-the-fly - use this for fast execution with IS/RS correction. -- Estimators (Token-TIS, Seq-TIS, Seq-MIS, Geo-RS, Geo-RS-Seq-TIS) are compatible with Decoupled PPO and Bypass PG modes. +- **Bypass mode** sets π_old = π_rollout and uses `loss_type` to select the loss function: + - `"ppo_clip"` (default): PPO clipped objective where ratio = π_θ/π_rollout already handles IS + - `"reinforce"`: REINFORCE with explicit IS weights as π_θ / π_rollout +- Both loss types benefit from rejection sampling (RS) which masks out-of-distribution samples. +- Estimators (Token-TIS, Seq-TIS, Seq-MIS, Geo-RS, Geo-RS-Seq-TIS) are compatible with Decoupled and Bypass modes. #### Other Supported Combinations (Manual Configuration Required) @@ -325,7 +329,7 @@ See [detailed configuration examples below](#additional-useful-configurations-no - Any aggregation level (token/sequence/geometric) works in either decoupled or bypass mode - All combinations are fully supported by the implementation - Rejection sampling is independent of IS weighting -- Pure RS (`pg_rs`) uses bypass + geometric RS with `use_policy_gradient=True` (no IS weights) +- Pure RS (`bypass_pg_rs`) uses bypass + geometric RS with `loss_type="reinforce"` (no IS weights) --- @@ -521,35 +525,35 @@ algorithm: --- -### 6. PPO with Bypass Mode (`ppo_is_bypass`) +### 6. Bypass Mode with PPO-clip (`bypass_ppo_clip`) **Configuration:** ```python -config = RolloutCorrectionConfig.ppo_is_bypass(threshold=2.0) +config = RolloutCorrectionConfig.bypass_ppo_clip() ``` **Components:** - **Operating Mode**: Bypass (2 policies: π_rollout = π_old, π_θ) -- **Loss**: PPO with clipping -- **IS Aggregation**: None (not needed, π_old = π_rollout) +- **Loss**: PPO-clip (IS handled by ratio, no explicit IS weights) +- **IS Aggregation**: None (PPO ratio handles it) - **RS**: None +- **Veto**: None **Equivalent YAML:** ```yaml algorithm: rollout_correction: - rollout_is: token # Placeholder for metrics - rollout_is_threshold: 2.0 + rollout_is: null rollout_rs: null - bypass_mode: true # Bypass mode - use_policy_gradient: false + bypass_mode: true + loss_type: ppo_clip ``` **Properties:** -- Skips `actor.compute_log_prob()` forward pass -- PPO clips against π_rollout (behavior policy) -- Sets π_old = π_rollout (two-policy setup) -- Does not separate proximal from behavior policy +- PPO clipped objective in bypass mode +- The PPO ratio = π_θ/π_rollout already handles IS (no explicit IS weights needed) +- Skips `actor.compute_log_prob()` forward pass (2 policies instead of 3) +- No rejection sampling - use `bypass_ppo_clip_geo_rs()` for RS **Configuration requirement:** - Set `actor_rollout_ref.rollout.calculate_log_probs: true` @@ -558,16 +562,61 @@ algorithm: --- -### 7. Policy Gradient with IS (`pg_is`) +### 6b. Bypass Mode with PPO-clip + Geo-RS (`bypass_ppo_clip_geo_rs`) + +**Configuration:** +```python +config = RolloutCorrectionConfig.bypass_ppo_clip_geo_rs( + rs_threshold=1.001, + veto_threshold=1e-4 +) +``` + +**Components:** +- **Operating Mode**: Bypass (2 policies: π_rollout = π_old, π_θ) +- **Loss**: PPO-clip (IS handled by ratio, no explicit IS weights) +- **IS Aggregation**: None (PPO ratio handles it) +- **RS**: Geometric-level rejection +- **Veto**: Enabled + +**Equivalent YAML:** +```yaml +algorithm: + rollout_correction: + rollout_is: null + rollout_rs: geometric + rollout_rs_threshold: 1.001 + rollout_rs_threshold_lower: 0.999 + rollout_token_veto_threshold: 1e-4 + bypass_mode: true + loss_type: ppo_clip +``` + +**Properties:** +- PPO clipped objective in bypass mode with geometric RS +- The PPO ratio = π_θ/π_rollout already handles IS (no explicit IS weights needed) +- Skips `actor.compute_log_prob()` forward pass (2 policies instead of 3) +- Geometric RS masks outliers +- Veto mechanism enabled +- Solves Length Trap problem for CoT/agent workloads + +**Configuration requirement:** +- Set `actor_rollout_ref.rollout.calculate_log_probs: true` + +**Theory:** [§3.1.2 (Bypass)](rollout_corr_math.md#312-bypass-mode-two-policies) + [§3.3.3 (Geometric)](rollout_corr_math.md#333-geometric-aggregation-geo-rs) + +--- + +### 7. REINFORCE with IS (`bypass_pg_is`) **Configuration:** ```python -config = RolloutCorrectionConfig.pg_is(threshold=2.0) +config = RolloutCorrectionConfig.bypass_pg_is(threshold=2.0) ``` **Components:** - **Operating Mode**: Bypass (2 policies: π_rollout, π_θ) -- **Loss**: Pure IS (policy gradient only, no PPO clipping) +- **Loss**: REINFORCE (policy gradient with explicit IS weights, no PPO clipping) - **IS Aggregation**: Sequence-level - **RS**: None @@ -578,12 +627,12 @@ algorithm: rollout_is: sequence rollout_is_threshold: 2.0 rollout_rs: null - bypass_mode: true # Required - use_policy_gradient: true # Use policy gradient loss (no PPO clipping) + bypass_mode: true + loss_type: reinforce # REINFORCE with explicit IS weights ``` **Properties:** -- Policy gradient loss (no PPO clipping) +- REINFORCE loss with explicit IS weights (no PPO clipping) - Single forward pass (skips old_log_prob computation) - IS weights computed on-the-fly in loss function @@ -591,11 +640,11 @@ algorithm: --- -### 8. Policy Gradient with Rejection Sampling (`pg_rs`) +### 8. REINFORCE with Rejection Sampling (`bypass_pg_rs`) **Configuration:** ```python -config = RolloutCorrectionConfig.pg_rs( +config = RolloutCorrectionConfig.bypass_pg_rs( rs_threshold=1.001, veto_threshold=1e-4 ) @@ -603,7 +652,7 @@ config = RolloutCorrectionConfig.pg_rs( **Components:** - **Operating Mode**: Bypass (2 policies: π_rollout, π_θ) -- **Loss**: Pure policy gradient (no PPO clipping, via `use_policy_gradient=True`) +- **Loss**: REINFORCE (no PPO clipping) - **IS Aggregation**: None - **RS**: Geometric-level rejection - **Veto**: Enabled @@ -618,7 +667,7 @@ algorithm: rollout_rs_threshold_lower: 0.999 rollout_token_veto_threshold: 1e-4 bypass_mode: true - use_policy_gradient: true + loss_type: reinforce ``` **Properties:** @@ -631,13 +680,13 @@ algorithm: --- -### 9. Policy Gradient with Geo-RS-Seq-TIS (`pg_geo_rs_seq_tis`) +### 9. REINFORCE with Geo-RS-Seq-TIS (`bypass_pg_geo_rs_seq_tis`) **Also known as: Geo-RS-Seq-TIS in bypass mode** **Configuration:** ```python -config = RolloutCorrectionConfig.pg_geo_rs_seq_tis( +config = RolloutCorrectionConfig.bypass_pg_geo_rs_seq_tis( is_threshold=2.0, rs_threshold=1.001, veto_threshold=1e-4 @@ -646,7 +695,7 @@ config = RolloutCorrectionConfig.pg_geo_rs_seq_tis( **Components:** - **Operating Mode**: Bypass (2 policies: π_rollout, π_θ) -- **Loss**: Pure policy gradient (no PPO clipping) +- **Loss**: REINFORCE (no PPO clipping) - **IS Aggregation**: Sequence-level (Seq-TIS) - **RS**: Geometric-level rejection (Geo-RS) - **Veto**: Enabled @@ -662,11 +711,11 @@ algorithm: rollout_rs_threshold_lower: 0.999 rollout_token_veto_threshold: 1e-4 bypass_mode: true - use_policy_gradient: true + loss_type: reinforce ``` **Properties:** -- Combines geometric filter + clipped sequence weight with policy gradient loss +- Combines geometric filter + clipped sequence weight with REINFORCE loss - Skips `actor.compute_log_prob()` forward pass (bypass mode) - Suitable for reasoning models (CoT, o1-style) when you want bypass mode efficiency - No PPO clipping - relies on IS/RS for stability @@ -760,11 +809,11 @@ The framework provides **two operating modes** for computing π_old, which can b ### Operating Modes and Configuration -| Configuration | `bypass_mode` | `use_policy_gradient` | Operating Mode | Loss Function | Description | -|---------------|----------------------------------|------------------------------|----------------|---------------|-------------| -| **Decoupled** | `false` | `false` | Decoupled | PPO | Computes `old_log_prob` separately via `actor.compute_log_prob()` | -| **Bypass** | `true` | `false` | Bypass | PPO | Sets `old_log_prob = rollout_log_prob`, PPO clips against rollout policy | -| **Bypass + PG** | `true` | `true` | Bypass | Policy Gradient | Bypass mode with policy gradient loss (no PPO clipping) | +| Configuration | `bypass_mode` | `loss_type` | Operating Mode | Loss Function | Description | +|---------------|---------------|-------------|----------------|---------------|-------------| +| **Decoupled** | `false` | N/A | Decoupled | PPO | Computes `old_log_prob` separately via `actor.compute_log_prob()` | +| **Bypass + PPO-clip** | `true` | `"ppo_clip"` (default) | Bypass | PPO-clip | PPO clipped objective (IS handled by ratio) | +| **Bypass + REINFORCE** | `true` | `"reinforce"` | Bypass | REINFORCE | Policy gradient with explicit IS weights (no PPO clipping) | ### Operating Mode Details @@ -829,9 +878,9 @@ The aggregation level can be chosen **independently** of the operating mode. Any ### Example Workflow -**Recommended: Bypass + Policy Gradient Mode** +**Recommended: Bypass Mode** -This workflow uses bypass mode with pure policy gradient loss for efficiency. +This workflow uses bypass mode for efficiency. 1. **Start with metrics only** to understand the off-policy gap: ```yaml @@ -840,7 +889,7 @@ This workflow uses bypass mode with pure policy gradient loss for efficiency. rollout_is: null rollout_rs: null bypass_mode: true # Bypass mode (recommended) - use_policy_gradient: true # Pure policy gradient (recommended) + loss_type: ppo_clip # Default: PPO clipped objective ``` Monitor `rollout_corr/kl`, `rollout_corr/log_ppl_abs_diff`, `rollout_corr/chi2_token` to assess off-policy gap. @@ -852,11 +901,11 @@ This workflow uses bypass mode with pure policy gradient loss for efficiency. rollout_rs: sequence # or "geometric" for higher sensitivity rollout_rs_threshold: 2.0 bypass_mode: true # Bypass mode - use_policy_gradient: true # Pure policy gradient + loss_type: ppo_clip # or "reinforce" for explicit IS weights ``` This excludes outliers from training without modifying gradients. -3. **Enable full IS correction** once comfortable with metrics: +3. **Enable full IS correction** (with REINFORCE loss) once comfortable with metrics: ```yaml algorithm: rollout_correction: @@ -865,14 +914,15 @@ This workflow uses bypass mode with pure policy gradient loss for efficiency. rollout_rs: sequence # or "geometric" for more aggressive filtering rollout_rs_threshold: 2.0 bypass_mode: true # Bypass mode - use_policy_gradient: true # Pure policy gradient + loss_type: reinforce # REINFORCE with explicit IS weights ``` -**Benefits of bypass + policy gradient mode:** +**Benefits of bypass mode:** - ✅ Skips expensive `actor.compute_log_prob()` forward pass (faster) -- ✅ IS weights computed on-the-fly in loss function (π_θ / π_rollout) -- ✅ Simpler than PPO (no clipping, pure policy gradient with IS/RS) -- ✅ Works for all IS/RS combinations +- ✅ `loss_type` controls the loss function: "ppo_clip" (default) or "reinforce" +- ✅ PPO-clip: IS handled by ratio (no explicit weights), RS mask applied +- ✅ REINFORCE: Explicit IS weights computed on-the-fly (π_θ / π_rollout) +- ✅ Both loss types work with all IS/RS combinations ## Usage @@ -1249,7 +1299,7 @@ algorithm: rollout_token_veto_threshold: 1e-4 # Veto catastrophic tokens ``` -### Example 5: Bypass Mode +### Example 5: Bypass Mode with PPO-clip (Default) ```yaml algorithm: rollout_correction: @@ -1258,22 +1308,35 @@ algorithm: rollout_rs: token rollout_rs_threshold: 2.0 bypass_mode: true # Skip old_log_prob computation - use_policy_gradient: false # Use bypass mode: PPO with rollout_log_prob as old_log_prob + loss_type: ppo_clip # PPO clipped objective (default) ``` -**Skips expensive `actor.compute_log_prob()` forward pass** +**Skips expensive `actor.compute_log_prob()` forward pass. PPO ratio = π_θ/π_rollout handles IS.** -### Example 6: Pure Policy Gradient Mode +### Example 6: Bypass Mode with REINFORCE ```yaml algorithm: rollout_correction: - rollout_is: token # Explicit IS correction in loss + rollout_is: sequence # Explicit IS correction in loss rollout_is_threshold: 2.0 rollout_rs: null # Optional: can add rejection sampling - bypass_mode: true # Required for policy gradient mode - use_policy_gradient: true # Use policy gradient loss (no PPO clipping) + bypass_mode: true + loss_type: reinforce # REINFORCE with explicit IS weights ``` **No PPO clipping, pure policy gradient with IS correction** +### Example 7: Bypass Mode with PPO-clip + Rejection Sampling +```yaml +algorithm: + rollout_correction: + rollout_is: sequence # Computed for metrics + rollout_is_threshold: 2.0 + rollout_rs: geometric # Rejection sampling enabled + rollout_rs_threshold: 1.001 + bypass_mode: true + loss_type: ppo_clip # PPO clipped objective (IS handled by ratio) +``` +**PPO clipping with rejection sampling. IS handled by PPO ratio (no explicit IS weights).** + ## Troubleshooting ### Issue: High spread in IS weights diff --git a/docs/algo/rollout_corr_math.md b/docs/algo/rollout_corr_math.md index 91ca84ae711..5ac34336502 100644 --- a/docs/algo/rollout_corr_math.md +++ b/docs/algo/rollout_corr_math.md @@ -96,7 +96,7 @@ The transition dynamics $p(s_{t+1}|s_t, a_t)$ and initial state $p(s_0)$ cancel - **Off-policy capable**: Can learn from any behavior policy via importance sampling - **No trust region**: Policy updates not constrained -**Implementation in verl:** The `pg_is` method implements off-policy REINFORCE with truncated importance sampling. +**Implementation in verl:** The `bypass_pg_is` preset implements off-policy REINFORCE with truncated importance sampling. ### 1.2 PPO: Adding Trust Region Control @@ -271,8 +271,8 @@ The operating mode determines how the proximal policy $\pi_{\text{old}}$ is comp - $\pi_{\theta}$: Current policy (being updated) **Ratios:** -- **With PPO loss** (`use_policy_gradient = false`): No separate IS computation; PPO ratio $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ clips against rollout policy -- **With policy gradient loss** (`use_policy_gradient = true`): IS ratio $\rho_t = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ computed on-the-fly in loss function +- **With PPO-clip loss** (`loss_type = "ppo_clip"`, default): PPO ratio $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ clips against rollout policy (IS handled by ratio) +- **With REINFORCE loss** (`loss_type = "reinforce"`): IS ratio $\rho_t = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ computed on-the-fly in loss function **Properties:** - ✅ Skips `actor.compute_log_prob()` call (faster) @@ -286,7 +286,7 @@ The operating mode determines how the proximal policy $\pi_{\text{old}}$ is comp #### 3.2.1 PPO Loss (with Clipping) -**Configuration:** `use_policy_gradient = false` +**Configuration:** `loss_type = "ppo_clip"` (default in bypass mode) **Loss function:** @@ -306,7 +306,7 @@ where: #### 3.2.2 Policy Gradient Loss (with IS/RS Correction) -**Configuration:** `use_policy_gradient = true` (requires `bypass_mode = true`) +**Configuration:** `loss_type = "reinforce"` (requires `bypass_mode = true`) **Loss function** (example with sequence-level IS): @@ -368,12 +368,17 @@ The stopgrad operator is **mathematically required** by importance sampling theo **Intuition**: The IS weight $w(\theta)$ tells us "how much to trust this sample" for estimating the gradient under $\pi_\theta$. We update $\theta$ to maximize the reweighted objective, but we don't update $\theta$ to maximize the weight itself—that would be circular reasoning (optimizing the correction factor instead of the actual objective). **Properties:** -- **Algorithm**: Off-policy REINFORCE + IS/RS correction -- **No PPO clipping**: Pure policy gradient +- **Algorithm**: Off-policy policy gradient with IS/RS correction +- **Loss types** (`loss_type` config option in bypass mode): + - `"ppo_clip"` (default): PPO clipped objective + - $L = -\mathbb{E}[\min(r \cdot A, \text{clip}(r) \cdot A)]$ where $r = \pi_\theta / \pi_{\text{rollout}}$ + - Note: IS weights NOT applied (PPO ratio already handles it; would be double-counting) + - `"reinforce"`: Pure policy gradient with explicit IS weights, no PPO clipping + - $L = -\mathbb{E}[w \cdot \log \pi_\theta(a|s) \cdot A]$ where $w = \pi_\theta / \pi_{\text{rollout}}$ - **Always uses bypass mode**: Direct $\pi_\theta$ to $\pi_{\text{rollout}}$ comparison - **Fast**: Single forward pass -**Implementation:** `compute_policy_loss_with_rollout_correction()` in [core_algos.py](../../verl/trainer/ppo/core_algos.py#L1537-L1681) +**Implementation:** `compute_policy_loss_bypass_mode()` and `compute_policy_loss_reinforce()` in [core_algos.py](../../verl/trainer/ppo/core_algos.py) --- @@ -613,7 +618,7 @@ where $\bar{w}_j = \frac{1}{T_j}\sum_{t=1}^{T_j} w_{j,t} \cdot m_{j,t}$ is the p | **Geo-RS** | `rollout_rs="geometric"` | Decoupled PPO, Bypass PG | | **Geo-RS-Seq-TIS** | `rollout_is="sequence"` + `rollout_rs="geometric"` | Decoupled PPO, Bypass PG | -**Note:** Bypass PPO mode (π_old = π_rollout) does not use IS correction since there's no gap to correct. Use Bypass PG mode for fast execution with IS/RS correction. +**Note:** In bypass mode, `loss_type` controls the loss function. Use "ppo_clip" (default) or "reinforce". #### Available Preset Methods @@ -625,16 +630,17 @@ where $\bar{w}_j = \frac{1}{T_j}\sum_{t=1}^{T_j} w_{j,t} \cdot m_{j,t}$ is the p | `decoupled_seq_is_rs()` | Seq-MIS | Decoupled PPO | Sequence IS + sequence RS | | `decoupled_geo_rs()` | Geo-RS | Decoupled PPO | Geometric RS + veto | | `geo_rs_seq_tis()` | Geo-RS-Seq-TIS | Decoupled PPO | Geometric filter + seq IS | -| **Bypass PPO Mode** (2 policies: π_rollout = π_old, π_θ) | -| `ppo_is_bypass()` | - | Bypass PPO | PPO with rollout as anchor (no IS correction needed) | -| **Bypass PG Mode** (2 policies: π_rollout, π_θ; IS = π_θ/π_rollout) | -| `pg_is()` | Seq-TIS | Bypass PG | Policy gradient + Seq IS | -| `pg_rs()` | Geo-RS | Bypass PG | Policy gradient + Geo-RS | -| `pg_geo_rs_seq_tis()` | Geo-RS-Seq-TIS | Bypass PG | PG + Geo filter + seq IS | +| **Bypass Mode (PPO-clip)** (ratio handles IS, RS masks outliers) | +| `bypass_ppo_clip()` | - | Bypass (PPO-clip) | PPO-clip only | +| `bypass_ppo_clip_geo_rs()` | Geo-RS | Bypass (PPO-clip) | PPO-clip + Geo-RS | +| **Bypass Mode (REINFORCE)** (explicit IS weights, no PPO clipping) | +| `bypass_pg_is()` | Seq-TIS | Bypass (REINFORCE) | REINFORCE + Seq IS | +| `bypass_pg_rs()` | Geo-RS | Bypass (REINFORCE) | REINFORCE + Geo-RS | +| `bypass_pg_geo_rs_seq_tis()` | Geo-RS-Seq-TIS | Bypass (REINFORCE) | REINFORCE + Geo filter + seq IS | | **Other** | | `disabled()` | - | - | Metrics only | -**Note:** Bypass PPO mode sets π_old = π_rollout, so IS correction is not applicable. Use Bypass PG mode for fast execution with IS/RS correction. +**Note:** Bypass mode sets π_old = π_rollout and uses `loss_type` to select the loss function. #### Additional Supported Combinations (Manual Configuration) @@ -676,7 +682,7 @@ config = RolloutCorrectionConfig( - Rejection sampling can be added to any combination - Veto is independent and can be added to any combination - Geometric aggregation is typically used for RS only (not IS weighting) -- Pure RS (`pg_rs`) uses bypass + geometric RS with `use_policy_gradient=True` for pure policy gradient (no IS weights) +- Pure RS (`bypass_pg_rs`) uses bypass + geometric RS with `loss_type="reinforce"` for REINFORCE (no IS weights) - All combinations in the table above are valid and supported by the implementation --- @@ -785,12 +791,16 @@ $$ | Method | Theory | Policies | PPO Clip | IS Correction | Correctness | Speed | |--------|--------|----------|----------|---------------|-------------|-------| -| **Bypass PG Mode** (IS weights = π_θ / π_rollout) | -| `pg_is` | Off-policy REINFORCE | 2 (rollout, θ) | ❌ | ✅ Seq-TIS | ✅ Correct | **Fast** | -| `pg_rs` | Pure PG + Geo RS | 2 (rollout, θ) | ❌ | Geo-RS only | ✅ Correct | **Fast** | -| `pg_geo_rs_seq_tis` | Pure PG + Geo RS + Seq IS | 2 (rollout, θ) | ❌ | ✅ Geo-RS-Seq-TIS | ✅ Correct | **Fast** | -| **Bypass PPO Mode** (π_old = π_rollout, no IS correction needed) | -| `ppo_is_bypass` | PPO (rollout as prox) | 2 (rollout, θ) | ✅ | ❌ (not needed) | ✅ Correct | **Fast** | +| **Bypass Mode** (π_old = π_rollout, `loss_type` selects algorithm) | +| `loss_type="ppo_clip"` (default) | PPO (ratio = π_θ/π_rollout) | 2 (rollout, θ) | ✅ | RS mask only (ratio handles IS) | ✅ Correct | **Fast** | +| `loss_type="reinforce"` | Off-policy REINFORCE | 2 (rollout, θ) | ❌ | ✅ (explicit IS weights) | ✅ Correct | **Fast** | +| **Bypass Mode Presets (PPO-clip)** | +| `bypass_ppo_clip` | PPO only | 2 (rollout, θ) | ✅ | - | ✅ Correct | **Fast** | +| `bypass_ppo_clip_geo_rs` | PPO + Geo-RS | 2 (rollout, θ) | ✅ | Geo-RS mask | ✅ Correct | **Fast** | +| **Bypass Mode Presets (REINFORCE)** | +| `bypass_pg_is` | REINFORCE + Seq-TIS | 2 (rollout, θ) | ❌ | ✅ Seq-TIS | ✅ Correct | **Fast** | +| `bypass_pg_rs` | REINFORCE + Geo RS | 2 (rollout, θ) | ❌ | Geo-RS only | ✅ Correct | **Fast** | +| `bypass_pg_geo_rs_seq_tis` | REINFORCE + Geo RS + Seq IS | 2 (rollout, θ) | ❌ | ✅ Geo-RS-Seq-TIS | ✅ Correct | **Fast** | | **Decoupled PPO Mode** (IS weights = π_old / π_rollout) | | `decoupled_token_is` | Decoupled PPO | 3 (rollout, old, θ) | ✅ | ✅ Token-TIS | ✅ Correct | Standard | | `decoupled_seq_is` | Decoupled PPO | 3 (rollout, old, θ) | ✅ | ✅ Seq-TIS | ✅ Correct | Standard | @@ -800,7 +810,11 @@ $$ | **Incorrect (for reference)** | | Naive LLM-RL | Incorrect PPO usage | 2 (old, θ) | ✅ | ❌ | ⚠️ Incorrect | Standard | -**Note:** Bypass PPO mode sets π_old = π_rollout, so IS correction is not applicable (the ratio would be 1.0). Use Bypass PG mode if you want IS/RS correction with fast execution. +**Notes:** +- **Bypass mode** sets π_old = π_rollout and uses `loss_type` to select the loss function: + - `"ppo_clip"` (default): PPO clipped ratio (IS handled by ratio = π_θ/π_rollout, no explicit IS weights to avoid double-counting) + - `"reinforce"`: Explicit IS weights applied as $w \cdot \log \pi \cdot A$ +- Both loss types benefit from rejection sampling (RS) which masks out-of-distribution samples ### 5.2 Estimator Hierarchy @@ -816,7 +830,9 @@ These estimators define **how IS weights and rejection masks are computed**. The **Note:** Each estimator can be used with either: - **Decoupled PPO** (`bypass_mode=false`): Three policies with PPO clipping -- **Bypass Policy Gradient** (`bypass_mode=true`, `use_policy_gradient=true`): Two policies without PPO clipping +- **Bypass Mode** (`bypass_mode=true`): Two policies with configurable loss type + - `loss_type="ppo_clip"` (default): PPO clipped objective (IS via ratio, RS mask applied) + - `loss_type="reinforce"`: REINFORCE with explicit IS weights ### 5.3 Method Characteristics by Scenario @@ -832,7 +848,7 @@ These estimators define **how IS weights and rejection masks are computed**. The **Choosing operating mode:** - **Batch size invariance needed**: Use decoupled mode (`bypass_mode=false`) - **Computational efficiency needed**: Use bypass mode (`bypass_mode=true`) to skip `old_log_prob` computation -- **No PPO clipping**: Use bypass + policy gradient (`bypass_mode=true`, `use_policy_gradient=true`) +- **No PPO clipping**: Use bypass mode with `loss_type="reinforce"` ### 5.4 Decoupled Mode vs Bypass Mode diff --git a/docs/ascend_tutorial/ascend_profiling_en.rst b/docs/ascend_tutorial/ascend_profiling_en.rst index 04a77e0cca4..bcd089e21dd 100644 --- a/docs/ascend_tutorial/ascend_profiling_en.rst +++ b/docs/ascend_tutorial/ascend_profiling_en.rst @@ -123,6 +123,13 @@ Visualization Collected data is stored in the user-defined save_path and can be visualized by using the `MindStudio Insight `_ tool. +Additionally, in a Linux environment, the MindStudio Insight tool is provided in the form of a `JupyterLab Plugin `_ ,offering a more intuitive and highly interactive user interface. The advantages of the JupyterLab plugin are as follows: + +- Seamless integration: Supports running the MindStudio Insight tool directly within the Jupyter environment, eliminating the need to switch platforms or copy data from the server, enabling data to be collected and used immediately. +- Fast startup: Allows MindStudio Insight to be launched quickly via the JupyterLab command line or graphical interface. +- Smooth operation: In a Linux environment, launching MindStudio Insight through JupyterLab effectively alleviates performance lag compared to the full-package communication mode, significantly improving the user experience. +- Remote access: Supports remotely launching MindStudio Insight. Users can connect to the service via a local browser for direct visual analysis, reducing the difficulty of uploading and downloading data during large-model training or inference. + If the analysis parameter is set to False, offline parsing is required after data collection: .. code:: python diff --git a/docs/ascend_tutorial/ascend_profiling_zh.rst b/docs/ascend_tutorial/ascend_profiling_zh.rst index b4b4896f8b0..00e8565a7e1 100644 --- a/docs/ascend_tutorial/ascend_profiling_zh.rst +++ b/docs/ascend_tutorial/ascend_profiling_zh.rst @@ -110,6 +110,13 @@ Last updated: 08/14/2025. 采集后的数据存放在用户设置的save_path下,可通过 `MindStudio Insight `_ 工具进行可视化。 +另外在Linux环境下,MindStudio Insight工具提供了 `JupyterLab插件 `_ 形态,提供更直观和交互式强的操作界面。JupyterLab插件优势如下: + +- 无缝集成:支持在Jupyter环境中直接运行MindStudio Insight工具,无需切换平台,无需拷贝服务器上的数据,实现数据即采即用。 +- 快速启动:通过JupyterLab的命令行或图形界面,可快速启动MindStudio Insight工具。 +- 运行流畅:在Linux环境下,通过JupyterLab环境启动MindStudio Insight,相较于整包通信,有效解决了运行卡顿问题,操作体验显著提升。 +- 远程访问:支持远程启动MindStudio Insight,可通过本地浏览器远程连接服务直接进行可视化分析,缓解了大模型训练或推理数据上传和下载的困难。 + 如果analysis参数设置为False,采集之后需要进行离线解析: .. code:: python diff --git a/docs/ascend_tutorial/ascend_quick_start.rst b/docs/ascend_tutorial/ascend_quick_start.rst index 8e381e46cfa..bb335178214 100644 --- a/docs/ascend_tutorial/ascend_quick_start.rst +++ b/docs/ascend_tutorial/ascend_quick_start.rst @@ -1,10 +1,17 @@ Ascend Quickstart =================================== -Last updated: 12/4/2025. +Last updated: 12/11/2025. 我们在 verl 上增加对华为昇腾设备的支持。 + +关键更新 +---------------------------------- + +2025/12/11:verl 存量场景目前支持自动识别 NPU 设备类型, GPU 脚本在昇腾上运行,原则上不再需要显式设置 trainer.device=npu 参数,新增特性通过设置 trainer.device 仍可优先使用,逐步适配自动识别能力。 + + 硬件支持 ----------------------------------- @@ -122,6 +129,9 @@ MindSpeed 源码安装指令: # (可选)如希望 shell 关闭,或系统重启后,PYTHONPATH 环境变量仍然生效,建议将它添加到 .bashrc 配置文件中 echo "export PYTHONPATH=$PYTHONPATH:\"$(pwd)/Megatron-LM\"" >> ~/.bashrc + # 安装 mbridge + pip install mbridge + MindSpeed 对应 Megatron-LM 后端使用场景,使用方式如下: 1. 使能 verl worker 模型 ``strategy`` 配置为 ``megatron`` ,例如 ``actor_rollout_ref.actor.strategy=megatron``。 @@ -213,8 +223,7 @@ verl 中昇腾暂不支持生态库如下: trainer.nnodes=1 \ trainer.save_freq=-1 \ trainer.test_freq=5 \ - trainer.total_epochs=1 \ - trainer.device=npu $@ + trainer.total_epochs=1 $@ 算法支持现状 diff --git a/docs/ascend_tutorial/dockerfile_build_guidance.rst b/docs/ascend_tutorial/dockerfile_build_guidance.rst index ce4584aaed5..c27a3090b6b 100644 --- a/docs/ascend_tutorial/dockerfile_build_guidance.rst +++ b/docs/ascend_tutorial/dockerfile_build_guidance.rst @@ -33,6 +33,7 @@ vLLM-ascend 0.11.0rc1 Megatron-LM v0.12.1 MindSpeed (f2b0977e) triton-ascend 3.2.0rc4 +mbridge latest version ================= ============ @@ -57,7 +58,7 @@ A3 8.3.RC1 `Dockerfile.ascend_8.3.rc1_a3 &1 | tee ${LOG_PATH} \ No newline at end of file + trainer.val_before_train=False 2>&1 | tee ${LOG_PATH} \ No newline at end of file diff --git a/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_1k_spmd_npu.sh b/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_1k_spmd_npu.sh index 5d592410d5e..b2d259b4330 100644 --- a/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_1k_spmd_npu.sh +++ b/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_1k_spmd_npu.sh @@ -67,5 +67,4 @@ python3 -m verl.trainer.main_ppo \ trainer.total_epochs=5 \ trainer.default_local_dir="${CKPTS_DIR}" \ actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - trainer.device=npu $@ \ No newline at end of file + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} $@ \ No newline at end of file diff --git a/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_32k_spmd_npu.sh b/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_32k_spmd_npu.sh index 3684e8a2d48..9076360bb6d 100644 --- a/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_32k_spmd_npu.sh +++ b/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_32k_spmd_npu.sh @@ -67,5 +67,4 @@ python3 -m verl.trainer.main_ppo \ trainer.total_epochs=5 \ trainer.default_local_dir="${CKPTS_DIR}" \ actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ - actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ - trainer.device=npu $@ \ No newline at end of file + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} $@ \ No newline at end of file diff --git a/examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh b/examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh index 77805cdfb3b..3a92171b6e2 100644 --- a/examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh +++ b/examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh @@ -3,9 +3,11 @@ set -xeuo pipefail # Need to install Megatron-Bridge # NOTE: Make sure you use Megatron-Bridge later than 0.2.0 -# (after https://github.com/NVIDIA-NeMo/Megatron-Bridge/commit/36302b7ca1305f0690e17cf4e4019ac822746872) -# for MoE LoRA When you want to set ETP and ETP!=TP. -# https://github.com/NVIDIA-NeMo/Megatron-Bridge/issues/1363 +# (Recommend https://github.com/NVIDIA-NeMo/Megatron-Bridge/commit/a489bed3a2410ed9b000ec13a3c90176fec7d99c or later) +# for proper MoE LoRA support. + +# For Megatron communication/computation overlapping +export CUDA_DEVICE_MAX_CONNECTIONS=1 ########################### Quick Config ########################### @@ -41,9 +43,17 @@ DATA=( MODEL=( actor_rollout_ref.model.path=Qwen/Qwen3-30B-A3B-Instruct-2507 - actor_rollout_ref.model.lora.rank=16 - actor_rollout_ref.model.lora.alpha=32 actor_rollout_ref.model.use_fused_kernels=True + actor_rollout_ref.model.lora.rank=32 + actor_rollout_ref.model.lora.alpha=64 + actor_rollout_ref.model.lora.lora_A_init_method=kaiming + # # Optional: Use canonical LoRA + # actor_rollout_ref.model.lora.type="canonical_lora" + # actor_rollout_ref.model.lora.target_modules='["linear_q","linear_k","linear_v","linear_proj","linear_fc1_up","linear_fc1_gate","linear_fc2"]' + + # # Optional: Add dropout to LoRA layers + # actor_rollout_ref.model.lora.dropout=0.05 + # actor_rollout_ref.model.lora.dropout_position=pre ) ACTOR=( diff --git a/examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh b/examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh index 2944de647c4..aa2b3e4a118 100644 --- a/examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh +++ b/examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh @@ -25,10 +25,14 @@ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megat critic.model.path=deepseek-ai/deepseek-llm-7b-chat \ critic.ppo_micro_batch_size_per_gpu=4 \ reward_model.enable=True \ - reward_model.megatron.tensor_model_parallel_size=4 \ reward_model.model.path=deepseek-ai/deepseek-llm-7b-chat \ - reward_model.micro_batch_size_per_gpu=4 \ - reward_model.param_offload=False \ + reward_model.use_reward_loop=True \ + reward_model.rollout.name=vllm \ + reward_model.rollout.gpu_memory_utilization=0.8 \ + reward_model.rollout.tensor_model_parallel_size=4 \ + reward_model.rollout.prompt_length=256 \ + reward_model.rollout.response_length=128 \ + reward_model.num_workers=8 \ algorithm.use_kl_in_reward=False \ trainer.critic_warmup=0 \ trainer.logger='["console","wandb"]' \ diff --git a/examples/ppo_trainer/run_qwen2-7b_rm.sh b/examples/ppo_trainer/run_qwen2-7b_rm.sh index 57b7bd7524b..33caabf40d8 100644 --- a/examples/ppo_trainer/run_qwen2-7b_rm.sh +++ b/examples/ppo_trainer/run_qwen2-7b_rm.sh @@ -55,9 +55,13 @@ python3 -m verl.trainer.main_ppo \ critic.model.fsdp_config.optimizer_offload=False \ reward_model.enable=True \ reward_model.model.path="$HOME/models/FsfairX-LLaMA3-RM-v0.1" \ - reward_model.model.use_remove_padding=True \ - reward_model.model.fsdp_config.param_offload=True \ - reward_model.micro_batch_size_per_gpu=32 \ + reward_model.use_reward_loop=True \ + reward_model.rollout.name=vllm \ + reward_model.rollout.gpu_memory_utilization=0.8 \ + reward_model.rollout.tensor_model_parallel_size=1 \ + reward_model.rollout.prompt_length=2048 \ + reward_model.rollout.response_length=1024 \ + reward_model.num_workers=8 \ algorithm.use_kl_in_reward=False \ trainer.critic_warmup=0 \ trainer.logger='["console","wandb"]' \ diff --git a/examples/ppo_trainer/run_qwen2-7b_rm_legacy.sh b/examples/ppo_trainer/run_qwen2-7b_rm_legacy.sh new file mode 100644 index 00000000000..99574a33c96 --- /dev/null +++ b/examples/ppo_trainer/run_qwen2-7b_rm_legacy.sh @@ -0,0 +1,63 @@ +# download datasets and models +# python3 examples/data_preprocess/gsm8k.py +# python3 examples/data_preprocess/math_dataset.py +# huggingface-cli download Skywork/Skywork-Reward-V2-Llama-3.2-3B --local-dir $HOME/models/Skywork-Reward-V2-Llama-3.2-3B +# huggingface-cli download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct + +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +math_train_path=$HOME/data/math/train.parquet +math_test_path=$HOME/data/math/test.parquet + +train_files="['$gsm8k_train_path', '$math_train_path']" +test_files="['$gsm8k_test_path', '$math_test_path']" + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=gae \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=1024 \ + data.max_prompt_length=1024 \ + data.max_response_length=2048 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.return_raw_chat=True \ + actor_rollout_ref.model.path="$HOME/models/Qwen2.5-3B-Instruct" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=True \ + critic.optim.lr_warmup_steps_ratio=0.05 \ + critic.model.path="$HOME/models/Qwen2.5-3B-Instruct" \ + critic.model.enable_gradient_checkpointing=True \ + critic.ppo_micro_batch_size_per_gpu=32 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + reward_model.enable=True \ + reward_model.model.path="$HOME/models/Skywork-Reward-V2-Llama-3.2-3B" \ + reward_model.use_reward_loop=False \ + reward_model.model.use_remove_padding=True \ + reward_model.model.fsdp_config.param_offload=True \ + reward_model.micro_batch_size_per_gpu=32 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_test_qwen25_rm' \ + trainer.val_before_train=True \ + trainer.experiment_name='legacy_fsdp_reward_model' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=10 \ + trainer.total_epochs=15 $@ diff --git a/examples/ppo_trainer/run_qwen2-7b_rm_reward_loop_colocate.sh b/examples/ppo_trainer/run_qwen2-7b_rm_reward_loop_colocate.sh new file mode 100644 index 00000000000..9641fdcb907 --- /dev/null +++ b/examples/ppo_trainer/run_qwen2-7b_rm_reward_loop_colocate.sh @@ -0,0 +1,69 @@ +# download datasets and models +# python3 examples/data_preprocess/gsm8k.py +# python3 examples/data_preprocess/math_dataset.py +# huggingface-cli download Skywork/Skywork-Reward-V2-Llama-3.2-3B --local-dir $HOME/models/Skywork-Reward-V2-Llama-3.2-3B +# huggingface-cli download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct + +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +math_train_path=$HOME/data/math/train.parquet +math_test_path=$HOME/data/math/test.parquet + +train_files="['$gsm8k_train_path', '$math_train_path']" +test_files="['$gsm8k_test_path', '$math_test_path']" + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=gae \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=1024 \ + data.max_prompt_length=1024 \ + data.max_response_length=2048 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.return_raw_chat=True \ + actor_rollout_ref.model.path="$HOME/models/Qwen2.5-3B-Instruct" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=True \ + critic.optim.lr_warmup_steps_ratio=0.05 \ + critic.model.path="$HOME/models/Qwen2.5-3B-Instruct" \ + critic.model.enable_gradient_checkpointing=True \ + critic.ppo_micro_batch_size_per_gpu=32 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + reward_model.enable=True \ + reward_model.model.path="$HOME/models/Skywork-Reward-V2-Llama-3.2-3B" \ + reward_model.use_reward_loop=True \ + reward_model.rollout.name=vllm \ + reward_model.rollout.gpu_memory_utilization=0.8 \ + reward_model.rollout.prompt_length=4096 \ + reward_model.rollout.response_length=4096 \ + reward_model.rollout.tensor_model_parallel_size=1 \ + reward_model.num_workers=8 \ + reward_model.model.use_remove_padding=True \ + reward_model.model.fsdp_config.param_offload=True \ + reward_model.micro_batch_size_per_gpu=32 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_test_qwen25_rm' \ + trainer.val_before_train=False \ + trainer.experiment_name='reward_loop_colocate_reward_model' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=10 \ + trainer.total_epochs=15 $@ diff --git a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh index e0ddc01e75e..902bcb8ede2 100644 --- a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh +++ b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh @@ -42,11 +42,13 @@ python3 -m verl.trainer.main_ppo \ critic.model.fsdp_config.optimizer_offload=False \ reward_model.enable=True \ reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\ - reward_model.model.use_remove_padding=True \ - reward_model.model.fsdp_config.param_offload=True \ - reward_model.micro_batch_size_per_gpu=32 \ - reward_model.use_dynamic_bsz=True \ - reward_model.forward_max_token_len_per_gpu=98304 \ + reward_model.use_reward_loop=True \ + reward_model.rollout.name=vllm \ + reward_model.rollout.gpu_memory_utilization=0.8 \ + reward_model.rollout.tensor_model_parallel_size=1 \ + reward_model.rollout.prompt_length=8192 \ + reward_model.rollout.response_length=4096 \ + reward_model.num_workers=8 \ algorithm.use_kl_in_reward=False \ trainer.critic_warmup=0 \ trainer.logger='["console","wandb"]' \ diff --git a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_fused_kernels.sh b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_fused_kernels.sh index 7e0a335efe2..fa2c154f3a1 100644 --- a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_fused_kernels.sh +++ b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_fused_kernels.sh @@ -45,12 +45,14 @@ python3 -m verl.trainer.main_ppo \ critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.optimizer_offload=False \ reward_model.enable=True \ - reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\ - reward_model.model.use_remove_padding=True \ - reward_model.model.fsdp_config.param_offload=True \ - reward_model.micro_batch_size_per_gpu=32 \ - reward_model.use_dynamic_bsz=True \ - reward_model.forward_max_token_len_per_gpu=98304 \ + reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1 \ + reward_model.use_reward_loop=True \ + reward_model.rollout.name=vllm \ + reward_model.rollout.gpu_memory_utilization=0.8 \ + reward_model.rollout.tensor_model_parallel_size=1 \ + reward_model.rollout.prompt_length=8192 \ + reward_model.rollout.response_length=4096 \ + reward_model.num_workers=8 \ algorithm.use_kl_in_reward=False \ trainer.critic_warmup=0 \ trainer.logger='["console","wandb"]' \ diff --git a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_nsys.sh b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_nsys.sh index 0acfe43e862..5ccfe1b3cd5 100644 --- a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_nsys.sh +++ b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_nsys.sh @@ -55,14 +55,13 @@ python3 -m verl.trainer.main_ppo \ critic.profiler.all_ranks=$PROFILE_RANKS_ALL \ reward_model.enable=True \ reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\ - reward_model.model.use_remove_padding=True \ - reward_model.model.fsdp_config.param_offload=True \ - reward_model.micro_batch_size_per_gpu=32 \ - reward_model.use_dynamic_bsz=True \ - reward_model.forward_max_token_len_per_gpu=98304 \ - reward_model.profiler.enable=True \ - reward_model.profiler.ranks=$PROFILE_RANKS \ - reward_model.profiler.all_ranks=$PROFILE_RANKS_ALL \ + reward_model.use_reward_loop=True \ + reward_model.rollout.name=vllm \ + reward_model.rollout.gpu_memory_utilization=0.8 \ + reward_model.rollout.tensor_model_parallel_size=1 \ + reward_model.rollout.prompt_length=8192 \ + reward_model.rollout.response_length=4096 \ + reward_model.num_workers=8 \ algorithm.use_kl_in_reward=False \ trainer.critic_warmup=0 \ trainer.logger='["console","wandb"]' \ diff --git a/examples/ppo_trainer/run_qwen2.5-3b_rm_legacy.sh b/examples/ppo_trainer/run_qwen2.5-3b_rm_legacy.sh new file mode 100644 index 00000000000..99574a33c96 --- /dev/null +++ b/examples/ppo_trainer/run_qwen2.5-3b_rm_legacy.sh @@ -0,0 +1,63 @@ +# download datasets and models +# python3 examples/data_preprocess/gsm8k.py +# python3 examples/data_preprocess/math_dataset.py +# huggingface-cli download Skywork/Skywork-Reward-V2-Llama-3.2-3B --local-dir $HOME/models/Skywork-Reward-V2-Llama-3.2-3B +# huggingface-cli download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct + +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +math_train_path=$HOME/data/math/train.parquet +math_test_path=$HOME/data/math/test.parquet + +train_files="['$gsm8k_train_path', '$math_train_path']" +test_files="['$gsm8k_test_path', '$math_test_path']" + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=gae \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=1024 \ + data.max_prompt_length=1024 \ + data.max_response_length=2048 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.return_raw_chat=True \ + actor_rollout_ref.model.path="$HOME/models/Qwen2.5-3B-Instruct" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=True \ + critic.optim.lr_warmup_steps_ratio=0.05 \ + critic.model.path="$HOME/models/Qwen2.5-3B-Instruct" \ + critic.model.enable_gradient_checkpointing=True \ + critic.ppo_micro_batch_size_per_gpu=32 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + reward_model.enable=True \ + reward_model.model.path="$HOME/models/Skywork-Reward-V2-Llama-3.2-3B" \ + reward_model.use_reward_loop=False \ + reward_model.model.use_remove_padding=True \ + reward_model.model.fsdp_config.param_offload=True \ + reward_model.micro_batch_size_per_gpu=32 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_test_qwen25_rm' \ + trainer.val_before_train=True \ + trainer.experiment_name='legacy_fsdp_reward_model' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=10 \ + trainer.total_epochs=15 $@ diff --git a/examples/ppo_trainer/run_qwen2.5-3b_rm_reward_loop_colocate.sh b/examples/ppo_trainer/run_qwen2.5-3b_rm_reward_loop_colocate.sh new file mode 100644 index 00000000000..d9d66f6f695 --- /dev/null +++ b/examples/ppo_trainer/run_qwen2.5-3b_rm_reward_loop_colocate.sh @@ -0,0 +1,66 @@ +# download datasets and models +# python3 examples/data_preprocess/gsm8k.py +# python3 examples/data_preprocess/math_dataset.py +# huggingface-cli download Skywork/Skywork-Reward-V2-Llama-3.2-3B --local-dir $HOME/models/Skywork-Reward-V2-Llama-3.2-3B +# huggingface-cli download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct + +gsm8k_train_path=$HOME/data/gsm8k/train.parquet +gsm8k_test_path=$HOME/data/gsm8k/test.parquet +math_train_path=$HOME/data/math/train.parquet +math_test_path=$HOME/data/math/test.parquet + +train_files="['$gsm8k_train_path', '$math_train_path']" +test_files="['$gsm8k_test_path', '$math_test_path']" + +python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=gae \ + data.train_files="$train_files" \ + data.val_files="$test_files" \ + data.train_batch_size=1024 \ + data.max_prompt_length=1024 \ + data.max_response_length=2048 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + data.return_raw_chat=True \ + actor_rollout_ref.model.path="$HOME/models/Qwen2.5-3B-Instruct" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.actor.use_kl_loss=False \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=True \ + critic.optim.lr_warmup_steps_ratio=0.05 \ + critic.model.path="$HOME/models/Qwen2.5-3B-Instruct" \ + critic.model.enable_gradient_checkpointing=True \ + critic.ppo_micro_batch_size_per_gpu=32 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + reward_model.enable=True \ + reward_model.model.path="$HOME/models/Skywork-Reward-V2-Llama-3.2-3B" \ + reward_model.use_reward_loop=True \ + reward_model.rollout.name=vllm \ + reward_model.rollout.gpu_memory_utilization=0.8 \ + reward_model.rollout.tensor_model_parallel_size=1 \ + reward_model.rollout.prompt_length=4096 \ + reward_model.rollout.response_length=4096 \ + reward_model.num_workers=8 \ + algorithm.use_kl_in_reward=False \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_test_qwen25_rm' \ + trainer.val_before_train=False \ + trainer.experiment_name='reward_loop_colocate_reward_model' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=10 \ + trainer.total_epochs=15 $@ diff --git a/examples/ppo_trainer/run_qwen3-8b_npu.sh b/examples/ppo_trainer/run_qwen3-8b_npu.sh index 40fb751e62c..a0ada0eb388 100644 --- a/examples/ppo_trainer/run_qwen3-8b_npu.sh +++ b/examples/ppo_trainer/run_qwen3-8b_npu.sh @@ -49,7 +49,6 @@ python3 -m verl.trainer.main_ppo \ trainer.save_freq=20 \ trainer.test_freq=-1 \ trainer.val_before_train=False \ - trainer.device=npu \ trainer.max_actor_ckpt_to_keep=1 \ trainer.max_critic_ckpt_to_keep=1 \ trainer.total_training_steps=100 $@ \ No newline at end of file diff --git a/examples/rollout_correction/run_with_rollout_corr.sh b/examples/rollout_correction/run_with_rollout_corr.sh index 15a004eefa8..e6b1061a2b3 100755 --- a/examples/rollout_correction/run_with_rollout_corr.sh +++ b/examples/rollout_correction/run_with_rollout_corr.sh @@ -25,9 +25,9 @@ rollout_rs_threshold_lower="null" # RS lower threshold # Veto mechanism (optional, independent of IS/RS) rollout_token_veto_threshold="null" # Per-token veto threshold (null to disable) -# Policy Gradient loss mode (bypass mode with policy gradient loss, no PPO clipping) -bypass_mode="true" # Required for policy gradient mode -use_policy_gradient="true" # Use policy gradient loss (works with IS/RS/both) +# Bypass mode with REINFORCE loss (no PPO clipping) +bypass_mode="true" # Skip old_log_prob computation +loss_type="reinforce" # REINFORCE with explicit IS weights (alternative: "ppo_clip") # ============================================================================== # Model and Data Configuration @@ -76,7 +76,7 @@ python3 -m verl.trainer.main_ppo \ algorithm.rollout_correction.rollout_rs_threshold_lower=${rollout_rs_threshold_lower} \ algorithm.rollout_correction.rollout_token_veto_threshold=${rollout_token_veto_threshold} \ algorithm.rollout_correction.bypass_mode=${bypass_mode} \ - algorithm.rollout_correction.use_policy_gradient=${use_policy_gradient} \ + algorithm.rollout_correction.loss_type=${loss_type} \ actor_rollout_ref.model.path="${MODEL_PATH}" \ actor_rollout_ref.actor.optim.lr=${learning_rate} \ actor_rollout_ref.actor.ppo_mini_batch_size=${ppo_mini_batch_size} \ @@ -95,7 +95,7 @@ echo " - Algorithm: RLOO (REINFORCE Leave-One-Out)" echo " - Advantage estimator: ${adv_estimator}" echo " - IS mode: ${rollout_is} (self-normalized: ${rollout_is_batch_normalize})" echo " - IS threshold: ${rollout_is_threshold}" -echo " - Policy gradient mode: ${use_policy_gradient} (bypass: ${bypass_mode})" +echo " - Bypass mode: ${bypass_mode}, loss_type: ${loss_type}" echo "" echo "Monitor these key metrics in wandb:" echo " - rollout_corr/rollout_is_mean (should be ~1.0 before batch norm)" diff --git a/examples/sft/gsm8k/run_qwen3_8b_sft_peft_sp2_npu.sh b/examples/sft/gsm8k/run_qwen3_8b_sft_peft_sp2_npu.sh index 720e2340838..7de7ebd67e4 100644 --- a/examples/sft/gsm8k/run_qwen3_8b_sft_peft_sp2_npu.sh +++ b/examples/sft/gsm8k/run_qwen3_8b_sft_peft_sp2_npu.sh @@ -32,5 +32,4 @@ torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \ model.target_modules=all-linear \ model.strategy=fsdp \ ulysses_sequence_parallel_size=2 \ - use_remove_padding=true \ - trainer.device=npu + use_remove_padding=true diff --git a/examples/sft/vlm/run_qwen3_vl_2b.sh b/examples/sft/vlm/run_qwen3_vl_2b.sh new file mode 100644 index 00000000000..28c21ffa049 --- /dev/null +++ b/examples/sft/vlm/run_qwen3_vl_2b.sh @@ -0,0 +1,100 @@ +#!/usr/bin/env bash +# python examples/data_preprocess/pokemon.py +set -xeuo pipefail + +HDFS_ROOT=${HDFS_ROOT:-$PWD} +DATA_ROOT=${DATA_ROOT:-$PWD} + +ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.sft_trainer"} + +TRAIN_FILES=${HOME}/data/pokemon-gpt4o-captions/train.parquet + +backend=${BACKEND:-fsdp} + +project_name=verl_sft_test + +RESUME_MODE=auto +MODEL_ID=${HDFS_ROOT}/model/Qwen3-VL-2B-Instruct +# MODEL_ID=${HDFS_ROOT}/model/Qwen3-VL-30B-A3B-Instruct + +SP_SIZE=${SP_SIZE:-2} +FSDP_SIZE=${FSDP_SIZE:--1} +FSDP_STRATEGY=${FSDP_STRATEGY:-"fsdp2"} + +TP_SIZE=${TP_SIZE:-2} +PP_SIZE=${PP_SIZE:-2} +VPP_SIZE=${VPP_SIZE:-null} +CP_SIZE=${CP_SIZE:-1} + +PAD_MODE=${PAD_MODE:-no_padding} + +USE_REMOVE_PADDING=${USE_REMOVE_PADDING:-True} + +FSDP_ENGINE_CONFIG="\ + engine=${backend} \ + optim=${backend} \ + optim.lr=2e-5 \ + optim.lr_warmup_steps_ratio=0.01 \ + optim.weight_decay=0.1 \ + optim.betas="[0.9,0.95]" \ + optim.clip_grad=1.0 \ + optim.min_lr_ratio=0.1 \ + optim.warmup_style=cosine \ + engine.ulysses_sequence_parallel_size=${SP_SIZE} \ + engine.strategy=${FSDP_STRATEGY} \ + engine.fsdp_size=${FSDP_SIZE}" + + +MEGATRON_ENGINE_CONFIG="\ + engine=${backend} \ + optim=${backend} \ + optim.lr=2e-5 \ + optim.lr_warmup_steps_ratio=0.01 \ + optim.weight_decay=0.1 \ + optim.betas="[0.9,0.95]" \ + optim.clip_grad=1.0 \ + optim.lr_warmup_init=0 \ + optim.lr_decay_style=cosine \ + optim.min_lr=2e-6 \ + engine.tensor_model_parallel_size=${TP_SIZE} \ + engine.pipeline_model_parallel_size=${PP_SIZE} \ + engine.virtual_pipeline_model_parallel_size=${VPP_SIZE} \ + engine.context_parallel_size=${CP_SIZE} \ + engine.use_mbridge=True \ + engine.vanilla_mbridge=True" + +if [ "$backend" = "fsdp" ]; then + ENGINE_CONFIG="$FSDP_ENGINE_CONFIG" + echo "Using fsdp engine" + exp_name=pokemon-qwen3-2b-${backend}-${FSDP_STRATEGY}-sp${SP_SIZE}-fsdp-1202a1 +else + ENGINE_CONFIG="$MEGATRON_ENGINE_CONFIG" + echo "Using megatron engine" + exp_name=pokemon-qwen3-2b-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-vpp${VPP_SIZE}-cp${CP_SIZE}-megatron-1202a1 +fi + +CKPT_HOME=${CKPT_HOME:-$HOME/open_verl/sft/${project_name}/${exp_name}} +mkdir -p "${CKPT_HOME}" + +torchrun --standalone --nnodes=1 --nproc-per-node=${NUM_TRAINERS:-8} \ + ${ENTRYPOINT} \ + data.train_files="${TRAIN_FILES}" \ + data.train_batch_size=96 \ + data.max_length=2048 \ + data.pad_mode=${PAD_MODE} \ + data.truncation=error \ + data.use_dynamic_bsz=True \ + data.max_token_len_per_gpu=65536 \ + model.path=$MODEL_ID \ + model.use_remove_padding=${USE_REMOVE_PADDING} \ + ${ENGINE_CONFIG} \ + trainer.test_freq=-1 \ + trainer.save_freq=4000 \ + trainer.logger=['console','wandb'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.total_epochs=10 \ + trainer.default_local_dir="${CKPT_HOME}" \ + trainer.resume_mode=${RESUME_MODE} \ + trainer.max_ckpt_to_keep=5 \ + checkpoint.save_contents=[model,optimizer,extra] \ No newline at end of file diff --git a/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_vllm_fsdp.sh b/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_vllm_fsdp.sh index b1be3bf56cb..cf5e065097f 100644 --- a/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_vllm_fsdp.sh +++ b/examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_multiturn_vllm_fsdp.sh @@ -42,7 +42,6 @@ python3 -m verl.trainer.main_ppo \ trainer.critic_warmup=0 \ trainer.project_name='gsm8k_async_rl' \ trainer.experiment_name='qwen2.5-3b_function_rm-gsm8k-sgl-multi-w-tool-verify-n16' \ - trainer.device=npu \ trainer.n_gpus_per_node=16 \ trainer.nnodes=1 \ trainer.save_freq=-1 \ diff --git a/recipe/dapo/main_dapo.py b/recipe/dapo/main_dapo.py index 303c8edbf4b..870ce30b54f 100644 --- a/recipe/dapo/main_dapo.py +++ b/recipe/dapo/main_dapo.py @@ -22,23 +22,25 @@ import ray from omegaconf import OmegaConf +from verl.trainer.constants_ppo import get_ppo_ray_runtime_env from verl.trainer.ppo.reward import load_reward_manager -from verl.utils.device import is_cuda_available +from verl.utils.device import auto_set_ascend_device_name, is_cuda_available from .dapo_ray_trainer import RayDAPOTrainer @hydra.main(config_path="config", config_name="dapo_trainer", version_base=None) def main(config): + # Automatically set `config.trainer.device = npu` when running on Ascend NPU. + auto_set_ascend_device_name(config) + run_ppo(config) def run_ppo(config) -> None: if not ray.is_initialized(): # this is for local ray cluster - default_runtime_env = { - "env_vars": {"TOKENIZERS_PARALLELISM": "true", "NCCL_DEBUG": "WARN", "VLLM_LOGGING_LEVEL": "WARN"} - } + default_runtime_env = get_ppo_ray_runtime_env() ray_init_kwargs = config.ray_kwargs.get("ray_init", {}) runtime_env_kwargs = ray_init_kwargs.get("runtime_env", {}) runtime_env = OmegaConf.merge(default_runtime_env, runtime_env_kwargs) diff --git a/recipe/dapo/run_dapo_qwen2.5_32b_npu.sh b/recipe/dapo/run_dapo_qwen2.5_32b_npu.sh index 0e99b5fa6fd..bce3ab8eca6 100644 --- a/recipe/dapo/run_dapo_qwen2.5_32b_npu.sh +++ b/recipe/dapo/run_dapo_qwen2.5_32b_npu.sh @@ -135,7 +135,6 @@ ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ trainer.save_freq=20 \ trainer.total_epochs=1 \ trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.device=npu \ trainer.resume_mode=auto \ actor_rollout_ref.actor.fsdp_config.forward_prefetch=True \ actor_rollout_ref.ref.fsdp_config.forward_prefetch=True \ diff --git a/recipe/dapo/run_dapo_qwen2.5_7b_npu.sh b/recipe/dapo/run_dapo_qwen2.5_7b_npu.sh index bd6b3689b23..834ab21fa6d 100644 --- a/recipe/dapo/run_dapo_qwen2.5_7b_npu.sh +++ b/recipe/dapo/run_dapo_qwen2.5_7b_npu.sh @@ -133,7 +133,6 @@ ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ trainer.save_freq=20 \ trainer.total_epochs=1 \ trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.device=npu \ trainer.resume_mode=auto \ actor_rollout_ref.actor.entropy_checkpointing=True \ actor_rollout_ref.ref.entropy_checkpointing=True \ diff --git a/recipe/dapo/run_dapo_qwen3_14b_base_npu.sh b/recipe/dapo/run_dapo_qwen3_14b_base_npu.sh index 3c8a9e9d5a6..9e0fdae374c 100644 --- a/recipe/dapo/run_dapo_qwen3_14b_base_npu.sh +++ b/recipe/dapo/run_dapo_qwen3_14b_base_npu.sh @@ -136,5 +136,4 @@ ray job submit --runtime-env="${RUNTIME_ENV}" \ actor_rollout_ref.actor.entropy_checkpointing=True \ actor_rollout_ref.ref.entropy_checkpointing=True \ actor_rollout_ref.actor.fsdp_config.forward_prefetch=True \ - actor_rollout_ref.ref.fsdp_config.forward_prefetch=True \ - trainer.device=npu + actor_rollout_ref.ref.fsdp_config.forward_prefetch=True diff --git a/recipe/dapo/run_dapo_qwen3_8b_base_npu.sh b/recipe/dapo/run_dapo_qwen3_8b_base_npu.sh index 29c7fd66b6c..9c34fa7e06c 100644 --- a/recipe/dapo/run_dapo_qwen3_8b_base_npu.sh +++ b/recipe/dapo/run_dapo_qwen3_8b_base_npu.sh @@ -135,5 +135,4 @@ ray job submit --runtime-env="${RUNTIME_ENV}" \ actor_rollout_ref.actor.entropy_checkpointing=True \ actor_rollout_ref.ref.entropy_checkpointing=True \ actor_rollout_ref.actor.fsdp_config.forward_prefetch=True \ - actor_rollout_ref.ref.fsdp_config.forward_prefetch=True \ - trainer.device=npu + actor_rollout_ref.ref.fsdp_config.forward_prefetch=True diff --git a/recipe/dapo/run_dapo_qwen3_moe_30b_base_fsdp_npu.sh b/recipe/dapo/run_dapo_qwen3_moe_30b_base_fsdp_npu.sh index d399ddfe8b7..52fb0b4e6a5 100644 --- a/recipe/dapo/run_dapo_qwen3_moe_30b_base_fsdp_npu.sh +++ b/recipe/dapo/run_dapo_qwen3_moe_30b_base_fsdp_npu.sh @@ -138,7 +138,6 @@ ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ trainer.test_freq=5 \ trainer.save_freq=-1 \ trainer.total_epochs=1 \ - trainer.device="npu" \ actor_rollout_ref.actor.use_torch_compile=False \ actor_rollout_ref.ref.use_torch_compile=False diff --git a/recipe/dapo/run_dapo_qwen3_moe_30b_megatron_npu.sh b/recipe/dapo/run_dapo_qwen3_moe_30b_megatron_npu.sh index 9e8d21d8890..24624275929 100644 --- a/recipe/dapo/run_dapo_qwen3_moe_30b_megatron_npu.sh +++ b/recipe/dapo/run_dapo_qwen3_moe_30b_megatron_npu.sh @@ -160,7 +160,6 @@ ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ trainer.save_freq=-1 \ trainer.total_epochs=1 \ trainer.default_local_dir="${CKPTS_DIR}" \ - trainer.device="npu" \ actor_rollout_ref.nccl_timeout=14400 \ actor_rollout_ref.actor.use_torch_compile=False \ actor_rollout_ref.ref.use_torch_compile=False \ diff --git a/recipe/fapo/README.md b/recipe/fapo/README.md index 485072c409f..4401bbc4f7a 100644 --- a/recipe/fapo/README.md +++ b/recipe/fapo/README.md @@ -78,3 +78,12 @@ bash recipe/fapo/run_fapo_32b.sh # 32b fapo model We implement RewardLoop to enable efficient and flexible reward computation. The core implementation can be found in `verl/experimental/reward/`. Refer to [this official document](https://verl.readthedocs.io/en/latest/advance/reward_loop.html) for more implementation details. + +```bibtex +@article{ding2025fapo, + title={FAPO: Flawed-Aware Policy Optimization for Efficient and Reliable Reasoning}, + author={Ding, Yuyang and Zhang, Chi and Li, Juntao and Lin, Haibin and Liu, Xin and Zhang, Min}, + journal={arXiv preprint arXiv:2510.22543}, + year={2025} +} +``` \ No newline at end of file diff --git a/recipe/fapo/run_baseline_32b.sh b/recipe/fapo/run_baseline_32b.sh index 3bb14bed7e3..f788066b5c5 100644 --- a/recipe/fapo/run_baseline_32b.sh +++ b/recipe/fapo/run_baseline_32b.sh @@ -53,15 +53,10 @@ offload=True gen_tp=4 fsdp_size=32 -PROJECT_DIR="$(pwd)" -CONFIG_PATH="$PROJECT_DIR/recipe/fapo/config" - ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ --address "${RAY_ADDRESS}" \ --working-dir "${WORKING_DIR}" \ -- python3 -m verl.trainer.main_ppo \ - --config-path $CONFIG_PATH \ - --config-name rm_config.yaml \ data.train_files="${TRAIN_FILE}" \ data.val_files="${TEST_FILE}" \ data.prompt_key=prompt \ diff --git a/recipe/fapo/run_baseline_7b.sh b/recipe/fapo/run_baseline_7b.sh index b39014f1e19..77605b1bbac 100644 --- a/recipe/fapo/run_baseline_7b.sh +++ b/recipe/fapo/run_baseline_7b.sh @@ -54,15 +54,10 @@ offload=True gen_tp=1 fsdp_size=8 -PROJECT_DIR="$(pwd)" -CONFIG_PATH="$PROJECT_DIR/recipe/fapo/config" - ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ --address "${RAY_ADDRESS}" \ --working-dir "${WORKING_DIR}" \ -- python3 -m verl.trainer.main_ppo \ - --config-path $CONFIG_PATH \ - --config-name rm_config.yaml \ data.train_files="${TRAIN_FILE}" \ data.val_files="${TEST_FILE}" \ data.prompt_key=prompt \ diff --git a/recipe/fapo/run_fapo_32b.sh b/recipe/fapo/run_fapo_32b.sh index bb442b76e4e..f458070c4a7 100644 --- a/recipe/fapo/run_fapo_32b.sh +++ b/recipe/fapo/run_fapo_32b.sh @@ -55,15 +55,10 @@ offload=True gen_tp=4 fsdp_size=32 -PROJECT_DIR="$(pwd)" -CONFIG_PATH="$PROJECT_DIR/recipe/fapo/config" - ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ --address "${RAY_ADDRESS}" \ --working-dir "${WORKING_DIR}" \ -- python3 -m verl.trainer.main_ppo \ - --config-path $CONFIG_PATH \ - --config-name rm_config.yaml \ data.train_files="${TRAIN_FILE}" \ data.val_files="${TEST_FILE}" \ data.prompt_key=prompt \ diff --git a/recipe/fapo/run_fapo_32b_remote.sh b/recipe/fapo/run_fapo_32b_remote.sh index 748d1bbf0ed..8833f109138 100644 --- a/recipe/fapo/run_fapo_32b_remote.sh +++ b/recipe/fapo/run_fapo_32b_remote.sh @@ -53,15 +53,10 @@ offload=True gen_tp=4 fsdp_size=32 -PROJECT_DIR="$(pwd)" -CONFIG_PATH="$PROJECT_DIR/recipe/fapo/config" - ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ --address "${RAY_ADDRESS}" \ --working-dir "${WORKING_DIR}" \ -- python3 -m verl.trainer.main_ppo \ - --config-path $CONFIG_PATH \ - --config-name rm_config.yaml \ data.train_files="${TRAIN_FILE}" \ data.val_files="${TEST_FILE}" \ data.prompt_key=prompt \ diff --git a/recipe/fapo/run_fapo_7b.sh b/recipe/fapo/run_fapo_7b.sh index 046b2b26873..96884d94e9b 100644 --- a/recipe/fapo/run_fapo_7b.sh +++ b/recipe/fapo/run_fapo_7b.sh @@ -56,15 +56,10 @@ offload=True gen_tp=1 fsdp_size=8 -PROJECT_DIR="$(pwd)" -CONFIG_PATH="$PROJECT_DIR/recipe/fapo/config" - ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ --address "${RAY_ADDRESS}" \ --working-dir "${WORKING_DIR}" \ -- python3 -m verl.trainer.main_ppo \ - --config-path $CONFIG_PATH \ - --config-name rm_config.yaml \ data.train_files="${TRAIN_FILE}" \ data.val_files="${TEST_FILE}" \ data.prompt_key=prompt \ diff --git a/recipe/fapo/run_fapo_7b_remote.sh b/recipe/fapo/run_fapo_7b_remote.sh index 1bd757bfc99..663e10c385b 100644 --- a/recipe/fapo/run_fapo_7b_remote.sh +++ b/recipe/fapo/run_fapo_7b_remote.sh @@ -54,15 +54,10 @@ offload=True gen_tp=1 fsdp_size=8 -PROJECT_DIR="$(pwd)" -CONFIG_PATH="$PROJECT_DIR/recipe/fapo/config" - ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ --address "${RAY_ADDRESS}" \ --working-dir "${WORKING_DIR}" \ -- python3 -m verl.trainer.main_ppo \ - --config-path $CONFIG_PATH \ - --config-name rm_config.yaml \ data.train_files="${TRAIN_FILE}" \ data.val_files="${TEST_FILE}" \ data.prompt_key=prompt \ diff --git a/recipe/fully_async_policy/agent_loop/agent_loop.py b/recipe/fully_async_policy/agent_loop/agent_loop.py index 676aa524c3d..d486579596f 100644 --- a/recipe/fully_async_policy/agent_loop/agent_loop.py +++ b/recipe/fully_async_policy/agent_loop/agent_loop.py @@ -27,8 +27,8 @@ AgentLoopOutput, AgentLoopWorkerBase, AsyncLLMServerManager, + DictConfigWrap, _agent_loop_registry, - _DummyConfig, get_trajectory_info, ) from verl.experimental.agent_loop.prometheus_utils import update_prometheus_config @@ -182,7 +182,7 @@ async def _partial_run_agent_loop( agent_loop_config = _agent_loop_registry[agent_name] agent_loop = hydra.utils.instantiate( config=agent_loop_config, - trainer_config=_DummyConfig(config=self.config), + trainer_config=DictConfigWrap(config=self.config), server_manager=self.server_manager, tokenizer=self.tokenizer, processor=self.processor, diff --git a/recipe/fully_async_policy/fully_async_rollouter.py b/recipe/fully_async_policy/fully_async_rollouter.py index c185e34c795..95fff9c478f 100644 --- a/recipe/fully_async_policy/fully_async_rollouter.py +++ b/recipe/fully_async_policy/fully_async_rollouter.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import asyncio import os import time diff --git a/recipe/fully_async_policy/shell/dapo_30b_a3b_base_math_fsdp.sh b/recipe/fully_async_policy/shell/dapo_30b_a3b_base_math_fsdp.sh new file mode 100644 index 00000000000..c061ad2b1ee --- /dev/null +++ b/recipe/fully_async_policy/shell/dapo_30b_a3b_base_math_fsdp.sh @@ -0,0 +1,191 @@ +#!/usr/bin/env bash +set -xeuo pipefail + +project_name='DAPO-Qwen3-30B-A3B-Base-Async' +exp_name='Fsdp2-tp4sp4' + +# Ray +RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +WORKING_DIR=${WORKING_DIR:-"${PWD}"} +RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} +# Paths +DATA_PATH=${RAY_DATA_HOME:-"${HOME}/verl"} +DATA_PATH=${DATA_PATH:-"/mnt/bn/${BYTENAS}"} +# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface +MODEL_PATH=${MODEL_PATH:-"${DATA_PATH}/shared/models/Qwen3-30B-A3B-Base"} +CKPTS_DIR=${CKPTS_DIR:-"${DATA_PATH}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${DATA_PATH}/shared/data/dapo-math/dapo-math-17k.parquet"} +TEST_FILE=${TEST_FILE:-"${DATA_PATH}/shared/data/dapo-math/aime-2024.parquet"} + + +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +# Algorithm parameters +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +# Response length parameters +max_prompt_length=$((1024 * 2)) +max_response_length=$((1024 * 20)) +enable_overlong_buffer=True +overlong_buffer_len=$((1024 * 4)) +overlong_penalty_factor=1.0 + +# Training parameters +loss_agg_mode="token-mean" +enable_filter_groups=True +filter_groups_metric=acc +max_num_gen_batches=10 + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + + +NNODES=${NNODES:-4} +NGPUS_PER_NODE=${NGPUS_PER_NODE:-8} + +# Fully async specific parameters +n_gpus_rollout=8 +n_gpus_training=8 +n_nodes_rollout=2 +n_nodes_train=2 # $((NNODES - n_nodes_rollout)) + +train_bsz=512 +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 +total_rollout_steps=$(((train_bsz * 400))) +test_freq=25 +staleness_threshold=0.6 # 0 0.3 1 +require_batches=1 +total_train_gpus=$((n_gpus_training * n_nodes_train)) +total_rollout_gpus=$((n_gpus_rollout * n_nodes_rollout)) +trigger_parameter_sync_step=$((train_bsz / ( train_prompt_mini_bsz * require_batches))) # 8 16 32 +partial_rollout=True +enforce_eager=False +nccl_timeout=72000 +enable_sleep_mode=False + +# Performance Related Parameter +sp_size=4 +use_dynamic_bsz=True +actor_ppo_max_token_len=$((max_prompt_length + max_response_length)) +infer_ppo_max_token_len=$((max_prompt_length + max_response_length)) +ref_offload=True +actor_offload=False +gen_tp=4 +fsdp_size=-1 + + +ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ + --working-dir "${WORKING_DIR}" \ + --address "${RAY_ADDRESS}" \ + -- python3 -m recipe.fully_async_policy.fully_async_main \ + --config-path=config \ + --config-name='fully_async_dapo_trainer.yaml' \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + actor_rollout_ref.actor.strategy=fsdp \ + critic.strategy=fsdp \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + actor_rollout_ref.nccl_timeout=${nccl_timeout} \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.hybrid_engine=False \ + +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.50 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + +actor_rollout_ref.rollout.enable_sleep_mode=${enable_sleep_mode} \ + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.enforce_eager=${enforce_eager} \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + reward_model.reward_manager=dapo \ + reward_model.overlong_buffer.enable=${enable_overlong_buffer} \ + reward_model.overlong_buffer.len=${overlong_buffer_len} \ + reward_model.overlong_buffer.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \ + +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \ + +reward_model.reward_kwargs.max_resp_len=${max_response_length} \ + trainer.logger=['console','wandb'] \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}-i${total_rollout_gpus}_t${total_train_gpus}_s${staleness_threshold}" \ + trainer.val_before_train=True \ + trainer.test_freq="${test_freq}" \ + trainer.save_freq=-1 \ + trainer.default_local_dir="${CKPTS_DIR}" \ + trainer.resume_mode=auto \ + trainer.nnodes="${n_nodes_train}" \ + trainer.n_gpus_per_node="${n_gpus_training}" \ + rollout.nnodes="${n_nodes_rollout}" \ + rollout.n_gpus_per_node="${n_gpus_rollout}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.test_freq=${test_freq} \ + rollout.total_epochs=10 \ + async_training.require_batches=${require_batches} \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" \ + async_training.use_rollout_log_probs=True diff --git a/recipe/one_step_off_policy/README.md b/recipe/one_step_off_policy/README.md index c698e2cf178..2cb0b9b85e8 100644 --- a/recipe/one_step_off_policy/README.md +++ b/recipe/one_step_off_policy/README.md @@ -215,7 +215,7 @@ def sync_rollout_weights(self): ### PPO Correctness To ensure the correctness of the PPO algorithm, we use rollout log_probs for PPO importance sampling. For the related algorithm details, please refer to: https://verl.readthedocs.io/en/latest/algo/rollout_corr_math.html -The default mode is ppo_is_bypass, but other modification strategies can also be explored. +The default mode is `bypass_ppo_clip`, but other modification strategies can also be explored. ### AgentLoop In the current implementation, we no longer provide SPMD model rollout mode. @@ -297,9 +297,6 @@ python3 -m recipe.one_step_off_policy.async_main_ppo \ > - When `trainer.n_gpus_per_node + rollout.n_gpus_per_node > physical_gpus_per_node`, > the required node count is `trainer.nnodes + rollout.nnodes` -3. **Ascend NPU Configuration** - If you are using Ascend NPU devices, add the following parameter: - - `trainer.device=npu` ## Functional Support diff --git a/recipe/one_step_off_policy/main_ppo.py b/recipe/one_step_off_policy/main_ppo.py index fa62f7c7500..c24b4d01774 100644 --- a/recipe/one_step_off_policy/main_ppo.py +++ b/recipe/one_step_off_policy/main_ppo.py @@ -30,6 +30,7 @@ from verl.trainer.ppo.reward import load_reward_manager from verl.trainer.ppo.utils import Role, need_reference_policy from verl.utils.config import validate_config +from verl.utils.device import auto_set_ascend_device_name def create_resource_pool_manager(config, roles: list) -> ResourcePoolManager: @@ -222,6 +223,10 @@ def main(config): from verl.trainer.main_ppo import run_ppo start_time = time() + + # Automatically set `config.trainer.device = npu` when running on Ascend NPU. + auto_set_ascend_device_name(config) + run_ppo(config, task_runner_class=OneStepTaskRunner) print(f"total time: {time() - start_time:.2f} seconds") diff --git a/recipe/one_step_off_policy/ray_trainer.py b/recipe/one_step_off_policy/ray_trainer.py index 76415016743..c3890f61bb9 100644 --- a/recipe/one_step_off_policy/ray_trainer.py +++ b/recipe/one_step_off_policy/ray_trainer.py @@ -531,9 +531,9 @@ async def fit(self): rollout_corr_config = self.config.algorithm.get("rollout_correction", None) bypass_recomputing_logprobs = rollout_corr_config and rollout_corr_config.get("bypass_mode", False) if bypass_recomputing_logprobs: # Use `rollout_log_probs` - from verl.trainer.ppo.rollout_corr_helper import apply_rollout_correction + from verl.trainer.ppo.rollout_corr_helper import apply_bypass_mode - apply_rollout_correction( + apply_bypass_mode( batch=batch, rollout_corr_config=rollout_corr_config, policy_loss_config=self.config.actor_rollout_ref.actor.policy_loss, diff --git a/recipe/one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh b/recipe/one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh index e06f65d82e4..9fcddfe246e 100644 --- a/recipe/one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh +++ b/recipe/one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh @@ -87,7 +87,6 @@ python3 -m recipe.one_step_off_policy.main_ppo \ trainer.save_freq=10 \ trainer.test_freq=-1 \ trainer.total_epochs=15 \ - trainer.device=npu \ trainer.resume_mode=auto \ trainer.nnodes="${NNODES}" \ trainer.n_gpus_per_node="${n_gpus_training}" \ diff --git a/recipe/open_math_reasoning/run_sft_qwen3_8b.sh b/recipe/open_math_reasoning/run_sft_qwen3_8b.sh index 3b7e9bb5c6c..ec564a1d602 100644 --- a/recipe/open_math_reasoning/run_sft_qwen3_8b.sh +++ b/recipe/open_math_reasoning/run_sft_qwen3_8b.sh @@ -55,7 +55,7 @@ MEGATRON_ENGINE_CONFIG="\ engine.pipeline_model_parallel_size=${PP_SIZE} \ engine.virtual_pipeline_model_parallel_size=${VPP_SIZE} \ engine.context_parallel_size=${CP_SIZE} \ - engine.use_mbridge=False" + engine.use_mbridge=True" if [ "$backend" = "fsdp" ]; then ENGINE_CONFIG="$FSDP_ENGINE_CONFIG" diff --git a/recipe/r1_ascend/main_ppo.py b/recipe/r1_ascend/main_ppo.py index 57d9b9796a8..2c3614de460 100644 --- a/recipe/r1_ascend/main_ppo.py +++ b/recipe/r1_ascend/main_ppo.py @@ -27,7 +27,7 @@ from verl.trainer.constants_ppo import get_ppo_ray_runtime_env from verl.trainer.main_ppo import TaskRunner as TaskRunnerBase -from verl.utils.device import is_cuda_available +from verl.utils.device import auto_set_ascend_device_name, is_cuda_available logger = logging.getLogger(__file__) logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) @@ -40,6 +40,9 @@ def main(config): Args: config_dict: Hydra configuration dictionary containing training parameters. """ + # Automatically set `config.trainer.device = npu` when running on Ascend NPU. + auto_set_ascend_device_name(config) + run_ppo(config) diff --git a/recipe/r1_ascend/run_deepseekv3_671b_grpo_megatron_npu.sh b/recipe/r1_ascend/run_deepseekv3_671b_grpo_megatron_npu.sh index 2bade83d4c1..44ac1e2e57a 100644 --- a/recipe/r1_ascend/run_deepseekv3_671b_grpo_megatron_npu.sh +++ b/recipe/r1_ascend/run_deepseekv3_671b_grpo_megatron_npu.sh @@ -105,7 +105,6 @@ python3 -m recipe.r1_ascend.main_ppo \ trainer.test_freq=5 \ trainer.save_freq=-1 \ trainer.total_epochs=1 \ - trainer.device="npu" \ +actor_rollout_ref.actor.megatron.override_transformer_config.multi_head_latent_attention=True \ +actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True \ +actor_rollout_ref.actor.megatron.override_transformer_config.pipeline_num_transformer_layers=[[6],[8],[8],[8],[8],[8],[8],[7]] \ diff --git a/recipe/spin/utils.py b/recipe/spin/utils.py index 571ad1e9154..e3855f64541 100644 --- a/recipe/spin/utils.py +++ b/recipe/spin/utils.py @@ -92,7 +92,11 @@ def check_mutually_exclusive(mbs, mbs_per_gpu, name: str): ) # Check for reward model micro-batch size conflicts - if config.reward_model.enable and not config.reward_model.use_dynamic_bsz: + if ( + config.reward_model.enable + and not config.reward_model.use_dynamic_bsz + and not config.reward_model.use_reward_loop + ): check_mutually_exclusive( config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu, "reward_model" ) diff --git a/recipe/transfer_queue/main_ppo.py b/recipe/transfer_queue/main_ppo.py index 4f982795eeb..236d59d51fb 100644 --- a/recipe/transfer_queue/main_ppo.py +++ b/recipe/transfer_queue/main_ppo.py @@ -33,7 +33,7 @@ from verl.trainer.ppo.reward import load_reward_manager from verl.trainer.ppo.utils import need_critic, need_reference_policy from verl.utils.config import validate_config -from verl.utils.device import is_cuda_available +from verl.utils.device import auto_set_ascend_device_name, is_cuda_available from .ray_trainer import RayPPOTrainer @@ -45,6 +45,9 @@ def main(config): Args: config_dict: Hydra configuration dictionary containing training parameters. """ + # Automatically set `config.trainer.device = npu` when running on Ascend NPU. + auto_set_ascend_device_name(config) + run_ppo(config) diff --git a/recipe/transfer_queue/ray_trainer.py b/recipe/transfer_queue/ray_trainer.py index 2acef1f84af..b3e7597cf4b 100644 --- a/recipe/transfer_queue/ray_trainer.py +++ b/recipe/transfer_queue/ray_trainer.py @@ -1315,15 +1315,10 @@ def fit(self): batch_dict, repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True ) batch: TensorDict = self.dict_to_tensordict(repeated_batch_dict) - asyncio.run(self.tq_client.async_put(data=batch, partition_id=f"train_{self.global_steps - 1}")) - gen_meta = asyncio.run( - self.tq_client.async_get_meta( - data_fields=list(batch.keys()), # TODO (TQ): Get metadata by specified fields - task_name="generate_sequences", - **base_get_meta_kwargs, - ) + self.tq_client.async_put(data=batch, partition_id=f"train_{self.global_steps - 1}") ) + # pass global_steps to trace gen_meta.set_extra_info("global_steps", self.global_steps) @@ -1411,14 +1406,9 @@ def fit(self): ] if "rm_scores" in batch_meta.field_names: compute_reward_fields.append("rm_scores") - compute_reward_meta = asyncio.run( - self.tq_client.async_get_meta( - data_fields=compute_reward_fields, - task_name="compute_reward", - **base_get_meta_kwargs, - ) - ) - compute_reward_meta.reorder(balanced_idx) + + compute_reward_meta = batch_meta.select_fields(compute_reward_fields) + if self.config.reward_model.launch_reward_fn_async: future_reward = compute_reward_async_decorated( data=compute_reward_meta, @@ -1432,31 +1422,26 @@ def fit(self): # recompute old_log_probs with marked_timer("old_log_prob", timing_raw, color="blue"): - old_log_prob_meta = asyncio.run( - self.tq_client.async_get_meta( - data_fields=[ - "input_ids", - "attention_mask", - "position_ids", - "prompts", - "responses", - "response_mask", - "data_source", - "reward_model", - "extra_info", - "uid", - "index", - "tools_kwargs", - "interaction_kwargs", - "ability", - ], - task_name="compute_log_prob", - **base_get_meta_kwargs, - ) - ) - old_log_prob_meta.reorder(balanced_idx) - + old_log_prob_meta_fields = [ + "input_ids", + "attention_mask", + "position_ids", + "prompts", + "responses", + "response_mask", + "data_source", + "reward_model", + "extra_info", + "uid", + "index", + "tools_kwargs", + "interaction_kwargs", + "ability", + ] + old_log_prob_meta = batch_meta.select_fields(old_log_prob_meta_fields) old_log_prob_output_meta = self.actor_rollout_wg.compute_log_prob(old_log_prob_meta) + batch_meta = batch_meta.union(old_log_prob_output_meta) + data = asyncio.run(self.tq_client.async_get_data(old_log_prob_output_meta)) entropys = data["entropys"] response_masks = data["response_mask"] @@ -1470,52 +1455,39 @@ def fit(self): old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()} metrics.update(old_log_prob_metrics) - batch_meta = batch_meta.union(old_log_prob_output_meta) - if "rollout_log_probs" in batch_meta.field_names: # TODO: we may want to add diff of probs too. - data_fields = ["rollout_log_probs", "old_log_probs", "responses"] + calculate_debug_metrics_fields = ["rollout_log_probs", "old_log_probs", "responses"] + if "response_mask" in batch_meta.field_names: - data_fields.append("response_mask") + calculate_debug_metrics_fields.append("response_mask") if "attention_mask" in batch_meta.field_names: - data_fields.append("attention_mask") - calculate_debug_metrics_meta = asyncio.run( - self.tq_client.async_get_meta( - data_fields=data_fields, - task_name="calculate_debug_metrics", - **base_get_meta_kwargs, - ) - ) - calculate_debug_metrics_meta.reorder(balanced_idx) + calculate_debug_metrics_fields.append("attention_mask") + calculate_debug_metrics_meta = batch_meta.select_fields(calculate_debug_metrics_fields) metrics.update(calculate_debug_metrics_decorated(calculate_debug_metrics_meta)) if self.use_reference_policy: # compute reference log_prob - ref_log_prob_meta = asyncio.run( - self.tq_client.async_get_meta( - data_fields=[ - "input_ids", - "attention_mask", - "position_ids", - "prompts", - "responses", - "response_mask", - "old_log_probs", - "data_source", - "reward_model", - "extra_info", - "uid", - "index", - "tools_kwargs", - "interaction_kwargs", - "ability", - ], - task_name="compute_ref_log_prob", - **base_get_meta_kwargs, - ) - ) - ref_log_prob_meta.reorder(balanced_idx) + ref_log_prob_fields = [ + "input_ids", + "attention_mask", + "position_ids", + "prompts", + "responses", + "response_mask", + "old_log_probs", + "data_source", + "reward_model", + "extra_info", + "uid", + "index", + "tools_kwargs", + "interaction_kwargs", + "ability", + ] + ref_log_prob_meta = batch_meta.select_fields(ref_log_prob_fields) + with marked_timer("ref", timing_raw, color="olive"): if not self.ref_in_actor: ref_log_prob_output_meta = self.ref_policy_wg.compute_ref_log_prob(ref_log_prob_meta) @@ -1535,14 +1507,14 @@ def fit(self): if self.config.reward_model.launch_reward_fn_async: reward_tensor, reward_extra_infos_dict = ray.get(future_reward) reward_td = TensorDict({"token_level_scores": reward_tensor}, batch_size=reward_tensor.size(0)) - asyncio.run(self.tq_client.async_put(data=reward_td, metadata=batch_meta)) - batch_meta.add_fields(reward_td) + batch_meta = asyncio.run(self.tq_client.async_put(data=reward_td, metadata=batch_meta)) if reward_extra_infos_dict: reward_extra_infos_dict_new = {k: np.array(v) for k, v in reward_extra_infos_dict.items()} reward_extra_infos_td = self.dict_to_tensordict(reward_extra_infos_dict_new) - asyncio.run(self.tq_client.async_put(data=reward_extra_infos_td, metadata=batch_meta)) - batch_meta.add_fields(reward_extra_infos_td) + batch_meta = asyncio.run( + self.tq_client.async_put(data=reward_extra_infos_td, metadata=batch_meta) + ) # compute rewards. apply_kl_penalty if available if self.config.algorithm.use_kl_in_reward: @@ -1552,14 +1524,9 @@ def fit(self): "old_log_probs", "ref_log_prob", ] - apply_kl_penalty_meta = asyncio.run( - self.tq_client.async_get_meta( - data_fields=apply_kl_penalty_fields, - task_name="apply_kl_penalty", - **base_get_meta_kwargs, - ) - ) - apply_kl_penalty_meta.reorder(balanced_idx) + + apply_kl_penalty_meta = batch_meta.select_fields(apply_kl_penalty_fields) + token_level_rewards, kl_metrics = apply_kl_penalty( apply_kl_penalty_meta, kl_ctrl=self.kl_ctrl_in_reward, @@ -1568,31 +1535,24 @@ def fit(self): token_level_rewards_td = TensorDict( {"token_level_rewards": token_level_rewards}, batch_size=token_level_rewards.size(0) ) - asyncio.run( + apply_kl_penalty_meta = asyncio.run( self.tq_client.async_put(data=token_level_rewards_td, metadata=apply_kl_penalty_meta) ) - apply_kl_penalty_meta.add_fields(token_level_rewards_td) metrics.update(kl_metrics) batch_meta = batch_meta.union(apply_kl_penalty_meta) else: - token_level_scores_meta = asyncio.run( - self.tq_client.async_get_meta( - data_fields=["token_level_scores"], - task_name="token_level_scores", - **base_get_meta_kwargs, - ) - ) - token_level_scores_meta.reorder(balanced_idx) + token_level_scores_meta = batch_meta.select_fields(["token_level_scores"]) + data = asyncio.run(self.tq_client.async_get_data(token_level_scores_meta)) token_level_rewards_td = TensorDict( {"token_level_rewards": data["token_level_scores"]}, batch_size=data["token_level_scores"].size(0), ) - asyncio.run( + token_level_scores_meta = asyncio.run( self.tq_client.async_put(data=token_level_rewards_td, metadata=token_level_scores_meta) ) - batch_meta.add_fields(token_level_rewards_td) + batch_meta = batch_meta.union(token_level_scores_meta) # compute advantages, executed on the driver process @@ -1617,14 +1577,7 @@ def fit(self): if "reward_baselines" in batch_meta.field_names: compute_advantage_fields.append("reward_baselines") - compute_advantage_meta = asyncio.run( - self.tq_client.async_get_meta( - data_fields=compute_advantage_fields, - task_name="compute_advantage", - **base_get_meta_kwargs, - ) - ) - compute_advantage_meta.reorder(balanced_idx) + compute_advantage_meta = batch_meta.select_fields(compute_advantage_fields) advantages, returns = compute_advantage( compute_advantage_meta, @@ -1639,9 +1592,9 @@ def fit(self): advantages_td = TensorDict( {"advantages": advantages, "returns": returns}, batch_size=advantages.size(0) ) - asyncio.run(self.tq_client.async_put(data=advantages_td, metadata=compute_advantage_meta)) - compute_advantage_meta.add_fields(advantages_td) - + compute_advantage_meta = asyncio.run( + self.tq_client.async_put(data=advantages_td, metadata=compute_advantage_meta) + ) batch_meta = batch_meta.union(compute_advantage_meta) # update critic @@ -1660,37 +1613,30 @@ def fit(self): self.config.actor_rollout_ref.rollout.multi_turn.enable ) - update_actor_meta = asyncio.run( - self.tq_client.async_get_meta( - data_fields=[ - "input_ids", - "attention_mask", - "position_ids", - "prompts", - "responses", - "response_mask", - "old_log_probs", - "ref_log_prob", - "advantages", - "returns", - "token_level_rewards", - "token_level_scores", - "data_source", - "reward_model", - "extra_info", - "uid", - "index", - "tools_kwargs", - "interaction_kwargs", - "ability", - ], - batch_size=self.config.data.train_batch_size - * self.config.actor_rollout_ref.rollout.n, - partition_id=f"train_{self.global_steps - 1}", - task_name="update_actor", - ) - ) - update_actor_meta.reorder(balanced_idx) + update_actor_fields = [ + "input_ids", + "attention_mask", + "position_ids", + "prompts", + "responses", + "response_mask", + "old_log_probs", + "ref_log_prob", + "advantages", + "returns", + "token_level_rewards", + "token_level_scores", + "data_source", + "reward_model", + "extra_info", + "uid", + "index", + "tools_kwargs", + "interaction_kwargs", + "ability", + ] + update_actor_meta = batch_meta.select_fields(update_actor_fields) + update_actor_meta.set_extra_info( "global_token_num", batch_meta.get_extra_info("global_token_num") ) @@ -1704,22 +1650,12 @@ def fit(self): # Log rollout generations if enabled rollout_data_dir = self.config.trainer.get("rollout_data_dir", None) if rollout_data_dir: - data_fields = ["prompts", "responses", "token_level_scores", "reward_model"] + log_rollout_fields = ["prompts", "responses", "token_level_scores", "reward_model"] if "request_id" in batch_meta.field_names: - data_fields.append("request_id") - log_rollout_meta = asyncio.run( - self.tq_client.async_get_meta( - data_fields=data_fields, - batch_size=self.config.data.train_batch_size * self.config.actor_rollout_ref.rollout.n, - partition_id=f"train_{self.global_steps - 1}", - task_name="log_rollout", - ) - ) - log_rollout_meta.reorder(balanced_idx) + log_rollout_fields.append("request_id") + log_rollout_meta = batch_meta.select_fields(log_rollout_fields) self._log_rollout_data(log_rollout_meta, reward_extra_infos_dict, timing_raw, rollout_data_dir) - # TODO: clear meta after iteration - # TODO: validate if ( self.val_reward_fn is not None diff --git a/recipe/transfer_queue/run_qwen3-8b_transferqueue.sh b/recipe/transfer_queue/run_qwen3-8b_transferqueue.sh index a6a013903b8..573e71a1f1b 100644 --- a/recipe/transfer_queue/run_qwen3-8b_transferqueue.sh +++ b/recipe/transfer_queue/run_qwen3-8b_transferqueue.sh @@ -9,6 +9,9 @@ mkdir -p ${log_dir} timestamp=$(date +"%Y%m%d%H%M%S") log_file="${log_dir}/qwen3-8b_tq_${timestamp}.log" +# You may try to enable zero-copy serialization for TransferQueue when using SimpleStorageUnit backend. +export TQ_ZERO_COPY_SERIALIZATION=False + rollout_mode="async" rollout_name="vllm" # sglang or vllm if [ "$rollout_mode" = "async" ]; then diff --git a/requirements-npu.txt b/requirements-npu.txt index 90304a94561..ea197c98f31 100644 --- a/requirements-npu.txt +++ b/requirements-npu.txt @@ -11,7 +11,7 @@ pyarrow>=15.0.0 pybind11 pylatexenc tensordict>=0.8.0,<=0.10.0,!=0.9.0 -ray==2.46.0 +ray[default] wandb mathruler torchdata diff --git a/requirements_transferqueue.txt b/requirements_transferqueue.txt deleted file mode 100644 index b4a1034f42d..00000000000 --- a/requirements_transferqueue.txt +++ /dev/null @@ -1,2 +0,0 @@ -# requirements.txt records the full set of dependencies for development -TransferQueue==0.1.2.dev0 diff --git a/setup.py b/setup.py index 8c9f5e1026d..9f5fbb03b67 100644 --- a/setup.py +++ b/setup.py @@ -57,7 +57,7 @@ ] TRL_REQUIRES = ["trl<=0.9.6"] MCORE_REQUIRES = ["mbridge"] -TRANSFERQUEUE_REQUIRES = ["TransferQueue==0.1.2.dev0"] +TRANSFERQUEUE_REQUIRES = ["TransferQueue==0.1.4.dev1"] extras_require = { "test": TEST_REQUIRES, diff --git a/tests/experimental/reward/test_agent_loop_reward_manager.py b/tests/experimental/reward/test_agent_loop_reward_manager.py index a7e3043835a..05ccc71c3e8 100644 --- a/tests/experimental/reward/test_agent_loop_reward_manager.py +++ b/tests/experimental/reward/test_agent_loop_reward_manager.py @@ -35,8 +35,8 @@ def test_agent_loop_reward_manager(): } } ) - with initialize_config_dir(config_dir=os.path.abspath("recipe/fapo/config")): - config = compose("rm_config") + with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")): + config = compose(config_name="ppo_trainer") rollout_model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-0.5B-Instruct") reward_model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct") diff --git a/tests/experimental/reward/test_agent_reward_loop_colocate.py b/tests/experimental/reward/test_agent_reward_loop_colocate.py index 9363944b510..5f76bab25dd 100644 --- a/tests/experimental/reward/test_agent_reward_loop_colocate.py +++ b/tests/experimental/reward/test_agent_reward_loop_colocate.py @@ -39,8 +39,8 @@ def test_agent_loop_reward_manager(): } } ) - with initialize_config_dir(config_dir=os.path.abspath("recipe/fapo/config")): - config = compose("rm_config") + with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")): + config = compose(config_name="ppo_trainer") rollout_model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-0.5B-Instruct") reward_model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct") diff --git a/tests/experimental/reward/test_async_token_bucket_on_cpu.py b/tests/experimental/reward/test_async_token_bucket_on_cpu.py index ceef232c5b0..0a044190bd0 100644 --- a/tests/experimental/reward/test_async_token_bucket_on_cpu.py +++ b/tests/experimental/reward/test_async_token_bucket_on_cpu.py @@ -17,7 +17,7 @@ import pytest -from verl.experimental.reward.reward_loop.limited import AsyncTokenBucket +from verl.experimental.reward.reward_manager.limited import AsyncTokenBucket class TestAsyncTokenBucket: diff --git a/tests/experimental/reward/test_rate_limited_reward_manager_on_cpu.py b/tests/experimental/reward/test_rate_limited_reward_manager_on_cpu.py index f91b23aae23..446dee56438 100644 --- a/tests/experimental/reward/test_rate_limited_reward_manager_on_cpu.py +++ b/tests/experimental/reward/test_rate_limited_reward_manager_on_cpu.py @@ -21,7 +21,7 @@ from transformers import AutoTokenizer from verl import DataProto -from verl.experimental.reward.reward_loop.limited import RateLimitedRewardLoopManager +from verl.experimental.reward.reward_manager.limited import RateLimitedRewardLoopManager # Mock API reward functions for testing diff --git a/tests/experimental/reward/test_reward_model_disrm.py b/tests/experimental/reward/test_reward_model_disrm.py index 5e0cfa0e553..1e79f57e32a 100644 --- a/tests/experimental/reward/test_reward_model_disrm.py +++ b/tests/experimental/reward/test_reward_model_disrm.py @@ -114,8 +114,8 @@ def test_reward_model_manager(): } } ) - with initialize_config_dir(config_dir=os.path.abspath("recipe/fapo/config")): - config = compose("rm_config") + with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")): + config = compose(config_name="ppo_trainer") rollout_model_name = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct") reward_model_name = os.path.expanduser("~/models/Skywork/Skywork-Reward-V2-Llama-3.2-1B") diff --git a/tests/experimental/reward/test_reward_model_genrm.py b/tests/experimental/reward/test_reward_model_genrm.py index c505267ab9e..ed853fbd811 100644 --- a/tests/experimental/reward/test_reward_model_genrm.py +++ b/tests/experimental/reward/test_reward_model_genrm.py @@ -115,8 +115,8 @@ def test_reward_model_manager(): } } ) - with initialize_config_dir(config_dir=os.path.abspath("recipe/fapo/config")): - config = compose("rm_config") + with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")): + config = compose(config_name="ppo_trainer") rollout_model_name = os.path.expanduser("~/models/Qwen/Qwen2.5-0.5B-Instruct") reward_model_name = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct") diff --git a/tests/models/test_engine.py b/tests/models/test_engine.py index ae413996b9f..9878ece4d06 100644 --- a/tests/models/test_engine.py +++ b/tests/models/test_engine.py @@ -24,11 +24,19 @@ import torch import torch.distributed as dist import torch.multiprocessing as mp -from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForTokenClassification, Qwen3Config, Qwen3MoeConfig +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + AutoModelForTokenClassification, + AutoTokenizer, + Qwen3Config, + Qwen3MoeConfig, +) from verl import DataProto from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup from verl.trainer.config import CheckpointConfig +from verl.utils import tensordict_utils as tu from verl.utils.model import compute_position_id_with_mask, create_random_mask from verl.utils.torch_functional import logprobs_from_logits_naive from verl.workers.config import ( @@ -40,49 +48,87 @@ McoreEngineConfig, McoreOptimizerConfig, ) -from verl.workers.engine_workers import ActorWorker, CriticWorker -from verl.workers.utils.losses import ppo_loss +from verl.workers.engine_workers import TrainingWorker, TrainingWorkerConfig +from verl.workers.utils.losses import ppo_loss, sft_loss, value_loss +from verl.workers.utils.padding import left_right_2_no_padding, no_padding_2_padding -@pytest.mark.parametrize("strategy", ["megatron", "fsdp", "fsdp2"]) -def test_actor_engine(strategy): - ray.init() +def get_test_language_model(device_count): + if device_count == 1: + model = "~/models/HuggingFaceTB/SmolLM2-135M-Instruct" + else: + model = "~/models/Qwen/Qwen2.5-0.5B" + model = os.path.expanduser(model) + return model - path = os.path.expanduser("~/models/Qwen/Qwen2.5-0.5B") - model_config = HFModelConfig(path=path) + +def create_training_config(model_type, strategy, device_count, model): + if device_count == 1: + tp = pp = cp = fsdp_size = 1 + else: + tp = pp = cp = 2 + fsdp_size = 4 + + path = os.path.expanduser(model) + model_config = HFModelConfig(path=path, use_remove_padding=True) + + kwargs = dict( + param_offload=True, + optimizer_offload=True, + grad_offload=True, + use_dynamic_bsz=True, + use_remove_padding=True, + max_token_len_per_gpu=500, + infer_max_token_len_per_gpu=1000, + ) if strategy == "megatron": engine_config = McoreEngineConfig( forward_only=False, - use_mbridge=False, - tensor_model_parallel_size=2, - pipeline_model_parallel_size=2, - context_parallel_size=2, + use_mbridge=True, + tensor_model_parallel_size=tp, + pipeline_model_parallel_size=pp, + context_parallel_size=cp, + **kwargs, ) optimizer_config = McoreOptimizerConfig(lr_decay_steps=10) elif strategy in ["fsdp", "fsdp2"]: engine_config = FSDPEngineConfig( - forward_only=False, fsdp_size=4, strategy=strategy, ulysses_sequence_parallel_size=2 + forward_only=False, fsdp_size=fsdp_size, strategy=strategy, ulysses_sequence_parallel_size=cp, **kwargs ) optimizer_config = FSDPOptimizerConfig() else: raise NotImplementedError(f"strategy {strategy} is not supported") - config = ActorConfig( + config = TrainingWorkerConfig( + model_type=model_type, model_config=model_config, - engine=engine_config, + engine_config=engine_config, + optimizer_config=optimizer_config, + checkpoint_config=None, + ) + return config + + +@pytest.mark.parametrize("strategy", ["fsdp", "fsdp2", "megatron"]) +def test_actor_engine(strategy): + ray.init() + device_count = torch.cuda.device_count() + config = create_training_config( + model_type="language_model", strategy=strategy, - ppo_micro_batch_size_per_gpu=256, - ppo_mini_batch_size=4, - optim=optimizer_config, - use_dynamic_bsz=True, - rollout_n=1, + device_count=device_count, + model=get_test_language_model(device_count), ) - ray_cls_with_init = RayClassWithInitArgs(cls=ray.remote(ActorWorker), config=config) - resource_pool = RayResourcePool(process_on_nodes=[8]) + ray_cls_with_init = RayClassWithInitArgs(cls=ray.remote(TrainingWorker), config=config) + resource_pool = RayResourcePool(process_on_nodes=[device_count]) wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init) # init model - wg.init_model() + wg.reset() + + sft_loss_ = partial(sft_loss, config=config) + + wg.set_loss_fn(sft_loss_) batch_size = 8 seqlen = 32 @@ -92,7 +138,7 @@ def test_actor_engine(strategy): torch.manual_seed(1) np.random.seed(1) - input_ids = torch.randint(0, model_config.hf_config.vocab_size, (batch_size, seqlen)) + input_ids = torch.randint(0, config.model_config.hf_config.vocab_size, (batch_size, seqlen)) attention_mask = create_random_mask( input_ids=input_ids, max_ratio_of_valid_token=0.8, max_ratio_of_left_padding=0.2, min_ratio_of_valid_token=0.6 ) @@ -116,15 +162,22 @@ def test_actor_engine(strategy): "responses": responses, "response_mask": response_mask, }, - meta_info={"temperature": 1.0, "global_token_num": global_token_num}, + meta_info={"temperature": 1.0, "global_token_num": global_token_num, "compute_loss": False}, ) - # sft_loss_ = partial(sft_loss, config=config) + data_td = data.to_tensordict() + data_td = left_right_2_no_padding(data_td) # eval - output = wg.compute_log_prob(data) + output = wg.infer_batch(data_td) + output = output.get() + logprobs_unpad = tu.get(output, "log_probs").cpu() + logprobs = no_padding_2_padding(logprobs_unpad, data_td) + + output = DataProto.from_single_dict({"old_log_probs": logprobs}) # load hf model and compare results with hf model + path = config.model_config.path hf_model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.bfloat16) hf_output = hf_model(input_ids, attention_mask=attention_mask) hf_logprobs = logprobs_from_logits_naive( @@ -148,78 +201,77 @@ def test_actor_engine(strategy): data.batch["advantages"] = torch.rand_like(responses, dtype=torch.float32) data.batch["ref_log_prob"] = torch.rand_like(responses, dtype=torch.float32) + # construct actor config + actor_config = ActorConfig(strategy=strategy, rollout_n=1, ppo_micro_batch_size_per_gpu=-1) + # set ppo loss - ppo_loss_ = partial(ppo_loss, config=config) + ppo_loss_ = partial(ppo_loss, config=actor_config) wg.set_loss_fn(ppo_loss_) # update again - ppo_metrics = wg.update_actor(data) + data_td = data.to_tensordict() + data_td = left_right_2_no_padding(data_td) + + # auto load/offload + tu.assign_non_tensor(data_td, global_batch_size=data_td.shape[0]) + ppo_metrics = wg.train_batch(data_td) + ppo_metrics = ppo_metrics.get() + ppo_metrics = tu.get(ppo_metrics, "metrics") print(ppo_metrics) - ray.shutdown() + # test manual load/offload + tu.assign_non_tensor(data_td, disable_auto_offload=True) + wg.to("device") + ppo_metrics = wg.train_batch(data_td) + ppo_metrics = ppo_metrics.get() + ppo_metrics = tu.get(ppo_metrics, "metrics") + print(ppo_metrics) + wg.to("cpu") + ray.shutdown() -def create_model(): - from transformers import Qwen3Config - config = Qwen3Config(num_hidden_layers=2, num_labels=1) +def create_value_model(language_model_path, output_path): + config = AutoConfig.from_pretrained(language_model_path) + config.num_labels = 1 + config.classifier_dropout = 0 + config.tie_word_embeddings = False model = AutoModelForTokenClassification.from_config(config) + tokenizer = AutoTokenizer.from_pretrained(os.path.expanduser(language_model_path)) assert model.config.num_labels == 1 - path = os.path.expanduser("~/models/test_model") + path = os.path.expanduser(output_path) model.save_pretrained(path) + tokenizer.save_pretrained(path) config.save_pretrained(path) return path -@pytest.mark.parametrize("strategy", ["megatron", "fsdp", "fsdp2"]) +@pytest.mark.parametrize("strategy", ["fsdp", "fsdp2"]) def test_critic_engine(strategy): - ray.init() + device_count = torch.cuda.device_count() + value_model_path = os.path.expanduser("~/models/test_model") + language_model_path = get_test_language_model(device_count=device_count) + create_value_model(language_model_path, value_model_path) - path = create_model() - model_config = HFModelConfig(path=path, load_tokenizer=False) + torch.manual_seed(1) + np.random.seed(1) - if strategy == "megatron": - engine_config = McoreEngineConfig( - forward_only=False, - use_mbridge=False, - tensor_model_parallel_size=2, - pipeline_model_parallel_size=2, - context_parallel_size=2, - ) - optimizer_config = McoreOptimizerConfig(lr_decay_steps=10) - elif strategy in ["fsdp", "fsdp2"]: - engine_config = FSDPEngineConfig( - forward_only=False, fsdp_size=4, strategy=strategy, ulysses_sequence_parallel_size=2 - ) - optimizer_config = FSDPOptimizerConfig() - else: - raise NotImplementedError(f"strategy {strategy} is not supported") + ray.init() - config = CriticConfig( - model_config=model_config, - engine=engine_config, - strategy=strategy, - ppo_micro_batch_size_per_gpu=256, - ppo_mini_batch_size=4, - optim=optimizer_config, - use_dynamic_bsz=True, - rollout_n=1, + config = create_training_config( + model_type="value_model", strategy=strategy, device_count=device_count, model=value_model_path ) - ray_cls_with_init = RayClassWithInitArgs(cls=ray.remote(CriticWorker), config=config) - resource_pool = RayResourcePool(process_on_nodes=[8]) + ray_cls_with_init = RayClassWithInitArgs(cls=ray.remote(TrainingWorker), config=config) + resource_pool = RayResourcePool(process_on_nodes=[device_count]) wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init) # init model - wg.init_model() + wg.reset() batch_size = 8 seqlen = 32 response_length = seqlen // 2 - - torch.manual_seed(1) - np.random.seed(1) - - input_ids = torch.randint(0, model_config.hf_config.vocab_size, (batch_size, seqlen)) + input_ids = torch.randint(0, config.model_config.hf_config.vocab_size, (batch_size, seqlen)) attention_mask = create_random_mask( input_ids=input_ids, max_ratio_of_valid_token=0.8, max_ratio_of_left_padding=0.2, min_ratio_of_valid_token=0.6 ) @@ -243,21 +295,30 @@ def test_critic_engine(strategy): "responses": responses, "response_mask": response_mask, }, - meta_info={"temperature": 1.0, "global_token_num": global_token_num}, + meta_info={"temperature": 1.0, "global_token_num": global_token_num, "compute_loss": False}, ) + data_td = data.to_tensordict() + data_td = left_right_2_no_padding(data_td) + # eval - output = wg.compute_values(data) + output = wg.infer_batch(data_td) + output = output.get() + + values_unpad = tu.get(output, "values").float().cpu() + values = no_padding_2_padding(values_unpad, data_td) + + output = DataProto.from_single_dict({"values": values}) # load hf model and compare results with hf model - with torch.device("cuda"): + with torch.device("cuda"), torch.autocast(device_type="cuda", dtype=torch.bfloat16): hf_model = AutoModelForTokenClassification.from_pretrained( - path, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + value_model_path, torch_dtype=torch.float32, attn_implementation="flash_attention_2" ) hf_output = hf_model(input_ids.cuda(), attention_mask=attention_mask.cuda()) hf_values = hf_output.logits[:, -response_length - 1 : -1, :].float().squeeze(-1).cpu() - hf_values_mean = torch.mean(hf_values * response_mask) + hf_values_mean = torch.mean(hf_values * response_mask) engine_values = torch.mean(output.batch["values"] * response_mask) torch.testing.assert_close(hf_values_mean, engine_values, atol=1e-2, rtol=1e-2) @@ -265,11 +326,25 @@ def test_critic_engine(strategy): data = data.union(output) # add ppo data - data.batch["values"] = torch.rand_like(responses, dtype=torch.float32) data.batch["returns"] = torch.rand_like(responses, dtype=torch.float32) # update again - ppo_metrics = wg.update_critic(data) + # create critic config + critic_config = CriticConfig( + strategy=strategy, rollout_n=1, ppo_micro_batch_size_per_gpu=-1, model_config=config.model_config + ) + value_loss_ = partial(value_loss, config=critic_config) + wg.set_loss_fn(value_loss_) + + # update again + data_td = data.to_tensordict() + data_td = left_right_2_no_padding(data_td) + + # auto load/offload + tu.assign_non_tensor(data_td, global_batch_size=data_td.shape[0]) + ppo_metrics = wg.train_batch(data_td) + ppo_metrics = ppo_metrics.get() + ppo_metrics = tu.get(ppo_metrics, "metrics") print(ppo_metrics) ray.shutdown() diff --git a/tests/single_controller/test_decorator_on_cpu.py b/tests/single_controller/test_decorator_on_cpu.py index 1178d256cf5..8dc74670410 100644 --- a/tests/single_controller/test_decorator_on_cpu.py +++ b/tests/single_controller/test_decorator_on_cpu.py @@ -66,6 +66,9 @@ async def async_dp_compute(self, data: DataProto) -> DataProto: def dp_compute_td(self, data: TensorDict) -> TensorDict: rank_value = torch.tensor(self.rank, device=data["input"].device, dtype=data["input"].dtype) data["output"] = data["input"] + self.value + rank_value + position_ids = data.pop("position_ids") + for i, position_id in enumerate(position_ids.unbind(dim=0)): + assert (position_id == torch.arange(4 + rank_value * 2 + i).expand(position_id.shape)).all() return data @@ -159,7 +162,16 @@ def test_decorator_dp_compute_td(ray_init_shutdown): # Prepare input data (size 4, for 2 workers) input_tensor = torch.arange(4, dtype=torch.float32) - data = TensorDict({"input": input_tensor}, batch_size=[4]) + position_ids = torch.nested.as_nested_tensor( + [ + torch.arange(4).expand(4, 4), + torch.arange(5).expand(4, 5), + torch.arange(6).expand(4, 6), + torch.arange(7).expand(4, 7), + ], + layout=torch.jagged, + ) + data = TensorDict({"input": input_tensor, "position_ids": position_ids}, batch_size=[4]) # Call the decorated method output = worker_group.dp_compute_td(data) diff --git a/tests/special_e2e/ppo_trainer/run_model_reward.sh b/tests/special_e2e/ppo_trainer/run_model_reward.sh index 09d6757b511..46fb7c64e38 100644 --- a/tests/special_e2e/ppo_trainer/run_model_reward.sh +++ b/tests/special_e2e/ppo_trainer/run_model_reward.sh @@ -79,13 +79,13 @@ python3 -m verl.trainer.main_ppo \ critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.optimizer_offload=False \ reward_model.enable=True \ - reward_model.ulysses_sequence_parallel_size="${SP_SIZE}" \ reward_model.model.path="${MODEL_PATH}" \ - reward_model.model.use_remove_padding="${RM_PAD}" \ - reward_model.model.fsdp_config.param_offload=True \ - reward_model.use_dynamic_bsz="${SEQ_BALANCE}" \ - reward_model.forward_max_token_len_per_gpu=${infer_max_token_num_per_gpu} \ - reward_model.micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \ + reward_model.use_reward_loop=True \ + reward_model.rollout.gpu_memory_utilization=0.8 \ + reward_model.rollout.tensor_model_parallel_size=1 \ + reward_model.rollout.prompt_length=1024 \ + reward_model.rollout.response_length=512 \ + reward_model.num_workers=8 \ algorithm.use_kl_in_reward=False \ trainer.critic_warmup=0 \ trainer.logger=console \ diff --git a/tests/special_e2e/run_ppo_trainer_megatron.sh b/tests/special_e2e/run_ppo_trainer_megatron.sh index a88500aba40..cd8033f132e 100644 --- a/tests/special_e2e/run_ppo_trainer_megatron.sh +++ b/tests/special_e2e/run_ppo_trainer_megatron.sh @@ -9,6 +9,7 @@ NUM_GPUS=${NUM_GPUS:-8} MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B} MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}} +RM_MODEL_PATH=${RM_MODEL_PATH:-${HOME}/models/Skywork/Skywork-Reward-V2-Llama-3.2-1B} #huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}" USE_DUMMY_MODEL=${USE_DUMMY_MODEL:-False} @@ -57,6 +58,7 @@ LORA_TARGET_MODULES=${LORA_TARGET_MODULES:-"['linear_qkv','linear_proj','linear_ MAX_PROMPT_LENGTH=${MAX_PROMPT_LENGTH:-512} MAX_RESPONSE_LENGTH=${MAX_RESPONSE_LENGTH:-512} +MAX_RM_LENGTH=$((MAX_PROMPT_LENGTH + MAX_RESPONSE_LENGTH)) COMMON_PP=${COMMON_PP:-2} COMMON_VPP=${COMMON_VPP:-2} @@ -87,12 +89,6 @@ CRITIC_CP=${CRITIC_CP:-$COMMON_CP} CRITIC_TP=${CRITIC_TP:-$TRAIN_TP} CRITIC_EP=${CRITIC_EP:-$COMMON_EP} CRITIC_ETP=${CRITIC_ETP:-$COMMON_ETP} -RM_PP=${RM_PP:-$COMMON_PP} -RM_VPP=${RM_VPP:-$COMMON_VPP} -RM_CP=${RM_CP:-$COMMON_CP} -RM_TP=${RM_TP:-$TRAIN_TP} -RM_EP=${RM_EP:-$COMMON_EP} -RM_ETP=${RM_ETP:-$COMMON_ETP} ALL_OFFLOAD=${ALL_OFFLOAD:-False} COMMON_PARAM_OFFLOAD=${COMMON_PARAM_OFFLOAD:-$ALL_OFFLOAD} @@ -244,22 +240,14 @@ python3 -m verl.trainer.main_ppo --config-path=config \ critic.profiler.ranks=$PROFILE_RANKS \ critic.profiler.all_ranks=$PROFILE_RANKS_ALL \ reward_model.enable=True \ - reward_model.model.path="${MODEL_PATH}" \ - reward_model.micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \ - reward_model.megatron.use_mbridge=${USE_MBRIDGE} \ - reward_model.megatron.vanilla_mbridge=${VALUE_VANILLA_MBRIDGE} \ - reward_model.megatron.pipeline_model_parallel_size=$RM_PP \ - reward_model.megatron.virtual_pipeline_model_parallel_size=$RM_VPP \ - reward_model.megatron.context_parallel_size=$RM_CP \ - reward_model.megatron.tensor_model_parallel_size=$RM_TP \ - reward_model.megatron.expert_model_parallel_size=$RM_EP \ - reward_model.megatron.expert_tensor_parallel_size=$RM_ETP \ - reward_model.megatron.param_offload=${RM_PARAM_OFFLOAD} \ - reward_model.megatron.use_dist_checkpointing=${USE_DIST_CKPT} \ - reward_model.megatron.dist_checkpointing_path=${DIST_CKPT_PATH} \ - reward_model.profiler.enable=$PROFILE_ENABLE \ - reward_model.profiler.ranks=$PROFILE_RANKS \ - reward_model.profiler.all_ranks=$PROFILE_RANKS_ALL \ + reward_model.model.path="${RM_MODEL_PATH}" \ + reward_model.use_reward_loop=True \ + reward_model.rollout.name=${ENGINE} \ + reward_model.rollout.gpu_memory_utilization=0.6 \ + reward_model.rollout.tensor_model_parallel_size=${INFER_TP} \ + reward_model.rollout.prompt_length=${MAX_RM_LENGTH} \ + reward_model.rollout.response_length=${MAX_RESPONSE_LENGTH} \ + reward_model.num_workers=8 \ algorithm.use_kl_in_reward=False \ algorithm.kl_penalty=kl \ algorithm.kl_ctrl.kl_coef=0.001 \ diff --git a/tests/special_e2e/run_transferqueue.sh b/tests/special_e2e/run_transferqueue.sh index 541742bf765..9e1c34acde1 100644 --- a/tests/special_e2e/run_transferqueue.sh +++ b/tests/special_e2e/run_transferqueue.sh @@ -63,8 +63,6 @@ echo "Running transferqueue with ${ACTOR_STRATEGY} strategy" echo "Total GPUs: ${NUM_GPUS}" # Common parameters for both FSDP and Megatron -# For Ascend NPU, please add -# trainer.device=npu common_params=( data.train_files="${HOME}/data/gsm8k/train.parquet" data.val_files="${HOME}/data/gsm8k/test.parquet" diff --git a/tests/special_e2e/sft/compare_sft_engine_results.py b/tests/special_e2e/sft/compare_sft_engine_results.py index b39e133ee5e..322f5353c06 100644 --- a/tests/special_e2e/sft/compare_sft_engine_results.py +++ b/tests/special_e2e/sft/compare_sft_engine_results.py @@ -36,7 +36,7 @@ def compare_results(golden_results, other_result): grad_norm = other_result[0]["data"]["train/grad_norm"] torch.testing.assert_close(golden_loss, loss, atol=1e-2, rtol=1e-2) - torch.testing.assert_close(golden_grad_norm, grad_norm, atol=1e-4, rtol=1e-2) + torch.testing.assert_close(golden_grad_norm, grad_norm, atol=1e-4, rtol=3e-2) if __name__ == "__main__": @@ -53,5 +53,6 @@ def compare_results(golden_results, other_result): for file, other_result in other_results.items(): print(f"compare results {file}") compare_results(golden_results, other_result) + print(f"compare results {file} done") print("All results are close to golden results") diff --git a/tests/special_e2e/sft/run_sft_engine_gsm8k.sh b/tests/special_e2e/sft/run_sft_engine.sh similarity index 91% rename from tests/special_e2e/sft/run_sft_engine_gsm8k.sh rename to tests/special_e2e/sft/run_sft_engine.sh index ead86f1747b..16b7631473d 100644 --- a/tests/special_e2e/sft/run_sft_engine_gsm8k.sh +++ b/tests/special_e2e/sft/run_sft_engine.sh @@ -13,9 +13,9 @@ else COMMAND="python ${ENTRYPOINT} trainer.nnodes=${NNODES:-1} trainer.n_gpus_per_node=${NUM_GPUS:-1}" fi - -TRAIN_FILES=~/data/gsm8k_sft/train.parquet -VAL_FILES=~/data/gsm8k_sft/test.parquet +DATASET_DIR=${DATASET_DIR:-~/data/gsm8k_sft} +TRAIN_FILES=${DATASET_DIR}/train.parquet +VAL_FILES=${DATASET_DIR}/test.parquet backend=${BACKEND:-fsdp} @@ -25,7 +25,7 @@ RESUME_MODE=disable ckpts_home=${ckpts_home:-~/verl/test/gsm8k-sft-${backend}} -MODEL_ID=${MODEL_ID:-Qwen/Qwen3-0.6B} +MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B} MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}} #huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}" @@ -71,7 +71,8 @@ MEGATRON_ENGINE_CONFIG="\ engine.tensor_model_parallel_size=${TP_SIZE} \ engine.pipeline_model_parallel_size=${PP_SIZE} \ engine.virtual_pipeline_model_parallel_size=${VPP_SIZE} \ - engine.context_parallel_size=${CP_SIZE}" + engine.context_parallel_size=${CP_SIZE} + engine.use_mbridge=True" if [ "$backend" = "fsdp" ]; then ENGINE_CONFIG="$FSDP_ENGINE_CONFIG" @@ -88,11 +89,11 @@ mkdir -p "${ckpts_home}" $COMMAND \ data.train_files="${TRAIN_FILES}" \ data.val_files="${VAL_FILES}" \ - data.train_batch_size=256 \ + data.train_batch_size=128 \ data.pad_mode=${PAD_MODE} \ data.truncation=error \ data.use_dynamic_bsz=True \ - data.max_token_len_per_gpu=8192 \ + data.max_token_len_per_gpu=2048 \ data.messages_key=messages \ model.path=$MODEL_PATH \ model.use_remove_padding=${USE_REMOVE_PADDING} \ diff --git a/tests/special_e2e/sft/test_sft_engine_all.sh b/tests/special_e2e/sft/test_sft_engine_all.sh index 1548ea9d588..0fba9606587 100644 --- a/tests/special_e2e/sft/test_sft_engine_all.sh +++ b/tests/special_e2e/sft/test_sft_engine_all.sh @@ -5,51 +5,32 @@ rm -rf ~/verl/test/log mkdir -p ~/verl/test/log export VERL_FILE_LOGGER_ROOT=~/verl/test/log +VPP_SIZE=${VPP_SIZE:-2} # test with single gpu as golden echo "run with single gpu as golden" -BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=1 NUM_GPUS=1 FSDP_STRATEGY=fsdp VERL_FILE_LOGGER_PATH=~/verl/test/log/golden.jsonl bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh +BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=1 NUM_GPUS=1 FSDP_STRATEGY=fsdp VERL_FILE_LOGGER_PATH=~/verl/test/log/golden.jsonl bash tests/special_e2e/sft/run_sft_engine.sh # test with fsdp 1 -echo "run with sp1 fsdp_size2 num_gpus8 fsdp_strategy fsdp pad_mode no_padding" -BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=2 NUM_GPUS=8 FSDP_STRATEGY=fsdp PAD_MODE=no_padding bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh -echo "run with sp1 fsdp_size-1 num_gpus8 fsdp_strategy fsdp pad_mode no_padding" -BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=-1 NUM_GPUS=8 FSDP_STRATEGY=fsdp PAD_MODE=no_padding bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh -echo "run with sp2 fsdp_size-1 num_gpus8 fsdp_strategy fsdp pad_mode no_padding" -BACKEND=fsdp SP_SIZE=2 FSDP_SIZE=-1 NUM_GPUS=8 FSDP_STRATEGY=fsdp PAD_MODE=no_padding bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh -echo "run with sp4 fsdp_size4 num_gpus8 fsdp_strategy fsdp pad_mode no_padding" -BACKEND=fsdp SP_SIZE=4 FSDP_SIZE=4 NUM_GPUS=8 FSDP_STRATEGY=fsdp PAD_MODE=no_padding bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh - -# test use_remove_padding and pad_mode no_padding +echo "run with sp2 fsdp_size2 num_gpus8 fsdp_strategy fsdp pad_mode no_padding" +BACKEND=fsdp SP_SIZE=2 FSDP_SIZE=2 NUM_GPUS=8 FSDP_STRATEGY=fsdp PAD_MODE=no_padding bash tests/special_e2e/sft/run_sft_engine.sh + +# test with fsdp 1 use_remove_padding and pad_mode no_padding echo "run with sp4 fsdp_size4 num_gpus8 fsdp_strategy fsdp pad_mode no_padding use_remove_padding False" -BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=-1 NUM_GPUS=8 FSDP_STRATEGY=fsdp PAD_MODE=no_padding USE_REMOVE_PADDING=False bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh +BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=-1 NUM_GPUS=8 FSDP_STRATEGY=fsdp PAD_MODE=no_padding USE_REMOVE_PADDING=False bash tests/special_e2e/sft/run_sft_engine.sh # test with fsdp 2 -echo "run with sp1 fsdp_size1 num_gpus1 fsdp_strategy fsdp2" -BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=1 NUM_GPUS=1 FSDP_STRATEGY=fsdp2 bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh - -echo "run with sp1 fsdp_size-1 num_gpus8 fsdp_strategy fsdp2" -BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=-1 NUM_GPUS=8 FSDP_STRATEGY=fsdp2 bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh -echo "run with sp2 fsdp_size-1 num_gpus8 fsdp_strategy fsdp2" -BACKEND=fsdp SP_SIZE=2 FSDP_SIZE=-1 NUM_GPUS=8 FSDP_STRATEGY=fsdp2 bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh -BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=2 NUM_GPUS=8 FSDP_STRATEGY=fsdp2 bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh -BACKEND=fsdp SP_SIZE=4 FSDP_SIZE=4 NUM_GPUS=8 FSDP_STRATEGY=fsdp2 bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh +echo "run with sp2 fsdp_size2 num_gpus8 fsdp_strategy fsdp2" +BACKEND=fsdp SP_SIZE=2 FSDP_SIZE=2 NUM_GPUS=8 FSDP_STRATEGY=fsdp2 bash tests/special_e2e/sft/run_sft_engine.sh # test with megatron -echo "run with tp1 pp1 cp1 num_gpus1" -BACKEND=megatron TP_SIZE=1 PP_SIZE=1 CP_SIZE=1 NUM_GPUS=1 bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh - -echo "run with tp2 pp2 vpp2 cp1 num_gpus8" -BACKEND=megatron TP_SIZE=2 PP_SIZE=2 VPP_SIZE=2 CP_SIZE=1 NUM_GPUS=8 bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh - -# test with cp echo "run with tp2 pp2 vpp2 cp2 num_gpus8" -BACKEND=megatron TP_SIZE=2 PP_SIZE=2 VPP_SIZE=2 CP_SIZE=2 NUM_GPUS=8 bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh +BACKEND=megatron TP_SIZE=2 PP_SIZE=2 VPP_SIZE=${VPP_SIZE} CP_SIZE=2 NUM_GPUS=8 bash tests/special_e2e/sft/run_sft_engine.sh # test with cp in ray echo "run with tp2 pp2 vpp2 cp2 num_gpus8 mode=ray" -BACKEND=megatron TP_SIZE=2 PP_SIZE=2 VPP_SIZE=2 CP_SIZE=2 NUM_GPUS=8 mode=ray bash tests/special_e2e/sft/run_sft_engine_gsm8k.sh +BACKEND=megatron TP_SIZE=2 PP_SIZE=2 VPP_SIZE=${VPP_SIZE} CP_SIZE=2 NUM_GPUS=8 mode=ray bash tests/special_e2e/sft/run_sft_engine.sh python3 tests/special_e2e/sft/compare_sft_engine_results.py diff --git a/tests/special_npu/run_qwen2_5_05b_dapo.sh b/tests/special_npu/run_qwen2_5_05b_dapo.sh index d90b63cb277..b27c7876f80 100644 --- a/tests/special_npu/run_qwen2_5_05b_dapo.sh +++ b/tests/special_npu/run_qwen2_5_05b_dapo.sh @@ -91,5 +91,4 @@ python3 -m recipe.dapo.main_dapo \ trainer.total_epochs=1 \ trainer.resume_mode=disable \ trainer.val_before_train=False \ - trainer.total_training_steps=1 \ - trainer.device=npu $@ + trainer.total_training_steps=1 $@ diff --git a/tests/special_npu/run_qwen2_5_05b_grpo.sh b/tests/special_npu/run_qwen2_5_05b_grpo.sh index cd3edc1e30e..352b4738948 100644 --- a/tests/special_npu/run_qwen2_5_05b_grpo.sh +++ b/tests/special_npu/run_qwen2_5_05b_grpo.sh @@ -44,5 +44,4 @@ python3 -m verl.trainer.main_ppo \ trainer.save_freq=-1 \ trainer.test_freq=-1 \ trainer.total_epochs=1 \ - trainer.total_training_steps=1 \ - trainer.device=npu $@ + trainer.total_training_steps=1 $@ diff --git a/tests/special_npu/run_qwen2_5_05b_grpo_mindspeed.sh b/tests/special_npu/run_qwen2_5_05b_grpo_mindspeed.sh index bdf225dc3a1..a821433790a 100644 --- a/tests/special_npu/run_qwen2_5_05b_grpo_mindspeed.sh +++ b/tests/special_npu/run_qwen2_5_05b_grpo_mindspeed.sh @@ -65,5 +65,4 @@ python3 -m verl.trainer.main_ppo --config-path=config \ trainer.test_freq=-1 \ trainer.total_epochs=1 \ trainer.total_training_steps=1 \ - trainer.device=npu \ +actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True $@ diff --git a/tests/special_npu/run_qwen2_5_05b_sft_peft_sp2.sh b/tests/special_npu/run_qwen2_5_05b_sft_peft_sp2.sh index 5af44c9907a..cb3aacf7190 100644 --- a/tests/special_npu/run_qwen2_5_05b_sft_peft_sp2.sh +++ b/tests/special_npu/run_qwen2_5_05b_sft_peft_sp2.sh @@ -5,7 +5,7 @@ mkdir -p ./save_ckpts MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct} MODEL_PATH=${MODEL_PATH:-${HOME}/.cache/models/${MODEL_ID}} -torchrun --standalone --nnodes=1 --nproc_per_node=8 \ +torchrun --standalone --nnodes=1 --nproc_per_node=2 \ -m verl.trainer.fsdp_sft_trainer \ data.train_files=$HOME/data/gsm8k/train.parquet \ data.val_files=$HOME/data/gsm8k/test.parquet \ @@ -27,7 +27,6 @@ torchrun --standalone --nnodes=1 --nproc_per_node=8 \ model.target_modules=all-linear \ model.strategy=fsdp \ ulysses_sequence_parallel_size=2 \ - use_remove_padding=true \ - trainer.device=npu + use_remove_padding=true rm -rf ./outputs ./save_ckpts diff --git a/tests/special_npu/run_qwen2_5_vl_3b_npu.sh b/tests/special_npu/run_qwen2_5_vl_3b_npu.sh index 10ffdf3747f..aca2dd6e5a4 100644 --- a/tests/special_npu/run_qwen2_5_vl_3b_npu.sh +++ b/tests/special_npu/run_qwen2_5_vl_3b_npu.sh @@ -54,5 +54,4 @@ python3 -m verl.trainer.main_ppo \ trainer.save_freq=-1 \ trainer.test_freq=-1 \ trainer.total_epochs=1 \ - trainer.total_training_steps=1 \ - trainer.device=npu $@ \ No newline at end of file + trainer.total_training_steps=1 $@ \ No newline at end of file diff --git a/tests/special_npu/run_qwen3_06b_ppo.sh b/tests/special_npu/run_qwen3_06b_ppo.sh index 284ad091e84..2c446379b9a 100644 --- a/tests/special_npu/run_qwen3_06b_ppo.sh +++ b/tests/special_npu/run_qwen3_06b_ppo.sh @@ -49,5 +49,4 @@ python3 -m verl.trainer.main_ppo \ trainer.save_freq=-1 \ trainer.test_freq=-1 \ trainer.total_epochs=1 \ - trainer.total_training_steps=1 \ - trainer.device=npu $@ + trainer.total_training_steps=1 $@ diff --git a/tests/special_npu/run_qwen3_30b_dapo_mindspeed.sh b/tests/special_npu/run_qwen3_30b_dapo_mindspeed.sh index aece3d11471..cd06eba18ce 100644 --- a/tests/special_npu/run_qwen3_30b_dapo_mindspeed.sh +++ b/tests/special_npu/run_qwen3_30b_dapo_mindspeed.sh @@ -125,7 +125,6 @@ python3 -m recipe.dapo.main_dapo \ trainer.test_freq=-1 \ trainer.total_epochs=1 \ trainer.total_training_steps=1 \ - trainer.device=npu \ actor_rollout_ref.actor.use_torch_compile=False \ actor_rollout_ref.ref.use_torch_compile=False \ +actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True $@ diff --git a/tests/special_sanity/check_pr_title.py b/tests/special_sanity/check_pr_title.py index cabc2f50d85..601484dc4bd 100644 --- a/tests/special_sanity/check_pr_title.py +++ b/tests/special_sanity/check_pr_title.py @@ -22,7 +22,7 @@ allowed_modules = ["fsdp", "megatron", "sglang", "vllm", "rollout", "trainer"] allowed_modules += ["tests", "training_utils", "recipe", "hardware", "deployment"] allowed_modules += ["ray", "worker", "single_controller", "misc", "docker", "ci"] -allowed_modules += ["perf", "model", "algo", "env", "tool", "ckpt", "doc", "data", "cfg"] +allowed_modules += ["perf", "model", "algo", "env", "tool", "ckpt", "doc", "data", "cfg", "reward"] allowed_types = ["feat", "fix", "refactor", "chore", "test"] # Check for [1/N] prefix and extract the rest of the title diff --git a/tests/test_protocol_v2_on_cpu.py b/tests/test_protocol_v2_on_cpu.py index 831a89935c0..bc6788387ab 100644 --- a/tests/test_protocol_v2_on_cpu.py +++ b/tests/test_protocol_v2_on_cpu.py @@ -247,15 +247,35 @@ def test_tensordict_eq(): def test_tensor_dict_make_iterator(): obs = torch.tensor([1, 2, 3, 4, 5, 6]) + input_ids = torch.nested.as_nested_tensor( + [ + torch.tensor([0, 1]), + torch.tensor([2]), + torch.tensor([3, 4]), + torch.tensor([5]), + torch.tensor([6, 7, 8]), + torch.tensor([9]), + ], + layout=torch.jagged, + ) data_sources = ["abc", "def", "abc", "def", "pol", "klj"] non_tensor_dict = {"train_sample_kwargs": {"top_p": 1.0}, "val_sample_kwargs": {"top_p": 0.7}} - dataset = tu.get_tensordict({"obs": obs, "data_sources": data_sources}, non_tensor_dict=non_tensor_dict) + dataset = tu.get_tensordict( + {"obs": obs, "data_sources": data_sources, "input_ids": input_ids}, non_tensor_dict=non_tensor_dict + ) dataloader = tu.make_iterator( dataset, mini_batch_size=2, epochs=2, seed=0, dataloader_kwargs={"shuffle": False, "drop_last": False} ) - expected_tensor_dict = [dataset[0:2], dataset[2:4], dataset[4:6], dataset[0:2], dataset[2:4], dataset[4:6]] + expected_tensor_dict = [ + tu.index_select_tensor_dict(dataset, indices=list(range(0, 2))), + tu.index_select_tensor_dict(dataset, indices=list(range(2, 4))), + tu.index_select_tensor_dict(dataset, indices=list(range(4, 6))), + tu.index_select_tensor_dict(dataset, indices=list(range(0, 2))), + tu.index_select_tensor_dict(dataset, indices=list(range(2, 4))), + tu.index_select_tensor_dict(dataset, indices=list(range(4, 6))), + ] i = 0 @@ -721,6 +741,55 @@ def test_concat_tensordict(): assert output["temp"] == 1.0 +def test_chunk_tensordict(): + # Qwen-VL 3d position_ids + position_ids = torch.nested.as_nested_tensor( + [ + torch.arange(4).expand(4, 4), + torch.arange(5).expand(4, 5), + torch.arange(6).expand(4, 6), + torch.arange(7).expand(4, 7), + ], + layout=torch.jagged, + ) + input_ids = torch.nested.as_nested_tensor( + [torch.arange(4), torch.arange(5), torch.arange(6), torch.arange(7)], layout=torch.jagged + ) + + multi_modal_inputs = torch.stack( + [ + NonTensorData({"pixel_values": torch.randn(3, 224, 224)}), + NonTensorData(None), + NonTensorData({"pixel_values": torch.randn(3, 128, 128)}), + NonTensorData({"pixel_values": torch.randn(3, 128, 128)}), + ] + ) + td = tu.get_tensordict( + { + "input_ids": input_ids, + "position_ids": position_ids, + "multi_modal_inputs": multi_modal_inputs, + }, + ) + assert len(td) == 4 + chunks = tu.chunk_tensordict(td, chunks=2) + + for i, chunk in enumerate(chunks): + assert len(chunk) == 2 + for key, val in chunk.items(): + if isinstance(val, torch.Tensor) and val.is_nested: + tensors = td[key].unbind(dim=0) + expected = torch.nested.as_nested_tensor(tensors[i * 2 : (i + 1) * 2], layout=torch.jagged) + assert torch.all(torch.eq(val.values(), expected.values())).item() + else: + expected = td[key][i * 2 : (i + 1) * 2] + for tensor, expect in zip(val, expected, strict=False): + if tensor.data is None: + assert expect is None + else: + assert torch.all(torch.eq(tensor.data["pixel_values"], expect["pixel_values"])).item() + + def test_assign_non_tensor_stack_with_nested_lists(): """Test assign_non_tensor_stack with lists of lists.""" td = tu.get_tensordict({"obs": torch.randn(3, 4)}, non_tensor_dict={}) diff --git a/tests/trainer/config/legacy_ppo_megatron_trainer.yaml b/tests/trainer/config/legacy_ppo_megatron_trainer.yaml index ea2a15d685e..06e2e94a662 100644 --- a/tests/trainer/config/legacy_ppo_megatron_trainer.yaml +++ b/tests/trainer/config/legacy_ppo_megatron_trainer.yaml @@ -111,7 +111,7 @@ actor_rollout_ref: dist_checkpointing_path: null seed: 42 override_transformer_config: {} # additional transformer config like: num_layers_in_first(/last)_pipeline_stage - use_mbridge: False + use_mbridge: True vanilla_mbridge: True profile: # profile the actor model in `update_policy` use_profile: False # open it when you want to profile the actor model diff --git a/tests/trainer/config/legacy_ppo_trainer.yaml b/tests/trainer/config/legacy_ppo_trainer.yaml index 3139e8a39db..c09e06e978d 100644 --- a/tests/trainer/config/legacy_ppo_trainer.yaml +++ b/tests/trainer/config/legacy_ppo_trainer.yaml @@ -165,7 +165,7 @@ actor_rollout_ref: enable_activation_offload: false # Whether to remove padding tokens in inputs during training - use_remove_padding: false + use_remove_padding: true # Set to positive value to enable LoRA (e.g., 32) lora_rank: 0 diff --git a/tests/utils/dataset/test_multiturn_sft_dataset_on_cpu.py b/tests/utils/dataset/test_multiturn_sft_dataset_on_cpu.py index 0c5bbb65084..6962e124000 100644 --- a/tests/utils/dataset/test_multiturn_sft_dataset_on_cpu.py +++ b/tests/utils/dataset/test_multiturn_sft_dataset_on_cpu.py @@ -16,28 +16,46 @@ """ import os +from io import BytesIO import pandas as pd +import pytest import torch -from transformers import AutoTokenizer - +from PIL import Image +from tensordict import TensorDict +from torch.utils.data import DistributedSampler +from torchdata.stateful_dataloader import StatefulDataLoader +from transformers import AutoProcessor, AutoTokenizer +from transformers.utils import get_json_schema + +from verl.utils.dataset.dataset_utils import DatasetPadMode, SFTTensorCollator from verl.utils.dataset.multiturn_sft_dataset import MultiTurnSFTDataset - - -def test_multiturn_sft_dataset(): - print("Starting test...") +from verl.utils.model import extract_multi_modal_inputs + + +@pytest.mark.parametrize( + "model_path", + [ + "Qwen/Qwen2.5-0.5B", + "Qwen/Qwen2.5-Coder-7B-Instruct", + "Qwen/Qwen3-30B-A3B-Instruct-2507", + # "Qwen/Qwen3-30B-A3B-Thinking-2507" # Thinking series models add tags to last turn. + ], +) +@pytest.mark.parametrize("enable_thinking", [False, True]) +def test_multiturn_sft_dataset(model_path: str, enable_thinking: bool): + print(f"Starting test... model_path={model_path}, enable_thinking={enable_thinking}") # Create a temporary parquet file with test data test_data = { "messages": [ [ - {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "What is 2+2?"}, {"role": "assistant", "content": "2+2 equals 4."}, {"role": "user", "content": "And what is 4+4?"}, {"role": "assistant", "content": "4+4 equals 8."}, ], [ - {"role": "system", "content": "You are a helpful assistant."}, + {"role": "system", "content": "You are a powerful assistant."}, {"role": "user", "content": "Tell me a joke."}, {"role": "assistant", "content": "Why did the chicken cross the road?"}, {"role": "user", "content": "Why?"}, @@ -55,8 +73,13 @@ def test_multiturn_sft_dataset(): df.to_parquet(test_file) # Initialize tokenizer and dataset - tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-7B-Instruct") - config = {"max_length": 512, "truncation": "error", "multiturn": {"messages_key": "messages"}} + tokenizer = AutoTokenizer.from_pretrained(model_path) + config = { + "max_length": 512, + "truncation": "error", + "multiturn": {"messages_key": "messages"}, + "apply_chat_template_kwargs": {"enable_thinking": enable_thinking}, + } dataset = MultiTurnSFTDataset(parquet_files=test_file, tokenizer=tokenizer, config=config) # Test 1: Dataset Length @@ -200,3 +223,220 @@ def test_multiturn_sft_dataset(): print("All tests passed!") print("Starting test...") + + +def generate_image(description: str, size: str = "256x256"): + """Generate a simple image based on description. + + Args: + description: The description of the image to generate. + size: The size of the image. Defaults to "256x256". (choices: ["256x256", "512x512"]) + + Returns: + A generated image + """ + ... + + +@pytest.fixture +def vlm_data_file(): + test_data = [ + # sample 0: single turn with image input + { + "messages": [ + { + "role": "user", + "content": "Describe this image.", + }, + { + "role": "assistant", + "content": "The image is a red square.", + }, + ], + "images": [Image.new("RGB", (300, 300), color="red")], + "tools": [], + }, + # sample 1: single turn with multiple images input + { + "messages": [ + { + "role": "user", + "content": "Compare these images.", + }, + { + "role": "assistant", + "content": "The first image is a red square and the second image is a green square.", + }, + ], + "images": [Image.new("RGB", (100, 100), color="red"), Image.new("RGB", (100, 300), color="green")], + "tools": [], + }, + # sample 2: multi turn with image input and tool generated image + { + "messages": [ + { + "role": "user", + "content": "Describe this image.", + }, + { + "role": "assistant", + "content": "Let's generate a zoom-in image.", + "tool_calls": [ + { + "function": {"arguments": '{"bbox_2d": "[0, 1, 2, 4]"}', "name": "image_zoom_in_tool"}, + "type": "function", + } + ], + }, + { + "role": "tool", + "content": "Generated image.", + }, + {"role": "assistant", "content": "The zoom-in image is a red square."}, + ], + "images": [Image.new("RGB", (300, 500), color="red"), Image.new("RGB", (100, 100), color="red")], + "tools": [get_json_schema(generate_image)], + }, + # sample 3: single turn without image input + { + "messages": [ + {"role": "user", "content": "How is the weather today?"}, + {"role": "assistant", "content": "The weather is sunny."}, + ], + "images": [], + "tools": [], + }, + ] + + # Create test directory if it doesn't exist + os.makedirs("test_data", exist_ok=True) + test_file = "test_data/test_vlm.parquet" + + # Save test data to parquet + df = pd.DataFrame(test_data) + + def serialize_image(img): + if isinstance(img, Image.Image): + img_byte_arr = BytesIO() + img.save(img_byte_arr, format="PNG") + return {"bytes": img_byte_arr.getvalue()} + return img + + df["images"] = df["images"].apply(lambda x: [serialize_image(img) for img in x]) + + df.to_parquet(test_file) + return test_file + + +def test_multiturn_sft_vlm_dataset_on_cpu(vlm_data_file): + df = pd.read_parquet(vlm_data_file) + model_path = "Qwen/Qwen3-VL-2B-Instruct" + tokenizer = AutoTokenizer.from_pretrained(model_path) + processor = AutoProcessor.from_pretrained(model_path) + config = {"max_length": 512, "pad_mode": "no_padding", "truncation": "error", "messages_key": "messages"} + dataset = MultiTurnSFTDataset(parquet_files=vlm_data_file, tokenizer=tokenizer, config=config, processor=processor) + assert dataset.pad_mode == DatasetPadMode.NO_PADDING + + for i in range(len(dataset)): + item = dataset[i] + input_ids = item["input_ids"] + loss_mask = item["loss_mask"] + position_ids = item["position_ids"] + pixel_values = item.get("multi_modal_inputs", {}).get("pixel_values") + image_grid_thw = item.get("multi_modal_inputs", {}).get("image_grid_thw") + + assert input_ids.shape == loss_mask.shape, "Shapes of input_ids and loss_mask must be equal" + assert position_ids.dim() == 2, "position_ids must be 2-dimensional" + assert position_ids.shape[0] == 4, f"position_ids[0] should be 4: {position_ids[0]}" + assert position_ids.shape[1] == input_ids.shape[0] + + # 1. verify input_ids without assistant text + text = tokenizer.decode(input_ids[loss_mask == 0], skip_special_tokens=True) + print(f"Text without assistant: {repr(text)}") + for message in df["messages"][i]: + if message["role"] != "assistant": + content = message["content"].replace("", "") + assert content in text, f"user/tool text should be in the input_ids: {text}" + + # 2. verify input_ids with assistant text + text = tokenizer.decode(input_ids[loss_mask == 1], skip_special_tokens=True) + print(f"Text with assistant: {repr(text)}") + for message in df["messages"][i]: + if message["role"] == "assistant": + assert message["content"] in text, f"Assistant text should be in the input_ids: {text}" + assert "assistant" not in text, f"Assistant token should not be in the input_ids: {text}" + + # 3. verify image token match with image_grid_thw + if len(df["images"][i]) > 0: + patch_size = processor.image_processor.patch_size + temporal_patch_size = processor.image_processor.temporal_patch_size + merge_size = processor.image_processor.merge_size + num_patches = image_grid_thw.prod(dim=1).sum() + assert image_grid_thw.shape == (len(df["images"][i]), 3), ( + f"image_grid_thw: {image_grid_thw.shape} should have shape ({len(df['images'][i])}, 3)" + ) + assert pixel_values.shape == (num_patches, 3 * temporal_patch_size * patch_size * patch_size), ( + f"pixel_values: {pixel_values.shape} should have shape ({num_patches}, {3 * patch_size * patch_size})" + ) + assert (input_ids == processor.image_token_id).sum() == num_patches // (merge_size**2) + else: + assert pixel_values is None, "pixel_values should be None when no image is provided" + assert image_grid_thw is None, "image_grid_thw should be None when no image is provided" + + +def test_multiturn_sft_vlm_dataloader_on_cpu(vlm_data_file): + df = pd.read_parquet(vlm_data_file) + model_path = "Qwen/Qwen3-VL-2B-Instruct" + tokenizer = AutoTokenizer.from_pretrained(model_path) + processor = AutoProcessor.from_pretrained(model_path) + config = {"max_length": 512, "pad_mode": "no_padding", "truncation": "error", "messages_key": "messages"} + dataset = MultiTurnSFTDataset(parquet_files=vlm_data_file, tokenizer=tokenizer, config=config, processor=processor) + assert dataset.pad_mode == DatasetPadMode.NO_PADDING + + collate_fn = SFTTensorCollator(DatasetPadMode.NO_PADDING) + sampler = DistributedSampler(dataset, shuffle=False, num_replicas=1, rank=0, drop_last=True) + batch_size = 2 + dataloader = StatefulDataLoader( + dataset=dataset, + batch_size=batch_size, + sampler=sampler, + collate_fn=collate_fn, + num_workers=0, + pin_memory=False, + drop_last=True, + ) + + for i, batch in enumerate(dataloader): + # 1. verify input_ids, loss_mask + input_ids = batch["input_ids"] + loss_mask = batch["loss_mask"] + assert input_ids.is_nested, "input_ids should be a nested tensor" + assert loss_mask.is_nested, "loss_mask should be a nested tensor" + assert input_ids.shape[0] == loss_mask.shape[0] == batch_size, "Shapes of input_ids, loss_mask must be equal" + + # 2. verify position_ids: (bs, 4, seq_len) + position_ids = batch["position_ids"] + assert position_ids.is_nested, "position_ids should be a nested tensor" + assert position_ids.dim() == 3, "position_ids must be 3-dimensional" + assert position_ids.shape[0] == batch_size + assert position_ids.shape[1] == 4 + values = position_ids.values() + assert values.shape == (4, len(input_ids.values())) + + # 3. verify multi-modal data + td = TensorDict(**batch, batch_size=batch_size) + multi_modal_inputs = extract_multi_modal_inputs(td["multi_modal_inputs"]) + pixel_values = multi_modal_inputs["pixel_values"] + image_grid_thw = multi_modal_inputs["image_grid_thw"] + + num_images = sum([len(images) for images in df["images"][i * batch_size : (i + 1) * batch_size]]) + assert image_grid_thw.shape == (num_images, 3), ( + f"image_grid_thw: {image_grid_thw.shape} should have shape ({num_images}, 3)" + ) + patch_size = processor.image_processor.patch_size + temporal_patch_size = processor.image_processor.temporal_patch_size + num_patches = image_grid_thw.prod(dim=1).sum() + assert pixel_values.shape[0] == num_patches, ( + f"pixel_values: {pixel_values.shape} should have shape " + f"({num_patches}, 3 * {temporal_patch_size} * {patch_size} * {patch_size})" + ) diff --git a/tests/utils/test_mlflow_key_sanitization.py b/tests/utils/test_mlflow_key_sanitization.py index 54605db241d..daf457869e3 100644 --- a/tests/utils/test_mlflow_key_sanitization.py +++ b/tests/utils/test_mlflow_key_sanitization.py @@ -20,24 +20,44 @@ class TestMlflowLoggingAdapter(unittest.TestCase): def test_sanitize_key_and_warning(self): + """Test key sanitization for invalid characters and consecutive slashes with warnings.""" adapter = _MlflowLoggingAdapter() - data = {"valid_key": 1.0, "invalid@key!": 2.0, "another/valid-key": 3.0, "bad key#": 4.0} + data = { + "valid_key": 1.0, + "invalid@key!": 2.0, + "another/valid-key": 3.0, + "bad key#": 4.0, + "val-aux//reward/mean_at_1": 5.0, + "val-core///acc/best_at_5": 6.0, + "metric////with/many////slashes": 7.0, + } # Patch mlflow.log_metrics to capture the metrics actually sent with ( patch("mlflow.log_metrics") as mock_log_metrics, patch.object(adapter, "logger") as mock_logger, ): adapter.log(data, step=5) - # Check that keys are sanitized + # Check that invalid characters are sanitized sent_metrics = mock_log_metrics.call_args[1]["metrics"] self.assertIn("invalid_at_key_", sent_metrics) # @ becomes _at_, ! becomes _ self.assertIn("bad key_", sent_metrics) # # becomes _, space remains self.assertNotIn("invalid@key!", sent_metrics) self.assertNotIn("bad key#", sent_metrics) - # Check that a warning was logged for each sanitized key + # Check that consecutive slashes are collapsed to single slashes + self.assertIn("val-aux/reward/mean_at_1", sent_metrics) + self.assertIn("val-core/acc/best_at_5", sent_metrics) + self.assertIn("metric/with/many/slashes", sent_metrics) + self.assertNotIn("val-aux//reward/mean_at_1", sent_metrics) + self.assertNotIn("val-core///acc/best_at_5", sent_metrics) + # Check that warnings were logged for all sanitized keys warning_msgs = [str(call) for call in mock_logger.warning.call_args_list] + # Warnings for invalid characters self.assertTrue(any("invalid@key!" in msg and "invalid_at_key_" in msg for msg in warning_msgs)) self.assertTrue(any("bad key#" in msg and "bad key_" in msg for msg in warning_msgs)) + # Warnings for consecutive slashes + self.assertTrue(any("val-aux//reward/mean_at_1" in msg for msg in warning_msgs)) + self.assertTrue(any("val-core///acc/best_at_5" in msg for msg in warning_msgs)) + self.assertTrue(any("metric////with/many////slashes" in msg for msg in warning_msgs)) if __name__ == "__main__": diff --git a/tests/workers/config/test_critic_config_on_cpu.py b/tests/workers/config/test_critic_config_on_cpu.py index d762763e0f1..fb03560e0f4 100644 --- a/tests/workers/config/test_critic_config_on_cpu.py +++ b/tests/workers/config/test_critic_config_on_cpu.py @@ -30,6 +30,7 @@ ) +@pytest.mark.skip(reason="This test is flaky when we actively load model config") class TestCriticConfig: """Test suite for critic configuration dataclasses.""" diff --git a/tests/workers/rollout/rollout_vllm/test_vllm_abort.py b/tests/workers/rollout/rollout_vllm/test_vllm_abort.py new file mode 100644 index 00000000000..82034f1e905 --- /dev/null +++ b/tests/workers/rollout/rollout_vllm/test_vllm_abort.py @@ -0,0 +1,217 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Test vLLM abort functionality. + +Usage: + pytest tests/workers/rollout/rollout_vllm/test_vllm_abort.py -v -s + or + python tests/workers/rollout/rollout_vllm/test_vllm_abort.py +""" + +import asyncio +import os +import time +from uuid import uuid4 + + +def test_vllm_abort(): + # ==================== Configuration ==================== + MODEL_PATH = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct") # /root/models/Qwen/Qwen2.5-1.5B-Instruct + GPUS_PER_NODE = 2 + TP_SIZE = 1 + ROLLOUT_NAME = "vllm" + ABORT_DELAY = 0.5 # seconds to wait before aborting + + print("=" * 60) + print("vLLM Abort Test") + print("=" * 60) + print(f"Model: {MODEL_PATH}") + print(f"GPUs: {GPUS_PER_NODE}, TP Size: {TP_SIZE}") + print(f"Abort Delay: {ABORT_DELAY}s") + print("=" * 60) + + # ==================== Initialize Ray ==================== + print("\n[1] Initializing Ray...") + import ray + + ray.init( + runtime_env={ + "env_vars": { + "TOKENIZERS_PARALLELISM": "true", + "NCCL_DEBUG": "WARN", + "VLLM_LOGGING_LEVEL": "INFO", + "VLLM_USE_V1": "1", + } + }, + ignore_reinit_error=True, + ) + + try: + # ==================== Create Config ==================== + print("\n[2] Creating config...") + from hydra import compose, initialize_config_dir + + config_dir = os.path.abspath("verl/verl/trainer/config") + if not os.path.exists(config_dir): + config_dir = os.path.abspath("verl/trainer/config") + + with initialize_config_dir(config_dir=config_dir, version_base=None): + config = compose(config_name="ppo_trainer") + + config.trainer.n_gpus_per_node = GPUS_PER_NODE + config.trainer.nnodes = 1 + config.actor_rollout_ref.model.path = MODEL_PATH + config.actor_rollout_ref.rollout.name = ROLLOUT_NAME + config.actor_rollout_ref.rollout.mode = "async" + config.actor_rollout_ref.rollout.tensor_model_parallel_size = TP_SIZE + config.actor_rollout_ref.rollout.prompt_length = 512 + config.actor_rollout_ref.rollout.response_length = 512 # Longer for abort test + + # ==================== Create Rollout Server ==================== + print("\n[3] Creating rollout server (this may take a while)...") + from verl.workers.rollout.replica import get_rollout_replica_class + + rollout_config = config.actor_rollout_ref.rollout + model_config = config.actor_rollout_ref.model + + rollout_server_class = get_rollout_replica_class(ROLLOUT_NAME) + server = rollout_server_class( + replica_rank=0, + config=rollout_config, + model_config=model_config, + gpus_per_node=GPUS_PER_NODE, + ) + + asyncio.run(server.init_standalone()) + server_handle = server._server_handle + print(f"Server address: {server._server_address}") + + # ==================== Load Tokenizer ==================== + print("\n[4] Loading tokenizer...") + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True) + + # ==================== Prepare Prompts ==================== + print("\n[5] Preparing prompts (to ensure generation takes time)...") + NUM_PROMPTS = 8 + prompts = [ + "Write a very long story about a brave knight and dragon.", + "Explain the history of the Roman Empire in great detail.", + "Describe quantum computing and its applications thoroughly.", + "Write an essay about climate change and its global effects.", + "Who won the Champions League in 2019?", + "Write a detailed analysis of Shakespeare's Hamlet.", + "Describe the process of photosynthesis in plants.", + "Write about the French Revolution and its consequences.", + ] + + all_prompt_ids = [] + for prompt in prompts[:NUM_PROMPTS]: + messages = [{"role": "user", "content": prompt}] + prompt_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True) + all_prompt_ids.append(prompt_ids) + print(f"Prepared {NUM_PROMPTS} prompts") + + # ==================== Start Generations and Abort ==================== + print("\n[6] Starting generations and then aborting...") + + sampling_params = { + "temperature": 1.0, + "top_p": 1.0, + "logprobs": False, + } + + # Start all generations concurrently + print(f"\n Starting {NUM_PROMPTS} generations...") + generate_refs = [] + for i, prompt_ids in enumerate(all_prompt_ids): + request_id = f"abort_test_{i}_{uuid4().hex[:8]}" + ref = server_handle.generate.remote( + request_id=request_id, + prompt_ids=prompt_ids, + sampling_params=sampling_params, + image_data=None, + ) + generate_refs.append((i, request_id, ref)) + print(f" Started request {i}: {request_id}") + + # Wait before aborting + print(f"\n Waiting {ABORT_DELAY}s before abort...") + time.sleep(ABORT_DELAY) + + # Call abort + print(" Calling abort_all_requests...") + abort_start = time.perf_counter() + abort_result = ray.get(server_handle.abort_all_requests.remote()) + abort_time = time.perf_counter() - abort_start + + print(f" Abort took: {abort_time * 1000:.2f}ms") + print(f" Abort result: {abort_result}") + + # Wait for all generations to finish + print("\n Waiting for all generations to complete...") + outputs = [] + for i, request_id, ref in generate_refs: + try: + output = ray.get(ref, timeout=10.0) + outputs.append((i, request_id, output)) + except ray.exceptions.GetTimeoutError: + print(f" Request {i} timed out!") + outputs.append((i, request_id, None)) + + # ==================== Print Results ==================== + print("\n" + "=" * 60) + print("RESULTS") + print("=" * 60) + + aborted_count = 0 + completed_count = 0 + timeout_count = 0 + + for i, request_id, output in outputs: + if output is None: + timeout_count += 1 + print(f"[{i}] {request_id}: TIMEOUT") + elif output.stop_reason == "aborted": + aborted_count += 1 + print(f"[{i}] {request_id}: ABORTED ({len(output.token_ids)} tokens)") + print(f"Partial Output: {tokenizer.decode(output.token_ids)}") + else: + completed_count += 1 + print(f"[{i}] {request_id}: COMPLETED ({output.stop_reason}, {len(output.token_ids)} tokens)") + print(f"Full Output: {tokenizer.decode(output.token_ids)}") + + print(f"\nSummary: {aborted_count} aborted, {completed_count} completed, {timeout_count} timeout") + + print("\n" + "=" * 60) + print(f"Abort result: {abort_result}") + print("=" * 60) + print("Abort test completed!") + + # Assertions for pytest + assert timeout_count == 0, "No requests should timeout" + assert aborted_count + completed_count == NUM_PROMPTS, "All requests should finish" + assert "aborted_count" in abort_result, "Abort result should contain aborted_count" + assert abort_time < 1.0, "Abort should be fast (< 1 second)" + + finally: + print("\nShutting down Ray...") + ray.shutdown() + + +if __name__ == "__main__": + # Can still run as standalone script + test_vllm_abort() diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py index 42e456acf6f..ec0ed25afb5 100644 --- a/verl/experimental/agent_loop/agent_loop.py +++ b/verl/experimental/agent_loop/agent_loop.py @@ -70,7 +70,7 @@ def __init__(self, config: DictConfig, server_handles: list[ray.actor.ActorHandl random.shuffle(self.server_handles) # Least requests load balancing - self.weighted_serveres = [[0, (hash(server), server)] for server in server_handles] + self.weighted_serveres = [[0, idx, server] for idx, server in enumerate(self.server_handles)] heapq.heapify(self.weighted_serveres) # LRU cache to map request_id to server @@ -81,7 +81,7 @@ def _choose_server(self, request_id: str) -> ray.actor.ActorHandle: if request_id in self.request_id_to_server: return self.request_id_to_server[request_id] - server = self.weighted_serveres[0][1][1] + _, _, server = self.weighted_serveres[0] self.weighted_serveres[0][0] += 1 heapq.heapreplace(self.weighted_serveres, self.weighted_serveres[0]) self.request_id_to_server[request_id] = server @@ -175,9 +175,10 @@ class _InternalAgentLoopOutput(AgentLoopOutput): """Extra fields for dynamic addition.""" -# make hydra.utils.instantiate happy -class _DummyConfig: - def __init__(self, config: DictConfig) -> None: +class DictConfigWrap: + """Wrapper for DictConfig to avoid hydra.utils.instantiate recursive resolve.""" + + def __init__(self, config: DictConfig): self.config = config @@ -185,11 +186,9 @@ class AgentLoopBase(ABC): """An agent loop takes an input message, chat with OpenAI compatible LLM server and interact with various environments.""" - _class_initialized = False - def __init__( self, - trainer_config: _DummyConfig, + trainer_config: DictConfigWrap, server_manager: AsyncLLMServerManager, tokenizer: AutoTokenizer, processor: AutoProcessor, @@ -198,32 +197,17 @@ def __init__( """Initialize agent loop, each sample will have its own loop instance. Args: - trainer_config (_DummyConfig): trainer config. + trainer_config (DictConfigWrap): trainer config. server_manager (AsyncLLMServerManager): OpenAI compatible LLM server manager. tokenizer (AutoTokenizer): Tokenizer for tokenize messages. processor (AutoProcessor): Processor for process messages. """ - self.init_class(config=trainer_config.config, tokenizer=tokenizer, processor=processor, **kwargs) self.config = trainer_config.config self.server_manager = server_manager self.tokenizer = tokenizer self.processor = processor self.loop = asyncio.get_running_loop() - @classmethod - def init_class(cls, config: DictConfig, tokenizer: AutoTokenizer, processor: AutoProcessor, **kwargs): - """This is used to do heavy initialization work that should shared across all instances. It's only called once. - - Args: - config (DictConfig): trainer config. - tokenizer (AutoTokenizer): Tokenizer for tokenize messages. - processor (AutoProcessor): Processor for process multi_modal data. - **kwargs: extra kwargs from config file passed in by `hydra.utils.instantiate`. - """ - if cls._class_initialized: - return - cls._class_initialized = True - @abstractmethod async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput: """Run agent loop to interact with LLM server and environment. @@ -297,12 +281,15 @@ def __init__( self.processor.chat_template = self.config.actor_rollout_ref.model.custom_chat_template self.tokenizer.chat_template = self.config.actor_rollout_ref.model.custom_chat_template - self.reward_manager_worker = RewardLoopWorker.options( - scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy( - node_id=ray.get_runtime_context().get_node_id(), - soft=False, - ), - ).remote(self.config, self.reward_router_address) + use_reward_loop = True if self.config.reward_model.use_reward_loop else None + self.use_reward_loop = use_reward_loop + if use_reward_loop and not hasattr(self, "reward_loop_worker"): + self.reward_loop_worker = RewardLoopWorker.options( + scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy( + node_id=ray.get_runtime_context().get_node_id(), + soft=False, + ), + ).remote(self.config, self.reward_router_address) trace_config = self.config.actor_rollout_ref.rollout.get("trace", {}) RolloutTraceConfig.init( @@ -417,7 +404,7 @@ async def _run_agent_loop( agent_loop_config = _agent_loop_registry[agent_name] agent_loop = hydra.utils.instantiate( config=agent_loop_config, - trainer_config=_DummyConfig(config=self.config), + trainer_config=DictConfigWrap(config=self.config), server_manager=self.server_manager, tokenizer=self.tokenizer, processor=self.processor, @@ -551,7 +538,7 @@ async def _agent_loop_postprocess(self, output, **kwargs) -> _InternalAgentLoopO enable_async_reward = ( self.reward_router_address is not None and self.config.reward_model.enable_resource_pool ) or not self.config.reward_model.enable - if output.reward_score is None and enable_async_reward: + if output.reward_score is None and enable_async_reward and self.use_reward_loop: batch = TensorDict( { "prompts": prompt_output["input_ids"], # [1, prompt_length] @@ -572,7 +559,7 @@ async def _agent_loop_postprocess(self, output, **kwargs) -> _InternalAgentLoopO batch=batch, non_tensor_batch=non_tensor_batch, ) - result = await self.reward_manager_worker.compute_score.remote(data) + result = await self.reward_loop_worker.compute_score.remote(data) output.reward_score = result["reward_score"] output.extra_fields["reward_extra_info"] = result["reward_extra_info"] diff --git a/verl/experimental/agent_loop/tool_agent_loop.py b/verl/experimental/agent_loop/tool_agent_loop.py index e107bb37b51..ef08376df75 100644 --- a/verl/experimental/agent_loop/tool_agent_loop.py +++ b/verl/experimental/agent_loop/tool_agent_loop.py @@ -20,9 +20,17 @@ from typing import Any, Optional from uuid import uuid4 -from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register +from transformers import AutoProcessor, AutoTokenizer + +from verl.experimental.agent_loop.agent_loop import ( + AgentLoopBase, + AgentLoopOutput, + AsyncLLMServerManager, + DictConfigWrap, + register, +) from verl.experimental.agent_loop.tool_parser import FunctionCall, ToolParser -from verl.experimental.agent_loop.utils import add_generation_prompt_for_gpt_oss, format_gpt_oss_tool_response_manually +from verl.experimental.agent_loop.utils import build_gpt_oss_tool_response_text from verl.interactions.base import BaseInteraction from verl.interactions.utils.interaction_registry import initialize_interactions_from_config from verl.tools.schemas import ToolResponse @@ -44,7 +52,8 @@ class AgentState(Enum): class AgentData: - """Encapsulates all state variables for the agent loop.""" + """Encapsulates all state variables for the agent loop. AgentData is passed to tool calling in case that + tool may need to access full history state. User can store any tool session data in `extra_fields`.""" def __init__( self, @@ -77,44 +86,49 @@ def __init__( # Temporary state for tool calls self.tool_calls: list[FunctionCall] = [] - # Extra fields for dynamic addition + # Extra fields for dynamic addition, e.g., tool session data self.extra_fields: dict[str, Any] = {} @register("tool_agent") class ToolAgentLoop(AgentLoopBase): - @classmethod - def init_class(cls, config, tokenizer, processor, **kwargs): - if cls._class_initialized: - return - cls._class_initialized = True - print("Performing class-level ToolAgentLoop initialization") + def __init__( + self, + trainer_config: DictConfigWrap, + server_manager: AsyncLLMServerManager, + tokenizer: AutoTokenizer, + processor: AutoProcessor, + **kwargs, + ): + super().__init__(trainer_config, server_manager, tokenizer, processor, **kwargs) + config = trainer_config.config # Initialize tools from config file - cls.tokenizer = tokenizer - cls.processor = processor - cls.max_user_turns = config.actor_rollout_ref.rollout.multi_turn.max_user_turns - cls.max_assistant_turns = config.actor_rollout_ref.rollout.multi_turn.max_assistant_turns - cls.max_parallel_calls = config.actor_rollout_ref.rollout.multi_turn.max_parallel_calls - cls.max_tool_response_length = config.actor_rollout_ref.rollout.multi_turn.max_tool_response_length - cls.tool_response_truncate_side = config.actor_rollout_ref.rollout.multi_turn.tool_response_truncate_side + self.max_user_turns = config.actor_rollout_ref.rollout.multi_turn.max_user_turns + self.max_assistant_turns = config.actor_rollout_ref.rollout.multi_turn.max_assistant_turns + self.max_parallel_calls = config.actor_rollout_ref.rollout.multi_turn.max_parallel_calls + self.max_tool_response_length = config.actor_rollout_ref.rollout.multi_turn.max_tool_response_length + self.tool_response_truncate_side = config.actor_rollout_ref.rollout.multi_turn.tool_response_truncate_side tool_config_path = config.actor_rollout_ref.rollout.multi_turn.tool_config_path tool_list = initialize_tools_from_config(tool_config_path) if tool_config_path else [] - cls.tools = {tool.name: tool for tool in tool_list} - cls.tool_schemas = [tool.tool_schema.model_dump(exclude_unset=True, exclude_none=True) for tool in tool_list] - cls.tool_parser = ToolParser.get_tool_parser(config.actor_rollout_ref.rollout.multi_turn.format, cls.tokenizer) - cls.tool_parser_name = config.actor_rollout_ref.rollout.multi_turn.format - print(f"Initialized tools: {cls.tools}") + self.tools = {tool.name: tool for tool in tool_list} + self.tool_schemas = [tool.tool_schema.model_dump(exclude_unset=True, exclude_none=True) for tool in tool_list] + self.tool_parser = ToolParser.get_tool_parser( + config.actor_rollout_ref.rollout.multi_turn.format, self.tokenizer + ) + self.tool_parser_name = config.actor_rollout_ref.rollout.multi_turn.format - cls.apply_chat_template_kwargs = config.data.get("apply_chat_template_kwargs", {}) - cls.prompt_length = config.actor_rollout_ref.rollout.prompt_length - cls.response_length = config.actor_rollout_ref.rollout.response_length - cls.system_prompt = initialize_system_prompt(cls.tokenizer, **cls.apply_chat_template_kwargs) + self.apply_chat_template_kwargs = config.data.get("apply_chat_template_kwargs", {}) + self.prompt_length = config.actor_rollout_ref.rollout.prompt_length + self.response_length = config.actor_rollout_ref.rollout.response_length + self.system_prompt = initialize_system_prompt(self.tokenizer, **self.apply_chat_template_kwargs) # Initialize interactions from config file - cls.interaction_config_file = config.actor_rollout_ref.rollout.multi_turn.interaction_config_path - if cls.interaction_config_file: - cls.interaction_map: dict[str, BaseInteraction] = cls._initialize_interactions(cls.interaction_config_file) + self.interaction_config_file = config.actor_rollout_ref.rollout.multi_turn.interaction_config_path + if self.interaction_config_file: + self.interaction_map: dict[str, BaseInteraction] = self._initialize_interactions( + self.interaction_config_file + ) @rollout_trace_op async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput: @@ -271,7 +285,7 @@ async def _handle_processing_tools_state(self, agent_data: AgentData) -> AgentSt tasks = [] tool_call_names = [] for tool_call in agent_data.tool_calls[: self.max_parallel_calls]: - tasks.append(self._call_tool(tool_call, agent_data.tools_kwargs)) + tasks.append(self._call_tool(tool_call, agent_data.tools_kwargs, agent_data)) tool_call_names.append(tool_call.name) with simple_timer("tool_calls", agent_data.metrics): @@ -346,14 +360,7 @@ async def _handle_processing_tools_state(self, agent_data: AgentData) -> AgentSt else: if self.tool_parser_name == "gpt-oss": logger.info("manually format tool responses for gpt-oss") - # Format tool responses manually - tool_response_texts = [] - for i, tool_msg in enumerate(add_messages): - actual_tool_name = tool_call_names[i] - formatted = format_gpt_oss_tool_response_manually(tool_msg["content"], actual_tool_name) - tool_response_texts.append(formatted) - - tool_response_text = add_generation_prompt_for_gpt_oss("".join(tool_response_texts)) + tool_response_text = build_gpt_oss_tool_response_text(add_messages, tool_call_names) response_ids = await self.loop.run_in_executor( None, lambda: self.tokenizer.encode(tool_response_text, add_special_tokens=False) ) @@ -434,7 +441,7 @@ async def _handle_interacting_state(self, agent_data: AgentData) -> AgentState: return AgentState.GENERATING async def _call_tool( - self, tool_call: FunctionCall, tools_kwargs: dict[str, Any] + self, tool_call: FunctionCall, tools_kwargs: dict[str, Any], agent_data: AgentData ) -> tuple[ToolResponse, float, dict]: """Call tool and return tool response.""" tool, instance_id = None, None @@ -445,7 +452,9 @@ async def _call_tool( tool = self.tools[tool_name] kwargs = tools_kwargs.get(tool_name, {}) instance_id, _ = await tool.create(create_kwargs=kwargs.get("create_kwargs", {})) - tool_execution_response, tool_reward, res = await tool.execute(instance_id, tool_args) + tool_execution_response, tool_reward, res = await tool.execute( + instance_id, tool_args, agent_data=agent_data + ) except Exception as e: logger.warning(f"Error when executing tool: {e}") return ( @@ -481,8 +490,7 @@ async def _call_tool( return ToolResponse(**tool_response_kwargs), tool_reward, res - @classmethod - def _initialize_interactions(cls, interaction_config_file): + def _initialize_interactions(self, interaction_config_file): """Initialize interactions from configuration. Returns: dict[str, BaseInteraction]: A dictionary mapping interaction names to interaction instances. @@ -491,5 +499,4 @@ def _initialize_interactions(cls, interaction_config_file): return {} interaction_map = initialize_interactions_from_config(interaction_config_file) - logger.info(f"Initialize interactions from configuration: interaction_map: {list(interaction_map.keys())}") return interaction_map diff --git a/verl/experimental/agent_loop/utils.py b/verl/experimental/agent_loop/utils.py index 39ffbd0335a..68cb57d870f 100644 --- a/verl/experimental/agent_loop/utils.py +++ b/verl/experimental/agent_loop/utils.py @@ -13,6 +13,7 @@ # limitations under the License. import os +from typing import Any def resolve_config_path(config_path: str) -> str: @@ -95,3 +96,13 @@ def add_generation_prompt_for_gpt_oss(message_content: str) -> str: Message content string with generation prompt """ return message_content + "<|start|>assistant" + + +def build_gpt_oss_tool_response_text(messages: list[dict[str, Any]], tool_call_names: list[str]) -> str: + """Build gpt-oss tool response text (manual formatting + generation prompt).""" + tool_response_texts: list[str] = [] + for i, tool_msg in enumerate(messages): + actual_tool_name = tool_call_names[i] + formatted = format_gpt_oss_tool_response_manually(tool_msg["content"], actual_tool_name) + tool_response_texts.append(formatted) + return add_generation_prompt_for_gpt_oss("".join(tool_response_texts)) diff --git a/verl/experimental/reward/__init__.py b/verl/experimental/reward/__init__.py index 1c8c72a423d..03807f0277b 100644 --- a/verl/experimental/reward/__init__.py +++ b/verl/experimental/reward/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .reward_manager import RewardLoopManager, RewardLoopWorker +from .reward_loop import RewardLoopManager, RewardLoopWorker from .reward_model import RewardModelManager __all__ = ["RewardModelManager", "RewardLoopWorker", "RewardLoopManager"] diff --git a/verl/experimental/reward/reward_manager.py b/verl/experimental/reward/reward_loop.py similarity index 94% rename from verl/experimental/reward/reward_manager.py rename to verl/experimental/reward/reward_loop.py index 52e7403ab6e..8e33ef86fec 100644 --- a/verl/experimental/reward/reward_manager.py +++ b/verl/experimental/reward/reward_loop.py @@ -30,7 +30,7 @@ from verl.utils import hf_tokenizer from verl.utils.fs import copy_to_local -from .reward_loop import get_reward_loop_manager_cls +from .reward_manager import get_reward_loop_manager_cls from .reward_model import RewardModelManager logger = logging.getLogger(__file__) @@ -136,6 +136,14 @@ async def _preprocess_reward_inputs(self, data: DataProto) -> str: add_generation_prompt=False, tokenize=False, ) + + # llama tokenizer will add bos token by default + # will be removed in vllm >= 0.11.2, where we can add "add_special_tokens" = False + if self.reward_model_tokenizer.bos_token is not None and rm_prompt.startswith( + self.reward_model_tokenizer.bos_token + ): + rm_prompt = rm_prompt[len(self.reward_model_tokenizer.bos_token) :] + return rm_prompt async def compute_score_disrm(self, data: DataProto) -> dict: @@ -148,7 +156,7 @@ async def compute_score_disrm(self, data: DataProto) -> dict: "model": model_name, "input": disrm_prompt, "activation": False, - "add_special_tokens": False, + # "add_special_tokens": False, # vllm >= 0.11.2 } output = await self._post_request(payloads, "classify") rm_score = output["data"][-1]["probs"][-1] @@ -187,7 +195,7 @@ def __init__(self, config: DictConfig, rm_resource_pool: RayResourcePool = None) def _init_reward_loop_workers(self): self.reward_loop_workers = [] - num_workers = self.config.reward_model.get("num_workers", 1) + num_workers = self.config.reward_model.num_workers node_ids = [node["NodeID"] for node in ray.nodes() if node["Alive"] and node["Resources"].get("CPU", 0) > 0] for i in range(num_workers): diff --git a/verl/experimental/reward/reward_loop/__init__.py b/verl/experimental/reward/reward_manager/__init__.py similarity index 100% rename from verl/experimental/reward/reward_loop/__init__.py rename to verl/experimental/reward/reward_manager/__init__.py diff --git a/verl/experimental/reward/reward_loop/base.py b/verl/experimental/reward/reward_manager/base.py similarity index 100% rename from verl/experimental/reward/reward_loop/base.py rename to verl/experimental/reward/reward_manager/base.py diff --git a/verl/experimental/reward/reward_loop/dapo.py b/verl/experimental/reward/reward_manager/dapo.py similarity index 97% rename from verl/experimental/reward/reward_loop/dapo.py rename to verl/experimental/reward/reward_manager/dapo.py index 5bd032c0827..d9c9307c6d7 100644 --- a/verl/experimental/reward/reward_loop/dapo.py +++ b/verl/experimental/reward/reward_manager/dapo.py @@ -15,8 +15,8 @@ import inspect from verl import DataProto -from verl.experimental.reward.reward_loop import register -from verl.experimental.reward.reward_loop.base import RewardLoopManagerBase +from verl.experimental.reward.reward_manager import register +from verl.experimental.reward.reward_manager.base import RewardLoopManagerBase from verl.utils.reward_score import default_compute_score diff --git a/verl/experimental/reward/reward_loop/limited.py b/verl/experimental/reward/reward_manager/limited.py similarity index 99% rename from verl/experimental/reward/reward_loop/limited.py rename to verl/experimental/reward/reward_manager/limited.py index 71bdcf95bee..180896915ce 100644 --- a/verl/experimental/reward/reward_loop/limited.py +++ b/verl/experimental/reward/reward_manager/limited.py @@ -20,8 +20,8 @@ from transformers import AutoTokenizer from verl import DataProto -from verl.experimental.reward.reward_loop import register as register_loop -from verl.experimental.reward.reward_loop.base import RewardLoopManagerBase +from verl.experimental.reward.reward_manager import register as register_loop +from verl.experimental.reward.reward_manager.base import RewardLoopManagerBase from verl.utils.reward_score import default_compute_score from verl.workers.reward_manager import register as register_manager diff --git a/verl/experimental/reward/reward_loop/naive.py b/verl/experimental/reward/reward_manager/naive.py similarity index 96% rename from verl/experimental/reward/reward_loop/naive.py rename to verl/experimental/reward/reward_manager/naive.py index d607cef7ff3..cbe5c474bff 100644 --- a/verl/experimental/reward/reward_loop/naive.py +++ b/verl/experimental/reward/reward_manager/naive.py @@ -15,8 +15,8 @@ import inspect from verl import DataProto -from verl.experimental.reward.reward_loop import register -from verl.experimental.reward.reward_loop.base import RewardLoopManagerBase +from verl.experimental.reward.reward_manager import register +from verl.experimental.reward.reward_manager.base import RewardLoopManagerBase from verl.utils.reward_score import default_compute_score diff --git a/verl/experimental/reward/reward_loop/registry.py b/verl/experimental/reward/reward_manager/registry.py similarity index 96% rename from verl/experimental/reward/reward_loop/registry.py rename to verl/experimental/reward/reward_manager/registry.py index 099e5eb200d..f31a1762e05 100644 --- a/verl/experimental/reward/reward_loop/registry.py +++ b/verl/experimental/reward/reward_manager/registry.py @@ -14,7 +14,7 @@ from typing import Callable -from verl.experimental.reward.reward_loop.base import RewardLoopManagerBase +from verl.experimental.reward.reward_manager.base import RewardLoopManagerBase __all__ = ["register", "get_reward_loop_manager_cls"] diff --git a/verl/experimental/reward/reward_model.py b/verl/experimental/reward/reward_model.py index 957ef05bf49..2bc05e1eea1 100644 --- a/verl/experimental/reward/reward_model.py +++ b/verl/experimental/reward/reward_model.py @@ -91,10 +91,13 @@ def _initialize_llm_servers(self): def _initialize_router(self): worker_urls = [f"http://{server_address}" for server_address in self.server_addresses] - if self.config.rollout.name == "sglang": - from .router.inner_sglang_router import launch_router_process - else: - from .router.naive_router import launch_router_process + # TODO (dyy): sglang router is not ready yet. + # if self.config.rollout.name == "sglang": + # from .router.inner_sglang_router import launch_router_process + # else: + # from .router.naive_router import launch_router_process + + from .router.naive_router import launch_router_process self.router_address, _ = launch_router_process(worker_urls=worker_urls) diff --git a/verl/models/llama/megatron/layers/parallel_rmsnorm.py b/verl/models/llama/megatron/layers/parallel_rmsnorm.py index bc2e9ae36f0..23a4a847ff8 100644 --- a/verl/models/llama/megatron/layers/parallel_rmsnorm.py +++ b/verl/models/llama/megatron/layers/parallel_rmsnorm.py @@ -15,7 +15,6 @@ import numbers import torch -from apex.normalization.fused_layer_norm import fused_rms_norm_affine from megatron.core import ModelParallelConfig from torch import nn from transformers import LlamaConfig @@ -39,6 +38,8 @@ def __init__(self, config: LlamaConfig, megatron_config: ModelParallelConfig): sp_utils.mark_parameter_as_sequence_parallel(self.weight) def forward(self, hidden_states): + from apex.normalization.fused_layer_norm import fused_rms_norm_affine + return fused_rms_norm_affine( input=hidden_states, weight=self.weight, diff --git a/verl/models/mcore/model_forward.py b/verl/models/mcore/model_forward.py index 3a9d6bb4aba..5b1d4dc4391 100644 --- a/verl/models/mcore/model_forward.py +++ b/verl/models/mcore/model_forward.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import torch from verl.utils.megatron_utils import unwrap_model @@ -152,6 +153,8 @@ def gptmodel_forward_no_padding( logits_processor=None, logits_processor_args: dict = None, value_model=False, + vision_model=False, + pad_token_id=None, data_format: str = "thd", ): """Default forward pass for GPT models with optional sequence packing.""" @@ -174,9 +177,19 @@ def gptmodel_forward_no_padding( if data_format == "thd": input_ids_rmpad, packed_seq_params = preprocess_thd_no_padding(input_ids, pre_process=pre_process) input_ids_rmpad = input_ids_rmpad.contiguous() + + # For VLM model, need to pass bshd format `input_ids` and `attention_mask`. + attention_mask = None + if vision_model: + input_ids_rmpad = input_ids.to_padded_tensor(pad_token_id) + seqlens_in_batch = input_ids.offsets().diff() + attention_mask = torch.zeros_like(input_ids_rmpad, dtype=torch.bool) + for i, seqlen in enumerate(seqlens_in_batch): + attention_mask[i, :seqlen] = True + output_orig = model( input_ids=input_ids_rmpad, - attention_mask=None, + attention_mask=attention_mask, position_ids=None, packed_seq_params=packed_seq_params, **model_kwargs, diff --git a/verl/models/mcore/registry.py b/verl/models/mcore/registry.py index d8c7b2cfa86..fd0f0ce1a5f 100644 --- a/verl/models/mcore/registry.py +++ b/verl/models/mcore/registry.py @@ -22,6 +22,54 @@ import torch import torch.nn as nn +from .model_forward import gptmodel_forward_no_padding, model_forward_gen +from .model_forward_fused import fused_forward_model_gen + + +class SupportedVLM(Enum): + QWEN2_5_VL = "Qwen2_5_VLForConditionalGeneration" + QWEN3_MOE_VL = "Qwen3VLMoeForConditionalGeneration" + QWEN3_VL = "Qwen3VLForConditionalGeneration" + + +def get_mcore_forward_fn(hf_config) -> Callable: + """ + Get the forward function for given model architecture. + """ + assert len(hf_config.architectures) == 1, "Only one architecture is supported for now" + if hf_config.architectures[0] in SupportedVLM: + return model_forward_gen(True) + else: + # default to language model + return model_forward_gen(False) + + +def get_mcore_forward_no_padding_fn(hf_config) -> Callable: + """ + Get the forward function for given model architecture. + """ + assert len(hf_config.architectures) == 1, "Only one architecture is supported for now" + return gptmodel_forward_no_padding + + +def get_mcore_forward_fused_fn(hf_config) -> Callable: + """ + Get the forward function for given model architecture. + """ + assert len(hf_config.architectures) == 1, "Only one architecture is supported for now" + if hf_config.architectures[0] in SupportedVLM: + return fused_forward_model_gen(True) + else: + # default to language model + return fused_forward_model_gen(False) + + +# ruff: noqa + +######################################################## +# below is the deprecated code +######################################################## + from .config_converter import ( PretrainedConfig, TransformerConfig, @@ -33,8 +81,6 @@ hf_to_mcore_config_qwen2moe, hf_to_mcore_config_qwen3moe, ) -from .model_forward import gptmodel_forward_no_padding, model_forward_gen -from .model_forward_fused import fused_forward_model_gen from .model_initializer import ( BaseModelInitializer, DeepseekV3Model, @@ -67,6 +113,7 @@ class SupportedModel(Enum): GLM4_MOE = "Glm4MoeForCausalLM" QWEN3_TOKEN_CLASSIFICATION = "Qwen3ForTokenClassification" + LLAMA_TOKEN_CLASSIFICATION = "LlamaForTokenClassification" QWEN3_MOE_VL = "Qwen3VLMoeForConditionalGeneration" QWEN3_VL = "Qwen3VLForConditionalGeneration" GPT_OSS = "GptOssForCausalLM" @@ -84,6 +131,7 @@ class SupportedModel(Enum): SupportedModel.QWEN3: hf_to_mcore_config_dense, SupportedModel.QWEN3_MOE: hf_to_mcore_config_qwen3moe, SupportedModel.QWEN3_TOKEN_CLASSIFICATION: hf_to_mcore_config_dense, + SupportedModel.LLAMA_TOKEN_CLASSIFICATION: hf_to_mcore_config_dense, } # Registry for model initializers @@ -98,6 +146,7 @@ class SupportedModel(Enum): SupportedModel.QWEN3: DenseModel, SupportedModel.QWEN3_MOE: Qwen3MoEModel, SupportedModel.QWEN3_TOKEN_CLASSIFICATION: DenseModel, + SupportedModel.LLAMA_TOKEN_CLASSIFICATION: DenseModel, } # Registry for model forward functions @@ -113,9 +162,9 @@ class SupportedModel(Enum): SupportedModel.QWEN2_5_VL: model_forward_gen(True), SupportedModel.QWEN3_MOE_VL: model_forward_gen(True), SupportedModel.QWEN3_VL: model_forward_gen(True), - SupportedModel.DEEPSEEK_V3: model_forward_gen(), SupportedModel.GLM4_MOE: model_forward_gen(), SupportedModel.QWEN3_TOKEN_CLASSIFICATION: model_forward_gen(), + SupportedModel.LLAMA_TOKEN_CLASSIFICATION: model_forward_gen(), SupportedModel.GPT_OSS: model_forward_gen(), } @@ -132,9 +181,9 @@ class SupportedModel(Enum): SupportedModel.LLAMA4: gptmodel_forward_no_padding, SupportedModel.QWEN3: gptmodel_forward_no_padding, SupportedModel.QWEN3_MOE: gptmodel_forward_no_padding, - SupportedModel.DEEPSEEK_V3: gptmodel_forward_no_padding, SupportedModel.GLM4_MOE: gptmodel_forward_no_padding, SupportedModel.QWEN3_TOKEN_CLASSIFICATION: gptmodel_forward_no_padding, + SupportedModel.LLAMA_TOKEN_CLASSIFICATION: gptmodel_forward_no_padding, SupportedModel.GPT_OSS: gptmodel_forward_no_padding, } @@ -144,7 +193,6 @@ class SupportedModel(Enum): SupportedModel.QWEN2: fused_forward_model_gen(), SupportedModel.QWEN2_MOE: fused_forward_model_gen(), SupportedModel.MIXTRAL: fused_forward_model_gen(), - SupportedModel.DEEPSEEK_V3: fused_forward_model_gen(), SupportedModel.QWEN2_5_VL: fused_forward_model_gen(True), SupportedModel.QWEN3_MOE_VL: fused_forward_model_gen(True), SupportedModel.QWEN3_VL: fused_forward_model_gen(True), @@ -167,6 +215,7 @@ class SupportedModel(Enum): SupportedModel.QWEN3_MOE: McoreToHFWeightConverterQwen3Moe, SupportedModel.QWEN2_5_VL: McoreToHFWeightConverterQwen2_5_VL, SupportedModel.QWEN3_TOKEN_CLASSIFICATION: McoreToHFWeightConverterDense, + SupportedModel.LLAMA_TOKEN_CLASSIFICATION: McoreToHFWeightConverterDense, } @@ -236,33 +285,6 @@ def init_mcore_model( ) -def get_mcore_forward_fn(hf_config: PretrainedConfig) -> Callable: - """ - Get the forward function for given model architecture. - """ - assert len(hf_config.architectures) == 1, "Only one architecture is supported for now" - model = get_supported_model(hf_config.architectures[0]) - return MODEL_FORWARD_REGISTRY[model] - - -def get_mcore_forward_no_padding_fn(hf_config: PretrainedConfig) -> Callable: - """ - Get the forward function for given model architecture. - """ - assert len(hf_config.architectures) == 1, "Only one architecture is supported for now" - model = get_supported_model(hf_config.architectures[0]) - return MODEL_FORWARD_NOPAD_REGISTRY[model] - - -def get_mcore_forward_fused_fn(hf_config: PretrainedConfig) -> Callable: - """ - Get the forward function for given model architecture. - """ - assert len(hf_config.architectures) == 1, "Only one architecture is supported for now" - model = get_supported_model(hf_config.architectures[0]) - return MODEL_FORWARD_FUSED_REGISTRY[model] - - def get_mcore_weight_converter(hf_config: PretrainedConfig, dtype: torch.dtype) -> Callable: """ Get the weight converter for given model architecture. diff --git a/verl/models/qwen2/megatron/modeling_qwen2_megatron.py b/verl/models/qwen2/megatron/modeling_qwen2_megatron.py index c536a0fba27..b3512f8afa5 100644 --- a/verl/models/qwen2/megatron/modeling_qwen2_megatron.py +++ b/verl/models/qwen2/megatron/modeling_qwen2_megatron.py @@ -583,7 +583,7 @@ def _init_head(self, config): def setup_embeddings_and_output_layer(self) -> None: """Sets up embedding layer in first stage and output layer in last stage. - This function initalizes word embeddings in the final stage when we are + This function initializes word embeddings in the final stage when we are using pipeline parallelism and sharing word embeddings, and sets up param attributes on the embedding and output layers. """ diff --git a/verl/models/transformers/monkey_patch.py b/verl/models/transformers/monkey_patch.py index 59b342f879f..c4ec20c1362 100644 --- a/verl/models/transformers/monkey_patch.py +++ b/verl/models/transformers/monkey_patch.py @@ -356,7 +356,11 @@ def state_dict(self, *args, **kwargs): Qwen3VLMoeTextModel, ) - from verl.models.transformers.qwen3_vl import forward_with_normal_backend, qwen3_vl_base_forward + from verl.models.transformers.qwen3_vl import ( + forward_with_normal_backend, + patch_qwen3_vl_moe_sparse_moe_block_forward, + qwen3_vl_base_forward, + ) Qwen3VLModel.forward = qwen3_vl_base_forward Qwen3VLMoeModel.forward = qwen3_vl_base_forward @@ -364,6 +368,10 @@ def state_dict(self, *args, **kwargs): Qwen3VLMoeForConditionalGeneration.forward = forward_with_normal_backend print(f"Monkey patch {model.__class__.__name__} model forward") + # Step 1.5: patch Qwen3VLMoeTextSparseMoeBlock to fix transformers 4.57.3 bug + if model.config.model_type == "qwen3_vl_moe" and is_transformers_version_in_range(max_version="4.57.3"): + patch_qwen3_vl_moe_sparse_moe_block_forward() + # Step 2: patch input for multimodal sequence parallelism if ulysses_sp_size > 1: patch_vlm_for_ulysses_input_slicing(Qwen3VLTextModel) diff --git a/verl/models/transformers/qwen3_vl.py b/verl/models/transformers/qwen3_vl.py index d0512172f46..38aa9cbfd4b 100644 --- a/verl/models/transformers/qwen3_vl.py +++ b/verl/models/transformers/qwen3_vl.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import functools import logging import os from dataclasses import dataclass @@ -334,3 +335,41 @@ def forward_with_triton_backend( entropy=entropy, hidden_states=outputs.hidden_states, ) + + +def patch_qwen3_vl_moe_sparse_moe_block_forward(): + """ + Monkey patch to fix a bug in transformers 4.57.3 where Qwen3VLMoeTextSparseMoeBlock.forward + incorrectly uses torch.zeros_like(hidden_states) instead of torch.zeros_like(router_logits) + when creating router_weights (line 148 in modeling_qwen3_vl_moe.py). + + This is a minimal fix that only changes the problematic line while keeping the rest of the + original implementation intact. + """ + try: + from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextSparseMoeBlock + except ImportError: + # Model not available, skip patching + return + + # Store the original forward method for reference + original_forward = Qwen3VLMoeTextSparseMoeBlock.forward + + @functools.wraps(original_forward) + def patched_forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + batch_size = hidden_states.shape[0] + hidden_states = hidden_states.reshape(-1, self.hidden_size) + router_logits = self.gate(hidden_states) + routing_weights = torch.nn.functional.softmax(router_logits, dim=-1, dtype=torch.float) + routing_weights, router_indices = torch.topk(routing_weights, self.top_k, dim=-1) + routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True) + # BUG FIX: Original code incorrectly uses hidden_states here, should use router_logits + routing_weights = routing_weights.to(router_logits.dtype) + router_weights = torch.zeros_like(router_logits).scatter_(1, router_indices, routing_weights) + hidden_states = hidden_states.reshape(batch_size, -1, self.hidden_size) + routed_out = self.experts(hidden_states, router_weights, router_indices) + return routed_out + + # Apply the patch + Qwen3VLMoeTextSparseMoeBlock.forward = patched_forward + logger.info("Monkey patched Qwen3VLMoeTextSparseMoeBlock.forward to fix router_weights bug") diff --git a/verl/models/weight_loader_registry.py b/verl/models/weight_loader_registry.py index 0904f14fad4..ee60ea71f0e 100644 --- a/verl/models/weight_loader_registry.py +++ b/verl/models/weight_loader_registry.py @@ -48,6 +48,7 @@ def get_weight_saver(arch: str): "Qwen3ForCausalLM": merge_megatron_ckpt_gptmodel, "Qwen3ForTokenClassification": merge_megatron_ckpt_gptmodel, "Qwen3MoeForCausalLM": merge_megatron_ckpt_gptmodel_qwen_moe, + "LlamaForTokenClassification": merge_megatron_ckpt_gptmodel, } if arch in _MODEL_WEIGHT_MEGATRON_SAVER_REGISTRY: return _MODEL_WEIGHT_MEGATRON_SAVER_REGISTRY[arch] diff --git a/verl/single_controller/base/decorator.py b/verl/single_controller/base/decorator.py index 1fa0496eaaa..cfcd793045d 100644 --- a/verl/single_controller/base/decorator.py +++ b/verl/single_controller/base/decorator.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import inspect from functools import partial, wraps from types import FunctionType @@ -20,7 +19,7 @@ from verl.protocol import DataProtoFuture, _padding_size_key from verl.utils.py_functional import DynamicEnum -from verl.utils.tensordict_utils import concat_tensordict +from verl.utils.tensordict_utils import chunk_tensordict, concat_tensordict from verl.utils.transferqueue_utils import BatchMeta # here we add a magic number of avoid user-defined function already have this attribute @@ -78,14 +77,20 @@ def _split_args_kwargs_data_proto(chunks, *args, **kwargs): splitted_args = [] for arg in args: assert isinstance(arg, DataProto | DataProtoFuture | BatchMeta | TensorDict) - chunked_arg = arg.chunk(chunks=chunks) + if isinstance(arg, TensorDict): + chunked_arg = chunk_tensordict(arg, chunks) + else: + chunked_arg = arg.chunk(chunks=chunks) assert len(chunked_arg) == chunks splitted_args.append(chunked_arg) splitted_kwargs = {} for key, val in kwargs.items(): assert isinstance(val, DataProto | DataProtoFuture | BatchMeta | TensorDict) - chunked_kwarg = val.chunk(chunks=chunks) + if isinstance(val, TensorDict): + chunked_kwarg = chunk_tensordict(val, chunks) + else: + chunked_kwarg = val.chunk(chunks=chunks) assert len(chunked_kwarg) == chunks splitted_kwargs[key] = chunked_kwarg diff --git a/verl/single_controller/ray/base.py b/verl/single_controller/ray/base.py index feb73a5430e..f1bdb553d5f 100644 --- a/verl/single_controller/ray/base.py +++ b/verl/single_controller/ray/base.py @@ -220,13 +220,15 @@ def split_resource_pool( else: start_bundle_idx_list = np.cumsum([0] + split_size_list[:-1]) + # ensure resource_pool.pgs has been initialized + placement_groups = resource_pool.get_placement_groups() split_resource_pools = [ SubRayResourcePool( process_on_nodes=resource_pool.store, use_gpu=resource_pool.use_gpu, name_prefix=f"{resource_pool.name_prefix}_split_{split_idx}", max_colocate_count=resource_pool.max_colocate_count, - placement_groups=resource_pool.pgs, + placement_groups=placement_groups, start_bundle_index=start_bundle_idx_list[split_idx], subgroup_world_size=split_size_list[split_idx], ) diff --git a/verl/third_party/sglang/__init__.py b/verl/third_party/sglang/__init__.py deleted file mode 100644 index 15593caaf36..00000000000 --- a/verl/third_party/sglang/__init__.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright 2023-2024 SGLang Team -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -# Copyright 2024 Bytedance Ltd. and/or its affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/verl/third_party/sglang/parallel_state.py b/verl/third_party/sglang/parallel_state.py deleted file mode 100644 index cdec743d13f..00000000000 --- a/verl/third_party/sglang/parallel_state.py +++ /dev/null @@ -1,328 +0,0 @@ -# Copyright 2024 Bytedance Ltd. and/or its affiliates -# Copyright 2023 The SGlang team. -# Adapted from -# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -"""Model and data parallel groups.""" - -import os -from typing import Optional - -import sglang.srt.distributed.parallel_state as ps -import torch -import torch.distributed -from sglang.srt.distributed.parallel_state import ( - get_pp_group, - get_world_group, - init_distributed_environment, - init_model_parallel_group, -) - -""" -This version is strongly tied with Megatron to implement HybridEngine and weight sharing between vllm and Megatron. -- We assume the Megatron tp+dp+pp world is already established before calling this function. - -""" - -# Device mesh for using DTensor -_DEVICE_MESH = None - -# Tensor model parallel group that the current rank belongs to. -_TP = None -# Pipeline model parallel group that the current rank belongs to. -_PP = None - - -# This method is for initializing the ParallelGroup when using HybridEngine -# NOTE(linjunrong): this function is for megatron -def initialize_parallel_state( - distributed_init_method: str = "env://", - backend: str = "nccl", - tensor_model_parallel_size: int = 1, - num_tp_per_train_tp: int = 1, - pipeline_model_parallel_size: int = 1, -): - # torch.distributed.all_reduce does not free the input tensor until - # the synchronization point. This causes the memory usage to grow - # as the number of all_reduce calls increases. This env var disables - # this behavior. - # Related issue: - # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573 - os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1" - - # NOTE(sgm): Modify for verl, Env vars will be set by TORCHRUN. - rank = int(os.getenv("RANK", "-1")) - local_rank = int(os.getenv("LOCAL_RANK", "0")) - - # Use the world_size set by TORCHRUN - world_size = int(os.getenv("WORLD_SIZE", "-1")) - assert world_size != -1, "The world_size is set to -1, not initialized by TORCHRUN" - init_distributed_environment(world_size, rank, distributed_init_method, local_rank, backend) - if torch.distributed.get_world_size() > 1: - # NOTE: build a separate inference group with infer tp & micro dp - initialize_model_parallel_for_sglang( - tensor_model_parallel_size=tensor_model_parallel_size, - num_tensor_model_parallel_groups_per_train_tp=num_tp_per_train_tp, - ) - else: - initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, backend) - - -# NOTE(linjunrong): After init SGLang rollout using class EngineFragment, user should always remember to call -# this function to sync the _TP, _PP define at the beginning of this file. Otherwise, only the conterparts -# inside sglang.srt.distributed are init as ProcessGroup, the symbols defined in this file remain as None. -# It could be weird to maintain two _TP and _PP, I follow the same way to maintain an extra ones for -# verl itself as how it was done in verl.third_party.vllm.parallel_state. Note that the process is a little -# bit different -def ensure_model_parallel_initialized( - tensor_model_parallel_size: int, - pipeline_model_parallel_size: int = 1, - backend: Optional[str] = None, -) -> None: - """Helper to initialize model parallel groups if they are not initialized, - or ensure tensor-parallel and pipeline-parallel sizes are equal to expected - values if the model parallel groups are initialized. - """ - # get the backend of _DEVICE_WORLD_GROUP - backend = backend or torch.distributed.get_backend(get_world_group().device_group) - if not model_parallel_is_initialized(): - initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, backend) - return - - assert get_tensor_model_parallel_world_size() == tensor_model_parallel_size, ( - f"tensor parallel group already initialized, but of unexpected size: " - f"{get_tensor_model_parallel_world_size()=} vs. {tensor_model_parallel_size=}" - ) - pp_world_size = get_pp_group().world_size - assert pp_world_size == pipeline_model_parallel_size, ( - f"pipeline parallel group already initialized, but of unexpected size: {pp_world_size=} vs. " - f"{pipeline_model_parallel_size=}" - ) - - -# TODO(sgm): deviate from the v0.5.4, not pp now -# NOTE(linjunrong): the SGLang version using _TP instead of ps._TP -def model_parallel_is_initialized(): - """Check if tensor and pipeline parallel groups are initialized.""" - return _TP is not None - # and _PIPELINE_MODEL_PARALLEL_GROUP is not None) - - -def initialize_model_parallel_for_sglang( - tensor_model_parallel_size: int, - num_tensor_model_parallel_groups_per_train_tp: int = 1, - pipeline_model_parallel_size: int = 1, -) -> None: - pass - - # Get world size and rank. Ensure some consistencies. - assert torch.distributed.is_initialized() - - assert isinstance(tensor_model_parallel_size, int) - - # assert num_tensor_model_parallel_groups_per_train_tp == 1 and not different_tp_group - # assert num_tensor_model_parallel_groups_per_train_tp > 1 and different_tp_group - - # Build the tensor model-parallel groups. - assert ps._TP is None, "tensor model parallel group is already initialized" - - global _TP - - world_size: int = torch.distributed.get_world_size() - - backend = torch.distributed.get_backend() - - num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size - - if num_tensor_model_parallel_groups_per_train_tp == 1: - # if tensor_model_parallel_size == train_tensor_parallel_size: - # using the same tp group as Megatron/vllm - assert _TP is None, "tensor model parallel group is already initialized" - group_ranks = [] - for i in range(num_tensor_model_parallel_groups): - ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size) - group_ranks.append(ranks) - _TP = init_model_parallel_group( - group_ranks=group_ranks, - local_rank=get_world_group().local_rank, - backend=backend, - use_custom_allreduce=False, # TODO: check why True is not work in Ray trainer - use_message_queue_broadcaster=True, - ) - ps._TP = _TP - # _MICRO_DATA_PARALLEL_GROUP is move to hybrid engine - else: - # initialize a micro_dp group and a tp group - # assume training tp=4, infer tp=2, then, weight is partitioned as - # [1], [2], [3], [4] for training and [1,2], [1,2], [3,4], [3,4] for inference - - # Build the inference tp groups - # train_tp = train_tensor_parallel_size - train_tp = num_tensor_model_parallel_groups_per_train_tp * tensor_model_parallel_size - # num_tensor_model_parallel_groups_per_train_tp = train_tp // tensor_model_parallel_size - assert _TP is None, "tensor model parallel group is already initialized" - group_ranks = [] - for i in range(num_tensor_model_parallel_groups // num_tensor_model_parallel_groups_per_train_tp): - start = train_tp * i - end = train_tp * (i + 1) - for j in range(num_tensor_model_parallel_groups_per_train_tp): - ranks = list(range(start, end, num_tensor_model_parallel_groups_per_train_tp)) - for i in range(len(ranks)): - ranks[i] += j - group_ranks.append(ranks) - _TP = init_model_parallel_group( - group_ranks=group_ranks, - local_rank=get_world_group().local_rank, - backend=backend, - use_custom_allreduce=False, # TODO: check why True is not work in Ray trainer - use_message_queue_broadcaster=True, - ) - ps._TP = _TP - - # Build the pipeline model-parallel groups. - # global _PIPELINE_MODEL_PARALLEL_GROUP - # global _PIPELINE_GLOBAL_RANKS - # assert ps._PIPELINE_MODEL_PARALLEL_GROUP is None, ("pipeline model parallel group is already initialized") - - # ps._PIPELINE_MODEL_PARALLEL_GROUP = mpu.get_pipeline_model_parallel_group() - # ps._PIPELINE_GLOBAL_RANKS = mpu.get_pipeline_model_parallel_ranks() - - # TODO: init using device mesh (not support hybrid engine now) - # Build the pipeline model-parallel groups. - num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size - global _PP - assert _PP is None, "pipeline model parallel group is already initialized" - group_ranks = [] - for i in range(num_pipeline_model_parallel_groups): - ranks = list(range(i, world_size, num_pipeline_model_parallel_groups)) - group_ranks.append(ranks) - # pipeline parallel does not need custom allreduce - _PP = init_model_parallel_group(group_ranks, get_world_group().local_rank, backend, use_custom_allreduce=False) - ps._PP = _PP # for verl - - -def initialize_model_parallel( - tensor_model_parallel_size: int = 1, - pipeline_model_parallel_size: int = 1, - backend: Optional[str] = None, -) -> None: - """ - NOTE: This method is a hack from the open-sourced version without - asertion of world_size = tp * pp - - Initialize model parallel groups. - - Arguments: - tensor_model_parallel_size: number of GPUs used for tensor model - parallelism. - pipeline_model_parallel_size: number of GPUs used for pipeline model - parallelism. - - Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we - use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize - the model pipeline. The present function will - create 4 tensor model-parallel groups and 2 pipeline model-parallel groups: - 4 tensor model-parallel groups: - [g0, g1], [g2, g3], [g4, g5], [g6, g7] - 2 pipeline model-parallel groups: - [g0, g2, g4, g6], [g1, g3, g5, g7] - Note that for efficiency, the caller should make sure adjacent ranks - are on the same DGX box. For example if we are using 2 DGX-1 boxes - with a total of 16 GPUs, rank 0 to 7 belong to the first box and - ranks 8 to 15 belong to the second box. - """ - # Get world size and rank. Ensure some consistencies. - assert torch.distributed.is_initialized() - world_size: int = torch.distributed.get_world_size() - backend = backend or torch.distributed.get_backend(ps.get_world_group().device_group) - - # NOTE(sgm) we don't assert world_size == tp * pp - # DP is not managed by vllm but by the VeRL WorkerGroup - # if (world_size != - # tensor_model_parallel_size * pipeline_model_parallel_size): - # raise RuntimeError( - # f"world_size ({world_size}) is not equal to " - # f"tensor_model_parallel_size ({tensor_model_parallel_size}) x " - # f"pipeline_model_parallel_size ({pipeline_model_parallel_size})") - - num_tensor_model_parallel_groups: int = world_size // tensor_model_parallel_size - - global _TP - assert _TP is None, "tensor model parallel group is already initialized" - group_ranks = [] - for i in range(num_tensor_model_parallel_groups): - ranks = list(range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)) - group_ranks.append(ranks) - - # message queue broadcaster is only used in tensor model parallel group - if ps._TP is not None: - _TP = ps._TP - else: - _TP = init_model_parallel_group( - group_ranks, - get_world_group().local_rank, - backend, - use_custom_allreduce=False, # TODO: check why True is not work in Ray trainer - use_message_queue_broadcaster=True, - ) - ps._TP = _TP - - # TODO: init using device mesh (not support hybrid engine now) - # Build the pipeline model-parallel groups. - num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size - global _PP - assert _PP is None, "pipeline model parallel group is already initialized" - group_ranks = [] - for i in range(num_pipeline_model_parallel_groups): - ranks = list(range(i, world_size, num_pipeline_model_parallel_groups)) - group_ranks.append(ranks) - # pipeline parallel does not need custom allreduce - if ps._TP is not None: - _PP = ps._TP - else: - _PP = init_model_parallel_group(group_ranks, get_world_group().local_rank, backend, use_custom_allreduce=False) - ps._PP = _PP - - -""" -Device mesh utilities -""" - - -def get_device_mesh(): - assert _DEVICE_MESH is not None, "device mesh is not initialized" - return _DEVICE_MESH - - -""" -Tensor model parallel utilities -""" - - -# NOTE(linjunrong): In the vllm version parallel_state.py. verl created its own _TP and _PP as verl want to use -# the process group for some extra purpose. Under the hood, there is no difference between them and the original -# one in vllm.distributed.parallel_state. However, the implementation need to hack the init process of inference -# engine, as we do not maintain another SGLang here, I just use the original _TP and _PP directly. -def get_tensor_model_parallel_group(): - """Get the tensor model parallel group the caller rank belongs to.""" - - assert _TP is not None, "tensor model parallel group is not initialized" - return _TP.device_group - - -def get_tensor_model_parallel_world_size(): - """Return world size for the tensor model parallel group.""" - return torch.distributed.get_world_size(group=get_tensor_model_parallel_group()) - - -def get_tensor_model_parallel_rank(): - """Return my rank for the tensor model parallel group.""" - return torch.distributed.get_rank(group=get_tensor_model_parallel_group()) - - -def get_tensor_model_parallel_src_rank(): - """Calculate the global rank corresponding to the first local rank - in the tensor model parallel group.""" - global_rank = torch.distributed.get_rank() - local_world_size = get_tensor_model_parallel_world_size() - return (global_rank // local_world_size) * local_world_size diff --git a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml index b40d462d4f6..a117c0f332f 100644 --- a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml +++ b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml @@ -52,7 +52,7 @@ actor_rollout_ref: recompute_num_layers: null attention_backend: flash override_mcore_model_config: {} - use_mbridge: false + use_mbridge: true vanilla_mbridge: true use_remove_padding: true forward_only: false @@ -88,6 +88,7 @@ actor_rollout_ref: kl_loss_type: low_var_kl ppo_epochs: 1 shuffle: false + data_loader_seed: 42 checkpoint: _target_: verl.trainer.config.CheckpointConfig save_contents: @@ -127,7 +128,6 @@ actor_rollout_ref: mode: disabled record_file: null replay_file: null - data_loader_seed: 42 load_weight: true ref: rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} @@ -433,7 +433,7 @@ critic: recompute_num_layers: null attention_backend: flash override_mcore_model_config: {} - use_mbridge: false + use_mbridge: true vanilla_mbridge: true use_remove_padding: true forward_only: false @@ -480,6 +480,7 @@ critic: forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: ${oc.select:actor_rollout_ref.actor.data_loader_seed,null} cliprange_value: 0.5 loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} checkpoint: @@ -517,11 +518,10 @@ critic: stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} nccl_timeout: 600 load_weight: true - data_loader_seed: ${oc.select:actor_rollout_ref.actor.data_loader_seed,null} reward_model: enable: false enable_resource_pool: false - n_gpus_per_node: 0 + n_gpus_per_node: 8 nnodes: 0 strategy: megatron model: @@ -571,6 +571,31 @@ reward_model: use_remove_padding: ${oc.select:actor_rollout_ref.actor.megatron.use_remove_padding,True} dtype: bfloat16 load_weight: true + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 algorithm: rollout_correction: rollout_is: null @@ -580,7 +605,7 @@ algorithm: rollout_rs_threshold_lower: null rollout_token_veto_threshold: null bypass_mode: false - use_policy_gradient: false + loss_type: ppo_clip rollout_is_batch_normalize: false _target_: verl.trainer.config.AlgoConfig gamma: 1.0 diff --git a/verl/trainer/config/_generated_ppo_trainer.yaml b/verl/trainer/config/_generated_ppo_trainer.yaml index d37965dbc50..833ebb70d5b 100644 --- a/verl/trainer/config/_generated_ppo_trainer.yaml +++ b/verl/trainer/config/_generated_ppo_trainer.yaml @@ -75,6 +75,7 @@ actor_rollout_ref: kl_loss_type: low_var_kl ppo_epochs: 1 shuffle: false + data_loader_seed: 42 checkpoint: _target_: verl.trainer.config.CheckpointConfig save_contents: @@ -287,7 +288,7 @@ actor_rollout_ref: override_config: {} enable_gradient_checkpointing: true enable_activation_offload: false - use_remove_padding: false + use_remove_padding: true lora_rank: 0 lora_alpha: 16 target_modules: all-linear @@ -413,6 +414,7 @@ critic: forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 cliprange_value: 0.5 loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} checkpoint: @@ -455,7 +457,7 @@ critic: reward_model: enable: false enable_resource_pool: false - n_gpus_per_node: 0 + n_gpus_per_node: 8 nnodes: 0 strategy: fsdp model: @@ -495,6 +497,31 @@ reward_model: save_path: ${oc.select:global_profiler.save_path,null} tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 algorithm: rollout_correction: rollout_is: null @@ -504,7 +531,7 @@ algorithm: rollout_rs_threshold_lower: null rollout_token_veto_threshold: null bypass_mode: false - use_policy_gradient: false + loss_type: ppo_clip rollout_is_batch_normalize: false _target_: verl.trainer.config.AlgoConfig gamma: 1.0 diff --git a/verl/trainer/config/actor/actor.yaml b/verl/trainer/config/actor/actor.yaml index f5f1d15eee5..283095a1527 100644 --- a/verl/trainer/config/actor/actor.yaml +++ b/verl/trainer/config/actor/actor.yaml @@ -103,6 +103,9 @@ ppo_epochs: 1 # Shuffle training data across PPO epochs shuffle: false +# The seed used to construct mini-batch +data_loader_seed: 42 + # checkpoint configs checkpoint: diff --git a/verl/trainer/config/actor/megatron_actor.yaml b/verl/trainer/config/actor/megatron_actor.yaml index a632fe4380b..fde70c363c4 100644 --- a/verl/trainer/config/actor/megatron_actor.yaml +++ b/verl/trainer/config/actor/megatron_actor.yaml @@ -15,6 +15,4 @@ _target_: verl.workers.config.McoreActorConfig strategy: megatron -data_loader_seed: 42 - load_weight: True diff --git a/verl/trainer/config/algorithm.py b/verl/trainer/config/algorithm.py index a7c86da0297..a40973e669f 100644 --- a/verl/trainer/config/algorithm.py +++ b/verl/trainer/config/algorithm.py @@ -114,14 +114,17 @@ class RolloutCorrectionConfig(BaseConfig): bypass_mode (bool): Operating mode - bypass or decoupled. - True: Bypass mode - reuse rollout_log_prob as old_log_prob (2 policies) + Uses compute_policy_loss_bypass_mode() with loss_type selection - False: Decoupled mode - compute old_log_prob separately (3 policies) + Uses standard PPO loss with IS weight correction Default: False (decoupled mode) - use_policy_gradient (bool): Loss function type. - - Requires bypass_mode=True - - True: Policy gradient loss (no PPO clipping) - - False: PPO loss (with clipping) - Default: False (PPO loss) + loss_type (str): Loss function type in bypass mode (bypass_mode=True). + - "reinforce": REINFORCE-style policy gradient with explicit IS weights + L = -E[w * log π(a|s) * A] where w = π_current / π_rollout + - "ppo_clip": PPO clipped objective (IS handled by ratio, no explicit weights) + L = -E[min(r*A, clip(r)*A)] where r = π_current / π_rollout + Default: "ppo_clip" rollout_is_batch_normalize (bool): Apply batch normalization to IS weights. - True: Normalize IS weights to have mean=1.0 within each batch @@ -142,15 +145,15 @@ class RolloutCorrectionConfig(BaseConfig): config = RolloutCorrectionConfig.decoupled_geo_rs() # Geo-RS config = RolloutCorrectionConfig.geo_rs_seq_tis() # Geo-RS-Seq-TIS - # Bypass PPO mode (2 policies: π_rollout = π_old, π_θ) - # No IS correction needed since π_old = π_rollout - config = RolloutCorrectionConfig.ppo_is_bypass() # PPO with rollout as anchor - - # Bypass PG mode presets (2 policies, no PPO clipping) - # IS weights computed on-the-fly as π_θ / π_rollout - config = RolloutCorrectionConfig.pg_is() # Seq-TIS + PG - config = RolloutCorrectionConfig.pg_rs() # Geo-RS + PG - config = RolloutCorrectionConfig.pg_geo_rs_seq_tis() # Geo-RS-Seq-TIS + PG + # Bypass mode presets (2 policies: π_rollout = π_old, π_θ) + # loss_type controls the loss function + # PPO-clip presets (ratio handles IS, so no separate IS weights needed): + config = RolloutCorrectionConfig.bypass_ppo_clip() # PPO-clip only + config = RolloutCorrectionConfig.bypass_ppo_clip_geo_rs() # PPO-clip + Geo-RS + # REINFORCE presets (explicit IS weights): + config = RolloutCorrectionConfig.bypass_pg_is() # REINFORCE + Seq-TIS + config = RolloutCorrectionConfig.bypass_pg_rs() # REINFORCE + Geo-RS + config = RolloutCorrectionConfig.bypass_pg_geo_rs_seq_tis() # REINFORCE + Geo-RS + Seq-TIS Reference: Liu, Li, Fu, Wang, Liu, Shen (2025) @@ -165,7 +168,7 @@ class RolloutCorrectionConfig(BaseConfig): rollout_rs_threshold_lower: Optional[float] = None rollout_token_veto_threshold: Optional[float] = None bypass_mode: bool = False - use_policy_gradient: bool = False + loss_type: str = "ppo_clip" rollout_is_batch_normalize: bool = False @classmethod @@ -256,57 +259,89 @@ def decoupled_geo_rs( ) @classmethod - def ppo_is_bypass(cls, threshold: float = 2.0) -> "RolloutCorrectionConfig": - """PPO with IS Correction in Bypass Mode. + def bypass_ppo_clip(cls) -> "RolloutCorrectionConfig": + """Bypass mode with PPO-clip loss. - Skips old_log_prob computation by reusing rollout_log_prob. - PPO clips against rollout policy instead of true old policy. + PPO clipped objective in bypass mode. The PPO ratio = π_θ/π_rollout + already handles IS correction, so no explicit IS weights are applied. - Args: - threshold (float): Upper threshold for IS weights. Default: 2.0 + Skips old_log_prob computation for faster execution (2 policies instead of 3). Returns: - RolloutCorrectionConfig configured for PPO_IS bypass mode + RolloutCorrectionConfig configured for bypass mode with PPO-clip """ return cls( - rollout_is="token", - rollout_is_threshold=threshold, + rollout_is=None, rollout_rs=None, bypass_mode=True, - use_policy_gradient=False, + loss_type="ppo_clip", + ) + + @classmethod + def bypass_ppo_clip_geo_rs( + cls, + rs_threshold: float = 1.001, + rs_threshold_lower: Optional[float] = None, + veto_threshold: float = 1e-4, + ) -> "RolloutCorrectionConfig": + """Bypass mode with PPO-clip loss and Geometric Rejection Sampling. + + PPO clipped objective in bypass mode with geometric RS to mask outliers. + The PPO ratio = π_θ/π_rollout already handles IS correction. + + Skips old_log_prob computation for faster execution (2 policies instead of 3). + Solves the "Length Trap" problem for CoT/agent workloads. + + Args: + rs_threshold (float): Geometric RS threshold (upper). Default: 1.001 (±0.1%) + rs_threshold_lower (Optional[float]): Geometric RS threshold (lower). + If None, auto-computed as reciprocal of rs_threshold. Default: None + veto_threshold (float): Per-token veto threshold. Default: 1e-4 + + Returns: + RolloutCorrectionConfig configured for bypass mode with PPO-clip + Geo-RS + """ + return cls( + rollout_is=None, + rollout_rs="geometric", + rollout_rs_threshold=rs_threshold, + rollout_rs_threshold_lower=rs_threshold_lower, + rollout_token_veto_threshold=veto_threshold, + bypass_mode=True, + loss_type="ppo_clip", ) @classmethod - def pg_is(cls, threshold: float = 2.0) -> "RolloutCorrectionConfig": - """Policy Gradient with IS Correction. + def bypass_pg_is(cls, threshold: float = 2.0) -> "RolloutCorrectionConfig": + """Bypass mode with REINFORCE loss and IS Correction. - Uses policy gradient loss with explicit IS correction. + Uses REINFORCE loss with explicit IS correction in bypass mode. No PPO clipping. Args: threshold (float): Upper threshold for IS weights. Default: 2.0 Returns: - RolloutCorrectionConfig configured for PG with IS + RolloutCorrectionConfig configured for bypass mode with REINFORCE + IS """ return cls( rollout_is="sequence", rollout_is_threshold=threshold, rollout_rs=None, bypass_mode=True, - use_policy_gradient=True, + loss_type="reinforce", ) @classmethod - def pg_rs( + def bypass_pg_rs( cls, rs_threshold: float = 1.001, rs_threshold_lower: Optional[float] = None, veto_threshold: float = 1e-4, ) -> "RolloutCorrectionConfig": - """Policy Gradient with Rejection Sampling (Geo-RS). + """Bypass mode with REINFORCE loss and Geometric Rejection Sampling. - Policy gradient with geometric rejection sampling (no IS weights) in bypass mode. + REINFORCE with geometric rejection sampling (no IS weights) in bypass mode. Skips old_log_prob computation for faster execution. Solves the "Length Trap" problem where standard IS estimators penalize long sequences. @@ -319,7 +354,7 @@ def pg_rs( veto_threshold (float): Per-token veto threshold. Default: 1e-4 Returns: - RolloutCorrectionConfig configured for PG with Geo-RS + RolloutCorrectionConfig configured for bypass mode with REINFORCE + Geo-RS """ return cls( rollout_is=None, @@ -328,7 +363,7 @@ def pg_rs( rollout_rs_threshold_lower=rs_threshold_lower, rollout_token_veto_threshold=veto_threshold, bypass_mode=True, - use_policy_gradient=True, + loss_type="reinforce", ) @classmethod @@ -367,17 +402,17 @@ def geo_rs_seq_tis( ) @classmethod - def pg_geo_rs_seq_tis( + def bypass_pg_geo_rs_seq_tis( cls, is_threshold: float = 2.0, rs_threshold: float = 1.001, rs_threshold_lower: Optional[float] = None, veto_threshold: Optional[float] = 1e-4, ) -> "RolloutCorrectionConfig": - """Policy Gradient with Geo-RS-Seq-TIS (Bypass mode). + """Bypass mode with REINFORCE loss, Geo-RS, and Sequence-level IS. Combines geometric rejection with sequence-level IS - in bypass mode with policy gradient loss (no PPO clipping). + in bypass mode with REINFORCE loss (no PPO clipping). Suitable for reasoning models (CoT, o1-style) and agents when you want bypass mode efficiency. @@ -390,7 +425,7 @@ def pg_geo_rs_seq_tis( veto_threshold (Optional[float]): Per-token veto threshold. Default: 1e-4 Returns: - RolloutCorrectionConfig configured for PG with Geo-RS-Seq-TIS + RolloutCorrectionConfig configured for bypass mode with REINFORCE + Geo-RS + Seq-TIS """ return cls( rollout_is="sequence", @@ -400,7 +435,7 @@ def pg_geo_rs_seq_tis( rollout_rs_threshold_lower=rs_threshold_lower, rollout_token_veto_threshold=veto_threshold, bypass_mode=True, - use_policy_gradient=True, + loss_type="reinforce", ) @classmethod @@ -440,9 +475,9 @@ class AlgoConfig(BaseConfig): - RolloutCorrectionConfig.decoupled_seq_is() - Decoupled mode with sequence-level IS - RolloutCorrectionConfig.decoupled_seq_is_rs() - Decoupled mode with sequence IS + RS - RolloutCorrectionConfig.decoupled_geo_rs() - Decoupled mode with geometric RS + veto - - RolloutCorrectionConfig.ppo_is_bypass() - Bypass mode (skips old_log_prob) - - RolloutCorrectionConfig.pg_is() - Policy gradient with IS - - RolloutCorrectionConfig.pg_rs() - Policy gradient with RS + - RolloutCorrectionConfig.bypass_ppo_clip() - Bypass mode with PPO-clip + - RolloutCorrectionConfig.bypass_pg_is() - Bypass mode with REINFORCE + IS + - RolloutCorrectionConfig.bypass_pg_rs() - Bypass mode with REINFORCE + RS For backward compatibility, you can still pass a dict, which will be converted to RolloutCorrectionConfig automatically. diff --git a/verl/trainer/config/algorithm/rollout_correction.yaml b/verl/trainer/config/algorithm/rollout_correction.yaml index 7c958c5ee76..cfd74316f4f 100644 --- a/verl/trainer/config/algorithm/rollout_correction.yaml +++ b/verl/trainer/config/algorithm/rollout_correction.yaml @@ -1,6 +1,6 @@ # Rollout Correction: corrects off-policy distribution shifts # See documentation: docs/algo/rollout_corr.md -# Use presets: RolloutCorrectionConfig.decoupled_seq_is(), .pg_is(), etc. +# Use presets: RolloutCorrectionConfig.decoupled_seq_is(), .bypass_pg_is(), etc. # IS aggregation level: null (disabled), "token" (per-token), "sequence" (per-sequence) rollout_is: null @@ -23,8 +23,10 @@ rollout_token_veto_threshold: null # Operating mode: false = Decoupled (3 policies), true = Bypass (2 policies) bypass_mode: false -# Loss function: false = PPO with clipping, true = Policy gradient (no clipping) -use_policy_gradient: false +# Loss type in bypass mode (bypass_mode=true): +# - "ppo_clip": PPO clipped objective (IS handled by ratio, default) +# - "reinforce": REINFORCE with explicit IS weights (no PPO clipping) +loss_type: ppo_clip # Batch normalize IS weights: false = raw weights, true = normalize to mean=1.0 rollout_is_batch_normalize: false diff --git a/verl/trainer/config/critic/critic.yaml b/verl/trainer/config/critic/critic.yaml index f201a34b40c..95cbeaf92bc 100644 --- a/verl/trainer/config/critic/critic.yaml +++ b/verl/trainer/config/critic/critic.yaml @@ -73,6 +73,9 @@ ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} # Shuffle training data across PPO epochs shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} +# The seed used to construct mini-batch +data_loader_seed: 42 + # PPO value function clipping range cliprange_value: 0.5 diff --git a/verl/trainer/config/engine/megatron.yaml b/verl/trainer/config/engine/megatron.yaml index 84601f5a3f5..b588a96c1b3 100644 --- a/verl/trainer/config/engine/megatron.yaml +++ b/verl/trainer/config/engine/megatron.yaml @@ -75,7 +75,7 @@ override_transformer_config: override_mcore_model_config: {} # oc.select: default val for ref.megatron.use_mbridge -use_mbridge: False +use_mbridge: True # oc.select: default val for ref.megatron.vanilla_mbridge vanilla_mbridge: True diff --git a/verl/trainer/config/model/hf_model.yaml b/verl/trainer/config/model/hf_model.yaml index 6d02b8eac89..1aefa6984ea 100644 --- a/verl/trainer/config/model/hf_model.yaml +++ b/verl/trainer/config/model/hf_model.yaml @@ -37,7 +37,7 @@ enable_gradient_checkpointing: True enable_activation_offload: False # whether to use remove padding. Only valid when we use hf model definition -use_remove_padding: False +use_remove_padding: True # Set to positive value to enable LoRA (e.g., 32) lora_rank: 0 diff --git a/verl/trainer/config/ppo_megatron_trainer.yaml b/verl/trainer/config/ppo_megatron_trainer.yaml index 9d9959aeabd..5050d3d8890 100644 --- a/verl/trainer/config/ppo_megatron_trainer.yaml +++ b/verl/trainer/config/ppo_megatron_trainer.yaml @@ -18,7 +18,7 @@ defaults: # Critic model config. - critic@critic: megatron_critic # Reward model config. - - reward_model@reward_model: megatron_reward_model + - reward_model@reward_model: megatron_reward_loop # Rollout correction config. - algorithm@algorithm.rollout_correction: rollout_correction - _self_ diff --git a/verl/trainer/config/ppo_trainer.yaml b/verl/trainer/config/ppo_trainer.yaml index c226d2d06d9..7489b522fa2 100644 --- a/verl/trainer/config/ppo_trainer.yaml +++ b/verl/trainer/config/ppo_trainer.yaml @@ -31,7 +31,7 @@ defaults: - critic@critic: dp_critic # Reward model config. - - reward_model@reward_model: dp_reward_model + - reward_model@reward_model: dp_reward_loop # Rollout correction config. - algorithm@algorithm.rollout_correction: rollout_correction diff --git a/verl/trainer/config/reward_model/dp_reward_loop.yaml b/verl/trainer/config/reward_model/dp_reward_loop.yaml new file mode 100644 index 00000000000..04fb106df1c --- /dev/null +++ b/verl/trainer/config/reward_model/dp_reward_loop.yaml @@ -0,0 +1,43 @@ +defaults: + - dp_reward_model + - _self_ + +use_reward_loop: True +reward_manager: naive +enable: False + +# Whether to deploy the model to a separate resource pool. +enable_resource_pool: False +n_gpus_per_node: 8 +num_workers: 1 +nnodes: 0 + +model: + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: False + +rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + + prompt_length: 2048 + response_length: 2048 \ No newline at end of file diff --git a/verl/trainer/config/reward_model/megatron_reward_loop.yaml b/verl/trainer/config/reward_model/megatron_reward_loop.yaml new file mode 100644 index 00000000000..f99b94abcc4 --- /dev/null +++ b/verl/trainer/config/reward_model/megatron_reward_loop.yaml @@ -0,0 +1,43 @@ +defaults: + - megatron_reward_model + - _self_ + +use_reward_loop: True +reward_manager: naive +enable: False + +# Whether to deploy the model to a separate resource pool. +enable_resource_pool: False +n_gpus_per_node: 8 +num_workers: 1 +nnodes: 0 + +model: + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: False + +rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + + prompt_length: 2048 + response_length: 2048 \ No newline at end of file diff --git a/verl/trainer/config/sft_trainer_engine.yaml b/verl/trainer/config/sft_trainer_engine.yaml index dd70640353b..f11b3bf8f1f 100644 --- a/verl/trainer/config/sft_trainer_engine.yaml +++ b/verl/trainer/config/sft_trainer_engine.yaml @@ -36,6 +36,13 @@ data: use_shm: False apply_chat_template_kwargs: {} + # MultiTurnSFTDataset apply_chat_template to each turn separately and concat `input_ids` + # as a whole sequence, which may not equal to apply_chat_template to whole messages at once. + # For example, Qwen Thinking series models add tags to last turn, please check + # your tokenizer chat template settings. + # Set to True to ignore input_ids mismatch and use the concatenated input_ids as the final input_ids. + ignore_input_ids_mismatch: False + # Checkpoint configuration checkpoint: _target_: verl.trainer.config.CheckpointConfig diff --git a/verl/trainer/fsdp_sft_trainer.py b/verl/trainer/fsdp_sft_trainer.py index 4088fafaf03..27ff801b362 100644 --- a/verl/trainer/fsdp_sft_trainer.py +++ b/verl/trainer/fsdp_sft_trainer.py @@ -49,7 +49,13 @@ from verl.utils.checkpoint.fsdp_checkpoint_manager import FSDPCheckpointManager from verl.utils.dataset import SFTDataset from verl.utils.dataset.multiturn_sft_dataset import MultiTurnSFTDataset -from verl.utils.device import get_device_id, get_device_name, is_cuda_available, is_npu_available +from verl.utils.device import ( + auto_set_ascend_device_name, + get_device_id, + get_device_name, + is_cuda_available, + is_npu_available, +) from verl.utils.distributed import destroy_global_process_group, initialize_global_process_group from verl.utils.fs import copy_to_local from verl.utils.fsdp_utils import ( @@ -132,6 +138,7 @@ def __init__( if self.device_mesh.get_rank() == 0: print(self.config) + self.device_name = self.config.trainer.device def _normalize_config_bsz(self): @@ -835,6 +842,9 @@ def run_sft(config): @hydra.main(config_path="config", config_name="sft_trainer", version_base=None) def main(config): + # Automatically set `config.trainer.device = npu` when running on Ascend NPU. + auto_set_ascend_device_name(config) + run_sft(config) diff --git a/verl/trainer/main_generation.py b/verl/trainer/main_generation.py index 791c17af7ef..18aaa8cdbd0 100644 --- a/verl/trainer/main_generation.py +++ b/verl/trainer/main_generation.py @@ -84,6 +84,7 @@ def main_task(config): ray_cls_with_init = RayClassWithInitArgs(cls=ray.remote(ActorRolloutRefWorker), config=config, role="rollout") resource_pool = RayResourcePool(process_on_nodes=[config.trainer.n_gpus_per_node] * config.trainer.nnodes) + wg = RayWorkerGroup( resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init, diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py index f738c98150f..0f3935b5bfb 100644 --- a/verl/trainer/main_ppo.py +++ b/verl/trainer/main_ppo.py @@ -28,7 +28,7 @@ from verl.trainer.ppo.reward import load_reward_manager from verl.trainer.ppo.utils import need_critic, need_reference_policy from verl.utils.config import validate_config -from verl.utils.device import is_cuda_available +from verl.utils.device import auto_set_ascend_device_name, is_cuda_available from verl.utils.import_utils import load_extern_object @@ -39,6 +39,9 @@ def main(config): Args: config_dict: Hydra configuration dictionary containing training parameters. """ + # Automatically set `config.trainer.device = npu` when running on Ascend NPU. + auto_set_ascend_device_name(config) + run_ppo(config) @@ -175,18 +178,21 @@ def add_actor_rollout_worker(self, config): def add_critic_worker(self, config): """Add critic worker to role mapping.""" + use_legacy_worker_impl = config.trainer.get("use_legacy_worker_impl", "auto") if config.critic.strategy in {"fsdp", "fsdp2"}: - use_legacy_worker_impl = config.trainer.get("use_legacy_worker_impl", "auto") if use_legacy_worker_impl in ["auto", "enable"]: from verl.workers.fsdp_workers import CriticWorker elif use_legacy_worker_impl == "disable": - from verl.workers.engine_workers import CriticWorker + # we don't need to specialize critic worker. Just use TrainingWorker + from verl.workers.engine_workers import TrainingWorker + CriticWorker = TrainingWorker print("Using new worker implementation") else: raise ValueError(f"Invalid use_legacy_worker_impl: {use_legacy_worker_impl}") elif config.critic.strategy == "megatron": + # TODO: switch this to TrainingWorker as well from verl.workers.megatron_workers import CriticWorker else: diff --git a/verl/trainer/ppo/core_algos.py b/verl/trainer/ppo/core_algos.py index dc30adfc343..7849bfbae9f 100644 --- a/verl/trainer/ppo/core_algos.py +++ b/verl/trainer/ppo/core_algos.py @@ -781,15 +781,21 @@ def agg_loss( """ Aggregate the loss across global batch to ensure the loss is invariant to fsdp/megatron parallelism. + NOTE: ``dp_size``, ``batch_num_tokens``, and ``global_batch_size`` are only compatible with the new model engine + for now, while the legacy model engines conduct the aggregation outside ``agg_loss``. + NOTE: The returned loss has different behaviors for different backend: - FSDP: the loss is directly used for backward. - Megatron: the loss should be scaled by `num_microbatches` and `cp_size` for pp schedule. + # TODO: Consider the numerical stability? + Args: loss_mat: micro batch loss matrix, (bs, response_length) loss_mask: micro batch loss mask, (bs, response_length) loss_agg_mode: method to aggregate the loss matrix into a scalar - dp_size: data parallel size + dp_size: data parallel size. When appling manual aggregation, + scaling up the ``loss`` by ``dp_size`` can cancel out FSDP averaging. batch_num_tokens: number of valid tokens in global batch global_batch_size: global batch size loss_scale_factor: scale factor for "seq-mean-token-sum-norm" mode. If None, uses loss_mask.shape[-1]. @@ -799,30 +805,39 @@ def agg_loss( loss: `a scalar torch.Tensor` aggregated loss """ + # NOTE: `masked_sum` is more robust than multiplying the `mask`. if loss_agg_mode == "token-mean": if batch_num_tokens is None: batch_num_tokens = loss_mask.sum() loss = verl_F.masked_sum(loss_mat, loss_mask) / batch_num_tokens * dp_size - elif loss_agg_mode == "seq-mean-token-sum": - seq_losses = torch.sum(loss_mat * loss_mask, dim=-1) # token-sum - seq_mask = (torch.sum(loss_mask, dim=-1) > 0).float() # exclude fully masked sequences - if global_batch_size is None: - global_batch_size = seq_mask.sum() - loss = verl_F.masked_sum(seq_losses, seq_mask) / global_batch_size * dp_size # seq-mean - elif loss_agg_mode == "seq-mean-token-mean": - seq_mask = torch.sum(loss_mask, dim=-1) # per-sequence token count - seq_losses = torch.sum(loss_mat * loss_mask, dim=-1) / (seq_mask + 1e-8) # token-mean - seq_mask = (seq_mask > 0).float() # exclude fully masked sequences - if global_batch_size is None: - global_batch_size = seq_mask.sum() - loss = verl_F.masked_sum(seq_losses, seq_mask) / global_batch_size * dp_size # seq-mean - elif loss_agg_mode == "seq-mean-token-sum-norm": - seq_losses = torch.sum(loss_mat * loss_mask, dim=-1) - if loss_scale_factor is None: - loss_scale_factor = loss_mask.shape[-1] - loss = torch.sum(seq_losses) / loss_scale_factor + elif loss_agg_mode.startswith("seq-mean"): + # TODO: Correct and unify the denominator logic. + if global_batch_size is not None: + seq_denominator = global_batch_size * dp_size + else: # The default logic which is only correct when the batch sizes are even. + local_bsz = loss_mat.shape[0] + seq_denominator = local_bsz + + if loss_agg_mode.startswith("seq-mean-token-sum"): + seq_losses = verl_F.masked_sum(loss_mat, loss_mask, axis=-1) # token-sum per sequence + + if loss_agg_mode == "seq-mean-token-sum": + pass # TODO: Add assertation. + elif loss_agg_mode == "seq-mean-token-sum-norm": + if loss_scale_factor is None: + loss_scale_factor = loss_mask.shape[-1] + seq_losses = seq_losses / loss_scale_factor + else: + raise ValueError(f"Invalid {loss_agg_mode=}") + elif loss_agg_mode == "seq-mean-token-mean": + token_counts = torch.sum(loss_mask, dim=-1) # per-sequence token count + # token-mean per sequence + seq_losses = verl_F.masked_sum(loss_mat, loss_mask, axis=-1) / (token_counts + 1e-8) + else: + raise ValueError(f"Invalid {loss_agg_mode=}") + loss = torch.sum(seq_losses) / seq_denominator # seq-mean else: - raise ValueError(f"Invalid loss_agg_mode: {loss_agg_mode}") + raise ValueError(f"Invalid {loss_agg_mode=}") return loss @@ -1582,115 +1597,60 @@ def compute_weights(scores: torch.Tensor, reweight_method: str, weight_pow: floa return resampled_data -def compute_policy_loss_with_rollout_correction( - rollout_log_prob, - log_prob, - advantages, - eos_mask, - loss_agg_mode="seq-mean-token-sum", +def compute_policy_loss_reinforce( + rollout_log_prob: torch.Tensor, + log_prob: torch.Tensor, + advantages: torch.Tensor, + response_mask: torch.Tensor, + loss_agg_mode: str = "seq-mean-token-sum", config: Optional[ActorConfig] = None, - loss_scale_factor=1.0, - rollout_is: Optional[str] = None, - rollout_is_threshold: float = 2.0, - rollout_rs: Optional[str] = None, - rollout_rs_threshold: Optional[float] = None, - rollout_rs_threshold_lower: Optional[float] = None, - rollout_token_veto_threshold: Optional[float] = None, - rollout_is_batch_normalize: bool = False, -): - """Compute policy loss with pure rollout correction (no PPO clipping). + rollout_is_weights: Optional[torch.Tensor] = None, +) -> tuple[torch.Tensor, dict[str, Any]]: + """Compute REINFORCE-style policy gradient loss with optional IS correction. - This function implements policy gradient with importance sampling correction - for rollout-training policy mismatch, without PPO's clipping mechanism. + This function implements policy gradient (REINFORCE) with optional importance + sampling correction for rollout-training policy mismatch. Mathematical formulation: - Without IS (rollout_is=None): + Without IS (rollout_is_weights=None): L = -E[log π(a|s) * A(s,a)] Gradient: ∇_θ L = -E[∇log π(a|s) * A] (standard REINFORCE) - With IS (rollout_is enabled): + With IS (rollout_is_weights provided): L = -E_π_rollout[w * log π(a|s) * A(s,a)] where w = π_current / π_rollout (truncated IS weight) Gradient: ∇_θ L = -E[w * ∇log π(a|s) * A] (IS-corrected policy gradient) Args: rollout_log_prob: Log probabilities from rollout policy (e.g., vLLM BF16). - Shape: (batch_size, seq_length) + Shape: (batch_size, seq_length). Used for KL computation. log_prob: Log probabilities from current training policy. Shape: (batch_size, seq_length) advantages: Advantage estimates for each token. Shape: (batch_size, seq_length) - eos_mask: Mask indicating valid tokens (1 for valid, 0 for padding). - Shape: (batch_size, seq_length) + response_mask: Mask indicating valid tokens (1 for valid, 0 for padding). + Shape: (batch_size, seq_length). Should already include rejection sampling. loss_agg_mode: Loss aggregation strategy (see agg_loss for details). - loss_scale_factor: Multiplicative scaling factor applied to final loss. - rollout_is: IS aggregation level ("token", "sequence", or None). - rollout_is_threshold: Upper threshold for truncating IS weights. - rollout_rs: Rejection sampling aggregation level (or None to disable). - rollout_rs_threshold: Upper threshold for rejection sampling. - rollout_rs_threshold_lower: Lower threshold for rejection sampling. - rollout_token_veto_threshold: Per-token veto threshold for catastrophic outliers. - rollout_is_batch_normalize: Whether to normalize IS weights to have mean=1.0 per batch. + config: Actor config (required for global_batch_info). + rollout_is_weights: Pre-computed IS weights (π_current / π_rollout). + Shape: (batch_size, seq_length). None to disable IS correction. - Note: - Unlike compute_policy_loss (PPO), this function: - - Does NOT use PPO clipping (no old_log_prob needed) - - Directly applies IS correction computed from current vs rollout - - Computes IS/RS on-the-fly during training - - Usage: - This function is called by the actor when: - - bypass_mode=True (trainer uses rollout_log_prob as old_log_prob) - - use_policy_gradient=True (actor uses this function instead of compute_policy_loss) - - Example config: - algorithm: - rollout_correction: - bypass_mode: true - use_policy_gradient: true - rollout_is: "token" - rollout_is_threshold: 2.0 - rollout_rs: "token" - rollout_rs_threshold: 2.0 - rollout_rs_threshold_lower: 0.5 + Returns: + Tuple of (loss, metrics): + loss: Scalar policy gradient loss + metrics: Dictionary with "actor/ppo_kl" + Note: + Unlike PPO (compute_policy_loss_vanilla), this function: + - Does NOT use PPO clipping + - Uses log π(a|s) directly (not ratio) + - IS weights are applied as multiplicative factor """ - # Import rollout correction helper - from verl.trainer.ppo.rollout_corr_helper import compute_rollout_correction_and_rejection_mask - - assert config is not None, "ActorConfig must be provided for rollout correction" - - # Compute IS weights and rejection mask on-the-fly - # Use no_grad since weights are detached inside and metrics don't need gradients - with torch.no_grad(): - rollout_is_weights_proto, modified_response_mask, rollout_metrics = ( - compute_rollout_correction_and_rejection_mask( - old_log_prob=log_prob, # Current policy - rollout_log_prob=rollout_log_prob, # Rollout policy - response_mask=eos_mask, - rollout_is=rollout_is, - rollout_is_threshold=rollout_is_threshold, - rollout_rs=rollout_rs, - rollout_rs_threshold=rollout_rs_threshold, - rollout_rs_threshold_lower=rollout_rs_threshold_lower, - rollout_token_veto_threshold=rollout_token_veto_threshold, - rollout_is_batch_normalize=rollout_is_batch_normalize, - ) - ) - - # Extract weights tensor from DataProto (or None if disabled) - rollout_is_weights = rollout_is_weights_proto.batch["rollout_is_weights"] if rollout_is_weights_proto else None + assert config is not None, "ActorConfig must be provided for REINFORCE loss" - # Apply rejection mask (if RS is enabled) - effective_mask = modified_response_mask if rollout_rs is not None else eos_mask - - # Compute pure policy gradient loss with IS correction + # Compute pure policy gradient loss with optional IS correction # Standard REINFORCE: L = -E[log π(a|s) * A] # With IS: L = -E[w * log π(a|s) * A] where w = π_current / π_rollout - # - # Note: rollout_is_weights already contains w = π_current / π_rollout - # So we apply it to the standard log-prob trick formula - if rollout_is_weights is not None: # IS-corrected policy gradient: L = -E[stopgrad(w) · log π · A] pg_losses = -advantages * log_prob * rollout_is_weights @@ -1698,33 +1658,27 @@ def compute_policy_loss_with_rollout_correction( # Standard REINFORCE: L = -E[log π · A] pg_losses = -advantages * log_prob - # Aggregate loss (apply scale factor manually) - pg_loss = ( - agg_loss( - loss_mat=pg_losses, - loss_mask=effective_mask, - loss_agg_mode=loss_agg_mode, - **config.global_batch_info, - ) - * loss_scale_factor + # Aggregate loss + pg_loss = agg_loss( + loss_mat=pg_losses, + loss_mask=response_mask, + loss_agg_mode=loss_agg_mode, + **config.global_batch_info, ) # Compute KL divergence between current and rollout policy negative_approx_kl = log_prob - rollout_log_prob - kl_divergence = verl_F.masked_mean(-negative_approx_kl, effective_mask) + kl_divergence = verl_F.masked_mean(-negative_approx_kl, response_mask) - pg_metrics = rollout_metrics - pg_metrics.update( - { - "actor/ppo_kl": kl_divergence.detach().item(), - } - ) + pg_metrics = { + "actor/ppo_kl": kl_divergence.detach().item(), + } return pg_loss, pg_metrics -@register_policy_loss("rollout_correction") -def compute_policy_loss_rollout_correction_wrapper( +@register_policy_loss("bypass_mode") +def compute_policy_loss_bypass_mode( old_log_prob: torch.Tensor, log_prob: torch.Tensor, advantages: torch.Tensor, @@ -1733,34 +1687,70 @@ def compute_policy_loss_rollout_correction_wrapper( config: Optional[ActorConfig] = None, rollout_is_weights: torch.Tensor | None = None, ) -> tuple[torch.Tensor, dict[str, Any]]: - """Wrapper for compute_policy_loss_with_rollout_correction to match PolicyLossFn interface. - - This function is used when algorithm.rollout_correction.use_policy_gradient=True. - In this mode, the trainer has already set old_log_prob=rollout_log_prob (bypass mode). + """Bypass mode policy loss supporting both REINFORCE and PPO-clip. + + This function is the entry point for bypass mode, where old_log_prob = rollout_log_prob. + It computes IS weights and rejection masks, then dispatches to either REINFORCE or + PPO-clip loss based on the loss_type configuration. + + IMPORTANT - Bypass mode semantics: + In bypass mode, the trainer sets old_log_prob = rollout_log_prob. + This means: + - For REINFORCE: We use IS weights w = π_current / π_rollout explicitly + - For PPO-clip: The PPO ratio π_current / π_old = π_current / π_rollout + already incorporates the IS correction through clipping, so we do NOT + apply additional IS weights (would be double-counting) + + Loss types: + - "ppo_clip" (default): PPO clipped objective (compute_policy_loss_vanilla) + L = -E[min(r*A, clip(r)*A)] where r = π_current / π_rollout + Note: IS weights are NOT applied (clipping handles the ratio) + - "reinforce": REINFORCE-style policy gradient with IS correction + L = -E[w * log π(a|s) * A] where w = π_current / π_rollout Args: - old_log_prob: In bypass mode, this is actually rollout_log_prob - log_prob: Current policy log probabilities - advantages: Advantage estimates - response_mask: Valid token mask - loss_agg_mode: Loss aggregation mode - config: Actor config containing rollout_correction settings - rollout_is_weights: Pre-computed IS weights (ignored, computed internally) + old_log_prob: In bypass mode, this is actually rollout_log_prob. + Shape: (batch_size, seq_length) + log_prob: Current policy log probabilities. + Shape: (batch_size, seq_length) + advantages: Advantage estimates. + Shape: (batch_size, seq_length) + response_mask: Valid token mask (1=valid, 0=padding). + Shape: (batch_size, seq_length) + loss_agg_mode: Loss aggregation mode (passed to underlying loss function). + config: Actor config containing rollout_correction settings in policy_loss. + rollout_is_weights: Pre-computed IS weights (ignored, computed internally). + + Config options (in config.policy_loss.rollout_correction): + loss_type: "ppo_clip" (default) or "reinforce" + rollout_is: IS aggregation level ("token", "sequence", or None) + rollout_is_threshold: Upper threshold for truncating IS weights (default: 2.0) + rollout_rs: Rejection sampling level ("token", "sequence", "geometric", or None) + rollout_rs_threshold: Upper threshold for rejection sampling + rollout_rs_threshold_lower: Lower threshold for rejection sampling + rollout_token_veto_threshold: Per-token veto threshold for catastrophic outliers + rollout_is_batch_normalize: Whether to normalize IS weights to mean=1.0 + + Returns: + Tuple of (loss, metrics): + loss: Scalar policy loss + metrics: Dictionary with rollout correction metrics and actor/ppo_kl """ - assert config is not None, "config is required for rollout_correction loss mode" + from verl.trainer.ppo.rollout_corr_helper import compute_rollout_correction_and_rejection_mask - # Extract rollout_correction config - # In ray_trainer, when use_policy_gradient=True, the rollout_correction config - # is embedded in actor config's policy_loss field + assert config is not None, "config is required for bypass_mode loss" + + # Extract rollout_correction config from policy_loss rollout_corr_config = config.policy_loss.get("rollout_correction", None) if hasattr(config, "policy_loss") else None if rollout_corr_config is None: raise ValueError( "rollout_correction config not found in policy_loss. " - "When using loss_mode='rollout_correction', ensure rollout_correction config is passed." + "When using loss_mode='bypass_mode', ensure rollout_correction config is passed." ) # Extract parameters + loss_type = rollout_corr_config.get("loss_type", "ppo_clip") rollout_is = rollout_corr_config.get("rollout_is", None) rollout_is_threshold = rollout_corr_config.get("rollout_is_threshold", 2.0) rollout_rs = rollout_corr_config.get("rollout_rs", None) @@ -1769,21 +1759,64 @@ def compute_policy_loss_rollout_correction_wrapper( rollout_token_veto_threshold = rollout_corr_config.get("rollout_token_veto_threshold", None) rollout_is_batch_normalize = rollout_corr_config.get("rollout_is_batch_normalize", False) - # Call the actual implementation - # In bypass mode, old_log_prob IS rollout_log_prob - return compute_policy_loss_with_rollout_correction( - rollout_log_prob=old_log_prob, # This is rollout_log_prob in bypass mode - log_prob=log_prob, - advantages=advantages, - eos_mask=response_mask, - loss_agg_mode=loss_agg_mode, - config=config, - loss_scale_factor=1.0, - rollout_is=rollout_is, - rollout_is_threshold=rollout_is_threshold, - rollout_rs=rollout_rs, - rollout_rs_threshold=rollout_rs_threshold, - rollout_rs_threshold_lower=rollout_rs_threshold_lower, - rollout_token_veto_threshold=rollout_token_veto_threshold, - rollout_is_batch_normalize=rollout_is_batch_normalize, - ) + # In bypass mode: old_log_prob IS rollout_log_prob + rollout_log_prob = old_log_prob + + # Compute IS weights and rejection mask + # Note: For PPO-clip, we still compute IS weights for metrics, but don't apply them + with torch.no_grad(): + rollout_is_weights_proto, modified_response_mask, rollout_metrics = ( + compute_rollout_correction_and_rejection_mask( + old_log_prob=log_prob, # Current policy (for IS ratio: π_current / π_rollout) + rollout_log_prob=rollout_log_prob, # Rollout policy + response_mask=response_mask, + rollout_is=rollout_is, + rollout_is_threshold=rollout_is_threshold, + rollout_rs=rollout_rs, + rollout_rs_threshold=rollout_rs_threshold, + rollout_rs_threshold_lower=rollout_rs_threshold_lower, + rollout_token_veto_threshold=rollout_token_veto_threshold, + rollout_is_batch_normalize=rollout_is_batch_normalize, + ) + ) + + # Extract IS weights tensor (or None if disabled) + computed_is_weights = rollout_is_weights_proto.batch["rollout_is_weights"] if rollout_is_weights_proto else None + + # Apply rejection mask (RS + veto) + effective_mask = modified_response_mask + + # Dispatch to appropriate loss function based on loss_type + if loss_type == "reinforce": + # REINFORCE: Apply IS weights explicitly + pg_loss, pg_metrics = compute_policy_loss_reinforce( + rollout_log_prob=rollout_log_prob, + log_prob=log_prob, + advantages=advantages, + response_mask=effective_mask, + loss_agg_mode=loss_agg_mode, + config=config, + rollout_is_weights=computed_is_weights, + ) + + elif loss_type == "ppo_clip": + # PPO-clip: The ratio π_current/π_old = π_current/π_rollout already handles IS + # DO NOT apply IS weights - would be double-counting! + # The clipping mechanism constrains the effective IS ratio + pg_loss, pg_metrics = compute_policy_loss_vanilla( # type: ignore[call-arg] + old_log_prob=rollout_log_prob, # = old_log_prob in bypass mode + log_prob=log_prob, + advantages=advantages, + response_mask=effective_mask, + loss_agg_mode=loss_agg_mode, + config=config, + rollout_is_weights=None, # Explicitly None - no IS weights for PPO-clip + ) + + else: + raise ValueError(f"Invalid loss_type: {loss_type}. Must be 'reinforce' or 'ppo_clip'.") + + # Merge rollout correction metrics + pg_metrics.update(rollout_metrics) + + return pg_loss, pg_metrics diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index e439a76d361..4558e750cc3 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -51,14 +51,19 @@ ) from verl.trainer.ppo.reward import compute_reward, compute_reward_async from verl.trainer.ppo.utils import Role, WorkerType, need_critic, need_reference_policy, need_reward_model +from verl.utils import tensordict_utils as tu from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path, should_save_ckpt_esi from verl.utils.config import omega_conf_to_dataclass from verl.utils.debug import marked_timer +from verl.utils.import_utils import load_class_from_fqn from verl.utils.metric import reduce_metrics +from verl.utils.py_functional import rename_dict from verl.utils.rollout_skip import RolloutSkip from verl.utils.seqlen_balancing import calculate_workload, get_seqlen_balanced_partitions, log_seqlen_unbalance from verl.utils.torch_functional import masked_mean from verl.utils.tracking import ValidationGenerationsLogger +from verl.workers.config import FSDPEngineConfig +from verl.workers.utils.padding import left_right_2_no_padding, no_padding_2_padding @dataclass @@ -323,7 +328,10 @@ def __init__( self.role_worker_mapping = role_worker_mapping self.resource_pool_manager = resource_pool_manager self.use_reference_policy = need_reference_policy(self.role_worker_mapping) + # legacy reward model implementation self.use_rm = need_reward_model(self.role_worker_mapping) + self.use_reward_loop = self.config.reward_model.use_reward_loop + self.use_critic = need_critic(self.config) self.ray_worker_group_cls = ray_worker_group_cls self.device_name = device_name if device_name else self.config.trainer.device @@ -343,6 +351,8 @@ def __init__( if self.config.algorithm.use_kl_in_reward: self.kl_ctrl_in_reward = core_algos.get_kl_controller(self.config.algorithm.kl_ctrl) + self.use_legacy_worker_impl = config.trainer.get("use_legacy_worker_impl", "auto") + self._create_dataloader(train_dataset, val_dataset, collate_fn, train_sampler) def _create_dataloader(self, train_dataset, val_dataset, collate_fn, train_sampler: Optional[Sampler]): @@ -696,7 +706,31 @@ def init_workers(self): # create critic if self.use_critic: resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic) - critic_cfg = omega_conf_to_dataclass(self.config.critic) + + from verl.workers.config import CriticConfig + + critic_cfg: CriticConfig = omega_conf_to_dataclass(self.config.critic) + + if self.use_legacy_worker_impl == "disable": + # convert critic_cfg into TrainingWorkerConfig + from verl.workers.engine_workers import TrainingWorkerConfig + + orig_critic_cfg = critic_cfg + if orig_critic_cfg.strategy == "fsdp": + engine_config: FSDPEngineConfig = orig_critic_cfg.model.fsdp_config + engine_config.infer_max_token_len_per_gpu = critic_cfg.ppo_infer_max_token_len_per_gpu + engine_config.max_token_len_per_gpu = critic_cfg.ppo_max_token_len_per_gpu + else: + raise NotImplementedError(f"Unknown strategy {orig_critic_cfg.strategy=}") + + critic_cfg = TrainingWorkerConfig( + model_type="value_model", + model_config=orig_critic_cfg.model_config, + engine_config=engine_config, + optimizer_config=orig_critic_cfg.optim, + checkpoint_config=orig_critic_cfg.checkpoint, + ) + critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=critic_cfg) self.resource_pool_to_cls[resource_pool][str(Role.Critic)] = critic_cls @@ -711,11 +745,37 @@ def init_workers(self): self.resource_pool_to_cls[resource_pool][str(Role.RefPolicy)] = ref_policy_cls # create a reward model if reward_fn is None - if self.use_rm: - # we create a RM here - resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel) - rm_cls = RayClassWithInitArgs(self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model) - self.resource_pool_to_cls[resource_pool][str(Role.RewardModel)] = rm_cls + # for legacy discriminative reward model, we create a reward model worker here + # for reward loop discriminative reward model, we create a reward loop manager here + if not self.use_reward_loop: + # legacy reward model only handle reward-model based scenario + if self.use_rm: + # we create a RM here + resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel) + rm_cls = RayClassWithInitArgs( + self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model + ) + self.resource_pool_to_cls[resource_pool][str(Role.RewardModel)] = rm_cls + else: + # reward loop handle hybrid reward scenario (rule, disrm, genrm, ...) + can_reward_loop_parallelize = self.config.actor_rollout_ref.rollout.mode == "async" and ( + not self.use_rm or self.config.reward_model.enable_resource_pool + ) + # judge if we can asynchronously parallelize reward model with actor rollout + # two condition that we can parallelize reward model with actor rollout: + # 1. reward model is not enabled (rule-based reward can parallelize) + # 2. reward model is enabled but extra resource pool is enabled + # If we cannot parallelize, we should enable synchronous mode here, and launch a reward loop manager here + # else for parallelize mode, we launch a reward worker for each rollout worker (in agent loop, not here) + if not can_reward_loop_parallelize: + from verl.experimental.reward import RewardLoopManager + + self.config.reward_model.n_gpus_per_node = self.config.trainer.n_gpus_per_node + resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel) + self.reward_loop_manager = RewardLoopManager( + config=self.config, + rm_resource_pool=resource_pool, + ) # initialize WorkerGroup # NOTE: if you want to use a different resource pool for each role, which can support different parallel size, @@ -751,7 +811,17 @@ def init_workers(self): if self.use_critic: self.critic_wg = all_wg[str(Role.Critic)] - self.critic_wg.init_model() + if self.use_legacy_worker_impl == "disable": + self.critic_wg.reset() + # assign critic loss + from functools import partial + + from verl.workers.utils.losses import value_loss + + value_loss_ = partial(value_loss, config=orig_critic_cfg) + self.critic_wg.set_loss_fn(value_loss_) + else: + self.critic_wg.init_model() if self.use_reference_policy and not self.ref_in_actor: if str(Role.RefPolicy) in all_wg: @@ -764,7 +834,7 @@ def init_workers(self): self.rm_wg = None # initalization of rm_wg will be deprecated in the future - if self.use_rm: + if self.use_rm and not self.use_reward_loop: self.rm_wg = all_wg[str(Role.RewardModel)] self.rm_wg.init_model() @@ -772,10 +842,18 @@ def init_workers(self): self.actor_rollout_wg = all_wg[str(actor_role)] self.actor_rollout_wg.init_model() + if self.ref_in_actor: + self.ref_policy_wg = self.actor_rollout_wg + # create async rollout manager and request scheduler self.async_rollout_mode = False if self.config.actor_rollout_ref.rollout.mode == "async": - from verl.experimental.agent_loop import AgentLoopManager + # Support custom AgentLoopManager via config + manager_class_fqn = self.config.actor_rollout_ref.rollout.get("agent", {}).get("agent_loop_manager_class") + if manager_class_fqn: + AgentLoopManager = load_class_from_fqn(manager_class_fqn, "AgentLoopManager") + else: + from verl.experimental.agent_loop import AgentLoopManager self.async_rollout_mode = True if self.config.reward_model.enable and self.config.reward_model.enable_resource_pool: @@ -923,7 +1001,7 @@ def _start_profiling(self, do_profile: bool) -> None: self.ref_policy_wg.start_profile(profile_step=self.global_steps) if self.use_critic: self.critic_wg.start_profile(profile_step=self.global_steps) - if self.use_rm: + if self.use_rm and not self.use_reward_loop: self.rm_wg.start_profile(profile_step=self.global_steps) def _stop_profiling(self, do_profile: bool) -> None: @@ -934,7 +1012,7 @@ def _stop_profiling(self, do_profile: bool) -> None: self.ref_policy_wg.stop_profile() if self.use_critic: self.critic_wg.stop_profile() - if self.use_rm: + if self.use_rm and not self.use_reward_loop: self.rm_wg.stop_profile() def _balance_batch(self, batch: DataProto, metrics, logging_prefix="global_seqlen", keep_minibatch=False): @@ -974,6 +1052,135 @@ def _balance_batch(self, batch: DataProto, metrics, logging_prefix="global_seqle ) metrics.update(global_balance_stats) + def _compute_values(self, batch: DataProto) -> DataProto: + if self.use_legacy_worker_impl == "disable": + batch_td = batch.to_tensordict() + # step 2: convert from padding to nopadding + batch_td = left_right_2_no_padding(batch_td) + # step 3: add meta info + tu.assign_non_tensor(batch_td, compute_loss=False) + output = self.critic_wg.infer_batch(batch_td) + output = output.get() + values = tu.get(output, "values") + values = no_padding_2_padding(values, batch_td) + values = tu.get_tensordict({"values": values.float()}) + values = DataProto.from_tensordict(values) + else: + values = self.critic_wg.compute_values(batch) + return values + + def _compute_ref_log_prob(self, batch: DataProto) -> DataProto: + if self.use_legacy_worker_impl == "disable": + # step 1: convert dataproto to tensordict. + batch_td = batch.to_tensordict() + # step 2: convert from padding to nopadding + batch_td = left_right_2_no_padding(batch_td) + # step 3: add meta info + tu.assign_non_tensor(batch_td, calculate_entropy=False, compute_loss=False) + output = self.ref_policy_wg.compute_ref_log_prob(batch_td) + # gather output + log_probs = tu.get(output, "log_probs") + # step 4. No padding to padding + log_probs = no_padding_2_padding(log_probs, batch_td) + # step 5: rebuild a tensordict and convert to dataproto + ref_log_prob = tu.get_tensordict({"ref_log_prob": log_probs.float()}) + ref_log_prob = DataProto.from_tensordict(ref_log_prob) + else: + ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch) + + return ref_log_prob + + def _compute_old_log_prob(self, batch: DataProto): + if self.use_legacy_worker_impl == "disable": + # TODO: remove step 1, 2, 4 after we make the whole training tensordict and padding free + # step 1: convert dataproto to tensordict. + batch_td = batch.to_tensordict() + # step 2: convert from padding to nopadding + batch_td = left_right_2_no_padding(batch_td) + # step 3: add meta info + tu.assign_non_tensor(batch_td, calculate_entropy=True, compute_loss=False) + output = self.actor_rollout_wg.compute_log_prob(batch_td) + # gather output + entropy = tu.get(output, "entropy") + log_probs = tu.get(output, "log_probs") + old_log_prob_mfu = tu.get(output, "metrics")["mfu"] + # step 4. No padding to padding + entropy = no_padding_2_padding(entropy, batch_td) + log_probs = no_padding_2_padding(log_probs, batch_td) + # step 5: rebuild a tensordict and convert to dataproto + old_log_prob = tu.get_tensordict({"old_log_probs": log_probs.float(), "entropys": entropy.float()}) + old_log_prob = DataProto.from_tensordict(old_log_prob) + else: + old_log_prob = self.actor_rollout_wg.compute_log_prob(batch) + old_log_prob_mfu = 0 + return old_log_prob, old_log_prob_mfu + + def _update_actor(self, batch: DataProto) -> DataProto: + rollout_config = self.config.actor_rollout_ref.rollout + batch.meta_info["multi_turn"] = rollout_config.multi_turn.enable + # TODO: Make "temperature" single source of truth from generation. + batch.meta_info["temperature"] = rollout_config.temperature + # update actor + if self.use_legacy_worker_impl == "disable": + batch_td = batch.to_tensordict() + # step 2: convert from padding to no-padding + batch_td = left_right_2_no_padding(batch_td) + calculate_entropy = self.config.actor_rollout_ref.actor.entropy_coeff != 0.0 + ppo_mini_batch_size = self.config.actor_rollout_ref.actor.ppo_mini_batch_size + ppo_mini_batch_size = ppo_mini_batch_size * self.config.actor_rollout_ref.rollout.n + ppo_epochs = self.config.actor_rollout_ref.actor.ppo_epochs + seed = self.config.actor_rollout_ref.actor.data_loader_seed + shuffle = self.config.actor_rollout_ref.actor.shuffle + tu.assign_non_tensor( + batch_td, + calculate_entropy=calculate_entropy, + global_batch_size=ppo_mini_batch_size, + mini_batch_size=ppo_mini_batch_size, + epochs=ppo_epochs, + seed=seed, + dataloader_kwargs={"shuffle": shuffle}, + ) + + actor_output = self.actor_rollout_wg.update_actor(batch_td) + actor_output = tu.get(actor_output, "metrics") + actor_output = rename_dict(actor_output, "actor/") + # modify key name + actor_output["perf/mfu/actor"] = actor_output.pop("actor/mfu") + actor_output = DataProto.from_single_dict(data={}, meta_info={"metrics": actor_output}) + else: + actor_output = self.actor_rollout_wg.update_actor(batch) + return actor_output + + def _update_critic(self, batch: DataProto) -> DataProto: + if self.use_legacy_worker_impl == "disable": + batch_td = batch.to_tensordict() + # step 2: convert from padding to no-padding + batch_td = left_right_2_no_padding(batch_td) + ppo_mini_batch_size = self.config.critic.ppo_mini_batch_size + ppo_mini_batch_size = ppo_mini_batch_size * self.config.actor_rollout_ref.rollout.n + ppo_epochs = self.config.critic.ppo_epochs + seed = self.config.critic.data_loader_seed + shuffle = self.config.critic.shuffle + tu.assign_non_tensor( + batch_td, + global_batch_size=ppo_mini_batch_size, + mini_batch_size=ppo_mini_batch_size, + epochs=ppo_epochs, + seed=seed, + dataloader_kwargs={"shuffle": shuffle}, + ) + + output = self.critic_wg.train_mini_batch(batch_td) + output = output.get() + output = tu.get(output, "metrics") + output = rename_dict(output, "critic/") + # modify key name + output["perf/mfu/critic"] = output.pop("critic/mfu") + critic_output = DataProto.from_single_dict(data={}, meta_info={"metrics": output}) + else: + critic_output = self.critic_wg.update_critic(batch) + return critic_output + def fit(self): """ The training loop of PPO. @@ -1085,7 +1292,11 @@ def fit(self): # compute reward model score on batch rm_scores = None if self.use_rm and "rm_scores" not in batch.batch.keys(): - rm_scores = self.rm_wg.compute_rm_score(batch) + if not self.use_reward_loop: + rm_scores = self.rm_wg.compute_rm_score(batch) + else: + assert self.reward_loop_manager is not None, "RewardLoopManager is None" + rm_scores = self.reward_loop_manager.compute_rm_score(batch) batch = batch.union(rm_scores) reward_baseline_tensor, _ = compute_reward(batch, self.reward_fn) reward_baseline_tensor = reward_baseline_tensor.sum(dim=-1) @@ -1117,7 +1328,11 @@ def fit(self): with marked_timer("reward", timing_raw, color="yellow"): # compute reward model score if self.use_rm and "rm_scores" not in batch.batch.keys(): - reward_tensor = self.rm_wg.compute_rm_score(batch) + if not self.use_reward_loop: + reward_tensor = self.rm_wg.compute_rm_score(batch) + else: + assert self.reward_loop_manager is not None, "RewardLoopManager is None" + reward_tensor = self.reward_loop_manager.compute_rm_score(batch) batch = batch.union(reward_tensor) if self.config.reward_model.launch_reward_fn_async: @@ -1134,16 +1349,16 @@ def fit(self): rollout_corr_config = self.config.algorithm.get("rollout_correction", None) bypass_recomputing_logprobs = rollout_corr_config and rollout_corr_config.get("bypass_mode", False) if bypass_recomputing_logprobs: # Use `rollout_log_probs` - from verl.trainer.ppo.rollout_corr_helper import apply_rollout_correction + from verl.trainer.ppo.rollout_corr_helper import apply_bypass_mode - apply_rollout_correction( + apply_bypass_mode( batch=batch, rollout_corr_config=rollout_corr_config, policy_loss_config=self.config.actor_rollout_ref.actor.policy_loss, ) else: # Recompute old_log_probs with marked_timer("old_log_prob", timing_raw, color="blue"): - old_log_prob = self.actor_rollout_wg.compute_log_prob(batch) + old_log_prob, old_log_prob_mfu = self._compute_old_log_prob(batch) entropys = old_log_prob.batch["entropys"] response_masks = batch.batch["response_mask"] actor_config = self.config.actor_rollout_ref.actor @@ -1153,7 +1368,10 @@ def fit(self): loss_agg_mode=actor_config.loss_agg_mode, loss_scale_factor=actor_config.loss_scale_factor, ) - old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()} + old_log_prob_metrics = { + "actor/entropy": entropy_agg.detach().item(), + "perf/mfu/actor_infer": old_log_prob_mfu, + } metrics.update(old_log_prob_metrics) old_log_prob.batch.pop("entropys") batch = batch.union(old_log_prob) @@ -1168,16 +1386,13 @@ def fit(self): if self.use_reference_policy: # compute reference log_prob with marked_timer(str(Role.RefPolicy), timing_raw, color="olive"): - if not self.ref_in_actor: - ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch) - else: - ref_log_prob = self.actor_rollout_wg.compute_ref_log_prob(batch) + ref_log_prob = self._compute_ref_log_prob(batch) batch = batch.union(ref_log_prob) # compute values if self.use_critic: with marked_timer("values", timing_raw, color="cyan"): - values = self.critic_wg.compute_values(batch) + values = self._compute_values(batch) batch = batch.union(values) with marked_timer("adv", timing_raw, color="brown"): @@ -1232,7 +1447,7 @@ def fit(self): # update critic if self.use_critic: with marked_timer("update_critic", timing_raw, color="pink"): - critic_output = self.critic_wg.update_critic(batch) + critic_output = self._update_critic(batch) critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"]) metrics.update(critic_output_metrics) @@ -1240,11 +1455,7 @@ def fit(self): if self.config.trainer.critic_warmup <= self.global_steps: # update actor with marked_timer("update_actor", timing_raw, color="red"): - rollout_config = self.config.actor_rollout_ref.rollout - batch.meta_info["multi_turn"] = rollout_config.multi_turn.enable - # TODO: Make "temperature" single source of truth from generation. - batch.meta_info["temperature"] = rollout_config.temperature - actor_output = self.actor_rollout_wg.update_actor(batch) + actor_output = self._update_actor(batch) actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"]) metrics.update(actor_output_metrics) diff --git a/verl/trainer/ppo/reward.py b/verl/trainer/ppo/reward.py index d9e2872c405..892610243f4 100644 --- a/verl/trainer/ppo/reward.py +++ b/verl/trainer/ppo/reward.py @@ -32,12 +32,12 @@ from omegaconf import DictConfig from verl import DataProto - from verl.experimental.reward.reward_loop.base import RewardLoopManagerBase + from verl.experimental.reward.reward_manager.base import RewardLoopManagerBase from verl.trainer.config.config import ModuleConfig, RewardManagerConfig from verl.workers.reward_manager.abstract import AbstractRewardManager, RawRewardFn else: try: - from verl.experimental.reward.reward_loop.base import RewardLoopManagerBase + from verl.experimental.reward.reward_manager.base import RewardLoopManagerBase except ImportError: RewardLoopManagerBase = None # type: ignore[assignment,misc] diff --git a/verl/trainer/ppo/rollout_corr_helper.py b/verl/trainer/ppo/rollout_corr_helper.py index c09d08f4505..dfff5dff08b 100644 --- a/verl/trainer/ppo/rollout_corr_helper.py +++ b/verl/trainer/ppo/rollout_corr_helper.py @@ -913,24 +913,22 @@ def compute_rollout_corr_metrics_from_logprobs( return metrics_with_prefix -def apply_rollout_correction( +def apply_bypass_mode( batch: DataProto, rollout_corr_config: Optional[RolloutCorrectionConfig] = None, policy_loss_config: PolicyLossConfig = None, ) -> None: """ - BYPASS MODE: Use rollout_log_probs as old_log_probs - Skips expensive actor forward pass for old_log_prob computation + Setup bypass mode: Use rollout_log_probs as old_log_probs. - Two sub-modes (controlled by use_policy_gradient): - 1. Bypass + PPO loss (use_policy_gradient=False, default): - - Uses standard PPO loss function with old_log_prob=rollout_log_prob - - PPO clips ratio π_θ/π_rollout instead of π_θ/π_old + Bypass mode skips expensive actor forward pass for old_log_prob computation + by setting old_log_probs = rollout_log_probs (2 policies instead of 3). - 2. Bypass + Policy Gradient loss (use_policy_gradient=True): - - Uses compute_policy_loss_with_rollout_correction() - - Policy gradient (REINFORCE-style) with IS/RS correction applied - - No PPO clipping + Uses compute_policy_loss_bypass_mode() which supports: + - loss_type="ppo_clip" (default): PPO clipped objective (IS handled by ratio) + - loss_type="reinforce": REINFORCE with explicit IS weights + + Both loss types benefit from rejection sampling (RS) which masks out-of-distribution samples. Note: The implementation is copied from szrlee . @@ -947,13 +945,7 @@ def apply_rollout_correction( batch.batch["old_log_probs"] = batch.batch["rollout_log_probs"] with open_dict(policy_loss_config): - # Always pass rollout_correction config to actor for metrics computation + # Pass rollout_correction config to actor for loss computation and metrics policy_loss_config["rollout_correction"] = rollout_corr_config - - # Check if policy gradient loss mode is enabled - use_policy_gradient = rollout_corr_config.get("use_policy_gradient", False) - - if use_policy_gradient: - # Policy gradient mode: Configure actor to use rollout_correction loss function - # This will use compute_policy_loss_with_rollout_correction (no PPO clipping) - policy_loss_config["loss_mode"] = "rollout_correction" + # Always use bypass_mode loss function which handles both loss_types + policy_loss_config["loss_mode"] = "bypass_mode" diff --git a/verl/trainer/sft_trainer.py b/verl/trainer/sft_trainer.py index ef02077d7c9..d498e9153af 100644 --- a/verl/trainer/sft_trainer.py +++ b/verl/trainer/sft_trainer.py @@ -140,12 +140,21 @@ def _init_engine(self): def _build_dataset(self): config = self.config tokenizer = self.model_config.tokenizer + processor = self.model_config.processor train_dataset = create_sft_dataset( - config.data.train_files, config.data, tokenizer, max_samples=config.data.get("train_max_samples", -1) + config.data.train_files, + config.data, + tokenizer, + processor, + max_samples=config.data.get("train_max_samples", -1), ) if config.data.val_files: val_dataset = create_sft_dataset( - config.data.val_files, config.data, tokenizer, max_samples=config.data.get("val_max_samples", -1) + config.data.val_files, + config.data, + tokenizer, + processor, + max_samples=config.data.get("val_max_samples", -1), ) else: val_dataset = None @@ -178,7 +187,7 @@ def _build_dataloader(self): sampler=self.train_sampler, collate_fn=self.collate_fn, num_workers=8, - pin_memory=True, + pin_memory=False, drop_last=True, pin_memory_device=device_name, ) @@ -193,7 +202,7 @@ def _build_dataloader(self): sampler=self.val_sampler, collate_fn=self.collate_fn, num_workers=8, - pin_memory=True, + pin_memory=False, drop_last=True, pin_memory_device=device_name, ) @@ -367,7 +376,7 @@ def main(config): run_sft(config) -def create_sft_dataset(data_paths, data_config, tokenizer, max_samples=-1): +def create_sft_dataset(data_paths, data_config, tokenizer, processor, max_samples=-1): """Create a dataset.""" # build dataset # First check if a custom dataset class is specified @@ -380,7 +389,9 @@ def create_sft_dataset(data_paths, data_config, tokenizer, max_samples=-1): dataset_cls = MultiTurnSFTDataset # Create datasets based on the selected class - dataset = dataset_cls(parquet_files=data_paths, tokenizer=tokenizer, config=data_config, max_samples=max_samples) + dataset = dataset_cls( + parquet_files=data_paths, tokenizer=tokenizer, config=data_config, processor=processor, max_samples=max_samples + ) return dataset diff --git a/verl/trainer/sft_trainer_ray.py b/verl/trainer/sft_trainer_ray.py index 759514710b6..9178168c32b 100644 --- a/verl/trainer/sft_trainer_ray.py +++ b/verl/trainer/sft_trainer_ray.py @@ -119,12 +119,21 @@ def _build_engine(self): def _build_dataset(self): config = self.config tokenizer = self.model_config.tokenizer + processor = self.model_config.processor train_dataset = create_sft_dataset( - config.data.train_files, config.data, tokenizer, max_samples=config.data.get("train_max_samples", -1) + config.data.train_files, + config.data, + tokenizer, + processor=processor, + max_samples=config.data.get("train_max_samples", -1), ) if config.data.val_files: val_dataset = create_sft_dataset( - config.data.val_files, config.data, tokenizer, max_samples=config.data.get("val_max_samples", -1) + config.data.val_files, + config.data, + tokenizer, + processor=processor, + max_samples=config.data.get("val_max_samples", -1), ) else: val_dataset = None @@ -157,7 +166,7 @@ def _build_dataloader(self): sampler=self.train_sampler, collate_fn=self.collate_fn, num_workers=8, - pin_memory=True, + pin_memory=False, drop_last=True, pin_memory_device=device_name, ) @@ -172,7 +181,7 @@ def _build_dataloader(self): sampler=self.val_sampler, collate_fn=self.collate_fn, num_workers=8, - pin_memory=True, + pin_memory=False, drop_last=True, pin_memory_device=device_name, ) @@ -327,7 +336,7 @@ def main(config): run_sft(config) -def create_sft_dataset(data_paths, data_config, tokenizer, max_samples=-1): +def create_sft_dataset(data_paths, data_config, tokenizer, processor, max_samples=-1): """Create a dataset.""" # build dataset # First check if a custom dataset class is specified @@ -340,7 +349,9 @@ def create_sft_dataset(data_paths, data_config, tokenizer, max_samples=-1): dataset_cls = MultiTurnSFTDataset # Create datasets based on the selected class - dataset = dataset_cls(parquet_files=data_paths, tokenizer=tokenizer, config=data_config, max_samples=max_samples) + dataset = dataset_cls( + parquet_files=data_paths, tokenizer=tokenizer, config=data_config, processor=processor, max_samples=max_samples + ) return dataset diff --git a/verl/utils/attention_utils.py b/verl/utils/attention_utils.py index 8340155e761..ea9884307fc 100644 --- a/verl/utils/attention_utils.py +++ b/verl/utils/attention_utils.py @@ -20,14 +20,14 @@ def _get_attention_functions() -> tuple[Callable, Callable, Callable, Callable]: """Dynamically import attention functions based on available hardware.""" - from verl.utils.device import is_cuda_available, is_npu_available + from verl.utils.device import is_npu_available global _index_first_axis, _pad_input, _rearrange, _unpad_input - if is_cuda_available: - from flash_attn.bert_padding import index_first_axis, pad_input, rearrange, unpad_input - elif is_npu_available: + if is_npu_available: from verl.utils.npu_flash_attn_utils import index_first_axis, pad_input, rearrange, unpad_input + else: + from flash_attn.bert_padding import index_first_axis, pad_input, rearrange, unpad_input _index_first_axis, _pad_input, _rearrange, _unpad_input = index_first_axis, pad_input, rearrange, unpad_input diff --git a/verl/utils/chat_template.py b/verl/utils/chat_template.py index 70b30452c01..6bda790641f 100644 --- a/verl/utils/chat_template.py +++ b/verl/utils/chat_template.py @@ -20,9 +20,23 @@ def initialize_system_prompt(tokenizer, **apply_chat_template_kwargs) -> list[in List of token IDs for the system prompt, or empty list if not supported """ try: - return tokenizer.apply_chat_template( - [{}], add_generation_prompt=False, tokenize=True, **apply_chat_template_kwargs - ) + return tokenizer.apply_chat_template([{}], tokenize=True, **apply_chat_template_kwargs) except TemplateError as e: logger.warning(f"Chat template does not support system prompt: {e}") return [] + + +def extract_system_prompt_and_generation(tokenizer): + token1 = tokenizer.apply_chat_template( + [{"role": "user", "content": ""}], add_generation_prompt=False, tokenize=True + ) + token2 = tokenizer.apply_chat_template( + [{"role": "user", "content": ""}] * 2, add_generation_prompt=False, tokenize=True + ) + # get system prompt tokens + system_prompt = token1[: -(len(token2) - len(token1))] + # get generate prompt tokens + token3 = tokenizer.apply_chat_template([{"role": "user", "content": ""}], add_generation_prompt=True, tokenize=True) + generate_prompt = token3[len(token1) :] + + return system_prompt, generate_prompt diff --git a/verl/utils/config.py b/verl/utils/config.py index 14b16538c25..094024a224e 100644 --- a/verl/utils/config.py +++ b/verl/utils/config.py @@ -168,7 +168,11 @@ def check_mutually_exclusive(mbs, mbs_per_gpu, name: str): ) # Check for reward model micro-batch size conflicts - if config.reward_model.enable and not config.reward_model.use_dynamic_bsz: + if ( + config.reward_model.enable + and not config.reward_model.use_dynamic_bsz + and not config.reward_model.use_reward_loop + ): check_mutually_exclusive( config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu, "reward_model" ) diff --git a/verl/utils/dataset/dataset_utils.py b/verl/utils/dataset/dataset_utils.py index 7354a0c896d..03bde7b01d2 100644 --- a/verl/utils/dataset/dataset_utils.py +++ b/verl/utils/dataset/dataset_utils.py @@ -16,6 +16,7 @@ from enum import Enum import torch +from tensordict.tensorclass import NonTensorData class DatasetPadMode(str, Enum): @@ -60,11 +61,15 @@ def collate_variable_batch(self, batch: list[dict[str, any]]) -> dict[str, any]: final_batch = {} - tensor_keys = [key for key in batch[0].keys() if isinstance(batch[0][key], torch.Tensor)] + tensor_keys = set().union(*(d.keys() for d in batch)) # Handle tensor values by creating a NestedTensor. for key in tensor_keys: - tensors = [item[key] for item in batch] - final_batch[key] = torch.nested.as_nested_tensor(tensors, layout=torch.jagged) + if isinstance(batch[0][key], torch.Tensor): + tensors = [item[key] for item in batch] + final_batch[key] = torch.nested.as_nested_tensor(tensors, layout=torch.jagged) + else: + tensors = [NonTensorData(item.get(key)) for item in batch] + final_batch[key] = torch.stack(tensors, dim=0) return final_batch diff --git a/verl/utils/dataset/multiturn_sft_dataset.py b/verl/utils/dataset/multiturn_sft_dataset.py index 73450530c7a..0eab4701458 100644 --- a/verl/utils/dataset/multiturn_sft_dataset.py +++ b/verl/utils/dataset/multiturn_sft_dataset.py @@ -17,19 +17,28 @@ """ import logging +import os +import re from typing import Any, Optional import numpy as np import pandas as pd import torch -from omegaconf import ListConfig +import torch.nn.functional as F +from omegaconf import DictConfig, ListConfig from torch.utils.data import Dataset -from transformers import PreTrainedTokenizer +from transformers import PreTrainedTokenizer, ProcessorMixin +from verl.models.transformers.qwen2_vl import get_rope_index from verl.utils import hf_tokenizer +from verl.utils.chat_template import extract_system_prompt_and_generation from verl.utils.dataset.dataset_utils import DatasetPadMode +from verl.utils.dataset.vision_utils import process_image, process_video from verl.utils.fs import copy_local_path_from_hdfs +logger = logging.getLogger(__file__) +logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) + def convert_nested_value_to_list_recursive(data_item): if isinstance(data_item, dict): @@ -47,9 +56,23 @@ def convert_nested_value_to_list_recursive(data_item): class MultiTurnSFTDataset(Dataset): """ Dataset for multi-turn conversations where each assistant response should be trained + + Args: + data_files (str or list): Path(s) to Parquet file(s). + tokenizer (PreTrainedTokenizer): For the tokenization of text to token IDs. + config (DictConfig): Options like cache_dir, prompt_key, max_prompt_length, truncation, etc. + processor (ProcessorMixin, optional): Multimodal preprocessor for images/videos. + max_samples (int, optional): Limit the number of samples. Defaults to -1 (use all). """ - def __init__(self, parquet_files: str | list[str], tokenizer, config=None, max_samples: int = -1): + def __init__( + self, + parquet_files: str | list[str], + tokenizer: PreTrainedTokenizer, + config: DictConfig, + processor: Optional[ProcessorMixin] = None, + max_samples: int = -1, + ): # Set defaults and extract parameters from config if provided config = config or {} self.pad_mode = config.get("pad_mode", "right") @@ -60,14 +83,19 @@ def __init__(self, parquet_files: str | list[str], tokenizer, config=None, max_s # for right padding self.max_length = config.get("max_length", 1024) # Get messages_key from the new multiturn config structure - multiturn_config = config.get("multiturn", {}) - self.messages_key = multiturn_config.get("messages_key", "messages") - self.tools_key = multiturn_config.get("tools_key", "tools") - self.enable_thinking_key = multiturn_config.get("enable_thinking_key", "enable_thinking") + self.messages_key = config.get("messages_key", "messages") + self.image_key = config.get("image_key", "images") + self.video_key = config.get("video_key", "videos") + self.image_patch_size = config.get( + "image_patch_size", processor.image_processor.patch_size if processor else None + ) + self.tools_key = config.get("tools_key", "tools") + self.enable_thinking_key = config.get("enable_thinking_key", "enable_thinking") self.apply_chat_template_kwargs = config.get("apply_chat_template_kwargs", {}) self.shuffle = config.get("shuffle", False) self.seed = config.get("seed") self.max_samples = max_samples + self.ignore_input_ids_mismatch = config.get("ignore_input_ids_mismatch", False) assert self.truncation in ["error", "left", "right"] if not isinstance(parquet_files, list | ListConfig): @@ -77,6 +105,7 @@ def __init__(self, parquet_files: str | list[str], tokenizer, config=None, max_s if isinstance(tokenizer, str): tokenizer = hf_tokenizer(tokenizer) self.tokenizer: PreTrainedTokenizer = tokenizer + self.processor = processor self._download() self._read_files_and_process() @@ -127,215 +156,165 @@ def series_to_item(ls): else: self.enable_thinking = None + # system prompt: <|im_start|>system\nYou are a helpful assistant.<|im_end|>\n + # generation prompt: <|im_start|>assistant\n + self.system_prompt, self.generation_prompt = extract_system_prompt_and_generation(self.tokenizer) + def __len__(self): return len(self.messages) - def _process_message_tokens( + def _process_single_message( self, - messages: list[dict[str, Any]], - start_idx: int, - end_idx: int, - is_assistant: bool = False, - enable_thinking: Optional[bool] = None, + index: int, + message: dict[str, Any], tools: Optional[list[dict[str, Any]]] = None, + enable_thinking: Optional[bool] = None, ) -> tuple[list[int], list[int], list[int]]: """ - Process tokens for a single message or a group of messages. + Process a single message and return its tokenized representation. Args: - messages: List of message dictionaries - start_idx: Start index in messages list - end_idx: End index in messages list - is_assistant: Whether this is an assistant message + index: turn index in the conversation + message: A single message dictionary + images: List of images to be used + videos: List of videos to be used + tools: List of tools to be used enable_thinking: Whether to enable thinking mode Returns: - Tuple of (tokens, loss_mask, attention_mask) + Tuple of (input_ids, loss_mask, attention_mask, dict[str, torch.Tensor]) """ - if start_idx > 0: - prev_applied_text = self.tokenizer.apply_chat_template( - messages[:start_idx], - tokenize=False, - add_generation_prompt=False, - enable_thinking=enable_thinking, - tools=tools, - **self.apply_chat_template_kwargs, - ) - if is_assistant: - prev_applied_text_w_generation_prompt = self.tokenizer.apply_chat_template( - messages[:start_idx], - tokenize=False, - add_generation_prompt=True, - enable_thinking=enable_thinking, - tools=tools, - **self.apply_chat_template_kwargs, - ) - - else: - prev_applied_text = "" + processor = self.processor if self.processor is not None else self.tokenizer + apply_chat_template_kwargs = {**self.apply_chat_template_kwargs} + if enable_thinking is not None: + apply_chat_template_kwargs["enable_thinking"] = enable_thinking - cur_applied_text = self.tokenizer.apply_chat_template( - messages[:end_idx], - tokenize=False, - add_generation_prompt=False, - enable_thinking=enable_thinking, + inputs = processor.apply_chat_template( + [message], tools=tools, - **self.apply_chat_template_kwargs, + add_generation_prompt=False, + tokenize=True, + return_dict=True, + return_tensors="pt", + **apply_chat_template_kwargs, ) - # Get tokens for the current message only - if is_assistant: - generation_prompt_text = prev_applied_text_w_generation_prompt[len(prev_applied_text) :] - generation_prompt_tokens = self.tokenizer.encode( - generation_prompt_text, - add_special_tokens=False, - ) - _message_tokens = self.tokenizer.encode( - cur_applied_text[len(prev_applied_text_w_generation_prompt) :], - add_special_tokens=False, - ) - message_tokens = generation_prompt_tokens + _message_tokens - loss_mask = [0] * (len(generation_prompt_tokens)) + [1] * ( - len(message_tokens) - len(generation_prompt_tokens) - ) - else: - message_tokens = self.tokenizer.encode( - cur_applied_text[len(prev_applied_text) :], - add_special_tokens=False, - ) - loss_mask = [0] * len(message_tokens) - attention_mask = [1] * len(message_tokens) + inputs = dict(inputs) + input_ids = inputs.pop("input_ids")[0] + attention_mask = inputs.pop("attention_mask")[0] - return message_tokens, loss_mask, attention_mask + # remove system prompt if exists + if index != 0 and message["role"] != "system": + input_ids = input_ids[len(self.system_prompt) :] + attention_mask = attention_mask[len(self.system_prompt) :] - def _validate_and_convert_tokens( - self, - full_tokens: torch.Tensor, - concat_tokens: list[int], - concat_loss_mask: list[int], - concat_attention_mask: list[int], - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """ - Validate tokenization and convert to tensors. + if message["role"] == "assistant": + loss_mask = torch.ones_like(attention_mask) + # mask out generation prompt if assistant message + loss_mask[: len(self.generation_prompt)] = 0 + else: + loss_mask = torch.zeros_like(attention_mask) + + return input_ids, loss_mask, attention_mask, inputs + + def _build_messages(self, example: dict): + """Replace and