diff --git a/.github/workflows/.deprecate/e2e_ppo_trainer.yml b/.github/workflows/.deprecate/e2e_ppo_trainer.yml index 00ecd79152b..de845d9f2e1 100644 --- a/.github/workflows/.deprecate/e2e_ppo_trainer.yml +++ b/.github/workflows/.deprecate/e2e_ppo_trainer.yml @@ -77,7 +77,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -110,7 +110,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/.deprecate/e2e_ppo_trainer_megatron_sglang.yml b/.github/workflows/.deprecate/e2e_ppo_trainer_megatron_sglang.yml index 177af9ec8b9..0e5f7a487a1 100644 --- a/.github/workflows/.deprecate/e2e_ppo_trainer_megatron_sglang.yml +++ b/.github/workflows/.deprecate/e2e_ppo_trainer_megatron_sglang.yml @@ -75,7 +75,7 @@ permissions: contents: read env: - IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2" + IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2" DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" jobs: diff --git a/.github/workflows/checkpoint_converter.yml b/.github/workflows/checkpoint_converter.yml index 90ad640dff4..577cc88e82f 100644 --- a/.github/workflows/checkpoint_converter.yml +++ b/.github/workflows/checkpoint_converter.yml @@ -81,7 +81,7 @@ jobs: NO_PROXY: "localhost,127.0.0.1" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -116,7 +116,7 @@ jobs: HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable HF_ENDPOINT: "https://hf-mirror.com" container: - image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/cpu_unit_tests.yml b/.github/workflows/cpu_unit_tests.yml index 698816ce11e..95cbc8be370 100644 --- a/.github/workflows/cpu_unit_tests.yml +++ b/.github/workflows/cpu_unit_tests.yml @@ -68,7 +68,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2 steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: diff --git a/.github/workflows/e2e_dapo.yml b/.github/workflows/e2e_dapo.yml index 60119e8436f..b4e28286069 100644 --- a/.github/workflows/e2e_dapo.yml +++ b/.github/workflows/e2e_dapo.yml @@ -94,7 +94,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/e2e_eval_aime24.yml b/.github/workflows/e2e_eval_aime24.yml index f5718603b9e..b0da8f2acc2 100644 --- a/.github/workflows/e2e_eval_aime24.yml +++ b/.github/workflows/e2e_eval_aime24.yml @@ -88,7 +88,7 @@ permissions: contents: read env: - IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2" + IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2" DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" jobs: diff --git a/.github/workflows/e2e_genrm_remote.yml b/.github/workflows/e2e_genrm_remote.yml index 8c7bc690718..6574ef3b61e 100644 --- a/.github/workflows/e2e_genrm_remote.yml +++ b/.github/workflows/e2e_genrm_remote.yml @@ -87,7 +87,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/e2e_ppo_trainer.yml b/.github/workflows/e2e_ppo_trainer.yml index f27da026aaf..27fa3ba5448 100644 --- a/.github/workflows/e2e_ppo_trainer.yml +++ b/.github/workflows/e2e_ppo_trainer.yml @@ -87,7 +87,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -229,7 +229,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2 options: --gpus all --shm-size=50g # Visual dataloader requires large memory steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -238,11 +238,10 @@ jobs: - name: Install the current repository run: | pip3 install --no-deps -e .[test,gpu,vllm,geo,trl] - pip install "transformers[hf_xet]<4.53.0" # Fix for transformers 4.53.0 + pip install "transformers[hf_xet]==4.54.0" # Geo3k - name: Prepare GEO3K dataset run: | - ray stop --force python3 examples/data_preprocess/geo3k.py - name: Running GEO3K VLM GRPO E2E training tests on 8 L20 GPUs with rmpad using function rm run: | @@ -285,7 +284,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -318,7 +317,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2 options: --gpus all --shm-size=50g # Visual dataloader requires large memory steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -326,7 +325,8 @@ jobs: fetch-depth: 0 - name: Install the current repository run: | - pip3 install -e .[test,geo,gpu,sglang] --no-deps && pip install transformers==4.52.3 + pip3 install -e .[test,geo,gpu,sglang] --no-deps + pip install "transformers[hf_xet]==4.54.0" # Geo3k - name: Prepare GEO3K dataset run: | diff --git a/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml b/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml index 4e635b3351b..f37866274a4 100644 --- a/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml +++ b/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml @@ -86,7 +86,7 @@ permissions: contents: read env: - IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2" + IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2" DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" jobs: diff --git a/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml b/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml index ae12c110693..3fa0e51a2d9 100644 --- a/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml +++ b/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml @@ -85,7 +85,7 @@ permissions: contents: read env: - IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2" + IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2" DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" jobs: @@ -348,7 +348,6 @@ jobs: - name: Install the current repository run: | pip3 install --no-deps -e .[test] - pip3 install "transformers[hf_xet]<4.52.0" - name: Prepare Geo3k dataset run: | python3 examples/data_preprocess/geo3k.py diff --git a/.github/workflows/e2e_spin.yml b/.github/workflows/e2e_spin.yml index cb56fbeac7a..ad6a2bfd748 100644 --- a/.github/workflows/e2e_spin.yml +++ b/.github/workflows/e2e_spin.yml @@ -68,7 +68,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/e2e_sppo.yml b/.github/workflows/e2e_sppo.yml index cf85c296012..15edc4f9d00 100644 --- a/.github/workflows/e2e_sppo.yml +++ b/.github/workflows/e2e_sppo.yml @@ -66,7 +66,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/gpu_unit_tests.yml b/.github/workflows/gpu_unit_tests.yml index d86e7e64d86..25018594d48 100644 --- a/.github/workflows/gpu_unit_tests.yml +++ b/.github/workflows/gpu_unit_tests.yml @@ -80,7 +80,7 @@ jobs: NO_PROXY: "localhost,127.0.0.1" HF_HUB_ENABLE_HF_TRANSFER: 1 container: - image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/model.yml b/.github/workflows/model.yml index d484c2b9d51..280781f8c7c 100644 --- a/.github/workflows/model.yml +++ b/.github/workflows/model.yml @@ -73,7 +73,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.4-sglang0.4.6.post5-vllm0.8.5-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -82,7 +82,7 @@ jobs: - name: Install the current repository and upgrade to latest transformers(4.54.0)/flash_attn, transformers 4.55.0 has strange behavior with model backward run: | pip3 install --no-deps -e .[test] - pip3 install --upgrade transformers==4.54.0 + pip3 install --upgrade transformers - name: Running rmpad model tests on 8 L20 GPUs + flash_attn 2.5.8 run: | pytest -s tests/models/test_transformer.py @@ -95,6 +95,10 @@ jobs: - name: Running transformers ulysses tests on 8 L20 GPUs + latest transformers run: | torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py + - name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.54.1 + run: | + pip3 install transformers==4.54.1 + torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py - name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.53.2 run: | pip3 install transformers==4.53.2 @@ -119,7 +123,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.0-fa2.7.4 + image: verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/sgl.yml b/.github/workflows/sgl.yml index 5795f9c3e7c..ea99102fbac 100644 --- a/.github/workflows/sgl.yml +++ b/.github/workflows/sgl.yml @@ -90,7 +90,7 @@ jobs: NCCL_SHM_DISABLE: "1" NCCL_P2P_DISABLE: "1" container: - image: verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml index 2998c08f09f..181eb9be74e 100644 --- a/.github/workflows/vllm.yml +++ b/.github/workflows/vllm.yml @@ -84,7 +84,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2 + image: verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/docker/README.md b/docker/README.md index 787843ec375..d988b0a2b2d 100644 --- a/docker/README.md +++ b/docker/README.md @@ -14,9 +14,7 @@ The first two types of images are hosted on dockerhub [verlai/verl](https://hub. ## Base Image -The stable base image is ``verlai/verl:base-verl0.4-cu124-cudnn9.8-torch2.6-fa2.7.4``. The installed package versions can be found from tags, and the Dockerfile can be found in ``verl[version]-[packages]/Dockerfile.base``. - -The base images for preview are ``verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.8.0`` and ``verlai/verl:base-verl0.5-preview-cu128-cudnn9.8-torch2.7.1-fa2.8.0`` with different CUDA versions. +The stable base image is ``verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4`` with different CUDA versions. The update of base image is not frequent, and the app image can be built on top of it without reinstalling base packages. @@ -25,8 +23,8 @@ The update of base image is not frequent, and the app image can be built on top From this version, we divide images built for vLLM and SGLang as the divergence of dependent packages like FlashInfer. There are 2 types of application images available: -- **vLLM with FSDP and Megatron**: ``verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2`` -- **SGLang with FSDP and Megatron**: ``verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2`` +- **vLLM with FSDP and Megatron**: ``verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2`` +- **SGLang with FSDP and Megatron**: `verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2` Docker images with Megatron backends are runnable with large language model like ``Qwen/Qwen3-235B-A22B``, ``deepseek-ai/DeepSeek-V3-0324`` post-training. Refer to the :doc:`Large Language Model Post-Training documentation<../perf/dpsk>` for more details. @@ -54,7 +52,7 @@ docker start verl docker exec -it verl bash ``` -2. If you use the images provided, you only need to install verl itself without dependencies: +2. If you use the images provided, you only need to install verl itself without dependencies: ```sh # install the nightly version (recommended) diff --git a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.8.mcore0.12 b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.8.mcore0.12 deleted file mode 100644 index 0ac7904b7c6..00000000000 --- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.8.mcore0.12 +++ /dev/null @@ -1,39 +0,0 @@ -# Start from the verl base image -# Dockerfile.base -FROM verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4 - -# Define environments -ENV MAX_JOBS=8 -ENV VLLM_WORKER_MULTIPROC_METHOD=spawn -ENV DEBIAN_FRONTEND=noninteractive -ENV NODE_OPTIONS="" -ENV PIP_ROOT_USER_ACTION=ignore -ENV HF_HUB_ENABLE_HF_TRANSFER="1" - -# Install sglang-0.4.8 and torch-memory-saver -# Install FlashInfer Python package -RUN pip install --upgrade pip setuptools packaging -RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation flashinfer-python==0.2.6.post1 -RUN pip install --resume-retries 999 --no-cache-dir "sglang[all]==0.4.8" && pip install torch-memory-saver --no-cache-dir - -# Fix packages -RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.52.3" accelerate datasets peft hf-transfer \ - "numpy<2.0.0" "pyarrow>=19.0.1" pandas \ - ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \ - pytest py-spy pyext pre-commit ruff - -RUN pip uninstall -y pynvml nvidia-ml-py && \ - pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1" - -RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87 - -# Install TransformerEngine -RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1 - -# Install Megatron-LM -RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2 - -# Install mbridge -RUN pip3 install --no-cache-dir mbridge - -RUN pip3 install --no-deps --no-cache-dir --no-build-isolation --resume-retries 999 vllm==0.9.2 \ No newline at end of file diff --git a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.mcore0.12 b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.mcore0.12 deleted file mode 100644 index 3f7bff3de1f..00000000000 --- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.mcore0.12 +++ /dev/null @@ -1,37 +0,0 @@ -# Start from the verl base image -# Dockerfile.base -FROM verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4 - -# Define environments -ENV MAX_JOBS=8 -ENV VLLM_WORKER_MULTIPROC_METHOD=spawn -ENV DEBIAN_FRONTEND=noninteractive -ENV NODE_OPTIONS="" -ENV PIP_ROOT_USER_ACTION=ignore -ENV HF_HUB_ENABLE_HF_TRANSFER="1" - -# Install sglang-0.4.8 and torch-memory-saver -# Install FlashInfer Python package -RUN pip install --upgrade pip setuptools packaging -RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation flashinfer-python==0.2.9rc1 -RUN pip install --resume-retries 999 --no-cache-dir "sglang[all]==0.4.9.post4" && pip install torch-memory-saver --no-cache-dir - -# Fix packages -RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]==4.53.2" accelerate datasets peft hf-transfer \ - "numpy<2.0.0" "pyarrow>=19.0.1" pandas \ - ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \ - pytest py-spy pyext pre-commit ruff - -RUN pip uninstall -y pynvml nvidia-ml-py && \ - pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1" - -RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87 - -# Install TransformerEngine -RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1 - -# Install Megatron-LM -RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2 - -# Install mbridge -RUN pip3 install --no-cache-dir mbridge \ No newline at end of file diff --git a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.post6.mcore0.12 b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.post6.mcore0.13 similarity index 93% rename from docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.post6.mcore0.12 rename to docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.post6.mcore0.13 index 292363f9056..d79201a92ee 100644 --- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.post6.mcore0.12 +++ b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.sglang0.4.9.post6.mcore0.13 @@ -17,7 +17,7 @@ RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation flashin RUN pip install --resume-retries 999 --no-cache-dir --no-build-isolation "sglang[all]==0.4.9.post6" # Fix packages -RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]==4.54.0" accelerate datasets peft hf-transfer \ +RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]==4.55.4" accelerate datasets peft hf-transfer \ "numpy<2.0.0" "pyarrow>=19.0.1" pandas \ ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \ pytest py-spy pyext pre-commit ruff @@ -31,7 +31,7 @@ RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87 RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1 # Install Megatron-LM -RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2 +RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.13.0 # Install mbridge RUN pip3 install --no-cache-dir mbridge \ No newline at end of file diff --git a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.vllm.mcore0.12 b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.vllm.mcore0.13 similarity index 79% rename from docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.vllm.mcore0.12 rename to docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.vllm.mcore0.13 index 9746301b471..9d73e0ffeeb 100644 --- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.vllm.mcore0.12 +++ b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.app.vllm.mcore0.13 @@ -1,6 +1,6 @@ # Start from the verl base image # Dockerfile.base -FROM verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.0-fa2.7.4 +FROM verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4 # Define environments ENV MAX_JOBS=32 @@ -10,11 +10,12 @@ ENV NODE_OPTIONS="" ENV PIP_ROOT_USER_ACTION=ignore ENV HF_HUB_ENABLE_HF_TRANSFER="1" -# Install torch-2.7.0+cu126 + vllm-0.9.1 -RUN pip install --resume-retries 999 --no-cache-dir vllm==0.9.1 +# Install torch-2.7.1+cu126 + vllm-0.10.0 +RUN pip install --resume-retries 999 --no-cache-dir vllm==0.10.0 # Fix packages -RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \ +# transformers 4.54.0 still not support +RUN pip install --no-cache-dir "tensordict==0.6.2" "transformers[hf_xet]>=4.55.4" accelerate datasets peft hf-transfer \ "numpy<2.0.0" "pyarrow>=19.0.1" pandas \ ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \ pytest py-spy pyext pre-commit ruff @@ -28,7 +29,7 @@ RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87 RUN export NVTE_FRAMEWORK=pytorch && pip3 install --resume-retries 999 --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@v2.2.1 # Install Megatron-LM -RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.2 +RUN pip3 install --no-deps --no-cache-dir --no-build-isolation git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.13.0 # Install mbridge RUN pip3 install --no-cache-dir mbridge diff --git a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.base.torch2.7.0 b/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.base.torch2.7.0 deleted file mode 100644 index 30251f578e9..00000000000 --- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/Dockerfile.base.torch2.7.0 +++ /dev/null @@ -1,133 +0,0 @@ -# Base Docker Image of verl, with CUDA/Torch/FlashAttn/Apex/TransformerEngine, without other frameworks -# Target: verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.8.0-fi0.2.6 -# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10) -# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html -FROM nvcr.io/nvidia/pytorch:24.08-py3 - -# Define environments -ENV MAX_JOBS=16 -ENV VLLM_WORKER_MULTIPROC_METHOD=spawn -ENV DEBIAN_FRONTEND=noninteractive -ENV NODE_OPTIONS="" -ENV PIP_ROOT_USER_ACTION=ignore -ENV HF_HUB_ENABLE_HF_TRANSFER="1" - -# Define installation arguments -ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ -ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple - -# Set apt source -RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \ - { \ - echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \ - echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \ - echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \ - echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \ - } > /etc/apt/sources.list - -# Install systemctl -RUN apt-get update && \ - apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \ - apt-get clean - -# Install tini -RUN apt-get update && \ - apt-get install -y tini aria2 libfreeimage3 libfreeimage-dev zlib1g htop && \ - apt-get clean - -# Change pip source -RUN pip config set global.index-url "${PIP_INDEX}" && \ - pip config set global.extra-index-url "${PIP_INDEX}" && \ - python -m pip install --upgrade pip - -# Uninstall nv-pytorch fork -RUN pip uninstall -y torch torchvision torchaudio \ - pytorch-quantization pytorch-triton torch-tensorrt \ - xgboost transformer_engine flash_attn apex megatron-core grpcio - -RUN pip install --resume-retries 999 --no-cache-dir torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 - -# Install flash-attn-2.7.4.post1, although built with torch2.6, it is compatible with torch2.7 -# https://github.com/Dao-AILab/flash-attention/issues/1644#issuecomment-2899396361 -RUN ABI_FLAG=$(python -c "import torch; print('TRUE' if torch._C._GLIBCXX_USE_CXX11_ABI else 'FALSE')") && \ - URL="https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abi${ABI_FLAG}-cp310-cp310-linux_x86_64.whl" && \ - FILE="flash_attn-2.7.4.post1+cu12torch2.6cxx11abi${ABI_FLAG}-cp310-cp310-linux_x86_64.whl" && \ - wget -nv "${URL}" && \ - pip install --no-cache-dir "${FILE}" - -# Fix packages -RUN pip uninstall -y pynvml nvidia-ml-py && \ - pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1" - -# Install cudnn -RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \ - dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \ - cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-*-keyring.gpg /usr/share/keyrings/ && \ - apt-get update && \ - apt-get -y install cudnn-cuda-12 && \ - rm cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb - -# Install Apex -RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" --resume-retries 999 git+https://github.com/NVIDIA/apex.git - -# Profiling tools -RUN aria2c --always-resume=true --max-tries=99999 https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \ - apt-get update && apt-get install -y libxcb-cursor0 - -RUN apt-get install -y ./nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb && \ - rm -rf /usr/local/cuda/bin/nsys && \ - ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys /usr/local/cuda/bin/nsys && \ - rm -rf /usr/local/cuda/bin/nsys-ui && \ - ln -s /opt/nvidia/nsight-systems/2025.3.1/target-linux-x64/nsys-ui /usr/local/cuda/bin/nsys-ui && \ - rm nsight-systems-2025.3.1_2025.3.1.90-1_amd64.deb - -RUN pip install --resume-retries 999 --no-cache-dir "tensordict==0.6.2" torchdata "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \ - "numpy<2.0.0" "pyarrow>=19.0.1" pandas cuda-bindings \ - ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile xgrammar \ - pytest py-spy pyext pre-commit ruff - -# Install DeepEP -## the dependency of IBGDA -RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so - -## Clone and build deepep and deepep-nvshmem -RUN git clone -b v2.3.1 https://github.com/NVIDIA/gdrcopy.git && \ - git clone https://github.com/deepseek-ai/DeepEP.git && \ - cd DeepEP && git checkout a84a248 - -# Prepare nvshmem -RUN wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && \ - tar -xvf nvshmem_src_3.2.5-1.txz && mv nvshmem_src deepep-nvshmem && \ - cd deepep-nvshmem && git apply ../DeepEP/third-party/nvshmem.patch - -ENV CUDA_HOME=/usr/local/cuda -### Set MPI environment variables. Having errors when not set. -ENV CPATH=/usr/local/mpi/include:$CPATH -ENV LD_LIBRARY_PATH=/usr/local/mpi/lib:$LD_LIBRARY_PATH -ENV LD_LIBRARY_PATH=/usr/local/x86_64-linux-gnu:$LD_LIBRARY_PATH -ENV GDRCOPY_HOME=/workspace/gdrcopy - -## Build deepep-nvshmem -RUN cd deepep-nvshmem && \ - NVSHMEM_SHMEM_SUPPORT=0 \ - NVSHMEM_UCX_SUPPORT=0 \ - NVSHMEM_USE_NCCL=0 \ - NVSHMEM_MPI_SUPPORT=0 \ - NVSHMEM_IBGDA_SUPPORT=1 \ - NVSHMEM_PMIX_SUPPORT=0 \ - NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ - NVSHMEM_USE_GDRCOPY=1 \ - cmake -G Ninja -S . -B build/ -DCMAKE_INSTALL_PREFIX=/workspace/deepep-nvshmem/install && cmake --build build/ --target install - -ENV NVSHMEM_DIR=/workspace/deepep-nvshmem/install -ENV LD_LIBRARY_PATH=$NVSHMEM_DIR/lib:$LD_LIBRARY_PATH -ENV PATH=$NVSHMEM_DIR/bin:$PATH - -## Build deepep -RUN cd DeepEP && \ - python setup.py install - -# Reset pip config -RUN pip config unset global.index-url && \ - pip config unset global.extra-index-url - diff --git a/docker/verl0.5-cu126-torch2.7-fa2.7.4/README.md b/docker/verl0.5-cu126-torch2.7-fa2.7.4/README.md index 2db8c58d51d..023e0eec0fa 100644 --- a/docker/verl0.5-cu126-torch2.7-fa2.7.4/README.md +++ b/docker/verl0.5-cu126-torch2.7-fa2.7.4/README.md @@ -20,9 +20,7 @@ megatron.core==core_r0.13.0 ## Target - Base image: - - `verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.0-fa2.7.4`: We offer a base image with deep ep built in, for vllm - - `verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4`: We offer a base image with deep ep built in, for sglang + - `verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.1-fa2.7.4`: We offer a base image with deep ep built in, for vllm/sglang - App image: - - `verlai/verl:app-verl0.5-vllm0.9.1-mcore0.12.2-te2.2` - - `verlai/verl:app-verl0.5-sglang0.4.8-mcore0.12.2-te2.2` - - `verlai/verl:app-verl0.5-sglang0.4.9.post6-mcore0.12.2-te2.2` \ No newline at end of file + - `verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2` + - `verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.9.post6-mcore0.13.0-te2.2` diff --git a/docs/start/install.rst b/docs/start/install.rst index a384a4dc3cf..8f015b204e1 100644 --- a/docs/start/install.rst +++ b/docs/start/install.rst @@ -52,7 +52,7 @@ The first two types of images are hosted on dockerhub `verlai/verl ` for more details. diff --git a/examples/grpo_trainer/run_deepseek671b_math_megatron_80gb.sh b/examples/grpo_trainer/run_deepseek671b_math_megatron_80gb.sh index b876b19ba57..2f5a93e4466 100644 --- a/examples/grpo_trainer/run_deepseek671b_math_megatron_80gb.sh +++ b/examples/grpo_trainer/run_deepseek671b_math_megatron_80gb.sh @@ -6,7 +6,7 @@ set -x # huggingface-cli download deepseek-ai/DeepSeek-V3-0324 # no offline dist checkpoint needed, now with mbridge>=0.13.0, we can directly init model from huggingface downloaded fp8 weights -# tested on docker://verlai/verl:app-verl0.5-vllm0.10.0-mcore0.13.0-te2.2 +# tested on docker://verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2 LLM="" diff --git a/tests/utils/dataset/test_rl_dataset_on_cpu.py b/tests/utils/dataset/test_rl_dataset_on_cpu.py index 2afc3ef49f6..391e89a94d5 100644 --- a/tests/utils/dataset/test_rl_dataset_on_cpu.py +++ b/tests/utils/dataset/test_rl_dataset_on_cpu.py @@ -77,7 +77,7 @@ def test_image_rl_data(): "prompt_key": "prompt", "max_prompt_length": 1024, "filter_overlong_prompts": True, - "filter_overlong_prompts_workers": 2, + "filter_overlong_prompts_workers": 1, } ) dataset = RLHFDataset( diff --git a/tests/utils/test_activation_offload.py b/tests/utils/test_activation_offload.py index 2393d7962ae..25bc23c40ac 100644 --- a/tests/utils/test_activation_offload.py +++ b/tests/utils/test_activation_offload.py @@ -29,6 +29,23 @@ from verl.utils.fsdp_utils import MixedPrecisionPolicy, apply_fsdp2, get_fsdp_wrap_policy +def create_random_input_ids(batch_size, seq_len, vocab_size): + from flash_attn.bert_padding import unpad_input + + from verl.utils.model import compute_position_id_with_mask, create_random_mask + + input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device="cuda") + + attention_mask = create_random_mask( + input_ids, max_ratio_of_left_padding=0.1, min_ratio_of_valid_token=0.5, max_ratio_of_valid_token=0.7 + ) + position_ids = compute_position_id_with_mask(attention_mask) + + input_ids = unpad_input(input_ids.unsqueeze(-1), attention_mask)[0].transpose(0, 1) + position_ids = unpad_input(position_ids.unsqueeze(-1), attention_mask)[0].transpose(0, 1) + return input_ids, position_ids + + def _fsdp_activation_offloading_test(rank, world_size, rendezvous_file, strategy="fsdp"): torch.cuda.set_device(rank) torch.distributed.init_process_group( @@ -85,15 +102,13 @@ def _fsdp_activation_offloading_test(rank, world_size, rendezvous_file, strategy seq_len = 32 vocab_size = 32000 # First input for initial update - input_ids1 = torch.randint(0, vocab_size, (batch_size, seq_len), device="cuda") - attention_mask1 = torch.ones_like(input_ids1) + input_ids1, position_ids1 = create_random_input_ids(batch_size, seq_len, vocab_size) # Second input for verification - input_ids2 = torch.randint(0, vocab_size, (batch_size, seq_len), device="cuda") - attention_mask2 = torch.ones_like(input_ids2) + input_ids2, position_ids2 = create_random_input_ids(batch_size, seq_len, vocab_size) # Step 1: Initial update and save checkpoint - outputs1 = model(input_ids=input_ids1, attention_mask=attention_mask1) + outputs1 = model(input_ids=input_ids1, position_ids=position_ids1) loss1 = outputs1.logits.mean() loss1.backward() optimizer.step() @@ -106,7 +121,7 @@ def _fsdp_activation_offloading_test(rank, world_size, rendezvous_file, strategy checkpoint_manager.save_checkpoint(local_path=checkpoint_path, hdfs_path=None, global_step=0) # Step 2: Second update and forward pass - outputs2 = model(input_ids=input_ids2, attention_mask=attention_mask2) + outputs2 = model(input_ids=input_ids2, position_ids=position_ids2) loss2 = outputs2.logits.mean() loss2.backward() optimizer.step() @@ -115,14 +130,14 @@ def _fsdp_activation_offloading_test(rank, world_size, rendezvous_file, strategy # Record logits after second update with torch.no_grad(): - logits_without_offloading = model(input_ids=input_ids2, attention_mask=attention_mask2).logits + logits_without_offloading = model(input_ids=input_ids2, position_ids=position_ids2).logits # Step 3: wrap module with activation offloading and load checkpoint - enable_activation_offloading(model, "fsdp") + enable_activation_offloading(model, strategy=strategy) checkpoint_manager.load_checkpoint(checkpoint_path) # Step 4: Repeat the second update with same input - outputs3 = model(input_ids=input_ids2, attention_mask=attention_mask2) + outputs3 = model(input_ids=input_ids2, position_ids=position_ids2) loss3 = outputs3.logits.mean() loss3.backward() optimizer.step() @@ -131,7 +146,7 @@ def _fsdp_activation_offloading_test(rank, world_size, rendezvous_file, strategy # Record logits after loaded checkpoint and update with torch.no_grad(): - logits_with_offloading = model(input_ids=input_ids2, attention_mask=attention_mask2).logits + logits_with_offloading = model(input_ids=input_ids2, position_ids=position_ids2).logits # Step 4: Verify outputs match torch.testing.assert_close(logits_without_offloading, logits_with_offloading, atol=0.0, rtol=0.0) diff --git a/verl/models/mcore/qwen2_5_vl/attention.py b/verl/models/mcore/qwen2_5_vl/attention.py index 91a27cc3edf..84e7ba8eda2 100644 --- a/verl/models/mcore/qwen2_5_vl/attention.py +++ b/verl/models/mcore/qwen2_5_vl/attention.py @@ -118,7 +118,8 @@ def forward( output, bias = self.linear_proj(context_layer) return output, bias - query, key, value, rotary_pos_emb, attn_mask_type = self._adjust_key_value_for_inference( + # Use latest mcore 0.13 API and forward-compatible with previous versions. + outputs = self._adjust_key_value_for_inference( inference_context, query, key, @@ -129,6 +130,8 @@ def forward( sequence_len_offset, ) + query, key, value, rotary_pos_emb, attn_mask_type = outputs[:5] + if packed_seq_params is not None: query = query.squeeze(1) key = key.squeeze(1) diff --git a/verl/models/transformers/monkey_patch.py b/verl/models/transformers/monkey_patch.py index 7f0e10ab65e..d8d67d5ebaa 100644 --- a/verl/models/transformers/monkey_patch.py +++ b/verl/models/transformers/monkey_patch.py @@ -110,6 +110,65 @@ def _ulysses_flash_attention_forward( return attn_output +def _ulysses_flash_attention_forward_transformers_4_55( + query_states: torch.Tensor, + key_states: torch.Tensor, + value_states: torch.Tensor, + attention_mask: Optional[torch.Tensor], + query_length: int, + *args, + position_ids: Optional[torch.Tensor] = None, + **kwargs, +): + """For transformers>=4.55, the flash attention api has changed, + we need to pass the query_length after doing ulysses alltoall. + + See https://github.com/huggingface/transformers/issues/40399 + """ + ulysses_sp_size = get_ulysses_sequence_parallel_world_size() + + ########## AlltoAll for Ulysses ########## + if ulysses_sp_size > 1: + assert position_ids is not None, "position_ids is required for Ulysses sequence parallelism" + + # NOTE: repeat kv heads to be divided by sequence parallel. Instead of repeating nheads_q//nheads_k, + # we choose to repeat sp_size//nheads_k, since flash_attention supports MQA/GQA. + # For example: + # - nheads_k=4, sp=8, repeats=2 + # - nheads_k=8, sp=8, repeats=1 + # - nheads_k=16, sp=8, repeats=1 + repeats = max(ulysses_sp_size // key_states.size(2), 1) + key_states = repeat_kv(key_states, repeats) + value_states = repeat_kv(value_states, repeats) + + # (bsz, seq_len/n, n_head, head_dim) -> (bsz, seq_len, n_head/n, head_dim) + query_states = gather_seq_scatter_heads(query_states, seq_dim=1, head_dim=2) + key_states = gather_seq_scatter_heads(key_states, seq_dim=1, head_dim=2) + value_states = gather_seq_scatter_heads(value_states, seq_dim=1, head_dim=2) + + # TODO: all_gather position_ids because `prepare_fa2_from_position_ids` needs it, we can eliminate + # this all_gather by passing cu_seq_lens_q, cu_seq_lens_k, max_length_k, max_length_q explicitly. + # https://github.com/huggingface/transformers/pull/33932 + + # (bsz, seq_len/n) -> (bsz, seq_len) + position_ids_list = [torch.empty_like(position_ids) for _ in range(ulysses_sp_size)] + torch.distributed.all_gather(position_ids_list, position_ids, group=get_ulysses_sequence_parallel_group()) + position_ids = torch.concat(position_ids_list, dim=-1) + + # (bsz, seq_len, n_head/n, head_dim) + query_length = query_states.size(1) + attn_output = _flash_attention_forward( + query_states, key_states, value_states, attention_mask, query_length, *args, position_ids=position_ids, **kwargs + ) + + ########## AlltoAll for Ulysses ########## + if ulysses_sp_size > 1: + # (bsz, seq_len, n_head/n, head_dim) -> (bsz, seq_len/n, n_head, head_dim) + attn_output = gather_heads_scatter_seq(attn_output, seq_dim=1, head_dim=2) + + return attn_output + + def patch_vlm_for_ulysses_input_slicing(model_class: type): """ Applies a monkey patch to the forward method of a given model class @@ -304,11 +363,17 @@ def state_dict(self, *args, **kwargs): module._flash_attention_forward = _ulysses_flash_attention_forward print(f"Monkey patch _flash_attention_forward in {model.__module__}") else: - # transformers>=4.48.0 - from transformers.integrations import flash_attention + if is_transformers_version_in_range(min_version="4.55.0"): + from transformers.integrations import flash_attention + + flash_attention._flash_attention_forward = _ulysses_flash_attention_forward_transformers_4_55 + print(f"Monkey patch _flash_attention_forward in {model.__module__} for new api") + else: + # 4.48.0 <= transformers <= 4.54.1, Vision attention + from transformers.integrations import flash_attention - flash_attention._flash_attention_forward = _ulysses_flash_attention_forward - print(f"Monkey patch _flash_attention_forward in {flash_attention.__name__}") + flash_attention._flash_attention_forward = _ulysses_flash_attention_forward + print(f"Monkey patch _flash_attention_forward in {flash_attention.__name__}") patch_forward_with_backends(model, use_fused_kernels=use_fused_kernels, fused_kernels_backend=fused_kernels_backend) diff --git a/verl/models/transformers/qwen2_vl.py b/verl/models/transformers/qwen2_vl.py index 7c8214c0a8e..0cc7820d114 100644 --- a/verl/models/transformers/qwen2_vl.py +++ b/verl/models/transformers/qwen2_vl.py @@ -217,18 +217,36 @@ def flash_attention_forward( ) # remove channel dimension cu_seqlens_q, cu_seqlens_k = cu_seq_lens max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens - attn_output = flash_attn_varlen_func( + + flash_attn_func = flash_attn_varlen_func + common_attn_kwargs = { + "cu_seqlens_q": cu_seqlens_q, + "cu_seqlens_k": cu_seqlens_k, + "max_seqlen_q": max_seqlen_in_batch_q, + "max_seqlen_k": max_seqlen_in_batch_k, + "dropout_p": kwargs.pop("dropout", 0.0), + "softmax_scale": kwargs.pop("softmax_scale", None), + **flash_kwargs, + } + + if flash_attn_func is None: + # Use transformers >= 4.54 + flash_attn_func = _flash_attention_forward + specific_attn_kwargs = { + "attention_mask": attention_mask, + "position_ids": position_ids, + "query_length": query_length, + "is_causal": causal, + } + else: + specific_attn_kwargs = {"causal": causal} + + attn_output = flash_attn_func( query_states, key_states, value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=kwargs.pop("dropout", 0.0), - softmax_scale=kwargs.pop("softmax_scale", None), - causal=causal, - **flash_kwargs, + **common_attn_kwargs, + **specific_attn_kwargs, ) attn_output = attn_output.view(batch_size, -1, attn_output.size(-2), attn_output.size(-1)) else: diff --git a/verl/workers/fsdp_workers.py b/verl/workers/fsdp_workers.py index df14d910db6..712568f2e8c 100644 --- a/verl/workers/fsdp_workers.py +++ b/verl/workers/fsdp_workers.py @@ -288,6 +288,11 @@ def _build_model_optimizer( actor_model_config = AutoConfig.from_pretrained( local_path, trust_remote_code=trust_remote_code, attn_implementation="flash_attention_2" ) + # TODO: VL models use VisionAttention, which directly uses flash_attention in transformers>=4.53 + # which will be patched by _ulysses_flash_attention_forward, but errorly misses position_ids + # Maybe support Ulysses in VisionAttention in the future and remove this patch + if self.ulysses_sequence_parallel_size > 1 and hasattr(actor_model_config, "vision_config"): + actor_model_config.vision_config._attn_implementation = "eager" # patch for kimi-vl if getattr(actor_model_config, "model_type", None) == "kimi_vl": @@ -1072,6 +1077,12 @@ def _build_critic_model_optimizer(self, config): attn_implementation="flash_attention_2", trust_remote_code=config.model.get("trust_remote_code", False), ) + # TODO: VL models use VisionAttention, which directly uses flash_attention in transformers>=4.53 + # which will be patched by _ulysses_flash_attention_forward, but errorly misses position_ids + # Maybe support Ulysses in VisionAttention in the future and remove this patch + if self.ulysses_sequence_parallel_size > 1 and hasattr(critic_model_config, "vision_config"): + critic_model_config.vision_config._attn_implementation = "eager" + critic_model_config.num_labels = 1 # patch for kimi-vl if getattr(critic_model_config, "model_type", None) == "kimi_vl": diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index 2a50a11a7b7..20bf0bfad41 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -276,7 +276,7 @@ async def init_engine(self): skip_tokenizer_init=False, max_model_len=self.max_model_len, max_num_seqs=config.max_num_seqs, - load_format="auto", + load_format="dummy" if config.load_format.startswith("dummy") else config.load_format, disable_log_stats=config.disable_log_stats, max_num_batched_tokens=max_num_batched_tokens, enable_chunked_prefill=config.enable_chunked_prefill,