diff --git a/.buildkite/hardware_tests/cpu.yaml b/.buildkite/hardware_tests/cpu.yaml index 9b1044443780..77abb5532dbe 100644 --- a/.buildkite/hardware_tests/cpu.yaml +++ b/.buildkite/hardware_tests/cpu.yaml @@ -14,13 +14,15 @@ steps: - tests/kernels/moe/test_cpu_fused_moe.py - tests/kernels/test_onednn.py - tests/kernels/test_awq_int4_to_int8.py + - tests/kernels/quantization/test_cpu_fp8_scaled_mm.py commands: - | - bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m " + bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m " pytest -x -v -s tests/kernels/attention/test_cpu_attn.py pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py pytest -x -v -s tests/kernels/test_onednn.py - pytest -x -v -s tests/kernels/test_awq_int4_to_int8.py" + pytest -x -v -s tests/kernels/test_awq_int4_to_int8.py + pytest -x -v -s tests/kernels/quantization/test_cpu_fp8_scaled_mm.py" - label: CPU-Compatibility Tests depends_on: [] @@ -69,11 +71,11 @@ steps: pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs pytest -x -v -s tests/quantization/test_cpu_wna16.py" -- label: CPU-Distributed Tests +- label: CPU-Distributed Tests (PP+TP) depends_on: [] device: intel_cpu no_plugin: true - source_file_dependencies: + source_file_dependencies: &cpu_distributed_deps - csrc/cpu/shm.cpp - vllm/v1/worker/cpu_worker.py - vllm/v1/worker/gpu_worker.py @@ -82,10 +84,21 @@ steps: - vllm/platforms/cpu.py - vllm/distributed/parallel_state.py - vllm/distributed/device_communicators/cpu_communicator.py + - .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh + commands: + - | + bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m " + bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh tp_pp" + +- label: CPU-Distributed Tests (DP+TP) + depends_on: [] + device: intel_cpu + no_plugin: true + source_file_dependencies: *cpu_distributed_deps commands: - | bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m " - bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh" + bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh dp_tp" - label: CPU-Multi-Modal Model Tests %N depends_on: [] diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh index 00ae34bba6d7..10c03c3e1773 100755 --- a/.buildkite/image_build/image_build.sh +++ b/.buildkite/image_build/image_build.sh @@ -192,6 +192,7 @@ export BUILDKITE_COMMIT export PARENT_COMMIT export IMAGE_TAG export IMAGE_TAG_LATEST +export COMMIT="${COMMIT:-${BUILDKITE_COMMIT}}" export CACHE_FROM export CACHE_FROM_BASE_BRANCH export CACHE_FROM_MAIN diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 8fce15680173..8a900c0bf862 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -27,7 +27,7 @@ steps: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64_CU129}\" --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31" + - "bash .buildkite/scripts/upload-nightly-wheels.sh" env: DOCKER_BUILDKIT: "1" @@ -37,10 +37,10 @@ steps: agents: queue: arm64_cpu_queue_release commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64}\" --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64}\" --build-arg BUILD_OS=manylinux --build-arg BUILD_BASE_IMAGE=pytorch/manylinuxaarch64-builder:cuda13.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35" + - "bash .buildkite/scripts/upload-nightly-wheels.sh" env: DOCKER_BUILDKIT: "1" @@ -53,7 +53,7 @@ steps: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35" + - "bash .buildkite/scripts/upload-nightly-wheels.sh" env: DOCKER_BUILDKIT: "1" @@ -66,7 +66,7 @@ steps: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86_CU129}\" --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_31" + - "bash .buildkite/scripts/upload-nightly-wheels.sh" env: DOCKER_BUILDKIT: "1" @@ -76,10 +76,10 @@ steps: agents: queue: cpu_queue_release commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86}\" --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86}\" --build-arg BUILD_OS=manylinux --build-arg BUILD_BASE_IMAGE=pytorch/manylinux2_28-builder:cuda13.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35" + - "bash .buildkite/scripts/upload-nightly-wheels.sh" env: DOCKER_BUILDKIT: "1" @@ -92,7 +92,7 @@ steps: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35" + - "bash .buildkite/scripts/upload-nightly-wheels.sh" env: DOCKER_BUILDKIT: "1" @@ -121,7 +121,19 @@ steps: queue: cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86}\" --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." + - | + DOCKER_BUILDKIT=1 docker build \ + $(bash .buildkite/scripts/docker-build-metadata-args.sh) \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg CUDA_VERSION=13.0.2 \ + --build-arg torch_cuda_arch_list="${CUDA_ARCH_X86}" \ + --build-arg INSTALL_KV_CONNECTORS=true \ + --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile . - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" # re-tag to default image tag and push, just in case arm64 build fails - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" @@ -134,7 +146,19 @@ steps: queue: arm64_cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64}\" --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." + - | + DOCKER_BUILDKIT=1 docker build \ + $(bash .buildkite/scripts/docker-build-metadata-args.sh) \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg CUDA_VERSION=13.0.2 \ + --build-arg torch_cuda_arch_list="${CUDA_ARCH_AARCH64}" \ + --build-arg INSTALL_KV_CONNECTORS=true \ + --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile . - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" - label: "Build release image - x86_64 - CUDA 12.9" @@ -144,7 +168,18 @@ steps: queue: cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86_CU129}\" --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129 --target vllm-openai --progress plain -f docker/Dockerfile ." + - | + DOCKER_BUILDKIT=1 docker build \ + $(bash .buildkite/scripts/docker-build-metadata-args.sh cu129) \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg CUDA_VERSION=12.9.1 \ + --build-arg torch_cuda_arch_list="${CUDA_ARCH_X86_CU129}" \ + --build-arg INSTALL_KV_CONNECTORS=true \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile . - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129" # re-tag to default image tag and push, just in case arm64 build fails - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129" @@ -157,7 +192,18 @@ steps: queue: arm64_cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64_CU129}\" --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129 --target vllm-openai --progress plain -f docker/Dockerfile ." + - | + DOCKER_BUILDKIT=1 docker build \ + $(bash .buildkite/scripts/docker-build-metadata-args.sh cu129) \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg CUDA_VERSION=12.9.1 \ + --build-arg torch_cuda_arch_list="${CUDA_ARCH_AARCH64_CU129}" \ + --build-arg INSTALL_KV_CONNECTORS=true \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile . - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129" - label: "Build release image - x86_64 - CUDA 13.0 - Ubuntu 24.04" @@ -167,7 +213,21 @@ steps: queue: cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86}\" --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu24.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ." + - | + DOCKER_BUILDKIT=1 docker build \ + $(bash .buildkite/scripts/docker-build-metadata-args.sh ubuntu2404) \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg CUDA_VERSION=13.0.2 \ + --build-arg UBUNTU_VERSION=24.04 \ + --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 \ + --build-arg torch_cuda_arch_list="${CUDA_ARCH_X86}" \ + --build-arg INSTALL_KV_CONNECTORS=true \ + --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu24.04 \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile . - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404" - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404" @@ -179,7 +239,21 @@ steps: queue: arm64_cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64}\" --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu24.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ." + - | + DOCKER_BUILDKIT=1 docker build \ + $(bash .buildkite/scripts/docker-build-metadata-args.sh ubuntu2404) \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg CUDA_VERSION=13.0.2 \ + --build-arg UBUNTU_VERSION=24.04 \ + --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 \ + --build-arg torch_cuda_arch_list="${CUDA_ARCH_AARCH64}" \ + --build-arg INSTALL_KV_CONNECTORS=true \ + --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu24.04 \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile . - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404" - label: "Build release image - x86_64 - CUDA 12.9 - Ubuntu 24.04" @@ -189,7 +263,20 @@ steps: queue: cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86_CU129}\" --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ." + - | + DOCKER_BUILDKIT=1 docker build \ + $(bash .buildkite/scripts/docker-build-metadata-args.sh cu129-ubuntu2404) \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg CUDA_VERSION=12.9.1 \ + --build-arg UBUNTU_VERSION=24.04 \ + --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 \ + --build-arg torch_cuda_arch_list="${CUDA_ARCH_X86_CU129}" \ + --build-arg INSTALL_KV_CONNECTORS=true \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile . - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404" - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129-ubuntu2404" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu129-ubuntu2404" @@ -201,7 +288,20 @@ steps: queue: arm64_cpu_queue_release commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64_CU129}\" --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ." + - | + DOCKER_BUILDKIT=1 docker build \ + $(bash .buildkite/scripts/docker-build-metadata-args.sh cu129-ubuntu2404) \ + --build-arg max_jobs=16 \ + --build-arg USE_SCCACHE=1 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg CUDA_VERSION=12.9.1 \ + --build-arg UBUNTU_VERSION=24.04 \ + --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 \ + --build-arg torch_cuda_arch_list="${CUDA_ARCH_AARCH64_CU129}" \ + --build-arg INSTALL_KV_CONNECTORS=true \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile . - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu129-ubuntu2404" - block: "Build release image for x86_64 CPU" @@ -209,6 +309,7 @@ steps: depends_on: ~ - label: "Build release image - x86_64 - CPU" + key: build-cpu-release-image-x86 depends_on: - block-cpu-release-image-build - input-release-version @@ -227,7 +328,8 @@ steps: depends_on: ~ - label: "Build release image - arm64 - CPU" - depends_on: + key: build-cpu-release-image-arm64 + depends_on: - block-arm64-cpu-release-image-build - input-release-version agents: @@ -336,6 +438,41 @@ steps: DOCKER_BUILDKIT: "1" DOCKERHUB_USERNAME: "vllmbot" + - block: "Publish release images to DockerHub" + key: block-publish-release-images + depends_on: + - create-multi-arch-manifest + - create-multi-arch-manifest-cuda-12-9 + - create-multi-arch-manifest-ubuntu2404 + - create-multi-arch-manifest-cuda-12-9-ubuntu2404 + - build-rocm-release-image + - input-release-version + # Wait for CPU builds if their block steps were unblocked, so publish + # doesn't race the in-progress CPU build. allow_failure lets publish + # proceed when the operator legitimately leaves the CPU block steps + # unblocked or the CPU build fails. + - step: build-cpu-release-image-x86 + allow_failure: true + - step: build-cpu-release-image-arm64 + allow_failure: true + if: build.env("NIGHTLY") != "1" + + - label: "Publish release images to DockerHub" + depends_on: + - block-publish-release-images + key: publish-release-images-dockerhub + agents: + queue: small_cpu_queue_release + commands: + - "bash .buildkite/scripts/publish-release-images.sh" + plugins: + - docker-login#v3.0.0: + username: vllmbot + password-env: DOCKERHUB_TOKEN + env: + DOCKER_BUILDKIT: "1" + DOCKERHUB_USERNAME: "vllmbot" + - group: "Publish wheels" key: "publish-wheels" steps: @@ -623,7 +760,7 @@ steps: - "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh" env: S3_BUCKET: "vllm-wheels" - VARIANT: "rocm721" + VARIANT: "rocm722" # ROCm Job 6: Build ROCm Release Docker Image - label: ":docker: Build release image - x86_64 - ROCm" diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh index 6f41d1cdda47..afa884fba46b 100755 --- a/.buildkite/scripts/annotate-release.sh +++ b/.buildkite/scripts/annotate-release.sh @@ -8,8 +8,6 @@ if [ -z "${RELEASE_VERSION}" ]; then RELEASE_VERSION="1.0.0.dev" fi -ROCM_BASE_CACHE_KEY=$(.buildkite/scripts/cache-rocm-base-wheels.sh key) - buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF To download the wheel (by commit): \`\`\` @@ -25,95 +23,5 @@ aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38- aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl . \`\`\` - -To download and upload the image: - -\`\`\` -# Download images: - -docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 -docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 -docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu129 -docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu129 -docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base -docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm -docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} -docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} - -# Tag and push images: - -## CUDA - -docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64 -docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64 -docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 -docker push vllm/vllm-openai:latest-x86_64 -docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 - -docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu129 vllm/vllm-openai:x86_64-cu129 -docker tag vllm/vllm-openai:x86_64-cu129 vllm/vllm-openai:latest-x86_64-cu129 -docker tag vllm/vllm-openai:x86_64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129 -docker push vllm/vllm-openai:latest-x86_64-cu129 -docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129 - -docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64 -docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64 -docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 -docker push vllm/vllm-openai:latest-aarch64 -docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 - -docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu129 vllm/vllm-openai:aarch64-cu129 -docker tag vllm/vllm-openai:aarch64-cu129 vllm/vllm-openai:latest-aarch64-cu129 -docker tag vllm/vllm-openai:aarch64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129 -docker push vllm/vllm-openai:latest-aarch64-cu129 -docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129 - -## ROCm - -docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} -docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest -docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION} -docker push vllm/vllm-openai-rocm:latest -docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION} - -docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base -docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base -docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base -docker push vllm/vllm-openai-rocm:latest-base -docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base - -## CPU - -docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64 -docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64 -docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 -docker push vllm/vllm-openai-cpu:latest-x86_64 -docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 - -docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64 -docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64 -docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64 -docker push vllm/vllm-openai-cpu:latest-arm64 -docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64 - -# Create multi-arch manifest: - -docker manifest rm vllm/vllm-openai:latest -docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 -docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 -docker manifest push vllm/vllm-openai:latest -docker manifest push vllm/vllm-openai:v${RELEASE_VERSION} - -docker manifest rm vllm/vllm-openai:latest-cu129 -docker manifest create vllm/vllm-openai:latest-cu129 vllm/vllm-openai:latest-x86_64-cu129 vllm/vllm-openai:latest-aarch64-cu129 -docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129 -docker manifest push vllm/vllm-openai:latest-cu129 -docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu129 - -docker manifest rm vllm/vllm-openai-cpu:latest || true -docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64 -docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64 -docker manifest push vllm/vllm-openai-cpu:latest -docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION} -\`\`\` +Docker images are published automatically by the "Publish release images to DockerHub" pipeline step. EOF diff --git a/.buildkite/scripts/ci-fetch-log.sh b/.buildkite/scripts/ci-fetch-log.sh new file mode 100755 index 000000000000..02798b56f4a9 --- /dev/null +++ b/.buildkite/scripts/ci-fetch-log.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Usage: ./ci-fetch-log.sh [output_file] +# ./ci-fetch-log.sh [output_file] +# +# Downloads the raw log for a Buildkite job from the public, unauthenticated +# /organizations//pipelines//builds//jobs//download +# endpoint, then strips ANSI/timestamps via ci-clean-log.sh. +# +# Find and via: +# gh pr checks --repo vllm-project/vllm +# Each failing row's URL is .../builds/#. + +set -euo pipefail + +ORG="vllm" +PIPELINE="ci" + +usage() { + echo "Usage: $0 [output_file]" + echo " $0 [output_file]" + exit 1 +} + +if [ $# -lt 1 ]; then usage; fi + +if [[ "$1" == https://* ]]; then + BUILD=$(echo "$1" | sed -nE 's#.*/builds/([0-9]+).*#\1#p') + JOB=$(echo "$1" | grep -oE '[0-9a-f]{8}-[0-9a-f-]+' | head -n 1) + OUT="${2:-ci-${BUILD}-${JOB:0:8}.log}" +else + if [ $# -lt 2 ]; then usage; fi + BUILD="$1" + JOB="$2" + OUT="${3:-ci-${BUILD}-${JOB:0:8}.log}" +fi + +if [ -z "$BUILD" ] || [ -z "$JOB" ]; then + echo "Could not parse build number or job UUID from: $1" >&2 + usage +fi + +COOKIES=$(mktemp) +trap 'rm -f "$COOKIES"' EXIT + +# Buildkite issues a session cookie on first hit; subsequent /download needs it. +curl -fsSL -c "$COOKIES" -A "vllm-ci-fetch-log" \ + "https://buildkite.com/${ORG}/${PIPELINE}/builds/${BUILD}" -o /dev/null + +curl -fsSL -b "$COOKIES" -A "vllm-ci-fetch-log" \ + "https://buildkite.com/organizations/${ORG}/pipelines/${PIPELINE}/builds/${BUILD}/jobs/${JOB}/download" \ + -o "$OUT" + +bash "$(dirname "$0")/ci-clean-log.sh" "$OUT" + +echo "$OUT" diff --git a/.buildkite/scripts/detect-manylinux-tag.py b/.buildkite/scripts/detect-manylinux-tag.py new file mode 100644 index 000000000000..40fa6c6ffbb7 --- /dev/null +++ b/.buildkite/scripts/detect-manylinux-tag.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Detect the manylinux platform tag for a wheel and rename it in place. + +vLLM's build images produce wheels with the generic ``linux_`` platform +tag, which installers like ``pip`` won't accept off PyPI/our index. We need to +rewrite the platform tag to the appropriate ``manylinux___`` +before uploading. + +Historically the tag was hard-coded per build (``manylinux_2_31`` for the +Ubuntu 20.04-based image, ``manylinux_2_35`` for the Ubuntu 22.04-based +images). That is brittle: bumping the base image silently produces wheels +labelled with the wrong glibc requirement. This script asks ``auditwheel`` +to derive the tag from the symbol versions actually referenced by the +binaries inside the wheel, so the label tracks reality. + +We can't simply call ``auditwheel repair`` -- it tries to graft external +shared libraries into the wheel and fails on vLLM's CUDA/cuBLAS dependencies. +Instead we use ``auditwheel.wheel_abi.analyze_wheel_abi`` directly, which is +the same call that powers ``auditwheel show``, and read off +``winfo.sym_policy.name``. + +Usage: + detect-manylinux-tag.py + +The wheel is renamed in place; the new path is printed on stdout. All +diagnostics go to stderr so callers can capture stdout safely. +""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +from auditwheel.error import ( + AuditwheelError, + NonPlatformWheelError, + WheelToolsError, +) +from auditwheel.wheel_abi import analyze_wheel_abi +from auditwheel.wheeltools import get_wheel_architecture, get_wheel_libc + + +def detect_platform_tag(wheel_path: Path) -> str: + """Return the most precise platform tag the wheel is consistent with. + + Mirrors ``auditwheel show`` but returns ``sym_policy`` rather than + ``overall_policy``: we only care about the glibc symbol versions used, + not about other policy axes (ISA extensions, blacklist, etc.) that + ``overall_policy`` folds in. + """ + fn = wheel_path.name + + try: + arch = get_wheel_architecture(fn) + except (WheelToolsError, NonPlatformWheelError): + # Architecture isn't deducible from the filename; let auditwheel + # infer it from the ELF binaries inside the wheel. + arch = None + + try: + libc = get_wheel_libc(fn) + except WheelToolsError: + # An unrepaired wheel uses ``linux_``, which doesn't encode + # libc. Let auditwheel infer it from the ELF binaries. + libc = None + + winfo = analyze_wheel_abi( + libc, + arch, + wheel_path, + frozenset(), + disable_isa_ext_check=False, + allow_graft=False, + ) + return winfo.sym_policy.name + + +def rename_wheel(wheel_path: Path, new_platform_tag: str) -> Path: + """Rename the wheel in place, replacing only its platform tag.""" + # Wheel filename per PEP 427: + # {distribution}-{version}(-{build})?-{python}-{abi}-{platform}.whl + # The platform tag is always the last ``-``-separated token before + # ``.whl``. Compound tags like ``manylinux_2_31_x86_64`` use ``_`` as the + # internal separator, so ``-``-splitting is unambiguous. + parts = wheel_path.stem.split("-") + if len(parts) < 5: + raise ValueError(f"Unrecognised wheel filename: {wheel_path.name}") + parts[-1] = new_platform_tag + new_path = wheel_path.with_name("-".join(parts) + ".whl") + if new_path != wheel_path: + wheel_path.rename(new_path) + return new_path + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Detect a wheel's manylinux platform tag with " + "auditwheel and rename the wheel in place." + ) + parser.add_argument( + "wheel", + type=Path, + help="Path to the wheel to inspect and rename.", + ) + args = parser.parse_args() + + wheel_path: Path = args.wheel + if not wheel_path.is_file(): + print(f"error: {wheel_path} is not a file", file=sys.stderr) + return 1 + + # Catch the things that ``analyze_wheel_abi`` and ``rename_wheel`` can + # raise: any subclass of ``AuditwheelError`` (pure-Python wheels, + # invalid libc, malformed wheels), filesystem errors, or our own + # ``ValueError`` for an unrecognised wheel filename. Print a single + # ``ERROR_TYPE: message`` line to stderr instead of a Python + # traceback, which is much friendlier in CI logs. + try: + new_tag = detect_platform_tag(wheel_path) + print(f"detected platform tag: {new_tag}", file=sys.stderr) + new_path = rename_wheel(wheel_path, new_tag) + except (AuditwheelError, ValueError, OSError) as e: + print( + f"error: failed to retag {wheel_path.name}: {type(e).__name__}: {e}", + file=sys.stderr, + ) + return 2 + + if new_path != wheel_path: + print(f"renamed {wheel_path.name} -> {new_path.name}", file=sys.stderr) + else: + print(f"wheel already tagged {new_tag}", file=sys.stderr) + + print(new_path) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.buildkite/scripts/docker-build-metadata-args.sh b/.buildkite/scripts/docker-build-metadata-args.sh new file mode 100644 index 000000000000..9aa6fa9314f7 --- /dev/null +++ b/.buildkite/scripts/docker-build-metadata-args.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# Emit docker build flags for release image provenance metadata. +# Keep this helper best-effort: missing Buildkite metadata should fall back to +# local/default values instead of blocking the Docker build. + +# Variant examples: "", "cu129", "ubuntu2404", "cu129-ubuntu2404". +variant="${1:-}" +variant_suffix="${variant:+-${variant}}" + +image_name="${VLLM_DOCKER_IMAGE_NAME:-vllm/vllm-openai}" +staging_repo="${VLLM_STAGING_IMAGE_REPO:-public.ecr.aws/q9t5s3a7/vllm-release-repo}" +build_commit="${VLLM_BUILD_COMMIT:-${BUILDKITE_COMMIT:-unknown}}" +build_pipeline="${VLLM_BUILD_PIPELINE:-${BUILDKITE_PIPELINE_ID:-${BUILDKITE_PIPELINE_SLUG:-local}}}" +build_url="${VLLM_BUILD_URL:-${BUILDKITE_BUILD_URL:-}}" +tag_commit="${BUILDKITE_COMMIT:-${build_commit}}" + +if [[ -n "${BUILDKITE:-}" || -n "${BUILDKITE_COMMIT:-}" ]]; then + release_version="${RELEASE_VERSION:-}" + if command -v buildkite-agent >/dev/null 2>&1; then + release_version="${release_version:-$(buildkite-agent meta-data get release-version 2>/dev/null)}" + fi + release_version="${release_version#v}" + release_version="${release_version:-${tag_commit}}" + + staging_image_ref="${staging_repo}:${tag_commit}-$(uname -m)${variant_suffix}" + + if [[ "${NIGHTLY:-}" == "1" ]]; then + if [[ -z "${variant}" ]]; then + image_tag="${image_name}:nightly-${tag_commit}" + elif [[ "${variant}" == cu* ]]; then + cuda_variant="${variant%%-*}" + remaining_variant="${variant#${cuda_variant}}" + image_tag="${image_name}:${cuda_variant}-nightly-${tag_commit}${remaining_variant}" + else + image_tag="${image_name}:nightly-${tag_commit}${variant_suffix}" + fi + else + image_tag="${image_name}:v${release_version}${variant_suffix}" + fi +else + image_tag="${VLLM_IMAGE_TAG:-local/vllm-openai:dev}" + staging_image_ref="${image_tag}" +fi + +emit_arg() { + printf -- "--build-arg %s=%s " "$1" "$2" +} + +emit_arg VLLM_BUILD_COMMIT "${build_commit}" +emit_arg VLLM_BUILD_PIPELINE "${build_pipeline}" +emit_arg VLLM_BUILD_URL "${build_url}" +# This is the intended public tag. The final digest is only known after push. +emit_arg VLLM_IMAGE_TAG "${image_tag}" +printf -- "--tag %s " "${staging_image_ref}" diff --git a/.buildkite/scripts/generate-and-upload-nightly-index.sh b/.buildkite/scripts/generate-and-upload-nightly-index.sh index 88c4f5173139..502ed0609310 100755 --- a/.buildkite/scripts/generate-and-upload-nightly-index.sh +++ b/.buildkite/scripts/generate-and-upload-nightly-index.sh @@ -10,20 +10,13 @@ set -ex BUCKET="vllm-wheels" INDICES_OUTPUT_DIR="indices" DEFAULT_VARIANT_ALIAS="cu130" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py -PYTHON="${PYTHON_PROG:-python3}" # try to read from env var, otherwise use python3 SUBPATH=$BUILDKITE_COMMIT S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/" -# detect if python3.12+ is available -has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)") -if [[ "$has_new_python" -eq 0 ]]; then - # use new python from docker - docker pull python:3-slim - PYTHON="docker run --rm -u $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3" -fi - -echo "Using python interpreter: $PYTHON" -echo "Python version: $($PYTHON --version)" +# Select python3 (>= 3.12) -- local if available, else a docker fallback. +# shellcheck source=lib/select-python.sh +source .buildkite/scripts/lib/select-python.sh +select_python # ======== generate and upload indices ======== diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index 703a7d753220..7e8ddb12ec98 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -378,9 +378,11 @@ HF_MOUNT="/root/.cache/huggingface" # double-quotes will have been stripped by the calling shell. if [[ -n "${VLLM_TEST_COMMANDS:-}" ]]; then commands="${VLLM_TEST_COMMANDS}" + commands_source="env" echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)" else commands="$*" + commands_source="argv" if [[ -z "$commands" ]]; then echo "Error: No test commands provided." >&2 echo "Usage:" >&2 @@ -397,9 +399,15 @@ fi echo "Raw commands: $commands" -# Fix quoting before ROCm overrides (so overrides see correct structure) -commands=$(re_quote_pytest_markers "$commands") -echo "After re-quoting: $commands" +# Only try to repair stripped pytest -m/-k quoting in legacy argv mode. +# VLLM_TEST_COMMANDS preserves inner quoting already, and re-quoting that path +# can corrupt embedded echo strings or otherwise well-formed shell fragments. +if [[ "$commands_source" == "argv" ]]; then + commands=$(re_quote_pytest_markers "$commands") + echo "After re-quoting: $commands" +else + echo "Skipping re-quoting for VLLM_TEST_COMMANDS input" +fi commands=$(apply_rocm_test_overrides "$commands") echo "Final commands: $commands" diff --git a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh index f12bb524d4cb..8ac27ed6583a 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh @@ -3,42 +3,37 @@ set -euox pipefail export VLLM_CPU_CI_ENV=0 export VLLM_CPU_KVCACHE_SPACE=1 # avoid OOM -echo "--- PP+TP" -vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 --max-model-len=4096 & -server_pid=$! -timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1 -vllm bench serve \ - --backend vllm \ - --dataset-name random \ - --model meta-llama/Llama-3.2-3B-Instruct \ - --num-prompts 20 \ - --result-dir ./test_results \ - --result-filename tp_pp.json \ - --save-result \ - --endpoint /v1/completions -kill -s SIGTERM $server_pid; wait $server_pid || true -failed_req=$(jq '.failed' ./test_results/tp_pp.json) -if [ "$failed_req" -ne 0 ]; then - echo "Some requests were failed!" - exit 1 -fi +MODE=${1:-all} -echo "--- DP+TP" -vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 --max-model-len=4096 & -server_pid=$! -timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1 -vllm bench serve \ - --backend vllm \ - --dataset-name random \ - --model meta-llama/Llama-3.2-3B-Instruct \ - --num-prompts 20 \ - --result-dir ./test_results \ - --result-filename dp_pp.json \ - --save-result \ - --endpoint /v1/completions -kill -s SIGTERM $server_pid; wait $server_pid || true -failed_req=$(jq '.failed' ./test_results/dp_pp.json) -if [ "$failed_req" -ne 0 ]; then - echo "Some requests were failed!" - exit 1 -fi +run_scenario() { + local label="$1" result_file="$2" + shift 2 + echo "--- $label" + vllm serve meta-llama/Llama-3.2-3B-Instruct "$@" --max-model-len=4096 & + local server_pid=$! + timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1 + vllm bench serve \ + --backend vllm \ + --dataset-name random \ + --model meta-llama/Llama-3.2-3B-Instruct \ + --num-prompts 20 \ + --result-dir ./test_results \ + --result-filename "$result_file" \ + --save-result \ + --endpoint /v1/completions + kill -s SIGTERM "$server_pid"; wait "$server_pid" || true + if [ "$(jq '.failed' "./test_results/$result_file")" -ne 0 ]; then + echo "Some requests were failed in $label!" + exit 1 + fi +} + +case "$MODE" in + tp_pp) run_scenario "PP+TP" tp_pp.json -tp=2 -pp=2 ;; + dp_tp) run_scenario "DP+TP" dp_tp.json -tp=2 -dp=2 ;; + all) + run_scenario "PP+TP" tp_pp.json -tp=2 -pp=2 + run_scenario "DP+TP" dp_tp.json -tp=2 -dp=2 + ;; + *) echo "ERROR: unknown mode '$MODE' (expected: tp_pp | dp_tp | all)" >&2; exit 1 ;; +esac diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh index 7166435ac1e9..0322397394d1 100755 --- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh @@ -67,6 +67,21 @@ function cpu_tests() { --num-prompts 20 \ --endpoint /v1/completions kill -s SIGTERM $server_pid &' + + # smoke test for Gated DeltaNet + docker exec cpu-test bash -c ' + set -e + VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3.5-0.8B --max-model-len 2048 & + server_pid=$! + timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 + vllm bench serve \ + --backend vllm \ + --dataset-name random \ + --model Qwen/Qwen3.5-0.8B \ + --num-prompts 20 \ + --endpoint /v1/completions + kill -s SIGTERM $server_pid &' + } # All of CPU tests are expected to be finished less than 40 mins. diff --git a/.buildkite/scripts/lib/manylinux.sh b/.buildkite/scripts/lib/manylinux.sh new file mode 100644 index 000000000000..bde2dfe0a3dc --- /dev/null +++ b/.buildkite/scripts/lib/manylinux.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# +# Shared helper for rewriting a wheel's platform tag from the generic +# ``linux_`` to the correct ``manylinux___``. +# After sourcing, call ``apply_manylinux_tag `` on each wheel +# that still carries the generic tag; the renamed path is printed on +# stdout (logs go to stderr). +# +# Why a pinned Docker container instead of using whatever Python +# happens to be on the agent: +# - vLLM's release agents are heterogeneous -- they don't agree on +# a Python minor version, and we can't rely on a particular +# ``auditwheel`` being installed. +# - ``detect-manylinux-tag.py`` reads ``auditwheel.wheel_abi`` and +# ``Policy.sym_policy``, which are *internal* APIs without a +# stability promise. Pinning both Python and auditwheel makes the +# detected tag a function of the inputs alone, and shifts version +# bumps from "implicit drift" to "deliberate, retested change". +# - Other release scripts (``generate-and-upload-nightly-index.sh``, +# ``upload-rocm-wheels.sh``) already use the python:3-slim image +# when the agent's interpreter is too old; this is the same idea +# made stricter. +# +# To keep the per-wheel cost down (the ROCm upload retags ~10 wheels +# each run), we install auditwheel into a long-lived helper container +# once on source, then ``docker exec`` into it for each call. +# +# Trap behaviour: +# - Sourcing installs an EXIT trap that calls ``manylinux_cleanup`` to +# tear down the helper container. Any EXIT trap that was already in +# place when this file was sourced is captured and run AFTER our +# cleanup, so we don't silently clobber it. +# - If a caller sets a new EXIT trap *after* sourcing, that trap will +# replace ours; in that case the caller should call +# ``manylinux_cleanup`` from their own handler. + +if [[ -n "${_MANYLINUX_LIB_SOURCED:-}" ]]; then + return 0 +fi +_MANYLINUX_LIB_SOURCED=1 + +# Pin both sides. Bump these deliberately and re-run a representative +# wheel from each build target through the detection. +_MANYLINUX_PYTHON_IMAGE="python:3.12-slim" +_MANYLINUX_AUDITWHEEL_VERSION="6.6.0" + +# Resolve our own directory (and the sibling detect script) using the +# canonical, symlink-resolved path. The container mounts cwd at the +# same absolute path on both sides, so all paths we hand to it -- the +# script, the wheel -- must canonicalise to a location under cwd. +_MANYLINUX_LIB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)" +_MANYLINUX_DETECT_SCRIPT="$(cd "${_MANYLINUX_LIB_DIR}/.." && pwd -P)/detect-manylinux-tag.py" +_MANYLINUX_CWD="$(pwd -P)" + +docker pull --quiet "$_MANYLINUX_PYTHON_IMAGE" >/dev/null + +# Spin up a long-lived helper container so we install auditwheel once +# and then ``docker exec`` into it for each wheel. +# +# The container runs as root so ``pip install`` can write into the +# system site-packages; individual ``docker exec`` calls below pin +# themselves to the host UID so any file rename happens with host +# ownership, not root. +_MANYLINUX_CONTAINER="$(docker run -d --rm \ + -v "$_MANYLINUX_CWD:$_MANYLINUX_CWD" \ + -w "$_MANYLINUX_CWD" \ + "$_MANYLINUX_PYTHON_IMAGE" \ + sleep infinity)" +docker exec "$_MANYLINUX_CONTAINER" \ + pip install --quiet --disable-pip-version-check \ + --root-user-action=ignore \ + "auditwheel==${_MANYLINUX_AUDITWHEEL_VERSION}" + +# Public cleanup -- safe to call multiple times. +manylinux_cleanup() { + if [[ -n "${_MANYLINUX_CONTAINER:-}" ]]; then + docker rm -f "$_MANYLINUX_CONTAINER" >/dev/null 2>&1 || true + _MANYLINUX_CONTAINER="" + fi +} + +# Capture any EXIT trap that was already in place so we can chain to +# it rather than overwrite it. ``trap -p EXIT`` prints the handler in +# eval-able form (``trap -- 'CMD' EXIT``) or nothing if unset; we +# strip the wrapper to recover ``CMD``. Handles the common case -- +# CMDs without embedded single quotes -- and degrades gracefully (we +# still run our own cleanup) for the pathological case. +_manylinux_prev_exit_trap_cmd="" +_manylinux_existing_exit_trap="$(trap -p EXIT)" +if [[ -n "$_manylinux_existing_exit_trap" ]]; then + _tmp="${_manylinux_existing_exit_trap#trap -- \'}" + _manylinux_prev_exit_trap_cmd="${_tmp%\' EXIT}" + unset _tmp +fi +unset _manylinux_existing_exit_trap + +_manylinux_run_exit_chain() { + manylinux_cleanup + if [[ -n "$_manylinux_prev_exit_trap_cmd" ]]; then + eval "$_manylinux_prev_exit_trap_cmd" + fi +} +trap _manylinux_run_exit_chain EXIT + +# Detect the manylinux platform tag for a single wheel and rename it +# in place, printing the renamed wheel path on stdout. Returns +# non-zero on failure (which under ``set -e`` propagates to caller). +# +# The wheel must be reachable via a path under the host cwd so it's +# visible inside the helper container; in CI the wheels always live +# under ``artifacts/`` so this is fine. +apply_manylinux_tag() { + local wheel="$1" + local abs_wheel + abs_wheel="$(realpath "$wheel")" + local new_wheel + new_wheel="$(docker exec -u "$(id -u):$(id -g)" \ + "$_MANYLINUX_CONTAINER" \ + python "$_MANYLINUX_DETECT_SCRIPT" "$abs_wheel")" + if [[ -z "$new_wheel" || ! -f "$new_wheel" ]]; then + echo "apply_manylinux_tag: detect-manylinux-tag.py did not produce a valid wheel path for $wheel" >&2 + return 1 + fi + printf '%s\n' "$new_wheel" +} diff --git a/.buildkite/scripts/lib/select-python.sh b/.buildkite/scripts/lib/select-python.sh new file mode 100644 index 000000000000..bc53030a2b50 --- /dev/null +++ b/.buildkite/scripts/lib/select-python.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# +# Pick a Python interpreter for buildkite scripts: prefer a local +# ``python3`` if it is recent enough (>= 3.12), otherwise fall back to +# a one-shot Docker container running ``python:3-slim``. After +# ``select_python`` returns, ``$PYTHON`` is set in the caller's shell +# and is safe to use as a command (e.g. ``$PYTHON some_script.py``). +# +# The 3.12 threshold matches what the existing nightly-index work +# expects -- typing features used by ``generate-nightly-index.py``. +# This helper does not pin the *minor* version; if you need stricter +# reproducibility (e.g. relying on auditwheel internals), invoke +# Docker yourself with a pinned tag rather than calling this. + +if [[ -n "${_SELECT_PYTHON_LIB_SOURCED:-}" ]]; then + return 0 +fi +_SELECT_PYTHON_LIB_SOURCED=1 + +# Sets ``PYTHON`` in the caller's shell and exports it. Idempotent -- +# calling twice is safe and the second call simply re-runs the probe. +select_python() { + local py="${PYTHON_PROG:-python3}" + local has_new_python + has_new_python=$("$py" -c \ + "print(1 if __import__('sys').version_info >= (3,12) else 0)" \ + 2>/dev/null || echo 0) + if [[ "$has_new_python" -eq 0 ]]; then + # ``-u $(id -u):$(id -g)`` so files created via the container + # end up owned by the host user, not root. + docker pull python:3-slim + PYTHON="docker run --rm -u $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3" + else + PYTHON="$py" + fi + export PYTHON + echo "Using python interpreter: $PYTHON" + echo "Python version: $($PYTHON --version)" +} diff --git a/.buildkite/scripts/publish-release-images.sh b/.buildkite/scripts/publish-release-images.sh new file mode 100755 index 000000000000..ec319aa76006 --- /dev/null +++ b/.buildkite/scripts/publish-release-images.sh @@ -0,0 +1,180 @@ +#!/bin/bash +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# +# Publish release Docker images from ECR to DockerHub. +# Pulls per-arch images, tags with latest and versioned tags, pushes them, +# then creates and pushes multi-arch manifests. + +set -euo pipefail + +RELEASE_VERSION=$(buildkite-agent meta-data get release-version --default "" | sed 's/^v//') +if [ -z "${RELEASE_VERSION}" ]; then + echo "ERROR: release-version metadata not set" + exit 1 +fi + +COMMIT="$BUILDKITE_COMMIT" +ROCM_BASE_CACHE_KEY=$(.buildkite/scripts/cache-rocm-base-wheels.sh key) + +echo "========================================" +echo "Publishing release images v${RELEASE_VERSION}" +echo " Commit: ${COMMIT}" +echo " ROCm base cache key: ${ROCM_BASE_CACHE_KEY}" +echo "========================================" + +# Login to ECR to pull staging images +aws ecr-public get-login-password --region us-east-1 | \ + docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7 + +# ---- CUDA (default: 13.0) ---- + +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64 +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64 + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64 vllm/vllm-openai:latest-x86_64 +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 +docker push vllm/vllm-openai:latest-x86_64 +docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64 vllm/vllm-openai:latest-aarch64 +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 +docker push vllm/vllm-openai:latest-aarch64 +docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 + +docker manifest rm vllm/vllm-openai:latest || true +docker manifest rm vllm/vllm-openai:v${RELEASE_VERSION} || true +docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 +docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 +docker manifest push vllm/vllm-openai:latest +docker manifest push vllm/vllm-openai:v${RELEASE_VERSION} + +# ---- CUDA 12.9 ---- + +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-cu129 +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-cu129 + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-cu129 vllm/vllm-openai:latest-x86_64-cu129 +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129 +docker push vllm/vllm-openai:latest-x86_64-cu129 +docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129 + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-cu129 vllm/vllm-openai:latest-aarch64-cu129 +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129 +docker push vllm/vllm-openai:latest-aarch64-cu129 +docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129 + +docker manifest rm vllm/vllm-openai:latest-cu129 || true +docker manifest rm vllm/vllm-openai:v${RELEASE_VERSION}-cu129 || true +docker manifest create vllm/vllm-openai:latest-cu129 vllm/vllm-openai:latest-x86_64-cu129 vllm/vllm-openai:latest-aarch64-cu129 +docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129 +docker manifest push vllm/vllm-openai:latest-cu129 +docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu129 + +# ---- Ubuntu 24.04 (CUDA 13.0) ---- + +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-ubuntu2404 +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-ubuntu2404 + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-ubuntu2404 vllm/vllm-openai:latest-x86_64-ubuntu2404 +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-ubuntu2404 +docker push vllm/vllm-openai:latest-x86_64-ubuntu2404 +docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-ubuntu2404 + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-ubuntu2404 vllm/vllm-openai:latest-aarch64-ubuntu2404 +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-ubuntu2404 +docker push vllm/vllm-openai:latest-aarch64-ubuntu2404 +docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-ubuntu2404 + +docker manifest rm vllm/vllm-openai:latest-ubuntu2404 || true +docker manifest rm vllm/vllm-openai:v${RELEASE_VERSION}-ubuntu2404 || true +docker manifest create vllm/vllm-openai:latest-ubuntu2404 vllm/vllm-openai:latest-x86_64-ubuntu2404 vllm/vllm-openai:latest-aarch64-ubuntu2404 +docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-ubuntu2404 +docker manifest push vllm/vllm-openai:latest-ubuntu2404 +docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-ubuntu2404 + +# ---- Ubuntu 24.04 (CUDA 12.9) ---- + +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-cu129-ubuntu2404 +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-cu129-ubuntu2404 + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-cu129-ubuntu2404 vllm/vllm-openai:latest-x86_64-cu129-ubuntu2404 +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-cu129-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129-ubuntu2404 +docker push vllm/vllm-openai:latest-x86_64-cu129-ubuntu2404 +docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129-ubuntu2404 + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-cu129-ubuntu2404 vllm/vllm-openai:latest-aarch64-cu129-ubuntu2404 +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-cu129-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129-ubuntu2404 +docker push vllm/vllm-openai:latest-aarch64-cu129-ubuntu2404 +docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129-ubuntu2404 + +docker manifest rm vllm/vllm-openai:latest-cu129-ubuntu2404 || true +docker manifest rm vllm/vllm-openai:v${RELEASE_VERSION}-cu129-ubuntu2404 || true +docker manifest create vllm/vllm-openai:latest-cu129-ubuntu2404 vllm/vllm-openai:latest-x86_64-cu129-ubuntu2404 vllm/vllm-openai:latest-aarch64-cu129-ubuntu2404 +docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu129-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129-ubuntu2404 +docker manifest push vllm/vllm-openai:latest-cu129-ubuntu2404 +docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu129-ubuntu2404 + +# ---- ROCm ---- + +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-rocm +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-rocm vllm/vllm-openai-rocm:latest +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-rocm vllm/vllm-openai-rocm:v${RELEASE_VERSION} +docker push vllm/vllm-openai-rocm:latest +docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION} + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base vllm/vllm-openai-rocm:latest-base +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base +docker push vllm/vllm-openai-rocm:latest-base +docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base + +# ---- CPU ---- +# CPU images are behind separate block steps and may not have been built. +# All-or-nothing: inspect both arches first, then either publish everything +# (per-arch + multi-arch manifest) or skip everything. Publishing only one +# arch would leave `:latest-x86_64` pointing at the new release while the +# `:latest` multi-arch manifest still resolves to the previous release. + +CPU_X86_TAG=public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} +CPU_ARM_TAG=public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} + +CPU_X86_AVAILABLE=false +CPU_ARM_AVAILABLE=false +docker manifest inspect "${CPU_X86_TAG}" >/dev/null 2>&1 && CPU_X86_AVAILABLE=true +docker manifest inspect "${CPU_ARM_TAG}" >/dev/null 2>&1 && CPU_ARM_AVAILABLE=true + +if [ "$CPU_X86_AVAILABLE" = "true" ] && [ "$CPU_ARM_AVAILABLE" = "true" ]; then + docker pull "${CPU_X86_TAG}" + docker tag "${CPU_X86_TAG}" vllm/vllm-openai-cpu:latest-x86_64 + docker tag "${CPU_X86_TAG}" vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 + docker push vllm/vllm-openai-cpu:latest-x86_64 + docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 + + docker pull "${CPU_ARM_TAG}" + docker tag "${CPU_ARM_TAG}" vllm/vllm-openai-cpu:latest-arm64 + docker tag "${CPU_ARM_TAG}" vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64 + docker push vllm/vllm-openai-cpu:latest-arm64 + docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64 + + docker manifest rm vllm/vllm-openai-cpu:latest || true + docker manifest rm vllm/vllm-openai-cpu:v${RELEASE_VERSION} || true + docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64 + docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64 + docker manifest push vllm/vllm-openai-cpu:latest + docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION} +elif [ "$CPU_X86_AVAILABLE" = "false" ] && [ "$CPU_ARM_AVAILABLE" = "false" ]; then + echo "WARNING: Neither CPU image found in ECR, skipping CPU publish (ensure block-cpu-release-image-build and block-arm64-cpu-release-image-build were unblocked and the builds finished pushing)" +else + # Partial state: one arch built, the other did not. Fail loudly rather than + # ship a Docker Hub state where `:latest-${arch}` and `:latest` (multi-arch) + # disagree on which release they point at. + echo "ERROR: Partial CPU build detected (x86_64=${CPU_X86_AVAILABLE}, arm64=${CPU_ARM_AVAILABLE})." + echo " Refusing to publish to avoid split-tag drift between per-arch and multi-arch tags." + echo " Re-run the missing CPU build and retry, or manually publish if a single-arch release is intended." + exit 1 +fi + +echo "" +echo "Successfully published release images for v${RELEASE_VERSION}" diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh index de48eb282a65..0eadfa1f80b4 100755 --- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh +++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh @@ -51,6 +51,7 @@ vllm serve "$MODEL" \ --offload-num-in-group 2 \ --offload-prefetch-step 1 \ --offload-params w13_weight w2_weight \ + --generation-config vllm \ --port "$PORT" \ ${EXTRA_ARGS+"${EXTRA_ARGS[@]}"} & SERVER_PID=$! diff --git a/.buildkite/scripts/tool_call/run-bfcl-eval.sh b/.buildkite/scripts/tool_call/run-bfcl-eval.sh index f3e5009e6fe3..3748cab62c7c 100755 --- a/.buildkite/scripts/tool_call/run-bfcl-eval.sh +++ b/.buildkite/scripts/tool_call/run-bfcl-eval.sh @@ -28,6 +28,7 @@ # BFCL_MAX_MODEL_LEN - Max model length (default: 4096) # BFCL_PORT - Server port (default: 8000) # BFCL_REASONING_PARSER - Reasoning parser name (default: disabled) +# BFCL_TEMPERATURE - Temperature (default: 0.0) # BFCL_EXTRA_ARGS - Additional vLLM server args set -euo pipefail @@ -43,6 +44,7 @@ TP_SIZE="${BFCL_TP_SIZE:-1}" MAX_MODEL_LEN="${BFCL_MAX_MODEL_LEN:-4096}" PORT="${BFCL_PORT:-8000}" REASONING_PARSER="${BFCL_REASONING_PARSER:-}" +TEMPERATURE="${BFCL_TEMPERATURE:-0.0}" EXTRA_ARGS="${BFCL_EXTRA_ARGS:-}" # Set up output directory @@ -139,7 +141,7 @@ echo "vLLM server is ready. (started in ${SECONDS_WAITED}s)" # be patched in-process so BFCL knows to use the OpenAI-compatible handler # against our local vLLM server. bfcl_exit_code=0 -python3 - "$MODEL" "$TEST_CATEGORY" "$NUM_THREADS" "$PORT" "$API_TYPE" "$OUTPUT_DIR" << 'PYEOF' || bfcl_exit_code=$? +python3 - "$MODEL" "$TEST_CATEGORY" "$NUM_THREADS" "$PORT" "$API_TYPE" "$TEMPERATURE" "$OUTPUT_DIR" << 'PYEOF' || bfcl_exit_code=$? import os import sys @@ -148,7 +150,8 @@ test_category = sys.argv[2] num_threads = int(sys.argv[3]) port = sys.argv[4] api_type = sys.argv[5] -output_dir = sys.argv[6] if len(sys.argv) > 6 and sys.argv[6] else os.getcwd() +temperature = float(sys.argv[6]) +output_dir = sys.argv[7] if len(sys.argv) > 7 and sys.argv[7] else os.getcwd() os.environ["OPENAI_BASE_URL"] = f"http://localhost:{port}/v1" os.environ["OPENAI_API_KEY"] = "dummy" @@ -204,6 +207,7 @@ gen_kwargs["model"] = [model] gen_kwargs["test_category"] = [c.strip() for c in test_category.split(",")] gen_kwargs["skip_server_setup"] = True gen_kwargs["num_threads"] = num_threads +gen_kwargs["temperature"] = temperature generate(**gen_kwargs) # ---- evaluate ---- diff --git a/.buildkite/scripts/upload-nightly-wheels.sh b/.buildkite/scripts/upload-nightly-wheels.sh index cc72cda7d505..8cef31908809 100644 --- a/.buildkite/scripts/upload-nightly-wheels.sh +++ b/.buildkite/scripts/upload-nightly-wheels.sh @@ -2,14 +2,18 @@ set -ex -# Upload a single wheel to S3 (rename linux -> manylinux). +# Upload a single wheel to S3, after detecting and applying the appropriate +# manylinux platform tag with auditwheel. # Index generation is handled separately by generate-and-upload-nightly-index.sh. +# shellcheck source=lib/manylinux.sh +source .buildkite/scripts/lib/manylinux.sh + BUCKET="vllm-wheels" SUBPATH=$BUILDKITE_COMMIT S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/" -# ========= collect, rename & upload the wheel ========== +# ========= locate the wheel ========== # Assume wheels are in artifacts/dist/*.whl wheel_files=(artifacts/dist/*.whl) @@ -21,19 +25,9 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then fi wheel="${wheel_files[0]}" -# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31 -# we also accept params as manylinux tag -# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels -manylinux_version="${1:-manylinux_2_31}" +# ========= detect manylinux tag and rename ========== -# Rename 'linux' to the appropriate manylinux version in the wheel filename -if [[ "$wheel" != *"linux"* ]]; then - echo "Error: Wheel filename does not contain 'linux': $wheel" - exit 1 -fi -new_wheel="${wheel/linux/$manylinux_version}" -mv -- "$wheel" "$new_wheel" -wheel="$new_wheel" +wheel="$(apply_manylinux_tag "$wheel")" echo "Renamed wheel to: $wheel" # Extract the version from the wheel diff --git a/.buildkite/scripts/upload-release-wheels-pypi.sh b/.buildkite/scripts/upload-release-wheels-pypi.sh index 058e5bbe4f4c..7e2077a2692c 100644 --- a/.buildkite/scripts/upload-release-wheels-pypi.sh +++ b/.buildkite/scripts/upload-release-wheels-pypi.sh @@ -39,10 +39,11 @@ fi set -x # avoid printing secrets above -# install twine from pypi +# install twine and sdist build prerequisites from pypi python3 -m venv /tmp/vllm-release-env source /tmp/vllm-release-env/bin/activate pip install twine +pip install -r requirements/build/cuda.txt python3 -m twine --version # copy release wheels to local directory diff --git a/.buildkite/scripts/upload-rocm-wheels.sh b/.buildkite/scripts/upload-rocm-wheels.sh index a42848a16ffe..1f3655631204 100755 --- a/.buildkite/scripts/upload-rocm-wheels.sh +++ b/.buildkite/scripts/upload-rocm-wheels.sh @@ -20,10 +20,6 @@ BUCKET="${S3_BUCKET:-vllm-wheels}" ROCM_SUBPATH="rocm/${BUILDKITE_COMMIT}" S3_COMMIT_PREFIX="s3://$BUCKET/$ROCM_SUBPATH/" INDICES_OUTPUT_DIR="rocm-indices" -PYTHON="${PYTHON_PROG:-python3}" - -# ROCm uses manylinux_2_35 (Ubuntu 22.04 based) -MANYLINUX_VERSION="manylinux_2_35" echo "========================================" echo "ROCm Wheel Upload Configuration" @@ -34,19 +30,21 @@ echo "Commit: $BUILDKITE_COMMIT" echo "Branch: $BUILDKITE_BRANCH" echo "========================================" -# ======== Part 0: Setup Python ======== +# ======== Part 0: Setup Python and helpers ======== -# Detect if python3.12+ is available -has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)" 2>/dev/null || echo 0) -if [[ "$has_new_python" -eq 0 ]]; then - # Use new python from docker - # Use --user to ensure files are created with correct ownership (not root) - docker pull python:3-slim - PYTHON="docker run --rm --user $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3" -fi +# Pick a Python interpreter for index generation -- local if recent +# enough, else a one-shot docker fallback. +# shellcheck source=lib/select-python.sh +source .buildkite/scripts/lib/select-python.sh +select_python -echo "Using python interpreter: $PYTHON" -echo "Python version: $($PYTHON --version)" +# Set up auditwheel-in-a-container for the manylinux retagging step. +# Distinct from select_python: ``manylinux.sh`` deliberately pins both +# the Python and auditwheel versions (the script reads auditwheel +# internals) and so always runs in a known-good container regardless +# of what's on the agent. +# shellcheck source=lib/manylinux.sh +source .buildkite/scripts/lib/manylinux.sh # ======== Part 1: Collect and prepare wheels ======== @@ -63,11 +61,18 @@ if [ "$WHEEL_COUNT" -eq 0 ]; then exit 1 fi -# Rename linux to manylinux in wheel filenames +# Detect the appropriate manylinux platform tag for any wheel that still +# carries the generic ``linux_`` tag, and rename it in place. We use +# auditwheel via ``apply_manylinux_tag`` (see lib/manylinux.sh) rather than +# a hard-coded ``manylinux_2_35`` string so that the label tracks the actual +# glibc symbol versions used by the binaries (and stays correct if the +# rocm_base image is rebased). +# +# The ``linux``/``manylinux`` filter below skips both pre-tagged wheels +# (e.g. upstream torch) and pure-Python ``-any.whl`` wheels. for wheel in all-rocm-wheels/*.whl; do if [[ "$wheel" == *"linux"* ]] && [[ "$wheel" != *"manylinux"* ]]; then - new_wheel="${wheel/linux/$MANYLINUX_VERSION}" - mv -- "$wheel" "$new_wheel" + new_wheel="$(apply_manylinux_tag "$wheel")" echo "Renamed: $(basename "$wheel") -> $(basename "$new_wheel")" fi done diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 68179dcb68cd..b3c77dcac7c8 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -230,7 +230,6 @@ steps: - tests/entrypoints/llm/test_collective_rpc.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py @@ -272,7 +271,6 @@ steps: - tests/v1/worker/test_worker_memory_snapshot.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown @@ -395,11 +393,11 @@ steps: # Pooling models - python3 pooling/embed/vision_embedding_offline.py --seed 0 # Features demo - - python3 offline_inference/prefix_caching.py + - python3 features/automatic_prefix_caching/prefix_caching_offline.py - python3 offline_inference/llm_engine_example.py - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 #---------------------------------------------------------- mi250 · kernels ----------------------------------------------------------# @@ -590,7 +588,6 @@ steps: - vllm/platforms/rocm.py commands: - pip freeze | grep -E 'torch' - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB - label: Multi-Modal Models (Extended Generation 2) # TBD @@ -621,6 +618,7 @@ steps: mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 torch_nightly: true + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ @@ -864,7 +862,6 @@ steps: - tests/entrypoints/openai/test_multi_api_servers.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py @@ -930,6 +927,7 @@ steps: - tests/renderers - tests/standalone_tests/lazy_imports.py - tests/tokenizers_ + - tests/reasoning - tests/tool_parsers - tests/transformers_utils - tests/config @@ -942,7 +940,7 @@ steps: - pytest -v -s -m 'cpu_test' multimodal - pytest -v -s renderers - pytest -v -s tokenizers_ - - pytest -v -s reasoning --ignore=reasoning/test_seedoss_reasoning_parser.py --ignore=reasoning/test_glm4_moe_reasoning_parser.py --ignore=reasoning/test_gemma4_reasoning_parser.py + - pytest -v -s reasoning --ignore=reasoning/test_seedoss_reasoning_parser.py --ignore=reasoning/test_glm4_moe_reasoning_parser.py - pytest -v -s tool_parsers - pytest -v -s transformers_utils - pytest -v -s config @@ -1108,6 +1106,7 @@ steps: - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py + - pytest -v -s tests/compile/passes/distributed/test_tp2_ar_rms.py::test_tp2_ar_rms_fusions #----------------------------------------------------------- mi300 · cuda ------------------------------------------------------------# @@ -1168,13 +1167,12 @@ steps: - vllm/v1/attention/backends/ - vllm/v1/attention/selector.py - tests/distributed/test_context_parallel.py - - examples/offline_inference/data_parallel.py + - examples/features/data_parallel/data_parallel_offline.py - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s tests/distributed/test_context_parallel.py - - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization + - VLLM_LOGGING_LEVEL=DEBUG python3 examples/features/data_parallel/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization - label: Distributed Tests (4xA100-4xMI300) # TBD timeout_in_minutes: 180 @@ -1186,7 +1184,6 @@ steps: source_file_dependencies: - vllm/ commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s distributed/test_custom_all_reduce.py - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' @@ -1203,17 +1200,16 @@ steps: - tests/distributed/test_torchrun_example.py - tests/distributed/test_torchrun_example_moe.py - examples/rl/ - - tests/examples/offline_inference/data_parallel.py + - tests/examples/features/data_parallel/data_parallel_offline.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py - - python3 ../examples/offline_inference/data_parallel.py --enforce-eager + - python3 ../examples/features/data_parallel/data_parallel_offline.py --enforce-eager # rlhf examples - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 ../examples/rl/rlhf_nccl.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 ../examples/rl/rlhf_ipc.py @@ -1252,7 +1248,6 @@ steps: - vllm/platforms/rocm.py commands: - export VLLM_USE_RAY_V2_EXECUTOR_BACKEND=1 - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s distributed/test_ray_v2_executor.py - pytest -v -s distributed/test_ray_v2_executor_e2e.py - pytest -v -s distributed/test_pipeline_parallel.py -k "ray" @@ -1266,7 +1261,7 @@ steps: optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - - examples/offline_inference/torchrun_dp_example.py + - examples/features/torchrun/torchrun_dp_example_offline.py - vllm/config/parallel.py - vllm/distributed/ - vllm/v1/engine/llm_engine.py @@ -1274,8 +1269,7 @@ steps: - vllm/v1/worker/gpu_worker.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep + - torchrun --nproc-per-node=8 ../examples/features/torchrun/torchrun_dp_example_offline.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep #-------------------------------------------------------- mi300 · entrypoints --------------------------------------------------------# @@ -1654,11 +1648,11 @@ steps: # Pooling models - python3 pooling/embed/vision_embedding_offline.py --seed 0 # Features demo - - python3 offline_inference/prefix_caching.py + - python3 features/automatic_prefix_caching/prefix_caching_offline.py - python3 offline_inference/llm_engine_example.py - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 #---------------------------------------------------------- mi300 · kernels ----------------------------------------------------------# @@ -1802,6 +1796,7 @@ steps: timeout_in_minutes: 180 mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] agent_pool: mi300_1 + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ @@ -1843,6 +1838,7 @@ steps: mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] agent_pool: mi300_1 torch_nightly: true + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ @@ -2203,7 +2199,6 @@ steps: timeout_in_minutes: 180 mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] agent_pool: mi300_1 - optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ @@ -2280,7 +2275,6 @@ steps: - tests/entrypoints/openai/test_multi_api_servers.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py @@ -2300,9 +2294,8 @@ steps: - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py - - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput + - VLLM_LOGGING_LEVEL=DEBUG python3 examples/features/data_parallel/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 pytest -v -s tests/distributed/test_weight_transfer.py - pytest -v -s tests/distributed/test_packed_tensor.py @@ -2363,7 +2356,6 @@ steps: - tests/distributed/test_utils - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py @@ -2493,7 +2485,6 @@ steps: - tests/entrypoints/llm/test_collective_rpc.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py @@ -2518,7 +2509,6 @@ steps: - tests/v1/worker/test_worker_memory_snapshot.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown @@ -2539,7 +2529,6 @@ steps: - tests/distributed/test_multiproc_executor.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s compile/fullgraph/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_events.py @@ -2627,6 +2616,7 @@ steps: agent_pool: mi325_1 torch_nightly: true parallelism: 2 + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ @@ -2652,6 +2642,7 @@ steps: mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 torch_nightly: true + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ @@ -2713,11 +2704,10 @@ steps: - vllm/v1/attention/selector.py - tests/distributed/test_context_parallel.py - tests/v1/distributed/test_dbo.py - - examples/offline_inference/data_parallel.py + - examples/features/data_parallel/data_parallel_offline.py - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/v1/distributed/test_dbo.py @@ -2748,6 +2738,7 @@ steps: agent_pool: mi355_1 fast_check: true torch_nightly: true + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ @@ -2763,6 +2754,7 @@ steps: agent_pool: mi355_1 fast_check: true torch_nightly: true + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ @@ -2937,11 +2929,11 @@ steps: # Pooling models - python3 pooling/embed/vision_embedding_offline.py --seed 0 # Features demo - - python3 offline_inference/prefix_caching.py + - python3 features/automatic_prefix_caching/prefix_caching_offline.py - python3 offline_inference/llm_engine_example.py - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 #---------------------------------------------------------- mi355 · kernels ----------------------------------------------------------# @@ -3059,6 +3051,7 @@ steps: timeout_in_minutes: 180 mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ @@ -3258,6 +3251,7 @@ steps: timeout_in_minutes: 60 mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ @@ -3283,6 +3277,7 @@ steps: timeout_in_minutes: 60 mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ diff --git a/.buildkite/test_areas/attention.yaml b/.buildkite/test_areas/attention.yaml index 4bcf116f2756..d3947a03162b 100644 --- a/.buildkite/test_areas/attention.yaml +++ b/.buildkite/test_areas/attention.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: V1 attention (H100) + key: v1-attention-h100 timeout_in_minutes: 30 device: h100 source_file_dependencies: @@ -14,8 +15,9 @@ steps: - pytest -v -s v1/attention - label: V1 attention (B200) + key: v1-attention-b200 timeout_in_minutes: 30 - device: b200 + device: b200-k8s source_file_dependencies: - vllm/config/attention.py - vllm/model_executor/layers/attention diff --git a/.buildkite/test_areas/basic_correctness.yaml b/.buildkite/test_areas/basic_correctness.yaml index 042734e8433b..5d547cd48637 100644 --- a/.buildkite/test_areas/basic_correctness.yaml +++ b/.buildkite/test_areas/basic_correctness.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: Basic Correctness + key: basic-correctness timeout_in_minutes: 30 device: h200_18gb source_file_dependencies: diff --git a/.buildkite/test_areas/benchmarks.yaml b/.buildkite/test_areas/benchmarks.yaml index 4cda6fff1443..85f804780179 100644 --- a/.buildkite/test_areas/benchmarks.yaml +++ b/.buildkite/test_areas/benchmarks.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: Benchmarks CLI Test + key: benchmarks-cli-test timeout_in_minutes: 20 device: h200_18gb source_file_dependencies: @@ -12,7 +13,8 @@ steps: - pytest -v -s benchmarks/ - label: Attention Benchmarks Smoke Test (B200) - device: b200 + key: attention-benchmarks-smoke-test-b200 + device: b200-k8s num_gpus: 2 optional: true working_dir: "/vllm-workspace/" diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml index aa46447c24af..01248738d519 100644 --- a/.buildkite/test_areas/compile.yaml +++ b/.buildkite/test_areas/compile.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: Sequence Parallel Correctness Tests (2 GPUs) + key: sequence-parallel-correctness-tests-2-gpus timeout_in_minutes: 50 working_dir: "/vllm-workspace/" num_devices: 2 @@ -17,6 +18,7 @@ steps: - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py - label: Sequence Parallel Correctness Tests (2xH100) + key: sequence-parallel-correctness-tests-2xh100 timeout_in_minutes: 50 working_dir: "/vllm-workspace/" device: h100 @@ -27,6 +29,7 @@ steps: - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py - label: AsyncTP Correctness Tests (2xH100) + key: asynctp-correctness-tests-2xh100 timeout_in_minutes: 50 working_dir: "/vllm-workspace/" device: h100 @@ -37,9 +40,10 @@ steps: - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py - label: AsyncTP Correctness Tests (B200) + key: asynctp-correctness-tests-b200 timeout_in_minutes: 50 working_dir: "/vllm-workspace/" - device: b200 + device: b200-k8s optional: true num_devices: 2 commands: @@ -47,6 +51,7 @@ steps: - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py - label: Distributed Compile Unit Tests (2xH100) + key: distributed-compile-unit-tests-2xh100 timeout_in_minutes: 20 working_dir: "/vllm-workspace/" device: h100 @@ -60,9 +65,10 @@ steps: - pytest -s -v tests/compile/passes/distributed - label: Fusion and Compile Unit Tests (2xB200) + key: fusion-and-compile-unit-tests-2xb200 timeout_in_minutes: 20 working_dir: "/vllm-workspace/" - device: b200 + device: b200-k8s source_file_dependencies: - csrc/quantization/fp4/ - vllm/model_executor/layers/quantization/ @@ -89,6 +95,7 @@ steps: - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile - label: Fusion E2E Quick (H100) + key: fusion-e2e-quick-h100 timeout_in_minutes: 15 working_dir: "/vllm-workspace/" device: h100 @@ -107,6 +114,7 @@ steps: - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and (qwen3 or deepseek)" - label: Fusion E2E Config Sweep (H100) + key: fusion-e2e-config-sweep-h100 timeout_in_minutes: 30 working_dir: "/vllm-workspace/" device: h100 @@ -126,9 +134,10 @@ steps: - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3" - label: Fusion E2E Config Sweep (B200) + key: fusion-e2e-config-sweep-b200 timeout_in_minutes: 30 working_dir: "/vllm-workspace/" - device: b200 + device: b200-k8s num_devices: 1 optional: true commands: @@ -139,6 +148,7 @@ steps: - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek)) or llama-3)" - label: Fusion E2E TP2 Quick (H100) + key: fusion-e2e-tp2-quick-h100 timeout_in_minutes: 20 working_dir: "/vllm-workspace/" device: h100 @@ -156,6 +166,7 @@ steps: - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))" - label: Fusion E2E TP2 AR-RMS Config Sweep (H100) + key: fusion-e2e-tp2-ar-rms-config-sweep-h100 timeout_in_minutes: 40 working_dir: "/vllm-workspace/" device: h100 @@ -175,6 +186,7 @@ steps: - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3" - label: Fusion E2E TP2 AsyncTP Config Sweep (H100) + key: fusion-e2e-tp2-asynctp-config-sweep-h100 timeout_in_minutes: 40 working_dir: "/vllm-workspace/" device: h100 @@ -194,9 +206,10 @@ steps: - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3" - label: Fusion E2E TP2 (B200) + key: fusion-e2e-tp2-b200 timeout_in_minutes: 20 working_dir: "/vllm-workspace/" - device: b200 + device: b200-k8s num_devices: 2 source_file_dependencies: - csrc/quantization/ diff --git a/.buildkite/test_areas/cuda.yaml b/.buildkite/test_areas/cuda.yaml index 4d1efdb13c88..9cb19ac40801 100644 --- a/.buildkite/test_areas/cuda.yaml +++ b/.buildkite/test_areas/cuda.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: Platform Tests (CUDA) + key: platform-tests-cuda timeout_in_minutes: 15 device: h200_18gb source_file_dependencies: @@ -13,6 +14,7 @@ steps: - pytest -v -s cuda/test_platform_no_cuda_init.py - label: Cudagraph + key: cudagraph timeout_in_minutes: 20 source_file_dependencies: - tests/v1/cudagraph diff --git a/.buildkite/test_areas/disaggregated.yaml b/.buildkite/test_areas/disaggregated.yaml index a10fda41ef0d..e68b9e1add8b 100644 --- a/.buildkite/test_areas/disaggregated.yaml +++ b/.buildkite/test_areas/disaggregated.yaml @@ -3,65 +3,71 @@ depends_on: - image-build steps: - label: Distributed NixlConnector PD accuracy (4 GPUs) + key: distributed-nixlconnector-pd-accuracy-4-gpus timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" num_devices: 4 source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - tests/v1/kv_connector/nixl_integration/ commands: - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - label: Distributed FlashInfer NixlConnector PD accuracy (4 GPUs) + key: distributed-flashinfer-nixlconnector-pd-accuracy-4-gpus timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" num_devices: 4 source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - tests/v1/kv_connector/nixl_integration/ commands: - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - FLASHINFER=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs) + key: dp-ep-distributed-nixlconnector-pd-accuracy-tests-4-gpus timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" num_devices: 4 source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - tests/v1/kv_connector/nixl_integration/ commands: - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs) + key: crosslayer-kv-layout-distributed-nixlconnector-pd-accuracy-tests-4-gpus timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" num_devices: 4 source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - tests/v1/kv_connector/nixl_integration/ commands: - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - label: Hybrid SSM NixlConnector PD accuracy tests (4 GPUs) + key: hybrid-ssm-nixlconnector-pd-accuracy-tests-4-gpus timeout_in_minutes: 20 working_dir: "/vllm-workspace/tests" num_devices: 4 source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - tests/v1/kv_connector/nixl_integration/ commands: - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - HYBRID_SSM=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - label: MultiConnector (Nixl+Offloading) PD accuracy (2 GPUs) + key: multiconnector-nixl-offloading-pd-accuracy-2-gpus timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" num_devices: 2 source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py - vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py - vllm/distributed/kv_transfer/kv_connector/v1/offloading/ @@ -71,12 +77,13 @@ steps: - bash v1/kv_connector/nixl_integration/run_multi_connector_accuracy_test.sh - label: NixlConnector PD + Spec Decode acceptance (2 GPUs) + key: nixlconnector-pd-spec-decode-acceptance-2-gpus timeout_in_minutes: 30 device: a100 working_dir: "/vllm-workspace/tests" num_devices: 2 source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - vllm/v1/worker/kv_connector_model_runner_mixin.py - tests/v1/kv_connector/nixl_integration/ commands: @@ -84,11 +91,12 @@ steps: - bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh - label: MultiConnector (Nixl+Offloading) PD edge cases (2 GPUs) + key: multiconnector-nixl-offloading-pd-edge-cases-2-gpus timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" num_devices: 2 source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py - vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py - vllm/distributed/kv_transfer/kv_connector/v1/offloading/ diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index 093f3ab4fe1f..8aa41a9a26ab 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: Distributed Comm Ops + key: distributed-comm-ops timeout_in_minutes: 20 working_dir: "/vllm-workspace/tests" num_devices: 2 @@ -16,6 +17,7 @@ steps: - pytest -v -s distributed/test_shm_storage.py - label: Distributed DP Tests (2 GPUs) + key: distributed-dp-tests-2-gpus timeout_in_minutes: 20 working_dir: "/vllm-workspace/tests" num_devices: 2 @@ -37,6 +39,7 @@ steps: - DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py - label: Distributed Compile + RPC Tests (2 GPUs) + key: distributed-compile-rpc-tests-2-gpus timeout_in_minutes: 20 working_dir: "/vllm-workspace/tests" num_devices: 2 @@ -59,6 +62,7 @@ steps: - pytest -v -s ./compile/test_wrapper.py - label: Distributed Torchrun + Shutdown Tests (2 GPUs) + key: distributed-torchrun-shutdown-tests-2-gpus timeout_in_minutes: 20 working_dir: "/vllm-workspace/tests" num_devices: 2 @@ -81,6 +85,7 @@ steps: - pytest -v -s v1/worker/test_worker_memory_snapshot.py - label: Distributed Torchrun + Examples (4 GPUs) + key: distributed-torchrun-examples-4-gpus timeout_in_minutes: 30 working_dir: "/vllm-workspace" num_devices: 4 @@ -88,9 +93,8 @@ steps: - vllm/distributed/ - tests/distributed/test_torchrun_example.py - tests/distributed/test_torchrun_example_moe.py - - examples/offline_inference/rlhf_colocate.py - examples/rl/ - - tests/examples/offline_inference/data_parallel.py + - tests/examples/features/data_parallel/data_parallel_offline.py commands: # https://github.com/NVIDIA/nccl/issues/1838 - export NCCL_CUMEM_HOST_ENABLE=0 @@ -107,12 +111,13 @@ steps: # test with torchrun tp=2 and dp=2 with ep - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 tests/distributed/test_torchrun_example_moe.py # test with internal dp - - python3 examples/offline_inference/data_parallel.py --enforce-eager + - python3 examples/features/data_parallel/data_parallel_offline.py --enforce-eager # rlhf examples - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_nccl.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_ipc.py - label: Distributed DP Tests (4 GPUs) + key: distributed-dp-tests-4-gpus timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" num_devices: 4 @@ -133,6 +138,7 @@ steps: - pytest -v -s distributed/test_utils.py - label: Distributed Compile + Comm (4 GPUs) + key: distributed-compile-comm-4-gpus timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" num_devices: 4 @@ -154,24 +160,28 @@ steps: - pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node - label: Distributed Tests (8 GPUs)(H100) + key: distributed-tests-8-gpus-h100 timeout_in_minutes: 10 device: h100 num_devices: 8 working_dir: "/vllm-workspace/tests" source_file_dependencies: - - examples/offline_inference/torchrun_dp_example.py + - examples/features/torchrun/torchrun_dp_example_offline.py - vllm/config/parallel.py - vllm/distributed/ - vllm/v1/engine/llm_engine.py - vllm/v1/executor/uniproc_executor.py - vllm/v1/worker/gpu_worker.py + - tests/distributed/test_mnnvl_alltoall.py + commands: # https://github.com/NVIDIA/nccl/issues/1838 - export NCCL_CUMEM_HOST_ENABLE=0 # test with torchrun tp=2 and dp=4 with ep - - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep + - torchrun --nproc-per-node=8 ../examples/features/torchrun/torchrun_dp_example_offline.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep - label: Distributed Tests (4 GPUs)(A100) + key: distributed-tests-4-gpus-a100 device: a100 optional: true num_devices: 4 @@ -186,6 +196,7 @@ steps: - pytest -v -s -x lora/test_mixtral.py - label: Distributed Tests (2 GPUs)(H100) + key: distributed-tests-2-gpus-h100 timeout_in_minutes: 15 device: h100 optional: true @@ -194,13 +205,14 @@ steps: commands: - pytest -v -s tests/distributed/test_context_parallel.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py - - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput + - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/features/data_parallel/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 pytest -v -s tests/distributed/test_weight_transfer.py - pytest -v -s tests/distributed/test_packed_tensor.py - label: Distributed Tests (2 GPUs)(B200) - device: b200 + key: distributed-tests-2-gpus-b200 + device: b200-k8s optional: true working_dir: "/vllm-workspace/" num_devices: 2 @@ -208,8 +220,12 @@ steps: - pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py - pytest -v -s tests/v1/distributed/test_dbo.py + - pytest -v -s tests/distributed/test_mnnvl_alltoall.py + + - label: 2 Node Test (4 GPUs) + key: 2-node-test-4-gpus timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" num_devices: 2 @@ -222,11 +238,12 @@ steps: - vllm/executor/ - vllm/model_executor/models/ - tests/distributed/ - - tests/examples/offline_inference/data_parallel.py + - tests/examples/features/data_parallel/data_parallel_offline.py commands: - - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code" + - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/features/data_parallel/data_parallel_offline.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/features/data_parallel/data_parallel_offline.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code" - label: Pipeline + Context Parallelism (4 GPUs) + key: pipeline-context-parallelism-4-gpus timeout_in_minutes: 60 working_dir: "/vllm-workspace/tests" num_devices: 4 @@ -241,6 +258,7 @@ steps: - pytest -v -s distributed/test_pipeline_parallel.py - label: RayExecutorV2 (4 GPUs) + key: rayexecutorv2-4-gpus timeout_in_minutes: 60 working_dir: "/vllm-workspace/tests" num_devices: 4 diff --git a/.buildkite/test_areas/docker.yaml b/.buildkite/test_areas/docker.yaml new file mode 100644 index 000000000000..9bf96221abe0 --- /dev/null +++ b/.buildkite/test_areas/docker.yaml @@ -0,0 +1,16 @@ +group: Docker +depends_on: + - image-build-cpu +steps: +- label: Docker Build Metadata + timeout_in_minutes: 10 + device: cpu-small + source_file_dependencies: + - .buildkite/release-pipeline.yaml + - .buildkite/scripts/docker-build-metadata-args.sh + - docker/Dockerfile + - docker/Dockerfile.cpu + - docker/docker-bake.hcl + - tests/tools/test_docker_build_metadata_args.py + commands: + - pytest -v -s tools/test_docker_build_metadata_args.py diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml index 857fefd268a4..bb8aa14eac18 100644 --- a/.buildkite/test_areas/e2e_integration.yaml +++ b/.buildkite/test_areas/e2e_integration.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: DeepSeek V2-Lite Accuracy + key: deepseek-v2-lite-accuracy timeout_in_minutes: 60 device: h100 optional: true @@ -12,6 +13,7 @@ steps: - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 - label: Qwen3-30B-A3B-FP8-block Accuracy + key: qwen3-30b-a3b-fp8-block-accuracy timeout_in_minutes: 60 device: h100 optional: true @@ -21,8 +23,9 @@ steps: - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 - label: Qwen3-30B-A3B-FP8-block Accuracy (B200) + key: qwen3-30b-a3b-fp8-block-accuracy-b200 timeout_in_minutes: 60 - device: b200 + device: b200-k8s optional: true num_devices: 2 working_dir: "/vllm-workspace" @@ -30,6 +33,7 @@ steps: - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 - label: Qwen3-30B-A3B-FP8 DP4 Async EPLB Accuracy + key: qwen3-30b-a3b-fp8-dp4-async-eplb-accuracy timeout_in_minutes: 60 device: h100 optional: true @@ -39,6 +43,7 @@ steps: - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_dp4_async_eplb.sh 0.8 200 8050 - label: DeepSeek V2-Lite Prefetch Offload Accuracy (H100) + key: deepseek-v2-lite-prefetch-offload-accuracy-h100 timeout_in_minutes: 60 device: h100 optional: true diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml index 5e4361ec9ad6..cf0f028255d2 100644 --- a/.buildkite/test_areas/engine.yaml +++ b/.buildkite/test_areas/engine.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: Engine + key: engine timeout_in_minutes: 15 device: h200_18gb source_file_dependencies: @@ -12,10 +13,12 @@ steps: - tests/test_config - tests/test_logger - tests/test_vllm_port + - tests/test_jit_monitor.py commands: - - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py + - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py test_jit_monitor.py - label: Engine (1 GPU) + key: engine-1-gpu timeout_in_minutes: 30 source_file_dependencies: - vllm/v1/engine/ @@ -25,6 +28,7 @@ steps: - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py - label: e2e Scheduling (1 GPU) + key: e2e-scheduling-1-gpu timeout_in_minutes: 30 device: h200_18gb source_file_dependencies: @@ -34,6 +38,7 @@ steps: - pytest -v -s v1/e2e/general/test_async_scheduling.py - label: e2e Core (1 GPU) + key: e2e-core-1-gpu timeout_in_minutes: 30 source_file_dependencies: - vllm/v1/ @@ -42,6 +47,7 @@ steps: - pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py - label: V1 e2e (2 GPUs) + key: v1-e2e-2-gpus timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability optional: true num_devices: 2 @@ -51,13 +57,9 @@ steps: commands: # Only run tests that need exactly 2 GPUs - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism" - mirror: - amd: - device: mi325_2 - depends_on: - - image-build-amd - label: V1 e2e (4 GPUs) + key: v1-e2e-4-gpus timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability optional: true num_devices: 4 @@ -67,13 +69,9 @@ steps: commands: # Only run tests that need 4 GPUs - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy" - mirror: - amd: - device: mi325_4 - depends_on: - - image-build-amd - label: V1 e2e (4xH100) + key: v1-e2e-4xh100 timeout_in_minutes: 60 device: h100 num_devices: 4 diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml index 8c2b529a8068..ba92d3a3aec0 100644 --- a/.buildkite/test_areas/entrypoints.yaml +++ b/.buildkite/test_areas/entrypoints.yaml @@ -2,7 +2,8 @@ group: Entrypoints depends_on: - image-build steps: -- label: Entrypoints Unit Tests +- label: Entrypoints Unit Tests + key: entrypoints-unit-tests timeout_in_minutes: 10 working_dir: "/vllm-workspace/tests" source_file_dependencies: @@ -13,6 +14,7 @@ steps: - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - label: Entrypoints Integration (LLM) + key: entrypoints-integration-llm timeout_in_minutes: 40 working_dir: "/vllm-workspace/tests" source_file_dependencies: @@ -24,8 +26,14 @@ steps: - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests + mirror: + amd: + device: mi300_1 + depends_on: + - image-build-amd - label: Entrypoints Integration (API Server openai - Part 1) + key: entrypoints-integration-api-server-openai-part-1 timeout_in_minutes: 50 working_dir: "/vllm-workspace/tests" source_file_dependencies: @@ -35,14 +43,10 @@ steps: commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py - mirror: - amd: - device: mi325_1 - depends_on: - - image-build-amd - label: Entrypoints Integration (API Server openai - Part 2) + key: entrypoints-integration-api-server-openai-part-2 timeout_in_minutes: 50 working_dir: "/vllm-workspace/tests" source_file_dependencies: @@ -53,13 +57,9 @@ steps: - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py - pytest -v -s entrypoints/openai/speech_to_text/ - pytest -v -s entrypoints/test_chat_utils.py - mirror: - amd: - device: mi325_1 - depends_on: - - image-build-amd - label: Entrypoints Integration (API Server openai - Part 3) + key: entrypoints-integration-api-server-openai-part-3 timeout_in_minutes: 50 device: h200_18gb working_dir: "/vllm-workspace/tests" @@ -72,6 +72,7 @@ steps: - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion --ignore=entrypoints/openai/completion --ignore=entrypoints/openai/speech_to_text/ --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py - label: Entrypoints Integration (API Server 2) + key: entrypoints-integration-api-server-2 timeout_in_minutes: 130 working_dir: "/vllm-workspace/tests" source_file_dependencies: @@ -86,6 +87,7 @@ steps: - pytest -v -s tool_use - label: Entrypoints Integration (Pooling) + key: entrypoints-integration-pooling timeout_in_minutes: 50 working_dir: "/vllm-workspace/tests" source_file_dependencies: @@ -96,6 +98,7 @@ steps: - pytest -v -s entrypoints/pooling - label: Entrypoints Integration (Responses API) + key: entrypoints-integration-responses-api timeout_in_minutes: 50 working_dir: "/vllm-workspace/tests" source_file_dependencies: @@ -105,6 +108,7 @@ steps: - pytest -v -s entrypoints/openai/responses - label: OpenAI API Correctness + key: openai-api-correctness timeout_in_minutes: 30 device: h200_18gb source_file_dependencies: diff --git a/.buildkite/test_areas/expert_parallelism.yaml b/.buildkite/test_areas/expert_parallelism.yaml index c2adf52a2d57..0f7ab0d7157c 100644 --- a/.buildkite/test_areas/expert_parallelism.yaml +++ b/.buildkite/test_areas/expert_parallelism.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: EPLB Algorithm + key: eplb-algorithm timeout_in_minutes: 15 device: h200_18gb working_dir: "/vllm-workspace/tests" @@ -15,6 +16,7 @@ steps: - pytest -v -s distributed/test_eplb_utils.py - label: EPLB Execution # 17min + key: eplb-execution timeout_in_minutes: 27 working_dir: "/vllm-workspace/tests" num_devices: 4 @@ -26,6 +28,7 @@ steps: - pytest -v -s distributed/test_eplb_spec_decode.py - label: Elastic EP Scaling Test + key: elastic-ep-scaling-test timeout_in_minutes: 20 device: h100 working_dir: "/vllm-workspace/tests" diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml index 86e09f3de4b5..34e1e4832d9d 100644 --- a/.buildkite/test_areas/kernels.yaml +++ b/.buildkite/test_areas/kernels.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: vLLM IR Tests + key: vllm-ir-tests timeout_in_minutes: 10 device: h200_18gb working_dir: "/vllm-workspace/" @@ -14,6 +15,7 @@ steps: - pytest -v -s tests/kernels/ir - label: Kernels Core Operation Test + key: kernels-core-operation-test timeout_in_minutes: 75 source_file_dependencies: - csrc/ @@ -23,6 +25,7 @@ steps: - pytest -v -s kernels/core --ignore=kernels/core/test_minimax_reduce_rms.py kernels/test_concat_mla_q.py - label: Kernels MiniMax Reduce RMS Test (2 GPUs) + key: kernels-minimax-reduce-rms-test-2-gpus timeout_in_minutes: 15 num_devices: 2 device: h100 @@ -36,6 +39,7 @@ steps: - pytest -v -s kernels/core/test_minimax_reduce_rms.py - label: Kernels Attention Test %N + key: kernels-attention-test timeout_in_minutes: 35 source_file_dependencies: - csrc/attention/ @@ -49,6 +53,7 @@ steps: parallelism: 2 - label: Kernels Quantization Test %N + key: kernels-quantization-test timeout_in_minutes: 90 source_file_dependencies: - csrc/quantization/ @@ -59,6 +64,7 @@ steps: parallelism: 2 - label: Kernels MoE Test %N + key: kernels-moe-test timeout_in_minutes: 25 source_file_dependencies: - csrc/quantization/cutlass_w8a8/moe/ @@ -74,6 +80,7 @@ steps: parallelism: 5 - label: Kernels Mamba Test + key: kernels-mamba-test timeout_in_minutes: 45 source_file_dependencies: - csrc/mamba/ @@ -82,7 +89,18 @@ steps: commands: - pytest -v -s kernels/mamba +- label: Kernels KDA Test + timeout_in_minutes: 20 + source_file_dependencies: + - vllm/model_executor/layers/fla/ops/kda.py + - vllm/model_executor/layers/fla/ops/chunk_delta_h.py + - vllm/model_executor/layers/fla/ops/l2norm.py + - tests/kernels/test_kda.py + commands: + - pytest -v -s kernels/test_kda.py + - label: Kernels DeepGEMM Test (H100) + key: kernels-deepgemm-test-h100 timeout_in_minutes: 45 device: h100 num_devices: 1 @@ -104,9 +122,10 @@ steps: - pytest -v -s quantization/test_cutlass_w4a16.py - label: Kernels (B200) + key: kernels-b200 timeout_in_minutes: 30 working_dir: "/vllm-workspace/" - device: b200 + device: b200-k8s # optional: true source_file_dependencies: - csrc/quantization/fp4/ @@ -152,6 +171,7 @@ steps: - pytest -v -s tests/models/quantization/test_nvfp4.py - label: Kernels Helion Test + key: kernels-helion-test timeout_in_minutes: 30 device: h100 source_file_dependencies: @@ -163,6 +183,7 @@ steps: - label: Kernels FP8 MoE Test (1 H100) + key: kernels-fp8-moe-test-1-h100 timeout_in_minutes: 90 device: h100 num_devices: 1 @@ -179,6 +200,7 @@ steps: - pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py - label: Kernels FP8 MoE Test (2 H100s) + key: kernels-fp8-moe-test-2-h100s timeout_in_minutes: 90 device: h100 num_devices: 2 @@ -188,8 +210,9 @@ steps: - pytest -v -s kernels/moe/test_deepep_moe.py - label: Kernels Fp4 MoE Test (B200) + key: kernels-fp4-moe-test-b200 timeout_in_minutes: 60 - device: b200 + device: b200-k8s num_devices: 1 optional: true commands: @@ -200,6 +223,7 @@ steps: - label: Kernels FusedMoE Layer Test (2 H100s) + key: kernels-fusedmoe-layer-test-2-h100s timeout_in_minutes: 90 device: h100 num_devices: 2 @@ -216,6 +240,7 @@ steps: - label: Kernels FusedMoE Layer Test (2 B200s) + key: kernels-fusedmoe-layer-test-2-b200s timeout_in_minutes: 90 device: b200 num_devices: 2 diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml index a07d702cf3ce..e5a163d17c7e 100644 --- a/.buildkite/test_areas/lm_eval.yaml +++ b/.buildkite/test_areas/lm_eval.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: LM Eval Small Models + key: lm-eval-small-models timeout_in_minutes: 75 source_file_dependencies: - csrc/ @@ -24,6 +25,7 @@ steps: # - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 - label: LM Eval Large Models (4 GPUs)(H100) + key: lm-eval-large-models-4-gpus-h100 device: h100 optional: true num_devices: 4 @@ -36,6 +38,7 @@ steps: - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 - label: LM Eval Small Models (B200) + key: lm-eval-small-models-b200 timeout_in_minutes: 120 device: b200 optional: true @@ -46,8 +49,9 @@ steps: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt - label: LM Eval Qwen3.5 Models (B200) + key: lm-eval-qwen3-5-models-b200 timeout_in_minutes: 120 - device: b200 + device: b200-k8s optional: true num_devices: 2 source_file_dependencies: @@ -62,6 +66,7 @@ steps: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-qwen35-blackwell.txt - label: LM Eval Large Models (H200) + key: lm-eval-large-models-h200 timeout_in_minutes: 60 device: h200 optional: true @@ -70,6 +75,7 @@ steps: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt - label: MoE Refactor Integration Test (H100 - TEMPORARY) + key: moe-refactor-integration-test-h100-temporary device: h100 optional: true num_devices: 2 @@ -77,13 +83,15 @@ steps: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt - label: MoE Refactor Integration Test (B200 - TEMPORARY) - device: b200 + key: moe-refactor-integration-test-b200-temporary + device: b200-k8s optional: true num_devices: 2 commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt - label: MoE Refactor Integration Test (B200 DP - TEMPORARY) + key: moe-refactor-integration-test-b200-dp-temporary device: b200 optional: true num_devices: 2 @@ -92,6 +100,7 @@ steps: - label: LM Eval TurboQuant KV Cache + key: lm-eval-turboquant-kv-cache timeout_in_minutes: 75 source_file_dependencies: - vllm/model_executor/layers/quantization/turboquant/ @@ -102,6 +111,7 @@ steps: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/models-turboquant.txt - label: GPQA Eval (GPT-OSS) (H100) + key: gpqa-eval-gpt-oss-h100 timeout_in_minutes: 120 device: h100 optional: true @@ -115,6 +125,7 @@ steps: - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-h100.txt - label: GPQA Eval (GPT-OSS) (B200) + key: gpqa-eval-gpt-oss-b200 timeout_in_minutes: 120 device: b200 optional: true @@ -126,3 +137,10 @@ steps: commands: - uv pip install --system 'gpt-oss[eval]==0.0.5' - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-b200.txt + +- label: MRCR Eval Small Models + timeout_in_minutes: 30 + source_file_dependencies: + - tests/evals/mrcr/ + commands: + - pytest -s -v evals/mrcr/test_mrcr_correctness.py --config-list-file=evals/mrcr/configs/models-small.txt diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml index 21f392ff737b..f540eb2fcc2a 100644 --- a/.buildkite/test_areas/lora.yaml +++ b/.buildkite/test_areas/lora.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: LoRA %N + key: lora timeout_in_minutes: 30 source_file_dependencies: - vllm/lora @@ -13,6 +14,7 @@ steps: - label: LoRA TP (Distributed) + key: lora-tp-distributed timeout_in_minutes: 30 num_devices: 4 source_file_dependencies: diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml index d0930be156d2..c34d4c10b49a 100644 --- a/.buildkite/test_areas/misc.yaml +++ b/.buildkite/test_areas/misc.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: V1 Spec Decode + key: v1-spec-decode timeout_in_minutes: 30 source_file_dependencies: - vllm/ @@ -11,13 +12,9 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn # TODO: create another `optional` test group for slow tests - pytest -v -s -m 'not slow_test' v1/spec_decode - mirror: - amd: - device: mi325_1 - depends_on: - - image-build-amd - label: V1 Sample + Logits + key: v1-sample-logits timeout_in_minutes: 30 device: h200_18gb source_file_dependencies: @@ -36,11 +33,12 @@ steps: - pytest -v -s v1/test_outputs.py mirror: amd: - device: mi325_1 + device: mi300_1 depends_on: - image-build-amd - label: V1 Core + KV + Metrics + key: v1-core-kv-metrics timeout_in_minutes: 30 source_file_dependencies: - vllm/ @@ -64,13 +62,9 @@ steps: # Integration test for streaming correctness (requires special branch). - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - mirror: - amd: - device: mi325_1 - depends_on: - - image-build-amd - label: V1 Others (CPU) + key: v1-others-cpu depends_on: - image-build-cpu source_file_dependencies: @@ -86,6 +80,7 @@ steps: - pytest -v -s -m 'cpu_test' v1/metrics - label: Regression + key: regression timeout_in_minutes: 20 device: h200_18gb source_file_dependencies: @@ -97,6 +92,7 @@ steps: working_dir: "/vllm-workspace/tests" # optional - label: Examples + key: examples timeout_in_minutes: 45 working_dir: "/vllm-workspace/examples" source_file_dependencies: @@ -120,14 +116,15 @@ steps: # for pooling models - python3 pooling/embed/vision_embedding_offline.py --seed 0 # for features demo - - python3 offline_inference/prefix_caching.py + - python3 features/automatic_prefix_caching/prefix_caching_offline.py - python3 offline_inference/llm_engine_example.py - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - label: Metrics, Tracing (2 GPUs) + key: metrics-tracing-2-gpus timeout_in_minutes: 20 num_devices: 2 source_file_dependencies: @@ -142,6 +139,7 @@ steps: - pytest -v -s v1/tracing - label: Python-only Installation + key: python-only-installation depends_on: ~ timeout_in_minutes: 20 source_file_dependencies: @@ -151,6 +149,7 @@ steps: - bash standalone_tests/python_only_compile.sh - label: Async Engine, Inputs, Utils, Worker + key: async-engine-inputs-utils-worker timeout_in_minutes: 50 source_file_dependencies: - vllm/ @@ -163,7 +162,8 @@ steps: - pytest -v -s utils_ - label: Async Engine, Inputs, Utils, Worker, Config (CPU) - depends_on: + key: async-engine-inputs-utils-worker-config-cpu + depends_on: - image-build-cpu timeout_in_minutes: 30 source_file_dependencies: @@ -190,12 +190,13 @@ steps: - pytest -v -s -m 'cpu_test' multimodal - pytest -v -s renderers - pytest -v -s tokenizers_ - - pytest -v -s reasoning --ignore=reasoning/test_seedoss_reasoning_parser.py --ignore=reasoning/test_glm4_moe_reasoning_parser.py --ignore=reasoning/test_gemma4_reasoning_parser.py + - pytest -v -s reasoning --ignore=reasoning/test_seedoss_reasoning_parser.py --ignore=reasoning/test_glm4_moe_reasoning_parser.py - pytest -v -s tool_parsers - pytest -v -s transformers_utils - pytest -v -s config - label: Batch Invariance (H100) + key: batch-invariance-h100 timeout_in_minutes: 30 device: h100 source_file_dependencies: @@ -211,8 +212,9 @@ steps: - VLLM_TEST_MODEL=Qwen/Qwen3-30B-A3B-Thinking-2507-FP8 pytest -v -s v1/determinism/test_batch_invariance.py::test_v1_generation_is_deterministic_across_batch_sizes_with_needle[FLASH_ATTN] - label: Batch Invariance (B200) + key: batch-invariance-b200 timeout_in_minutes: 30 - device: b200 + device: b200-k8s source_file_dependencies: - vllm/v1/attention - vllm/model_executor/layers @@ -227,6 +229,7 @@ steps: - pytest -v -s v1/determinism/test_nvfp4_batch_invariant.py - label: Acceptance Length Test (Large Models) # optional + key: acceptance-length-test-large-models timeout_in_minutes: 25 gpu: h100 optional: true diff --git a/.buildkite/test_areas/model_executor.yaml b/.buildkite/test_areas/model_executor.yaml index 212abfdbb906..c41ef8a7110d 100644 --- a/.buildkite/test_areas/model_executor.yaml +++ b/.buildkite/test_areas/model_executor.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: Model Executor + key: model-executor timeout_in_minutes: 35 source_file_dependencies: - vllm/engine/arg_utils.py diff --git a/.buildkite/test_areas/model_runner_v2.yaml b/.buildkite/test_areas/model_runner_v2.yaml index 2b88c00d6b77..6a4338a5e40a 100644 --- a/.buildkite/test_areas/model_runner_v2.yaml +++ b/.buildkite/test_areas/model_runner_v2.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: Model Runner V2 Core Tests + key: model-runner-v2-core-tests timeout_in_minutes: 45 source_file_dependencies: - vllm/v1/worker/gpu/ @@ -25,14 +26,16 @@ steps: - pytest -v -s entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0" - label: Model Runner V2 Examples + key: model-runner-v2-examples timeout_in_minutes: 45 working_dir: "/vllm-workspace/examples" source_file_dependencies: - vllm/v1/worker/gpu/ - vllm/v1/core/sched/ - vllm/v1/worker/gpu_worker.py - - examples/offline_inference/ - examples/basic/offline_inference/ + - examples/generate/multimodal/ + - examples/features/ - examples/pooling/embed/vision_embedding_offline.py - examples/others/tensorize_vllm_model.py commands: @@ -51,14 +54,15 @@ steps: # for pooling models - python3 pooling/embed/vision_embedding_offline.py --seed 0 # for features demo - - python3 offline_inference/prefix_caching.py + - python3 features/automatic_prefix_caching/prefix_caching_offline.py - python3 offline_inference/llm_engine_example.py - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - label: Model Runner V2 Distributed (2 GPUs) + key: model-runner-v2-distributed-2-gpus timeout_in_minutes: 45 working_dir: "/vllm-workspace/tests" num_devices: 2 @@ -79,6 +83,7 @@ steps: - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - label: Model Runner V2 Pipeline Parallelism (4 GPUs) + key: model-runner-v2-pipeline-parallelism-4-gpus timeout_in_minutes: 60 working_dir: "/vllm-workspace/tests" num_devices: 4 @@ -94,6 +99,7 @@ steps: - pytest -v -s distributed/test_pp_cudagraph.py -k "not ray" - label: Model Runner V2 Spec Decode + key: model-runner-v2-spec-decode timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" source_file_dependencies: diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml index 73cf8c53bc92..8fca203de44f 100644 --- a/.buildkite/test_areas/models_basic.yaml +++ b/.buildkite/test_areas/models_basic.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: Basic Models Tests (Initialization) + key: basic-models-tests-initialization timeout_in_minutes: 45 torch_nightly: true source_file_dependencies: @@ -16,6 +17,7 @@ steps: torch_nightly: {} - label: Basic Models Tests (Extra Initialization) %N + key: basic-models-tests-extra-initialization timeout_in_minutes: 45 source_file_dependencies: - vllm/model_executor/models/ @@ -31,6 +33,7 @@ steps: torch_nightly: {} - label: Basic Models Tests (Other) + key: basic-models-tests-other timeout_in_minutes: 45 source_file_dependencies: - vllm/ @@ -39,14 +42,9 @@ steps: - tests/models/test_registry.py commands: - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py - mirror: - amd: - device: mi325_1 - depends_on: - - image-build-amd - - label: Basic Models Test (Other CPU) # 5min + key: basic-models-test-other-cpu depends_on: - image-build-cpu timeout_in_minutes: 10 @@ -59,6 +57,7 @@ steps: - pytest -v -s models/test_utils.py models/test_vision.py - label: Transformers Nightly Models + key: transformers-nightly-models working_dir: "/vllm-workspace/" optional: true soft_fail: true @@ -74,6 +73,7 @@ steps: - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/generate/multimodal/audio_language_offline.py --model-type whisper - label: Transformers Backward Compatibility Models Test + key: transformers-backward-compatibility-models-test working_dir: "/vllm-workspace/" optional: true soft_fail: true diff --git a/.buildkite/test_areas/models_distributed.yaml b/.buildkite/test_areas/models_distributed.yaml index 55e7410b8af4..b5758c55affa 100644 --- a/.buildkite/test_areas/models_distributed.yaml +++ b/.buildkite/test_areas/models_distributed.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: Distributed Model Tests (2 GPUs) + key: distributed-model-tests-2-gpus timeout_in_minutes: 50 working_dir: "/vllm-workspace/tests" num_devices: 2 diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml index c13371e25f1d..b560c5a4769a 100644 --- a/.buildkite/test_areas/models_language.yaml +++ b/.buildkite/test_areas/models_language.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: Language Models Tests (Standard) + key: language-models-tests-standard timeout_in_minutes: 25 source_file_dependencies: - vllm/ @@ -15,6 +16,7 @@ steps: torch_nightly: {} - label: Language Models Tests (Extra Standard) %N + key: language-models-tests-extra-standard timeout_in_minutes: 45 source_file_dependencies: - vllm/model_executor/models/ @@ -31,6 +33,7 @@ steps: torch_nightly: {} - label: Language Models Tests (Hybrid) %N + key: language-models-tests-hybrid timeout_in_minutes: 75 source_file_dependencies: - vllm/ @@ -45,8 +48,17 @@ steps: parallelism: 2 mirror: torch_nightly: {} + amd: + device: mi300_1 + depends_on: + - image-build-amd + commands: + - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0' + - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB - label: Language Models Test (Extended Generation) # 80min + key: language-models-test-extended-generation timeout_in_minutes: 110 optional: true source_file_dependencies: @@ -58,17 +70,9 @@ steps: - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0' - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0' - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' - mirror: - amd: - device: mi325_1 - depends_on: - - image-build-amd - commands: - - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' - - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0' - - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' - label: Language Models Test (PPL) + key: language-models-test-ppl timeout_in_minutes: 110 device: h200_18gb optional: true @@ -79,6 +83,7 @@ steps: - pytest -v -s models/language/generation_ppl_test - label: Language Models Test (Extended Pooling) # 36min + key: language-models-test-extended-pooling timeout_in_minutes: 50 optional: true source_file_dependencies: @@ -86,13 +91,9 @@ steps: - tests/models/language/pooling commands: - pytest -v -s models/language/pooling -m 'not core_model' - mirror: - amd: - device: mi325_1 - depends_on: - - image-build-amd - label: Language Models Test (MTEB) + key: language-models-test-mteb timeout_in_minutes: 110 device: h200_18gb optional: true diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml index 245ef24026d2..1f66393df818 100644 --- a/.buildkite/test_areas/models_multimodal.yaml +++ b/.buildkite/test_areas/models_multimodal.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: "Multi-Modal Models (Standard) 1: qwen2" + key: multi-modal-models-standard-1-qwen2 timeout_in_minutes: 45 device: h200_18gb source_file_dependencies: @@ -14,11 +15,12 @@ steps: - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model mirror: amd: - device: mi325_1 + device: mi300_1 depends_on: - image-build-amd - label: "Multi-Modal Models (Standard) 2: qwen3 + gemma" + key: multi-modal-models-standard-2-qwen3-gemma timeout_in_minutes: 45 device: h200_18gb source_file_dependencies: @@ -31,11 +33,12 @@ steps: - pytest -v -s models/multimodal/generation/test_vit_cudagraph.py -m core_model mirror: amd: - device: mi325_1 + device: mi300_1 depends_on: - image-build-amd - label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl" + key: multi-modal-models-standard-3-llava-qwen2-vl timeout_in_minutes: 45 source_file_dependencies: - vllm/ @@ -46,11 +49,12 @@ steps: - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model mirror: amd: - device: mi325_1 + device: mi300_1 depends_on: - image-build-amd - label: "Multi-Modal Models (Standard) 4: other + whisper" + key: multi-modal-models-standard-4-other-whisper timeout_in_minutes: 45 source_file_dependencies: - vllm/ @@ -60,14 +64,10 @@ steps: - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/generation/test_memory_leak.py --ignore models/multimodal/processing - pytest models/multimodal/generation/test_memory_leak.py -m core_model - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - mirror: - amd: - device: mi325_1 - depends_on: - - image-build-amd - label: Multi-Modal Processor (CPU) - depends_on: + key: multi-modal-processor-cpu + depends_on: - image-build-cpu timeout_in_minutes: 60 source_file_dependencies: @@ -80,6 +80,7 @@ steps: - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py - label: Multi-Modal Processor # 44min + key: multi-modal-processor timeout_in_minutes: 60 device: h200_18gb source_file_dependencies: @@ -91,6 +92,7 @@ steps: - pytest -v -s models/multimodal/processing/test_tensor_schema.py - label: Multi-Modal Accuracy Eval (Small Models) # 50min + key: multi-modal-accuracy-eval-small-models timeout_in_minutes: 70 working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" source_file_dependencies: @@ -101,6 +103,7 @@ steps: - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 - label: Multi-Modal Models (Extended Generation 1) + key: multi-modal-models-extended-generation-1 optional: true source_file_dependencies: - vllm/ @@ -112,11 +115,12 @@ steps: - pytest -v -s models/multimodal/test_mapping.py mirror: amd: - device: mi325_1 + device: mi300_1 depends_on: - image-build-amd - label: Multi-Modal Models (Extended Generation 2) + key: multi-modal-models-extended-generation-2 optional: true source_file_dependencies: - vllm/ @@ -126,6 +130,7 @@ steps: - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' - label: Multi-Modal Models (Extended Generation 3) + key: multi-modal-models-extended-generation-3 optional: true source_file_dependencies: - vllm/ @@ -135,6 +140,7 @@ steps: - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' - label: Multi-Modal Models (Extended Pooling) + key: multi-modal-models-extended-pooling optional: true device: h200_18gb source_file_dependencies: diff --git a/.buildkite/test_areas/plugins.yaml b/.buildkite/test_areas/plugins.yaml index 8e0eb0284019..0d23180f3ef7 100644 --- a/.buildkite/test_areas/plugins.yaml +++ b/.buildkite/test_areas/plugins.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: Plugin Tests (2 GPUs) + key: plugin-tests-2-gpus timeout_in_minutes: 60 working_dir: "/vllm-workspace/tests" num_devices: 2 diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml index a3648219d89d..a470cc60a3e5 100644 --- a/.buildkite/test_areas/pytorch.yaml +++ b/.buildkite/test_areas/pytorch.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: PyTorch Compilation Unit Tests + key: pytorch-compilation-unit-tests timeout_in_minutes: 10 source_file_dependencies: - vllm/ @@ -18,6 +19,7 @@ steps: - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" - label: PyTorch Compilation Unit Tests (H100) + key: pytorch-compilation-unit-tests-h100 timeout_in_minutes: 30 device: h100 num_devices: 1 @@ -28,6 +30,7 @@ steps: - "find compile/h100/ -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" - label: PyTorch Compilation Passes Unit Tests + key: pytorch-compilation-passes-unit-tests timeout_in_minutes: 20 source_file_dependencies: - vllm/ @@ -36,6 +39,7 @@ steps: - pytest -s -v compile/passes --ignore compile/passes/distributed - label: PyTorch Fullgraph Smoke Test + key: pytorch-fullgraph-smoke-test timeout_in_minutes: 35 source_file_dependencies: - vllm/ @@ -48,6 +52,7 @@ steps: - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" - label: PyTorch Fullgraph + key: pytorch-fullgraph timeout_in_minutes: 30 device: h200_18gb source_file_dependencies: @@ -58,6 +63,7 @@ steps: - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' - label: Pytorch Nightly Dependency Override Check # 2min + key: pytorch-nightly-dependency-override-check # if this test fails, it means the nightly torch version is not compatible with some # of the dependencies. Please check the error message and add the package to whitelist # in /vllm/tools/pre_commit/generate_nightly_torch_test.py diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml index a42d59b021c6..8a9a36da4481 100644 --- a/.buildkite/test_areas/quantization.yaml +++ b/.buildkite/test_areas/quantization.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: Quantization + key: quantization timeout_in_minutes: 90 source_file_dependencies: - csrc/ @@ -21,9 +22,10 @@ steps: - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - label: Quantized MoE Test (B200) + key: quantized-moe-test-b200 timeout_in_minutes: 60 working_dir: "/vllm-workspace/" - device: b200 + device: b200-k8s source_file_dependencies: - tests/quantization/test_blackwell_moe.py - vllm/model_executor/models/deepseek_v2.py @@ -38,6 +40,7 @@ steps: - pytest -s -v tests/quantization/test_blackwell_moe.py - label: Quantized Models Test + key: quantized-models-test timeout_in_minutes: 60 source_file_dependencies: - vllm/model_executor/layers/quantization diff --git a/.buildkite/test_areas/ray_compat.yaml b/.buildkite/test_areas/ray_compat.yaml index 3485e346532c..9207621a5830 100644 --- a/.buildkite/test_areas/ray_compat.yaml +++ b/.buildkite/test_areas/ray_compat.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: Ray Dependency Compatibility Check + key: ray-dependency-compatibility-check # Informational only — does not block the pipeline. # If this fails, it means the PR introduces a dependency that # conflicts with Ray's dependency constraints. diff --git a/.buildkite/test_areas/samplers.yaml b/.buildkite/test_areas/samplers.yaml index 2052a379827a..48e9f55571e4 100644 --- a/.buildkite/test_areas/samplers.yaml +++ b/.buildkite/test_areas/samplers.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: Samplers Test + key: samplers-test timeout_in_minutes: 75 source_file_dependencies: - vllm/model_executor/layers @@ -10,11 +11,13 @@ steps: - tests/samplers - tests/conftest.py commands: - - pytest -v -s samplers + # VLLM_USE_FLASHINFER_SAMPLER defaults to 1 now, so we need to pin both + # values explicitly to still cover the PyTorch-native (Triton) path. + - VLLM_USE_FLASHINFER_SAMPLER=0 pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers mirror: amd: - device: mi325_1 + device: mi250_1 depends_on: - image-build-amd commands: diff --git a/.buildkite/test_areas/spec_decode.yaml b/.buildkite/test_areas/spec_decode.yaml index 05925da0da01..5253f54735aa 100644 --- a/.buildkite/test_areas/spec_decode.yaml +++ b/.buildkite/test_areas/spec_decode.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: Spec Decode Eagle + key: spec-decode-eagle timeout_in_minutes: 30 device: h200_18gb source_file_dependencies: @@ -13,6 +14,7 @@ steps: - pytest -v -s v1/e2e/spec_decode -k "eagle_correctness" - label: Spec Decode Eagle Nightly B200 + key: spec-decode-eagle-nightly-b200 timeout_in_minutes: 30 device: b200 optional: true @@ -24,6 +26,7 @@ steps: - pytest -v -s v1/e2e/spec_decode -k "eagle_correctness" - label: Spec Decode Speculators + MTP + key: spec-decode-speculators-mtp timeout_in_minutes: 30 device: h200_18gb source_file_dependencies: @@ -35,6 +38,7 @@ steps: - pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness" - label: Spec Decode Speculators + MTP Nightly B200 + key: spec-decode-speculators-mtp-nightly-b200 timeout_in_minutes: 30 device: b200 optional: true @@ -47,6 +51,7 @@ steps: - pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness" - label: Spec Decode Ngram + Suffix + key: spec-decode-ngram-suffix timeout_in_minutes: 30 device: h200_18gb source_file_dependencies: @@ -57,6 +62,7 @@ steps: - pytest -v -s v1/e2e/spec_decode -k "ngram or suffix" - label: Spec Decode Draft Model + key: spec-decode-draft-model timeout_in_minutes: 30 device: h200_18gb source_file_dependencies: @@ -67,8 +73,9 @@ steps: - pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference" - label: Spec Decode Draft Model Nightly B200 + key: spec-decode-draft-model-nightly-b200 timeout_in_minutes: 30 - device: b200 + device: b200-k8s optional: true source_file_dependencies: - vllm/v1/spec_decode/ @@ -78,6 +85,7 @@ steps: - pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference" - label: DFlash Speculators Correctness + key: dflash-speculators-correctness timeout_in_minutes: 30 device: h100 optional: true @@ -89,3 +97,16 @@ steps: commands: - export VLLM_ALLOW_INSECURE_SERIALIZATION=1 - pytest -v -s v1/spec_decode/test_speculators_dflash.py -m slow_test + +- label: Spec Decode MTP hybrid (B200) + timeout_in_minutes: 30 + device: b200 + optional: true + source_file_dependencies: + - vllm/v1/spec_decode/ + - vllm/v1/worker/gpu/spec_decode/ + - vllm/model_executor/models/qwen3_5.py + - vllm/model_executor/models/qwen3_5_mtp.py + - tests/v1/e2e/spec_decode/ + commands: + - pytest -v -s v1/e2e/spec_decode -k "qwen3_5-hybrid" diff --git a/.buildkite/test_areas/weight_loading.yaml b/.buildkite/test_areas/weight_loading.yaml index 8e86374a8ad0..01c6bb7809bc 100644 --- a/.buildkite/test_areas/weight_loading.yaml +++ b/.buildkite/test_areas/weight_loading.yaml @@ -3,6 +3,7 @@ depends_on: - image-build steps: - label: Weight Loading Multiple GPU # 33min + key: weight-loading-multiple-gpu timeout_in_minutes: 45 working_dir: "/vllm-workspace/tests" num_devices: 2 diff --git a/.github/mergify.yml b/.github/mergify.yml index 8ca00d6e7d2d..2d36e3507028 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -308,8 +308,7 @@ pull_request_rules: - files=benchmarks/benchmark_serving_structured_output.py - files=benchmarks/run_structured_output_benchmark.sh - files=docs/features/structured_outputs.md - - files=examples/offline_inference/structured_outputs.py - - files=examples/online_serving/structured_outputs/structured_outputs.py + - files=^examples/features/structured_outputs/ - files~=^tests/v1/structured_output/ - files=tests/entrypoints/llm/test_struct_output_generate.py - files~=^vllm/v1/structured_output/ @@ -325,7 +324,7 @@ pull_request_rules: - or: - files~=^vllm/v1/spec_decode/ - files~=^tests/v1/spec_decode/ - - files~=^examples/.*(spec_decode|mlpspeculator|eagle|speculation).*\.py + - files=^examples/features/speculative_decoding/ - files~=^vllm/model_executor/models/.*eagle.*\.py - files=vllm/model_executor/models/mlp_speculator.py - files~=^vllm/transformers_utils/configs/(eagle|medusa|mlp_speculator)\.py @@ -478,9 +477,7 @@ pull_request_rules: conditions: - label != stale - or: - - files~=^examples/online_serving/disaggregated[^/]*/.* - - files~=^examples/offline_inference/disaggregated[^/]*/.* - - files~=^examples/others/lmcache/ + - files~=^examples/disaggregated/ - files~=^tests/v1/kv_connector/ - files~=^vllm/distributed/kv_transfer/ - title~=(?i)\bP/?D\b diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 8ab8d3e7035f..1dd31b0e50f6 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -16,11 +16,7 @@ permissions: jobs: pre-run-check: - if: >- - github.event_name == 'pull_request' && - (github.event.action != 'labeled' || - github.event.label.name == 'ready' || - github.event.label.name == 'verified') + if: github.event_name == 'pull_request' runs-on: ubuntu-latest steps: - name: Check PR label and author merge count @@ -49,12 +45,7 @@ jobs: pre-commit: needs: pre-run-check - if: >- - always() && - (github.event.action != 'labeled' || - github.event.label.name == 'ready' || - github.event.label.name == 'verified') && - (needs.pre-run-check.result == 'success' || needs.pre-run-check.result == 'skipped') + if: always() && (needs.pre-run-check.result == 'success' || needs.pre-run-check.result == 'skipped') runs-on: ubuntu-latest steps: - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 diff --git a/.gitignore b/.gitignore index 134bbc5cc893..e53d19b35340 100644 --- a/.gitignore +++ b/.gitignore @@ -237,6 +237,7 @@ ep_kernels_workspace/ # Allow tracked library source folders under submodules (e.g., benchmarks/lib) !vllm/benchmarks/lib/ +!.buildkite/scripts/lib/ # Generated gRPC protobuf files (compiled at build time from vllm_engine.proto) vllm/grpc/vllm_engine_pb2.py diff --git a/CMakeLists.txt b/CMakeLists.txt index fb8a1d7e1e14..13788fa87437 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -307,12 +307,12 @@ set(VLLM_EXT_SRC "csrc/quantization/activation_kernels.cu" "csrc/cuda_utils_kernels.cu" "csrc/custom_all_reduce.cu" - "csrc/torch_bindings.cpp") + "csrc/torch_bindings.cpp" + "csrc/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu") if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_EXT_SRC - "csrc/minimax_reduce_rms_kernel.cu" - "csrc/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu") + "csrc/minimax_reduce_rms_kernel.cu") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") @@ -1047,14 +1047,13 @@ endif() set(VLLM_MOE_EXT_SRC "csrc/moe/torch_bindings.cpp" "csrc/moe/moe_align_sum_kernels.cu" - "csrc/moe/topk_softmax_kernels.cu") + "csrc/moe/topk_softmax_kernels.cu" + "csrc/moe/topk_softplus_sqrt_kernels.cu") if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu" - "csrc/moe/grouped_topk_kernels.cu" - "csrc/moe/router_gemm.cu" - "csrc/moe/topk_softplus_sqrt_kernels.cu") + "csrc/moe/grouped_topk_kernels.cu") endif() if(VLLM_GPU_LANG STREQUAL "CUDA") diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py index 881039f43f07..2f56099c66fd 100644 --- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py +++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py @@ -217,6 +217,7 @@ async def send_request( min_tokens: int | None = None, max_tokens: int | None = None, timeout_sec: int = 120, + conversation_id: str | None = None, ) -> ServerResponse: payload = { "model": model, @@ -225,6 +226,9 @@ async def send_request( "temperature": 0.0, } + if conversation_id is not None: + payload["conversation_id"] = conversation_id + if stream: payload["stream"] = True payload["stream_options"] = {"include_usage": False} @@ -419,6 +423,7 @@ async def send_turn( min_tokens, max_tokens, req_args.timeout_sec, + conversation_id=conv_id, ) if response.valid is False: @@ -1468,6 +1473,12 @@ async def main() -> None: "(for example: --warmup-percentages=0%%,50%%)", ) + parser.add_argument( + "--trust-remote-code", + action="store_true", + help="Trust remote code when loading the tokenizer.", + ) + args = parser.parse_args() logger.info(args) @@ -1510,7 +1521,9 @@ async def main() -> None: np.random.seed(args.seed) logger.info("Loading tokenizer") - tokenizer = AutoTokenizer.from_pretrained(args.model) + tokenizer = AutoTokenizer.from_pretrained( + args.model, trust_remote_code=args.trust_remote_code + ) await get_server_info(args.url) diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 8535186cc1ec..650dbf365169 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -32,18 +32,23 @@ else() "-DVLLM_CPU_EXTENSION") # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0) - # and create a local shim dir with it + # and create a local shim dir with it. When PyTorch is built from source or packaged + # by a distro (common on RISC-V, s390x, Fedora/RHEL aarch64), no vendored libgomp + # exists and the shim dir is empty; fall back to the system libgomp in that case. vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR) - find_library(OPEN_MP - NAMES gomp - PATHS ${VLLM_TORCH_GOMP_SHIM_DIR} - NO_DEFAULT_PATH - REQUIRED - ) - # Set LD_LIBRARY_PATH to include the shim dir at build time to use the same libgomp as PyTorch - if (OPEN_MP) + if(VLLM_TORCH_GOMP_SHIM_DIR) + find_library(OPEN_MP + NAMES gomp + PATHS "${VLLM_TORCH_GOMP_SHIM_DIR}" + NO_DEFAULT_PATH + REQUIRED + ) + # Use the same libgomp as PyTorch at runtime set(ENV{LD_LIBRARY_PATH} "${VLLM_TORCH_GOMP_SHIM_DIR}:$ENV{LD_LIBRARY_PATH}") + else() + # Fall back to system / toolchain libgomp + find_library(OPEN_MP NAMES gomp REQUIRED) endif() endif() @@ -321,14 +326,6 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND set(ONEDNN_VERBOSE "ON") set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) - # TODO: Refactor this - if (ENABLE_X86_ISA) - # Note: only enable oneDNN for AVX512 - list(APPEND DNNL_COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512}) - else() - list(APPEND DNNL_COMPILE_FLAGS ${CXX_COMPILE_FLAGS}) - endif() - set(VLLM_BUILD_TYPE ${CMAKE_BUILD_TYPE}) set(CMAKE_BUILD_TYPE "Release") # remove oneDNN debug symbols to reduce size FetchContent_MakeAvailable(oneDNN) @@ -341,8 +338,14 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND PRIVATE ${oneDNN_SOURCE_DIR}/src ) target_link_libraries(dnnl_ext dnnl torch) - target_compile_options(dnnl_ext PRIVATE ${DNNL_COMPILE_FLAGS} -fPIC) + if (ENABLE_X86_ISA) + target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS_AVX2} -fPIC) + else() + target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC) + endif() list(APPEND LIBS dnnl_ext) + + set(USE_ONEDNN ON) else() set(USE_ONEDNN OFF) @@ -430,10 +433,11 @@ if (ENABLE_X86_ISA) "csrc/cpu/pos_encoding.cpp" "csrc/moe/dynamic_4bit_int_moe_cpu.cpp") - set(VLLM_EXT_SRC_AVX2 + set(VLLM_EXT_SRC_AVX2 "csrc/cpu/utils.cpp" "csrc/cpu/spec_decode_utils.cpp" "csrc/cpu/cpu_attn.cpp" + "csrc/cpu/dnnl_kernels.cpp" "csrc/cpu/torch_bindings.cpp" # TODO: Remove these files "csrc/cpu/activation.cpp" @@ -448,7 +452,7 @@ if (ENABLE_X86_ISA) set(_C_LIBS numa dnnl_ext) set(_C_AVX512_LIBS numa dnnl_ext) - set(_C_AVX2_LIBS numa) + set(_C_AVX2_LIBS numa dnnl_ext) # AMX + AVX512F + AVX512BF16 + AVX512VNNI define_extension_target( diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 7e456d32598b..895490f45a79 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -97,13 +97,13 @@ void swap_blocks_batch(const torch::Tensor& src_ptrs, const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - // Use cuMemcpyBatchAsync (CUDA 12.8+) to submit all copies in a single - // driver call, amortizing per-copy submission overhead. - // int64_t and CUdeviceptr/size_t are both 8 bytes on 64-bit platforms, - // so we reinterpret_cast the tensor data directly to avoid copies. - static_assert(sizeof(CUdeviceptr) == sizeof(int64_t)); + // Use cuMemcpyBatchAsync / hipMemcpyBatchAsync to submit all copies in a + // single driver call, amortizing per-copy submission overhead. int64_t + // and CUdeviceptr/void*/size_t are all 8 bytes on 64-bit platforms, so we + // reinterpret_cast the tensor data directly to avoid copies. static_assert(sizeof(size_t) == sizeof(int64_t)); #if !defined(USE_ROCM) && defined(CUDA_VERSION) && CUDA_VERSION >= 12080 + static_assert(sizeof(CUdeviceptr) == sizeof(int64_t)); // Resolve cuMemcpyBatchAsync at runtime via cuGetProcAddress so that // binaries compiled with CUDA 12.8+ still work on older drivers, and // we avoid the CUDA 13.0 header remapping (#define to _v2 signature). @@ -134,12 +134,30 @@ void swap_blocks_batch(const torch::Tensor& src_ptrs, &fail_idx, static_cast(stream)); TORCH_CHECK(result == CUDA_SUCCESS, "cuMemcpyBatchAsync failed at index ", fail_idx, " with error ", result); - } else + return; + } +#elif defined(USE_ROCM) && defined(HIP_VERSION) && HIP_VERSION >= 70100000 + // ROCm 7.1+ exposes hipMemcpyBatchAsync. The 7.2.1 implementation early- + // returns hipErrorNotSupported whenever numAttrs > 0 (see ROCm/clr @ + // rocm-7.2.1 hipamd/src/hip_memory.cpp:2819-2822), so call with + // numAttrs=0. + { + hipMemcpyAttributes attr = {}; + size_t attrs_idx = 0; + size_t fail_idx = 0; + hipError_t result = hipMemcpyBatchAsync( + reinterpret_cast(dst_data), reinterpret_cast(src_data), + reinterpret_cast(size_data), static_cast(n), &attr, + &attrs_idx, 0, &fail_idx, static_cast(stream)); + TORCH_CHECK(result == hipSuccess, "hipMemcpyBatchAsync failed at index ", + fail_idx, " with error ", result); + return; + } #endif { - // Fallback for CUDA < 12.8, older drivers, and ROCm: - // individual async copies. - // cudaMemcpyDefault lets the driver infer direction from pointer types. + // Fallback for CUDA < 12.8, older CUDA drivers, and ROCm < 7.1: + // individual async copies. cudaMemcpyDefault lets the driver infer + // direction from pointer types. for (int64_t i = 0; i < n; i++) { cudaMemcpyAsync(reinterpret_cast(dst_data[i]), reinterpret_cast(src_data[i]), diff --git a/csrc/cpu/cpu_attn.cpp b/csrc/cpu/cpu_attn.cpp index a582b4b4d7cc..4750dd78838d 100644 --- a/csrc/cpu/cpu_attn.cpp +++ b/csrc/cpu/cpu_attn.cpp @@ -1,5 +1,16 @@ #include "cpu_attn_dispatch_generated.h" +// Maps kv_cache_dtype string to Fp8KVCacheDataType enum. +// "auto" -> kAuto(0); "fp8"/"fp8_e4m3" -> kFp8E4M3; "fp8_e5m2" -> kFp8E5M2. +static inline cpu_attention::Fp8KVCacheDataType parse_fp8_kv_dtype( + const std::string& kv_cache_dtype) { + if (kv_cache_dtype == "fp8_e5m2") + return cpu_attention::Fp8KVCacheDataType::kFp8E5M2; + if (kv_cache_dtype == "fp8_e4m3" || kv_cache_dtype == "fp8") + return cpu_attention::Fp8KVCacheDataType::kFp8E4M3; + return cpu_attention::Fp8KVCacheDataType::kAuto; +} + torch::Tensor get_scheduler_metadata( const int64_t num_req, const int64_t num_heads_q, const int64_t num_heads_kv, const int64_t head_dim, @@ -18,6 +29,8 @@ torch::Tensor get_scheduler_metadata( isa = cpu_attention::ISA::NEON; } else if (isa_hint == "vxe") { isa = cpu_attention::ISA::VXE; + } else if (isa_hint == "vsx") { + isa = cpu_attention::ISA::VSX; } else { TORCH_CHECK(false, "Unsupported CPU attention ISA hint: " + isa_hint); } @@ -49,7 +62,7 @@ torch::Tensor get_scheduler_metadata( input.enable_kv_split = enable_kv_split; VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() { - CPU_ATTN_DISPATCH(head_dim, isa, [&]() { + CPU_ATTN_DISPATCH(head_dim, isa, 0, [&]() { input.elem_size = sizeof(scalar_t); input.q_buffer_elem_size = sizeof(attn_impl::q_buffer_t); input.logits_buffer_elem_size = sizeof(attn_impl::logits_buffer_t); @@ -72,7 +85,9 @@ void cpu_attn_reshape_and_cache( key_cache, // [num_blocks, num_kv_heads, block_size, head_size] torch::Tensor& value_cache, // [num_blocks, num_kv_heads, block_size, head_size] - const torch::Tensor& slot_mapping, const std::string& isa) { + const torch::Tensor& slot_mapping, const std::string& isa, + const double k_scale = 1.0, const double v_scale = 1.0, + const std::string& kv_cache_dtype = "auto") { TORCH_CHECK_EQ(key.dim(), 3); TORCH_CHECK_EQ(value.dim(), 3); TORCH_CHECK_EQ(key_cache.dim(), 4); @@ -80,18 +95,30 @@ void cpu_attn_reshape_and_cache( TORCH_CHECK_EQ(key.stride(2), 1); TORCH_CHECK_EQ(value.stride(2), 1); + const int64_t kv_cache_idx = + static_cast(parse_fp8_kv_dtype(kv_cache_dtype)); + const bool is_fp8 = (kv_cache_idx != 0); + + if (is_fp8) { + TORCH_CHECK(key_cache.scalar_type() == at::ScalarType::Byte, + "key_cache must be uint8 for FP8 path"); + TORCH_CHECK(value_cache.scalar_type() == at::ScalarType::Byte, + "value_cache must be uint8 for FP8 path"); + TORCH_CHECK(k_scale > 0, "k_scale must be positive for FP8 path"); + TORCH_CHECK(v_scale > 0, "v_scale must be positive for FP8 path"); + } + + const float k_inv = is_fp8 ? 1.0f / static_cast(k_scale) : 0.0f; + const float v_inv = is_fp8 ? 1.0f / static_cast(v_scale) : 0.0f; + const int64_t token_num = key.size(0); - const int64_t key_token_num_stride = key.stride(0); - const int64_t value_token_num_stride = value.stride(0); - const int64_t head_num = value.size(1); - const int64_t key_head_num_stride = key.stride(1); - const int64_t value_head_num_stride = value.stride(1); + const int64_t head_num = key.size(1); + const int64_t head_dim = key.size(2); const int64_t num_blocks = key_cache.size(0); const int64_t num_blocks_stride = key_cache.stride(0); const int64_t cache_head_num_stride = key_cache.stride(1); const int64_t block_size = key_cache.size(2); const int64_t block_size_stride = key_cache.stride(2); - const int64_t head_dim = key.size(-1); cpu_attention::ISA isa_tag = [&]() { if (isa == "amx") { @@ -104,21 +131,31 @@ void cpu_attn_reshape_and_cache( return cpu_attention::ISA::NEON; } else if (isa == "vxe") { return cpu_attention::ISA::VXE; + } else if (isa == "vsx") { + return cpu_attention::ISA::VSX; } else { TORCH_CHECK(false, "Invalid ISA type: " + isa); } }(); + if (is_fp8) { + TORCH_CHECK(isa_tag == cpu_attention::ISA::AMX || + isa_tag == cpu_attention::ISA::VEC, + "FP8 KV cache is only supported on x86 (AMX/VEC) ISA"); + } + VLLM_DISPATCH_FLOATING_TYPES( key.scalar_type(), "cpu_attn_reshape_and_cache", [&]() { - CPU_ATTN_DISPATCH(head_dim, isa_tag, [&]() { + CPU_ATTN_DISPATCH(head_dim, isa_tag, kv_cache_idx, [&]() { + using kv_t = typename attn_impl::kv_cache_t; attn_impl::reshape_and_cache( key.data_ptr(), value.data_ptr(), - key_cache.data_ptr(), value_cache.data_ptr(), - slot_mapping.data_ptr(), token_num, key_token_num_stride, - value_token_num_stride, head_num, key_head_num_stride, - value_head_num_stride, num_blocks, num_blocks_stride, - cache_head_num_stride, block_size, block_size_stride); + reinterpret_cast(key_cache.data_ptr()), + reinterpret_cast(value_cache.data_ptr()), + slot_mapping.data_ptr(), token_num, key.stride(0), + value.stride(0), head_num, key.stride(1), value.stride(1), + num_blocks, num_blocks_stride, cache_head_num_stride, block_size, + block_size_stride, k_inv, v_inv); }); }); } @@ -137,13 +174,26 @@ void cpu_attention_with_kv_cache( const int64_t sliding_window_left, const int64_t sliding_window_right, const torch::Tensor& block_table, // [num_tokens, max_block_num] const double softcap, const torch::Tensor& scheduler_metadata, - const std::optional& s_aux // [num_heads] -) { + const std::optional& s_aux, // [num_heads] + const double k_scale = 1.0, const double v_scale = 1.0, + const std::string& kv_cache_dtype = "auto") { TORCH_CHECK_EQ(query.dim(), 3); TORCH_CHECK_EQ(query.stride(2), 1); TORCH_CHECK_EQ(key_cache.dim(), 4); TORCH_CHECK_EQ(value_cache.dim(), 4); + const int64_t kv_cache_idx = + static_cast(parse_fp8_kv_dtype(kv_cache_dtype)); + const bool is_fp8 = (kv_cache_idx != 0); + if (is_fp8) { + TORCH_CHECK(key_cache.scalar_type() == at::ScalarType::Byte, + "key_cache must be uint8 for FP8 path"); + TORCH_CHECK(value_cache.scalar_type() == at::ScalarType::Byte, + "value_cache must be uint8 for FP8 path"); + TORCH_CHECK(k_scale > 0, "k_scale must be positive for FP8 path"); + TORCH_CHECK(v_scale > 0, "v_scale must be positive for FP8 path"); + } + cpu_attention::AttentionInput input; input.metadata = reinterpret_cast( scheduler_metadata.data_ptr()); @@ -165,25 +215,32 @@ void cpu_attention_with_kv_cache( input.block_table = block_table.data_ptr(); input.alibi_slopes = alibi_slopes.has_value() ? alibi_slopes->data_ptr() : nullptr; - // For now sink must be bf16 input.s_aux = s_aux.has_value() ? s_aux->data_ptr() : nullptr; input.scale = scale; input.causal = causal; input.sliding_window_left = sliding_window_left; input.sliding_window_right = sliding_window_right; if (input.causal) { - // to make boundary calculation easier input.sliding_window_right = 0; } - float softcap_fp32 = softcap; - input.softcap = softcap_fp32; + input.softcap = static_cast(softcap); + + if (is_fp8) { + input.k_scale_fp8 = static_cast(k_scale); + input.v_scale_fp8 = static_cast(v_scale); + TORCH_CHECK(input.metadata->isa == cpu_attention::ISA::AMX || + input.metadata->isa == cpu_attention::ISA::VEC, + "FP8 KV cache is only supported on x86 (AMX/VEC) ISA"); + } VLLM_DISPATCH_FLOATING_TYPES( query.scalar_type(), "cpu_attention_with_kv_cache", [&]() { - CPU_ATTN_DISPATCH(query.size(2), input.metadata->isa, [&]() { - TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, 0); - cpu_attention::AttentionMainLoop mainloop; - mainloop(&input); - }); + CPU_ATTN_DISPATCH( + query.size(2), input.metadata->isa, kv_cache_idx, [&]() { + TORCH_CHECK_EQ(input.block_size % attn_impl::BlockSizeAlignment, + 0); + cpu_attention::AttentionMainLoop mainloop; + mainloop(&input); + }); }); } diff --git a/csrc/cpu/cpu_attn_amx.hpp b/csrc/cpu/cpu_attn_amx.hpp index 1c8644d52329..6a0341085dce 100644 --- a/csrc/cpu/cpu_attn_amx.hpp +++ b/csrc/cpu/cpu_attn_amx.hpp @@ -1,6 +1,7 @@ #ifndef CPU_ATTN_AMX_HPP #define CPU_ATTN_AMX_HPP +#include "cpu_attn_fp8.hpp" #include "cpu_attn_impl.hpp" namespace cpu_attention { @@ -21,9 +22,10 @@ typedef struct __tile_config { // 2-2-4 pattern, for 16 < m <= 32 // TILE 0, 1: load A matrix, row num should be 16, m - 16 // TILE 2, 3: load B matrix, row num should be 16 -// TILE 4, 5, 6, 7: store results C matrix, row num should be 16, 16, m - 16, m -// - 16 -template +// TILE 4, 5, 6, 7: store results C matrix, row num should be 16, 16, +// m - 16, m - 16 +// q_buffer_t: A (Q/P) tile type; kv_cache_t: B (K/V cache) tile type. +template class TileGemm224 { public: template @@ -42,13 +44,56 @@ class TileGemm224 { } }; -template <> -class TileGemm224 { +// Dequantize one FP8 tile (AMX_TILE_ROW_NUM rows x 32 cols) to BF16. +template +FORCE_INLINE void deq_tile_amx(const uint8_t* src, c10::BFloat16* dst) { + for (int r = 0; r < AMX_TILE_ROW_NUM; ++r) { + if constexpr (std::is_same_v) { + vec_op::BF16Vec32(src + r * 32, vec_op::fp8_bf16_e4m3_tag{}) + .save(dst + r * 32); + } else { + vec_op::BF16Vec32(src + r * 32, vec_op::fp8_bf16_e5m2_tag{}) + .save(dst + r * 32); + } + } +} + +// For FP8: dequant src into scratch and return scratch. +// For BF16: return src directly (scratch is unused; the compiler elides it). +template +FORCE_INLINE const c10::BFloat16* prepare_b_tile(const kv_cache_t* src, + c10::BFloat16* scratch) { + if constexpr (std::is_same_v || + std::is_same_v) { + deq_tile_amx(reinterpret_cast(src), scratch); + return scratch; + } else { + return reinterpret_cast(src); + } +} + +// Handles both BF16 and FP8 KV cache (2-2-4 pattern). +template +class TileGemm224 { + static_assert(std::is_same_v || + std::is_same_v || + std::is_same_v, + "kv_cache_t must be BFloat16, Float8_e4m3fn, or Float8_e5m2"); + + static constexpr bool fp8_kv = + std::is_same_v || + std::is_same_v; + + static constexpr int64_t tile_elems = AMX_TILE_BYTES / sizeof(c10::BFloat16); + // BF16 path: scratch_elems=1 so the scratch array is eliminated by the + // compiler. + static constexpr int64_t scratch_elems = fp8_kv ? tile_elems : 1; + public: template FORCE_INLINE static void gemm(const int32_t m_size, c10::BFloat16* __restrict__ a_tile, - c10::BFloat16* __restrict__ b_tile, + kv_cache_t* __restrict__ b_tile, float* __restrict__ c_tile, const int64_t lda, const int64_t ldb, const int64_t ldc, const int32_t block_size, @@ -56,6 +101,7 @@ class TileGemm224 { const bool accum_c) { const int32_t k_times = dynamic_k_size / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16)); + c10::BFloat16* __restrict__ a_tile_0 = a_tile; c10::BFloat16* __restrict__ a_tile_1 = a_tile + lda * AMX_TILE_ROW_NUM; const int64_t a_tile_stride = [&]() { @@ -70,8 +116,8 @@ class TileGemm224 { } }(); - c10::BFloat16* __restrict__ b_tile_2 = b_tile; - c10::BFloat16* __restrict__ b_tile_3 = [&]() { + kv_cache_t* __restrict__ b_tile_2 = b_tile; + kv_cache_t* __restrict__ b_tile_3 = [&]() { if constexpr (phase == AttentionGemmPhase::QK) { // k_cache is prepacked return b_tile + (k_size * AMX_TILE_ROW_BYTES / 4); @@ -106,11 +152,16 @@ class TileGemm224 { _tile_zero(7); } + alignas(64) c10::BFloat16 scratch_2[scratch_elems]; + alignas(64) c10::BFloat16 scratch_3[scratch_elems]; for (int32_t k = 0; k < k_times; ++k) { + const c10::BFloat16* load_2 = prepare_b_tile(b_tile_2, scratch_2); + const c10::BFloat16* load_3 = prepare_b_tile(b_tile_3, scratch_3); + _tile_loadd(0, a_tile_0, a_tile_stride); - _tile_stream_loadd(2, b_tile_2, b_tile_stride); + _tile_stream_loadd(2, const_cast(load_2), b_tile_stride); _tile_dpbf16ps(4, 0, 2); - _tile_stream_loadd(3, b_tile_3, b_tile_stride); + _tile_stream_loadd(3, const_cast(load_3), b_tile_stride); _tile_dpbf16ps(5, 0, 3); _tile_loadd(1, a_tile_1, a_tile_stride); _tile_dpbf16ps(6, 1, 2); @@ -154,13 +205,13 @@ class TileGemm224 { }; // 1-2-2 pattern, for 0 < m <= 16 -// TILE 0, (1): load A matrix, use extra 1 tile for prefetch, row num should be -// m, m -// TILE 2, 3, (4, 5): load B matrix, use extra 2 tiles for prefetch, row -// num should be 16 -// TILE 6, 7, (6, 7): store results C matrix, row num should be -// m -template +// TILE 0, (1): load A matrix, use extra 1 tile for prefetch, row num should +// be m, m +// TILE 2, 3, (4, 5): load B matrix, use extra 2 tiles for prefetch, row num +// should be 16 +// TILE 6, 7: store results C matrix, row num should be m +// q_buffer_t: A (Q/P) tile type; kv_cache_t: B (K/V cache) tile type. +template class TileGemm122 { public: template @@ -179,13 +230,26 @@ class TileGemm122 { } }; -template <> -class TileGemm122 { +// Handles both BF16 and FP8 KV cache (1-2-2 pattern). +template +class TileGemm122 { + static_assert(std::is_same_v || + std::is_same_v || + std::is_same_v, + "kv_cache_t must be BFloat16, Float8_e4m3fn, or Float8_e5m2"); + + static constexpr bool fp8_kv = + std::is_same_v || + std::is_same_v; + + static constexpr int64_t tile_elems = AMX_TILE_BYTES / sizeof(c10::BFloat16); + static constexpr int64_t scratch_elems = fp8_kv ? tile_elems : 1; + public: template FORCE_INLINE static void gemm(const int32_t m_size, c10::BFloat16* __restrict__ a_tile, - c10::BFloat16* __restrict__ b_tile, + kv_cache_t* __restrict__ b_tile, float* __restrict__ c_tile, const int64_t lda, const int64_t ldb, const int64_t ldc, const int32_t block_size, @@ -215,21 +279,19 @@ class TileGemm122 { } }(); - c10::BFloat16* __restrict__ b_tile_2 = b_tile; - c10::BFloat16* __restrict__ b_tile_3 = [&]() { + kv_cache_t* __restrict__ b_tile_2 = b_tile; + kv_cache_t* __restrict__ b_tile_3 = [&]() { if constexpr (phase == AttentionGemmPhase::QK) { - // k_cache is prepacked return b_tile + (k_size * AMX_TILE_ROW_BYTES / 4); } else if constexpr (phase == AttentionGemmPhase::PV) { - // v_cache is prepacked return b_tile + (block_size * AMX_TILE_ROW_BYTES / 4); } else { TORCH_CHECK(false, "Unreachable"); } }(); - c10::BFloat16* __restrict__ b_tile_4 = + kv_cache_t* __restrict__ b_tile_4 = b_tile_2 + AMX_TILE_BYTES / sizeof(c10::BFloat16); - c10::BFloat16* __restrict__ b_tile_5 = + kv_cache_t* __restrict__ b_tile_5 = b_tile_3 + AMX_TILE_BYTES / sizeof(c10::BFloat16); int64_t b_stride = AMX_TILE_ROW_BYTES; @@ -250,16 +312,25 @@ class TileGemm122 { _tile_zero(7); } + alignas(64) c10::BFloat16 scratch_2[scratch_elems]; + alignas(64) c10::BFloat16 scratch_3[scratch_elems]; + alignas(64) c10::BFloat16 scratch_4[scratch_elems]; + alignas(64) c10::BFloat16 scratch_5[scratch_elems]; for (int32_t k = 0; k < k_group_times; ++k) { + const c10::BFloat16* load_2 = prepare_b_tile(b_tile_2, scratch_2); + const c10::BFloat16* load_3 = prepare_b_tile(b_tile_3, scratch_3); + const c10::BFloat16* load_4 = prepare_b_tile(b_tile_4, scratch_4); + const c10::BFloat16* load_5 = prepare_b_tile(b_tile_5, scratch_5); + _tile_loadd(0, a_tile_0, a_tile_stride); - _tile_stream_loadd(2, b_tile_2, b_stride); + _tile_stream_loadd(2, const_cast(load_2), b_stride); _tile_dpbf16ps(6, 0, 2); - _tile_stream_loadd(3, b_tile_3, b_stride); + _tile_stream_loadd(3, const_cast(load_3), b_stride); _tile_dpbf16ps(7, 0, 3); _tile_loadd(1, a_tile_1, a_tile_stride); - _tile_stream_loadd(4, b_tile_4, b_stride); + _tile_stream_loadd(4, const_cast(load_4), b_stride); _tile_dpbf16ps(6, 1, 4); - _tile_stream_loadd(5, b_tile_5, b_stride); + _tile_stream_loadd(5, const_cast(load_5), b_stride); _tile_dpbf16ps(7, 1, 5); // update ptrs @@ -279,10 +350,13 @@ class TileGemm122 { } if (has_tail) { + const c10::BFloat16* load_2 = prepare_b_tile(b_tile_2, scratch_2); + const c10::BFloat16* load_3 = prepare_b_tile(b_tile_3, scratch_3); + _tile_loadd(0, a_tile_0, a_tile_stride); - _tile_stream_loadd(2, b_tile_2, b_stride); + _tile_stream_loadd(2, const_cast(load_2), b_stride); _tile_dpbf16ps(6, 0, 2); - _tile_stream_loadd(3, b_tile_3, b_stride); + _tile_stream_loadd(3, const_cast(load_3), b_stride); _tile_dpbf16ps(7, 0, 3); } @@ -302,21 +376,25 @@ class TileGemm122 { _tile_loadconfig(&config); } }; + } // namespace -template -class AttentionImpl { +template +class AttentionImpl { + static constexpr bool fp8_kv = + std::is_same_v || + std::is_same_v; + public: using query_t = scalar_t; using q_buffer_t = scalar_t; - using kv_cache_t = scalar_t; + using kv_cache_t = kv_cache_scalar_t; using logits_buffer_t = float; using partial_output_buffer_t = float; using prob_buffer_t = scalar_t; constexpr static int64_t BlockSizeAlignment = - AMX_TILE_ROW_BYTES / - sizeof(kv_cache_t); // KV token num unit of QK and PV phases + 32; // AMX_TILE_ROW_NUM = 16 tokens/tile; 32 = 2 tiles constexpr static int64_t HeadDimAlignment = 2 * (AMX_TILE_ROW_BYTES / 4); // headdim num unit of PV phase constexpr static int64_t MaxQHeadNumPerIteration = 32; @@ -324,6 +402,9 @@ class AttentionImpl { constexpr static ISA ISAType = ISA::AMX; constexpr static bool scale_on_logits = true; + float k_scale = 1.0f; + float v_scale = 1.0f; + public: AttentionImpl() : current_q_head_num_(0) { // Use all columns in AMX tiles @@ -332,21 +413,50 @@ class AttentionImpl { ~AttentionImpl() { _tile_release(); } + void init_from_input(const AttentionInput* input) { + if constexpr (fp8_kv) { + k_scale = input->k_scale_fp8; + v_scale = input->v_scale_fp8; + } + } + + float get_output_v_scale() const noexcept { + if constexpr (fp8_kv) { + // AMX dequant places FP8 payload into a BF16 field (exponent bias 127). + // Correction = 2^(127 - FP8_bias): E4M3 bias=7 → 2^120, E5M2 bias=15 → + // 2^112. + constexpr float bias = + std::is_same_v ? 0x1p112f : 0x1p120f; + return v_scale * bias; + } + return 1.0f; + } + template