diff --git a/.buildkite/hardware_tests/cpu.yaml b/.buildkite/hardware_tests/cpu.yaml index ba8fd497c39e..92b0196ec2a2 100644 --- a/.buildkite/hardware_tests/cpu.yaml +++ b/.buildkite/hardware_tests/cpu.yaml @@ -12,15 +12,19 @@ steps: - vllm/_custom_ops.py - tests/kernels/attention/test_cpu_attn.py - tests/kernels/moe/test_cpu_fused_moe.py + - tests/kernels/moe/test_cpu_fp8_fused_moe.py - tests/kernels/test_onednn.py - tests/kernels/test_awq_int4_to_int8.py + - tests/kernels/quantization/test_cpu_fp8_scaled_mm.py commands: - | - bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m " + bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m " pytest -x -v -s tests/kernels/attention/test_cpu_attn.py pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py + pytest -x -v -s tests/kernels/moe/test_cpu_fp8_fused_moe.py pytest -x -v -s tests/kernels/test_onednn.py - pytest -x -v -s tests/kernels/test_awq_int4_to_int8.py" + pytest -x -v -s tests/kernels/test_awq_int4_to_int8.py + pytest -x -v -s tests/kernels/quantization/test_cpu_fp8_scaled_mm.py" - label: CPU-Compatibility Tests depends_on: [] @@ -61,6 +65,7 @@ steps: - vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py - vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py - vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py + - vllm/model_executor/layers/fused_moe/experts/cpu_moe.py - tests/quantization/test_compressed_tensors.py - tests/quantization/test_cpu_wna16.py commands: diff --git a/.buildkite/intel_jobs/lora_intel.yaml b/.buildkite/intel_jobs/lora_intel.yaml index 366d9daa24dd..6d5bddacf1bc 100644 --- a/.buildkite/intel_jobs/lora_intel.yaml +++ b/.buildkite/intel_jobs/lora_intel.yaml @@ -18,17 +18,18 @@ steps: - >- bash .buildkite/scripts/hardware_ci/run-intel-test.sh 'cd tests && + export VLLM_WORKER_MULTIPROC_METHOD=spawn && pytest -v -s lora/test_layers.py && pytest -v -s lora/test_lora_checkpoints.py && - (pytest -v -s lora/test_lora_functions.py --deselect="tests/lora/test_lora_functions.py::test_lora_functions_sync" --deselect="tests/lora/test_lora_functions.py::test_lora_functions_async" || true) && + pytest -v -s lora/test_lora_functions.py && pytest -v -s lora/test_lora_huggingface.py && pytest -v -s lora/test_lora_manager.py && pytest -v -s lora/test_lora_utils.py && pytest -v -s lora/test_peft_helper.py && pytest -v -s lora/test_resolver.py && pytest -v -s lora/test_utils.py && - (pytest -v -s lora/test_add_lora.py --deselect="tests/lora/test_add_lora.py::test_add_lora" || true) && - (pytest -v -s lora/test_worker.py --deselect="tests/lora/test_worker.py::test_worker_apply_lora" || true)' + pytest -v -s lora/test_add_lora.py && + pytest -v -s lora/test_worker.py' - label: LoRA Fused/MoE Kernels timeout_in_minutes: 45 @@ -46,6 +47,7 @@ steps: - >- bash .buildkite/scripts/hardware_ci/run-intel-test.sh 'cd tests && + export VLLM_WORKER_MULTIPROC_METHOD=spawn && pytest -v -s lora/test_fused_moe_lora_kernel.py && pytest -v -s lora/test_moe_lora_align_sum.py' @@ -65,8 +67,9 @@ steps: - >- bash .buildkite/scripts/hardware_ci/run-intel-test.sh 'cd tests && + export VLLM_WORKER_MULTIPROC_METHOD=spawn && set -o pipefail && - pytest -v -s lora/test_punica_ops.py --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-2-2049-64-32-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[expand-0-xpu:0-dtype1-2-64000-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-128-1-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-256-1-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-256-8-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[expand-0-xpu:0-dtype0-3-2049-128-8-16]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-128-8-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels[expand-0-xpu:0-dtype1-1-2049-256-128-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype0-3-64256-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype1-2-29696-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype1-3-49408-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype0-2-16384-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[expand-0-xpu:0-dtype0-2-51328-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[expand-0-xpu:0-dtype1-1-102656-32-4-4]"' + pytest -v -s lora/test_punica_ops.py --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[expand-0-xpu:0-dtype0-3-43264-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype1-1-2049-64-128-16]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-128-1-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-256-1-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-256-8-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[expand-0-xpu:0-dtype0-3-2049-128-8-16]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-128-8-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels[expand-0-xpu:0-dtype1-1-2049-256-128-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype0-3-64256-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype1-2-29696-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype1-3-49408-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype0-2-16384-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[expand-0-xpu:0-dtype0-2-51328-32-4-4]"' - label: LoRA Punica FP8/XPU Ops timeout_in_minutes: 45 @@ -84,6 +87,7 @@ steps: - >- bash .buildkite/scripts/hardware_ci/run-intel-test.sh 'cd tests && + export VLLM_WORKER_MULTIPROC_METHOD=spawn && pytest -v -s lora/test_punica_ops_fp8.py && pytest -v -s lora/test_punica_xpu_ops.py' @@ -103,10 +107,12 @@ steps: - >- bash .buildkite/scripts/hardware_ci/run-intel-test.sh 'cd tests && + export VLLM_WORKER_MULTIPROC_METHOD=spawn && (pytest -v -s lora/test_mixtral.py --deselect="tests/lora/test_mixtral.py::test_mixtral_lora[4]" || true) && pytest -v -s lora/test_quant_model.py --deselect="tests/lora/test_quant_model.py::test_quant_model_lora[model0]" --deselect="tests/lora/test_quant_model.py::test_quant_model_lora[model1]" --deselect="tests/lora/test_quant_model.py::test_quant_model_tp_equality[model0]" && - pytest -v -s lora/test_qwen35_densemodel_lora.py && - pytest -v -s lora/test_transformers_model.py' + pytest -v -s lora/test_transformers_model.py && + pytest -v -s lora/test_chatglm3_tp.py && + pytest -s -v lora/test_minicpmv_tp.py' - label: LoRA Multimodal timeout_in_minutes: 45 @@ -124,6 +130,6 @@ steps: - >- bash .buildkite/scripts/hardware_ci/run-intel-test.sh 'cd tests && + export VLLM_WORKER_MULTIPROC_METHOD=spawn && pytest -v -s lora/test_default_mm_loras.py && - (pytest -v -s lora/test_qwen3_unembed.py || true) && pytest -v -s lora/test_whisper.py' diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 74227da45c71..8a900c0bf862 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -37,7 +37,7 @@ steps: agents: queue: arm64_cpu_queue_release commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64}\" --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64}\" --build-arg BUILD_OS=manylinux --build-arg BUILD_BASE_IMAGE=pytorch/manylinuxaarch64-builder:cuda13.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-nightly-wheels.sh" @@ -76,7 +76,7 @@ steps: agents: queue: cpu_queue_release commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86}\" --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86}\" --build-arg BUILD_OS=manylinux --build-arg BUILD_BASE_IMAGE=pytorch/manylinux2_28-builder:cuda13.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-nightly-wheels.sh" @@ -309,6 +309,7 @@ steps: depends_on: ~ - label: "Build release image - x86_64 - CPU" + key: build-cpu-release-image-x86 depends_on: - block-cpu-release-image-build - input-release-version @@ -327,7 +328,8 @@ steps: depends_on: ~ - label: "Build release image - arm64 - CPU" - depends_on: + key: build-cpu-release-image-arm64 + depends_on: - block-arm64-cpu-release-image-build - input-release-version agents: @@ -436,6 +438,41 @@ steps: DOCKER_BUILDKIT: "1" DOCKERHUB_USERNAME: "vllmbot" + - block: "Publish release images to DockerHub" + key: block-publish-release-images + depends_on: + - create-multi-arch-manifest + - create-multi-arch-manifest-cuda-12-9 + - create-multi-arch-manifest-ubuntu2404 + - create-multi-arch-manifest-cuda-12-9-ubuntu2404 + - build-rocm-release-image + - input-release-version + # Wait for CPU builds if their block steps were unblocked, so publish + # doesn't race the in-progress CPU build. allow_failure lets publish + # proceed when the operator legitimately leaves the CPU block steps + # unblocked or the CPU build fails. + - step: build-cpu-release-image-x86 + allow_failure: true + - step: build-cpu-release-image-arm64 + allow_failure: true + if: build.env("NIGHTLY") != "1" + + - label: "Publish release images to DockerHub" + depends_on: + - block-publish-release-images + key: publish-release-images-dockerhub + agents: + queue: small_cpu_queue_release + commands: + - "bash .buildkite/scripts/publish-release-images.sh" + plugins: + - docker-login#v3.0.0: + username: vllmbot + password-env: DOCKERHUB_TOKEN + env: + DOCKER_BUILDKIT: "1" + DOCKERHUB_USERNAME: "vllmbot" + - group: "Publish wheels" key: "publish-wheels" steps: @@ -723,7 +760,7 @@ steps: - "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh" env: S3_BUCKET: "vllm-wheels" - VARIANT: "rocm721" + VARIANT: "rocm722" # ROCm Job 6: Build ROCm Release Docker Image - label: ":docker: Build release image - x86_64 - ROCm" diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh index 6f41d1cdda47..afa884fba46b 100755 --- a/.buildkite/scripts/annotate-release.sh +++ b/.buildkite/scripts/annotate-release.sh @@ -8,8 +8,6 @@ if [ -z "${RELEASE_VERSION}" ]; then RELEASE_VERSION="1.0.0.dev" fi -ROCM_BASE_CACHE_KEY=$(.buildkite/scripts/cache-rocm-base-wheels.sh key) - buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF To download the wheel (by commit): \`\`\` @@ -25,95 +23,5 @@ aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38- aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl . \`\`\` - -To download and upload the image: - -\`\`\` -# Download images: - -docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 -docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 -docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu129 -docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu129 -docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base -docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm -docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} -docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} - -# Tag and push images: - -## CUDA - -docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64 -docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64 -docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 -docker push vllm/vllm-openai:latest-x86_64 -docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 - -docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu129 vllm/vllm-openai:x86_64-cu129 -docker tag vllm/vllm-openai:x86_64-cu129 vllm/vllm-openai:latest-x86_64-cu129 -docker tag vllm/vllm-openai:x86_64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129 -docker push vllm/vllm-openai:latest-x86_64-cu129 -docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129 - -docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64 -docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64 -docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 -docker push vllm/vllm-openai:latest-aarch64 -docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 - -docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu129 vllm/vllm-openai:aarch64-cu129 -docker tag vllm/vllm-openai:aarch64-cu129 vllm/vllm-openai:latest-aarch64-cu129 -docker tag vllm/vllm-openai:aarch64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129 -docker push vllm/vllm-openai:latest-aarch64-cu129 -docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129 - -## ROCm - -docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} -docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest -docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION} -docker push vllm/vllm-openai-rocm:latest -docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION} - -docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base -docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base -docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base -docker push vllm/vllm-openai-rocm:latest-base -docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base - -## CPU - -docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64 -docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64 -docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 -docker push vllm/vllm-openai-cpu:latest-x86_64 -docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 - -docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64 -docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64 -docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64 -docker push vllm/vllm-openai-cpu:latest-arm64 -docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64 - -# Create multi-arch manifest: - -docker manifest rm vllm/vllm-openai:latest -docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 -docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 -docker manifest push vllm/vllm-openai:latest -docker manifest push vllm/vllm-openai:v${RELEASE_VERSION} - -docker manifest rm vllm/vllm-openai:latest-cu129 -docker manifest create vllm/vllm-openai:latest-cu129 vllm/vllm-openai:latest-x86_64-cu129 vllm/vllm-openai:latest-aarch64-cu129 -docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129 -docker manifest push vllm/vllm-openai:latest-cu129 -docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu129 - -docker manifest rm vllm/vllm-openai-cpu:latest || true -docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64 -docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64 -docker manifest push vllm/vllm-openai-cpu:latest -docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION} -\`\`\` +Docker images are published automatically by the "Publish release images to DockerHub" pipeline step. EOF diff --git a/.buildkite/scripts/ci-fetch-log.sh b/.buildkite/scripts/ci-fetch-log.sh new file mode 100755 index 000000000000..02798b56f4a9 --- /dev/null +++ b/.buildkite/scripts/ci-fetch-log.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Usage: ./ci-fetch-log.sh [output_file] +# ./ci-fetch-log.sh [output_file] +# +# Downloads the raw log for a Buildkite job from the public, unauthenticated +# /organizations//pipelines//builds//jobs//download +# endpoint, then strips ANSI/timestamps via ci-clean-log.sh. +# +# Find and via: +# gh pr checks --repo vllm-project/vllm +# Each failing row's URL is .../builds/#. + +set -euo pipefail + +ORG="vllm" +PIPELINE="ci" + +usage() { + echo "Usage: $0 [output_file]" + echo " $0 [output_file]" + exit 1 +} + +if [ $# -lt 1 ]; then usage; fi + +if [[ "$1" == https://* ]]; then + BUILD=$(echo "$1" | sed -nE 's#.*/builds/([0-9]+).*#\1#p') + JOB=$(echo "$1" | grep -oE '[0-9a-f]{8}-[0-9a-f-]+' | head -n 1) + OUT="${2:-ci-${BUILD}-${JOB:0:8}.log}" +else + if [ $# -lt 2 ]; then usage; fi + BUILD="$1" + JOB="$2" + OUT="${3:-ci-${BUILD}-${JOB:0:8}.log}" +fi + +if [ -z "$BUILD" ] || [ -z "$JOB" ]; then + echo "Could not parse build number or job UUID from: $1" >&2 + usage +fi + +COOKIES=$(mktemp) +trap 'rm -f "$COOKIES"' EXIT + +# Buildkite issues a session cookie on first hit; subsequent /download needs it. +curl -fsSL -c "$COOKIES" -A "vllm-ci-fetch-log" \ + "https://buildkite.com/${ORG}/${PIPELINE}/builds/${BUILD}" -o /dev/null + +curl -fsSL -b "$COOKIES" -A "vllm-ci-fetch-log" \ + "https://buildkite.com/organizations/${ORG}/pipelines/${PIPELINE}/builds/${BUILD}/jobs/${JOB}/download" \ + -o "$OUT" + +bash "$(dirname "$0")/ci-clean-log.sh" "$OUT" + +echo "$OUT" diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index 703a7d753220..7e8ddb12ec98 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -378,9 +378,11 @@ HF_MOUNT="/root/.cache/huggingface" # double-quotes will have been stripped by the calling shell. if [[ -n "${VLLM_TEST_COMMANDS:-}" ]]; then commands="${VLLM_TEST_COMMANDS}" + commands_source="env" echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)" else commands="$*" + commands_source="argv" if [[ -z "$commands" ]]; then echo "Error: No test commands provided." >&2 echo "Usage:" >&2 @@ -397,9 +399,15 @@ fi echo "Raw commands: $commands" -# Fix quoting before ROCm overrides (so overrides see correct structure) -commands=$(re_quote_pytest_markers "$commands") -echo "After re-quoting: $commands" +# Only try to repair stripped pytest -m/-k quoting in legacy argv mode. +# VLLM_TEST_COMMANDS preserves inner quoting already, and re-quoting that path +# can corrupt embedded echo strings or otherwise well-formed shell fragments. +if [[ "$commands_source" == "argv" ]]; then + commands=$(re_quote_pytest_markers "$commands") + echo "After re-quoting: $commands" +else + echo "Skipping re-quoting for VLLM_TEST_COMMANDS input" +fi commands=$(apply_rocm_test_overrides "$commands") echo "Final commands: $commands" diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh index 7166435ac1e9..9c13fa79fcb2 100755 --- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh @@ -31,6 +31,21 @@ function cpu_tests() { set -e pip list" + # Run kernel tests + docker exec cpu-test bash -c " + set -e + pytest -x -v -s tests/kernels/test_onednn.py + pytest -x -v -s tests/kernels/attention/test_cpu_attn.py + pytest -x -v -s tests/kernels/core/test_cpu_activation.py + pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic" + + # skip tests requiring model downloads if HF_TOKEN is not set + # due to rate-limits + if [ -z "$HF_TOKEN" ]; then + echo "Warning: HF_TOKEN is not set. Skipping tests that require model downloads." + return + fi + # offline inference docker exec cpu-test bash -c " set -e @@ -46,13 +61,6 @@ function cpu_tests() { set -e pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs" - # Run kernel tests - docker exec cpu-test bash -c " - set -e - pytest -x -v -s tests/kernels/test_onednn.py - pytest -x -v -s tests/kernels/attention/test_cpu_attn.py - pytest -x -v -s tests/kernels/core/test_cpu_activation.py - pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic" # basic online serving docker exec cpu-test bash -c ' @@ -67,6 +75,21 @@ function cpu_tests() { --num-prompts 20 \ --endpoint /v1/completions kill -s SIGTERM $server_pid &' + + # smoke test for Gated DeltaNet + docker exec cpu-test bash -c ' + set -e + VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3.5-0.8B --max-model-len 2048 & + server_pid=$! + timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 + vllm bench serve \ + --backend vllm \ + --dataset-name random \ + --model Qwen/Qwen3.5-0.8B \ + --num-prompts 20 \ + --endpoint /v1/completions + kill -s SIGTERM $server_pid &' + } # All of CPU tests are expected to be finished less than 40 mins. diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index feaf2b356267..61ebddf82e40 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -136,8 +136,6 @@ run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \ "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine" run_and_track_test 4 "test_quantization_accuracy.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py" -run_and_track_test 5 "examples/offline_inference/tpu.py" \ - "python3 /workspace/vllm/examples/offline_inference/tpu.py" run_and_track_test 6 "test_tpu_model_runner.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py" run_and_track_test 7 "test_sampler.py" \ diff --git a/.buildkite/scripts/publish-release-images.sh b/.buildkite/scripts/publish-release-images.sh new file mode 100755 index 000000000000..ec319aa76006 --- /dev/null +++ b/.buildkite/scripts/publish-release-images.sh @@ -0,0 +1,180 @@ +#!/bin/bash +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# +# Publish release Docker images from ECR to DockerHub. +# Pulls per-arch images, tags with latest and versioned tags, pushes them, +# then creates and pushes multi-arch manifests. + +set -euo pipefail + +RELEASE_VERSION=$(buildkite-agent meta-data get release-version --default "" | sed 's/^v//') +if [ -z "${RELEASE_VERSION}" ]; then + echo "ERROR: release-version metadata not set" + exit 1 +fi + +COMMIT="$BUILDKITE_COMMIT" +ROCM_BASE_CACHE_KEY=$(.buildkite/scripts/cache-rocm-base-wheels.sh key) + +echo "========================================" +echo "Publishing release images v${RELEASE_VERSION}" +echo " Commit: ${COMMIT}" +echo " ROCm base cache key: ${ROCM_BASE_CACHE_KEY}" +echo "========================================" + +# Login to ECR to pull staging images +aws ecr-public get-login-password --region us-east-1 | \ + docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7 + +# ---- CUDA (default: 13.0) ---- + +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64 +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64 + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64 vllm/vllm-openai:latest-x86_64 +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 +docker push vllm/vllm-openai:latest-x86_64 +docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64 vllm/vllm-openai:latest-aarch64 +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 +docker push vllm/vllm-openai:latest-aarch64 +docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 + +docker manifest rm vllm/vllm-openai:latest || true +docker manifest rm vllm/vllm-openai:v${RELEASE_VERSION} || true +docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 +docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 +docker manifest push vllm/vllm-openai:latest +docker manifest push vllm/vllm-openai:v${RELEASE_VERSION} + +# ---- CUDA 12.9 ---- + +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-cu129 +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-cu129 + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-cu129 vllm/vllm-openai:latest-x86_64-cu129 +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129 +docker push vllm/vllm-openai:latest-x86_64-cu129 +docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129 + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-cu129 vllm/vllm-openai:latest-aarch64-cu129 +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129 +docker push vllm/vllm-openai:latest-aarch64-cu129 +docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129 + +docker manifest rm vllm/vllm-openai:latest-cu129 || true +docker manifest rm vllm/vllm-openai:v${RELEASE_VERSION}-cu129 || true +docker manifest create vllm/vllm-openai:latest-cu129 vllm/vllm-openai:latest-x86_64-cu129 vllm/vllm-openai:latest-aarch64-cu129 +docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129 +docker manifest push vllm/vllm-openai:latest-cu129 +docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu129 + +# ---- Ubuntu 24.04 (CUDA 13.0) ---- + +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-ubuntu2404 +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-ubuntu2404 + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-ubuntu2404 vllm/vllm-openai:latest-x86_64-ubuntu2404 +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-ubuntu2404 +docker push vllm/vllm-openai:latest-x86_64-ubuntu2404 +docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-ubuntu2404 + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-ubuntu2404 vllm/vllm-openai:latest-aarch64-ubuntu2404 +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-ubuntu2404 +docker push vllm/vllm-openai:latest-aarch64-ubuntu2404 +docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-ubuntu2404 + +docker manifest rm vllm/vllm-openai:latest-ubuntu2404 || true +docker manifest rm vllm/vllm-openai:v${RELEASE_VERSION}-ubuntu2404 || true +docker manifest create vllm/vllm-openai:latest-ubuntu2404 vllm/vllm-openai:latest-x86_64-ubuntu2404 vllm/vllm-openai:latest-aarch64-ubuntu2404 +docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-ubuntu2404 +docker manifest push vllm/vllm-openai:latest-ubuntu2404 +docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-ubuntu2404 + +# ---- Ubuntu 24.04 (CUDA 12.9) ---- + +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-cu129-ubuntu2404 +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-cu129-ubuntu2404 + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-cu129-ubuntu2404 vllm/vllm-openai:latest-x86_64-cu129-ubuntu2404 +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-cu129-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129-ubuntu2404 +docker push vllm/vllm-openai:latest-x86_64-cu129-ubuntu2404 +docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129-ubuntu2404 + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-cu129-ubuntu2404 vllm/vllm-openai:latest-aarch64-cu129-ubuntu2404 +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-cu129-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129-ubuntu2404 +docker push vllm/vllm-openai:latest-aarch64-cu129-ubuntu2404 +docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129-ubuntu2404 + +docker manifest rm vllm/vllm-openai:latest-cu129-ubuntu2404 || true +docker manifest rm vllm/vllm-openai:v${RELEASE_VERSION}-cu129-ubuntu2404 || true +docker manifest create vllm/vllm-openai:latest-cu129-ubuntu2404 vllm/vllm-openai:latest-x86_64-cu129-ubuntu2404 vllm/vllm-openai:latest-aarch64-cu129-ubuntu2404 +docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu129-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129-ubuntu2404 +docker manifest push vllm/vllm-openai:latest-cu129-ubuntu2404 +docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu129-ubuntu2404 + +# ---- ROCm ---- + +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-rocm +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-rocm vllm/vllm-openai-rocm:latest +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-rocm vllm/vllm-openai-rocm:v${RELEASE_VERSION} +docker push vllm/vllm-openai-rocm:latest +docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION} + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base vllm/vllm-openai-rocm:latest-base +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base +docker push vllm/vllm-openai-rocm:latest-base +docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base + +# ---- CPU ---- +# CPU images are behind separate block steps and may not have been built. +# All-or-nothing: inspect both arches first, then either publish everything +# (per-arch + multi-arch manifest) or skip everything. Publishing only one +# arch would leave `:latest-x86_64` pointing at the new release while the +# `:latest` multi-arch manifest still resolves to the previous release. + +CPU_X86_TAG=public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} +CPU_ARM_TAG=public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} + +CPU_X86_AVAILABLE=false +CPU_ARM_AVAILABLE=false +docker manifest inspect "${CPU_X86_TAG}" >/dev/null 2>&1 && CPU_X86_AVAILABLE=true +docker manifest inspect "${CPU_ARM_TAG}" >/dev/null 2>&1 && CPU_ARM_AVAILABLE=true + +if [ "$CPU_X86_AVAILABLE" = "true" ] && [ "$CPU_ARM_AVAILABLE" = "true" ]; then + docker pull "${CPU_X86_TAG}" + docker tag "${CPU_X86_TAG}" vllm/vllm-openai-cpu:latest-x86_64 + docker tag "${CPU_X86_TAG}" vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 + docker push vllm/vllm-openai-cpu:latest-x86_64 + docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 + + docker pull "${CPU_ARM_TAG}" + docker tag "${CPU_ARM_TAG}" vllm/vllm-openai-cpu:latest-arm64 + docker tag "${CPU_ARM_TAG}" vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64 + docker push vllm/vllm-openai-cpu:latest-arm64 + docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64 + + docker manifest rm vllm/vllm-openai-cpu:latest || true + docker manifest rm vllm/vllm-openai-cpu:v${RELEASE_VERSION} || true + docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64 + docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64 + docker manifest push vllm/vllm-openai-cpu:latest + docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION} +elif [ "$CPU_X86_AVAILABLE" = "false" ] && [ "$CPU_ARM_AVAILABLE" = "false" ]; then + echo "WARNING: Neither CPU image found in ECR, skipping CPU publish (ensure block-cpu-release-image-build and block-arm64-cpu-release-image-build were unblocked and the builds finished pushing)" +else + # Partial state: one arch built, the other did not. Fail loudly rather than + # ship a Docker Hub state where `:latest-${arch}` and `:latest` (multi-arch) + # disagree on which release they point at. + echo "ERROR: Partial CPU build detected (x86_64=${CPU_X86_AVAILABLE}, arm64=${CPU_ARM_AVAILABLE})." + echo " Refusing to publish to avoid split-tag drift between per-arch and multi-arch tags." + echo " Re-run the missing CPU build and retry, or manually publish if a single-arch release is intended." + exit 1 +fi + +echo "" +echo "Successfully published release images for v${RELEASE_VERSION}" diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh index de48eb282a65..0eadfa1f80b4 100755 --- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh +++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh @@ -51,6 +51,7 @@ vllm serve "$MODEL" \ --offload-num-in-group 2 \ --offload-prefetch-step 1 \ --offload-params w13_weight w2_weight \ + --generation-config vllm \ --port "$PORT" \ ${EXTRA_ARGS+"${EXTRA_ARGS[@]}"} & SERVER_PID=$! diff --git a/.buildkite/scripts/upload-release-wheels-pypi.sh b/.buildkite/scripts/upload-release-wheels-pypi.sh index 058e5bbe4f4c..7e2077a2692c 100644 --- a/.buildkite/scripts/upload-release-wheels-pypi.sh +++ b/.buildkite/scripts/upload-release-wheels-pypi.sh @@ -39,10 +39,11 @@ fi set -x # avoid printing secrets above -# install twine from pypi +# install twine and sdist build prerequisites from pypi python3 -m venv /tmp/vllm-release-env source /tmp/vllm-release-env/bin/activate pip install twine +pip install -r requirements/build/cuda.txt python3 -m twine --version # copy release wheels to local directory diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 844dbe639b3c..bad2796266a7 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -230,7 +230,6 @@ steps: - tests/entrypoints/llm/test_collective_rpc.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py @@ -272,7 +271,6 @@ steps: - tests/v1/worker/test_worker_memory_snapshot.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown @@ -396,8 +394,8 @@ steps: - python3 pooling/embed/vision_embedding_offline.py --seed 0 # Features demo - python3 features/automatic_prefix_caching/prefix_caching_offline.py - - python3 offline_inference/llm_engine_example.py - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 deployment/llm_engine_example.py + - python3 features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 @@ -590,7 +588,6 @@ steps: - vllm/platforms/rocm.py commands: - pip freeze | grep -E 'torch' - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB - label: Multi-Modal Models (Extended Generation 2) # TBD @@ -621,6 +618,7 @@ steps: mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 torch_nightly: true + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ @@ -864,7 +862,6 @@ steps: - tests/entrypoints/openai/test_multi_api_servers.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py @@ -930,6 +927,7 @@ steps: - tests/renderers - tests/standalone_tests/lazy_imports.py - tests/tokenizers_ + - tests/reasoning - tests/tool_parsers - tests/transformers_utils - tests/config @@ -942,7 +940,7 @@ steps: - pytest -v -s -m 'cpu_test' multimodal - pytest -v -s renderers - pytest -v -s tokenizers_ - - pytest -v -s reasoning --ignore=reasoning/test_seedoss_reasoning_parser.py --ignore=reasoning/test_glm4_moe_reasoning_parser.py --ignore=reasoning/test_gemma4_reasoning_parser.py + - pytest -v -s reasoning --ignore=reasoning/test_seedoss_reasoning_parser.py --ignore=reasoning/test_glm4_moe_reasoning_parser.py - pytest -v -s tool_parsers - pytest -v -s transformers_utils - pytest -v -s config @@ -1173,7 +1171,6 @@ steps: - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s tests/distributed/test_context_parallel.py - VLLM_LOGGING_LEVEL=DEBUG python3 examples/features/data_parallel/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization @@ -1187,7 +1184,6 @@ steps: source_file_dependencies: - vllm/ commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s distributed/test_custom_all_reduce.py - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' @@ -1207,7 +1203,6 @@ steps: - tests/examples/features/data_parallel/data_parallel_offline.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py @@ -1253,7 +1248,6 @@ steps: - vllm/platforms/rocm.py commands: - export VLLM_USE_RAY_V2_EXECUTOR_BACKEND=1 - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s distributed/test_ray_v2_executor.py - pytest -v -s distributed/test_ray_v2_executor_e2e.py - pytest -v -s distributed/test_pipeline_parallel.py -k "ray" @@ -1275,7 +1269,6 @@ steps: - vllm/v1/worker/gpu_worker.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - torchrun --nproc-per-node=8 ../examples/features/torchrun/torchrun_dp_example_offline.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep #-------------------------------------------------------- mi300 · entrypoints --------------------------------------------------------# @@ -1656,8 +1649,8 @@ steps: - python3 pooling/embed/vision_embedding_offline.py --seed 0 # Features demo - python3 features/automatic_prefix_caching/prefix_caching_offline.py - - python3 offline_inference/llm_engine_example.py - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 deployment/llm_engine_example.py + - python3 features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 @@ -1803,6 +1796,7 @@ steps: timeout_in_minutes: 180 mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] agent_pool: mi300_1 + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ @@ -1844,6 +1838,7 @@ steps: mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] agent_pool: mi300_1 torch_nightly: true + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ @@ -2204,7 +2199,6 @@ steps: timeout_in_minutes: 180 mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300] agent_pool: mi300_1 - optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ @@ -2281,7 +2275,6 @@ steps: - tests/entrypoints/openai/test_multi_api_servers.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py @@ -2301,7 +2294,6 @@ steps: - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py - VLLM_LOGGING_LEVEL=DEBUG python3 examples/features/data_parallel/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py @@ -2364,7 +2356,6 @@ steps: - tests/distributed/test_utils - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py @@ -2494,7 +2485,6 @@ steps: - tests/entrypoints/llm/test_collective_rpc.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py @@ -2519,7 +2509,6 @@ steps: - tests/v1/worker/test_worker_memory_snapshot.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown @@ -2540,7 +2529,6 @@ steps: - tests/distributed/test_multiproc_executor.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s compile/fullgraph/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_events.py @@ -2628,6 +2616,7 @@ steps: agent_pool: mi325_1 torch_nightly: true parallelism: 2 + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ @@ -2653,6 +2642,7 @@ steps: mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 torch_nightly: true + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ @@ -2718,7 +2708,6 @@ steps: - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: - - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/v1/distributed/test_dbo.py @@ -2749,6 +2738,7 @@ steps: agent_pool: mi355_1 fast_check: true torch_nightly: true + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ @@ -2764,6 +2754,7 @@ steps: agent_pool: mi355_1 fast_check: true torch_nightly: true + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ @@ -2939,8 +2930,8 @@ steps: - python3 pooling/embed/vision_embedding_offline.py --seed 0 # Features demo - python3 features/automatic_prefix_caching/prefix_caching_offline.py - - python3 offline_inference/llm_engine_example.py - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 deployment/llm_engine_example.py + - python3 features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 @@ -3060,6 +3051,7 @@ steps: timeout_in_minutes: 180 mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ @@ -3259,6 +3251,7 @@ steps: timeout_in_minutes: 60 mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ @@ -3284,6 +3277,7 @@ steps: timeout_in_minutes: 60 mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ diff --git a/.buildkite/test_areas/attention.yaml b/.buildkite/test_areas/attention.yaml index 9b1f67705d6e..d3947a03162b 100644 --- a/.buildkite/test_areas/attention.yaml +++ b/.buildkite/test_areas/attention.yaml @@ -17,7 +17,7 @@ steps: - label: V1 attention (B200) key: v1-attention-b200 timeout_in_minutes: 30 - device: b200 + device: b200-k8s source_file_dependencies: - vllm/config/attention.py - vllm/model_executor/layers/attention diff --git a/.buildkite/test_areas/benchmarks.yaml b/.buildkite/test_areas/benchmarks.yaml index 3385f9472ae6..85f804780179 100644 --- a/.buildkite/test_areas/benchmarks.yaml +++ b/.buildkite/test_areas/benchmarks.yaml @@ -14,7 +14,7 @@ steps: - label: Attention Benchmarks Smoke Test (B200) key: attention-benchmarks-smoke-test-b200 - device: b200 + device: b200-k8s num_gpus: 2 optional: true working_dir: "/vllm-workspace/" diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml index 638e5b0eec9b..01248738d519 100644 --- a/.buildkite/test_areas/compile.yaml +++ b/.buildkite/test_areas/compile.yaml @@ -43,7 +43,7 @@ steps: key: asynctp-correctness-tests-b200 timeout_in_minutes: 50 working_dir: "/vllm-workspace/" - device: b200 + device: b200-k8s optional: true num_devices: 2 commands: @@ -68,7 +68,7 @@ steps: key: fusion-and-compile-unit-tests-2xb200 timeout_in_minutes: 20 working_dir: "/vllm-workspace/" - device: b200 + device: b200-k8s source_file_dependencies: - csrc/quantization/fp4/ - vllm/model_executor/layers/quantization/ @@ -137,7 +137,7 @@ steps: key: fusion-e2e-config-sweep-b200 timeout_in_minutes: 30 working_dir: "/vllm-workspace/" - device: b200 + device: b200-k8s num_devices: 1 optional: true commands: @@ -209,7 +209,7 @@ steps: key: fusion-e2e-tp2-b200 timeout_in_minutes: 20 working_dir: "/vllm-workspace/" - device: b200 + device: b200-k8s num_devices: 2 source_file_dependencies: - csrc/quantization/ diff --git a/.buildkite/test_areas/disaggregated.yaml b/.buildkite/test_areas/disaggregated.yaml index cef2c69d775a..e68b9e1add8b 100644 --- a/.buildkite/test_areas/disaggregated.yaml +++ b/.buildkite/test_areas/disaggregated.yaml @@ -8,7 +8,7 @@ steps: working_dir: "/vllm-workspace/tests" num_devices: 4 source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - tests/v1/kv_connector/nixl_integration/ commands: - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt @@ -19,7 +19,7 @@ steps: working_dir: "/vllm-workspace/tests" num_devices: 4 source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - tests/v1/kv_connector/nixl_integration/ commands: - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt @@ -31,7 +31,7 @@ steps: working_dir: "/vllm-workspace/tests" num_devices: 4 source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - tests/v1/kv_connector/nixl_integration/ commands: - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt @@ -43,7 +43,7 @@ steps: working_dir: "/vllm-workspace/tests" num_devices: 4 source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - tests/v1/kv_connector/nixl_integration/ commands: - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt @@ -55,7 +55,7 @@ steps: working_dir: "/vllm-workspace/tests" num_devices: 4 source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - tests/v1/kv_connector/nixl_integration/ commands: - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt @@ -67,7 +67,7 @@ steps: working_dir: "/vllm-workspace/tests" num_devices: 2 source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py - vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py - vllm/distributed/kv_transfer/kv_connector/v1/offloading/ @@ -83,7 +83,7 @@ steps: working_dir: "/vllm-workspace/tests" num_devices: 2 source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - vllm/v1/worker/kv_connector_model_runner_mixin.py - tests/v1/kv_connector/nixl_integration/ commands: @@ -96,7 +96,7 @@ steps: working_dir: "/vllm-workspace/tests" num_devices: 2 source_file_dependencies: - - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py - vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py - vllm/distributed/kv_transfer/kv_connector/v1/offloading/ diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index 6f45a38eeb39..8aa41a9a26ab 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -212,7 +212,7 @@ steps: - label: Distributed Tests (2 GPUs)(B200) key: distributed-tests-2-gpus-b200 - device: b200 + device: b200-k8s optional: true working_dir: "/vllm-workspace/" num_devices: 2 diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml index 8df3b2fd0d9e..bb8aa14eac18 100644 --- a/.buildkite/test_areas/e2e_integration.yaml +++ b/.buildkite/test_areas/e2e_integration.yaml @@ -25,7 +25,7 @@ steps: - label: Qwen3-30B-A3B-FP8-block Accuracy (B200) key: qwen3-30b-a3b-fp8-block-accuracy-b200 timeout_in_minutes: 60 - device: b200 + device: b200-k8s optional: true num_devices: 2 working_dir: "/vllm-workspace" diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml index 17ac44803aaf..cf0f028255d2 100644 --- a/.buildkite/test_areas/engine.yaml +++ b/.buildkite/test_areas/engine.yaml @@ -13,8 +13,9 @@ steps: - tests/test_config - tests/test_logger - tests/test_vllm_port + - tests/test_jit_monitor.py commands: - - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py + - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py test_jit_monitor.py - label: Engine (1 GPU) key: engine-1-gpu @@ -56,11 +57,6 @@ steps: commands: # Only run tests that need exactly 2 GPUs - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism" - mirror: - amd: - device: mi325_2 - depends_on: - - image-build-amd - label: V1 e2e (4 GPUs) key: v1-e2e-4-gpus @@ -73,11 +69,6 @@ steps: commands: # Only run tests that need 4 GPUs - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy" - mirror: - amd: - device: mi325_4 - depends_on: - - image-build-amd - label: V1 e2e (4xH100) key: v1-e2e-4xh100 diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml index 4908df1e4eca..ba92d3a3aec0 100644 --- a/.buildkite/test_areas/entrypoints.yaml +++ b/.buildkite/test_areas/entrypoints.yaml @@ -26,6 +26,11 @@ steps: - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests + mirror: + amd: + device: mi300_1 + depends_on: + - image-build-amd - label: Entrypoints Integration (API Server openai - Part 1) key: entrypoints-integration-api-server-openai-part-1 @@ -38,11 +43,6 @@ steps: commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py - mirror: - amd: - device: mi325_1 - depends_on: - - image-build-amd - label: Entrypoints Integration (API Server openai - Part 2) @@ -57,11 +57,6 @@ steps: - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py - pytest -v -s entrypoints/openai/speech_to_text/ - pytest -v -s entrypoints/test_chat_utils.py - mirror: - amd: - device: mi325_1 - depends_on: - - image-build-amd - label: Entrypoints Integration (API Server openai - Part 3) key: entrypoints-integration-api-server-openai-part-3 diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml index b7cd1eae65a4..34e1e4832d9d 100644 --- a/.buildkite/test_areas/kernels.yaml +++ b/.buildkite/test_areas/kernels.yaml @@ -125,7 +125,7 @@ steps: key: kernels-b200 timeout_in_minutes: 30 working_dir: "/vllm-workspace/" - device: b200 + device: b200-k8s # optional: true source_file_dependencies: - csrc/quantization/fp4/ @@ -212,7 +212,7 @@ steps: - label: Kernels Fp4 MoE Test (B200) key: kernels-fp4-moe-test-b200 timeout_in_minutes: 60 - device: b200 + device: b200-k8s num_devices: 1 optional: true commands: diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml index 67b87ab6921f..e5a163d17c7e 100644 --- a/.buildkite/test_areas/lm_eval.yaml +++ b/.buildkite/test_areas/lm_eval.yaml @@ -51,7 +51,7 @@ steps: - label: LM Eval Qwen3.5 Models (B200) key: lm-eval-qwen3-5-models-b200 timeout_in_minutes: 120 - device: b200 + device: b200-k8s optional: true num_devices: 2 source_file_dependencies: @@ -84,7 +84,7 @@ steps: - label: MoE Refactor Integration Test (B200 - TEMPORARY) key: moe-refactor-integration-test-b200-temporary - device: b200 + device: b200-k8s optional: true num_devices: 2 commands: diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml index 3ff96dcbdfb7..877d931a12ef 100644 --- a/.buildkite/test_areas/misc.yaml +++ b/.buildkite/test_areas/misc.yaml @@ -12,11 +12,6 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn # TODO: create another `optional` test group for slow tests - pytest -v -s -m 'not slow_test' v1/spec_decode - mirror: - amd: - device: mi325_1 - depends_on: - - image-build-amd - label: V1 Sample + Logits key: v1-sample-logits @@ -38,7 +33,7 @@ steps: - pytest -v -s v1/test_outputs.py mirror: amd: - device: mi325_1 + device: mi300_1 depends_on: - image-build-amd @@ -67,11 +62,6 @@ steps: # Integration test for streaming correctness (requires special branch). - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - mirror: - amd: - device: mi325_1 - depends_on: - - image-build-amd - label: V1 Others (CPU) key: v1-others-cpu @@ -127,8 +117,8 @@ steps: - python3 pooling/embed/vision_embedding_offline.py --seed 0 # for features demo - python3 features/automatic_prefix_caching/prefix_caching_offline.py - - python3 offline_inference/llm_engine_example.py - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 deployment/llm_engine_example.py + - python3 features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 @@ -200,7 +190,7 @@ steps: - pytest -v -s -m 'cpu_test' multimodal - pytest -v -s renderers - pytest -v -s tokenizers_ - - pytest -v -s reasoning --ignore=reasoning/test_seedoss_reasoning_parser.py --ignore=reasoning/test_glm4_moe_reasoning_parser.py --ignore=reasoning/test_gemma4_reasoning_parser.py + - pytest -v -s reasoning --ignore=reasoning/test_seedoss_reasoning_parser.py --ignore=reasoning/test_glm4_moe_reasoning_parser.py - pytest -v -s tool_parsers - pytest -v -s transformers_utils - pytest -v -s config @@ -224,7 +214,7 @@ steps: - label: Batch Invariance (B200) key: batch-invariance-b200 timeout_in_minutes: 30 - device: b200 + device: b200-k8s source_file_dependencies: - vllm/v1/attention - vllm/model_executor/layers diff --git a/.buildkite/test_areas/model_runner_v2.yaml b/.buildkite/test_areas/model_runner_v2.yaml index 6a4338a5e40a..9dfd046289e8 100644 --- a/.buildkite/test_areas/model_runner_v2.yaml +++ b/.buildkite/test_areas/model_runner_v2.yaml @@ -37,7 +37,7 @@ steps: - examples/generate/multimodal/ - examples/features/ - examples/pooling/embed/vision_embedding_offline.py - - examples/others/tensorize_vllm_model.py + - examples/features/tensorize_vllm_model.py commands: - set -x - export VLLM_USE_V2_MODEL_RUNNER=1 @@ -55,8 +55,8 @@ steps: - python3 pooling/embed/vision_embedding_offline.py --seed 0 # for features demo - python3 features/automatic_prefix_caching/prefix_caching_offline.py - - python3 offline_inference/llm_engine_example.py - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 deployment/llm_engine_example.py + - python3 features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml index ab597f960e74..8fca203de44f 100644 --- a/.buildkite/test_areas/models_basic.yaml +++ b/.buildkite/test_areas/models_basic.yaml @@ -42,12 +42,6 @@ steps: - tests/models/test_registry.py commands: - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py - mirror: - amd: - device: mi325_1 - depends_on: - - image-build-amd - - label: Basic Models Test (Other CPU) # 5min key: basic-models-test-other-cpu diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml index 64f8fc809d46..b560c5a4769a 100644 --- a/.buildkite/test_areas/models_language.yaml +++ b/.buildkite/test_areas/models_language.yaml @@ -48,6 +48,14 @@ steps: parallelism: 2 mirror: torch_nightly: {} + amd: + device: mi300_1 + depends_on: + - image-build-amd + commands: + - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0' + - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB - label: Language Models Test (Extended Generation) # 80min key: language-models-test-extended-generation @@ -62,15 +70,6 @@ steps: - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0' - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0' - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' - mirror: - amd: - device: mi325_1 - depends_on: - - image-build-amd - commands: - - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' - - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0' - - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' - label: Language Models Test (PPL) key: language-models-test-ppl @@ -92,11 +91,6 @@ steps: - tests/models/language/pooling commands: - pytest -v -s models/language/pooling -m 'not core_model' - mirror: - amd: - device: mi325_1 - depends_on: - - image-build-amd - label: Language Models Test (MTEB) key: language-models-test-mteb diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml index 857cc68f2a92..1f66393df818 100644 --- a/.buildkite/test_areas/models_multimodal.yaml +++ b/.buildkite/test_areas/models_multimodal.yaml @@ -15,7 +15,7 @@ steps: - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model mirror: amd: - device: mi325_1 + device: mi300_1 depends_on: - image-build-amd @@ -33,7 +33,7 @@ steps: - pytest -v -s models/multimodal/generation/test_vit_cudagraph.py -m core_model mirror: amd: - device: mi325_1 + device: mi300_1 depends_on: - image-build-amd @@ -49,7 +49,7 @@ steps: - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model mirror: amd: - device: mi325_1 + device: mi300_1 depends_on: - image-build-amd @@ -64,11 +64,6 @@ steps: - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/generation/test_memory_leak.py --ignore models/multimodal/processing - pytest models/multimodal/generation/test_memory_leak.py -m core_model - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - mirror: - amd: - device: mi325_1 - depends_on: - - image-build-amd - label: Multi-Modal Processor (CPU) key: multi-modal-processor-cpu @@ -120,7 +115,7 @@ steps: - pytest -v -s models/multimodal/test_mapping.py mirror: amd: - device: mi325_1 + device: mi300_1 depends_on: - image-build-amd diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml index 0ccca528401b..8a9a36da4481 100644 --- a/.buildkite/test_areas/quantization.yaml +++ b/.buildkite/test_areas/quantization.yaml @@ -25,7 +25,7 @@ steps: key: quantized-moe-test-b200 timeout_in_minutes: 60 working_dir: "/vllm-workspace/" - device: b200 + device: b200-k8s source_file_dependencies: - tests/quantization/test_blackwell_moe.py - vllm/model_executor/models/deepseek_v2.py diff --git a/.buildkite/test_areas/samplers.yaml b/.buildkite/test_areas/samplers.yaml index 37f8eaa6883c..48e9f55571e4 100644 --- a/.buildkite/test_areas/samplers.yaml +++ b/.buildkite/test_areas/samplers.yaml @@ -17,7 +17,7 @@ steps: - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers mirror: amd: - device: mi325_1 + device: mi250_1 depends_on: - image-build-amd commands: diff --git a/.buildkite/test_areas/spec_decode.yaml b/.buildkite/test_areas/spec_decode.yaml index 726043769c91..5253f54735aa 100644 --- a/.buildkite/test_areas/spec_decode.yaml +++ b/.buildkite/test_areas/spec_decode.yaml @@ -75,7 +75,7 @@ steps: - label: Spec Decode Draft Model Nightly B200 key: spec-decode-draft-model-nightly-b200 timeout_in_minutes: 30 - device: b200 + device: b200-k8s optional: true source_file_dependencies: - vllm/v1/spec_decode/ diff --git a/.github/mergify.yml b/.github/mergify.yml index de3c76fd458b..2d36e3507028 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -477,9 +477,7 @@ pull_request_rules: conditions: - label != stale - or: - - files~=^examples/online_serving/disaggregated[^/]*/.* - - files~=^examples/offline_inference/disaggregated[^/]*/.* - - files~=^examples/others/lmcache/ + - files~=^examples/disaggregated/ - files~=^tests/v1/kv_connector/ - files~=^vllm/distributed/kv_transfer/ - title~=(?i)\bP/?D\b diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 33b1db69dec4..f1e0afebf213 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -131,6 +131,16 @@ repos: --python-version, "3.12", ] files: ^requirements/(common|xpu|test/xpu)\.(in|txt)$ + - id: pip-compile + alias: pip-compile-docs + name: pip-compile-docs + args: [ + requirements/docs.in, + -o, requirements/docs.txt, + --python-platform, x86_64-manylinux_2_28, + --python-version, "3.12", + ] + files: ^requirements/docs\.(in|txt)$ - repo: local hooks: - id: format-torch-nightly-test diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 1e479fd03d91..f372a3fb8cc9 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -9,7 +9,6 @@ build: python: "3.12" jobs: post_checkout: - # - bash docs/maybe_skip_pr_build.sh - git fetch origin main --unshallow --no-tags --filter=blob:none || true pre_create_environment: - pip install uv diff --git a/CMakeLists.txt b/CMakeLists.txt index bf4ac05e4f29..13788fa87437 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -307,12 +307,12 @@ set(VLLM_EXT_SRC "csrc/quantization/activation_kernels.cu" "csrc/cuda_utils_kernels.cu" "csrc/custom_all_reduce.cu" - "csrc/torch_bindings.cpp") + "csrc/torch_bindings.cpp" + "csrc/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu") if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_EXT_SRC - "csrc/minimax_reduce_rms_kernel.cu" - "csrc/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu") + "csrc/minimax_reduce_rms_kernel.cu") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") @@ -1047,13 +1047,13 @@ endif() set(VLLM_MOE_EXT_SRC "csrc/moe/torch_bindings.cpp" "csrc/moe/moe_align_sum_kernels.cu" - "csrc/moe/topk_softmax_kernels.cu") + "csrc/moe/topk_softmax_kernels.cu" + "csrc/moe/topk_softplus_sqrt_kernels.cu") if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu" - "csrc/moe/grouped_topk_kernels.cu" - "csrc/moe/topk_softplus_sqrt_kernels.cu") + "csrc/moe/grouped_topk_kernels.cu") endif() if(VLLM_GPU_LANG STREQUAL "CUDA") diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py index 3e3c79217982..2f56099c66fd 100644 --- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py +++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py @@ -1473,6 +1473,12 @@ async def main() -> None: "(for example: --warmup-percentages=0%%,50%%)", ) + parser.add_argument( + "--trust-remote-code", + action="store_true", + help="Trust remote code when loading the tokenizer.", + ) + args = parser.parse_args() logger.info(args) @@ -1515,7 +1521,9 @@ async def main() -> None: np.random.seed(args.seed) logger.info("Loading tokenizer") - tokenizer = AutoTokenizer.from_pretrained(args.model) + tokenizer = AutoTokenizer.from_pretrained( + args.model, trust_remote_code=args.trust_remote_code + ) await get_server_info(args.url) diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 8535186cc1ec..d27a5ea93dea 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -32,18 +32,23 @@ else() "-DVLLM_CPU_EXTENSION") # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0) - # and create a local shim dir with it + # and create a local shim dir with it. When PyTorch is built from source or packaged + # by a distro (common on RISC-V, s390x, Fedora/RHEL aarch64), no vendored libgomp + # exists and the shim dir is empty; fall back to the system libgomp in that case. vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR) - find_library(OPEN_MP - NAMES gomp - PATHS ${VLLM_TORCH_GOMP_SHIM_DIR} - NO_DEFAULT_PATH - REQUIRED - ) - # Set LD_LIBRARY_PATH to include the shim dir at build time to use the same libgomp as PyTorch - if (OPEN_MP) + if(VLLM_TORCH_GOMP_SHIM_DIR) + find_library(OPEN_MP + NAMES gomp + PATHS "${VLLM_TORCH_GOMP_SHIM_DIR}" + NO_DEFAULT_PATH + REQUIRED + ) + # Use the same libgomp as PyTorch at runtime set(ENV{LD_LIBRARY_PATH} "${VLLM_TORCH_GOMP_SHIM_DIR}:$ENV{LD_LIBRARY_PATH}") + else() + # Fall back to system / toolchain libgomp + find_library(OPEN_MP NAMES gomp REQUIRED) endif() endif() @@ -321,14 +326,6 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND set(ONEDNN_VERBOSE "ON") set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) - # TODO: Refactor this - if (ENABLE_X86_ISA) - # Note: only enable oneDNN for AVX512 - list(APPEND DNNL_COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512}) - else() - list(APPEND DNNL_COMPILE_FLAGS ${CXX_COMPILE_FLAGS}) - endif() - set(VLLM_BUILD_TYPE ${CMAKE_BUILD_TYPE}) set(CMAKE_BUILD_TYPE "Release") # remove oneDNN debug symbols to reduce size FetchContent_MakeAvailable(oneDNN) @@ -341,8 +338,14 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND PRIVATE ${oneDNN_SOURCE_DIR}/src ) target_link_libraries(dnnl_ext dnnl torch) - target_compile_options(dnnl_ext PRIVATE ${DNNL_COMPILE_FLAGS} -fPIC) + if (ENABLE_X86_ISA) + target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS_AVX2} -fPIC) + else() + target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC) + endif() list(APPEND LIBS dnnl_ext) + + set(USE_ONEDNN ON) else() set(USE_ONEDNN OFF) @@ -406,12 +409,15 @@ endif() if (ENABLE_X86_ISA) set(VLLM_EXT_SRC_SGL + "csrc/cpu/sgl-kernels/fla.cpp" + "csrc/cpu/sgl-kernels/conv.cpp" "csrc/cpu/sgl-kernels/gemm.cpp" "csrc/cpu/sgl-kernels/gemm_int8.cpp" "csrc/cpu/sgl-kernels/gemm_fp8.cpp" "csrc/cpu/sgl-kernels/gemm_int4.cpp" "csrc/cpu/sgl-kernels/moe.cpp" "csrc/cpu/sgl-kernels/moe_int8.cpp" + "csrc/cpu/sgl-kernels/moe_int4.cpp" "csrc/cpu/sgl-kernels/moe_fp8.cpp") set(VLLM_EXT_SRC_AVX512 @@ -430,10 +436,11 @@ if (ENABLE_X86_ISA) "csrc/cpu/pos_encoding.cpp" "csrc/moe/dynamic_4bit_int_moe_cpu.cpp") - set(VLLM_EXT_SRC_AVX2 + set(VLLM_EXT_SRC_AVX2 "csrc/cpu/utils.cpp" "csrc/cpu/spec_decode_utils.cpp" "csrc/cpu/cpu_attn.cpp" + "csrc/cpu/dnnl_kernels.cpp" "csrc/cpu/torch_bindings.cpp" # TODO: Remove these files "csrc/cpu/activation.cpp" @@ -448,7 +455,7 @@ if (ENABLE_X86_ISA) set(_C_LIBS numa dnnl_ext) set(_C_AVX512_LIBS numa dnnl_ext) - set(_C_AVX2_LIBS numa) + set(_C_AVX2_LIBS numa dnnl_ext) # AMX + AVX512F + AVX512BF16 + AVX512VNNI define_extension_target( diff --git a/cmake/external_projects/deepgemm.cmake b/cmake/external_projects/deepgemm.cmake index b821b90ec8e9..0d7ea43fb7d0 100644 --- a/cmake/external_projects/deepgemm.cmake +++ b/cmake/external_projects/deepgemm.cmake @@ -59,26 +59,11 @@ if(DEEPGEMM_ARCHS) # Build the _C pybind11 extension from DeepGEMM's C++ source. # This is a CXX-only module — CUDA kernels are JIT-compiled at runtime. # - # Free-threaded Python doesn't yet support the stable ABI, so skip USE_SABI - # there. (The other vLLM extensions get this guard for free via - # define_extension_target; this target uses raw Python_add_library.) - run_python(IS_FREETHREADED_PYTHON - "import sysconfig; print(1 if sysconfig.get_config_var(\"Py_GIL_DISABLED\") else 0)" - "Failed to determine whether interpreter is free-threaded") - if (NOT IS_FREETHREADED_PYTHON) - Python_add_library(_deep_gemm_C MODULE WITH_SOABI USE_SABI 3 - "${deepgemm_SOURCE_DIR}/csrc/python_api.cpp") - else() - Python_add_library(_deep_gemm_C MODULE WITH_SOABI - "${deepgemm_SOURCE_DIR}/csrc/python_api.cpp") - endif() + Python_add_library(_deep_gemm_C MODULE WITH_SOABI + "${deepgemm_SOURCE_DIR}/csrc/python_api.cpp") # The pybind11 module name must be _C to match DeepGEMM's Python imports. - # Place the build artifact in a subdir so it doesn't collide with vLLM's own - # `_C.abi3.so` in the build tree (the install destination still differs). - set_target_properties(_deep_gemm_C PROPERTIES - OUTPUT_NAME "_C" - LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/deep_gemm") + set_target_properties(_deep_gemm_C PROPERTIES OUTPUT_NAME "_C") target_compile_definitions(_deep_gemm_C PRIVATE "-DTORCH_EXTENSION_NAME=_C") @@ -90,15 +75,11 @@ if(DEEPGEMM_ARCHS) "${deepgemm_SOURCE_DIR}/third-party/cutlass/tools/util/include" "${deepgemm_SOURCE_DIR}/third-party/fmt/include") - # Keep Stable ABI for the module, but *not* for CUDA/C++ files. - # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles. target_compile_options(_deep_gemm_C PRIVATE $<$:-std=c++17> $<$:-O3> $<$:-Wno-psabi> - $<$:-Wno-deprecated-declarations> - $<$:-UPy_LIMITED_API> - $<$:-UPy_LIMITED_API>) + $<$:-Wno-deprecated-declarations>) # torch_python is required because DeepGEMM uses pybind11 type casters # for at::Tensor (via PYBIND11_MODULE), unlike vLLM's own extensions which diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 7e456d32598b..895490f45a79 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -97,13 +97,13 @@ void swap_blocks_batch(const torch::Tensor& src_ptrs, const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - // Use cuMemcpyBatchAsync (CUDA 12.8+) to submit all copies in a single - // driver call, amortizing per-copy submission overhead. - // int64_t and CUdeviceptr/size_t are both 8 bytes on 64-bit platforms, - // so we reinterpret_cast the tensor data directly to avoid copies. - static_assert(sizeof(CUdeviceptr) == sizeof(int64_t)); + // Use cuMemcpyBatchAsync / hipMemcpyBatchAsync to submit all copies in a + // single driver call, amortizing per-copy submission overhead. int64_t + // and CUdeviceptr/void*/size_t are all 8 bytes on 64-bit platforms, so we + // reinterpret_cast the tensor data directly to avoid copies. static_assert(sizeof(size_t) == sizeof(int64_t)); #if !defined(USE_ROCM) && defined(CUDA_VERSION) && CUDA_VERSION >= 12080 + static_assert(sizeof(CUdeviceptr) == sizeof(int64_t)); // Resolve cuMemcpyBatchAsync at runtime via cuGetProcAddress so that // binaries compiled with CUDA 12.8+ still work on older drivers, and // we avoid the CUDA 13.0 header remapping (#define to _v2 signature). @@ -134,12 +134,30 @@ void swap_blocks_batch(const torch::Tensor& src_ptrs, &fail_idx, static_cast(stream)); TORCH_CHECK(result == CUDA_SUCCESS, "cuMemcpyBatchAsync failed at index ", fail_idx, " with error ", result); - } else + return; + } +#elif defined(USE_ROCM) && defined(HIP_VERSION) && HIP_VERSION >= 70100000 + // ROCm 7.1+ exposes hipMemcpyBatchAsync. The 7.2.1 implementation early- + // returns hipErrorNotSupported whenever numAttrs > 0 (see ROCm/clr @ + // rocm-7.2.1 hipamd/src/hip_memory.cpp:2819-2822), so call with + // numAttrs=0. + { + hipMemcpyAttributes attr = {}; + size_t attrs_idx = 0; + size_t fail_idx = 0; + hipError_t result = hipMemcpyBatchAsync( + reinterpret_cast(dst_data), reinterpret_cast(src_data), + reinterpret_cast(size_data), static_cast(n), &attr, + &attrs_idx, 0, &fail_idx, static_cast(stream)); + TORCH_CHECK(result == hipSuccess, "hipMemcpyBatchAsync failed at index ", + fail_idx, " with error ", result); + return; + } #endif { - // Fallback for CUDA < 12.8, older drivers, and ROCm: - // individual async copies. - // cudaMemcpyDefault lets the driver infer direction from pointer types. + // Fallback for CUDA < 12.8, older CUDA drivers, and ROCm < 7.1: + // individual async copies. cudaMemcpyDefault lets the driver infer + // direction from pointer types. for (int64_t i = 0; i < n; i++) { cudaMemcpyAsync(reinterpret_cast(dst_data[i]), reinterpret_cast(src_data[i]), diff --git a/csrc/cpu/cpu_attn.cpp b/csrc/cpu/cpu_attn.cpp index 18afe4b7925c..4750dd78838d 100644 --- a/csrc/cpu/cpu_attn.cpp +++ b/csrc/cpu/cpu_attn.cpp @@ -29,6 +29,8 @@ torch::Tensor get_scheduler_metadata( isa = cpu_attention::ISA::NEON; } else if (isa_hint == "vxe") { isa = cpu_attention::ISA::VXE; + } else if (isa_hint == "vsx") { + isa = cpu_attention::ISA::VSX; } else { TORCH_CHECK(false, "Unsupported CPU attention ISA hint: " + isa_hint); } @@ -129,6 +131,8 @@ void cpu_attn_reshape_and_cache( return cpu_attention::ISA::NEON; } else if (isa == "vxe") { return cpu_attention::ISA::VXE; + } else if (isa == "vsx") { + return cpu_attention::ISA::VSX; } else { TORCH_CHECK(false, "Invalid ISA type: " + isa); } diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp index f5b473bd262a..b9987fb26c19 100644 --- a/csrc/cpu/cpu_attn_impl.hpp +++ b/csrc/cpu/cpu_attn_impl.hpp @@ -12,7 +12,7 @@ #include "cpu/utils.hpp" namespace cpu_attention { -enum class ISA { AMX, VEC, VEC16, NEON, VXE }; +enum class ISA { AMX, VEC, VEC16, NEON, VXE, VSX }; // Mirrors csrc/attention/dtype_fp8.cuh Fp8KVCacheDataType exactly. enum class Fp8KVCacheDataType { @@ -164,6 +164,9 @@ struct AttentionMetadata { case ISA::VXE: ss << "VXE, "; break; + case ISA::VSX: + ss << "VSX, "; + break; } ss << "workitem_group_num: " << workitem_group_num << ", reduction_item_num: " << reduction_item_num diff --git a/csrc/cpu/cpu_attn_vec.hpp b/csrc/cpu/cpu_attn_vec.hpp index 61cae12d67da..c3983e0578a5 100644 --- a/csrc/cpu/cpu_attn_vec.hpp +++ b/csrc/cpu/cpu_attn_vec.hpp @@ -27,8 +27,8 @@ FORCE_INLINE std::pair load_b_pair_vec( return {vec_op::FP32Vec16(bf16_b_reg, 0), vec_op::FP32Vec16(bf16_b_reg, 1)}; } else { using load_vec_t = typename VecTypeTrait::vec_t; - return {vec_op::FP32Vec16(load_vec_t(ptr)), - vec_op::FP32Vec16(load_vec_t(ptr + 16))}; + return std::make_pair(vec_op::FP32Vec16(load_vec_t(ptr)), + vec_op::FP32Vec16(load_vec_t(ptr + 16))); } } diff --git a/csrc/cpu/cpu_attn_vsx.hpp b/csrc/cpu/cpu_attn_vsx.hpp new file mode 100644 index 000000000000..c7e1502bcb05 --- /dev/null +++ b/csrc/cpu/cpu_attn_vsx.hpp @@ -0,0 +1,359 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright contributors to the vLLM project +#ifndef CPU_ATTN_VSX_HPP +#define CPU_ATTN_VSX_HPP + +#include "cpu_attn_impl.hpp" +#include +#include + +namespace cpu_attention { + +namespace { + +// ppc64le Vector = 16 bytes (128 bits) +#define BLOCK_SIZE_ALIGNMENT 32 +#define HEAD_SIZE_ALIGNMENT 32 +#define MAX_Q_HEAD_NUM_PER_ITER 16 + +template +FORCE_INLINE void load_row8_B_as_f32(const kv_cache_t* p, __vector float& b0, + __vector float& b1); + +// [1] Float Specialization +template <> +FORCE_INLINE void load_row8_B_as_f32(const float* p, __vector float& b0, + __vector float& b1) { + b0 = vec_xl(0, const_cast(p)); + b1 = vec_xl(0, const_cast(p + 4)); +} + +// [2] BFloat16 Specialization (Little Endian ppc64le) +// On ppc64le (LE): BF16 bits should land in the HIGH 16 bits of each float32. +// Byte layout of float32 on LE: [byte0(LSB), byte1, byte2, byte3(MSB)] +// We need BF16 in bytes2-3 (high half) with bytes0-1 zeroed. +// vec_mergeh on LE interleaves elements 0..3: result_i = {a[i], b[i]} +// So vec_mergeh(zeros_u16, raw_u16) gives for each uint16 pair: +// uint16[2i] = zeros[i] -> low 16 bits of uint32 -> zeroed mantissa LSBs +// uint16[2i+1] = raw[i] -> high 16 bits of uint32 -> BF16 bits +// Cast to float32 gives exactly (bf16_bits << 16) per element. +template <> +FORCE_INLINE void load_row8_B_as_f32(const c10::BFloat16* p, + __vector float& b0, + __vector float& b1) { + __vector unsigned short raw = vec_xl( + 0, reinterpret_cast(const_cast(p))); + __vector unsigned short zeros = vec_splat_u16(0); + + // LE: zeros in low 16 bits, raw in high 16 bits → bf16 << 16 == float32 + b0 = (__vector float)vec_mergeh(zeros, raw); + b1 = (__vector float)vec_mergel(zeros, raw); +} + +// Note: c10::Half (FP16) is not supported on PowerPC architecture + +template +FORCE_INLINE void gemm_micro_ppc64le_Mx8_Ku4( + const float* __restrict A, // [M x K] + const kv_cache_t* __restrict B, // [K x 8] + float* __restrict C, // [M x 8] + int64_t lda, int64_t ldb, int64_t ldc, int32_t K, bool accumulate) { + static_assert(1 <= M && M <= 8, "M must be in [1,8]"); + +#define ROWS_APPLY(OP) OP(0) OP(1) OP(2) OP(3) OP(4) OP(5) OP(6) OP(7) +#define IF_M(i) if constexpr (M > (i)) + + // 1. Define A pointers +#define DECL_A(i) const float* a##i = A + (i) * lda; + ROWS_APPLY(DECL_A) +#undef DECL_A + + // 2. Define Accumulators (2 vectors covers 8 columns) +#define DECL_ACC(i) __vector float acc##i##_0, acc##i##_1; + ROWS_APPLY(DECL_ACC) +#undef DECL_ACC + + // 3. Initialize Accumulators (Load C or Zero) +#define INIT_ACC(i) \ + IF_M(i) { \ + if (accumulate) { \ + acc##i##_0 = vec_xl(0, const_cast(C + (i) * ldc + 0)); \ + acc##i##_1 = vec_xl(0, const_cast(C + (i) * ldc + 4)); \ + } else { \ + acc##i##_0 = vec_splats(0.0f); \ + acc##i##_1 = vec_splats(0.0f); \ + } \ + } + ROWS_APPLY(INIT_ACC) +#undef INIT_ACC + + int32_t k = 0; + + for (; k + 3 < K; k += 4) { + // Load 4 values of A for each Row M: A[k...k+3] +#define LOAD_A4(i) \ + __vector float a##i##v; \ + IF_M(i) a##i##v = vec_xl(0, const_cast(a##i + k)); + ROWS_APPLY(LOAD_A4) +#undef LOAD_A4 + + // FMA for specific lane L of A + // ppc64le: vec_madd(b, vec_splat(a, lane), acc) +#define FMAS_LANE(i, aiv, L) \ + IF_M(i) { \ + __vector float a_broad = vec_splat(aiv, L); \ + acc##i##_0 = vec_madd(b0, a_broad, acc##i##_0); \ + acc##i##_1 = vec_madd(b1, a_broad, acc##i##_1); \ + } + + // Unroll K=0..3 + { + __vector float b0, b1; + load_row8_B_as_f32(B + (int64_t)(k + 0) * ldb, b0, b1); +#define STEP_K0(i) FMAS_LANE(i, a##i##v, 0) + ROWS_APPLY(STEP_K0) +#undef STEP_K0 + } + { + __vector float b0, b1; + load_row8_B_as_f32(B + (int64_t)(k + 1) * ldb, b0, b1); +#define STEP_K1(i) FMAS_LANE(i, a##i##v, 1) + ROWS_APPLY(STEP_K1) +#undef STEP_K1 + } + { + __vector float b0, b1; + load_row8_B_as_f32(B + (int64_t)(k + 2) * ldb, b0, b1); +#define STEP_K2(i) FMAS_LANE(i, a##i##v, 2) + ROWS_APPLY(STEP_K2) +#undef STEP_K2 + } + { + __vector float b0, b1; + load_row8_B_as_f32(B + (int64_t)(k + 3) * ldb, b0, b1); +#define STEP_K3(i) FMAS_LANE(i, a##i##v, 3) + ROWS_APPLY(STEP_K3) +#undef STEP_K3 + } +#undef FMAS_LANE + } + + for (; k < K; ++k) { + __vector float b0, b1; + load_row8_B_as_f32(B + (int64_t)k * ldb, b0, b1); +#define TAIL_ROW(i) \ + IF_M(i) { \ + __vector float ai = vec_splats(*(a##i + k)); \ + acc##i##_0 = vec_madd(b0, ai, acc##i##_0); \ + acc##i##_1 = vec_madd(b1, ai, acc##i##_1); \ + } + ROWS_APPLY(TAIL_ROW) +#undef TAIL_ROW + } + +#define STORE_ROW(i) \ + IF_M(i) { \ + vec_xst(acc##i##_0, 0, C + (i) * ldc + 0); \ + vec_xst(acc##i##_1, 0, C + (i) * ldc + 4); \ + } + ROWS_APPLY(STORE_ROW) +#undef STORE_ROW + +#undef ROWS_APPLY +#undef IF_M +} + +template +FORCE_INLINE void gemm_macro_ppc64le_Mx8_Ku4(const float* __restrict A, + const kv_cache_t* __restrict B, + float* __restrict C, int32_t M, + int32_t K, int64_t lda, + int64_t ldb, int64_t ldc, + bool accumulate) { + static_assert(N % 8 == 0, "N must be a multiple of 8"); + for (int32_t m = 0; m < M;) { + int32_t mb = (M - m >= 8) ? 8 : (M - m >= 4) ? 4 : (M - m >= 2) ? 2 : 1; + const float* Ab = A + m * lda; + float* Cb = C + m * ldc; + + for (int32_t n = 0; n < N; n += 8) { + const kv_cache_t* Bn = B + n; + float* Cn = Cb + n; + switch (mb) { + case 8: + gemm_micro_ppc64le_Mx8_Ku4<8, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, + K, accumulate); + break; + case 4: + gemm_micro_ppc64le_Mx8_Ku4<4, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, + K, accumulate); + break; + case 2: + gemm_micro_ppc64le_Mx8_Ku4<2, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, + K, accumulate); + break; + default: + gemm_micro_ppc64le_Mx8_Ku4<1, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, + K, accumulate); + break; + } + } + m += mb; + } +} + +template +class TileGemmPPC64 { + public: + template + FORCE_INLINE static void gemm(const int32_t m_size, + float* __restrict__ a_tile, + kv_cache_t* __restrict__ b_tile, + float* __restrict__ c_tile, const int64_t lda, + const int64_t ldb, const int64_t ldc, + const int32_t block_size, + const int32_t dynamic_k_size, + const bool accum_c) { + if constexpr (phase == AttentionGemmPhase::QK) { + gemm_macro_ppc64le_Mx8_Ku4( + a_tile, b_tile, c_tile, m_size, k_size, lda, ldb, ldc, accum_c); + } else { + gemm_macro_ppc64le_Mx8_Ku4( + a_tile, b_tile, c_tile, m_size, dynamic_k_size, lda, ldb, ldc, + accum_c); + } + } +}; + +} // namespace + +template +class AttentionImpl { + public: + using query_t = scalar_t; + using q_buffer_t = float; + using kv_cache_t = scalar_t; + using logits_buffer_t = float; + using partial_output_buffer_t = float; + using prob_buffer_t = float; + + constexpr static int64_t BlockSizeAlignment = BLOCK_SIZE_ALIGNMENT; + constexpr static int64_t HeadDimAlignment = HEAD_SIZE_ALIGNMENT; + constexpr static int64_t MaxQHeadNumPerIteration = MAX_Q_HEAD_NUM_PER_ITER; + constexpr static int64_t HeadDim = head_dim; + constexpr static ISA ISAType = ISA::VSX; + constexpr static bool scale_on_logits = + false; // Scale is applied to Q during copy + + public: + AttentionImpl() {} + + template