diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index a1de41652c9a..8c6ef7817aaf 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -1,24 +1,22 @@ steps: # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9 - label: "Build arm64 wheel - CUDA 12.9" + depends_on: ~ id: build-wheel-arm64-cuda-12-9 agents: queue: arm64_cpu_queue_postmerge commands: # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-wheels.sh" env: DOCKER_BUILDKIT: "1" - - block: "Build CUDA 12.8 wheel" - key: block-build-cu128-wheel - - label: "Build wheel - CUDA 12.8" - depends_on: block-build-cu128-wheel + depends_on: ~ id: build-wheel-cuda-12-8 agents: queue: cpu_queue_postmerge @@ -30,12 +28,8 @@ steps: env: DOCKER_BUILDKIT: "1" - - block: "Build CUDA 12.6 wheel" - key: block-build-cu126-wheel - depends_on: ~ - - label: "Build wheel - CUDA 12.6" - depends_on: block-build-cu126-wheel + depends_on: ~ id: build-wheel-cuda-12-6 agents: queue: cpu_queue_postmerge @@ -102,8 +96,6 @@ steps: depends_on: - create-multi-arch-manifest - build-wheel-cuda-12-8 - - build-wheel-cuda-12-6 - - build-wheel-cuda-12-9 id: annotate-release-workflow agents: queue: cpu_queue_postmerge diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh index 94e0ac2398f3..fde48603ad3c 100755 --- a/.buildkite/scripts/annotate-release.sh +++ b/.buildkite/scripts/annotate-release.sh @@ -14,18 +14,33 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF To download the wheel: \`\`\` aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl . +aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl . + aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl . -aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . +aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl . \`\`\` To download and upload the image: \`\`\` -docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} -docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai -docker tag vllm/vllm-openai vllm/vllm-openai:latest -docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION} -docker push vllm/vllm-openai:latest -docker push vllm/vllm-openai:v${RELEASE_VERSION} +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64 +docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64 +docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 +docker push vllm/vllm-openai:latest-x86_64 +docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 + +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64 +docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64 +docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 +docker push vllm/vllm-openai:latest-aarch64 +docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 + +docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend +docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend +docker manifest push vllm/vllm-openai:latest +docker manifest push vllm/vllm-openai:v${RELEASE_VERSION} \`\`\` EOF \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile index 307e9658f717..d4761e84f28e 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -196,6 +196,7 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0 # Flag to control whether to use pre-built vLLM wheels ARG VLLM_USE_PRECOMPILED="" +ARG VLLM_MAIN_CUDA_VERSION="" # if USE_SCCACHE is set, use sccache to speed up compilation RUN --mount=type=cache,target=/root/.cache/uv \ @@ -213,6 +214,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ && export SCCACHE_IDLE_TIMEOUT=0 \ && export CMAKE_BUILD_TYPE=Release \ && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \ + && export VLLM_MAIN_CUDA_VERSION="${VLLM_MAIN_CUDA_VERSION}" \ && export VLLM_DOCKER_BUILD_CONTEXT=1 \ && sccache --show-stats \ && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \ diff --git a/setup.py b/setup.py index eb313b7d219c..67f65d9b9f56 100644 --- a/setup.py +++ b/setup.py @@ -56,8 +56,6 @@ def load_module_from_path(module_name, path): # fallback to cpu VLLM_TARGET_DEVICE = "cpu" -MAIN_CUDA_VERSION = "12.8" - def is_sccache_available() -> bool: return which("sccache") is not None and \ @@ -507,7 +505,7 @@ def get_vllm_version() -> str: version += f"{sep}precompiled" else: cuda_version = str(get_nvcc_cuda_version()) - if cuda_version != MAIN_CUDA_VERSION: + if cuda_version != envs.VLLM_MAIN_CUDA_VERSION: cuda_version_str = cuda_version.replace(".", "")[:3] # skip this for source tarball, required for pypi if "sdist" not in sys.argv: @@ -515,7 +513,7 @@ def get_vllm_version() -> str: elif _is_hip(): # Get the Rocm Version rocm_version = get_rocm_version() or torch.version.hip - if rocm_version and rocm_version != MAIN_CUDA_VERSION: + if rocm_version and rocm_version != envs.VLLM_MAIN_CUDA_VERSION: version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}" elif _is_tpu(): version += f"{sep}tpu" diff --git a/vllm/envs.py b/vllm/envs.py index bb10c7cc2ac2..10ac28413bbc 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -70,6 +70,7 @@ VLLM_VIDEO_LOADER_BACKEND: str = "opencv" VLLM_MM_INPUT_CACHE_GIB: int = 4 VLLM_TARGET_DEVICE: str = "cuda" + VLLM_MAIN_CUDA_VERSION: str = "12.8" MAX_JOBS: Optional[str] = None NVCC_THREADS: Optional[str] = None VLLM_USE_PRECOMPILED: bool = False @@ -247,6 +248,11 @@ def get_vllm_port() -> Optional[int]: "VLLM_TARGET_DEVICE": lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(), + # Main CUDA version of vLLM, supporting [12.6, 12.8, 12.9], + # 12.8 is the default. This follows PyTorch but can be overridden. + "VLLM_MAIN_CUDA_VERSION": + lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower() or "12.8", + # Maximum number of compilation jobs to run in parallel. # By default this is the number of CPUs "MAX_JOBS":