diff --git a/.buildkite/test_areas/disaggregated.yaml b/.buildkite/test_areas/disaggregated.yaml index d3e02be23981..67ea2f826583 100644 --- a/.buildkite/test_areas/disaggregated.yaml +++ b/.buildkite/test_areas/disaggregated.yaml @@ -1,3 +1,4 @@ +# We assume uv pip install -r requirements/kv_connectors.txt is run in the image-build step. group: Disaggregated depends_on: - image-build @@ -11,7 +12,6 @@ steps: - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - tests/v1/kv_connector/nixl_integration/ commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - label: Distributed FlashInfer NixlConnector PD accuracy (4 GPUs) key: distributed-flashinfer-nixlconnector-pd-accuracy-4-gpus @@ -22,7 +22,6 @@ steps: - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - tests/v1/kv_connector/nixl_integration/ commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - FLASHINFER=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs) @@ -34,7 +33,6 @@ steps: - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - tests/v1/kv_connector/nixl_integration/ commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs) @@ -46,7 +44,6 @@ steps: - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - tests/v1/kv_connector/nixl_integration/ commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - label: Hybrid SSM NixlConnector PD accuracy tests (4 GPUs) @@ -58,7 +55,6 @@ steps: - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - tests/v1/kv_connector/nixl_integration/ commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - HYBRID_SSM=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - label: MultiConnector (Nixl+Offloading) PD accuracy (2 GPUs) @@ -73,7 +69,6 @@ steps: - vllm/distributed/kv_transfer/kv_connector/v1/offloading/ - tests/v1/kv_connector/nixl_integration/ commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - bash v1/kv_connector/nixl_integration/run_multi_connector_accuracy_test.sh - label: NixlConnector PD + Spec Decode acceptance (2 GPUs) @@ -87,7 +82,6 @@ steps: - vllm/v1/worker/kv_connector_model_runner_mixin.py - tests/v1/kv_connector/nixl_integration/ commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - bash v1/kv_connector/nixl_integration/config_sweep_spec_decode_test.sh - label: MultiConnector (Nixl+Offloading) PD edge cases (2 GPUs) @@ -102,5 +96,4 @@ steps: - vllm/distributed/kv_transfer/kv_connector/v1/offloading/ - tests/v1/kv_connector/nixl_integration/ commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - bash v1/kv_connector/nixl_integration/run_multi_connector_edge_case_test.sh \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile index 06cdc0b667f9..782edde3d53f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -887,6 +887,31 @@ RUN apt-get update -y \ # We can specify the standard or nightly build of PyTorch ARG PYTORCH_NIGHTLY +# install kv_connectors if requested (same logic as vllm-openai-base) +ARG INSTALL_KV_CONNECTORS=false +ARG CUDA_VERSION +ARG torch_cuda_arch_list='7.5 8.0 8.6 8.9 9.0 10.0 11.0 12.0+PTX' +ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} +RUN --mount=type=cache,target=/opt/uv/cache \ + --mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \ + CUDA_MAJOR="${CUDA_VERSION%%.*}"; \ + CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-'); \ + CUDA_HOME=/usr/local/cuda; \ + BUILD_PKGS="libcusparse-dev-${CUDA_VERSION_DASH} \ + libcublas-dev-${CUDA_VERSION_DASH} \ + libcusolver-dev-${CUDA_VERSION_DASH}"; \ + if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \ + uv pip install --system -r /tmp/kv_connectors.txt --no-build || ( \ + apt-get update -y && \ + apt-get install -y --no-install-recommends --allow-change-held-packages ${BUILD_PKGS} && \ + uv pip install --system -r /tmp/kv_connectors.txt --no-build-isolation && \ + apt-get purge -y ${BUILD_PKGS} && \ + rm -rf /var/lib/apt/lists/* \ + ); \ + uv pip uninstall --system nixl-cu12 nixl-cu13 2>/dev/null || true; \ + uv pip install --system --no-deps nixl-cu${CUDA_MAJOR}; \ + fi + # Install development dependencies (for testing) COPY requirements/lint.txt requirements/lint.txt COPY requirements/test/cuda.in requirements/test/cuda.in @@ -922,6 +947,10 @@ RUN --mount=type=cache,target=/opt/uv/cache \ RUN --mount=type=cache,target=/opt/uv/cache \ uv pip install --system -e tests/vllm_test_utils +# Prevent early CUDA initialization when CUDA-dependent packages (cupy, nixl) +# are installed — tests that fork subprocesses would otherwise fail. +ENV CUDA_MODULE_LOADING=LAZY + # enable fast downloads from hf (for testing) ENV HF_XET_HIGH_PERFORMANCE 1 @@ -936,6 +965,7 @@ COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1 # will not be imported by other tests RUN mkdir src RUN mv vllm src/vllm + #################### TEST IMAGE #################### #################### OPENAI API SERVER #################### @@ -978,8 +1008,10 @@ RUN --mount=type=cache,target=/opt/uv/cache \ # clean up -dev packages, keep runtime libraries rm -rf /var/lib/apt/lists/* \ ); \ - # Force-reinstall the matching CUDA wheel so the correct nixl_ep_cpp.so is installed. - uv pip install --system --force-reinstall --no-deps nixl-cu${CUDA_MAJOR}; \ + # Remove all nixl-cu* variants then install only the one matching this + # image's CUDA (nixl>=1.1.0 installs both) + uv pip uninstall --system nixl-cu12 nixl-cu13 2>/dev/null || true; \ + uv pip install --system --no-deps nixl-cu${CUDA_MAJOR}; \ fi # Optional override: install mooncake-transfer-engine from a URL instead of the diff --git a/docker/docker-bake.hcl b/docker/docker-bake.hcl index 94ca8397561a..1b58373afd0d 100644 --- a/docker/docker-bake.hcl +++ b/docker/docker-bake.hcl @@ -59,13 +59,14 @@ target "_common" { dockerfile = "docker/Dockerfile" context = "." args = { - max_jobs = MAX_JOBS - nvcc_threads = NVCC_THREADS - torch_cuda_arch_list = TORCH_CUDA_ARCH_LIST - VLLM_BUILD_COMMIT = VLLM_BUILD_COMMIT != "unknown" ? VLLM_BUILD_COMMIT : (COMMIT != "" ? COMMIT : "unknown") - VLLM_BUILD_PIPELINE = VLLM_BUILD_PIPELINE - VLLM_BUILD_URL = VLLM_BUILD_URL - VLLM_IMAGE_TAG = VLLM_IMAGE_TAG + max_jobs = MAX_JOBS + nvcc_threads = NVCC_THREADS + torch_cuda_arch_list = TORCH_CUDA_ARCH_LIST + INSTALL_KV_CONNECTORS = "true" + VLLM_BUILD_COMMIT = VLLM_BUILD_COMMIT != "unknown" ? VLLM_BUILD_COMMIT : (COMMIT != "" ? COMMIT : "unknown") + VLLM_BUILD_PIPELINE = VLLM_BUILD_PIPELINE + VLLM_BUILD_URL = VLLM_BUILD_URL + VLLM_IMAGE_TAG = VLLM_IMAGE_TAG } }