diff --git a/.buildkite/test_areas/disaggregated.yaml b/.buildkite/test_areas/disaggregated.yaml index d3e02be23981..67ea2f826583 100644 --- a/.buildkite/test_areas/disaggregated.yaml +++ b/.buildkite/test_areas/disaggregated.yaml @@ -1,3 +1,4 @@ +# We assume uv pip install -r requirements/kv_connectors.txt is run in the image-build step. group: Disaggregated depends_on: - image-build @@ -11,7 +12,6 @@ steps: - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - tests/v1/kv_connector/nixl_integration/ commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - label: Distributed FlashInfer NixlConnector PD accuracy (4 GPUs) key: distributed-flashinfer-nixlconnector-pd-accuracy-4-gpus @@ -22,7 +22,6 @@ steps: - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - tests/v1/kv_connector/nixl_integration/ commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - FLASHINFER=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs) @@ -34,7 +33,6 @@ steps: - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - tests/v1/kv_connector/nixl_integration/ commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs) @@ -46,7 +44,6 @@ steps: - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - tests/v1/kv_connector/nixl_integration/ commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - label: Hybrid SSM NixlConnector PD accuracy tests (4 GPUs) @@ -58,7 +55,6 @@ steps: - vllm/distributed/kv_transfer/kv_connector/v1/nixl/ - tests/v1/kv_connector/nixl_integration/ commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - HYBRID_SSM=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh - label: MultiConnector (Nixl+Offloading) PD accuracy (2 GPUs) @@ -73,7 +69,6 @@ steps: - vllm/distributed/kv_transfer/kv_connector/v1/offloading/ - tests/v1/kv_connector/nixl_integration/ commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - bash v1/kv_connector/nixl_integration/run_multi_connector_accuracy_test.sh - label: NixlConnector PD + Spec Decode acceptance (2 GPUs) @@ -87,7 +82,6 @@ steps: - vllm/v1/worker/kv_connector_model_runner_mixin.py - tests/v1/kv_connector/nixl_integration/ commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - bash v1/kv_connector/nixl_integration/config_sweep_spec_decode_test.sh - label: MultiConnector (Nixl+Offloading) PD edge cases (2 GPUs) @@ -102,5 +96,4 @@ steps: - vllm/distributed/kv_transfer/kv_connector/v1/offloading/ - tests/v1/kv_connector/nixl_integration/ commands: - - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt - bash v1/kv_connector/nixl_integration/run_multi_connector_edge_case_test.sh \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile index 06cdc0b667f9..db758b76422b 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -887,6 +887,31 @@ RUN apt-get update -y \ # We can specify the standard or nightly build of PyTorch ARG PYTORCH_NIGHTLY +# install kv_connectors if requested (same logic as vllm-openai-base) +ARG INSTALL_KV_CONNECTORS=false +ARG CUDA_VERSION +ARG torch_cuda_arch_list='7.5 8.0 8.6 8.9 9.0 10.0 11.0 12.0+PTX' +ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} +RUN --mount=type=cache,target=/opt/uv/cache \ + --mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \ + CUDA_MAJOR="${CUDA_VERSION%%.*}"; \ + CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-'); \ + CUDA_HOME=/usr/local/cuda; \ + BUILD_PKGS="libcusparse-dev-${CUDA_VERSION_DASH} \ + libcublas-dev-${CUDA_VERSION_DASH} \ + libcusolver-dev-${CUDA_VERSION_DASH}"; \ + if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \ + uv pip install --system -r /tmp/kv_connectors.txt --no-build || ( \ + apt-get update -y && \ + apt-get install -y --no-install-recommends --allow-change-held-packages ${BUILD_PKGS} && \ + uv pip install --system -r /tmp/kv_connectors.txt --no-build-isolation && \ + apt-get purge -y ${BUILD_PKGS} && \ + rm -rf /var/lib/apt/lists/* \ + ); \ + uv pip uninstall --system nixl-cu12 nixl-cu13 2>/dev/null || true; \ + uv pip install --system --no-deps nixl-cu${CUDA_MAJOR}; \ + fi + # Install development dependencies (for testing) COPY requirements/lint.txt requirements/lint.txt COPY requirements/test/cuda.in requirements/test/cuda.in @@ -936,6 +961,7 @@ COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1 # will not be imported by other tests RUN mkdir src RUN mv vllm src/vllm + #################### TEST IMAGE #################### #################### OPENAI API SERVER #################### @@ -978,8 +1004,10 @@ RUN --mount=type=cache,target=/opt/uv/cache \ # clean up -dev packages, keep runtime libraries rm -rf /var/lib/apt/lists/* \ ); \ - # Force-reinstall the matching CUDA wheel so the correct nixl_ep_cpp.so is installed. - uv pip install --system --force-reinstall --no-deps nixl-cu${CUDA_MAJOR}; \ + # Remove all nixl-cu* variants then install only the one matching this + # image's CUDA (nixl>=1.1.0 installs both) + uv pip uninstall --system nixl-cu12 nixl-cu13 2>/dev/null || true; \ + uv pip install --system --no-deps nixl-cu${CUDA_MAJOR}; \ fi # Optional override: install mooncake-transfer-engine from a URL instead of the diff --git a/docker/docker-bake.hcl b/docker/docker-bake.hcl index 94ca8397561a..1b58373afd0d 100644 --- a/docker/docker-bake.hcl +++ b/docker/docker-bake.hcl @@ -59,13 +59,14 @@ target "_common" { dockerfile = "docker/Dockerfile" context = "." args = { - max_jobs = MAX_JOBS - nvcc_threads = NVCC_THREADS - torch_cuda_arch_list = TORCH_CUDA_ARCH_LIST - VLLM_BUILD_COMMIT = VLLM_BUILD_COMMIT != "unknown" ? VLLM_BUILD_COMMIT : (COMMIT != "" ? COMMIT : "unknown") - VLLM_BUILD_PIPELINE = VLLM_BUILD_PIPELINE - VLLM_BUILD_URL = VLLM_BUILD_URL - VLLM_IMAGE_TAG = VLLM_IMAGE_TAG + max_jobs = MAX_JOBS + nvcc_threads = NVCC_THREADS + torch_cuda_arch_list = TORCH_CUDA_ARCH_LIST + INSTALL_KV_CONNECTORS = "true" + VLLM_BUILD_COMMIT = VLLM_BUILD_COMMIT != "unknown" ? VLLM_BUILD_COMMIT : (COMMIT != "" ? COMMIT : "unknown") + VLLM_BUILD_PIPELINE = VLLM_BUILD_PIPELINE + VLLM_BUILD_URL = VLLM_BUILD_URL + VLLM_IMAGE_TAG = VLLM_IMAGE_TAG } } diff --git a/tests/cuda/scripts/check_test_utils_no_cuda_init.py b/tests/cuda/scripts/check_test_utils_no_cuda_init.py new file mode 100644 index 000000000000..59151e02c4b9 --- /dev/null +++ b/tests/cuda/scripts/check_test_utils_no_cuda_init.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Check that importing shared test utils does not initialize CUDA.""" + +import sys +from pathlib import Path + +import torch # noqa: E402 + +assert not torch.cuda.is_initialized(), "CUDA initialized before import" + + +def find_repo_root() -> Path: + for path in Path(__file__).resolve().parents: + if (path / "pyproject.toml").is_file() and ( + path / "tests" / "utils.py" + ).is_file(): + return path + raise RuntimeError("Could not locate vLLM repository root") + + +# Buildkite runs CUDA tests from /vllm-workspace/tests, so the repository root +# is not guaranteed to be importable when this script runs as a subprocess. +sys.path.insert(0, str(find_repo_root())) + +from tests.utils import create_new_process_for_each_test # noqa: E402, F401 + +assert not torch.cuda.is_initialized(), "CUDA was initialized during tests.utils import" +print("OK") diff --git a/tests/cuda/test_platform_no_cuda_init.py b/tests/cuda/test_platform_no_cuda_init.py index 697d2cfec8e1..943d19c343bf 100644 --- a/tests/cuda/test_platform_no_cuda_init.py +++ b/tests/cuda/test_platform_no_cuda_init.py @@ -35,6 +35,13 @@ def test_platform_import_does_not_init_cuda(): pytest.fail(f"Platform import initialized CUDA:\n{result.stderr}") +def test_tests_utils_import_does_not_init_cuda(): + """Test that importing tests.utils does not initialize CUDA before fork.""" + result = run_script("check_test_utils_no_cuda_init.py") + if result.returncode != 0: + pytest.fail(f"tests.utils import initialized CUDA:\n{result.stderr}") + + def test_device_count_respects_env_after_platform_import(): """Test that device_count respects CUDA_VISIBLE_DEVICES after import.""" result = run_script("check_device_count_respects_env.py") diff --git a/tests/utils.py b/tests/utils.py index 6a32f3e2e2d4..6ed312ac5e6b 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -22,7 +22,7 @@ from contextlib import ExitStack, contextmanager from multiprocessing import Process, get_context from pathlib import Path -from typing import Any, Literal, cast +from typing import TYPE_CHECKING, Any, Literal, cast from unittest.mock import patch import anthropic @@ -47,14 +47,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.entrypoints.cli.serve import ServeSubcommand from vllm.logger import init_logger -from vllm.model_executor.kernels.linear import ( - _KernelT, - init_fp8_linear_kernel, -) from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, ) -from vllm.model_executor.model_loader import get_model_loader from vllm.platforms import current_platform from vllm.tokenizers import get_tokenizer from vllm.utils.argparse_utils import FlexibleArgumentParser @@ -66,6 +61,9 @@ logger = init_logger(__name__) +if TYPE_CHECKING: + from vllm.model_executor.kernels.linear import _KernelT + FP8_DTYPE = current_platform.fp8_dtype() @@ -199,6 +197,10 @@ def _pre_download_model(self, model: str, args) -> None: model_config = engine_args.create_model_config() load_config = engine_args.create_load_config() + # Keep model-loader imports lazy: tests.utils is imported during + # pytest collection before some CUDA tests fork child processes. + from vllm.model_executor.model_loader import get_model_loader + model_loader = get_model_loader(load_config) model_loader.download_model(model_config) @@ -2094,7 +2096,7 @@ def __init__( out_dtype: torch.dtype | None = None, transpose_weights: bool = False, device: torch.device | None = None, - force_kernel: type[_KernelT] | None = None, + force_kernel: type["_KernelT"] | None = None, ): super().__init__() act_scale_desc = activation_quant_key.scale @@ -2132,6 +2134,10 @@ def __init__( out_dtype = torch.get_default_dtype() if out_dtype is None else out_dtype + # Keep FP8 kernel imports lazy so importing tests.utils does not + # initialize CUDA before helpers fork child processes. + from vllm.model_executor.kernels.linear import init_fp8_linear_kernel + self.kernel = init_fp8_linear_kernel( activation_quant_key=activation_quant_key, weight_quant_key=weight_quant_key,