Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 1 addition & 8 deletions .buildkite/test_areas/disaggregated.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# We assume uv pip install -r requirements/kv_connectors.txt is run in the image-build step.
group: Disaggregated
depends_on:
- image-build
Expand All @@ -11,7 +12,6 @@ steps:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl/
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
- label: Distributed FlashInfer NixlConnector PD accuracy (4 GPUs)
key: distributed-flashinfer-nixlconnector-pd-accuracy-4-gpus
Expand All @@ -22,7 +22,6 @@ steps:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl/
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- FLASHINFER=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh

- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs)
Expand All @@ -34,7 +33,6 @@ steps:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl/
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- DP_EP=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh

- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs)
Expand All @@ -46,7 +44,6 @@ steps:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl/
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- CROSS_LAYERS_BLOCKS=True bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh

- label: Hybrid SSM NixlConnector PD accuracy tests (4 GPUs)
Expand All @@ -58,7 +55,6 @@ steps:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl/
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- HYBRID_SSM=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh

- label: MultiConnector (Nixl+Offloading) PD accuracy (2 GPUs)
Expand All @@ -73,7 +69,6 @@ steps:
- vllm/distributed/kv_transfer/kv_connector/v1/offloading/
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- bash v1/kv_connector/nixl_integration/run_multi_connector_accuracy_test.sh

- label: NixlConnector PD + Spec Decode acceptance (2 GPUs)
Expand All @@ -87,7 +82,6 @@ steps:
- vllm/v1/worker/kv_connector_model_runner_mixin.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- bash v1/kv_connector/nixl_integration/config_sweep_spec_decode_test.sh

- label: MultiConnector (Nixl+Offloading) PD edge cases (2 GPUs)
Expand All @@ -102,5 +96,4 @@ steps:
- vllm/distributed/kv_transfer/kv_connector/v1/offloading/
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- bash v1/kv_connector/nixl_integration/run_multi_connector_edge_case_test.sh
32 changes: 30 additions & 2 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -887,6 +887,31 @@ RUN apt-get update -y \
# We can specify the standard or nightly build of PyTorch
ARG PYTORCH_NIGHTLY

# install kv_connectors if requested (same logic as vllm-openai-base)
ARG INSTALL_KV_CONNECTORS=false
ARG CUDA_VERSION
ARG torch_cuda_arch_list='7.5 8.0 8.6 8.9 9.0 10.0 11.0 12.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
RUN --mount=type=cache,target=/opt/uv/cache \
--mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-'); \
CUDA_HOME=/usr/local/cuda; \
BUILD_PKGS="libcusparse-dev-${CUDA_VERSION_DASH} \
libcublas-dev-${CUDA_VERSION_DASH} \
libcusolver-dev-${CUDA_VERSION_DASH}"; \
if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
uv pip install --system -r /tmp/kv_connectors.txt --no-build || ( \
apt-get update -y && \
apt-get install -y --no-install-recommends --allow-change-held-packages ${BUILD_PKGS} && \
uv pip install --system -r /tmp/kv_connectors.txt --no-build-isolation && \
apt-get purge -y ${BUILD_PKGS} && \
rm -rf /var/lib/apt/lists/* \
); \
uv pip uninstall --system nixl-cu12 nixl-cu13 2>/dev/null || true; \
uv pip install --system --no-deps nixl-cu${CUDA_MAJOR}; \
fi

# Install development dependencies (for testing)
COPY requirements/lint.txt requirements/lint.txt
COPY requirements/test/cuda.in requirements/test/cuda.in
Expand Down Expand Up @@ -936,6 +961,7 @@ COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
# will not be imported by other tests
RUN mkdir src
RUN mv vllm src/vllm

#################### TEST IMAGE ####################

#################### OPENAI API SERVER ####################
Expand Down Expand Up @@ -978,8 +1004,10 @@ RUN --mount=type=cache,target=/opt/uv/cache \
# clean up -dev packages, keep runtime libraries
rm -rf /var/lib/apt/lists/* \
); \
# Force-reinstall the matching CUDA wheel so the correct nixl_ep_cpp.so is installed.
uv pip install --system --force-reinstall --no-deps nixl-cu${CUDA_MAJOR}; \
# Remove all nixl-cu* variants then install only the one matching this
# image's CUDA (nixl>=1.1.0 installs both)
uv pip uninstall --system nixl-cu12 nixl-cu13 2>/dev/null || true; \
uv pip install --system --no-deps nixl-cu${CUDA_MAJOR}; \
fi

# Optional override: install mooncake-transfer-engine from a URL instead of the
Expand Down
15 changes: 8 additions & 7 deletions docker/docker-bake.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,14 @@ target "_common" {
dockerfile = "docker/Dockerfile"
context = "."
args = {
max_jobs = MAX_JOBS
nvcc_threads = NVCC_THREADS
torch_cuda_arch_list = TORCH_CUDA_ARCH_LIST
VLLM_BUILD_COMMIT = VLLM_BUILD_COMMIT != "unknown" ? VLLM_BUILD_COMMIT : (COMMIT != "" ? COMMIT : "unknown")
VLLM_BUILD_PIPELINE = VLLM_BUILD_PIPELINE
VLLM_BUILD_URL = VLLM_BUILD_URL
VLLM_IMAGE_TAG = VLLM_IMAGE_TAG
max_jobs = MAX_JOBS
nvcc_threads = NVCC_THREADS
torch_cuda_arch_list = TORCH_CUDA_ARCH_LIST
INSTALL_KV_CONNECTORS = "true"
VLLM_BUILD_COMMIT = VLLM_BUILD_COMMIT != "unknown" ? VLLM_BUILD_COMMIT : (COMMIT != "" ? COMMIT : "unknown")
VLLM_BUILD_PIPELINE = VLLM_BUILD_PIPELINE
VLLM_BUILD_URL = VLLM_BUILD_URL
VLLM_IMAGE_TAG = VLLM_IMAGE_TAG
}
}

Expand Down
30 changes: 30 additions & 0 deletions tests/cuda/scripts/check_test_utils_no_cuda_init.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Check that importing shared test utils does not initialize CUDA."""

import sys
from pathlib import Path

import torch # noqa: E402

assert not torch.cuda.is_initialized(), "CUDA initialized before import"


def find_repo_root() -> Path:
for path in Path(__file__).resolve().parents:
if (path / "pyproject.toml").is_file() and (
path / "tests" / "utils.py"
).is_file():
return path
raise RuntimeError("Could not locate vLLM repository root")


# Buildkite runs CUDA tests from /vllm-workspace/tests, so the repository root
# is not guaranteed to be importable when this script runs as a subprocess.
sys.path.insert(0, str(find_repo_root()))

from tests.utils import create_new_process_for_each_test # noqa: E402, F401

assert not torch.cuda.is_initialized(), "CUDA was initialized during tests.utils import"
print("OK")
7 changes: 7 additions & 0 deletions tests/cuda/test_platform_no_cuda_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ def test_platform_import_does_not_init_cuda():
pytest.fail(f"Platform import initialized CUDA:\n{result.stderr}")


def test_tests_utils_import_does_not_init_cuda():
"""Test that importing tests.utils does not initialize CUDA before fork."""
result = run_script("check_test_utils_no_cuda_init.py")
if result.returncode != 0:
pytest.fail(f"tests.utils import initialized CUDA:\n{result.stderr}")


def test_device_count_respects_env_after_platform_import():
"""Test that device_count respects CUDA_VISIBLE_DEVICES after import."""
result = run_script("check_device_count_respects_env.py")
Expand Down
20 changes: 13 additions & 7 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from contextlib import ExitStack, contextmanager
from multiprocessing import Process, get_context
from pathlib import Path
from typing import Any, Literal, cast
from typing import TYPE_CHECKING, Any, Literal, cast
from unittest.mock import patch

import anthropic
Expand All @@ -47,14 +47,9 @@
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.cli.serve import ServeSubcommand
from vllm.logger import init_logger
from vllm.model_executor.kernels.linear import (
_KernelT,
init_fp8_linear_kernel,
)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
QuantKey,
)
from vllm.model_executor.model_loader import get_model_loader
from vllm.platforms import current_platform
from vllm.tokenizers import get_tokenizer
from vllm.utils.argparse_utils import FlexibleArgumentParser
Expand All @@ -66,6 +61,9 @@

logger = init_logger(__name__)

if TYPE_CHECKING:
from vllm.model_executor.kernels.linear import _KernelT

FP8_DTYPE = current_platform.fp8_dtype()


Expand Down Expand Up @@ -199,6 +197,10 @@ def _pre_download_model(self, model: str, args) -> None:
model_config = engine_args.create_model_config()
load_config = engine_args.create_load_config()

# Keep model-loader imports lazy: tests.utils is imported during
# pytest collection before some CUDA tests fork child processes.
from vllm.model_executor.model_loader import get_model_loader

model_loader = get_model_loader(load_config)
model_loader.download_model(model_config)

Expand Down Expand Up @@ -2094,7 +2096,7 @@ def __init__(
out_dtype: torch.dtype | None = None,
transpose_weights: bool = False,
device: torch.device | None = None,
force_kernel: type[_KernelT] | None = None,
force_kernel: type["_KernelT"] | None = None,
):
super().__init__()
act_scale_desc = activation_quant_key.scale
Expand Down Expand Up @@ -2132,6 +2134,10 @@ def __init__(

out_dtype = torch.get_default_dtype() if out_dtype is None else out_dtype

# Keep FP8 kernel imports lazy so importing tests.utils does not
# initialize CUDA before helpers fork child processes.
from vllm.model_executor.kernels.linear import init_fp8_linear_kernel

self.kernel = init_fp8_linear_kernel(
activation_quant_key=activation_quant_key,
weight_quant_key=weight_quant_key,
Expand Down
Loading