diff --git a/.buildkite/pipeline-intel.yaml b/.buildkite/pipeline-intel.yaml index 2dc53ad9636..3f14fd263bc 100644 --- a/.buildkite/pipeline-intel.yaml +++ b/.buildkite/pipeline-intel.yaml @@ -10,7 +10,7 @@ steps: DOCKER_BUILDKIT: "1" # Buildkite will automatically replace this with the actual commit hash VLLM_IMAGE_TAG: "${BUILDKITE_COMMIT}" - VLLM_VERSION: "v0.19.0" + VLLM_VERSION: "v0.20.0" priority: 100 timeout_in_minutes: 60 soft_fail: true diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci index 80b263085fc..a263c12e2d2 100644 --- a/docker/Dockerfile.ci +++ b/docker/Dockerfile.ci @@ -1,5 +1,5 @@ ARG VLLM_BASE_IMAGE=vllm/vllm-openai -ARG VLLM_BASE_TAG=v0.20.0-cu130 +ARG VLLM_BASE_TAG=v0.20.0 FROM ${VLLM_BASE_IMAGE}:${VLLM_BASE_TAG} ARG APP_DIR=/workspace/vllm-omni WORKDIR ${APP_DIR} diff --git a/docker/Dockerfile.cuda b/docker/Dockerfile.cuda index 28e10f4fb85..78f64f6a5e0 100644 --- a/docker/Dockerfile.cuda +++ b/docker/Dockerfile.cuda @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=vllm/vllm-openai:v0.19.0 +ARG BASE_IMAGE=vllm/vllm-openai:v0.20.0 FROM ${BASE_IMAGE} ARG COMMON_WORKDIR=/app diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index a54aa3b7933..ab95077fc7b 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=vllm/vllm-openai-rocm:v0.19.0 +ARG BASE_IMAGE=vllm/vllm-openai-rocm:v0.20.0 FROM ${BASE_IMAGE} AS base # Declare a variable to know if we want to use the nightly build or the stable build. diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu index 25d5d0c800e..f015059ed88 100644 --- a/docker/Dockerfile.xpu +++ b/docker/Dockerfile.xpu @@ -74,7 +74,7 @@ ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} ENV UV_INDEX_STRATEGY="unsafe-best-match" ENV UV_LINK_MODE="copy" -ARG VLLM_VERSION=v0.19.0 +ARG VLLM_VERSION=v0.20.0 RUN git clone -b ${VLLM_VERSION} https://github.com/vllm-project/vllm WORKDIR /workspace/vllm diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md index 84965129c78..608c259af1c 100644 --- a/docs/getting_started/installation/gpu/rocm.inc.md +++ b/docs/getting_started/installation/gpu/rocm.inc.md @@ -13,7 +13,7 @@ vLLM-Omni current recommends the steps in under setup through Docker Images. vLLM-Omni is built based on vLLM. Please install it with command below. ```bash -uv pip install vllm==0.20.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.20.0/rocm700 +uv pip install vllm==0.20.0+rocm721 --extra-index-url https://wheels.vllm.ai/rocm/0.20.0/rocm721 ``` #### Installation of vLLM-Omni @@ -37,10 +37,10 @@ uv pip install onnxruntime-rocm If you do not need to modify source code of vLLM, you can directly install the stable 0.20.0 release version of the library ```bash -uv pip install vllm==0.20.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.20.0/rocm700 +uv pip install vllm==0.20.0+rocm721 --extra-index-url https://wheels.vllm.ai/rocm/0.20.0/rocm721 ``` -The pre-built 0.20.0 vLLM wheel targets ROCm 7.0. If you need a different ROCm stack or want to reuse an existing PyTorch installation, build vLLM from source instead. +The pre-built 0.20.0 vLLM wheel targets ROCm 7.2.1. If you need a different ROCm stack or want to reuse an existing PyTorch installation, build vLLM from source instead. #### Installation of vLLM-Omni Since vllm-omni is rapidly evolving, it's recommended to install it from source diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index 7ebb16bc484..dfebe8b6154 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -22,7 +22,7 @@ source .venv/bin/activate uv pip install vllm==0.20.0 --torch-backend=auto # On ROCm -uv pip install vllm==0.20.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.20.0/rocm700 +uv pip install vllm==0.20.0+rocm721 --extra-index-url https://wheels.vllm.ai/rocm/0.20.0/rocm721 git clone https://github.com/vllm-project/vllm-omni.git cd vllm-omni @@ -31,6 +31,12 @@ uv pip install -e . For additional installation methods — please see the [installation guide](installation/README.md). + +!!! note + It is important to install the same major & minor version of vLLM and vLLM Omni, otherwise things may not work as expected. If the versions are misaligned, you will see a warning when you import vLLM Omni. + + If you are seeing strange behavior with the `vllm` command not handling the `--omni` flag correctly, you most likely have a version mismatch with vLLM < `0.20.0` and vLLM Omni `0.20.0`, as vLLM Omni no longer hijacks the vLLM entrypoint. Updating vLLM should resolve this issue. + ## Offline Inference Text-to-image generation quickstart with vLLM-Omni: diff --git a/pyproject.toml b/pyproject.toml index 76954221c08..d21f3c4758a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,6 @@ classifiers = [ # Dependencies are now managed dynamically via setup.py based on detected hardware platform. # This allows automatic installation of the correct platform-specific dependencies (CUDA/ROCm/CPU/XPU/NPU) # without requiring extras like [cuda]. See requirements/ directory for platform-specific dependencies. -# Note: vllm is intentionally excluded due to entrypoints overwrite issue. [project.optional-dependencies] @@ -90,7 +89,6 @@ Documentation = "https://vllm-omni.readthedocs.io" "Bug Tracker" = "https://github.com/vllm-project/vllm-omni/issues" [project.scripts] -vllm = "vllm_omni.entrypoints.cli.main:main" vllm-omni = "vllm_omni.entrypoints.cli.main:main" diff --git a/tests/helpers/mark.py b/tests/helpers/mark.py index 077e3f2d9b0..89025ac05b5 100644 --- a/tests/helpers/mark.py +++ b/tests/helpers/mark.py @@ -3,14 +3,7 @@ """Pytest marks and decorators for hardware / resource selection (CUDA, ROCm, …).""" import pytest - -try: - from vllm.utils.torch_utils import cuda_device_count_stateless -except ImportError: - import torch - - def cuda_device_count_stateless() -> int: - return torch.cuda.device_count() +from vllm.platforms import current_platform # Re-exported from tests.helpers.env (GPU wait + DeviceMemoryMonitor). @@ -27,8 +20,9 @@ def cuda_marks(*, res: str, num_cards: int): if num_cards == 1: return marks test_distributed = pytest.mark.distributed_cuda(num_cards=num_cards) + test_skipif = pytest.mark.skipif_cuda( - cuda_device_count_stateless() < num_cards, + not current_platform.is_cuda() or (current_platform.device_count() < num_cards), reason=f"Need at least {num_cards} CUDA GPUs to run the test.", ) return marks + [test_distributed, test_skipif] diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 304aa558f5f..09e4b6b6688 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -65,7 +65,7 @@ from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.pooling.classify.serving import ServingClassification from vllm.entrypoints.pooling.embed.serving import ServingEmbedding as OpenAIServingEmbedding -from vllm.entrypoints.pooling.pooling.serving import ServingPooling as OpenAIServingPooling +from vllm.entrypoints.pooling.pooling.serving import ServingPooling from vllm.entrypoints.pooling.scoring.serving import ServingScores from vllm.entrypoints.serve.disagg.serving import ServingTokens @@ -800,10 +800,9 @@ async def omni_init_app_state( else None ) state.openai_serving_pooling = ( - OpenAIServingPooling( + ServingPooling( engine_client, state.openai_serving_models, - state.openai_serving_render, supported_tasks=tuple(supported_tasks), request_logger=request_logger, chat_template=resolved_chat_template, diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py index 28f49918f2c..08b20ec53d3 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py @@ -489,7 +489,7 @@ class Qwen3OmniMoeModel(Qwen3MoeLLMForCausalLM): """ Qwen3 Omni MoE Talker language model. - Extends Qwen3MoeLLMForCausalLM (which already uses SharedFusedMoE with + Extends Qwen3MoeLLMForCausalLM (which already uses FusedMoE with shared-expert support) and replaces the text embedding / LM head with a codec embedding so the talker operates over audio-codec tokens instead of text tokens. diff --git a/vllm_omni/platforms/npu/models/hunyuan_fused_moe.py b/vllm_omni/platforms/npu/models/hunyuan_fused_moe.py index 05079a7e4ae..cb3c68383e1 100644 --- a/vllm_omni/platforms/npu/models/hunyuan_fused_moe.py +++ b/vllm_omni/platforms/npu/models/hunyuan_fused_moe.py @@ -103,6 +103,9 @@ def prepare_hunyuan_fused_moe_runtime() -> None: _ensure_forward_context_attr("flash_comm_v1_enabled", bool, False) +# NOTE: As of v0.20,0 vLLM has folded SharedFusedMoE -> FusedMoE and removed the class. +# Since AscendSharedFusedMoE is a subclass of SharedFusedMoE, we should be careful +# to ensure that this is updated correctly. class AscendHunyuanFusedMoE(AscendSharedFusedMoE): def __init__(self, *, prefix: str = "", **kwargs: Any) -> None: super().__init__(prefix=prefix, **kwargs)