vllm-project · Gaohan123 · Apr 30, 2026 · Apr 23, 2026 · Apr 28, 2026 · Apr 28, 2026
@@ -10,7 +10,7 @@ steps:
           DOCKER_BUILDKIT: "1"
           # Buildkite will automatically replace this with the actual commit hash
           VLLM_IMAGE_TAG: "${BUILDKITE_COMMIT}"
-          VLLM_VERSION: "v0.19.0"
+          VLLM_VERSION: "v0.20.0"
         priority: 100
         timeout_in_minutes: 60
         soft_fail: true
@@ -1,5 +1,5 @@
 ARG VLLM_BASE_IMAGE=vllm/vllm-openai
-ARG VLLM_BASE_TAG=v0.20.0-cu130
+ARG VLLM_BASE_TAG=v0.20.0
 FROM ${VLLM_BASE_IMAGE}:${VLLM_BASE_TAG}
 ARG APP_DIR=/workspace/vllm-omni
 WORKDIR ${APP_DIR}

@@ -1,4 +1,4 @@
-ARG BASE_IMAGE=vllm/vllm-openai:v0.19.0
+ARG BASE_IMAGE=vllm/vllm-openai:v0.20.0
 FROM ${BASE_IMAGE}
 
 ARG COMMON_WORKDIR=/app

@@ -1,4 +1,4 @@
-ARG BASE_IMAGE=vllm/vllm-openai-rocm:v0.19.0
+ARG BASE_IMAGE=vllm/vllm-openai-rocm:v0.20.0
 FROM ${BASE_IMAGE} AS base
 
 # Declare a variable to know if we want to use the nightly build or the stable build.

@@ -74,7 +74,7 @@ ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE="copy"
 
-ARG VLLM_VERSION=v0.19.0
+ARG VLLM_VERSION=v0.20.0
 RUN git clone -b ${VLLM_VERSION} https://github.com/vllm-project/vllm
 WORKDIR /workspace/vllm
 

@@ -13,7 +13,7 @@ vLLM-Omni current recommends the steps in under setup through Docker Images.
 
 vLLM-Omni is built based on vLLM. Please install it with command below.
 ```bash
-uv pip install vllm==0.20.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.20.0/rocm700
+uv pip install vllm==0.20.0+rocm721 --extra-index-url https://wheels.vllm.ai/rocm/0.20.0/rocm721
 ```
 
 #### Installation of vLLM-Omni
@@ -37,10 +37,10 @@ uv pip install onnxruntime-rocm
 If you do not need to modify source code of vLLM, you can directly install the stable 0.20.0 release version of the library
 
 ```bash
-uv pip install vllm==0.20.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.20.0/rocm700
+uv pip install vllm==0.20.0+rocm721 --extra-index-url https://wheels.vllm.ai/rocm/0.20.0/rocm721
 ```
 
-The pre-built 0.20.0 vLLM wheel targets ROCm 7.0. If you need a different ROCm stack or want to reuse an existing PyTorch installation, build vLLM from source instead.
+The pre-built 0.20.0 vLLM wheel targets ROCm 7.2.1. If you need a different ROCm stack or want to reuse an existing PyTorch installation, build vLLM from source instead.
 
 #### Installation of vLLM-Omni
 Since vllm-omni is rapidly evolving, it's recommended to install it from source

@@ -22,7 +22,7 @@ source .venv/bin/activate
 uv pip install vllm==0.20.0 --torch-backend=auto
 
 # On ROCm
-uv pip install vllm==0.20.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.20.0/rocm700
+uv pip install vllm==0.20.0+rocm721 --extra-index-url https://wheels.vllm.ai/rocm/0.20.0/rocm721
 
 git clone https://github.com/vllm-project/vllm-omni.git
 cd vllm-omni
@@ -31,6 +31,12 @@ uv pip install -e .
 
 For additional installation methods — please see the [installation guide](installation/README.md).
 
+
+!!! note
+    It is important to install the same major & minor version of vLLM and vLLM Omni, otherwise things may not work as expected. If the versions are misaligned, you will see a warning when you import vLLM Omni.
+
+    If you are seeing strange behavior with the `vllm` command not handling the `--omni` flag correctly, you most likely have a version mismatch with vLLM < `0.20.0` and vLLM Omni `0.20.0`, as vLLM Omni no longer hijacks the vLLM entrypoint. Updating vLLM should resolve this issue.
+
 ## Offline Inference
 
 Text-to-image generation quickstart with vLLM-Omni:

@@ -32,7 +32,6 @@ classifiers = [
 # Dependencies are now managed dynamically via setup.py based on detected hardware platform.
 # This allows automatic installation of the correct platform-specific dependencies (CUDA/ROCm/CPU/XPU/NPU)
 # without requiring extras like [cuda]. See requirements/ directory for platform-specific dependencies.
-# Note: vllm is intentionally excluded due to entrypoints overwrite issue.
 
 [project.optional-dependencies]
 
@@ -90,7 +89,6 @@ Documentation = "https://vllm-omni.readthedocs.io"
 "Bug Tracker" = "https://github.com/vllm-project/vllm-omni/issues"
 
 [project.scripts]
-vllm = "vllm_omni.entrypoints.cli.main:main"
 vllm-omni = "vllm_omni.entrypoints.cli.main:main"
 
 

@@ -3,14 +3,7 @@
 """Pytest marks and decorators for hardware / resource selection (CUDA, ROCm, …)."""
 
 import pytest
-
-try:
-    from vllm.utils.torch_utils import cuda_device_count_stateless
-except ImportError:
-    import torch
-
-    def cuda_device_count_stateless() -> int:
-        return torch.cuda.device_count()
+from vllm.platforms import current_platform
 
 # Re-exported from tests.helpers.env (GPU wait + DeviceMemoryMonitor).
 
@@ -27,8 +20,9 @@ def cuda_marks(*, res: str, num_cards: int):
     if num_cards == 1:
         return marks
     test_distributed = pytest.mark.distributed_cuda(num_cards=num_cards)
+
     test_skipif = pytest.mark.skipif_cuda(
-        cuda_device_count_stateless() < num_cards,
+        not current_platform.is_cuda() or (current_platform.device_count() < num_cards),
         reason=f"Need at least {num_cards} CUDA GPUs to run the test.",
     )
     return marks + [test_distributed, test_skipif]

@@ -65,7 +65,7 @@
 from vllm.entrypoints.openai.utils import validate_json_request
 from vllm.entrypoints.pooling.classify.serving import ServingClassification
 from vllm.entrypoints.pooling.embed.serving import ServingEmbedding as OpenAIServingEmbedding
-from vllm.entrypoints.pooling.pooling.serving import ServingPooling as OpenAIServingPooling
+from vllm.entrypoints.pooling.pooling.serving import ServingPooling
 from vllm.entrypoints.pooling.scoring.serving import ServingScores
 from vllm.entrypoints.serve.disagg.serving import ServingTokens
 
@@ -800,10 +800,9 @@ async def omni_init_app_state(
         else None
     )
     state.openai_serving_pooling = (
-        OpenAIServingPooling(
+        ServingPooling(
             engine_client,
             state.openai_serving_models,
-            state.openai_serving_render,
             supported_tasks=tuple(supported_tasks),
             request_logger=request_logger,
             chat_template=resolved_chat_template,

@@ -489,7 +489,7 @@ class Qwen3OmniMoeModel(Qwen3MoeLLMForCausalLM):
     """
     Qwen3 Omni MoE Talker language model.
 
-    Extends Qwen3MoeLLMForCausalLM (which already uses SharedFusedMoE with
+    Extends Qwen3MoeLLMForCausalLM (which already uses FusedMoE with
     shared-expert support) and replaces the text embedding / LM head with a
     codec embedding so the talker operates over audio-codec tokens instead
     of text tokens.

@@ -103,6 +103,9 @@ def prepare_hunyuan_fused_moe_runtime() -> None:
     _ensure_forward_context_attr("flash_comm_v1_enabled", bool, False)
 
 
+# NOTE: As of v0.20,0 vLLM has folded SharedFusedMoE -> FusedMoE and removed the class.
+# Since AscendSharedFusedMoE is a subclass of SharedFusedMoE, we should be careful
+# to ensure that this is updated correctly.
 class AscendHunyuanFusedMoE(AscendSharedFusedMoE):
     def __init__(self, *, prefix: str = "", **kwargs: Any) -> None:
         super().__init__(prefix=prefix, **kwargs)