diff --git a/.buildkite/pipeline-intel.yaml b/.buildkite/pipeline-intel.yaml
index 2dc53ad9636..3f14fd263bc 100644
--- a/.buildkite/pipeline-intel.yaml
+++ b/.buildkite/pipeline-intel.yaml
@@ -10,7 +10,7 @@ steps:
           DOCKER_BUILDKIT: "1"
           # Buildkite will automatically replace this with the actual commit hash
           VLLM_IMAGE_TAG: "${BUILDKITE_COMMIT}"
-          VLLM_VERSION: "v0.19.0"
+          VLLM_VERSION: "v0.20.0"
         priority: 100
         timeout_in_minutes: 60
         soft_fail: true
diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci
index 80b263085fc..a263c12e2d2 100644
--- a/docker/Dockerfile.ci
+++ b/docker/Dockerfile.ci
@@ -1,5 +1,5 @@
 ARG VLLM_BASE_IMAGE=vllm/vllm-openai
-ARG VLLM_BASE_TAG=v0.20.0-cu130
+ARG VLLM_BASE_TAG=v0.20.0
 FROM ${VLLM_BASE_IMAGE}:${VLLM_BASE_TAG}
 ARG APP_DIR=/workspace/vllm-omni
 WORKDIR ${APP_DIR}
diff --git a/docker/Dockerfile.cuda b/docker/Dockerfile.cuda
index 28e10f4fb85..78f64f6a5e0 100644
--- a/docker/Dockerfile.cuda
+++ b/docker/Dockerfile.cuda
@@ -1,4 +1,4 @@
-ARG BASE_IMAGE=vllm/vllm-openai:v0.19.0
+ARG BASE_IMAGE=vllm/vllm-openai:v0.20.0
 FROM ${BASE_IMAGE}
 
 ARG COMMON_WORKDIR=/app
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index a54aa3b7933..ab95077fc7b 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -1,4 +1,4 @@
-ARG BASE_IMAGE=vllm/vllm-openai-rocm:v0.19.0
+ARG BASE_IMAGE=vllm/vllm-openai-rocm:v0.20.0
 FROM ${BASE_IMAGE} AS base
 
 # Declare a variable to know if we want to use the nightly build or the stable build.
diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index 25d5d0c800e..f015059ed88 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -74,7 +74,7 @@ ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE="copy"
 
-ARG VLLM_VERSION=v0.19.0
+ARG VLLM_VERSION=v0.20.0
 RUN git clone -b ${VLLM_VERSION} https://github.com/vllm-project/vllm
 WORKDIR /workspace/vllm
 
diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md
index 84965129c78..608c259af1c 100644
--- a/docs/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/getting_started/installation/gpu/rocm.inc.md
@@ -13,7 +13,7 @@ vLLM-Omni current recommends the steps in under setup through Docker Images.
 
 vLLM-Omni is built based on vLLM. Please install it with command below.
 ```bash
-uv pip install vllm==0.20.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.20.0/rocm700
+uv pip install vllm==0.20.0+rocm721 --extra-index-url https://wheels.vllm.ai/rocm/0.20.0/rocm721
 ```
 
 #### Installation of vLLM-Omni
@@ -37,10 +37,10 @@ uv pip install onnxruntime-rocm
 If you do not need to modify source code of vLLM, you can directly install the stable 0.20.0 release version of the library
 
 ```bash
-uv pip install vllm==0.20.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.20.0/rocm700
+uv pip install vllm==0.20.0+rocm721 --extra-index-url https://wheels.vllm.ai/rocm/0.20.0/rocm721
 ```
 
-The pre-built 0.20.0 vLLM wheel targets ROCm 7.0. If you need a different ROCm stack or want to reuse an existing PyTorch installation, build vLLM from source instead.
+The pre-built 0.20.0 vLLM wheel targets ROCm 7.2.1. If you need a different ROCm stack or want to reuse an existing PyTorch installation, build vLLM from source instead.
 
 #### Installation of vLLM-Omni
 Since vllm-omni is rapidly evolving, it's recommended to install it from source
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index 7ebb16bc484..dfebe8b6154 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -22,7 +22,7 @@ source .venv/bin/activate
 uv pip install vllm==0.20.0 --torch-backend=auto
 
 # On ROCm
-uv pip install vllm==0.20.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.20.0/rocm700
+uv pip install vllm==0.20.0+rocm721 --extra-index-url https://wheels.vllm.ai/rocm/0.20.0/rocm721
 
 git clone https://github.com/vllm-project/vllm-omni.git
 cd vllm-omni
@@ -31,6 +31,12 @@ uv pip install -e .
 
 For additional installation methods — please see the [installation guide](installation/README.md).
 
+
+!!! note
+    It is important to install the same major & minor version of vLLM and vLLM Omni, otherwise things may not work as expected. If the versions are misaligned, you will see a warning when you import vLLM Omni.
+
+    If you are seeing strange behavior with the `vllm` command not handling the `--omni` flag correctly, you most likely have a version mismatch with vLLM < `0.20.0` and vLLM Omni `0.20.0`, as vLLM Omni no longer hijacks the vLLM entrypoint. Updating vLLM should resolve this issue.
+
 ## Offline Inference
 
 Text-to-image generation quickstart with vLLM-Omni:
diff --git a/pyproject.toml b/pyproject.toml
index 76954221c08..d21f3c4758a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,6 @@ classifiers = [
 # Dependencies are now managed dynamically via setup.py based on detected hardware platform.
 # This allows automatic installation of the correct platform-specific dependencies (CUDA/ROCm/CPU/XPU/NPU)
 # without requiring extras like [cuda]. See requirements/ directory for platform-specific dependencies.
-# Note: vllm is intentionally excluded due to entrypoints overwrite issue.
 
 [project.optional-dependencies]
 
@@ -90,7 +89,6 @@ Documentation = "https://vllm-omni.readthedocs.io"
 "Bug Tracker" = "https://github.com/vllm-project/vllm-omni/issues"
 
 [project.scripts]
-vllm = "vllm_omni.entrypoints.cli.main:main"
 vllm-omni = "vllm_omni.entrypoints.cli.main:main"
 
 
diff --git a/tests/helpers/mark.py b/tests/helpers/mark.py
index 077e3f2d9b0..89025ac05b5 100644
--- a/tests/helpers/mark.py
+++ b/tests/helpers/mark.py
@@ -3,14 +3,7 @@
 """Pytest marks and decorators for hardware / resource selection (CUDA, ROCm, …)."""
 
 import pytest
-
-try:
-    from vllm.utils.torch_utils import cuda_device_count_stateless
-except ImportError:
-    import torch
-
-    def cuda_device_count_stateless() -> int:
-        return torch.cuda.device_count()
+from vllm.platforms import current_platform
 
 # Re-exported from tests.helpers.env (GPU wait + DeviceMemoryMonitor).
 
@@ -27,8 +20,9 @@ def cuda_marks(*, res: str, num_cards: int):
     if num_cards == 1:
         return marks
     test_distributed = pytest.mark.distributed_cuda(num_cards=num_cards)
+
     test_skipif = pytest.mark.skipif_cuda(
-        cuda_device_count_stateless() < num_cards,
+        not current_platform.is_cuda() or (current_platform.device_count() < num_cards),
         reason=f"Need at least {num_cards} CUDA GPUs to run the test.",
     )
     return marks + [test_distributed, test_skipif]
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index 304aa558f5f..09e4b6b6688 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -65,7 +65,7 @@
 from vllm.entrypoints.openai.utils import validate_json_request
 from vllm.entrypoints.pooling.classify.serving import ServingClassification
 from vllm.entrypoints.pooling.embed.serving import ServingEmbedding as OpenAIServingEmbedding
-from vllm.entrypoints.pooling.pooling.serving import ServingPooling as OpenAIServingPooling
+from vllm.entrypoints.pooling.pooling.serving import ServingPooling
 from vllm.entrypoints.pooling.scoring.serving import ServingScores
 from vllm.entrypoints.serve.disagg.serving import ServingTokens
 
@@ -800,10 +800,9 @@ async def omni_init_app_state(
         else None
     )
     state.openai_serving_pooling = (
-        OpenAIServingPooling(
+        ServingPooling(
             engine_client,
             state.openai_serving_models,
-            state.openai_serving_render,
             supported_tasks=tuple(supported_tasks),
             request_logger=request_logger,
             chat_template=resolved_chat_template,
diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py
index 28f49918f2c..08b20ec53d3 100644
--- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py
+++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py
@@ -489,7 +489,7 @@ class Qwen3OmniMoeModel(Qwen3MoeLLMForCausalLM):
     """
     Qwen3 Omni MoE Talker language model.
 
-    Extends Qwen3MoeLLMForCausalLM (which already uses SharedFusedMoE with
+    Extends Qwen3MoeLLMForCausalLM (which already uses FusedMoE with
     shared-expert support) and replaces the text embedding / LM head with a
     codec embedding so the talker operates over audio-codec tokens instead
     of text tokens.
diff --git a/vllm_omni/platforms/npu/models/hunyuan_fused_moe.py b/vllm_omni/platforms/npu/models/hunyuan_fused_moe.py
index 05079a7e4ae..cb3c68383e1 100644
--- a/vllm_omni/platforms/npu/models/hunyuan_fused_moe.py
+++ b/vllm_omni/platforms/npu/models/hunyuan_fused_moe.py
@@ -103,6 +103,9 @@ def prepare_hunyuan_fused_moe_runtime() -> None:
     _ensure_forward_context_attr("flash_comm_v1_enabled", bool, False)
 
 
+# NOTE: As of v0.20,0 vLLM has folded SharedFusedMoE -> FusedMoE and removed the class.
+# Since AscendSharedFusedMoE is a subclass of SharedFusedMoE, we should be careful
+# to ensure that this is updated correctly.
 class AscendHunyuanFusedMoE(AscendSharedFusedMoE):
     def __init__(self, *, prefix: str = "", **kwargs: Any) -> None:
         super().__init__(prefix=prefix, **kwargs)