From c1f80016438d36602da3631a26a900232fb0f1f8 Mon Sep 17 00:00:00 2001
From: tzhouam <tzhouam@connect.ust.hk>
Date: Fri, 20 Mar 2026 06:27:11 +0000
Subject: [PATCH 1/2] [Update] Bump vLLM version to 0.18.0 across Dockerfiles
 and documentation

- Updated VLLM_VERSION in pipeline-intel.yaml and Dockerfiles for CI, ROCm, and XPU to 0.18.0.
- Modified installation instructions in quickstart.md, gpu.md, cuda.inc.md, and rocm.inc.md to reflect the new version.
- Adjusted pre-built wheel availability note in gpu.md to include version 0.18.0.
- Updated Docker image tags in documentation to use v0.18.0.

Signed-off-by: tzhouam <tzhouam@connect.ust.hk>
---
 .buildkite/pipeline-intel.yaml                    |  2 +-
 docker/Dockerfile.ci                              |  2 +-
 docker/Dockerfile.rocm                            |  4 ++--
 docker/Dockerfile.xpu                             |  2 +-
 docs/getting_started/installation/gpu.md          |  2 +-
 docs/getting_started/installation/gpu/cuda.inc.md | 12 ++++++------
 docs/getting_started/installation/gpu/rocm.inc.md | 14 +++++++-------
 docs/getting_started/quickstart.md                |  4 ++--
 8 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/.buildkite/pipeline-intel.yaml b/.buildkite/pipeline-intel.yaml
index bd9c9daabec..4334dd516b4 100644
--- a/.buildkite/pipeline-intel.yaml
+++ b/.buildkite/pipeline-intel.yaml
@@ -10,7 +10,7 @@ steps:
           DOCKER_BUILDKIT: "1"
           # Buildkite will automatically replace this with the actual commit hash
           VLLM_IMAGE_TAG: "${BUILDKITE_COMMIT}"
-          VLLM_VERSION: "v0.17.0"
+          VLLM_VERSION: "v0.18.0"
         priority: 100
         timeout_in_minutes: 60
         soft_fail: true
diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci
index cb80828eb95..f4253fe7255 100644
--- a/docker/Dockerfile.ci
+++ b/docker/Dockerfile.ci
@@ -1,5 +1,5 @@
 ARG VLLM_BASE_IMAGE=vllm/vllm-openai
-ARG VLLM_BASE_TAG=v0.17.0
+ARG VLLM_BASE_TAG=v0.18.0
 FROM ${VLLM_BASE_IMAGE}:${VLLM_BASE_TAG}
 ARG APP_DIR=/workspace/vllm-omni
 WORKDIR ${APP_DIR}
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 3e0b6065059..ce541497a34 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -1,4 +1,4 @@
-ARG BASE_IMAGE=vllm/vllm-openai-rocm:v0.17.0
+ARG BASE_IMAGE=vllm/vllm-openai-rocm:v0.18.0
 FROM ${BASE_IMAGE} AS base
 
 # Declare a variable to know if we want to use the nightly build or the stable build.
@@ -10,7 +10,7 @@ FROM ${BASE_IMAGE} AS base
 #    we should swap over to use stable release ASAP.
 #    We should avoid relying on custom commits.
 ARG USE_NIGHTLY_BUILD=0
-ARG VLLM_VERSION_OR_COMMIT_HASH=2d5be1dd5ce2e44dfea53ea03ff61143da5137eb
+ARG VLLM_VERSION_OR_COMMIT_HASH=89138b21cc246ae944c741d5c399c148e2b770ab
 ARG ARG_PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
 
diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index 02f57265fe0..8901725f06c 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -76,7 +76,7 @@ ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE="copy"
 
-ARG VLLM_VERSION=v0.17.0
+ARG VLLM_VERSION=v0.18.0
 RUN git clone -b ${VLLM_VERSION} https://github.com/vllm-project/vllm
 WORKDIR /workspace/vllm
 
diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md
index 508ea307da8..297c3666169 100644
--- a/docs/getting_started/installation/gpu.md
+++ b/docs/getting_started/installation/gpu.md
@@ -30,7 +30,7 @@ vLLM-Omni is a Python library that supports the following GPU variants. The libr
 
 ### Pre-built wheels
 
-Note: Pre-built wheels are currently only available for vLLM-Omni 0.11.0rc1, 0.12.0rc1, 0.14.0rc1, 0.14.0, 0.16.0. For the latest version, please [build from source](https://docs.vllm.ai/projects/vllm-omni/en/latest/getting_started/installation/gpu/#build-wheel-from-source).
+Note: Pre-built wheels are currently available for vLLM-Omni 0.11.0rc1, 0.12.0rc1, 0.14.0rc1, 0.14.0, 0.16.0, and 0.18.0. If you need a newer unreleased revision, please [build from source](https://docs.vllm.ai/projects/vllm-omni/en/latest/getting_started/installation/gpu/#build-wheel-from-source).
 
 === "NVIDIA CUDA"
 
diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md
index d8585ad685e..45fa69ddbb5 100644
--- a/docs/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/getting_started/installation/gpu/cuda.inc.md
@@ -34,13 +34,13 @@ uv pip install vllm-omni
 # --8<-- [start:build-wheel-from-source]
 
 #### Installation of vLLM
-If you do not need to modify source code of vLLM, you can directly install the stable 0.17.0 release version of the library
+If you do not need to modify source code of vLLM, you can directly install the stable 0.18.0 release version of the library
 
 ```bash
-uv pip install vllm==0.17.0 --torch-backend=auto
+uv pip install vllm==0.18.0 --torch-backend=auto
 ```
 
-The release 0.17.0 of vLLM is based on PyTorch 2.10.0 which requires CUDA 12.9 environment.
+The 0.18.0 release of vLLM ships CUDA 12.9-compatible binaries by default. If you need a different CUDA variant or want to reuse an existing PyTorch installation, build vLLM from source instead.
 
 #### Installation of vLLM-Omni
 Since vllm-omni is rapidly evolving, it's recommended to install it from source
@@ -56,11 +56,11 @@ If you want to check, modify or debug with source code of vLLM, install the libr
 ```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-git checkout v0.17.0
+git checkout v0.18.0
 ```
 Set up environment variables to get pre-built wheels. If there are internet problems, just download the whl file manually. And set `VLLM_PRECOMPILED_WHEEL_LOCATION` as your local absolute path of whl file.
 ```bash
-export VLLM_PRECOMPILED_WHEEL_LOCATION=https://github.com/vllm-project/vllm/releases/download/v0.17.0/vllm-0.17.0-cp38-abi3-manylinux_2_31_x86_64.whl
+export VLLM_PRECOMPILED_WHEEL_LOCATION=https://github.com/vllm-project/vllm/releases/download/v0.18.0/vllm-0.18.0+cu129-cp38-abi3-manylinux_2_35_x86_64.whl
 ```
 Install vllm with command below (If you have no existing PyTorch).
 ```bash
@@ -91,7 +91,7 @@ docker run --runtime nvidia --gpus 2 \
     --env "HF_TOKEN=$HF_TOKEN" \
     -p 8091:8091 \
     --ipc=host \
-    vllm/vllm-omni:v0.16.0 \
+    vllm/vllm-omni:v0.18.0 \
     --model Qwen/Qwen3-Omni-30B-A3B-Instruct --port 8091
 ```
 
diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md
index 701741cd091..da84561c966 100644
--- a/docs/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/getting_started/installation/gpu/rocm.inc.md
@@ -13,7 +13,7 @@ vLLM-Omni current recommends the steps in under setup through Docker Images.
 
 vLLM-Omni is built based on vLLM. Please install it with command below.
 ```bash
-uv pip install vllm==0.17.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.17.0/rocm700
+uv pip install vllm==0.18.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.18.0/rocm700
 ```
 
 #### Installation of vLLM-Omni
@@ -34,13 +34,13 @@ uv pip install onnxruntime-rocm sox
 # --8<-- [start:build-wheel-from-source]
 
 #### Installation of vLLM
-If you do not need to modify source code of vLLM, you can directly install the stable 0.17.0 release version of the library
+If you do not need to modify source code of vLLM, you can directly install the stable 0.18.0 release version of the library
 
 ```bash
-uv pip install vllm==0.17.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.17.0/rocm700
+uv pip install vllm==0.18.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.18.0/rocm700
 ```
 
-The release 0.17.0 of vLLM requires ROCm 7.0 environment.
+The pre-built 0.18.0 vLLM wheel targets ROCm 7.0. If you need a different ROCm stack or want to reuse an existing PyTorch installation, build vLLM from source instead.
 
 #### Installation of vLLM-Omni
 Since vllm-omni is rapidly evolving, it's recommended to install it from source
@@ -58,7 +58,7 @@ If you want to check, modify or debug with source code of vLLM, install the libr
 ```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-git checkout v0.17.0
+git checkout v0.18.0
 python3 -m pip install -r requirements/rocm.txt
 python3 setup.py develop
 ```
@@ -130,7 +130,7 @@ docker run --rm \
   -v ~/.cache/huggingface:/root/.cache/huggingface \
   --env "HF_TOKEN=$HF_TOKEN" \
   -p 8091:8091 \
-  vllm/vllm-omni-rocm:v0.16.0 \
+  vllm/vllm-omni-rocm:v0.18.0 \
   --model Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091
 ```
 
@@ -149,7 +149,7 @@ docker run --rm -it \
   -v ~/.cache/huggingface:/root/.cache/huggingface \
   --env "HF_TOKEN=$HF_TOKEN" \
   --entrypoint bash \
-  vllm/vllm-omni-rocm:v0.16.0
+  vllm/vllm-omni-rocm:v0.18.0
 ```
 
 # --8<-- [end:pre-built-images]
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index 9b1182c6b3c..bf0978216bf 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -19,10 +19,10 @@ uv venv --python 3.12 --seed
 source .venv/bin/activate
 
 # On CUDA
-uv pip install vllm==0.17.0 --torch-backend=auto
+uv pip install vllm==0.18.0 --torch-backend=auto
 
 # On ROCm
-uv pip install vllm==0.17.0 --extra-index-url https://wheels.vllm.ai/rocm/0.17.0/rocm700
+uv pip install vllm==0.18.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.18.0/rocm700
 
 git clone https://github.com/vllm-project/vllm-omni.git
 cd vllm-omni

From ca162b50d04ddbc5caae66b64e380148c7612bb4 Mon Sep 17 00:00:00 2001
From: Gao Han <hgaoaf@connect.ust.hk>
Date: Sat, 21 Mar 2026 20:52:03 +0800
Subject: [PATCH 2/2] Refactor Dockerfile by removing vLLM installation

Removed installation of vLLM and related dependencies from Dockerfile.

Signed-off-by: Gao Han <hgaoaf@connect.ust.hk>
---
 docker/Dockerfile.ci | 25 +++----------------------
 1 file changed, 3 insertions(+), 22 deletions(-)

diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci
index 2289d3fbd43..f4253fe7255 100644
--- a/docker/Dockerfile.ci
+++ b/docker/Dockerfile.ci
@@ -11,29 +11,10 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-RUN uv pip uninstall --system -y vllm || true
+# Install vllm-omni into the same uv-managed Python environment used by the base image.
+# Use bash -c so that $(python3 -c ...) is expanded inside the container.
+RUN uv pip install --system --no-cache-dir ".[dev]"
 
-# Install vLLM from precompiled wheel at the selected commit.
-# Must use direct URL because the wheel has a PEP 440 local version identifier
-# (e.g. +g0a0a1a198) which pip/uv refuse to install from a PEP 503 package index.
-ENV VLLM_PRECOMPILED_WHEEL_COMMIT=89138b21cc246ae944c741d5c399c148e2b770ab
-RUN VLLM_WHEEL_URL=$(python3 -c "import urllib.request,re; \
-    html=urllib.request.urlopen('https://wheels.vllm.ai/${VLLM_PRECOMPILED_WHEEL_COMMIT}/vllm/').read().decode(); \
-    m=re.search(r'>(\S+x86_64\.whl)<',html); \
-    print('https://wheels.vllm.ai/${VLLM_PRECOMPILED_WHEEL_COMMIT}/'+m.group(1).replace('+','%2B'))") && \
-    echo "Installing vLLM from: ${VLLM_WHEEL_URL}" && \
-    uv pip install --system --force-reinstall "${VLLM_WHEEL_URL}"
-
-RUN uv pip install --system ".[dev]"
-
-RUN uv pip install --system --upgrade \
-        "flashinfer-cubin==0.6.6" \
-        "nvidia-cublas-cu12==12.9.1.4" \
-        "numpy==2.2.6"
-
-RUN uv pip install --system --upgrade \
-    "flashinfer-jit-cache==0.6.6" \
-    --index-url https://flashinfer.ai/whl/cu129
 RUN ln -sf /usr/bin/python3 /usr/bin/python
 
 ENTRYPOINT []