From cab10974402206909a5297ec2eeed80a8af2b6bd Mon Sep 17 00:00:00 2001
From: Hyoseop Song <crad_on25@naver.com>
Date: Tue, 10 Feb 2026 16:10:23 +0900
Subject: [PATCH 1/6] [Doc] add custom cuda build guides

Signed-off-by: Hyoseop Song <crad_on25@naver.com>
---
 docs/getting_started/installation/gpu.md      |  4 ++
 .../installation/gpu/cuda.inc.md              | 45 +++++++++++++++++++
 2 files changed, 49 insertions(+)

diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md
index 141256dbc7f..9afd97a62f4 100644
--- a/docs/getting_started/installation/gpu.md
+++ b/docs/getting_started/installation/gpu.md
@@ -63,6 +63,10 @@ Note: Pre-built wheels are currently only available for vLLM-Omni 0.11.0rc1, 0.1
 
 ### Build your own docker image
 
+=== "NVIDIA CUDA"
+
+    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:build-docker"
+
 === "AMD ROCm"
 
     --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:build-docker"
diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md
index 09323cd2588..0f93e2b3bbb 100644
--- a/docs/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/getting_started/installation/gpu/cuda.inc.md
@@ -99,3 +99,48 @@ docker run --runtime nvidia --gpus 2 \
     You can use this docker image to serve models the same way you would with in vLLM! To do so, make sure you overwrite the default entrypoint (`vllm serve --omni`) which works only for models supported in the vLLM-Omni project.
 
 # --8<-- [end:pre-built-images]
+
+# --8<-- [start:build-docker]
+
+#### Build docker image
+
+```bash
+DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.ci -t vllm-omni-cuda .
+```
+
+If you want to specify the base vLLM version:
+
+```bash
+DOCKER_BUILDKIT=1 docker build \
+  -f docker/Dockerfile.ci \
+  --build-arg VLLM_BASE_TAG=v0.15.0 \
+  -t vllm-omni-cuda .
+```
+
+#### Launch the docker image
+
+##### Launch with OpenAI API Server
+
+```bash
+docker run --runtime nvidia --gpus all \
+  -v ~/.cache/huggingface:/root/.cache/huggingface \
+  --env "HF_TOKEN=$HF_TOKEN" \
+  -p 8091:8091 \
+  --ipc=host \
+  vllm-omni-cuda \
+  vllm serve --omni --model Qwen/Qwen3-Omni-30B-A3B-Instruct --port 8091
+```
+
+##### Launch with interactive session for development
+
+```bash
+docker run --runtime nvidia --gpus all -it \
+  -v ~/.cache/huggingface:/root/.cache/huggingface \
+  --env "HF_TOKEN=$HF_TOKEN" \
+  -p 8091:8091 \
+  --ipc=host \
+  --entrypoint bash \
+  vllm-omni-cuda
+```
+
+# --8<-- [end:build-docker]

From 2ef58468fbb50ebf1ef52777b1cbcc7a131b3e4f Mon Sep 17 00:00:00 2001
From: Hyoseop Song <crad_on25@naver.com>
Date: Mon, 6 Apr 2026 17:55:43 +0900
Subject: [PATCH 2/6] docs: revise custom image build guide based on
 Dockerfile.cuda (#1439)

Signed-off-by: Hyoseop Song <crad_on25@naver.com>

Signed-off-by: Hyoseop Song  <crad_on25@naver.com>
---
 docs/getting_started/installation/gpu/cuda.inc.md | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md
index 06c9363df86..55b14fba0f8 100644
--- a/docs/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/getting_started/installation/gpu/cuda.inc.md
@@ -116,15 +116,15 @@ docker run --runtime nvidia --gpus 2 \
 #### Build docker image
 
 ```bash
-DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.ci -t vllm-omni-cuda .
+DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.cuda -t vllm-omni-cuda .
 ```
 
 If you want to specify the base vLLM version:
 
 ```bash
 DOCKER_BUILDKIT=1 docker build \
-  -f docker/Dockerfile.ci \
-  --build-arg VLLM_BASE_TAG=v0.15.0 \
+  -f docker/Dockerfile.cuda \
+  --build-arg BASE_IMAGE=vllm/vllm-openai:v0.18.0 \
   -t vllm-omni-cuda .
 ```
 
@@ -132,8 +132,11 @@ DOCKER_BUILDKIT=1 docker build \
 
 ##### Launch with OpenAI API Server
 
+!!! note
+    The model `Qwen/Qwen3-Omni-30B-A3B-Instruct` requires significant GPU memory. The example below has been verified on 2 x H100's.
+
 ```bash
-docker run --runtime nvidia --gpus all \
+docker run --runtime nvidia --gpus 2 \
   -v ~/.cache/huggingface:/root/.cache/huggingface \
   --env "HF_TOKEN=$HF_TOKEN" \
   -p 8091:8091 \
@@ -145,7 +148,7 @@ docker run --runtime nvidia --gpus all \
 ##### Launch with interactive session for development
 
 ```bash
-docker run --runtime nvidia --gpus all -it \
+docker run --runtime nvidia --gpus all -it --rm \
   -v ~/.cache/huggingface:/root/.cache/huggingface \
   --env "HF_TOKEN=$HF_TOKEN" \
   -p 8091:8091 \

From f74ba814ba00321cd55bf56578daaaf57a25681b Mon Sep 17 00:00:00 2001
From: Hyoseop Song <crad_on25@naver.com>
Date: Fri, 10 Apr 2026 12:41:31 +0900
Subject: [PATCH 3/6] Update base image version

Signed-off-by: Hyoseop Song <crad_on25@naver.com>

Signed-off-by: Hyoseop Song  <crad_on25@naver.com>
---
 docs/getting_started/installation/gpu/cuda.inc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md
index 55b14fba0f8..102105746c9 100644
--- a/docs/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/getting_started/installation/gpu/cuda.inc.md
@@ -124,7 +124,7 @@ If you want to specify the base vLLM version:
 ```bash
 DOCKER_BUILDKIT=1 docker build \
   -f docker/Dockerfile.cuda \
-  --build-arg BASE_IMAGE=vllm/vllm-openai:v0.18.0 \
+  --build-arg BASE_IMAGE=vllm/vllm-openai:v0.19.0 \
   -t vllm-omni-cuda .
 ```
 

From 0dff3b8c39ee30706cae0d3c14e1e7023deef57b Mon Sep 17 00:00:00 2001
From: Hyoseop Song <crad_on25@naver.com>
Date: Mon, 27 Apr 2026 11:33:45 +0900
Subject: [PATCH 4/6] docs: Use HF_HOME env for model cache path

Signed-off-by: Hyoseop Song <crad_on25@naver.com>

Signed-off-by: Hyoseop Song  <crad_on25@naver.com>
---
 docs/getting_started/installation/gpu/cuda.inc.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md
index 102105746c9..666dc63a61a 100644
--- a/docs/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/getting_started/installation/gpu/cuda.inc.md
@@ -137,7 +137,7 @@ DOCKER_BUILDKIT=1 docker build \
 
 ```bash
 docker run --runtime nvidia --gpus 2 \
-  -v ~/.cache/huggingface:/root/.cache/huggingface \
+  -v ${HF_HOME:-$HOME/.cache/huggingface}:/root/.cache/huggingface \
   --env "HF_TOKEN=$HF_TOKEN" \
   -p 8091:8091 \
   --ipc=host \
@@ -145,11 +145,13 @@ docker run --runtime nvidia --gpus 2 \
   vllm serve --omni --model Qwen/Qwen3-Omni-30B-A3B-Instruct --port 8091
 ```
 
+By default, this mounts `$HOME/.cache/huggingface` as the model cache directory. To use a custom location, set the `HF_HOME` environment variable before running the command (e.g., `export HF_HOME=/data/models`).
+
 ##### Launch with interactive session for development
 
 ```bash
 docker run --runtime nvidia --gpus all -it --rm \
-  -v ~/.cache/huggingface:/root/.cache/huggingface \
+  -v ${HF_HOME:-$HOME/.cache/huggingface}:/root/.cache/huggingface \
   --env "HF_TOKEN=$HF_TOKEN" \
   -p 8091:8091 \
   --ipc=host \

From 835b259cc5077c154454ba06dd159b536df3c4b8 Mon Sep 17 00:00:00 2001
From: Hyoseop Song <crad_on25@naver.com>
Date: Wed, 13 May 2026 16:44:15 +0900
Subject: [PATCH 5/6] Update CUDA base image version to v0.20.0

Signed-off-by: Hyoseop Song <crad_on25@naver.com>
---
 docs/getting_started/installation/gpu/cuda.inc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md
index 9df2e4ea19d..6c04eea4d0a 100644
--- a/docs/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/getting_started/installation/gpu/cuda.inc.md
@@ -124,7 +124,7 @@ If you want to specify the base vLLM version:
 ```bash
 DOCKER_BUILDKIT=1 docker build \
   -f docker/Dockerfile.cuda \
-  --build-arg BASE_IMAGE=vllm/vllm-openai:v0.19.0 \
+  --build-arg BASE_IMAGE=vllm/vllm-openai:v0.20.0 \
   -t vllm-omni-cuda .
 ```
 

From 97f04c041a119780b182daa31ed32b8e5101e66e Mon Sep 17 00:00:00 2001
From: Hyoseop Song <crad_on25@naver.com>
Date: Tue, 26 May 2026 10:37:05 +0900
Subject: [PATCH 6/6] Update CUDA base image version to v0.21.0

Signed-off-by: Hyoseop Song <crad_on25@naver.com>
---
 docs/getting_started/installation/gpu/cuda.inc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md
index 12631dbc4bb..4e097d8a314 100644
--- a/docs/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/getting_started/installation/gpu/cuda.inc.md
@@ -124,7 +124,7 @@ If you want to specify the base vLLM version:
 ```bash
 DOCKER_BUILDKIT=1 docker build \
   -f docker/Dockerfile.cuda \
-  --build-arg BASE_IMAGE=vllm/vllm-openai:v0.20.0 \
+  --build-arg BASE_IMAGE=vllm/vllm-openai:v0.21.0 \
   -t vllm-omni-cuda .
 ```