vllm-project · david6666666 · Feb 4, 2026 · Jan 6, 2026 · Jan 6, 2026 · Jan 8, 2026
@@ -0,0 +1,19 @@
+ARG VLLM_ASCEND_IMAGE=quay.io/ascend/vllm-ascend
+ARG VLLM_ASCEND_TAG=v0.14.0rc1
+FROM ${VLLM_ASCEND_IMAGE}:${VLLM_ASCEND_TAG}
+
+ARG APP_DIR=/vllm-workspace/vllm-omni
+WORKDIR ${APP_DIR}
+
+COPY . .
+
+# Remove this replace when the dispatch of requirements is ready
+RUN sed -i -E 's/^([[:space:]]*)"fa3-fwd==0\.0\.1",/\1# "fa3-fwd==0.0.1",/' pyproject.toml \
+ && sed -i -E 's/\bonnxruntime\b/onnxruntime-cann/g' pyproject.toml
+
+# Install vllm-omni with dev dependencies
+RUN pip install --no-cache-dir -e .
+
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+
+ENTRYPOINT []
@@ -0,0 +1,19 @@
+ARG VLLM_ASCEND_IMAGE=quay.io/ascend/vllm-ascend
+ARG VLLM_ASCEND_TAG=v0.14.0rc1-a3
+FROM ${VLLM_ASCEND_IMAGE}:${VLLM_ASCEND_TAG}
+
+ARG APP_DIR=/vllm-workspace/vllm-omni
+WORKDIR ${APP_DIR}
+
+COPY . .
+
+# Remove this replace when the dispatch of requirements is ready
+RUN sed -i -E 's/^([[:space:]]*)"fa3-fwd==0\.0\.1",/\1# "fa3-fwd==0.0.1",/' pyproject.toml \
+ && sed -i -E 's/\bonnxruntime\b/onnxruntime-cann/g' pyproject.toml
+
+# Install vllm-omni with dev dependencies
+RUN pip install --no-cache-dir -e .
+
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+
+ENTRYPOINT []
diff --git a/docs/getting_started/installation/npu.md b/docs/getting_started/installation/npu.md
@@ -16,7 +16,13 @@ vLLM-Omni supports NPU through the vLLM Ascend Plugin (vllm-ascend). This is a c
 
 ## Installation
 
-### Recommended
+### Set up using Docker
+
+=== "NPU"
+
+    --8<-- "docs/getting_started/installation/npu/npu.inc.md:pre-built-images"
+
+### Build wheel from source
 
 === "NPU"
 

diff --git a/docs/getting_started/installation/npu/npu.inc.md b/docs/getting_started/installation/npu/npu.inc.md
@@ -13,10 +13,10 @@ export DEVICE0=/dev/davinci0
 export DEVICE1=/dev/davinci1
 # Update the vllm-ascend image
 # Atlas A2:
-# export IMAGE=quay.io/ascend/vllm-ascend:v0.14.0
+# export IMAGE=quay.io/ascend/vllm-ascend:v0.14.0rc1
 # Atlas A3:
-# export IMAGE=quay.io/ascend/vllm-ascend:v0.14.0-a3
-export IMAGE=quay.io/ascend/vllm-ascend:v0.14.0
+# export IMAGE=quay.io/ascend/vllm-ascend:v0.14.0rc1-a3
+export IMAGE=quay.io/ascend/vllm-ascend:v0.14.0rc1
 docker run --rm \
     --name vllm-omni-npu \
     --shm-size=1g \
@@ -34,26 +34,82 @@ docker run --rm \
     -p 8000:8000 \
     -it $IMAGE bash
 
-# Install the missing dependency of mooncake in the origin image.
-apt update
-apt install libjemalloc2
-echo "export LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc
-source ~/.bashrc
-
 # Inside the container, install vLLM-Omni from source
 cd /vllm-workspace
 git clone -b v0.14.0 https://github.com/vllm-project/vllm-omni.git
+
+# Remove this replace when the dispatch of requirements is ready
+RUN sed -i -E 's/^([[:space:]]*)"fa3-fwd==0\.0\.1",/\1# "fa3-fwd==0.0.1",/' pyproject.toml \
+ && sed -i -E 's/\bonnxruntime\b/onnxruntime-cann/g' pyproject.toml
+
 cd vllm-omni
 pip install -v -e .
 export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
-# (Optional) Disable mooncake for stable capability
-mv /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/mooncake \
-   /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/mooncake.disabled
 ```
 
 The default workdir is `/workspace`, with vLLM, vLLM-Ascend and vLLM-Omni code placed in `/vllm-workspace` installed in development mode.
 
 For other installation methods (pip installation, building from source, custom Docker builds), please refer to the [vllm-ascend installation guide](https://docs.vllm.ai/projects/ascend/en/latest/installation.html).
 
+We are keeping [issue #997](https://github.com/vllm-project/vllm-omni/issues/997) up to date with the aligned versions of vLLM, vLLM-Ascend, and vLLM-Omni, and also outlining the Q1 roadmap there.
+
 # --8<-- [end:installation]
+
+# --8<-- [start:pre-built-images]
+
+`vllm-ascend` offers Docker images for deployment. You can just pull the **prebuilt image** from the image repository [ascend/vllm-ascend](https://quay.io/repository/ascend/vllm-ascend?tab=tags) and run it with bash.
+
+Supported images as following.
+
+| image name | Hardware | OS |
+|-|-|-|
+| image-tag | Atlas A2 | Ubuntu |
+| image-tag-a3 | Atlas A3 | Ubuntu |
+
+vLLM-Omni offers Docker images for Ascend NPU deployment. You can just pull the **prebuilt image** from the image repository [ascend/vllm-ascend](https://quay.io/repository/ascend/vllm-ascend?tab=tags) and run it with bash.
+
+Here's an example deployment command that has been verified on 6 x NPUs:
+
+```bash
+# Atlas A2:
+# export IMAGE=quay.io/ascend/vllm-omni:v0.14.0
+# Atlas A3:
+# export IMAGE=quay.io/ascend/vllm-omni:v0.14.0-a3
+export IMAGE=quay.io/ascend/vllm-omni:v0.14.0
+docker run --rm \
+    --name vllm-omni-npu \
+    --shm-size=1g \
+    --device /dev/davinci0 \
+    --device /dev/davinci1 \
+    --device /dev/davinci2 \
+    --device /dev/davinci3 \
+    --device /dev/davinci4 \
+    --device /dev/davinci5 \
+    --device /dev/davinci_manager \
+    --device /dev/devmm_svm \
+    --device /dev/hisi_hdc \
+    -v /usr/local/dcmi:/usr/local/dcmi \
+    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
+    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
+    -v /etc/ascend_install.info:/etc/ascend_install.info \
+    -v /root/.cache:/root/.cache \
+    -p 8000:8000 \
+    -it $IMAGE bash
+```
+
+!!! tip
+    You can use this docker image to serve models the same way you would with in vLLM! To do so, make sure you overwrite the default entrypoint (`vllm serve --omni`) which works only for models supported in the vLLM-Omni project.
+
+Or build IMAGE from **source code**:
+
+```bash
+git clone https://github.com/vllm-project/vllm-omni.git
+cd vllm-omni
+# A2
+docker build -t vllm-omni-dev-image:latest -f ./docker/Dockerfile.npu .
+# A3
+# docker build -t vllm-omni-dev-image:latest -f ./docker/Dockerfile.npu.a3 .
+```
+
+# --8<-- [end:pre-built-images]
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
@@ -51,6 +51,7 @@ th {
 
 | Architecture | Models | Example HF Models |
 |--------------|--------|-------------------|
+| `Qwen3OmniMoeForConditionalGeneration` | Qwen3-Omni | `Qwen/Qwen3-Omni-30B-A3B-Instruct` |
 | `Qwen2_5OmniForConditionalGeneration` | Qwen2.5-Omni | `Qwen/Qwen2.5-Omni-7B`, `Qwen/Qwen2.5-Omni-3B`|
 | `QwenImagePipeline` | Qwen-Image | `Qwen/Qwen-Image` |
 | `QwenImagePipeline` | Qwen-Image-2512 | `Qwen/Qwen-Image-2512` |
@@ -59,3 +60,8 @@ th {
 | `QwenImageLayeredPipeline` | Qwen-Image-Layered | `Qwen/Qwen-Image-Layered` |
 | `QwenImageEditPlusPipeline` | Qwen-Image-Edit-2511 | `Qwen/Qwen-Image-Edit-2511` |
 |`ZImagePipeline` | Z-Image | `Tongyi-MAI/Z-Image-Turbo` |
+|`LongcatImagePipeline` | LongCat-Image | `meituan-longcat/LongCat-Image` |
+|`Flux2KleinPipeline` | FLUX.2-klein | `black-forest-labs/FLUX.2-klein-4B`, `black-forest-labs/FLUX.2-klein-9B` |
+|`Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-CustomVoice | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` |
+|`Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-VoiceDesign | `Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign` |
+|`Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-Base | `Qwen/Qwen3-TTS-12Hz-0.6B-Base` |
@@ -12,6 +12,9 @@ For **multi-image** input editing, use **Qwen-Image-Edit-2509** (QwenImageEditPl
 vllm serve Qwen/Qwen-Image-Edit --omni --port 8092
 ```
 
+!!! note
+    If you encounter Out-of-Memory (OOM) issues or have limited GPU memory, you can enable VAE slicing and tiling to reduce memory usage, --vae-use-slicing --vae-use-tiling
+
 ### Multi-Image Edit (Qwen-Image-Edit-2509)
 
 ```bash