diff --git a/docker/Dockerfile.ci.npu b/docker/Dockerfile.ci.npu deleted file mode 100644 index cdf7a70f3a..0000000000 --- a/docker/Dockerfile.ci.npu +++ /dev/null @@ -1,15 +0,0 @@ -ARG VLLM_ASCEND_IMAGE=quay.nju.edu.cn/ascend/vllm-ascend -ARG VLLM_ASCEND_TAG=v0.11.0rc2 -FROM ${VLLM_ASCEND_IMAGE}:${VLLM_ASCEND_TAG} - -ARG APP_DIR=/vllm-workspace/vllm-omni -WORKDIR ${APP_DIR} - -COPY . . - -# Install vllm-omni with dev dependencies -RUN pip install --no-cache-dir -e ".[dev]" - -ENV VLLM_WORKER_MULTIPROC_METHOD=spawn - -ENTRYPOINT [] diff --git a/docker/Dockerfile.npu b/docker/Dockerfile.npu new file mode 100644 index 0000000000..ab58ebec0d --- /dev/null +++ b/docker/Dockerfile.npu @@ -0,0 +1,19 @@ +ARG VLLM_ASCEND_IMAGE=quay.io/ascend/vllm-ascend +ARG VLLM_ASCEND_TAG=v0.14.0rc1 +FROM ${VLLM_ASCEND_IMAGE}:${VLLM_ASCEND_TAG} + +ARG APP_DIR=/vllm-workspace/vllm-omni +WORKDIR ${APP_DIR} + +COPY . . + +# Remove this replace when the dispatch of requirements is ready +RUN sed -i -E 's/^([[:space:]]*)"fa3-fwd==0\.0\.1",/\1# "fa3-fwd==0.0.1",/' pyproject.toml \ + && sed -i -E 's/\bonnxruntime\b/onnxruntime-cann/g' pyproject.toml + +# Install vllm-omni with dev dependencies +RUN pip install --no-cache-dir -e . + +ENV VLLM_WORKER_MULTIPROC_METHOD=spawn + +ENTRYPOINT [] diff --git a/docker/Dockerfile.npu.a3 b/docker/Dockerfile.npu.a3 new file mode 100644 index 0000000000..17515fdb98 --- /dev/null +++ b/docker/Dockerfile.npu.a3 @@ -0,0 +1,19 @@ +ARG VLLM_ASCEND_IMAGE=quay.io/ascend/vllm-ascend +ARG VLLM_ASCEND_TAG=v0.14.0rc1-a3 +FROM ${VLLM_ASCEND_IMAGE}:${VLLM_ASCEND_TAG} + +ARG APP_DIR=/vllm-workspace/vllm-omni +WORKDIR ${APP_DIR} + +COPY . . + +# Remove this replace when the dispatch of requirements is ready +RUN sed -i -E 's/^([[:space:]]*)"fa3-fwd==0\.0\.1",/\1# "fa3-fwd==0.0.1",/' pyproject.toml \ + && sed -i -E 's/\bonnxruntime\b/onnxruntime-cann/g' pyproject.toml + +# Install vllm-omni with dev dependencies +RUN pip install --no-cache-dir -e . + +ENV VLLM_WORKER_MULTIPROC_METHOD=spawn + +ENTRYPOINT [] diff --git a/docs/getting_started/installation/npu.md b/docs/getting_started/installation/npu.md index 197bcec305..5ee2c0ee4a 100644 --- a/docs/getting_started/installation/npu.md +++ b/docs/getting_started/installation/npu.md @@ -16,7 +16,13 @@ vLLM-Omni supports NPU through the vLLM Ascend Plugin (vllm-ascend). This is a c ## Installation -### Recommended +### Set up using Docker + +=== "NPU" + + --8<-- "docs/getting_started/installation/npu/npu.inc.md:pre-built-images" + +### Build wheel from source === "NPU" diff --git a/docs/getting_started/installation/npu/npu.inc.md b/docs/getting_started/installation/npu/npu.inc.md index 9044bb0898..ff71f40091 100644 --- a/docs/getting_started/installation/npu/npu.inc.md +++ b/docs/getting_started/installation/npu/npu.inc.md @@ -13,10 +13,10 @@ export DEVICE0=/dev/davinci0 export DEVICE1=/dev/davinci1 # Update the vllm-ascend image # Atlas A2: -# export IMAGE=quay.io/ascend/vllm-ascend:v0.14.0 +# export IMAGE=quay.io/ascend/vllm-ascend:v0.14.0rc1 # Atlas A3: -# export IMAGE=quay.io/ascend/vllm-ascend:v0.14.0-a3 -export IMAGE=quay.io/ascend/vllm-ascend:v0.14.0 +# export IMAGE=quay.io/ascend/vllm-ascend:v0.14.0rc1-a3 +export IMAGE=quay.io/ascend/vllm-ascend:v0.14.0rc1 docker run --rm \ --name vllm-omni-npu \ --shm-size=1g \ @@ -34,26 +34,82 @@ docker run --rm \ -p 8000:8000 \ -it $IMAGE bash -# Install the missing dependency of mooncake in the origin image. -apt update -apt install libjemalloc2 -echo "export LD_PRELOAD=/usr/lib/$(uname -m)-linux-gnu/libjemalloc.so.2:$LD_PRELOAD" >> ~/.bashrc -source ~/.bashrc - # Inside the container, install vLLM-Omni from source cd /vllm-workspace git clone -b v0.14.0 https://github.com/vllm-project/vllm-omni.git + +# Remove this replace when the dispatch of requirements is ready +RUN sed -i -E 's/^([[:space:]]*)"fa3-fwd==0\.0\.1",/\1# "fa3-fwd==0.0.1",/' pyproject.toml \ + && sed -i -E 's/\bonnxruntime\b/onnxruntime-cann/g' pyproject.toml + cd vllm-omni pip install -v -e . export VLLM_WORKER_MULTIPROC_METHOD=spawn - -# (Optional) Disable mooncake for stable capability -mv /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/mooncake \ - /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/mooncake.disabled ``` The default workdir is `/workspace`, with vLLM, vLLM-Ascend and vLLM-Omni code placed in `/vllm-workspace` installed in development mode. For other installation methods (pip installation, building from source, custom Docker builds), please refer to the [vllm-ascend installation guide](https://docs.vllm.ai/projects/ascend/en/latest/installation.html). +We are keeping [issue #997](https://github.com/vllm-project/vllm-omni/issues/997) up to date with the aligned versions of vLLM, vLLM-Ascend, and vLLM-Omni, and also outlining the Q1 roadmap there. + # --8<-- [end:installation] + +# --8<-- [start:pre-built-images] + +`vllm-ascend` offers Docker images for deployment. You can just pull the **prebuilt image** from the image repository [ascend/vllm-ascend](https://quay.io/repository/ascend/vllm-ascend?tab=tags) and run it with bash. + +Supported images as following. + +| image name | Hardware | OS | +|-|-|-| +| image-tag | Atlas A2 | Ubuntu | +| image-tag-a3 | Atlas A3 | Ubuntu | + +vLLM-Omni offers Docker images for Ascend NPU deployment. You can just pull the **prebuilt image** from the image repository [ascend/vllm-ascend](https://quay.io/repository/ascend/vllm-ascend?tab=tags) and run it with bash. + +Here's an example deployment command that has been verified on 6 x NPUs: + +```bash +# Atlas A2: +# export IMAGE=quay.io/ascend/vllm-omni:v0.14.0 +# Atlas A3: +# export IMAGE=quay.io/ascend/vllm-omni:v0.14.0-a3 +export IMAGE=quay.io/ascend/vllm-omni:v0.14.0 +docker run --rm \ + --name vllm-omni-npu \ + --shm-size=1g \ + --device /dev/davinci0 \ + --device /dev/davinci1 \ + --device /dev/davinci2 \ + --device /dev/davinci3 \ + --device /dev/davinci4 \ + --device /dev/davinci5 \ + --device /dev/davinci_manager \ + --device /dev/devmm_svm \ + --device /dev/hisi_hdc \ + -v /usr/local/dcmi:/usr/local/dcmi \ + -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ + -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ + -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ + -v /etc/ascend_install.info:/etc/ascend_install.info \ + -v /root/.cache:/root/.cache \ + -p 8000:8000 \ + -it $IMAGE bash +``` + +!!! tip + You can use this docker image to serve models the same way you would with in vLLM! To do so, make sure you overwrite the default entrypoint (`vllm serve --omni`) which works only for models supported in the vLLM-Omni project. + +Or build IMAGE from **source code**: + +```bash +git clone https://github.com/vllm-project/vllm-omni.git +cd vllm-omni +# A2 +docker build -t vllm-omni-dev-image:latest -f ./docker/Dockerfile.npu . +# A3 +# docker build -t vllm-omni-dev-image:latest -f ./docker/Dockerfile.npu.a3 . +``` + +# --8<-- [end:pre-built-images] diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 269c9261ea..ca581d38f4 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -51,6 +51,7 @@ th { | Architecture | Models | Example HF Models | |--------------|--------|-------------------| +| `Qwen3OmniMoeForConditionalGeneration` | Qwen3-Omni | `Qwen/Qwen3-Omni-30B-A3B-Instruct` | | `Qwen2_5OmniForConditionalGeneration` | Qwen2.5-Omni | `Qwen/Qwen2.5-Omni-7B`, `Qwen/Qwen2.5-Omni-3B`| | `QwenImagePipeline` | Qwen-Image | `Qwen/Qwen-Image` | | `QwenImagePipeline` | Qwen-Image-2512 | `Qwen/Qwen-Image-2512` | @@ -59,3 +60,8 @@ th { | `QwenImageLayeredPipeline` | Qwen-Image-Layered | `Qwen/Qwen-Image-Layered` | | `QwenImageEditPlusPipeline` | Qwen-Image-Edit-2511 | `Qwen/Qwen-Image-Edit-2511` | |`ZImagePipeline` | Z-Image | `Tongyi-MAI/Z-Image-Turbo` | +|`LongcatImagePipeline` | LongCat-Image | `meituan-longcat/LongCat-Image` | +|`Flux2KleinPipeline` | FLUX.2-klein | `black-forest-labs/FLUX.2-klein-4B`, `black-forest-labs/FLUX.2-klein-9B` | +|`Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-CustomVoice | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` | +|`Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-VoiceDesign | `Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign` | +|`Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-Base | `Qwen/Qwen3-TTS-12Hz-0.6B-Base` | diff --git a/examples/online_serving/image_to_image/README.md b/examples/online_serving/image_to_image/README.md index 171a336804..f69fa8b428 100644 --- a/examples/online_serving/image_to_image/README.md +++ b/examples/online_serving/image_to_image/README.md @@ -12,6 +12,9 @@ For **multi-image** input editing, use **Qwen-Image-Edit-2509** (QwenImageEditPl vllm serve Qwen/Qwen-Image-Edit --omni --port 8092 ``` +!!! note + If you encounter Out-of-Memory (OOM) issues or have limited GPU memory, you can enable VAE slicing and tiling to reduce memory usage, --vae-use-slicing --vae-use-tiling + ### Multi-Image Edit (Qwen-Image-Edit-2509) ```bash