vllm-project · hsliuustc0106 · Feb 28, 2026 · Feb 28, 2026 · Feb 28, 2026
@@ -2,13 +2,15 @@ ARG VLLM_ASCEND_IMAGE=quay.io/ascend/vllm-ascend
 ARG VLLM_ASCEND_TAG=v0.14.0rc1
 FROM ${VLLM_ASCEND_IMAGE}:${VLLM_ASCEND_TAG}
 
+WORKDIR /vllm-workspace/vllm-ascend
+RUN git checkout e2175d9c7e62b437391dfee996b1375674ba7c18
+RUN pip install -v -e .
+
 ARG APP_DIR=/vllm-workspace/vllm-omni
 WORKDIR ${APP_DIR}
 
 COPY . .
 
-RUN sed -i -E 's/^([[:space:]]*)"fa3-fwd==0\.0\.1",/\1# "fa3-fwd==0.0.1",/' pyproject.toml \
- && sed -i -E 's/\bonnxruntime\b/onnxruntime-cann/g' pyproject.toml
 RUN pip install -v -e .
 
 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn

@@ -2,13 +2,15 @@ ARG VLLM_ASCEND_IMAGE=quay.io/ascend/vllm-ascend
 ARG VLLM_ASCEND_TAG=v0.14.0rc1-a3
 FROM ${VLLM_ASCEND_IMAGE}:${VLLM_ASCEND_TAG}
 
+WORKDIR /vllm-workspace/vllm-ascend
+RUN git checkout e2175d9c7e62b437391dfee996b1375674ba7c18
+RUN pip install -v -e .
+
 ARG APP_DIR=/vllm-workspace/vllm-omni
 WORKDIR ${APP_DIR}
 
 COPY . .
 
-RUN sed -i -E 's/^([[:space:]]*)"fa3-fwd==0\.0\.1",/\1# "fa3-fwd==0.0.1",/' pyproject.toml \
- && sed -i -E 's/\bonnxruntime\b/onnxruntime-cann/g' pyproject.toml
 RUN pip install -v -e .
 
 ENV VLLM_WORKER_MULTIPROC_METHOD=spawn

diff --git a/docs/getting_started/installation/npu/npu.inc.md b/docs/getting_started/installation/npu/npu.inc.md
@@ -33,13 +33,17 @@ docker run --rm \
     -p 8000:8000 \
     -it $IMAGE bash
 
+# Because vllm-ascend will release v0.16.0rc1 after vllm-omni 0.16.0,
+# we have to pin vllm-ascend at the current commit.
+cd /vllm-workspace/vllm-ascend
+git checkout e2175d9c7e62b437391dfee996b1375674ba7c18
+pip install -v -e .
+
 # Inside the container, install vLLM-Omni from source
 cd /vllm-workspace
-git clone -b v0.14.0 https://github.com/vllm-project/vllm-omni.git
+git clone -b v0.16.0 https://github.com/vllm-project/vllm-omni.git
 
 cd vllm-omni
-sed -i -E 's/^([[:space:]]*)"fa3-fwd==0\.0\.1",/\1# "fa3-fwd==0.0.1",/' pyproject.toml \
- && sed -i -E 's/\bonnxruntime\b/onnxruntime-cann/g' pyproject.toml
 pip install -v -e .
 export VLLM_WORKER_MULTIPROC_METHOD=spawn
 ```
@@ -48,26 +52,28 @@ The default workdir is `/workspace`, with vLLM, vLLM-Ascend and vLLM-Omni code p
 
 For other installation methods (pip installation, building from source, custom Docker builds), please refer to the [vllm-ascend installation guide](https://docs.vllm.ai/projects/ascend/en/latest/installation.html).
 
-We are keeping [issue #997](https://github.com/vllm-project/vllm-omni/issues/997) up to date with the aligned versions of vLLM, vLLM-Ascend, and vLLM-Omni, and also outlining the Q1 roadmap there.
+We are keeping [issue #886](https://github.com/vllm-project/vllm-omni/issues/886) up to date with the aligned versions of vLLM, vLLM-Ascend, and vLLM-Omni, and also outlining the Q1 roadmap there.
 
 # --8<-- [end:installation-release]
 
 # --8<-- [start:installation-main]
 
-You can also build vLLM-Omni from the latest main branch if you want to use the latest features or bug fixes. (But sometimes it will break for a while. You can check [issue #997](https://github.com/vllm-project/vllm-omni/issues/997) for the status of the latest commit of vLLM-Omni main branch on NPU.)
+You can also build vLLM-Omni from the latest main branch if you want to use the latest features or bug fixes. (But sometimes it will break for a while. You can check [issue #886](https://github.com/vllm-project/vllm-omni/issues/886) for the status of the latest commit of vLLM-Omni main branch on NPU.)
 
 ```bash
 # Pin vLLM version to 0.16.0
 cd /vllm-workspace/vllm
 git pull origin main
 git fetch origin --tags
 git checkout v0.16.0
+VLLM_TARGET_DEVICE=empty pip install -v -e .
 
 # Because vllm-ascend has not yet entered continuous development and has not been officially released, we need to pin it to a specific commit. Please note that this commit may change over time.
 cd ../vllm-ascend
 git pull origin main
 git fetch origin --tags
 git checkout e2175d9c7e62b437391dfee996b1375674ba7c18
+pip install -v -e .
 
 # Install vLLM-Omni from the latest main branch
 cd ../vllm-omni
@@ -94,10 +100,10 @@ Here's an example deployment command that has been verified on 4 x NPUs:
 
 ```bash
 # Atlas A2:
-# export IMAGE=quay.io/ascend/vllm-omni:v0.14.0
+# export IMAGE=quay.io/ascend/vllm-omni:v0.16.0
 # Atlas A3:
-# export IMAGE=quay.io/ascend/vllm-omni:v0.14.0-a3
-export IMAGE=quay.io/ascend/vllm-omni:v0.14.0
+# export IMAGE=quay.io/ascend/vllm-omni:v0.16.0-a3
+export IMAGE=quay.io/ascend/vllm-omni:v0.16.0
 docker run --rm \
     --name vllm-omni-npu \
     --shm-size=1g \

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
@@ -66,7 +66,10 @@ th {
 | `QwenImageLayeredPipeline` | Qwen-Image-Layered | `Qwen/Qwen-Image-Layered` |
 | `QwenImageEditPlusPipeline` | Qwen-Image-Edit-2511 | `Qwen/Qwen-Image-Edit-2511` |
 |`ZImagePipeline` | Z-Image | `Tongyi-MAI/Z-Image-Turbo` |
+| `WanPipeline` | Wan2.2-T2V, Wan2.2-TI2V | `Wan-AI/Wan2.2-T2V-A14B-Diffusers`, `Wan-AI/Wan2.2-TI2V-5B-Diffusers` |
+| `WanImageToVideoPipeline` | Wan2.2-I2V | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` |
 |`LongcatImagePipeline` | LongCat-Image | `meituan-longcat/LongCat-Image` |
+|`LongCatImageEditPipeline` | LongCat-Image-Edit | `meituan-longcat/LongCat-Image-Edit` |
 |`Flux2KleinPipeline` | FLUX.2-klein | `black-forest-labs/FLUX.2-klein-4B`, `black-forest-labs/FLUX.2-klein-9B` |
 |`Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-CustomVoice | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` |
 |`Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-VoiceDesign | `Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign` |

@@ -18,6 +18,8 @@
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheGroupSpec, KVCacheSpec, KVCacheTensor
 from vllm.v1.worker.gpu import attn_utils
 
+from vllm_omni.platforms import current_omni_platform
+
 from .configuration_qwen3_tts import Qwen3TTSTalkerCodePredictorConfig, Qwen3TTSTalkerConfig
 
 
@@ -148,18 +150,40 @@ def build_attn_metadata(
         block_table = self._block_table[:num_reqs].contiguous()
         slot_mapping_gpu = slot_mapping.to(device=self.device)
 
-        attn_metadata = attn_utils.build_attn_metadata(
-            self.attn_metadata_builders,
-            num_reqs=num_reqs,
-            num_tokens=num_tokens,
-            query_start_loc_gpu=query_start_loc_gpu,
-            query_start_loc_cpu=qsl,
-            seq_lens=seq_lens_gpu,
-            max_seq_len=max_seq_len,
-            block_tables=[block_table],
-            slot_mappings=[slot_mapping_gpu],
-            kv_cache_config=self.kv_cache_config,
-        )
+        # FIXME(gcanlin): Refactor build_attn_metadata to avoid special-casing NPU backends here.
+        if current_omni_platform.is_npu():
+            # NPU requires AscendCommonAttentionMetadata with extra attributes
+            from vllm_ascend.worker.v2 import attn_utils as attn_utils_npu
+
+            max_query_len = int(query_lens_i32[:num_reqs].max().item())
+            # NPU version expects slot_mappings as a stacked tensor, not a list
+            slot_mappings_tensor = slot_mapping_gpu.unsqueeze(0)
+            attn_metadata = attn_utils_npu.build_attn_metadata(
+                attn_metadata_builders=self.attn_metadata_builders,
+                num_reqs=num_reqs,
+                num_tokens=num_tokens,
+                query_start_loc_gpu=query_start_loc_gpu,
+                query_start_loc_cpu=qsl,
+                max_query_len=max_query_len,
+                seq_lens=seq_lens_gpu,
+                max_seq_len=max_seq_len,
+                block_tables=[block_table],
+                slot_mappings=slot_mappings_tensor,
+                kv_cache_config=self.kv_cache_config,
+            )
+        else:
+            attn_metadata = attn_utils.build_attn_metadata(
+                self.attn_metadata_builders,
+                num_reqs=num_reqs,
+                num_tokens=num_tokens,
+                query_start_loc_gpu=query_start_loc_gpu,
+                query_start_loc_cpu=qsl,
+                seq_lens=seq_lens_gpu,
+                max_seq_len=max_seq_len,
+                block_tables=[block_table],
+                slot_mappings=[slot_mapping_gpu],
+                kv_cache_config=self.kv_cache_config,
+            )
 
         # Build slot_mappings_by_layer for set_forward_context.
         # Fix for vllm 0.15.0

@@ -8,6 +8,7 @@ stage_args:
     engine_args:
       model_stage: qwen3_tts
       model_arch: Qwen3TTSTalkerForConditionalGeneration
+      # Force stage-specific registered architecture.
       hf_overrides:
         architectures: [Qwen3TTSTalkerForConditionalGeneration]
       worker_type: ar
@@ -22,6 +23,7 @@ stage_args:
       max_num_batched_tokens: 512
       max_model_len: 4096
       custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk
+    # Use named connector to apply runtime.connectors.extra.
     output_connectors:
       to_stage_1: connector_of_shared_memory
     default_sampling_params:
@@ -41,6 +43,7 @@ stage_args:
     engine_args:
       model_stage: code2wav
       model_arch: Qwen3TTSCode2Wav
+      # Force stage-specific registered architecture.
       hf_overrides:
         architectures: [Qwen3TTSCode2Wav]
       worker_type: generation
@@ -52,11 +55,14 @@ stage_args:
       engine_output_type: audio
       gpu_memory_utilization: 0.2
       distributed_executor_backend: "mp"
+      # Must be divisible by num_code_groups and cover (left_context + chunk).
       max_num_batched_tokens: 8192
+      # async_chunk appends windows per step; max_model_len must cover accumulated stream.
       max_model_len: 32768
     engine_input_source: [0]
     final_output: true
     final_output_type: audio
+    # Distributed connector configuration
     input_connectors:
       from_stage_0: connector_of_shared_memory
     tts_args:
@@ -81,10 +87,13 @@ runtime:
       name: SharedMemoryConnector
       extra:
         shm_threshold_bytes: 65536
+        # Frame-aligned codec streaming transport.
         codec_streaming: true
+        # Connector polling / timeout (unit: loop count, sleep interval in seconds).
         connector_get_sleep_s: 0.01
         connector_get_max_wait_first_chunk: 3000
         connector_get_max_wait: 300
+        # Align with Omni: small chunks with sufficient context overlap.
         codec_chunk_frames: 25
         codec_left_context_frames: 25
 

@@ -86,7 +86,7 @@ def execute_model(
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         with record_function_or_nullcontext("prepare input"):
             #  -------------------------------------- Omni-new -------------------------------------------------
-            if self.model_config.async_chunk:
+            if self.model_config.async_chunk and num_scheduled_tokens:
                 self._update_request_states(scheduler_output)
             #  -------------------------------------- Omni-new -------------------------------------------------
             with self.synchronize_input_prep():

@@ -34,15 +34,22 @@ def load_model(self, *args, **kwargs) -> None:
         # This is a workaround for vllm-ascend not passing vllm_config to enable_sp().
         enable_sp(self.vllm_config)
         # TODO move this model specific logic to a separate class
-        if hasattr(self.model, "talker_mtp") and self.model.talker is not None:
-            self.talker_mtp = self.model.talker_mtp
+        # TTS model IS the talker (no .talker sub-attr); use getattr to support both Omni and TTS.
+        talker_mtp = getattr(self.model, "talker_mtp", None)
+        if talker_mtp is not None:
+            self.talker_mtp = talker_mtp  # type: ignore[assignment]
             cudagraph_mode = self.compilation_config.cudagraph_mode
             assert cudagraph_mode is not None
-            if cudagraph_mode.has_full_cudagraphs():
-                self.talker_mtp = ACLGraphWrapper(
-                    self.model.talker_mtp, self.vllm_config, runtime_mode=CUDAGraphMode.FULL
-                )
-            hidden_size = self.model_config.hf_config.talker_config.text_config.hidden_size
+            # Only wrap talker_mtp in CUDAGraphWrapper for Omni models that
+            # have a separate .talker sub-module.  TTS models' code predictor
+            # has internal AR loops / torch.multinomial — not graph-safe.
+            has_separate_talker = getattr(self.model, "talker", None) is not None
+            if cudagraph_mode.has_full_cudagraphs() and has_separate_talker:
+                self.talker_mtp = ACLGraphWrapper(talker_mtp, self.vllm_config, runtime_mode=CUDAGraphMode.FULL)
+            # TTS exposes mtp_hidden_size; Omni uses hf_text_config.hidden_size.
+            hidden_size = int(
+                getattr(self.model, "mtp_hidden_size", 0) or getattr(self.model_config.hf_text_config, "hidden_size")
+            )
             max_batch_size = max(self.max_num_reqs, self.compilation_config.max_cudagraph_capture_size)
             self.talker_mtp_input_ids = self._make_buffer(max_batch_size, dtype=torch.int32)
             self.talker_mtp_inputs_embeds = self._make_buffer(
@@ -337,12 +344,6 @@ def _model_forward(
         # Omni-specific: build and inject extra model kwargs
         model_kwargs_extra = self._build_model_kwargs_extra()
 
-        runtime_info = model_kwargs_extra.get("runtime_additional_information", [])
-        if runtime_info:
-            for i, info in enumerate(runtime_info):
-                if info:
-                    logger.debug(f"[OMNI] req[{i}] runtime_additional_information keys: {list(info.keys())}")
-
         # Call the model forward (same as NPUModelRunner)
         assert self.model is not None
         model_output = self.model(
@@ -397,6 +398,9 @@ def _talker_mtp_forward(self, decode_req_ids: list[str], inputs_embeds: torch.Te
             max_num_scheduled_tokens=1,
             use_cascade_attn=False,
         )
+        # Force eager for unwrapped code predictors (AR loops / multinomial).
+        if not isinstance(self.talker_mtp, ACLGraphWrapper):
+            _cudagraph_mode = CUDAGraphMode.NONE
         num_tokens_padded = batch_desc.num_tokens
         req_input_ids = self.talker_mtp_input_ids.gpu[:num_tokens_padded]
         req_embeds = self.talker_mtp_inputs_embeds.gpu[:num_tokens_padded]
@@ -408,9 +412,10 @@ def _talker_mtp_forward(self, decode_req_ids: list[str], inputs_embeds: torch.Te
             req_embeds, code_predictor_codes = self.talker_mtp(req_input_ids, req_embeds, last_talker_hidden, text_step)
         # update the inputs_embeds and code_predictor_codes
         code_predictor_codes_cpu = code_predictor_codes.detach().to("cpu").contiguous()
+        out_key = getattr(self.model, "talker_mtp_output_key", "code_predictor_codes")
         for idx, req_id in enumerate(decode_req_ids):
             req_index = self.input_batch.req_ids.index(req_id)
             start_offset = int(self.query_start_loc.cpu[req_index])
             inputs_embeds[start_offset : start_offset + 1] = req_embeds[idx : idx + 1]
-            update_dict = {"code_predictor_codes": code_predictor_codes_cpu[idx : idx + 1]}
+            update_dict = {out_key: code_predictor_codes_cpu[idx : idx + 1]}
             self._merge_additional_information_update(req_id, update_dict)