diff --git a/docker/Dockerfile.npu b/docker/Dockerfile.npu index 1c65d43b07..1bd8601ef0 100644 --- a/docker/Dockerfile.npu +++ b/docker/Dockerfile.npu @@ -2,13 +2,15 @@ ARG VLLM_ASCEND_IMAGE=quay.io/ascend/vllm-ascend ARG VLLM_ASCEND_TAG=v0.14.0rc1 FROM ${VLLM_ASCEND_IMAGE}:${VLLM_ASCEND_TAG} +WORKDIR /vllm-workspace/vllm-ascend +RUN git checkout e2175d9c7e62b437391dfee996b1375674ba7c18 +RUN pip install -v -e . + ARG APP_DIR=/vllm-workspace/vllm-omni WORKDIR ${APP_DIR} COPY . . -RUN sed -i -E 's/^([[:space:]]*)"fa3-fwd==0\.0\.1",/\1# "fa3-fwd==0.0.1",/' pyproject.toml \ - && sed -i -E 's/\bonnxruntime\b/onnxruntime-cann/g' pyproject.toml RUN pip install -v -e . ENV VLLM_WORKER_MULTIPROC_METHOD=spawn diff --git a/docker/Dockerfile.npu.a3 b/docker/Dockerfile.npu.a3 index 33b5d3c4e0..d521288f66 100644 --- a/docker/Dockerfile.npu.a3 +++ b/docker/Dockerfile.npu.a3 @@ -2,13 +2,15 @@ ARG VLLM_ASCEND_IMAGE=quay.io/ascend/vllm-ascend ARG VLLM_ASCEND_TAG=v0.14.0rc1-a3 FROM ${VLLM_ASCEND_IMAGE}:${VLLM_ASCEND_TAG} +WORKDIR /vllm-workspace/vllm-ascend +RUN git checkout e2175d9c7e62b437391dfee996b1375674ba7c18 +RUN pip install -v -e . + ARG APP_DIR=/vllm-workspace/vllm-omni WORKDIR ${APP_DIR} COPY . . -RUN sed -i -E 's/^([[:space:]]*)"fa3-fwd==0\.0\.1",/\1# "fa3-fwd==0.0.1",/' pyproject.toml \ - && sed -i -E 's/\bonnxruntime\b/onnxruntime-cann/g' pyproject.toml RUN pip install -v -e . ENV VLLM_WORKER_MULTIPROC_METHOD=spawn diff --git a/docs/getting_started/installation/npu/npu.inc.md b/docs/getting_started/installation/npu/npu.inc.md index 50e18697b7..27a3518a8b 100644 --- a/docs/getting_started/installation/npu/npu.inc.md +++ b/docs/getting_started/installation/npu/npu.inc.md @@ -33,13 +33,17 @@ docker run --rm \ -p 8000:8000 \ -it $IMAGE bash +# Because vllm-ascend will release v0.16.0rc1 after vllm-omni 0.16.0, +# we have to pin vllm-ascend at the current commit. +cd /vllm-workspace/vllm-ascend +git checkout e2175d9c7e62b437391dfee996b1375674ba7c18 +pip install -v -e . + # Inside the container, install vLLM-Omni from source cd /vllm-workspace -git clone -b v0.14.0 https://github.com/vllm-project/vllm-omni.git +git clone -b v0.16.0 https://github.com/vllm-project/vllm-omni.git cd vllm-omni -sed -i -E 's/^([[:space:]]*)"fa3-fwd==0\.0\.1",/\1# "fa3-fwd==0.0.1",/' pyproject.toml \ - && sed -i -E 's/\bonnxruntime\b/onnxruntime-cann/g' pyproject.toml pip install -v -e . export VLLM_WORKER_MULTIPROC_METHOD=spawn ``` @@ -48,13 +52,13 @@ The default workdir is `/workspace`, with vLLM, vLLM-Ascend and vLLM-Omni code p For other installation methods (pip installation, building from source, custom Docker builds), please refer to the [vllm-ascend installation guide](https://docs.vllm.ai/projects/ascend/en/latest/installation.html). -We are keeping [issue #997](https://github.com/vllm-project/vllm-omni/issues/997) up to date with the aligned versions of vLLM, vLLM-Ascend, and vLLM-Omni, and also outlining the Q1 roadmap there. +We are keeping [issue #886](https://github.com/vllm-project/vllm-omni/issues/886) up to date with the aligned versions of vLLM, vLLM-Ascend, and vLLM-Omni, and also outlining the Q1 roadmap there. # --8<-- [end:installation-release] # --8<-- [start:installation-main] -You can also build vLLM-Omni from the latest main branch if you want to use the latest features or bug fixes. (But sometimes it will break for a while. You can check [issue #997](https://github.com/vllm-project/vllm-omni/issues/997) for the status of the latest commit of vLLM-Omni main branch on NPU.) +You can also build vLLM-Omni from the latest main branch if you want to use the latest features or bug fixes. (But sometimes it will break for a while. You can check [issue #886](https://github.com/vllm-project/vllm-omni/issues/886) for the status of the latest commit of vLLM-Omni main branch on NPU.) ```bash # Pin vLLM version to 0.16.0 @@ -62,12 +66,14 @@ cd /vllm-workspace/vllm git pull origin main git fetch origin --tags git checkout v0.16.0 +VLLM_TARGET_DEVICE=empty pip install -v -e . # Because vllm-ascend has not yet entered continuous development and has not been officially released, we need to pin it to a specific commit. Please note that this commit may change over time. cd ../vllm-ascend git pull origin main git fetch origin --tags git checkout e2175d9c7e62b437391dfee996b1375674ba7c18 +pip install -v -e . # Install vLLM-Omni from the latest main branch cd ../vllm-omni @@ -94,10 +100,10 @@ Here's an example deployment command that has been verified on 4 x NPUs: ```bash # Atlas A2: -# export IMAGE=quay.io/ascend/vllm-omni:v0.14.0 +# export IMAGE=quay.io/ascend/vllm-omni:v0.16.0 # Atlas A3: -# export IMAGE=quay.io/ascend/vllm-omni:v0.14.0-a3 -export IMAGE=quay.io/ascend/vllm-omni:v0.14.0 +# export IMAGE=quay.io/ascend/vllm-omni:v0.16.0-a3 +export IMAGE=quay.io/ascend/vllm-omni:v0.16.0 docker run --rm \ --name vllm-omni-npu \ --shm-size=1g \ diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index c8ced1b747..980488852f 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -66,7 +66,10 @@ th { | `QwenImageLayeredPipeline` | Qwen-Image-Layered | `Qwen/Qwen-Image-Layered` | | `QwenImageEditPlusPipeline` | Qwen-Image-Edit-2511 | `Qwen/Qwen-Image-Edit-2511` | |`ZImagePipeline` | Z-Image | `Tongyi-MAI/Z-Image-Turbo` | +| `WanPipeline` | Wan2.2-T2V, Wan2.2-TI2V | `Wan-AI/Wan2.2-T2V-A14B-Diffusers`, `Wan-AI/Wan2.2-TI2V-5B-Diffusers` | +| `WanImageToVideoPipeline` | Wan2.2-I2V | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | |`LongcatImagePipeline` | LongCat-Image | `meituan-longcat/LongCat-Image` | +|`LongCatImageEditPipeline` | LongCat-Image-Edit | `meituan-longcat/LongCat-Image-Edit` | |`Flux2KleinPipeline` | FLUX.2-klein | `black-forest-labs/FLUX.2-klein-4B`, `black-forest-labs/FLUX.2-klein-9B` | |`Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-CustomVoice | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` | |`Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-VoiceDesign | `Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign` | diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code_predictor_vllm.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code_predictor_vllm.py index b8b8f6bed4..a3133c1796 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code_predictor_vllm.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code_predictor_vllm.py @@ -18,6 +18,8 @@ from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheGroupSpec, KVCacheSpec, KVCacheTensor from vllm.v1.worker.gpu import attn_utils +from vllm_omni.platforms import current_omni_platform + from .configuration_qwen3_tts import Qwen3TTSTalkerCodePredictorConfig, Qwen3TTSTalkerConfig @@ -148,18 +150,40 @@ def build_attn_metadata( block_table = self._block_table[:num_reqs].contiguous() slot_mapping_gpu = slot_mapping.to(device=self.device) - attn_metadata = attn_utils.build_attn_metadata( - self.attn_metadata_builders, - num_reqs=num_reqs, - num_tokens=num_tokens, - query_start_loc_gpu=query_start_loc_gpu, - query_start_loc_cpu=qsl, - seq_lens=seq_lens_gpu, - max_seq_len=max_seq_len, - block_tables=[block_table], - slot_mappings=[slot_mapping_gpu], - kv_cache_config=self.kv_cache_config, - ) + # FIXME(gcanlin): Refactor build_attn_metadata to avoid special-casing NPU backends here. + if current_omni_platform.is_npu(): + # NPU requires AscendCommonAttentionMetadata with extra attributes + from vllm_ascend.worker.v2 import attn_utils as attn_utils_npu + + max_query_len = int(query_lens_i32[:num_reqs].max().item()) + # NPU version expects slot_mappings as a stacked tensor, not a list + slot_mappings_tensor = slot_mapping_gpu.unsqueeze(0) + attn_metadata = attn_utils_npu.build_attn_metadata( + attn_metadata_builders=self.attn_metadata_builders, + num_reqs=num_reqs, + num_tokens=num_tokens, + query_start_loc_gpu=query_start_loc_gpu, + query_start_loc_cpu=qsl, + max_query_len=max_query_len, + seq_lens=seq_lens_gpu, + max_seq_len=max_seq_len, + block_tables=[block_table], + slot_mappings=slot_mappings_tensor, + kv_cache_config=self.kv_cache_config, + ) + else: + attn_metadata = attn_utils.build_attn_metadata( + self.attn_metadata_builders, + num_reqs=num_reqs, + num_tokens=num_tokens, + query_start_loc_gpu=query_start_loc_gpu, + query_start_loc_cpu=qsl, + seq_lens=seq_lens_gpu, + max_seq_len=max_seq_len, + block_tables=[block_table], + slot_mappings=[slot_mapping_gpu], + kv_cache_config=self.kv_cache_config, + ) # Build slot_mappings_by_layer for set_forward_context. # Fix for vllm 0.15.0 diff --git a/vllm_omni/platforms/npu/stage_configs/qwen3_tts.yaml b/vllm_omni/platforms/npu/stage_configs/qwen3_tts.yaml index 6e37ec0185..60659a9768 100644 --- a/vllm_omni/platforms/npu/stage_configs/qwen3_tts.yaml +++ b/vllm_omni/platforms/npu/stage_configs/qwen3_tts.yaml @@ -8,6 +8,7 @@ stage_args: engine_args: model_stage: qwen3_tts model_arch: Qwen3TTSTalkerForConditionalGeneration + # Force stage-specific registered architecture. hf_overrides: architectures: [Qwen3TTSTalkerForConditionalGeneration] worker_type: ar @@ -22,6 +23,7 @@ stage_args: max_num_batched_tokens: 512 max_model_len: 4096 custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk + # Use named connector to apply runtime.connectors.extra. output_connectors: to_stage_1: connector_of_shared_memory default_sampling_params: @@ -41,6 +43,7 @@ stage_args: engine_args: model_stage: code2wav model_arch: Qwen3TTSCode2Wav + # Force stage-specific registered architecture. hf_overrides: architectures: [Qwen3TTSCode2Wav] worker_type: generation @@ -52,11 +55,14 @@ stage_args: engine_output_type: audio gpu_memory_utilization: 0.2 distributed_executor_backend: "mp" + # Must be divisible by num_code_groups and cover (left_context + chunk). max_num_batched_tokens: 8192 + # async_chunk appends windows per step; max_model_len must cover accumulated stream. max_model_len: 32768 engine_input_source: [0] final_output: true final_output_type: audio + # Distributed connector configuration input_connectors: from_stage_0: connector_of_shared_memory tts_args: @@ -81,10 +87,13 @@ runtime: name: SharedMemoryConnector extra: shm_threshold_bytes: 65536 + # Frame-aligned codec streaming transport. codec_streaming: true + # Connector polling / timeout (unit: loop count, sleep interval in seconds). connector_get_sleep_s: 0.01 connector_get_max_wait_first_chunk: 3000 connector_get_max_wait: 300 + # Align with Omni: small chunks with sufficient context overlap. codec_chunk_frames: 25 codec_left_context_frames: 25 diff --git a/vllm_omni/platforms/npu/worker/npu_generation_model_runner.py b/vllm_omni/platforms/npu/worker/npu_generation_model_runner.py index 1fb1ac07a5..7651e365a3 100644 --- a/vllm_omni/platforms/npu/worker/npu_generation_model_runner.py +++ b/vllm_omni/platforms/npu/worker/npu_generation_model_runner.py @@ -86,7 +86,7 @@ def execute_model( num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens with record_function_or_nullcontext("prepare input"): # -------------------------------------- Omni-new ------------------------------------------------- - if self.model_config.async_chunk: + if self.model_config.async_chunk and num_scheduled_tokens: self._update_request_states(scheduler_output) # -------------------------------------- Omni-new ------------------------------------------------- with self.synchronize_input_prep(): diff --git a/vllm_omni/platforms/npu/worker/npu_model_runner.py b/vllm_omni/platforms/npu/worker/npu_model_runner.py index 14ec559dd9..254d27a4f4 100644 --- a/vllm_omni/platforms/npu/worker/npu_model_runner.py +++ b/vllm_omni/platforms/npu/worker/npu_model_runner.py @@ -34,15 +34,22 @@ def load_model(self, *args, **kwargs) -> None: # This is a workaround for vllm-ascend not passing vllm_config to enable_sp(). enable_sp(self.vllm_config) # TODO move this model specific logic to a separate class - if hasattr(self.model, "talker_mtp") and self.model.talker is not None: - self.talker_mtp = self.model.talker_mtp + # TTS model IS the talker (no .talker sub-attr); use getattr to support both Omni and TTS. + talker_mtp = getattr(self.model, "talker_mtp", None) + if talker_mtp is not None: + self.talker_mtp = talker_mtp # type: ignore[assignment] cudagraph_mode = self.compilation_config.cudagraph_mode assert cudagraph_mode is not None - if cudagraph_mode.has_full_cudagraphs(): - self.talker_mtp = ACLGraphWrapper( - self.model.talker_mtp, self.vllm_config, runtime_mode=CUDAGraphMode.FULL - ) - hidden_size = self.model_config.hf_config.talker_config.text_config.hidden_size + # Only wrap talker_mtp in CUDAGraphWrapper for Omni models that + # have a separate .talker sub-module. TTS models' code predictor + # has internal AR loops / torch.multinomial — not graph-safe. + has_separate_talker = getattr(self.model, "talker", None) is not None + if cudagraph_mode.has_full_cudagraphs() and has_separate_talker: + self.talker_mtp = ACLGraphWrapper(talker_mtp, self.vllm_config, runtime_mode=CUDAGraphMode.FULL) + # TTS exposes mtp_hidden_size; Omni uses hf_text_config.hidden_size. + hidden_size = int( + getattr(self.model, "mtp_hidden_size", 0) or getattr(self.model_config.hf_text_config, "hidden_size") + ) max_batch_size = max(self.max_num_reqs, self.compilation_config.max_cudagraph_capture_size) self.talker_mtp_input_ids = self._make_buffer(max_batch_size, dtype=torch.int32) self.talker_mtp_inputs_embeds = self._make_buffer( @@ -337,12 +344,6 @@ def _model_forward( # Omni-specific: build and inject extra model kwargs model_kwargs_extra = self._build_model_kwargs_extra() - runtime_info = model_kwargs_extra.get("runtime_additional_information", []) - if runtime_info: - for i, info in enumerate(runtime_info): - if info: - logger.debug(f"[OMNI] req[{i}] runtime_additional_information keys: {list(info.keys())}") - # Call the model forward (same as NPUModelRunner) assert self.model is not None model_output = self.model( @@ -397,6 +398,9 @@ def _talker_mtp_forward(self, decode_req_ids: list[str], inputs_embeds: torch.Te max_num_scheduled_tokens=1, use_cascade_attn=False, ) + # Force eager for unwrapped code predictors (AR loops / multinomial). + if not isinstance(self.talker_mtp, ACLGraphWrapper): + _cudagraph_mode = CUDAGraphMode.NONE num_tokens_padded = batch_desc.num_tokens req_input_ids = self.talker_mtp_input_ids.gpu[:num_tokens_padded] req_embeds = self.talker_mtp_inputs_embeds.gpu[:num_tokens_padded] @@ -408,9 +412,10 @@ def _talker_mtp_forward(self, decode_req_ids: list[str], inputs_embeds: torch.Te req_embeds, code_predictor_codes = self.talker_mtp(req_input_ids, req_embeds, last_talker_hidden, text_step) # update the inputs_embeds and code_predictor_codes code_predictor_codes_cpu = code_predictor_codes.detach().to("cpu").contiguous() + out_key = getattr(self.model, "talker_mtp_output_key", "code_predictor_codes") for idx, req_id in enumerate(decode_req_ids): req_index = self.input_batch.req_ids.index(req_id) start_offset = int(self.query_start_loc.cpu[req_index]) inputs_embeds[start_offset : start_offset + 1] = req_embeds[idx : idx + 1] - update_dict = {"code_predictor_codes": code_predictor_codes_cpu[idx : idx + 1]} + update_dict = {out_key: code_predictor_codes_cpu[idx : idx + 1]} self._merge_additional_information_update(req_id, update_dict)