diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 02d8cced403..c33d7b4d10d 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -552,7 +552,7 @@ steps: - label: ":full_moon: Diffusion X2V · Accuracy Test" timeout_in_minutes: 180 commands: - - pytest -s -v tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py --run-level advanced_model + - pytest -s -v tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py -m advanced_model --run-level advanced_model agents: queue: "mithril-h100-pool" plugins: diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml index 68f8e615286..3ca1747fe64 100644 --- a/.buildkite/test-ready.yml +++ b/.buildkite/test-ready.yml @@ -367,6 +367,33 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" + - label: "Qwen3-TTS Base E2E Test (ModelRunner V2)" + depends_on: upload-ready-pipeline + soft_fail: + - exit_status: 1 + commands: + - | + timeout 20m bash -c ' + export VLLM_LOGGING_LEVEL=DEBUG + export VLLM_WORKER_MULTIPROC_METHOD=spawn + export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" + export VLLM_OMNI_USE_V2_RUNNER="1" + pytest -s -v tests/e2e/online_serving/test_qwen3_tts_base.py -m "core_model" --run-level "core_model" + ' + agents: + queue: "gpu_1_queue" + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + always-pull: true + propagate-environment: true + shm-size: "8gb" + environment: + - "HF_HOME=/fsx/hf_cache" + - "HF_TOKEN" + volumes: + - "/fsx/hf_cache:/fsx/hf_cache" + - label: "Voxtral-TTS E2E Test" timeout_in_minutes: 20 depends_on: upload-ready-pipeline diff --git a/.claude/skills/vllm-omni-npu-upgrade/SKILL.md b/.claude/skills/vllm-omni-npu-upgrade/SKILL.md new file mode 100644 index 00000000000..1ef7ab39301 --- /dev/null +++ b/.claude/skills/vllm-omni-npu-upgrade/SKILL.md @@ -0,0 +1,300 @@ +--- +name: vllm-omni-npu-model-runner-upgrade +description: "Upgrade vllm-omni NPU model runners (OmniNPUModelRunner, NPUARModelRunner, NPUGenerationModelRunner) to align with the latest vllm-ascend NPUModelRunner while preserving omni-specific logic." +--- + +# vLLM-Omni NPU Model Runner Upgrade Skill + +## Overview + +This skill guides the process of upgrading vllm-omni's NPU model runners to align with the latest vllm-ascend codebase while preserving omni-specific enhancements. The NPU runners are designed to run omni multimodal models (like Qwen3-Omni, Bagel, MiMoAudio) on Ascend NPUs. + +## File Structure + +### NPU Model Runner Files +``` +vllm-omni/vllm_omni/platforms/npu/worker/ +├── __init__.py +├── npu_model_runner.py # OmniNPUModelRunner (base class) +├── npu_ar_model_runner.py # NPUARModelRunner (autoregressive) +├── npu_ar_worker.py # AR worker +├── npu_generation_model_runner.py # NPUGenerationModelRunner (diffusion/non-AR) +└── npu_generation_worker.py # Generation worker +``` + +### GPU Reference Files (for omni-specific logic sync) +``` +vllm-omni/vllm_omni/worker/ +├── __init__.py +├── gpu_model_runner.py # OmniGPUModelRunner +├── gpu_ar_model_runner.py # GPUARModelRunner +├── gpu_ar_worker.py +├── gpu_generation_model_runner.py +├── gpu_generation_worker.py +├── mixins.py +├── base.py +└── gpu_memory_utils.py +``` + +### vllm-ascend Reference Files +``` +vllm-ascend/vllm_ascend/worker/ +├── model_runner_v1.py # NPUModelRunner (base class to copy from) +├── npu_input_batch.py +├── block_table.py +├── pcp_utils.py +└── worker.py +``` + +## Inheritance Hierarchy + +``` + GPUModelRunner (vllm) + | + +----------------+----------------+ + | | + OmniGPUModelRunner NPUModelRunner (vllm-ascend) + (vllm_omni/worker) (vllm_ascend/worker) + | | + +----------- OmniNPUModelRunner --+ + (multiple inheritance) + | + +---------------+---------------+ + | | + NPUARModelRunner NPUGenerationModelRunner + (autoregressive) (non-autoregressive/diffusion) +``` + +## Omni-Specific Comment Markers + +Omni-specific logic is marked with comment blocks: +```python +# -------------------------------------- Omni-new ------------------------------------------------- +# ... omni-specific code ... +# -------------------------------------- Omni-new ------------------------------------------------- +``` + +Or simpler variations: +```python +# -------------------------------------- Omni-new ------------------------------------------------- +# ------------------------------------------------------------------------------------------------ +``` + +**Important**: +- Always preserve and add these markers when modifying code. +- **The reference documents (`references/omni-specific-blocks.md`) may not be up-to-date.** Always grep for `Omni-new` in the GPU implementations to find the authoritative list of omni-specific blocks. +- When you discover new omni-specific code that is not documented in the references, please update the reference files. + +## Key Methods Requiring Attention + +### OmniNPUModelRunner (npu_model_runner.py) + +| Method | Description | Omni-Specific Logic | +|--------|-------------|---------------------| +| `load_model` | Load model and initialize talker_mtp | Uses `ACLGraphWrapper` instead of `CUDAGraphWrapper`, initializes talker buffers | +| `_dummy_run` | Warmup/profiling run | talker_mtp dummy forward, `extract_multimodal_outputs` | +| `_model_forward` | Forward pass wrapper | Injects `model_kwargs_extra`, wraps with `OmniOutput`, NPU-specific graph updates | +| `_talker_mtp_forward` | Talker MTP forward for Qwen3-Omni | Uses `set_ascend_forward_context` | + +### NPUARModelRunner (npu_ar_model_runner.py) + +| Method | Description | Omni-Specific Logic | +|--------|-------------|---------------------| +| `__init__` | Initialize with KV transfer manager | `OmniKVTransferManager` setup | +| `execute_model` | Main inference entry | KV transfer handling, `_update_states` override, `extract_multimodal_outputs` | +| `sample_tokens` | Token sampling | Hidden states extraction, multimodal outputs processing, `OmniModelRunnerOutput` | +| `_resolve_global_request_id` | Request ID resolution | For disaggregated inference | + +### NPUGenerationModelRunner (npu_generation_model_runner.py) + +| Method | Description | Omni-Specific Logic | +|--------|-------------|---------------------| +| `_update_request_states` | Update request states for async chunk | async_chunk handling | +| `execute_model` | Generation forward | async_chunk, `seq_token_counts`, `_run_generation_model` | +| `sample_tokens` | Output processing | multimodal output packaging to `OmniModelRunnerOutput` | +| `_dummy_run` | Dummy run override | model_kwargs initialization, multimodal extraction | +| `_run_generation_model` | Run generation model | Calls `_model_forward` with sampler | + +## Upgrade Workflow + +### Step 1: Preparation + +1. **Identify target versions**(Use gh cli to check): + - We're using vllm-omni main branch + - Check the last release of vllm-omni + - Target vllm-ascend version(Just directly use the local latest vllm-ascend code) + +2. **Check GPU-side changes** (since last release): + ```bash + cd /root/vllm-workspace/vllm-omni + git log --oneline --since="" -- vllm_omni/worker/ + ``` + +3. **Read latest vllm-ascend code**: + - We don't track vllm-ascend changes - just directly use the latest code from `/root/vllm-workspace/vllm-ascend/vllm_ascend/worker/model_runner_v1.py` + - Copy the relevant methods and re-insert omni-specific blocks + +### Step 2: Analyze Omni-Specific Logic + +For each NPU model runner file: + +1. **Extract existing omni-specific blocks**: + ```bash + grep -n "Omni-new" vllm_omni/platforms/npu/worker/npu_model_runner.py + ``` + +2. **Document each omni block**: + - Which method it belongs to + - What functionality it provides + - Dependencies on other omni code + +### Step 3: Update Base Class (OmniNPUModelRunner) + +**Note**: Always check the GPU implementation `gpu_model_runner.py` for any new omni logic not yet documented in references. + +1. **Read the latest vllm-ascend `NPUModelRunner.load_model`** +2. **Copy the method, keeping the structure** +3. **Re-insert omni-specific logic** (check GPU `gpu_model_runner.py` for authoritative list): + - Replace `CUDAGraphWrapper` with `ACLGraphWrapper` + - Keep talker_mtp initialization + - Preserve buffer allocations for talker + - Check for any new omni blocks added since last sync + +4. **Update `_dummy_run`**: + - Copy from vllm-ascend + - Compare with GPU `_dummy_run` for omni-specific blocks + - Re-insert all `Omni-new` marked code from GPU version + +5. **Update `_model_forward`**: + - Keep the omni wrapper logic + - Update NPU-specific parts (graph params, SP all-gather) + - Check GPU version for any new omni logic + +### Step 4: Update AR Model Runner + +1. **Compare with GPU `gpu_ar_model_runner.py`** for any new omni features +2. **Copy `execute_model` from vllm-ascend** +3. **Re-insert omni blocks** (reference `references/omni-specific-blocks.md`, but note it may be incomplete): + - **IMPORTANT**: Always check the GPU implementation `gpu_ar_model_runner.py` for all `Omni-new` marked code blocks + - The reference doc may not include newly added omni logic - treat it as a starting point, not exhaustive + - When discovering new omni code blocks, please update `references/omni-specific-blocks.md` + - Common omni blocks include but are not limited to: KV transfer, multimodal outputs, sampling_metadata handling, etc. + +4. **Update `sample_tokens`** (also compare with GPU implementation): + - Compare with `gpu_ar_model_runner.py`'s `sample_tokens` method + - Identify all `Omni-new` marked code blocks + - Ensure NPU version includes all omni-specific logic + +### Step 5: Update Generation Model Runner + +**Note**: Generation model runner may have unique omni logic for diffusion/non-AR models. + +1. **Compare with GPU `gpu_generation_model_runner.py`** - grep for all `Omni-new` blocks +2. **Update `execute_model`**: + - Check GPU version for all omni-specific blocks + - Keep async_chunk handling + - Keep `seq_token_counts` injection + - Update forward/context setup from vllm-ascend + - Look for any new omni logic not documented in references + +3. **Update `_dummy_run`**: + - Copy from vllm-ascend base + - Compare with GPU `_dummy_run` if exists + - Re-insert all omni-specific logic + +### Step 6: Update Imports + +Check and update imports at the top of each file: + +```python +# Common vllm-ascend imports +from vllm_ascend.ascend_forward_context import get_forward_context, set_ascend_forward_context +from vllm_ascend.attention.attention_v1 import AscendAttentionState +from vllm_ascend.attention.utils import using_paged_attention +from vllm_ascend.compilation.acl_graph import ACLGraphWrapper, update_full_graph_params +from vllm_ascend.ops.rotary_embedding import update_cos_sin +from vllm_ascend.utils import enable_sp, lmhead_tp_enable +from vllm_ascend.worker.model_runner_v1 import SEQ_LEN_WITH_MAX_PA_WORKSPACE, NPUModelRunner + +# Omni-specific imports +from vllm_omni.model_executor.models.output_templates import OmniOutput +from vllm_omni.worker.gpu_model_runner import OmniGPUModelRunner +from vllm_omni.outputs import OmniModelRunnerOutput +from vllm_omni.distributed.omni_connectors.kv_transfer_manager import OmniKVTransferManager +``` + +### Step 7: Sync GPU-Side Omni Changes + +1. **Check recent GPU worker changes**: + ```bash + git diff .. -- vllm_omni/worker/gpu_model_runner.py + git diff .. -- vllm_omni/worker/gpu_ar_model_runner.py + ``` + +2. **Identify new omni features** that need to be ported to NPU + +3. **Apply corresponding changes** to NPU runners + +### Step 8: Validation + +1. **Run type checking**: + ```bash + cd /root/vllm-workspace/vllm-omni + python -m py_compile vllm_omni/platforms/npu/worker/npu_model_runner.py + python -m py_compile vllm_omni/platforms/npu/worker/npu_ar_model_runner.py + python -m py_compile vllm_omni/platforms/npu/worker/npu_generation_model_runner.py + ``` + +2. **Run import test**: + ```bash + python -c "from vllm_omni.platforms.npu.worker import *" + ``` + +3. **Run model serving test** (if hardware available): + ```bash + vllm serve --trust-remote-code + ``` + +## Common Pitfalls + +### 1. Forward Context Differences +- GPU uses `set_forward_context` +- NPU uses `set_ascend_forward_context` +- Parameters may differ slightly + +### 2. Graph Wrapper Differences +- GPU: `CUDAGraphWrapper` +- NPU: `ACLGraphWrapper` +- Constructor parameters may differ + +### 3. Buffer Creation +- GPU: `_make_buffer` returns different structure +- NPU: May need numpy=True/False parameter + +### 4. Attention Metadata +- GPU: Uses vllm attention metadata builders +- NPU: Uses `AscendCommonAttentionMetadata` + +### 5. Sampling +- GPU: Uses vllm sampler +- NPU: Uses `AscendSampler` + +## Checklist Before Commit + +- [ ] All omni-specific comment markers preserved +- [ ] New omni logic from GPU side synced +- [ ] Imports updated to latest vllm-ascend +- [ ] No `CUDAGraphWrapper` references in NPU code +- [ ] `set_ascend_forward_context` used instead of `set_forward_context` +- [ ] `ACLGraphWrapper` used for talker_mtp wrapping +- [ ] Type hints match vllm-ascend signatures +- [ ] No duplicate code blocks +- [ ] Python syntax valid (py_compile passes) + +## Reference Files for Comparison + +When upgrading, keep these files open for reference: + +1. **vllm-ascend NPUModelRunner**: `/root/vllm-workspace/vllm-ascend/vllm_ascend/worker/model_runner_v1.py` +2. **vllm GPUModelRunner**: `/root/vllm-workspace/vllm/vllm/v1/worker/gpu_model_runner.py` +3. **vllm-omni OmniGPUModelRunner**: `/root/vllm-workspace/vllm-omni/vllm_omni/worker/gpu_model_runner.py` diff --git a/.claude/skills/vllm-omni-npu-upgrade/references/gpu-to-npu-translation.md b/.claude/skills/vllm-omni-npu-upgrade/references/gpu-to-npu-translation.md new file mode 100644 index 00000000000..89067d37b2d --- /dev/null +++ b/.claude/skills/vllm-omni-npu-upgrade/references/gpu-to-npu-translation.md @@ -0,0 +1,335 @@ +# GPU to NPU Translation Patterns + +This document provides a quick reference for translating GPU code patterns to NPU equivalents when porting omni-specific logic. + +## Import Translations + +### Forward Context +```python +# GPU +from vllm.forward_context import set_forward_context + +# NPU +from vllm_ascend.ascend_forward_context import set_ascend_forward_context +``` + +### Graph Wrapper +```python +# GPU +from vllm.compilation.cuda_graph import CUDAGraphWrapper + +# NPU +from vllm_ascend.compilation.acl_graph import ACLGraphWrapper +``` + +### Attention State +```python +# GPU (no equivalent - uses FlashAttention states directly) + +# NPU +from vllm_ascend.attention.attention_v1 import AscendAttentionState +``` + +### Utilities +```python +# GPU +# (directly use torch.cuda functions) + +# NPU +from vllm_ascend.utils import enable_sp, lmhead_tp_enable +from vllm_ascend.ops.rotary_embedding import update_cos_sin +``` + +## Context Manager Translations + +### Forward Context Setup +```python +# GPU +with set_forward_context( + attn_metadata, + self.vllm_config, + num_tokens=num_tokens_padded, + num_tokens_across_dp=num_tokens_across_dp, + cudagraph_runtime_mode=cudagraph_mode, + batch_descriptor=batch_desc, +): + # forward pass + +# NPU +with set_ascend_forward_context( + attn_metadata, + self.vllm_config, + num_tokens=num_tokens_padded, + num_tokens_across_dp=num_tokens_across_dp, + aclgraph_runtime_mode=cudagraph_mode, # Note: 'aclgraph' not 'cudagraph' + batch_descriptor=batch_desc, + num_actual_tokens=scheduler_output.total_num_scheduled_tokens, + model_instance=self.model, +): + # forward pass +``` + +### Graph Capture Context +```python +# GPU +from vllm.compilation.cuda_graph import graph_capture as cuda_graph_capture +with cuda_graph_capture(self.device): + # capture + +# NPU +from vllm_ascend.worker.model_runner_v1 import graph_capture +with graph_capture(self.device): + # capture +``` + +## Graph Wrapper Usage + +### Creating Graph Wrapper +```python +# GPU +if cudagraph_mode.has_full_cudagraphs() and has_separate_talker: + self.talker_mtp = CUDAGraphWrapper( + talker_mtp, + self.vllm_config, + runtime_mode=CUDAGraphMode.FULL + ) + +# NPU +if cudagraph_mode.has_full_cudagraphs() and has_separate_talker: + self.talker_mtp = ACLGraphWrapper( + talker_mtp, + self.vllm_config, + runtime_mode=CUDAGraphMode.FULL + ) +``` + +### Checking Graph Wrapper Type +```python +# GPU +if not isinstance(self.talker_mtp, CUDAGraphWrapper): + _cudagraph_mode = CUDAGraphMode.NONE + +# NPU +if not isinstance(self.talker_mtp, ACLGraphWrapper): + _cudagraph_mode = CUDAGraphMode.NONE +``` + +## Device Operations + +### Synchronization +```python +# GPU +torch.cuda.synchronize() + +# NPU +torch.npu.synchronize() +``` + +### Stream Operations +```python +# GPU +stream = torch.cuda.Stream(device=device) +torch.cuda.current_stream() + +# NPU +stream = torch.npu.Stream(device=device) +torch.npu.current_stream() +``` + +## Attention Metadata + +### State Setting (NPU-specific) +```python +# GPU - handled internally by attention backends + +# NPU - explicit state setting required +self.attn_state = AscendAttentionState.DecodeOnly +if self.speculative_config and self.speculative_config.method == "mtp": + if self.vllm_config.model_config.use_mla: + self.attn_state = AscendAttentionState.SpecDecoding + else: + self.attn_state = AscendAttentionState.ChunkedPrefill +``` + +### Building Attention Metadata +```python +# GPU - uses vllm attention builders + +# NPU - may need additional parameters +(attn_metadata, spec_decode_common_attn_metadata) = self._build_attention_metadata( + num_tokens=num_tokens_unpadded, + num_tokens_padded=num_tokens_padded, + num_reqs=num_reqs, + num_reqs_padded=num_reqs_padded, + max_query_len=max_num_scheduled_tokens, + ubatch_slices=ubatch_slices_attn, + logits_indices=logits_indices, + use_spec_decode=use_spec_decode, + num_scheduled_tokens=scheduler_output.num_scheduled_tokens, + num_scheduled_tokens_np=num_scheduled_tokens_np, + cascade_attn_prefix_lens=cascade_attn_prefix_lens, +) +``` + +## Rotary Embedding + +### Update Cos/Sin Cache +```python +# GPU - typically handled inside attention + +# NPU - explicit update required before forward +from vllm_ascend.ops.rotary_embedding import update_cos_sin +update_cos_sin(positions) +``` + +## Sequence Parallelism + +### Enable SP Check +```python +# GPU - use vllm distributed utilities + +# NPU - use vllm-ascend wrapper +from vllm_ascend.utils import enable_sp + +if enable_sp(): + # sequence parallelism enabled +``` + +## Sampler + +### Sampler Type +```python +# GPU - uses vllm sampler +self.sampler = Sampler() + +# NPU - uses AscendSampler +from vllm_ascend.sample.sampler import AscendSampler +self.sampler = AscendSampler() +``` + +## Input Batch + +### Batch Class +```python +# GPU +from vllm.v1.worker.gpu_input_batch import InputBatch + +# NPU +from vllm_ascend.worker.npu_input_batch import NPUInputBatch +``` + +## Graph Parameter Updates + +### Full Graph Params Update (NPU-specific) +```python +# GPU - not needed + +# NPU - required for FULL graph mode +from vllm_ascend.compilation.acl_graph import update_full_graph_params + +forward_context = get_forward_context() +if ( + forward_context.cudagraph_runtime_mode == CUDAGraphMode.FULL + and not forward_context.capturing + and not self.use_sparse +): + update_full_graph_params( + self.attn_backend, + self.update_stream, + forward_context, + num_tokens_padded, + self.vllm_config, + self.speculative_config, + positions.shape[0], + ) +``` + +## Paged Attention Check + +```python +# GPU - not typically needed + +# NPU +from vllm_ascend.attention.utils import using_paged_attention + +if is_graph_capturing and using_paged_attention(num_tokens, self.vllm_config): + seq_lens = SEQ_LEN_WITH_MAX_PA_WORKSPACE +``` + +## Common Method Signature Differences + +### _dummy_run Parameters +```python +# GPU (v0.17.0) +def _dummy_run( + self, + num_tokens: int, + cudagraph_runtime_mode: CUDAGraphMode | None = None, + force_attention: bool = False, + uniform_decode: bool = False, + allow_microbatching: bool = True, + skip_eplb: bool = False, + is_profile: bool = False, + create_mixed_batch: bool = False, + remove_lora: bool = True, + is_graph_capturing: bool = False, + num_active_loras: int = 0, +) -> tuple[torch.Tensor, torch.Tensor]: + +# NPU (v0.17.0) - adds with_prefill, activate_lora +def _dummy_run( + self, + num_tokens: int, + with_prefill: bool = False, + cudagraph_runtime_mode: CUDAGraphMode | None = None, + force_attention: bool = False, + uniform_decode: bool = False, + is_profile: bool = False, + create_mixed_batch: bool = False, + allow_microbatching: bool = True, + skip_eplb: bool = False, + remove_lora: bool = True, + activate_lora: bool = False, + is_graph_capturing: bool = False, + num_active_loras: int = 0, +) -> tuple[torch.Tensor, torch.Tensor]: +``` + +### _model_forward Parameters +```python +# GPU - no num_tokens_padded +def _model_forward( + self, + input_ids: torch.Tensor | None = None, + positions: torch.Tensor | None = None, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **model_kwargs: dict[str, Any], +): + +# NPU - has num_tokens_padded as first parameter +def _model_forward( + self, + num_tokens_padded: int, + input_ids: torch.Tensor | None = None, + positions: torch.Tensor | None = None, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **model_kwargs: dict[str, Any], +): +``` + +## Quick Reference Table + +| Feature | GPU | NPU | +|---------|-----|-----| +| Graph wrapper | `CUDAGraphWrapper` | `ACLGraphWrapper` | +| Forward context | `set_forward_context` | `set_ascend_forward_context` | +| Runtime mode param | `cudagraph_runtime_mode` | `aclgraph_runtime_mode` | +| Device sync | `torch.cuda.synchronize()` | `torch.npu.synchronize()` | +| Stream | `torch.cuda.Stream` | `torch.npu.Stream` | +| Current stream | `torch.cuda.current_stream()` | `torch.npu.current_stream()` | +| Input batch | `InputBatch` | `NPUInputBatch` | +| Sampler | `Sampler` | `AscendSampler` | +| Attention state | N/A | `AscendAttentionState` | +| RoPE update | N/A | `update_cos_sin()` | diff --git a/.claude/skills/vllm-omni-npu-upgrade/references/omni-specific-blocks.md b/.claude/skills/vllm-omni-npu-upgrade/references/omni-specific-blocks.md new file mode 100644 index 00000000000..8c5d32ab4c1 --- /dev/null +++ b/.claude/skills/vllm-omni-npu-upgrade/references/omni-specific-blocks.md @@ -0,0 +1,374 @@ +# Omni-Specific Code Blocks Reference + +This document catalogs omni-specific code blocks in the NPU model runners, making it easier to identify what needs to be preserved during upgrades. + +> **IMPORTANT**: This document may not be complete or up-to-date! +> +> - Always grep for `Omni-new` in the GPU implementations (`vllm_omni/worker/`) to find the authoritative list +> - New omni features may be added that are not yet documented here +> - When you discover new omni-specific blocks during an upgrade, please update this document +> - Last verified: Check git history for this file + +## OmniNPUModelRunner (npu_model_runner.py) + +### load_model - Talker MTP Initialization + +```python +def load_model(self, *args, **kwargs) -> None: + NPUModelRunner.load_model(self, *args, **kwargs) + # Initialize enable_sp cache to avoid get_current_vllm_config() error + # in _pad_for_sequence_parallelism during execute_model. + # This is a workaround for vllm-ascend not passing vllm_config to enable_sp(). + enable_sp(self.vllm_config) + # TODO move this model specific logic to a separate class + # TTS model IS the talker (no .talker sub-attr); use getattr to support both Omni and TTS. + talker_mtp = getattr(self.model, "talker_mtp", None) + if talker_mtp is not None: + self.talker_mtp = talker_mtp # type: ignore[assignment] + cudagraph_mode = self.compilation_config.cudagraph_mode + assert cudagraph_mode is not None + # Only wrap talker_mtp in CUDAGraphWrapper for Omni models that + # have a separate .talker sub-module. TTS models' code predictor + # has internal AR loops / torch.multinomial — not graph-safe. + has_separate_talker = getattr(self.model, "talker", None) is not None + if cudagraph_mode.has_full_cudagraphs() and has_separate_talker: + # NOTE: Use ACLGraphWrapper on NPU, not CUDAGraphWrapper + self.talker_mtp = ACLGraphWrapper(talker_mtp, self.vllm_config, runtime_mode=CUDAGraphMode.FULL) + # TTS exposes mtp_hidden_size; Omni uses hf_text_config.hidden_size. + hidden_size = int( + getattr(self.model, "mtp_hidden_size", 0) or getattr(self.model_config.hf_text_config, "hidden_size") + ) + max_batch_size = max(self.max_num_reqs, self.compilation_config.max_cudagraph_capture_size) + self.talker_mtp_input_ids = self._make_buffer(max_batch_size, dtype=torch.int32) + self.talker_mtp_inputs_embeds = self._make_buffer( + max_batch_size, hidden_size, dtype=self.dtype, numpy=False + ) + self.last_talker_hidden = self._make_buffer(max_batch_size, hidden_size, dtype=self.dtype, numpy=False) + self.text_step = self._make_buffer(max_batch_size, hidden_size, dtype=self.dtype, numpy=False) +``` + +### _dummy_run - Talker MTP Dummy Forward + +Location: Inside `set_ascend_forward_context` block, before main model forward + +```python +# ---------------------------------------Omni-new---------------------------------------------- +if getattr(self.model, "talker", None) is not None and hasattr(self.model, "talker_mtp"): + num_tokens_padded_talker_mtp = num_tokens_padded + if num_tokens_padded_talker_mtp == self.max_num_tokens: + num_tokens_padded_talker_mtp = self.talker_mtp_input_ids.gpu.shape[0] + outputs = self.talker_mtp( + self.talker_mtp_input_ids.gpu[:num_tokens_padded_talker_mtp], + self.talker_mtp_inputs_embeds.gpu[:num_tokens_padded_talker_mtp], + self.last_talker_hidden.gpu[:num_tokens_padded_talker_mtp], + self.text_step.gpu[:num_tokens_padded_talker_mtp], + ) + self.compilation_config.cache_dir = None +# ---------------------------------------Omni-new---------------------------------------------- +``` + +### _dummy_run - Extract Multimodal Outputs + +Location: After model forward, before dummy_compute_logits + +```python +# ---------------------------------------Omni-new---------------------------------------------- +hidden_states, multimodal_outputs = self.extract_multimodal_outputs(hidden_states) +# ---------------------------------------Omni-new---------------------------------------------- +``` + +### _model_forward - Omni Output Wrapping + +```python +def _model_forward( + self, + num_tokens_padded: int, + input_ids: torch.Tensor | None = None, + positions: torch.Tensor | None = None, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **model_kwargs: dict[str, Any], +): + """Override to combine NPUModelRunner's signature with OmniGPUModelRunner's logic.""" + # Omni-specific: build and inject extra model kwargs + model_kwargs_extra = self._build_model_kwargs_extra() + + # Call the model forward (same as NPUModelRunner) + assert self.model is not None + model_output = self.model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + **model_kwargs, + **model_kwargs_extra, + ) + + # Omni-specific: wrap output if needed + if not isinstance(model_output, OmniOutput) and hasattr(self.model, "make_omni_output"): + model_output = self.model.make_omni_output(model_output, **model_kwargs_extra) + + # Omni-specific: cache model output for later sample_tokens + self._omni_last_model_output = model_output + + # NPU-specific: update full graph params (keep from vllm-ascend) + forward_context = get_forward_context() + # ... NPU graph update logic ... + + # NPU-specific: all-gather for sequence parallelism (keep from vllm-ascend) + if get_forward_context().sp_enabled and not isinstance(model_output, IntermediateTensors): + model_output = self._all_gather_hidden_states_and_aux(model_output) + + return model_output +``` + +--- + +## NPUARModelRunner (npu_ar_model_runner.py) + +### __init__ - KV Transfer Manager + +```python +def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.input_ids = self._make_buffer(self.max_num_tokens, dtype=torch.int32) + # each model stage has their own hidden size + self.hidden_size = self.model_config.hf_text_config.hidden_size + self.inputs_embeds = self._make_buffer(self.max_num_tokens, self.hidden_size, dtype=self.dtype, numpy=False) + # Initialize KV cache manager (preserve vllm_config fallback behavior) + self.kv_transfer_manager = OmniKVTransferManager.from_vllm_config(self.vllm_config, self.model_config) +``` + +### execute_model - KV Transfer Before Update States + +Location: At the very beginning of execute_model + +```python +# -------------------------------------- Omni-new ------------------------------------------------- +# [Omni] Handle KV transfer BEFORE updating states (which removes finished requests) +self.kv_extracted_req_ids = self.kv_transfer_manager.handle_finished_requests_kv_transfer( + finished_reqs=getattr(scheduler_output, "finished_requests_needing_kv_transfer", {}), + kv_caches=self.kv_caches, + block_size=self.cache_config.block_size, + cache_dtype=str(self.cache_config.cache_dtype), + request_id_resolver=self._resolve_global_request_id, +) +# -------------------------------------- Omni-new ------------------------------------------------- +``` + +### execute_model - Custom _update_states Call + +Location: Inside synchronize_input_prep context + +```python +# -------------------------------------- Omni-new ------------------------------------------------- +self._update_states(scheduler_output) +# ------------------------------------------------------------------------------------------------ +``` + +### execute_model - Extract Multimodal Outputs + +Location: In post process section, after hidden_states assignment + +```python +# -------------------------------------- Omni-new ------------------------------------------------- +hidden_states, multimodal_outputs = self.extract_multimodal_outputs(hidden_states) + +if multimodal_outputs is not None: + keys_or_type = ( + list(multimodal_outputs.keys()) + if isinstance(multimodal_outputs, dict) + else type(multimodal_outputs) + ) + logger.debug(f"[AR] execute_model: multimodal_outputs keys = {keys_or_type}") +else: + logger.debug("[AR] execute_model: multimodal_outputs is None") +# -------------------------------------- Omni-new ------------------------------------------------- +``` + +### execute_model - Compute Logits with sampling_metadata + +Location: In both broadcast_pp_output True and False branches + +```python +# -------------------------------------- Omni-new ------------------------------------------------- +# Try with sampling_metadata first; fall back to without for models that don't support it +try: + logits = self.model.compute_logits( + sample_hidden_states, sampling_metadata=self.input_batch.sampling_metadata + ) +except TypeError: + logits = self.model.compute_logits(sample_hidden_states) +# -------------------------------------- Omni-new ------------------------------------------------- +``` + +### sample_tokens - KV Extracted Req IDs + +Location: At the beginning of sample_tokens + +```python +# -------------------------------------- Omni-new ------------------------------------------------- +kv_extracted_req_ids = getattr(self, "kv_extracted_req_ids", None) +self.kv_extracted_req_ids = None +# -------------------------------------- Omni-new ------------------------------------------------- +``` + +### sample_tokens - Process Additional Information and Build Output + +Location: After bookkeeping sync, replacing the original output construction + +```python +# -------------------------------------- Omni-new ------------------------------------------------- +hidden_states_cpu = hidden_states.detach().to("cpu").contiguous() +num_scheduled_tokens_np = getattr(self, "_omni_num_scheduled_tokens_np", None) +if num_scheduled_tokens_np is None: + req_ids = self.input_batch.req_ids + num_scheduled_tokens_np = np.array( + [scheduler_output.num_scheduled_tokens[rid] for rid in req_ids], + dtype=np.int32, + ) + +self._process_additional_information_updates( + hidden_states, multimodal_outputs, num_scheduled_tokens_np, scheduler_output +) + +pooler_output: list[dict[str, object]] = [] +for rid in req_ids_output_copy: + idx = req_id_to_index_output_copy[rid] + start = int(self.query_start_loc.cpu[idx]) + sched = int(num_scheduled_tokens_np[idx]) + end = start + sched + hidden_slice = hidden_states_cpu[start:end] + payload: dict[str, object] = {"hidden": hidden_slice} + if isinstance(multimodal_outputs, dict) and multimodal_outputs: + # ... multimodal output slicing logic ... + pooler_output.append(payload) + +model_runner_output = OmniModelRunnerOutput( + req_ids=req_ids_output_copy, + req_id_to_index=req_id_to_index_output_copy, + sampled_token_ids=valid_sampled_token_ids, + logprobs=logprobs_lists, + prompt_logprobs_dict=prompt_logprobs_dict, + pooler_output=(pooler_output if self.vllm_config.model_config.engine_output_type != "text" else None), + kv_connector_output=kv_connector_output, +) +model_runner_output.kv_extracted_req_ids = kv_extracted_req_ids +# -------------------------------------- Omni-new ------------------------------------------------- +``` + +--- + +## NPUGenerationModelRunner (npu_generation_model_runner.py) + +### execute_model - Async Chunk Update + +Location: Inside prepare input section, before synchronize_input_prep + +```python +# -------------------------------------- Omni-new ------------------------------------------------- +if self.model_config.async_chunk and num_scheduled_tokens: + self._update_request_states(scheduler_output) +# -------------------------------------- Omni-new ------------------------------------------------- +``` + +### execute_model - Seq Token Counts + +Location: After _preprocess call + +```python +# [Omni] Pass token counts per request for code2wav output slicing +model_kwargs["seq_token_counts"] = tokens +``` + +### execute_model - Run Generation Model + +Location: Inside forward context + +```python +# -------------------------------------- Omni-new ------------------------------------------------- +outputs = self._run_generation_model( + num_tokens_padded=num_tokens_padded, + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + model_kwargs=model_kwargs, + logits_indices=logits_indices, +) +_, multimodal_outputs = self.extract_multimodal_outputs(outputs) +# -------------------------------------- Omni-new ------------------------------------------------- +``` + +### sample_tokens - Multimodal Output Processing + +The entire sample_tokens method body is omni-specific for generation models: + +```python +# -------------------------------------- Omni-new ------------------------------------------------- +pooler_output: list[object] = [] +if isinstance(multimodal_outputs, torch.Tensor): + # ... tensor handling ... +elif isinstance(multimodal_outputs, list): + # ... list handling ... +elif isinstance(multimodal_outputs, dict): + # ... dict handling per request ... +else: + raise RuntimeError("Unsupported diffusion output type") +# [Omni] Copy req_id mappings to avoid async scheduling mutation. +req_ids_output_copy = self.input_batch.req_ids.copy() +req_id_to_index_output_copy = self.input_batch.req_id_to_index.copy() +output = OmniModelRunnerOutput( + req_ids=req_ids_output_copy, + req_id_to_index=req_id_to_index_output_copy, + sampled_token_ids=[], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=pooler_output, + kv_connector_output=kv_connector_output, + num_nans_in_logits={}, + ec_connector_output=ec_connector_output if self.supports_mm_inputs else None, +) +# -------------------------------------- Omni-new ------------------------------------------------- +``` + +### _dummy_run - Model Kwargs Init and Multimodal Extract + +Location: Before model forward and after + +```python +model_kwargs = self._init_model_kwargs() # Before forward + +# ... forward ... + +# -------------------------------------- Omni-new ------------------------------------------------- +hidden_states, _ = self.extract_multimodal_outputs(hidden_states) +# ------------------------------------------------------------------------------------------------- +``` + +--- + +## ExecuteModelState Extension + +The `ExecuteModelState` NamedTuple is extended for omni: + +```python +class ExecuteModelState(NamedTuple): + """Ephemeral cached state transferred between execute_model() and + sample_tokens(), after execute_model() returns None.""" + + scheduler_output: SchedulerOutput + logits: torch.Tensor + spec_decode_metadata: SpecDecodeMetadata | None + spec_decode_common_attn_metadata: AscendCommonAttentionMetadata | None + hidden_states: torch.Tensor + sample_hidden_states: torch.Tensor + aux_hidden_states: list[torch.Tensor] | None + attn_metadata: PerLayerAttnMetadata + positions: torch.Tensor + ec_connector_output: ECConnectorOutput | None + cudagraph_stats: CUDAGraphStat | None + multimodal_outputs: Any # <-- Omni extension +``` + +This extended state must be imported from `npu_ar_model_runner` in `npu_generation_model_runner`. diff --git a/.claude/skills/vllm-omni-npu-upgrade/references/workflow-checklist.md b/.claude/skills/vllm-omni-npu-upgrade/references/workflow-checklist.md new file mode 100644 index 00000000000..4f184df0ecb --- /dev/null +++ b/.claude/skills/vllm-omni-npu-upgrade/references/workflow-checklist.md @@ -0,0 +1,222 @@ +# NPU Model Runner Upgrade Workflow Checklist + +> **Note**: Reference documents (`omni-specific-blocks.md`) may not be complete. Always grep for `Omni-new` in GPU implementations to find all omni-specific code blocks. Update the reference docs when discovering new blocks. + +## Pre-Upgrade Preparation + +### 1. Version Information +- [ ] Identify current vllm-omni version: `_________` +- [ ] Identify target vllm-ascend version: `_________` +- [ ] Identify target vllm version: `_________` +- [ ] Last release date for GPU worker changes: `_________` + +### 2. Gather Git History +```bash +# GPU-side omni changes since last release +cd /root/vllm-workspace/vllm-omni +git log --oneline --since="YYYY-MM-DD" -- vllm_omni/worker/ + +# vllm-ascend NPUModelRunner changes +cd /root/vllm-workspace/vllm-ascend +git log --oneline .. -- vllm_ascend/worker/model_runner_v1.py +``` + +### 3. Backup Current Files +- [ ] Create backup of current NPU runners: + ```bash + cp -r vllm_omni/platforms/npu/worker vllm_omni/platforms/npu/worker.backup + ``` + +--- + +## OmniNPUModelRunner (npu_model_runner.py) + +### Read and Understand +- [ ] Read current `npu_model_runner.py` +- [ ] Read latest `vllm_ascend/worker/model_runner_v1.py` +- [ ] Read latest `vllm_omni/worker/gpu_model_runner.py` + +### Method: load_model +- [ ] Document existing omni-specific logic +- [ ] Copy latest NPUModelRunner.load_model structure +- [ ] Re-insert: `enable_sp(self.vllm_config)` call +- [ ] Re-insert: talker_mtp detection and setup +- [ ] Replace: `CUDAGraphWrapper` → `ACLGraphWrapper` +- [ ] Re-insert: Buffer allocations (talker_mtp_input_ids, etc.) + +### Method: _dummy_run +- [ ] Document existing omni-specific logic locations +- [ ] Copy latest NPUModelRunner._dummy_run +- [ ] Re-insert: talker_mtp dummy forward block (inside context) +- [ ] Re-insert: `extract_multimodal_outputs` call +- [ ] Verify: Comment markers are present + +### Method: _model_forward +- [ ] Copy latest NPUModelRunner._model_forward structure +- [ ] Re-insert: `_build_model_kwargs_extra()` call +- [ ] Re-insert: OmniOutput wrapping logic +- [ ] Re-insert: `_omni_last_model_output` caching +- [ ] Keep: NPU graph params update +- [ ] Keep: SP all-gather logic + +### Method: _talker_mtp_forward +- [ ] Verify: Uses `set_ascend_forward_context` +- [ ] Verify: Uses `ACLGraphWrapper` check +- [ ] Sync any changes from GPU `_talker_mtp_forward` + +### Imports +- [ ] Update vllm-ascend imports to latest paths +- [ ] Verify all omni imports are present +- [ ] Remove any deprecated imports + +--- + +## NPUARModelRunner (npu_ar_model_runner.py) + +### Read and Understand +- [ ] Read current `npu_ar_model_runner.py` +- [ ] Read latest `vllm_ascend/worker/model_runner_v1.py` execute_model +- [ ] Read latest `vllm_omni/worker/gpu_ar_model_runner.py` + +### Method: __init__ +- [ ] Sync any new initialization from GPU side +- [ ] Keep: `OmniKVTransferManager` setup +- [ ] Keep: Custom buffer allocations + +### Method: execute_model +- [ ] Document all omni blocks with line numbers +- [ ] Copy latest NPUModelRunner.execute_model structure +- [ ] Re-insert: KV transfer handling (beginning) +- [ ] Re-insert: Custom `_update_states` call +- [ ] Re-insert: `extract_multimodal_outputs` +- [ ] Re-insert: `compute_logits` with sampling_metadata try/except +- [ ] Update: ExecuteModelState to include multimodal_outputs + +### Method: sample_tokens +- [ ] Document all omni blocks +- [ ] Copy latest NPUModelRunner.sample_tokens structure +- [ ] Re-insert: `kv_extracted_req_ids` handling +- [ ] Re-insert: Hidden states CPU copy +- [ ] Re-insert: `_process_additional_information_updates` +- [ ] Re-insert: `OmniModelRunnerOutput` construction + +### ExecuteModelState +- [ ] Verify: `multimodal_outputs` field is present +- [ ] Verify: Imported/used correctly in execute_model + +### Imports +- [ ] Update all vllm-ascend imports +- [ ] Keep omni-specific imports + +--- + +## NPUGenerationModelRunner (npu_generation_model_runner.py) + +### Read and Understand +- [ ] Read current `npu_generation_model_runner.py` +- [ ] Read latest GPU `gpu_generation_model_runner.py` + +### Method: _update_request_states +- [ ] Verify: async_chunk handling is correct +- [ ] Sync any changes from GPU side + +### Method: execute_model +- [ ] Document all omni blocks +- [ ] Copy latest NPUModelRunner.execute_model base structure +- [ ] Re-insert: async_chunk update logic +- [ ] Re-insert: `seq_token_counts` injection +- [ ] Re-insert: `_run_generation_model` call +- [ ] Re-insert: `extract_multimodal_outputs` +- [ ] Use: ExecuteModelState from npu_ar_model_runner + +### Method: sample_tokens +- [ ] Keep: Entire omni multimodal output processing +- [ ] Update: Any new output fields needed +- [ ] Keep: `OmniModelRunnerOutput` construction + +### Method: _run_generation_model +- [ ] Sync any changes from GPU side +- [ ] Keep: `_model_forward` call with sampler + +### Method: _dummy_run +- [ ] Copy latest NPUModelRunner._dummy_run +- [ ] Re-insert: `model_kwargs = self._init_model_kwargs()` +- [ ] Re-insert: `extract_multimodal_outputs` at end + +### Imports +- [ ] Import ExecuteModelState from npu_ar_model_runner +- [ ] Update vllm-ascend imports + +--- + +## Post-Upgrade Validation + +### Syntax Validation +- [ ] `python -m py_compile vllm_omni/platforms/npu/worker/npu_model_runner.py` +- [ ] `python -m py_compile vllm_omni/platforms/npu/worker/npu_ar_model_runner.py` +- [ ] `python -m py_compile vllm_omni/platforms/npu/worker/npu_generation_model_runner.py` + +### Import Validation +- [ ] `python -c "from vllm_omni.platforms.npu.worker.npu_model_runner import OmniNPUModelRunner"` +- [ ] `python -c "from vllm_omni.platforms.npu.worker.npu_ar_model_runner import NPUARModelRunner"` +- [ ] `python -c "from vllm_omni.platforms.npu.worker.npu_generation_model_runner import NPUGenerationModelRunner"` + +### Comment Markers +- [ ] Grep for "Omni-new" in all three files +- [ ] Verify all omni blocks have closing markers + +### Code Review +- [ ] No `CUDAGraphWrapper` references +- [ ] All `set_forward_context` replaced with `set_ascend_forward_context` +- [ ] Parameter names correct (`aclgraph_runtime_mode` not `cudagraph_runtime_mode`) +- [ ] No duplicate code blocks +- [ ] No missing imports + +--- + +## Git Commit + +### Commit Message Template +``` +[NPU] Upgrade model runners to align with vllm-ascend vX.Y.Z + +- Update OmniNPUModelRunner with latest NPUModelRunner base +- Update NPUARModelRunner execute_model and sample_tokens +- Update NPUGenerationModelRunner for async_chunk changes +- Sync GPU-side omni changes from vX.Y.Z release +- Preserve all omni-specific logic (marked with Omni-new comments) + +Changes from vllm-ascend: +- + +Changes synced from GPU: +- +``` + +### Files to Stage +- [ ] `vllm_omni/platforms/npu/worker/npu_model_runner.py` +- [ ] `vllm_omni/platforms/npu/worker/npu_ar_model_runner.py` +- [ ] `vllm_omni/platforms/npu/worker/npu_generation_model_runner.py` +- [ ] Any other modified files + +--- + +## Troubleshooting + +### Import Errors +- Check if vllm-ascend module paths have changed +- Verify PYTHONPATH includes both vllm-ascend and vllm-omni + +### Type Errors +- Check method signatures match between GPU and NPU +- Verify NamedTuple fields match expected structure + +### Runtime Errors +- Enable debug logging: `export VLLM_LOGGING_LEVEL=DEBUG` +- Check graph capture issues: try `--enforce-eager` +- Check attention issues: verify AscendAttentionState usage + +### Performance Regression +- Compare with previous version on same model +- Check if graph capture is working: look for ACLGraph logs +- Verify SP/EP configurations are correct diff --git a/.gitignore b/.gitignore index c0ee968064c..35dc7571ee2 100644 --- a/.gitignore +++ b/.gitignore @@ -203,6 +203,7 @@ checkpoints/ # Cache directories cache/ !vllm_omni/diffusion/cache/ +!tests/diffusion/cache/ .cache/ diffusion_cache/ kv_cache/ @@ -262,3 +263,5 @@ tmp_test vllm_omni/_version.py # output files *.wav +# CI overlay yamls materialized from tests/utils.py:_CI_OVERLAYS at test time +tests/.ci_generated/ diff --git a/benchmarks/qwen3-tts/README.md b/benchmarks/qwen3-tts/README.md index 9c01f29aa9f..a1c2ebe12ff 100644 --- a/benchmarks/qwen3-tts/README.md +++ b/benchmarks/qwen3-tts/README.md @@ -35,8 +35,8 @@ MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice bash run_benchmark.sh --async-only # Use a Voice Clone model MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-Base TASK_TYPE=Base bash run_benchmark.sh --async-only -# Use bs16 config for higher throughput -STAGE_CONFIG=vllm_omni/configs/qwen3_tts_bs16.yaml bash run_benchmark.sh --async-only +# Use batch size 16 for higher throughput +BATCH_SIZE=16 bash run_benchmark.sh --async-only # Custom GPU, prompt count, concurrency levels GPU_DEVICE=1 NUM_PROMPTS=20 CONCURRENCY="1 4" bash run_benchmark.sh @@ -50,7 +50,8 @@ GPU_DEVICE=1 NUM_PROMPTS=20 CONCURRENCY="1 4" bash run_benchmark.sh CUDA_VISIBLE_DEVICES=0 python -m vllm_omni.entrypoints.cli.main serve \ "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice" \ --omni --host 127.0.0.1 --port 8000 \ - --stage-configs-path benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs1.yaml \ + --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ + --stage-overrides '{"0":{"max_num_seqs":1,"gpu_memory_utilization":0.3,"max_num_batched_tokens":512},"1":{"max_num_seqs":1,"gpu_memory_utilization":0.3,"max_num_batched_tokens":8192}}' \ --trust-remote-code ``` @@ -84,16 +85,19 @@ python benchmarks/qwen3-tts/plot_results.py \ --output results/comparison.png ``` -## Stage Configs +## Batch-size presets -| Config | max_num_seqs | Description | -|--------|:------------:|-------------| -| `vllm_omni/configs/qwen3_tts_bs1.yaml` | 1 | Single-request processing (lowest latency) | -| `vllm_omni/configs/qwen3_tts_bs16.yaml` | 16 | High-throughput concurrent processing | +The bench script loads the bundled production deploy (`vllm_omni/deploy/qwen3_tts.yaml`) and layers per-stage budgets on top via `--stage-overrides`, driven by the `BATCH_SIZE` env var. Each batch size picks compatible per-stage `max_num_seqs`, `max_num_batched_tokens`, and `gpu_memory_utilization` defaults: -All configs use a 2-stage pipeline (Talker -> Code2Wav) with `async_chunk` streaming enabled. The `SharedMemoryConnector` streams codec frames (25-frame chunks with 25-frame context overlap) between stages. +| `BATCH_SIZE` | Description | +|:--:|-------------| +| `1` (default) | Single-request processing (lowest latency) | +| `4` | Moderate-throughput concurrent processing | +| `16` | High-throughput concurrent processing | -The model is specified via the CLI `--model` flag (or `MODEL` env var), so the same configs work for both the 0.6B and 1.7B model variants. +The 2-stage pipeline (Talker -> Code2Wav) runs with `async_chunk` streaming enabled via the prod deploy; the `SharedMemoryConnector` streams codec frames (25-frame chunks with 25-frame context overlap) between stages. + +The model is specified via the CLI `--model` flag (or `MODEL` env var), so the same bench script works for both the 0.6B and 1.7B model variants. ## Metrics diff --git a/benchmarks/qwen3-tts/run_benchmark.sh b/benchmarks/qwen3-tts/run_benchmark.sh index 283b6b844c1..8c3e46903ca 100755 --- a/benchmarks/qwen3-tts/run_benchmark.sh +++ b/benchmarks/qwen3-tts/run_benchmark.sh @@ -26,8 +26,8 @@ # # Use Voice Clone model # MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-Base TASK_TYPE=Base bash run_benchmark.sh --async-only # -# # Use batch_size=4 config: -# STAGE_CONFIG=vllm_omni/configs/qwen3_tts_bs4.yaml bash run_benchmark.sh --async-only +# # Use batch_size=4: +# BATCH_SIZE=4 bash run_benchmark.sh --async-only # # Environment variables: # GPU_DEVICE - GPU index to use (default: 0) @@ -35,9 +35,9 @@ # CONCURRENCY - Space-separated concurrency levels (default: "1 4 10") # MODEL - Model name (default: Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice) # PORT - Server port (default: 8000) -# GPU_MEM_TALKER - gpu_memory_utilization for talker stage (default: 0.3) -# GPU_MEM_CODE2WAV - gpu_memory_utilization for code2wav stage (default: 0.2) -# STAGE_CONFIG - Path to stage config YAML (default: configs/qwen3_tts_bs1.yaml) +# BATCH_SIZE - Per-stage ``max_num_seqs`` for both talker and code2wav (default: 1) +# GPU_MEM_TALKER - gpu_memory_utilization for talker stage (default: 0.3 at bs=1, else 0.2) +# GPU_MEM_CODE2WAV - gpu_memory_utilization for code2wav stage (default: 0.3 at bs=1, else 0.2) # TASK_TYPE - Task type: CustomVoice, VoiceDesign, Base (default: CustomVoice) set -euo pipefail @@ -51,14 +51,36 @@ NUM_PROMPTS="${NUM_PROMPTS:-50}" CONCURRENCY="${CONCURRENCY:-1 4 10}" MODEL="${MODEL:-Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice}" PORT="${PORT:-8000}" -GPU_MEM_TALKER="${GPU_MEM_TALKER:-0.3}" -GPU_MEM_CODE2WAV="${GPU_MEM_CODE2WAV:-0.2}" +BATCH_SIZE="${BATCH_SIZE:-1}" +DEFAULT_MEM=$([ "${BATCH_SIZE}" = "1" ] && echo "0.3" || echo "0.2") +GPU_MEM_TALKER="${GPU_MEM_TALKER:-${DEFAULT_MEM}}" +GPU_MEM_CODE2WAV="${GPU_MEM_CODE2WAV:-${DEFAULT_MEM}}" NUM_WARMUPS="${NUM_WARMUPS:-3}" -STAGE_CONFIG="${STAGE_CONFIG:-vllm_omni/configs/qwen3_tts_bs1.yaml}" +DEPLOY_CONFIG="vllm_omni/deploy/qwen3_tts.yaml" RESULT_DIR="${SCRIPT_DIR}/results" TIMESTAMP="$(date +%Y%m%d_%H%M%S)" TASK_TYPE="${TASK_TYPE:-CustomVoice}" +# Build --stage-overrides JSON from BATCH_SIZE + GPU_MEM_*. +STAGE_OVERRIDES=$( + BATCH_SIZE="${BATCH_SIZE}" \ + GPU_MEM_TALKER="${GPU_MEM_TALKER}" \ + GPU_MEM_CODE2WAV="${GPU_MEM_CODE2WAV}" \ + python - <<'PYEOF' +import json, os +bs = int(os.environ["BATCH_SIZE"]) +mem_t = float(os.environ["GPU_MEM_TALKER"]) +mem_c = float(os.environ["GPU_MEM_CODE2WAV"]) +# Prefill budget grows with batch size on both stages. +talker_batched = 512 if bs <= 4 else 4096 +code2wav_batched = 8192 if bs <= 4 else 32768 +print(json.dumps({ + "0": {"max_num_seqs": bs, "gpu_memory_utilization": mem_t, "max_num_batched_tokens": talker_batched}, + "1": {"max_num_seqs": bs, "gpu_memory_utilization": mem_c, "max_num_batched_tokens": code2wav_batched}, +})) +PYEOF +) + # Parse args RUN_ASYNC=true RUN_HF=true @@ -75,41 +97,27 @@ mkdir -p "${RESULT_DIR}" echo "============================================================" echo " Qwen3-TTS Benchmark" echo "============================================================" -echo " GPU: ${GPU_DEVICE}" -echo " Model: ${MODEL}" -echo " Prompts: ${NUM_PROMPTS}" -echo " Concurrency: ${CONCURRENCY}" -echo " Port: ${PORT}" -echo " Stage config: ${STAGE_CONFIG}" -echo " Results: ${RESULT_DIR}" -echo " Task type: ${TASK_TYPE}" +echo " GPU: ${GPU_DEVICE}" +echo " Model: ${MODEL}" +echo " Prompts: ${NUM_PROMPTS}" +echo " Concurrency: ${CONCURRENCY}" +echo " Port: ${PORT}" +echo " Deploy config: ${DEPLOY_CONFIG}" +echo " Batch size: ${BATCH_SIZE}" +echo " GPU mem T/C: ${GPU_MEM_TALKER} / ${GPU_MEM_CODE2WAV}" +echo " Results: ${RESULT_DIR}" +echo " Task type: ${TASK_TYPE}" echo "============================================================" -# Prepare stage config with correct GPU device and memory settings -prepare_config() { - local config_template="$1" - local config_name="$2" - local output_path="${RESULT_DIR}/${config_name}_stage_config.yaml" - - # Use sed to patch GPU device and memory utilization - sed \ - -e "s/devices: \"0\"/devices: \"${GPU_DEVICE}\"/g" \ - -e "s/gpu_memory_utilization: 0.3/gpu_memory_utilization: ${GPU_MEM_TALKER}/g" \ - -e "s/gpu_memory_utilization: 0.2/gpu_memory_utilization: ${GPU_MEM_CODE2WAV}/g" \ - "${config_template}" > "${output_path}" - - echo "${output_path}" -} - # Start server and wait for it to be ready start_server() { - local stage_config="$1" - local config_name="$2" + local config_name="$1" local log_file="${RESULT_DIR}/server_${config_name}_${TIMESTAMP}.log" echo "" echo "Starting server with config: ${config_name}" - echo " Stage config: ${stage_config}" + echo " Deploy config: ${DEPLOY_CONFIG}" + echo " Stage overrides: ${STAGE_OVERRIDES}" echo " Log file: ${log_file}" VLLM_WORKER_MULTIPROC_METHOD=spawn \ @@ -118,7 +126,8 @@ start_server() { --omni \ --host 127.0.0.1 \ --port "${PORT}" \ - --stage-configs-path "${stage_config}" \ + --deploy-config "${DEPLOY_CONFIG}" \ + --stage-overrides "${STAGE_OVERRIDES}" \ --stage-init-timeout 120 \ --trust-remote-code \ --disable-log-stats \ @@ -175,17 +184,13 @@ trap 'stop_server' EXIT # Run benchmark for a given config run_bench() { local config_name="$1" - local config_template="$2" echo "" echo "============================================================" echo " Benchmarking: ${config_name}" echo "============================================================" - local stage_config - stage_config=$(prepare_config "${config_template}" "${config_name}") - - start_server "${stage_config}" "${config_name}" + start_server "${config_name}" # Convert concurrency string to args local conc_args="" @@ -212,7 +217,7 @@ run_bench() { # Run vllm-omni benchmark if [ "${RUN_ASYNC}" = true ]; then - run_bench "async_chunk" "${SCRIPT_DIR}/${STAGE_CONFIG}" + run_bench "async_chunk" fi # Run HuggingFace baseline benchmark diff --git a/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs1.yaml b/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs1.yaml deleted file mode 100644 index ca441d286dd..00000000000 --- a/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs1.yaml +++ /dev/null @@ -1,93 +0,0 @@ -# Qwen3-TTS batch_size=1 config (streaming with async_chunk) -# 2-stage pipeline: Talker -> Code2Wav -async_chunk: true -stage_args: - - stage_id: 0 - stage_type: llm - is_comprehension: true - runtime: - devices: "0" - engine_args: - max_num_seqs: 1 - model_stage: qwen3_tts - model_arch: Qwen3TTSTalkerForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - enforce_eager: false - trust_remote_code: true - async_scheduling: true - enable_prefix_caching: false - engine_output_type: latent - gpu_memory_utilization: 0.3 - distributed_executor_backend: "mp" - max_num_batched_tokens: 512 - max_model_len: 4096 - custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk - output_connectors: - to_stage_1: connector_of_shared_memory - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: false - repetition_penalty: 1.05 - stop_token_ids: [2150] - - - stage_id: 1 - stage_type: llm - runtime: - devices: "0" - engine_args: - max_num_seqs: 1 - model_stage: code2wav - model_arch: Qwen3TTSCode2Wav - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - async_scheduling: true - enable_prefix_caching: false - engine_output_type: audio - gpu_memory_utilization: 0.3 - distributed_executor_backend: "mp" - max_num_batched_tokens: 8192 - max_model_len: 32768 - engine_input_source: [0] - final_output: true - final_output_type: audio - input_connectors: - from_stage_0: connector_of_shared_memory - tts_args: - max_instructions_length: 500 - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 65536 - seed: 42 - detokenize: true - repetition_penalty: 1.0 - -runtime: - enabled: true - defaults: - window_size: -1 - max_inflight: 1 - - connectors: - connector_of_shared_memory: - name: SharedMemoryConnector - extra: - shm_threshold_bytes: 65536 - codec_streaming: true - connector_get_sleep_s: 0.01 - connector_get_max_wait_first_chunk: 3000 - connector_get_max_wait: 300 - codec_chunk_frames: 25 - codec_left_context_frames: 25 - - edges: - - from: 0 - to: 1 - window_size: -1 diff --git a/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs16.yaml b/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs16.yaml deleted file mode 100644 index 2cc5cf53532..00000000000 --- a/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs16.yaml +++ /dev/null @@ -1,94 +0,0 @@ -# Qwen3-TTS max_num_seqs=16 config (streaming with async_chunk) -# High-throughput concurrent request processing -# 2-stage pipeline: Talker -> Code2Wav -async_chunk: true -stage_args: - - stage_id: 0 - stage_type: llm - is_comprehension: true - runtime: - devices: "0" - engine_args: - max_num_seqs: 16 - model_stage: qwen3_tts - model_arch: Qwen3TTSTalkerForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - enforce_eager: false - trust_remote_code: true - async_scheduling: true - enable_prefix_caching: false - engine_output_type: latent - gpu_memory_utilization: 0.3 - distributed_executor_backend: "mp" - max_num_batched_tokens: 4096 - max_model_len: 4096 - custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk - output_connectors: - to_stage_1: connector_of_shared_memory - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: false - repetition_penalty: 1.05 - stop_token_ids: [2150] - - - stage_id: 1 - stage_type: llm - runtime: - devices: "0" - engine_args: - max_num_seqs: 16 - model_stage: code2wav - model_arch: Qwen3TTSCode2Wav - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - async_scheduling: true - enable_prefix_caching: false - engine_output_type: audio - gpu_memory_utilization: 0.2 - distributed_executor_backend: "mp" - max_num_batched_tokens: 16384 - max_model_len: 32768 - engine_input_source: [0] - final_output: true - final_output_type: audio - input_connectors: - from_stage_0: connector_of_shared_memory - tts_args: - max_instructions_length: 500 - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 65536 - seed: 42 - detokenize: true - repetition_penalty: 1.0 - -runtime: - enabled: true - defaults: - window_size: -1 - max_inflight: 16 - - connectors: - connector_of_shared_memory: - name: SharedMemoryConnector - extra: - shm_threshold_bytes: 65536 - codec_streaming: true - connector_get_sleep_s: 0.01 - connector_get_max_wait_first_chunk: 3000 - connector_get_max_wait: 300 - codec_chunk_frames: 25 - codec_left_context_frames: 25 - - edges: - - from: 0 - to: 1 - window_size: -1 diff --git a/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs4.yaml b/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs4.yaml deleted file mode 100644 index 5de107d4976..00000000000 --- a/benchmarks/qwen3-tts/vllm_omni/configs/qwen3_tts_bs4.yaml +++ /dev/null @@ -1,94 +0,0 @@ -# Qwen3-TTS batch_size=4 config (streaming with async_chunk) -# Enables concurrent request processing -# 2-stage pipeline: Talker -> Code2Wav -async_chunk: true -stage_args: - - stage_id: 0 - stage_type: llm - is_comprehension: true - runtime: - devices: "0" - engine_args: - max_num_seqs: 4 - model_stage: qwen3_tts - model_arch: Qwen3TTSTalkerForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - enforce_eager: false - trust_remote_code: true - async_scheduling: true - enable_prefix_caching: false - engine_output_type: latent - gpu_memory_utilization: 0.3 - distributed_executor_backend: "mp" - max_num_batched_tokens: 512 - max_model_len: 4096 - custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk - output_connectors: - to_stage_1: connector_of_shared_memory - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: false - repetition_penalty: 1.05 - stop_token_ids: [2150] - - - stage_id: 1 - stage_type: llm - runtime: - devices: "0" - engine_args: - max_num_seqs: 4 - model_stage: code2wav - model_arch: Qwen3TTSCode2Wav - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - async_scheduling: true - enable_prefix_caching: false - engine_output_type: audio - gpu_memory_utilization: 0.2 - distributed_executor_backend: "mp" - max_num_batched_tokens: 8192 - max_model_len: 32768 - engine_input_source: [0] - final_output: true - final_output_type: audio - input_connectors: - from_stage_0: connector_of_shared_memory - tts_args: - max_instructions_length: 500 - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 65536 - seed: 42 - detokenize: true - repetition_penalty: 1.0 - -runtime: - enabled: true - defaults: - window_size: -1 - max_inflight: 4 - - connectors: - connector_of_shared_memory: - name: SharedMemoryConnector - extra: - shm_threshold_bytes: 65536 - codec_streaming: true - connector_get_sleep_s: 0.01 - connector_get_max_wait_first_chunk: 3000 - connector_get_max_wait: 300 - codec_chunk_frames: 25 - codec_left_context_frames: 25 - - edges: - - from: 0 - to: 1 - window_size: -1 diff --git a/benchmarks/qwen3-tts/vllm_omni/run_async_chunk_benchmark.sh b/benchmarks/qwen3-tts/vllm_omni/run_async_chunk_benchmark.sh index 61cf7757a9b..0ede359ea37 100755 --- a/benchmarks/qwen3-tts/vllm_omni/run_async_chunk_benchmark.sh +++ b/benchmarks/qwen3-tts/vllm_omni/run_async_chunk_benchmark.sh @@ -31,8 +31,11 @@ PORT_OFF="${PORT_OFF:-8001}" RESULT_DIR="${SCRIPT_DIR}/results" TIMESTAMP="$(date +%Y%m%d_%H%M%S)" -STAGE_CONFIG_ON="vllm_omni/model_executor/stage_configs/qwen3_tts.yaml" -STAGE_CONFIG_OFF="vllm_omni/model_executor/stage_configs/qwen3_tts_no_async_chunk.yaml" +# The bundled ``vllm_omni/deploy/qwen3_tts.yaml`` is auto-loaded by the model +# registry; no ``--deploy-config`` flag needed on the default (ON) path. +# async_chunk OFF is selected by the ``--no-async-chunk`` CLI flag — +# the single ``qwen3_tts`` pipeline dispatches to the end-to-end codec +# processor when ``deploy.async_chunk`` is false. mkdir -p "${RESULT_DIR}" @@ -77,7 +80,6 @@ wait_for_server() { echo "" echo "[Phase 1] Starting async_chunk ON server on port ${PORT_ON}..." CUDA_VISIBLE_DEVICES=${GPU_DEVICE} vllm-omni serve "${MODEL}" \ - --stage-configs-path "${STAGE_CONFIG_ON}" \ --host 0.0.0.0 --port "${PORT_ON}" \ --trust-remote-code --enforce-eager --omni \ > "${RESULT_DIR}/server_on_${TIMESTAMP}.log" 2>&1 & @@ -104,7 +106,7 @@ sleep 5 echo "" echo "[Phase 2] Starting async_chunk OFF server on port ${PORT_OFF}..." CUDA_VISIBLE_DEVICES=${GPU_DEVICE} vllm-omni serve "${MODEL}" \ - --stage-configs-path "${STAGE_CONFIG_OFF}" \ + --no-async-chunk \ --host 0.0.0.0 --port "${PORT_OFF}" \ --trust-remote-code --enforce-eager --omni \ > "${RESULT_DIR}/server_off_${TIMESTAMP}.log" 2>&1 & diff --git a/docs/.nav.yml b/docs/.nav.yml index 79d7c38e274..455a0525056 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -107,7 +107,7 @@ nav: - design/feature/hsdp.md - design/feature/cache_dit.md - design/feature/teacache.md - - design/feature/async_chunk_design.md + - design/feature/async_chunk.md - design/feature/vae_parallel.md - design/feature/diffusion_step_execution.md - Module Design: diff --git a/docs/assets/WeChat.jpg b/docs/assets/WeChat.jpg index 416439f7eb0..83252b7569d 100644 Binary files a/docs/assets/WeChat.jpg and b/docs/assets/WeChat.jpg differ diff --git a/docs/configuration/README.md b/docs/configuration/README.md index b5761a7f1bc..390176e9cea 100644 --- a/docs/configuration/README.md +++ b/docs/configuration/README.md @@ -6,7 +6,7 @@ For options within a vLLM Engine. Please refer to [vLLM Configuration](https://d Currently, the main options are maintained by stage configs for each model. -For specific example, please refer to [Qwen2.5-omni stage config](stage_configs/qwen2_5_omni.yaml) +For a specific example, see the [Qwen2.5-Omni deploy config](gh-file:vllm_omni/deploy/qwen2_5_omni.yaml). The matching frozen pipeline topology lives at [vllm_omni/model_executor/models/qwen2_5_omni/pipeline.py](gh-file:vllm_omni/model_executor/models/qwen2_5_omni/pipeline.py). For introduction, please check [Introduction for stage config](./stage_configs.md) diff --git a/docs/configuration/pd_disaggregation.md b/docs/configuration/pd_disaggregation.md index 1cf6189e603..9196bdb0240 100644 --- a/docs/configuration/pd_disaggregation.md +++ b/docs/configuration/pd_disaggregation.md @@ -11,7 +11,7 @@ deployment-specific values usually change per environment: - connector backend and connector ports - connector IPs or bootstrap addresses -Start from the [default Qwen3-Omni stage config](gh-file:vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml) +Start from the [default Qwen3-Omni stage config](gh-file:vllm_omni/deploy/qwen3_omni_moe.yaml) and copy it to your own file, for example `qwen3_omni_pd.yaml`. Then apply the changes below. @@ -145,19 +145,13 @@ Compared with the default Qwen3-Omni config: ```yaml runtime: enabled: true - defaults: - window_size: -1 - max_inflight: 1 edges: - from: 0 to: 1 - window_size: -1 - from: 1 to: 2 - window_size: -1 - from: 2 to: 3 - window_size: -1 ``` ## 4. Launch with your custom config diff --git a/docs/configuration/stage_configs.md b/docs/configuration/stage_configs.md index 95c42afcc70..55b4053cc71 100644 --- a/docs/configuration/stage_configs.md +++ b/docs/configuration/stage_configs.md @@ -3,7 +3,147 @@ In vLLM-Omni, the target model is separated into multiple stages, which are processed by different LLMEngines, DiffusionEngines or other types of engines. Depending on different types of stages, such as Autoregressive (AR) stage or Diffusion transformer (DiT) stage, each can choose corresponding schedulers, model workers to load with the Engines in a plug-in fashion. !!! note - Default stage config YAMLs (for example, `vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml` and `vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml`) are bundled and loaded automatically when `stage_configs_path` is not provided. They have been verified to work on 1xH100 for Qwen2.5-Omni and 2xH100 for Qwen3-Omni. + Default deploy config YAMLs (for example, `vllm_omni/deploy/qwen2_5_omni.yaml`, `vllm_omni/deploy/qwen3_omni_moe.yaml`, and `vllm_omni/deploy/qwen3_tts.yaml`) are bundled and loaded automatically when neither `--stage-configs-path` nor `--deploy-config` is provided — the model registry resolves the right pipeline + deploy YAML by `model_type`. The bundled defaults have been verified on 1xH100 for Qwen2.5-Omni and 2xH100 for Qwen3-Omni. Models that have not yet migrated to the new schema continue to use the legacy `vllm_omni/model_executor/stage_configs/.yaml` files via `--stage-configs-path`. + +## New deploy schema reference + +The new deploy schema lives under `vllm_omni/deploy/` and is paired with a frozen `PipelineConfig` registered by the model's `pipeline.py`. Each deploy YAML has these top-level fields: + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `base_config` | str (path) | optional | — | Overlay parent (relative or absolute). `stages:` / `platforms:` deep-merged by stage_id; other scalars overlay-wins. Intended for user-authored overlays; prod yamls stay flat. | +| `async_chunk` | bool | optional | `true` | Enable chunked streaming between stages. Pin to `false` if the pipeline runs end-to-end. | +| `connectors` | dict | optional | `null` | Named connector specs (`{name, extra}`). Referenced by each stage's `input_connectors` / `output_connectors`. See [Connector schema](#connector-schema). | +| `edges` | list | optional | `null` | Explicit edge list for the KV transfer graph. Auto-derived from stage inputs if omitted. | +| `stages` | list | required | — | Per-stage engine args + wiring (see [Stage fields](#stage-fields)). | +| `platforms` | dict | optional | `null` | Keyed by `npu` / `rocm` / `xpu`, each contains a `stages:` list with per-platform overrides applied on top of the CUDA defaults. | +| `pipeline` | str | optional | `null` | Override the auto-detected pipeline registry key (used for structural variants like `qwen2_5_omni_thinker_only`). | +| `trust_remote_code` | bool | optional | `true` | **Pipeline-wide.** Trust HF remote code on model load; applies to every stage. | +| `distributed_executor_backend` | str | optional | `"mp"` | **Pipeline-wide.** Executor backend (`"mp"` or `"ray"`). | +| `dtype` | str \| null | optional | `null` | **Pipeline-wide.** Model dtype for every stage. | +| `quantization` | str \| null | optional | `null` | **Pipeline-wide.** Quantization method for every stage. | +| `enable_prefix_caching` | bool | optional | `false` | **Pipeline-wide.** Prefix cache toggle applied to every stage. | +| `enable_chunked_prefill` | bool \| null | optional | `null` | **Pipeline-wide.** Chunked prefill toggle applied to every stage. | +| `data_parallel_size` | int | optional | `1` | **Pipeline-wide.** DP degree for every stage. | +| `pipeline_parallel_size` | int | optional | `1` | **Pipeline-wide.** PP degree for every stage. | + +### Stage fields + +Each entry under `stages:` accepts any `StageDeployConfig` field directly (no nested `engine_args:`). Only fields whose value legitimately varies across stages live here; pipeline-wide settings (trust_remote_code, distributed_executor_backend, dtype, quantization, prefix/chunked prefill, DP/PP sizes) are declared at the top level and applied to every stage. Unknown keys fall through to `engine_extras:` and are forwarded to the engine. + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `stage_id` | int | required | — | Stage identity; matched against `PipelineConfig.stages[*].stage_id`. | +| `max_num_seqs` | int | optional | `64` | Max concurrent sequences per stage. | +| `gpu_memory_utilization` | float | optional | `0.9` | Per-stage memory budget. | +| `tensor_parallel_size` | int | optional | `1` | TP degree for this stage. | +| `enforce_eager` | bool | optional | `false` | Disable CUDA graphs. | +| `max_num_batched_tokens` | int | optional | `32768` | Prefill budget. | +| `max_model_len` | int \| null | optional | `null` | Per-stage context length (auto-sets `VLLM_ALLOW_LONG_MAX_MODEL_LEN=1` when larger than HF default). | +| `async_scheduling` | bool \| null | optional | `null` | Per-stage async scheduling toggle. | +| `devices` | str | optional | `"0"` | `CUDA_VISIBLE_DEVICES`-style device list. | +| `output_connectors` | dict \| null | optional | `null` | Keyed by `to_stage_`; values are names registered under top-level `connectors:`. | +| `input_connectors` | dict \| null | optional | `null` | Keyed by `from_stage_`; values are names registered under top-level `connectors:`. | +| `default_sampling_params` | dict \| null | optional | `null` | Baseline sampling params. Deep-merged with pipeline `sampling_constraints` (pipeline wins). | +| `engine_extras` | dict | optional | `{}` | Catch-all for keys not listed above; deep-merged across overlays. Also carries per-stage overrides of pipeline-wide settings (e.g. stage-specific `dtype`). | + +### Connector schema + +Each entry under top-level `connectors:` follows this shape: + +```yaml +connectors: + : + name: # required — class registered in vllm_omni.distributed + extra: # optional — forwarded to the connector's __init__ + : + ... +``` + +| Connector class | Use case | `extra` keys | +|-----------------|----------|--------------| +| `SharedMemoryConnector` | Same-host KV transfer between stages (default for bundled YAMLs). | `shm_threshold_bytes` (int, default `65536`). | +| `MooncakeStoreConnector` | Cross-host KV transfer over TCP. Required for multi-node deployments. | `host`, `metadata_server`, `master`, `segment` (int bytes), `localbuf` (int bytes), `proto` (`"tcp"` / `"rdma"`). | + +A stage references a connector by name in its `input_connectors` / `output_connectors`: + +```yaml +connectors: + shm: + name: SharedMemoryConnector + +stages: + - stage_id: 0 + output_connectors: {to_stage_1: shm} + - stage_id: 1 + input_connectors: {from_stage_0: shm} +``` + +### CLI flags introduced in this refactor + +| Flag | Description | +|------|-------------| +| `--deploy-config PATH` | Load a new-schema deploy YAML. Takes precedence over `--stage-configs-path`. **Optional** — when omitted, the bundled `vllm_omni/deploy/.yaml` is auto-loaded by the model registry. | +| `--stage-overrides JSON` | Per-stage JSON overrides, e.g. `'{"0":{"gpu_memory_utilization":0.5}}'`. Per-stage values always win over global flags. | +| `--async-chunk` / `--no-async-chunk` | Flip the deploy YAML's `async_chunk:` bool. Unset (default) leaves the YAML value in force. | +| `--stage-configs-path` | **Deprecated.** Accepts legacy `stage_args` yamls and (auto-detected) new deploy yamls; emits a deprecation warning. Migrate to `--deploy-config`. To be removed in a follow-up PR. | + +### Precedence + +From highest to lowest: + +1. Per-stage flags (`--stage-overrides` JSON, `--stage--` if registered) +2. Explicit global CLI flags (`--gpu-memory-utilization 0.85`, etc.) +3. Platform section (`platforms.npu.stages`, etc.) on top of the base `stages:` +4. Overlay YAML (via `base_config:`) on top of the base YAML +5. Parser defaults + +### Worked override example + +Starting from the bundled `vllm_omni/deploy/qwen3_omni_moe.yaml`: + +```yaml +# vllm_omni/deploy/qwen3_omni_moe.yaml (excerpt) +async_chunk: true +stages: + - stage_id: 0 + gpu_memory_utilization: 0.9 + max_num_seqs: 32 + - stage_id: 1 + gpu_memory_utilization: 0.7 + max_num_seqs: 16 +``` + +A user-authored overlay that inherits the base and overrides only stage 1: + +```yaml +# my_overrides.yaml +base_config: /path/to/vllm_omni/deploy/qwen3_omni_moe.yaml +stages: + - stage_id: 1 + gpu_memory_utilization: 0.5 # smaller GPU +``` + +Launched with both an explicit global flag and a per-stage override: + +```bash +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ + --deploy-config my_overrides.yaml \ + --max-model-len 16384 \ + --stage-overrides '{"0": {"max_num_seqs": 8}}' +``` + +Effective config per stage after the merge: + +| Stage | Field | Final value | Source | +|-------|-------|-------------|--------| +| 0 | `gpu_memory_utilization` | `0.9` | base YAML (overlay didn't touch stage 0) | +| 0 | `max_num_seqs` | `8` | per-stage CLI (`--stage-overrides`) — wins over base `32` | +| 0 | `max_model_len` | `16384` | global CLI | +| 1 | `gpu_memory_utilization` | `0.5` | overlay YAML — wins over base `0.7` | +| 1 | `max_num_seqs` | `16` | base YAML (overlay didn't touch this field) | +| 1 | `max_model_len` | `16384` | global CLI | +| 2 | (all defaults) | — | base YAML (no overrides apply) | Therefore, as a core part of vLLM-Omni, the stage configs for a model have several main functions: @@ -35,7 +175,7 @@ stage_args: - stage_id: 0 # mark the unique id for each stage runtime: # The disaggregated configuration process: true # Run this stage in a separate process - devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) + devices: "0" # Logical device index for this stage (mapped through CUDA_VISIBLE_DEVICES / ASCEND_RT_VISIBLE_DEVICES if set) engine_args: # Engine arguments for a certain engine model_stage: thinker max_num_seqs: 1 @@ -114,16 +254,12 @@ stage_args: # Top-level runtime config (concise): default windows and stage edges runtime: enabled: true - defaults: - window_size: -1 # Simplified: trigger downstream only after full upstream completion - max_inflight: 1 # Simplified: process serially within each stage + edges: - from: 0 # thinker → talker: trigger only after receiving full input (-1) to: 1 - window_size: -1 - from: 1 # talker → code2wav: trigger only after receiving full input (-1) to: 2 - window_size: -1 ``` @@ -155,7 +291,9 @@ Default: `true` #### `runtime.devices` -Visible devices for this stage, specified as a string. This controls which GPU devices are available to the stage process, similar to setting `CUDA_VISIBLE_DEVICES` or using `torch.cuda.set_device()`. For example, `"0"` uses GPU 0, `"1"` uses GPU 1, and `"0,1"` makes both GPUs 0 and 1 visible. +Logical device indices for this stage, specified as a string. Values are **logical indices** (`0`, `1`, `2`, ...) — not physical GPU IDs — and are mapped through the platform's visibility env var (`CUDA_VISIBLE_DEVICES` on CUDA, `ASCEND_RT_VISIBLE_DEVICES` on NPU) before being applied via `torch.cuda.set_device()` (or the equivalent). + +Example: if `CUDA_VISIBLE_DEVICES=0,2,4` is set in the environment, then `devices: "0"` selects physical GPU 0 (the first visible), `devices: "1"` selects physical GPU 2, and `devices: "0,1"` makes physical GPUs 0 and 2 available to the stage. If no visibility env var is set, logical and physical IDs coincide. Default: `"0"` diff --git a/docs/configuration/stage_configs/qwen2_5_omni.yaml b/docs/configuration/stage_configs/qwen2_5_omni.yaml deleted file mode 100644 index 690577b84a8..00000000000 --- a/docs/configuration/stage_configs/qwen2_5_omni.yaml +++ /dev/null @@ -1,94 +0,0 @@ -# stage config for running qwen2.5-omni with AsyncOmniEngine + Orchestrator runtime. -stage_args: - - stage_id: 0 - runtime: - process: true # Run this stage in a separate process - devices: "0" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) - engine_args: - model_stage: thinker - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.8 - enforce_eager: true # Now we only support eager mode - trust_remote_code: true - engine_output_type: latent - enable_prefix_caching: false - is_comprehension: true - final_output: true - final_output_type: text - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - - stage_id: 1 - runtime: - process: true - devices: "1" - engine_args: - model_stage: talker - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.8 - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: latent - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker - default_sampling_params: - temperature: 0.9 - top_p: 0.8 - top_k: 40 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - stop_token_ids: [8294] - - stage_id: 2 - runtime: - process: true - devices: "0" # Example: use a different GPU than the previous stage; use "0" if single GPU - engine_args: - model_stage: code2wav - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - gpu_memory_utilization: 0.15 - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: audio - engine_input_source: [1] - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - -# Top-level runtime config (concise): default windows and stage edges -runtime: - enabled: true - defaults: - window_size: -1 # Simplified: trigger downstream only after full upstream completion - max_inflight: 1 # Simplified: process serially within each stage - edges: - - from: 0 # thinker → talker: trigger only after receiving full input (-1) - to: 1 - window_size: -1 - - from: 1 # talker → code2wav: trigger only after receiving full input (-1) - to: 2 - window_size: -1 diff --git a/docs/contributing/ci/CI_5levels.md b/docs/contributing/ci/CI_5levels.md index b0428ddd7de..2452ef5d4a3 100644 --- a/docs/contributing/ci/CI_5levels.md +++ b/docs/contributing/ci/CI_5levels.md @@ -231,8 +231,7 @@ vllm_omni/ tests/ │ ├── test_qwen3_omni_expansion.py │ ├── test_mimo_audio.py │ ├── test_image_gen_edit.py - │ ├── test_images_generations_lora.py - │ └── stage_configs/ + │ └── test_images_generations_lora.py └── offline_inference/ ✅ ├── test_qwen2_5_omni.py ├── test_qwen3_omni.py @@ -248,11 +247,12 @@ vllm_omni/ tests/ ├── test_diffusion_layerwise_offload.py ├── test_diffusion_lora.py ├── test_sequence_parallel.py - └── stage_configs/ - ├── qwen2_5_omni_ci.yaml - ├── qwen3_omni_ci.yaml - ├── bagel_*.yaml - └── npu/, rocm/, etc. + └── stage_configs/ (legacy schema, still + ├── bagel_*.yaml present for unmigrated + └── npu/, rocm/, etc. models) + +# Migrated models (qwen3_omni_moe, qwen2_5_omni, qwen3_tts) live under +# vllm_omni/deploy/ instead — see docs/configuration/stage_configs.md. ``` diff --git a/docs/contributing/ci/test_examples/l4_functionality_tests.inc.md b/docs/contributing/ci/test_examples/l4_functionality_tests.inc.md index 69d6ad82871..ab4deecd60d 100644 --- a/docs/contributing/ci/test_examples/l4_functionality_tests.inc.md +++ b/docs/contributing/ci/test_examples/l4_functionality_tests.inc.md @@ -40,7 +40,7 @@ Currently all the features are available in online serving mode. Hence, only nee - Test marks: always add `advanced_model` and `diffusion`. Add GPU-related marks if needed. Ref: [Markers for Tests](https://docs.vllm.ai/projects/vllm-omni/en/latest/contributing/ci/tests_markers/). - To maximize code reuse, you may refer to - `tests/conftest.py` for `omni_server` (running server in subprocess) and `openai_client` fixtures (sending requests and validating output), `generate_synthetic_image` and `assert_XXX_valid` helper. - - `tests/utils.py` for `@hardware_test(...)` and `hardware_marks`. + - `tests/helpers/mark.py` for `@hardware_test(...)` and `hardware_marks`. - [Parametrizing tests (pytest doc)](https://docs.pytest.org/en/stable/example/parametrize.html) to reuse test function implementation for different cases. - Doc: add a concise docstring for each test function. - Reference L4 test implementation: [tests/e2e/online_serving/test_qwen_image_edit_expansion.py](https://github.com/vllm-project/vllm-omni/blob/main/tests/e2e/online_serving/test_qwen_image_edit_expansion.py). diff --git a/docs/contributing/ci/tests_markers.md b/docs/contributing/ci/tests_markers.md index 7c1ba1c73bd..7628db284a7 100644 --- a/docs/contributing/ci/tests_markers.md +++ b/docs/contributing/ci/tests_markers.md @@ -38,7 +38,7 @@ Defined in `pyproject.toml`: ### Example usage for markers ```python -from tests.utils import hardware_test +from tests.helpers.mark import hardware_test @pytest.mark.core_model @pytest.mark.omni @@ -53,7 +53,7 @@ def test_video_to_audio() ### Decorator: `@hardware_test` -This decorator is intended to make hardware-aware, cross-platform test authoring easier and more robust for CI/CD environments. The `hardware_test` decorator in `vllm-omni/tests/utils.py` performs the following actions: +This decorator is intended to make hardware-aware, cross-platform test authoring easier and more robust for CI/CD environments. The `hardware_test` decorator in `vllm-omni/tests/helpers/mark.py` performs the following actions: 1. **Applies platform and resource markers** Adds the appropriate pytest markers for each specified hardware platform (e.g., `cuda`, `rocm`, `xpu`, `npu`) and resource type (e.g., `L4`, `H100`, `MI325`, `B60`, `A2`, `A3`). @@ -105,7 +105,7 @@ This decorator is intended to make hardware-aware, cross-platform test authoring `hardware_marks` returns a list of pytest mark objects with the same signature as `@hardware_test`. Use it when you need more flexibility, such as attaching hardware marks to individual `pytest.param` entries rather than an entire test function. ```python -from tests.utils import hardware_marks +from tests.helpers.mark import hardware_marks MULTI_CARD_MARKS = hardware_marks( res={"cuda": "H100", "rocm": "MI325", "npu": "A2"}, num_cards=2 @@ -133,9 +133,9 @@ If you want to add support for a new platform (e.g., "tpu" for a new accelerator "distributed_tpu: Tests that require multiple TPU devices", ] ``` -2. **Implement a marker construction function for your platform** in `vllm-omni/tests/utils.py`: +2. **Implement a marker construction function for your platform** in `vllm-omni/tests/helpers/mark.py`: ```python - # In vllm-omni/tests/utils.py + # In vllm-omni/tests/helpers/mark.py def tpu_marks(*, res: str, num_cards: int): test_platform = pytest.mark.tpu @@ -175,4 +175,4 @@ If you want to add support for a new platform (e.g., "tpu" for a new accelerator - Plug into `hardware_marks` - You're done: tests using `@hardware_test` or `hardware_marks` with your platform now automatically get the correct markers, distribution, and isolation! -See code in `vllm-omni/tests/utils.py` for existing examples (`cuda_marks`, `rocm_marks`, `npu_marks`). +See code in `vllm-omni/tests/helpers/mark.py` for existing examples (`cuda_marks`, `rocm_marks`, `npu_marks`). diff --git a/docs/contributing/ci/tests_style.md b/docs/contributing/ci/tests_style.md index 69d5b16d7a5..3a8cb0f127c 100644 --- a/docs/contributing/ci/tests_style.md +++ b/docs/contributing/ci/tests_style.md @@ -135,8 +135,7 @@ vllm_omni/ tests/ │ ├── test_qwen3_omni_expansion.py │ ├── test_mimo_audio.py │ ├── test_image_gen_edit.py - │ ├── test_images_generations_lora.py - │ └── stage_configs/ + │ └── test_images_generations_lora.py └── offline_inference/ ✅ ├── test_qwen2_5_omni.py ├── test_qwen3_omni.py @@ -153,11 +152,12 @@ vllm_omni/ tests/ ├── test_diffusion_lora.py ├── test_sequence_parallel.py ├── test_qwen_image_edit_expansion.py - └── stage_configs/ - ├── qwen2_5_omni_ci.yaml - ├── qwen3_omni_ci.yaml - ├── bagel_*.yaml + └── stage_configs/ (legacy schema, still present + ├── bagel_*.yaml for unmigrated models) └── npu/, rocm/, etc. + +# Migrated models (qwen3_omni_moe, qwen2_5_omni, qwen3_tts) live under +# vllm_omni/deploy/ instead — see docs/configuration/stage_configs.md. examples/ tests │ └── examples ├── online_serving/ → ├── online_serving/ @@ -221,14 +221,13 @@ from pathlib import Path import openai import pytest -from tests.conftest import ( - OmniServer, - convert_audio_to_text, +from tests.helpers.media import ( + convert_audio_bytes_to_text, cosine_similarity_text, - dummy_messages_from_mix_data, generate_synthetic_video, - merge_base64_and_convert_to_text, ) +from tests.helpers.runtime import OmniServer, dummy_messages_from_mix_data +from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config from vllm_omni.platforms import current_omni_platform # Edit: model name and stage config path @@ -236,7 +235,7 @@ models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"] #If you use the default configuration file, you can directly use the following address. def get_default_config(): - return str(Path(__file__).parent.parent / "stage_configs" / "qwen3_omni_ci.yaml") + return get_deploy_config_path("ci/qwen3_omni_moe.yaml") #If you need to modify the configuration file, you can use modify_stage_config. def get_chunk_config(): @@ -405,7 +404,7 @@ def test_mix_to_text_audio_001(client: openai.OpenAI, omni_server, request) -> N # PURPOSE: Verify text and audio outputs convey the same information # CUSTOMIZATION: Adjust similarity threshold (0.9) based on accuracy requirements assert audio_data is not None, "No audio output is generated" - audio_content = merge_base64_and_convert_to_text(audio_data) + audio_content = convert_audio_bytes_to_text(audio_data) print(f"text content is: {text_content}") print(f"audio content is: {audio_content}") similarity = cosine_similarity_text(audio_content.lower(), text_content.lower()) @@ -428,7 +427,7 @@ from pathlib import Path import pytest from vllm.assets.video import VideoAsset -from tests.utils import hardware_test +from tests.helpers.mark import hardware_test from ..multi_stages.conftest import OmniRunner # Optional: set process start method for workers diff --git a/docs/contributing/model/adding_omni_model.md b/docs/contributing/model/adding_omni_model.md index a0619e33811..1eaff10596c 100644 --- a/docs/contributing/model/adding_omni_model.md +++ b/docs/contributing/model/adding_omni_model.md @@ -313,7 +313,7 @@ The registry uses lazy loading, so the model class is imported only when needed. ## Stage Configuration -Create a YAML configuration file in `vllm_omni/model_executor/stage_configs/`. For a complete example, see the [Qwen3-Omni configuration file](gh-file:vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml). +Create a YAML configuration file in `vllm_omni/deploy/`. For a complete example, see the [Qwen3-Omni configuration file](gh-file:vllm_omni/deploy/qwen3_omni_moe.yaml). ### Key Configuration Fields @@ -408,18 +408,17 @@ Understanding the data structures is crucial for implementing stage transitions: **Input to your function:** - `stage_list[source_stage_id].engine_outputs`: List of `EngineCoreOutput` objects - - Each contains `outputs`: List of `RequestOutput` objects - - Each `RequestOutput` has: - - `token_ids`: Generated token IDs - - `multimodal_output`: Dict with keys like `"code_predictor_codes"`, etc. - - These are the hidden states or intermediate outputs from the model's forward pass - - `prompt_token_ids`: Original prompt token IDs +- - Each contains `outputs`: List of `RequestOutput` objects + - Each `RequestOutput` has: +- - - `token_ids`: Generated token IDs + - `multimodal_output`: Dict with keys like `"code_predictor_codes"`, etc.These are the hidden states or intermediate outputs from the model's forward pass + - `prompt_token_ids`: Original prompt token IDs **Output from your function:** - Must return `list[OmniTokensPrompt]` where each `OmniTokensPrompt` contains: - - `prompt_token_ids`: List[int] - Token IDs for the next stage - - `additional_information`: Dict[str, Any] - Optional metadata (e.g., embeddings, hidden states) - - `multi_modal_data`: Optional multimodal data if needed +- - `prompt_token_ids`: List[int] - Token IDs for the next stage + - `additional_information`: Dict[str, Any] - Optional metadata (e.g., embeddings, hidden states) + - `multi_modal_data`: Optional multimodal data if needed ### How Model Outputs Are Stored @@ -614,7 +613,7 @@ For a complete reference implementation, see: - **Thinker**: `vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py` - **Talker**: `vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py` - **Code2Wav**: `vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_code2wav.py` -- **Stage config**: `vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml` +- **Stage config**: `vllm_omni/deploy/qwen3_omni_moe.yaml` - **Input processors**: `vllm_omni/model_executor/stage_input_processors/qwen3_omni.py` - **Registry**: `vllm_omni/model_executor/models/registry.py` - **Testing**: `vllm_omni/tests/e2e/offline_inference/test_qwen3_omni.py` diff --git a/docs/contributing/model/adding_tts_model.md b/docs/contributing/model/adding_tts_model.md index e48ae5049ff..622064173cd 100644 --- a/docs/contributing/model/adding_tts_model.md +++ b/docs/contributing/model/adding_tts_model.md @@ -28,7 +28,7 @@ and can be placed on different devices. Qwen3-TTS has two stages: Each stage is a separate model class configured independently via YAML. The two stages are connected by the `async_chunk` framework, which enables inter-stage streaming for -low first-packet latency (see [Async Chunk Design](../../design/feature/async_chunk_design.md)). +low first-packet latency (see [Async Chunk Design](../../design/feature/async_chunk.md)). ### Without async_chunk (batch mode) @@ -120,8 +120,18 @@ vllm_omni/model_executor/stage_configs/ | `models/qwen3_tts/qwen3_tts.py` | Unified model class | | `models/qwen3_tts/qwen3_tts_code_predictor_vllm.py` | Stage 0 - optimized AR | | `models/qwen3_tts/qwen3_tts_code2wav.py` | Stage 1 - decoder | -| `stage_configs/qwen3_tts.yaml` | Stage config (async_chunk enabled) | -| `stage_configs/qwen3_tts_batch.yaml` | Batch mode config | +| `deploy/qwen3_tts.yaml` (new schema) | Deploy config (async_chunk enabled) — paired with `models/qwen3_tts/pipeline.py` for the frozen topology | + +> **Chunked vs end-to-end modes**: `qwen3_tts` registers a single +> pipeline whose stage 1 declares alternate processor functions — an +> `async_chunk_process_next_stage_input_func` (per-chunk streaming, used +> when `deploy.async_chunk=True`) and a `sync_process_input_func` +> (batch-end, used when `deploy.async_chunk=False`). The loader selects +> one at merge time based on the bool, so `--no-async-chunk` alone +> switches modes — no variant yaml or variant pipeline registration is +> needed. Pipelines that only make sense in one mode (e.g. +> `qwen3_omni_moe` is always chunked) can keep using the unconditional +> `custom_process_*` fields. | `stage_input_processors/qwen3_tts.py` | Stage transition processors | ## Step-by-Step Implementation @@ -574,11 +584,12 @@ Adding a TTS model to vLLM-Omni involves: | `models/qwen3_tts/qwen3_tts.py` | Unified model class | | `models/qwen3_tts/qwen3_tts_code_predictor_vllm.py` | AR stage with vLLM fused ops | | `models/qwen3_tts/qwen3_tts_code2wav.py` | Decoder stage with `chunked_decode_streaming()` | -| `stage_configs/qwen3_tts.yaml` | Stage configuration | +| `models/qwen3_tts/pipeline.py` | Frozen pipeline topology (registered at import time) | +| `deploy/qwen3_tts.yaml` | Deploy config (user-editable, async_chunk + SharedMemoryConnector) | | `stage_input_processors/qwen3_tts.py` | Stage transition processors | For more information, see: - [Architecture Overview](../../design/architecture_overview.md) -- [Async Chunk Design](../../design/feature/async_chunk_design.md) +- [Async Chunk Design](../../design/feature/async_chunk.md) - [Stage Configuration Guide](../../configuration/stage_configs.md) diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index 418fb707ae9..6c209e5659a 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -127,10 +127,11 @@ Multi-stage omni serving: ```bash vllm serve Qwen/Qwen2.5-Omni-7B \ --omni \ - --stage-configs-path qwen2_5_omni.yaml \ --port 8091 ``` +(The default deploy config at `vllm_omni/deploy/qwen2_5_omni.yaml` is loaded automatically. Pass `--deploy-config /path/to/custom.yaml` to override.) + Single-stage diffusion serving with torch profiler: ```bash diff --git a/docs/design/feature/async_chunk_design.md b/docs/design/feature/async_chunk.md similarity index 99% rename from docs/design/feature/async_chunk_design.md rename to docs/design/feature/async_chunk.md index 45314a0aec6..57b4209b8df 100644 --- a/docs/design/feature/async_chunk_design.md +++ b/docs/design/feature/async_chunk.md @@ -1,4 +1,4 @@ -# Async Chunk Design +# Async Chunk ## Table of Contents @@ -88,8 +88,9 @@ The following diagram illustrates the **Async Chunk Architecture** for multi-sta

**Diagram Legend:** + | Step | Stage Type | Description | -|:------:|:-----------:|:------------| +|------|-----------|------------| | `prefill` | Initialization | Context processing, KV cache initialization | | `decode` | Autoregressive | Token-by-token generation in AR stages | | `codes` | Audio Encoding | RVQ codec codes from Talker stage | diff --git a/docs/design/feature/teacache.md b/docs/design/feature/teacache.md index 9fa315cee77..8577cff1f05 100644 --- a/docs/design/feature/teacache.md +++ b/docs/design/feature/teacache.md @@ -326,9 +326,41 @@ for prompt in tqdm(prompts, desc="Collecting data"): # Estimate coefficients coeffs = estimator.estimate(poly_order=4) -print(f"Estimated coefficients: {coeffs.tolist()}") +print(f"Estimated coefficients: {coeffs}") ``` +Note: some models may require the vLLM context and config to be initialized to initialize vLLM modules. To this end, you may need a workaround like the following to be able to run coefficient estimation. +```python +from vllm_omni.diffusion.forward_context import set_forward_context +from vllm_omni.diffusion.distributed.parallel_state import ( + init_distributed_environment, + initialize_model_parallel, +) +from vllm.config import VllmConfig +... + +if __name__ == "__main__": + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "8192" + os.environ["LOCAL_RANK"] = "0" + os.environ["RANK"] = "0" + os.environ["WORLD_SIZE"] = "1" + + vllm_config = VllmConfig() + init_distributed_environment() + initialize_model_parallel() + + # NOTE: you may have to pass an initialized OmniDiffusionConfig as a kwarg + # here to make current sp checks happy; if this is the case, just create one + # .from_kwargs() with the model name to get around this check for now, + # since your estimator subclass should handle the actual model configuration. + # + # This will be cleaned up in the future + with set_forward_context(vllm_config): + +``` + + **Data Statistics Guide:** | Metric | Good Range | Warning Signs | diff --git a/docs/design/figures/omni/E2EL_s_vllm_omni_vs_transformers.png b/docs/design/figures/omni/E2EL_s_vllm_omni_vs_transformers.png new file mode 100644 index 00000000000..15112d5862a Binary files /dev/null and b/docs/design/figures/omni/E2EL_s_vllm_omni_vs_transformers.png differ diff --git a/docs/design/figures/omni/Mean_AUDIO_RTF_Baseline_vs_Batch.png b/docs/design/figures/omni/Mean_AUDIO_RTF_Baseline_vs_Batch.png new file mode 100644 index 00000000000..2f0615f77bb Binary files /dev/null and b/docs/design/figures/omni/Mean_AUDIO_RTF_Baseline_vs_Batch.png differ diff --git a/docs/design/figures/omni/Mean_AUDIO_RTF_Batch_CUDA_Graph_vs_Async_Chunk.png b/docs/design/figures/omni/Mean_AUDIO_RTF_Batch_CUDA_Graph_vs_Async_Chunk.png new file mode 100644 index 00000000000..62d8bc79b6b Binary files /dev/null and b/docs/design/figures/omni/Mean_AUDIO_RTF_Batch_CUDA_Graph_vs_Async_Chunk.png differ diff --git a/docs/design/figures/omni/Mean_AUDIO_RTF_Batch_vs_Batch_CUDA_Graph.png b/docs/design/figures/omni/Mean_AUDIO_RTF_Batch_vs_Batch_CUDA_Graph.png new file mode 100644 index 00000000000..5838b45319e Binary files /dev/null and b/docs/design/figures/omni/Mean_AUDIO_RTF_Batch_vs_Batch_CUDA_Graph.png differ diff --git a/docs/design/figures/omni/Mean_AUDIO_TTFP_ms_Baseline_vs_Batch.png b/docs/design/figures/omni/Mean_AUDIO_TTFP_ms_Baseline_vs_Batch.png new file mode 100644 index 00000000000..24be814b7e9 Binary files /dev/null and b/docs/design/figures/omni/Mean_AUDIO_TTFP_ms_Baseline_vs_Batch.png differ diff --git a/docs/design/figures/omni/Mean_AUDIO_TTFP_ms_Batch_CUDA_Graph_vs_Async_Chunk.png b/docs/design/figures/omni/Mean_AUDIO_TTFP_ms_Batch_CUDA_Graph_vs_Async_Chunk.png new file mode 100644 index 00000000000..c8df58ebcdf Binary files /dev/null and b/docs/design/figures/omni/Mean_AUDIO_TTFP_ms_Batch_CUDA_Graph_vs_Async_Chunk.png differ diff --git a/docs/design/figures/omni/Mean_AUDIO_TTFP_ms_Batch_vs_Batch_CUDA_Graph.png b/docs/design/figures/omni/Mean_AUDIO_TTFP_ms_Batch_vs_Batch_CUDA_Graph.png new file mode 100644 index 00000000000..2d1a04e9c2c Binary files /dev/null and b/docs/design/figures/omni/Mean_AUDIO_TTFP_ms_Batch_vs_Batch_CUDA_Graph.png differ diff --git a/docs/design/figures/omni/Mean_E2EL_ms_Baseline_vs_Batch.png b/docs/design/figures/omni/Mean_E2EL_ms_Baseline_vs_Batch.png new file mode 100644 index 00000000000..e598b543431 Binary files /dev/null and b/docs/design/figures/omni/Mean_E2EL_ms_Baseline_vs_Batch.png differ diff --git a/docs/design/figures/omni/Mean_E2EL_ms_Batch_CUDA_Graph_vs_Async_Chunk.png b/docs/design/figures/omni/Mean_E2EL_ms_Batch_CUDA_Graph_vs_Async_Chunk.png new file mode 100644 index 00000000000..54452013eb4 Binary files /dev/null and b/docs/design/figures/omni/Mean_E2EL_ms_Batch_CUDA_Graph_vs_Async_Chunk.png differ diff --git a/docs/design/figures/omni/Mean_E2EL_ms_Batch_vs_Batch_CUDA_Graph.png b/docs/design/figures/omni/Mean_E2EL_ms_Batch_vs_Batch_CUDA_Graph.png new file mode 100644 index 00000000000..04c5ad7396a Binary files /dev/null and b/docs/design/figures/omni/Mean_E2EL_ms_Batch_vs_Batch_CUDA_Graph.png differ diff --git a/docs/design/figures/omni/RTF_vllm_omni_vs_transformers.png b/docs/design/figures/omni/RTF_vllm_omni_vs_transformers.png new file mode 100644 index 00000000000..d93ba0b2af5 Binary files /dev/null and b/docs/design/figures/omni/RTF_vllm_omni_vs_transformers.png differ diff --git a/docs/design/figures/omni/Summary_E2EL_ms_vs_features.png b/docs/design/figures/omni/Summary_E2EL_ms_vs_features.png new file mode 100644 index 00000000000..04087b5910f Binary files /dev/null and b/docs/design/figures/omni/Summary_E2EL_ms_vs_features.png differ diff --git a/docs/design/figures/omni/Summary_RTF_vs_features.png b/docs/design/figures/omni/Summary_RTF_vs_features.png new file mode 100644 index 00000000000..c2c8ad40834 Binary files /dev/null and b/docs/design/figures/omni/Summary_RTF_vs_features.png differ diff --git a/docs/design/figures/omni/Summary_TTFP_ms_vs_features.png b/docs/design/figures/omni/Summary_TTFP_ms_vs_features.png new file mode 100644 index 00000000000..3dcc1c55379 Binary files /dev/null and b/docs/design/figures/omni/Summary_TTFP_ms_vs_features.png differ diff --git a/docs/design/figures/omni/TTFP_s_vllm_omni_vs_transformers.png b/docs/design/figures/omni/TTFP_s_vllm_omni_vs_transformers.png new file mode 100644 index 00000000000..9a5b6c9bdaf Binary files /dev/null and b/docs/design/figures/omni/TTFP_s_vllm_omni_vs_transformers.png differ diff --git a/docs/design/figures/tts/Mean_AUDIO_RTF_vllm_omni_vs_transformers.png b/docs/design/figures/tts/Mean_AUDIO_RTF_vllm_omni_vs_transformers.png new file mode 100644 index 00000000000..68f0ef17e88 Binary files /dev/null and b/docs/design/figures/tts/Mean_AUDIO_RTF_vllm_omni_vs_transformers.png differ diff --git a/docs/design/figures/tts/Mean_AUDIO_TTFP_(ms)_vllm_omni_vs_transformers.png b/docs/design/figures/tts/Mean_AUDIO_TTFP_(ms)_vllm_omni_vs_transformers.png new file mode 100644 index 00000000000..44be96e96da Binary files /dev/null and b/docs/design/figures/tts/Mean_AUDIO_TTFP_(ms)_vllm_omni_vs_transformers.png differ diff --git a/docs/design/figures/tts/Mean_E2EL_(ms)_vllm_omni_vs_transformers.png b/docs/design/figures/tts/Mean_E2EL_(ms)_vllm_omni_vs_transformers.png new file mode 100644 index 00000000000..2e5d1482bd7 Binary files /dev/null and b/docs/design/figures/tts/Mean_E2EL_(ms)_vllm_omni_vs_transformers.png differ diff --git a/docs/design/figures/tts/Mean_mean_e2e_ms_baseline_vs_batch.png b/docs/design/figures/tts/Mean_mean_e2e_ms_baseline_vs_batch.png new file mode 100644 index 00000000000..04d8f0bac53 Binary files /dev/null and b/docs/design/figures/tts/Mean_mean_e2e_ms_baseline_vs_batch.png differ diff --git a/docs/design/figures/tts/Mean_mean_e2e_ms_batch_vs_cuda_graph.png b/docs/design/figures/tts/Mean_mean_e2e_ms_batch_vs_cuda_graph.png new file mode 100644 index 00000000000..eb85ec0dd4f Binary files /dev/null and b/docs/design/figures/tts/Mean_mean_e2e_ms_batch_vs_cuda_graph.png differ diff --git a/docs/design/figures/tts/Mean_mean_e2e_ms_cuda_graph_vs_async_chunk.png b/docs/design/figures/tts/Mean_mean_e2e_ms_cuda_graph_vs_async_chunk.png new file mode 100644 index 00000000000..6f0e0e2529d Binary files /dev/null and b/docs/design/figures/tts/Mean_mean_e2e_ms_cuda_graph_vs_async_chunk.png differ diff --git a/docs/design/figures/tts/Mean_mean_rtf_baseline_vs_batch.png b/docs/design/figures/tts/Mean_mean_rtf_baseline_vs_batch.png new file mode 100644 index 00000000000..89ea30a8643 Binary files /dev/null and b/docs/design/figures/tts/Mean_mean_rtf_baseline_vs_batch.png differ diff --git a/docs/design/figures/tts/Mean_mean_rtf_batch_vs_cuda_graph.png b/docs/design/figures/tts/Mean_mean_rtf_batch_vs_cuda_graph.png new file mode 100644 index 00000000000..2b207b88987 Binary files /dev/null and b/docs/design/figures/tts/Mean_mean_rtf_batch_vs_cuda_graph.png differ diff --git a/docs/design/figures/tts/Mean_mean_rtf_cuda_graph_vs_async_chunk.png b/docs/design/figures/tts/Mean_mean_rtf_cuda_graph_vs_async_chunk.png new file mode 100644 index 00000000000..f5f7ad72c8f Binary files /dev/null and b/docs/design/figures/tts/Mean_mean_rtf_cuda_graph_vs_async_chunk.png differ diff --git a/docs/design/figures/tts/Mean_mean_ttfp_ms_baseline_vs_batch.png b/docs/design/figures/tts/Mean_mean_ttfp_ms_baseline_vs_batch.png new file mode 100644 index 00000000000..6f8c1da4a5b Binary files /dev/null and b/docs/design/figures/tts/Mean_mean_ttfp_ms_baseline_vs_batch.png differ diff --git a/docs/design/figures/tts/Mean_mean_ttfp_ms_batch_vs_cuda_graph.png b/docs/design/figures/tts/Mean_mean_ttfp_ms_batch_vs_cuda_graph.png new file mode 100644 index 00000000000..b0fe1d02a9d Binary files /dev/null and b/docs/design/figures/tts/Mean_mean_ttfp_ms_batch_vs_cuda_graph.png differ diff --git a/docs/design/figures/tts/Mean_mean_ttfp_ms_cuda_graph_vs_async_chunk.png b/docs/design/figures/tts/Mean_mean_ttfp_ms_cuda_graph_vs_async_chunk.png new file mode 100644 index 00000000000..008ba9bf78f Binary files /dev/null and b/docs/design/figures/tts/Mean_mean_ttfp_ms_cuda_graph_vs_async_chunk.png differ diff --git a/docs/design/figures/tts/Summary_mean_e2e_ms_vs_features.png b/docs/design/figures/tts/Summary_mean_e2e_ms_vs_features.png new file mode 100644 index 00000000000..7c65aa11770 Binary files /dev/null and b/docs/design/figures/tts/Summary_mean_e2e_ms_vs_features.png differ diff --git a/docs/design/figures/tts/Summary_mean_rtf_vs_features.png b/docs/design/figures/tts/Summary_mean_rtf_vs_features.png new file mode 100644 index 00000000000..71bb2c54680 Binary files /dev/null and b/docs/design/figures/tts/Summary_mean_rtf_vs_features.png differ diff --git a/docs/design/figures/tts/Summary_mean_ttfp_ms_vs_features.png b/docs/design/figures/tts/Summary_mean_ttfp_ms_vs_features.png new file mode 100644 index 00000000000..cef2546d6fe Binary files /dev/null and b/docs/design/figures/tts/Summary_mean_ttfp_ms_vs_features.png differ diff --git a/docs/design/qwen3_omni_tts_performance_optimization.md b/docs/design/qwen3_omni_tts_performance_optimization.md new file mode 100644 index 00000000000..2f18a1b1bc0 --- /dev/null +++ b/docs/design/qwen3_omni_tts_performance_optimization.md @@ -0,0 +1,539 @@ +# Speech Generation on vLLM-Omni: Performance Optimizations for Qwen3-Omni and Qwen3-TTS + +## Summary + +vLLM-Omni supports end-to-end serving for speech-generating models, including both **Qwen3-Omni** (multimodal understanding + speech) and **Qwen3-TTS** (text-to-speech). Despite their different architectures, both models share the same multi-stage pipeline design and benefit from the same set of stacked optimizations: + +1. **Batching** improves GPU utilization stage by stage and increases overall throughput. +2. **CUDA Graph** reduces CPU launch overhead and decode-time jitter on stable shapes. +3. **Async Chunk and Streaming Output** overlap compute and communication across stages and emit audio incrementally, improving both TTFP and E2E. + +### Model architectures + +**Qwen3-Omni** is a native multimodal model that understands text, audio, image, and video inputs, and generates both text and speech outputs. Its pipeline has three stages: + +- **Thinker**: multimodal understanding and text generation +- **Talker (+ Talker-MTP / code predictor path)**: converts semantic/text representations into codec tokens +- **Code2Wav**: decodes codec tokens into waveform audio + +**Qwen3-TTS** is a lightweight, high-quality text-to-speech model. Its pipeline has two stages: + +- **Talker (AR decoder)**: auto-regressively generates codec tokens from text input +- **Code2Wav (vocoder)**: decodes codec tokens into waveform audio + +The optimizations described in this post apply to both models. We present results for each side by side. + +### vLLM-Omni vs HF Transformers + +Compared with **HF Transformers** (offline, single request), vLLM-Omni with the full optimization stack delivers dramatically lower latency and higher efficiency for both models. + +**Qwen3-Omni** (A100): + + + + + +
Qwen3-Omni E2EL: vLLM vs HFQwen3-Omni TTFP: vLLM vs HFQwen3-Omni RTF: vLLM vs HF
+ +| Metric | vLLM-Omni | HF Transformers | Improvement | +| --- | --- | --- | --- | +| E2E latency (s) | 23.78 | 336.10 | ~93% reduction | +| TTFP (s) | 0.934 | 336.10 | ~99.7% reduction | +| RTF | 0.32 | 3.776 | ~91% reduction (~12× faster) | + +- **E2E latency**: 23.78 s vs 336.10 s - **~93%** reduction +- **TTFP**: 0.934 s vs 336.10 s - **~99.7%** reduction +- **RTF**: 0.32 vs 3.776 - **~91%** reduction (~12x faster) + +**Qwen3-TTS** (H200, concurrency 1): + + + + + +
Qwen3-TTS E2EL: vLLM vs HFQwen3-TTS TTFP: vLLM vs HFQwen3-TTS RTF: vLLM vs HF
+ +| Metric | vLLM-Omni | HF Transformers | Improvement | +| --- | --- | --- | --- | +| E2E latency (ms) | 941 | 15,513 | ~94% reduction | +| TTFP (ms) | 64 | 15,513 | ~99.6% reduction (242× faster) | +| RTF | 0.16 | 2.64 | ~94% reduction (~16.5× faster) | + +- **E2E latency**: 941 ms vs 15,513 ms - **~94%** reduction +- **TTFP**: 64 ms vs 15,513 ms - **~99.6%** reduction (242x faster) +- **RTF**: 0.16 vs 2.64 - **~94%** reduction (~16.5x faster) + +### Stacked optimization summary + +Each optimization stacks on the previous one. The summary plots below show the cumulative effect at each step, with one line per concurrency level (1, 4, 10). + +**Qwen3-Omni** (A100): + + + + + +
Qwen3-Omni E2EL: stacked optimizationQwen3-Omni TTFP: stacked optimizationQwen3-Omni RTF: stacked optimization
+ +- **E2EL reduction**: ~74% at concurrency 10 (410,054 ms -> 104,901 ms); ~90% at concurrency 1 (426,529 ms -> 41,216 ms) +- **TTFP reduction**: ~96% at concurrency 10 (409,705 ms -> 16,482 ms); ~99.7% at concurrency 1 (426,078 ms -> 1,164 ms) +- **RTF reduction**: ~74% at concurrency 10 (2.83 -> 0.74); ~90% at concurrency 1 (2.08 -> 0.21) + +**Qwen3-TTS** (H200): + + + + + +
Qwen3-TTS E2EL: stacked optimizationQwen3-TTS TTFP: stacked optimizationQwen3-TTS RTF: stacked optimization
+ +- **E2EL reduction**: ~85% at concurrency 10 (12,141 ms -> 1,767 ms); ~29% at concurrency 1 (1,323 ms -> 941 ms) +- **TTFP reduction**: ~96.5% at concurrency 10 (12,141 ms -> 425 ms); ~95% at concurrency 1 (1,323 ms -> 64 ms) +- **RTF reduction**: ~86% at concurrency 10 (2.19 -> 0.31); ~30% at concurrency 1 (0.23 -> 0.16) + +**Benchmark environment:** + +| | Qwen3-Omni | Qwen3-TTS | +| --- |-----------------------------| --- | +| **GPU** | A100 | H200 | +| **Model** | Qwen3-Omni-30B-A3B-Instruct | Qwen3-TTS-12Hz-1.7B-CustomVoice | +| **vLLM** | v0.17.0 | v0.18.0 | +| **vllm-omni** | commit 199f7832 | v0.18.0rc2 | +| **CUDA** | 12.9 | 12.8 | + +This post walks through each optimization in the same order they are typically enabled in practice, then ends with deployment playbooks for both models. + +--- + +## Pipeline Batching + +### How stage-wise batching works + +For both Qwen3-Omni and Qwen3-TTS, batching is a pipeline-level optimization: + +- Requests are grouped per stage using `runtime.max_batch_size` +- Each stage executes batch inference with its own scheduler/worker +- Stage outputs are routed to downstream stages with per-request mapping preserved + +**Batching strategy by stage:** The understanding and decode stages (Thinker for Omni, Talker for both) use **continuous batching**: requests can join and leave the batch over time. Code2Wav uses **static batching**: once a batch is formed, the stage runs the whole batch before starting the next. This matches the decode pattern of Code2Wav and keeps implementation simple while still improving throughput. + +### Batching results (Baseline vs. Batch) + +Batching alone greatly reduces E2EL and RTF across all concurrencies. The biggest gains appear at high concurrency where requests share GPU resources. + +**Qwen3-Omni** (A100): + + + + + +
Qwen3-Omni E2EL: Baseline vs BatchQwen3-Omni TTFP: Baseline vs BatchQwen3-Omni RTF: Baseline vs Batch
+ +| Metric | Concurrency | Baseline | + Batch | Improvement | +| --- | --- | --- | --- | --- | +| E2EL (ms) | 1 | 426,529 | 307,719 | 1.4× | +| E2EL (ms) | 4 | 407,213 | 376,934 | 1.1× | +| E2EL (ms) | 10 | 410,054 | 234,844 | 1.7× | +| TTFP (ms) | 1 | 426,078 | 307,262 | 1.4× | +| TTFP (ms) | 4 | 406,843 | 376,466 | 1.1× | +| TTFP (ms) | 10 | 409,705 | 234,557 | 1.7× | +| RTF | 1 | 2.08 | 1.51 | 1.4× | +| RTF | 4 | 2.55 | 1.83 | 1.4× | +| RTF | 10 | 2.83 | 2.28 | 1.2× | + +At concurrency 10, E2EL drops from ~410 s to ~235 s; at concurrency 1, from ~427 s to ~308 s. + +**Qwen3-TTS** (H200): + + + + + +
Qwen3-TTS E2EL: Baseline vs BatchQwen3-TTS TTFP: Baseline vs BatchQwen3-TTS RTF: Baseline vs Batch
+ +| Metric | Concurrency | Baseline | + Batch | Improvement | +| --- | --- | --- | --- | --- | +| E2EL (ms) | 1 | 1,323 | 1,339 | 1.0× | +| E2EL (ms) | 4 | 5,171 | 1,471 | 3.5× | +| E2EL (ms) | 10 | 12,141 | 1,705 | 7.1× | +| RTF | 1 | 0.230 | 0.234 | 1.0× | +| RTF | 4 | 0.908 | 0.255 | 3.6× | +| RTF | 10 | 2.186 | 0.292 | 7.5× | +| Throughput (audio-s/wall-s) | 10 | 3.99 | 33.53 | 8.4× | + +At concurrency 10, batching alone brings Qwen3-TTS RTF from 2.19 (slower than realtime) down to 0.29 (faster than realtime), and throughput from 4.0 to 33.5 audio-sec/wall-sec. + +--- + +## CUDA Graph on the Critical Decode Path + +### Why CUDA Graph helps here + +In decode-heavy serving, repeatedly launching many small kernels from CPU can become a visible overhead. CUDA Graph reduces this overhead by capturing and replaying stable execution graphs. + +In stage configs, this is represented by `enforce_eager: false` for stages where graph capture is desired (Thinker/Talker), while Code2Wav keeps eager mode depending on stage behavior. + +### CUDA Graph results on top of batching + +**Qwen3-Omni** (A100): + + + + + +
Qwen3-Omni E2EL: Batch vs CUDA GraphQwen3-Omni TTFP: Batch vs CUDA GraphQwen3-Omni RTF: Batch vs CUDA Graph
+ +| Metric | Concurrency | Batch | + CUDA Graph | Improvement | +| --- | --- | --- | --- | --- | +| E2EL (ms) | 1 | 307,719 | 61,613 | 5.0× | +| E2EL (ms) | 4 | 376,934 | 79,019 | 4.8× | +| E2EL (ms) | 10 | 234,844 | 126,867 | 1.9× | +| TTFP (ms) | 1 | 307,262 | 61,257 | 5.0× | +| TTFP (ms) | 4 | 376,466 | 78,634 | 4.8× | +| TTFP (ms) | 10 | 234,557 | 126,534 | 1.9× | +| RTF | 1 | 1.51 | 0.32 | 4.7× | +| RTF | 4 | 1.83 | 0.43 | 4.3× | +| RTF | 10 | 2.28 | 0.90 | 2.5× | + +For the larger Qwen3-Omni model (30B-A3B), CUDA Graph provides a significant improvement. At concurrency 1, E2EL drops from ~308 s to ~62 s; at concurrency 10, from ~235 s to ~127 s. + +**Qwen3-TTS** (H200): + + + + + +
TTS E2EL: Batch vs +CGTTS TTFP: Batch vs +CGTTS RTF: Batch vs +CG
+ +| Metric | Concurrency | Batch | + CUDA Graph | Improvement | +| --- | --- | --- | --- | --- | +| E2EL (ms) | 1 | 1,339 | 733 | 1.8× | +| E2EL (ms) | 4 | 1,471 | 987 | 1.5× | +| E2EL (ms) | 10 | 1,705 | 1,197 | 1.4× | +| RTF | 1 | 0.234 | 0.124 | 1.9× | +| RTF | 10 | 0.292 | 0.203 | 1.4× | +| Throughput (audio-s/wall-s) | 10 | 33.53 | 47.15 | 1.4× | + +At concurrency 1, CUDA Graph reduces E2EL from 1,339 ms to 733 ms and RTF from 0.234 to 0.124 - nearly a 2x improvement. The benefit is consistent across all concurrency levels. + +--- + +## Async Chunk and Streaming Output: Earlier Audio and Cross-Stage Overlap + +### Why this step matters for first-packet latency + +Two mechanisms work together to improve user-visible latency: + +- **Streaming output**: audio streaming emits audio chunks as soon as they are decoded (lower **TTFP**). Without streaming, the client waits for larger buffers or end-of-sequence. +- **Async chunk** is the main enabler for *earlier* audio: instead of handing off whole-request results between stages, each stage forwards **chunks** so the next stage can start as soon as the first chunk is ready. For Omni: Thinker -> Talker forwards hidden-state chunks; for both: Talker -> Code2Wav forwards codec chunks; Code2Wav decodes and emits packets incrementally. This **overlaps compute and communication** across stages and directly reduces time-to-first-audio-packet (TTFP) and end-to-end latency (E2EL). + +So in practice: streaming output defines *how* bytes are sent to the client; async chunk defines *when* the pipeline can produce the first bytes. + +**Dependency between the two:** Async chunk and audio streaming output are mutually dependent. Without async chunk, **audio streaming output cannot truly take effect**. Without audio streaming output, async chunk's **TTFP advantage is not fully realized**: the client would still wait for larger buffers or end-of-sequence instead of hearing the first packet as soon as it is ready. We therefore recommend enabling **both** on top of batching + CUDA Graph; the benchmarks in this post use both. + +### Results: Batch + CUDA Graph vs. Batch + CUDA Graph + Async Chunk + Streaming Output + +**Qwen3-Omni** (A100): + + + + + +
Qwen3-Omni E2EL: CG vs Async ChunkQwen3-Omni TTFP: CG vs Async ChunkQwen3-Omni RTF: CG vs Async Chunk
+ +| Metric | Concurrency | Batch + CG | + Async Chunk | Improvement | +| --- | --- | --- | --- | --- | +| E2EL (ms) | 1 | 61,613 | 41,216 | 1.5× | +| E2EL (ms) | 4 | 79,019 | 67,584 | 1.2× | +| E2EL (ms) | 10 | 126,867 | 104,901 | 1.2× | +| TTFP (ms) | 1 | 61,257 | 1,164 | 53× | +| TTFP (ms) | 4 | 78,634 | 3,152 | 24.9× | +| TTFP (ms) | 10 | 126,534 | 16,482 | 7.7× | +| RTF | 1 | 0.32 | 0.21 | 1.5× | +| RTF | 4 | 0.43 | 0.34 | 1.3× | +| RTF | 10 | 0.90 | 0.74 | 1.2× | + +Enabling both brings TTFP down sharply (concurrency 1: 61,257 ms -> 1,164 ms, **~98% reduction**; concurrency 4: 78,634 ms -> 3,152 ms, **~96% reduction**). E2EL and RTF also improve at every concurrency. + +**Qwen3-TTS** (H200): + + + + + +
Qwen3-TTS E2EL: CG vs Async ChunkQwen3-TTS TTFP: CG vs Async ChunkQwen3-TTS RTF: CG vs Async Chunk
+ +| Metric | Concurrency | Batch + CG | + Async Chunk | Improvement | +| --- | --- | --- | --- | --- | +| TTFP (ms) | 1 | 733 | **64** | **11.5×** | +| TTFP (ms) | 4 | 987 | **119** | **8.3×** | +| TTFP (ms) | 10 | 1,197 | **425** | **2.8×** | +| E2EL (ms) | 1 | 733 | 941 | 0.8× | +| E2EL (ms) | 10 | 1,197 | 1,767 | 0.7× | +| RTF | 1 | 0.124 | 0.160 | 0.8× | +| RTF | 10 | 0.203 | 0.314 | 0.6× | + +The TTFP improvement is the headline result for both models. For Qwen3-TTS at concurrency 1, users hear the first audio in **64 ms** instead of 733 ms - an **11.5x reduction**. For Qwen3-Omni at concurrency 1, TTFP drops from 61 s to 1.2 s - a **53x reduction**. + +### Why E2EL and RTF are higher with async chunk (TTS) + +The table above shows that enabling async chunk + streaming *increases* E2EL and RTF for TTS compared to CUDA Graph alone. This is expected - the two configurations optimize for fundamentally different metrics: + +- **CUDA Graph (no async chunk)** generates the entire audio end-to-end before returning. No chunking overhead, so total compute is minimized. +- **Async Chunk + Streaming** splits the pipeline into incremental chunks, adding overhead from chunked transport, context overlap in Code2Wav (`codec_left_context_frames=25`), and smaller effective batch sizes per chunk. + +**The tradeoff is intentional.** Async chunk trades ~30% higher total compute for **11x faster time-to-first-audio**. For interactive applications (voice assistants, chatbots), TTFP determines perceived responsiveness. For offline batch processing, CUDA Graph without async chunk is the better choice. + +--- + +## TTS-Specific: Code Predictor Re-prefill + `torch.compile` + +Qwen3-TTS has a **code predictor** - a small 5-layer transformer that generates residual codebook tokens (groups 1 through Q-1) autoregressively. Each AR step operates on very short sequences (2 to ~16 tokens). + +The naive approach uses a KV cache for this small transformer, similar to the main Talker. But the KV cache machinery (block tables, slot mappings, paged attention) introduces significant overhead relative to the tiny model. Two optimizations replace that: + +### Re-prefill (stateless forward, no KV cache) + +Instead of maintaining a KV cache across steps, the code predictor **re-feeds the full growing sequence** at each AR step using `F.scaled_dot_product_attention`. With sequences of at most ~16 tokens through 5 layers, the O(T^2) attention cost is negligible - and removing the KV cache machinery (block table management, `set_forward_context`, slot mapping) saves far more time than it costs. + +### `torch.compile` on the code predictor forward + +The 5-layer transformer forward pass launches ~60 small CUDA kernels per step. `torch.compile(mode="default", dynamic=True)` fuses these into fewer kernels via Inductor: + +```python +self._compiled_model_fwd = torch.compile( + self.model.forward, + mode="default", # no Inductor CUDA graphs, avoids conflict with vLLM's CUDAGraphWrapper + dynamic=True, # sequence length grows each step (2, 3, ..., num_groups+1) +) +``` + +`mode="default"` is used instead of `mode="reduce-overhead"` to avoid conflicts with vLLM's own CUDA graph capture on the main Talker model. `dynamic=True` handles the growing sequence length without recompilation. + +These optimizations are always-on in the current codebase - all Qwen3-TTS benchmark results in this post include them. + +--- + +## TTS-Specific: Dynamic Initial Chunk for Faster First Audio + +In the async chunk pipeline, the standard `codec_chunk_frames` is 25 (each chunk = ~2 seconds of audio at 12 Hz). Waiting for 25 frames before forwarding the first chunk to Code2Wav adds unnecessary TTFP. The **initial codec chunk** optimization sends a smaller first chunk so Code2Wav can start decoding earlier. + +**Dynamic initial chunk sizing (default behavior):** + +Rather than using a fixed initial chunk size, vLLM-Omni dynamically selects it based on current server load. The initial chunk size is chosen from power-of-2 steps [2, 4, 8, 16] based on load factor (`active_requests / max_batch_size`): + +| Server load | Initial chunk frames | Rationale | +| --- | --- | --- | +| Low (e.g. 1/10 active) | **2** (~167 ms of audio) | Minimize TTFP when there's headroom | +| Medium (e.g. 5/10 active) | **4-8** | Balance TTFP vs decode efficiency | +| High (e.g. 10/10 active) | **16** | Larger first chunk to amortize decode cost | + +After the initial chunk, all subsequent chunks use the standard `codec_chunk_frames` (25) size. + +**How it works in the pipeline:** + +1. Talker generates codec tokens auto-regressively +2. The stage input processor checks current load and picks an initial chunk size (e.g. **2 frames** at low load) +3. After that many frames, the first chunk is forwarded to Code2Wav +4. Code2Wav decodes this small chunk and emits the first audio packet +5. Subsequent chunks use the standard 25-frame size for efficient batch decoding + +**Per-request override:** Clients can also set a fixed initial chunk size via the API: + +```json +{"initial_codec_chunk_frames": 2} +``` + +This overrides the dynamic calculation for that request. + +**Config (server-side):** + +```yaml +runtime: + connectors: + connector_of_shared_memory: + name: SharedMemoryConnector + extra: + codec_streaming: true + codec_chunk_frames: 25 # standard chunk size (~2s of audio) + codec_left_context_frames: 25 + # initial chunk is computed dynamically by default + # set initial_codec_chunk_frames: 2 to force a fixed value +``` + +The 64 ms TTFP result reported above for Qwen3-TTS at concurrency 1 uses the dynamic initial chunk, which picks `initial_codec_chunk_frames=2` at low load. At higher concurrency the dynamic sizing increases the initial chunk to maintain decode efficiency. + +--- + +## Live Demo: Streaming TTS over WebSocket + +vLLM-Omni supports real-time streaming audio output for Qwen3-TTS over WebSocket ([PR #1719](https://github.com/vllm-project/vllm-omni/pull/1719)). With `stream_audio: true`, the server sends chunked PCM audio frames as they are generated, so clients can start playback before full sentence synthesis completes. + +The WebSocket protocol uses `audio.start` / binary PCM chunks / `audio.done` framing per sentence: + +```json +// Client sends: +{"type":"session.config","voice":"Vivian","response_format":"pcm","stream_audio":true} +{"type":"input.text","text":"Hello world. This is a streaming demo."} +{"type":"input.done"} + +// Server streams back per sentence: +{"type":"audio.start","sentence_index":0,"sentence_text":"Hello world.","format":"pcm","sample_rate":24000} + + +... +{"type":"audio.done","sentence_index":0,"total_bytes":96000,"error":false} +{"type":"audio.start","sentence_index":1,"sentence_text":"This is a streaming demo.","format":"pcm","sample_rate":24000} + +... +{"type":"audio.done","sentence_index":1,"total_bytes":72000,"error":false} +{"type":"session.done","total_sentences":2} +``` + + + +--- + +## Deployment Playbook + +### Qwen3-Omni + +#### 1) Serve with the default 3-stage config + +```bash +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct \ + --omni \ + --port 8091 +``` + +Notes: + +- `runtime.max_batch_size` controls stage-level batching. +- Thinker/Talker commonly use `enforce_eager: false` for CUDA Graph paths. +- Code2Wav often remains eager (`enforce_eager: true`) depending on runtime behavior. + +#### 2) Enable async chunk + +```bash +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct \ + --omni \ + --port 8091 \ + --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml +``` + +#### 3) Key config knobs + +```yaml +async_chunk: true +stage_args: + - stage_id: 0 # thinker + runtime: + max_batch_size: 64 + engine_args: + enforce_eager: false + max_num_batched_tokens: 32768 + custom_process_next_stage_input_func: >- + vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk + + - stage_id: 1 # talker + runtime: + max_batch_size: 64 + engine_args: + enforce_eager: false + max_num_batched_tokens: 32768 + custom_process_next_stage_input_func: >- + vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav_async_chunk + + - stage_id: 2 # code2wav + runtime: + max_batch_size: 64 + engine_args: + enforce_eager: true + max_num_batched_tokens: 51200 +``` + +#### Reproduce Qwen3-Omni benchmarks + +```bash +vllm bench serve \ + --dataset-name random \ + --port ${PORT} \ + --model ${MODEL_PATH} \ + --endpoint /v1/chat/completions \ + --backend openai-chat-omni \ + --max-concurrency ${MAX_CONCURRENCY} \ + --num-prompts ${NUM_PROMPTS} \ + --random-input-len 2500 \ + --ignore-eos \ + --percentile-metrics ttft,tpot,itl,e2el,audio_ttfp,audio_rtf \ + --random-output-len 900 \ + --extra_body '{"modalities": ["text","audio"]}' +``` + +### Qwen3-TTS + +#### 1) Serve with async chunk (recommended) + +```bash +vllm-omni serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ + --omni \ + --port 8000 +``` + +The default config (`qwen3_tts.yaml`) enables the full optimization stack: + +- Batching with `max_batch_size: 10` on the Talker stage +- CUDA Graph on the Talker (`enforce_eager: false`) +- Async chunk with streaming transport + +#### 2) Serve without async chunk (for comparison) + +```bash +vllm-omni serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ + --omni \ + --port 8000 \ + --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts_no_async_chunk.yaml +``` + +#### 3) Key config knobs + +```yaml +async_chunk: true +stage_args: + - stage_id: 0 # Talker (AR decoder) + runtime: + max_batch_size: 10 + engine_args: + enforce_eager: false + max_num_batched_tokens: 512 + custom_process_next_stage_input_func: >- + vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk + + - stage_id: 1 # Code2Wav (vocoder) + runtime: + max_batch_size: 1 + engine_args: + enforce_eager: true + max_num_batched_tokens: 8192 + +runtime: + connectors: + connector_of_shared_memory: + name: SharedMemoryConnector + extra: + codec_streaming: true + codec_chunk_frames: 25 + codec_left_context_frames: 25 +``` + +#### Reproduce Qwen3-TTS benchmarks + +```bash +GPU_DEVICE=0 \ +MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ +NUM_PROMPTS=50 \ +CONCURRENCY="1 4 10" \ +bash benchmarks/qwen3-tts/vllm_omni/run_stacked_benchmark.sh +``` + +This cycles through four configs (Baseline -> + Batch -> + CUDA Graph -> + Async Chunk + Streaming), benchmarks each at the specified concurrency levels, and generates all comparison figures automatically. diff --git a/docs/serving/speech_api.md b/docs/serving/speech_api.md index ecbe8d9ac98..733811081a7 100644 --- a/docs/serving/speech_api.md +++ b/docs/serving/speech_api.md @@ -15,7 +15,7 @@ Each server instance runs a single model (specified at startup via `vllm serve < ```bash # Qwen3-TTS: CustomVoice model (predefined speakers) vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ + --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ --omni \ --port 8091 \ --trust-remote-code \ @@ -300,7 +300,7 @@ curl -X POST http://localhost:8091/v1/audio/speech \ ```bash # Start server with VoiceDesign model first vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ + --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ --omni \ --port 8091 \ --trust-remote-code \ @@ -322,7 +322,7 @@ curl -X POST http://localhost:8091/v1/audio/speech \ ```bash # Start server with Base model first vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-Base \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ + --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ --omni \ --port 8091 \ --trust-remote-code \ @@ -517,15 +517,16 @@ for result in response.json()["results"]: All items are fanned out to `generate()` concurrently. The engine's stage worker automatically batches them up to the configured `max_batch_size` and queues the rest — no client-side throttling needed. -For best throughput, use a batch-optimized stage config with `max_batch_size > 1`: +For best throughput, set both stages' `max_num_seqs` to ≥4 via `--stage-overrides`: ```bash vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts_batch.yaml \ - --omni --port 8091 --trust-remote-code --enforce-eager + --omni --port 8091 --trust-remote-code --enforce-eager \ + --stage-overrides '{"0":{"max_num_seqs":4,"gpu_memory_utilization":0.2}, + "1":{"max_num_seqs":4,"gpu_memory_utilization":0.2}}' ``` -The default `qwen3_tts.yaml` uses `max_batch_size: 1` (single request). The `qwen3_tts_batch.yaml` config sets `max_batch_size: 4` for ~4x throughput. +The bundled `qwen3_tts.yaml` uses `max_num_seqs: 1` (single request) on both stages. Bumping to 4 yields roughly 4× throughput on the talker and lets stage 1 batch chunks across in-flight requests. ## Supported Models @@ -617,7 +618,7 @@ Enable debug logging: ```bash vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ + --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ --omni \ --port 8091 \ --trust-remote-code \ diff --git a/docs/source/architecture/async-chunk-architecture.png b/docs/source/architecture/async-chunk-architecture.png index 249de53bfe3..7b3e95e4df9 100644 Binary files a/docs/source/architecture/async-chunk-architecture.png and b/docs/source/architecture/async-chunk-architecture.png differ diff --git a/docs/source/architecture/vllm-omni-dataflow-between-stages.png b/docs/source/architecture/vllm-omni-dataflow-between-stages.png index cdbc9a8b7b3..74abc81ff07 100644 Binary files a/docs/source/architecture/vllm-omni-dataflow-between-stages.png and b/docs/source/architecture/vllm-omni-dataflow-between-stages.png differ diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md index 4e7003cce37..7bdeede446a 100644 --- a/docs/user_guide/diffusion_features.md +++ b/docs/user_guide/diffusion_features.md @@ -115,8 +115,8 @@ The following tables show which models support each feature: | **FLUX.2-dev** | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | | **GLM-Image** | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | | **HunyuanImage3** | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | -| **LongCat-Image** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | -| **LongCat-Image-Edit** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | +| **LongCat-Image** | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | +| **LongCat-Image-Edit** | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | | **MagiHuman** | ❌ | ❌ | ❌ | ❓ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | | **MammothModa2(T2I)** | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | **Nextstep_1(T2I)** | ❓ | ❓ | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | @@ -140,10 +140,10 @@ The following tables show which models support each feature: |-------|:----------:|:-----------:|:---------------------:|:--------------:|:-----------------:|:------:|:------------------------:|:--------------------:|:--------------:|:----------------:| | **Wan2.2** | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (encode/decode) | ❌ | ❌ | | **Wan2.1-VACE** | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ❌ | ❌ | -| **LTX-2** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **LTX-2** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | | **Helios** | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | | **HunyuanVideo-1.5 T2V I2V** | ❌ | ✅ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ✅ | ❌ | -| **DreamID-Omni** | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **DreamID-Omni** | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | **Frame Interpolation Support** diff --git a/docs/user_guide/examples/offline_inference/bagel.md b/docs/user_guide/examples/offline_inference/bagel.md index e6266868722..1fb4d404578 100644 --- a/docs/user_guide/examples/offline_inference/bagel.md +++ b/docs/user_guide/examples/offline_inference/bagel.md @@ -176,8 +176,6 @@ Example configuration for TP=2 on GPUs 0 and 1: | Parameter | Value | Description | | :-------------------- | :------ | :------------------------------- | -| `window_size` | `-1` | Window size (-1 means unlimited) | -| `max_inflight` | `1` | Maximum inflight requests | | `shm_threshold_bytes` | `65536` | Shared memory threshold (64KB) | ## Using Mooncake Connector diff --git a/docs/user_guide/examples/offline_inference/qwen3_tts.md b/docs/user_guide/examples/offline_inference/qwen3_tts.md index 4ece5219d7f..7226ac1fe4b 100644 --- a/docs/user_guide/examples/offline_inference/qwen3_tts.md +++ b/docs/user_guide/examples/offline_inference/qwen3_tts.md @@ -144,13 +144,13 @@ completes. This demonstrates that audio data is available progressively rather t ## Batched Decoding -The Code2Wav stage (stage 1) supports batched decoding, where multiple requests are decoded in a single forward pass through the SpeechTokenizer. To use it, provide a stage config with `max_num_seqs > 1` and pass multiple prompts via `--txt-prompts` with a matching `--batch-size`. +The Code2Wav stage (stage 1) supports batched decoding, where multiple requests are decoded in a single forward pass through the SpeechTokenizer. To use it, set `max_num_seqs > 1` on both stages via `--stage-overrides` and pass multiple prompts via `--txt-prompts` with a matching `--batch-size`. ``` python end2end.py --query-type CustomVoice \ --txt-prompts benchmark_prompts.txt \ --batch-size 4 \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts_batch.yaml + --stage-overrides '{"0":{"max_num_seqs":4,"gpu_memory_utilization":0.2},"1":{"max_num_seqs":4,"gpu_memory_utilization":0.2}}' ``` **Important:** `--batch-size` must match a CUDA graph capture size (1, 2, 4, 8, 16...) because the Talker's code predictor KV cache is sized to `max_num_seqs`, and CUDA graphs pad the batch to the next capture size. Both stages need `max_num_seqs >= batch_size` in the stage config for batching to take effect. If only stage 1 has a higher `max_num_seqs`, it won't help — stage 1 can only batch chunks from requests that are in-flight simultaneously, which requires stage 0 to also process multiple requests concurrently. diff --git a/docs/user_guide/examples/online_serving/qwen3_omni.md b/docs/user_guide/examples/online_serving/qwen3_omni.md index 6f6d9ae4a9d..611eb6fd3fc 100644 --- a/docs/user_guide/examples/online_serving/qwen3_omni.md +++ b/docs/user_guide/examples/online_serving/qwen3_omni.md @@ -18,12 +18,12 @@ vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 If you want to open async chunking for qwen3-omni, launch the server with command below ```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path /vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --deploy-config /vllm_omni/deploy/qwen3_omni_moe.yaml ``` If you have custom stage configs file, launch the server with command below ```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path /path/to/stage_configs_file +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --deploy-config /path/to/deploy_config_file ``` ### Send Multi-modal Request @@ -187,7 +187,7 @@ The script supports the following arguments: - `--model`: Model name/path (default: Qwen/Qwen3-Omni-30B-A3B-Instruct) - `--server-port`: Port for vLLM server (default: 8091) - `--gradio-port`: Port for Gradio demo (default: 7861) -- `--stage-configs-path`: Path to custom stage configs YAML file (optional) +- `--deploy-config`: Path to custom deploy config YAML file (optional) - `--server-host`: Host for vLLM server (default: 0.0.0.0) - `--gradio-ip`: IP for Gradio demo (default: 127.0.0.1) - `--share`: Share Gradio demo publicly (creates a public link) @@ -202,7 +202,7 @@ vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 If you have custom stage configs file: ```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path /path/to/stage_configs_file +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --deploy-config /path/to/deploy_config_file ``` **Step 2: Run the Gradio demo** diff --git a/docs/user_guide/examples/online_serving/qwen3_tts.md b/docs/user_guide/examples/online_serving/qwen3_tts.md index 4e632d4c288..95f234f02de 100644 --- a/docs/user_guide/examples/online_serving/qwen3_tts.md +++ b/docs/user_guide/examples/online_serving/qwen3_tts.md @@ -58,7 +58,7 @@ Then open http://localhost:7860 in your browser. ```bash # CustomVoice model (predefined speakers) vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ + --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ --omni \ --port 8091 \ --trust-remote-code \ @@ -66,7 +66,7 @@ vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ # VoiceDesign model vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ + --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ --omni \ --port 8091 \ --trust-remote-code \ @@ -74,7 +74,7 @@ vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign \ # Base model (voice cloning) vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-Base \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ + --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ --omni \ --port 8091 \ --trust-remote-code \ diff --git a/examples/offline_inference/bagel/README.md b/examples/offline_inference/bagel/README.md index 48517b1cda0..3e653d0e3ab 100644 --- a/examples/offline_inference/bagel/README.md +++ b/examples/offline_inference/bagel/README.md @@ -173,8 +173,6 @@ Example configuration for TP=2 on GPUs 0 and 1: | Parameter | Value | Description | | :-------------------- | :------ | :------------------------------- | -| `window_size` | `-1` | Window size (-1 means unlimited) | -| `max_inflight` | `1` | Maximum inflight requests | | `shm_threshold_bytes` | `65536` | Shared memory threshold (64KB) | ## Using Mooncake Connector diff --git a/examples/offline_inference/ming_flash_omni/README.md b/examples/offline_inference/ming_flash_omni/README.md new file mode 100644 index 00000000000..7414163fc01 --- /dev/null +++ b/examples/offline_inference/ming_flash_omni/README.md @@ -0,0 +1,76 @@ +# Ming-flash-omni 2.0 + +[Ming-flash-omni-2.0](https://github.com/inclusionAI/Ming) is an omni-modal model supporting text, image, video, and audio understanding, with outputs in text, image, and audio. For now, Ming-flash-omni-2.0 in vLLM-Omni is supported with thinker stage (multi-modal understanding). + +## Setup + +Please refer to the [stage configuration documentation](https://docs.vllm.ai/projects/vllm-omni/en/latest/configuration/stage_configs/) to configure memory allocation appropriately for your hardware setup. + +## Run examples + +### Text-only +```bash +python examples/offline_inference/ming_flash_omni/end2end.py --query-type text +``` + +#### Reasoning (Thinking Mode) + +Reasoning (Thinking) mode is enabled via applying "detailed thinking on" when building the system prompt template (in `apply_chat_template`). + +In the end2end example, a default problem for thinking mode is provided, as referred to the example usage of Ming's cookbook; +To utilize it, you have to download the example figure from https://github.com/inclusionAI/Ming/blob/3954fcb880ff5e61ff128bcf7f1ec344d46a6fe3/figures/cases/3_0.png + +```bash +python examples/offline_inference/ming_flash_omni/end2end.py -q reasoning --image-path ./3_0.png +``` + +### Image understanding +```bash +python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_image + +# With a local image +python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_image --image-path /path/to/image.jpg +``` + +### Audio understanding +```bash +python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_audio + +# With a local audio file +python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_audio --audio-path /path/to/audio.wav +``` + +### Video understanding +```bash +python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_video + +# With a local video and custom frame count +python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_video --video-path /path/to/video.mp4 --num-frames 16 +``` + +### Mixed modalities (image + audio) +```bash +python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_mixed_modalities \ + --image-path /path/to/image.jpg \ + --audio-path /path/to/audio.wav +``` + +If media file paths are not provided, the script uses built-in default assets. + +### Modality control +To control output modalities (e.g. text-only output): +```bash +python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_audio --modalities text +``` + +*For now, only text output is supported* + +### Custom stage config +```bash +python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_image \ + --stage-configs-path /path/to/your_config.yaml +``` + +## Online serving + +For online serving via the OpenAI-compatible API, see [examples/online_serving/ming_flash_omni/README.md](../../online_serving/ming_flash_omni/README.md). diff --git a/examples/offline_inference/ming_flash_omni/end2end.py b/examples/offline_inference/ming_flash_omni/end2end.py new file mode 100644 index 00000000000..49cdbcc0186 --- /dev/null +++ b/examples/offline_inference/ming_flash_omni/end2end.py @@ -0,0 +1,485 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Partial example cases are referred from +# https://github.com/inclusionAI/Ming/blob/3954fcb880ff5e61ff128bcf7f1ec344d46a6fe3/cookbook.ipynb +import os +import time +from typing import NamedTuple + +import librosa +import numpy as np +import vllm +from PIL import Image +from transformers import AutoProcessor +from vllm import SamplingParams +from vllm.assets.audio import AudioAsset +from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset, video_to_ndarrays +from vllm.multimodal.image import convert_image_mode +from vllm.utils.argparse_utils import FlexibleArgumentParser + +import vllm_omni +from vllm_omni.entrypoints.omni import Omni + +# Imports the processor also registers itself +from vllm_omni.transformers_utils.processors.ming import MingFlashOmniProcessor # noqa: F401 + +SEED = 42 +MODEL_NAME = "Jonathan1909/Ming-flash-omni-2.0" + + +class QueryResult(NamedTuple): + inputs: dict + limit_mm_per_prompt: dict[str, int] + + +def get_text_query(processor: MingFlashOmniProcessor, question: str | None = None) -> QueryResult: + if question is None: + question = "请详细介绍鹦鹉的生活习性。" + conversation = [{"role": "HUMAN", "content": question}] + prompt = processor.apply_chat_template(conversation, tokenize=False) + return QueryResult( + inputs={"prompt": prompt}, + limit_mm_per_prompt={}, + ) + + +def get_image_query( + processor: MingFlashOmniProcessor, + question: str | None = None, + image_path: str | None = None, +) -> QueryResult: + if question is None: + question = "Describe this image in detail." + + if image_path: + if not os.path.exists(image_path): + raise FileNotFoundError(f"Image file not found: {image_path}") + image_data = convert_image_mode(Image.open(image_path), "RGB") + else: + image_data = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB") + + conversation = [ + { + "role": "HUMAN", + "content": [ + {"type": "image", "image": image_data}, + {"type": "text", "text": question}, + ], + } + ] + prompt = processor.apply_chat_template(conversation, tokenize=False) + + return QueryResult( + inputs={ + "prompt": prompt, + "multi_modal_data": {"image": image_data}, + }, + limit_mm_per_prompt={"image": 1}, + ) + + +def get_audio_query( + processor: MingFlashOmniProcessor, + question: str | None = None, + audio_path: str | None = None, + sampling_rate: int = 16000, +) -> QueryResult: + if question is None: + question = "Please recognize the language of this speech and transcribe it. Format: oral." + + if audio_path: + if not os.path.exists(audio_path): + raise FileNotFoundError(f"Audio file not found: {audio_path}") + audio_signal, sr = librosa.load(audio_path, sr=sampling_rate) + audio_data = (audio_signal.astype(np.float32), sr) + else: + audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate + + # Use a string for "audio" so the processor counts it as 1 audio input + conversation = [ + { + "role": "HUMAN", + "content": [ + {"type": "audio", "audio": "input"}, + {"type": "text", "text": question}, + ], + } + ] + prompt = processor.apply_chat_template(conversation, tokenize=False) + + return QueryResult( + inputs={ + "prompt": prompt, + "multi_modal_data": {"audio": audio_data}, + }, + limit_mm_per_prompt={"audio": 1}, + ) + + +def get_video_query( + processor: MingFlashOmniProcessor, + question: str | None = None, + video_path: str | None = None, + num_frames: int = 16, +) -> QueryResult: + if question is None: + question = "Describe what is happening in this video." + + if video_path: + if not os.path.exists(video_path): + raise FileNotFoundError(f"Video file not found: {video_path}") + video_frames = video_to_ndarrays(video_path, num_frames=num_frames) + else: + video_frames = VideoAsset(name="baby_reading", num_frames=num_frames).np_ndarrays + + conversation = [ + { + "role": "HUMAN", + "content": [ + {"type": "video"}, + {"type": "text", "text": question}, + ], + } + ] + prompt = processor.apply_chat_template(conversation, tokenize=False) + + return QueryResult( + inputs={ + "prompt": prompt, + "multi_modal_data": {"video": video_frames}, + }, + limit_mm_per_prompt={"video": 1}, + ) + + +def get_mixed_modalities_query( + processor: MingFlashOmniProcessor, + image_path: str | None = None, + audio_path: str | None = None, + sampling_rate: int = 16000, +) -> QueryResult: + """Mixed image + audio understanding.""" + question = "Describe the image, and recognize the language of this speech and transcribe it. Format: oral" + + if image_path: + if not os.path.exists(image_path): + raise FileNotFoundError(f"Image file not found: {image_path}") + image_data = convert_image_mode(Image.open(image_path), "RGB") + else: + image_data = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB") + + if audio_path: + if not os.path.exists(audio_path): + raise FileNotFoundError(f"Audio file not found: {audio_path}") + sig, sr = librosa.load(audio_path, sr=sampling_rate) + audio_data = (sig.astype(np.float32), sr) + else: + audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate + + conversation = [ + { + "role": "HUMAN", + "content": [ + {"type": "image", "image": image_data}, + {"type": "audio", "audio": "input"}, + {"type": "text", "text": question}, + ], + } + ] + prompt = processor.apply_chat_template(conversation, tokenize=False) + + return QueryResult( + inputs={ + "prompt": prompt, + "multi_modal_data": {"image": image_data, "audio": audio_data}, + }, + limit_mm_per_prompt={"image": 1, "audio": 1}, + ) + + +def get_reasoning_query( + processor: MingFlashOmniProcessor, + question: str | None = None, + image_path: str | None = None, +) -> QueryResult: + if question is None: + # NOTE: To use the following default question, input with example figure provided by Ming + # https://github.com/inclusionAI/Ming/blob/3954fcb880ff5e61ff128bcf7f1ec344d46a6fe3/figures/cases/3_0.png + # E.g., + # python examples/offline_inference/ming_flash_omni/end2end.py -q reasoning --image-path ./3_0.png + # Otherwise, the problem solving might be false. + question = ( + "Based on the following rules:\n•\tYou control the smiley face character\n" + "•\tYou can move up, down, left, and right, and only a single square at a time\n" + "•\tWalls are dark grey and cannot be moved into\n•\tThe brown square is a box\n•" + "\tThe box can be pushed by moving into it (i.e., if you are in the square " + "adjacent to the box to the left, and move onto the square with the box, " + "the box will move one square to the right).\n" + "•\tThe box cannot be pushed into walls\n" + "•\tThe blue door at the bottom is locked and cannot be passed through, " + "unless the box is placed on the blue square\n" + "•\tThe square beneath the blue door is the exit\n" + "•\tMoving from one square to another\n\n" + "Let's assume a coordinate system where the smiley face is " + "on the top left at (1,1) and the square below it is (1,2). " + "The smiley face performs the following moves: {down, right, right, right}, " + "such that the smiley face is at square (4,2) and the box is in square (5,2). " + "What are the next sequence of moves that must be done to move the box down to (5,3)? " + "Give your answer as a comma separated list." + ) + + if image_path: + if not os.path.exists(image_path): + raise FileNotFoundError(f"Image file not found: {image_path}") + image_data = convert_image_mode(Image.open(image_path), "RGB") + conversation = [ + { + "role": "HUMAN", + "content": [ + {"type": "image", "image": image_data}, + {"type": "text", "text": question}, + ], + } + ] + prompt = processor.apply_chat_template(conversation, tokenize=False, use_cot_system_prompt=True) + return QueryResult( + inputs={ + "prompt": prompt, + "multi_modal_data": {"image": image_data}, + }, + limit_mm_per_prompt={"image": 1}, + ) + + conversation = [{"role": "HUMAN", "content": question}] + prompt = processor.apply_chat_template(conversation, tokenize=False, use_cot_system_prompt=True) + return QueryResult( + inputs={"prompt": prompt}, + limit_mm_per_prompt={}, + ) + + +query_map = { + "text": get_text_query, + "use_audio": get_audio_query, + "use_image": get_image_query, + "use_video": get_video_query, + "use_mixed_modalities": get_mixed_modalities_query, + "reasoning": get_reasoning_query, +} + + +def main(args): + print( + "=" * 20, + "\n", + f"vllm version: {vllm.__version__}\n", + f"vllm-omni version: {vllm_omni.__version__}\n", + "=" * 20, + sep="", + ) + + processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True) + assert isinstance(processor, MingFlashOmniProcessor), f"Wrong processor type being used: {type(processor)}" + + query_func = query_map[args.query_type] + if args.query_type == "use_image": + query_result = query_func(processor, image_path=args.image_path) + elif args.query_type == "use_audio": + query_result = query_func(processor, audio_path=args.audio_path, sampling_rate=args.sampling_rate) + elif args.query_type == "use_video": + query_result = query_func(processor, video_path=args.video_path, num_frames=args.num_frames) + elif args.query_type == "use_mixed_modalities": + query_result = query_func( + processor, + image_path=args.image_path, + audio_path=args.audio_path, + sampling_rate=args.sampling_rate, + ) + elif args.query_type == "reasoning": + query_result = query_func(processor, image_path=args.image_path) + else: + query_result = query_func(processor) + + # Initialize Omni (with thinker-only stage config) + omni = Omni( + model=MODEL_NAME, + stage_configs_path=args.stage_configs_path, + log_stats=args.log_stats, + init_timeout=args.init_timeout, + stage_init_timeout=args.stage_init_timeout, + ) + + # Thinker sampling params + thinker_sampling_params = SamplingParams( + temperature=0.4, + top_p=0.9, + max_tokens=args.max_tokens, + repetition_penalty=1.05, + seed=SEED, + detokenize=True, + ) + sampling_params_list = [thinker_sampling_params] + + prompts = [query_result.inputs for _ in range(args.num_prompts)] + + if args.modalities is not None: + output_modalities = args.modalities.split(",") + for prompt in prompts: + prompt["modalities"] = output_modalities + + total_requests = len(prompts) + processed_count = 0 + print(f"Query type: {args.query_type}") + print(f"Number of prompts: {total_requests}") + + output_dir = args.output_dir + os.makedirs(output_dir, exist_ok=True) + + profiler_enabled = args.enable_profiler + if profiler_enabled: + omni.start_profile(stages=args.profiler_stages) + + for stage_outputs in omni.generate(prompts, sampling_params_list): + output = stage_outputs.request_output + if stage_outputs.final_output_type == "text": + request_id = output.request_id + text_output = output.outputs[0].text + lines = [] + lines.append("Prompt:\n") + lines.append(str(output.prompt) + "\n") + lines.append("Text Output:\n") + lines.append(str(text_output).strip() + "\n") + print(*lines, sep="") + + # Save to file + out_txt = os.path.join(output_dir, f"{request_id}.txt") + try: + with open(out_txt, "w", encoding="utf-8") as f: + f.writelines(lines) + print(f"Request ID: {request_id}, text saved to {out_txt}") + except Exception as e: + print(f"Failed to write output file {out_txt}: {e}") + + elif stage_outputs.final_output_type == "audio": + raise NotImplementedError("Add audio example after talker supported.") + + processed_count += 1 + if profiler_enabled and processed_count >= total_requests: + print(f"[Info] Processed {processed_count}/{total_requests}. Stopping profiler inside active loop...") + # Stop the profiler while workers are still alive + omni.stop_profile(stages=args.profiler_stages) + + print("[Info] Waiting 30s for workers to write trace files to disk...") + time.sleep(30) + print("[Info] Trace export wait time finished.") + + omni.close() + + +def parse_args(): + parser = FlexibleArgumentParser(description="Ming-flash-omni 2.0 offline inference example") + parser.add_argument( + "--query-type", + "-q", + type=str, + default="text", + choices=query_map.keys(), + help="Query type.", + ) + parser.add_argument( + "--stage-configs-path", + type=str, + default=None, + help="Path to a stage configs YAML file.", + ) + parser.add_argument( + "--log-stats", + action="store_true", + default=False, + help="Enable detailed statistics logging.", + ) + parser.add_argument("--init-timeout", type=int, default=2000, help="Timeout for initializing in seconds.") + parser.add_argument( + "--stage-init-timeout", + type=int, + default=2000, + help="Timeout for initializing a single stage in seconds.", + ) + parser.add_argument( + "--enable-profiler", + action="store_true", + default=False, + help="Enables profiling when set.", + ) + parser.add_argument( + "--profiler-stages", + type=int, + nargs="*", + default=[0], + help="List of stage IDs to profile. If not set, profiles all stages.", + ) + parser.add_argument( + "--image-path", + "-i", + type=str, + default=None, + help="Path to local image file. Uses default asset if not provided.", + ) + parser.add_argument( + "--audio-path", + "-a", + type=str, + default=None, + help="Path to local audio file. Uses default asset if not provided.", + ) + parser.add_argument( + "--video-path", + "-v", + type=str, + default=None, + help="Path to local video file. Uses default asset if not provided.", + ) + parser.add_argument( + "--num-frames", + type=int, + default=16, + help="Number of frames to extract from video.", + ) + parser.add_argument( + "--sampling-rate", + type=int, + default=16000, + help="Sampling rate for audio loading.", + ) + parser.add_argument( + "--max-tokens", + type=int, + default=16384, + help="Maximum tokens to generate.", + ) + parser.add_argument( + "--num-prompts", + type=int, + default=1, + help="Number of prompts to generate.", + ) + parser.add_argument( + "--modalities", + type=str, + default=None, + help="Output modalities (comma-separated).", + ) + parser.add_argument( + "--output-dir", + type=str, + default="output_ming", + help="Output directory for results.", + ) + + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/offline_inference/qwen2_5_omni/end2end.py b/examples/offline_inference/qwen2_5_omni/end2end.py index d8f1898ec91..dfe124700de 100644 --- a/examples/offline_inference/qwen2_5_omni/end2end.py +++ b/examples/offline_inference/qwen2_5_omni/end2end.py @@ -320,14 +320,7 @@ def main(args): query_result = query_func(audio_path=audio_path, sampling_rate=sampling_rate) else: query_result = query_func() - omni = Omni( - model=model_name, - log_stats=args.log_stats, - stage_init_timeout=args.stage_init_timeout, - batch_timeout=args.batch_timeout, - init_timeout=args.init_timeout, - shm_threshold_bytes=args.shm_threshold_bytes, - ) + omni = Omni.from_cli_args(args, model=model_name) thinker_sampling_params = SamplingParams( temperature=0.0, # Deterministic - no randomness top_p=1.0, # Disable nucleus sampling diff --git a/examples/offline_inference/qwen3_omni/README.md b/examples/offline_inference/qwen3_omni/README.md index d69ad6abfc9..0710faa133c 100644 --- a/examples/offline_inference/qwen3_omni/README.md +++ b/examples/offline_inference/qwen3_omni/README.md @@ -70,8 +70,8 @@ For true stage-level concurrency -- where downstream stages (Talker, Code2Wav) start **before** the upstream stage (Thinker) finishes -- use the async_chunk example. This requires: -1. A stage config YAML with ``async_chunk: true`` (e.g. - ``qwen3_omni_moe_async_chunk.yaml``). +1. A deploy config YAML with ``async_chunk: true`` (e.g. + ``qwen3_omni_moe.yaml``). 2. Hardware that matches the config (e.g. 2x H100 for the default 3-stage config). @@ -101,7 +101,7 @@ python end2end_async_chunk.py --query-type text --modalities text ```bash python end2end_async_chunk.py \ --query-type use_audio \ - --stage-configs-path /path/to/your_async_chunk.yaml + --deploy-config /path/to/your_deploy_config.yaml ``` > **Note**: The synchronous ``end2end.py`` (using ``Omni``) is still the diff --git a/examples/offline_inference/qwen3_omni/end2end.py b/examples/offline_inference/qwen3_omni/end2end.py index 056f820ff07..f028c32aa1b 100644 --- a/examples/offline_inference/qwen3_omni/end2end.py +++ b/examples/offline_inference/qwen3_omni/end2end.py @@ -294,14 +294,7 @@ def main(args): else: query_result = query_func() - omni = Omni( - model=model_name, - dtype=args.dtype, - stage_configs_path=args.stage_configs_path, - log_stats=args.log_stats, - stage_init_timeout=args.stage_init_timeout, - init_timeout=args.init_timeout, - ) + omni = Omni.from_cli_args(args, model=model_name) thinker_sampling_params = SamplingParams( temperature=0.9, diff --git a/examples/offline_inference/qwen3_omni/end2end_async_chunk.py b/examples/offline_inference/qwen3_omni/end2end_async_chunk.py index 07442631302..f38922e9437 100644 --- a/examples/offline_inference/qwen3_omni/end2end_async_chunk.py +++ b/examples/offline_inference/qwen3_omni/end2end_async_chunk.py @@ -14,7 +14,7 @@ Usage ----- python end2end_async_chunk.py --query-type use_audio \ - --stage-configs-path + --deploy-config See ``--help`` for all options. """ @@ -179,20 +179,26 @@ def clone_prompt_for_request(template: dict) -> dict: return cloned -def _default_async_chunk_stage_configs_path() -> str | None: - """Best-effort default stage config for running Qwen3-Omni with async_chunk. +def _default_deploy_config_path() -> str | None: + """Best-effort default deploy config for running Qwen3-Omni with async_chunk. - When this example is executed from within the repository, we resolve the - default YAML path relative to this file. When installed elsewhere, the - file may not exist and callers should pass --stage-configs-path explicitly. + The default ``vllm_omni/deploy/qwen3_omni_moe.yaml`` ships with + ``async_chunk: true`` at the top level, so loading it is enough to + enable async-chunk semantics. To disable it, copy the YAML and set + ``async_chunk: false`` (or pass ``--deploy-config`` to a YAML that + overrides the flag). + + When this example is executed from within the repository, we resolve + the default YAML path relative to this file. When installed elsewhere, + the file may not exist and callers should pass ``--deploy-config`` + explicitly. """ repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")) candidate = os.path.join( repo_root, "vllm_omni", - "model_executor", - "stage_configs", - "qwen3_omni_moe_async_chunk.yaml", + "deploy", + "qwen3_omni_moe.yaml", ) return candidate if os.path.exists(candidate) else None @@ -374,15 +380,16 @@ async def run_all(args): prompt["modalities"] = output_modalities # Create AsyncOmni - print(f"[Info] Creating AsyncOmni with stage_configs_path={args.stage_configs_path}") + print(f"[Info] Creating AsyncOmni with deploy_config={args.deploy_config}") async_omni = None try: - async_omni = AsyncOmni( - model=args.model, - stage_configs_path=args.stage_configs_path, - log_stats=args.log_stats, - stage_init_timeout=args.stage_init_timeout, - ) + # ``from_cli_args`` expands vars(args) into kwargs and auto-captures + # ``_cli_explicit_keys`` from ``sys.argv[1:]`` so argparse defaults + # do not silently override deploy YAML values. Mirrors the + # ``EngineArgs.from_cli_args`` pattern used throughout vllm / + # vllm-omni. ``deploy_config=None`` (the default) falls through to + # the bundled ``vllm_omni/deploy/qwen3_omni_moe.yaml``. + async_omni = AsyncOmni.from_cli_args(args) # Use default sampling params from stage config (they are pre-configured # in the YAML for each stage). @@ -470,11 +477,11 @@ def parse_args(): help="Query type.", ) parser.add_argument( - "--stage-configs-path", + "--deploy-config", type=str, - default=_default_async_chunk_stage_configs_path(), + default=_default_deploy_config_path(), help=( - "Path to an async_chunk stage config YAML. " + "Path to a deploy config YAML. " "If not set, uses the model's default config " "(make sure it has async_chunk: true)." ), diff --git a/examples/offline_inference/qwen3_omni/run_multiple_prompts_async_chunk.sh b/examples/offline_inference/qwen3_omni/run_multiple_prompts_async_chunk.sh index 809054867c3..2f2be20915a 100755 --- a/examples/offline_inference/qwen3_omni/run_multiple_prompts_async_chunk.sh +++ b/examples/offline_inference/qwen3_omni/run_multiple_prompts_async_chunk.sh @@ -17,7 +17,7 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" python "${SCRIPT_DIR}/end2end_async_chunk.py" \ --query-type text \ --txt-prompts "${SCRIPT_DIR}/text_prompts_10.txt" \ - --stage-configs-path "${REPO_ROOT}/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml" \ + --deploy-config "${REPO_ROOT}/vllm_omni/deploy/qwen3_omni_moe.yaml" \ --output-dir output_audio_async_chunk \ --max-in-flight 2 \ "$@" diff --git a/examples/offline_inference/qwen3_omni/run_single_prompt_async_chunk.sh b/examples/offline_inference/qwen3_omni/run_single_prompt_async_chunk.sh index 918c7ee4fd9..9ef69293cb5 100755 --- a/examples/offline_inference/qwen3_omni/run_single_prompt_async_chunk.sh +++ b/examples/offline_inference/qwen3_omni/run_single_prompt_async_chunk.sh @@ -6,13 +6,13 @@ # achieving true stage-level concurrency via chunk-level streaming. # # Prerequisites: -# - An async_chunk stage config YAML (e.g. qwen3_omni_moe_async_chunk.yaml) +# - A deploy config YAML (e.g. qwen3_omni_moe.yaml) # - Hardware matching the config (e.g. 2x H100 for the default 3-stage config) # # Usage: # bash run_single_prompt_async_chunk.sh # bash run_single_prompt_async_chunk.sh --query-type text --modalities text -# bash run_single_prompt_async_chunk.sh --stage-configs-path /path/to/custom.yaml +# bash run_single_prompt_async_chunk.sh --deploy-config /path/to/custom.yaml set -euo pipefail @@ -21,6 +21,6 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" python "${SCRIPT_DIR}/end2end_async_chunk.py" \ --query-type use_audio \ - --stage-configs-path "${REPO_ROOT}/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml" \ + --deploy-config "${REPO_ROOT}/vllm_omni/deploy/qwen3_omni_moe.yaml" \ --output-dir output_audio_async_chunk \ "$@" diff --git a/examples/offline_inference/qwen3_tts/README.md b/examples/offline_inference/qwen3_tts/README.md index c38a2b462d1..2971ad716a2 100644 --- a/examples/offline_inference/qwen3_tts/README.md +++ b/examples/offline_inference/qwen3_tts/README.md @@ -104,13 +104,13 @@ completes. This demonstrates that audio data is available progressively rather t ## Batched Decoding -The Code2Wav stage (stage 1) supports batched decoding, where multiple requests are decoded in a single forward pass through the SpeechTokenizer. To use it, provide a stage config with `max_num_seqs > 1` and pass multiple prompts via `--txt-prompts` with a matching `--batch-size`. +The Code2Wav stage (stage 1) supports batched decoding, where multiple requests are decoded in a single forward pass through the SpeechTokenizer. To use it, set `max_num_seqs > 1` on both stages via `--stage-overrides` and pass multiple prompts via `--txt-prompts` with a matching `--batch-size`. ``` python end2end.py --query-type CustomVoice \ --txt-prompts benchmark_prompts.txt \ --batch-size 4 \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts_batch.yaml + --stage-overrides '{"0":{"max_num_seqs":4,"gpu_memory_utilization":0.2},"1":{"max_num_seqs":4,"gpu_memory_utilization":0.2}}' ``` **Important:** `--batch-size` must match a CUDA graph capture size (1, 2, 4, 8, 16...) because the Talker's code predictor KV cache is sized to `max_num_seqs`, and CUDA graphs pad the batch to the next capture size. Both stages need `max_num_seqs >= batch_size` in the stage config for batching to take effect. If only stage 1 has a higher `max_num_seqs`, it won't help — stage 1 can only batch chunks from requests that are in-flight simultaneously, which requires stage 0 to also process multiple requests concurrently. diff --git a/examples/offline_inference/qwen3_tts/end2end.py b/examples/offline_inference/qwen3_tts/end2end.py index 901418c39b8..77da356b4f8 100644 --- a/examples/offline_inference/qwen3_tts/end2end.py +++ b/examples/offline_inference/qwen3_tts/end2end.py @@ -366,12 +366,7 @@ def main(args): output_dir = args.output_dir os.makedirs(output_dir, exist_ok=True) - omni = Omni( - model=model_name, - stage_configs_path=args.stage_configs_path, - log_stats=args.log_stats, - stage_init_timeout=args.stage_init_timeout, - ) + omni = Omni.from_cli_args(args, model=model_name) batch_size = args.batch_size for batch_start in range(0, len(inputs), batch_size): @@ -387,12 +382,7 @@ async def main_streaming(args): output_dir = args.output_dir os.makedirs(output_dir, exist_ok=True) - omni = AsyncOmni( - model=model_name, - stage_configs_path=args.stage_configs_path, - log_stats=args.log_stats, - stage_init_timeout=args.stage_init_timeout, - ) + omni = AsyncOmni.from_cli_args(args, model=model_name) for i, prompt in enumerate(inputs): request_id = str(i) diff --git a/examples/offline_inference/voxcpm2/end2end.py b/examples/offline_inference/voxcpm2/end2end.py index 687e596018c..6b6bf78ddf1 100644 --- a/examples/offline_inference/voxcpm2/end2end.py +++ b/examples/offline_inference/voxcpm2/end2end.py @@ -65,6 +65,12 @@ def parse_args(): default=None, help="Text matching --prompt-audio for continuation mode.", ) + parser.add_argument( + "--ref-text", + type=str, + default=None, + help="Optional transcript of --reference-audio (enables ref_continuation mode).", + ) return parser.parse_args() @@ -103,24 +109,40 @@ def main(): stage_configs_path=args.stage_configs_path, ) - additional: dict = {} - if args.reference_audio: - additional["reference_audio"] = args.reference_audio - if args.prompt_audio and args.prompt_text: - additional["prompt_audio"] = args.prompt_audio - additional["prompt_text"] = args.prompt_text + from transformers import AutoTokenizer - prompt: dict = {"prompt": args.text} - if additional: - prompt["additional_information"] = additional + from vllm_omni.model_executor.models.voxcpm2.voxcpm2_talker import ( + build_cjk_split_map, + build_voxcpm2_prompt, + ) + + tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) + split_map = build_cjk_split_map(tokenizer) + hf_config = engine.engine.stage_vllm_configs[0].model_config.hf_config + + ref_audio_arg = args.reference_audio or args.prompt_audio + ref_text_arg = args.ref_text or args.prompt_text + ref_wav, ref_sr = (None, None) + if ref_audio_arg: + ref_wav_arr, ref_sr = sf.read(ref_audio_arg) + ref_wav = ref_wav_arr.mean(axis=-1).tolist() if ref_wav_arr.ndim > 1 else ref_wav_arr.tolist() + + prompt = build_voxcpm2_prompt( + hf_config=hf_config, + tokenizer=tokenizer, + split_map=split_map, + text=args.text, + ref_audio=ref_wav, + ref_sr=ref_sr, + ref_text=ref_text_arg, + ) print(f"Model : {args.model}") print(f"Text : {args.text}") - if args.reference_audio: - print(f"Ref audio : {args.reference_audio}") - if args.prompt_audio: - print(f"Prompt audio: {args.prompt_audio}") - print(f"Prompt text : {args.prompt_text}") + if ref_audio_arg: + print(f"Ref audio : {ref_audio_arg}") + if ref_text_arg: + print(f"Ref text : {ref_text_arg}") print(f"Output dir : {output_dir}") t_start = time.perf_counter() diff --git a/examples/offline_inference/x_to_video_audio/x_to_video_audio.py b/examples/offline_inference/x_to_video_audio/x_to_video_audio.py index 322b184e520..497284ceb96 100644 --- a/examples/offline_inference/x_to_video_audio/x_to_video_audio.py +++ b/examples/offline_inference/x_to_video_audio/x_to_video_audio.py @@ -58,6 +58,11 @@ def parse_args() -> argparse.Namespace: default=False, help="Enable CPU offloading for diffusion models.", ) + parser.add_argument( + "--enable-layerwise-offload", + action="store_true", + help="Enable layerwise (blockwise) offloading on DiT modules.", + ) return parser.parse_args() @@ -126,6 +131,7 @@ def main() -> None: parallel_config=parallel_config, model_type=args.model_type, enable_cpu_offload=args.enable_cpu_offload, + enable_layerwise_offload=args.enable_layerwise_offload, ) start = time.perf_counter() outputs = omni.generate(prompt, sampling_params) diff --git a/examples/online_serving/ming_flash_omni/README.md b/examples/online_serving/ming_flash_omni/README.md new file mode 100644 index 00000000000..502232725c2 --- /dev/null +++ b/examples/online_serving/ming_flash_omni/README.md @@ -0,0 +1,204 @@ +# Ming-flash-omni 2.0 + +## Installation + +Please refer to [README.md](../../../README.md) + +## Run examples (Ming-flash-omni 2.0) + +### Launch the Server + +```bash +vllm serve Jonathan1909/Ming-flash-omni-2.0 --omni --port 8091 +``` + +If you have custom stage configs file, launch the server with command below +```bash +vllm serve Jonathan1909/Ming-flash-omni-2.0 --omni --port 8091 --stage-configs-path /path/to/stage_configs_file +``` + +### Send Multi-modal Request + +#### Send request via python + +```bash +python examples/online_serving/openai_chat_completion_client_for_multimodal_generation.py --model Jonathan1909/Ming-flash-omni-2.0 --query-type use_mixed_modalities --port 8091 --host "localhost" --modalities text +``` + +The Python client supports the following command-line arguments: + +- `--query-type` (or `-q`): Query type. Options: `text`, `use_audio`, `use_image`, `use_video`, `use_mixed_modalities` +- `--video-path` (or `-v`): Path to local video file or URL. If not provided and query-type uses video, uses default video URL. Supports local file paths (automatically encoded to base64) or HTTP/HTTPS URLs. Example: `--video-path /path/to/video.mp4` or `--video-path https://example.com/video.mp4` +- `--image-path` (or `-i`): Path to local image file or URL. If not provided and query-type uses image, uses default image URL. Supports local file paths (automatically encoded to base64) or HTTP/HTTPS URLs and common image formats: JPEG, PNG, GIF, WebP. Example: `--image-path /path/to/image.jpg` or `--image-path https://example.com/image.png` +- `--audio-path` (or `-a`): Path to local audio file or URL. If not provided and query-type uses audio, uses default audio URL. Supports local file paths (automatically encoded to base64) or HTTP/HTTPS URLs and common audio formats: MP3, WAV, OGG, FLAC, M4A. Example: `--audio-path /path/to/audio.wav` or `--audio-path https://example.com/audio.mp3` +- `--prompt` (or `-p`): Custom text prompt/question. If not provided, uses default prompt for the selected query type. Example: `--prompt "What are the main activities shown in this video?"` +- `--modalities`: Output modalities. For now, only `text` is supported. Example: `--modalities text` + + +#### Send request via curl + +```bash +bash run_curl_multimodal_generation.sh text +bash run_curl_multimodal_generation.sh use_image +bash run_curl_multimodal_generation.sh use_audio +bash run_curl_multimodal_generation.sh use_video +bash run_curl_multimodal_generation.sh use_mixed_modalities +``` + +## Modality control + +Ming-flash-omni 2.0 currently supports text output only (thinker stage). + +| Modalities | Output | +|------------|--------| +| `["text"]` | Text only | +| Not specified | Text only (default) | + +### Using curl + +```bash +curl http://localhost:8091/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Jonathan1909/Ming-flash-omni-2.0", + "messages": [ + {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking off"}]}, + {"role": "user", "content": "请详细介绍鹦鹉的生活习性。"} + ], + "modalities": ["text"] + }' +``` + +### Using OpenAI Python SDK + +```python +from openai import OpenAI + +client = OpenAI(base_url="http://localhost:8091/v1", api_key="EMPTY") + +response = client.chat.completions.create( + model="Jonathan1909/Ming-flash-omni-2.0", + messages=[ + {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking off"}]}, + {"role": "user", "content": "请详细介绍鹦鹉的生活习性。"}, + ], + modalities=["text"], +) +print(response.choices[0].message.content) +``` + +### Multi-modal input with OpenAI Python SDK + +```python +from openai import OpenAI + +client = OpenAI(base_url="http://localhost:8091/v1", api_key="EMPTY") + +response = client.chat.completions.create( + model="Jonathan1909/Ming-flash-omni-2.0", + messages=[ + {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking off"}]}, + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/cherry_blossom.jpg"}}, + {"type": "text", "text": "Describe this image in detail."}, + ], + }, + ], + modalities=["text"], +) +print(response.choices[0].message.content) +``` + +## Streaming Output + +To enable streaming output: + +```bash +python examples/online_serving/openai_chat_completion_client_for_multimodal_generation.py \ + --query-type use_image \ + --model Jonathan1909/Ming-flash-omni-2.0 \ + --modalities text \ + --stream +``` + +Or with the OpenAI Python SDK: + +```python +from openai import OpenAI + +client = OpenAI(base_url="http://localhost:8091/v1", api_key="EMPTY") + +response = client.chat.completions.create( + model="Jonathan1909/Ming-flash-omni-2.0", + messages=[ + {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking off"}]}, + {"role": "user", "content": "请详细介绍鹦鹉的生活习性。"}, + ], + modalities=["text"], + stream=True, +) +for chunk in response: + for choice in chunk.choices: + if hasattr(choice, "delta") and choice.delta.content: + print(choice.delta.content, end="", flush=True) +print() +``` + +Or using curl: + +```bash +curl http://localhost:8091/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Jonathan1909/Ming-flash-omni-2.0", + "messages": [ + {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking off"}]}, + {"role": "user", "content": "请详细介绍鹦鹉的生活习性。"} + ], + "modalities": ["text"], + "stream": true, + }' +``` + + +## Reasoning (Thinking Mode) + +To enable reasoning/thinking mode, change `detailed thinking off` to `detailed thinking on` in the system prompt: + +### Using curl + +```bash +curl http://localhost:8091/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Jonathan1909/Ming-flash-omni-2.0", + "messages": [ + {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking on"}]}, + {"role": "user", "content": [ + {"type": "image_url", "image_url": {"url": "https://example.com/math_problem.png"}}, + {"type": "text", "text": "Solve this math problem step by step."} + ]} + ], + "modalities": ["text"] + }' +``` + +### Using OpenAI Python SDK + +```python +from openai import OpenAI + +client = OpenAI(base_url="http://localhost:8091/v1", api_key="EMPTY") + +response = client.chat.completions.create( + model="Jonathan1909/Ming-flash-omni-2.0", + messages=[ + {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking on"}]}, + {"role": "user", "content": "If a train travels 120 km in 2 hours, what is its average speed?"}, + ], + modalities=["text"], +) +print(response.choices[0].message.content) +``` diff --git a/examples/online_serving/ming_flash_omni/run_curl_multimodal_generation.sh b/examples/online_serving/ming_flash_omni/run_curl_multimodal_generation.sh new file mode 100755 index 00000000000..768a424e451 --- /dev/null +++ b/examples/online_serving/ming_flash_omni/run_curl_multimodal_generation.sh @@ -0,0 +1,145 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Server port +PORT="${PORT:-8091}" +# Default query type +QUERY_TYPE="${1:-text}" + +# Validate query type +if [[ ! "$QUERY_TYPE" =~ ^(text|use_audio|use_image|use_video|use_mixed_modalities)$ ]]; then + echo "Error: Invalid query type '$QUERY_TYPE'" + echo "Usage: $0 [text|use_audio|use_image|use_video|use_mixed_modalities]" + echo " text: Text-only query" + echo " use_audio: Audio + Text query" + echo " use_image: Image + Text query" + echo " use_video: Video + Text query" + echo " use_mixed_modalities: Audio + Image + Video + Text query" + exit 1 +fi + +thinker_sampling_params='{ + "temperature": 0.4, + "top_p": 0.9, + "top_k": -1, + "max_tokens": 16384, + "seed": 42, + "detokenize": true, + "repetition_penalty": 1.05 +}' +# Above is optional, it has a default setting in stage_configs of the corresponding model. + +# Define URLs for assets +MARY_HAD_LAMB_AUDIO_URL="https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/mary_had_lamb.ogg" +CHERRY_BLOSSOM_IMAGE_URL="https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/cherry_blossom.jpg" +SAMPLE_VIDEO_URL="https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4" + +# Build user content based on query type +case "$QUERY_TYPE" in + text) + user_content='[ + { + "type": "text", + "text": "请详细介绍鹦鹉的生活习性。" + } + ]' + ;; + use_image) + user_content='[ + { + "type": "image_url", + "image_url": { + "url": "'"$CHERRY_BLOSSOM_IMAGE_URL"'" + } + }, + { + "type": "text", + "text": "Describe this image in detail." + } + ]' + ;; + use_audio) + user_content='[ + { + "type": "audio_url", + "audio_url": { + "url": "'"$MARY_HAD_LAMB_AUDIO_URL"'" + } + }, + { + "type": "text", + "text": "Please recognize the language of this speech and transcribe it. Format: oral." + } + ]' + ;; + use_video) + user_content='[ + { + "type": "video_url", + "video_url": { + "url": "'"$SAMPLE_VIDEO_URL"'" + } + }, + { + "type": "text", + "text": "Describe what is happening in this video." + } + ]' + ;; + use_mixed_modalities) + user_content='[ + { + "type": "image_url", + "image_url": { + "url": "'"$CHERRY_BLOSSOM_IMAGE_URL"'" + } + }, + { + "type": "audio_url", + "audio_url": { + "url": "'"$MARY_HAD_LAMB_AUDIO_URL"'" + } + }, + { + "type": "text", + "text": "Describe the image, and recognize the language of this speech and transcribe it. Format: oral" + } + ]' + ;; +esac + +echo "Running query type: $QUERY_TYPE" +echo "" + +request_body=$(cat < **Note on `--no-async-chunk`**: Flips the deploy yaml's `async_chunk:` +> bool. Pipelines that implement alternate processor functions for +> chunked vs end-to-end modes (e.g. qwen3_tts code2wav) dispatch +> automatically based on that bool — no extra flag or variant yaml is +> needed. + +> ⚠️ **For multi-stage models that share GPUs (qwen3_omni_moe by default +> shares cuda:1 between stages 1 and 2), avoid using global memory flags.** +> A global `--gpu-memory-utilization 0.85` would apply to every stage and +> oversubscribe the shared device. Use per-stage overrides instead — see +> below. + +#### 2. Per-stage overrides via `--stage-overrides` (recommended for memory) + ```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path /path/to/stage_configs_file +# Lower stage 1's memory budget; leave others at the YAML default +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 \ + --stage-overrides '{ + "1": {"gpu_memory_utilization": 0.5}, + "2": {"max_num_batched_tokens": 65536} + }' +``` + +Per-stage values are always treated as explicit and beat YAML defaults for +the named stage. Other stages keep their YAML values. + +#### 3. Custom deploy YAML + +When per-stage overrides get long, write a small overlay YAML that inherits +from the bundled default: + +```yaml +# my_qwen3_omni_overrides.yaml +base_config: /path/to/vllm_omni/deploy/qwen3_omni_moe.yaml + +stages: + - stage_id: 0 + max_num_batched_tokens: 65536 + enforce_eager: true + - stage_id: 1 + gpu_memory_utilization: 0.5 + - stage_id: 2 + max_model_len: 8192 ``` +Then start the server with `--deploy-config my_qwen3_omni_overrides.yaml`. +The `base_config:` line tells the loader to inherit everything else (stages, +connectors, edges, platforms section) from the bundled production YAML, so +you only need to spell out the deltas. + +#### 4. Multi-node deployment (cross-host transfer connector) + +The bundled `qwen3_omni_moe.yaml` uses `SharedMemoryConnector` between stages, +which only works when all stages run on the same physical host. For +**cross-node** deployments, write a small overlay YAML that swaps in a +network-capable connector (e.g. `MooncakeStoreConnector`) and re-points each +stage's connector wiring at it. The connector spec carries your own server +addresses — there is no checked-in default because every cluster is +different. + +```yaml +# my_qwen3_omni_multinode.yaml +base_config: /path/to/vllm_omni/deploy/qwen3_omni_moe.yaml + +connectors: + mooncake_connector: + name: MooncakeStoreConnector + extra: + host: "127.0.0.1" + metadata_server: "http://YOUR_METADATA_HOST:8080/metadata" + master: "YOUR_MASTER_HOST:50051" + segment: 512000000 # 512 MB transfer segment + localbuf: 64000000 # 64 MB local buffer + proto: "tcp" + +stages: + - stage_id: 0 + output_connectors: + to_stage_1: mooncake_connector + - stage_id: 1 + input_connectors: + from_stage_0: mooncake_connector + output_connectors: + to_stage_2: mooncake_connector + - stage_id: 2 + input_connectors: + from_stage_1: mooncake_connector +``` + +Then launch with `--deploy-config my_qwen3_omni_multinode.yaml`. Same +pattern works for Qwen2.5-Omni — replace `base_config:` with the path to +`vllm_omni/deploy/qwen2_5_omni.yaml`. + +> ⚠️ Replace `YOUR_METADATA_HOST` / `YOUR_MASTER_HOST` with the actual +> mooncake server addresses for your cluster. The `base_config:` overlay +> inherits all stage budgets, devices, and edges from the bundled prod +> YAML — you only need to spell out the connector swap. + ### Send Multi-modal Request Get into the example folder @@ -38,36 +188,43 @@ python examples/online_serving/openai_chat_completion_client_for_multimodal_gene #### Realtime WebSocket client (`openai_realtime_client.py`) -[`openai_realtime_client.py`](./openai_realtime_client.py) connects to **`ws://:/v1/realtime`**, uploads a local audio file as **PCM16 mono @ 16 kHz** chunks (OpenAI-style `input_audio_buffer.append` / `commit`), and prints **streaming transcription** (`transcription.delta` / `transcription.done`). +[`openai_realtime_client.py`](./openai_realtime_client.py) connects to **`ws://:/v1/realtime`**, streams a local WAV as **PCM16 mono @ 16 kHz** in fixed-size chunks (OpenAI-style `input_audio_buffer.append` / `commit`), and receives **`response.audio.delta`** (incremental PCM for the reply) plus **`transcription.*`** events. By default it concatenates audio deltas and writes **`--output-wav`** (model output is typically **24 kHz**). Optional **`--delta-dump-dir`** saves each delta as `delta_000001.wav`, … for debugging. + +Streaming input works well for translation-style use cases; if the Thinker runs while input is still incomplete, consider limiting **`max_tokens`** in your session / server defaults to avoid over-generation. **Dependencies:** ```bash -pip install websockets numpy +pip install websockets ``` **From this directory** (`examples/online_serving/qwen3_omni`): ```bash python openai_realtime_client.py \ - --host localhost \ - --port 8091 \ + --url ws://localhost:8091/v1/realtime \ --model Qwen/Qwen3-Omni-30B-A3B-Instruct \ - --audio_path /path/to/your.wav + --input-wav /path/to/input_16k_mono.wav \ + --output-wav realtime_output.wav \ + --delta-dump-dir ./rt_delta_wavs ``` -If `--audio_path` is omitted, the script uses a bundled default clip (`mary_had_lamb` via vLLM assets). - **Arguments:** | Flag | Default | Description | |------|---------|-------------| -| `--host` | `localhost` | API server host | -| `--port` | `8000` | API server port (match your `vllm serve` port, e.g. `8091`) | -| `--model` | `Qwen/Qwen3-Omni-30B-A3B-Instruct` | Must match the served model (also sent in `session.update`) | -| `--audio_path` | *(optional)* | Path to input audio; resampled to 16 kHz mono inside the client | - -Ensure the vLLM-Omni server is running with realtime support for this endpoint, for example: +| `--url` | `ws://localhost:8091/v1/realtime` | Full WebSocket URL including path | +| `--model` | `Qwen/Qwen3-Omni-30B-A3B-Instruct` | Must match the served model (sent in `session.update`) | +| `--input-wav` | *(required)* | Input WAV: mono, 16-bit PCM, **16 kHz** | +| `--output-wav` | `realtime_output.wav` | Output path for concatenated reply audio | +| `--output-text` | *(optional)* | If set, write final transcription text to this path | +| `--chunk-ms` | `200` | Size of each uploaded audio chunk (milliseconds of audio) | +| `--send-delay-ms` | `0` | Delay between chunk sends (simulate realtime upload) | +| `--delta-dump-dir` | *(optional)* | Directory to write per-`response.audio.delta` WAV files | +| `--num-requests` | `1` | Number of sequential sessions (see `--concurrency`) | +| `--concurrency` | `1` | Max concurrent WebSocket sessions when `--num-requests` > 1 | + +Ensure the server is running **without** `async_chunk` if you use `/v1/realtime`, for example: ```bash vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 @@ -276,7 +433,7 @@ The script supports the following arguments: - `--model`: Model name/path (default: Qwen/Qwen3-Omni-30B-A3B-Instruct) - `--server-port`: Port for vLLM server (default: 8091) - `--gradio-port`: Port for Gradio demo (default: 7861) -- `--stage-configs-path`: Path to custom stage configs YAML file (optional) +- `--deploy-config`: Path to custom deploy config YAML file (optional) - `--server-host`: Host for vLLM server (default: 0.0.0.0) - `--gradio-ip`: IP for Gradio demo (default: 127.0.0.1) - `--share`: Share Gradio demo publicly (creates a public link) @@ -291,7 +448,7 @@ vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 If you have custom stage configs file: ```bash -vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --stage-configs-path /path/to/stage_configs_file +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 --deploy-config /path/to/deploy_config_file ``` **Step 2: Run the Gradio demo** diff --git a/examples/online_serving/qwen3_omni/openai_realtime_client.py b/examples/online_serving/qwen3_omni/openai_realtime_client.py index 660e4ac336a..79e30a3f50b 100644 --- a/examples/online_serving/qwen3_omni/openai_realtime_client.py +++ b/examples/online_serving/qwen3_omni/openai_realtime_client.py @@ -1,81 +1,118 @@ -""" -This script demonstrates how to use the vLLM-Omni Realtime WebSocket API to perform -audio transcription by uploading an audio file. +"""Realtime client for vLLM-Omni /v1/realtime (audio + text events). + +This client: +1) Reads a local WAV file (must be mono, 16-bit PCM, 16kHz), +2) Streams PCM16 chunks to /v1/realtime with OpenAI-style events, +3) Receives response.audio.* and transcription.* events, +4) Saves synthesized audio to an output WAV file and optional text file. -Before running this script, you must start the vLLM-Omni server with a realtime-capable -model, for example: +By default each ``response.audio.delta`` is treated as an **incremental PCM** +chunk and all chunks are concatenated into the final ``--output-wav``. - vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni +Optional debugging: pass ``--delta-dump-dir DIR`` to write every +``response.audio.delta`` payload as ``delta_000001.wav``, ``delta_000002.wav``, … -Requirements: -- vllm with audio support -- websockets -- soundfile -- numpy +Usage: + python openai_realtime_client.py \ + --url ws://localhost:8091/v1/realtime \ + --model Qwen/Qwen3-Omni-30B-A3B-Instruct \ + --input-wav input_16k_mono.wav \ + --output-wav realtime_output.wav \ + --delta-dump-dir ./rt_delta_wavs -The script: -1. Connects to the Realtime WebSocket endpoint -2. Converts an audio file to PCM16 @ 16kHz -3. Sends audio chunks to the server -4. Receives and prints transcription as it streams +Dependencies: + pip install websockets """ +from __future__ import annotations + import argparse import asyncio import base64 import json +import wave +from pathlib import Path + +try: + import websockets +except ImportError: + print("Please install websockets: pip install websockets") + raise SystemExit(1) + + +def _read_wav_pcm16(path: Path) -> bytes: + with wave.open(str(path), "rb") as wf: + nchannels = wf.getnchannels() + sampwidth = wf.getsampwidth() + framerate = wf.getframerate() + comptype = wf.getcomptype() + nframes = wf.getnframes() + + if nchannels != 1: + raise ValueError(f"Input WAV must be mono (got {nchannels} channels).") + if sampwidth != 2: + raise ValueError(f"Input WAV must be 16-bit PCM (got sample width={sampwidth}).") + if framerate != 16000: + raise ValueError(f"Input WAV must be 16kHz (got {framerate} Hz).") + if comptype != "NONE": + raise ValueError(f"Input WAV must be uncompressed PCM (got comptype={comptype}).") + if nframes <= 0: + raise ValueError("Input WAV has no audio frames.") + + return wf.readframes(nframes) + + +def _write_wav_pcm16(path: Path, pcm16_bytes: bytes, sample_rate_hz: int) -> None: + with wave.open(str(path), "wb") as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(sample_rate_hz) + wf.writeframes(pcm16_bytes) + + +async def run_client( + url: str, + model: str, + input_wav: Path, + output_wav: Path, + output_text: Path | None, + chunk_ms: int, + send_delay_ms: int, + delta_dump_dir: Path | None, + request_idx: int = 1, + total_requests: int = 1, +) -> None: + log_prefix = f"[req {request_idx:02d}/{total_requests:02d}] " if total_requests > 1 else "" + pcm16 = _read_wav_pcm16(input_wav) + bytes_per_ms = 16000 * 2 // 1000 # mono PCM16 at 16kHz + chunk_bytes = max(bytes_per_ms * chunk_ms, 2) -import numpy as np -import websockets -from vllm.assets.audio import AudioAsset -from vllm.multimodal.media.audio import load_audio - - -def audio_to_pcm16_base64(audio_path: str) -> str: - """ - Load an audio file and convert it to base64-encoded PCM16 @ 16kHz. - """ - # Load audio and resample to 16kHz mono - audio, _ = load_audio(audio_path, sr=16000, mono=True) - # Convert to PCM16 - pcm16 = (audio * 32767).astype(np.int16) - # Encode as base64 - return base64.b64encode(pcm16.tobytes()).decode("utf-8") - - -async def realtime_transcribe(audio_path: str, host: str, port: int, model: str): - """ - Connect to the Realtime API and transcribe an audio file. - """ - uri = f"ws://{host}:{port}/v1/realtime" - - async with websockets.connect(uri) as ws: - # Wait for session.created - response = json.loads(await ws.recv()) - if response["type"] == "session.created": - print(f"Session created: {response['id']}") - else: - print(f"Unexpected response: {response}") - return - - # Validate model - await ws.send(json.dumps({"type": "session.update", "model": model})) - - # Signal ready to start - await ws.send(json.dumps({"type": "input_audio_buffer.commit"})) - - # Convert audio file to base64 PCM16 - print(f"Loading audio from: {audio_path}") - audio_base64 = audio_to_pcm16_base64(audio_path) - - # Send audio in chunks (4KB of raw audio = ~8KB base64) - chunk_size = 4096 - audio_bytes = base64.b64decode(audio_base64) - total_chunks = (len(audio_bytes) + chunk_size - 1) // chunk_size - - print(f"Sending {total_chunks} audio chunks...") - for i in range(0, len(audio_bytes), chunk_size): - chunk = audio_bytes[i : i + chunk_size] + incremental_pcm_parts: list[bytes] = [] + output_sample_rate = 24000 + delta_index = 0 + text_chunks: list[str] = [] + final_text: str = "" + + if delta_dump_dir is not None: + delta_dump_dir.mkdir(parents=True, exist_ok=True) + + async with websockets.connect(url, max_size=64 * 1024 * 1024) as ws: + # 1) Validate model. + await ws.send( + json.dumps( + { + "type": "session.update", + "model": model, + } + ) + ) + + # 2) Start generation once (non-final commit). + await ws.send(json.dumps({"type": "input_audio_buffer.commit", "final": False})) + + # 3) Stream audio chunks. + for i in range(0, len(pcm16), chunk_bytes): + chunk = pcm16[i : i + chunk_bytes] await ws.send( json.dumps( { @@ -84,63 +121,212 @@ async def realtime_transcribe(audio_path: str, host: str, port: int, model: str) } ) ) + if send_delay_ms > 0: + await asyncio.sleep(send_delay_ms / 1000.0) - # Signal all audio is sent + # 4) Final commit closes input stream. await ws.send(json.dumps({"type": "input_audio_buffer.commit", "final": True})) - print("Audio sent. Waiting for transcription...\n") - # Receive transcription - print("Transcription: ", end="", flush=True) + # 5) Receive server events until audio done. while True: - response = json.loads(await ws.recv()) - if response["type"] == "transcription.delta": - print(response["delta"], end="", flush=True) - elif response["type"] == "transcription.done": - print(f"\n\nFinal transcription: {response['text']}") - if response.get("usage"): - print(f"Usage: {response['usage']}") - break - elif response["type"] == "error": - print(f"\nError: {response['error']}") + message = await ws.recv() + if isinstance(message, bytes): + # We only expect JSON text frames. + continue + + event = json.loads(message) + event_type = event.get("type") + + if event_type == "session.created": + continue + + if event_type == "response.audio.delta": + sr = event.get("sample_rate_hz") + if isinstance(sr, int) and sr > 0: + output_sample_rate = sr + audio_b64 = event.get("audio", "") + if audio_b64: + pcm_delta = base64.b64decode(audio_b64) + incremental_pcm_parts.append(pcm_delta) + if delta_dump_dir is not None and pcm_delta: + delta_index += 1 + dump_path = delta_dump_dir / f"delta_{delta_index:06d}.wav" + _write_wav_pcm16(dump_path, pcm_delta, output_sample_rate) + print( + f"{log_prefix}delta dump #{delta_index}: {dump_path} " + f"(pcm bytes={len(pcm_delta)}, sr={output_sample_rate})" + ) + continue + + if event_type == "transcription.delta": + delta = event.get("delta", "") + if delta: + text_chunks.append(delta) + print(delta, end="", flush=True) + continue + + if event_type == "transcription.done": + final_text = event.get("text", "") or "".join(text_chunks) + usage = event.get("usage") + final_text_with_tag = f"Final transcription: {final_text}" + if text_chunks: + print() + print(f"{log_prefix}{final_text_with_tag}") + if usage: + print(f"{log_prefix}text usage: {usage}") + continue + + if event_type == "response.audio.done": break + if event_type == "error": + raise RuntimeError(f"Server error: {event}") -def main(args): - if args.audio_path: - audio_path = args.audio_path - else: - # Use default audio asset - audio_path = str(AudioAsset("mary_had_lamb").get_local_path()) - print(f"No audio path provided, using default: {audio_path}") + all_pcm16 = b"".join(incremental_pcm_parts) + if not all_pcm16: + raise RuntimeError("No audio received from server.") - asyncio.run(realtime_transcribe(audio_path, args.host, args.port, args.model)) + output_wav.parent.mkdir(parents=True, exist_ok=True) + _write_wav_pcm16(output_wav, all_pcm16, output_sample_rate) + print(f"{log_prefix}Saved realtime audio to: {output_wav} (incremental chunks joined)") + if output_text is not None: + text_to_save = final_text if final_text else "".join(text_chunks) + output_text.parent.mkdir(parents=True, exist_ok=True) + output_text.write_text(text_to_save, encoding="utf-8") + print(f"{log_prefix}Saved realtime text to: {output_text}") -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Realtime WebSocket Transcription Client") + +def _indexed_output_path(path: Path | None, index: int, total: int) -> Path | None: + if path is None or total <= 1: + return path + return path.with_name(f"{path.stem}_{index:02d}{path.suffix}") + + +async def run_clients_concurrent( + *, + url: str, + model: str, + input_wav: Path, + output_wav: Path, + output_text: Path | None, + chunk_ms: int, + send_delay_ms: int, + delta_dump_dir: Path | None, + num_requests: int, + concurrency: int, +) -> None: + sem = asyncio.Semaphore(concurrency) + + async def _run_one(index: int) -> tuple[int, bool, str | None]: + per_output_wav = _indexed_output_path(output_wav, index, num_requests) + per_output_text = _indexed_output_path(output_text, index, num_requests) + per_delta_dir = None + if delta_dump_dir is not None: + per_delta_dir = delta_dump_dir / f"req_{index:02d}" + async with sem: + try: + await run_client( + url=url, + model=model, + input_wav=input_wav, + output_wav=per_output_wav, + output_text=per_output_text, + chunk_ms=chunk_ms, + send_delay_ms=send_delay_ms, + delta_dump_dir=per_delta_dir, + request_idx=index, + total_requests=num_requests, + ) + return index, True, None + except Exception as exc: + return index, False, str(exc) + + tasks = [asyncio.create_task(_run_one(i), name=f"rt-client-{i}") for i in range(1, num_requests + 1)] + results = await asyncio.gather(*tasks) + + failed = [(idx, err) for idx, ok, err in results if not ok] + succeeded = num_requests - len(failed) + print(f"[summary] succeeded={succeeded}, failed={len(failed)}, total={num_requests}") + if failed: + for idx, err in failed: + print(f"[summary] req {idx:02d} failed: {err}") + raise RuntimeError(f"{len(failed)} concurrent request(s) failed") + + +def main() -> None: + parser = argparse.ArgumentParser(description="Realtime audio/text client for vLLM-Omni") + parser.add_argument("--url", default="ws://localhost:8091/v1/realtime", help="WebSocket URL") parser.add_argument( "--model", - type=str, default="Qwen/Qwen3-Omni-30B-A3B-Instruct", - help="Model that is served and should be pinged.", + help="Model name for session.update", ) + parser.add_argument("--input-wav", required=True, type=Path, help="Input WAV (mono, PCM16, 16kHz)") + parser.add_argument("--output-wav", default=Path("realtime_output.wav"), type=Path, help="Output WAV path") parser.add_argument( - "--audio_path", - type=str, + "--output-text", default=None, - help="Path to the audio file to transcribe.", + type=Path, + help="Optional output text path for final transcription", ) + parser.add_argument("--chunk-ms", type=int, default=200, help="Input chunk size in milliseconds") parser.add_argument( - "--host", - type=str, - default="localhost", - help="vLLM-Omni server host (default: localhost)", + "--send-delay-ms", + type=int, + default=0, + help="Delay between chunk sends; set >0 to simulate realtime upload", ) parser.add_argument( - "--port", + "--delta-dump-dir", + type=Path, + default=None, + help="If set, each response.audio.delta is saved as delta_NNNNNN.wav under this directory", + ) + parser.add_argument("--num-requests", type=int, default=1, help="Total number of requests to send") + parser.add_argument( + "--concurrency", type=int, - default=8000, - help="vLLM-Omni server port (default: 8000)", + default=1, + help="Maximum number of concurrent websocket requests", ) args = parser.parse_args() - main(args) + + if args.num_requests <= 0: + raise ValueError("--num-requests must be >= 1") + if args.concurrency <= 0: + raise ValueError("--concurrency must be >= 1") + concurrency = min(args.concurrency, args.num_requests) + + if args.num_requests == 1: + asyncio.run( + run_client( + url=args.url, + model=args.model, + input_wav=args.input_wav, + output_wav=args.output_wav, + output_text=args.output_text, + chunk_ms=args.chunk_ms, + send_delay_ms=args.send_delay_ms, + delta_dump_dir=args.delta_dump_dir, + ) + ) + else: + asyncio.run( + run_clients_concurrent( + url=args.url, + model=args.model, + input_wav=args.input_wav, + output_wav=args.output_wav, + output_text=args.output_text, + chunk_ms=args.chunk_ms, + send_delay_ms=args.send_delay_ms, + delta_dump_dir=args.delta_dump_dir, + num_requests=args.num_requests, + concurrency=concurrency, + ) + ) + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/qwen3_tts/README.md b/examples/online_serving/qwen3_tts/README.md index b48db9cf453..350fcb71cac 100644 --- a/examples/online_serving/qwen3_tts/README.md +++ b/examples/online_serving/qwen3_tts/README.md @@ -43,7 +43,7 @@ Then open http://localhost:7860 in your browser. ### Launch the Server -The default stage config is located at `vllm_omni/model_executor/stage_configs/qwen3_tts.yaml`. For other platforms (e.g., NPU), refer to `vllm_omni/platforms/npu/stage_configs/qwen3_tts.yaml`. +The default deploy config is located at `vllm_omni/deploy/qwen3_tts.yaml` and is loaded automatically by the model registry — no `--deploy-config` flag needed for default use. Platform-specific deltas (NPU, ROCm, XPU) are merged in automatically from the `platforms:` block of the same YAML based on the detected runtime. ```bash # CustomVoice model (predefined speakers) @@ -70,6 +70,22 @@ vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ --port 8091 ``` +#### Sync vs async-chunk mode + +Qwen3-TTS supports both **chunked streaming** (default, lower latency) and +**synchronous end-to-end** modes from the same deploy YAML. The bundled +`qwen3_tts.yaml` ships with `async_chunk: true`; flip with `--no-async-chunk` +and the pipeline automatically dispatches to the end-to-end codec processor: + +```bash +vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice --omni --port 8091 \ + --no-async-chunk +``` + +No variant YAML or extra flag is needed — the `StagePipelineConfig` on each +stage declares both processor functions and the runtime picks based on the +`async_chunk:` bool. + Alternatively, use the convenience script: ```bash ./run_server.sh # Default: CustomVoice model diff --git a/examples/online_serving/qwen3_tts/batch_speech_client.py b/examples/online_serving/qwen3_tts/batch_speech_client.py index 7d48e650f88..47fdc3691c7 100644 --- a/examples/online_serving/qwen3_tts/batch_speech_client.py +++ b/examples/online_serving/qwen3_tts/batch_speech_client.py @@ -5,11 +5,13 @@ batch level and generate many utterances in the cloned voice without repeating the reference for each item. -Start the server (with batch-optimized config for best throughput): +Start the server (with batch-optimized stage settings for best throughput): vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts_batch.yaml \ - --trust-remote-code + --omni \ + --trust-remote-code \ + --stage-overrides '{"0":{"max_num_seqs":4,"gpu_memory_utilization":0.2}, + "1":{"max_num_seqs":4,"gpu_memory_utilization":0.2}}' Examples: # Batch with a predefined voice diff --git a/examples/online_serving/qwen3_tts/run_gradio_demo.sh b/examples/online_serving/qwen3_tts/run_gradio_demo.sh index bcc0ddb7cf5..d79be3c2abd 100644 --- a/examples/online_serving/qwen3_tts/run_gradio_demo.sh +++ b/examples/online_serving/qwen3_tts/run_gradio_demo.sh @@ -127,7 +127,7 @@ echo "Starting vLLM server..." LOG_FILE="/tmp/vllm_tts_server_${SERVER_PORT}.log" vllm-omni serve "$MODEL" \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ + --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ --host "$SERVER_HOST" \ --port "$SERVER_PORT" \ --gpu-memory-utilization 0.9 \ diff --git a/examples/online_serving/qwen3_tts/run_server.sh b/examples/online_serving/qwen3_tts/run_server.sh index 6f4aa83a0b9..78dd2c305d3 100755 --- a/examples/online_serving/qwen3_tts/run_server.sh +++ b/examples/online_serving/qwen3_tts/run_server.sh @@ -31,7 +31,7 @@ esac echo "Starting Qwen3-TTS server with model: $MODEL" vllm-omni serve "$MODEL" \ - --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts.yaml \ + --deploy-config vllm_omni/deploy/qwen3_tts.yaml \ --host 0.0.0.0 \ --port 8091 \ --gpu-memory-utilization 0.9 \ diff --git a/examples/online_serving/qwen3_tts/speaker_embedding_interpolation.py b/examples/online_serving/qwen3_tts/speaker_embedding_interpolation.py index 38a2bdea929..7790fa51276 100644 --- a/examples/online_serving/qwen3_tts/speaker_embedding_interpolation.py +++ b/examples/online_serving/qwen3_tts/speaker_embedding_interpolation.py @@ -5,7 +5,7 @@ using SLERP and sends the result to the /v1/audio/speech API. Requirements: - pip install torch resampy soundfile numpy httpx + pip install torch soundfile numpy httpx Examples: # Extract and save an embedding @@ -143,11 +143,12 @@ def _load_speaker_encoder_weights(encoder: torch.nn.Module, model_path: str) -> def compute_mel_spectrogram(audio: np.ndarray, sr: int = 24000) -> torch.Tensor: """Compute 128-bin mel spectrogram matching Qwen3-TTS's extraction pipeline.""" - from vllm.multimodal.audio import resample_audio_resampy + from vllm.multimodal.audio import AudioResampler # Resample to 24kHz if needed if sr != 24000: - audio = resample_audio_resampy(audio.astype(np.float32), orig_sr=sr, target_sr=24000) + resampler = AudioResampler(target_sr=24000) + audio = resampler.resample(audio.astype(np.float32), orig_sr=sr) y = torch.from_numpy(audio).unsqueeze(0).float() diff --git a/recipes/Qwen/Qwen3-Omni.md b/recipes/Qwen/Qwen3-Omni.md new file mode 100644 index 00000000000..081e1453d37 --- /dev/null +++ b/recipes/Qwen/Qwen3-Omni.md @@ -0,0 +1,90 @@ +# Qwen3-Omni for multimodal chat on 1x A100 80GB + +## Summary + +- Vendor: Qwen +- Model: `Qwen/Qwen3-Omni-30B-A3B-Instruct` +- Task: Multimodal chat with text, image, audio, or video input +- Mode: Online serving with the OpenAI-compatible API +- Maintainer: Community + +## When to use this recipe + +Use this recipe when you want a known-good starting point for serving +`Qwen/Qwen3-Omni-30B-A3B-Instruct` with vLLM-Omni on a single 80 GB A100 and +validate the deployment with the existing multimodal client examples in this +repository. + +## References + +- Upstream or canonical docs: + [`docs/user_guide/examples/online_serving/qwen3_omni.md`](../../docs/user_guide/examples/online_serving/qwen3_omni.md) +- Related example under `examples/`: + [`examples/online_serving/qwen3_omni/README.md`](../../examples/online_serving/qwen3_omni/README.md) +- Related issue or discussion: + [RFC: add recipes folder](https://github.com/vllm-project/vllm-omni/issues/2645) + +## Hardware Support + +This recipe currently documents one tested-style reference configuration for +CUDA GPU serving. Add more sections for other hardware as community validation +lands. + +## GPU + +### 1x A100 80GB + +#### Environment + +- OS: Linux +- Python: 3.10+ +- Driver / runtime: NVIDIA CUDA environment with an A100 80 GB GPU +- vLLM version: Match the repository requirements for your checkout +- vLLM-Omni version or commit: Use the commit you are deploying from + +#### Command + +Start the server from the repository root: + +```bash +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 +``` + +To enable async chunking, use the bundled stage config: + +```bash +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct \ + --omni \ + --port 8091 \ + --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml +``` + +#### Verification + +Run one of the existing example clients after the server is ready: + +```bash +python examples/online_serving/openai_chat_completion_client_for_multimodal_generation.py \ + --model Qwen/Qwen3-Omni-30B-A3B-Instruct \ + --query-type use_image \ + --port 8091 \ + --host localhost +``` + +For a quick API smoke test, request text-only output: + +```bash +curl http://localhost:8091/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct", + "messages": [{"role": "user", "content": "Describe vLLM in brief."}], + "modalities": ["text"] + }' +``` + +#### Notes + +- Memory usage: Size depends on runtime options and output modalities; leave headroom for multimodal workloads. +- Key flags: `--omni` is required; `--stage-configs-path` is optional for custom or async-chunk stage configs. +- Known limitations: This starter recipe is intentionally narrow and focuses on the single-GPU online-serving path already documented in the repo examples. diff --git a/recipes/README.md b/recipes/README.md new file mode 100644 index 00000000000..5b3dfb5430b --- /dev/null +++ b/recipes/README.md @@ -0,0 +1,35 @@ +# Community Recipes + +This directory contains community-maintained recipes for answering a +practical user question: + +> How do I run model X on hardware Y for task Z? + +Add recipes for this repository under this in-repo `recipes/` directory. To +keep naming and layout consistent, organize recipes by model vendor in a way +that is aligned with +[`vllm-project/recipes`](https://github.com/vllm-project/recipes), but treat +that external repository as a reference for structure rather than the place to +add files for this repo. Use one Markdown file per model family by default. + +Example layout: + +```text +recipes/ + Qwen/ + Qwen3-Omni.md + Qwen3-TTS.md + Tencent-Hunyuan/ + HunyuanVideo.md +``` + +## Available Recipes + +- [`Qwen/Qwen3-Omni.md`](./Qwen/Qwen3-Omni.md): online serving recipe for + multimodal chat on `1x A100 80GB` + +Within a single recipe file, include different hardware support sections such +as `GPU`, `ROCm`, and `NPU`, and add concrete tested configurations like +`1x A100 80GB` or `2x L40S` inside those sections when applicable. + +See [TEMPLATE.md](./TEMPLATE.md) for the recommended format. diff --git a/recipes/TEMPLATE.md b/recipes/TEMPLATE.md new file mode 100644 index 00000000000..9bf8cb9c759 --- /dev/null +++ b/recipes/TEMPLATE.md @@ -0,0 +1,82 @@ +# Recipe Title + +> Example: Qwen3-Omni for speech chat on 1x A100 80GB + +## Summary + +- Vendor: +- Model: +- Task: +- Mode: +- Maintainer: + +## When to use this recipe + +Briefly describe the concrete scenario this recipe covers. + +## References + +- Upstream or canonical docs: +- Related example under `examples/`: +- Related issue or discussion: + +## Hardware Support + +Add one section per platform, such as `GPU`, `ROCm`, or `NPU`. Under each +platform section, document one or more tested hardware configurations. + +## GPU + +### 1x A100 80GB + +#### Environment + +- OS: +- Python: +- Driver / runtime: +- vLLM version: +- vLLM-Omni version or commit: + +#### Command + +```bash +# Add the exact command(s) here +``` + +#### Verification + +```bash +# Add a quick validation command or expected output here +``` + +#### Notes + +- Memory usage: +- Key flags: +- Known limitations: + +### 2x L40S + +Repeat the same structure for other hardware setups as needed. + +## ROCm + +### Example hardware configuration + +Repeat the same nested structure for ROCm setups as needed: + +- `#### Environment` +- `#### Command` +- `#### Verification` +- `#### Notes` + +## NPU + +### Example hardware configuration + +Repeat the same nested structure for NPU setups as needed: + +- `#### Environment` +- `#### Command` +- `#### Verification` +- `#### Notes` diff --git a/requirements/common.txt b/requirements/common.txt index 1f44d343c62..63e16d580ff 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -1,7 +1,6 @@ # Common dependencies for all platforms av>=14.0.0 omegaconf>=2.3.0 -resampy>=0.4.3 diffusers>=0.36.0 accelerate==1.12.0 soundfile>=0.13.1 diff --git a/tests/comfyui/test_comfyui_integration.py b/tests/comfyui/test_comfyui_integration.py index 80e86d82412..5164f3b9acb 100644 --- a/tests/comfyui/test_comfyui_integration.py +++ b/tests/comfyui/test_comfyui_integration.py @@ -523,6 +523,7 @@ def run_server(): "Qwen/Qwen-Image-Edit", True, id="image-to-image-dalle-endpoint", + marks=pytest.mark.skip(reason="Temporarily disabled due to failure."), ), pytest.param( ServerCase( diff --git a/tests/config/__init__.py b/tests/config/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/config/test_pipeline_registry.py b/tests/config/test_pipeline_registry.py new file mode 100644 index 00000000000..3483d530c63 --- /dev/null +++ b/tests/config/test_pipeline_registry.py @@ -0,0 +1,111 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests for the central pipeline registry (2.5/N).""" + +from __future__ import annotations + +import pytest + +from vllm_omni.config.pipeline_registry import ( + _DIFFUSION_PIPELINES, + _OMNI_PIPELINES, + _VLLM_OMNI_PIPELINES, +) +from vllm_omni.config.stage_config import ( + _PIPELINE_REGISTRY, + PipelineConfig, + StageExecutionType, + StagePipelineConfig, + register_pipeline, +) + + +class TestCentralRegistryDeclarations: + """Every in-tree pipeline must be declared exactly once in the central registry.""" + + def test_union_contains_all_omni(self): + for key in _OMNI_PIPELINES: + assert key in _VLLM_OMNI_PIPELINES + + def test_union_contains_all_diffusion(self): + for key in _DIFFUSION_PIPELINES: + assert key in _VLLM_OMNI_PIPELINES + + def test_no_duplicate_model_type_between_omni_and_diffusion(self): + overlap = set(_OMNI_PIPELINES) & set(_DIFFUSION_PIPELINES) + assert not overlap, f"Duplicate model_types across omni/diffusion: {overlap}" + + def test_expected_omni_pipelines_present(self): + # Guard against accidental removal during future refactors. + assert "qwen2_5_omni" in _OMNI_PIPELINES + assert "qwen2_5_omni_thinker_only" in _OMNI_PIPELINES + assert "qwen3_omni_moe" in _OMNI_PIPELINES + assert "qwen3_tts" in _OMNI_PIPELINES + + +class TestLazyLoading: + """Pipelines are imported only on first access.""" + + def test_contains_without_import(self): + # ``in`` hits the lazy map, not the loaded cache. + assert "qwen3_omni_moe" in _PIPELINE_REGISTRY + + def test_getitem_loads_correct_pipeline(self): + pipeline = _PIPELINE_REGISTRY["qwen3_omni_moe"] + assert pipeline.model_type == "qwen3_omni_moe" + assert pipeline.model_arch == "Qwen3OmniMoeForConditionalGeneration" + + def test_unknown_model_type_returns_none_via_get(self): + assert _PIPELINE_REGISTRY.get("not_a_real_pipeline") is None + + def test_unknown_model_type_raises_keyerror_via_getitem(self): + with pytest.raises(KeyError): + _PIPELINE_REGISTRY["not_a_real_pipeline"] + + def test_iteration_yields_registered_pipelines(self): + keys = set(_PIPELINE_REGISTRY) + assert "qwen2_5_omni" in keys + assert "qwen3_omni_moe" in keys + + +class TestDynamicRegistration: + """``register_pipeline()`` still works for plugins and tests.""" + + def test_register_adds_to_registry(self): + custom = PipelineConfig( + model_type="_test_dynamic_registration", + model_arch="DynamicTestModel", + stages=( + StagePipelineConfig( + stage_id=0, + model_stage="test", + execution_type=StageExecutionType.LLM_AR, + input_sources=(), + final_output=True, + ), + ), + ) + register_pipeline(custom) + try: + assert "_test_dynamic_registration" in _PIPELINE_REGISTRY + assert _PIPELINE_REGISTRY["_test_dynamic_registration"] is custom + finally: + # Don't leak the test registration into other tests. + if "_test_dynamic_registration" in _PIPELINE_REGISTRY: + del _PIPELINE_REGISTRY["_test_dynamic_registration"] + + def test_dynamic_registration_overrides_lazy_entry(self): + # Build a substitute for qwen3_omni_moe that we can distinguish. + original = _PIPELINE_REGISTRY["qwen3_omni_moe"] + override = PipelineConfig( + model_type="qwen3_omni_moe", + model_arch="OverriddenArch", + stages=original.stages, + ) + register_pipeline(override) + try: + assert _PIPELINE_REGISTRY["qwen3_omni_moe"].model_arch == "OverriddenArch" + finally: + # Remove the dynamic override so later tests see the original. + if "qwen3_omni_moe" in _PIPELINE_REGISTRY._loaded: + del _PIPELINE_REGISTRY["qwen3_omni_moe"] diff --git a/tests/conftest.py b/tests/conftest.py index ad1008b7263..77075f9525a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3429 +1,62 @@ -import atexit -import base64 -import datetime -import io -import json -import math -import os -import random -import re -import tempfile - -import requests - -os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -# Set CPU device for CI environments without GPU -if "VLLM_TARGET_DEVICE" not in os.environ: - os.environ["VLLM_TARGET_DEVICE"] = "cpu" - -import concurrent.futures -import contextlib -import gc -import multiprocessing -import socket -import subprocess -import sys -import threading -import time -import uuid -from collections.abc import Generator -from dataclasses import dataclass -from io import BytesIO -from pathlib import Path -from typing import Any, NamedTuple - -import cv2 -import numpy as np -import psutil -import pytest -import soundfile as sf -import torch -import yaml -from openai import OpenAI, omit -from PIL import Image -from transformers import pipeline -from vllm import TextPrompt -from vllm.distributed.parallel_state import cleanup_dist_env_and_memory -from vllm.logger import init_logger -from vllm.utils.network_utils import get_open_port - -from vllm_omni.entrypoints.omni import Omni -from vllm_omni.inputs.data import OmniSamplingParams -from vllm_omni.outputs import OmniRequestOutput -from vllm_omni.platforms import current_omni_platform - -logger = init_logger(__name__) - - -PromptAudioInput = list[tuple[Any, int]] | tuple[Any, int] | None -PromptImageInput = list[Any] | Any | None -PromptVideoInput = list[Any] | Any | None - -_GENDER_PIPELINE = None -# transformers.Pipeline is not thread-safe; concurrent e2e requests must serialize inference. -_GENDER_PIPELINE_LOCK = threading.Lock() - -# int16 mono PCM from /v1/audio/speech when response_format=pcm (Qwen3-TTS code2wav output rate). -_PCM_SPEECH_SAMPLE_RATE_HZ = 24_000 - - -class OmniServerParams(NamedTuple): - model: str - port: int | None = None - stage_config_path: str | None = None - server_args: list[str] | None = None - env_dict: dict[str, str] | None = None - use_omni: bool = True - use_stage_cli: bool = False - init_timeout: int | None = None - stage_init_timeout: int | None = None # None defers to the server's own default (300 s) - - -def assert_image_diffusion_response( - response, - request_config: dict[str, Any], - run_level: str = None, -) -> None: - """ - Validate image diffusion response. - - Expected request_config schema: - { - "request_type": "image", - "extra_body": { - "num_outputs_per_prompt": 1, - "width": ..., - "height": ..., - ... - } - } - """ - assert response.images is not None, "Image response is None" - assert len(response.images) > 0, "No images in response" - - extra_body = request_config.get("extra_body") or {} - - num_outputs_per_prompt = extra_body.get("num_outputs_per_prompt") - if num_outputs_per_prompt is not None: - assert len(response.images) == num_outputs_per_prompt, ( - f"Expected {num_outputs_per_prompt} images, got {len(response.images)}" - ) - - if run_level == "advanced_model": - width = extra_body.get("width") - height = extra_body.get("height") - - if width is not None or height is not None: - for img in response.images: - assert_image_valid(img, width=width, height=height) - - -def assert_video_diffusion_response( - response, - request_config: dict[str, Any], - run_level: str = None, -) -> None: - """ - Validate video diffusion response. - - Expected request_config schema: - { - "request_type": "video", - "form_data": { - "prompt": "...", - "num_frames": ..., - "width": ..., - "height": ..., - "fps": ..., - ... - } - } - """ - form_data = request_config.get("form_data", {}) - - assert response.videos is not None, "Video response is None" - assert len(response.videos) > 0, "No videos in response" - - expected_frames = _maybe_int(form_data.get("num_frames")) - expected_width = _maybe_int(form_data.get("width")) - expected_height = _maybe_int(form_data.get("height")) - expected_fps = _maybe_int(form_data.get("fps")) - - for vid_bytes in response.videos: - assert_video_valid( - vid_bytes, - num_frames=expected_frames, - width=expected_width, - height=expected_height, - fps=expected_fps, - ) - - -def assert_audio_diffusion_response( - response, - request_config: dict[str, Any], - run_level: str = None, -) -> None: - """ - Validate audio diffusion response. - """ - raise NotImplementedError("Audio validation is not implemented yet") - - -def _maybe_int(value: Any) -> int | None: - if value is None: - return None - return int(value) - - -def assert_image_valid(image: Path | Image.Image, *, width: int | None = None, height: int | None = None): - """Assert the file is a loadable image with optional exact dimensions.""" - if isinstance(image, Path): - assert image.exists(), f"Image not found: {image}" - image = Image.open(image) - image.load() - assert image.width > 0 and image.height > 0 - if width is not None: - assert image.width == width, f"Expected width={width}, got {image.width}" - if height is not None: - assert image.height == height, f"Expected height={height}, got {image.height}" - return image - - -def assert_video_valid( - video: Path | bytes | BytesIO, - *, - num_frames: int | None = None, - width: int | None = None, - height: int | None = None, - fps: float | None = None, -) -> dict[str, int | float]: - """Assert the MP4 has the expected resolution and exact frame count.""" - temp_path = None - cap = None - try: - # Normalize input to file path - if isinstance(video, Path): - if not video.exists(): - raise AssertionError(f"Video file not found: {video}") - video_path = str(video) - else: - # Create temp file for bytes/BytesIO - suffix = ".mp4" - with tempfile.NamedTemporaryFile(delete=False, suffix=suffix, mode="wb") as tmp: - if isinstance(video, bytes): - tmp.write(video) - elif isinstance(video, BytesIO): - tmp.write(video.getvalue()) - else: - raise TypeError(f"Unsupported video type: {type(video)}") - temp_path = Path(tmp.name) - video_path = str(temp_path) - - # Open video capture - cap = cv2.VideoCapture(video_path) - if not cap.isOpened(): - raise AssertionError(f"Failed to open video: {video_path}") - - # Extract properties - actual_num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - actual_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) - actual_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) - actual_fps = cap.get(cv2.CAP_PROP_FPS) - - actual_num_frames = 0 - while True: - ok, _frame = cap.read() - if not ok: - break - actual_num_frames += 1 - - # Basic validity checks - if actual_num_frames <= 0: - raise AssertionError(f"Invalid frame count: {actual_num_frames} (must be > 0)") - if actual_width <= 0 or actual_height <= 0: - raise AssertionError(f"Invalid dimensions: {actual_width}x{actual_height} (must be > 0)") - if actual_fps <= 0: - raise AssertionError(f"Invalid FPS: {actual_fps} (must be > 0)") - - # Validate against expectations - if num_frames is not None: - expected_num_frames = (num_frames // 4) * 4 + 1 - assert actual_num_frames == expected_num_frames, ( - f"Frame count mismatch: expected {num_frames}, got {actual_num_frames}" - ) - if width is not None: - assert actual_width == width, f"Width mismatch: expected {width}px, got {actual_width}px" - if height is not None: - assert actual_height == height, f"Height mismatch: expected {height}px, got {actual_height}px" - if fps is not None: - # Use tolerance for float comparison (codec rounding) - assert abs(actual_fps - fps) < 0.5, f"FPS mismatch: expected {fps}, got {actual_fps:.2f}" - - return {"num_frames": actual_num_frames, "width": actual_width, "height": actual_height, "fps": actual_fps} - - except Exception as e: - print(f"ERROR: {type(e).__name__}: {e}", flush=True) - raise - - finally: - # Cleanup resources - if cap is not None: - cap.release() - if temp_path and temp_path.exists(): - try: - temp_path.unlink() - except OSError: - pass - - -def assert_audio_valid( - audio_or_path: Path | np.ndarray, - *, - sample_rate: int, - channels: int, - duration_s: float, -) -> None: - """Assert WAV file or (batch, channels, samples) ndarray matches expected audio format.""" - expected_samples = int(duration_s * sample_rate) - if isinstance(audio_or_path, np.ndarray): - audio = audio_or_path - assert audio.ndim == 3, f"Expected audio ndim=3 (batch, channels, samples), got shape {audio.shape}" - assert audio.shape[0] == 1, f"Expected batch size 1, got {audio.shape[0]}" - assert audio.shape[1] == channels, f"Expected {channels} channels, got {audio.shape[1]}" - assert audio.shape[2] == expected_samples, ( - f"Expected {expected_samples} samples ({duration_s}s @ {sample_rate} Hz), got {audio.shape[2]}" - ) - return - - path = audio_or_path - assert path.exists(), f"Audio not found: {path}" - info = sf.info(str(path)) - assert info.samplerate == sample_rate, f"Expected sample_rate={sample_rate}, got {info.samplerate}" - assert info.channels == channels, f"Expected {channels} channel(s), got {info.channels}" - assert info.frames == expected_samples, ( - f"Expected {expected_samples} frames ({duration_s}s @ {sample_rate} Hz), got {info.frames}" - ) - - -def decode_b64_image(b64: str): - img = Image.open(BytesIO(base64.b64decode(b64))) - img.load() - return img - - -@pytest.fixture(scope="session") -def model_prefix() -> str: - """Optional model-path prefix from MODEL_PREFIX env var. - Useful if models are downloaded to non-default local directories. - """ - prefix = os.environ.get("MODEL_PREFIX", "") - return f"{prefix.rstrip('/')}/" if prefix else "" - - -@pytest.fixture(autouse=True) -def default_vllm_config(): - """Set a default VllmConfig for all tests. - - This fixture is auto-used for all tests to ensure that any test - that directly instantiates vLLM CustomOps (e.g., RMSNorm, LayerNorm) - or model components has the required VllmConfig context. - - This fixture is required for vLLM 0.14.0+ where CustomOp initialization - requires a VllmConfig context set via set_current_vllm_config(). - """ - from vllm.config import DeviceConfig, VllmConfig, set_current_vllm_config - - # Use CPU device if no GPU is available (e.g., in CI environments) - has_gpu = torch.cuda.is_available() and torch.cuda.device_count() > 0 - device = "cuda" if has_gpu else "cpu" - device_config = DeviceConfig(device=device) - - with set_current_vllm_config(VllmConfig(device_config=device_config)): - yield - - -@pytest.fixture(autouse=True) -def clean_gpu_memory_between_tests(): - print("\n=== PRE-TEST GPU CLEANUP ===") - _run_pre_test_cleanup() - yield - _run_post_test_cleanup() - - -@pytest.fixture(autouse=True) -def log_test_name_before_test(request): - print(f"--- Running test: {request.node.name}") - yield - - -def _run_pre_test_cleanup(enable_force=False): - if os.getenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") != "1" and not enable_force: - print("\nPre-test GPU cleanup skipped(Default off is typical when one worker/instance runs many tests.)\n") - return - - print("\nPre-test GPU status:") - - num_gpus = torch.cuda.device_count() - if num_gpus > 0: - try: - from tests.utils import wait_for_gpu_memory_to_clear - - wait_for_gpu_memory_to_clear( - devices=list(range(num_gpus)), - threshold_ratio=0.05, - ) - except Exception as e: - print(f"Pre-test cleanup note: {e}") - - -def _run_post_test_cleanup(enable_force=False): - if os.getenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") != "1" and not enable_force: - print("GPU cleanup disabled") - return - - if torch.cuda.is_available(): - gc.collect() - torch.cuda.empty_cache() - - print("Post-test GPU status:") - _print_gpu_processes() - - -def _print_gpu_processes(): - """Print GPU information including nvidia-smi and system processes""" - - print("\n" + "=" * 80) - print("NVIDIA GPU Information (nvidia-smi)") - print("=" * 80) - - try: - nvidia_result = subprocess.run( - ["nvidia-smi"], - capture_output=True, - text=True, - timeout=5, - ) - - if nvidia_result.returncode == 0: - lines = nvidia_result.stdout.strip().split("\n") - for line in lines[:20]: - print(line) - - if len(lines) > 20: - print(f"... (showing first 20 of {len(lines)} lines)") - else: - print("nvidia-smi command failed") - - except (subprocess.TimeoutExpired, FileNotFoundError): - print("nvidia-smi not available or timed out") - except Exception as e: - print(f"Error running nvidia-smi: {e}") - - print("\n" + "=" * 80) - print("Detailed GPU Processes (nvidia-smi pmon)") - print("=" * 80) - - try: - pmon_result = subprocess.run( - ["nvidia-smi", "pmon", "-c", "1"], - capture_output=True, - text=True, - timeout=3, - ) - - if pmon_result.returncode == 0 and pmon_result.stdout.strip(): - print(pmon_result.stdout) - else: - print("No active GPU processes found via nvidia-smi pmon") - - except Exception: - print("nvidia-smi pmon not available") - - print("\n" + "=" * 80) - print("System Processes with GPU keywords") - print("=" * 80) - - -def dummy_messages_from_mix_data( - system_prompt: dict[str, Any] = None, - video_data_url: Any = None, - audio_data_url: Any = None, - image_data_url: Any = None, - content_text: str = None, -): - """Create messages with video、image、audio data URL for OpenAI API.""" - - if content_text is not None: - content = [{"type": "text", "text": content_text}] - else: - content = [] - - media_items = [] - if isinstance(video_data_url, list): - for video_url in video_data_url: - media_items.append((video_url, "video")) - else: - media_items.append((video_data_url, "video")) - - if isinstance(image_data_url, list): - for url in image_data_url: - media_items.append((url, "image")) - else: - media_items.append((image_data_url, "image")) - - if isinstance(audio_data_url, list): - for url in audio_data_url: - media_items.append((url, "audio")) - else: - media_items.append((audio_data_url, "audio")) - - content.extend( - {"type": f"{media_type}_url", f"{media_type}_url": {"url": url}} - for url, media_type in media_items - if url is not None - ) - messages = [{"role": "user", "content": content}] - if system_prompt is not None: - messages = [system_prompt] + messages - return messages - - -def generate_synthetic_audio( - duration: int, # seconds - num_channels: int, # 1:Mono,2:Stereo 5:5.1 surround sound - sample_rate: int = 48000, # Default use 48000Hz. - save_to_file: bool = False, -) -> dict[str, Any]: - """ - Generate TTS speech with pyttsx3 and return base64 string. - """ - - import pyttsx3 - import soundfile as sf - - def _pick_voice(engine: pyttsx3.Engine) -> str | None: - voices = engine.getProperty("voices") - if not voices: - return None - - preferred_tokens = ( - "natural", - "jenny", - "sonia", - "susan", - "zira", - "aria", - "hazel", - "samantha", - "ava", - "allison", - "female", - "woman", - "english-us", - "en-us", - "english", - ) - discouraged_tokens = ( - "espeak", - "robot", - "mbrola", - "microsoft david", - "male", - "man", - ) - - best_voice = voices[0] - best_score = float("-inf") - for voice in voices: - voice_text = f"{getattr(voice, 'id', '')} {getattr(voice, 'name', '')}".lower() - voice_languages = " ".join( - lang.decode(errors="ignore") if isinstance(lang, bytes) else str(lang) - for lang in getattr(voice, "languages", []) - ).lower() - combined_text = f"{voice_text} {voice_languages}" - score = 0 - for idx, token in enumerate(preferred_tokens): - if token in combined_text: - score += 20 - idx - for token in discouraged_tokens: - if token in combined_text: - score -= 10 - if "english" in combined_text or "en_" in combined_text or "en-" in combined_text: - score += 4 - if "en-us" in combined_text or "english-us" in combined_text: - score += 4 - if score > best_score: - best_score = score - best_voice = voice - - return best_voice.id - - def _resample_audio(audio: np.ndarray, src_sr: int, dst_sr: int) -> np.ndarray: - if src_sr == dst_sr or len(audio) == 0: - return audio.astype(np.float32) - - src_len = audio.shape[0] - dst_len = max(1, int(round(src_len * float(dst_sr) / float(src_sr)))) - src_idx = np.arange(src_len, dtype=np.float32) - dst_idx = np.linspace(0, src_len - 1, dst_len, dtype=np.float32) - - resampled_channels: list[np.ndarray] = [] - for ch in range(audio.shape[1]): - resampled_channels.append(np.interp(dst_idx, src_idx, audio[:, ch]).astype(np.float32)) - return np.stack(resampled_channels, axis=1) - - def _match_channels(audio: np.ndarray, target_channels: int) -> np.ndarray: - current_channels = audio.shape[1] - if current_channels == target_channels: - return audio.astype(np.float32) - if target_channels == 1: - return np.mean(audio, axis=1, keepdims=True, dtype=np.float32) - if current_channels == 1: - return np.repeat(audio, target_channels, axis=1).astype(np.float32) - - collapsed = np.mean(audio, axis=1, keepdims=True, dtype=np.float32) - return np.repeat(collapsed, target_channels, axis=1).astype(np.float32) - - def _trim_silence(audio: np.ndarray, threshold: float = 0.01) -> np.ndarray: - if len(audio) == 0: - return audio - energy = np.max(np.abs(audio), axis=1) - voiced = np.where(energy > threshold)[0] - if len(voiced) == 0: - return audio - start = max(0, int(voiced[0]) - int(sample_rate * 0.02)) - end = min(len(audio), int(voiced[-1]) + int(sample_rate * 0.04) + 1) - return audio[start:end] - - def _enhance_speech(audio: np.ndarray) -> np.ndarray: - if len(audio) == 0: - return audio.astype(np.float32) - enhanced = audio.astype(np.float32).copy() - enhanced -= np.mean(enhanced, axis=0, keepdims=True, dtype=np.float32) - if len(enhanced) > 1: - preemphasis = enhanced.copy() - preemphasis[1:] = enhanced[1:] - 0.94 * enhanced[:-1] - enhanced = 0.7 * enhanced + 0.3 * preemphasis - # Mild dynamic-range compression for ASR/TTS robustness. - enhanced = np.sign(enhanced) * np.sqrt(np.abs(enhanced)) - # Light fade to avoid clicks after trimming/repeating. - fade = min(len(enhanced) // 4, max(1, int(sample_rate * 0.01))) - if fade > 1: - ramp_in = np.linspace(0.0, 1.0, fade, dtype=np.float32) - ramp_out = np.linspace(1.0, 0.0, fade, dtype=np.float32) - enhanced[:fade] *= ramp_in[:, None] - enhanced[-fade:] *= ramp_out[:, None] - peak = float(np.max(np.abs(enhanced))) - if peak > 1e-8: - enhanced = enhanced / peak * 0.95 - return enhanced.astype(np.float32) - - phrase_text = "test" - num_samples = int(sample_rate * max(1, duration)) - audio_data = np.zeros((num_samples, num_channels), dtype=np.float32) - - engine = pyttsx3.init() - engine.setProperty("rate", 112) - engine.setProperty("volume", 1.0) - selected_voice = _pick_voice(engine) - if selected_voice is not None: - engine.setProperty("voice", selected_voice) - - temp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) - temp_wav.close() - - try: - engine.save_to_file(phrase_text, temp_wav.name) - engine.runAndWait() - engine.stop() - - ready = False - for _ in range(50): - if os.path.exists(temp_wav.name) and os.path.getsize(temp_wav.name) > 44: - ready = True - break - time.sleep(0.1) - - if not ready: - raise RuntimeError("pyttsx3 did not produce a WAV file in time.") - - tts_audio, tts_sr = sf.read(temp_wav.name, dtype="float32", always_2d=True) - finally: - if os.path.exists(temp_wav.name): - os.unlink(temp_wav.name) - - if len(tts_audio) == 0: - raise RuntimeError("pyttsx3 produced an empty WAV file.") - - tts_audio = _resample_audio(tts_audio, tts_sr, sample_rate) - tts_audio = _match_channels(tts_audio, num_channels) - tts_audio = _trim_silence(tts_audio, threshold=0.012) - tts_audio = _enhance_speech(tts_audio) - - lead_silence = min(int(sample_rate * 0.02), num_samples // 8) - pause_samples = int(sample_rate * 0.18) - start = lead_silence - phrase_len = tts_audio.shape[0] - - while start < num_samples: - take = min(phrase_len, num_samples - start) - audio_data[start : start + take] = tts_audio[:take] - start += phrase_len + pause_samples - - max_amp = float(np.max(np.abs(audio_data))) - if max_amp > 0: - audio_data = audio_data / max_amp * 0.95 - - audio_bytes: bytes | None = None - output_path: str | None = None - result: dict[str, Any] = { - "np_array": audio_data.copy(), - } - - if save_to_file: - timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") - output_path = f"audio_{num_channels}ch_{timestamp}.wav" - - try: - sf.write(output_path, audio_data, sample_rate, format="WAV", subtype="PCM_16") - print(f"Audio saved: {output_path}") - - with open(output_path, "rb") as f: - audio_bytes = f.read() - except Exception as e: - print(f"Save failed: {e}") - save_to_file = False - - # If not saving or save failed, create in memory - if not save_to_file or audio_bytes is None: - buffer = io.BytesIO() - sf.write(buffer, audio_data, sample_rate, format="WAV", subtype="PCM_16") - buffer.seek(0) - audio_bytes = buffer.read() - - # Return result - base64_audio = base64.b64encode(audio_bytes).decode("utf-8") - result["base64"] = base64_audio - # Always include file_path to avoid KeyError in callers. - result["file_path"] = output_path if save_to_file and output_path else None - - return result - - -def _mux_mp4_bytes_with_synthetic_audio( - video_mp4_bytes: bytes, - *, - num_frames: int, - fps: float = 30.0, - sample_rate: int = 48000, -) -> bytes: - """ - Mux a video-only MP4 with mono TTS audio from :func:`generate_synthetic_audio` (AAC). - - Audio length is at least the video duration in whole seconds (rounded up); ffmpeg - ``-shortest`` trims to the video when the WAV is longer. - - Uses ffmpeg from ``imageio_ffmpeg`` when available, else ``ffmpeg`` on PATH. - If TTS or mux fails, returns ``video_mp4_bytes`` unchanged. - - Mux subprocess does **not** use ``capture_output=True``: ffmpeg can block writing - to a full stderr pipe while :func:`subprocess.run` waits for exit (classic deadlock). - """ - duration_sec = num_frames / fps if fps > 0 else 0.0 - # generate_synthetic_audio(duration=int) uses at least 1s of buffer internally - duration_int = max(1, int(math.ceil(duration_sec))) - - try: - audio_result = generate_synthetic_audio( - duration=duration_int, - num_channels=1, - sample_rate=sample_rate, - save_to_file=False, - ) - audio_pcm = audio_result["np_array"] - except Exception as e: - logger.warning("Synthetic video: generate_synthetic_audio failed (%s); using video-only MP4.", e) - return video_mp4_bytes - - try: - import imageio_ffmpeg - - ffmpeg_exe = imageio_ffmpeg.get_ffmpeg_exe() - except Exception: - ffmpeg_exe = "ffmpeg" - - import tempfile - - try: - with tempfile.TemporaryDirectory(prefix="syn_vid_mux_") as tmp: - vid_path = os.path.join(tmp, "video.mp4") - wav_path = os.path.join(tmp, "audio.wav") - out_path = os.path.join(tmp, "out.mp4") - with open(vid_path, "wb") as f: - f.write(video_mp4_bytes) - sf.write(wav_path, audio_pcm, sample_rate, format="WAV", subtype="PCM_16") - cmd = [ - ffmpeg_exe, - "-y", - "-nostdin", - "-hide_banner", - "-loglevel", - "error", - "-i", - vid_path, - "-i", - wav_path, - "-c:v", - "copy", - "-c:a", - "aac", - "-b:a", - "128k", - "-shortest", - "-movflags", - "+faststart", - out_path, - ] - subprocess.run( - cmd, - check=True, - stdin=subprocess.DEVNULL, - timeout=300, - ) - with open(out_path, "rb") as f: - return f.read() - except ( - FileNotFoundError, - subprocess.CalledProcessError, - subprocess.TimeoutExpired, - OSError, - ) as e: - logger.warning("Synthetic video: audio mux failed (%s); using video-only MP4.", e) - return video_mp4_bytes - - -def generate_synthetic_video( - width: int, - height: int, - num_frames: int, - save_to_file: bool = False, - *, - embed_audio: bool = False, -) -> dict[str, Any]: - """Generate synthetic video with bouncing balls and base64 MP4. - - When ``embed_audio`` is True, muxes mono AAC from :func:`generate_synthetic_audio` - (TTS + ffmpeg) into the MP4; otherwise returns video-only MP4 (faster when tests do - not need an audio track). - """ - - import cv2 - import imageio - - # Create random balls - num_balls = random.randint(3, 8) - balls = [] - - for _ in range(num_balls): - radius = min(width, height) // 8 - if radius < 1: - raise ValueError(f"Video dimensions ({width}x{height}) are too small for synthetic video generation") - x = random.randint(radius, width - radius) - y = random.randint(radius, height - radius) - - speed = random.uniform(3.0, 8.0) - angle = random.uniform(0, 2 * math.pi) - vx = speed * math.cos(angle) - vy = speed * math.sin(angle) - - # OpenCV uses BGR format, but imageio expects RGB - # We'll create in BGR first, then convert to RGB later - color_bgr = (random.randint(50, 255), random.randint(50, 255), random.randint(50, 255)) - - balls.append({"x": x, "y": y, "vx": vx, "vy": vy, "radius": radius, "color_bgr": color_bgr}) - - # Generate video frames - video_frames = [] - - for frame_idx in range(num_frames): - # Create black background (BGR format) - frame_bgr = np.zeros((height, width, 3), dtype=np.uint8) - - for ball in balls: - # Update position - ball["x"] += ball["vx"] - ball["y"] += ball["vy"] - - # Boundary collision detection - if ball["x"] - ball["radius"] <= 0 or ball["x"] + ball["radius"] >= width: - ball["vx"] = -ball["vx"] - ball["x"] = max(ball["radius"], min(width - ball["radius"], ball["x"])) - - if ball["y"] - ball["radius"] <= 0 or ball["y"] + ball["radius"] >= height: - ball["vy"] = -ball["vy"] - ball["y"] = max(ball["radius"], min(height - ball["radius"], ball["y"])) - - # Use cv2 to draw circle - x, y = int(ball["x"]), int(ball["y"]) - radius = ball["radius"] - - # Draw solid circle (main circle) - cv2.circle(frame_bgr, (x, y), radius, ball["color_bgr"], -1) - - # Add simple 3D effect: draw a brighter center - if radius > 3: # Only add highlight when radius is large enough - highlight_radius = max(1, radius // 2) - highlight_x = max(highlight_radius, min(x - radius // 4, width - highlight_radius)) - highlight_y = max(highlight_radius, min(y - radius // 4, height - highlight_radius)) - - # Create highlight color (brighter) - highlight_color = tuple(min(c + 40, 255) for c in ball["color_bgr"]) - cv2.circle(frame_bgr, (highlight_x, highlight_y), highlight_radius, highlight_color, -1) - - # Convert BGR to RGB for imageio - frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB) - video_frames.append(frame_rgb) - - video_array = np.array(video_frames) - result = { - "np_array": video_array, - } - saved_file_path = None - - fps = 30 - buffer = io.BytesIO() - writer_kwargs = { - "format": "mp4", - "fps": fps, - "codec": "libx264", - "quality": 7, - "pixelformat": "yuv420p", - "macro_block_size": 16, - "ffmpeg_params": [ - "-preset", - "medium", - "-crf", - "23", - "-movflags", - "+faststart", - "-pix_fmt", - "yuv420p", - "-vf", - f"scale={width}:{height}", - ], - } - - try: - with imageio.get_writer(buffer, **writer_kwargs) as writer: - for frame in video_frames: - writer.append_data(frame) - buffer.seek(0) - video_only_bytes = buffer.read() - except Exception as e: - print(f"Warning: Failed to encode synthetic video: {e}") - raise - - if embed_audio: - video_bytes = _mux_mp4_bytes_with_synthetic_audio(video_only_bytes, num_frames=num_frames, fps=float(fps)) - else: - video_bytes = video_only_bytes - - if save_to_file: - timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") - output_path = f"video_{width}x{height}_{timestamp}.mp4" - try: - with open(output_path, "wb") as f: - f.write(video_bytes) - saved_file_path = output_path - print(f"Video saved to: {saved_file_path}") - except Exception as e: - print(f"Warning: Failed to save video to file {output_path}: {e}") - - base64_video = base64.b64encode(video_bytes).decode("utf-8") - - result["base64"] = base64_video - if save_to_file and saved_file_path: - result["file_path"] = saved_file_path - - return result - - -def generate_synthetic_image(width: int, height: int, save_to_file: bool = False) -> dict[str, Any]: - """Generate synthetic image with randomly colored squares and return base64 string.""" - from PIL import Image, ImageDraw - - # Create white background - image = Image.new("RGB", (width, height), (255, 255, 255)) - draw = ImageDraw.Draw(image) - - # Generate random number of squares - num_squares = random.randint(3, 8) - - for _ in range(num_squares): - # Random square size - square_size = random.randint(min(width, height) // 8, min(width, height) // 4) - - # Random position - x = random.randint(0, width - square_size - 1) - y = random.randint(0, height - square_size - 1) - - # Random color - color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) - - # Random border width - border_width = random.randint(1, 5) - - # Draw square - draw.rectangle([x, y, x + square_size, y + square_size], fill=color, outline=(0, 0, 0), width=border_width) - - image_array = np.array(image) - result = {"np_array": image_array.copy()} - - # Handle file saving - image_bytes = None - saved_file_path = None - - if save_to_file: - timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") - output_path = f"image_{width}x{height}_{timestamp}.jpg" - - try: - # Save image to file - image.save(output_path, format="JPEG", quality=85, optimize=True) - saved_file_path = output_path - print(f"Image saved to: {saved_file_path}") - - # Read file for base64 encoding - with open(output_path, "rb") as f: - image_bytes = f.read() - - except Exception as e: - print(f"Warning: Failed to save image to file {output_path}: {e}") - save_to_file = False - - # If not saving or save failed, create in memory - if not save_to_file or image_bytes is None: - buffer = io.BytesIO() - image.save(buffer, format="JPEG", quality=85, optimize=True) - buffer.seek(0) - image_bytes = buffer.read() - - # Generate base64 - base64_image = base64.b64encode(image_bytes).decode("utf-8") - - # Return result - result["base64"] = base64_image - if save_to_file and saved_file_path: - result["file_path"] = saved_file_path - - return result - - -def preprocess_text(text): - import opencc - - word_to_num = { - "zero": "0", - "one": "1", - "two": "2", - "three": "3", - "four": "4", - "five": "5", - "six": "6", - "seven": "7", - "eight": "8", - "nine": "9", - "ten": "10", - } - - for word, num in word_to_num.items(): - pattern = r"\b" + re.escape(word) + r"\b" - text = re.sub(pattern, num, text, flags=re.IGNORECASE) - - text = re.sub(r"[^\w\s]", "", text) - text = re.sub(r"\s+", " ", text) - cc = opencc.OpenCC("t2s") - text = cc.convert(text) - - # Special handling for spaces between Chinese characters: - # - Keep single spaces between English words/numbers - # - Remove spaces only when surrounded by Chinese characters on both sides to prevent incorrect word segmentation - text = re.sub(r"(?<=[\u4e00-\u9fff])\s+(?=[\u4e00-\u9fff])", "", text) - - return text.lower().strip() - - -def cosine_similarity_text(text1, text2, n: int = 3): - from collections import Counter - - if not text1 or not text2: - return 0.0 - - text1 = preprocess_text(text1) - text2 = preprocess_text(text2) - print(f"cosine similarity text1 is: {text1}, text2 is: {text2}") - - ngrams1 = [text1[i : i + n] for i in range(len(text1) - n + 1)] - ngrams2 = [text2[i : i + n] for i in range(len(text2) - n + 1)] - - counter1 = Counter(ngrams1) - counter2 = Counter(ngrams2) - - all_ngrams = set(counter1.keys()) | set(counter2.keys()) - vec1 = [counter1.get(ng, 0) for ng in all_ngrams] - vec2 = [counter2.get(ng, 0) for ng in all_ngrams] - - dot_product = sum(a * b for a, b in zip(vec1, vec2)) - norm1 = sum(a * a for a in vec1) ** 0.5 - norm2 = sum(b * b for b in vec2) ** 0.5 - - if norm1 == 0 or norm2 == 0: - return 0.0 - return dot_product / (norm1 * norm2) - - -def convert_audio_to_text(audio_data): - """ - Convert base64 encoded audio data to text using speech recognition. - """ - audio_data = base64.b64decode(audio_data) - output_path = f"./test_{uuid.uuid4().hex}.wav" - with open(output_path, "wb") as audio_file: - audio_file.write(audio_data) - - print(f"audio data is saved: {output_path}") - text = convert_audio_file_to_text(output_path=output_path) - return text - - -def _merge_base64_audio_to_segment(base64_list: list[str]): - """Merge a list of base64-encoded audio chunks into one pydub AudioSegment.""" - from pydub import AudioSegment - - merged = None - for b64 in base64_list: - raw = base64.b64decode(b64.split(",", 1)[-1]) - seg = AudioSegment.from_file(io.BytesIO(raw)) - merged = seg if merged is None else merged + seg - return merged - - -@contextlib.contextmanager -def _serialize_whisper_small_model_download(): - """Serialize Whisper ``small`` cache writes across processes (Linux; ``fcntl``).""" - import fcntl - - lock_path = Path.home() / ".cache" / "whisper" / ".small_model_download.lock" - lock_path.parent.mkdir(parents=True, exist_ok=True) - f = open(lock_path, "a+b") - try: - fcntl.flock(f.fileno(), fcntl.LOCK_EX) - yield - finally: - fcntl.flock(f.fileno(), fcntl.LOCK_UN) - f.close() - - -def _whisper_transcribe_in_current_process(output_path: str) -> str: - import whisper - - # Multi-GPU: use last visible device to avoid colliding with default device 0; single device uses 0. - device_index = None - if current_omni_platform.is_available(): - n = current_omni_platform.get_device_count() - if n == 1: - device_index = 0 - elif n > 1: - device_index = n - 1 - - if device_index is not None: - torch_device = current_omni_platform.get_torch_device(device_index) - current_omni_platform.set_device(torch_device) - device = str(torch_device) - use_accelerator = True - else: - use_accelerator = False - device = "cpu" - with _serialize_whisper_small_model_download(): - model = whisper.load_model("small", device=device) - try: - text = model.transcribe( - output_path, - temperature=0.0, - word_timestamps=True, - condition_on_previous_text=False, - )["text"] - finally: - del model - gc.collect() - if use_accelerator: - current_omni_platform.synchronize() - current_omni_platform.empty_cache() - - return text or "" - - -def convert_audio_file_to_text(output_path: str) -> str: - """Convert an audio file to text in an isolated subprocess (spawn).""" - ctx = multiprocessing.get_context("spawn") - with concurrent.futures.ProcessPoolExecutor(max_workers=1, mp_context=ctx) as executor: - future = executor.submit(_whisper_transcribe_in_current_process, output_path) - return future.result() - - -def convert_audio_bytes_to_text(raw_bytes: bytes) -> str: - """ - Write container audio bytes (WAV, etc.) to a temp WAV file suitable for Whisper/ffmpeg. - Normalizes with soundfile to PCM_16 WAV when possible to avoid codec issues. - """ - output_path = f"./test_{uuid.uuid4().hex}.wav" - data, samplerate = sf.read(io.BytesIO(raw_bytes)) - sf.write(output_path, data, samplerate, format="WAV", subtype="PCM_16") - text = convert_audio_file_to_text(output_path) - return text - - -def modify_stage_config( - yaml_path: str, - updates: dict[str, Any] = None, - deletes: dict[str, Any] = None, -) -> str: - """ - Modify configurations in a YAML file, supporting both top-level and stage-specific modifications, - including addition, modification, and deletion of configurations. - - Args: - yaml_path: Path to the YAML configuration file. - updates: Dictionary containing both top-level and stage-specific modifications to add or update. - Format: { - 'async_chunk': True, - 'stage_args': { - 0: {'engine_args.max_model_len': 5800}, - 1: {'engine_args.max_num_seqs': 2} - } - } - deletes: Dictionary containing configurations to delete. - Format: { - 'old_config': None, # Delete entire key - 'stage_args': { - 0: ['engine_args.old_param'], - 1: ['runtime.unused_setting'] - } - } - - Returns: - str: Path to the newly created modified YAML file with timestamp suffix. - """ - path = Path(yaml_path) - if not path.exists(): - raise FileNotFoundError(f"yaml does not exist: {path}") - - try: - with open(yaml_path, encoding="utf-8") as f: - config = yaml.safe_load(f) or {} - except Exception as e: - raise ValueError(f"Cannot parse YAML file: {e}") - - # Helper function to apply update - def apply_update(config_dict: dict, key_path: str, value: Any) -> None: - """Apply update to dictionary using dot-separated path.""" - # Handle direct list assignment (e.g., engine_input_source: [1, 2]) - if "." not in key_path: - # Simple key, set directly - config_dict[key_path] = value - return - - current = config_dict - keys = key_path.split(".") - - for i in range(len(keys) - 1): - key = keys[i] - - # Handle list indices - if key.isdigit() and isinstance(current, list): - index = int(key) - if index < 0: - raise ValueError(f"Negative list index not allowed: {index}") - if index >= len(current): - # Expand list if needed - while len(current) <= index: - # If we need to go deeper (more keys after this), create a dict - # Otherwise, create None placeholder - current.append({} if i < len(keys) - 2 else None) - current = current[index] - elif isinstance(current, dict): - # Handle dictionary keys - if key not in current: - # If there are more keys after this, create appropriate structure - if i < len(keys) - 1: - # Check if next key is a digit (list index) or string (dict key) - if keys[i + 1].isdigit(): - current[key] = [] - else: - current[key] = {} - else: - # This is the last key, create based on value type - current[key] = [] if isinstance(value, list) else {} - elif not isinstance(current[key], (dict, list)) and i < len(keys) - 1: - # If current value is not dict/list but we need to go deeper, replace it - if keys[i + 1].isdigit(): - current[key] = [] - else: - current[key] = {} - current = current[key] - else: - # Current is not a dict or list, cannot traverse further - raise TypeError( - f"Cannot access {'.'.join(keys[: i + 1])} as a dict/list. It's a {type(current).__name__}" - ) - - # Set the final value - last_key = keys[-1] - if isinstance(current, list) and last_key.isdigit(): - # Setting a value in a list by index - index = int(last_key) - if index < 0: - raise ValueError(f"Negative list index not allowed: {index}") - if index >= len(current): - # Expand list if needed - while len(current) <= index: - current.append(None) - current[index] = value - elif isinstance(current, dict): - # Special case: if the value is a list and we're setting a top-level key - # Example: updating engine_input_source with [1, 2] - current[last_key] = value - else: - # Current is not a dict, cannot set key - raise TypeError(f"Cannot set value at {key_path}. Current type is {type(current).__name__}, expected dict.") - - # Helper function to delete by path - def delete_by_path(config_dict: dict, path: str) -> None: - """Delete configuration by dot-separated path.""" - if not path: - return - - current = config_dict - keys = path.split(".") - - # Traverse to the parent - for i in range(len(keys) - 1): - key = keys[i] - - # Handle list indices - if key.isdigit() and isinstance(current, list): - index = int(key) - if index < 0 or index >= len(current): - raise KeyError(f"List index {index} out of bounds") - current = current[index] - elif isinstance(current, dict): - if key not in current: - raise KeyError(f"Path {'.'.join(keys[: i + 1])} does not exist") - current = current[key] - else: - raise TypeError( - f"Cannot access {'.'.join(keys[: i + 1])} as a dict/list. It's a {type(current).__name__}" - ) - - # Delete the item - last_key = keys[-1] - - if isinstance(current, list) and last_key.isdigit(): - index = int(last_key) - if index < 0 or index >= len(current): - raise KeyError(f"List index {index} out of bounds") - del current[index] - elif isinstance(current, dict) and last_key in current: - del current[last_key] - else: - print(f"Path {path} does not exist") - - # Apply deletions first - if deletes: - for key, value in deletes.items(): - if key == "stage_args": - if value and isinstance(value, dict): - stage_args = config.get("stage_args", []) - if not stage_args: - raise ValueError("stage_args does not exist in config") - - for stage_id, delete_paths in value.items(): - if not delete_paths: - continue - - # Find stage by ID - target_stage = None - for stage in stage_args: - if stage.get("stage_id") == int(stage_id): - target_stage = stage - break - - if target_stage is None: - continue - - # Delete specified paths in this stage - # Avoid shadowing the original YAML Path used for the output filename below. - for delete_path in delete_paths: - if delete_path: # Skip empty paths - delete_by_path(target_stage, delete_path) - elif "." in key: - # Delete using dot-separated path - delete_by_path(config, key) - elif value is None and key in config: - # Delete entire key - del config[key] - - # Apply updates - if updates: - for key, value in updates.items(): - if key == "stage_args": - if value and isinstance(value, dict): - stage_args = config.get("stage_args", []) - if not stage_args: - raise ValueError("stage_args does not exist in config") - - for stage_id, stage_updates in value.items(): - # Find stage by ID - target_stage = None - for stage in stage_args: - if stage.get("stage_id") == int(stage_id): - target_stage = stage - break - - if target_stage is None: - available_ids = [s.get("stage_id") for s in stage_args if "stage_id" in s] - raise KeyError(f"Stage ID {stage_id} not found, available: {available_ids}") - - # Apply updates to this stage - for update_path, val in stage_updates.items(): - # Check if this is a simple key (not dot-separated) - # Example: 'engine_input_source' vs 'engine_args.max_model_len' - if "." not in update_path: - # Direct key assignment (e.g., updating a list value) - target_stage[update_path] = val - else: - # Dot-separated path (e.g., nested dict access) - apply_update(target_stage, update_path, val) - elif "." in key: - # Apply using dot-separated path - apply_update(config, key, value) - else: - # Direct top-level key - config[key] = value - - # Unique suffix: multiple modify_stage_config calls in one process often run - # within the same second (e.g. test_qwen3_omni_expansion imports both - # get_chunk_config and get_batch_token_config). int(time.time()) would collide - # and the later write would overwrite the earlier YAML on disk. - # Keep generated configs outside the repo and delete them when pytest exits. - output_fd, output_path = tempfile.mkstemp(prefix=f"{path.stem}_", suffix=".yaml") - atexit.register(Path(output_path).unlink, missing_ok=True) - - with os.fdopen(output_fd, "w", encoding="utf-8") as f: - yaml.dump(config, f, default_flow_style=None, sort_keys=False, allow_unicode=True, indent=2) - - return str(output_path) - - -class OmniServer: - """Omniserver for vLLM-Omni tests.""" - - def __init__( - self, - model: str, - serve_args: list[str], - *, - port: int | None = None, - env_dict: dict[str, str] | None = None, - use_omni: bool = True, - ) -> None: - _run_pre_test_cleanup(enable_force=True) - _run_post_test_cleanup(enable_force=True) - cleanup_dist_env_and_memory() - self.model = model - self.serve_args = serve_args - self.env_dict = env_dict - self.use_omni = use_omni - self.proc: subprocess.Popen | None = None - self.host = "127.0.0.1" - if port is None: - self.port = get_open_port() - else: - self.port = port - - def _start_server(self) -> None: - """Start the vLLM-Omni server subprocess.""" - env = os.environ.copy() - env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" - if self.env_dict is not None: - env.update(self.env_dict) - - cmd = [ - sys.executable, - "-m", - "vllm_omni.entrypoints.cli.main", - "serve", - self.model, - "--host", - self.host, - "--port", - str(self.port), - ] - if self.use_omni: - cmd.append("--omni") - cmd += self.serve_args - - print(f"Launching OmniServer with: {' '.join(cmd)}") - self.proc = subprocess.Popen( - cmd, - env=env, - cwd=os.path.dirname(os.path.dirname(os.path.abspath(__file__))), # Set working directory to vllm-omni root - ) - - # Wait for server to be ready - max_wait = 1200 # 20 minutes - start_time = time.time() - while time.time() - start_time < max_wait: - # Check for process status - ret = self.proc.poll() - if ret is not None: - raise RuntimeError(f"Server processes exited with code {ret} before becoming ready.") - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: - sock.settimeout(1) - result = sock.connect_ex((self.host, self.port)) - if result == 0: - print(f"Server ready on {self.host}:{self.port}") - return - time.sleep(2) - - raise RuntimeError(f"Server failed to start within {max_wait} seconds") - - def _kill_process_tree(self, pid): - """kill process and its children with verification""" - try: - parent = psutil.Process(pid) - children = parent.children(recursive=True) - - # Get all PIDs first - all_pids = [pid] + [child.pid for child in children] - - # Terminate children - for child in children: - try: - child.terminate() - except psutil.NoSuchProcess: - pass - - # Wait for children - gone, still_alive = psutil.wait_procs(children, timeout=10) - - # Kill remaining children - for child in still_alive: - try: - child.kill() - except psutil.NoSuchProcess: - pass - - # Terminate parent - try: - parent.terminate() - parent.wait(timeout=10) - except (psutil.NoSuchProcess, psutil.TimeoutExpired): - try: - parent.kill() - except psutil.NoSuchProcess: - pass - - # VERIFICATION: Check if all processes are gone - time.sleep(1) # Give system time - alive_processes = [] - for check_pid in all_pids: - if psutil.pid_exists(check_pid): - alive_processes.append(check_pid) - - if alive_processes: - print(f"Warning: Processes still alive: {alive_processes}") - # Optional: Try system kill - import subprocess - - for alive_pid in alive_processes: - try: - subprocess.run(["kill", "-9", str(alive_pid)], timeout=2) - except Exception as e: - print(f"Cleanup failed: {e}") - - except psutil.NoSuchProcess: - pass - - def __enter__(self): - self._start_server() - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - if self.proc: - self._kill_process_tree(self.proc.pid) - _run_pre_test_cleanup(enable_force=True) - _run_post_test_cleanup(enable_force=True) - cleanup_dist_env_and_memory() - - -class OmniServerStageCli(OmniServer): - """Omni server harness that exercises the stage CLI flow.""" - - def __init__( - self, - model: str, - stage_config_path: str, - serve_args: list[str] | None = None, - *, - stage_ids: list[int] | None = None, - port: int | None = None, - env_dict: dict[str, str] | None = None, - ) -> None: - super().__init__(model, serve_args or [], port=port, env_dict=env_dict, use_omni=True) - self.stage_config_path = stage_config_path - self.master_port = get_open_port() - self.visible_device_list = self._load_visible_device_list(env_dict) - self.stage_runtime_devices = self._load_stage_runtime_devices(stage_config_path) - self.stage_ids = stage_ids or self._load_stage_ids(stage_config_path) - if 0 not in self.stage_ids: - raise ValueError(f"Stage CLI test requires stage_id=0 in config: {stage_config_path}") - self.stage_procs: dict[int, subprocess.Popen] = {} - self.proc = None - - @staticmethod - def _load_stage_ids(stage_config_path: str) -> list[int]: - with open(stage_config_path, encoding="utf-8") as f: - cfg = yaml.safe_load(f) or {} - - stage_ids = [stage["stage_id"] for stage in cfg.get("stage_args", []) if "stage_id" in stage] - if not stage_ids: - raise ValueError(f"No stage IDs found in config: {stage_config_path}") - return stage_ids - - @staticmethod - def _load_stage_runtime_devices(stage_config_path: str) -> dict[int, str]: - with open(stage_config_path, encoding="utf-8") as f: - cfg = yaml.safe_load(f) or {} - - runtime_devices: dict[int, str] = {} - for stage in cfg.get("stage_args", []): - stage_id = stage.get("stage_id") - devices = stage.get("runtime", {}).get("devices") - if stage_id is not None and devices: - runtime_devices[int(stage_id)] = str(devices) - return runtime_devices - - @classmethod - def _parse_device_list(cls, devices: str | int) -> list[str]: - if isinstance(devices, int): - if devices < 0: - raise ValueError("Device IDs must be non-negative integers") - return [str(devices)] - return [token.strip() for token in str(devices).split(",") if token.strip()] - - @classmethod - def _load_visible_device_list(cls, env_dict: dict[str, str] | None) -> list[str] | None: - env = os.environ.copy() - if env_dict is not None: - env.update(env_dict) - - env_var = getattr(current_omni_platform, "device_control_env_var", None) - if env_var and env_var in env: - return [token.strip() for token in env[env_var].split(",") if token.strip()] - return None - - @classmethod - def _map_stage_devices(cls, stage_id: int, visible_device_list: list[str] | None, devices: str) -> str: - device_list = cls._parse_device_list(devices) - - if visible_device_list is None: - return ",".join(device_list) - - if not all(device.isdigit() for device in device_list): - raise ValueError("Logical devices must be non-negative integers") - - logical_ids = [int(device) for device in device_list] - if logical_ids and max(logical_ids) >= len(visible_device_list): - raise ValueError( - f"Stage {stage_id} has logical IDs {device_list}, one or more of which exceed the number of visible devices" - ) - - return ",".join(visible_device_list[idx] for idx in logical_ids) - - def _set_stage_device_env(self, stage_id: int, env: dict[str, str], devices: str) -> None: - mapped_devices = self._map_stage_devices(stage_id, self.visible_device_list, devices) - env_var = getattr(current_omni_platform, "device_control_env_var", None) - if env_var: - env[env_var] = mapped_devices - - def _build_stage_cmd(self, stage_id: int, *, headless: bool) -> list[str]: - cmd = [ - sys.executable, - "-m", - "vllm_omni.entrypoints.cli.main", - "serve", - self.model, - "--omni", - "--stage-configs-path", - self.stage_config_path, - "--stage-id", - str(stage_id), - "--omni-master-address", - self.host, - "--omni-master-port", - str(self.master_port), - ] - - if headless: - cmd.append("--headless") - else: - cmd += ["--host", self.host, "--port", str(self.port)] - - cmd += self.serve_args - return cmd - - def _launch_stage(self, stage_id: int, *, headless: bool) -> None: - env = os.environ.copy() - env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" - if self.env_dict is not None: - env.update(self.env_dict) - - devices = self.stage_runtime_devices.get(stage_id) - if devices: - self._set_stage_device_env(stage_id, env, devices) - - cmd = self._build_stage_cmd(stage_id, headless=headless) - print(f"Launching OmniServerStageCli stage {stage_id}: {' '.join(cmd)}") - proc = subprocess.Popen( - cmd, - env=env, - cwd=os.path.dirname(os.path.dirname(os.path.abspath(__file__))), - ) - self.stage_procs[stage_id] = proc - if stage_id == 0: - self.proc = proc - - def _ensure_stage_processes_alive(self) -> None: - for stage_id, proc in self.stage_procs.items(): - ret = proc.poll() - if ret is not None: - raise RuntimeError(f"Stage {stage_id} exited with code {ret} before API server became ready.") - - def _start_server(self) -> None: - ordered_stage_ids = [0, *[stage_id for stage_id in self.stage_ids if stage_id != 0]] - - self._launch_stage(0, headless=False) - time.sleep(2) - self._ensure_stage_processes_alive() - - for stage_id in ordered_stage_ids[1:]: - self._launch_stage(stage_id, headless=True) - - max_wait = 1200 - start_time = time.time() - while time.time() - start_time < max_wait: - self._ensure_stage_processes_alive() - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: - sock.settimeout(1) - result = sock.connect_ex((self.host, self.port)) - if result == 0: - print(f"OmniServerStageCli ready on {self.host}:{self.port}") - return - time.sleep(2) - - raise RuntimeError(f"OmniServerStageCli failed to start within {max_wait} seconds") - - def __exit__(self, exc_type, exc_val, exc_tb): - for stage_id in sorted(self.stage_procs, reverse=True): - proc = self.stage_procs[stage_id] - if proc.poll() is None: - self._kill_process_tree(proc.pid) - _run_pre_test_cleanup(enable_force=True) - _run_post_test_cleanup(enable_force=True) - cleanup_dist_env_and_memory() - - -def pytest_addoption(parser): - parser.addoption( - "--run-level", - action="store", - default="core_model", - choices=["core_model", "advanced_model"], - help="Test level to run: L2, L3", - ) - - -@pytest.fixture(scope="session") -def run_level(request) -> str: - """A command-line argument that specifies the level of tests to run in this session. - See https://docs.vllm.ai/projects/vllm-omni/en/latest/contributing/ci/CI_5levels/""" - return request.config.getoption("--run-level") - - -_omni_server_lock = threading.Lock() - - -@pytest.fixture(scope="module") -def omni_server(request: pytest.FixtureRequest, run_level: str, model_prefix: str) -> Generator[OmniServer, Any, None]: - """Start vLLM-Omni through the standard or stage-CLI launcher. - - The fixture stays module-scoped because multi-stage initialization is costly. - The ``use_stage_cli`` flag on ``OmniServerParams`` routes the setup through the - stage-CLI harness while still reusing the same fixture grouping semantics. - """ - with _omni_server_lock: - params: OmniServerParams = request.param - model = model_prefix + params.model - port = params.port - stage_config_path = params.stage_config_path - if run_level == "advanced_model" and stage_config_path is not None: - with open(stage_config_path, encoding="utf-8") as f: - cfg = yaml.safe_load(f) or {} - stage_ids = [stage["stage_id"] for stage in cfg.get("stage_args", []) if "stage_id" in stage] - stage_config_path = modify_stage_config( - stage_config_path, - deletes={"stage_args": {stage_id: ["engine_args.load_format"] for stage_id in stage_ids}}, - ) - - server_args = params.server_args or [] - if params.use_omni and params.stage_init_timeout is not None: - server_args = [*server_args, "--stage-init-timeout", str(params.stage_init_timeout)] - else: - server_args = [*server_args, "--stage-init-timeout", "600"] - if params.init_timeout is not None: - server_args = [*server_args, "--init-timeout", str(params.init_timeout)] - else: - server_args = [*server_args, "--init-timeout", "900"] - if params.use_stage_cli: - if not params.use_omni: - raise ValueError("omni_server with use_stage_cli=True requires use_omni=True") - if stage_config_path is None: - raise ValueError("omni_server with use_stage_cli=True requires a stage_config_path") - - with OmniServerStageCli( - model, - stage_config_path, - server_args, - port=port, - env_dict=params.env_dict, - ) as server: - print("OmniServer started successfully") - yield server - print("OmniServer stopping...") - else: - if stage_config_path is not None: - server_args += ["--stage-configs-path", stage_config_path] - - with ( - OmniServer( - model, - server_args, - port=port, - env_dict=params.env_dict, - use_omni=params.use_omni, - ) - if port - else OmniServer( - model, - server_args, - env_dict=params.env_dict, - use_omni=params.use_omni, - ) - ) as server: - print("OmniServer started successfully") - yield server - print("OmniServer stopping...") - - print("OmniServer stopped") - - -@dataclass -class OmniResponse: - text_content: str | None = None - audio_data: list[str] | None = None - audio_content: str | None = None - audio_format: str | None = None - audio_bytes: bytes | None = None - similarity: float | None = None - e2e_latency: float | None = None - success: bool = False - error_message: str | None = None - cached_tokens: int | None = None - - -@dataclass -class DiffusionResponse: - text_content: str | None = None - images: list[Image.Image] | None = None - audios: list[Any] | None = None - videos: list[Any] | None = None - e2e_latency: float | None = None - success: bool = False - error_message: str | None = None - - -def _load_gender_pipeline(): - """ - Lazy-load a cached audio-classification pipeline for gender. - - We prefer the pipeline wrapper because it encapsulates processor/model loading - and avoids direct AutoProcessor.from_pretrained call sites in this file. - """ - global _GENDER_PIPELINE - if _GENDER_PIPELINE is not None: - return _GENDER_PIPELINE - - model_name = "7wolf/wav2vec2-base-gender-classification" - try: - # device=-1 forces CPU for pipeline. - _GENDER_PIPELINE = pipeline( - task="audio-classification", - model=model_name, - device=-1, - ) - return _GENDER_PIPELINE - except Exception as exc: # pragma: no cover - best-effort fallback - print(f"Warning: failed to create gender pipeline '{model_name}': {exc}") - _GENDER_PIPELINE = None - return None - - -def _median_pitch_hz_from_autocorr(mono: np.ndarray, sr: int) -> float | None: - """ - Rough median F0 (Hz) over short-time frames. Used to debias wav2vec2 gender head on TTS, - which often labels lower-pitched synthetic speech as female under load or on clean signals. - Returns None if the clip is too short or mostly unvoiced. - """ - x = np.asarray(mono, dtype=np.float64) - x = x - np.mean(x) - if x.size < int(0.15 * sr): - return None - frame_len = int(0.04 * sr) - hop = max(frame_len // 2, 1) - f0_min_hz, f0_max_hz = 70.0, 400.0 - lag_min = max(1, int(sr / f0_max_hz)) - lag_max = min(frame_len - 2, int(sr / f0_min_hz)) - if lag_max <= lag_min: - return None - win = np.hamming(frame_len) - pitches: list[float] = [] - for start in range(0, int(x.shape[0]) - frame_len, hop): - frame = x[start : start + frame_len] * win - frame = frame - np.mean(frame) - if float(np.sqrt(np.mean(frame**2))) < 1e-4: - continue - ac = np.correlate(frame, frame, mode="full")[frame_len - 1 :] - ac = ac / (float(ac[0]) + 1e-12) - region = ac[lag_min : lag_max + 1] - peak_rel = int(np.argmax(region)) - peak_lag = peak_rel + lag_min - if peak_lag <= 0: - continue - f0 = float(sr) / float(peak_lag) - if f0_min_hz <= f0 <= f0_max_hz: - pitches.append(f0) - if len(pitches) < 4: - return None - return float(np.median(np.asarray(pitches, dtype=np.float64))) - - -def _estimate_voice_gender_from_audio(audio_bytes: bytes) -> str: - """ - Estimate voice gender from audio using a small pre-trained classification model. - - Uses a cached `audio-classification` pipeline to classify the clip. - Returns 'male' / 'female' when the model confidence is >= 0.9 and the label - maps to one of these; otherwise returns 'unknown'. If the model is unavailable - or inference fails, returns 'unknown' to keep tests stable. - - Under concurrent tests, a global lock serializes pipeline calls (the HF pipeline is not - thread-safe). A coarse F0 median can correct systematic "male -> female" errors on TTS audio. - """ - data, sr = sf.read(io.BytesIO(audio_bytes), dtype="float32", always_2d=True) - if data.size == 0: - raise ValueError("Empty audio") - mono = np.mean(data, axis=1) - - try: - target_sr = 16000 - if int(sr) != target_sr and mono.size > 1: - src_len = int(mono.shape[0]) - dst_len = max(1, int(round(src_len * float(target_sr) / float(sr)))) - src_idx = np.arange(src_len, dtype=np.float32) - dst_idx = np.linspace(0, src_len - 1, dst_len, dtype=np.float32) - mono = np.interp(dst_idx, src_idx, mono.astype(np.float32, copy=False)).astype(np.float32) - sr = target_sr - - median_f0 = _median_pitch_hz_from_autocorr(mono, sr) - - clf = _load_gender_pipeline() - if clf is None: - print("gender model not available, returning 'unknown'") - return "unknown" - - # transformers pipeline returns a list of {label, score} (highest score first). - with _GENDER_PIPELINE_LOCK: - outputs = clf(mono, sampling_rate=sr) - if not outputs: - return "unknown" - - top = outputs[0] - label = str(top.get("label", "")).lower() - conf = float(top.get("score", 0.0)) - - if conf < 0.5: - gender = "unknown" - # Some models use non-English labels (e.g., Russian). Normalize to 'male'/'female'. - elif ("female" in label) or ("жен" in label): - gender = "female" - elif ("male" in label) or ("муж" in label): - gender = "male" - else: - gender = "unknown" - - # Debias: wav2vec2 gender heads often call TTS / band-limited male speech "female". - # Low median F0 (~speech male range) + female label -> trust pitch when score is not overwhelming. - if gender == "female" and median_f0 is not None and median_f0 < 165.0 and conf < 0.88: - print(f"gender pitch assist: reclassifying female->male (median_f0={median_f0:.1f} Hz, conf={conf:.3f})") - gender = "male" - elif gender == "male" and median_f0 is not None and median_f0 > 230.0 and conf < 0.88: - print(f"gender pitch assist: reclassifying male->female (median_f0={median_f0:.1f} Hz, conf={conf:.3f})") - gender = "female" - - print( - f"gender classifier: label={label}, conf={conf:.3f}, gender={gender}" - + (f", median_f0={median_f0:.1f}Hz" if median_f0 is not None else "") - ) - return gender - except Exception as exc: # pragma: no cover - best-effort fallback - print(f"Warning: gender classification failed, returning 'unknown': {exc}") - return "unknown" - - -_PRESET_VOICE_GENDER_MAP: dict[str, str] = { - "serena": "female", - "uncle_fu": "male", - "chelsie": "female", - "clone": "female", - "ethan": "male", +""" +Root pytest entrypoint for the vLLM-Omni test suite. + +- `tests/conftest.py` stays thin: plugin registration + compatibility re-exports. +- Importable utilities live under `tests/helpers/`. +- Fixtures live under `tests/helpers/fixtures/` and are loaded via `pytest_plugins`. +""" + +from __future__ import annotations + +pytest_plugins = ( + "tests.helpers.fixtures.env", + "tests.helpers.fixtures.log", + "tests.helpers.fixtures.run_args", + "tests.helpers.fixtures.runtime", +) + + +def pytest_terminal_summary(terminalreporter, exitstatus, config): + # Marker for Buildkite log folding before pytest summary lines. + terminalreporter.write_line("--- Running Summary") + + +# Backward-compatible lazy re-exports. +# (Many tests still import from `tests.conftest`; migrate these imports to `tests.helpers.*` over time.) +# Keep these lazy so conftest import does not trigger heavy helper dependencies. +_ASSERTION_EXPORT_NAMES = ( + "assert_audio_speech_response", + "assert_diffusion_response", + "assert_image_diffusion_response", + "assert_image_valid", + "assert_omni_response", + "assert_video_diffusion_response", + "assert_video_valid", +) +_MEDIA_EXPORT_NAMES = ( + "convert_audio_bytes_to_text", + "convert_audio_file_to_text", + "cosine_similarity_text", + "decode_b64_image", + "generate_synthetic_audio", + "generate_synthetic_image", + "generate_synthetic_video", +) +_STAGE_CONFIG_EXPORT_NAMES = ("modify_stage_config",) +_RUNTIME_EXPORT_NAMES = ( + "DiffusionResponse", + "OmniResponse", + "OmniRunner", + "OmniRunnerHandler", + "OmniServer", + "OmniServerParams", + "OmniServerStageCli", + "OpenAIClientHandler", + "dummy_messages_from_mix_data", +) +_LAZY_EXPORT_MODULES = { + **{name: "tests.helpers.assertions" for name in _ASSERTION_EXPORT_NAMES}, + **{name: "tests.helpers.media" for name in _MEDIA_EXPORT_NAMES}, + **{name: "tests.helpers.stage_config" for name in _STAGE_CONFIG_EXPORT_NAMES}, + **{name: "tests.helpers.runtime" for name in _RUNTIME_EXPORT_NAMES}, } - - -def _assert_preset_voice_gender_from_audio( - audio_bytes: bytes | None, - voice_name: str | None, -) -> None: - """If ``voice_name`` matches a known preset, assert classifier gender matches (skip when unknown).""" - if not voice_name or not audio_bytes: - return - key = str(voice_name).lower() - expected_gender = _PRESET_VOICE_GENDER_MAP.get(key) - if expected_gender is None: - return - estimated_gender = _estimate_voice_gender_from_audio(audio_bytes) - print(f"Preset voice gender check: preset={key!r}, estimated={estimated_gender!r}, expected={expected_gender!r}") - if estimated_gender != "unknown": - assert estimated_gender == expected_gender, ( - f"{voice_name!r} is expected {expected_gender}, but estimated gender is {estimated_gender!r}" - ) - - -# Threshold aligned with _compute_pcm_hnr_db docstring (clean clone vs distorted). -_MIN_PCM_SPEECH_HNR_DB = 1.0 - - -def _compute_pcm_hnr_db(pcm_samples: np.ndarray, sr: int = _PCM_SPEECH_SAMPLE_RATE_HZ) -> float: - """Compute mean Harmonic-to-Noise Ratio (dB) for speech quality. - - Clean cloned speech has HNR > 1.2 dB; distorted speech (e.g. lost - ref_code decoder context) drops below 1.0 dB. - """ - frame_len = int(0.03 * sr) # 30ms frames - hop = frame_len // 2 - hnr_values: list[float] = [] - - for start in range(0, len(pcm_samples) - frame_len, hop): - frame = pcm_samples[start : start + frame_len].astype(np.float32, copy=False) - frame = frame - np.mean(frame) - if np.max(np.abs(frame)) < 0.01: - continue - ac = np.correlate(frame, frame, mode="full")[len(frame) - 1 :] - ac = ac / (ac[0] + 1e-10) - min_lag = int(sr / 400) - max_lag = min(int(sr / 80), len(ac)) - if min_lag >= max_lag: - continue - peak = float(np.max(ac[min_lag:max_lag])) - if 0 < peak < 1: - hnr_values.append(10 * np.log10(peak / (1 - peak + 1e-10))) - - return float(np.mean(hnr_values)) if hnr_values else 0.0 - - -def _assert_pcm_int16_speech_hnr(audio_bytes: bytes) -> None: - """Validate harmonic-to-noise ratio on raw int16 PCM from /v1/audio/speech.""" - assert audio_bytes is not None and len(audio_bytes) >= 2, "missing PCM bytes" - assert len(audio_bytes) % 2 == 0, "PCM byte length must be aligned to int16" - pcm_samples = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0 - hnr = _compute_pcm_hnr_db(pcm_samples) - print(f"PCM speech HNR: {hnr:.2f} dB (threshold: {_MIN_PCM_SPEECH_HNR_DB} dB)") - assert hnr >= _MIN_PCM_SPEECH_HNR_DB, ( - f"Audio distortion detected: HNR={hnr:.2f} dB < {_MIN_PCM_SPEECH_HNR_DB} dB. " - "Voice clone decoder may be losing ref_code speaker context on later chunks." - ) - - -def assert_omni_response(response: OmniResponse, request_config: dict[str, Any], run_level): - """ - Validate response results. - - Args: - response: OmniResponse object - - Raises: - AssertionError: When the response does not meet validation criteria - """ - assert response.success, "The request failed." - e2e_latency = response.e2e_latency - if e2e_latency is not None: - print(f"the e2e latency is: {e2e_latency}") - - modalities = request_config.get("modalities", ["text", "audio"]) - - if run_level == "advanced_model": - if "audio" in modalities: - assert response.audio_content is not None, "No audio output is generated" - print(f"audio content is: {response.audio_content}") - speaker = request_config.get("speaker") - if speaker: - _assert_preset_voice_gender_from_audio( - response.audio_bytes, - speaker, - ) - - if "text" in modalities: - assert response.text_content is not None, "No text output is generated" - print(f"text content is: {response.text_content}") - - # Verify image description - word_types = ["text", "image", "audio", "video"] - keywords_dict = request_config.get("key_words", {}) - for word_type in word_types: - keywords = keywords_dict.get(word_type) - if "text" in modalities: - if keywords: - text_lower = response.text_content.lower() - assert any(str(kw).lower() in text_lower for kw in keywords), ( - "The output does not contain any of the keywords." - ) - else: - if keywords: - audio_lower = response.audio_content.lower() - assert any(str(kw).lower() in audio_lower for kw in keywords), ( - "The output does not contain any of the keywords." - ) - - # Verify similarity (Whisper transcript vs streamed/detokenized text) - if "text" in modalities and "audio" in modalities: - assert response.similarity is not None and response.similarity > 0.9, ( - "The audio content is not same as the text" - ) - print(f"similarity is: {response.similarity}") - - -def assert_audio_speech_response( - response: OmniResponse, - request_config: dict[str, Any], - run_level: str, -) -> None: - """ - Validate /v1/audio/speech response: success, optional format check, transcription similarity - and gender (non-PCM only for advanced_model), and int16 PCM HNR when response_format is pcm. - """ - assert response.success, "The request failed." - - req_fmt = request_config.get("response_format") - - if req_fmt == "pcm" and response.audio_bytes: - _assert_pcm_int16_speech_hnr(response.audio_bytes) - if response.audio_format: - assert "pcm" in response.audio_format.lower(), ( - f"Expected audio/pcm content-type, got {response.audio_format!r}" - ) - - elif req_fmt == "wav" and response.audio_format: - assert req_fmt in response.audio_format, ( - f"The response audio format {response.audio_format} don't match the request audio format {req_fmt}" - ) - - e2e_latency = response.e2e_latency - if e2e_latency is not None: - print(f"the avg e2e latency is: {e2e_latency}") - - if run_level == "advanced_model" and req_fmt != "pcm": - # Text–audio semantic similarity check (skipped for raw PCM: no Whisper transcript). - expected_text = request_config.get("input") - if expected_text: - transcript = (response.audio_content or "").strip() - print(f"audio content is: {transcript}") - print(f"input text is: {expected_text}") - similarity = cosine_similarity_text(transcript.lower(), expected_text.lower()) - print(f"Cosine similarity: {similarity:.3f}") - assert similarity > 0.9, ( - f"Transcript doesn't match input: similarity={similarity:.2f}, transcript='{transcript}'" - ) - - # Voice gender consistency check (preset names in ``_PRESET_VOICE_GENDER_MAP``). - # When the estimator returns 'unknown', we treat it as inconclusive and do NOT fail the test. - _assert_preset_voice_gender_from_audio( - response.audio_bytes, - request_config.get("voice"), - ) - - -def assert_diffusion_response(response: DiffusionResponse, request_config: dict[str, Any], run_level: str = None): - """ - Validate diffusion response results. - - Dispatcher that routes validation to modality-specific assert functions. - - Args: - response: DiffusionResponse object. - request_config: Request configuration dictionary. - run_level: Test run level (e.g. "core_model", "advanced_model") - - Raises: - AssertionError: When the response does not meet validation criteria - KeyError: When the request_config does not contain necessary parameters for validation - """ - assert response.success, "The request failed." - - e2e_latency = response.e2e_latency - if e2e_latency is not None: - print(f"the avg e2e is: {e2e_latency}") - - has_any_content = any(content is not None for content in (response.images, response.videos, response.audios)) - assert has_any_content, "Response contains no images, videos, or audios" - - if response.images is not None: - assert_image_diffusion_response( - response=response, - request_config=request_config, - run_level=run_level, - ) - - if response.videos is not None: - assert_video_diffusion_response( - response=response, - request_config=request_config, - run_level=run_level, - ) - - if response.audios is not None: - assert_audio_diffusion_response( - response=response, - request_config=request_config, - run_level=run_level, - ) - - -class OpenAIClientHandler: - """ - OpenAI client handler class, encapsulating both streaming and non-streaming response processing logic. - - This class integrates OpenAI API request sending, response handling, and validation functionality, - supporting both single request and concurrent request modes. - """ - - def __init__( - self, host: str = "127.0.0.1", port: int = get_open_port(), api_key: str = "EMPTY", run_level: str = None - ): - """ - Initialize the OpenAI client. - - Args: - host: vLLM-Omni server host address - port: vLLM-Omni server port - api_key: API key (defaults to "EMPTY") - """ - self.base_url = f"http://{host}:{port}" - self.client = OpenAI(base_url=f"http://{host}:{port}/v1", api_key=api_key) - self.run_level = run_level - - def _process_stream_omni_response(self, chat_completion) -> OmniResponse: - """ - Process streaming responses. - - Args: - chat_completion: OpenAI streaming response object - request_config: Request configuration dictionary - - Returns: - OmniResponse: Processed response object - """ - result = OmniResponse() - start_time = time.perf_counter() - - try: - text_content = "" - audio_data = [] - - for chunk in chat_completion: - for choice in chunk.choices: - # Get content data - if hasattr(choice, "delta"): - content = getattr(choice.delta, "content", None) - else: - content = None - - # Get modality type - modality = getattr(chunk, "modality", None) - - # Process content based on modality type - if modality == "audio" and content: - audio_data.append(content) - elif modality == "text" and content: - text_content += content if content else "" - - # Calculate end-to-end latency - result.e2e_latency = time.perf_counter() - start_time - - # Process audio and text content - audio_content = None - similarity = None - - if audio_data or text_content: - if audio_data: - merged_seg = _merge_base64_audio_to_segment(audio_data) - wav_buf = BytesIO() - merged_seg.export(wav_buf, format="wav") - result.audio_bytes = wav_buf.getvalue() - audio_content = convert_audio_bytes_to_text(result.audio_bytes) - if audio_content and text_content: - similarity = cosine_similarity_text(audio_content.lower(), text_content.lower()) - - # Populate result object - result.text_content = text_content - result.audio_data = audio_data - result.audio_content = audio_content - result.similarity = similarity - result.success = True - - except Exception as e: - result.error_message = f"Stream processing error: {str(e)}" - print(f"Error: {result.error_message}") - - return result - - def _process_non_stream_omni_response(self, chat_completion) -> OmniResponse: - """ - Process non-streaming responses. - - Args: - chat_completion: OpenAI non-streaming response object - request_config: Request configuration dictionary - - Returns: - OmniResponse: Processed response object - """ - result = OmniResponse() - start_time = time.perf_counter() - - try: - audio_data = None - text_content = None - - # Iterate through all choices - for choice in chat_completion.choices: - # Process audio data - if hasattr(choice.message, "audio") and choice.message.audio is not None: - audio_message = choice.message - audio_data = audio_message.audio.data - - # Process text content - if hasattr(choice.message, "content") and choice.message.content is not None: - text_content = choice.message.content - - # Extract cached_tokens for prefix caching tests - usage = getattr(chat_completion, "usage", None) - if usage and (details := getattr(usage, "prompt_tokens_details", None)): - result.cached_tokens = details.cached_tokens - - # Calculate end-to-end latency - result.e2e_latency = time.perf_counter() - start_time - - # Process audio and text content - audio_content = None - similarity = None - - if audio_data or text_content: - if audio_data: - result.audio_bytes = base64.b64decode(audio_data) - audio_content = convert_audio_bytes_to_text(result.audio_bytes) - if audio_content and text_content: - similarity = cosine_similarity_text(audio_content.lower(), text_content.lower()) - - # Populate result object - result.text_content = text_content - result.audio_content = audio_content - result.similarity = similarity - result.success = True - - except Exception as e: - result.error_message = f"Non-stream processing error: {str(e)}" - print(f"Error: {result.error_message}") - - return result - - def _process_diffusion_response(self, chat_completion) -> DiffusionResponse: - """ - Process diffusion responses (image generation/editing). - - Args: - chat_completion: OpenAI response object - - Returns: - DiffusionResponse: Processed response object - """ - result = DiffusionResponse() - start_time = time.perf_counter() - - try: - images = [] - # [TODO] reading video and audio output from API response for later validation - - for choice in chat_completion.choices: - if hasattr(choice.message, "content") and choice.message.content is not None: - content = choice.message.content - if isinstance(content, list): - for item in content: - if isinstance(item, dict): - image_url = item.get("image_url", {}).get("url") - else: - image_url_obj = getattr(item, "image_url", None) - image_url = getattr(image_url_obj, "url", None) if image_url_obj else None - if image_url and image_url.startswith("data:image"): - b64_data = image_url.split(",", 1)[1] - img = decode_b64_image(b64_data) - images.append(img) - - result.e2e_latency = time.perf_counter() - start_time - result.images = images if images else None - result.success = True - - except Exception as e: - result.error_message = f"Diffusion response processing error: {str(e)}" - print(f"Error: {result.error_message}") - - return result - - def _process_stream_audio_speech_response(self, response, *, response_format: str | None = None) -> OmniResponse: - """ - Process streaming /v1/audio/speech responses into an OmniResponse. - - This mirrors _process_stream_omni_response but operates on low-level - audio bytes and produces an OmniResponse with audio_content filled - from Whisper transcription. - """ - result = OmniResponse() - start_time = time.perf_counter() - - try: - # Aggregate all audio bytes from the streaming response. - data = bytearray() - - # Preferred OpenAI helper. - if hasattr(response, "iter_bytes") and callable(getattr(response, "iter_bytes")): - for chunk in response.iter_bytes(): - if chunk: - data.extend(chunk) - else: - # Generic iterable-of-bytes fallback (e.g., generator or list of chunks). - try: - iterator = iter(response) - except TypeError: - iterator = None - - if iterator is not None: - for chunk in iterator: - if not chunk: - continue - if isinstance(chunk, (bytes, bytearray)): - data.extend(chunk) - elif hasattr(chunk, "data"): - data.extend(chunk.data) # type: ignore[arg-type] - elif hasattr(chunk, "content"): - data.extend(chunk.content) # type: ignore[arg-type] - else: - raise TypeError(f"Unsupported stream chunk type: {type(chunk)}") - else: - raise TypeError(f"Unsupported audio speech streaming response type: {type(response)}") - - raw_bytes = bytes(data) - if response_format == "pcm": - transcript = None - else: - transcript = convert_audio_bytes_to_text(raw_bytes) - - # Populate OmniResponse. - result.audio_bytes = raw_bytes - result.audio_content = transcript - result.e2e_latency = time.perf_counter() - start_time - result.success = True - result.audio_format = getattr(response, "response", None) - if result.audio_format is not None: - result.audio_format = result.audio_format.headers.get("content-type", "") - - except Exception as e: - result.error_message = f"Audio speech stream processing error: {str(e)}" - print(f"Error: {result.error_message}") - - return result - - def _process_non_stream_audio_speech_response( - self, response, *, response_format: str | None = None - ) -> OmniResponse: - """ - Process non-streaming /v1/audio/speech responses into an OmniResponse. - - This mirrors _process_non_stream_omni_response but for the binary - audio payload returned by audio.speech.create. - """ - result = OmniResponse() - start_time = time.perf_counter() - - try: - # OpenAI non-streaming audio.speech.create returns HttpxBinaryResponseContent (.read() or .content) - if hasattr(response, "read") and callable(getattr(response, "read")): - raw_bytes = response.read() - elif hasattr(response, "content"): - raw_bytes = response.content # type: ignore[assignment] - else: - raise TypeError(f"Unsupported audio speech response type: {type(response)}") - - if response_format == "pcm": - transcript = None - else: - transcript = convert_audio_bytes_to_text(raw_bytes) - - result.audio_bytes = raw_bytes - result.audio_content = transcript - result.e2e_latency = time.perf_counter() - start_time - result.success = True - result.audio_format = getattr(response, "response", None) - if result.audio_format is not None: - result.audio_format = result.audio_format.headers.get("content-type", "") - - except Exception as e: - result.error_message = f"Audio speech non-stream processing error: {str(e)}" - print(f"Error: {result.error_message}") - - return result - - def send_omni_request(self, request_config: dict[str, Any], request_num: int = 1) -> list[OmniResponse]: - """ - Send OpenAI requests. - - Args: - request_config: Request configuration dictionary containing parameters like model, messages, stream. - Optional ``use_audio_in_video`` (bool): when true, sets - ``extra_body["mm_processor_kwargs"] = {"use_audio_in_video": True}`` for Qwen-Omni video+audio - extraction. - Optional top-level ``speaker`` (str): Qwen3-Omni preset TTS speaker name; sent as - ``extra_body["speaker"]`` to ``chat.completions.create``. - request_num: Number of requests, defaults to 1 (single request) - - Returns: - List[OmniResponse]: List of response objects - """ - - responses = [] - stream = request_config.get("stream", False) - modalities = request_config.get("modalities", ["text", "audio"]) - - extra_body: dict[str, Any] = {} - if "speaker" in request_config: - extra_body["speaker"] = request_config["speaker"] - if request_config.get("use_audio_in_video"): - mm = dict(extra_body.get("mm_processor_kwargs") or {}) - mm["use_audio_in_video"] = True - extra_body["mm_processor_kwargs"] = mm - extra_body_arg: dict[str, Any] | None = extra_body if extra_body else None - - create_kwargs: dict[str, Any] = { - "model": request_config.get("model"), - "messages": request_config.get("messages"), - "stream": stream, - "modalities": modalities, - } - if extra_body_arg is not None: - create_kwargs["extra_body"] = extra_body_arg - - if request_num == 1: - # Send single request - chat_completion = self.client.chat.completions.create(**create_kwargs) - - if stream: - response = self._process_stream_omni_response(chat_completion) - else: - response = self._process_non_stream_omni_response(chat_completion) - - assert_omni_response(response, request_config, run_level=self.run_level) - responses.append(response) - - else: - # Send concurrent requests: run create + process in worker so e2e_latency includes full round-trip. - def _one_omni_request(): - start = time.perf_counter() - worker_kwargs: dict[str, Any] = { - "model": request_config.get("model"), - "messages": request_config.get("messages"), - "modalities": modalities, - "stream": stream, - } - if extra_body_arg is not None: - worker_kwargs["extra_body"] = extra_body_arg - chat_completion = self.client.chat.completions.create(**worker_kwargs) - if stream: - response = self._process_stream_omni_response(chat_completion) - else: - response = self._process_non_stream_omni_response(chat_completion) - response.e2e_latency = time.perf_counter() - start - return response - - with concurrent.futures.ThreadPoolExecutor(max_workers=request_num) as executor: - futures = [executor.submit(_one_omni_request) for _ in range(request_num)] - for future in concurrent.futures.as_completed(futures): - response = future.result() - assert_omni_response(response, request_config, run_level=self.run_level) - responses.append(response) - - return responses - - def send_audio_speech_request(self, request_config: dict[str, Any], request_num: int = 1) -> list[OmniResponse]: - """ - Call the /v1/audio/speech endpoint using the same configuration-dict - style as send_omni_request, but via the OpenAI Python client's - audio.speech APIs. - - Expected keys in request_config: - - model: model name/path (required) - - input: text to synthesize (required) - - response_format: audio format such as "wav" or "pcm" (optional) - - task_type, ref_text, ref_audio: TTS-specific extras (optional, passed via extra_body) - - timeout: request timeout in seconds (float, optional, default 120.0) - - stream: whether to use streaming API (bool, optional, default False) - """ - timeout = float(request_config.get("timeout", 120.0)) - - model = request_config["model"] - text_input = request_config["input"] - stream = bool(request_config.get("stream", False)) - voice = request_config.get("voice", None) - - # Standard OpenAI param: use omit when not provided to keep default behavior. - response_format = request_config.get("response_format", omit) - - # Qwen3-TTS custom fields, forwarded via extra_body. - extra_body: dict[str, Any] = {} - # Keep this list aligned with vllm_omni.entrypoints.openai.protocol.audio params. - for key in ("task_type", "ref_text", "ref_audio", "language", "max_new_tokens"): - if key in request_config: - extra_body[key] = request_config[key] - - responses: list[OmniResponse] = [] - - speech_fmt: str | None = None if response_format is omit else str(response_format).lower() - - if request_num == 1: - if stream: - # Use streaming response helper. - with self.client.audio.speech.with_streaming_response.create( - model=model, - input=text_input, - response_format=response_format, - extra_body=extra_body or None, - timeout=timeout, - voice=voice, - ) as resp: - omni_resp = self._process_stream_audio_speech_response(resp, response_format=speech_fmt) - else: - # Non-streaming response. - resp = self.client.audio.speech.create( - model=model, - input=text_input, - response_format=response_format, - extra_body=extra_body or None, - timeout=timeout, - voice=voice, - ) - omni_resp = self._process_non_stream_audio_speech_response(resp, response_format=speech_fmt) - - assert_audio_speech_response(omni_resp, request_config, run_level=self.run_level) - responses.append(omni_resp) - return responses - else: - # request_num > 1: concurrent requests (use same params as single-request path) - - if stream: - - def _stream_task(): - with self.client.audio.speech.with_streaming_response.create( - model=model, - input=text_input, - response_format=response_format, - extra_body=extra_body or None, - timeout=timeout, - voice=voice, - ) as resp: - return self._process_stream_audio_speech_response(resp, response_format=speech_fmt) - - with concurrent.futures.ThreadPoolExecutor(max_workers=request_num) as executor: - futures = [executor.submit(_stream_task) for _ in range(request_num)] - for future in concurrent.futures.as_completed(futures): - omni_resp = future.result() - assert_audio_speech_response(omni_resp, request_config, run_level=self.run_level) - responses.append(omni_resp) - else: - with concurrent.futures.ThreadPoolExecutor(max_workers=request_num) as executor: - futures = [] - for _ in range(request_num): - future = executor.submit( - self.client.audio.speech.create, - model=model, - input=text_input, - response_format=response_format, - extra_body=extra_body or None, - timeout=timeout, - voice=voice, - ) - futures.append(future) - - for future in concurrent.futures.as_completed(futures): - resp = future.result() - omni_resp = self._process_non_stream_audio_speech_response(resp, response_format=speech_fmt) - assert_audio_speech_response(omni_resp, request_config, run_level=self.run_level) - responses.append(omni_resp) - - return responses - - def send_diffusion_request(self, request_config: dict[str, Any], request_num: int = 1) -> list[DiffusionResponse]: - """ - Send OpenAI requests for diffusion models. - - Args: - request_config: Request configuration dictionary containing parameters like model, messages - request_num: Number of requests to send concurrently, defaults to 1 (single request) - Returns: - List[DiffusionResponse]: List of response objects - """ - responses: list[DiffusionResponse] = [] - stream = request_config.get("stream", False) - modalities = request_config.get("modalities", omit) # Most diffusion models don't require modalities param - extra_body = request_config.get("extra_body", None) - - if stream: - raise NotImplementedError("Streaming is not currently implemented for diffusion model e2e test") - - if request_num == 1: - # Send single request - chat_completion = self.client.chat.completions.create( - model=request_config.get("model"), - messages=request_config.get("messages"), - extra_body=extra_body, - modalities=modalities, - ) - - response = self._process_diffusion_response(chat_completion) - assert_diffusion_response(response, request_config, run_level=self.run_level) - responses.append(response) - - else: - # Send concurrent requests - with concurrent.futures.ThreadPoolExecutor(max_workers=request_num) as executor: - futures = [] - - # Submit all request tasks - for _ in range(request_num): - future = executor.submit( - self.client.chat.completions.create, - model=request_config.get("model"), - messages=request_config.get("messages"), - modalities=modalities, - extra_body=extra_body, - ) - futures.append(future) - - # Process completed tasks - for future in concurrent.futures.as_completed(futures): - chat_completion = future.result() - response = self._process_diffusion_response(chat_completion) - assert_diffusion_response(response, request_config, run_level=self.run_level) - responses.append(response) - - return responses - - def send_video_diffusion_request(self, request_config: dict[str, Any], request_num: int = 1) -> list[OmniResponse]: - """ - Send native /v1/videos requests. - """ - if request_num != 1: - raise NotImplementedError("Concurrent video diffusion requests are not currently implemented") - - if request_config.get("stream", False): - raise NotImplementedError("Streaming is not currently implemented for video diffusion e2e test") - - form_data = request_config.get("form_data") - if not isinstance(form_data, dict): - raise ValueError("Video request_config must contain 'form_data'") - - if not form_data.get("prompt"): - raise ValueError("Video request_config['form_data'] must contain 'prompt'") - - normalized_form_data = {key: str(value) for key, value in form_data.items() if value is not None} - - files: dict[str, tuple[str, BytesIO, str]] = {} - image_reference = request_config.get("image_reference") - if image_reference: - if image_reference.startswith("data:image"): - header, encoded = image_reference.split(",", 1) - content_type = header.split(";")[0].removeprefix("data:") - extension = content_type.split("/")[-1] - file_data = base64.b64decode(encoded) - - files["input_reference"] = ( - f"reference.{extension}", - BytesIO(file_data), - content_type, - ) - else: - normalized_form_data["image_reference"] = json.dumps({"image_url": image_reference}) - - result = DiffusionResponse() - start_time = time.perf_counter() - - try: - create_url = self._build_url("/v1/videos") - response = requests.post( - create_url, - data=normalized_form_data, - files=files, - headers={"Accept": "application/json"}, - timeout=60, - ) - response.raise_for_status() - - job_data = response.json() - video_id = job_data["id"] - - self._wait_until_video_completed(video_id) - - video_content = self._download_video_content(video_id) - - result.success = True - result.videos = [video_content] - result.e2e_latency = time.perf_counter() - start_time - - assert_diffusion_response(result, request_config, run_level=self.run_level) - - except Exception as e: - result.success = False - result.error_message = f"Diffusion response processing error: {e}" - assert False, result.error_message - - return [result] - - def _wait_until_video_completed( - self, - video_id: str, - poll_interval_seconds: int = 2, - timeout_seconds: int = 300, - ) -> None: - status_url = self._build_url(f"/v1/videos/{video_id}") - deadline = time.monotonic() + timeout_seconds - - while time.monotonic() < deadline: - status_resp = requests.get( - status_url, - headers={"Accept": "application/json"}, - timeout=30, - ) - status_resp.raise_for_status() - - status_data = status_resp.json() - current_status = status_data["status"] - - if current_status == "completed": - return - - if current_status == "failed": - error_msg = status_data.get("last_error", "Unknown error") - raise RuntimeError(f"Job failed: {error_msg}") - - time.sleep(poll_interval_seconds) - - raise TimeoutError(f"Video job {video_id} did not complete within {timeout_seconds}s") - - def _download_video_content(self, video_id: str) -> bytes: - download_url = self._build_url(f"/v1/videos/{video_id}/content") - video_resp = requests.get(download_url, stream=True, timeout=60) - video_resp.raise_for_status() - - video_bytes = BytesIO() - for chunk in video_resp.iter_content(chunk_size=8192): - if chunk: - video_bytes.write(chunk) - - return video_bytes.getvalue() - - def _build_url(self, path: str) -> str: - return f"{self.base_url.rstrip('/')}/{path.lstrip('/')}" - - -@pytest.fixture -def openai_client(request: pytest.FixtureRequest, run_level: str): - """Create OpenAIClientHandler fixture to facilitate communication with OmniServer - with encapsulated request sending, concurrent requests, response handling, and validation.""" - server = request.getfixturevalue("omni_server") - return OpenAIClientHandler(host=server.host, port=server.port, api_key="EMPTY", run_level=run_level) - - -class OmniRunner: - """ - Offline test runner for Omni models. - """ - - def __init__( - self, - model_name: str, - seed: int = 42, - stage_init_timeout: int = 600, - batch_timeout: int = 10, - init_timeout: int = 900, - shm_threshold_bytes: int = 65536, - log_stats: bool = False, - stage_configs_path: str | None = None, - **kwargs, - ) -> None: - """ - Initialize an OmniRunner for testing. - - Args: - model_name: The model name or path - seed: Random seed for reproducibility - stage_init_timeout: Timeout for initializing a single stage in seconds - batch_timeout: Timeout for batching in seconds - init_timeout: Timeout for initializing stages in seconds - shm_threshold_bytes: Threshold for using shared memory - log_stats: Enable detailed statistics logging - stage_configs_path: Optional path to YAML stage config file - **kwargs: Additional arguments passed to Omni - """ - cleanup_dist_env_and_memory() - _run_pre_test_cleanup(enable_force=True) - _run_post_test_cleanup(enable_force=True) - self.model_name = model_name - self.seed = seed - - self.omni = Omni( - model=model_name, - log_stats=log_stats, - stage_init_timeout=stage_init_timeout, - batch_timeout=batch_timeout, - init_timeout=init_timeout, - shm_threshold_bytes=shm_threshold_bytes, - stage_configs_path=stage_configs_path, - **kwargs, - ) - - def _estimate_prompt_len( - self, - additional_information: dict[str, Any], - model_name: str, - _cache: dict[str, Any] = {}, - ) -> int: - """Estimate prompt_token_ids placeholder length for the Talker stage. - - The AR Talker replaces all input embeddings via ``preprocess``, so the - placeholder values are irrelevant but the **length** must match the - embeddings that ``preprocess`` will produce. - """ - try: - from vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts import Qwen3TTSConfig - from vllm_omni.model_executor.models.qwen3_tts.qwen3_tts_talker import ( - Qwen3TTSTalkerForConditionalGeneration, - ) - - if model_name not in _cache: - from transformers import AutoTokenizer - - tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, padding_side="left") - cfg = Qwen3TTSConfig.from_pretrained(model_name, trust_remote_code=True) - _cache[model_name] = (tok, getattr(cfg, "talker_config", None)) - - tok, tcfg = _cache[model_name] - task_type = (additional_information.get("task_type") or ["CustomVoice"])[0] - return Qwen3TTSTalkerForConditionalGeneration.estimate_prompt_len_from_additional_information( - additional_information=additional_information, - task_type=task_type, - tokenize_prompt=lambda t: tok(t, padding=False)["input_ids"], - codec_language_id=getattr(tcfg, "codec_language_id", None), - spk_is_dialect=getattr(tcfg, "spk_is_dialect", None), - ) - except Exception as exc: - logger.warning("Failed to estimate prompt length, using fallback 2048: %s", exc) - return 2048 - - def get_default_sampling_params_list(self) -> list[OmniSamplingParams]: - """ - Get a list of default sampling parameters for all stages. - - Returns: - List of SamplingParams with default decoding for each stage - """ - if not hasattr(self.omni, "default_sampling_params_list"): - raise AttributeError("Omni.default_sampling_params_list is not available") - return list(self.omni.default_sampling_params_list) - - def get_omni_inputs( - self, - prompts: list[str] | str, - system_prompt: str | None = None, - audios: PromptAudioInput = None, - images: PromptImageInput = None, - videos: PromptVideoInput = None, - mm_processor_kwargs: dict[str, Any] | None = None, - modalities: list[str] | None = None, - ) -> list[TextPrompt]: - """ - Construct Omni input format from prompts and multimodal data. - - Args: - prompts: Text prompt(s) - either a single string or list of strings - system_prompt: Optional system prompt (defaults to Qwen system prompt) - audios: Audio input(s) - tuple of (audio_array, sample_rate) or list of tuples - images: Image input(s) - PIL Image or list of PIL Images - videos: Video input(s) - numpy array or list of numpy arrays - mm_processor_kwargs: Optional processor kwargs (e.g., use_audio_in_video) - - Returns: - List of prompt dictionaries suitable for Omni.generate() - """ - if system_prompt is None: - system_prompt = ( - "You are Qwen, a virtual human developed by the Qwen Team, Alibaba " - "Group, capable of perceiving auditory and visual inputs, as well as " - "generating text and speech." - ) - - video_padding_token = "<|VIDEO|>" - image_padding_token = "<|IMAGE|>" - audio_padding_token = "<|AUDIO|>" - - if "Qwen3-Omni-30B-A3B-Instruct" in self.model_name: - video_padding_token = "<|video_pad|>" - image_padding_token = "<|image_pad|>" - audio_padding_token = "<|audio_pad|>" - - if isinstance(prompts, str): - prompts = [prompts] - - # Qwen-TTS: follow examples/offline_inference/qwen3_tts/end2end.py style. - # Stage 0 expects token placeholders + additional_information (text/speaker/task_type/...), - # and Talker replaces embeddings in preprocess based on additional_information only. - is_tts_model = "Qwen3-TTS" in self.model_name or "qwen3_tts" in self.model_name.lower() - if is_tts_model and modalities == ["audio"]: - tts_kw = mm_processor_kwargs or {} - task_type = tts_kw.get("task_type", "CustomVoice") - speaker = tts_kw.get("speaker", "Vivian") - language = tts_kw.get("language", "Auto") - max_new_tokens = int(tts_kw.get("max_new_tokens", 2048)) - ref_audio = tts_kw.get("ref_audio", None) - ref_text = tts_kw.get("ref_text", None) - - omni_inputs: list[TextPrompt] = [] - for prompt_text in prompts: - text_str = str(prompt_text).strip() or " " - additional_information: dict[str, Any] = { - "task_type": [task_type], - "text": [text_str], - "language": [language], - "speaker": [speaker], - "max_new_tokens": [max_new_tokens], - } - if ref_audio is not None: - additional_information["ref_audio"] = [ref_audio] - if ref_text is not None: - additional_information["ref_text"] = [ref_text] - # Use official helper to get correct placeholder length - plen = self._estimate_prompt_len(additional_information, self.model_name) - input_dict: TextPrompt = { - "prompt_token_ids": [0] * plen, - "additional_information": additional_information, - } - omni_inputs.append(input_dict) - return omni_inputs - - def _normalize_mm_input(mm_input, num_prompts): - if mm_input is None: - return [None] * num_prompts - if isinstance(mm_input, list): - if len(mm_input) != num_prompts: - raise ValueError( - f"Multimodal input list length ({len(mm_input)}) must match prompts length ({num_prompts})" - ) - return mm_input - return [mm_input] * num_prompts - - num_prompts = len(prompts) - audios_list = _normalize_mm_input(audios, num_prompts) - images_list = _normalize_mm_input(images, num_prompts) - videos_list = _normalize_mm_input(videos, num_prompts) - - omni_inputs = [] - for i, prompt_text in enumerate(prompts): - user_content = "" - multi_modal_data = {} - - audio = audios_list[i] - if audio is not None: - if isinstance(audio, list): - for _ in audio: - user_content += f"<|audio_bos|>{audio_padding_token}<|audio_eos|>" - multi_modal_data["audio"] = audio - else: - user_content += f"<|audio_bos|>{audio_padding_token}<|audio_eos|>" - multi_modal_data["audio"] = audio - - image = images_list[i] - if image is not None: - if isinstance(image, list): - for _ in image: - user_content += f"<|vision_bos|>{image_padding_token}<|vision_eos|>" - multi_modal_data["image"] = image - else: - user_content += f"<|vision_bos|>{image_padding_token}<|vision_eos|>" - multi_modal_data["image"] = image - - video = videos_list[i] - if video is not None: - if isinstance(video, list): - for _ in video: - user_content += f"<|vision_bos|>{video_padding_token}<|vision_eos|>" - multi_modal_data["video"] = video - else: - user_content += f"<|vision_bos|>{video_padding_token}<|vision_eos|>" - multi_modal_data["video"] = video - - user_content += prompt_text - - full_prompt = ( - f"<|im_start|>system\n{system_prompt}<|im_end|>\n" - f"<|im_start|>user\n{user_content}<|im_end|>\n" - f"<|im_start|>assistant\n" - ) - - input_dict: TextPrompt = {"prompt": full_prompt} - if multi_modal_data: - input_dict["multi_modal_data"] = multi_modal_data - if modalities: - input_dict["modalities"] = modalities - if mm_processor_kwargs: - input_dict["mm_processor_kwargs"] = mm_processor_kwargs - - omni_inputs.append(input_dict) - - return omni_inputs - - def generate( - self, - prompts: list[TextPrompt], - sampling_params_list: list[OmniSamplingParams] | None = None, - ) -> list[OmniRequestOutput]: - """ - Generate outputs for the given prompts. - - Args: - prompts: List of prompt dictionaries with 'prompt' and optionally - 'multi_modal_data' keys - sampling_params_list: List of sampling parameters for each stage. - If None, uses default parameters. - - Returns: - List of OmniRequestOutput objects from stages with final_output=True - """ - if sampling_params_list is None: - sampling_params_list = self.get_default_sampling_params_list() - - return self.omni.generate(prompts, sampling_params_list) - - def generate_multimodal( - self, - prompts: list[str] | str, - sampling_params_list: list[OmniSamplingParams] | None = None, - system_prompt: str | None = None, - audios: PromptAudioInput = None, - images: PromptImageInput = None, - videos: PromptVideoInput = None, - mm_processor_kwargs: dict[str, Any] | None = None, - modalities: list[str] | None = None, - ) -> list[OmniRequestOutput]: - """ - Convenience method to generate with multimodal inputs. - - Args: - prompts: Text prompt(s) - sampling_params_list: List of sampling parameters for each stage - system_prompt: Optional system prompt - audios: Audio input(s) - images: Image input(s) - videos: Video input(s) - mm_processor_kwargs: Optional processor kwargs - - Returns: - List of OmniRequestOutput objects from stages with final_output=True - """ - omni_inputs = self.get_omni_inputs( - prompts=prompts, - system_prompt=system_prompt, - audios=audios, - images=images, - videos=videos, - mm_processor_kwargs=mm_processor_kwargs, - modalities=modalities, - ) - return self.generate(omni_inputs, sampling_params_list) - - def start_profile( - self, - profile_prefix: str | None = None, - stages: list[int] | None = None, - ) -> list[Any]: - """Start profiling specified stages. - - Args: - profile_prefix: Optional prefix for the trace file names. - stages: List of stage IDs to profile. If None, profiles all stages. - - Returns: - List of results from each stage. - """ - return self.omni.start_profile(profile_prefix=profile_prefix, stages=stages) - - def stop_profile(self, stages: list[int] | None = None) -> list[Any]: - """Stop profiling specified stages. - - Args: - stages: List of stage IDs to profile. If None, stops all stages. - - Returns: - List of results from each stage. - """ - return self.omni.stop_profile(stages=stages) - - def _cleanup_process(self): - try: - keywords = ["enginecore"] - matched = [] - - for proc in psutil.process_iter(["pid", "name", "cmdline", "username"]): - try: - cmdline = " ".join(proc.cmdline()).lower() if proc.cmdline() else "" - name = proc.name().lower() - - is_process = any(keyword in cmdline for keyword in keywords) or any( - keyword in name for keyword in keywords - ) - - if is_process: - print(f"Found vllm process: PID={proc.pid}, cmd={cmdline[:100]}") - matched.append(proc) - except (psutil.NoSuchProcess, psutil.AccessDenied): - pass - - for proc in matched: - try: - proc.terminate() - except (psutil.NoSuchProcess, psutil.AccessDenied): - pass - - _, still_alive = psutil.wait_procs(matched, timeout=5) - for proc in still_alive: - try: - proc.kill() - except (psutil.NoSuchProcess, psutil.AccessDenied): - pass - - if still_alive: - _, stubborn = psutil.wait_procs(still_alive, timeout=3) - if stubborn: - print(f"Warning: failed to kill residual vllm pids: {[p.pid for p in stubborn]}") - else: - print(f"Force-killed residual vllm pids: {[p.pid for p in still_alive]}") - elif matched: - print(f"Terminated vllm pids: {[p.pid for p in matched]}") - - except Exception as e: - print(f"Error in psutil vllm cleanup: {e}") - - def __enter__(self): - """Context manager entry.""" - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - """Context manager exit - cleanup resources.""" - if hasattr(self.omni, "close"): - self.omni.close() - self._cleanup_process() - _run_pre_test_cleanup(enable_force=True) - _run_post_test_cleanup(enable_force=True) - cleanup_dist_env_and_memory() - - -@pytest.fixture(scope="module") -def omni_runner(request, model_prefix): - with _omni_server_lock: - model, stage_config_path = request.param - model = model_prefix + model - with OmniRunner(model, seed=42, stage_configs_path=stage_config_path) as runner: - print("OmniRunner started successfully") - yield runner - print("OmniRunner stopping...") - - print("OmniRunner stopped") - - -class OmniRunnerHandler: - def __init__(self, omni_runner): - self.runner = omni_runner - - def _process_output(self, outputs: list[Any]) -> OmniResponse: - result = OmniResponse() - try: - text_content = None - audio_content = None - for stage_output in outputs: - if getattr(stage_output, "final_output_type", None) == "text": - text_content = stage_output.request_output.outputs[0].text - if getattr(stage_output, "final_output_type", None) == "audio": - audio_content = stage_output.request_output.outputs[0].multimodal_output["audio"] - - result.audio_content = audio_content - result.text_content = text_content - result.success = True - - except Exception as e: - result.error_message = f"Output processing error: {str(e)}" - result.success = False - print(f"Error: {result.error_message}") - - return result - - def send_request(self, request_config: dict[str, Any] | None = None) -> OmniResponse: - if request_config is None: - request_config = {} - prompts = request_config.get("prompts") - videos = request_config.get("videos") - images = request_config.get("images") - audios = request_config.get("audios") - modalities = request_config.get("modalities", ["text", "audio"]) - outputs = self.runner.generate_multimodal( - prompts=prompts, videos=videos, images=images, audios=audios, modalities=modalities - ) - response = self._process_output(outputs) - assert_omni_response(response, request_config, run_level="core_model") - return response - - def send_audio_speech_request( - self, - request_config: dict[str, Any], - ) -> OmniResponse: - """ - Offline TTS: text -> audio via generate_multimodal, then validate with assert_audio_speech_response. - - request_config must contain: - - 'input' or 'prompts': text to synthesize. - Optional keys: - - 'voice' -> speaker (CustomVoice) - - 'task_type' -> task_type in additional_information (default: "CustomVoice") - - 'language' -> language in additional_information (default: "Auto") - - 'max_new_tokens' -> max_new_tokens in additional_information (default: 2048) - - 'response_format' -> desired audio format (used only for assertion) - """ - input_text = request_config.get("input") or request_config.get("prompts") - if input_text is None: - raise ValueError("request_config must contain 'input' or 'prompts' for TTS") - if isinstance(input_text, list): - input_text = input_text[0] if input_text else "" - - # Build TTS-specific kwargs passed through to get_omni_inputs for Qwen3-TTS, - # matching examples/offline_inference/qwen3_tts/end2end.py. - mm_processor_kwargs: dict[str, Any] = {} - if "voice" in request_config: - mm_processor_kwargs["speaker"] = request_config["voice"] - if "task_type" in request_config: - mm_processor_kwargs["task_type"] = request_config["task_type"] - if "ref_audio" in request_config: - mm_processor_kwargs["ref_audio"] = request_config["ref_audio"] - if "ref_text" in request_config: - mm_processor_kwargs["ref_text"] = request_config["ref_text"] - if "language" in request_config: - mm_processor_kwargs["language"] = request_config["language"] - if "max_new_tokens" in request_config: - mm_processor_kwargs["max_new_tokens"] = request_config["max_new_tokens"] - - outputs = self.runner.generate_multimodal( - prompts=input_text, - modalities=["audio"], - mm_processor_kwargs=mm_processor_kwargs or None, - ) - mm_out: dict[str, Any] | None = None - for stage_out in outputs: - if getattr(stage_out, "final_output_type", None) == "audio": - mm_out = stage_out.request_output.outputs[0].multimodal_output - break - if mm_out is None: - result = OmniResponse(success=False, error_message="No audio output from pipeline") - assert result.success, result.error_message - return result - - audio_data = mm_out.get("audio") - if audio_data is None: - result = OmniResponse(success=False, error_message="No audio tensor in multimodal output") - assert result.success, result.error_message - return result - - sr_raw = mm_out.get("sr") - sr_val = sr_raw[-1] if isinstance(sr_raw, list) and sr_raw else sr_raw - sr = int(sr_val.item() if hasattr(sr_val, "item") else sr_val) - wav_tensor = torch.cat(audio_data, dim=-1) if isinstance(audio_data, list) else audio_data - wav_buf = io.BytesIO() - sf.write( - wav_buf, - wav_tensor.float().cpu().numpy().reshape(-1), - samplerate=sr, - format="WAV", - subtype="PCM_16", - ) - result = OmniResponse(success=True, audio_bytes=wav_buf.getvalue(), audio_format="audio/wav") - assert_audio_speech_response(result, request_config, run_level="core_model") - return result - - def start_profile( - self, - profile_prefix: str | None = None, - stages: list[int] | None = None, - ) -> list[Any]: - """Start profiling specified stages.""" - return self.runner.start_profile(profile_prefix=profile_prefix, stages=stages) - - def stop_profile(self, stages: list[int] | None = None) -> list[Any]: - """Stop profiling specified stages.""" - return self.runner.stop_profile(stages=stages) - - -@pytest.fixture -def omni_runner_handler(omni_runner): - return OmniRunnerHandler(omni_runner) diff --git a/tests/core/sched/test_omni_scheduler_mixin.py b/tests/core/sched/test_omni_scheduler_mixin.py new file mode 100644 index 00000000000..e04a9c39fbc --- /dev/null +++ b/tests/core/sched/test_omni_scheduler_mixin.py @@ -0,0 +1,129 @@ +"""Unit tests for OmniSchedulerMixin streaming session replacement. + +These tests pin the behavior of `_replace_session_with_streaming_update` against +current vLLM `Request` / `StreamingUpdate` (and Omni patches). When upgrading +vLLM, failures here should highlight incompatible changes to request state or +update payloads early. +""" + +from __future__ import annotations + +from dataclasses import replace + +import pytest + +# Imports must run in this order: vllm_omni applies patches to vllm.v1.request before +# Request / StreamingUpdate are bound in this module. Ruff isort would reorder them. +# isort: off +import vllm_omni # noqa: F401 - import for side effects (patch vLLM) +from vllm.sampling_params import SamplingParams +from vllm.v1.engine import EngineCoreEventType +from vllm.v1.request import Request, RequestStatus, StreamingUpdate +from vllm_omni.core.sched.omni_scheduler_mixin import OmniSchedulerMixin + +# isort: on + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +class _SchedulerStub(OmniSchedulerMixin): + """Minimal scheduler surface required by OmniSchedulerMixin.""" + + def __init__(self, *, log_stats: bool = False) -> None: + self.num_waiting_for_streaming_input = 0 + self.log_stats = log_stats + + +def _make_request(**kwargs) -> Request: + sp = SamplingParams(max_tokens=8) + defaults = dict( + request_id="req-mixin-test", + prompt_token_ids=[1, 2, 3], + sampling_params=sp, + pooling_params=None, + arrival_time=100.0, + block_hasher=None, + ) + defaults.update(kwargs) + return Request(**defaults) + + +def _make_update(**kwargs) -> StreamingUpdate: + sp_new = SamplingParams(max_tokens=16) + defaults = dict( + mm_features=None, + prompt_token_ids=[10, 20], + max_tokens=32, + arrival_time=200.0, + sampling_params=sp_new, + ) + defaults.update(kwargs) + return StreamingUpdate(**defaults) + + +class TestReplaceSessionWithStreamingUpdate: + def test_resets_tokens_and_prompt_from_update(self) -> None: + sched = _SchedulerStub() + session = _make_request() + session.append_output_token_ids([7, 8]) + session.num_computed_tokens = 99 + session.status = RequestStatus.WAITING_FOR_STREAMING_REQ + + update = _make_update(prompt_token_ids=[40, 41, 42]) + sched.num_waiting_for_streaming_input = 3 + sched._replace_session_with_streaming_update(session, update) + + assert session._output_token_ids == [] + assert list(session._all_token_ids) == [40, 41, 42] + assert session.prompt_token_ids == [40, 41, 42] + assert session.num_computed_tokens == 0 + assert session.num_prompt_tokens == 3 + assert session.arrival_time == 200.0 + assert session.sampling_params is update.sampling_params + assert session.status == RequestStatus.WAITING + assert sched.num_waiting_for_streaming_input == 2 + + def test_none_prompt_token_ids_becomes_empty(self) -> None: + sched = _SchedulerStub() + session = _make_request() + session.status = RequestStatus.RUNNING + update = _make_update(prompt_token_ids=None) + sched._replace_session_with_streaming_update(session, update) + + assert session.prompt_token_ids == () + assert list(session._all_token_ids) == [] + assert session.num_prompt_tokens == 0 + assert sched.num_waiting_for_streaming_input == 0 + + def test_additional_information_cleared_when_update_omits_it(self) -> None: + sched = _SchedulerStub() + session = _make_request() + if not hasattr(session, "additional_information"): + pytest.skip("Request has no additional_information (Omni patch inactive?)") + session.additional_information = {"keep": True} + session.status = RequestStatus.RUNNING + + base = _make_update() + if not hasattr(base, "additional_information"): + pytest.skip("StreamingUpdate has no additional_information (Omni patch inactive?)") + update = replace(base, additional_information=None) + + sched._replace_session_with_streaming_update(session, update) + assert session.additional_information is None + + def test_does_not_decrement_waiting_when_not_streaming_status(self) -> None: + sched = _SchedulerStub() + session = _make_request() + session.status = RequestStatus.RUNNING + sched.num_waiting_for_streaming_input = 5 + sched._replace_session_with_streaming_update(session, _make_update()) + assert sched.num_waiting_for_streaming_input == 5 + + def test_records_queued_event_when_log_stats_enabled(self) -> None: + sched = _SchedulerStub(log_stats=True) + session = _make_request() + session.status = RequestStatus.WAITING_FOR_STREAMING_REQ + sched._replace_session_with_streaming_update(session, _make_update()) + + assert session.events + assert session.events[-1].type == EngineCoreEventType.QUEUED diff --git a/tests/core/test_prefix_cache.py b/tests/core/test_prefix_cache.py index c3d8c1ff928..b5d0e96d305 100644 --- a/tests/core/test_prefix_cache.py +++ b/tests/core/test_prefix_cache.py @@ -1,5 +1,3 @@ -from unittest.mock import Mock, patch - import pytest import torch @@ -19,10 +17,14 @@ def __init__(self, num_computed_tokens_cpu): self.req_ids = ["req1", "req2"] self.req_id_to_index = {req_id: i for i, req_id in enumerate(self.req_ids)} self.num_computed_tokens_cpu = num_computed_tokens_cpu + # Block table is only mocked for validation of length; # we don't actually need to add valid values here since # we patch the table when testing. - self.block_table = Mock() + class _DummyBlockTable: + pass + + self.block_table = _DummyBlockTable() self.block_table.block_tables = [None] @@ -186,7 +188,7 @@ def fake_get_cached_block_ids(self, req_idx, *args, **kwargs): @pytest.mark.parametrize("num_tokens_padded", [None, 16]) -def test_get_merged_hidden_states(num_tokens_padded): +def test_get_merged_hidden_states(num_tokens_padded, mocker): """Ensure that hidden states are merged correctly.""" cache = get_omni_pcache() @@ -221,16 +223,16 @@ def test_get_merged_hidden_states(num_tokens_padded): input_batch = MockInputBatch(num_computed_tokens_cpu=torch.Tensor([orig_num_tokens_unpadded, 0])) - with patch( + mocker.patch( "vllm_omni.core.prefix_cache.OmniTensorPrefixCache._get_cached_block_ids", new=fake_get_cached_block_ids, - ): - merged_states = cache.get_merged_hidden_states( - query_start_loc=[0, num_new_toks_req1], - input_batch=input_batch, - hidden_states=new_hidden_states, - num_scheduled_tokens=num_scheduled_tokens, - ) + ) + merged_states = cache.get_merged_hidden_states( + query_start_loc=[0, num_new_toks_req1], + input_batch=input_batch, + hidden_states=new_hidden_states, + num_scheduled_tokens=num_scheduled_tokens, + ) assert "req1" in merged_states and "req2" in merged_states req1_merged_states = merged_states["req1"] @@ -255,7 +257,7 @@ def test_get_merged_hidden_states(num_tokens_padded): {"foo": 100, "bar": 50, "baz": 10}, ], ) -def test_get_merged_multimodal_outputs(feat_dims, num_tokens_padded): +def test_get_merged_multimodal_outputs(feat_dims, num_tokens_padded, mocker): cache = get_omni_pcache_with_mm_tensors(feat_dims, seq_len=DEFAULT_SEQ_LEN) orig_num_tokens_unpadded = 8 @@ -298,16 +300,16 @@ def test_get_merged_multimodal_outputs(feat_dims, num_tokens_padded): input_batch = MockInputBatch(num_computed_tokens_cpu=torch.Tensor([orig_num_tokens_unpadded, 0])) - with patch( + mocker.patch( "vllm_omni.core.prefix_cache.OmniTensorPrefixCache._get_cached_block_ids", new=fake_get_cached_block_ids, - ): - merged_mm_outputs = cache.get_merged_multimodal_states( - query_start_loc=[0, num_new_toks_req1], - input_batch=input_batch, - multimodal_outputs=new_mm_outputs, - num_scheduled_tokens=num_scheduled_tokens, - ) + ) + merged_mm_outputs = cache.get_merged_multimodal_states( + query_start_loc=[0, num_new_toks_req1], + input_batch=input_batch, + multimodal_outputs=new_mm_outputs, + num_scheduled_tokens=num_scheduled_tokens, + ) # Ensure the passthrough data wasn't dropped assert "passthrough_data" in merged_mm_outputs diff --git a/tests/dfx/conftest.py b/tests/dfx/conftest.py index 997f25e6e54..c3f6d0a15d8 100644 --- a/tests/dfx/conftest.py +++ b/tests/dfx/conftest.py @@ -4,7 +4,7 @@ import pytest -from tests.conftest import modify_stage_config +from tests.helpers.stage_config import modify_stage_config def load_configs(config_path: str) -> list[dict[str, Any]]: @@ -40,22 +40,32 @@ def modify_stage(default_path, updates, deletes): def create_unique_server_params( configs: list[dict[str, Any]], stage_configs_dir: Path, -) -> list[tuple[str, str, str]]: +) -> list[tuple[str, str, str | None, str | None, tuple[str, ...]]]: unique_params = [] seen = set() for config in configs: test_name = config["test_name"] - model = config["server_params"]["model"] - stage_config_name = config["server_params"].get("stage_config_name") + server_params = config["server_params"] + model = server_params["model"] + stage_config_name = server_params.get("stage_config_name") if stage_config_name: stage_config_path = str(stage_configs_dir / stage_config_name) - delete = config["server_params"].get("delete", None) - update = config["server_params"].get("update", None) + delete = server_params.get("delete", None) + update = server_params.get("update", None) stage_config_path = modify_stage(stage_config_path, update, delete) else: stage_config_path = None - server_param = (test_name, model, stage_config_path) + stage_overrides = server_params.get("stage_overrides") + stage_overrides_json = json.dumps(stage_overrides) if stage_overrides else None + + # ``extra_cli_args`` passes raw CLI flags straight through to + # ``vllm_omni.entrypoints.cli.main serve`` — used for flags that + # don't map to stage-level overrides, e.g. ``--async-chunk`` / + # ``--no-async-chunk`` toggling the deploy-level async_chunk bool. + extra_cli_args = tuple(server_params.get("extra_cli_args") or ()) + + server_param = (test_name, model, stage_config_path, stage_overrides_json, extra_cli_args) if server_param not in seen: seen.add(server_param) unique_params.append(server_param) diff --git a/tests/dfx/perf/scripts/run_benchmark.py b/tests/dfx/perf/scripts/run_benchmark.py index bea46f684be..13011b4bdab 100644 --- a/tests/dfx/perf/scripts/run_benchmark.py +++ b/tests/dfx/perf/scripts/run_benchmark.py @@ -8,7 +8,6 @@ import pytest -from tests.conftest import OmniServer from tests.dfx.conftest import ( create_benchmark_indices, create_test_parameter_mapping, @@ -16,6 +15,7 @@ get_benchmark_params_for_server, load_configs, ) +from tests.helpers.runtime import OmniServer os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" @@ -48,8 +48,8 @@ def _get_config_file_from_argv() -> str | None: OMNI_RESULT_TEMPLATE_PATH = Path(__file__).parent / "result_omni_template.json" -STAGE_CONFIGS_DIR = Path(__file__).parent.parent / "stage_configs" -test_params = create_unique_server_params(BENCHMARK_CONFIGS, STAGE_CONFIGS_DIR) +DEPLOY_CONFIGS_DIR = Path(__file__).parent.parent / "deploy" +test_params = create_unique_server_params(BENCHMARK_CONFIGS, DEPLOY_CONFIGS_DIR) server_to_benchmark_mapping = create_test_parameter_mapping(BENCHMARK_CONFIGS) _omni_server_lock = threading.Lock() @@ -62,13 +62,19 @@ def omni_server(request): Multi-stage initialization can take 10-20+ minutes. """ with _omni_server_lock: - test_name, model, stage_config_path = request.param + test_name, model, stage_config_path, stage_overrides, extra_cli_args = request.param print(f"Starting OmniServer with test: {test_name}, model: {model}") server_args = ["--stage-init-timeout", "600", "--init-timeout", "900"] + # --deploy-config and --stage-overrides compose at the CLI (see vllm_omni/entrypoints/utils.py): + # deploy-config sets the base; stage-overrides are applied on top. Both can be set. if stage_config_path: - server_args = ["--stage-configs-path", stage_config_path] + server_args + server_args = ["--deploy-config", stage_config_path] + server_args + if stage_overrides: + server_args = ["--stage-overrides", stage_overrides] + server_args + if extra_cli_args: + server_args = list(extra_cli_args) + server_args with OmniServer(model, server_args) as server: server.test_name = test_name print("OmniServer started successfully") diff --git a/tests/dfx/perf/stage_configs/qwen3_omni.yaml b/tests/dfx/perf/stage_configs/qwen3_omni.yaml deleted file mode 100644 index 2add22b8732..00000000000 --- a/tests/dfx/perf/stage_configs/qwen3_omni.yaml +++ /dev/null @@ -1,101 +0,0 @@ -# Stage config for running Qwen3-Omni-MoE with 3-stage architecture -# Stage 0: Thinker (multimodal understanding + text generation) -# Stage 1: Talker (text embeddings → 8-layer RVQ codec codes) -# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform) - -# The following config has been verified on 2x H100-80G GPUs. -async_chunk: false -stage_args: - - stage_id: 0 - stage_type: llm # Use llm stage type for AR stages - runtime: - devices: "0" - engine_args: - model_stage: thinker - max_num_seqs: 64 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.9 - enforce_eager: false - trust_remote_code: true - engine_output_type: latent # Output hidden states for talker - distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 - hf_config_name: thinker_config - tensor_parallel_size: 1 - final_output: true - final_output_type: text - is_comprehension: true - default_sampling_params: - temperature: 0.4 - top_p: 0.9 - top_k: 1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - - - stage_id: 1 - stage_type: llm # Use llm stage type for AR stages - runtime: - devices: "1" - engine_args: - model_stage: talker - max_num_seqs: 64 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 - enforce_eager: false - trust_remote_code: true - engine_output_type: latent # Output codec codes for code2wav - enable_prefix_caching: false - max_num_batched_tokens: 32768 - distributed_executor_backend: "mp" - hf_config_name: talker_config - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker - # final_output: true - # final_output_type: text - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: False - repetition_penalty: 1.05 - stop_token_ids: [2150] - - - stage_id: 2 - stage_type: llm # Use llm stage type for AR stages - runtime: - devices: "1" - engine_args: - model_stage: code2wav - max_num_seqs: 64 - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - async_scheduling: false - enable_prefix_caching: false - engine_output_type: audio # Final output: audio waveform - gpu_memory_utilization: 0.1 - distributed_executor_backend: "mp" - max_num_batched_tokens: 100000 - hf_config_name: thinker_config - engine_input_source: [1] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 65536 - seed: 42 - detokenize: True - repetition_penalty: 1.1 diff --git a/tests/dfx/perf/stage_configs/qwen3_tts.yaml b/tests/dfx/perf/stage_configs/qwen3_tts.yaml deleted file mode 100644 index 97b30905603..00000000000 --- a/tests/dfx/perf/stage_configs/qwen3_tts.yaml +++ /dev/null @@ -1,96 +0,0 @@ -# Stage config for running Qwen3-TTS with 2-stage architecture -# Stage 0: Talker (text -> 8-layer RVQ codec codes) -# Stage 1: Code2Wav (codec codes -> audio waveform) -# -# The following config has been verified on 1x H100-80G GPU. -async_chunk: true -stage_args: - - stage_id: 0 - stage_type: llm - is_comprehension: true - runtime: - devices: "0" - engine_args: - max_num_seqs: 4 - model_stage: qwen3_tts - model_arch: Qwen3TTSTalkerForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - enforce_eager: false - trust_remote_code: true - async_scheduling: false - enable_prefix_caching: false - engine_output_type: latent - gpu_memory_utilization: 0.3 - distributed_executor_backend: "mp" - max_num_batched_tokens: 512 - max_model_len: 4096 - custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk - output_connectors: - to_stage_1: connector_of_shared_memory - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: false - repetition_penalty: 1.05 - stop_token_ids: [2150] - - - stage_id: 1 - stage_type: llm - runtime: - devices: "0" - engine_args: - max_num_seqs: 4 - model_stage: code2wav - model_arch: Qwen3TTSCode2Wav - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - async_scheduling: false - enable_prefix_caching: false - engine_output_type: audio - gpu_memory_utilization: 0.2 - distributed_executor_backend: "mp" - max_num_batched_tokens: 8192 - max_model_len: 32768 - engine_input_source: [0] - final_output: true - final_output_type: audio - input_connectors: - from_stage_0: connector_of_shared_memory - tts_args: - max_instructions_length: 500 - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 65536 - seed: 42 - detokenize: true - repetition_penalty: 1.0 - -runtime: - enabled: true - defaults: - window_size: -1 - max_inflight: 4 - - connectors: - connector_of_shared_memory: - name: SharedMemoryConnector - extra: - shm_threshold_bytes: 65536 - codec_streaming: true - connector_get_sleep_s: 0.01 - connector_get_max_wait_first_chunk: 3000 - connector_get_max_wait: 300 - codec_chunk_frames: 25 - codec_left_context_frames: 72 - - edges: - - from: 0 - to: 1 - window_size: -1 diff --git a/tests/dfx/perf/tests/test_qwen_omni.json b/tests/dfx/perf/tests/test_qwen_omni.json index 4662f8c0c71..ca3eb555708 100644 --- a/tests/dfx/perf/tests/test_qwen_omni.json +++ b/tests/dfx/perf/tests/test_qwen_omni.json @@ -3,7 +3,7 @@ "test_name": "test_qwen3_omni", "server_params": { "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct", - "stage_config_name": "qwen3_omni.yaml" + "extra_cli_args": ["--no-async-chunk"] }, "benchmark_params": [ { @@ -109,25 +109,7 @@ "test_name": "test_qwen3_omni_chunk", "server_params": { "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct", - "stage_config_name": "qwen3_omni.yaml", - "update": { - "async_chunk": true, - "stage_args": { - "0": { - "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk" - }, - "1": { - "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav_async_chunk" - } - } - }, - "delete": { - "stage_args": { - "2": [ - "custom_process_input_func" - ] - } - } + "extra_cli_args": ["--async-chunk"] }, "benchmark_params": [ { diff --git a/tests/dfx/stability/conftest.py b/tests/dfx/stability/conftest.py index 3a0aee7608f..e36c88b9aa6 100644 --- a/tests/dfx/stability/conftest.py +++ b/tests/dfx/stability/conftest.py @@ -1,125 +1,17 @@ -""" -Stability-specific conftest: when pytest is executed under this directory, -resource monitoring is started before each test and finalized after each test, -so each stability test case gets its own HTML report (one report per case). -No need to wrap pytest with `bash resource_monitor.sh run -- pytest ...`. -""" +"""Stability pytest hooks and fixtures.""" -import os import subprocess import sys import threading -import time -from pathlib import Path import pytest -STABILITY_DIR = Path(__file__).resolve().parent -RESOURCE_MONITOR_SCRIPT = STABILITY_DIR / "scripts" / "resource_monitor.sh" -REPO_ROOT = STABILITY_DIR.parent.parent.parent - - -def _start_resource_monitor(): - """Start `resource_monitor.sh start` in the background and return `Popen` or `None`.""" - if not RESOURCE_MONITOR_SCRIPT.is_file(): - return None - try: - proc = subprocess.Popen( - ["bash", str(RESOURCE_MONITOR_SCRIPT), "start", "--backend", "gpu"], - cwd=str(REPO_ROOT), - stdout=subprocess.DEVNULL, - stderr=subprocess.PIPE, - start_new_session=True, - ) - try: - proc.wait(timeout=2) - if proc.returncode != 0: - stderr = proc.stderr.read().decode("utf-8", errors="ignore") if proc.stderr else "" - if stderr.strip(): - sys.stderr.write(f"[Stability] Resource monitor failed to start: {stderr.strip()}\n") - return None - except subprocess.TimeoutExpired: - pass - return proc - except (FileNotFoundError, OSError): - return None - - -def _get_monitor_data_root() -> Path: - data_root = os.environ.get("RESOURCE_MONITOR_DATA_ROOT") or os.environ.get("GPU_MONITOR_DATA_ROOT") - if data_root: - return Path(data_root) - return STABILITY_DIR / "gpu_monitor_data" - - -def _wait_for_run_dir(timeout_sec: int = 10) -> Path | None: - data_root = _get_monitor_data_root() - run_id_file = data_root / "current_run_id" - deadline = time.time() + timeout_sec - while time.time() < deadline: - if run_id_file.is_file(): - run_id = run_id_file.read_text(encoding="utf-8").strip() - if run_id: - run_dir = data_root / run_id - if run_dir.is_dir(): - return run_dir - time.sleep(0.5) - return None - - -def _report_latest_gpu_samples(stop_event: threading.Event) -> None: - """Periodically print the latest sampled GPU line.""" - log_interval = int( - os.environ.get("RESOURCE_MONITOR_LOG_INTERVAL") or os.environ.get("GPU_MONITOR_LOG_INTERVAL") or "15" - ) - log_interval = max(log_interval, 1) - last_line = "" - - time.sleep(min(log_interval, 5)) - while not stop_event.wait(log_interval): - run_dir = _wait_for_run_dir(timeout_sec=1) - if run_dir is None: - continue - csv_file = run_dir / "gpu_metrics.csv" - if not csv_file.is_file(): - continue - try: - lines = csv_file.read_text(encoding="utf-8").splitlines() - except OSError: - continue - if len(lines) <= 1: - continue - latest = lines[-1].strip() - if latest and latest != last_line: - last_line = latest - sys.stderr.write(f"[GPU] {latest}\n") - - -def _finalize_resource_monitor() -> str | None: - """ - Run `resource_monitor.sh finalize` for the current run and generate the report. - Returns the bundle dir path (for this test case's report) if successful, else None. - """ - if not RESOURCE_MONITOR_SCRIPT.is_file(): - return None - try: - result = subprocess.run( - ["bash", str(RESOURCE_MONITOR_SCRIPT), "finalize", "--backend", "gpu"], - cwd=str(REPO_ROOT), - capture_output=True, - text=True, - timeout=60, - check=False, - ) - if result.returncode != 0: - return None - for line in (result.stdout or "").splitlines(): - if line.startswith("GPU_MONITOR_BUNDLE_DIR=") or line.startswith("RESOURCE_MONITOR_BUNDLE_DIR="): - _, _, value = line.partition("=") - return value.strip() if value else None - return None - except (FileNotFoundError, OSError, subprocess.TimeoutExpired): - return None +from tests.dfx.stability.helpers import ( + finalize_resource_monitor, + report_latest_gpu_samples, + start_resource_monitor, + wait_for_run_dir, +) @pytest.fixture(autouse=True) @@ -128,19 +20,19 @@ def stability_resource_monitor_per_test(request: pytest.FixtureRequest): For each test under this directory: start GPU monitor before the test, then finalize after the test so this case gets its own report.html. """ - proc = _start_resource_monitor() + proc = start_resource_monitor() stop_event = threading.Event() reporter: threading.Thread | None = None if proc is not None: reporter = threading.Thread( - target=_report_latest_gpu_samples, + target=report_latest_gpu_samples, args=(stop_event,), name="stability-resource-monitor-reporter", daemon=True, ) reporter.start() - run_dir = _wait_for_run_dir(timeout_sec=5) + run_dir = wait_for_run_dir(timeout_sec=5) node_name = request.node.name if run_dir is not None: sys.stderr.write(f"[Stability] Resource monitor started for test: {node_name} | run dir: {run_dir}\n") @@ -161,7 +53,7 @@ def stability_resource_monitor_per_test(request: pytest.FixtureRequest): except subprocess.TimeoutExpired: proc.kill() proc.wait() - bundle_dir = _finalize_resource_monitor() + bundle_dir = finalize_resource_monitor() node_name = request.node.name if bundle_dir: sys.stderr.write(f"[Stability] Report for test «{node_name}»: {bundle_dir}/report.html\n") diff --git a/tests/dfx/stability/helpers.py b/tests/dfx/stability/helpers.py new file mode 100644 index 00000000000..3956bd21304 --- /dev/null +++ b/tests/dfx/stability/helpers.py @@ -0,0 +1,117 @@ +"""Stability resource monitor helpers.""" + +from __future__ import annotations + +import os +import subprocess +import sys +import threading +import time +from pathlib import Path + +STABILITY_DIR = Path(__file__).resolve().parent +RESOURCE_MONITOR_SCRIPT = STABILITY_DIR / "scripts" / "resource_monitor.sh" +REPO_ROOT = STABILITY_DIR.parent.parent.parent + + +def start_resource_monitor(): + """Start `resource_monitor.sh start` in the background and return `Popen` or `None`.""" + if not RESOURCE_MONITOR_SCRIPT.is_file(): + return None + try: + proc = subprocess.Popen( + ["bash", str(RESOURCE_MONITOR_SCRIPT), "start", "--backend", "gpu"], + cwd=str(REPO_ROOT), + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + start_new_session=True, + ) + try: + proc.wait(timeout=2) + if proc.returncode != 0: + stderr = proc.stderr.read().decode("utf-8", errors="ignore") if proc.stderr else "" + if stderr.strip(): + sys.stderr.write(f"[Stability] Resource monitor failed to start: {stderr.strip()}\n") + return None + except subprocess.TimeoutExpired: + pass + return proc + except (FileNotFoundError, OSError): + return None + + +def get_monitor_data_root() -> Path: + data_root = os.environ.get("RESOURCE_MONITOR_DATA_ROOT") or os.environ.get("GPU_MONITOR_DATA_ROOT") + if data_root: + return Path(data_root) + return STABILITY_DIR / "gpu_monitor_data" + + +def wait_for_run_dir(timeout_sec: int = 10) -> Path | None: + data_root = get_monitor_data_root() + run_id_file = data_root / "current_run_id" + deadline = time.time() + timeout_sec + while time.time() < deadline: + if run_id_file.is_file(): + run_id = run_id_file.read_text(encoding="utf-8").strip() + if run_id: + run_dir = data_root / run_id + if run_dir.is_dir(): + return run_dir + time.sleep(0.5) + return None + + +def report_latest_gpu_samples(stop_event: threading.Event) -> None: + """Periodically print the latest sampled GPU line.""" + log_interval = int( + os.environ.get("RESOURCE_MONITOR_LOG_INTERVAL") or os.environ.get("GPU_MONITOR_LOG_INTERVAL") or "15" + ) + log_interval = max(log_interval, 1) + last_line = "" + + time.sleep(min(log_interval, 5)) + while not stop_event.wait(log_interval): + run_dir = wait_for_run_dir(timeout_sec=1) + if run_dir is None: + continue + csv_file = run_dir / "gpu_metrics.csv" + if not csv_file.is_file(): + continue + try: + lines = csv_file.read_text(encoding="utf-8").splitlines() + except OSError: + continue + if len(lines) <= 1: + continue + latest = lines[-1].strip() + if latest and latest != last_line: + last_line = latest + sys.stderr.write(f"[GPU] {latest}\n") + + +def finalize_resource_monitor() -> str | None: + """ + Run `resource_monitor.sh finalize` for the current run and generate the report. + Returns the bundle dir path (for this test case's report) if successful, else None. + """ + if not RESOURCE_MONITOR_SCRIPT.is_file(): + return None + try: + result = subprocess.run( + ["bash", str(RESOURCE_MONITOR_SCRIPT), "finalize", "--backend", "gpu"], + cwd=str(REPO_ROOT), + capture_output=True, + text=True, + timeout=60, + check=False, + ) + if result.returncode != 0: + return None + for line in (result.stdout or "").splitlines(): + if line.startswith("GPU_MONITOR_BUNDLE_DIR=") or line.startswith("RESOURCE_MONITOR_BUNDLE_DIR="): + _, _, value = line.partition("=") + return value.strip() if value else None + return None + except (FileNotFoundError, OSError, subprocess.TimeoutExpired): + return None diff --git a/tests/dfx/stability/scripts/test_benchmark_stability.py b/tests/dfx/stability/scripts/test_benchmark_stability.py index a9faae8ab84..620241762d3 100644 --- a/tests/dfx/stability/scripts/test_benchmark_stability.py +++ b/tests/dfx/stability/scripts/test_benchmark_stability.py @@ -24,7 +24,6 @@ import pytest -from tests.conftest import OmniServer from tests.dfx.conftest import ( create_benchmark_indices, create_test_parameter_mapping, @@ -33,9 +32,10 @@ load_configs, ) from tests.dfx.perf.scripts.run_benchmark import run_benchmark +from tests.helpers.runtime import OmniServer STABILITY_DIR = Path(__file__).resolve().parent.parent -STAGE_CONFIGS_DIR = STABILITY_DIR / "stage_configs" +DEPLOY_CONFIGS_DIR = STABILITY_DIR / "deploy" CONFIG_FILE_PATH = str(STABILITY_DIR / "tests" / "test.json") DEFAULT_NUM_PROMPTS_PER_BATCH = 20 @@ -45,7 +45,7 @@ except FileNotFoundError: BENCHMARK_CONFIGS = [] -test_params = create_unique_server_params(BENCHMARK_CONFIGS, STAGE_CONFIGS_DIR) if BENCHMARK_CONFIGS else [] +test_params = create_unique_server_params(BENCHMARK_CONFIGS, DEPLOY_CONFIGS_DIR) if BENCHMARK_CONFIGS else [] server_to_benchmark_mapping = create_test_parameter_mapping(BENCHMARK_CONFIGS) if BENCHMARK_CONFIGS else {} _omni_server_lock = threading.Lock() @@ -219,11 +219,20 @@ def omni_server(request): Multi-stage initialization can take 10-20+ minutes. """ with _omni_server_lock: - test_name, model, stage_config_path = request.param + test_name, model, stage_config_path, stage_overrides, extra_cli_args = request.param print(f"Starting OmniServer with test: {test_name}, model: {model}") - with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "120"]) as server: + server_args = ["--stage-init-timeout", "120"] + # --deploy-config and --stage-overrides compose at the CLI (see vllm_omni/entrypoints/utils.py): + # deploy-config sets the base; stage-overrides are applied on top. Both can be set. + if stage_config_path: + server_args = ["--deploy-config", stage_config_path] + server_args + if stage_overrides: + server_args = ["--stage-overrides", stage_overrides] + server_args + if extra_cli_args: + server_args = list(extra_cli_args) + server_args + with OmniServer(model, server_args) as server: server.test_name = test_name print("OmniServer started successfully") yield server diff --git a/tests/dfx/stability/stage_configs/qwen3_omni.yaml b/tests/dfx/stability/stage_configs/qwen3_omni.yaml deleted file mode 100644 index 802f8dd2494..00000000000 --- a/tests/dfx/stability/stage_configs/qwen3_omni.yaml +++ /dev/null @@ -1,101 +0,0 @@ -# Stage config for running Qwen3-Omni-MoE with 3-stage architecture -# Stage 0: Thinker (multimodal understanding + text generation) -# Stage 1: Talker (text embeddings → 8-layer RVQ codec codes) -# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform) - -# The following config has been verified on 2x H100-80G GPUs. -async_chunk: false -stage_args: - - stage_id: 0 - stage_type: llm # Use llm stage type to launch OmniLLM - runtime: - devices: "0" - max_batch_size: 64 - engine_args: - model_stage: thinker - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.9 - enforce_eager: false - trust_remote_code: true - engine_output_type: latent # Output hidden states for talker - distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 - hf_config_name: thinker_config - tensor_parallel_size: 1 - final_output: true - final_output_type: text - is_comprehension: true - default_sampling_params: - temperature: 0.4 - top_p: 0.9 - top_k: 1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - - - stage_id: 1 - stage_type: llm # Use llm stage type to launch OmniLLM - runtime: - devices: "1" - max_batch_size: 64 - engine_args: - model_stage: talker - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.6 - enforce_eager: false - trust_remote_code: true - engine_output_type: latent # Output codec codes for code2wav - enable_prefix_caching: false - max_num_batched_tokens: 32768 - distributed_executor_backend: "mp" - hf_config_name: talker_config - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker - # final_output: true - # final_output_type: text - default_sampling_params: - temperature: 0.9 - top_k: 50 - max_tokens: 4096 - seed: 42 - detokenize: False - repetition_penalty: 1.05 - stop_token_ids: [2150] - - - stage_id: 2 - stage_type: llm # Use llm stage type to launch OmniLLM - runtime: - devices: "1" - max_batch_size: 64 - engine_args: - model_stage: code2wav - model_arch: Qwen3OmniMoeForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - enforce_eager: true - trust_remote_code: true - async_scheduling: false - enable_prefix_caching: false - engine_output_type: audio # Final output: audio waveform - gpu_memory_utilization: 0.1 - distributed_executor_backend: "mp" - max_num_batched_tokens: 1000000 - hf_config_name: thinker_config - engine_input_source: [1] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 65536 - seed: 42 - detokenize: True - repetition_penalty: 1.1 diff --git a/tests/dfx/stability/tests/test.json b/tests/dfx/stability/tests/test.json index 95993c9c556..255cd5b1091 100644 --- a/tests/dfx/stability/tests/test.json +++ b/tests/dfx/stability/tests/test.json @@ -3,7 +3,11 @@ "test_name": "test_qwen3_omni_stability", "server_params": { "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct", - "stage_config_name": "qwen3_omni.yaml" + "stage_overrides": { + "2": { + "max_num_batched_tokens": 1000000 + } + } }, "benchmark_params": [ { @@ -36,25 +40,12 @@ "test_name": "test_qwen3_omni_stability_async_chunk", "server_params": { "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct", - "stage_config_name": "qwen3_omni.yaml", - "update": { - "async_chunk": true, - "stage_args": { - "0": { - "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk" - }, - "1": { - "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav_async_chunk" - } + "stage_overrides": { + "2": { + "max_num_batched_tokens": 1000000 } }, - "delete": { - "stage_args": { - "2": [ - "custom_process_input_func" - ] - } - } + "extra_cli_args": ["--async-chunk"] }, "benchmark_params": [ { diff --git a/tests/diffusion/cache/test_cache_dit.py b/tests/diffusion/cache/test_cache_dit.py new file mode 100644 index 00000000000..0b7ef723585 --- /dev/null +++ b/tests/diffusion/cache/test_cache_dit.py @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Model specific tests for CacheDiT enablement. +""" + +from unittest.mock import Mock, patch + +import pytest + +import vllm_omni.diffusion.cache.cache_dit_backend as cd_backend +from vllm_omni.diffusion.data import DiffusionCacheConfig + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + +SEPARATE_CFG_ENABLERS = [ + cd_backend.enable_cache_for_ltx2, + cd_backend.enable_cache_for_wan22, + cd_backend.enable_cache_for_longcat_image, +] + +SAMPLE_CACHE_CONFIG = DiffusionCacheConfig() + + +@pytest.mark.parametrize("enabler", SEPARATE_CFG_ENABLERS) +@patch("vllm_omni.diffusion.cache.cache_dit_backend.BlockAdapter") +@patch("vllm_omni.diffusion.cache.cache_dit_backend.cache_dit") +def test_separate_cfg(mock_cache_dit, mock_block_adapter, enabler): + """Ensure that custom enablers for models with separate CFG pass + the param through to cache_dit correctly. + + Regression test for: https://github.com/vllm-project/vllm-omni/pull/2860 + """ + mock_pipeline = Mock() + enabler(mock_pipeline, SAMPLE_CACHE_CONFIG) + + mock_cache_dit.enable_cache.assert_called_once() + adapter_kwargs = mock_block_adapter.call_args.kwargs + assert adapter_kwargs["has_separate_cfg"] is True diff --git a/tests/diffusion/cache/test_teacache_extractors.py b/tests/diffusion/cache/test_teacache_extractors.py index c22a60e227e..4bb958a36c1 100644 --- a/tests/diffusion/cache/test_teacache_extractors.py +++ b/tests/diffusion/cache/test_teacache_extractors.py @@ -21,7 +21,7 @@ import pytest import torch -from tests.utils import hardware_test +from tests.helpers.mark import hardware_test from vllm_omni.diffusion.cache.teacache.extractors import extract_flux2_context, extract_flux2_klein_context from vllm_omni.diffusion.models.flux2_klein.flux2_klein_transformer import ( Flux2Transformer2DModel, diff --git a/tests/diffusion/distributed/test_ulysses_uaa_perf.py b/tests/diffusion/distributed/test_ulysses_uaa_perf.py index 04bbf5ee863..2a16a9ae578 100644 --- a/tests/diffusion/distributed/test_ulysses_uaa_perf.py +++ b/tests/diffusion/distributed/test_ulysses_uaa_perf.py @@ -17,7 +17,7 @@ import torch import torch.distributed as dist -from tests.utils import hardware_test +from tests.helpers.mark import hardware_test from vllm_omni.diffusion.attention.parallel.ulysses import ( _all_gather_int, _ulysses_all_to_all_any_o, diff --git a/tests/diffusion/layers/test_rotary_emb_equivalence.py b/tests/diffusion/layers/test_rotary_emb_equivalence.py new file mode 100644 index 00000000000..2fbb7a31f5a --- /dev/null +++ b/tests/diffusion/layers/test_rotary_emb_equivalence.py @@ -0,0 +1,112 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Numerical equivalence tests for rotary embedding implementations (#2436). + +Verifies that the optimized stack+flatten RoPE produces bit-identical results +to the original strided-slice implementation across various tensor shapes and +dtypes, ensuring the refactor is safe. +""" + +from __future__ import annotations + +import pytest +import torch + + +def _apply_rotary_emb_helios_original( + hidden_states: torch.Tensor, + freqs_cis: torch.Tensor, +) -> torch.Tensor: + """Original Helios RoPE using strided slice assignment (pre-#2436).""" + x_1, x_2 = hidden_states.unflatten(-1, (-1, 2)).unbind(-1) + cos, sin = freqs_cis.unsqueeze(-2).chunk(2, dim=-1) + out = torch.empty_like(hidden_states) + out[..., 0::2] = x_1 * cos[..., 0::2] - x_2 * sin[..., 1::2] + out[..., 1::2] = x_1 * sin[..., 1::2] + x_2 * cos[..., 0::2] + return out.type_as(hidden_states) + + +def _apply_rotary_emb_helios_optimized( + hidden_states: torch.Tensor, + freqs_cis: torch.Tensor, +) -> torch.Tensor: + """Optimized Helios RoPE using stack+flatten (post-#2436).""" + x_1, x_2 = hidden_states.unflatten(-1, (-1, 2)).unbind(-1) + cos, sin = freqs_cis.unsqueeze(-2).chunk(2, dim=-1) + rotated = torch.stack( + ( + x_1 * cos[..., 0::2] - x_2 * sin[..., 1::2], + x_1 * sin[..., 1::2] + x_2 * cos[..., 0::2], + ), + dim=-1, + ) + return rotated.flatten(-2, -1).type_as(hidden_states) + + +def _make_inputs( + batch: int, + seq_len: int, + num_heads: int, + head_dim: int, + dtype: torch.dtype = torch.float32, +) -> tuple[torch.Tensor, torch.Tensor]: + """Generate random hidden_states and freqs_cis for testing.""" + torch.manual_seed(42) + hidden_states = torch.randn(batch, seq_len, num_heads, head_dim, dtype=dtype) + # freqs_cis: [B, seq, head_dim*2] — cos and sin concatenated along last dim + freqs_cis = torch.randn(batch, seq_len, head_dim * 2, dtype=dtype) + return hidden_states, freqs_cis + + +class TestHeliosRoPEEquivalence: + """Verify optimized Helios RoPE is numerically identical to original.""" + + @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) + def test_equivalence_across_dtypes(self, dtype: torch.dtype) -> None: + """Optimized output must be bit-identical to original across dtypes.""" + hidden, freqs = _make_inputs(2, 16, 8, 64, dtype=dtype) + original = _apply_rotary_emb_helios_original(hidden, freqs) + optimized = _apply_rotary_emb_helios_optimized(hidden, freqs) + torch.testing.assert_close(optimized, original, atol=0, rtol=0) + + @pytest.mark.parametrize( + "batch,seq_len,num_heads,head_dim", + [ + (1, 8, 1, 32), # minimal: single batch, single head + (2, 16, 8, 64), # typical transformer config + (1, 8192, 4, 64), # video-scale patch tokens (720p DiT) + (4, 32, 16, 128), # large head_dim + ], + ) + def test_equivalence_across_shapes(self, batch: int, seq_len: int, num_heads: int, head_dim: int) -> None: + """Equivalence must hold across different tensor shapes.""" + hidden, freqs = _make_inputs(batch, seq_len, num_heads, head_dim) + original = _apply_rotary_emb_helios_original(hidden, freqs) + optimized = _apply_rotary_emb_helios_optimized(hidden, freqs) + torch.testing.assert_close(optimized, original, atol=0, rtol=0) + + def test_output_contiguous(self) -> None: + """Optimized output should be contiguous in memory.""" + hidden, freqs = _make_inputs(2, 16, 8, 64) + optimized = _apply_rotary_emb_helios_optimized(hidden, freqs) + assert optimized.is_contiguous() + + def test_output_shape_preserved(self) -> None: + """Output shape must match input shape.""" + hidden, freqs = _make_inputs(2, 16, 8, 64) + optimized = _apply_rotary_emb_helios_optimized(hidden, freqs) + assert optimized.shape == hidden.shape + + def test_output_dtype_preserved(self) -> None: + """Output dtype must match input dtype.""" + hidden, freqs = _make_inputs(2, 16, 8, 64, dtype=torch.float16) + optimized = _apply_rotary_emb_helios_optimized(hidden, freqs) + assert optimized.dtype == hidden.dtype + + def test_odd_head_dim_raises(self) -> None: + """Odd head_dim should fail at unflatten (not a valid RoPE config).""" + hidden = torch.randn(1, 4, 2, 63) + freqs = torch.randn(1, 4, 126) + with pytest.raises(RuntimeError): + _apply_rotary_emb_helios_optimized(hidden, freqs) diff --git a/tests/diffusion/lora/conftest.py b/tests/diffusion/lora/helpers.py similarity index 100% rename from tests/diffusion/lora/conftest.py rename to tests/diffusion/lora/helpers.py diff --git a/tests/diffusion/lora/test_lora_manager.py b/tests/diffusion/lora/test_lora_manager.py index 83ac7a1144b..785f5d84217 100644 --- a/tests/diffusion/lora/test_lora_manager.py +++ b/tests/diffusion/lora/test_lora_manager.py @@ -8,7 +8,7 @@ from vllm.lora.lora_weights import LoRALayerWeights from vllm.lora.utils import get_supported_lora_modules -from tests.diffusion.lora.conftest import ( +from tests.diffusion.lora.helpers import ( DummyBaseLayerWithLoRA, FakeLinearBase, fake_replace_submodule, diff --git a/tests/diffusion/models/bagel/test_bagel_lora.py b/tests/diffusion/models/bagel/test_bagel_lora.py index 8cb3446ed53..c285758fe86 100644 --- a/tests/diffusion/models/bagel/test_bagel_lora.py +++ b/tests/diffusion/models/bagel/test_bagel_lora.py @@ -11,7 +11,7 @@ import torch from safetensors.torch import save_file -from tests.diffusion.lora.conftest import ( +from tests.diffusion.lora.helpers import ( DummyBaseLayerWithLoRA, FakeLinearBase, fake_replace_submodule, diff --git a/tests/diffusion/models/flux2/test_flux2_transformer_tp.py b/tests/diffusion/models/flux2/test_flux2_transformer_tp.py index 54dda1dd07e..c613bb0b4c8 100644 --- a/tests/diffusion/models/flux2/test_flux2_transformer_tp.py +++ b/tests/diffusion/models/flux2/test_flux2_transformer_tp.py @@ -2,7 +2,7 @@ import torch from pytest_mock import MockerFixture -from tests.utils import hardware_test +from tests.helpers.mark import hardware_test from vllm_omni.diffusion.models.flux2.flux2_transformer import ( Flux2PosEmbed, Flux2Transformer2DModel, diff --git a/tests/diffusion/models/glm_image/test_glm_image_sp.py b/tests/diffusion/models/glm_image/test_glm_image_sp.py index 1b1c8d7a75b..40d1c873070 100644 --- a/tests/diffusion/models/glm_image/test_glm_image_sp.py +++ b/tests/diffusion/models/glm_image/test_glm_image_sp.py @@ -2,27 +2,26 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for GLM-Image Sequence Parallelism support.""" -from unittest.mock import MagicMock, patch - import pytest from vllm_omni.diffusion.data import DiffusionParallelConfig @pytest.fixture(scope="function", autouse=True) -def setup_sp_groups(): +def setup_sp_groups(mocker): """Set up SP and TP groups for each test function.""" - with patch("vllm_omni.diffusion.distributed.parallel_state.get_sp_group") as mock_get_sp_group: - with patch("vllm.model_executor.layers.linear.get_tensor_model_parallel_world_size", return_value=1): - with patch("vllm.distributed.parallel_state.get_tp_group") as mock_get_tp_group: - mock_sp_group = MagicMock() - mock_sp_group.world_size = 4 - mock_get_sp_group.return_value = mock_sp_group - - mock_tp_group = MagicMock() - mock_tp_group.world_size = 1 - mock_get_tp_group.return_value = mock_tp_group - yield + mock_get_sp_group = mocker.patch("vllm_omni.diffusion.distributed.parallel_state.get_sp_group") + mocker.patch("vllm.model_executor.layers.linear.get_tensor_model_parallel_world_size", return_value=1) + mock_get_tp_group = mocker.patch("vllm.distributed.parallel_state.get_tp_group") + + mock_sp_group = mocker.MagicMock() + mock_sp_group.world_size = 4 + mock_get_sp_group.return_value = mock_sp_group + + mock_tp_group = mocker.MagicMock() + mock_tp_group.world_size = 1 + mock_get_tp_group.return_value = mock_tp_group + yield pytestmark = [pytest.mark.core_model, pytest.mark.cpu] diff --git a/tests/diffusion/models/qwen_image/test_qwen_image_edit_plus.py b/tests/diffusion/models/qwen_image/test_qwen_image_edit_plus.py new file mode 100644 index 00000000000..873b52bf7a6 --- /dev/null +++ b/tests/diffusion/models/qwen_image/test_qwen_image_edit_plus.py @@ -0,0 +1,38 @@ +# SPDX-License-Identifier: Apache-2.0 + +import json +from pathlib import Path +from types import SimpleNamespace + +import numpy as np +import pytest +from PIL import Image + +from vllm_omni.diffusion.models.qwen_image.pipeline_qwen_image_edit_plus import ( + get_qwen_image_edit_plus_pre_process_func, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu] + + +def test_qwen_image_edit_plus_rejects_too_many_input_images(tmp_path: Path): + vae_dir = tmp_path / "vae" + vae_dir.mkdir() + # Keep the mock config intentionally minimal: this test only needs the + # fields touched during pre-process initialization. + (vae_dir / "config.json").write_text(json.dumps({"z_dim": 16})) + + pre_process = get_qwen_image_edit_plus_pre_process_func(SimpleNamespace(model=str(tmp_path))) + image = Image.fromarray(np.zeros((32, 32, 3), dtype=np.uint8)) + request = SimpleNamespace( + prompts=[ + { + "prompt": "combine", + "multi_modal_data": {"image": [image, image, image, image, image]}, + } + ], + sampling_params=SimpleNamespace(height=None, width=None), + ) + + with pytest.raises(ValueError, match=r"At most 4 images are supported by this model"): + pre_process(request) diff --git a/tests/diffusion/models/wan2_2/conftest.py b/tests/diffusion/models/wan2_2/conftest.py new file mode 100644 index 00000000000..f836fa545fd --- /dev/null +++ b/tests/diffusion/models/wan2_2/conftest.py @@ -0,0 +1,80 @@ +from __future__ import annotations + +from contextlib import contextmanager +from types import SimpleNamespace + +import torch +from torch import nn + + +class StubTransformer(nn.Module): + def __init__(self, *, name: str = "transformer", in_channels: int = 4, out_channels: int = 4) -> None: + super().__init__() + self.name = name + self.config = SimpleNamespace( + patch_size=(1, 2, 2), + in_channels=in_channels, + out_channels=out_channels, + image_dim=None, + ) + + @property + def dtype(self) -> torch.dtype: + return torch.float32 + + def forward(self, **kwargs): + hidden_states = kwargs["hidden_states"] + return (torch.zeros_like(hidden_states[:, : self.config.out_channels]),) + + +class StubScheduler: + def __init__(self, timesteps: list[int]) -> None: + self.timesteps = torch.tensor(timesteps, dtype=torch.int64) + self.config = SimpleNamespace(num_train_timesteps=1000) + self.set_timesteps_calls: list[tuple[int, torch.device]] = [] + + def set_timesteps(self, num_steps: int, device: torch.device) -> None: + self.set_timesteps_calls.append((num_steps, device)) + + +class StubVAE: + dtype = torch.float32 + + def __init__(self, z_dim: int = 4) -> None: + self.config = SimpleNamespace( + z_dim=z_dim, + scale_factor_temporal=4, + scale_factor_spatial=8, + latents_mean=[0.0] * z_dim, + latents_std=[1.0] * z_dim, + ) + + def encode(self, video: torch.Tensor): + latent_frames = (video.shape[2] + self.config.scale_factor_temporal - 1) // self.config.scale_factor_temporal + latent_height = video.shape[-2] // self.config.scale_factor_spatial + latent_width = video.shape[-1] // self.config.scale_factor_spatial + latents = torch.ones( + video.shape[0], + self.config.z_dim, + latent_frames, + latent_height, + latent_width, + dtype=video.dtype, + device=video.device, + ) + return SimpleNamespace(latents=latents) + + def decode(self, latents: torch.Tensor, return_dict: bool = False): + del return_dict + return (latents,) + + +@contextmanager +def noop_progress_bar(*args, **kwargs): + del args, kwargs + + class Bar: + def update(self) -> None: + return None + + yield Bar() diff --git a/tests/diffusion/models/wan2_2/test_wan22_i2v_pipeline.py b/tests/diffusion/models/wan2_2/test_wan22_i2v_pipeline.py new file mode 100644 index 00000000000..04e834ac47c --- /dev/null +++ b/tests/diffusion/models/wan2_2/test_wan22_i2v_pipeline.py @@ -0,0 +1,126 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from types import SimpleNamespace + +import pytest +import torch +from PIL import Image +from torch import nn + +from tests.diffusion.models.wan2_2.conftest import StubTransformer, StubVAE, noop_progress_bar +from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2_i2v import ( + Wan22I2VPipeline, + get_wan22_i2v_pre_process_func, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion] + + +def _make_i2v_pipeline(*, expand_timesteps: bool) -> Wan22I2VPipeline: + pipeline = object.__new__(Wan22I2VPipeline) + nn.Module.__init__(pipeline) + pipeline.device = torch.device("cpu") + pipeline.transformer = StubTransformer(name="high", in_channels=8, out_channels=4) + pipeline.transformer_2 = StubTransformer(name="low", in_channels=8, out_channels=4) + pipeline.vae = StubVAE(z_dim=4) + pipeline.vae_scale_factor_temporal = 4 + pipeline.vae_scale_factor_spatial = 8 + pipeline.expand_timesteps = expand_timesteps + pipeline.progress_bar = noop_progress_bar + return pipeline + + +def test_i2v_preprocess_requires_image_and_resizes_to_480p_aspect() -> None: + preprocess = get_wan22_i2v_pre_process_func(SimpleNamespace()) + request = SimpleNamespace( + prompts=[{"prompt": "p", "multi_modal_data": {"image": Image.new("RGB", (320, 160), "red")}}], + sampling_params=SimpleNamespace(height=None, width=None), + ) + + result = preprocess(request) + prompt = result.prompts[0] + + assert result.sampling_params.height == 432 + assert result.sampling_params.width == 880 + assert prompt["multi_modal_data"]["image"].size == (880, 432) + assert prompt["additional_information"]["preprocessed_image"].shape[-2:] == (432, 880) + + missing_image = SimpleNamespace( + prompts=[{"prompt": "p", "multi_modal_data": {}}], + sampling_params=SimpleNamespace(height=None, width=None), + ) + with pytest.raises(ValueError, match="No image is provided"): + preprocess(missing_image) + + +def test_i2v_diffuse_selects_stage_guidance_and_expands_timesteps() -> None: + pipeline = _make_i2v_pipeline(expand_timesteps=True) + latents = torch.zeros(1, 4, 2, 4, 4) + condition = torch.ones_like(latents) + first_frame_mask = torch.ones(1, 1, 2, 4, 4) + first_frame_mask[:, :, 0] = 0 + timesteps = torch.tensor([900, 100]) + + calls = [] + + def fake_predict_noise_maybe_with_cfg(**kwargs): + positive = kwargs["positive_kwargs"] + calls.append( + { + "model": positive["current_model"].name, + "scale": kwargs["true_cfg_scale"], + "timestep_shape": tuple(positive["timestep"].shape), + "timestep_values": positive["timestep"].clone(), + "hidden_states": positive["hidden_states"].clone(), + } + ) + return torch.ones_like(latents) + + pipeline.predict_noise_maybe_with_cfg = fake_predict_noise_maybe_with_cfg # type: ignore[method-assign] + pipeline.scheduler_step_maybe_with_cfg = lambda noise, t, current, cfg: current + noise # type: ignore[method-assign] + + result = pipeline.diffuse( + latents=latents, + timesteps=timesteps, + prompt_embeds=torch.zeros(1, 2, 3), + negative_prompt_embeds=None, + image_embeds=None, + guidance_low=1.0, + guidance_high=2.0, + boundary_timestep=500.0, + dtype=torch.float32, + attention_kwargs={}, + condition=condition, + first_frame_mask=first_frame_mask, + ) + + assert [call["model"] for call in calls] == ["high", "low"] + assert [call["scale"] for call in calls] == [1.0, 2.0] + assert calls[0]["timestep_shape"] == (1, 8) + timestep_dtype = calls[0]["timestep_values"].dtype + torch.testing.assert_close(calls[0]["timestep_values"][0, :4], torch.zeros(4, dtype=timestep_dtype)) + torch.testing.assert_close(calls[0]["timestep_values"][0, 4:], torch.full((4,), 900, dtype=timestep_dtype)) + torch.testing.assert_close(calls[0]["hidden_states"][:, :, 0], torch.ones(1, 4, 4, 4)) + torch.testing.assert_close(result, torch.full_like(latents, 2.0)) + + +def test_i2v_prepare_latents_builds_expand_condition_and_first_frame_mask() -> None: + pipeline = _make_i2v_pipeline(expand_timesteps=True) + latents, condition, first_frame_mask = pipeline.prepare_latents( + image=torch.zeros(1, 3, 16, 16), + batch_size=1, + num_channels_latents=4, + height=16, + width=16, + num_frames=5, + dtype=torch.float32, + device=torch.device("cpu"), + generator=torch.Generator(device="cpu").manual_seed(0), + ) + + assert latents.shape == (1, 4, 2, 2, 2) + assert condition.shape == (1, 4, 1, 2, 2) + assert first_frame_mask.shape == (1, 1, 2, 2, 2) + assert first_frame_mask[:, :, 0].sum() == 0 + assert first_frame_mask[:, :, 1].sum() == 4 diff --git a/tests/diffusion/models/wan2_2/test_wan22_pipeline_diffuse.py b/tests/diffusion/models/wan2_2/test_wan22_pipeline_diffuse.py new file mode 100644 index 00000000000..54bb672ef81 --- /dev/null +++ b/tests/diffusion/models/wan2_2/test_wan22_pipeline_diffuse.py @@ -0,0 +1,155 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from contextlib import contextmanager +from types import SimpleNamespace + +import pytest +import torch +from torch import nn + +from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2 import Wan22Pipeline + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion] + + +class _StubTransformer(nn.Module): + @property + def dtype(self) -> torch.dtype: + return torch.float32 + + +class _StubScheduler: + def __init__(self, timesteps: list[int]) -> None: + self.timesteps = torch.tensor(timesteps, dtype=torch.int64) + self.config = SimpleNamespace(num_train_timesteps=1000) + self.set_timesteps_calls: list[tuple[int, torch.device]] = [] + + def set_timesteps(self, num_steps: int, device: torch.device) -> None: + self.set_timesteps_calls.append((num_steps, device)) + + +@contextmanager +def _noop_progress_bar(*args, **kwargs): + del args, kwargs + + class _Bar: + def update(self) -> None: + return None + + yield _Bar() + + +def _make_pipeline() -> Wan22Pipeline: + pipeline = object.__new__(Wan22Pipeline) + nn.Module.__init__(pipeline) + pipeline.device = torch.device("cpu") + pipeline.transformer = _StubTransformer() + pipeline.transformer_2 = None + pipeline.transformer_config = SimpleNamespace(patch_size=(1, 2, 2), in_channels=4, out_channels=4) + pipeline.scheduler = _StubScheduler([9, 5]) + pipeline.od_config = SimpleNamespace(flow_shift=5.0) + pipeline._sample_solver = "unipc" + pipeline._flow_shift = 5.0 + pipeline.vae_scale_factor_temporal = 4 + pipeline.vae_scale_factor_spatial = 8 + pipeline.boundary_ratio = 0.875 + pipeline.expand_timesteps = False + pipeline._guidance_scale = None + pipeline._guidance_scale_2 = None + pipeline._num_timesteps = None + pipeline._current_timestep = None + pipeline.check_inputs = lambda **kwargs: None + pipeline.prepare_latents = lambda **kwargs: torch.zeros((1, 4, 1, 8, 8), dtype=torch.float32) + pipeline.progress_bar = _noop_progress_bar + return pipeline + + +def test_forward_delegates_denoising_to_diffuse(monkeypatch) -> None: + pipeline = _make_pipeline() + + prompt_embeds = torch.randn(1, 8) + captured: dict[str, object] = {} + + def _fake_diffuse(**kwargs): + captured.update(kwargs) + return kwargs["latents"] + 1 + + pipeline.diffuse = _fake_diffuse # type: ignore[method-assign] + + req = SimpleNamespace( + prompts=["prompt"], + sampling_params=SimpleNamespace( + height=None, + width=None, + num_frames=1, + num_inference_steps=2, + guidance_scale_provided=False, + guidance_scale=None, + guidance_scale_2=None, + boundary_ratio=None, + generator=None, + seed=None, + num_outputs_per_prompt=1, + max_sequence_length=32, + latents=None, + extra_args={}, + ), + ) + + output = pipeline.forward(req, prompt_embeds=prompt_embeds, output_type="latent", guidance_scale=1.0) + + assert torch.equal(output.output, torch.ones((1, 4, 1, 8, 8))) + assert torch.equal(captured["timesteps"], pipeline.scheduler.timesteps) + assert captured["guidance_low"] == 1.0 + assert captured["guidance_high"] == 1.0 + assert captured["boundary_timestep"] == pytest.approx(875.0) + assert captured["latent_condition"] is None + assert captured["first_frame_mask"] is None + assert pipeline.scheduler.set_timesteps_calls == [(2, torch.device("cpu"))] + + +def test_diffuse_runs_prediction_and_scheduler_for_each_timestep() -> None: + pipeline = _make_pipeline() + latents = torch.zeros((1, 1, 1, 2, 2), dtype=torch.float32) + timesteps = torch.tensor([7, 3], dtype=torch.int64) + prompt_embeds = torch.randn(1, 8) + + predict_calls: list[dict[str, object]] = [] + scheduler_calls: list[tuple[float, int, float, bool]] = [] + + def _fake_predict_noise_maybe_with_cfg(**kwargs): + predict_calls.append(kwargs) + timestep = kwargs["positive_kwargs"]["timestep"] + assert isinstance(timestep, torch.Tensor) + return torch.full_like(latents, float(timestep[0].item())) + + def _fake_scheduler_step_maybe_with_cfg(noise_pred, t, current_latents, do_true_cfg): + scheduler_calls.append( + (float(noise_pred[0, 0, 0, 0, 0]), int(t.item()), float(current_latents.sum()), do_true_cfg) + ) + return current_latents + noise_pred + + pipeline.predict_noise_maybe_with_cfg = _fake_predict_noise_maybe_with_cfg # type: ignore[method-assign] + pipeline.scheduler_step_maybe_with_cfg = _fake_scheduler_step_maybe_with_cfg # type: ignore[method-assign] + + result = pipeline.diffuse( + latents=latents, + timesteps=timesteps, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=None, + guidance_low=1.0, + guidance_high=2.0, + boundary_timestep=5.0, + dtype=torch.float32, + attention_kwargs={}, + ) + + assert len(predict_calls) == 2 + assert predict_calls[0]["true_cfg_scale"] == 1.0 + assert predict_calls[1]["true_cfg_scale"] == 2.0 + assert scheduler_calls == [ + (7.0, 7, 0.0, False), + (3.0, 3, 28.0, False), + ] + assert torch.equal(result, torch.full_like(latents, 10.0)) diff --git a/tests/diffusion/models/wan2_2/test_wan22_pipeline_helpers.py b/tests/diffusion/models/wan2_2/test_wan22_pipeline_helpers.py new file mode 100644 index 00000000000..31471786976 --- /dev/null +++ b/tests/diffusion/models/wan2_2/test_wan22_pipeline_helpers.py @@ -0,0 +1,81 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +from types import SimpleNamespace + +import pytest +import torch + +import vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2 as wan22_module +from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2 import ( + create_transformer_from_config, + load_transformer_config, + retrieve_latents, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion] + + +class _LatentDist: + def sample(self, generator): + assert isinstance(generator, torch.Generator) + return torch.tensor([1.0]) + + def mode(self): + return torch.tensor([2.0]) + + +def test_retrieve_latents_supports_sample_mode_argmax_and_direct_latents() -> None: + generator = torch.Generator(device="cpu") + + assert retrieve_latents(SimpleNamespace(latent_dist=_LatentDist()), generator).item() == 1.0 + assert retrieve_latents(SimpleNamespace(latent_dist=_LatentDist()), sample_mode="argmax").item() == 2.0 + torch.testing.assert_close(retrieve_latents(SimpleNamespace(latents=torch.tensor([3.0]))), torch.tensor([3.0])) + + +def test_retrieve_latents_rejects_unknown_encoder_output() -> None: + with pytest.raises(AttributeError, match="Could not access latents"): + retrieve_latents(SimpleNamespace()) + + +def test_load_transformer_config_reads_local_subfolder_config(tmp_path) -> None: + config_dir = tmp_path / "transformer_2" + config_dir.mkdir(parents=True) + (config_dir / "config.json").write_text(json.dumps({"patch_size": [1, 2, 2], "num_layers": 2})) + + assert load_transformer_config(str(tmp_path), "transformer_2") == {"patch_size": [1, 2, 2], "num_layers": 2} + assert load_transformer_config(str(tmp_path), "missing") == {} + + +def test_create_transformer_from_config_maps_supported_keys(monkeypatch) -> None: + captured = {} + + class FakeTransformer: + def __init__(self, **kwargs) -> None: + captured.update(kwargs) + + monkeypatch.setattr(wan22_module, "WanTransformer3DModel", FakeTransformer) + + transformer = create_transformer_from_config( + { + "patch_size": [1, 2, 2], + "num_attention_heads": 8, + "attention_head_dim": 128, + "in_channels": 16, + "out_channels": 16, + "text_dim": 4096, + "vace_layers": [0], + "ignored": "value", + } + ) + + assert isinstance(transformer, FakeTransformer) + assert captured == { + "patch_size": (1, 2, 2), + "num_attention_heads": 8, + "attention_head_dim": 128, + "in_channels": 16, + "out_channels": 16, + "text_dim": 4096, + } diff --git a/tests/diffusion/models/wan2_2/test_wan22_ti2v_pipeline.py b/tests/diffusion/models/wan2_2/test_wan22_ti2v_pipeline.py new file mode 100644 index 00000000000..983350c4cf9 --- /dev/null +++ b/tests/diffusion/models/wan2_2/test_wan22_ti2v_pipeline.py @@ -0,0 +1,98 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from types import SimpleNamespace + +import pytest +import torch +from PIL import Image +from torch import nn + +from tests.diffusion.models.wan2_2.conftest import StubTransformer, StubVAE, noop_progress_bar +from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2_ti2v import ( + Wan22TI2VPipeline, + get_wan22_ti2v_pre_process_func, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion] + + +def _make_ti2v_pipeline() -> Wan22TI2VPipeline: + pipeline = object.__new__(Wan22TI2VPipeline) + nn.Module.__init__(pipeline) + pipeline.device = torch.device("cpu") + pipeline.transformer = StubTransformer(in_channels=4, out_channels=4) + pipeline.vae = StubVAE(z_dim=4) + pipeline.vae_scale_factor_temporal = 4 + pipeline.vae_scale_factor_spatial = 8 + pipeline.progress_bar = noop_progress_bar + return pipeline + + +def test_ti2v_preprocess_uses_720p_area_for_image_condition() -> None: + preprocess = get_wan22_ti2v_pre_process_func(SimpleNamespace()) + request = SimpleNamespace( + prompts=[{"prompt": "p", "multi_modal_data": {"image": Image.new("RGB", (320, 160), "blue")}}], + sampling_params=SimpleNamespace(height=None, width=None), + ) + + result = preprocess(request) + + assert result.sampling_params.height == 672 + assert result.sampling_params.width == 1344 + assert result.prompts[0]["multi_modal_data"]["image"].size == (1344, 672) + assert result.prompts[0]["additional_information"]["preprocessed_image"].shape[-2:] == (672, 1344) + + +def test_ti2v_diffuse_without_image_condition_expands_patch_timesteps() -> None: + pipeline = _make_ti2v_pipeline() + latents = torch.zeros(1, 4, 2, 4, 4) + calls = [] + + def fake_predict_noise_maybe_with_cfg(**kwargs): + calls.append(kwargs) + return torch.ones_like(latents) + + pipeline.predict_noise_maybe_with_cfg = fake_predict_noise_maybe_with_cfg # type: ignore[method-assign] + pipeline.scheduler_step_maybe_with_cfg = lambda noise, t, current, cfg: current + noise # type: ignore[method-assign] + + result = pipeline.diffuse( + latents=latents, + timesteps=torch.tensor([7]), + prompt_embeds=torch.zeros(1, 2, 3), + negative_prompt_embeds=torch.zeros(1, 2, 3), + guidance_scale=3.0, + dtype=torch.float32, + attention_kwargs={"a": "b"}, + num_latent_frames=2, + latent_height=4, + latent_width=4, + ) + + positive = calls[0]["positive_kwargs"] + assert calls[0]["do_true_cfg"] is True + assert positive["timestep"].shape == (1, 8) + torch.testing.assert_close(positive["timestep"], torch.full((1, 8), 7, dtype=positive["timestep"].dtype)) + torch.testing.assert_close(positive["hidden_states"], latents) + torch.testing.assert_close(result, torch.ones_like(latents)) + + +def test_ti2v_prepare_i2v_latents_encodes_condition_and_masks_first_frame() -> None: + pipeline = _make_ti2v_pipeline() + latents, latent_condition, first_frame_mask = pipeline.prepare_i2v_latents( + image=torch.zeros(1, 3, 16, 16), + batch_size=1, + num_channels_latents=4, + height=16, + width=16, + num_frames=5, + dtype=torch.float32, + device=torch.device("cpu"), + generator=None, + latents=torch.zeros(1, 4, 2, 2, 2), + ) + + torch.testing.assert_close(latents, torch.zeros(1, 4, 2, 2, 2)) + assert latent_condition.shape == (1, 4, 1, 2, 2) + assert first_frame_mask[:, :, 0].sum() == 0 + assert first_frame_mask[:, :, 1].sum() == 4 diff --git a/tests/diffusion/models/wan2_2/test_wan22_vace_pipeline.py b/tests/diffusion/models/wan2_2/test_wan22_vace_pipeline.py new file mode 100644 index 00000000000..9fa9b67c499 --- /dev/null +++ b/tests/diffusion/models/wan2_2/test_wan22_vace_pipeline.py @@ -0,0 +1,137 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from types import SimpleNamespace + +import pytest +import torch +from PIL import Image +from torch import nn + +from tests.diffusion.models.wan2_2.conftest import StubTransformer, StubVAE, noop_progress_bar +from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2_vace import ( + Wan22VACEPipeline, + create_vace_transformer_from_config, + get_wan22_vace_pre_process_func, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion] + + +def _make_vace_pipeline() -> Wan22VACEPipeline: + pipeline = object.__new__(Wan22VACEPipeline) + nn.Module.__init__(pipeline) + pipeline.device = torch.device("cpu") + pipeline.transformer = StubTransformer(in_channels=4, out_channels=4) + pipeline.transformer_config = pipeline.transformer.config + pipeline.vae = StubVAE(z_dim=4) + pipeline.vae_scale_factor_temporal = 4 + pipeline.vae_scale_factor_spatial = 8 + pipeline.progress_bar = noop_progress_bar + return pipeline + + +def test_vace_preprocess_collects_reference_video_and_mask_inputs() -> None: + preprocess = get_wan22_vace_pre_process_func(SimpleNamespace()) + ref = Image.new("RGB", (320, 160), "green") + frame = Image.new("RGB", (64, 64), "black") + mask = Image.new("L", (64, 64), 255) + request = SimpleNamespace( + prompts=[ + { + "prompt": "p", + "multi_modal_data": { + "image": ref, + "video": [frame], + "mask": mask, + }, + } + ], + sampling_params=SimpleNamespace(height=None, width=None), + ) + + result = preprocess(request) + additional_info = result.prompts[0]["additional_information"] + + assert result.sampling_params.height == 432 + assert result.sampling_params.width == 880 + assert additional_info["reference_images"] == [ref] + assert additional_info["source_video"] == [frame] + assert additional_info["mask"] == [mask] + + +def test_create_vace_transformer_from_config_maps_vace_specific_keys(monkeypatch) -> None: + captured = {} + + class FakeVACETransformer: + def __init__(self, **kwargs) -> None: + captured.update(kwargs) + + monkeypatch.setattr( + "vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2_vace.WanVACETransformer3DModel", + FakeVACETransformer, + ) + + transformer = create_vace_transformer_from_config( + { + "patch_size": [1, 2, 2], + "in_channels": 96, + "out_channels": 16, + "vace_layers": [0, 1, 2], + "vace_in_channels": 132, + "unknown": "ignored", + } + ) + + assert isinstance(transformer, FakeVACETransformer) + assert captured == { + "patch_size": (1, 2, 2), + "in_channels": 96, + "out_channels": 16, + "vace_layers": [0, 1, 2], + "vace_in_channels": 132, + } + + +def test_vace_prepare_masks_encodes_spatial_stride_and_reference_padding() -> None: + pipeline = _make_vace_pipeline() + mask = torch.ones(1, 3, 5, 16, 16) + reference_images = [[torch.zeros(3, 16, 16), torch.zeros(3, 16, 16)]] + + encoded = pipeline.prepare_masks(mask, reference_images) + + assert encoded.shape == (1, 64, 4, 2, 2) + torch.testing.assert_close(encoded[:, :, :2], torch.zeros(1, 64, 2, 2, 2)) + torch.testing.assert_close(encoded[:, :, 2:], torch.ones(1, 64, 2, 2, 2)) + + +def test_vace_diffuse_passes_context_and_scale_to_cfg_branches() -> None: + pipeline = _make_vace_pipeline() + latents = torch.zeros(1, 4, 1, 2, 2) + vace_context = torch.ones(1, 12, 1, 2, 2) + calls = [] + + def fake_predict_noise_maybe_with_cfg(**kwargs): + calls.append(kwargs) + return torch.ones_like(latents) + + pipeline.predict_noise_maybe_with_cfg = fake_predict_noise_maybe_with_cfg # type: ignore[method-assign] + pipeline.scheduler_step_maybe_with_cfg = lambda noise, t, current, cfg: current + noise # type: ignore[method-assign] + + result = pipeline.diffuse( + latents=latents, + timesteps=torch.tensor([5]), + prompt_embeds=torch.zeros(1, 2, 3), + negative_prompt_embeds=torch.zeros(1, 2, 3), + guidance_scale=4.0, + dtype=torch.float32, + attention_kwargs={}, + vace_context=vace_context, + vace_context_scale=0.75, + ) + + assert calls[0]["do_true_cfg"] is True + assert calls[0]["true_cfg_scale"] == 4.0 + assert calls[0]["positive_kwargs"]["vace_context"] is vace_context + assert calls[0]["negative_kwargs"]["vace_context_scale"] == 0.75 + torch.testing.assert_close(result, torch.ones_like(latents)) diff --git a/tests/diffusion/quantization/test_quantization_quality.py b/tests/diffusion/quantization/test_quantization_quality.py index 3d8f1873698..ba6a150c4bb 100644 --- a/tests/diffusion/quantization/test_quantization_quality.py +++ b/tests/diffusion/quantization/test_quantization_quality.py @@ -32,7 +32,7 @@ import pytest import torch -from tests.utils import hardware_marks +from tests.helpers.mark import hardware_marks # --------------------------------------------------------------------------- # Configuration — add new quantization methods / models here diff --git a/tests/diffusion/test_diffusion_model_runner.py b/tests/diffusion/test_diffusion_model_runner.py index 8768986f01d..b63f6d8887f 100644 --- a/tests/diffusion/test_diffusion_model_runner.py +++ b/tests/diffusion/test_diffusion_model_runner.py @@ -8,7 +8,7 @@ import torch import vllm_omni.diffusion.worker.diffusion_model_runner as model_runner_module -from tests.utils import hardware_test +from tests.helpers.mark import hardware_test from vllm_omni.diffusion.worker.diffusion_model_runner import DiffusionModelRunner pytestmark = [pytest.mark.diffusion] diff --git a/tests/diffusion/test_diffusion_step_pipeline.py b/tests/diffusion/test_diffusion_step_pipeline.py index 42687d4a1ed..06f8cd14dc8 100644 --- a/tests/diffusion/test_diffusion_step_pipeline.py +++ b/tests/diffusion/test_diffusion_step_pipeline.py @@ -13,7 +13,7 @@ from pytest_mock import MockerFixture import vllm_omni.diffusion.worker.diffusion_model_runner as model_runner_module -from tests.utils import hardware_test +from tests.helpers.mark import hardware_test from vllm_omni.diffusion.data import DiffusionOutput from vllm_omni.diffusion.diffusion_engine import DiffusionEngine from vllm_omni.diffusion.distributed.cfg_parallel import CFGParallelMixin diff --git a/tests/e2e/accuracy/conftest.py b/tests/e2e/accuracy/conftest.py index 3d614b8cdc1..709fdf345ec 100644 --- a/tests/e2e/accuracy/conftest.py +++ b/tests/e2e/accuracy/conftest.py @@ -1,7 +1,6 @@ from __future__ import annotations import os -import shutil import subprocess from contextlib import contextmanager from dataclasses import dataclass @@ -13,7 +12,7 @@ import torch from PIL import Image -from tests.conftest import OmniServer, OmniServerParams +from tests.helpers.runtime import OmniServer, OmniServerParams def pytest_addoption(parser): @@ -208,18 +207,6 @@ def rabbit_image(accuracy_artifact_root: Path) -> Image.Image: return image -def reset_artifact_dir(path: Path) -> Path: - if path.exists(): - shutil.rmtree(path) - path.mkdir(parents=True, exist_ok=True) - return path - - -def infer_model_label(model: str) -> str: - label = Path(model.rstrip("/\\")).name or "model" - return "".join(char if char.isalnum() or char in {"-", "_"} else "_" for char in label) - - def _build_accuracy_server_config( *, generate_model: str, diff --git a/tests/e2e/accuracy/utils.py b/tests/e2e/accuracy/helpers.py similarity index 91% rename from tests/e2e/accuracy/utils.py rename to tests/e2e/accuracy/helpers.py index d722b69b011..382d3ea9b5f 100644 --- a/tests/e2e/accuracy/utils.py +++ b/tests/e2e/accuracy/helpers.py @@ -1,5 +1,3 @@ -from __future__ import annotations - from pathlib import Path import numpy as np @@ -9,6 +7,20 @@ from torchmetrics.image import PeakSignalNoiseRatio, StructuralSimilarityIndexMeasure +def reset_artifact_dir(path: Path) -> Path: + import shutil + + if path.exists(): + shutil.rmtree(path) + path.mkdir(parents=True, exist_ok=True) + return path + + +def infer_model_label(model: str) -> str: + label = Path(model.rstrip("/\\")).name or "model" + return "".join(char if char.isalnum() or char in {"-", "_"} else "_" for char in label) + + def model_output_dir(parent_dir: Path, model: str) -> Path: safe_model_name = model.split("/")[-1].replace(".", "_") path = parent_dir / safe_model_name diff --git a/tests/e2e/accuracy/test_gebench_h100_smoke.py b/tests/e2e/accuracy/test_gebench_h100_smoke.py index b4b83187135..74891926910 100644 --- a/tests/e2e/accuracy/test_gebench_h100_smoke.py +++ b/tests/e2e/accuracy/test_gebench_h100_smoke.py @@ -6,8 +6,8 @@ import pytest from benchmarks.accuracy.text_to_image.gbench import main as gbench_main -from tests.e2e.accuracy.conftest import infer_model_label, reset_artifact_dir -from tests.utils import hardware_test +from tests.e2e.accuracy.helpers import infer_model_label, reset_artifact_dir +from tests.helpers.mark import hardware_test @pytest.mark.advanced_model diff --git a/tests/e2e/accuracy/test_gedit_bench_h100_smoke.py b/tests/e2e/accuracy/test_gedit_bench_h100_smoke.py index 960ea57960c..6227d636863 100644 --- a/tests/e2e/accuracy/test_gedit_bench_h100_smoke.py +++ b/tests/e2e/accuracy/test_gedit_bench_h100_smoke.py @@ -7,8 +7,8 @@ from benchmarks.accuracy.image_to_image.gedit_bench import GROUPS from benchmarks.accuracy.image_to_image.gedit_bench import main as gedit_main -from tests.e2e.accuracy.conftest import infer_model_label, reset_artifact_dir -from tests.utils import hardware_test +from tests.e2e.accuracy.helpers import infer_model_label, reset_artifact_dir +from tests.helpers.mark import hardware_test @pytest.mark.advanced_model diff --git a/tests/e2e/accuracy/test_qwen_image.py b/tests/e2e/accuracy/test_qwen_image.py index e73195017aa..8922d9d1044 100644 --- a/tests/e2e/accuracy/test_qwen_image.py +++ b/tests/e2e/accuracy/test_qwen_image.py @@ -12,13 +12,10 @@ from diffusers.pipelines.pipeline_utils import DiffusionPipeline from PIL import Image -from tests.conftest import ( - OmniServer, - _run_post_test_cleanup, - _run_pre_test_cleanup, -) -from tests.e2e.accuracy.utils import assert_similarity, model_output_dir -from tests.utils import hardware_test +from tests.e2e.accuracy.helpers import assert_similarity, model_output_dir +from tests.helpers.env import run_post_test_cleanup, run_pre_test_cleanup +from tests.helpers.mark import hardware_test +from tests.helpers.runtime import OmniServer MODEL_ID = "Qwen/Qwen-Image" MODEL_ENV_VAR = "QWEN_IMAGE_MODEL" @@ -70,7 +67,7 @@ def _run_vllm_omni_qwen_image(*, model: str, output_path: Path) -> Image.Image: def _run_diffusers_qwen_image(*, model: str, output_path: Path) -> Image.Image: - _run_pre_test_cleanup(enable_force=True) + run_pre_test_cleanup(enable_force=True) pipe: DiffusionPipeline | None = None try: pipe = DiffusionPipeline.from_pretrained( @@ -99,7 +96,7 @@ def _run_diffusers_qwen_image(*, model: str, output_path: Path) -> Image.Image: gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() - _run_post_test_cleanup(enable_force=True) + run_post_test_cleanup(enable_force=True) @pytest.mark.advanced_model diff --git a/tests/e2e/accuracy/test_qwen_image_edit.py b/tests/e2e/accuracy/test_qwen_image_edit.py index 9a970103438..e17aca6e99b 100644 --- a/tests/e2e/accuracy/test_qwen_image_edit.py +++ b/tests/e2e/accuracy/test_qwen_image_edit.py @@ -10,13 +10,10 @@ from PIL import Image from benchmarks.accuracy.common import decode_base64_image, pil_to_png_bytes -from tests.conftest import ( - OmniServer, - _run_post_test_cleanup, - _run_pre_test_cleanup, -) -from tests.e2e.accuracy.utils import assert_similarity, model_output_dir -from tests.utils import hardware_test +from tests.e2e.accuracy.helpers import assert_similarity, model_output_dir +from tests.helpers.env import run_post_test_cleanup, run_pre_test_cleanup +from tests.helpers.mark import hardware_test +from tests.helpers.runtime import OmniServer SINGLE_MODEL = "Qwen/Qwen-Image-Edit" MULTIPLE_MODEL = "Qwen/Qwen-Image-Edit-2509" @@ -77,7 +74,7 @@ def _run_diffusers_image_edit( input_images: list[Image.Image], output_path: Path, ) -> Image.Image: - _run_pre_test_cleanup(enable_force=True) + run_pre_test_cleanup(enable_force=True) pipe: QwenImageEditPipeline | QwenImageEditPlusPipeline | None = None device = torch.device("cuda:0") torch.cuda.set_device(device) @@ -110,7 +107,7 @@ def _run_diffusers_image_edit( gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() - _run_post_test_cleanup(enable_force=True) + run_post_test_cleanup(enable_force=True) def _vllm_omni_output_single_image( diff --git a/tests/e2e/accuracy/test_qwen_image_layered.py b/tests/e2e/accuracy/test_qwen_image_layered.py index 04b13df3bb2..0ab9cb32363 100644 --- a/tests/e2e/accuracy/test_qwen_image_layered.py +++ b/tests/e2e/accuracy/test_qwen_image_layered.py @@ -12,13 +12,10 @@ from diffusers.pipelines.pipeline_utils import DiffusionPipeline from PIL import Image -from tests.conftest import ( - OmniServer, - _run_post_test_cleanup, - _run_pre_test_cleanup, -) -from tests.e2e.accuracy.utils import assert_image_sequence_similarity, model_output_dir -from tests.utils import hardware_test +from tests.e2e.accuracy.helpers import assert_image_sequence_similarity, model_output_dir +from tests.helpers.env import run_post_test_cleanup, run_pre_test_cleanup +from tests.helpers.mark import hardware_test +from tests.helpers.runtime import OmniServer MODEL_ID = "Qwen/Qwen-Image-Layered" MODEL_ENV_VAR = "QWEN_IMAGE_LAYERED_MODEL" @@ -93,7 +90,7 @@ def _run_vllm_omni_qwen_image_layered(*, model: str, input_image: Image.Image, o def _run_diffusers_qwen_image_layered(*, model: str, input_image: Image.Image, output_dir: Path) -> list[Image.Image]: - _run_pre_test_cleanup(enable_force=True) + run_pre_test_cleanup(enable_force=True) pipe: DiffusionPipeline | None = None try: pipe = DiffusionPipeline.from_pretrained( @@ -126,7 +123,7 @@ def _run_diffusers_qwen_image_layered(*, model: str, input_image: Image.Image, o gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() - _run_post_test_cleanup(enable_force=True) + run_post_test_cleanup(enable_force=True) @pytest.mark.advanced_model diff --git a/tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py b/tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py index 3cdda1f9ffa..24daa8ccf54 100644 --- a/tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py +++ b/tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py @@ -22,7 +22,6 @@ from diffusers import UniPCMultistepScheduler from PIL import Image -from tests.conftest import OmniServerParams from tests.e2e.accuracy.wan22_i2v.run_wan22_i2v_diffusers_cp import ( _configure_scheduler, _ensure_wan_ftfy_fallback, @@ -48,7 +47,8 @@ SSIM_THRESHOLD, WIDTH, ) -from tests.utils import hardware_test +from tests.helpers.mark import hardware_test +from tests.helpers.runtime import OmniServerParams def test_parse_video_metadata_extracts_dimensions_and_fps() -> None: @@ -567,6 +567,7 @@ def test_wan22_i2v_diffusers_offline_generates_video( @pytest.mark.benchmark @pytest.mark.diffusion @hardware_test(res={"cuda": "H100"}, num_cards=2) +@pytest.mark.skip(reason="issue: #2874") @pytest.mark.parametrize("omni_server", SERVER_CASES, indirect=True) def test_wan22_i2v_online_serving_generates_video( omni_server, diff --git a/tests/e2e/offline_inference/custom_pipeline/test_async_omni_collective_rpc.py b/tests/e2e/offline_inference/custom_pipeline/test_async_omni_collective_rpc.py index 57743d62bf6..bd3f2e09975 100644 --- a/tests/e2e/offline_inference/custom_pipeline/test_async_omni_collective_rpc.py +++ b/tests/e2e/offline_inference/custom_pipeline/test_async_omni_collective_rpc.py @@ -26,7 +26,7 @@ import pytest -from tests.utils import hardware_test +from tests.helpers.mark import hardware_test from vllm_omni.entrypoints.async_omni import AsyncOmni from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput diff --git a/tests/e2e/offline_inference/custom_pipeline/test_async_omni_qwen_image_generate.py b/tests/e2e/offline_inference/custom_pipeline/test_async_omni_qwen_image_generate.py index 03bd12efae2..0681687fe73 100644 --- a/tests/e2e/offline_inference/custom_pipeline/test_async_omni_qwen_image_generate.py +++ b/tests/e2e/offline_inference/custom_pipeline/test_async_omni_qwen_image_generate.py @@ -19,7 +19,7 @@ import pytest from transformers import AutoTokenizer -from tests.utils import hardware_test +from tests.helpers.mark import hardware_test from vllm_omni.entrypoints.async_omni import AsyncOmni from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput diff --git a/tests/e2e/offline_inference/custom_pipeline/test_worker_extension.py b/tests/e2e/offline_inference/custom_pipeline/test_worker_extension.py index ffbe703ca78..653b35d7e2f 100644 --- a/tests/e2e/offline_inference/custom_pipeline/test_worker_extension.py +++ b/tests/e2e/offline_inference/custom_pipeline/test_worker_extension.py @@ -10,7 +10,7 @@ from tests.e2e.offline_inference.custom_pipeline.worker_extension import ( vLLMOmniColocateWorkerExtensionForTest, ) -from tests.utils import hardware_test +from tests.helpers.mark import hardware_test from vllm_omni.diffusion.worker.diffusion_worker import CustomPipelineWorkerExtension from vllm_omni.entrypoints.async_omni import AsyncOmni diff --git a/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml b/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml index 1f0d06cb8c0..b7768c071f6 100644 --- a/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml +++ b/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml @@ -64,9 +64,6 @@ stage_args: # Top-level runtime config with Mooncake connector runtime: enabled: true - defaults: - window_size: -1 - max_inflight: 1 connectors: mooncake_connector: name: MooncakeConnector @@ -80,4 +77,3 @@ runtime: edges: - from: 0 to: 1 - window_size: -1 diff --git a/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml b/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml index 36b1d2bbe48..504f3c98e92 100644 --- a/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml +++ b/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml @@ -62,10 +62,6 @@ stage_args: # Runtime edges runtime: enabled: true - defaults: - window_size: -1 - max_inflight: 1 - # Distributed connectors configuration (optional) # More connectors will be supported in the future. connectors: @@ -78,4 +74,3 @@ runtime: edges: - from: 0 to: 1 - window_size: -1 diff --git a/tests/e2e/offline_inference/stage_configs/npu/qwen2_5_omni_ci.yaml b/tests/e2e/offline_inference/stage_configs/npu/qwen2_5_omni_ci.yaml deleted file mode 100644 index f93a6c71473..00000000000 --- a/tests/e2e/offline_inference/stage_configs/npu/qwen2_5_omni_ci.yaml +++ /dev/null @@ -1,103 +0,0 @@ -# stage config for running qwen2.5-omni for multi-stage omni runtime. - -# This config is optimized for CI e2e tests. -stage_args: - - stage_id: 0 - runtime: - process: true # Run this stage in a separate process - devices: "0" - engine_args: - model_stage: thinker - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - max_model_len: 896 - max_num_batched_tokens: 896 - max_num_seqs: 1 - gpu_memory_utilization: 0.8 - skip_mm_profiling: true - enforce_eager: true # Now we only support eager mode - trust_remote_code: true - engine_output_type: latent - enable_prefix_caching: false - mm_processor_cache_gb: 0 - is_comprehension: true - final_output: true - final_output_type: text - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 128 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - - stage_id: 1 - runtime: - process: true - devices: "1" - engine_args: - model_stage: talker - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: ar - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - max_model_len: 896 - max_num_batched_tokens: 896 - max_num_seqs: 1 - gpu_memory_utilization: 0.8 - skip_mm_profiling: true - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: latent - engine_input_source: [0] - custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker - default_sampling_params: - temperature: 0.9 - top_p: 0.8 - top_k: 40 - max_tokens: 128 - seed: 42 - detokenize: True - repetition_penalty: 1.05 - stop_token_ids: [8294] - - stage_id: 2 - runtime: - process: true - devices: "0" # Example: use a different GPU than the previous stage; use "0" if single GPU - engine_args: - model_stage: code2wav - max_num_seqs: 1 - model_arch: Qwen2_5OmniForConditionalGeneration - worker_type: generation - scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler - gpu_memory_utilization: 0.15 - enforce_eager: true - trust_remote_code: true - enable_prefix_caching: false - engine_output_type: audio - engine_input_source: [1] - final_output: true - final_output_type: audio - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 128 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - -# Top-level runtime config (concise): default windows and stage edges -runtime: - enabled: true - defaults: - window_size: -1 # Simplified: trigger downstream only after full upstream completion - max_inflight: 1 # Simplified: process serially within each stage - edges: - - from: 0 # thinker → talker: trigger only after receiving full input (-1) - to: 1 - window_size: -1 - - from: 1 # talker → code2wav: trigger only after receiving full input (-1) - to: 2 - window_size: -1 diff --git a/tests/e2e/offline_inference/test_bagel_img2img.py b/tests/e2e/offline_inference/test_bagel_img2img.py index 63d2a37da79..66aec80c7c4 100644 --- a/tests/e2e/offline_inference/test_bagel_img2img.py +++ b/tests/e2e/offline_inference/test_bagel_img2img.py @@ -22,9 +22,10 @@ from PIL import Image from vllm.assets.image import ImageAsset -from tests.conftest import OmniRunner, modify_stage_config -from tests.utils import hardware_test -from vllm_omni import Omni +from tests.helpers.mark import hardware_test +from tests.helpers.runtime import OmniRunner +from tests.helpers.stage_config import modify_stage_config +from vllm_omni.entrypoints.omni import Omni from vllm_omni.platforms import current_omni_platform # Reference pixel data extracted from the known-good output image @@ -32,30 +33,30 @@ # prompt='Change the grass color to red', # input image: 2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg REFERENCE_PIXELS = [ - {"position": (100, 100), "rgb": (157, 172, 217)}, - {"position": (400, 50), "rgb": (105, 144, 218)}, - {"position": (700, 100), "rgb": (118, 159, 233)}, - {"position": (150, 400), "rgb": (195, 34, 60)}, - {"position": (512, 336), "rgb": (222, 214, 193)}, - {"position": (700, 400), "rgb": (197, 15, 43)}, - {"position": (100, 600), "rgb": (105, 13, 18)}, - {"position": (400, 600), "rgb": (169, 33, 44)}, - {"position": (700, 600), "rgb": (101, 86, 93)}, - {"position": (256, 256), "rgb": (181, 202, 222)}, + {"position": (100, 100), "rgb": (156, 172, 217)}, + {"position": (400, 50), "rgb": (105, 144, 217)}, + {"position": (700, 100), "rgb": (118, 159, 232)}, + {"position": (150, 400), "rgb": (180, 22, 52)}, + {"position": (512, 336), "rgb": (221, 211, 194)}, + {"position": (700, 400), "rgb": (192, 10, 46)}, + {"position": (100, 600), "rgb": (102, 12, 22)}, + {"position": (400, 600), "rgb": (161, 28, 47)}, + {"position": (700, 600), "rgb": (100, 87, 94)}, + {"position": (256, 256), "rgb": (181, 201, 221)}, ] if current_omni_platform.is_rocm(): REFERENCE_PIXELS = [ - {"position": (100, 100), "rgb": (156, 172, 215)}, - {"position": (400, 50), "rgb": (106, 144, 216)}, - {"position": (700, 100), "rgb": (118, 158, 231)}, - {"position": (150, 400), "rgb": (183, 23, 48)}, - {"position": (512, 336), "rgb": (218, 215, 191)}, - {"position": (700, 400), "rgb": (194, 14, 42)}, - {"position": (100, 600), "rgb": (105, 10, 16)}, - {"position": (400, 600), "rgb": (167, 33, 46)}, - {"position": (700, 600), "rgb": (102, 86, 92)}, - {"position": (256, 256), "rgb": (181, 201, 220)}, + {"position": (100, 100), "rgb": (156, 172, 217)}, + {"position": (400, 50), "rgb": (105, 144, 217)}, + {"position": (700, 100), "rgb": (118, 159, 232)}, + {"position": (150, 400), "rgb": (180, 22, 52)}, + {"position": (512, 336), "rgb": (221, 211, 194)}, + {"position": (700, 400), "rgb": (192, 10, 46)}, + {"position": (100, 600), "rgb": (102, 12, 22)}, + {"position": (400, 600), "rgb": (161, 28, 47)}, + {"position": (700, 600), "rgb": (100, 87, 94)}, + {"position": (256, 256), "rgb": (181, 201, 221)}, ] PIXEL_TOLERANCE = 10 diff --git a/tests/e2e/offline_inference/test_bagel_lora.py b/tests/e2e/offline_inference/test_bagel_lora.py index 501d23eaa88..75f41f9beea 100644 --- a/tests/e2e/offline_inference/test_bagel_lora.py +++ b/tests/e2e/offline_inference/test_bagel_lora.py @@ -31,9 +31,10 @@ from PIL import Image from safetensors.torch import save_file -from tests.conftest import OmniRunner, modify_stage_config -from tests.utils import hardware_test -from vllm_omni import Omni +from tests.helpers.mark import hardware_test +from tests.helpers.runtime import OmniRunner +from tests.helpers.stage_config import modify_stage_config +from vllm_omni.entrypoints.omni import Omni from vllm_omni.lora.request import LoRARequest from vllm_omni.lora.utils import stable_lora_int_id diff --git a/tests/e2e/offline_inference/test_bagel_text2img.py b/tests/e2e/offline_inference/test_bagel_text2img.py index e45d64f2ac5..0819f103a0a 100644 --- a/tests/e2e/offline_inference/test_bagel_text2img.py +++ b/tests/e2e/offline_inference/test_bagel_text2img.py @@ -27,9 +27,10 @@ import pytest from PIL import Image -from tests.conftest import OmniRunner, modify_stage_config -from tests.utils import hardware_test -from vllm_omni import Omni +from tests.helpers.mark import hardware_test +from tests.helpers.runtime import OmniRunner +from tests.helpers.stage_config import modify_stage_config +from vllm_omni.entrypoints.omni import Omni from vllm_omni.platforms import current_omni_platform # Reference pixel data extracted from the known-good output image @@ -37,30 +38,30 @@ # "Generated with seed=52, num_inference_steps=15, # prompt='A futuristic city skyline at twilight, cyberpunk style'" REFERENCE_PIXELS = [ - {"position": (100, 100), "rgb": (121, 118, 100)}, - {"position": (400, 50), "rgb": (163, 162, 143)}, - {"position": (700, 100), "rgb": (170, 156, 127)}, - {"position": (150, 400), "rgb": (129, 127, 112)}, - {"position": (512, 512), "rgb": (135, 61, 59)}, - {"position": (700, 400), "rgb": (205, 107, 43)}, - {"position": (100, 700), "rgb": (197, 177, 157)}, - {"position": (400, 700), "rgb": (139, 107, 86)}, - {"position": (700, 700), "rgb": (247, 205, 146)}, - {"position": (256, 256), "rgb": (171, 160, 153)}, + {"position": (100, 100), "rgb": (115, 113, 94)}, + {"position": (400, 50), "rgb": (159, 160, 144)}, + {"position": (700, 100), "rgb": (164, 151, 123)}, + {"position": (150, 400), "rgb": (120, 121, 107)}, + {"position": (512, 512), "rgb": (165, 133, 127)}, + {"position": (700, 400), "rgb": (217, 130, 66)}, + {"position": (100, 700), "rgb": (191, 168, 152)}, + {"position": (400, 700), "rgb": (130, 96, 77)}, + {"position": (700, 700), "rgb": (247, 203, 140)}, + {"position": (256, 256), "rgb": (167, 156, 150)}, ] if current_omni_platform.is_rocm(): REFERENCE_PIXELS = [ - {"position": (100, 100), "rgb": (123, 119, 100)}, - {"position": (400, 50), "rgb": (162, 161, 142)}, - {"position": (700, 100), "rgb": (171, 156, 127)}, - {"position": (150, 400), "rgb": (131, 128, 112)}, - {"position": (512, 512), "rgb": (134, 61, 59)}, - {"position": (700, 400), "rgb": (204, 107, 43)}, - {"position": (100, 700), "rgb": (201, 180, 165)}, - {"position": (400, 700), "rgb": (140, 108, 87)}, - {"position": (700, 700), "rgb": (247, 205, 145)}, - {"position": (256, 256), "rgb": (171, 160, 153)}, + {"position": (100, 100), "rgb": (115, 113, 94)}, + {"position": (400, 50), "rgb": (159, 160, 144)}, + {"position": (700, 100), "rgb": (164, 151, 123)}, + {"position": (150, 400), "rgb": (120, 121, 107)}, + {"position": (512, 512), "rgb": (165, 133, 127)}, + {"position": (700, 400), "rgb": (217, 130, 66)}, + {"position": (100, 700), "rgb": (191, 168, 152)}, + {"position": (400, 700), "rgb": (130, 96, 77)}, + {"position": (700, 700), "rgb": (247, 203, 140)}, + {"position": (256, 256), "rgb": (167, 156, 150)}, ] # Maximum allowed difference per color channel diff --git a/tests/e2e/offline_inference/test_bagel_understanding.py b/tests/e2e/offline_inference/test_bagel_understanding.py index bbee3298079..c3ed97b42bd 100644 --- a/tests/e2e/offline_inference/test_bagel_understanding.py +++ b/tests/e2e/offline_inference/test_bagel_understanding.py @@ -26,8 +26,9 @@ import pytest from vllm.assets.image import ImageAsset -from tests.conftest import OmniRunner, modify_stage_config -from tests.utils import hardware_test +from tests.helpers.mark import hardware_test +from tests.helpers.runtime import OmniRunner +from tests.helpers.stage_config import modify_stage_config MODEL_NAME = "ByteDance-Seed/BAGEL-7B-MoT" STAGE_CONFIG = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml") diff --git a/tests/e2e/offline_inference/test_cache_dit.py b/tests/e2e/offline_inference/test_cache_dit.py index fc08da7bedf..1577dd9f6db 100644 --- a/tests/e2e/offline_inference/test_cache_dit.py +++ b/tests/e2e/offline_inference/test_cache_dit.py @@ -11,8 +11,8 @@ import pytest import torch -from tests.conftest import OmniRunner -from tests.utils import hardware_test +from tests.helpers.mark import hardware_test +from tests.helpers.runtime import OmniRunner from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform diff --git a/tests/e2e/offline_inference/test_cosyvoice3.py b/tests/e2e/offline_inference/test_cosyvoice3.py index 8c88d972d5e..db5debac828 100644 --- a/tests/e2e/offline_inference/test_cosyvoice3.py +++ b/tests/e2e/offline_inference/test_cosyvoice3.py @@ -26,8 +26,8 @@ from huggingface_hub import snapshot_download from vllm.sampling_params import SamplingParams -from tests.conftest import OmniRunner -from tests.utils import hardware_test +from tests.helpers.mark import hardware_test +from tests.helpers.runtime import OmniRunner from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config from vllm_omni.model_executor.models.cosyvoice3.tokenizer import get_qwen_tokenizer diff --git a/tests/e2e/offline_inference/test_diffusion_cpu_offload.py b/tests/e2e/offline_inference/test_diffusion_cpu_offload.py index 257755ef8b9..d7fd6f72f5b 100644 --- a/tests/e2e/offline_inference/test_diffusion_cpu_offload.py +++ b/tests/e2e/offline_inference/test_diffusion_cpu_offload.py @@ -4,8 +4,9 @@ import torch from vllm.distributed.parallel_state import cleanup_dist_env_and_memory -from tests.conftest import OmniRunner -from tests.utils import DeviceMemoryMonitor, hardware_test +from tests.helpers.env import DeviceMemoryMonitor +from tests.helpers.mark import hardware_test +from tests.helpers.runtime import OmniRunner from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform diff --git a/tests/e2e/offline_inference/test_diffusion_layerwise_offload.py b/tests/e2e/offline_inference/test_diffusion_layerwise_offload.py index bdfd594c774..4f19c100476 100644 --- a/tests/e2e/offline_inference/test_diffusion_layerwise_offload.py +++ b/tests/e2e/offline_inference/test_diffusion_layerwise_offload.py @@ -2,8 +2,8 @@ import torch from vllm.distributed.parallel_state import cleanup_dist_env_and_memory -from tests.conftest import OmniRunner -from tests.utils import DeviceMemoryMonitor +from tests.helpers.env import DeviceMemoryMonitor +from tests.helpers.runtime import OmniRunner from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform diff --git a/tests/e2e/offline_inference/test_diffusion_lora.py b/tests/e2e/offline_inference/test_diffusion_lora.py index 7edd03f20d1..027dadb3f4e 100644 --- a/tests/e2e/offline_inference/test_diffusion_lora.py +++ b/tests/e2e/offline_inference/test_diffusion_lora.py @@ -7,7 +7,7 @@ import torch from safetensors.torch import save_file -from tests.conftest import OmniRunner +from tests.helpers.runtime import OmniRunner from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform diff --git a/tests/e2e/offline_inference/test_dynin_omni.py b/tests/e2e/offline_inference/test_dynin_omni.py index 5388ac67468..f891fc4f12e 100644 --- a/tests/e2e/offline_inference/test_dynin_omni.py +++ b/tests/e2e/offline_inference/test_dynin_omni.py @@ -18,7 +18,7 @@ import torch from transformers import AutoTokenizer -from tests.utils import hardware_test +from tests.helpers.mark import hardware_test os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" diff --git a/tests/e2e/offline_inference/test_expert_parallel.py b/tests/e2e/offline_inference/test_expert_parallel.py index 29d84d7a3e2..f11646b300d 100644 --- a/tests/e2e/offline_inference/test_expert_parallel.py +++ b/tests/e2e/offline_inference/test_expert_parallel.py @@ -18,8 +18,8 @@ import torch.distributed as dist from PIL import Image -from tests.conftest import OmniRunner -from tests.utils import hardware_test +from tests.helpers.mark import hardware_test +from tests.helpers.runtime import OmniRunner from vllm_omni.diffusion.data import DiffusionParallelConfig from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform diff --git a/tests/e2e/offline_inference/test_flux_autoround_w4a16.py b/tests/e2e/offline_inference/test_flux_autoround_w4a16.py index cbcd1009dd5..ef5d6f9e051 100644 --- a/tests/e2e/offline_inference/test_flux_autoround_w4a16.py +++ b/tests/e2e/offline_inference/test_flux_autoround_w4a16.py @@ -14,8 +14,9 @@ import torch from vllm.distributed.parallel_state import cleanup_dist_env_and_memory -from tests.conftest import OmniRunner -from tests.utils import DeviceMemoryMonitor, hardware_test +from tests.helpers.env import DeviceMemoryMonitor +from tests.helpers.mark import hardware_test +from tests.helpers.runtime import OmniRunner from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform diff --git a/tests/e2e/offline_inference/test_flux_kontext.py b/tests/e2e/offline_inference/test_flux_kontext.py index cd711d6b818..057319c855f 100644 --- a/tests/e2e/offline_inference/test_flux_kontext.py +++ b/tests/e2e/offline_inference/test_flux_kontext.py @@ -13,7 +13,7 @@ from PIL import Image from vllm.assets.image import ImageAsset -from tests.conftest import OmniRunner +from tests.helpers.runtime import OmniRunner from vllm_omni.diffusion.data import DiffusionParallelConfig from vllm_omni.inputs.data import OmniDiffusionSamplingParams diff --git a/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py b/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py index ec4f4693d75..bd0d132d093 100644 --- a/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py +++ b/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py @@ -8,7 +8,7 @@ from PIL import Image from transformers import CLIPModel, CLIPProcessor -from tests.conftest import OmniRunner +from tests.helpers.runtime import OmniRunner from vllm_omni import Omni from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform diff --git a/tests/e2e/offline_inference/test_ltx2_cfg_parallel_parity.py b/tests/e2e/offline_inference/test_ltx2_cfg_parallel_parity.py index 659040929e2..07aa5a647be 100644 --- a/tests/e2e/offline_inference/test_ltx2_cfg_parallel_parity.py +++ b/tests/e2e/offline_inference/test_ltx2_cfg_parallel_parity.py @@ -11,7 +11,7 @@ import pytest from PIL import Image -from tests.utils import hardware_test +from tests.helpers.mark import hardware_test REPO_ROOT = Path(__file__).resolve().parents[3] T2V_EXAMPLE = REPO_ROOT / "examples" / "offline_inference" / "text_to_video" / "text_to_video.py" diff --git a/tests/e2e/offline_inference/test_magi_human.py b/tests/e2e/offline_inference/test_magi_human.py index abb7f9c163c..6d46141729e 100644 --- a/tests/e2e/offline_inference/test_magi_human.py +++ b/tests/e2e/offline_inference/test_magi_human.py @@ -8,8 +8,8 @@ import numpy as np import pytest -from tests.conftest import OmniRunner -from tests.utils import hardware_test +from tests.helpers.mark import hardware_test +from tests.helpers.runtime import OmniRunner from vllm_omni.diffusion.utils.media_utils import mux_video_audio_bytes from vllm_omni.inputs.data import OmniDiffusionSamplingParams diff --git a/tests/e2e/offline_inference/test_mammoth_moda2.py b/tests/e2e/offline_inference/test_mammoth_moda2.py index ff744c86e1e..c3d95844c11 100644 --- a/tests/e2e/offline_inference/test_mammoth_moda2.py +++ b/tests/e2e/offline_inference/test_mammoth_moda2.py @@ -23,8 +23,10 @@ import torch from vllm.sampling_params import SamplingParams -from tests.conftest import OmniRunner -from tests.utils import hardware_test +from tests.helpers.mark import hardware_test +from tests.helpers.runtime import OmniRunner + +os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" # --------------------------------------------------------------------------- # Constants diff --git a/tests/e2e/offline_inference/test_ming_flash_omni.py b/tests/e2e/offline_inference/test_ming_flash_omni.py new file mode 100644 index 00000000000..c591e910ac3 --- /dev/null +++ b/tests/e2e/offline_inference/test_ming_flash_omni.py @@ -0,0 +1,142 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" + +from pathlib import Path + +import pytest + +from tests.helpers.mark import hardware_test +from tests.helpers.media import ( + generate_synthetic_audio, + generate_synthetic_image, + generate_synthetic_video, +) +from tests.helpers.stage_config import modify_stage_config + +models = ["Jonathan1909/Ming-flash-omni-2.0"] + +# Ming-specific +SYSTEM_PROMPT = "你是一个友好的AI助手。\n\ndetailed thinking off" +EOS_TOKEN = "<|role_end|>" +IMAGE_TOKEN = "" +VIDEO_TOKEN = "