vllm-project · hsliuustc0106 · Mar 5, 2026 · Mar 1, 2026 · Mar 1, 2026 · Mar 1, 2026
@@ -19,7 +19,7 @@ The `async_chunk` feature enables asynchronous, chunked processing of data acros
 
 For qwen3-omni:
 - **Thinker → Talker**: Per decode step (typically chunk_size=1)
-- **Talker → Code2Wav**: Accumulated to code2wav chunk_size(default=25, currently only support default, will support chunk_size soon) before sending
+- **Talker → Code2Wav**: Accumulated to `codec_chunk_frames` (default=25) before sending. Set `initial_codec_chunk_frames` to emit smaller chunks during the initial phase for reduced TTFA
 - **Code2Wav**: Streaming decode with code2wav chunk_size
 
 With `async_chunk`:

@@ -96,6 +96,7 @@ Content-Type: application/json
 | `language` | string | "Auto" | Language (see supported languages below) |
 | `instructions` | string | "" | Voice style/emotion instructions |
 | `max_new_tokens` | integer | 2048 | Maximum tokens to generate |
+| `initial_codec_chunk_frames` | integer | null | Initial chunk size for reduced TTFA (overrides stage config) |
 
 **Supported languages:** Auto, Chinese, English, Japanese, Korean, German, French, Russian, Portuguese, Spanish, Italian
 

@@ -98,7 +98,8 @@ Add `--streaming` to stream audio chunks progressively via `AsyncOmni` (requires
 python end2end.py --query-type CustomVoice --streaming --output-dir /tmp/out_stream
 ```
 
-Each 25-frame Code2Wav chunk is logged as it arrives. The final WAV file is written once generation
+Each Code2Wav chunk is logged as it arrives (default 25 frames; configurable via `codec_chunk_frames`
+and `initial_codec_chunk_frames` in the stage config). The final WAV file is written once generation
 completes. This demonstrates that audio data is available progressively rather than only at the end.
 
 > **Note:** Streaming uses `AsyncOmni` internally. The non-streaming path (`Omni`) is unchanged.

@@ -268,7 +268,7 @@ Returns binary audio data with appropriate `Content-Type` header (e.g., `audio/w
 ## Streaming
 
 Set `stream=true` with `response_format="pcm"` to receive raw PCM audio chunks as they are decoded
-(one chunk per 25-frame Code2Wav window):
+(one chunk per Code2Wav window, default 25 frames; configurable in the stage config):
 
 ```bash
 curl -X POST http://localhost:8091/v1/audio/speech \

@@ -95,7 +95,8 @@ Add `--streaming` to stream audio chunks progressively via `AsyncOmni` (requires
 python end2end.py --query-type CustomVoice --streaming --output-dir /tmp/out_stream
 ```
 
-Each 25-frame Code2Wav chunk is logged as it arrives. The final WAV file is written once generation
+Each Code2Wav chunk is logged as it arrives (default 25 frames; configurable via `codec_chunk_frames`
+and `initial_codec_chunk_frames` in the stage config). The final WAV file is written once generation
 completes. This demonstrates that audio data is available progressively rather than only at the end.
 
 > **Note:** Streaming uses `AsyncOmni` internally. The non-streaming path (`Omni`) is unchanged.

@@ -7,6 +7,7 @@
 import asyncio
 import logging
 import os
+import time
 from typing import Any, NamedTuple
 
 import soundfile as sf
@@ -337,13 +338,27 @@ async def main_streaming(args):
 
     for i, prompt in enumerate(inputs):
         request_id = str(i)
+        t_start = time.perf_counter()
+        t_prev = t_start
+        chunk_idx = 0
         async for stage_output in omni.generate(prompt, request_id=request_id):
             mm = stage_output.request_output.outputs[0].multimodal_output
             if not stage_output.finished:
+                t_now = time.perf_counter()
                 audio = mm.get("audio")
                 n = len(audio) if isinstance(audio, list) else (0 if audio is None else 1)
-                logger.info(f"Request {request_id}: received chunk {n}")
+                dt_ms = (t_now - t_prev) * 1000
+                ttfa_ms = (t_now - t_start) * 1000
+                if chunk_idx == 0:
+                    logger.info(f"Request {request_id}: chunk {chunk_idx} samples={n} TTFA={ttfa_ms:.1f}ms")
+                else:
+                    logger.info(f"Request {request_id}: chunk {chunk_idx} samples={n} inter_chunk={dt_ms:.1f}ms")
+                t_prev = t_now
+                chunk_idx += 1
             else:
+                t_end = time.perf_counter()
+                total_ms = (t_end - t_start) * 1000
+                logger.info(f"Request {request_id}: done total={total_ms:.1f}ms chunks={chunk_idx}")
                 _save_wav(output_dir, request_id, mm)
 
 

@@ -250,6 +250,7 @@ Returns binary audio data with appropriate `Content-Type` header (e.g., `audio/w
 | `language`       | string | "Auto"        | Language (see supported languages below)                                     |
 | `instructions`   | string | ""            | Voice style/emotion instructions                                             |
 | `max_new_tokens` | int    | 2048          | Maximum tokens to generate                                                   |
+| `initial_codec_chunk_frames` | int | null | Initial chunk size for reduced TTFA (overrides stage config)                 |
 | `stream`         | bool   | false         | Stream raw PCM chunks as they are decoded (requires `response_format="pcm"`) |
 
 **Supported languages:** Auto, Chinese, English, Japanese, Korean, German, French, Russian, Portuguese, Spanish, Italian
@@ -265,7 +266,7 @@ Returns binary audio data with appropriate `Content-Type` header (e.g., `audio/w
 ## Streaming
 
 Set `stream=true` with `response_format="pcm"` to receive raw PCM audio chunks as they are decoded
-(one chunk per 25-frame Code2Wav window):
+(one chunk per Code2Wav window, default 25 frames; configurable in the stage config):
 
 ```bash
 curl -X POST http://localhost:8091/v1/audio/speech \

@@ -4,64 +4,90 @@
 from collections import defaultdict
 from types import SimpleNamespace
 
+import pytest
 import torch
 
 from vllm_omni.model_executor.stage_input_processors.qwen3_tts import talker2code2wav_async_chunk
 
+_FRAME = [1, 2, 3, 4]  # 4-codebook frame
+_Q = len(_FRAME)  # num quantizers
 
-def _req(external_req_id: str, *, finished: bool):
+
+def _req(rid: str, *, finished: bool, initial_codec_chunk_frames: int | None = None):
+    ai = None
+    if initial_codec_chunk_frames is not None:
+        entry = SimpleNamespace(list_data=[initial_codec_chunk_frames])
+        ai = SimpleNamespace(entries={"initial_codec_chunk_frames": entry})
     return SimpleNamespace(
-        external_req_id=external_req_id,
+        external_req_id=rid,
         is_finished=lambda: finished,
+        additional_information=ai,
     )
 
 
-def test_talker2code2wav_async_chunk_does_not_emit_empty_chunk_when_not_finished():
-    transfer_manager = SimpleNamespace(
+def _tm(*, chunk_frames=25, left_context=25, initial_chunk=0):
+    return SimpleNamespace(
         code_prompt_token_ids=defaultdict(list),
-        connector=SimpleNamespace(config={"extra": {"codec_chunk_frames": 25, "codec_left_context_frames": 25}}),
+        put_req_chunk=defaultdict(int),
+        connector=SimpleNamespace(
+            config={
+                "extra": {
+                    "codec_chunk_frames": chunk_frames,
+                    "codec_left_context_frames": left_context,
+                    "initial_codec_chunk_frames": initial_chunk,
+                }
+            }
+        ),
+    )
+
+
+def _call(tm, rid, *, n_frames, put_req=0, finished=False, req_ic=None):
+    tm.code_prompt_token_ids[rid] = [_FRAME[:] for _ in range(n_frames)]
+    tm.put_req_chunk[rid] = put_req
+    return talker2code2wav_async_chunk(
+        transfer_manager=tm,
+        pooling_output={"audio_codes": torch.zeros((0,))},
+        request=_req(rid, finished=finished, initial_codec_chunk_frames=req_ic),
+        is_finished=finished,
     )
+
+
+def test_does_not_emit_empty_chunk_when_not_finished():
+    tm = _tm()
     request = _req("rid-empty", finished=False)
 
     payload = talker2code2wav_async_chunk(
-        transfer_manager=transfer_manager,
+        transfer_manager=tm,
         pooling_output={"audio_codes": torch.zeros((0,))},
         request=request,
     )
 
     assert payload is None
 
 
-def test_talker2code2wav_async_chunk_flushes_tail_when_finished_without_pooler_output():
-    transfer_manager = SimpleNamespace(
-        code_prompt_token_ids=defaultdict(list),
-        connector=SimpleNamespace(config={"extra": {"codec_chunk_frames": 25, "codec_left_context_frames": 25}}),
-    )
-    request_id = "rid-tail"
-    transfer_manager.code_prompt_token_ids[request_id] = [[1, 2, 3, 4] for _ in range(24)]
-    request = _req(request_id, finished=True)
+def test_flushes_tail_when_finished_without_pooler_output():
+    tm = _tm()
+    rid = "rid-tail"
+    tm.code_prompt_token_ids[rid] = [_FRAME[:] for _ in range(24)]
+    request = _req(rid, finished=True)
 
     payload = talker2code2wav_async_chunk(
-        transfer_manager=transfer_manager,
-        pooling_output=None,  # e.g. EOS step with no audio_codes
+        transfer_manager=tm,
+        pooling_output=None,
         request=request,
     )
 
     assert payload is not None
     assert payload["finished"].item() is True
-    # ctx_frames header + flat codes
-    assert len(payload["code_predictor_codes"]) == 1 + 4 * 24
+    assert len(payload["code_predictor_codes"]) == _Q * 24
 
 
-def test_talker2code2wav_async_chunk_emits_eof_marker_when_finished_with_no_frames():
-    transfer_manager = SimpleNamespace(
-        code_prompt_token_ids=defaultdict(list),
-        connector=SimpleNamespace(config={"extra": {"codec_chunk_frames": 25, "codec_left_context_frames": 25}}),
-    )
+def test_emits_eof_marker_when_finished_with_no_frames():
+    tm = _tm()
     request = _req("rid-eof", finished=True)
 
     payload = talker2code2wav_async_chunk(
-        transfer_manager=transfer_manager,
+        transfer_manager=tm,
         pooling_output=None,
         request=request,
     )
@@ -70,3 +96,59 @@ def test_talker2code2wav_async_chunk_emits_eof_marker_when_finished_with_no_fram
         "code_predictor_codes": [],
         "finished": torch.tensor(True, dtype=torch.bool),
     }
+
+
+_CASES = [
+    # Normal path (initial=0): emit at chunk_size boundaries
+    ((25, 25, 0), (24, 0, False), None),
+    ((25, 25, 0), (25, 0, False), (0, 25)),
+    # Initial-chunk phase: hold, first emit, second emit
+    ((25, 25, 10), (9, 0, False), None),
+    ((25, 25, 10), (10, 0, False), (0, 10)),
+    ((25, 25, 10), (20, 1, False), (10, 20)),
+    # Non-divisible: holds at chunk boundary
+    ((25, 25, 12), (25, 2, False), None),
+    # Normal phase: offset by initial_coverage (chunk//initial * initial)
+    ((25, 25, 10), (45, 2, False), (20, 45)),
+    # Second normal emit (offset must stay stable)
+    ((25, 25, 10), (70, 3, False), (25, 50)),
+    # initial >= chunk clamps to chunk_size (behaves as normal)
+    ((25, 25, 30), (25, 0, False), (0, 25)),
+    # finished=True flushes IC tail
+    ((25, 25, 10), (5, 0, True), (0, 5)),
+    # finished=True flushes non-divisible IC residual
+    ((25, 25, 12), (25, 2, True), (24, 25)),
+    # finished=True flushes normal phase tail
+    ((25, 25, 10), (30, 2, True), (20, 30)),
+]
+
+
+@pytest.mark.parametrize("config, state, expected", _CASES)
+def test_streaming_decoding_with_variable_initial(config, state, expected):
+    chunk_frames, left_context, initial_chunk = config
+    n_frames, put_req, finished = state
+
+    tm = _tm(chunk_frames=chunk_frames, left_context=left_context, initial_chunk=initial_chunk)
+    payload = _call(tm, "r", n_frames=n_frames, put_req=put_req, finished=finished)
+
+    if expected is None:
+        assert payload is None
+    else:
+        exp_ctx, exp_window = expected
+        assert payload is not None
+        assert payload["left_context_size"] == exp_ctx
+        assert len(payload["code_predictor_codes"]) == _Q * exp_window
+
+
+def test_per_request_override_activates_initial_phase():
+    tm = _tm(initial_chunk=0)
+    payload = _call(tm, "r-override", n_frames=10, req_ic=10)
+    assert payload is not None
+    assert payload["left_context_size"] == 0
+    assert len(payload["code_predictor_codes"]) == _Q * 10
+
+
+def test_per_request_override_wins_over_stage_config():
+    tm = _tm(initial_chunk=5)
+    payload = _call(tm, "r-override2", n_frames=10, put_req=0, req_ic=15)
+    assert payload is None
@@ -55,6 +55,11 @@ class OpenAICreateSpeechRequest(BaseModel):
         default=None,
         description="Maximum tokens to generate",
     )
+    initial_codec_chunk_frames: int | None = Field(
+        default=None,
+        ge=0,
+        description="Initial chunk size for reduced TTFA. Overrides stage config for this request.",
+    )
 
     @field_validator("stream_format")
     @classmethod

@@ -468,6 +468,9 @@ def _build_tts_params(self, request: OpenAICreateSpeechRequest) -> dict[str, Any
         else:
             params["max_new_tokens"] = [2048]
 
+        if request.initial_codec_chunk_frames is not None:
+            params["initial_codec_chunk_frames"] = [request.initial_codec_chunk_frames]
+
         # VoiceDesign requires non_streaming_mode (match offline script behaviour).
         # CustomVoice and Base rely on the model default (True and False respectively).
         if params["task_type"][0] == "VoiceDesign":

@@ -96,6 +96,10 @@ runtime:
         # Align with Omni: small chunks with sufficient context overlap.
         codec_chunk_frames: 25
         codec_left_context_frames: 25
+        # First chunk size for reduced TTFA (0 = disabled).
+        # When > 0, emits small chunks every N frames during the initial phase,
+        # then switches to codec_chunk_frames cadence.
+        initial_codec_chunk_frames: 0
 
   edges:
     - from: 0

@@ -98,6 +98,10 @@ runtime:
         # Align with Omni: small chunks with sufficient context overlap.
         codec_chunk_frames: 25
         codec_left_context_frames: 25
+        # First chunk size for reduced TTFA (0 = disabled).
+        # When > 0, emits small chunks every N frames during the initial phase,
+        # then switches to codec_chunk_frames cadence.
+        initial_codec_chunk_frames: 0
 
   edges:
     - from: 0