vllm-project · hsliuustc0106 · May 22, 2026 · May 22, 2026 · May 22, 2026 · May 22, 2026
@@ -549,13 +549,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # do_sample=True here without also routing vLLM's sampled token back into
         # this gate check would cause the two to diverge and corrupt KV-cache state.
         self.global_sampler = MiMoSampler(do_sample=False, temperature=0.6, top_p=0.95)
-        # local_sampler drives audio-code generation inside local_forward.  It is
-        # entirely internal and NOT subject to vLLM's SamplingParams, so stochastic
-        # sampling is safe here and is required to produce natural, varied speech.
-        # Setting do_sample=True also disables the CUDA-graph path (use_cg gate in
-        # local_forward checks `do_sample is False`), preventing MiMoLocalSamplerTensor
-        # from silently forcing argmax even when temperature > 0.
-        self.local_sampler = MiMoSampler(do_sample=True, temperature=0.9, top_p=0.95)
+        # local_sampler drives audio-code generation inside local_forward.  Keep
+        # it greedy (do_sample=False) so the CUDA-graph path (use_cg gate in
+        # local_forward) stays active AND so the audio codes — and therefore the
+        # `new_audio_emb` written into `_cached_new_audio_emb_by_req` — are
+        # deterministic.  That cache is fed back into `inputs_embeds` on the next
+        # decode step (see `_prepare_multimodal_embeddings_with_cache`), so any
+        # stochasticity here propagates into subsequent *text* logits via the
+        # audio-embedding feedback path and destabilises text continuations even
+        # though `global_sampler` is greedy.  Voice diversity must be tackled in
+        # the codec/vocoder path (stage-1), not by randomising local_sampler.
+        self.local_sampler = MiMoSampler(do_sample=False, temperature=0.9, top_p=0.95)
         self.removed_tokens = None
 
         self.speech_vocab_sizes = config.parsed_speech_vocab_sizes()
@@ -818,7 +822,7 @@ def base_local_forward(
             device=tokens_device,
         )
         if local_sampler is None:
-            local_sampler = MiMoSampler(do_sample=True, temperature=0.9, top_p=0.95)
+            local_sampler = MiMoSampler(do_sample=False, temperature=0.9, top_p=0.95)
 
         past_key_values = DynamicCache()
         for t in range(delay_iters):
@@ -864,7 +868,7 @@ def local_forward(
         local_sampler: MiMoSampler | None = None,
     ):
         if local_sampler is None:
-            local_sampler = MiMoSampler(do_sample=True, temperature=0.9, top_p=0.95)
+            local_sampler = MiMoSampler(do_sample=False, temperature=0.9, top_p=0.95)
 
         b = int(local_embeds.shape[0])
         use_cg = (local_sampler.do_sample is None or local_sampler.do_sample is False) and bool(

@@ -173,7 +173,12 @@ def llm2code2wav_async_chunk(
 
     request_id = getattr(request, "external_req_id", None)
 
-    po_codes = pooling_output.get("codes", {})
+    # Text-only paths (e.g. modalities=["text"]) yield no codec pooling output;
+    # stage-0 still drives the chunk transfer adapter, so treat None as "no codes
+    # this step" rather than letting `.get()` raise AttributeError — an unhandled
+    # error here drops the chunk, starves stage-1 of the finished payload, and
+    # the stage subprocesses die before the final token is emitted.
+    po_codes = pooling_output.get("codes", {}) if pooling_output is not None else {}
     if "audio" not in po_codes:
         if is_finished:
             return _flush_remaining_codes(transfer_manager, request_id, chunk_size, left_context_size)