Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 13 additions & 9 deletions vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -549,13 +549,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
# do_sample=True here without also routing vLLM's sampled token back into
# this gate check would cause the two to diverge and corrupt KV-cache state.
self.global_sampler = MiMoSampler(do_sample=False, temperature=0.6, top_p=0.95)
# local_sampler drives audio-code generation inside local_forward. It is
# entirely internal and NOT subject to vLLM's SamplingParams, so stochastic
# sampling is safe here and is required to produce natural, varied speech.
# Setting do_sample=True also disables the CUDA-graph path (use_cg gate in
# local_forward checks `do_sample is False`), preventing MiMoLocalSamplerTensor
# from silently forcing argmax even when temperature > 0.
self.local_sampler = MiMoSampler(do_sample=True, temperature=0.9, top_p=0.95)
# local_sampler drives audio-code generation inside local_forward. Keep
# it greedy (do_sample=False) so the CUDA-graph path (use_cg gate in
# local_forward) stays active AND so the audio codes — and therefore the
# `new_audio_emb` written into `_cached_new_audio_emb_by_req` — are
# deterministic. That cache is fed back into `inputs_embeds` on the next
# decode step (see `_prepare_multimodal_embeddings_with_cache`), so any
# stochasticity here propagates into subsequent *text* logits via the
# audio-embedding feedback path and destabilises text continuations even
# though `global_sampler` is greedy. Voice diversity must be tackled in
# the codec/vocoder path (stage-1), not by randomising local_sampler.
self.local_sampler = MiMoSampler(do_sample=False, temperature=0.9, top_p=0.95)
self.removed_tokens = None

self.speech_vocab_sizes = config.parsed_speech_vocab_sizes()
Expand Down Expand Up @@ -818,7 +822,7 @@ def base_local_forward(
device=tokens_device,
)
if local_sampler is None:
local_sampler = MiMoSampler(do_sample=True, temperature=0.9, top_p=0.95)
local_sampler = MiMoSampler(do_sample=False, temperature=0.9, top_p=0.95)

past_key_values = DynamicCache()
for t in range(delay_iters):
Expand Down Expand Up @@ -864,7 +868,7 @@ def local_forward(
local_sampler: MiMoSampler | None = None,
):
if local_sampler is None:
local_sampler = MiMoSampler(do_sample=True, temperature=0.9, top_p=0.95)
local_sampler = MiMoSampler(do_sample=False, temperature=0.9, top_p=0.95)

b = int(local_embeds.shape[0])
use_cg = (local_sampler.do_sample is None or local_sampler.do_sample is False) and bool(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,12 @@ def llm2code2wav_async_chunk(

request_id = getattr(request, "external_req_id", None)

po_codes = pooling_output.get("codes", {})
# Text-only paths (e.g. modalities=["text"]) yield no codec pooling output;
# stage-0 still drives the chunk transfer adapter, so treat None as "no codes
# this step" rather than letting `.get()` raise AttributeError — an unhandled
# error here drops the chunk, starves stage-1 of the finished payload, and
# the stage subprocesses die before the final token is emitted.
po_codes = pooling_output.get("codes", {}) if pooling_output is not None else {}
if "audio" not in po_codes:
if is_finished:
return _flush_remaining_codes(transfer_manager, request_id, chunk_size, left_context_size)
Expand Down
Loading