diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index b8b5e893d154..de8027c434cc 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -750,18 +750,20 @@ def pad_to_hop_length(x: np.ndarray, hop_length: int) -> np.ndarray: # https://github.com/huggingface/transformers/pull/41473 mm_kwargs = dict(mm_kwargs) tok_kwargs = dict(tok_kwargs) + mm_kwargs["audio_kwargs"] = dict(mm_kwargs.get("audio_kwargs") or {}) + mm_kwargs["text_kwargs"] = dict(mm_kwargs.get("text_kwargs") or {}) if Version(TRANSFORMERS_VERSION) < Version("4.58.0"): # Extract audio_sample_rate before restructuring audio_sample_rate = mm_kwargs.pop("audio_sample_rate", None) # move truncation to audio_kwargs level to avoid conflict # with tok_kwargs - mm_kwargs["audio_kwargs"] = { - "truncation": mm_kwargs.pop("truncation", False) - } - mm_kwargs["text_kwargs"] = { - "truncation": tok_kwargs.pop("truncation", False) - } + mm_kwargs["audio_kwargs"].setdefault( + "truncation", mm_kwargs.pop("truncation", False) + ) + mm_kwargs["text_kwargs"].setdefault( + "truncation", tok_kwargs.pop("truncation", False) + ) # Validate and conditionally pass audio_sample_rate # WhisperFeatureExtractor has a fixed sampling rate, and vLLM's