From 8d700397db352e6e7078da28f27ecbc5ca0c4dbb Mon Sep 17 00:00:00 2001 From: "lvyuanjun.lyj" Date: Thu, 9 Oct 2025 19:46:34 +0800 Subject: [PATCH 1/2] Set `truncation` to `False` in Qwen3Omni to avoid default truncation --- .../models/qwen3_omni_moe/modular_qwen3_omni_moe.py | 3 ++- .../models/qwen3_omni_moe/processing_qwen3_omni_moe.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py index 504cbb2f3689..058efd24e7b1 100644 --- a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py @@ -2716,7 +2716,8 @@ def __call__( fps = output_kwargs["videos_kwargs"].get("fps", 1.0) if audio is not None: - output_kwargs["audio_kwargs"]["padding"] = True # Setting to True to avoid default truncation + output_kwargs["audio_kwargs"]["padding"] = True + output_kwargs["audio_kwargs"]["truncation"] = False audio_inputs = self.feature_extractor(audio, **output_kwargs["audio_kwargs"]) audio_inputs["feature_attention_mask"] = audio_inputs.pop( "attention_mask" diff --git a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py index df5629931fa3..f4c70aa4b558 100644 --- a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py @@ -166,7 +166,8 @@ def __call__( fps = output_kwargs["videos_kwargs"].get("fps", 1.0) if audio is not None: - output_kwargs["audio_kwargs"]["padding"] = True # Setting to True to avoid default truncation + output_kwargs["audio_kwargs"]["padding"] = True + output_kwargs["audio_kwargs"]["truncation"] = False audio_inputs = self.feature_extractor(audio, **output_kwargs["audio_kwargs"]) audio_inputs["feature_attention_mask"] = audio_inputs.pop( "attention_mask" From 335806b55c33c78e31ff4e16060a3c46aa033bfc Mon Sep 17 00:00:00 2001 From: "lvyuanjun.lyj" Date: Fri, 10 Oct 2025 17:17:38 +0800 Subject: [PATCH 2/2] move `padding` and `truncation` to audio default args --- .../models/qwen3_omni_moe/modular_qwen3_omni_moe.py | 3 +-- .../models/qwen3_omni_moe/processing_qwen3_omni_moe.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py index 058efd24e7b1..682e1224bfaf 100644 --- a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py @@ -2585,6 +2585,7 @@ class Qwen3OmniMoeProcessorKwargs(Qwen2_5OmniProcessorKwargs): "audio_kwargs": { "sampling_rate": 16000, "padding": True, + "truncation": False, "return_attention_mask": True, }, } @@ -2716,8 +2717,6 @@ def __call__( fps = output_kwargs["videos_kwargs"].get("fps", 1.0) if audio is not None: - output_kwargs["audio_kwargs"]["padding"] = True - output_kwargs["audio_kwargs"]["truncation"] = False audio_inputs = self.feature_extractor(audio, **output_kwargs["audio_kwargs"]) audio_inputs["feature_attention_mask"] = audio_inputs.pop( "attention_mask" diff --git a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py index f4c70aa4b558..d74554de6624 100644 --- a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py @@ -66,6 +66,7 @@ class Qwen3OmniMoeProcessorKwargs(ProcessingKwargs, total=False): "audio_kwargs": { "sampling_rate": 16000, "padding": True, + "truncation": False, "return_attention_mask": True, }, } @@ -166,8 +167,6 @@ def __call__( fps = output_kwargs["videos_kwargs"].get("fps", 1.0) if audio is not None: - output_kwargs["audio_kwargs"]["padding"] = True - output_kwargs["audio_kwargs"]["truncation"] = False audio_inputs = self.feature_extractor(audio, **output_kwargs["audio_kwargs"]) audio_inputs["feature_attention_mask"] = audio_inputs.pop( "attention_mask"