diff --git a/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_generation.py b/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_generation.py index 4041a53e55a..cd67e4f0740 100644 --- a/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_generation.py +++ b/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_generation.py @@ -864,6 +864,29 @@ def get_replacement(item_idx: int): ), ] + def _apply_hf_processor_mm_only( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + tokenization_kwargs: Mapping[str, object], + ) -> BatchFeature: + """ + Apply the HF processor on the multi-modal data only. + + Issue: Voxtral TTS use Mistral Tokenizer with custom audio encoder. It doesn't + inherit Transformers ProcessorMixin and can't use call_hf_processor_mm_only. + + Solution: Override this method to call _apply_hf_processor_text_mm directly. + """ + mm_counts = mm_items.get_all_counts() + _, mm_processed_data, _ = self._apply_hf_processor_text_mm( + prompt_text=self.dummy_inputs.get_dummy_text(mm_counts), + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + tokenization_kwargs=tokenization_kwargs, + ) + return mm_processed_data + def _cached_apply_hf_processor( self, inputs: ProcessorInputs,