diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py index 478a0a7ea9e8..396edba1155d 100644 --- a/examples/online_serving/openai_transcription_client.py +++ b/examples/online_serving/openai_transcription_client.py @@ -27,7 +27,12 @@ def sync_openai( - audio_path: str, client: OpenAI, model: str, *, repetition_penalty: float = 1.3 + audio_path: str, + client: OpenAI, + model: str, + *, + repetition_penalty: float = 1.3, + hotwords: str = None, ): """ Perform synchronous transcription using OpenAI-compatible API. @@ -43,12 +48,15 @@ def sync_openai( extra_body=dict( seed=4419, repetition_penalty=repetition_penalty, + hotwords=hotwords, ), ) print("transcription result [sync]:", transcription.text) -async def stream_openai_response(audio_path: str, client: AsyncOpenAI, model: str): +async def stream_openai_response( + audio_path: str, client: AsyncOpenAI, model: str, hotwords: str = None +): """ Perform asynchronous transcription using OpenAI-compatible API. """ @@ -64,6 +72,7 @@ async def stream_openai_response(audio_path: str, client: AsyncOpenAI, model: st extra_body=dict( seed=420, top_p=0.6, + hotwords=hotwords, ), stream=True, ) @@ -136,6 +145,7 @@ def main(args): client=client, model=model, repetition_penalty=args.repetition_penalty, + hotwords=args.hotwords, ) # Run the asynchronous function @@ -146,7 +156,10 @@ def main(args): ) asyncio.run( stream_openai_response( - args.audio_path if args.audio_path else winning_call, client, model + args.audio_path if args.audio_path else winning_call, + client, + model, + hotwords=args.hotwords, ) ) else: @@ -174,5 +187,11 @@ def main(args): default=1.3, help="repetition penalty", ) + parser.add_argument( + "--hotwords", + type=str, + default=None, + help="hotwords", + ) args = parser.parse_args() main(args) diff --git a/vllm/entrypoints/openai/speech_to_text/protocol.py b/vllm/entrypoints/openai/speech_to_text/protocol.py index a8d978e33eb2..97bbc366d4a4 100644 --- a/vllm/entrypoints/openai/speech_to_text/protocol.py +++ b/vllm/entrypoints/openai/speech_to_text/protocol.py @@ -65,12 +65,19 @@ class TranscriptionRequest(OpenAIBaseModel): language: str | None = None """The language of the input audio. + Supplying the input language in [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format will improve accuracy and latency. """ + hotwords: str | None = None + """ + hotwords refers to a list of important words or phrases that the model + should pay extra attention to during transcription. + """ + prompt: str = Field(default="") """An optional text to guide the model's style or continue a previous audio segment. @@ -446,6 +453,12 @@ class TranslationRequest(OpenAIBaseModel): will improve accuracy. """ + hotwords: str | None = None + """ + hotwords refers to a list of important words or phrases that the model + should pay extra attention to during transcription. + """ + to_language: str | None = None """The language of the input audio we translate to. diff --git a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text/speech_to_text.py index 4a6030d71b63..68124ff439ee 100644 --- a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text/speech_to_text.py @@ -200,6 +200,8 @@ async def _preprocess_speech_to_text( else None ) + hotwords = request.hotwords if request.hotwords else None + if len(audio_data) / 1024**2 > self.max_audio_filesize_mb: raise VLLMValidationError( "Maximum file size exceeded", @@ -277,6 +279,7 @@ async def _preprocess_speech_to_text( task_type=self.task_type, request_prompt=request.prompt, to_language=to_language, + hotwords=hotwords, ) parsed_prompt: DictPrompt diff --git a/vllm/model_executor/models/cohere_asr.py b/vllm/model_executor/models/cohere_asr.py index 21b38f37fd82..283469eebd28 100644 --- a/vllm/model_executor/models/cohere_asr.py +++ b/vllm/model_executor/models/cohere_asr.py @@ -2024,6 +2024,7 @@ def get_generation_prompt( task_type: Literal["transcribe", "translate"], request_prompt: str, to_language: str | None, + hotwords: str | None, ) -> PromptType: if language is None: raise ValueError( diff --git a/vllm/model_executor/models/fireredasr2.py b/vllm/model_executor/models/fireredasr2.py index 0aae13997c57..e996843cc1bd 100644 --- a/vllm/model_executor/models/fireredasr2.py +++ b/vllm/model_executor/models/fireredasr2.py @@ -710,6 +710,7 @@ def get_generation_prompt( task_type: Literal["transcribe", "translate"], request_prompt: str, to_language: str | None, + hotwords: str | None, ) -> PromptType: if language is None: raise ValueError( diff --git a/vllm/model_executor/models/funasr.py b/vllm/model_executor/models/funasr.py index 78acca3c2a46..75b5af599602 100644 --- a/vllm/model_executor/models/funasr.py +++ b/vllm/model_executor/models/funasr.py @@ -884,13 +884,20 @@ def get_generation_prompt( task_type: Literal["transcribe", "translate"], request_prompt: str, to_language: str | None, + hotwords: str | None, ) -> PromptType: if language is None: raise ValueError( "Language must be specified when creating the funasr prompt" ) - funasr_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n语音转写:<|AUDIO|><|im_end|>\n<|im_start|>assistant\n" # noqa: E501 + if hotwords is not None: + funasr_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n请结合上下文信息,更加准确地完成语音转写任务。如果没有相关信息,我们会留空。\n\n\n**上下文信息:**\n\n\n热词列表:[{}]\n语音转写:<|AUDIO|><|im_end|>\n<|im_start|>assistant\n".format( # noqa: E501 + hotwords + ) + else: + funasr_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n语音转写:<|AUDIO|><|im_end|>\n<|im_start|>assistant\n" # noqa: E501 + prompt = { "prompt": funasr_prompt, "multi_modal_data": { diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index 4b6f53788183..e40f2eb07c05 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -779,6 +779,7 @@ def get_generation_prompt( task_type: Literal["transcribe", "translate"], request_prompt: str, to_language: str | None, + hotwords: str | None, ) -> PromptType: """ Gemma3n supports "free-form" transcription. diff --git a/vllm/model_executor/models/glmasr.py b/vllm/model_executor/models/glmasr.py index fd47a014a8c1..598790f90fe9 100644 --- a/vllm/model_executor/models/glmasr.py +++ b/vllm/model_executor/models/glmasr.py @@ -1140,6 +1140,7 @@ def get_generation_prompt( task_type: Literal["transcribe", "translate"], request_prompt: str, to_language: str | None, + hotwords: str | None, ) -> PromptType: """Get the generation prompt to be used for transcription requests.""" tokenizer = cached_tokenizer_from_config(model_config) diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index 1209f1cbef02..e4f0517dcd6d 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -858,6 +858,7 @@ def get_generation_prompt( task_type: Literal["transcribe", "translate"], request_prompt: str, to_language: str | None, + hotwords: str | None, ) -> PromptType: """Get the generation prompt to be used for transcription requests.""" # Audio placeholders don't use an index, so value doesn't matter diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 094887530f17..0cc613c93c71 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -1113,6 +1113,7 @@ def get_generation_prompt( task_type: Literal["transcribe", "translate"], request_prompt: str, to_language: str | None, + hotwords: str | None, ) -> PromptType: """Get the prompt for the ASR model. The model has control over the construction, as long as it diff --git a/vllm/model_executor/models/kimi_audio.py b/vllm/model_executor/models/kimi_audio.py index 651144683f19..bbc27e3ac04c 100644 --- a/vllm/model_executor/models/kimi_audio.py +++ b/vllm/model_executor/models/kimi_audio.py @@ -645,6 +645,7 @@ def get_generation_prompt( task_type: Literal["transcribe", "translate"], request_prompt: str, to_language: str | None, + hotwords: str | None, ) -> PromptType: tokenizer = cached_get_tokenizer( model_config.tokenizer, diff --git a/vllm/model_executor/models/qwen3_asr.py b/vllm/model_executor/models/qwen3_asr.py index 5c7b4a567ef8..89c0d5462401 100644 --- a/vllm/model_executor/models/qwen3_asr.py +++ b/vllm/model_executor/models/qwen3_asr.py @@ -536,6 +536,7 @@ def get_generation_prompt( task_type: Literal["transcribe", "translate"], request_prompt: str, to_language: str | None, + hotwords: str | None, ) -> PromptType: """Get the generation prompt to be used for transcription requests.""" tokenizer = cached_tokenizer_from_config(model_config) diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index fc097ffddbfe..16626ad1f1ba 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -2195,6 +2195,7 @@ def get_generation_prompt( task_type: Literal["transcribe", "translate"], request_prompt: str, to_language: str | None, + hotwords: str | None, ) -> PromptType: """ Construct a transcription/translation prompt for Qwen3-Omni. diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index dba52d106ef1..568a4d24d539 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -439,6 +439,7 @@ def get_generation_prompt( task_type: Literal["transcribe", "translate"], request_prompt: str, to_language: str | None, + hotwords: str | None, ) -> PromptType: tokenizer = cached_tokenizer_from_config(model_config) audio = Audio(audio, int(stt_config.sample_rate), format="wav") # lossless diff --git a/vllm/model_executor/models/voxtral_realtime.py b/vllm/model_executor/models/voxtral_realtime.py index bb2c701e9190..223f3aae0c64 100644 --- a/vllm/model_executor/models/voxtral_realtime.py +++ b/vllm/model_executor/models/voxtral_realtime.py @@ -474,6 +474,7 @@ def get_generation_prompt( task_type: Literal["transcribe", "translate"], request_prompt: str, to_language: str | None, + hotwords: str | None, ) -> PromptType: tokenizer = cached_tokenizer_from_config(model_config) audio = Audio(audio, int(stt_config.sample_rate), format="wav") # lossless diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 631a829cf4f6..5a70cb65fbbc 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -833,6 +833,7 @@ def get_generation_prompt( task_type: Literal["transcribe", "translate"], request_prompt: str, to_language: str | None, + hotwords: str | None, ) -> PromptType: if language is None: raise ValueError(