vllm-project · AllenDou · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026
diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py
@@ -27,7 +27,12 @@
 
 
 def sync_openai(
-    audio_path: str, client: OpenAI, model: str, *, repetition_penalty: float = 1.3
+    audio_path: str,
+    client: OpenAI,
+    model: str,
+    *,
+    repetition_penalty: float = 1.3,
+    hotwords: str = None,
 ):
     """
     Perform synchronous transcription using OpenAI-compatible API.
@@ -43,12 +48,15 @@ def sync_openai(
             extra_body=dict(
                 seed=4419,
                 repetition_penalty=repetition_penalty,
+                hotwords=hotwords,
             ),
         )
         print("transcription result [sync]:", transcription.text)
 
 
-async def stream_openai_response(audio_path: str, client: AsyncOpenAI, model: str):
+async def stream_openai_response(
+    audio_path: str, client: AsyncOpenAI, model: str, hotwords: str = None
+):
     """
     Perform asynchronous transcription using OpenAI-compatible API.
     """
@@ -64,6 +72,7 @@ async def stream_openai_response(audio_path: str, client: AsyncOpenAI, model: st
             extra_body=dict(
                 seed=420,
                 top_p=0.6,
+                hotwords=hotwords,
             ),
             stream=True,
         )
@@ -136,6 +145,7 @@ def main(args):
         client=client,
         model=model,
         repetition_penalty=args.repetition_penalty,
+        hotwords=args.hotwords,
     )
 
     # Run the asynchronous function
@@ -146,7 +156,10 @@ def main(args):
         )
         asyncio.run(
             stream_openai_response(
-                args.audio_path if args.audio_path else winning_call, client, model
+                args.audio_path if args.audio_path else winning_call,
+                client,
+                model,
+                hotwords=args.hotwords,
             )
         )
     else:
@@ -174,5 +187,11 @@ def main(args):
         default=1.3,
         help="repetition penalty",
     )
+    parser.add_argument(
+        "--hotwords",
+        type=str,
+        default=None,
+        help="hotwords",
+    )
     args = parser.parse_args()
     main(args)
@@ -65,12 +65,19 @@ class TranscriptionRequest(OpenAIBaseModel):
 
     language: str | None = None
     """The language of the input audio.
+
 
     Supplying the input language in
     [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
     will improve accuracy and latency.
     """
 
+    hotwords: str | None = None
+    """
+    hotwords refers to a list of important words or phrases that the model
+    should pay extra attention to during transcription.
+    """
+
     prompt: str = Field(default="")
     """An optional text to guide the model's style or continue a previous audio
     segment.
@@ -446,6 +453,12 @@ class TranslationRequest(OpenAIBaseModel):
     will improve accuracy.
     """
 
+    hotwords: str | None = None
+    """
+    hotwords refers to a list of important words or phrases that the model
+    should pay extra attention to during transcription.
+    """
+
     to_language: str | None = None
     """The language of the input audio we translate to.
 

@@ -200,6 +200,8 @@ async def _preprocess_speech_to_text(
             else None
         )
 
+        hotwords = request.hotwords if request.hotwords else None
-        hotwords = request.hotwords if request.hotwords else None
+        hotwords = getattr(request, "hotwords", None) or None
-        hotwords = request.hotwords if request.hotwords else None
+        hotwords = getattr(request, "hotwords", None) or None
+
         if len(audio_data) / 1024**2 > self.max_audio_filesize_mb:
             raise VLLMValidationError(
                 "Maximum file size exceeded",
@@ -277,6 +279,7 @@ async def _preprocess_speech_to_text(
                 task_type=self.task_type,
                 request_prompt=request.prompt,
                 to_language=to_language,
+                hotwords=hotwords,
             )
 
             parsed_prompt: DictPrompt

diff --git a/vllm/model_executor/models/cohere_asr.py b/vllm/model_executor/models/cohere_asr.py
@@ -2024,6 +2024,7 @@ def get_generation_prompt(
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
         to_language: str | None,
+        hotwords: str | None,
     ) -> PromptType:
         if language is None:
             raise ValueError(

diff --git a/vllm/model_executor/models/fireredasr2.py b/vllm/model_executor/models/fireredasr2.py
@@ -710,6 +710,7 @@ def get_generation_prompt(
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
         to_language: str | None,
+        hotwords: str | None,
     ) -> PromptType:
         if language is None:
             raise ValueError(

diff --git a/vllm/model_executor/models/funasr.py b/vllm/model_executor/models/funasr.py
@@ -884,13 +884,20 @@ def get_generation_prompt(
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
         to_language: str | None,
+        hotwords: str | None,
     ) -> PromptType:
         if language is None:
             raise ValueError(
                 "Language must be specified when creating the funasr prompt"
             )
 
-        funasr_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n语音转写：<|AUDIO|><|im_end|>\n<|im_start|>assistant\n"  # noqa: E501
+        if hotwords is not None:
+            funasr_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n请结合上下文信息，更加准确地完成语音转写任务。如果没有相关信息，我们会留空。\n\n\n**上下文信息：**\n\n\n热词列表：[{}]\n语音转写：<|AUDIO|><|im_end|>\n<|im_start|>assistant\n".format(  # noqa: E501
+                hotwords
+            )
+        else:
+            funasr_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n语音转写：<|AUDIO|><|im_end|>\n<|im_start|>assistant\n"  # noqa: E501
+
         prompt = {
             "prompt": funasr_prompt,
             "multi_modal_data": {

diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
@@ -779,6 +779,7 @@ def get_generation_prompt(
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
         to_language: str | None,
+        hotwords: str | None,
     ) -> PromptType:
         """
         Gemma3n supports "free-form" transcription.

diff --git a/vllm/model_executor/models/glmasr.py b/vllm/model_executor/models/glmasr.py
@@ -1140,6 +1140,7 @@ def get_generation_prompt(
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
         to_language: str | None,
+        hotwords: str | None,
     ) -> PromptType:
         """Get the generation prompt to be used for transcription requests."""
         tokenizer = cached_tokenizer_from_config(model_config)

diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
@@ -858,6 +858,7 @@ def get_generation_prompt(
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
         to_language: str | None,
+        hotwords: str | None,
     ) -> PromptType:
         """Get the generation prompt to be used for transcription requests."""
         # Audio placeholders don't use an index, so value doesn't matter

diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
@@ -1113,6 +1113,7 @@ def get_generation_prompt(
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
         to_language: str | None,
+        hotwords: str | None,
     ) -> PromptType:
         """Get the prompt for the ASR model.
         The model has control over the construction, as long as it

diff --git a/vllm/model_executor/models/kimi_audio.py b/vllm/model_executor/models/kimi_audio.py
@@ -645,6 +645,7 @@ def get_generation_prompt(
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
         to_language: str | None,
+        hotwords: str | None,
     ) -> PromptType:
         tokenizer = cached_get_tokenizer(
             model_config.tokenizer,

@@ -536,6 +536,7 @@ def get_generation_prompt(
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
         to_language: str | None,
+        hotwords: str | None,
     ) -> PromptType:
         """Get the generation prompt to be used for transcription requests."""
         tokenizer = cached_tokenizer_from_config(model_config)

@@ -2195,6 +2195,7 @@ def get_generation_prompt(
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
         to_language: str | None,
+        hotwords: str | None,
     ) -> PromptType:
         """
         Construct a transcription/translation prompt for Qwen3-Omni.

@@ -439,6 +439,7 @@ def get_generation_prompt(
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
         to_language: str | None,
+        hotwords: str | None,
     ) -> PromptType:
         tokenizer = cached_tokenizer_from_config(model_config)
         audio = Audio(audio, int(stt_config.sample_rate), format="wav")  # lossless

@@ -474,6 +474,7 @@ def get_generation_prompt(
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
         to_language: str | None,
+        hotwords: str | None,
     ) -> PromptType:
         tokenizer = cached_tokenizer_from_config(model_config)
         audio = Audio(audio, int(stt_config.sample_rate), format="wav")  # lossless

diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
@@ -833,6 +833,7 @@ def get_generation_prompt(
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
         to_language: str | None,
+        hotwords: str | None,
     ) -> PromptType:
         if language is None:
             raise ValueError(