From ca656abf2d76f3a41e656b786a254d8918807c53 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 1 Apr 2025 16:06:59 -0700 Subject: [PATCH 01/18] initial Signed-off-by: Roger Wang --- vllm/entrypoints/openai/api_server.py | 33 ++- vllm/entrypoints/openai/protocol.py | 193 ++++++++++++++++ .../openai/serving_transcription.py | 214 ++++++++++++++---- 3 files changed, 392 insertions(+), 48 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index a23736470f66..faced5204b6a 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -73,6 +73,8 @@ TokenizeResponse, TranscriptionRequest, TranscriptionResponse, + TranslationRequest, + TranslationResponse, UnloadLoRAAdapterRequest) # yapf: enable from vllm.entrypoints.openai.serving_chat import OpenAIServingChat @@ -88,7 +90,7 @@ from vllm.entrypoints.openai.serving_tokenization import ( OpenAIServingTokenization) from vllm.entrypoints.openai.serving_transcription import ( - OpenAIServingTranscription) + OpenAIServingTranscription, OpenAIServingTranslation) from vllm.entrypoints.openai.tool_parsers import ToolParserManager from vllm.entrypoints.utils import (cli_env_setup, load_aware_call, with_cancellation) @@ -401,6 +403,10 @@ def transcription(request: Request) -> OpenAIServingTranscription: return request.app.state.openai_serving_transcription +def translation(request: Request) -> OpenAIServingTranslation: + return request.app.state.openai_serving_translation + + def engine_client(request: Request) -> EngineClient: return request.app.state.engine_client @@ -774,6 +780,31 @@ async def create_transcriptions(raw_request: Request, return StreamingResponse(content=generator, media_type="text/event-stream") +@router.post("/v1/audio/translations") +@with_cancellation +@load_aware_call +async def create_translations(request: Annotated[TranslationRequest, + Form()], + raw_request: Request): + handler = translation(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Translations API") + + audio_data = await request.file.read() + generator = await handler.create_translation(audio_data, request, + raw_request) + + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + + elif isinstance(generator, TranslationResponse): + return JSONResponse(content=generator.model_dump()) + + return StreamingResponse(content=generator, media_type="text/event-stream") + + @router.post("/rerank", dependencies=[Depends(validate_json_request)], responses={ diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index b278d0d00586..0f92b5393610 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1947,3 +1947,196 @@ class TranscriptionResponseVerbose(OpenAIBaseModel): words: Optional[list[TranscriptionWord]] = None """Extracted words and their corresponding timestamps.""" + + +class TranslationResponseStreamChoice(OpenAIBaseModel): + delta: DeltaMessage + finish_reason: Optional[str] = None + stop_reason: Optional[Union[int, str]] = None + + +class TranslationStreamResponse(OpenAIBaseModel): + id: str = Field(default_factory=lambda: f"trsl-{random_uuid()}") + object: Literal["translation.chunk"] = "translation.chunk" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + choices: list[TranslationResponseStreamChoice] + usage: Optional[UsageInfo] = Field(default=None) + + +class TranslationRequest(OpenAIBaseModel): + # Ordered by official OpenAI API documentation + # https://platform.openai.com/docs/api-reference/audio/createTranslation + + file: UploadFile + """ + The audio file object (not file name) to translate, in one of these + formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. + """ + + model: Optional[str] = None + """ID of the model to use. + """ + + language: Optional[str] = None + """The language of the input audio. + + Supplying the input language in + [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format + will improve accuracy and latency. + """ + + prompt: str = Field(default="") + """An optional text to guide the model's style or continue a previous audio + segment. + + The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) + should match the audio language. + """ + + response_format: AudioResponseFormat = Field(default="json") + """ + The format of the output, in one of these options: `json`, `text`, `srt`, + `verbose_json`, or `vtt`. + """ + + ## TODO (varun) : Support if set to 0, certain thresholds are met !! + temperature: float = Field(default=0.0) + """The sampling temperature, between 0 and 1. + + Higher values like 0.8 will make the output more random, while lower values + like 0.2 will make it more focused / deterministic. If set to 0, the model + will use [log probability](https://en.wikipedia.org/wiki/Log_probability) + to automatically increase the temperature until certain thresholds are hit. + """ + + timestamp_granularities: list[Literal["word", "segment"]] = Field( + alias="timestamp_granularities[]", default=[]) + """The timestamp granularities to populate for this translation. + + `response_format` must be set `verbose_json` to use timestamp granularities. + Either or both of these options are supported: `word`, or `segment`. Note: + There is no additional latency for segment timestamps, but generating word + timestamps incurs additional latency. + """ + + stream: Optional[bool] = False + """Custom field not present in the original OpenAI definition. When set, + it will enable output to be streamed in a similar fashion as the Chat + Completion endpoint. + """ + # Flattened stream option to simplify form data. + stream_include_usage: Optional[bool] = False + stream_continuous_usage_stats: Optional[bool] = False + + # Default sampling parameters for translation requests. + _DEFAULT_SAMPLING_PARAMS: dict = { + "temperature": 0, + } + + def to_sampling_params( + self, + default_max_tokens: int, + default_sampling_params: Optional[dict] = None) -> SamplingParams: + # TODO(#9845): remove max_tokens when field is removed from OpenAI API + max_tokens = default_max_tokens + + if default_sampling_params is None: + default_sampling_params = {} + # Default parameters + if (temperature := self.temperature) is None: + temperature = default_sampling_params.get( + "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]) + + return SamplingParams.from_optional(temperature=temperature, + max_tokens=max_tokens, + output_kind=RequestOutputKind.DELTA + if self.stream \ + else RequestOutputKind.FINAL_ONLY) + + @model_validator(mode="before") + @classmethod + def validate_stream_options(cls, data): + stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"] + stream = data.get("stream", False) + if any(bool(data.get(so, False)) for so in stream_opts) and not stream: + raise ValueError( + "Stream options can only be defined when `stream=True`.") + + return data + + +# Translation response objects +class TranslationResponse(OpenAIBaseModel): + text: str + """The translated text.""" + + +class TranslationWord(OpenAIBaseModel): + end: float + """End time of the word in seconds.""" + + start: float + """Start time of the word in seconds.""" + + word: str + """The text content of the word.""" + + +class TranslationSegment(OpenAIBaseModel): + id: int + """Unique identifier of the segment.""" + + avg_logprob: float + """Average logprob of the segment. + + If the value is lower than -1, consider the logprobs failed. + """ + + compression_ratio: float + """Compression ratio of the segment. + + If the value is greater than 2.4, consider the compression failed. + """ + + end: float + """End time of the segment in seconds.""" + + no_speech_prob: float + """Probability of no speech in the segment. + + If the value is higher than 1.0 and the `avg_logprob` is below -1, consider + this segment silent. + """ + + seek: int + """Seek offset of the segment.""" + + start: float + """Start time of the segment in seconds.""" + + temperature: float + """Temperature parameter used for generating the segment.""" + + text: str + """Text content of the segment.""" + + tokens: list[int] + """Array of token IDs for the text content.""" + + +class TranslationResponseVerbose(OpenAIBaseModel): + duration: str + """The duration of the input audio.""" + + language: str + """The language of the input audio.""" + + text: str + """The translated text.""" + + segments: Optional[list[TranslationSegment]] = None + """Segments of the translated text and their corresponding details.""" + + words: Optional[list[TranslationWord]] = None + """Extracted words and their corresponding timestamps.""" diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py index 60d66434ea5a..8ffd579a1487 100644 --- a/vllm/entrypoints/openai/serving_transcription.py +++ b/vllm/entrypoints/openai/serving_transcription.py @@ -6,7 +6,7 @@ import time from collections.abc import AsyncGenerator from math import ceil -from typing import Final, Optional, Union, cast +from typing import Callable, Optional, Union, cast import numpy as np from fastapi import Request @@ -17,7 +17,8 @@ from vllm.entrypoints.openai.protocol import ( DeltaMessage, ErrorResponse, RequestResponseMetadata, TranscriptionRequest, TranscriptionResponse, TranscriptionResponseStreamChoice, - TranscriptionStreamResponse, UsageInfo) + TranscriptionStreamResponse, TranslationRequest, TranslationResponse, + TranslationResponseStreamChoice, TranslationStreamResponse, UsageInfo) from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.inputs.data import PromptType @@ -33,7 +34,7 @@ logger = init_logger(__name__) -# From https://platform.openai.com/docs/guides/speech-to-text/supported-languages#supported-languages +# From https://platform.openai.com/docs/guides/speech-to-text/supported-languages # TODO these configs should live somewhere with the model so we can support # additional ones @@ -149,16 +150,19 @@ MIN_ENERGY_WINDOW_SIZE = 1600 # 1600 ~ 100ms for 16000 Hz audio -class OpenAIServingTranscription(OpenAIServing): +class OpenAISpeechToText(OpenAIServing): + """Base class for speech-to-text operations like transcription and + translation.""" def __init__( - self, - engine_client: EngineClient, - model_config: ModelConfig, - models: OpenAIServingModels, - *, - request_logger: Optional[RequestLogger], - return_tokens_as_token_ids: bool = False, + self, + engine_client: EngineClient, + model_config: ModelConfig, + models: OpenAIServingModels, + *, + request_logger: Optional[RequestLogger], + return_tokens_as_token_ids: bool = False, + task_type: str = "transcribe", # or "translate" ): super().__init__(engine_client=engine_client, model_config=model_config, @@ -172,15 +176,16 @@ def __init__( self.max_audio_clip_s = processor.feature_extractor.chunk_length self.model_sr = processor.feature_extractor.sampling_rate self.hop_length = processor.feature_extractor.hop_length + self.task_type = task_type if self.default_sampling_params: logger.info( "Overwriting default completion sampling param with: %s", self.default_sampling_params) - async def _preprocess_transcription( + async def _preprocess_speech_to_text( self, - request: TranscriptionRequest, + request: Union[TranscriptionRequest, TranslationRequest], audio_data: bytes, ) -> tuple[list[PromptType], float]: # Validate request @@ -221,23 +226,24 @@ async def _preprocess_transcription( }, }, "decoder_prompt": - f"<|startoftranscript|>{lang_token}<|transcribe|><|notimestamps|>{request.prompt}" + (f"<|startoftranscript|>{lang_token}" + f"<|{self.task_type}|><|notimestamps|>{request.prompt}") if i == 0 else "" } prompts.append(cast(PromptType, prompt)) return prompts, duration - # TODO (varun) : Make verbose response work ! - async def create_transcription( - self, audio_data: bytes, request: TranscriptionRequest, - raw_request: Request - ) -> Union[TranscriptionResponse, AsyncGenerator[str, None], - ErrorResponse]: - """Transcription API similar to OpenAI's API. - - See https://platform.openai.com/docs/api-reference/audio/createTranscription - for the API specification. This API mimics the OpenAI transcription API. - """ + async def _create_speech_to_text( + self, + audio_data: bytes, + request: Union[TranscriptionRequest, TranslationRequest], + raw_request: Request, + response_class: Union[TranscriptionResponse, TranslationResponse], + stream_generator_method: Callable, + ) -> Union[Union[TranscriptionResponse, TranslationResponse], + AsyncGenerator[str, None], ErrorResponse]: + """Base method for speech-to-text operations like transcription and + translation.""" error_check_ret = await self._check_model(request) if error_check_ret is not None: return error_check_ret @@ -252,7 +258,7 @@ async def create_transcription( return self.create_error_response( "Currently only support response_format `text` or `json`") - request_id = f"trsc-{self._base_request_id(raw_request)}" + request_id = f"{self.task_type}-{self._base_request_id(raw_request)}" request_metadata = RequestResponseMetadata(request_id=request_id) if raw_request: @@ -266,13 +272,14 @@ async def create_transcription( if lora_request: return self.create_error_response( - "Currently do not support LoRA for Transcription.") + "Currently do not support LoRA for " + f"{self.task_type.title()}.") if prompt_adapter_request: return self.create_error_response( - "Currently do not support PromptAdapter for Transcription." - ) + f"Currently do not support PromptAdapter for " + f"{self.task_type.title()}.") - prompts, duration_s = await self._preprocess_transcription( + prompts, duration_s = await self._preprocess_speech_to_text( request=request, audio_data=audio_data, ) @@ -310,11 +317,11 @@ async def create_transcription( return self.create_error_response(str(e)) if request.stream: - return self.transcription_stream_generator(request, - list_result_generator, - request_id, - request_metadata, - duration_s) + return stream_generator_method(request, + list_result_generator, + request_id, + request_metadata, + duration_s) # Non-streaming response. try: assert list_result_generator is not None @@ -322,21 +329,28 @@ async def create_transcription( for result_generator in list_result_generator: async for op in result_generator: text += op.outputs[0].text - return TranscriptionResponse(text=text) + return response_class(text=text) except asyncio.CancelledError: return self.create_error_response("Client disconnected") except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) - async def transcription_stream_generator( - self, request: TranscriptionRequest, - list_result_generator: list[AsyncGenerator[RequestOutput, None]], - request_id: str, request_metadata: RequestResponseMetadata, - audio_duration_s: float) -> AsyncGenerator[str, None]: + async def _speech_to_text_stream_generator( + self, + request: Union[TranscriptionRequest, TranslationRequest], + list_result_generator: list[AsyncGenerator[RequestOutput, None]], + request_id: str, + request_metadata: RequestResponseMetadata, + audio_duration_s: float, + chunk_object_type: str, + response_stream_choice_class: Union[TranscriptionResponseStreamChoice, + TranslationResponseStreamChoice], + stream_response_class: Union[TranscriptionStreamResponse, + TranslationStreamResponse], + ) -> AsyncGenerator[str, None]: created_time = int(time.time()) model_name = request.model - chunk_object_type: Final = "transcription.chunk" completion_tokens = 0 num_prompt_tokens = 0 @@ -377,16 +391,16 @@ async def transcription_stream_generator( if output.finish_reason is None: # Still generating, send delta update. - choice_data = TranscriptionResponseStreamChoice( + choice_data = response_stream_choice_class( delta=delta_message) else: # Model is finished generating. - choice_data = TranscriptionResponseStreamChoice( + choice_data = response_stream_choice_class( delta=delta_message, finish_reason=output.finish_reason, stop_reason=output.stop_reason) - chunk = TranscriptionStreamResponse( + chunk = stream_response_class( id=request_id, object=chunk_object_type, created=created_time, @@ -412,7 +426,7 @@ async def transcription_stream_generator( total_tokens=num_prompt_tokens + completion_tokens) - final_usage_chunk = TranscriptionStreamResponse( + final_usage_chunk = stream_response_class( id=request_id, object=chunk_object_type, created=created_time, @@ -431,7 +445,7 @@ async def transcription_stream_generator( except Exception as e: # TODO: Use a vllm-specific Validation Error - logger.exception("Error in chat completion stream generator.") + logger.exception("Error in %s stream generator.", self.task_type) data = self.create_streaming_error_response(str(e)) yield f"data: {data}\n\n" # Send the final done message after all response.n are finished @@ -485,3 +499,109 @@ def _find_split_point(self, wav: np.ndarray, start_idx: int, quietest_idx = i + start_idx min_energy = energy return quietest_idx + +class OpenAIServingTranscription(OpenAISpeechToText): + """Handles transcription requests.""" + + def __init__( + self, + engine_client: EngineClient, + model_config: ModelConfig, + models: OpenAIServingModels, + *, + request_logger: Optional[RequestLogger], + return_tokens_as_token_ids: bool = False, + ): + super().__init__(engine_client=engine_client, + model_config=model_config, + models=models, + request_logger=request_logger, + return_tokens_as_token_ids=return_tokens_as_token_ids, + task_type="transcribe") + + async def create_transcription( + self, audio_data: bytes, request: TranscriptionRequest, + raw_request: Request + ) -> Union[TranscriptionResponse, AsyncGenerator[str, None], + ErrorResponse]: + """Transcription API similar to OpenAI's API. + + See https://platform.openai.com/docs/api-reference/audio/createTranscription + for the API specification. This API mimics the OpenAI transcription API. + """ + return await self._create_speech_to_text( + audio_data=audio_data, + request=request, + raw_request=raw_request, + response_class=TranscriptionResponse, + stream_generator_method=self.transcription_stream_generator, + ) + + async def transcription_stream_generator( + self, request: TranscriptionRequest, + result_generator: AsyncGenerator[RequestOutput, None], + request_id: str, request_metadata: RequestResponseMetadata, + audio_duration_s: float) -> AsyncGenerator[str, None]: + return await self._speech_to_text_stream_generator( + request=request, + result_generator=result_generator, + request_id=request_id, + request_metadata=request_metadata, + audio_duration_s=audio_duration_s, + chunk_object_type="transcription.chunk", + response_stream_choice_class=TranscriptionResponseStreamChoice, + stream_response_class=TranscriptionStreamResponse, + ) + + +class OpenAIServingTranslation(OpenAISpeechToText): + """Handles translation requests.""" + + def __init__( + self, + engine_client: EngineClient, + model_config: ModelConfig, + models: OpenAIServingModels, + *, + request_logger: Optional[RequestLogger], + return_tokens_as_token_ids: bool = False, + ): + super().__init__(engine_client=engine_client, + model_config=model_config, + models=models, + request_logger=request_logger, + return_tokens_as_token_ids=return_tokens_as_token_ids, + task_type="translate") + + async def create_translation( + self, audio_data: bytes, request: TranslationRequest, + raw_request: Request + ) -> Union[TranslationResponse, AsyncGenerator[str, None], ErrorResponse]: + """Translation API similar to OpenAI's API. + + See https://platform.openai.com/docs/api-reference/audio/createTranslation + for the API specification. This API mimics the OpenAI translation API. + """ + return await self._create_speech_to_text( + audio_data=audio_data, + request=request, + raw_request=raw_request, + response_class=TranslationResponse, + stream_generator_method=self.translation_stream_generator, + ) + + async def translation_stream_generator( + self, request: TranslationRequest, + result_generator: AsyncGenerator[RequestOutput, None], + request_id: str, request_metadata: RequestResponseMetadata, + audio_duration_s: float) -> AsyncGenerator[str, None]: + return await self._speech_to_text_stream_generator( + request=request, + result_generator=result_generator, + request_id=request_id, + request_metadata=request_metadata, + audio_duration_s=audio_duration_s, + chunk_object_type="translation.chunk", + response_stream_choice_class=TranslationResponseStreamChoice, + stream_response_class=TranslationStreamResponse, + ) From 9bd4692cdd786ecaa44d1dc5bd3dab9f72f54f32 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Thu, 12 Jun 2025 10:40:06 +0000 Subject: [PATCH 02/18] separate openai s2t Signed-off-by: NickLucche --- vllm/entrypoints/openai/api_server.py | 24 +- .../openai/serving_transcription.py | 482 +----------------- vllm/entrypoints/openai/speech_to_text.py | 436 ++++++++++++++++ 3 files changed, 463 insertions(+), 479 deletions(-) create mode 100644 vllm/entrypoints/openai/speech_to_text.py diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index faced5204b6a..681633a2aff7 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -780,7 +780,23 @@ async def create_transcriptions(raw_request: Request, return StreamingResponse(content=generator, media_type="text/event-stream") -@router.post("/v1/audio/translations") +@router.post("/v1/audio/translations", + responses={ + HTTPStatus.OK.value: { + "content": { + "text/event-stream": {} + } + }, + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.UNPROCESSABLE_ENTITY.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + }) @with_cancellation @load_aware_call async def create_translations(request: Annotated[TranslationRequest, @@ -1279,6 +1295,12 @@ async def init_app_state( state.openai_serving_models, request_logger=request_logger, ) if model_config.runner_type == "transcription" else None + state.openai_serving_translation = OpenAIServingTranslation( + engine_client, + model_config, + state.openai_serving_models, + request_logger=request_logger, + ) if model_config.runner_type == "transcription" else None state.task = model_config.task state.enable_server_load_tracking = args.enable_server_load_tracking diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py index 8ffd579a1487..30dcbd0f6a3b 100644 --- a/vllm/entrypoints/openai/serving_transcription.py +++ b/vllm/entrypoints/openai/serving_transcription.py @@ -5,8 +5,7 @@ import math import time from collections.abc import AsyncGenerator -from math import ceil -from typing import Callable, Optional, Union, cast +from typing import Optional, Union import numpy as np from fastapi import Request @@ -15,490 +14,17 @@ from vllm.engine.protocol import EngineClient from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.protocol import ( - DeltaMessage, ErrorResponse, RequestResponseMetadata, TranscriptionRequest, + ErrorResponse, RequestResponseMetadata, TranscriptionRequest, TranscriptionResponse, TranscriptionResponseStreamChoice, TranscriptionStreamResponse, TranslationRequest, TranslationResponse, - TranslationResponseStreamChoice, TranslationStreamResponse, UsageInfo) -from vllm.entrypoints.openai.serving_engine import OpenAIServing + TranslationResponseStreamChoice, TranslationStreamResponse) from vllm.entrypoints.openai.serving_models import OpenAIServingModels -from vllm.inputs.data import PromptType +from vllm.entrypoints.openai.speech_to_text import OpenAISpeechToText from vllm.logger import init_logger from vllm.outputs import RequestOutput -from vllm.transformers_utils.processor import cached_get_processor -from vllm.utils import PlaceholderModule - -try: - import librosa -except ImportError: - librosa = PlaceholderModule("librosa") # type: ignore[assignment] logger = init_logger(__name__) -# From https://platform.openai.com/docs/guides/speech-to-text/supported-languages -# TODO these configs should live somewhere with the model so we can support -# additional ones - -ISO639_1_SUPPORTED_LANGS = { - "af": "Afrikaans", - "ar": "Arabic", - "hy": "Armenian", - "az": "Azerbaijani", - "be": "Belarusian", - "bs": "Bosnian", - "bg": "Bulgarian", - "ca": "Catalan", - "zh": "Chinese", - "hr": "Croatian", - "cs": "Czech", - "da": "Danish", - "nl": "Dutch", - "en": "English", - "et": "Estonian", - "fi": "Finnish", - "fr": "French", - "gl": "Galician", - "de": "German", - "el": "Greek", - "he": "Hebrew", - "hi": "Hindi", - "hu": "Hungarian", - "is": "Icelandic", - "id": "Indonesian", - "it": "Italian", - "ja": "Japanese", - "kn": "Kannada", - "kk": "Kazakh", - "ko": "Korean", - "lv": "Latvian", - "lt": "Lithuanian", - "mk": "Macedonian", - "ms": "Malay", - "mr": "Marathi", - "mi": "Maori", - "ne": "Nepali", - "no": "Norwegian", - "fa": "Persian", - "pl": "Polish", - "pt": "Portuguese", - "ro": "Romanian", - "ru": "Russian", - "sr": "Serbian", - "sk": "Slovak", - "sl": "Slovenian", - "es": "Spanish", - "sw": "Swahili", - "sv": "Swedish", - "tl": "Tagalog", - "ta": "Tamil", - "th": "Thai", - "tr": "Turkish", - "uk": "Ukrainian", - "ur": "Urdu", - "vi": "Vietnamese", - "cy": "Welsh" -} -ISO639_1_OTHER_LANGS = { - "lo": "Lao", - "jw": "Javanese", - "tk": "Turkmen", - "yi": "Yiddish", - "so": "Somali", - "bn": "Bengali", - "nn": "Norwegian Nynorsk", - "si": "Sinhala", - "yo": "Yoruba", - "sa": "Sanskrit", - "mi": "Māori", - "fo": "Faroese", # codespell:ignore - "mt": "Maltese", - "tg": "Tajik", - "mg": "Malagasy", - "haw": "Hawaiian", - "km": "Khmer", - "br": "Breton", - "ps": "Pashto", - "ln": "Lingala", - "la": "Latin", - "ml": "Malayalam", - "sq": "Albanian", - "su": "Sundanese", - "eu": "Basque", - "ka": "Georgian", - "uz": "Uzbek", - "sn": "Shona", - "ht": "Haitian", - "as": "Assamese", - "mn": "Mongolian", - "te": "Telugu", - "pa": "Panjabi", - "tt": "Tatar", - "gu": "Gujarati", - "oc": "Occitan", - "ha": "Hausa", - "ba": "Bashkir", - "my": "Burmese", - "sd": "Sindhi", - "am": "Amharic", - "lb": "Luxembourgish", - "bo": "Tibetan" -} - -# As per https://platform.openai.com/docs/guides/speech-to-text#overview. -# TODO configurable -MAX_AUDIO_CLIP_FILESIZE_MB = 25 -OVERLAP_CHUNK_SECOND = 1 -MIN_ENERGY_WINDOW_SIZE = 1600 # 1600 ~ 100ms for 16000 Hz audio - - -class OpenAISpeechToText(OpenAIServing): - """Base class for speech-to-text operations like transcription and - translation.""" - - def __init__( - self, - engine_client: EngineClient, - model_config: ModelConfig, - models: OpenAIServingModels, - *, - request_logger: Optional[RequestLogger], - return_tokens_as_token_ids: bool = False, - task_type: str = "transcribe", # or "translate" - ): - super().__init__(engine_client=engine_client, - model_config=model_config, - models=models, - request_logger=request_logger, - return_tokens_as_token_ids=return_tokens_as_token_ids) - - self.default_sampling_params = ( - self.model_config.get_diff_sampling_param()) - processor = cached_get_processor(model_config.model) - self.max_audio_clip_s = processor.feature_extractor.chunk_length - self.model_sr = processor.feature_extractor.sampling_rate - self.hop_length = processor.feature_extractor.hop_length - self.task_type = task_type - - if self.default_sampling_params: - logger.info( - "Overwriting default completion sampling param with: %s", - self.default_sampling_params) - - async def _preprocess_speech_to_text( - self, - request: Union[TranscriptionRequest, TranslationRequest], - audio_data: bytes, - ) -> tuple[list[PromptType], float]: - # Validate request - # TODO language should be optional and can be guessed. - # For now we default to en. See - # https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/generation_whisper.py#L1520 - lang_token = f"<|{request.language}|>" if request.language else "<|en|>" - if request.language: - if request.language in ISO639_1_SUPPORTED_LANGS: - pass - elif request.language in ISO639_1_OTHER_LANGS: - logger.warning( - "The selected language %s has limited accuracy with" - " reported WER>=0.5. Results may be less accurate " - "for this choice.", request.language) - else: - raise ValueError( - f"Unsupported language: {request.language}." - "Language should be one of:" + - f" {list(ISO639_1_SUPPORTED_LANGS.values())}" + - f"or {list(ISO639_1_OTHER_LANGS.values())}") - - if len(audio_data) / 1024**2 > MAX_AUDIO_CLIP_FILESIZE_MB: - raise ValueError("Maximum file size exceeded.") - - with io.BytesIO(audio_data) as bytes_: - y, sr = librosa.load(bytes_) - - duration = librosa.get_duration(y=y, sr=sr) - chunks = [y] if duration < 30 else self._split_audio(y, sr) - prompts = [] - for i, chunk in enumerate(chunks): - prompt = { - "encoder_prompt": { - "prompt": "", - "multi_modal_data": { - "audio": (chunk, sr), - }, - }, - "decoder_prompt": - (f"<|startoftranscript|>{lang_token}" - f"<|{self.task_type}|><|notimestamps|>{request.prompt}") - if i == 0 else "" - } - prompts.append(cast(PromptType, prompt)) - return prompts, duration - - async def _create_speech_to_text( - self, - audio_data: bytes, - request: Union[TranscriptionRequest, TranslationRequest], - raw_request: Request, - response_class: Union[TranscriptionResponse, TranslationResponse], - stream_generator_method: Callable, - ) -> Union[Union[TranscriptionResponse, TranslationResponse], - AsyncGenerator[str, None], ErrorResponse]: - """Base method for speech-to-text operations like transcription and - translation.""" - error_check_ret = await self._check_model(request) - if error_check_ret is not None: - return error_check_ret - - # If the engine is dead, raise the engine's DEAD_ERROR. - # This is required for the streaming case, where we return a - # success status before we actually start generating text :). - if self.engine_client.errored: - raise self.engine_client.dead_error - - if request.response_format not in ['text', 'json']: - return self.create_error_response( - "Currently only support response_format `text` or `json`") - - request_id = f"{self.task_type}-{self._base_request_id(raw_request)}" - - request_metadata = RequestResponseMetadata(request_id=request_id) - if raw_request: - raw_request.state.request_metadata = request_metadata - - try: - ( - lora_request, - prompt_adapter_request, - ) = self._maybe_get_adapters(request) - - if lora_request: - return self.create_error_response( - "Currently do not support LoRA for " - f"{self.task_type.title()}.") - if prompt_adapter_request: - return self.create_error_response( - f"Currently do not support PromptAdapter for " - f"{self.task_type.title()}.") - - prompts, duration_s = await self._preprocess_speech_to_text( - request=request, - audio_data=audio_data, - ) - - except ValueError as e: - logger.exception("Error in preprocessing prompt inputs") - return self.create_error_response(str(e)) - - list_result_generator: Optional[list[AsyncGenerator[RequestOutput, - None]]] = None - try: - # Unlike most decoder-only models, whisper generation length is not - # constrained by the size of the input audio, which is mapped to a - # fixed-size log-mel-spectogram. - default_max_tokens = self.model_config.max_model_len - sampling_params = request.to_sampling_params( - default_max_tokens, self.default_sampling_params) - - self._log_inputs( - request_id, - prompts[0]['decoder_prompt'], # type: ignore - params=sampling_params, - lora_request=None, - prompt_adapter_request=None) - - list_result_generator = [ - self.engine_client.generate( - prompt, - sampling_params, - request_id, - ) for prompt in prompts - ] - except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) - - if request.stream: - return stream_generator_method(request, - list_result_generator, - request_id, - request_metadata, - duration_s) - # Non-streaming response. - try: - assert list_result_generator is not None - text = "" - for result_generator in list_result_generator: - async for op in result_generator: - text += op.outputs[0].text - return response_class(text=text) - except asyncio.CancelledError: - return self.create_error_response("Client disconnected") - except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) - - async def _speech_to_text_stream_generator( - self, - request: Union[TranscriptionRequest, TranslationRequest], - list_result_generator: list[AsyncGenerator[RequestOutput, None]], - request_id: str, - request_metadata: RequestResponseMetadata, - audio_duration_s: float, - chunk_object_type: str, - response_stream_choice_class: Union[TranscriptionResponseStreamChoice, - TranslationResponseStreamChoice], - stream_response_class: Union[TranscriptionStreamResponse, - TranslationStreamResponse], - ) -> AsyncGenerator[str, None]: - created_time = int(time.time()) - model_name = request.model - - completion_tokens = 0 - num_prompt_tokens = 0 - - include_usage = request.stream_include_usage \ - if request.stream_include_usage else False - include_continuous_usage = request.stream_continuous_usage_stats\ - if include_usage and request.stream_continuous_usage_stats\ - else False - - try: - for result_generator in list_result_generator: - async for res in result_generator: - # On first result. - if res.prompt_token_ids is not None: - # Do not account the 4-tokens `<|startoftranscript|>..` - # Could be negative when language token - # is not specified. - num_prompt_tokens = max( - len(res.prompt_token_ids) - 4, 0) - # NOTE(NickLucche) user can't pass encoder - # prompts directly at least not to Whisper. - # One indicator of the encoder amount of processing - # is the log-mel spectogram length. - num_prompt_tokens += ceil( - audio_duration_s * self.model_sr / self.hop_length) - - # We need to do it here, because if there are exceptions in - # the result_generator, it needs to be sent as the FIRST - # response (by the try...catch). - - # Just one output (n=1) supported. - assert len(res.outputs) == 1 - output = res.outputs[0] - - delta_message = DeltaMessage(content=output.text) - completion_tokens += len(output.token_ids) - - if output.finish_reason is None: - # Still generating, send delta update. - choice_data = response_stream_choice_class( - delta=delta_message) - else: - # Model is finished generating. - choice_data = response_stream_choice_class( - delta=delta_message, - finish_reason=output.finish_reason, - stop_reason=output.stop_reason) - - chunk = stream_response_class( - id=request_id, - object=chunk_object_type, - created=created_time, - choices=[choice_data], - model=model_name) - - # handle usage stats if requested & if continuous - if include_continuous_usage: - chunk.usage = UsageInfo( - prompt_tokens=num_prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=num_prompt_tokens + completion_tokens, - ) - - data = chunk.model_dump_json(exclude_unset=True) - yield f"data: {data}\n\n" - - # Once the final token is handled, if stream_options.include_usage - # is sent, send the usage. - if include_usage: - final_usage = UsageInfo(prompt_tokens=num_prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=num_prompt_tokens + - completion_tokens) - - final_usage_chunk = stream_response_class( - id=request_id, - object=chunk_object_type, - created=created_time, - choices=[], - model=model_name, - usage=final_usage) - final_usage_data = (final_usage_chunk.model_dump_json( - exclude_unset=True, exclude_none=True)) - yield f"data: {final_usage_data}\n\n" - - # report to FastAPI middleware aggregate usage across all choices - request_metadata.final_usage_info = UsageInfo( - prompt_tokens=num_prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=num_prompt_tokens + completion_tokens) - - except Exception as e: - # TODO: Use a vllm-specific Validation Error - logger.exception("Error in %s stream generator.", self.task_type) - data = self.create_streaming_error_response(str(e)) - yield f"data: {data}\n\n" - # Send the final done message after all response.n are finished - yield "data: [DONE]\n\n" - - def _split_audio(self, audio_data: np.ndarray, - sample_rate: int) -> list[np.ndarray]: - chunk_size = sample_rate * self.max_audio_clip_s - overlap_size = sample_rate * OVERLAP_CHUNK_SECOND - chunks = [] - i = 0 - while i < audio_data.shape[-1]: - if i + chunk_size >= audio_data.shape[-1]: - # handle last chunk - chunks.append(audio_data[..., i:]) - break - - # Find the best split point in the overlap region - search_start = i + chunk_size - overlap_size - search_end = min(i + chunk_size, audio_data.shape[-1]) - split_point = self._find_split_point(audio_data, search_start, - search_end) - - # Extract chunk up to the split point - chunks.append(audio_data[..., i:split_point]) - i = split_point - return chunks - - def _find_split_point(self, wav: np.ndarray, start_idx: int, - end_idx: int) -> int: - """Find the best point to split audio by - looking for silence or low amplitude. - Args: - wav: Audio tensor [1, T] - start_idx: Start index of search region - end_idx: End index of search region - Returns: - Index of best splitting point - """ - segment = wav[start_idx:end_idx] - - # Calculate RMS energy in small windows - min_energy = math.inf - quietest_idx = 0 - for i in range(0, - len(segment) - MIN_ENERGY_WINDOW_SIZE, - MIN_ENERGY_WINDOW_SIZE): - window = segment[i:i + MIN_ENERGY_WINDOW_SIZE] - energy = (window**2).mean()**0.5 - if energy < min_energy: - quietest_idx = i + start_idx - min_energy = energy - return quietest_idx class OpenAIServingTranscription(OpenAISpeechToText): """Handles transcription requests.""" diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py new file mode 100644 index 000000000000..9553fcbf52eb --- /dev/null +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -0,0 +1,436 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import asyncio +import io +import time +from collections.abc import AsyncGenerator +from math import ceil +from typing import Callable, Optional, Union, cast + +from fastapi import Request + +from vllm.config import ModelConfig +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.protocol import ( + DeltaMessage, ErrorResponse, RequestResponseMetadata, TranscriptionRequest, + TranscriptionResponse, TranscriptionResponseStreamChoice, + TranscriptionStreamResponse, TranslationRequest, TranslationResponse, + TranslationResponseStreamChoice, TranslationStreamResponse, UsageInfo) +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.inputs.data import PromptType +from vllm.logger import init_logger +from vllm.outputs import RequestOutput +from vllm.transformers_utils.processor import cached_get_processor +from vllm.utils import PlaceholderModule + +try: + import librosa +except ImportError: + librosa = PlaceholderModule("librosa") # type: ignore[assignment] + +logger = init_logger(__name__) + +# From https://platform.openai.com/docs/guides/speech-to-text/supported-languages +# TODO these configs should live somewhere with the model so we can support +# additional ones + +ISO639_1_SUPPORTED_LANGS = { + "af": "Afrikaans", + "ar": "Arabic", + "hy": "Armenian", + "az": "Azerbaijani", + "be": "Belarusian", + "bs": "Bosnian", + "bg": "Bulgarian", + "ca": "Catalan", + "zh": "Chinese", + "hr": "Croatian", + "cs": "Czech", + "da": "Danish", + "nl": "Dutch", + "en": "English", + "et": "Estonian", + "fi": "Finnish", + "fr": "French", + "gl": "Galician", + "de": "German", + "el": "Greek", + "he": "Hebrew", + "hi": "Hindi", + "hu": "Hungarian", + "is": "Icelandic", + "id": "Indonesian", + "it": "Italian", + "ja": "Japanese", + "kn": "Kannada", + "kk": "Kazakh", + "ko": "Korean", + "lv": "Latvian", + "lt": "Lithuanian", + "mk": "Macedonian", + "ms": "Malay", + "mr": "Marathi", + "mi": "Maori", + "ne": "Nepali", + "no": "Norwegian", + "fa": "Persian", + "pl": "Polish", + "pt": "Portuguese", + "ro": "Romanian", + "ru": "Russian", + "sr": "Serbian", + "sk": "Slovak", + "sl": "Slovenian", + "es": "Spanish", + "sw": "Swahili", + "sv": "Swedish", + "tl": "Tagalog", + "ta": "Tamil", + "th": "Thai", + "tr": "Turkish", + "uk": "Ukrainian", + "ur": "Urdu", + "vi": "Vietnamese", + "cy": "Welsh" +} +ISO639_1_OTHER_LANGS = { + "lo": "Lao", + "jw": "Javanese", + "tk": "Turkmen", + "yi": "Yiddish", + "so": "Somali", + "bn": "Bengali", + "nn": "Norwegian Nynorsk", + "si": "Sinhala", + "yo": "Yoruba", + "sa": "Sanskrit", + "mi": "Māori", + "fo": "Faroese", # codespell:ignore + "mt": "Maltese", + "tg": "Tajik", + "mg": "Malagasy", + "haw": "Hawaiian", + "km": "Khmer", + "br": "Breton", + "ps": "Pashto", + "ln": "Lingala", + "la": "Latin", + "ml": "Malayalam", + "sq": "Albanian", + "su": "Sundanese", + "eu": "Basque", + "ka": "Georgian", + "uz": "Uzbek", + "sn": "Shona", + "ht": "Haitian", + "as": "Assamese", + "mn": "Mongolian", + "te": "Telugu", + "pa": "Panjabi", + "tt": "Tatar", + "gu": "Gujarati", + "oc": "Occitan", + "ha": "Hausa", + "ba": "Bashkir", + "my": "Burmese", + "sd": "Sindhi", + "am": "Amharic", + "lb": "Luxembourgish", + "bo": "Tibetan" +} + +# As per https://platform.openai.com/docs/guides/speech-to-text#overview. +# TODO configurable +MAX_AUDIO_CLIP_FILESIZE_MB = 25 + + +class OpenAISpeechToText(OpenAIServing): + """Base class for speech-to-text operations like transcription and + translation.""" + + def __init__( + self, + engine_client: EngineClient, + model_config: ModelConfig, + models: OpenAIServingModels, + *, + request_logger: Optional[RequestLogger], + return_tokens_as_token_ids: bool = False, + task_type: str = "transcribe", # or "translate" + ): + super().__init__(engine_client=engine_client, + model_config=model_config, + models=models, + request_logger=request_logger, + return_tokens_as_token_ids=return_tokens_as_token_ids) + + self.default_sampling_params = ( + self.model_config.get_diff_sampling_param()) + processor = cached_get_processor(model_config.model) + self.max_audio_clip_s = processor.feature_extractor.chunk_length + self.model_sr = processor.feature_extractor.sampling_rate + self.hop_length = processor.feature_extractor.hop_length + self.task_type = task_type + + if self.default_sampling_params: + logger.info( + "Overwriting default completion sampling param with: %s", + self.default_sampling_params) + + async def _preprocess_speech_to_text( + self, + request: Union[TranscriptionRequest, TranslationRequest], + audio_data: bytes, + ) -> tuple[PromptType, float]: + # Validate request + # TODO language should be optional and can be guessed. + # For now we default to en. See + # https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/generation_whisper.py#L1520 + lang_token = f"<|{request.language}|>" if request.language else "<|en|>" + if request.language: + if request.language in ISO639_1_SUPPORTED_LANGS: + pass + elif request.language in ISO639_1_OTHER_LANGS: + logger.warning( + "The selected language %s has limited accuracy with" + " reported WER>=0.5. Results may be less accurate " + "for this choice.", request.language) + else: + raise ValueError( + f"Unsupported language: {request.language}." + "Language should be one of:" + + f" {list(ISO639_1_SUPPORTED_LANGS.values())}" + + f"or {list(ISO639_1_OTHER_LANGS.values())}") + + if len(audio_data) / 1024**2 > MAX_AUDIO_CLIP_FILESIZE_MB: + raise ValueError("Maximum file size exceeded.") + + with io.BytesIO(audio_data) as bytes_: + y, sr = librosa.load(bytes_) + + duration = librosa.get_duration(y=y, sr=sr) + if duration > self.max_audio_clip_s: + raise ValueError( + f"Maximum clip duration ({self.max_audio_clip_s}s) " + "exceeded.") + + prompt = { + "encoder_prompt": { + "prompt": "", + "multi_modal_data": { + "audio": (y, sr), + }, + }, + "decoder_prompt": + (f"<|startoftranscript|>{lang_token}" + f"<|{self.task_type}|><|notimestamps|>{request.prompt}") + } + return cast(PromptType, prompt), duration + + async def _create_speech_to_text( + self, + audio_data: bytes, + request: Union[TranscriptionRequest, TranslationRequest], + raw_request: Request, + response_class: Union[TranscriptionResponse, TranslationResponse], + stream_generator_method: Callable, + ) -> Union[Union[TranscriptionResponse, TranslationResponse], + AsyncGenerator[str, None], ErrorResponse]: + """Base method for speech-to-text operations like transcription and + translation.""" + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + return error_check_ret + + # If the engine is dead, raise the engine's DEAD_ERROR. + # This is required for the streaming case, where we return a + # success status before we actually start generating text :). + if self.engine_client.errored: + raise self.engine_client.dead_error + + if request.response_format not in ['text', 'json']: + return self.create_error_response( + "Currently only support response_format `text` or `json`") + + request_id = f"{self.task_type}-{self._base_request_id(raw_request)}" + + request_metadata = RequestResponseMetadata(request_id=request_id) + if raw_request: + raw_request.state.request_metadata = request_metadata + + try: + ( + lora_request, + prompt_adapter_request, + ) = self._maybe_get_adapters(request) + + if lora_request: + return self.create_error_response( + "Currently do not support LoRA for " + f"{self.task_type.title()}.") + if prompt_adapter_request: + return self.create_error_response( + f"Currently do not support PromptAdapter for " + f"{self.task_type.title()}.") + + prompt, duration_s = await self._preprocess_speech_to_text( + request=request, + audio_data=audio_data, + ) + + except ValueError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) + + result_generator: Optional[AsyncGenerator[RequestOutput, None]] = None + try: + # Unlike most decoder-only models, whisper generation length is not + # constrained by the size of the input audio, which is mapped to a + # fixed-size log-mel-spectogram. + default_max_tokens = self.model_config.max_model_len + sampling_params = request.to_sampling_params( + default_max_tokens, self.default_sampling_params) + + self._log_inputs( + request_id, + prompt['decoder_prompt'], # type: ignore + params=sampling_params, + lora_request=None, + prompt_adapter_request=None) + + result_generator = self.engine_client.generate( + prompt, + sampling_params, + request_id, + ) + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + if request.stream: + return stream_generator_method(request, result_generator, + request_id, request_metadata, + duration_s) + # Non-streaming response. + try: + assert result_generator is not None + async for op in result_generator: + result = op + return response_class(text=result.outputs[0].text) + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + async def _speech_to_text_stream_generator( + self, + request: Union[TranscriptionRequest, TranslationRequest], + result_generator: AsyncGenerator[RequestOutput, None], + request_id: str, + request_metadata: RequestResponseMetadata, + audio_duration_s: float, + chunk_object_type: str, + response_stream_choice_class: Union[TranscriptionResponseStreamChoice, + TranslationResponseStreamChoice], + stream_response_class: Union[TranscriptionStreamResponse, + TranslationStreamResponse], + ) -> AsyncGenerator[str, None]: + created_time = int(time.time()) + model_name = request.model + + completion_tokens = 0 + num_prompt_tokens = 0 + + include_usage = request.stream_include_usage \ + if request.stream_include_usage else False + include_continuous_usage = request.stream_continuous_usage_stats\ + if include_usage and request.stream_continuous_usage_stats\ + else False + + try: + async for res in result_generator: + # On first result. + if res.prompt_token_ids is not None: + # Do not account the 4-tokens `<|startoftranscript|>..` + # Could be negative when language token is not specified. + num_prompt_tokens = max(len(res.prompt_token_ids) - 4, 0) + # NOTE(NickLucche) user can't pass encoder prompts directly + # at least not to Whisper. One indicator of the encoder + # amount of processing is the log-mel spectogram length. + num_prompt_tokens += ceil(audio_duration_s * + self.model_sr / self.hop_length) + + # We need to do it here, because if there are exceptions in + # the result_generator, it needs to be sent as the FIRST + # response (by the try...catch). + + # Just one output (n=1) supported. + assert len(res.outputs) == 1 + output = res.outputs[0] + + delta_message = DeltaMessage(content=output.text) + completion_tokens += len(output.token_ids) + + if output.finish_reason is None: + # Still generating, send delta update. + choice_data = response_stream_choice_class( + delta=delta_message) + else: + # Model is finished generating. + choice_data = response_stream_choice_class( + delta=delta_message, + finish_reason=output.finish_reason, + stop_reason=output.stop_reason) + + chunk = stream_response_class(id=request_id, + object=chunk_object_type, + created=created_time, + choices=[choice_data], + model=model_name) + + # handle usage stats if requested & if continuous + if include_continuous_usage: + chunk.usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=num_prompt_tokens + completion_tokens, + ) + + data = chunk.model_dump_json(exclude_unset=True) + yield f"data: {data}\n\n" + + # Once the final token is handled, if stream_options.include_usage + # is sent, send the usage. + if include_usage: + final_usage = UsageInfo(prompt_tokens=num_prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=num_prompt_tokens + + completion_tokens) + + final_usage_chunk = stream_response_class( + id=request_id, + object=chunk_object_type, + created=created_time, + choices=[], + model=model_name, + usage=final_usage) + final_usage_data = (final_usage_chunk.model_dump_json( + exclude_unset=True, exclude_none=True)) + yield f"data: {final_usage_data}\n\n" + + # report to FastAPI middleware aggregate usage across all choices + request_metadata.final_usage_info = UsageInfo( + prompt_tokens=num_prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=num_prompt_tokens + completion_tokens) + + except Exception as e: + # TODO: Use a vllm-specific Validation Error + logger.exception("Error in %s stream generator.", self.task_type) + data = self.create_streaming_error_response(str(e)) + yield f"data: {data}\n\n" + # Send the final done message after all response.n are finished + yield "data: [DONE]\n\n" \ No newline at end of file From 13e25026122c6331b48e432cddc521d60d8058d9 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Fri, 13 Jun 2025 09:27:21 +0000 Subject: [PATCH 03/18] minor Signed-off-by: NickLucche --- vllm/entrypoints/openai/protocol.py | 1 + vllm/entrypoints/openai/speech_to_text.py | 19 ++++++++++--------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 0f92b5393610..ea21200af1cd 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1984,6 +1984,7 @@ class TranslationRequest(OpenAIBaseModel): Supplying the input language in [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format will improve accuracy and latency. + This is a custom field not present in the openai specification. """ prompt: str = Field(default="") diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index 9553fcbf52eb..d6c346c71b5f 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -151,15 +151,16 @@ class OpenAISpeechToText(OpenAIServing): translation.""" def __init__( - self, - engine_client: EngineClient, - model_config: ModelConfig, - models: OpenAIServingModels, - *, - request_logger: Optional[RequestLogger], - return_tokens_as_token_ids: bool = False, - task_type: str = "transcribe", # or "translate" + self, + engine_client: EngineClient, + model_config: ModelConfig, + models: OpenAIServingModels, + *, + request_logger: Optional[RequestLogger], + return_tokens_as_token_ids: bool = False, + task_type: str = "transcribe", ): + assert task_type in ["transcribe", "translate"] super().__init__(engine_client=engine_client, model_config=model_config, models=models, @@ -433,4 +434,4 @@ async def _speech_to_text_stream_generator( data = self.create_streaming_error_response(str(e)) yield f"data: {data}\n\n" # Send the final done message after all response.n are finished - yield "data: [DONE]\n\n" \ No newline at end of file + yield "data: [DONE]\n\n" From 60bbae1417cb97596cdaf50286106033452d43c4 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Fri, 13 Jun 2025 15:45:02 +0000 Subject: [PATCH 04/18] fix streaming Signed-off-by: NickLucche --- vllm/entrypoints/openai/serving_transcription.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py index 30dcbd0f6a3b..ae31b8ba44e3 100644 --- a/vllm/entrypoints/openai/serving_transcription.py +++ b/vllm/entrypoints/openai/serving_transcription.py @@ -68,7 +68,7 @@ async def transcription_stream_generator( result_generator: AsyncGenerator[RequestOutput, None], request_id: str, request_metadata: RequestResponseMetadata, audio_duration_s: float) -> AsyncGenerator[str, None]: - return await self._speech_to_text_stream_generator( + generator = self._speech_to_text_stream_generator( request=request, result_generator=result_generator, request_id=request_id, @@ -78,6 +78,8 @@ async def transcription_stream_generator( response_stream_choice_class=TranscriptionResponseStreamChoice, stream_response_class=TranscriptionStreamResponse, ) + async for chunk in generator: + yield chunk class OpenAIServingTranslation(OpenAISpeechToText): @@ -121,7 +123,7 @@ async def translation_stream_generator( result_generator: AsyncGenerator[RequestOutput, None], request_id: str, request_metadata: RequestResponseMetadata, audio_duration_s: float) -> AsyncGenerator[str, None]: - return await self._speech_to_text_stream_generator( + generator = self._speech_to_text_stream_generator( request=request, result_generator=result_generator, request_id=request_id, @@ -131,3 +133,5 @@ async def translation_stream_generator( response_stream_choice_class=TranslationResponseStreamChoice, stream_response_class=TranslationStreamResponse, ) + async for chunk in generator: + yield chunk From 04454610d149a7c016dcf1aec115ceb03345f463 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Fri, 13 Jun 2025 16:08:55 +0000 Subject: [PATCH 05/18] docs Signed-off-by: NickLucche --- docs/serving/openai_compatible_server.md | 30 ++++++++ .../openai_transcription_client.py | 2 +- .../openai_translation_client.py | 75 +++++++++++++++++++ vllm/entrypoints/openai/protocol.py | 27 +++---- 4 files changed, 116 insertions(+), 18 deletions(-) create mode 100644 examples/online_serving/openai_translation_client.py diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 7862778464dd..c161d32cae14 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -57,6 +57,8 @@ We currently support the following OpenAI APIs: - Only applicable to [embedding models](../models/pooling_models.md) (`--task embed`). - [Transcriptions API][transcriptions-api] (`/v1/audio/transcriptions`) - Only applicable to Automatic Speech Recognition (ASR) models (OpenAI Whisper) (`--task generate`). +- [Translation API][translation-api] (`/v1/audio/translations`) + - Only applicable to Automatic Speech Recognition (ASR) models (OpenAI Whisper) (`--task generate`). In addition, we have the following custom APIs: @@ -374,6 +376,34 @@ The following extra parameters are supported: ```python --8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params" ``` + +[](){ #translations-api } + +### Translations API + +Our Translation API is compatible with [OpenAI's Translations API](https://platform.openai.com/docs/api-reference/audio/createTranslation); +you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. +Whisper models can translate audio from one of the 55 non-English supported languages into English. +Please mind that the popular `openai/whisper-large-v3-turbo` model does not support translating. + +!!! note + To use the Translation API, please install with extra audio dependencies using `pip install vllm[audio]`. + +Code example: + +#### Extra Parameters + +The following [sampling parameters][sampling-params] are supported. + +```python +--8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params" +``` + +The following extra parameters are supported: + +```python +--8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params" +``` [](){ #tokenizer-api } diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py index ae43cb5da790..e2d2ad8908b8 100644 --- a/examples/online_serving/openai_transcription_client.py +++ b/examples/online_serving/openai_transcription_client.py @@ -66,7 +66,7 @@ async def stream_openai_response(): data = { "language": "en", "stream": True, - "model": "openai/whisper-large-v3", + "model": "openai/whisper-large-v3-turbo", } url = openai_api_base + "/audio/transcriptions" headers = {"Authorization": f"Bearer {openai_api_key}"} diff --git a/examples/online_serving/openai_translation_client.py b/examples/online_serving/openai_translation_client.py new file mode 100644 index 000000000000..44a0dfde1ead --- /dev/null +++ b/examples/online_serving/openai_translation_client.py @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import asyncio +import json + +import httpx +from openai import OpenAI + +from vllm.assets.audio import AudioAsset + +# TODO get actual asset +mary_had_lamb = AudioAsset("mary_had_lamb").get_local_path() +winning_call = AudioAsset("winning_call").get_local_path() + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + + +def sync_openai(): + with open(str(mary_had_lamb), "rb") as f: + translation = client.audio.translations.create( + file=f, + model="openai/whisper-large-v3", + response_format="json", + temperature=0.0, + # Additional sampling params not provided by OpenAI API. + extra_body=dict( + language="it", + seed=4419, + repetition_penalty=1.3, + ), + ) + print("translation result:", translation.text) + + +sync_openai() + + +# OpenAI translation API client does not support streaming. +async def stream_openai_response(): + data = { + "language": "it", + "stream": True, + "model": "openai/whisper-large-v3", + } + url = openai_api_base + "/audio/translations" + headers = {"Authorization": f"Bearer {openai_api_key}"} + print("translation result:", end=" ") + async with httpx.AsyncClient() as client: + with open(str(winning_call), "rb") as f: + async with client.stream( + "POST", url, files={"file": f}, data=data, headers=headers + ) as response: + async for line in response.aiter_lines(): + # Each line is a JSON object prefixed with 'data: ' + if line: + if line.startswith("data: "): + line = line[len("data: ") :] + # Last chunk, stream ends + if line.strip() == "[DONE]": + break + # Parse the JSON response + chunk = json.loads(line) + # Extract and print the content + content = chunk["choices"][0].get("delta", {}).get("content") + print(content, end="") + + +# Run the asynchronous function +asyncio.run(stream_openai_response()) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index ea21200af1cd..60f0e11a0216 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1978,15 +1978,6 @@ class TranslationRequest(OpenAIBaseModel): """ID of the model to use. """ - language: Optional[str] = None - """The language of the input audio. - - Supplying the input language in - [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format - will improve accuracy and latency. - This is a custom field not present in the openai specification. - """ - prompt: str = Field(default="") """An optional text to guide the model's style or continue a previous audio segment. @@ -2001,7 +1992,8 @@ class TranslationRequest(OpenAIBaseModel): `verbose_json`, or `vtt`. """ - ## TODO (varun) : Support if set to 0, certain thresholds are met !! + # TODO support additional sampling parameters + # --8<-- [start:transcription-sampling-params] temperature: float = Field(default=0.0) """The sampling temperature, between 0 and 1. @@ -2010,15 +2002,15 @@ class TranslationRequest(OpenAIBaseModel): will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit. """ + # --8<-- [end:transcription-sampling-params] - timestamp_granularities: list[Literal["word", "segment"]] = Field( - alias="timestamp_granularities[]", default=[]) - """The timestamp granularities to populate for this translation. + # --8<-- [start:translation-extra-params] + language: Optional[str] = None + """The language of the input audio we translate from. - `response_format` must be set `verbose_json` to use timestamp granularities. - Either or both of these options are supported: `word`, or `segment`. Note: - There is no additional latency for segment timestamps, but generating word - timestamps incurs additional latency. + Supplying the input language in + [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format + will improve accuracy. """ stream: Optional[bool] = False @@ -2029,6 +2021,7 @@ class TranslationRequest(OpenAIBaseModel): # Flattened stream option to simplify form data. stream_include_usage: Optional[bool] = False stream_continuous_usage_stats: Optional[bool] = False + # --8<-- [end:translation-extra-params] # Default sampling parameters for translation requests. _DEFAULT_SAMPLING_PARAMS: dict = { From 40dfecbeab116426f37eb9845c7f17480a115ea2 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Sat, 14 Jun 2025 18:15:49 +0000 Subject: [PATCH 06/18] revert example change Signed-off-by: NickLucche --- examples/online_serving/openai_transcription_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py index e2d2ad8908b8..ae43cb5da790 100644 --- a/examples/online_serving/openai_transcription_client.py +++ b/examples/online_serving/openai_transcription_client.py @@ -66,7 +66,7 @@ async def stream_openai_response(): data = { "language": "en", "stream": True, - "model": "openai/whisper-large-v3-turbo", + "model": "openai/whisper-large-v3", } url = openai_api_base + "/audio/transcriptions" headers = {"Authorization": f"Bearer {openai_api_key}"} From 5691476cd757dee5d87a05455adc54cae62fe7b9 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Sat, 14 Jun 2025 18:37:44 +0000 Subject: [PATCH 07/18] type for passing class Signed-off-by: NickLucche --- vllm/entrypoints/openai/speech_to_text.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index d6c346c71b5f..1bd36c3cd421 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -235,7 +235,8 @@ async def _create_speech_to_text( audio_data: bytes, request: Union[TranscriptionRequest, TranslationRequest], raw_request: Request, - response_class: Union[TranscriptionResponse, TranslationResponse], + response_class: Union[type[TranscriptionResponse], + type[TranslationResponse]], stream_generator_method: Callable, ) -> Union[Union[TranscriptionResponse, TranslationResponse], AsyncGenerator[str, None], ErrorResponse]: @@ -334,10 +335,11 @@ async def _speech_to_text_stream_generator( request_metadata: RequestResponseMetadata, audio_duration_s: float, chunk_object_type: str, - response_stream_choice_class: Union[TranscriptionResponseStreamChoice, - TranslationResponseStreamChoice], - stream_response_class: Union[TranscriptionStreamResponse, - TranslationStreamResponse], + response_stream_choice_class: Union[ + type[TranscriptionResponseStreamChoice], + type[TranslationResponseStreamChoice]], + stream_response_class: Union[type[TranscriptionStreamResponse], + type[TranslationStreamResponse]], ) -> AsyncGenerator[str, None]: created_time = int(time.time()) model_name = request.model From a7966613f026b189dd48f17bec7e87b06af59ced Mon Sep 17 00:00:00 2001 From: NickLucche Date: Mon, 16 Jun 2025 14:26:36 +0000 Subject: [PATCH 08/18] types Signed-off-by: NickLucche --- vllm/entrypoints/openai/serving_engine.py | 8 ++++---- vllm/entrypoints/openai/serving_transcription.py | 10 ++++++++-- vllm/entrypoints/openai/speech_to_text.py | 4 ++-- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 4bf790bbb298..cf2b738ba55e 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -58,7 +58,8 @@ TokenizeCompletionRequest, TokenizeResponse, TranscriptionRequest, - TranscriptionResponse) + TranscriptionResponse, + TranslationRequest) from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.tool_parsers import ToolParser # yapf: enable @@ -89,9 +90,8 @@ ChatLikeRequest = Union[ChatCompletionRequest, EmbeddingChatRequest, TokenizeChatRequest] - -AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest, - TranscriptionRequest] +SpeechToTextRequest = Union[TranscriptionRequest, TranslationRequest] +AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest, SpeechToTextRequest] AnyResponse = Union[ CompletionResponse, diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py index ae31b8ba44e3..8bee247455b5 100644 --- a/vllm/entrypoints/openai/serving_transcription.py +++ b/vllm/entrypoints/openai/serving_transcription.py @@ -55,13 +55,16 @@ async def create_transcription( See https://platform.openai.com/docs/api-reference/audio/createTranscription for the API specification. This API mimics the OpenAI transcription API. """ - return await self._create_speech_to_text( + response = await self._create_speech_to_text( audio_data=audio_data, request=request, raw_request=raw_request, response_class=TranscriptionResponse, stream_generator_method=self.transcription_stream_generator, ) + # Make mypy happy + assert not isinstance(response, TranslationResponse) + return response async def transcription_stream_generator( self, request: TranscriptionRequest, @@ -110,13 +113,16 @@ async def create_translation( See https://platform.openai.com/docs/api-reference/audio/createTranslation for the API specification. This API mimics the OpenAI translation API. """ - return await self._create_speech_to_text( + response = await self._create_speech_to_text( audio_data=audio_data, request=request, raw_request=raw_request, response_class=TranslationResponse, stream_generator_method=self.translation_stream_generator, ) + # Make mypy happy + assert not isinstance(response, TranscriptionResponse) + return response async def translation_stream_generator( self, request: TranslationRequest, diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index 1bd36c3cd421..f7f98795613b 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -30,6 +30,7 @@ except ImportError: librosa = PlaceholderModule("librosa") # type: ignore[assignment] +SpeechToTextResponse = Union[TranscriptionResponse, TranslationResponse] logger = init_logger(__name__) # From https://platform.openai.com/docs/guides/speech-to-text/supported-languages @@ -238,8 +239,7 @@ async def _create_speech_to_text( response_class: Union[type[TranscriptionResponse], type[TranslationResponse]], stream_generator_method: Callable, - ) -> Union[Union[TranscriptionResponse, TranslationResponse], - AsyncGenerator[str, None], ErrorResponse]: + ) -> Union[SpeechToTextResponse, AsyncGenerator[str, None], ErrorResponse]: """Base method for speech-to-text operations like transcription and translation.""" error_check_ret = await self._check_model(request) From 313a063b11d59e11a78ca06b5515d5be3b4b7b28 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Wed, 18 Jun 2025 10:21:44 +0000 Subject: [PATCH 09/18] test translation Signed-off-by: NickLucche --- .../openai/test_translation_validation.py | 143 ++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 tests/entrypoints/openai/test_translation_validation.py diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py new file mode 100644 index 000000000000..4f26fc243fcd --- /dev/null +++ b/tests/entrypoints/openai/test_translation_validation.py @@ -0,0 +1,143 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# imports for guided decoding tests +import json +from unittest.mock import patch + +import pytest +from openai._base_client import AsyncAPIClient + +from vllm.assets.audio import AudioAsset + +from ...utils import RemoteOpenAIServer + + +@pytest.fixture +def foscolo(): + # Test translation it->en + path = AudioAsset('azacinto_foscolo').get_local_path() + with open(str(path), "rb") as f: + yield f + + +# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation! +@pytest.mark.asyncio +async def test_basic_audio(foscolo): + model_name = "openai/whisper-small" + server_args = ["--enforce-eager"] + with RemoteOpenAIServer(model_name, server_args) as remote_server: + client = remote_server.get_async_client() + translation = await client.audio.translations.create( + model=model_name, + file=foscolo, + response_format="text", + # TODO remove once language detection is implemented + extra_body=dict(language="it"), + temperature=0.0) + out = json.loads(translation)['text'].strip() + assert "Nor will I ever touch the sacred" in out + + +@pytest.mark.asyncio +async def test_audio_prompt(foscolo): + model_name = "openai/whisper-small" + server_args = ["--enforce-eager"] + # Condition whisper on starting text + prompt = "Nor have I ever" + with RemoteOpenAIServer(model_name, server_args) as remote_server: + client = remote_server.get_async_client() + transcription = await client.audio.translations.create( + model=model_name, + file=foscolo, + prompt=prompt, + extra_body=dict(language="it"), + response_format="text", + temperature=0.0) + out = json.loads(transcription)['text'] + assert "Nor will I ever touch the sacred" not in out + assert prompt not in out + + +@pytest.mark.asyncio +async def test_non_asr_model(foscolo): + # text to text model + model_name = "JackFram/llama-68m" + server_args = ["--enforce-eager"] + with RemoteOpenAIServer(model_name, server_args) as remote_server: + client = remote_server.get_async_client() + res = await client.audio.translations.create(model=model_name, + file=foscolo, + temperature=0.0) + assert res.code == 400 and not res.text + assert res.message == "The model does not support Translations API" + + +@pytest.mark.asyncio +async def test_streaming_response(foscolo): + model_name = "openai/whisper-small" + server_args = ["--enforce-eager"] + translation = "" + with RemoteOpenAIServer(model_name, server_args) as remote_server: + client = remote_server.get_async_client() + res_no_stream = await client.audio.translations.create( + model=model_name, + file=foscolo, + response_format="json", + extra_body=dict(language="it"), + temperature=0.0) + # Unfortunately this only works when the openai client is patched + # to use streaming mode, not exposed in the translation api. + original_post = AsyncAPIClient.post + + async def post_with_stream(*args, **kwargs): + kwargs['stream'] = True + return await original_post(*args, **kwargs) + + with patch.object(AsyncAPIClient, "post", new=post_with_stream): + client = remote_server.get_async_client() + res = await client.audio.translations.create(model=model_name, + file=foscolo, + temperature=0.0, + extra_body=dict( + stream=True, + language="it")) + # Reconstruct from chunks and validate + async for chunk in res: + # just a chunk + text = chunk.choices[0]['delta']['content'] + translation += text + + assert translation == res_no_stream.text + + +@pytest.mark.asyncio +async def test_stream_options(foscolo): + model_name = "openai/whisper-small" + server_args = ["--enforce-eager"] + with RemoteOpenAIServer(model_name, server_args) as remote_server: + original_post = AsyncAPIClient.post + + async def post_with_stream(*args, **kwargs): + kwargs['stream'] = True + return await original_post(*args, **kwargs) + + with patch.object(AsyncAPIClient, "post", new=post_with_stream): + client = remote_server.get_async_client() + res = await client.audio.translations.create( + model=model_name, + file=foscolo, + temperature=0.0, + extra_body=dict(language="it", + stream=True, + stream_include_usage=True, + stream_continuous_usage_stats=True)) + final = False + continuous = True + async for chunk in res: + if not len(chunk.choices): + # final usage sent + final = True + else: + continuous = continuous and hasattr(chunk, 'usage') + assert final and continuous From 21684fc8d071707bb12a80d6531c3538de34f91a Mon Sep 17 00:00:00 2001 From: NickLucche Date: Wed, 18 Jun 2025 12:27:57 +0000 Subject: [PATCH 10/18] rebase changes Signed-off-by: NickLucche --- .../openai/serving_transcription.py | 13 +- vllm/entrypoints/openai/speech_to_text.py | 259 +++++++++++------- 2 files changed, 164 insertions(+), 108 deletions(-) diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py index 8bee247455b5..fe89de5fd9e3 100644 --- a/vllm/entrypoints/openai/serving_transcription.py +++ b/vllm/entrypoints/openai/serving_transcription.py @@ -1,13 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import asyncio -import io -import math -import time from collections.abc import AsyncGenerator from typing import Optional, Union -import numpy as np from fastapi import Request from vllm.config import ModelConfig @@ -68,12 +63,12 @@ async def create_transcription( async def transcription_stream_generator( self, request: TranscriptionRequest, - result_generator: AsyncGenerator[RequestOutput, None], + result_generator: list[AsyncGenerator[RequestOutput, None]], request_id: str, request_metadata: RequestResponseMetadata, audio_duration_s: float) -> AsyncGenerator[str, None]: generator = self._speech_to_text_stream_generator( request=request, - result_generator=result_generator, + list_result_generator=result_generator, request_id=request_id, request_metadata=request_metadata, audio_duration_s=audio_duration_s, @@ -126,12 +121,12 @@ async def create_translation( async def translation_stream_generator( self, request: TranslationRequest, - result_generator: AsyncGenerator[RequestOutput, None], + result_generator: list[AsyncGenerator[RequestOutput, None]], request_id: str, request_metadata: RequestResponseMetadata, audio_duration_s: float) -> AsyncGenerator[str, None]: generator = self._speech_to_text_stream_generator( request=request, - result_generator=result_generator, + list_result_generator=result_generator, request_id=request_id, request_metadata=request_metadata, audio_duration_s=audio_duration_s, diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index f7f98795613b..227bc9760639 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -2,11 +2,13 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import io +import math import time from collections.abc import AsyncGenerator from math import ceil from typing import Callable, Optional, Union, cast +import numpy as np from fastapi import Request from vllm.config import ModelConfig @@ -145,6 +147,8 @@ # As per https://platform.openai.com/docs/guides/speech-to-text#overview. # TODO configurable MAX_AUDIO_CLIP_FILESIZE_MB = 25 +OVERLAP_CHUNK_SECOND = 1 +MIN_ENERGY_WINDOW_SIZE = 1600 # 1600 ~ 100ms for 16000 Hz audio class OpenAISpeechToText(OpenAIServing): @@ -152,14 +156,14 @@ class OpenAISpeechToText(OpenAIServing): translation.""" def __init__( - self, - engine_client: EngineClient, - model_config: ModelConfig, - models: OpenAIServingModels, - *, - request_logger: Optional[RequestLogger], - return_tokens_as_token_ids: bool = False, - task_type: str = "transcribe", + self, + engine_client: EngineClient, + model_config: ModelConfig, + models: OpenAIServingModels, + *, + request_logger: Optional[RequestLogger], + return_tokens_as_token_ids: bool = False, + task_type: str = "transcribe", # or "translate" ): assert task_type in ["transcribe", "translate"] super().__init__(engine_client=engine_client, @@ -185,7 +189,7 @@ async def _preprocess_speech_to_text( self, request: Union[TranscriptionRequest, TranslationRequest], audio_data: bytes, - ) -> tuple[PromptType, float]: + ) -> tuple[list[PromptType], float]: # Validate request # TODO language should be optional and can be guessed. # For now we default to en. See @@ -213,33 +217,33 @@ async def _preprocess_speech_to_text( y, sr = librosa.load(bytes_) duration = librosa.get_duration(y=y, sr=sr) - if duration > self.max_audio_clip_s: - raise ValueError( - f"Maximum clip duration ({self.max_audio_clip_s}s) " - "exceeded.") - - prompt = { - "encoder_prompt": { - "prompt": "", - "multi_modal_data": { - "audio": (y, sr), + chunks = [y] if duration < 30 else self._split_audio(y, sr) + prompts = [] + for i, chunk in enumerate(chunks): + prompt = { + "encoder_prompt": { + "prompt": "", + "multi_modal_data": { + "audio": (chunk, sr), + }, }, - }, - "decoder_prompt": - (f"<|startoftranscript|>{lang_token}" - f"<|{self.task_type}|><|notimestamps|>{request.prompt}") - } - return cast(PromptType, prompt), duration + "decoder_prompt": + (f"<|startoftranscript|>{lang_token}" + f"<|{self.task_type}|><|notimestamps|>{request.prompt}") + if i == 0 else "" + } + prompts.append(cast(PromptType, prompt)) + return prompts, duration async def _create_speech_to_text( self, audio_data: bytes, request: Union[TranscriptionRequest, TranslationRequest], raw_request: Request, - response_class: Union[type[TranscriptionResponse], - type[TranslationResponse]], + response_class: Union[TranscriptionResponse, TranslationResponse], stream_generator_method: Callable, - ) -> Union[SpeechToTextResponse, AsyncGenerator[str, None], ErrorResponse]: + ) -> Union[Union[TranscriptionResponse, TranslationResponse], + AsyncGenerator[str, None], ErrorResponse]: """Base method for speech-to-text operations like transcription and translation.""" error_check_ret = await self._check_model(request) @@ -277,7 +281,7 @@ async def _create_speech_to_text( f"Currently do not support PromptAdapter for " f"{self.task_type.title()}.") - prompt, duration_s = await self._preprocess_speech_to_text( + prompts, duration_s = await self._preprocess_speech_to_text( request=request, audio_data=audio_data, ) @@ -286,7 +290,8 @@ async def _create_speech_to_text( logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) - result_generator: Optional[AsyncGenerator[RequestOutput, None]] = None + list_result_generator: Optional[list[AsyncGenerator[RequestOutput, + None]]] = None try: # Unlike most decoder-only models, whisper generation length is not # constrained by the size of the input audio, which is mapped to a @@ -297,30 +302,34 @@ async def _create_speech_to_text( self._log_inputs( request_id, - prompt['decoder_prompt'], # type: ignore + prompts[0]['decoder_prompt'], # type: ignore params=sampling_params, lora_request=None, prompt_adapter_request=None) - result_generator = self.engine_client.generate( - prompt, - sampling_params, - request_id, - ) + list_result_generator = [ + self.engine_client.generate( + prompt, + sampling_params, + request_id, + ) for prompt in prompts + ] except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) if request.stream: - return stream_generator_method(request, result_generator, + return stream_generator_method(request, list_result_generator, request_id, request_metadata, duration_s) # Non-streaming response. try: - assert result_generator is not None - async for op in result_generator: - result = op - return response_class(text=result.outputs[0].text) + assert list_result_generator is not None + text = "" + for result_generator in list_result_generator: + async for op in result_generator: + text += op.outputs[0].text + return response_class(text=text) except asyncio.CancelledError: return self.create_error_response("Client disconnected") except ValueError as e: @@ -330,16 +339,15 @@ async def _create_speech_to_text( async def _speech_to_text_stream_generator( self, request: Union[TranscriptionRequest, TranslationRequest], - result_generator: AsyncGenerator[RequestOutput, None], + list_result_generator: list[AsyncGenerator[RequestOutput, None]], request_id: str, request_metadata: RequestResponseMetadata, audio_duration_s: float, chunk_object_type: str, - response_stream_choice_class: Union[ - type[TranscriptionResponseStreamChoice], - type[TranslationResponseStreamChoice]], - stream_response_class: Union[type[TranscriptionStreamResponse], - type[TranslationStreamResponse]], + response_stream_choice_class: Union[TranscriptionResponseStreamChoice, + TranslationResponseStreamChoice], + stream_response_class: Union[TranscriptionStreamResponse, + TranslationStreamResponse], ) -> AsyncGenerator[str, None]: created_time = int(time.time()) model_name = request.model @@ -350,60 +358,64 @@ async def _speech_to_text_stream_generator( include_usage = request.stream_include_usage \ if request.stream_include_usage else False include_continuous_usage = request.stream_continuous_usage_stats\ - if include_usage and request.stream_continuous_usage_stats\ - else False + if include_usage and request.stream_continuous_usage_stats\ + else False try: - async for res in result_generator: - # On first result. - if res.prompt_token_ids is not None: - # Do not account the 4-tokens `<|startoftranscript|>..` - # Could be negative when language token is not specified. - num_prompt_tokens = max(len(res.prompt_token_ids) - 4, 0) - # NOTE(NickLucche) user can't pass encoder prompts directly - # at least not to Whisper. One indicator of the encoder - # amount of processing is the log-mel spectogram length. - num_prompt_tokens += ceil(audio_duration_s * - self.model_sr / self.hop_length) - - # We need to do it here, because if there are exceptions in - # the result_generator, it needs to be sent as the FIRST - # response (by the try...catch). - - # Just one output (n=1) supported. - assert len(res.outputs) == 1 - output = res.outputs[0] - - delta_message = DeltaMessage(content=output.text) - completion_tokens += len(output.token_ids) - - if output.finish_reason is None: - # Still generating, send delta update. - choice_data = response_stream_choice_class( - delta=delta_message) - else: - # Model is finished generating. - choice_data = response_stream_choice_class( - delta=delta_message, - finish_reason=output.finish_reason, - stop_reason=output.stop_reason) - - chunk = stream_response_class(id=request_id, - object=chunk_object_type, - created=created_time, - choices=[choice_data], - model=model_name) - - # handle usage stats if requested & if continuous - if include_continuous_usage: - chunk.usage = UsageInfo( - prompt_tokens=num_prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=num_prompt_tokens + completion_tokens, - ) - - data = chunk.model_dump_json(exclude_unset=True) - yield f"data: {data}\n\n" + for result_generator in list_result_generator: + async for res in result_generator: + # On first result. + if res.prompt_token_ids is not None: + # Do not account the 4-tokens `<|startoftranscript|>..` + # Could be negative when language token + # is not specified. + num_prompt_tokens = max( + len(res.prompt_token_ids) - 4, 0) + # NOTE(NickLucche) user can't pass encoder + # prompts directly at least not to Whisper. + # One indicator of the encoder amount of processing + # is the log-mel spectogram length. + num_prompt_tokens += ceil( + audio_duration_s * self.model_sr / self.hop_length) + + # We need to do it here, because if there are exceptions in + # the result_generator, it needs to be sent as the FIRST + # response (by the try...catch). + + # Just one output (n=1) supported. + assert len(res.outputs) == 1 + output = res.outputs[0] + + delta_message = DeltaMessage(content=output.text) + completion_tokens += len(output.token_ids) + + if output.finish_reason is None: + # Still generating, send delta update. + choice_data = response_stream_choice_class( + delta=delta_message) + else: + # Model is finished generating. + choice_data = response_stream_choice_class( + delta=delta_message, + finish_reason=output.finish_reason, + stop_reason=output.stop_reason) + + chunk = stream_response_class(id=request_id, + object=chunk_object_type, + created=created_time, + choices=[choice_data], + model=model_name) + + # handle usage stats if requested & if continuous + if include_continuous_usage: + chunk.usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=num_prompt_tokens + completion_tokens, + ) + + data = chunk.model_dump_json(exclude_unset=True) + yield f"data: {data}\n\n" # Once the final token is handled, if stream_options.include_usage # is sent, send the usage. @@ -437,3 +449,52 @@ async def _speech_to_text_stream_generator( yield f"data: {data}\n\n" # Send the final done message after all response.n are finished yield "data: [DONE]\n\n" + + def _split_audio(self, audio_data: np.ndarray, + sample_rate: int) -> list[np.ndarray]: + chunk_size = sample_rate * self.max_audio_clip_s + overlap_size = sample_rate * OVERLAP_CHUNK_SECOND + chunks = [] + i = 0 + while i < audio_data.shape[-1]: + if i + chunk_size >= audio_data.shape[-1]: + # handle last chunk + chunks.append(audio_data[..., i:]) + break + + # Find the best split point in the overlap region + search_start = i + chunk_size - overlap_size + search_end = min(i + chunk_size, audio_data.shape[-1]) + split_point = self._find_split_point(audio_data, search_start, + search_end) + + # Extract chunk up to the split point + chunks.append(audio_data[..., i:split_point]) + i = split_point + return chunks + + def _find_split_point(self, wav: np.ndarray, start_idx: int, + end_idx: int) -> int: + """Find the best point to split audio by + looking for silence or low amplitude. + Args: + wav: Audio tensor [1, T] + start_idx: Start index of search region + end_idx: End index of search region + Returns: + Index of best splitting point + """ + segment = wav[start_idx:end_idx] + + # Calculate RMS energy in small windows + min_energy = math.inf + quietest_idx = 0 + for i in range(0, + len(segment) - MIN_ENERGY_WINDOW_SIZE, + MIN_ENERGY_WINDOW_SIZE): + window = segment[i:i + MIN_ENERGY_WINDOW_SIZE] + energy = (window**2).mean()**0.5 + if energy < min_energy: + quietest_idx = i + start_idx + min_energy = energy + return quietest_idx From 382a2579c926f73bc54ab440a7ae0be94e45df36 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Wed, 18 Jun 2025 13:26:11 +0000 Subject: [PATCH 11/18] fix splitting: subsequent chunks still need preamble Signed-off-by: NickLucche --- vllm/entrypoints/openai/speech_to_text.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index 227bc9760639..abdabeba9e9c 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -214,12 +214,14 @@ async def _preprocess_speech_to_text( raise ValueError("Maximum file size exceeded.") with io.BytesIO(audio_data) as bytes_: - y, sr = librosa.load(bytes_) + # NOTE resample to model SR here for efficiency. This is also a + # pre-requisite for chunking, as it assumes Whisper SR. + y, sr = librosa.load(bytes_, sr=self.model_sr) duration = librosa.get_duration(y=y, sr=sr) chunks = [y] if duration < 30 else self._split_audio(y, sr) prompts = [] - for i, chunk in enumerate(chunks): + for chunk in chunks: prompt = { "encoder_prompt": { "prompt": "", @@ -230,7 +232,6 @@ async def _preprocess_speech_to_text( "decoder_prompt": (f"<|startoftranscript|>{lang_token}" f"<|{self.task_type}|><|notimestamps|>{request.prompt}") - if i == 0 else "" } prompts.append(cast(PromptType, prompt)) return prompts, duration From 5df1274a28eec11aeaf5812427157097ccf47a74 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Wed, 18 Jun 2025 13:35:02 +0000 Subject: [PATCH 12/18] long audio test Signed-off-by: NickLucche --- .../openai/test_transcription_validation.py | 2 ++ .../openai/test_translation_validation.py | 29 +++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index 8117e774951e..dab14f1d7d03 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -82,6 +82,8 @@ async def test_long_audio_request(mary_had_lamb): mary_had_lamb.seek(0) audio, sr = librosa.load(mary_had_lamb) + # Add small silence after each audio for repeatability in the split process + audio = np.pad(audio, (0, 1600)) repeated_audio = np.tile(audio, 10) # Repeated audio to buffer buffer = io.BytesIO() diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py index 4f26fc243fcd..0c2cb367f330 100644 --- a/tests/entrypoints/openai/test_translation_validation.py +++ b/tests/entrypoints/openai/test_translation_validation.py @@ -1,11 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import io # imports for guided decoding tests import json from unittest.mock import patch +import librosa +import numpy as np import pytest +import soundfile as sf from openai._base_client import AsyncAPIClient from vllm.assets.audio import AudioAsset @@ -141,3 +145,28 @@ async def post_with_stream(*args, **kwargs): else: continuous = continuous and hasattr(chunk, 'usage') assert final and continuous + + +@pytest.mark.asyncio +async def test_long_audio_request(foscolo): + model_name = "openai/whisper-small" + server_args = ["--enforce-eager"] + + foscolo.seek(0) + audio, sr = librosa.load(foscolo) + repeated_audio = np.tile(audio, 2) + # Repeated audio to buffer + buffer = io.BytesIO() + sf.write(buffer, repeated_audio, sr, format='WAV') + buffer.seek(0) + with RemoteOpenAIServer(model_name, server_args) as remote_server: + client = remote_server.get_async_client() + translation = await client.audio.translations.create( + model=model_name, + file=buffer, + extra_body=dict(language="it"), + response_format="text", + temperature=0.0) + out = json.loads(translation)['text'].strip().lower() + # TODO investigate higher model uncertainty in for longer translations. + assert out.count("nor will i ever") == 2 From fbf8f86db97a9d7a13260e516ccbcd54b0895aa4 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Wed, 18 Jun 2025 14:36:39 +0000 Subject: [PATCH 13/18] types Signed-off-by: NickLucche --- vllm/entrypoints/openai/speech_to_text.py | 27 +++++++++++++---------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index abdabeba9e9c..68ad91b19558 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -15,11 +15,12 @@ from vllm.engine.protocol import EngineClient from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.protocol import ( - DeltaMessage, ErrorResponse, RequestResponseMetadata, TranscriptionRequest, + DeltaMessage, ErrorResponse, RequestResponseMetadata, TranscriptionResponse, TranscriptionResponseStreamChoice, - TranscriptionStreamResponse, TranslationRequest, TranslationResponse, + TranscriptionStreamResponse, TranslationResponse, TranslationResponseStreamChoice, TranslationStreamResponse, UsageInfo) -from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_engine import (OpenAIServing, + SpeechToTextRequest) from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.inputs.data import PromptType from vllm.logger import init_logger @@ -187,7 +188,7 @@ def __init__( async def _preprocess_speech_to_text( self, - request: Union[TranscriptionRequest, TranslationRequest], + request: SpeechToTextRequest, audio_data: bytes, ) -> tuple[list[PromptType], float]: # Validate request @@ -219,7 +220,7 @@ async def _preprocess_speech_to_text( y, sr = librosa.load(bytes_, sr=self.model_sr) duration = librosa.get_duration(y=y, sr=sr) - chunks = [y] if duration < 30 else self._split_audio(y, sr) + chunks = [y] if duration < 30 else self._split_audio(y, int(sr)) prompts = [] for chunk in chunks: prompt = { @@ -239,9 +240,10 @@ async def _preprocess_speech_to_text( async def _create_speech_to_text( self, audio_data: bytes, - request: Union[TranscriptionRequest, TranslationRequest], + request: SpeechToTextRequest, raw_request: Request, - response_class: Union[TranscriptionResponse, TranslationResponse], + response_class: Union[type[TranscriptionResponse], + type[TranslationResponse]], stream_generator_method: Callable, ) -> Union[Union[TranscriptionResponse, TranslationResponse], AsyncGenerator[str, None], ErrorResponse]: @@ -339,16 +341,17 @@ async def _create_speech_to_text( async def _speech_to_text_stream_generator( self, - request: Union[TranscriptionRequest, TranslationRequest], + request: SpeechToTextRequest, list_result_generator: list[AsyncGenerator[RequestOutput, None]], request_id: str, request_metadata: RequestResponseMetadata, audio_duration_s: float, chunk_object_type: str, - response_stream_choice_class: Union[TranscriptionResponseStreamChoice, - TranslationResponseStreamChoice], - stream_response_class: Union[TranscriptionStreamResponse, - TranslationStreamResponse], + response_stream_choice_class: Union[ + type[TranscriptionResponseStreamChoice], + type[TranslationResponseStreamChoice]], + stream_response_class: Union[type[TranscriptionStreamResponse], + type[TranslationStreamResponse]], ) -> AsyncGenerator[str, None]: created_time = int(time.time()) model_name = request.model From 5b771617de677e85b15d1c7b95f4354097da9ac1 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Wed, 25 Jun 2025 08:43:00 +0000 Subject: [PATCH 14/18] generic create_speech_to_text Signed-off-by: NickLucche --- vllm/entrypoints/openai/serving_transcription.py | 10 ++-------- vllm/entrypoints/openai/speech_to_text.py | 14 +++++++------- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py index fe89de5fd9e3..0d6989fe91bf 100644 --- a/vllm/entrypoints/openai/serving_transcription.py +++ b/vllm/entrypoints/openai/serving_transcription.py @@ -50,16 +50,13 @@ async def create_transcription( See https://platform.openai.com/docs/api-reference/audio/createTranscription for the API specification. This API mimics the OpenAI transcription API. """ - response = await self._create_speech_to_text( + return await self._create_speech_to_text( audio_data=audio_data, request=request, raw_request=raw_request, response_class=TranscriptionResponse, stream_generator_method=self.transcription_stream_generator, ) - # Make mypy happy - assert not isinstance(response, TranslationResponse) - return response async def transcription_stream_generator( self, request: TranscriptionRequest, @@ -108,16 +105,13 @@ async def create_translation( See https://platform.openai.com/docs/api-reference/audio/createTranslation for the API specification. This API mimics the OpenAI translation API. """ - response = await self._create_speech_to_text( + return await self._create_speech_to_text( audio_data=audio_data, request=request, raw_request=raw_request, response_class=TranslationResponse, stream_generator_method=self.translation_stream_generator, ) - # Make mypy happy - assert not isinstance(response, TranscriptionResponse) - return response async def translation_stream_generator( self, request: TranslationRequest, diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index 68ad91b19558..88683a605d9b 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -6,7 +6,7 @@ import time from collections.abc import AsyncGenerator from math import ceil -from typing import Callable, Optional, Union, cast +from typing import Callable, Literal, Optional, TypeVar, Union, cast import numpy as np from fastapi import Request @@ -34,6 +34,8 @@ librosa = PlaceholderModule("librosa") # type: ignore[assignment] SpeechToTextResponse = Union[TranscriptionResponse, TranslationResponse] +T = TypeVar("T", bound=SpeechToTextResponse) + logger = init_logger(__name__) # From https://platform.openai.com/docs/guides/speech-to-text/supported-languages @@ -242,11 +244,9 @@ async def _create_speech_to_text( audio_data: bytes, request: SpeechToTextRequest, raw_request: Request, - response_class: Union[type[TranscriptionResponse], - type[TranslationResponse]], + response_class: type[T], stream_generator_method: Callable, - ) -> Union[Union[TranscriptionResponse, TranslationResponse], - AsyncGenerator[str, None], ErrorResponse]: + ) -> Union[T, AsyncGenerator[str, None], ErrorResponse]: """Base method for speech-to-text operations like transcription and translation.""" error_check_ret = await self._check_model(request) @@ -332,7 +332,7 @@ async def _create_speech_to_text( for result_generator in list_result_generator: async for op in result_generator: text += op.outputs[0].text - return response_class(text=text) + return cast(T, response_class(text=text)) except asyncio.CancelledError: return self.create_error_response("Client disconnected") except ValueError as e: @@ -346,7 +346,7 @@ async def _speech_to_text_stream_generator( request_id: str, request_metadata: RequestResponseMetadata, audio_duration_s: float, - chunk_object_type: str, + chunk_object_type: Literal["translation.chunk", "transcription.chunk"], response_stream_choice_class: Union[ type[TranscriptionResponseStreamChoice], type[TranslationResponseStreamChoice]], From 29fbaf81d7c35dd42b7380566c6b7dcce5c45508 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Wed, 25 Jun 2025 08:45:34 +0000 Subject: [PATCH 15/18] examples main guard Signed-off-by: NickLucche --- .../openai_transcription_client.py | 41 ++++++++------- .../openai_translation_client.py | 50 +++++++++---------- 2 files changed, 45 insertions(+), 46 deletions(-) diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py index ae43cb5da790..755038a76139 100644 --- a/examples/online_serving/openai_transcription_client.py +++ b/examples/online_serving/openai_transcription_client.py @@ -26,23 +26,12 @@ from vllm.assets.audio import AudioAsset -mary_had_lamb = AudioAsset("mary_had_lamb").get_local_path() -winning_call = AudioAsset("winning_call").get_local_path() -# Modify OpenAI's API key and API base to use vLLM's API server. -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" -client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, -) - - -def sync_openai(): +def sync_openai(audio_path: str, client: OpenAI): """ Perform synchronous transcription using OpenAI-compatible API. """ - with open(str(mary_had_lamb), "rb") as f: + with open(audio_path, "rb") as f: transcription = client.audio.transcriptions.create( file=f, model="openai/whisper-large-v3", @@ -58,8 +47,7 @@ def sync_openai(): print("transcription result:", transcription.text) -# OpenAI Transcription API client does not support streaming. -async def stream_openai_response(): +async def stream_openai_response(audio_path: str, base_url: str, api_key: str): """ Perform streaming transcription using vLLM's raw HTTP streaming API. """ @@ -68,11 +56,12 @@ async def stream_openai_response(): "stream": True, "model": "openai/whisper-large-v3", } - url = openai_api_base + "/audio/transcriptions" - headers = {"Authorization": f"Bearer {openai_api_key}"} + url = base_url + "/audio/transcriptions" + headers = {"Authorization": f"Bearer {api_key}"} print("transcription result:", end=" ") + # OpenAI Transcription API client does not support streaming. async with httpx.AsyncClient() as client: - with open(str(winning_call), "rb") as f: + with open(audio_path, "rb") as f: async with client.stream( "POST", url, files={"file": f}, data=data, headers=headers ) as response: @@ -93,10 +82,20 @@ async def stream_openai_response(): def main(): - sync_openai() - + mary_had_lamb = str(AudioAsset("mary_had_lamb").get_local_path()) + winning_call = str(AudioAsset("winning_call").get_local_path()) + + # Modify OpenAI's API key and API base to use vLLM's API server. + openai_api_key = "EMPTY" + openai_api_base = "http://localhost:8000/v1" + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) + + sync_openai(mary_had_lamb, client) # Run the asynchronous function - asyncio.run(stream_openai_response()) + asyncio.run(stream_openai_response(winning_call, openai_api_base, openai_api_key)) if __name__ == "__main__": diff --git a/examples/online_serving/openai_translation_client.py b/examples/online_serving/openai_translation_client.py index 44a0dfde1ead..6f7253e2a789 100644 --- a/examples/online_serving/openai_translation_client.py +++ b/examples/online_serving/openai_translation_client.py @@ -8,27 +8,15 @@ from vllm.assets.audio import AudioAsset -# TODO get actual asset -mary_had_lamb = AudioAsset("mary_had_lamb").get_local_path() -winning_call = AudioAsset("winning_call").get_local_path() -# Modify OpenAI's API key and API base to use vLLM's API server. -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" -client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, -) - - -def sync_openai(): - with open(str(mary_had_lamb), "rb") as f: +def sync_openai(audio_path: str, client: OpenAI): + with open(audio_path, "rb") as f: translation = client.audio.translations.create( file=f, model="openai/whisper-large-v3", response_format="json", temperature=0.0, - # Additional sampling params not provided by OpenAI API. + # Additional params not provided by OpenAI API. extra_body=dict( language="it", seed=4419, @@ -38,21 +26,18 @@ def sync_openai(): print("translation result:", translation.text) -sync_openai() - - -# OpenAI translation API client does not support streaming. -async def stream_openai_response(): +async def stream_openai_response(audio_path: str, base_url: str, api_key: str): data = { "language": "it", "stream": True, "model": "openai/whisper-large-v3", } - url = openai_api_base + "/audio/translations" - headers = {"Authorization": f"Bearer {openai_api_key}"} + url = base_url + "/audio/translations" + headers = {"Authorization": f"Bearer {api_key}"} print("translation result:", end=" ") + # OpenAI translation API client does not support streaming. async with httpx.AsyncClient() as client: - with open(str(winning_call), "rb") as f: + with open(audio_path, "rb") as f: async with client.stream( "POST", url, files={"file": f}, data=data, headers=headers ) as response: @@ -71,5 +56,20 @@ async def stream_openai_response(): print(content, end="") -# Run the asynchronous function -asyncio.run(stream_openai_response()) +def main(): + foscolo = str(AudioAsset("azacinto_foscolo").get_local_path()) + + # Modify OpenAI's API key and API base to use vLLM's API server. + openai_api_key = "EMPTY" + openai_api_base = "http://localhost:8000/v1" + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) + sync_openai(foscolo, client) + # Run the asynchronous function + asyncio.run(stream_openai_response(foscolo, openai_api_base, openai_api_key)) + + +if __name__ == "__main__": + main() From f8377471cdee78eec4d18d66f7db45c59bc44a7c Mon Sep 17 00:00:00 2001 From: NickLucche Date: Wed, 25 Jun 2025 09:20:56 +0000 Subject: [PATCH 16/18] fix docs Signed-off-by: NickLucche --- docs/serving/openai_compatible_server.md | 6 +++--- vllm/entrypoints/openai/protocol.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index c161d32cae14..00756e719992 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -57,7 +57,7 @@ We currently support the following OpenAI APIs: - Only applicable to [embedding models](../models/pooling_models.md) (`--task embed`). - [Transcriptions API][transcriptions-api] (`/v1/audio/transcriptions`) - Only applicable to Automatic Speech Recognition (ASR) models (OpenAI Whisper) (`--task generate`). -- [Translation API][translation-api] (`/v1/audio/translations`) +- [Translation API][translations-api] (`/v1/audio/translations`) - Only applicable to Automatic Speech Recognition (ASR) models (OpenAI Whisper) (`--task generate`). In addition, we have the following custom APIs: @@ -396,13 +396,13 @@ Code example: The following [sampling parameters][sampling-params] are supported. ```python ---8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params" +--8<-- "vllm/entrypoints/openai/protocol.py:translation-sampling-params" ``` The following extra parameters are supported: ```python ---8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params" +--8<-- "vllm/entrypoints/openai/protocol.py:translation-extra-params" ``` [](){ #tokenizer-api } diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 60f0e11a0216..3b5281962b2d 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1993,7 +1993,7 @@ class TranslationRequest(OpenAIBaseModel): """ # TODO support additional sampling parameters - # --8<-- [start:transcription-sampling-params] + # --8<-- [start:translation-sampling-params] temperature: float = Field(default=0.0) """The sampling temperature, between 0 and 1. @@ -2002,7 +2002,7 @@ class TranslationRequest(OpenAIBaseModel): will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit. """ - # --8<-- [end:transcription-sampling-params] + # --8<-- [end:translation-sampling-params] # --8<-- [start:translation-extra-params] language: Optional[str] = None From 90159fc96621ad0009229d0004ea3d5ac16f2440 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Wed, 25 Jun 2025 12:23:17 +0000 Subject: [PATCH 17/18] types Signed-off-by: NickLucche --- vllm/commit_id.py | 3 +++ vllm/entrypoints/openai/speech_to_text.py | 19 +++++++++---------- 2 files changed, 12 insertions(+), 10 deletions(-) create mode 100644 vllm/commit_id.py diff --git a/vllm/commit_id.py b/vllm/commit_id.py new file mode 100644 index 000000000000..391ae0b29747 --- /dev/null +++ b/vllm/commit_id.py @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +__commit__ = "933dc175653650d405b1e344822a57dad241c075" diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index 88683a605d9b..b23cf6cab097 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -159,16 +159,15 @@ class OpenAISpeechToText(OpenAIServing): translation.""" def __init__( - self, - engine_client: EngineClient, - model_config: ModelConfig, - models: OpenAIServingModels, - *, - request_logger: Optional[RequestLogger], - return_tokens_as_token_ids: bool = False, - task_type: str = "transcribe", # or "translate" + self, + engine_client: EngineClient, + model_config: ModelConfig, + models: OpenAIServingModels, + *, + request_logger: Optional[RequestLogger], + return_tokens_as_token_ids: bool = False, + task_type: Literal["transcribe", "translate"] = "transcribe", ): - assert task_type in ["transcribe", "translate"] super().__init__(engine_client=engine_client, model_config=model_config, models=models, @@ -245,7 +244,7 @@ async def _create_speech_to_text( request: SpeechToTextRequest, raw_request: Request, response_class: type[T], - stream_generator_method: Callable, + stream_generator_method: Callable[..., AsyncGenerator[str, None]], ) -> Union[T, AsyncGenerator[str, None], ErrorResponse]: """Base method for speech-to-text operations like transcription and translation.""" From 7c7c10712c8315558838c10208167d8997aff0d9 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Wed, 25 Jun 2025 13:02:12 +0000 Subject: [PATCH 18/18] cruft Signed-off-by: NickLucche --- vllm/commit_id.py | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 vllm/commit_id.py diff --git a/vllm/commit_id.py b/vllm/commit_id.py deleted file mode 100644 index 391ae0b29747..000000000000 --- a/vllm/commit_id.py +++ /dev/null @@ -1,3 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -__commit__ = "933dc175653650d405b1e344822a57dad241c075"