From 280b8b1cc892478f86d2a33e867cc515e955e313 Mon Sep 17 00:00:00 2001 From: David Gao Date: Fri, 30 May 2025 02:34:48 -0400 Subject: [PATCH 01/28] [feat]: add transcription API endpoint using OpenAI Whisper-small Signed-off-by: David Gao --- src/vllm_router/routers/main_router.py | 151 ++++++++++++++++++++++++- src/vllm_router/run-router.sh | 42 ++++--- src/vllm_router/utils.py | 6 + 3 files changed, 182 insertions(+), 17 deletions(-) diff --git a/src/vllm_router/routers/main_router.py b/src/vllm_router/routers/main_router.py index e4e77b018..e458ae812 100644 --- a/src/vllm_router/routers/main_router.py +++ b/src/vllm_router/routers/main_router.py @@ -12,19 +12,31 @@ # See the License for the specific language governing permissions and # limitations under the License. import json - -from fastapi import APIRouter, BackgroundTasks, Request +import time + +import httpx +from fastapi import ( + APIRouter, + BackgroundTasks, + File, + Form, + HTTPException, + Request, + UploadFile, +) from fastapi.responses import JSONResponse, Response from vllm_router.dynamic_config import get_dynamic_config_watcher from vllm_router.log import init_logger from vllm_router.protocols import ModelCard, ModelList +from vllm_router.routers.routing_logic import get_routing_logic from vllm_router.service_discovery import get_service_discovery from vllm_router.services.request_service.request import ( route_general_request, route_sleep_wakeup_request, ) from vllm_router.stats.engine_stats import get_engine_stats_scraper +from vllm_router.stats.request_stats import RequestStatsMonitor from vllm_router.version import __version__ try: @@ -198,8 +210,7 @@ async def get_engine_instances(): @main_router.get("/health") async def health() -> Response: - """ - Endpoint to check the health status of various components. + """Endpoint to check the health status of various components. This function verifies the health of the service discovery module and the engine stats scraper. If either component is down, it returns a @@ -232,3 +243,135 @@ async def health() -> Response: ) else: return JSONResponse(content={"status": "healthy"}, status_code=200) + + +@main_router.post("/v1/audio/transcriptions") +async def audio_transcriptions( + file: UploadFile = File(...), + model: str = Form(...), + prompt: str | None = Form(None), + response_format: str | None = Form("json"), + temperature: float | None = Form(None), + language: str = Form("en"), +): + + logger.debug("==== Enter audio_transcriptions ====") + logger.debug("Received upload: %s (%s)", file.filename, file.content_type) + logger.debug( + "Params: model=%s prompt=%r response_format=%r temperature=%r language=%s", + model, + prompt, + response_format, + temperature, + language, + ) + + # read file bytes + payload_bytes = await file.read() + files = { + "file": (file.filename, payload_bytes, file.content_type), + } + # logger.debug("=========files=========") + # logger.debug(files) + # logger.debug("=========files=========") + + data = { + "model": model, + "language": language, + } + + if prompt: + data["prompt"] = prompt + + if response_format: + data["response_format"] = response_format + + if temperature is not None: + data["temperature"] = str(temperature) + + logger.debug("==== data payload keys ====") + logger.debug(list(data.keys())) + logger.debug("==== data payload keys ====") + + # get the backend url + endpoints = get_service_discovery().get_endpoint_info() + + logger.debug("==== Total endpoints ====") + logger.debug(endpoints) + logger.debug("==== Total endpoints ====") + + # TODO: right now is skipping label check in code for local testing + endpoints = [ + ep + for ep in endpoints + if model in ep.model_names # that actually serve your model + ] + + logger.debug("==== Discovered endpoints after filtering ====") + logger.debug(endpoints) + logger.debug("==== Discovered endpoints after filtering ====") + + # filter the endpoints url for transcriptions + transcription_endpoints = [ep for ep in endpoints if model in ep.model_names] + + logger.debug("====List of transcription endpoints====") + logger.debug(transcription_endpoints) + logger.debug("====List of transcription endpoints====") + + if not transcription_endpoints: + logger.error("No transcription backend available for model %s", model) + raise HTTPException( + status_code=503, detail=f"No transcription backend for model {model}" + ) + + # grab the current engin and request stats + engine_stats = get_engine_stats_scraper().get_engine_stats() + request_stats = RequestStatsMonitor().get_request_stats(time.time()) + router = get_routing_logic() + + # pick one using the router's configured logic (roundrobin, least-loaded, etc.) + chosen_url = router.route_request( + transcription_endpoints, + engine_stats, + request_stats, + # we don’t need to pass the original FastAPI Request object here, + # but you can if your routing logic looks at headers or body + None, + ) + + logger.info("Proxying transcription request to %s", chosen_url) + + # proxy the request + # by default httpx will only wait for 5 seconds, large audio transcriptions generally + # take longer than that + async with httpx.AsyncClient( + base_url=chosen_url, + timeout=httpx.Timeout( + connect=60.0, # connect timeout + read=300.0, # read timeout + write=30.0, # if you’re streaming uploads + pool=None, # no pool timeout + ), + ) as client: + logger.debug("Sending multipart to %s/v1/audio/transcriptions …", chosen_url) + proxied = await client.post("/v1/audio/transcriptions", data=data, files=files) + logger.info("Received %d from whisper backend", proxied.status_code) + + # return the whisper response unmodified + resp = proxied.json() + logger.debug("==== Whisper response payload ====") + logger.debug(resp) + logger.debug("==== Whisper response payload ====") + + logger.debug("Backend response headers: %s", proxied.headers) + logger.debug("Backend response body (truncated): %r", proxied.content[:200]) + + return JSONResponse( + content=resp, + status_code=proxied.status_code, + headers={ + k: v + for k, v in proxied.headers.items() + if k.lower() not in ("content-encoding", "transfer-encoding", "connection") + }, + ) diff --git a/src/vllm_router/run-router.sh b/src/vllm_router/run-router.sh index 4f908948d..cb2bf0385 100755 --- a/src/vllm_router/run-router.sh +++ b/src/vllm_router/run-router.sh @@ -1,6 +1,6 @@ #!/bin/bash -if [[ $# -ne 1 ]]; then - echo "Usage $0 " +if [[ $# -ne 2 ]]; then + echo "Usage $0 " exit 1 fi @@ -15,17 +15,17 @@ fi # --log-stats # Use this command when testing with static service discovery -python3 -m vllm_router.app --port "$1" \ - --service-discovery static \ - --static-backends "http://localhost:8000" \ - --static-models "facebook/opt-125m" \ - --static-model-types "chat" \ - --log-stats \ - --log-stats-interval 10 \ - --engine-stats-interval 10 \ - --request-stats-window 10 \ - --request-stats-window 10 \ - --routing-logic roundrobin +# python3 -m vllm_router.app --port "$1" \ +# --service-discovery static \ +# --static-backends "http://localhost:8000" \ +# --static-models "facebook/opt-125m" \ +# --static-model-types "chat" \ +# --log-stats \ +# --log-stats-interval 10 \ +# --engine-stats-interval 10 \ +# --request-stats-window 10 \ +# --request-stats-window 10 \ +# --routing-logic roundrobin # Use this command when testing with roundrobin routing logic #python3 router.py --port "$1" \ @@ -35,3 +35,19 @@ python3 -m vllm_router.app --port "$1" \ # --engine-stats-interval 10 \ # --log-stats # + +# Use this command when testing with whisper transcription +ROUTER_PORT=$1 +BACKEND_URL=$2 + +python3 -m vllm_router.app \ + --host 0.0.0.0 \ + --port "${ROUTER_PORT}" \ + --service-discovery static \ + --static-backends "${BACKEND_URL}" \ + --static-models "openai/whisper-small" \ + --static-model-types "transcription" \ + --routing-logic roundrobin \ + --log-stats \ + --engine-stats-interval 10 \ + --request-stats-window 10 diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py index 762d34afe..c1ca1cb31 100644 --- a/src/vllm_router/utils.py +++ b/src/vllm_router/utils.py @@ -51,6 +51,7 @@ class ModelType(enum.Enum): embeddings = "/v1/embeddings" rerank = "/v1/rerank" score = "/v1/score" + transcription = "/v1/audio/transcriptions" @staticmethod def get_test_payload(model_type: str): @@ -75,6 +76,11 @@ def get_test_payload(model_type: str): return {"query": "Hello", "documents": ["Test"]} case ModelType.score: return {"encoding_format": "float", "text_1": "Test", "test_2": "Test2"} + case ModelType.transcription: + return { + "file": "", + "model": "openai/whisper-small", + } @staticmethod def get_all_fields(): From 63cbd15a7f1778f624278d957f969d5309eb47ac Mon Sep 17 00:00:00 2001 From: David Gao Date: Tue, 3 Jun 2025 16:45:23 -0400 Subject: [PATCH 02/28] remove the whisper payload response log Signed-off-by: David Gao --- src/vllm_router/routers/main_router.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/vllm_router/routers/main_router.py b/src/vllm_router/routers/main_router.py index e458ae812..a14f01ae8 100644 --- a/src/vllm_router/routers/main_router.py +++ b/src/vllm_router/routers/main_router.py @@ -359,10 +359,7 @@ async def audio_transcriptions( # return the whisper response unmodified resp = proxied.json() - logger.debug("==== Whisper response payload ====") - logger.debug(resp) - logger.debug("==== Whisper response payload ====") - + logger.debug("Backend response headers: %s", proxied.headers) logger.debug("Backend response body (truncated): %r", proxied.content[:200]) From 7954fc939dc29cbe48d5b20dd72688d352394300 Mon Sep 17 00:00:00 2001 From: David Gao Date: Tue, 3 Jun 2025 17:22:44 -0400 Subject: [PATCH 03/28] [docs]: add tutorial for transcription v1 api Signed-off-by: David Gao --- tutorials/17-whisper-api-transcription.md | 99 +++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 tutorials/17-whisper-api-transcription.md diff --git a/tutorials/17-whisper-api-transcription.md b/tutorials/17-whisper-api-transcription.md new file mode 100644 index 000000000..e3834bc69 --- /dev/null +++ b/tutorials/17-whisper-api-transcription.md @@ -0,0 +1,99 @@ +# Tutorial: Whisper Transcription API in vLLM Production Stack + +## Overview + +This tutorial introduces the newly added `/v1/audio/transcriptions` endpoint in the `vllm-router`, enabling users to transcribe `.wav` audio files using OpenAI’s `whisper-small` model. + +## Prerequisites + +* Access to a machine with a GPU (e.g. via [RunPod](https://runpod.io/)) +* Python 3.12 environment (recommended with `uv`) +* `vllm` and `production-stack` cloned and installed +* `vllm` installed with audio support: + + ```bash + pip install vllm[audio] + ``` + +## 1. Serving the Whisper Model + +Start a vLLM backend with the `whisper-small` model: + +```bash +vllm serve \ + --task transcription openai/whisper-small \ + --host 0.0.0.0 --port 8002 +``` + +## 2. Running the Router + +Create and run a router connected to the Whisper backend: + +```bash +#!/bin/bash +if [[ $# -ne 2 ]]; then + echo "Usage: $0 " + exit 1 +fi + +uv run python3 -m vllm_router.app \ + --host 0.0.0.0 --port "$1" \ + --service-discovery static \ + --static-backends "$2" \ + --static-models "openai/whisper-small" \ + --static-model-types "transcription" \ + --routing-logic roundrobin \ + --log-stats \ + --engine-stats-interval 10 \ + --request-stats-window 10 +``` + +Example usage: + +```bash +./run-router.sh 8000 http://localhost:8002 +``` + +## 3. Sending a Transcription Request + +Use `curl` to send a `.wav` file to the transcription endpoint: + +* You can test with any `.wav` audio file of your choice. + +```bash +curl -v http://localhost:8000/v1/audio/transcriptions \ + -F 'file=@/path/to/audio.wav;type=audio/wav' \ + -F 'model=openai/whisper-small' \ + -F 'response_format=json' \ + -F 'language=en' +``` + +### Supported Parameters + +| Parameter | Description | +| ----------------- | ------------------------------------------------------ | +| `file` | Path to a `.wav` audio file | +| `model` | Whisper model to use (e.g., `openai/whisper-small`) | +| `prompt` | *(Optional)* Text prompt to guide the transcription | +| `response_format` | One of `json`, `text`, `srt`, `verbose_json`, or `vtt` | +| `temperature` | *(Optional)* Sampling temperature as a float | +| `language` | ISO 639-1 code (e.g., `en`, `fr`, `zh`) | + +## 4. Sample Output + +```json +{ + "text": "Testing testing testing the whisper small model testing testing testing the audio transcription function testing testing testing the whisper small model" +} +``` + +## 5. Notes + +* Router uses extended HTTPX timeouts to support long transcription jobs. +* This implementation dynamically discovers valid transcription backends and routes requests accordingly. + +## 6. Resources + +* [PR #469 – Add Whisper Transcription API](https://github.com/vllm-project/production-stack/pull/469) +* [OpenAI Whisper GitHub](https://github.com/openai/whisper) +* [Blog: vLLM Whisper Transcription Walkthrough](https://davidgao7.github.io/posts/vllm-v1-whisper-transcription/) From 7ef270bb5eca9012bd6783497ad06f0db8b9f6b9 Mon Sep 17 00:00:00 2001 From: David Gao Date: Tue, 3 Jun 2025 17:44:27 -0400 Subject: [PATCH 04/28] [chore] align example router running script with main new script will be mentioned in `tutorials/17-whisper-api-transcription.md` Signed-off-by: David Gao --- src/vllm_router/run-router.sh | 42 +++++++++++------------------------ 1 file changed, 13 insertions(+), 29 deletions(-) diff --git a/src/vllm_router/run-router.sh b/src/vllm_router/run-router.sh index cb2bf0385..4f908948d 100755 --- a/src/vllm_router/run-router.sh +++ b/src/vllm_router/run-router.sh @@ -1,6 +1,6 @@ #!/bin/bash -if [[ $# -ne 2 ]]; then - echo "Usage $0 " +if [[ $# -ne 1 ]]; then + echo "Usage $0 " exit 1 fi @@ -15,17 +15,17 @@ fi # --log-stats # Use this command when testing with static service discovery -# python3 -m vllm_router.app --port "$1" \ -# --service-discovery static \ -# --static-backends "http://localhost:8000" \ -# --static-models "facebook/opt-125m" \ -# --static-model-types "chat" \ -# --log-stats \ -# --log-stats-interval 10 \ -# --engine-stats-interval 10 \ -# --request-stats-window 10 \ -# --request-stats-window 10 \ -# --routing-logic roundrobin +python3 -m vllm_router.app --port "$1" \ + --service-discovery static \ + --static-backends "http://localhost:8000" \ + --static-models "facebook/opt-125m" \ + --static-model-types "chat" \ + --log-stats \ + --log-stats-interval 10 \ + --engine-stats-interval 10 \ + --request-stats-window 10 \ + --request-stats-window 10 \ + --routing-logic roundrobin # Use this command when testing with roundrobin routing logic #python3 router.py --port "$1" \ @@ -35,19 +35,3 @@ fi # --engine-stats-interval 10 \ # --log-stats # - -# Use this command when testing with whisper transcription -ROUTER_PORT=$1 -BACKEND_URL=$2 - -python3 -m vllm_router.app \ - --host 0.0.0.0 \ - --port "${ROUTER_PORT}" \ - --service-discovery static \ - --static-backends "${BACKEND_URL}" \ - --static-models "openai/whisper-small" \ - --static-model-types "transcription" \ - --routing-logic roundrobin \ - --log-stats \ - --engine-stats-interval 10 \ - --request-stats-window 10 From 292fc628258ac9dda718c04ffaf25b80fb3aef7b Mon Sep 17 00:00:00 2001 From: David Gao Date: Thu, 12 Jun 2025 17:04:03 +0800 Subject: [PATCH 05/28] omit model field since backend already knows which model to run Signed-off-by: David Gao --- src/vllm_router/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py index c1ca1cb31..bc43a1351 100644 --- a/src/vllm_router/utils.py +++ b/src/vllm_router/utils.py @@ -79,7 +79,6 @@ def get_test_payload(model_type: str): case ModelType.transcription: return { "file": "", - "model": "openai/whisper-small", } @staticmethod From 4413d83b935685ff7951ba7345cb5851a84770b6 Mon Sep 17 00:00:00 2001 From: David Gao Date: Wed, 18 Jun 2025 17:36:28 +0800 Subject: [PATCH 06/28] generate a silent audio file if no audio file appears Signed-off-by: David Gao --- src/vllm_router/routers/main_router.py | 22 +++++++--------------- src/vllm_router/utils.py | 16 +++++++++++++++- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/src/vllm_router/routers/main_router.py b/src/vllm_router/routers/main_router.py index a14f01ae8..ed2a4ccf0 100644 --- a/src/vllm_router/routers/main_router.py +++ b/src/vllm_router/routers/main_router.py @@ -210,7 +210,8 @@ async def get_engine_instances(): @main_router.get("/health") async def health() -> Response: - """Endpoint to check the health status of various components. + """ + Endpoint to check the health status of various components. This function verifies the health of the service discovery module and the engine stats scraper. If either component is down, it returns a @@ -247,7 +248,7 @@ async def health() -> Response: @main_router.post("/v1/audio/transcriptions") async def audio_transcriptions( - file: UploadFile = File(...), + file: UploadFile | None = File(None), model: str = Form(...), prompt: str | None = Form(None), response_format: str | None = Form("json"), @@ -276,7 +277,6 @@ async def audio_transcriptions( # logger.debug("=========files=========") data = { - "model": model, "language": language, } @@ -300,20 +300,12 @@ async def audio_transcriptions( logger.debug(endpoints) logger.debug("==== Total endpoints ====") - # TODO: right now is skipping label check in code for local testing - endpoints = [ - ep - for ep in endpoints - if model in ep.model_names # that actually serve your model + # filter the endpoints url by model name for transcriptions + transcription_endpoints = [ + ep for ep in endpoints + if model == ep.model_name and ep.model_label == "transcription" ] - logger.debug("==== Discovered endpoints after filtering ====") - logger.debug(endpoints) - logger.debug("==== Discovered endpoints after filtering ====") - - # filter the endpoints url for transcriptions - transcription_endpoints = [ep for ep in endpoints if model in ep.model_names] - logger.debug("====List of transcription endpoints====") logger.debug(transcription_endpoints) logger.debug("====List of transcription endpoints====") diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py index bc43a1351..3be5ccb4b 100644 --- a/src/vllm_router/utils.py +++ b/src/vllm_router/utils.py @@ -7,6 +7,8 @@ import requests from fastapi.requests import Request from starlette.datastructures import MutableHeaders +import io +import wave from vllm_router.log import init_logger @@ -77,10 +79,22 @@ def get_test_payload(model_type: str): case ModelType.score: return {"encoding_format": "float", "text_1": "Test", "test_2": "Test2"} case ModelType.transcription: + # Generate a 0.1 second silent audio file + with io.BytesIO() as wav_buffer: + with wave.open(wav_buffer, "wb") as wf: + wf.setnchannels(1) # mono audio channel, standard configuration + wf.setsampwidth(2) # 16 bit audio, common bit depth for wav file + wf.setframerate(16000) # 16 kHz sample rate + wf.writeframes(b"\x00\x00" * 1600) # 0.1 second of silence + + # retrieves the generated wav bytes, return + wav_bytes = wav_buffer.getvalue() + return { - "file": "", + "file": ("empty.wav", wav_bytes, "audio/wav"), } + @staticmethod def get_all_fields(): return [model_type.name for model_type in ModelType] From a34a431f0c23f111874730020e69c969953ac908 Mon Sep 17 00:00:00 2001 From: David Gao Date: Thu, 19 Jun 2025 02:00:50 +0800 Subject: [PATCH 07/28] put wav creation at the module level to prevent being recreated every time Signed-off-by: David Gao --- src/vllm_router/utils.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py index 3be5ccb4b..f2d37fa2f 100644 --- a/src/vllm_router/utils.py +++ b/src/vllm_router/utils.py @@ -14,6 +14,19 @@ logger = init_logger(__name__) +# prepare a WAV byte to prevent repeatedly generating it +# Generate a 0.1 second silent audio file +_SILENT_WAV_BYTES = None +with io.BytesIO() as wav_buffer: + with wave.open(wav_buffer, "wb") as wf: + wf.setnchannels(1) # mono audio channel, standard configuration + wf.setsampwidth(2) # 16 bit audio, common bit depth for wav file + wf.setframerate(16000) # 16 kHz sample rate + wf.writeframes(b"\x00\x00" * 1600) # 0.1 second of silence + + # retrieves the generated wav bytes, return + _SILENT_WAV_BYTES = wav_buffer.getvalue() + class SingletonMeta(type): _instances = {} @@ -80,19 +93,10 @@ def get_test_payload(model_type: str): return {"encoding_format": "float", "text_1": "Test", "test_2": "Test2"} case ModelType.transcription: # Generate a 0.1 second silent audio file - with io.BytesIO() as wav_buffer: - with wave.open(wav_buffer, "wb") as wf: - wf.setnchannels(1) # mono audio channel, standard configuration - wf.setsampwidth(2) # 16 bit audio, common bit depth for wav file - wf.setframerate(16000) # 16 kHz sample rate - wf.writeframes(b"\x00\x00" * 1600) # 0.1 second of silence - - # retrieves the generated wav bytes, return - wav_bytes = wav_buffer.getvalue() - - return { - "file": ("empty.wav", wav_bytes, "audio/wav"), - } + if _SILENT_WAV_BYTES is not None: + return { + "file": ("empty.wav", _SILENT_WAV_BYTES, "audio/wav"), + } @staticmethod From b1562547cc925061204b1e48dd71bccdc27a1fed Mon Sep 17 00:00:00 2001 From: David Gao Date: Thu, 19 Jun 2025 14:59:24 +0800 Subject: [PATCH 08/28] [Test] test frequency of silent audio creation Signed-off-by: David Gao --- src/vllm_router/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py index f2d37fa2f..53bbd0071 100644 --- a/src/vllm_router/utils.py +++ b/src/vllm_router/utils.py @@ -26,6 +26,7 @@ # retrieves the generated wav bytes, return _SILENT_WAV_BYTES = wav_buffer.getvalue() + logger.debug("======A default silent WAV file has been stored in memory within py application process====") class SingletonMeta(type): @@ -94,6 +95,7 @@ def get_test_payload(model_type: str): case ModelType.transcription: # Generate a 0.1 second silent audio file if _SILENT_WAV_BYTES is not None: + logger.debug("=====Slient WAV Bytes is being used=====") return { "file": ("empty.wav", _SILENT_WAV_BYTES, "audio/wav"), } From 518e453b1e974700af57ee0929bfe2da0c9258d9 Mon Sep 17 00:00:00 2001 From: David Gao Date: Sun, 22 Jun 2025 00:03:50 +0800 Subject: [PATCH 09/28] send multipart/form-data for transcription model's health check Signed-off-by: David Gao --- src/vllm_router/utils.py | 46 +++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py index 53bbd0071..c9c2abd11 100644 --- a/src/vllm_router/utils.py +++ b/src/vllm_router/utils.py @@ -4,8 +4,8 @@ import re import resource -import requests from fastapi.requests import Request +import requests from starlette.datastructures import MutableHeaders import io import wave @@ -184,14 +184,40 @@ def update_content_length(request: Request, request_body: str): def is_model_healthy(url: str, model: str, model_type: str) -> bool: model_details = ModelType[model_type] + try: - response = requests.post( - f"{url}{model_details.value}", - headers={"Content-Type": "application/json"}, - json={"model": model} | model_details.get_test_payload(model_type), - timeout=30, - ) - except Exception as e: - logger.error(e) + if model_type == "transcription": + + # for transcription, the backend expects multipart/form-data with a file + # we will use pre-generated silent wav bytes + files = { + "file": ("empty.wav", _SILENT_WAV_BYTES, "audio/wav") + } + data = {"model":model} + response = requests.post( + f"{url}{model_details.value}", + files=files, # multipart/form-data + data=data + ) + else: + # for other model types (chat, completion, etc.) + response = requests.post( + f"{url}{model_details.value}", + headers={"Content-Type": "application/json"}, + json={"model":model} | model_details.get_test_payload(model_type) + ) + + response.raise_for_status() + + if model_type == "transcription": + return True + else: + response.json() # verify it's valid json for other model types + + except requests.exceptions.RequestException as e: + logger.warning(f"{model_type} model {model} at {url} not healthy: {e}") + return False + + except json.JSONDecodeError as e: + logger.error(f"Failed to decode JSON from {model_type} model {model} at {url}: {e}") return False - return response.status_code == 200 From d7cc2a3cb32c0cd409be2b6c8354e68112fbc8ef Mon Sep 17 00:00:00 2001 From: David Gao Date: Mon, 23 Jun 2025 16:55:31 +0800 Subject: [PATCH 10/28] fix pre-commit issue Signed-off-by: David Gao --- src/vllm_router/routers/main_router.py | 5 +++-- src/vllm_router/utils.py | 29 +++++++++++++------------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/src/vllm_router/routers/main_router.py b/src/vllm_router/routers/main_router.py index ed2a4ccf0..1f13fc6f8 100644 --- a/src/vllm_router/routers/main_router.py +++ b/src/vllm_router/routers/main_router.py @@ -302,7 +302,8 @@ async def audio_transcriptions( # filter the endpoints url by model name for transcriptions transcription_endpoints = [ - ep for ep in endpoints + ep + for ep in endpoints if model == ep.model_name and ep.model_label == "transcription" ] @@ -351,7 +352,7 @@ async def audio_transcriptions( # return the whisper response unmodified resp = proxied.json() - + logger.debug("Backend response headers: %s", proxied.headers) logger.debug("Backend response body (truncated): %r", proxied.content[:200]) diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py index c9c2abd11..907c82867 100644 --- a/src/vllm_router/utils.py +++ b/src/vllm_router/utils.py @@ -1,14 +1,14 @@ import abc import enum +import io import json import re import resource +import wave -from fastapi.requests import Request import requests +from fastapi.requests import Request from starlette.datastructures import MutableHeaders -import io -import wave from vllm_router.log import init_logger @@ -26,7 +26,9 @@ # retrieves the generated wav bytes, return _SILENT_WAV_BYTES = wav_buffer.getvalue() - logger.debug("======A default silent WAV file has been stored in memory within py application process====") + logger.debug( + "======A default silent WAV file has been stored in memory within py application process====" + ) class SingletonMeta(type): @@ -95,12 +97,11 @@ def get_test_payload(model_type: str): case ModelType.transcription: # Generate a 0.1 second silent audio file if _SILENT_WAV_BYTES is not None: - logger.debug("=====Slient WAV Bytes is being used=====") + logger.debug("=====Silent WAV Bytes is being used=====") return { "file": ("empty.wav", _SILENT_WAV_BYTES, "audio/wav"), } - @staticmethod def get_all_fields(): return [model_type.name for model_type in ModelType] @@ -184,27 +185,25 @@ def update_content_length(request: Request, request_body: str): def is_model_healthy(url: str, model: str, model_type: str) -> bool: model_details = ModelType[model_type] - + try: if model_type == "transcription": # for transcription, the backend expects multipart/form-data with a file # we will use pre-generated silent wav bytes - files = { - "file": ("empty.wav", _SILENT_WAV_BYTES, "audio/wav") - } - data = {"model":model} + files = {"file": ("empty.wav", _SILENT_WAV_BYTES, "audio/wav")} + data = {"model": model} response = requests.post( f"{url}{model_details.value}", files=files, # multipart/form-data - data=data + data=data, ) else: # for other model types (chat, completion, etc.) response = requests.post( f"{url}{model_details.value}", headers={"Content-Type": "application/json"}, - json={"model":model} | model_details.get_test_payload(model_type) + json={"model": model} | model_details.get_test_payload(model_type), ) response.raise_for_status() @@ -219,5 +218,7 @@ def is_model_healthy(url: str, model: str, model_type: str) -> bool: return False except json.JSONDecodeError as e: - logger.error(f"Failed to decode JSON from {model_type} model {model} at {url}: {e}") + logger.error( + f"Failed to decode JSON from {model_type} model {model} at {url}: {e}" + ) return False From cf11af69625c0ab4a0a19c03f5cc747d537ae1ec Mon Sep 17 00:00:00 2001 From: David Gao Date: Sat, 28 Jun 2025 21:03:22 +0800 Subject: [PATCH 11/28] Moves the implementation for the `/v1/audio/transcriptions` endpoint from `main_router.py` into `request.py`, align architectural pattern. Signed-off-by: David Gao --- src/vllm_router/routers/main_router.py | 132 +-------------- .../services/request_service/request.py | 150 ++++++++++++++++-- 2 files changed, 146 insertions(+), 136 deletions(-) diff --git a/src/vllm_router/routers/main_router.py b/src/vllm_router/routers/main_router.py index 1f13fc6f8..32777fabb 100644 --- a/src/vllm_router/routers/main_router.py +++ b/src/vllm_router/routers/main_router.py @@ -12,31 +12,24 @@ # See the License for the specific language governing permissions and # limitations under the License. import json -import time -import httpx from fastapi import ( APIRouter, BackgroundTasks, - File, - Form, - HTTPException, Request, - UploadFile, ) from fastapi.responses import JSONResponse, Response from vllm_router.dynamic_config import get_dynamic_config_watcher from vllm_router.log import init_logger from vllm_router.protocols import ModelCard, ModelList -from vllm_router.routers.routing_logic import get_routing_logic from vllm_router.service_discovery import get_service_discovery from vllm_router.services.request_service.request import ( route_general_request, route_sleep_wakeup_request, + route_general_transcriptions, ) from vllm_router.stats.engine_stats import get_engine_stats_scraper -from vllm_router.stats.request_stats import RequestStatsMonitor from vllm_router.version import __version__ try: @@ -138,7 +131,7 @@ async def show_version(): @main_router.get("/v1/models") async def show_models(): """ - Returns a list of all models available in the stack + Returns a list of all models available in the stack. Args: None @@ -247,121 +240,8 @@ async def health() -> Response: @main_router.post("/v1/audio/transcriptions") -async def audio_transcriptions( - file: UploadFile | None = File(None), - model: str = Form(...), - prompt: str | None = Form(None), - response_format: str | None = Form("json"), - temperature: float | None = Form(None), - language: str = Form("en"), +async def route_v1_audio_transcriptions( + request: Request, background_tasks: BackgroundTasks ): - - logger.debug("==== Enter audio_transcriptions ====") - logger.debug("Received upload: %s (%s)", file.filename, file.content_type) - logger.debug( - "Params: model=%s prompt=%r response_format=%r temperature=%r language=%s", - model, - prompt, - response_format, - temperature, - language, - ) - - # read file bytes - payload_bytes = await file.read() - files = { - "file": (file.filename, payload_bytes, file.content_type), - } - # logger.debug("=========files=========") - # logger.debug(files) - # logger.debug("=========files=========") - - data = { - "language": language, - } - - if prompt: - data["prompt"] = prompt - - if response_format: - data["response_format"] = response_format - - if temperature is not None: - data["temperature"] = str(temperature) - - logger.debug("==== data payload keys ====") - logger.debug(list(data.keys())) - logger.debug("==== data payload keys ====") - - # get the backend url - endpoints = get_service_discovery().get_endpoint_info() - - logger.debug("==== Total endpoints ====") - logger.debug(endpoints) - logger.debug("==== Total endpoints ====") - - # filter the endpoints url by model name for transcriptions - transcription_endpoints = [ - ep - for ep in endpoints - if model == ep.model_name and ep.model_label == "transcription" - ] - - logger.debug("====List of transcription endpoints====") - logger.debug(transcription_endpoints) - logger.debug("====List of transcription endpoints====") - - if not transcription_endpoints: - logger.error("No transcription backend available for model %s", model) - raise HTTPException( - status_code=503, detail=f"No transcription backend for model {model}" - ) - - # grab the current engin and request stats - engine_stats = get_engine_stats_scraper().get_engine_stats() - request_stats = RequestStatsMonitor().get_request_stats(time.time()) - router = get_routing_logic() - - # pick one using the router's configured logic (roundrobin, least-loaded, etc.) - chosen_url = router.route_request( - transcription_endpoints, - engine_stats, - request_stats, - # we don’t need to pass the original FastAPI Request object here, - # but you can if your routing logic looks at headers or body - None, - ) - - logger.info("Proxying transcription request to %s", chosen_url) - - # proxy the request - # by default httpx will only wait for 5 seconds, large audio transcriptions generally - # take longer than that - async with httpx.AsyncClient( - base_url=chosen_url, - timeout=httpx.Timeout( - connect=60.0, # connect timeout - read=300.0, # read timeout - write=30.0, # if you’re streaming uploads - pool=None, # no pool timeout - ), - ) as client: - logger.debug("Sending multipart to %s/v1/audio/transcriptions …", chosen_url) - proxied = await client.post("/v1/audio/transcriptions", data=data, files=files) - logger.info("Received %d from whisper backend", proxied.status_code) - - # return the whisper response unmodified - resp = proxied.json() - - logger.debug("Backend response headers: %s", proxied.headers) - logger.debug("Backend response body (truncated): %r", proxied.content[:200]) - - return JSONResponse( - content=resp, - status_code=proxied.status_code, - headers={ - k: v - for k, v in proxied.headers.items() - if k.lower() not in ("content-encoding", "transfer-encoding", "connection") - }, - ) + """Handles audio transcription requests.""" + return await route_general_transcriptions(request, "/v1/audio/transcriptions", background_tasks) diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py index 9842d1821..a3b05c61a 100644 --- a/src/vllm_router/services/request_service/request.py +++ b/src/vllm_router/services/request_service/request.py @@ -17,23 +17,24 @@ import os import time import uuid +from typing import Optional import httpx -from fastapi import BackgroundTasks, Request +from fastapi import BackgroundTasks, Request, UploadFile from fastapi.responses import JSONResponse, StreamingResponse from vllm_router.log import init_logger from vllm_router.routers.routing_logic import ( DisaggregatedPrefillRouter, KvawareRouter, - PrefixAwareRouter, + PrefixAwareRouter ) from vllm_router.service_discovery import get_service_discovery from vllm_router.services.request_service.rewriter import ( get_request_rewriter, is_request_rewriter_initialized, ) -from vllm_router.utils import replace_model_in_request_body, update_content_length +from vllm_router.utils import replace_model_in_request_body,update_content_length try: # Semantic cache integration @@ -314,9 +315,7 @@ async def route_general_request( async def send_request_to_prefiller( client: httpx.AsyncClient, endpoint: str, req_data: dict, request_id: str ): - """ - Send a request to a prefiller service. - """ + """Send a request to a prefiller service.""" req_data = req_data.copy() req_data["max_tokens"] = 1 if "max_completion_tokens" in req_data: @@ -335,9 +334,7 @@ async def send_request_to_prefiller( async def send_request_to_decode( client: httpx.AsyncClient, endpoint: str, req_data: dict, request_id: str ): - """ - Asynchronously stream the response from a service using a persistent client. - """ + """Asynchronously stream the response from a service using a persistent client.""" headers = { "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", "X-Request-Id": request_id, @@ -428,7 +425,7 @@ async def route_sleep_wakeup_request( } if VLLM_API_KEY := os.getenv("VLLM_API_KEY"): - logger.info(f"Using vllm server authentication") + logger.info("Using vllm server authentication") headers["Authorization"] = f"Bearer {VLLM_API_KEY}" url = server_url + endpoint @@ -458,3 +455,136 @@ async def route_sleep_wakeup_request( content={"status": "success"}, headers={"X-Request-Id": request_id}, ) + +async def route_general_transcriptions( + request: Request, + endpoint: str, # "/v1/audio/transcriptions" + background_tasks: BackgroundTasks, +): + """Handles audio transcription requests by parsing form data and proxying to backend.""" + + request_id = request.headers.get("X-Request-Id", str(uuid.uuid4())) + in_router_time = time.time() + + # --- 1. Form parsing --- + try: + form = await request.form() + + # Extract parameters from the form data + file: UploadFile = form["file"] + model: str = form["model"] + prompt: Optional[str] = form.get("prompt", None) + response_format: Optional[str] = form.get("response_format", "json") + temperature_str: Optional[str] = form.get("temperature", None) + temperature: Optional[float] = float(temperature_str) if temperature_str is not None else None + language: Optional[str] = form.get("language", "en") + except KeyError as e: + return JSONResponse(status_code=400, content={"error": f"Invalid request: missing '{e.args[0]}' in form data."}) + + logger.debug("==== Enter audio_transcriptions ====") + logger.debug("Received upload: %s (%s)", file.filename, file.content_type) + logger.debug( + "Params: model=%s prompt=%r response_format=%r temperature=%r language=%s", + model, + prompt, + response_format, + temperature, + language, + ) + + # --- 2. Service Discovery and Routing --- + # Access singletons via request.app.state for consistent style + service_discovery = get_service_discovery() # This one is often still accessed directly via its get function + router = request.app.state.router # Access router from app.state + engine_stats_scraper = request.app.state.engine_stats_scraper # Access engine_stats_scraper from app.state + request_stats_monitor = request.app.state.request_stats_monitor # Access request_stats_monitor from app.state + + + endpoints = service_discovery.get_endpoint_info() + + logger.debug("==== Total endpoints ====") + logger.debug(endpoints) + logger.debug("==== Total endpoints ====") + + # filter the endpoints url by model name and label for transcriptions + transcription_endpoints = [ + ep + for ep in endpoints + if model == ep.model_name and ep.model_label == "transcription" and not ep.sleep # Added ep.sleep == False + ] + + logger.debug("====List of transcription endpoints====") + logger.debug(transcription_endpoints) + logger.debug("====List of transcription endpoints====") + + if not transcription_endpoints: + logger.error("No transcription backend available for model %s", model) + return JSONResponse( + status_code=404, content={"error": f"No transcription backend for model {model}"} + ) + + # grab the current engine and request stats + engine_stats = engine_stats_scraper.get_engine_stats() + request_stats = request_stats_monitor.get_request_stats(time.time()) + + # pick one using the router's configured logic (roundrobin, least-loaded, etc.) + chosen_url = router.route_request( + transcription_endpoints, + engine_stats, + request_stats, + request, + ) + + logger.info("Proxying transcription request to %s", chosen_url) + + # --- 3. Prepare and Proxy the Request --- + payload_bytes = await file.read() + files = {"file": (file.filename, payload_bytes, file.content_type)} + + data = {"model": model,"language": language} + + if prompt: + data["prompt"] = prompt + + if response_format: + data["response_format"] = response_format + + if temperature is not None: + data["temperature"] = str(temperature) + + logger.info("Proxying transcription request for model %s to %s", model, chosen_url) + + logger.debug("==== data payload keys ====") + logger.debug(list(data.keys())) + logger.debug("==== data payload keys ====") + + try: + async with request.app.state.httpx_client_wrapper() as client: + backend_response = await client.post( + f"{chosen_url}{endpoint}", + data=data, + files=files, + timeout=300.0 + ) + backend_response.raise_for_status() + + # --- 4. Return the response --- + response_content = backend_response.json() + headers = { + k: v + for k, v in backend_response.headers.items() + if k.lower() not in ("content-encoding", "transfer-encoding", "connection") + } + + headers["X-Request-Id"] = request_id + + return JSONResponse( + content=response_content, + status_code=backend_response.status_code, + headers=headers, + ) + except httpx.HTTPStatusError as e: + error_content = e.response.json() if "json" in e.response.headers.get("content-type", "") else e.response.text + return JSONResponse(status_code=e.response.status_code, content=error_content) + except httpx.RequestError as e: + return JSONResponse(status_code=503, content={"error": f"Failed to connect to backend: {e}"}) From 769e4f4166ca0b3862c217df7732e9259133da6f Mon Sep 17 00:00:00 2001 From: David Gao Date: Thu, 24 Jul 2025 22:26:51 +0800 Subject: [PATCH 12/28] add timeout to ensure health check will not hang indefinitely if a backend model becomes unresponsive Signed-off-by: David Gao --- src/vllm_router/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py index 907c82867..a882681a5 100644 --- a/src/vllm_router/utils.py +++ b/src/vllm_router/utils.py @@ -197,6 +197,7 @@ def is_model_healthy(url: str, model: str, model_type: str) -> bool: f"{url}{model_details.value}", files=files, # multipart/form-data data=data, + timeout=10 ) else: # for other model types (chat, completion, etc.) @@ -204,6 +205,7 @@ def is_model_healthy(url: str, model: str, model_type: str) -> bool: f"{url}{model_details.value}", headers={"Content-Type": "application/json"}, json={"model": model} | model_details.get_test_payload(model_type), + timeout=10 ) response.raise_for_status() From dc0f2d2e82c98f361675767d077c968efe8c67d6 Mon Sep 17 00:00:00 2001 From: David Gao Date: Thu, 24 Jul 2025 23:04:27 +0800 Subject: [PATCH 13/28] add boolean model health check return for non-transcription model Signed-off-by: David Gao --- src/vllm_router/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py index a882681a5..2299c9872 100644 --- a/src/vllm_router/utils.py +++ b/src/vllm_router/utils.py @@ -214,6 +214,7 @@ def is_model_healthy(url: str, model: str, model_type: str) -> bool: return True else: response.json() # verify it's valid json for other model types + return True # validation passed except requests.exceptions.RequestException as e: logger.warning(f"{model_type} model {model} at {url} not healthy: {e}") From a57de3d71d3671eb3acf31d817791f892d3ffe9b Mon Sep 17 00:00:00 2001 From: David Gao Date: Thu, 24 Jul 2025 23:37:05 +0800 Subject: [PATCH 14/28] remove redundant warning log since handled in outer 'StaticServiceDiscovery.get_unhealthy_endpoint_hashes' Signed-off-by: David Gao --- src/vllm_router/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py index 2299c9872..8b596d3ea 100644 --- a/src/vllm_router/utils.py +++ b/src/vllm_router/utils.py @@ -217,7 +217,6 @@ def is_model_healthy(url: str, model: str, model_type: str) -> bool: return True # validation passed except requests.exceptions.RequestException as e: - logger.warning(f"{model_type} model {model} at {url} not healthy: {e}") return False except json.JSONDecodeError as e: From adcc64f28b334b2e15742891028e5f9d685575e8 Mon Sep 17 00:00:00 2001 From: David Gao Date: Fri, 25 Jul 2025 00:24:13 +0800 Subject: [PATCH 15/28] remove redundant JSONDecodeError catch and downgrade RequestException log to debug, align with service discovery's warning Signed-off-by: David Gao --- src/vllm_router/utils.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py index 8b596d3ea..bdd3af02e 100644 --- a/src/vllm_router/utils.py +++ b/src/vllm_router/utils.py @@ -217,10 +217,5 @@ def is_model_healthy(url: str, model: str, model_type: str) -> bool: return True # validation passed except requests.exceptions.RequestException as e: - return False - - except json.JSONDecodeError as e: - logger.error( - f"Failed to decode JSON from {model_type} model {model} at {url}: {e}" - ) + logger.debug(f"{model_type} Model {model} at {url} is not healthy: {e}") return False From f1522fc28f92bdeaecded0bbffc6720ebb7b474a Mon Sep 17 00:00:00 2001 From: David Gao Date: Thu, 31 Jul 2025 01:11:09 +0800 Subject: [PATCH 16/28] Chore: Apply auto-formatting and linting fixes via pre-commit Signed-off-by: David Gao --- src/vllm_router/routers/main_router.py | 6 +- .../services/request_service/request.py | 66 ++++++++++++------- src/vllm_router/utils.py | 4 +- 3 files changed, 47 insertions(+), 29 deletions(-) diff --git a/src/vllm_router/routers/main_router.py b/src/vllm_router/routers/main_router.py index 999868e03..5d77124dd 100644 --- a/src/vllm_router/routers/main_router.py +++ b/src/vllm_router/routers/main_router.py @@ -26,8 +26,8 @@ from vllm_router.service_discovery import get_service_discovery from vllm_router.services.request_service.request import ( route_general_request, - route_sleep_wakeup_request, route_general_transcriptions, + route_sleep_wakeup_request, ) from vllm_router.stats.engine_stats import get_engine_stats_scraper from vllm_router.version import __version__ @@ -241,4 +241,6 @@ async def route_v1_audio_transcriptions( request: Request, background_tasks: BackgroundTasks ): """Handles audio transcription requests.""" - return await route_general_transcriptions(request, "/v1/audio/transcriptions", background_tasks) + return await route_general_transcriptions( + request, "/v1/audio/transcriptions", background_tasks + ) diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py index e5565b4a3..9ae8c9e26 100644 --- a/src/vllm_router/services/request_service/request.py +++ b/src/vllm_router/services/request_service/request.py @@ -20,8 +20,7 @@ from typing import Optional import httpx - -from fastapi import BackgroundTasks, Request, UploadFile, HTTPException +from fastapi import BackgroundTasks, HTTPException, Request, UploadFile from fastapi.responses import JSONResponse, StreamingResponse from requests import JSONDecodeError @@ -29,14 +28,14 @@ from vllm_router.routers.routing_logic import ( DisaggregatedPrefillRouter, KvawareRouter, - PrefixAwareRouter + PrefixAwareRouter, ) from vllm_router.service_discovery import get_service_discovery from vllm_router.services.request_service.rewriter import ( get_request_rewriter, is_request_rewriter_initialized, ) -from vllm_router.utils import replace_model_in_request_body,update_content_length +from vllm_router.utils import replace_model_in_request_body, update_content_length try: # Semantic cache integration @@ -511,8 +510,9 @@ async def route_sleep_wakeup_request( headers={"X-Request-Id": request_id}, ) + async def route_general_transcriptions( - request: Request, + request: Request, endpoint: str, # "/v1/audio/transcriptions" background_tasks: BackgroundTasks, ): @@ -531,10 +531,15 @@ async def route_general_transcriptions( prompt: Optional[str] = form.get("prompt", None) response_format: Optional[str] = form.get("response_format", "json") temperature_str: Optional[str] = form.get("temperature", None) - temperature: Optional[float] = float(temperature_str) if temperature_str is not None else None + temperature: Optional[float] = ( + float(temperature_str) if temperature_str is not None else None + ) language: Optional[str] = form.get("language", "en") except KeyError as e: - return JSONResponse(status_code=400, content={"error": f"Invalid request: missing '{e.args[0]}' in form data."}) + return JSONResponse( + status_code=400, + content={"error": f"Invalid request: missing '{e.args[0]}' in form data."}, + ) logger.debug("==== Enter audio_transcriptions ====") logger.debug("Received upload: %s (%s)", file.filename, file.content_type) @@ -549,11 +554,16 @@ async def route_general_transcriptions( # --- 2. Service Discovery and Routing --- # Access singletons via request.app.state for consistent style - service_discovery = get_service_discovery() # This one is often still accessed directly via its get function - router = request.app.state.router # Access router from app.state - engine_stats_scraper = request.app.state.engine_stats_scraper # Access engine_stats_scraper from app.state - request_stats_monitor = request.app.state.request_stats_monitor # Access request_stats_monitor from app.state - + service_discovery = ( + get_service_discovery() + ) # This one is often still accessed directly via its get function + router = request.app.state.router # Access router from app.state + engine_stats_scraper = ( + request.app.state.engine_stats_scraper + ) # Access engine_stats_scraper from app.state + request_stats_monitor = ( + request.app.state.request_stats_monitor + ) # Access request_stats_monitor from app.state endpoints = service_discovery.get_endpoint_info() @@ -565,7 +575,9 @@ async def route_general_transcriptions( transcription_endpoints = [ ep for ep in endpoints - if model == ep.model_name and ep.model_label == "transcription" and not ep.sleep # Added ep.sleep == False + if model == ep.model_name + and ep.model_label == "transcription" + and not ep.sleep # Added ep.sleep == False ] logger.debug("====List of transcription endpoints====") @@ -575,7 +587,8 @@ async def route_general_transcriptions( if not transcription_endpoints: logger.error("No transcription backend available for model %s", model) return JSONResponse( - status_code=404, content={"error": f"No transcription backend for model {model}"} + status_code=404, + content={"error": f"No transcription backend for model {model}"}, ) # grab the current engine and request stats @@ -596,7 +609,7 @@ async def route_general_transcriptions( payload_bytes = await file.read() files = {"file": (file.filename, payload_bytes, file.content_type)} - data = {"model": model,"language": language} + data = {"model": model, "language": language} if prompt: data["prompt"] = prompt @@ -616,19 +629,16 @@ async def route_general_transcriptions( try: async with request.app.state.httpx_client_wrapper() as client: backend_response = await client.post( - f"{chosen_url}{endpoint}", - data=data, - files=files, - timeout=300.0 - ) + f"{chosen_url}{endpoint}", data=data, files=files, timeout=300.0 + ) backend_response.raise_for_status() # --- 4. Return the response --- response_content = backend_response.json() headers = { - k: v - for k, v in backend_response.headers.items() - if k.lower() not in ("content-encoding", "transfer-encoding", "connection") + k: v + for k, v in backend_response.headers.items() + if k.lower() not in ("content-encoding", "transfer-encoding", "connection") } headers["X-Request-Id"] = request_id @@ -639,7 +649,13 @@ async def route_general_transcriptions( headers=headers, ) except httpx.HTTPStatusError as e: - error_content = e.response.json() if "json" in e.response.headers.get("content-type", "") else e.response.text + error_content = ( + e.response.json() + if "json" in e.response.headers.get("content-type", "") + else e.response.text + ) return JSONResponse(status_code=e.response.status_code, content=error_content) except httpx.RequestError as e: - return JSONResponse(status_code=503, content={"error": f"Failed to connect to backend: {e}"}) + return JSONResponse( + status_code=503, content={"error": f"Failed to connect to backend: {e}"} + ) diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py index b831c5ca3..7f580c0c0 100644 --- a/src/vllm_router/utils.py +++ b/src/vllm_router/utils.py @@ -200,7 +200,7 @@ def is_model_healthy(url: str, model: str, model_type: str) -> bool: f"{url}{model_details.value}", files=files, # multipart/form-data data=data, - timeout=10 + timeout=10, ) else: # for other model types (chat, completion, etc.) @@ -208,7 +208,7 @@ def is_model_healthy(url: str, model: str, model_type: str) -> bool: f"{url}{model_details.value}", headers={"Content-Type": "application/json"}, json={"model": model} | model_details.get_test_payload(model_type), - timeout=10 + timeout=10, ) response.raise_for_status() From 73d58179d176afe9b560e40f34a0b0301d4ca4dd Mon Sep 17 00:00:00 2001 From: David Gao Date: Thu, 31 Jul 2025 01:16:07 +0800 Subject: [PATCH 17/28] refactor: update more meaningful comments for silent wav bytes generation Signed-off-by: David Gao --- src/vllm_router/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py index 7f580c0c0..d071cd952 100644 --- a/src/vllm_router/utils.py +++ b/src/vllm_router/utils.py @@ -15,8 +15,8 @@ logger = init_logger(__name__) -# prepare a WAV byte to prevent repeatedly generating it # Generate a 0.1 second silent audio file +# This will be used for the /v1/audio/transcriptions endpoint _SILENT_WAV_BYTES = None with io.BytesIO() as wav_buffer: with wave.open(wav_buffer, "wb") as wf: @@ -96,7 +96,6 @@ def get_test_payload(model_type: str): case ModelType.score: return {"encoding_format": "float", "text_1": "Test", "test_2": "Test2"} case ModelType.transcription: - # Generate a 0.1 second silent audio file if _SILENT_WAV_BYTES is not None: logger.debug("=====Silent WAV Bytes is being used=====") return { From ba769ee53a6975f301b4968bc43d4739f2e18f50 Mon Sep 17 00:00:00 2001 From: David Gao Date: Thu, 31 Jul 2025 02:05:04 +0800 Subject: [PATCH 18/28] refactor: keep the comment to explain purpose for generating a silent WAV byte Signed-off-by: David Gao --- src/vllm_router/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py index d071cd952..6b121cb6e 100644 --- a/src/vllm_router/utils.py +++ b/src/vllm_router/utils.py @@ -15,6 +15,7 @@ logger = init_logger(__name__) +# prepare a WAV byte to prevent repeatedly generating it # Generate a 0.1 second silent audio file # This will be used for the /v1/audio/transcriptions endpoint _SILENT_WAV_BYTES = None From 996c653e9131354b38a444fb735fe1b47e61d3e7 Mon Sep 17 00:00:00 2001 From: David Gao Date: Tue, 12 Aug 2025 00:31:34 +0800 Subject: [PATCH 19/28] fix(tests): Improve mock in model health check test The mock for `requests.post` in `test_is_model_healthy` did not correctly simulate an `HTTPError` on a non-200 response. This change configures the mock's `raise_for_status` method to raise the appropriate exception, ensuring the test now accurately validates the function's error handling logic. Signed-off-by: David Gao --- src/tests/test_utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/tests/test_utils.py b/src/tests/test_utils.py index dc352b930..5737beda6 100644 --- a/src/tests/test_utils.py +++ b/src/tests/test_utils.py @@ -104,6 +104,14 @@ def test_is_model_healthy_when_requests_raises_exception_returns_false( def test_is_model_healthy_when_requests_status_with_status_code_not_200_returns_false( monkeypatch: pytest.MonkeyPatch, ) -> None: - request_mock = MagicMock(return_value=MagicMock(status_code=500)) + + # Mock an internal server error response + mock_response = MagicMock(status_code=500) + + # Tell the mock to raise an HTTP Error when raise_for_status() is called + mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError + + request_mock = MagicMock(return_value=mock_response) monkeypatch.setattr("requests.post", request_mock) + assert utils.is_model_healthy("http://localhost", "test", "chat") is False From f7ef2eb067835a443e944d96a898b7c914eb0ac6 Mon Sep 17 00:00:00 2001 From: David Gao Date: Tue, 12 Aug 2025 16:15:35 +0800 Subject: [PATCH 20/28] Chore: Apply auto-formatting and linting fixes via pre-commit Signed-off-by: David Gao --- src/tests/test_utils.py | 4 ++-- src/vllm_router/services/request_service/request.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/tests/test_utils.py b/src/tests/test_utils.py index 5737beda6..fb5e0afa2 100644 --- a/src/tests/test_utils.py +++ b/src/tests/test_utils.py @@ -104,10 +104,10 @@ def test_is_model_healthy_when_requests_raises_exception_returns_false( def test_is_model_healthy_when_requests_status_with_status_code_not_200_returns_false( monkeypatch: pytest.MonkeyPatch, ) -> None: - + # Mock an internal server error response mock_response = MagicMock(status_code=500) - + # Tell the mock to raise an HTTP Error when raise_for_status() is called mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py index 58caaecb7..b7f99f637 100644 --- a/src/vllm_router/services/request_service/request.py +++ b/src/vllm_router/services/request_service/request.py @@ -19,10 +19,9 @@ import uuid from typing import Optional - +import aiohttp import httpx from fastapi import BackgroundTasks, HTTPException, Request, UploadFile -import aiohttp from fastapi.responses import JSONResponse, StreamingResponse from requests import JSONDecodeError From 3c66f96a64560108fb282e9154405d295b901d42 Mon Sep 17 00:00:00 2001 From: David Gao Date: Wed, 13 Aug 2025 15:24:33 +0800 Subject: [PATCH 21/28] chore: remove unused var `in_router_time` Signed-off-by: David Gao --- src/vllm_router/services/request_service/request.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py index b7f99f637..13abdd81c 100644 --- a/src/vllm_router/services/request_service/request.py +++ b/src/vllm_router/services/request_service/request.py @@ -520,7 +520,6 @@ async def route_general_transcriptions( """Handles audio transcription requests by parsing form data and proxying to backend.""" request_id = request.headers.get("X-Request-Id", str(uuid.uuid4())) - in_router_time = time.time() # --- 1. Form parsing --- try: From 6ac4661427f46cb6990e6bf7a391a35820edd077 Mon Sep 17 00:00:00 2001 From: David Gao Date: Fri, 15 Aug 2025 01:23:52 +0800 Subject: [PATCH 22/28] fix: (deps) add httpx as an explicit dependency The CI/CD workflow was failing with a `ModuleNotFoundError` because `httpx` was not an explicit dependency in `pyproject.toml` and was not being installed in the clean Docker environment. Signed-off-by: David Gao --- pyproject.toml | 1 + src/vllm_router/requirements.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index dfc923313..b44455930 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ dependencies = [ "xxhash==3.5.0", "psutil==7.0.0", "pyyaml>=6.0.2", + "httpx==0.28.1", ] [project.scripts] diff --git a/src/vllm_router/requirements.txt b/src/vllm_router/requirements.txt index 4861ba4be..a0f40ffcd 100644 --- a/src/vllm_router/requirements.txt +++ b/src/vllm_router/requirements.txt @@ -10,3 +10,4 @@ sentry-sdk[fastapi]==2.27.0 uhashring==2.3 uvicorn==0.34.0 xxhash==3.5.0 +httpx==0.28.1 From 37e33caf9c94a609af61a32bbe663f73beab3e98 Mon Sep 17 00:00:00 2001 From: David Gao Date: Fri, 15 Aug 2025 01:56:08 +0800 Subject: [PATCH 23/28] chore: dependencies order changes after running pre-commit Signed-off-by: David Gao --- src/vllm_router/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vllm_router/requirements.txt b/src/vllm_router/requirements.txt index a0f40ffcd..02a88aca9 100644 --- a/src/vllm_router/requirements.txt +++ b/src/vllm_router/requirements.txt @@ -1,6 +1,7 @@ aiofiles==24.1.0 aiohttp==3.9.5 fastapi==0.115.8 +httpx==0.28.1 kubernetes==32.0.0 numpy==1.26.4 prometheus_client==0.21.1 @@ -10,4 +11,3 @@ sentry-sdk[fastapi]==2.27.0 uhashring==2.3 uvicorn==0.34.0 xxhash==3.5.0 -httpx==0.28.1 From 114895bef18e05f5cc9c643f4369a98126123b5c Mon Sep 17 00:00:00 2001 From: David Gao Date: Fri, 22 Aug 2025 16:11:29 +0800 Subject: [PATCH 24/28] refactor: Migration from httpx to aiohttp for improved concurrency Replaced httpx with aiohttp for better asynchronous performance and resource utilization. Fixed JSON syntax error in error response handling. Signed-off-by: David Gao --- pyproject.toml | 1 - src/vllm_router/requirements.txt | 1 - .../services/request_service/request.py | 47 +++++---- tutorials/23-whisper-api-transcription.md | 99 +++++++++++++++++++ 4 files changed, 129 insertions(+), 19 deletions(-) create mode 100644 tutorials/23-whisper-api-transcription.md diff --git a/pyproject.toml b/pyproject.toml index b44455930..dfc923313 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,6 @@ dependencies = [ "xxhash==3.5.0", "psutil==7.0.0", "pyyaml>=6.0.2", - "httpx==0.28.1", ] [project.scripts] diff --git a/src/vllm_router/requirements.txt b/src/vllm_router/requirements.txt index 02a88aca9..4861ba4be 100644 --- a/src/vllm_router/requirements.txt +++ b/src/vllm_router/requirements.txt @@ -1,7 +1,6 @@ aiofiles==24.1.0 aiohttp==3.9.5 fastapi==0.115.8 -httpx==0.28.1 kubernetes==32.0.0 numpy==1.26.4 prometheus_client==0.21.1 diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py index 1debc28f7..9b44fe7a7 100644 --- a/src/vllm_router/services/request_service/request.py +++ b/src/vllm_router/services/request_service/request.py @@ -20,7 +20,6 @@ from typing import Optional import aiohttp -import httpx from fastapi import BackgroundTasks, HTTPException, Request, UploadFile from fastapi.responses import JSONResponse, StreamingResponse from requests import JSONDecodeError @@ -627,14 +626,26 @@ async def route_general_transcriptions( logger.debug("==== data payload keys ====") try: - async with request.app.state.httpx_client_wrapper() as client: - backend_response = await client.post( - f"{chosen_url}{endpoint}", data=data, files=files, timeout=300.0 - ) - backend_response.raise_for_status() + client = request.app.state.aiohttp_client_wrapper() + + form_data = aiohttp.FormData() + + # add file data + for key, (filename, content, content_type) in files.items(): + form_data.add_field(key, content, filename=filename, content_type=content_type) + + # add from data + for key, value in data.items(): + form_data.add_field(key,value) + + backend_response = await client.post( + f"{chosen_url}{endpoint}", + data=form_data, + timeout=aiohttp.ClientTimeout(total=300) + ) # --- 4. Return the response --- - response_content = backend_response.json() + response_content = await backend_response.json() headers = { k: v for k, v in backend_response.headers.items() @@ -645,17 +656,19 @@ async def route_general_transcriptions( return JSONResponse( content=response_content, - status_code=backend_response.status_code, + status_code=backend_response.status, headers=headers, ) - except httpx.HTTPStatusError as e: - error_content = ( - e.response.json() - if "json" in e.response.headers.get("content-type", "") - else e.response.text - ) - return JSONResponse(status_code=e.response.status_code, content=error_content) - except httpx.RequestError as e: + except aiohttp.ClientResponseError as e: + if hasattr(e, "response") and e.response is not None: + try: + error_content = await e.response.json() + except: + error_content = await e.response.text() + else: + error_content = {"error": f"HTTP {e.status}: {e.message}"} + return JSONResponse(status_code=e.status, content=error_content) + except aiohttp.ClientError as e: return JSONResponse( - status_code=503, content={"error": f"Failed to connect to backend: {e}"} + status_code=503, content={"error": f"Failed to connect to backend: {str(e)}"} ) diff --git a/tutorials/23-whisper-api-transcription.md b/tutorials/23-whisper-api-transcription.md new file mode 100644 index 000000000..a09281916 --- /dev/null +++ b/tutorials/23-whisper-api-transcription.md @@ -0,0 +1,99 @@ +# Tutorial: Whisper Transcription API in vLLM Production Stack + +## Overview + +This tutorial introduces the newly added `/v1/audio/transcriptions` endpoint in the `vllm-router`, enabling users to transcribe `.wav` audio files using OpenAI’s `whisper-small` model. + +## Prerequisites + +* Access to a machine with a GPU (e.g. via [RunPod](https://runpod.io/)) +* Python 3.12 environment (recommended with `uv`) +* `vllm` and `production-stack` cloned and installed +* `vllm` installed with audio support: + + ```bash + pip install vllm[audio] + ``` + +## 1. Serving the Whisper Model + +Start a vLLM backend with the `whisper-small` model: + +```bash +vllm serve \ + --task transcription openai/whisper-small \ + --host 0.0.0.0 --port 8002 +``` + +## 2. Running the Router + +Create and run a router connected to the Whisper backend: + +```bash +#!/bin/bash +if [[ $# -ne 2 ]]; then + echo "Usage: $0 " + exit 1 +fi + +uv run python3 -m vllm_router.app \ + --host 0.0.0.0 --port "$1" \ + --service-discovery static \ + --static-backends "$2" \ + --static-models "openai/whisper-small" \ + --static-model-types "transcription" \ + --routing-logic roundrobin \ + --log-stats \ + --engine-stats-interval 10 \ + --request-stats-window 10 +``` + +Example usage: + +```bash +./run-router.sh 8000 http://localhost:8002 +``` + +## 3. Sending a Transcription Request + +Use `curl` to send a `.wav` file to the transcription endpoint: + +* You can test with any `.wav` audio file of your choice. + +```bash +curl -v http://localhost:8000/v1/audio/transcriptions \ + -F 'file=@/path/to/audio.wav;type=audio/wav' \ + -F 'model=openai/whisper-small' \ + -F 'response_format=json' \ + -F 'language=en' +``` + +### Supported Parameters + +| Parameter | Description | +| ----------------- | ------------------------------------------------------ | +| `file` | Path to a `.wav` audio file | +| `model` | Whisper model to use (e.g., `openai/whisper-small`) | +| `prompt` | *(Optional)* Text prompt to guide the transcription | +| `response_format` | One of `json`, `text`, `srt`, `verbose_json`, or `vtt` | +| `temperature` | *(Optional)* Sampling temperature as a float | +| `language` | ISO 639-1 code (e.g., `en`, `fr`, `zh`) | + +## 4. Sample Output + +```json +{ + "text": "Testing testing testing the whisper small model testing testing testing the audio transcription function testing testing testing the whisper small model" +} +``` + +## 5. Notes + +* Router uses extended aiohttp timeouts to support long transcription jobs. +* This implementation dynamically discovers valid transcription backends and routes requests accordingly. + +## 6. Resources + +* [PR #469 – Add Whisper Transcription API](https://github.com/vllm-project/production-stack/pull/469) +* [OpenAI Whisper GitHub](https://github.com/openai/whisper) +* [Blog: vLLM Whisper Transcription Walkthrough](https://davidgao7.github.io/posts/vllm-v1-whisper-transcription/) From 12fcb2a277c188dd9576ece9a588b44162a4344c Mon Sep 17 00:00:00 2001 From: David Gao Date: Fri, 22 Aug 2025 16:33:21 +0800 Subject: [PATCH 25/28] chore: remove wrong tutorial file Signed-off-by: David Gao --- tutorials/17-whisper-api-transcription.md | 99 ----------------------- 1 file changed, 99 deletions(-) delete mode 100644 tutorials/17-whisper-api-transcription.md diff --git a/tutorials/17-whisper-api-transcription.md b/tutorials/17-whisper-api-transcription.md deleted file mode 100644 index e3834bc69..000000000 --- a/tutorials/17-whisper-api-transcription.md +++ /dev/null @@ -1,99 +0,0 @@ -# Tutorial: Whisper Transcription API in vLLM Production Stack - -## Overview - -This tutorial introduces the newly added `/v1/audio/transcriptions` endpoint in the `vllm-router`, enabling users to transcribe `.wav` audio files using OpenAI’s `whisper-small` model. - -## Prerequisites - -* Access to a machine with a GPU (e.g. via [RunPod](https://runpod.io/)) -* Python 3.12 environment (recommended with `uv`) -* `vllm` and `production-stack` cloned and installed -* `vllm` installed with audio support: - - ```bash - pip install vllm[audio] - ``` - -## 1. Serving the Whisper Model - -Start a vLLM backend with the `whisper-small` model: - -```bash -vllm serve \ - --task transcription openai/whisper-small \ - --host 0.0.0.0 --port 8002 -``` - -## 2. Running the Router - -Create and run a router connected to the Whisper backend: - -```bash -#!/bin/bash -if [[ $# -ne 2 ]]; then - echo "Usage: $0 " - exit 1 -fi - -uv run python3 -m vllm_router.app \ - --host 0.0.0.0 --port "$1" \ - --service-discovery static \ - --static-backends "$2" \ - --static-models "openai/whisper-small" \ - --static-model-types "transcription" \ - --routing-logic roundrobin \ - --log-stats \ - --engine-stats-interval 10 \ - --request-stats-window 10 -``` - -Example usage: - -```bash -./run-router.sh 8000 http://localhost:8002 -``` - -## 3. Sending a Transcription Request - -Use `curl` to send a `.wav` file to the transcription endpoint: - -* You can test with any `.wav` audio file of your choice. - -```bash -curl -v http://localhost:8000/v1/audio/transcriptions \ - -F 'file=@/path/to/audio.wav;type=audio/wav' \ - -F 'model=openai/whisper-small' \ - -F 'response_format=json' \ - -F 'language=en' -``` - -### Supported Parameters - -| Parameter | Description | -| ----------------- | ------------------------------------------------------ | -| `file` | Path to a `.wav` audio file | -| `model` | Whisper model to use (e.g., `openai/whisper-small`) | -| `prompt` | *(Optional)* Text prompt to guide the transcription | -| `response_format` | One of `json`, `text`, `srt`, `verbose_json`, or `vtt` | -| `temperature` | *(Optional)* Sampling temperature as a float | -| `language` | ISO 639-1 code (e.g., `en`, `fr`, `zh`) | - -## 4. Sample Output - -```json -{ - "text": "Testing testing testing the whisper small model testing testing testing the audio transcription function testing testing testing the whisper small model" -} -``` - -## 5. Notes - -* Router uses extended HTTPX timeouts to support long transcription jobs. -* This implementation dynamically discovers valid transcription backends and routes requests accordingly. - -## 6. Resources - -* [PR #469 – Add Whisper Transcription API](https://github.com/vllm-project/production-stack/pull/469) -* [OpenAI Whisper GitHub](https://github.com/openai/whisper) -* [Blog: vLLM Whisper Transcription Walkthrough](https://davidgao7.github.io/posts/vllm-v1-whisper-transcription/) From 6c72f81d693b10225044ad514c387f0b52ad97d5 Mon Sep 17 00:00:00 2001 From: David Gao Date: Fri, 22 Aug 2025 17:05:29 +0800 Subject: [PATCH 26/28] chore: apply pre-commit Signed-off-by: David Gao --- src/vllm_router/services/request_service/request.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py index 9b44fe7a7..53654cf85 100644 --- a/src/vllm_router/services/request_service/request.py +++ b/src/vllm_router/services/request_service/request.py @@ -632,16 +632,18 @@ async def route_general_transcriptions( # add file data for key, (filename, content, content_type) in files.items(): - form_data.add_field(key, content, filename=filename, content_type=content_type) + form_data.add_field( + key, content, filename=filename, content_type=content_type + ) # add from data for key, value in data.items(): - form_data.add_field(key,value) + form_data.add_field(key, value) backend_response = await client.post( f"{chosen_url}{endpoint}", data=form_data, - timeout=aiohttp.ClientTimeout(total=300) + timeout=aiohttp.ClientTimeout(total=300), ) # --- 4. Return the response --- @@ -670,5 +672,6 @@ async def route_general_transcriptions( return JSONResponse(status_code=e.status, content=error_content) except aiohttp.ClientError as e: return JSONResponse( - status_code=503, content={"error": f"Failed to connect to backend: {str(e)}"} + status_code=503, + content={"error": f"Failed to connect to backend: {str(e)}"}, ) From 1a1c985c605021500f4a2adeef9e638f6b51a213 Mon Sep 17 00:00:00 2001 From: David Gao Date: Sat, 23 Aug 2025 01:08:14 +0800 Subject: [PATCH 27/28] chore: use debug log print Signed-off-by: David Gao --- src/vllm_router/services/request_service/request.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py index 53654cf85..f4ea337fa 100644 --- a/src/vllm_router/services/request_service/request.py +++ b/src/vllm_router/services/request_service/request.py @@ -602,7 +602,7 @@ async def route_general_transcriptions( request, ) - logger.info("Proxying transcription request to %s", chosen_url) + logger.debug("Proxying transcription request to %s", chosen_url) # --- 3. Prepare and Proxy the Request --- payload_bytes = await file.read() From 1d1b8288806299f59e73aa865935cddcc8d09bd8 Mon Sep 17 00:00:00 2001 From: David Gao Date: Sat, 23 Aug 2025 01:54:40 +0800 Subject: [PATCH 28/28] chore: change to more specific exception handling for aiohttp Signed-off-by: David Gao --- .../services/request_service/request.py | 31 +++++++++++++------ 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py index f4ea337fa..1c4147d4d 100644 --- a/src/vllm_router/services/request_service/request.py +++ b/src/vllm_router/services/request_service/request.py @@ -661,17 +661,30 @@ async def route_general_transcriptions( status_code=backend_response.status, headers=headers, ) - except aiohttp.ClientResponseError as e: - if hasattr(e, "response") and e.response is not None: + except aiohttp.ClientResponseError as response_error: + if response_error.response is not None: try: - error_content = await e.response.json() - except: - error_content = await e.response.text() + error_content = await response_error.response.json() + except ( + aiohttp.ContentTypeError, + json.JSONDecodeError, + aiohttp.ClientError, + ): + # If JSON parsing fails, get text content + try: + text_content = await response_error.response.text() + error_content = {"error": text_content} + except aiohttp.ClientError: + error_content = { + "error": f"HTTP {response_error.status}: {response_error.message}" + } else: - error_content = {"error": f"HTTP {e.status}: {e.message}"} - return JSONResponse(status_code=e.status, content=error_content) - except aiohttp.ClientError as e: + error_content = { + "error": f"HTTP {response_error.status}: {response_error.message}" + } + return JSONResponse(status_code=response_error.status, content=error_content) + except aiohttp.ClientError as client_error: return JSONResponse( status_code=503, - content={"error": f"Failed to connect to backend: {str(e)}"}, + content={"error": f"Failed to connect to backend: {str(client_error)}"}, )