From 280b8b1cc892478f86d2a33e867cc515e955e313 Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Fri, 30 May 2025 02:34:48 -0400
Subject: [PATCH 01/28] [feat]: add transcription API endpoint using OpenAI
 Whisper-small

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 src/vllm_router/routers/main_router.py | 151 ++++++++++++++++++++++++-
 src/vllm_router/run-router.sh          |  42 ++++---
 src/vllm_router/utils.py               |   6 +
 3 files changed, 182 insertions(+), 17 deletions(-)

diff --git a/src/vllm_router/routers/main_router.py b/src/vllm_router/routers/main_router.py
index e4e77b018..e458ae812 100644
--- a/src/vllm_router/routers/main_router.py
+++ b/src/vllm_router/routers/main_router.py
@@ -12,19 +12,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import json
-
-from fastapi import APIRouter, BackgroundTasks, Request
+import time
+
+import httpx
+from fastapi import (
+    APIRouter,
+    BackgroundTasks,
+    File,
+    Form,
+    HTTPException,
+    Request,
+    UploadFile,
+)
 from fastapi.responses import JSONResponse, Response
 
 from vllm_router.dynamic_config import get_dynamic_config_watcher
 from vllm_router.log import init_logger
 from vllm_router.protocols import ModelCard, ModelList
+from vllm_router.routers.routing_logic import get_routing_logic
 from vllm_router.service_discovery import get_service_discovery
 from vllm_router.services.request_service.request import (
     route_general_request,
     route_sleep_wakeup_request,
 )
 from vllm_router.stats.engine_stats import get_engine_stats_scraper
+from vllm_router.stats.request_stats import RequestStatsMonitor
 from vllm_router.version import __version__
 
 try:
@@ -198,8 +210,7 @@ async def get_engine_instances():
 
 @main_router.get("/health")
 async def health() -> Response:
-    """
-    Endpoint to check the health status of various components.
+    """Endpoint to check the health status of various components.
 
     This function verifies the health of the service discovery module and
     the engine stats scraper. If either component is down, it returns a
@@ -232,3 +243,135 @@ async def health() -> Response:
         )
     else:
         return JSONResponse(content={"status": "healthy"}, status_code=200)
+
+
+@main_router.post("/v1/audio/transcriptions")
+async def audio_transcriptions(
+    file: UploadFile = File(...),
+    model: str = Form(...),
+    prompt: str | None = Form(None),
+    response_format: str | None = Form("json"),
+    temperature: float | None = Form(None),
+    language: str = Form("en"),
+):
+
+    logger.debug("==== Enter audio_transcriptions ====")
+    logger.debug("Received upload: %s (%s)", file.filename, file.content_type)
+    logger.debug(
+        "Params: model=%s prompt=%r response_format=%r temperature=%r language=%s",
+        model,
+        prompt,
+        response_format,
+        temperature,
+        language,
+    )
+
+    # read file bytes
+    payload_bytes = await file.read()
+    files = {
+        "file": (file.filename, payload_bytes, file.content_type),
+    }
+    # logger.debug("=========files=========")
+    # logger.debug(files)
+    # logger.debug("=========files=========")
+
+    data = {
+        "model": model,
+        "language": language,
+    }
+
+    if prompt:
+        data["prompt"] = prompt
+
+    if response_format:
+        data["response_format"] = response_format
+
+    if temperature is not None:
+        data["temperature"] = str(temperature)
+
+    logger.debug("==== data payload keys ====")
+    logger.debug(list(data.keys()))
+    logger.debug("==== data payload keys ====")
+
+    # get the backend url
+    endpoints = get_service_discovery().get_endpoint_info()
+
+    logger.debug("==== Total endpoints ====")
+    logger.debug(endpoints)
+    logger.debug("==== Total endpoints ====")
+
+    # TODO: right now is skipping label check in code for local testing
+    endpoints = [
+        ep
+        for ep in endpoints
+        if model in ep.model_names  # that actually serve your model
+    ]
+
+    logger.debug("==== Discovered endpoints after filtering ====")
+    logger.debug(endpoints)
+    logger.debug("==== Discovered endpoints after filtering ====")
+
+    # filter the endpoints url for transcriptions
+    transcription_endpoints = [ep for ep in endpoints if model in ep.model_names]
+
+    logger.debug("====List of transcription endpoints====")
+    logger.debug(transcription_endpoints)
+    logger.debug("====List of transcription endpoints====")
+
+    if not transcription_endpoints:
+        logger.error("No transcription backend available for model %s", model)
+        raise HTTPException(
+            status_code=503, detail=f"No transcription backend for model {model}"
+        )
+
+    # grab the current engin and request stats
+    engine_stats = get_engine_stats_scraper().get_engine_stats()
+    request_stats = RequestStatsMonitor().get_request_stats(time.time())
+    router = get_routing_logic()
+
+    # pick one using the router's configured logic (roundrobin, least-loaded, etc.)
+    chosen_url = router.route_request(
+        transcription_endpoints,
+        engine_stats,
+        request_stats,
+        # we don’t need to pass the original FastAPI Request object here,
+        # but you can if your routing logic looks at headers or body
+        None,
+    )
+
+    logger.info("Proxying transcription request to %s", chosen_url)
+
+    # proxy the request
+    # by default httpx will only wait for 5 seconds, large audio transcriptions generally
+    # take longer than that
+    async with httpx.AsyncClient(
+        base_url=chosen_url,
+        timeout=httpx.Timeout(
+            connect=60.0,  # connect timeout
+            read=300.0,  # read timeout
+            write=30.0,  # if you’re streaming uploads
+            pool=None,  # no pool timeout
+        ),
+    ) as client:
+        logger.debug("Sending multipart to %s/v1/audio/transcriptions …", chosen_url)
+        proxied = await client.post("/v1/audio/transcriptions", data=data, files=files)
+    logger.info("Received %d from whisper backend", proxied.status_code)
+
+    # return the whisper response unmodified
+    resp = proxied.json()
+    logger.debug("==== Whisper response payload ====")
+    logger.debug(resp)
+    logger.debug("==== Whisper response payload ====")
+
+    logger.debug("Backend response headers: %s", proxied.headers)
+    logger.debug("Backend response body (truncated): %r", proxied.content[:200])
+
+    return JSONResponse(
+        content=resp,
+        status_code=proxied.status_code,
+        headers={
+            k: v
+            for k, v in proxied.headers.items()
+            if k.lower() not in ("content-encoding", "transfer-encoding", "connection")
+        },
+    )
diff --git a/src/vllm_router/run-router.sh b/src/vllm_router/run-router.sh
index 4f908948d..cb2bf0385 100755
--- a/src/vllm_router/run-router.sh
+++ b/src/vllm_router/run-router.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-if [[ $# -ne 1 ]]; then
-    echo "Usage $0 <router port>"
+if [[ $# -ne 2 ]]; then
+    echo "Usage $0 <router port> <backend url>"
     exit 1
 fi
 
@@ -15,17 +15,17 @@ fi
 #     --log-stats
 
 # Use this command when testing with static service discovery
-python3 -m vllm_router.app --port "$1" \
-    --service-discovery static \
-    --static-backends "http://localhost:8000" \
-    --static-models "facebook/opt-125m" \
-    --static-model-types "chat" \
-    --log-stats \
-    --log-stats-interval 10 \
-    --engine-stats-interval 10 \
-    --request-stats-window 10 \
-    --request-stats-window 10 \
-    --routing-logic roundrobin
+# python3 -m vllm_router.app --port "$1" \
+#     --service-discovery static \
+#     --static-backends "http://localhost:8000" \
+#     --static-models "facebook/opt-125m" \
+#     --static-model-types "chat" \
+#     --log-stats \
+#     --log-stats-interval 10 \
+#     --engine-stats-interval 10 \
+#     --request-stats-window 10 \
+#     --request-stats-window 10 \
+#     --routing-logic roundrobin
 
 # Use this command when testing with roundrobin routing logic
 #python3 router.py --port "$1" \
@@ -35,3 +35,19 @@ python3 -m vllm_router.app --port "$1" \
 #    --engine-stats-interval 10 \
 #    --log-stats
 #
+
+# Use this command when testing with whisper transcription
+ROUTER_PORT=$1
+BACKEND_URL=$2
+
+python3 -m vllm_router.app \
+    --host 0.0.0.0 \
+    --port "${ROUTER_PORT}" \
+    --service-discovery static \
+    --static-backends "${BACKEND_URL}" \
+    --static-models "openai/whisper-small" \
+    --static-model-types "transcription" \
+    --routing-logic roundrobin \
+    --log-stats \
+    --engine-stats-interval 10 \
+    --request-stats-window 10
diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py
index 762d34afe..c1ca1cb31 100644
--- a/src/vllm_router/utils.py
+++ b/src/vllm_router/utils.py
@@ -51,6 +51,7 @@ class ModelType(enum.Enum):
     embeddings = "/v1/embeddings"
     rerank = "/v1/rerank"
     score = "/v1/score"
+    transcription = "/v1/audio/transcriptions"
 
     @staticmethod
     def get_test_payload(model_type: str):
@@ -75,6 +76,11 @@ def get_test_payload(model_type: str):
                 return {"query": "Hello", "documents": ["Test"]}
             case ModelType.score:
                 return {"encoding_format": "float", "text_1": "Test", "test_2": "Test2"}
+            case ModelType.transcription:
+                return {
+                    "file": "",
+                    "model": "openai/whisper-small",
+                }
 
     @staticmethod
     def get_all_fields():

From 63cbd15a7f1778f624278d957f969d5309eb47ac Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Tue, 3 Jun 2025 16:45:23 -0400
Subject: [PATCH 02/28] remove the whisper payload response log

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 src/vllm_router/routers/main_router.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/vllm_router/routers/main_router.py b/src/vllm_router/routers/main_router.py
index e458ae812..a14f01ae8 100644
--- a/src/vllm_router/routers/main_router.py
+++ b/src/vllm_router/routers/main_router.py
@@ -359,10 +359,7 @@ async def audio_transcriptions(
 
     # return the whisper response unmodified
     resp = proxied.json()
-    logger.debug("==== Whisper response payload ====")
-    logger.debug(resp)
-    logger.debug("==== Whisper response payload ====")
-
+    
     logger.debug("Backend response headers: %s", proxied.headers)
     logger.debug("Backend response body (truncated): %r", proxied.content[:200])
 

From 7954fc939dc29cbe48d5b20dd72688d352394300 Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Tue, 3 Jun 2025 17:22:44 -0400
Subject: [PATCH 03/28] [docs]: add tutorial for transcription v1 api

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 tutorials/17-whisper-api-transcription.md | 99 +++++++++++++++++++++++
 1 file changed, 99 insertions(+)
 create mode 100644 tutorials/17-whisper-api-transcription.md

diff --git a/tutorials/17-whisper-api-transcription.md b/tutorials/17-whisper-api-transcription.md
new file mode 100644
index 000000000..e3834bc69
--- /dev/null
+++ b/tutorials/17-whisper-api-transcription.md
@@ -0,0 +1,99 @@
+# Tutorial: Whisper Transcription API in vLLM Production Stack
+
+## Overview
+
+This tutorial introduces the newly added `/v1/audio/transcriptions` endpoint in the `vllm-router`, enabling users to transcribe `.wav` audio files using OpenAI’s `whisper-small` model.
+
+## Prerequisites
+
+* Access to a machine with a GPU (e.g. via [RunPod](https://runpod.io/))
+* Python 3.12 environment (recommended with `uv`)
+* `vllm` and `production-stack` cloned and installed
+* `vllm` installed with audio support:
+
+  ```bash
+  pip install vllm[audio]
+  ```
+
+## 1. Serving the Whisper Model
+
+Start a vLLM backend with the `whisper-small` model:
+
+```bash
+vllm serve \
+  --task transcription openai/whisper-small \
+  --host 0.0.0.0 --port 8002
+```
+
+## 2. Running the Router
+
+Create and run a router connected to the Whisper backend:
+
+```bash
+#!/bin/bash
+if [[ $# -ne 2 ]]; then
+    echo "Usage: $0 <router_port> <backend_url>"
+    exit 1
+fi
+
+uv run python3 -m vllm_router.app \
+  --host 0.0.0.0 --port "$1" \
+  --service-discovery static \
+  --static-backends "$2" \
+  --static-models "openai/whisper-small" \
+  --static-model-types "transcription" \
+  --routing-logic roundrobin \
+  --log-stats \
+  --engine-stats-interval 10 \
+  --request-stats-window 10
+```
+
+Example usage:
+
+```bash
+./run-router.sh 8000 http://localhost:8002
+```
+
+## 3. Sending a Transcription Request
+
+Use `curl` to send a `.wav` file to the transcription endpoint:
+
+* You can test with any `.wav` audio file of your choice.
+
+```bash
+curl -v http://localhost:8000/v1/audio/transcriptions \
+  -F 'file=@/path/to/audio.wav;type=audio/wav' \
+  -F 'model=openai/whisper-small' \
+  -F 'response_format=json' \
+  -F 'language=en'
+```
+
+### Supported Parameters
+
+| Parameter         | Description                                            |
+| ----------------- | ------------------------------------------------------ |
+| `file`            | Path to a `.wav` audio file                            |
+| `model`           | Whisper model to use (e.g., `openai/whisper-small`)    |
+| `prompt`          | *(Optional)* Text prompt to guide the transcription    |
+| `response_format` | One of `json`, `text`, `srt`, `verbose_json`, or `vtt` |
+| `temperature`     | *(Optional)* Sampling temperature as a float           |
+| `language`        | ISO 639-1 code (e.g., `en`, `fr`, `zh`)                |
+
+## 4. Sample Output
+
+```json
+{
+  "text": "Testing testing testing the whisper small model testing testing testing the audio transcription function testing testing testing the whisper small model"
+}
+```
+
+## 5. Notes
+
+* Router uses extended HTTPX timeouts to support long transcription jobs.
+* This implementation dynamically discovers valid transcription backends and routes requests accordingly.
+
+## 6. Resources
+
+* [PR #469 – Add Whisper Transcription API](https://github.com/vllm-project/production-stack/pull/469)
+* [OpenAI Whisper GitHub](https://github.com/openai/whisper)
+* [Blog: vLLM Whisper Transcription Walkthrough](https://davidgao7.github.io/posts/vllm-v1-whisper-transcription/)

From 7ef270bb5eca9012bd6783497ad06f0db8b9f6b9 Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Tue, 3 Jun 2025 17:44:27 -0400
Subject: [PATCH 04/28] [chore] align example router running script with main

new script will be mentioned in
`tutorials/17-whisper-api-transcription.md`

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 src/vllm_router/run-router.sh | 42 +++++++++++------------------------
 1 file changed, 13 insertions(+), 29 deletions(-)

diff --git a/src/vllm_router/run-router.sh b/src/vllm_router/run-router.sh
index cb2bf0385..4f908948d 100755
--- a/src/vllm_router/run-router.sh
+++ b/src/vllm_router/run-router.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-if [[ $# -ne 2 ]]; then
-    echo "Usage $0 <router port> <backend url>"
+if [[ $# -ne 1 ]]; then
+    echo "Usage $0 <router port>"
     exit 1
 fi
 
@@ -15,17 +15,17 @@ fi
 #     --log-stats
 
 # Use this command when testing with static service discovery
-# python3 -m vllm_router.app --port "$1" \
-#     --service-discovery static \
-#     --static-backends "http://localhost:8000" \
-#     --static-models "facebook/opt-125m" \
-#     --static-model-types "chat" \
-#     --log-stats \
-#     --log-stats-interval 10 \
-#     --engine-stats-interval 10 \
-#     --request-stats-window 10 \
-#     --request-stats-window 10 \
-#     --routing-logic roundrobin
+python3 -m vllm_router.app --port "$1" \
+    --service-discovery static \
+    --static-backends "http://localhost:8000" \
+    --static-models "facebook/opt-125m" \
+    --static-model-types "chat" \
+    --log-stats \
+    --log-stats-interval 10 \
+    --engine-stats-interval 10 \
+    --request-stats-window 10 \
+    --request-stats-window 10 \
+    --routing-logic roundrobin
 
 # Use this command when testing with roundrobin routing logic
 #python3 router.py --port "$1" \
@@ -35,19 +35,3 @@ fi
 #    --engine-stats-interval 10 \
 #    --log-stats
 #
-
-# Use this command when testing with whisper transcription
-ROUTER_PORT=$1
-BACKEND_URL=$2
-
-python3 -m vllm_router.app \
-    --host 0.0.0.0 \
-    --port "${ROUTER_PORT}" \
-    --service-discovery static \
-    --static-backends "${BACKEND_URL}" \
-    --static-models "openai/whisper-small" \
-    --static-model-types "transcription" \
-    --routing-logic roundrobin \
-    --log-stats \
-    --engine-stats-interval 10 \
-    --request-stats-window 10

From 292fc628258ac9dda718c04ffaf25b80fb3aef7b Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Thu, 12 Jun 2025 17:04:03 +0800
Subject: [PATCH 05/28] omit model field since backend already knows which
 model to run

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 src/vllm_router/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py
index c1ca1cb31..bc43a1351 100644
--- a/src/vllm_router/utils.py
+++ b/src/vllm_router/utils.py
@@ -79,7 +79,6 @@ def get_test_payload(model_type: str):
             case ModelType.transcription:
                 return {
                     "file": "",
-                    "model": "openai/whisper-small",
                 }
 
     @staticmethod

From 4413d83b935685ff7951ba7345cb5851a84770b6 Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Wed, 18 Jun 2025 17:36:28 +0800
Subject: [PATCH 06/28] generate a silent audio file if no audio file appears

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 src/vllm_router/routers/main_router.py | 22 +++++++---------------
 src/vllm_router/utils.py               | 16 +++++++++++++++-
 2 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/src/vllm_router/routers/main_router.py b/src/vllm_router/routers/main_router.py
index a14f01ae8..ed2a4ccf0 100644
--- a/src/vllm_router/routers/main_router.py
+++ b/src/vllm_router/routers/main_router.py
@@ -210,7 +210,8 @@ async def get_engine_instances():
 
 @main_router.get("/health")
 async def health() -> Response:
-    """Endpoint to check the health status of various components.
+    """
+    Endpoint to check the health status of various components.
 
     This function verifies the health of the service discovery module and
     the engine stats scraper. If either component is down, it returns a
@@ -247,7 +248,7 @@ async def health() -> Response:
 
 @main_router.post("/v1/audio/transcriptions")
 async def audio_transcriptions(
-    file: UploadFile = File(...),
+    file: UploadFile | None = File(None),
     model: str = Form(...),
     prompt: str | None = Form(None),
     response_format: str | None = Form("json"),
@@ -276,7 +277,6 @@ async def audio_transcriptions(
     # logger.debug("=========files=========")
 
     data = {
-        "model": model,
         "language": language,
     }
 
@@ -300,20 +300,12 @@ async def audio_transcriptions(
     logger.debug(endpoints)
     logger.debug("==== Total endpoints ====")
 
-    # TODO: right now is skipping label check in code for local testing
-    endpoints = [
-        ep
-        for ep in endpoints
-        if model in ep.model_names  # that actually serve your model
+    # filter the endpoints url by model name for transcriptions
+    transcription_endpoints = [
+        ep for ep in endpoints
+        if model == ep.model_name and ep.model_label == "transcription"
     ]
 
-    logger.debug("==== Discovered endpoints after filtering ====")
-    logger.debug(endpoints)
-    logger.debug("==== Discovered endpoints after filtering ====")
-
-    # filter the endpoints url for transcriptions
-    transcription_endpoints = [ep for ep in endpoints if model in ep.model_names]
-
     logger.debug("====List of transcription endpoints====")
     logger.debug(transcription_endpoints)
     logger.debug("====List of transcription endpoints====")
diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py
index bc43a1351..3be5ccb4b 100644
--- a/src/vllm_router/utils.py
+++ b/src/vllm_router/utils.py
@@ -7,6 +7,8 @@
 import requests
 from fastapi.requests import Request
 from starlette.datastructures import MutableHeaders
+import io
+import wave
 
 from vllm_router.log import init_logger
 
@@ -77,10 +79,22 @@ def get_test_payload(model_type: str):
             case ModelType.score:
                 return {"encoding_format": "float", "text_1": "Test", "test_2": "Test2"}
             case ModelType.transcription:
+                # Generate a 0.1 second silent audio file
+                with io.BytesIO() as wav_buffer:
+                    with wave.open(wav_buffer, "wb") as wf:
+                        wf.setnchannels(1)  # mono audio channel, standard configuration
+                        wf.setsampwidth(2)  # 16 bit audio, common bit depth for wav file
+                        wf.setframerate(16000)  # 16 kHz sample rate
+                        wf.writeframes(b"\x00\x00" * 1600)  # 0.1 second of silence
+
+                    # retrieves the generated wav bytes, return
+                    wav_bytes = wav_buffer.getvalue()
+
                 return {
-                    "file": "",
+                    "file": ("empty.wav", wav_bytes, "audio/wav"),
                 }
 
+
     @staticmethod
     def get_all_fields():
         return [model_type.name for model_type in ModelType]

From a34a431f0c23f111874730020e69c969953ac908 Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Thu, 19 Jun 2025 02:00:50 +0800
Subject: [PATCH 07/28] put wav creation at the module level to prevent being
 recreated every time

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 src/vllm_router/utils.py | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py
index 3be5ccb4b..f2d37fa2f 100644
--- a/src/vllm_router/utils.py
+++ b/src/vllm_router/utils.py
@@ -14,6 +14,19 @@
 
 logger = init_logger(__name__)
 
+# prepare a WAV byte to prevent repeatedly generating it
+# Generate a 0.1 second silent audio file
+_SILENT_WAV_BYTES = None
+with io.BytesIO() as wav_buffer:
+    with wave.open(wav_buffer, "wb") as wf:
+        wf.setnchannels(1)  # mono audio channel, standard configuration
+        wf.setsampwidth(2)  # 16 bit audio, common bit depth for wav file
+        wf.setframerate(16000)  # 16 kHz sample rate
+        wf.writeframes(b"\x00\x00" * 1600)  # 0.1 second of silence
+
+    # retrieves the generated wav bytes, return
+    _SILENT_WAV_BYTES = wav_buffer.getvalue()
+
 
 class SingletonMeta(type):
     _instances = {}
@@ -80,19 +93,10 @@ def get_test_payload(model_type: str):
                 return {"encoding_format": "float", "text_1": "Test", "test_2": "Test2"}
             case ModelType.transcription:
                 # Generate a 0.1 second silent audio file
-                with io.BytesIO() as wav_buffer:
-                    with wave.open(wav_buffer, "wb") as wf:
-                        wf.setnchannels(1)  # mono audio channel, standard configuration
-                        wf.setsampwidth(2)  # 16 bit audio, common bit depth for wav file
-                        wf.setframerate(16000)  # 16 kHz sample rate
-                        wf.writeframes(b"\x00\x00" * 1600)  # 0.1 second of silence
-
-                    # retrieves the generated wav bytes, return
-                    wav_bytes = wav_buffer.getvalue()
-
-                return {
-                    "file": ("empty.wav", wav_bytes, "audio/wav"),
-                }
+                if _SILENT_WAV_BYTES is not None:
+                    return {
+                        "file": ("empty.wav", _SILENT_WAV_BYTES, "audio/wav"),
+                    }
 
 
     @staticmethod

From b1562547cc925061204b1e48dd71bccdc27a1fed Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Thu, 19 Jun 2025 14:59:24 +0800
Subject: [PATCH 08/28] [Test] test frequency of silent audio creation

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 src/vllm_router/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py
index f2d37fa2f..53bbd0071 100644
--- a/src/vllm_router/utils.py
+++ b/src/vllm_router/utils.py
@@ -26,6 +26,7 @@
 
     # retrieves the generated wav bytes, return
     _SILENT_WAV_BYTES = wav_buffer.getvalue()
+    logger.debug("======A default silent WAV file has been stored in memory within py application process====")
 
 
 class SingletonMeta(type):
@@ -94,6 +95,7 @@ def get_test_payload(model_type: str):
             case ModelType.transcription:
                 # Generate a 0.1 second silent audio file
                 if _SILENT_WAV_BYTES is not None:
+                    logger.debug("=====Slient WAV Bytes is being used=====")
                     return {
                         "file": ("empty.wav", _SILENT_WAV_BYTES, "audio/wav"),
                     }

From 518e453b1e974700af57ee0929bfe2da0c9258d9 Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Sun, 22 Jun 2025 00:03:50 +0800
Subject: [PATCH 09/28] send multipart/form-data for transcription model's
 health check

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 src/vllm_router/utils.py | 46 +++++++++++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py
index 53bbd0071..c9c2abd11 100644
--- a/src/vllm_router/utils.py
+++ b/src/vllm_router/utils.py
@@ -4,8 +4,8 @@
 import re
 import resource
 
-import requests
 from fastapi.requests import Request
+import requests
 from starlette.datastructures import MutableHeaders
 import io
 import wave
@@ -184,14 +184,40 @@ def update_content_length(request: Request, request_body: str):
 
 def is_model_healthy(url: str, model: str, model_type: str) -> bool:
     model_details = ModelType[model_type]
+    
     try:
-        response = requests.post(
-            f"{url}{model_details.value}",
-            headers={"Content-Type": "application/json"},
-            json={"model": model} | model_details.get_test_payload(model_type),
-            timeout=30,
-        )
-    except Exception as e:
-        logger.error(e)
+        if model_type == "transcription":
+
+            # for transcription, the backend expects multipart/form-data with a file
+            # we will use pre-generated silent wav bytes
+            files = {
+                "file": ("empty.wav", _SILENT_WAV_BYTES, "audio/wav")
+            }
+            data = {"model":model}
+            response = requests.post(
+                f"{url}{model_details.value}",
+                files=files,  # multipart/form-data
+                data=data
+            )
+        else:
+            # for other model types (chat, completion, etc.)
+            response = requests.post(
+                f"{url}{model_details.value}",
+                headers={"Content-Type": "application/json"},
+                json={"model":model} | model_details.get_test_payload(model_type)
+            )
+
+        response.raise_for_status()
+
+        if model_type == "transcription":
+            return True
+        else:
+            response.json()  # verify it's valid json for other model types
+
+    except requests.exceptions.RequestException as e:
+        logger.warning(f"{model_type} model {model} at {url} not healthy: {e}")
+        return False
+
+    except json.JSONDecodeError as e:
+        logger.error(f"Failed to decode JSON from {model_type} model {model} at {url}: {e}")
         return False
-    return response.status_code == 200

From d7cc2a3cb32c0cd409be2b6c8354e68112fbc8ef Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Mon, 23 Jun 2025 16:55:31 +0800
Subject: [PATCH 10/28] fix pre-commit issue

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 src/vllm_router/routers/main_router.py |  5 +++--
 src/vllm_router/utils.py               | 29 +++++++++++++-------------
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/src/vllm_router/routers/main_router.py b/src/vllm_router/routers/main_router.py
index ed2a4ccf0..1f13fc6f8 100644
--- a/src/vllm_router/routers/main_router.py
+++ b/src/vllm_router/routers/main_router.py
@@ -302,7 +302,8 @@ async def audio_transcriptions(
 
     # filter the endpoints url by model name for transcriptions
     transcription_endpoints = [
-        ep for ep in endpoints
+        ep
+        for ep in endpoints
         if model == ep.model_name and ep.model_label == "transcription"
     ]
 
@@ -351,7 +352,7 @@ async def audio_transcriptions(
 
     # return the whisper response unmodified
     resp = proxied.json()
-    
+
     logger.debug("Backend response headers: %s", proxied.headers)
     logger.debug("Backend response body (truncated): %r", proxied.content[:200])
 
diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py
index c9c2abd11..907c82867 100644
--- a/src/vllm_router/utils.py
+++ b/src/vllm_router/utils.py
@@ -1,14 +1,14 @@
 import abc
 import enum
+import io
 import json
 import re
 import resource
+import wave
 
-from fastapi.requests import Request
 import requests
+from fastapi.requests import Request
 from starlette.datastructures import MutableHeaders
-import io
-import wave
 
 from vllm_router.log import init_logger
 
@@ -26,7 +26,9 @@
 
     # retrieves the generated wav bytes, return
     _SILENT_WAV_BYTES = wav_buffer.getvalue()
-    logger.debug("======A default silent WAV file has been stored in memory within py application process====")
+    logger.debug(
+        "======A default silent WAV file has been stored in memory within py application process===="
+    )
 
 
 class SingletonMeta(type):
@@ -95,12 +97,11 @@ def get_test_payload(model_type: str):
             case ModelType.transcription:
                 # Generate a 0.1 second silent audio file
                 if _SILENT_WAV_BYTES is not None:
-                    logger.debug("=====Slient WAV Bytes is being used=====")
+                    logger.debug("=====Silent WAV Bytes is being used=====")
                     return {
                         "file": ("empty.wav", _SILENT_WAV_BYTES, "audio/wav"),
                     }
 
-
     @staticmethod
     def get_all_fields():
         return [model_type.name for model_type in ModelType]
@@ -184,27 +185,25 @@ def update_content_length(request: Request, request_body: str):
 
 def is_model_healthy(url: str, model: str, model_type: str) -> bool:
     model_details = ModelType[model_type]
-    
+
     try:
         if model_type == "transcription":
 
             # for transcription, the backend expects multipart/form-data with a file
             # we will use pre-generated silent wav bytes
-            files = {
-                "file": ("empty.wav", _SILENT_WAV_BYTES, "audio/wav")
-            }
-            data = {"model":model}
+            files = {"file": ("empty.wav", _SILENT_WAV_BYTES, "audio/wav")}
+            data = {"model": model}
             response = requests.post(
                 f"{url}{model_details.value}",
                 files=files,  # multipart/form-data
-                data=data
+                data=data,
             )
         else:
             # for other model types (chat, completion, etc.)
             response = requests.post(
                 f"{url}{model_details.value}",
                 headers={"Content-Type": "application/json"},
-                json={"model":model} | model_details.get_test_payload(model_type)
+                json={"model": model} | model_details.get_test_payload(model_type),
             )
 
         response.raise_for_status()
@@ -219,5 +218,7 @@ def is_model_healthy(url: str, model: str, model_type: str) -> bool:
         return False
 
     except json.JSONDecodeError as e:
-        logger.error(f"Failed to decode JSON from {model_type} model {model} at {url}: {e}")
+        logger.error(
+            f"Failed to decode JSON from {model_type} model {model} at {url}: {e}"
+        )
         return False

From cf11af69625c0ab4a0a19c03f5cc747d537ae1ec Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Sat, 28 Jun 2025 21:03:22 +0800
Subject: [PATCH 11/28] Moves the implementation for the
 `/v1/audio/transcriptions` endpoint from `main_router.py` into `request.py`,
 align architectural pattern.

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 src/vllm_router/routers/main_router.py        | 132 +--------------
 .../services/request_service/request.py       | 150 ++++++++++++++++--
 2 files changed, 146 insertions(+), 136 deletions(-)

diff --git a/src/vllm_router/routers/main_router.py b/src/vllm_router/routers/main_router.py
index 1f13fc6f8..32777fabb 100644
--- a/src/vllm_router/routers/main_router.py
+++ b/src/vllm_router/routers/main_router.py
@@ -12,31 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import json
-import time
 
-import httpx
 from fastapi import (
     APIRouter,
     BackgroundTasks,
-    File,
-    Form,
-    HTTPException,
     Request,
-    UploadFile,
 )
 from fastapi.responses import JSONResponse, Response
 
 from vllm_router.dynamic_config import get_dynamic_config_watcher
 from vllm_router.log import init_logger
 from vllm_router.protocols import ModelCard, ModelList
-from vllm_router.routers.routing_logic import get_routing_logic
 from vllm_router.service_discovery import get_service_discovery
 from vllm_router.services.request_service.request import (
     route_general_request,
     route_sleep_wakeup_request,
+    route_general_transcriptions,
 )
 from vllm_router.stats.engine_stats import get_engine_stats_scraper
-from vllm_router.stats.request_stats import RequestStatsMonitor
 from vllm_router.version import __version__
 
 try:
@@ -138,7 +131,7 @@ async def show_version():
 @main_router.get("/v1/models")
 async def show_models():
     """
-    Returns a list of all models available in the stack
+    Returns a list of all models available in the stack.
 
     Args:
         None
@@ -247,121 +240,8 @@ async def health() -> Response:
 
 
 @main_router.post("/v1/audio/transcriptions")
-async def audio_transcriptions(
-    file: UploadFile | None = File(None),
-    model: str = Form(...),
-    prompt: str | None = Form(None),
-    response_format: str | None = Form("json"),
-    temperature: float | None = Form(None),
-    language: str = Form("en"),
+async def route_v1_audio_transcriptions(
+    request: Request, background_tasks: BackgroundTasks
 ):
-
-    logger.debug("==== Enter audio_transcriptions ====")
-    logger.debug("Received upload: %s (%s)", file.filename, file.content_type)
-    logger.debug(
-        "Params: model=%s prompt=%r response_format=%r temperature=%r language=%s",
-        model,
-        prompt,
-        response_format,
-        temperature,
-        language,
-    )
-
-    # read file bytes
-    payload_bytes = await file.read()
-    files = {
-        "file": (file.filename, payload_bytes, file.content_type),
-    }
-    # logger.debug("=========files=========")
-    # logger.debug(files)
-    # logger.debug("=========files=========")
-
-    data = {
-        "language": language,
-    }
-
-    if prompt:
-        data["prompt"] = prompt
-
-    if response_format:
-        data["response_format"] = response_format
-
-    if temperature is not None:
-        data["temperature"] = str(temperature)
-
-    logger.debug("==== data payload keys ====")
-    logger.debug(list(data.keys()))
-    logger.debug("==== data payload keys ====")
-
-    # get the backend url
-    endpoints = get_service_discovery().get_endpoint_info()
-
-    logger.debug("==== Total endpoints ====")
-    logger.debug(endpoints)
-    logger.debug("==== Total endpoints ====")
-
-    # filter the endpoints url by model name for transcriptions
-    transcription_endpoints = [
-        ep
-        for ep in endpoints
-        if model == ep.model_name and ep.model_label == "transcription"
-    ]
-
-    logger.debug("====List of transcription endpoints====")
-    logger.debug(transcription_endpoints)
-    logger.debug("====List of transcription endpoints====")
-
-    if not transcription_endpoints:
-        logger.error("No transcription backend available for model %s", model)
-        raise HTTPException(
-            status_code=503, detail=f"No transcription backend for model {model}"
-        )
-
-    # grab the current engin and request stats
-    engine_stats = get_engine_stats_scraper().get_engine_stats()
-    request_stats = RequestStatsMonitor().get_request_stats(time.time())
-    router = get_routing_logic()
-
-    # pick one using the router's configured logic (roundrobin, least-loaded, etc.)
-    chosen_url = router.route_request(
-        transcription_endpoints,
-        engine_stats,
-        request_stats,
-        # we don’t need to pass the original FastAPI Request object here,
-        # but you can if your routing logic looks at headers or body
-        None,
-    )
-
-    logger.info("Proxying transcription request to %s", chosen_url)
-
-    # proxy the request
-    # by default httpx will only wait for 5 seconds, large audio transcriptions generally
-    # take longer than that
-    async with httpx.AsyncClient(
-        base_url=chosen_url,
-        timeout=httpx.Timeout(
-            connect=60.0,  # connect timeout
-            read=300.0,  # read timeout
-            write=30.0,  # if you’re streaming uploads
-            pool=None,  # no pool timeout
-        ),
-    ) as client:
-        logger.debug("Sending multipart to %s/v1/audio/transcriptions …", chosen_url)
-        proxied = await client.post("/v1/audio/transcriptions", data=data, files=files)
-    logger.info("Received %d from whisper backend", proxied.status_code)
-
-    # return the whisper response unmodified
-    resp = proxied.json()
-
-    logger.debug("Backend response headers: %s", proxied.headers)
-    logger.debug("Backend response body (truncated): %r", proxied.content[:200])
-
-    return JSONResponse(
-        content=resp,
-        status_code=proxied.status_code,
-        headers={
-            k: v
-            for k, v in proxied.headers.items()
-            if k.lower() not in ("content-encoding", "transfer-encoding", "connection")
-        },
-    )
+    """Handles audio transcription requests."""
+    return await route_general_transcriptions(request, "/v1/audio/transcriptions", background_tasks)
diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py
index 9842d1821..a3b05c61a 100644
--- a/src/vllm_router/services/request_service/request.py
+++ b/src/vllm_router/services/request_service/request.py
@@ -17,23 +17,24 @@
 import os
 import time
 import uuid
+from typing import Optional
 
 import httpx
-from fastapi import BackgroundTasks, Request
+from fastapi import BackgroundTasks, Request, UploadFile
 from fastapi.responses import JSONResponse, StreamingResponse
 
 from vllm_router.log import init_logger
 from vllm_router.routers.routing_logic import (
     DisaggregatedPrefillRouter,
     KvawareRouter,
-    PrefixAwareRouter,
+    PrefixAwareRouter
 )
 from vllm_router.service_discovery import get_service_discovery
 from vllm_router.services.request_service.rewriter import (
     get_request_rewriter,
     is_request_rewriter_initialized,
 )
-from vllm_router.utils import replace_model_in_request_body, update_content_length
+from vllm_router.utils import replace_model_in_request_body,update_content_length
 
 try:
     # Semantic cache integration
@@ -314,9 +315,7 @@ async def route_general_request(
 async def send_request_to_prefiller(
     client: httpx.AsyncClient, endpoint: str, req_data: dict, request_id: str
 ):
-    """
-    Send a request to a prefiller service.
-    """
+    """Send a request to a prefiller service."""
     req_data = req_data.copy()
     req_data["max_tokens"] = 1
     if "max_completion_tokens" in req_data:
@@ -335,9 +334,7 @@ async def send_request_to_prefiller(
 async def send_request_to_decode(
     client: httpx.AsyncClient, endpoint: str, req_data: dict, request_id: str
 ):
-    """
-    Asynchronously stream the response from a service using a persistent client.
-    """
+    """Asynchronously stream the response from a service using a persistent client."""
     headers = {
         "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
         "X-Request-Id": request_id,
@@ -428,7 +425,7 @@ async def route_sleep_wakeup_request(
     }
 
     if VLLM_API_KEY := os.getenv("VLLM_API_KEY"):
-        logger.info(f"Using vllm server authentication")
+        logger.info("Using vllm server authentication")
         headers["Authorization"] = f"Bearer {VLLM_API_KEY}"
 
     url = server_url + endpoint
@@ -458,3 +455,136 @@ async def route_sleep_wakeup_request(
                 content={"status": "success"},
                 headers={"X-Request-Id": request_id},
             )
+
+async def route_general_transcriptions(
+    request: Request, 
+    endpoint: str,  # "/v1/audio/transcriptions"
+    background_tasks: BackgroundTasks,
+):
+    """Handles audio transcription requests by parsing form data and proxying to backend."""
+
+    request_id = request.headers.get("X-Request-Id", str(uuid.uuid4()))
+    in_router_time = time.time()
+
+    # --- 1. Form parsing ---
+    try:
+        form = await request.form()
+
+        # Extract parameters from the form data
+        file: UploadFile = form["file"]
+        model: str = form["model"]
+        prompt: Optional[str] = form.get("prompt", None)
+        response_format: Optional[str] = form.get("response_format", "json")
+        temperature_str: Optional[str] = form.get("temperature", None)
+        temperature: Optional[float] = float(temperature_str) if temperature_str is not None else None
+        language: Optional[str] = form.get("language", "en")
+    except KeyError as e:
+        return JSONResponse(status_code=400, content={"error": f"Invalid request: missing '{e.args[0]}' in form data."})
+
+    logger.debug("==== Enter audio_transcriptions ====")
+    logger.debug("Received upload: %s (%s)", file.filename, file.content_type)
+    logger.debug(
+        "Params: model=%s prompt=%r response_format=%r temperature=%r language=%s",
+        model,
+        prompt,
+        response_format,
+        temperature,
+        language,
+    )
+
+    # --- 2. Service Discovery and Routing ---
+    # Access singletons via request.app.state for consistent style
+    service_discovery = get_service_discovery() # This one is often still accessed directly via its get function
+    router = request.app.state.router # Access router from app.state
+    engine_stats_scraper = request.app.state.engine_stats_scraper # Access engine_stats_scraper from app.state
+    request_stats_monitor = request.app.state.request_stats_monitor # Access request_stats_monitor from app.state
+
+
+    endpoints = service_discovery.get_endpoint_info()
+
+    logger.debug("==== Total endpoints ====")
+    logger.debug(endpoints)
+    logger.debug("==== Total endpoints ====")
+
+    # filter the endpoints url by model name and label for transcriptions
+    transcription_endpoints = [
+        ep
+        for ep in endpoints
+        if model == ep.model_name and ep.model_label == "transcription" and not ep.sleep # Added ep.sleep == False
+    ]
+
+    logger.debug("====List of transcription endpoints====")
+    logger.debug(transcription_endpoints)
+    logger.debug("====List of transcription endpoints====")
+
+    if not transcription_endpoints:
+        logger.error("No transcription backend available for model %s", model)
+        return JSONResponse(
+            status_code=404, content={"error": f"No transcription backend for model {model}"}
+        )
+
+    # grab the current engine and request stats
+    engine_stats = engine_stats_scraper.get_engine_stats()
+    request_stats = request_stats_monitor.get_request_stats(time.time())
+
+    # pick one using the router's configured logic (roundrobin, least-loaded, etc.)
+    chosen_url = router.route_request(
+        transcription_endpoints,
+        engine_stats,
+        request_stats,
+        request,
+    )
+
+    logger.info("Proxying transcription request to %s", chosen_url)
+
+    # --- 3. Prepare and Proxy the Request ---
+    payload_bytes = await file.read()
+    files = {"file": (file.filename, payload_bytes, file.content_type)}
+
+    data = {"model": model,"language": language}
+
+    if prompt:
+        data["prompt"] = prompt
+
+    if response_format:
+        data["response_format"] = response_format
+
+    if temperature is not None:
+        data["temperature"] = str(temperature)
+
+    logger.info("Proxying transcription request for model %s to %s", model, chosen_url)
+
+    logger.debug("==== data payload keys ====")
+    logger.debug(list(data.keys()))
+    logger.debug("==== data payload keys ====")
+
+    try:
+        async with request.app.state.httpx_client_wrapper() as client:
+            backend_response = await client.post(
+            f"{chosen_url}{endpoint}",
+            data=data,
+            files=files,
+            timeout=300.0
+        )
+        backend_response.raise_for_status()
+
+        # --- 4. Return the response ---
+        response_content = backend_response.json()
+        headers = {
+                k: v
+                for k, v in backend_response.headers.items()
+                if k.lower() not in ("content-encoding", "transfer-encoding", "connection")
+        }
+
+        headers["X-Request-Id"] = request_id
+
+        return JSONResponse(
+            content=response_content,
+            status_code=backend_response.status_code,
+            headers=headers,
+        )
+    except httpx.HTTPStatusError as e:
+        error_content = e.response.json() if "json" in e.response.headers.get("content-type", "") else e.response.text
+        return JSONResponse(status_code=e.response.status_code, content=error_content)
+    except httpx.RequestError as e:
+        return JSONResponse(status_code=503, content={"error": f"Failed to connect to backend: {e}"})

From 769e4f4166ca0b3862c217df7732e9259133da6f Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Thu, 24 Jul 2025 22:26:51 +0800
Subject: [PATCH 12/28] add timeout to ensure health check will not hang
 indefinitely if a backend model becomes unresponsive

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 src/vllm_router/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py
index 907c82867..a882681a5 100644
--- a/src/vllm_router/utils.py
+++ b/src/vllm_router/utils.py
@@ -197,6 +197,7 @@ def is_model_healthy(url: str, model: str, model_type: str) -> bool:
                 f"{url}{model_details.value}",
                 files=files,  # multipart/form-data
                 data=data,
+                timeout=10
             )
         else:
             # for other model types (chat, completion, etc.)
@@ -204,6 +205,7 @@ def is_model_healthy(url: str, model: str, model_type: str) -> bool:
                 f"{url}{model_details.value}",
                 headers={"Content-Type": "application/json"},
                 json={"model": model} | model_details.get_test_payload(model_type),
+                timeout=10
             )
 
         response.raise_for_status()

From dc0f2d2e82c98f361675767d077c968efe8c67d6 Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Thu, 24 Jul 2025 23:04:27 +0800
Subject: [PATCH 13/28] add boolean model health check return for
 non-transcription model

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 src/vllm_router/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py
index a882681a5..2299c9872 100644
--- a/src/vllm_router/utils.py
+++ b/src/vllm_router/utils.py
@@ -214,6 +214,7 @@ def is_model_healthy(url: str, model: str, model_type: str) -> bool:
             return True
         else:
             response.json()  # verify it's valid json for other model types
+            return True  # validation passed
 
     except requests.exceptions.RequestException as e:
         logger.warning(f"{model_type} model {model} at {url} not healthy: {e}")

From a57de3d71d3671eb3acf31d817791f892d3ffe9b Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Thu, 24 Jul 2025 23:37:05 +0800
Subject: [PATCH 14/28] remove redundant warning log since handled in outer
 'StaticServiceDiscovery.get_unhealthy_endpoint_hashes'

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 src/vllm_router/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py
index 2299c9872..8b596d3ea 100644
--- a/src/vllm_router/utils.py
+++ b/src/vllm_router/utils.py
@@ -217,7 +217,6 @@ def is_model_healthy(url: str, model: str, model_type: str) -> bool:
             return True  # validation passed
 
     except requests.exceptions.RequestException as e:
-        logger.warning(f"{model_type} model {model} at {url} not healthy: {e}")
         return False
 
     except json.JSONDecodeError as e:

From adcc64f28b334b2e15742891028e5f9d685575e8 Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Fri, 25 Jul 2025 00:24:13 +0800
Subject: [PATCH 15/28] remove redundant JSONDecodeError catch and downgrade
 RequestException log to debug, align with service discovery's warning

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 src/vllm_router/utils.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py
index 8b596d3ea..bdd3af02e 100644
--- a/src/vllm_router/utils.py
+++ b/src/vllm_router/utils.py
@@ -217,10 +217,5 @@ def is_model_healthy(url: str, model: str, model_type: str) -> bool:
             return True  # validation passed
 
     except requests.exceptions.RequestException as e:
-        return False
-
-    except json.JSONDecodeError as e:
-        logger.error(
-            f"Failed to decode JSON from {model_type} model {model} at {url}: {e}"
-        )
+        logger.debug(f"{model_type} Model {model} at {url} is not healthy: {e}")
         return False

From f1522fc28f92bdeaecded0bbffc6720ebb7b474a Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Thu, 31 Jul 2025 01:11:09 +0800
Subject: [PATCH 16/28] Chore: Apply auto-formatting and linting fixes via
 pre-commit

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 src/vllm_router/routers/main_router.py        |  6 +-
 .../services/request_service/request.py       | 66 ++++++++++++-------
 src/vllm_router/utils.py                      |  4 +-
 3 files changed, 47 insertions(+), 29 deletions(-)

diff --git a/src/vllm_router/routers/main_router.py b/src/vllm_router/routers/main_router.py
index 999868e03..5d77124dd 100644
--- a/src/vllm_router/routers/main_router.py
+++ b/src/vllm_router/routers/main_router.py
@@ -26,8 +26,8 @@
 from vllm_router.service_discovery import get_service_discovery
 from vllm_router.services.request_service.request import (
     route_general_request,
-    route_sleep_wakeup_request,
     route_general_transcriptions,
+    route_sleep_wakeup_request,
 )
 from vllm_router.stats.engine_stats import get_engine_stats_scraper
 from vllm_router.version import __version__
@@ -241,4 +241,6 @@ async def route_v1_audio_transcriptions(
     request: Request, background_tasks: BackgroundTasks
 ):
     """Handles audio transcription requests."""
-    return await route_general_transcriptions(request, "/v1/audio/transcriptions", background_tasks)
+    return await route_general_transcriptions(
+        request, "/v1/audio/transcriptions", background_tasks
+    )
diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py
index e5565b4a3..9ae8c9e26 100644
--- a/src/vllm_router/services/request_service/request.py
+++ b/src/vllm_router/services/request_service/request.py
@@ -20,8 +20,7 @@
 from typing import Optional
 
 import httpx
-
-from fastapi import BackgroundTasks, Request, UploadFile, HTTPException
+from fastapi import BackgroundTasks, HTTPException, Request, UploadFile
 from fastapi.responses import JSONResponse, StreamingResponse
 from requests import JSONDecodeError
 
@@ -29,14 +28,14 @@
 from vllm_router.routers.routing_logic import (
     DisaggregatedPrefillRouter,
     KvawareRouter,
-    PrefixAwareRouter
+    PrefixAwareRouter,
 )
 from vllm_router.service_discovery import get_service_discovery
 from vllm_router.services.request_service.rewriter import (
     get_request_rewriter,
     is_request_rewriter_initialized,
 )
-from vllm_router.utils import replace_model_in_request_body,update_content_length
+from vllm_router.utils import replace_model_in_request_body, update_content_length
 
 try:
     # Semantic cache integration
@@ -511,8 +510,9 @@ async def route_sleep_wakeup_request(
                 headers={"X-Request-Id": request_id},
             )
 
+
 async def route_general_transcriptions(
-    request: Request, 
+    request: Request,
     endpoint: str,  # "/v1/audio/transcriptions"
     background_tasks: BackgroundTasks,
 ):
@@ -531,10 +531,15 @@ async def route_general_transcriptions(
         prompt: Optional[str] = form.get("prompt", None)
         response_format: Optional[str] = form.get("response_format", "json")
         temperature_str: Optional[str] = form.get("temperature", None)
-        temperature: Optional[float] = float(temperature_str) if temperature_str is not None else None
+        temperature: Optional[float] = (
+            float(temperature_str) if temperature_str is not None else None
+        )
         language: Optional[str] = form.get("language", "en")
     except KeyError as e:
-        return JSONResponse(status_code=400, content={"error": f"Invalid request: missing '{e.args[0]}' in form data."})
+        return JSONResponse(
+            status_code=400,
+            content={"error": f"Invalid request: missing '{e.args[0]}' in form data."},
+        )
 
     logger.debug("==== Enter audio_transcriptions ====")
     logger.debug("Received upload: %s (%s)", file.filename, file.content_type)
@@ -549,11 +554,16 @@ async def route_general_transcriptions(
 
     # --- 2. Service Discovery and Routing ---
     # Access singletons via request.app.state for consistent style
-    service_discovery = get_service_discovery() # This one is often still accessed directly via its get function
-    router = request.app.state.router # Access router from app.state
-    engine_stats_scraper = request.app.state.engine_stats_scraper # Access engine_stats_scraper from app.state
-    request_stats_monitor = request.app.state.request_stats_monitor # Access request_stats_monitor from app.state
-
+    service_discovery = (
+        get_service_discovery()
+    )  # This one is often still accessed directly via its get function
+    router = request.app.state.router  # Access router from app.state
+    engine_stats_scraper = (
+        request.app.state.engine_stats_scraper
+    )  # Access engine_stats_scraper from app.state
+    request_stats_monitor = (
+        request.app.state.request_stats_monitor
+    )  # Access request_stats_monitor from app.state
 
     endpoints = service_discovery.get_endpoint_info()
 
@@ -565,7 +575,9 @@ async def route_general_transcriptions(
     transcription_endpoints = [
         ep
         for ep in endpoints
-        if model == ep.model_name and ep.model_label == "transcription" and not ep.sleep # Added ep.sleep == False
+        if model == ep.model_name
+        and ep.model_label == "transcription"
+        and not ep.sleep  # Added ep.sleep == False
     ]
 
     logger.debug("====List of transcription endpoints====")
@@ -575,7 +587,8 @@ async def route_general_transcriptions(
     if not transcription_endpoints:
         logger.error("No transcription backend available for model %s", model)
         return JSONResponse(
-            status_code=404, content={"error": f"No transcription backend for model {model}"}
+            status_code=404,
+            content={"error": f"No transcription backend for model {model}"},
         )
 
     # grab the current engine and request stats
@@ -596,7 +609,7 @@ async def route_general_transcriptions(
     payload_bytes = await file.read()
     files = {"file": (file.filename, payload_bytes, file.content_type)}
 
-    data = {"model": model,"language": language}
+    data = {"model": model, "language": language}
 
     if prompt:
         data["prompt"] = prompt
@@ -616,19 +629,16 @@ async def route_general_transcriptions(
     try:
         async with request.app.state.httpx_client_wrapper() as client:
             backend_response = await client.post(
-            f"{chosen_url}{endpoint}",
-            data=data,
-            files=files,
-            timeout=300.0
-        )
+                f"{chosen_url}{endpoint}", data=data, files=files, timeout=300.0
+            )
         backend_response.raise_for_status()
 
         # --- 4. Return the response ---
         response_content = backend_response.json()
         headers = {
-                k: v
-                for k, v in backend_response.headers.items()
-                if k.lower() not in ("content-encoding", "transfer-encoding", "connection")
+            k: v
+            for k, v in backend_response.headers.items()
+            if k.lower() not in ("content-encoding", "transfer-encoding", "connection")
         }
 
         headers["X-Request-Id"] = request_id
@@ -639,7 +649,13 @@ async def route_general_transcriptions(
             headers=headers,
         )
     except httpx.HTTPStatusError as e:
-        error_content = e.response.json() if "json" in e.response.headers.get("content-type", "") else e.response.text
+        error_content = (
+            e.response.json()
+            if "json" in e.response.headers.get("content-type", "")
+            else e.response.text
+        )
         return JSONResponse(status_code=e.response.status_code, content=error_content)
     except httpx.RequestError as e:
-        return JSONResponse(status_code=503, content={"error": f"Failed to connect to backend: {e}"})
+        return JSONResponse(
+            status_code=503, content={"error": f"Failed to connect to backend: {e}"}
+        )
diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py
index b831c5ca3..7f580c0c0 100644
--- a/src/vllm_router/utils.py
+++ b/src/vllm_router/utils.py
@@ -200,7 +200,7 @@ def is_model_healthy(url: str, model: str, model_type: str) -> bool:
                 f"{url}{model_details.value}",
                 files=files,  # multipart/form-data
                 data=data,
-                timeout=10
+                timeout=10,
             )
         else:
             # for other model types (chat, completion, etc.)
@@ -208,7 +208,7 @@ def is_model_healthy(url: str, model: str, model_type: str) -> bool:
                 f"{url}{model_details.value}",
                 headers={"Content-Type": "application/json"},
                 json={"model": model} | model_details.get_test_payload(model_type),
-                timeout=10
+                timeout=10,
             )
 
         response.raise_for_status()

From 73d58179d176afe9b560e40f34a0b0301d4ca4dd Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Thu, 31 Jul 2025 01:16:07 +0800
Subject: [PATCH 17/28] refactor: update more meaningful comments for silent
 wav bytes generation

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 src/vllm_router/utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py
index 7f580c0c0..d071cd952 100644
--- a/src/vllm_router/utils.py
+++ b/src/vllm_router/utils.py
@@ -15,8 +15,8 @@
 
 logger = init_logger(__name__)
 
-# prepare a WAV byte to prevent repeatedly generating it
 # Generate a 0.1 second silent audio file
+# This will be used for the /v1/audio/transcriptions endpoint
 _SILENT_WAV_BYTES = None
 with io.BytesIO() as wav_buffer:
     with wave.open(wav_buffer, "wb") as wf:
@@ -96,7 +96,6 @@ def get_test_payload(model_type: str):
             case ModelType.score:
                 return {"encoding_format": "float", "text_1": "Test", "test_2": "Test2"}
             case ModelType.transcription:
-                # Generate a 0.1 second silent audio file
                 if _SILENT_WAV_BYTES is not None:
                     logger.debug("=====Silent WAV Bytes is being used=====")
                     return {

From ba769ee53a6975f301b4968bc43d4739f2e18f50 Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Thu, 31 Jul 2025 02:05:04 +0800
Subject: [PATCH 18/28] refactor: keep the comment to explain purpose for
 generating a silent WAV byte

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 src/vllm_router/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/vllm_router/utils.py b/src/vllm_router/utils.py
index d071cd952..6b121cb6e 100644
--- a/src/vllm_router/utils.py
+++ b/src/vllm_router/utils.py
@@ -15,6 +15,7 @@
 
 logger = init_logger(__name__)
 
+# prepare a WAV byte to prevent repeatedly generating it
 # Generate a 0.1 second silent audio file
 # This will be used for the /v1/audio/transcriptions endpoint
 _SILENT_WAV_BYTES = None

From 996c653e9131354b38a444fb735fe1b47e61d3e7 Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Tue, 12 Aug 2025 00:31:34 +0800
Subject: [PATCH 19/28] fix(tests): Improve mock in model health check test

The mock for `requests.post` in `test_is_model_healthy` did not
correctly simulate an `HTTPError` on a non-200 response.

This change configures the mock's `raise_for_status` method to
raise the appropriate exception, ensuring the test now accurately
validates the function's error handling logic.

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 src/tests/test_utils.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/tests/test_utils.py b/src/tests/test_utils.py
index dc352b930..5737beda6 100644
--- a/src/tests/test_utils.py
+++ b/src/tests/test_utils.py
@@ -104,6 +104,14 @@ def test_is_model_healthy_when_requests_raises_exception_returns_false(
 def test_is_model_healthy_when_requests_status_with_status_code_not_200_returns_false(
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    request_mock = MagicMock(return_value=MagicMock(status_code=500))
+    
+    # Mock an internal server error response
+    mock_response = MagicMock(status_code=500)
+    
+    # Tell the mock to raise an HTTP Error when raise_for_status() is called
+    mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError
+
+    request_mock = MagicMock(return_value=mock_response)
     monkeypatch.setattr("requests.post", request_mock)
+
     assert utils.is_model_healthy("http://localhost", "test", "chat") is False

From f7ef2eb067835a443e944d96a898b7c914eb0ac6 Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Tue, 12 Aug 2025 16:15:35 +0800
Subject: [PATCH 20/28] Chore: Apply auto-formatting and linting fixes via
 pre-commit

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 src/tests/test_utils.py                             | 4 ++--
 src/vllm_router/services/request_service/request.py | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/tests/test_utils.py b/src/tests/test_utils.py
index 5737beda6..fb5e0afa2 100644
--- a/src/tests/test_utils.py
+++ b/src/tests/test_utils.py
@@ -104,10 +104,10 @@ def test_is_model_healthy_when_requests_raises_exception_returns_false(
 def test_is_model_healthy_when_requests_status_with_status_code_not_200_returns_false(
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    
+
     # Mock an internal server error response
     mock_response = MagicMock(status_code=500)
-    
+
     # Tell the mock to raise an HTTP Error when raise_for_status() is called
     mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError
 
diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py
index 58caaecb7..b7f99f637 100644
--- a/src/vllm_router/services/request_service/request.py
+++ b/src/vllm_router/services/request_service/request.py
@@ -19,10 +19,9 @@
 import uuid
 from typing import Optional
 
-
+import aiohttp
 import httpx
 from fastapi import BackgroundTasks, HTTPException, Request, UploadFile
-import aiohttp
 from fastapi.responses import JSONResponse, StreamingResponse
 from requests import JSONDecodeError
 

From 3c66f96a64560108fb282e9154405d295b901d42 Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Wed, 13 Aug 2025 15:24:33 +0800
Subject: [PATCH 21/28] chore: remove unused var `in_router_time`

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 src/vllm_router/services/request_service/request.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py
index b7f99f637..13abdd81c 100644
--- a/src/vllm_router/services/request_service/request.py
+++ b/src/vllm_router/services/request_service/request.py
@@ -520,7 +520,6 @@ async def route_general_transcriptions(
     """Handles audio transcription requests by parsing form data and proxying to backend."""
 
     request_id = request.headers.get("X-Request-Id", str(uuid.uuid4()))
-    in_router_time = time.time()
 
     # --- 1. Form parsing ---
     try:

From 6ac4661427f46cb6990e6bf7a391a35820edd077 Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Fri, 15 Aug 2025 01:23:52 +0800
Subject: [PATCH 22/28] fix: (deps) add httpx as an explicit dependency

The CI/CD workflow was failing with a `ModuleNotFoundError` because `httpx` was not an explicit dependency in `pyproject.toml` and was not being installed in the clean Docker environment.

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 pyproject.toml                   | 1 +
 src/vllm_router/requirements.txt | 1 +
 2 files changed, 2 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index dfc923313..b44455930 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,6 +26,7 @@ dependencies = [
     "xxhash==3.5.0",
     "psutil==7.0.0",
     "pyyaml>=6.0.2",
+    "httpx==0.28.1",
 ]
 
 [project.scripts]
diff --git a/src/vllm_router/requirements.txt b/src/vllm_router/requirements.txt
index 4861ba4be..a0f40ffcd 100644
--- a/src/vllm_router/requirements.txt
+++ b/src/vllm_router/requirements.txt
@@ -10,3 +10,4 @@ sentry-sdk[fastapi]==2.27.0
 uhashring==2.3
 uvicorn==0.34.0
 xxhash==3.5.0
+httpx==0.28.1

From 37e33caf9c94a609af61a32bbe663f73beab3e98 Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Fri, 15 Aug 2025 01:56:08 +0800
Subject: [PATCH 23/28] chore: dependencies order changes after running
 pre-commit

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 src/vllm_router/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/vllm_router/requirements.txt b/src/vllm_router/requirements.txt
index a0f40ffcd..02a88aca9 100644
--- a/src/vllm_router/requirements.txt
+++ b/src/vllm_router/requirements.txt
@@ -1,6 +1,7 @@
 aiofiles==24.1.0
 aiohttp==3.9.5
 fastapi==0.115.8
+httpx==0.28.1
 kubernetes==32.0.0
 numpy==1.26.4
 prometheus_client==0.21.1
@@ -10,4 +11,3 @@ sentry-sdk[fastapi]==2.27.0
 uhashring==2.3
 uvicorn==0.34.0
 xxhash==3.5.0
-httpx==0.28.1

From 114895bef18e05f5cc9c643f4369a98126123b5c Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Fri, 22 Aug 2025 16:11:29 +0800
Subject: [PATCH 24/28] refactor: Migration from httpx to aiohttp for improved
 concurrency

Replaced httpx with aiohttp for better asynchronous performance and resource utilization. Fixed JSON syntax error in error response handling.

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 pyproject.toml                                |  1 -
 src/vllm_router/requirements.txt              |  1 -
 .../services/request_service/request.py       | 47 +++++----
 tutorials/23-whisper-api-transcription.md     | 99 +++++++++++++++++++
 4 files changed, 129 insertions(+), 19 deletions(-)
 create mode 100644 tutorials/23-whisper-api-transcription.md

diff --git a/pyproject.toml b/pyproject.toml
index b44455930..dfc923313 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,6 @@ dependencies = [
     "xxhash==3.5.0",
     "psutil==7.0.0",
     "pyyaml>=6.0.2",
-    "httpx==0.28.1",
 ]
 
 [project.scripts]
diff --git a/src/vllm_router/requirements.txt b/src/vllm_router/requirements.txt
index 02a88aca9..4861ba4be 100644
--- a/src/vllm_router/requirements.txt
+++ b/src/vllm_router/requirements.txt
@@ -1,7 +1,6 @@
 aiofiles==24.1.0
 aiohttp==3.9.5
 fastapi==0.115.8
-httpx==0.28.1
 kubernetes==32.0.0
 numpy==1.26.4
 prometheus_client==0.21.1
diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py
index 1debc28f7..9b44fe7a7 100644
--- a/src/vllm_router/services/request_service/request.py
+++ b/src/vllm_router/services/request_service/request.py
@@ -20,7 +20,6 @@
 from typing import Optional
 
 import aiohttp
-import httpx
 from fastapi import BackgroundTasks, HTTPException, Request, UploadFile
 from fastapi.responses import JSONResponse, StreamingResponse
 from requests import JSONDecodeError
@@ -627,14 +626,26 @@ async def route_general_transcriptions(
     logger.debug("==== data payload keys ====")
 
     try:
-        async with request.app.state.httpx_client_wrapper() as client:
-            backend_response = await client.post(
-                f"{chosen_url}{endpoint}", data=data, files=files, timeout=300.0
-            )
-        backend_response.raise_for_status()
+        client = request.app.state.aiohttp_client_wrapper()
+
+        form_data = aiohttp.FormData()
+
+        # add file data
+        for key, (filename, content, content_type) in files.items():
+            form_data.add_field(key, content, filename=filename, content_type=content_type)
+
+        # add from data
+        for key, value in data.items():
+            form_data.add_field(key,value)
+
+        backend_response = await client.post(
+            f"{chosen_url}{endpoint}",
+            data=form_data,
+            timeout=aiohttp.ClientTimeout(total=300)
+        )
 
         # --- 4. Return the response ---
-        response_content = backend_response.json()
+        response_content = await backend_response.json()
         headers = {
             k: v
             for k, v in backend_response.headers.items()
@@ -645,17 +656,19 @@ async def route_general_transcriptions(
 
         return JSONResponse(
             content=response_content,
-            status_code=backend_response.status_code,
+            status_code=backend_response.status,
             headers=headers,
         )
-    except httpx.HTTPStatusError as e:
-        error_content = (
-            e.response.json()
-            if "json" in e.response.headers.get("content-type", "")
-            else e.response.text
-        )
-        return JSONResponse(status_code=e.response.status_code, content=error_content)
-    except httpx.RequestError as e:
+    except aiohttp.ClientResponseError as e:
+        if hasattr(e, "response") and e.response is not None:
+            try:
+                error_content = await e.response.json()
+            except:
+                error_content = await e.response.text()
+        else:
+            error_content = {"error": f"HTTP {e.status}: {e.message}"}
+        return JSONResponse(status_code=e.status, content=error_content)
+    except aiohttp.ClientError as e:
         return JSONResponse(
-            status_code=503, content={"error": f"Failed to connect to backend: {e}"}
+            status_code=503, content={"error": f"Failed to connect to backend: {str(e)}"}
         )
diff --git a/tutorials/23-whisper-api-transcription.md b/tutorials/23-whisper-api-transcription.md
new file mode 100644
index 000000000..a09281916
--- /dev/null
+++ b/tutorials/23-whisper-api-transcription.md
@@ -0,0 +1,99 @@
+# Tutorial: Whisper Transcription API in vLLM Production Stack
+
+## Overview
+
+This tutorial introduces the newly added `/v1/audio/transcriptions` endpoint in the `vllm-router`, enabling users to transcribe `.wav` audio files using OpenAI’s `whisper-small` model.
+
+## Prerequisites
+
+* Access to a machine with a GPU (e.g. via [RunPod](https://runpod.io/))
+* Python 3.12 environment (recommended with `uv`)
+* `vllm` and `production-stack` cloned and installed
+* `vllm` installed with audio support:
+
+  ```bash
+  pip install vllm[audio]
+  ```
+
+## 1. Serving the Whisper Model
+
+Start a vLLM backend with the `whisper-small` model:
+
+```bash
+vllm serve \
+  --task transcription openai/whisper-small \
+  --host 0.0.0.0 --port 8002
+```
+
+## 2. Running the Router
+
+Create and run a router connected to the Whisper backend:
+
+```bash
+#!/bin/bash
+if [[ $# -ne 2 ]]; then
+    echo "Usage: $0 <router_port> <backend_url>"
+    exit 1
+fi
+
+uv run python3 -m vllm_router.app \
+  --host 0.0.0.0 --port "$1" \
+  --service-discovery static \
+  --static-backends "$2" \
+  --static-models "openai/whisper-small" \
+  --static-model-types "transcription" \
+  --routing-logic roundrobin \
+  --log-stats \
+  --engine-stats-interval 10 \
+  --request-stats-window 10
+```
+
+Example usage:
+
+```bash
+./run-router.sh 8000 http://localhost:8002
+```
+
+## 3. Sending a Transcription Request
+
+Use `curl` to send a `.wav` file to the transcription endpoint:
+
+* You can test with any `.wav` audio file of your choice.
+
+```bash
+curl -v http://localhost:8000/v1/audio/transcriptions \
+  -F 'file=@/path/to/audio.wav;type=audio/wav' \
+  -F 'model=openai/whisper-small' \
+  -F 'response_format=json' \
+  -F 'language=en'
+```
+
+### Supported Parameters
+
+| Parameter         | Description                                            |
+| ----------------- | ------------------------------------------------------ |
+| `file`            | Path to a `.wav` audio file                            |
+| `model`           | Whisper model to use (e.g., `openai/whisper-small`)    |
+| `prompt`          | *(Optional)* Text prompt to guide the transcription    |
+| `response_format` | One of `json`, `text`, `srt`, `verbose_json`, or `vtt` |
+| `temperature`     | *(Optional)* Sampling temperature as a float           |
+| `language`        | ISO 639-1 code (e.g., `en`, `fr`, `zh`)                |
+
+## 4. Sample Output
+
+```json
+{
+  "text": "Testing testing testing the whisper small model testing testing testing the audio transcription function testing testing testing the whisper small model"
+}
+```
+
+## 5. Notes
+
+* Router uses extended aiohttp timeouts to support long transcription jobs.
+* This implementation dynamically discovers valid transcription backends and routes requests accordingly.
+
+## 6. Resources
+
+* [PR #469 – Add Whisper Transcription API](https://github.com/vllm-project/production-stack/pull/469)
+* [OpenAI Whisper GitHub](https://github.com/openai/whisper)
+* [Blog: vLLM Whisper Transcription Walkthrough](https://davidgao7.github.io/posts/vllm-v1-whisper-transcription/)

From 12fcb2a277c188dd9576ece9a588b44162a4344c Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Fri, 22 Aug 2025 16:33:21 +0800
Subject: [PATCH 25/28] chore: remove wrong tutorial file

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 tutorials/17-whisper-api-transcription.md | 99 -----------------------
 1 file changed, 99 deletions(-)
 delete mode 100644 tutorials/17-whisper-api-transcription.md

diff --git a/tutorials/17-whisper-api-transcription.md b/tutorials/17-whisper-api-transcription.md
deleted file mode 100644
index e3834bc69..000000000
--- a/tutorials/17-whisper-api-transcription.md
+++ /dev/null
@@ -1,99 +0,0 @@
-# Tutorial: Whisper Transcription API in vLLM Production Stack
-
-## Overview
-
-This tutorial introduces the newly added `/v1/audio/transcriptions` endpoint in the `vllm-router`, enabling users to transcribe `.wav` audio files using OpenAI’s `whisper-small` model.
-
-## Prerequisites
-
-* Access to a machine with a GPU (e.g. via [RunPod](https://runpod.io/))
-* Python 3.12 environment (recommended with `uv`)
-* `vllm` and `production-stack` cloned and installed
-* `vllm` installed with audio support:
-
-  ```bash
-  pip install vllm[audio]
-  ```
-
-## 1. Serving the Whisper Model
-
-Start a vLLM backend with the `whisper-small` model:
-
-```bash
-vllm serve \
-  --task transcription openai/whisper-small \
-  --host 0.0.0.0 --port 8002
-```
-
-## 2. Running the Router
-
-Create and run a router connected to the Whisper backend:
-
-```bash
-#!/bin/bash
-if [[ $# -ne 2 ]]; then
-    echo "Usage: $0 <router_port> <backend_url>"
-    exit 1
-fi
-
-uv run python3 -m vllm_router.app \
-  --host 0.0.0.0 --port "$1" \
-  --service-discovery static \
-  --static-backends "$2" \
-  --static-models "openai/whisper-small" \
-  --static-model-types "transcription" \
-  --routing-logic roundrobin \
-  --log-stats \
-  --engine-stats-interval 10 \
-  --request-stats-window 10
-```
-
-Example usage:
-
-```bash
-./run-router.sh 8000 http://localhost:8002
-```
-
-## 3. Sending a Transcription Request
-
-Use `curl` to send a `.wav` file to the transcription endpoint:
-
-* You can test with any `.wav` audio file of your choice.
-
-```bash
-curl -v http://localhost:8000/v1/audio/transcriptions \
-  -F 'file=@/path/to/audio.wav;type=audio/wav' \
-  -F 'model=openai/whisper-small' \
-  -F 'response_format=json' \
-  -F 'language=en'
-```
-
-### Supported Parameters
-
-| Parameter         | Description                                            |
-| ----------------- | ------------------------------------------------------ |
-| `file`            | Path to a `.wav` audio file                            |
-| `model`           | Whisper model to use (e.g., `openai/whisper-small`)    |
-| `prompt`          | *(Optional)* Text prompt to guide the transcription    |
-| `response_format` | One of `json`, `text`, `srt`, `verbose_json`, or `vtt` |
-| `temperature`     | *(Optional)* Sampling temperature as a float           |
-| `language`        | ISO 639-1 code (e.g., `en`, `fr`, `zh`)                |
-
-## 4. Sample Output
-
-```json
-{
-  "text": "Testing testing testing the whisper small model testing testing testing the audio transcription function testing testing testing the whisper small model"
-}
-```
-
-## 5. Notes
-
-* Router uses extended HTTPX timeouts to support long transcription jobs.
-* This implementation dynamically discovers valid transcription backends and routes requests accordingly.
-
-## 6. Resources
-
-* [PR #469 – Add Whisper Transcription API](https://github.com/vllm-project/production-stack/pull/469)
-* [OpenAI Whisper GitHub](https://github.com/openai/whisper)
-* [Blog: vLLM Whisper Transcription Walkthrough](https://davidgao7.github.io/posts/vllm-v1-whisper-transcription/)

From 6c72f81d693b10225044ad514c387f0b52ad97d5 Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Fri, 22 Aug 2025 17:05:29 +0800
Subject: [PATCH 26/28] chore: apply pre-commit

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 src/vllm_router/services/request_service/request.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py
index 9b44fe7a7..53654cf85 100644
--- a/src/vllm_router/services/request_service/request.py
+++ b/src/vllm_router/services/request_service/request.py
@@ -632,16 +632,18 @@ async def route_general_transcriptions(
 
         # add file data
         for key, (filename, content, content_type) in files.items():
-            form_data.add_field(key, content, filename=filename, content_type=content_type)
+            form_data.add_field(
+                key, content, filename=filename, content_type=content_type
+            )
 
         # add from data
         for key, value in data.items():
-            form_data.add_field(key,value)
+            form_data.add_field(key, value)
 
         backend_response = await client.post(
             f"{chosen_url}{endpoint}",
             data=form_data,
-            timeout=aiohttp.ClientTimeout(total=300)
+            timeout=aiohttp.ClientTimeout(total=300),
         )
 
         # --- 4. Return the response ---
@@ -670,5 +672,6 @@ async def route_general_transcriptions(
         return JSONResponse(status_code=e.status, content=error_content)
     except aiohttp.ClientError as e:
         return JSONResponse(
-            status_code=503, content={"error": f"Failed to connect to backend: {str(e)}"}
+            status_code=503,
+            content={"error": f"Failed to connect to backend: {str(e)}"},
         )

From 1a1c985c605021500f4a2adeef9e638f6b51a213 Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Sat, 23 Aug 2025 01:08:14 +0800
Subject: [PATCH 27/28] chore: use debug log print

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 src/vllm_router/services/request_service/request.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py
index 53654cf85..f4ea337fa 100644
--- a/src/vllm_router/services/request_service/request.py
+++ b/src/vllm_router/services/request_service/request.py
@@ -602,7 +602,7 @@ async def route_general_transcriptions(
         request,
     )
 
-    logger.info("Proxying transcription request to %s", chosen_url)
+    logger.debug("Proxying transcription request to %s", chosen_url)
 
     # --- 3. Prepare and Proxy the Request ---
     payload_bytes = await file.read()

From 1d1b8288806299f59e73aa865935cddcc8d09bd8 Mon Sep 17 00:00:00 2001
From: David Gao <davidgao313@outlook.com>
Date: Sat, 23 Aug 2025 01:54:40 +0800
Subject: [PATCH 28/28] chore: change to more specific exception handling for
 aiohttp

Signed-off-by: David Gao <davidgao313@outlook.com>
---
 .../services/request_service/request.py       | 31 +++++++++++++------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py
index f4ea337fa..1c4147d4d 100644
--- a/src/vllm_router/services/request_service/request.py
+++ b/src/vllm_router/services/request_service/request.py
@@ -661,17 +661,30 @@ async def route_general_transcriptions(
             status_code=backend_response.status,
             headers=headers,
         )
-    except aiohttp.ClientResponseError as e:
-        if hasattr(e, "response") and e.response is not None:
+    except aiohttp.ClientResponseError as response_error:
+        if response_error.response is not None:
             try:
-                error_content = await e.response.json()
-            except:
-                error_content = await e.response.text()
+                error_content = await response_error.response.json()
+            except (
+                aiohttp.ContentTypeError,
+                json.JSONDecodeError,
+                aiohttp.ClientError,
+            ):
+                # If JSON parsing fails, get text content
+                try:
+                    text_content = await response_error.response.text()
+                    error_content = {"error": text_content}
+                except aiohttp.ClientError:
+                    error_content = {
+                        "error": f"HTTP {response_error.status}: {response_error.message}"
+                    }
         else:
-            error_content = {"error": f"HTTP {e.status}: {e.message}"}
-        return JSONResponse(status_code=e.status, content=error_content)
-    except aiohttp.ClientError as e:
+            error_content = {
+                "error": f"HTTP {response_error.status}: {response_error.message}"
+            }
+        return JSONResponse(status_code=response_error.status, content=error_content)
+    except aiohttp.ClientError as client_error:
         return JSONResponse(
             status_code=503,
-            content={"error": f"Failed to connect to backend: {str(e)}"},
+            content={"error": f"Failed to connect to backend: {str(client_error)}"},
         )