diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_example.py b/doc/source/llm/doc_code/serve/transcription/transcription_example.py
new file mode 100644
index 000000000000..aed2e567146e
--- /dev/null
+++ b/doc/source/llm/doc_code/serve/transcription/transcription_example.py
@@ -0,0 +1,106 @@
+"""
+This file serves as a documentation example and CI test.
+
+Structure:
+1. Monkeypatch setup: Ensures serve.run is non-blocking and removes accelerator requirements for CI testing.
+2. Docs example (between __transcription_example_start/end__): Embedded in Sphinx docs via literalinclude.
+3. Test validation (deployment status polling + cleanup)
+"""
+
+import time
+import openai
+import requests
+from ray import serve
+from ray.serve.schema import ApplicationStatus
+from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME
+from ray.serve import llm
+
+_original_serve_run = serve.run
+_original_build_openai_app = llm.build_openai_app
+
+
+def _non_blocking_serve_run(app, **kwargs):
+    """Forces blocking=False for testing"""
+    kwargs["blocking"] = False
+    return _original_serve_run(app, **kwargs)
+
+
+def _testing_build_openai_app(llm_serving_args):
+    """Removes accelerator requirements for testing"""
+    for config in llm_serving_args["llm_configs"]:
+        config.accelerator_type = None
+
+    return _original_build_openai_app(llm_serving_args)
+
+
+serve.run = _non_blocking_serve_run
+llm.build_openai_app = _testing_build_openai_app
+
+# __transcription_example_start__
+from ray import serve
+from ray.serve.llm import LLMConfig, build_openai_app
+
+llm_config = LLMConfig(
+    model_loading_config={
+        "model_id": "voxtral-mini",
+        "model_source": "mistralai/Voxtral-Mini-3B-2507",
+    },
+    deployment_config={
+        "autoscaling_config": {
+            "min_replicas": 1,
+            "max_replicas": 4,
+        }
+    },
+    accelerator_type="A10G",
+    # You can customize the engine arguments (e.g. vLLM engine kwargs)
+    engine_kwargs={
+        "tokenizer_mode": "mistral",
+        "config_format": "mistral",
+        "load_format": "mistral",
+    },
+    log_engine_metrics=True,
+)
+
+app = build_openai_app({"llm_configs": [llm_config]})
+serve.run(app, blocking=True)
+# __transcription_example_end__
+
+status = ApplicationStatus.NOT_STARTED
+timeout_seconds = 300
+start_time = time.time()
+
+while (
+    status != ApplicationStatus.RUNNING and time.time() - start_time < timeout_seconds
+):
+    status = serve.status().applications[SERVE_DEFAULT_APP_NAME].status
+
+    if status in [ApplicationStatus.DEPLOY_FAILED, ApplicationStatus.UNHEALTHY]:
+        raise AssertionError(f"Deployment failed with status: {status}")
+
+    time.sleep(1)
+
+if status != ApplicationStatus.RUNNING:
+    raise AssertionError(
+        f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}"
+    )
+
+response = requests.get("https://voiceage.com/wbsamples/in_stereo/Sports.wav")
+with open("audio.wav", "wb") as f:
+    f.write(response.content)
+
+client = openai.OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key")
+
+with open("audio.wav", "rb") as f:
+    try:
+        response = client.audio.transcriptions.create(
+            model="voxtral-mini",
+            file=f,
+            temperature=0.0,
+            language="en",
+        )
+    except Exception as e:
+        raise AssertionError(
+            f"Error while querying models: {e}. Check the logs for more details."
+        )
+
+serve.shutdown()
diff --git a/doc/source/serve/llm/user-guides/vllm-compatibility.md b/doc/source/serve/llm/user-guides/vllm-compatibility.md
index 846fc79720c3..4ec9a44b6ad4 100644
--- a/doc/source/serve/llm/user-guides/vllm-compatibility.md
+++ b/doc/source/serve/llm/user-guides/vllm-compatibility.md
@@ -80,6 +80,66 @@ curl -X POST http://localhost:8000/v1/embeddings \
 
 ::::
 
+
+## Transcriptions
+
+You can generate audio transcriptions using Speech-to-Text (STT) models trained specifically for Automatic Speech Recognition (ASR) tasks. Models supporting this use case are listed in the [vLLM transcription models documentation](https://docs.vllm.ai/en/stable/models/supported_models.html).
+
+
+### Deploy a transcription model
+
+::::{tab-set}
+
+:::{tab-item} Server
+:sync: server
+
+```{literalinclude} ../../../llm/doc_code/serve/transcription/transcription_example.py
+:language: python
+:start-after: __transcription_example_start__
+:end-before: __transcription_example_end__
+```
+:::
+
+:::{tab-item} Python Client
+:sync: client
+
+```python
+from openai import OpenAI
+
+# Initialize client
+client = OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key")
+
+# Open audio file
+with open("/path/to/audio.wav", "rb") as f:
+    # Make a request to the transcription model
+    response = client.audio.transcriptions.create(
+        model="whisper-large",
+        file=f,
+        temperature=0.0,
+        language="en",
+    )
+
+    print(response.text)
+```
+:::
+
+:::{tab-item} cURL
+:sync: curl
+
+```bash
+curl http://localhost:8000/v1/audio/transcriptions \
+    -X POST \
+    -H "Authorization: Bearer fake-key" \
+    -F "file=@/path/to/audio.wav" \
+    -F "model=whisper-large" \
+    -F "temperature=0.0" \
+    -F "language=en"
+```
+:::
+
+::::
+
+
 ## Structured output
 
 You can request structured JSON output similar to OpenAI's API using JSON mode or JSON schema validation with Pydantic models.
@@ -179,7 +239,6 @@ response = client.chat.completions.create(
     response_format={
         "type": "json_schema",
         "json_schema": Color.model_json_schema()
-
     },
     messages=[
         {
diff --git a/python/deplocks/llm/rayllm_py311_cpu.lock b/python/deplocks/llm/rayllm_py311_cpu.lock
index 9461ae88b62b..9ad44ad117b2 100644
--- a/python/deplocks/llm/rayllm_py311_cpu.lock
+++ b/python/deplocks/llm/rayllm_py311_cpu.lock
@@ -149,6 +149,12 @@ attrs==25.1.0 \
     #   aiohttp
     #   jsonschema
     #   referencing
+audioread==3.0.1 \
+    --hash=sha256:4cdce70b8adc0da0a3c9e0d85fb10b3ace30fbdf8d1670fd443929b61d117c33 \
+    --hash=sha256:ac5460a5498c48bdf2e8e767402583a4dcd13f4414d286f42ce4379e8b35066d
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
+    #   librosa
 billiard==4.2.1 \
     --hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \
     --hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb
@@ -572,6 +578,12 @@ cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \
     #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
     #   -r python/requirements.txt
     #   ray
+decorator==5.1.1 \
+    --hash=sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330 \
+    --hash=sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
+    #   librosa
 depyf==0.19.0 \
     --hash=sha256:040b35fc0997d49df024b7d094f2a7836f91e9ed02f49982dd37e70aa3285ad5 \
     --hash=sha256:afed0916b32d141cc90fa6220df01885eda442ca43b297d5050eeb90b4a5cb44
@@ -1229,6 +1241,13 @@ jiter==0.8.2 \
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
     #   openai
+joblib==1.5.2 \
+    --hash=sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55 \
+    --hash=sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
+    #   librosa
+    #   scikit-learn
 jsonref==1.1.0 \
     --hash=sha256:32fe8e1d85af0fdefbebce950af85590b22b60f9e95443176adbde4e1ecea552 \
     --hash=sha256:590dc7773df6c21cbf948b5dac07a72a251db28b0238ceecce0a2abfa8ec30a9
@@ -1267,7 +1286,14 @@ lazy-loader==0.4 \
     --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
+    #   librosa
     #   scikit-image
+librosa==0.11.0 \
+    --hash=sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1 \
+    --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
+    #   vllm
 llguidance==0.7.26 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \
     --hash=sha256:1895ff449c8ec0a5f1d3b142d723fc9b26a85b021b72d7f1173f8b7507f528c0 \
     --hash=sha256:5e6f6cec9c6648164062f0347262b3ec7c39f54d1be5c5347d6446bc7fdba115 \
@@ -1544,6 +1570,7 @@ msgpack==1.0.7 \
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
     #   -r python/requirements.txt
+    #   librosa
     #   ray
 msgspec==0.19.0 \
     --hash=sha256:00e87ecfa9795ee5214861eab8326b0e75475c2e68a384002aa135ea2a27d909 \
@@ -1746,6 +1773,7 @@ numba==0.61.2 \
     --hash=sha256:efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
+    #   librosa
     #   vllm
 numpy==1.26.4 \
     --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \
@@ -1791,12 +1819,14 @@ numpy==1.26.4 \
     #   gguf
     #   gymnasium
     #   imageio
+    #   librosa
     #   mistral-common
     #   nixl
     #   numba
     #   opencv-python-headless
     #   pandas
     #   scikit-image
+    #   scikit-learn
     #   scipy
     #   soundfile
     #   soxr
@@ -1944,6 +1974,7 @@ packaging==23.0 \
     #   kombu
     #   lazy-loader
     #   lm-format-enforcer
+    #   pooch
     #   ray
     #   scikit-image
     #   tensorboardx
@@ -2067,7 +2098,14 @@ platformdirs==3.11.0 \
     --hash=sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
+    #   pooch
     #   virtualenv
+pooch==1.8.2 \
+    --hash=sha256:3529a57096f7198778a5ceefd5ac3ef0e4d06a6ddaf9fc2d609b806f25302c47 \
+    --hash=sha256:76561f0de68a01da4df6af38e9955c4c9d1a5c90da73f7e40276a5728ec83d10
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
+    #   librosa
 prometheus-client==0.19.0 \
     --hash=sha256:4585b0d1223148c27a225b10dbec5ae9bc4c81a99a3fa80774fa6209935324e1 \
     --hash=sha256:c88b1e6ecf6b41cd8fb5731c7ae919bf66df6ec6fafa555cd6c0e16ca169ae92
@@ -2919,6 +2957,7 @@ requests==2.32.3 \
     #   google-api-core
     #   huggingface-hub
     #   mistral-common
+    #   pooch
     #   ray
     #   tiktoken
     #   transformers
@@ -3089,6 +3128,41 @@ scikit-image==0.24.0 \
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
     #   -r python/requirements.txt
+scikit-learn==1.7.2 \
+    --hash=sha256:0486c8f827c2e7b64837c731c8feff72c0bd2b998067a8a9cbc10643c31f0fe1 \
+    --hash=sha256:0b7dacaa05e5d76759fb071558a8b5130f4845166d88654a0f9bdf3eb57851b7 \
+    --hash=sha256:191e5550980d45449126e23ed1d5e9e24b2c68329ee1f691a3987476e115e09c \
+    --hash=sha256:20e9e49ecd130598f1ca38a1d85090e1a600147b9c02fa6f15d69cb53d968fda \
+    --hash=sha256:2a41e2a0ef45063e654152ec9d8bcfc39f7afce35b08902bfe290c2498a67a6a \
+    --hash=sha256:36749fb62b3d961b1ce4fedf08fa57a1986cd409eff2d783bca5d4b9b5fce51c \
+    --hash=sha256:4a847fea807e278f821a0406ca01e387f97653e284ecbd9750e3ee7c90347f18 \
+    --hash=sha256:502c18e39849c0ea1a5d681af1dbcf15f6cce601aebb657aabbfe84133c1907f \
+    --hash=sha256:57dc4deb1d3762c75d685507fbd0bc17160144b2f2ba4ccea5dc285ab0d0e973 \
+    --hash=sha256:6088aa475f0785e01bcf8529f55280a3d7d298679f50c0bb70a2364a82d0b290 \
+    --hash=sha256:63a9afd6f7b229aad94618c01c252ce9e6fa97918c5ca19c9a17a087d819440c \
+    --hash=sha256:6b33579c10a3081d076ab403df4a4190da4f4432d443521674637677dc91e61f \
+    --hash=sha256:7a4c328a71785382fe3fe676a9ecf2c86189249beff90bf85e22bdb7efaf9ae0 \
+    --hash=sha256:7a58814265dfc52b3295b1900cfb5701589d30a8bb026c7540f1e9d3499d5ec8 \
+    --hash=sha256:89877e19a80c7b11a2891a27c21c4894fb18e2c2e077815bcade10d34287b20d \
+    --hash=sha256:8d91a97fa2b706943822398ab943cde71858a50245e31bc71dba62aab1d60a96 \
+    --hash=sha256:8da8bf89d4d79aaec192d2bda62f9b56ae4e5b4ef93b6a56b5de4977e375c1f1 \
+    --hash=sha256:9656e4a53e54578ad10a434dc1f993330568cfee176dff07112b8785fb413106 \
+    --hash=sha256:96dc05a854add0e50d3f47a1ef21a10a595016da5b007c7d9cd9d0bffd1fcc61 \
+    --hash=sha256:98335fb98509b73385b3ab2bd0639b1f610541d3988ee675c670371d6a87aa7c \
+    --hash=sha256:9acb6c5e867447b4e1390930e3944a005e2cb115922e693c08a323421a6966e8 \
+    --hash=sha256:9b7ed8d58725030568523e937c43e56bc01cadb478fc43c042a9aca1dacb3ba1 \
+    --hash=sha256:abebbd61ad9e1deed54cca45caea8ad5f79e1b93173dece40bb8e0c658dbe6fe \
+    --hash=sha256:acbc0f5fd2edd3432a22c69bed78e837c70cf896cd7993d71d51ba6708507476 \
+    --hash=sha256:b4d6e9deed1a47aca9fe2f267ab8e8fe82ee20b4526b2c0cd9e135cea10feb44 \
+    --hash=sha256:bb24510ed3f9f61476181e4db51ce801e2ba37541def12dc9333b946fc7a9cf8 \
+    --hash=sha256:c7509693451651cd7361d30ce4e86a1347493554f172b1c72a39300fa2aea79e \
+    --hash=sha256:ca250e6836d10e6f402436d6463d6c0e4d8e0234cfb6a9a47835bd392b852ce5 \
+    --hash=sha256:e5bf3d930aee75a65478df91ac1225ff89cd28e9ac7bd1196853a9229b6adb0b \
+    --hash=sha256:f95dc55b7902b91331fa4e5845dd5bde0580c9cd9612b1b2791b7e80c3d32615 \
+    --hash=sha256:fa8f63940e29c82d1e67a45d5297bdebbcb585f5a5a50c4914cc2e852ab77f33
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
+    #   librosa
 scipy==1.11.4 \
     --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \
     --hash=sha256:028eccd22e654b3ea01ee63705681ee79933652b2d8f873e7949898dda6d11b6 \
@@ -3118,7 +3192,9 @@ scipy==1.11.4 \
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
     #   -r python/requirements.txt
+    #   librosa
     #   scikit-image
+    #   scikit-learn
     #   vllm
 sentencepiece==0.2.0 \
     --hash=sha256:0461324897735512a32d222e3d886e24ad6a499761952b6bda2a9ee6e4313ea5 \
@@ -3317,7 +3393,9 @@ soundfile==0.13.1 \
     --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
+    #   librosa
     #   mistral-common
+    #   vllm
 soxr==0.5.0.post1 \
     --hash=sha256:39e0f791ba178d69cd676485dbee37e75a34f20daa478d90341ecb7f6d9d690f \
     --hash=sha256:4704ba6b13a3f1e41d12acf192878384c1c31f71ce606829c64abdf64a8d7d32 \
@@ -3342,6 +3420,7 @@ soxr==0.5.0.post1 \
     --hash=sha256:fef509466c9c25f65eae0ce1e4b9ac9705d22c6038c914160ddaf459589c6e31
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
+    #   librosa
     #   mistral-common
 starlette==0.46.2 \
     --hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \
@@ -3363,6 +3442,12 @@ tensorboardx==2.6.2.2 \
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
     #   -r python/requirements.txt
+threadpoolctl==3.6.0 \
+    --hash=sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb \
+    --hash=sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
+    #   scikit-learn
 tifffile==2024.7.21 \
     --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \
     --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483
@@ -3518,6 +3603,7 @@ typing-extensions==4.12.2 \
     #   fastapi
     #   gymnasium
     #   huggingface-hub
+    #   librosa
     #   mistral-common
     #   openai
     #   opentelemetry-api
diff --git a/python/deplocks/llm/rayllm_py311_cu128.lock b/python/deplocks/llm/rayllm_py311_cu128.lock
index 8445dd9c5354..a7f14ffe377e 100644
--- a/python/deplocks/llm/rayllm_py311_cu128.lock
+++ b/python/deplocks/llm/rayllm_py311_cu128.lock
@@ -149,6 +149,12 @@ attrs==25.1.0 \
     #   aiohttp
     #   jsonschema
     #   referencing
+audioread==3.0.1 \
+    --hash=sha256:4cdce70b8adc0da0a3c9e0d85fb10b3ace30fbdf8d1670fd443929b61d117c33 \
+    --hash=sha256:ac5460a5498c48bdf2e8e767402583a4dcd13f4414d286f42ce4379e8b35066d
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
+    #   librosa
 billiard==4.2.1 \
     --hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \
     --hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb
@@ -572,6 +578,12 @@ cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \
     #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
     #   -r python/requirements.txt
     #   ray
+decorator==5.1.1 \
+    --hash=sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330 \
+    --hash=sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
+    #   librosa
 depyf==0.19.0 \
     --hash=sha256:040b35fc0997d49df024b7d094f2a7836f91e9ed02f49982dd37e70aa3285ad5 \
     --hash=sha256:afed0916b32d141cc90fa6220df01885eda442ca43b297d5050eeb90b4a5cb44
@@ -1230,6 +1242,13 @@ jiter==0.10.0 \
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
     #   openai
+joblib==1.5.2 \
+    --hash=sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55 \
+    --hash=sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
+    #   librosa
+    #   scikit-learn
 jsonref==1.1.0 \
     --hash=sha256:32fe8e1d85af0fdefbebce950af85590b22b60f9e95443176adbde4e1ecea552 \
     --hash=sha256:590dc7773df6c21cbf948b5dac07a72a251db28b0238ceecce0a2abfa8ec30a9
@@ -1268,7 +1287,14 @@ lazy-loader==0.4 \
     --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
+    #   librosa
     #   scikit-image
+librosa==0.11.0 \
+    --hash=sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1 \
+    --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
+    #   vllm
 llguidance==0.7.29 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \
     --hash=sha256:17fd439957d6ca5f459d0dec755a2d040c2dc946ed7e3c332b469ef6861292f8 \
     --hash=sha256:1d30a76b30b646ac7f9025d262665f62bdbf2d43698115eeb1119c6ee062a36f \
@@ -1509,6 +1535,7 @@ msgpack==1.0.7 \
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
     #   -r python/requirements.txt
+    #   librosa
     #   ray
 msgspec==0.19.0 \
     --hash=sha256:00e87ecfa9795ee5214861eab8326b0e75475c2e68a384002aa135ea2a27d909 \
@@ -1710,6 +1737,7 @@ numba==0.61.2 \
     --hash=sha256:efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
+    #   librosa
     #   vllm
 numpy==1.26.4 \
     --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \
@@ -1755,12 +1783,14 @@ numpy==1.26.4 \
     #   gguf
     #   gymnasium
     #   imageio
+    #   librosa
     #   mistral-common
     #   nixl
     #   numba
     #   opencv-python-headless
     #   pandas
     #   scikit-image
+    #   scikit-learn
     #   scipy
     #   soundfile
     #   soxr
@@ -1984,6 +2014,7 @@ packaging==23.0 \
     #   kombu
     #   lazy-loader
     #   lm-format-enforcer
+    #   pooch
     #   ray
     #   scikit-image
     #   tensorboardx
@@ -2107,7 +2138,14 @@ platformdirs==3.11.0 \
     --hash=sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
+    #   pooch
     #   virtualenv
+pooch==1.8.2 \
+    --hash=sha256:3529a57096f7198778a5ceefd5ac3ef0e4d06a6ddaf9fc2d609b806f25302c47 \
+    --hash=sha256:76561f0de68a01da4df6af38e9955c4c9d1a5c90da73f7e40276a5728ec83d10
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
+    #   librosa
 prometheus-client==0.19.0 \
     --hash=sha256:4585b0d1223148c27a225b10dbec5ae9bc4c81a99a3fa80774fa6209935324e1 \
     --hash=sha256:c88b1e6ecf6b41cd8fb5731c7ae919bf66df6ec6fafa555cd6c0e16ca169ae92
@@ -2959,6 +2997,7 @@ requests==2.32.3 \
     #   google-api-core
     #   huggingface-hub
     #   mistral-common
+    #   pooch
     #   ray
     #   tiktoken
     #   transformers
@@ -3129,6 +3168,41 @@ scikit-image==0.24.0 \
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
     #   -r python/requirements.txt
+scikit-learn==1.7.2 \
+    --hash=sha256:0486c8f827c2e7b64837c731c8feff72c0bd2b998067a8a9cbc10643c31f0fe1 \
+    --hash=sha256:0b7dacaa05e5d76759fb071558a8b5130f4845166d88654a0f9bdf3eb57851b7 \
+    --hash=sha256:191e5550980d45449126e23ed1d5e9e24b2c68329ee1f691a3987476e115e09c \
+    --hash=sha256:20e9e49ecd130598f1ca38a1d85090e1a600147b9c02fa6f15d69cb53d968fda \
+    --hash=sha256:2a41e2a0ef45063e654152ec9d8bcfc39f7afce35b08902bfe290c2498a67a6a \
+    --hash=sha256:36749fb62b3d961b1ce4fedf08fa57a1986cd409eff2d783bca5d4b9b5fce51c \
+    --hash=sha256:4a847fea807e278f821a0406ca01e387f97653e284ecbd9750e3ee7c90347f18 \
+    --hash=sha256:502c18e39849c0ea1a5d681af1dbcf15f6cce601aebb657aabbfe84133c1907f \
+    --hash=sha256:57dc4deb1d3762c75d685507fbd0bc17160144b2f2ba4ccea5dc285ab0d0e973 \
+    --hash=sha256:6088aa475f0785e01bcf8529f55280a3d7d298679f50c0bb70a2364a82d0b290 \
+    --hash=sha256:63a9afd6f7b229aad94618c01c252ce9e6fa97918c5ca19c9a17a087d819440c \
+    --hash=sha256:6b33579c10a3081d076ab403df4a4190da4f4432d443521674637677dc91e61f \
+    --hash=sha256:7a4c328a71785382fe3fe676a9ecf2c86189249beff90bf85e22bdb7efaf9ae0 \
+    --hash=sha256:7a58814265dfc52b3295b1900cfb5701589d30a8bb026c7540f1e9d3499d5ec8 \
+    --hash=sha256:89877e19a80c7b11a2891a27c21c4894fb18e2c2e077815bcade10d34287b20d \
+    --hash=sha256:8d91a97fa2b706943822398ab943cde71858a50245e31bc71dba62aab1d60a96 \
+    --hash=sha256:8da8bf89d4d79aaec192d2bda62f9b56ae4e5b4ef93b6a56b5de4977e375c1f1 \
+    --hash=sha256:9656e4a53e54578ad10a434dc1f993330568cfee176dff07112b8785fb413106 \
+    --hash=sha256:96dc05a854add0e50d3f47a1ef21a10a595016da5b007c7d9cd9d0bffd1fcc61 \
+    --hash=sha256:98335fb98509b73385b3ab2bd0639b1f610541d3988ee675c670371d6a87aa7c \
+    --hash=sha256:9acb6c5e867447b4e1390930e3944a005e2cb115922e693c08a323421a6966e8 \
+    --hash=sha256:9b7ed8d58725030568523e937c43e56bc01cadb478fc43c042a9aca1dacb3ba1 \
+    --hash=sha256:abebbd61ad9e1deed54cca45caea8ad5f79e1b93173dece40bb8e0c658dbe6fe \
+    --hash=sha256:acbc0f5fd2edd3432a22c69bed78e837c70cf896cd7993d71d51ba6708507476 \
+    --hash=sha256:b4d6e9deed1a47aca9fe2f267ab8e8fe82ee20b4526b2c0cd9e135cea10feb44 \
+    --hash=sha256:bb24510ed3f9f61476181e4db51ce801e2ba37541def12dc9333b946fc7a9cf8 \
+    --hash=sha256:c7509693451651cd7361d30ce4e86a1347493554f172b1c72a39300fa2aea79e \
+    --hash=sha256:ca250e6836d10e6f402436d6463d6c0e4d8e0234cfb6a9a47835bd392b852ce5 \
+    --hash=sha256:e5bf3d930aee75a65478df91ac1225ff89cd28e9ac7bd1196853a9229b6adb0b \
+    --hash=sha256:f95dc55b7902b91331fa4e5845dd5bde0580c9cd9612b1b2791b7e80c3d32615 \
+    --hash=sha256:fa8f63940e29c82d1e67a45d5297bdebbcb585f5a5a50c4914cc2e852ab77f33
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
+    #   librosa
 scipy==1.11.4 \
     --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \
     --hash=sha256:028eccd22e654b3ea01ee63705681ee79933652b2d8f873e7949898dda6d11b6 \
@@ -3158,7 +3232,9 @@ scipy==1.11.4 \
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
     #   -r python/requirements.txt
+    #   librosa
     #   scikit-image
+    #   scikit-learn
     #   vllm
 sentencepiece==0.2.0 \
     --hash=sha256:0461324897735512a32d222e3d886e24ad6a499761952b6bda2a9ee6e4313ea5 \
@@ -3357,7 +3433,9 @@ soundfile==0.13.1 \
     --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
+    #   librosa
     #   mistral-common
+    #   vllm
 soxr==0.5.0.post1 \
     --hash=sha256:39e0f791ba178d69cd676485dbee37e75a34f20daa478d90341ecb7f6d9d690f \
     --hash=sha256:4704ba6b13a3f1e41d12acf192878384c1c31f71ce606829c64abdf64a8d7d32 \
@@ -3382,6 +3460,7 @@ soxr==0.5.0.post1 \
     --hash=sha256:fef509466c9c25f65eae0ce1e4b9ac9705d22c6038c914160ddaf459589c6e31
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
+    #   librosa
     #   mistral-common
 starlette==0.46.2 \
     --hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \
@@ -3403,6 +3482,12 @@ tensorboardx==2.6.2.2 \
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
     #   -r python/requirements.txt
+threadpoolctl==3.6.0 \
+    --hash=sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb \
+    --hash=sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
+    #   scikit-learn
 tifffile==2024.7.21 \
     --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \
     --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483
@@ -3547,6 +3632,7 @@ typing-extensions==4.12.2 \
     #   fastapi
     #   gymnasium
     #   huggingface-hub
+    #   librosa
     #   mistral-common
     #   openai
     #   opentelemetry-api
diff --git a/python/deplocks/llm/rayllm_test_py311_cpu.lock b/python/deplocks/llm/rayllm_test_py311_cpu.lock
index 06eec0f1fbf6..d5f4c289ab74 100644
--- a/python/deplocks/llm/rayllm_test_py311_cpu.lock
+++ b/python/deplocks/llm/rayllm_test_py311_cpu.lock
@@ -219,6 +219,10 @@ attrs==25.1.0 \
     #   aiohttp
     #   jsonschema
     #   referencing
+audioread==3.0.1 \
+    --hash=sha256:4cdce70b8adc0da0a3c9e0d85fb10b3ace30fbdf8d1670fd443929b61d117c33 \
+    --hash=sha256:ac5460a5498c48bdf2e8e767402583a4dcd13f4414d286f42ce4379e8b35066d
+    # via librosa
 azure-common==1.1.28 \
     --hash=sha256:4ac0cd3214e36b6a1b6a442686722a5d8cc449603aa833f3f0f40bda836704a3 \
     --hash=sha256:5c12d3dcf4ec20599ca6b0d3e09e86e146353d443e7fcc050c9a19c1f9df20ad
@@ -766,6 +770,7 @@ decorator==5.1.1 \
     # via
     #   -c python/deplocks/llm/ray_test_py311_cpu.lock
     #   ipython
+    #   librosa
 defusedxml==0.7.1 \
     --hash=sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69 \
     --hash=sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61
@@ -1678,6 +1683,12 @@ jmespath==1.0.1 \
     #   -c python/deplocks/llm/ray_test_py311_cpu.lock
     #   boto3
     #   botocore
+joblib==1.5.2 \
+    --hash=sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55 \
+    --hash=sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241
+    # via
+    #   librosa
+    #   scikit-learn
 json5==0.9.14 \
     --hash=sha256:740c7f1b9e584a468dbb2939d8d458db3427f2c93ae2139d05f47e453eae964f \
     --hash=sha256:9ed66c3a6ca3510a976a9ef9b8c0787de24802724ab1860bc0153c7fdd589b02
@@ -1821,7 +1832,12 @@ lazy-loader==0.4 \
     --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1
     # via
     #   -c python/deplocks/llm/ray_test_py311_cpu.lock
+    #   librosa
     #   scikit-image
+librosa==0.11.0 \
+    --hash=sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1 \
+    --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908
+    # via vllm
 llguidance==0.7.26 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \
     --hash=sha256:1895ff449c8ec0a5f1d3b142d723fc9b26a85b021b72d7f1173f8b7507f528c0 \
     --hash=sha256:5e6f6cec9c6648164062f0347262b3ec7c39f54d1be5c5347d6446bc7fdba115 \
@@ -2223,6 +2239,7 @@ msgpack==1.0.7 \
     # via
     #   -c python/deplocks/llm/ray_test_py311_cpu.lock
     #   -r python/requirements.txt
+    #   librosa
     #   ray
 msgspec==0.19.0 \
     --hash=sha256:00e87ecfa9795ee5214861eab8326b0e75475c2e68a384002aa135ea2a27d909 \
@@ -2472,7 +2489,9 @@ numba==0.61.2 \
     --hash=sha256:cf9f9fc00d6eca0c23fc840817ce9f439b9f03c8f03d6246c0e7f0cb15b7162a \
     --hash=sha256:ea0247617edcb5dd61f6106a56255baab031acc4257bddaeddb3a1003b4ca3fd \
     --hash=sha256:efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2
-    # via vllm
+    # via
+    #   librosa
+    #   vllm
 numpy==1.26.4 \
     --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \
     --hash=sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818 \
@@ -2517,12 +2536,14 @@ numpy==1.26.4 \
     #   gguf
     #   gymnasium
     #   imageio
+    #   librosa
     #   mistral-common
     #   nixl
     #   numba
     #   opencv-python-headless
     #   pandas
     #   scikit-image
+    #   scikit-learn
     #   scipy
     #   soundfile
     #   soxr
@@ -2680,6 +2701,7 @@ packaging==23.0 \
     #   lazy-loader
     #   lm-format-enforcer
     #   nbconvert
+    #   pooch
     #   pytest
     #   ray
     #   scikit-image
@@ -2835,6 +2857,7 @@ platformdirs==3.11.0 \
     # via
     #   -c python/deplocks/llm/ray_test_py311_cpu.lock
     #   jupyter-core
+    #   pooch
     #   virtualenv
 pluggy==1.3.0 \
     --hash=sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12 \
@@ -2842,6 +2865,10 @@ pluggy==1.3.0 \
     # via
     #   -c python/deplocks/llm/ray_test_py311_cpu.lock
     #   pytest
+pooch==1.8.2 \
+    --hash=sha256:3529a57096f7198778a5ceefd5ac3ef0e4d06a6ddaf9fc2d609b806f25302c47 \
+    --hash=sha256:76561f0de68a01da4df6af38e9955c4c9d1a5c90da73f7e40276a5728ec83d10
+    # via librosa
 portalocker==2.8.2 \
     --hash=sha256:2b035aa7828e46c58e9b31390ee1f169b98e1066ab10b9a6a861fe7e25ee4f33 \
     --hash=sha256:cfb86acc09b9aa7c3b43594e19be1345b9d16af3feb08bf92f23d4dce513a28e
@@ -3804,6 +3831,7 @@ requests==2.32.3 \
     #   jupyterlab-server
     #   mistral-common
     #   msal
+    #   pooch
     #   ray
     #   smart-open
     #   sphinx
@@ -3996,6 +4024,39 @@ scikit-image==0.24.0 \
     # via
     #   -c python/deplocks/llm/ray_test_py311_cpu.lock
     #   -r python/requirements.txt
+scikit-learn==1.7.2 \
+    --hash=sha256:0486c8f827c2e7b64837c731c8feff72c0bd2b998067a8a9cbc10643c31f0fe1 \
+    --hash=sha256:0b7dacaa05e5d76759fb071558a8b5130f4845166d88654a0f9bdf3eb57851b7 \
+    --hash=sha256:191e5550980d45449126e23ed1d5e9e24b2c68329ee1f691a3987476e115e09c \
+    --hash=sha256:20e9e49ecd130598f1ca38a1d85090e1a600147b9c02fa6f15d69cb53d968fda \
+    --hash=sha256:2a41e2a0ef45063e654152ec9d8bcfc39f7afce35b08902bfe290c2498a67a6a \
+    --hash=sha256:36749fb62b3d961b1ce4fedf08fa57a1986cd409eff2d783bca5d4b9b5fce51c \
+    --hash=sha256:4a847fea807e278f821a0406ca01e387f97653e284ecbd9750e3ee7c90347f18 \
+    --hash=sha256:502c18e39849c0ea1a5d681af1dbcf15f6cce601aebb657aabbfe84133c1907f \
+    --hash=sha256:57dc4deb1d3762c75d685507fbd0bc17160144b2f2ba4ccea5dc285ab0d0e973 \
+    --hash=sha256:6088aa475f0785e01bcf8529f55280a3d7d298679f50c0bb70a2364a82d0b290 \
+    --hash=sha256:63a9afd6f7b229aad94618c01c252ce9e6fa97918c5ca19c9a17a087d819440c \
+    --hash=sha256:6b33579c10a3081d076ab403df4a4190da4f4432d443521674637677dc91e61f \
+    --hash=sha256:7a4c328a71785382fe3fe676a9ecf2c86189249beff90bf85e22bdb7efaf9ae0 \
+    --hash=sha256:7a58814265dfc52b3295b1900cfb5701589d30a8bb026c7540f1e9d3499d5ec8 \
+    --hash=sha256:89877e19a80c7b11a2891a27c21c4894fb18e2c2e077815bcade10d34287b20d \
+    --hash=sha256:8d91a97fa2b706943822398ab943cde71858a50245e31bc71dba62aab1d60a96 \
+    --hash=sha256:8da8bf89d4d79aaec192d2bda62f9b56ae4e5b4ef93b6a56b5de4977e375c1f1 \
+    --hash=sha256:9656e4a53e54578ad10a434dc1f993330568cfee176dff07112b8785fb413106 \
+    --hash=sha256:96dc05a854add0e50d3f47a1ef21a10a595016da5b007c7d9cd9d0bffd1fcc61 \
+    --hash=sha256:98335fb98509b73385b3ab2bd0639b1f610541d3988ee675c670371d6a87aa7c \
+    --hash=sha256:9acb6c5e867447b4e1390930e3944a005e2cb115922e693c08a323421a6966e8 \
+    --hash=sha256:9b7ed8d58725030568523e937c43e56bc01cadb478fc43c042a9aca1dacb3ba1 \
+    --hash=sha256:abebbd61ad9e1deed54cca45caea8ad5f79e1b93173dece40bb8e0c658dbe6fe \
+    --hash=sha256:acbc0f5fd2edd3432a22c69bed78e837c70cf896cd7993d71d51ba6708507476 \
+    --hash=sha256:b4d6e9deed1a47aca9fe2f267ab8e8fe82ee20b4526b2c0cd9e135cea10feb44 \
+    --hash=sha256:bb24510ed3f9f61476181e4db51ce801e2ba37541def12dc9333b946fc7a9cf8 \
+    --hash=sha256:c7509693451651cd7361d30ce4e86a1347493554f172b1c72a39300fa2aea79e \
+    --hash=sha256:ca250e6836d10e6f402436d6463d6c0e4d8e0234cfb6a9a47835bd392b852ce5 \
+    --hash=sha256:e5bf3d930aee75a65478df91ac1225ff89cd28e9ac7bd1196853a9229b6adb0b \
+    --hash=sha256:f95dc55b7902b91331fa4e5845dd5bde0580c9cd9612b1b2791b7e80c3d32615 \
+    --hash=sha256:fa8f63940e29c82d1e67a45d5297bdebbcb585f5a5a50c4914cc2e852ab77f33
+    # via librosa
 scipy==1.11.4 \
     --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \
     --hash=sha256:028eccd22e654b3ea01ee63705681ee79933652b2d8f873e7949898dda6d11b6 \
@@ -4025,7 +4086,9 @@ scipy==1.11.4 \
     # via
     #   -c python/deplocks/llm/ray_test_py311_cpu.lock
     #   -r python/requirements.txt
+    #   librosa
     #   scikit-image
+    #   scikit-learn
     #   vllm
 send2trash==1.8.3 \
     --hash=sha256:0c31227e0bd08961c7665474a3d1ef7193929fedda4233843689baa056be46c9 \
@@ -4246,7 +4309,10 @@ soundfile==0.13.1 \
     --hash=sha256:a23c717560da2cf4c7b5ae1142514e0fd82d6bbd9dfc93a50423447142f2c445 \
     --hash=sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b \
     --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5
-    # via mistral-common
+    # via
+    #   librosa
+    #   mistral-common
+    #   vllm
 soupsieve==2.5 \
     --hash=sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690 \
     --hash=sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7
@@ -4275,7 +4341,9 @@ soxr==0.5.0.post1 \
     --hash=sha256:fa0a382fb8d8e2afed2c1642723b2d2d1b9a6728ff89f77f3524034c8885b8c9 \
     --hash=sha256:fcc049b0a151a65aa75b92f0ac64bb2dba785d16b78c31c2b94e68c141751d6d \
     --hash=sha256:fef509466c9c25f65eae0ce1e4b9ac9705d22c6038c914160ddaf459589c6e31
-    # via mistral-common
+    # via
+    #   librosa
+    #   mistral-common
 sphinx==6.2.1 \
     --hash=sha256:6d56a34697bb749ffa0152feafc4b19836c755d90a7c59b72bc7dfd371b9cc6b \
     --hash=sha256:97787ff1fa3256a3eef9eda523a63dbf299f7b47e053cfcf684a1c2a8380c912
@@ -4354,6 +4422,10 @@ terminado==0.18.1 \
     #   jupyter-server
     #   nbclassic
     #   notebook
+threadpoolctl==3.6.0 \
+    --hash=sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb \
+    --hash=sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e
+    # via scikit-learn
 tifffile==2024.7.21 \
     --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \
     --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483
@@ -4585,6 +4657,7 @@ typing-extensions==4.12.2 \
     #   fastapi
     #   gymnasium
     #   huggingface-hub
+    #   librosa
     #   mistral-common
     #   openai
     #   opentelemetry-api
diff --git a/python/deplocks/llm/rayllm_test_py311_cu128.lock b/python/deplocks/llm/rayllm_test_py311_cu128.lock
index 34a7a94ed149..ab9931f03ebf 100644
--- a/python/deplocks/llm/rayllm_test_py311_cu128.lock
+++ b/python/deplocks/llm/rayllm_test_py311_cu128.lock
@@ -219,6 +219,10 @@ attrs==25.1.0 \
     #   aiohttp
     #   jsonschema
     #   referencing
+audioread==3.0.1 \
+    --hash=sha256:4cdce70b8adc0da0a3c9e0d85fb10b3ace30fbdf8d1670fd443929b61d117c33 \
+    --hash=sha256:ac5460a5498c48bdf2e8e767402583a4dcd13f4414d286f42ce4379e8b35066d
+    # via librosa
 azure-common==1.1.28 \
     --hash=sha256:4ac0cd3214e36b6a1b6a442686722a5d8cc449603aa833f3f0f40bda836704a3 \
     --hash=sha256:5c12d3dcf4ec20599ca6b0d3e09e86e146353d443e7fcc050c9a19c1f9df20ad
@@ -765,6 +769,7 @@ decorator==5.1.1 \
     # via
     #   -c python/deplocks/llm/ray_test_py311_cu128.lock
     #   ipython
+    #   librosa
 defusedxml==0.7.1 \
     --hash=sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69 \
     --hash=sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61
@@ -1678,6 +1683,12 @@ jmespath==1.0.1 \
     #   -c python/deplocks/llm/ray_test_py311_cu128.lock
     #   boto3
     #   botocore
+joblib==1.5.2 \
+    --hash=sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55 \
+    --hash=sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241
+    # via
+    #   librosa
+    #   scikit-learn
 json5==0.9.14 \
     --hash=sha256:740c7f1b9e584a468dbb2939d8d458db3427f2c93ae2139d05f47e453eae964f \
     --hash=sha256:9ed66c3a6ca3510a976a9ef9b8c0787de24802724ab1860bc0153c7fdd589b02
@@ -1821,7 +1832,12 @@ lazy-loader==0.4 \
     --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1
     # via
     #   -c python/deplocks/llm/ray_test_py311_cu128.lock
+    #   librosa
     #   scikit-image
+librosa==0.11.0 \
+    --hash=sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1 \
+    --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908
+    # via vllm
 llguidance==0.7.29 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \
     --hash=sha256:17fd439957d6ca5f459d0dec755a2d040c2dc946ed7e3c332b469ef6861292f8 \
     --hash=sha256:1d30a76b30b646ac7f9025d262665f62bdbf2d43698115eeb1119c6ee062a36f \
@@ -2187,6 +2203,7 @@ msgpack==1.0.7 \
     # via
     #   -c python/deplocks/llm/ray_test_py311_cu128.lock
     #   -r python/requirements.txt
+    #   librosa
     #   ray
 msgspec==0.19.0 \
     --hash=sha256:00e87ecfa9795ee5214861eab8326b0e75475c2e68a384002aa135ea2a27d909 \
@@ -2435,7 +2452,9 @@ numba==0.61.2 \
     --hash=sha256:cf9f9fc00d6eca0c23fc840817ce9f439b9f03c8f03d6246c0e7f0cb15b7162a \
     --hash=sha256:ea0247617edcb5dd61f6106a56255baab031acc4257bddaeddb3a1003b4ca3fd \
     --hash=sha256:efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2
-    # via vllm
+    # via
+    #   librosa
+    #   vllm
 numpy==1.26.4 \
     --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \
     --hash=sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818 \
@@ -2480,12 +2499,14 @@ numpy==1.26.4 \
     #   gguf
     #   gymnasium
     #   imageio
+    #   librosa
     #   mistral-common
     #   nixl
     #   numba
     #   opencv-python-headless
     #   pandas
     #   scikit-image
+    #   scikit-learn
     #   scipy
     #   soundfile
     #   soxr
@@ -2694,6 +2715,7 @@ packaging==23.0 \
     #   lazy-loader
     #   lm-format-enforcer
     #   nbconvert
+    #   pooch
     #   pytest
     #   ray
     #   scikit-image
@@ -2849,6 +2871,7 @@ platformdirs==3.11.0 \
     # via
     #   -c python/deplocks/llm/ray_test_py311_cu128.lock
     #   jupyter-core
+    #   pooch
     #   virtualenv
 pluggy==1.3.0 \
     --hash=sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12 \
@@ -2856,6 +2879,10 @@ pluggy==1.3.0 \
     # via
     #   -c python/deplocks/llm/ray_test_py311_cu128.lock
     #   pytest
+pooch==1.8.2 \
+    --hash=sha256:3529a57096f7198778a5ceefd5ac3ef0e4d06a6ddaf9fc2d609b806f25302c47 \
+    --hash=sha256:76561f0de68a01da4df6af38e9955c4c9d1a5c90da73f7e40276a5728ec83d10
+    # via librosa
 portalocker==2.8.2 \
     --hash=sha256:2b035aa7828e46c58e9b31390ee1f169b98e1066ab10b9a6a861fe7e25ee4f33 \
     --hash=sha256:cfb86acc09b9aa7c3b43594e19be1345b9d16af3feb08bf92f23d4dce513a28e
@@ -3818,6 +3845,7 @@ requests==2.32.3 \
     #   jupyterlab-server
     #   mistral-common
     #   msal
+    #   pooch
     #   ray
     #   smart-open
     #   sphinx
@@ -4010,6 +4038,39 @@ scikit-image==0.24.0 \
     # via
     #   -c python/deplocks/llm/ray_test_py311_cu128.lock
     #   -r python/requirements.txt
+scikit-learn==1.7.2 \
+    --hash=sha256:0486c8f827c2e7b64837c731c8feff72c0bd2b998067a8a9cbc10643c31f0fe1 \
+    --hash=sha256:0b7dacaa05e5d76759fb071558a8b5130f4845166d88654a0f9bdf3eb57851b7 \
+    --hash=sha256:191e5550980d45449126e23ed1d5e9e24b2c68329ee1f691a3987476e115e09c \
+    --hash=sha256:20e9e49ecd130598f1ca38a1d85090e1a600147b9c02fa6f15d69cb53d968fda \
+    --hash=sha256:2a41e2a0ef45063e654152ec9d8bcfc39f7afce35b08902bfe290c2498a67a6a \
+    --hash=sha256:36749fb62b3d961b1ce4fedf08fa57a1986cd409eff2d783bca5d4b9b5fce51c \
+    --hash=sha256:4a847fea807e278f821a0406ca01e387f97653e284ecbd9750e3ee7c90347f18 \
+    --hash=sha256:502c18e39849c0ea1a5d681af1dbcf15f6cce601aebb657aabbfe84133c1907f \
+    --hash=sha256:57dc4deb1d3762c75d685507fbd0bc17160144b2f2ba4ccea5dc285ab0d0e973 \
+    --hash=sha256:6088aa475f0785e01bcf8529f55280a3d7d298679f50c0bb70a2364a82d0b290 \
+    --hash=sha256:63a9afd6f7b229aad94618c01c252ce9e6fa97918c5ca19c9a17a087d819440c \
+    --hash=sha256:6b33579c10a3081d076ab403df4a4190da4f4432d443521674637677dc91e61f \
+    --hash=sha256:7a4c328a71785382fe3fe676a9ecf2c86189249beff90bf85e22bdb7efaf9ae0 \
+    --hash=sha256:7a58814265dfc52b3295b1900cfb5701589d30a8bb026c7540f1e9d3499d5ec8 \
+    --hash=sha256:89877e19a80c7b11a2891a27c21c4894fb18e2c2e077815bcade10d34287b20d \
+    --hash=sha256:8d91a97fa2b706943822398ab943cde71858a50245e31bc71dba62aab1d60a96 \
+    --hash=sha256:8da8bf89d4d79aaec192d2bda62f9b56ae4e5b4ef93b6a56b5de4977e375c1f1 \
+    --hash=sha256:9656e4a53e54578ad10a434dc1f993330568cfee176dff07112b8785fb413106 \
+    --hash=sha256:96dc05a854add0e50d3f47a1ef21a10a595016da5b007c7d9cd9d0bffd1fcc61 \
+    --hash=sha256:98335fb98509b73385b3ab2bd0639b1f610541d3988ee675c670371d6a87aa7c \
+    --hash=sha256:9acb6c5e867447b4e1390930e3944a005e2cb115922e693c08a323421a6966e8 \
+    --hash=sha256:9b7ed8d58725030568523e937c43e56bc01cadb478fc43c042a9aca1dacb3ba1 \
+    --hash=sha256:abebbd61ad9e1deed54cca45caea8ad5f79e1b93173dece40bb8e0c658dbe6fe \
+    --hash=sha256:acbc0f5fd2edd3432a22c69bed78e837c70cf896cd7993d71d51ba6708507476 \
+    --hash=sha256:b4d6e9deed1a47aca9fe2f267ab8e8fe82ee20b4526b2c0cd9e135cea10feb44 \
+    --hash=sha256:bb24510ed3f9f61476181e4db51ce801e2ba37541def12dc9333b946fc7a9cf8 \
+    --hash=sha256:c7509693451651cd7361d30ce4e86a1347493554f172b1c72a39300fa2aea79e \
+    --hash=sha256:ca250e6836d10e6f402436d6463d6c0e4d8e0234cfb6a9a47835bd392b852ce5 \
+    --hash=sha256:e5bf3d930aee75a65478df91ac1225ff89cd28e9ac7bd1196853a9229b6adb0b \
+    --hash=sha256:f95dc55b7902b91331fa4e5845dd5bde0580c9cd9612b1b2791b7e80c3d32615 \
+    --hash=sha256:fa8f63940e29c82d1e67a45d5297bdebbcb585f5a5a50c4914cc2e852ab77f33
+    # via librosa
 scipy==1.11.4 \
     --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \
     --hash=sha256:028eccd22e654b3ea01ee63705681ee79933652b2d8f873e7949898dda6d11b6 \
@@ -4039,7 +4100,9 @@ scipy==1.11.4 \
     # via
     #   -c python/deplocks/llm/ray_test_py311_cu128.lock
     #   -r python/requirements.txt
+    #   librosa
     #   scikit-image
+    #   scikit-learn
     #   vllm
 send2trash==1.8.3 \
     --hash=sha256:0c31227e0bd08961c7665474a3d1ef7193929fedda4233843689baa056be46c9 \
@@ -4260,7 +4323,10 @@ soundfile==0.13.1 \
     --hash=sha256:a23c717560da2cf4c7b5ae1142514e0fd82d6bbd9dfc93a50423447142f2c445 \
     --hash=sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b \
     --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5
-    # via mistral-common
+    # via
+    #   librosa
+    #   mistral-common
+    #   vllm
 soupsieve==2.5 \
     --hash=sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690 \
     --hash=sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7
@@ -4289,7 +4355,9 @@ soxr==0.5.0.post1 \
     --hash=sha256:fa0a382fb8d8e2afed2c1642723b2d2d1b9a6728ff89f77f3524034c8885b8c9 \
     --hash=sha256:fcc049b0a151a65aa75b92f0ac64bb2dba785d16b78c31c2b94e68c141751d6d \
     --hash=sha256:fef509466c9c25f65eae0ce1e4b9ac9705d22c6038c914160ddaf459589c6e31
-    # via mistral-common
+    # via
+    #   librosa
+    #   mistral-common
 sphinx==6.2.1 \
     --hash=sha256:6d56a34697bb749ffa0152feafc4b19836c755d90a7c59b72bc7dfd371b9cc6b \
     --hash=sha256:97787ff1fa3256a3eef9eda523a63dbf299f7b47e053cfcf684a1c2a8380c912
@@ -4368,6 +4436,10 @@ terminado==0.18.1 \
     #   jupyter-server
     #   nbclassic
     #   notebook
+threadpoolctl==3.6.0 \
+    --hash=sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb \
+    --hash=sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e
+    # via scikit-learn
 tifffile==2024.7.21 \
     --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \
     --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483
@@ -4589,6 +4661,7 @@ typing-extensions==4.12.2 \
     #   fastapi
     #   gymnasium
     #   huggingface-hub
+    #   librosa
     #   mistral-common
     #   openai
     #   opentelemetry-api
diff --git a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py
index ed2adf153d2c..9fc708ce0bc6 100644
--- a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py
+++ b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py
@@ -25,6 +25,9 @@
     ErrorResponse as vLLMErrorResponse,
     ScoreRequest as vLLMScoreRequest,
     ScoreResponse as vLLMScoreResponse,
+    TranscriptionRequest as vLLMTranscriptionRequest,
+    TranscriptionResponse as vLLMTranscriptionResponse,
+    TranscriptionStreamResponse as vLLMTranscriptionStreamResponse,
 )
 from vllm.utils import random_uuid
 
@@ -96,6 +99,27 @@ class EmbeddingResponse(vLLMEmbeddingResponse):
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
 
+class TranscriptionRequest(vLLMTranscriptionRequest):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+
+
+class TranscriptionResponse(vLLMTranscriptionResponse):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+
+class TranscriptionStreamResponse(vLLMTranscriptionStreamResponse):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+
 class ScoreRequest(vLLMScoreRequest):
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
@@ -115,15 +139,26 @@ class ScoreResponse(vLLMScoreResponse):
 ]
 
 LLMChatResponse = Union[
-    AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None],
+    AsyncGenerator[
+        Union[str, ChatCompletionStreamResponse, ChatCompletionResponse, ErrorResponse],
+        None,
+    ],
 ]
 
 LLMCompletionsResponse = Union[
     AsyncGenerator[
-        Union[CompletionStreamResponse, CompletionResponse, ErrorResponse], None
+        Union[str, CompletionStreamResponse, CompletionResponse, ErrorResponse], None
+    ],
+]
+
+LLMTranscriptionResponse = Union[
+    AsyncGenerator[
+        Union[str, TranscriptionStreamResponse, TranscriptionResponse, ErrorResponse],
+        None,
     ],
 ]
 
+
 # TODO: remove this class
 class OpenAIHTTPException(Exception):
     def __init__(
diff --git a/python/ray/llm/_internal/serve/core/engine/protocol.py b/python/ray/llm/_internal/serve/core/engine/protocol.py
index 56bcc5acf827..c36b8073d0da 100644
--- a/python/ray/llm/_internal/serve/core/engine/protocol.py
+++ b/python/ray/llm/_internal/serve/core/engine/protocol.py
@@ -15,6 +15,8 @@
         EmbeddingRequest,
         EmbeddingResponse,
         ErrorResponse,
+        TranscriptionRequest,
+        TranscriptionResponse,
     )
 
 
@@ -118,6 +120,35 @@ async def embeddings(
         """
         pass
 
+    @abc.abstractmethod
+    async def transcriptions(
+        self, request: "TranscriptionRequest"
+    ) -> AsyncGenerator[Union[str, "TranscriptionResponse", "ErrorResponse"], None]:
+        """Run a Transcription with the engine.
+
+        Similar to chat and completion, this method is an async generator,
+        so it yields chunks of response and when it is done, it returns None.
+        We have the following convention:
+
+        * In case of streaming, yield a string representing data:
+        <json_str>\n\n for each chunk. This should be already openAI compatible,
+        so the higher level can just yield it to the client.
+        * In case of non-streaming, yield a single object of type TranscriptionResponse.
+        * In case of error, yield a single object of type ErrorResponse.
+
+        Args:
+            request: The transcription request.
+
+        Yields:
+            Union[str, TranscriptionResponse, ErrorResponse]: A string
+            representing a chunk of the response, a TranscriptionResponse object,
+            or an ErrorResponse object.
+
+        Returns:
+            None when the generator is done.
+        """
+        pass
+
     async def check_health(self) -> None:
         """Check the health of the engine.
 
diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py
index cb61e1ab7a22..29a9e17ada4d 100644
--- a/python/ray/llm/_internal/serve/core/ingress/ingress.py
+++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py
@@ -2,7 +2,9 @@
 import json
 import sys
 from contextlib import asynccontextmanager
+from enum import Enum
 from typing import (
+    Annotated,
     Any,
     AsyncGenerator,
     Awaitable,
@@ -16,7 +18,7 @@
     Union,
 )
 
-from fastapi import FastAPI, HTTPException, status
+from fastapi import FastAPI, Form, HTTPException, status
 from fastapi.middleware.cors import CORSMiddleware
 from starlette.responses import JSONResponse, Response, StreamingResponse
 
@@ -45,11 +47,15 @@
     LLMCompletionsResponse,
     LLMEmbeddingsResponse,
     LLMScoreResponse,
+    LLMTranscriptionResponse,
     ModelCard,
     ModelList,
     OpenAIHTTPException,
     ScoreRequest,
     ScoreResponse,
+    TranscriptionRequest,
+    TranscriptionResponse,
+    TranscriptionStreamResponse,
     to_model_metadata,
 )
 from ray.llm._internal.serve.core.ingress.middleware import (
@@ -83,6 +89,19 @@
     "max_ongoing_requests": DEFAULT_MAX_ONGOING_REQUESTS,
 }
 
+# These methods correspond to functions defined in the LLMEngine class in python/ray/llm/_internal/serve/deployments/llm/llm_engine.py
+class CallMethod(Enum):
+    CHAT = "chat"
+    COMPLETIONS = "completions"
+    TRANSCRIPTIONS = "transcriptions"
+
+
+NON_STREAMING_RESPONSE_TYPES = (
+    ChatCompletionResponse,
+    CompletionResponse,
+    TranscriptionResponse,
+)
+
 
 def _sanitize_chat_completion_request(
     request: ChatCompletionRequest,
@@ -108,8 +127,7 @@ def _sanitize_chat_completion_request(
 
 
 StreamResponseType = Union[
-    ChatCompletionStreamResponse,
-    CompletionStreamResponse,
+    ChatCompletionStreamResponse, CompletionStreamResponse, TranscriptionStreamResponse
 ]
 BatchedStreamResponseType = List[StreamResponseType]
 
@@ -122,6 +140,9 @@ def _sanitize_chat_completion_request(
     "completions": lambda app: app.post("/v1/completions"),
     "chat": lambda app: app.post("/v1/chat/completions"),
     "embeddings": lambda app: app.post("/v1/embeddings"),
+    "transcriptions": lambda app: app.post(
+        "/v1/audio/transcriptions",
+    ),
     "score": lambda app: app.post("/v1/score"),
 }
 
@@ -227,7 +248,7 @@ def make_fastapi_ingress(
 
 
 def _apply_openai_json_format(
-    response: Union[StreamResponseType, BatchedStreamResponseType]
+    response: Union[StreamResponseType, BatchedStreamResponseType],
 ) -> str:
     """Converts the stream response to OpenAI format.
 
@@ -256,7 +277,7 @@ def _apply_openai_json_format(
 
 
 async def _peek_at_generator(
-    gen: AsyncGenerator[T, None]
+    gen: AsyncGenerator[T, None],
 ) -> Tuple[T, AsyncGenerator[T, None]]:
     # Peek at the first element
     first_item = await gen.__anext__()
@@ -403,7 +424,11 @@ async def _get_response(
         self,
         *,
         body: Union[
-            CompletionRequest, ChatCompletionRequest, EmbeddingRequest, ScoreRequest
+            CompletionRequest,
+            ChatCompletionRequest,
+            EmbeddingRequest,
+            TranscriptionRequest,
+            ScoreRequest,
         ],
         call_method: str,
     ) -> AsyncGenerator[
@@ -411,6 +436,7 @@ async def _get_response(
             LLMChatResponse,
             LLMCompletionsResponse,
             LLMEmbeddingsResponse,
+            LLMTranscriptionResponse,
             LLMScoreResponse,
         ],
         None,
@@ -497,12 +523,10 @@ async def model_data(self, model: str) -> ModelCard:
         return model_data
 
     async def _process_llm_request(
-        self, body: Union[CompletionRequest, ChatCompletionRequest], is_chat: bool
+        self,
+        body: Union[CompletionRequest, ChatCompletionRequest, TranscriptionRequest],
+        call_method: str,
     ) -> Response:
-        NoneStreamingResponseType = (
-            ChatCompletionResponse if is_chat else CompletionResponse
-        )
-        call_method = "chat" if is_chat else "completions"
 
         async with router_request_timeout(DEFAULT_LLM_ROUTER_HTTP_TIMEOUT):
 
@@ -523,7 +547,7 @@ async def _process_llm_request(
                     type=first_chunk.error.type,
                 )
 
-            if isinstance(first_chunk, NoneStreamingResponseType):
+            if isinstance(first_chunk, NON_STREAMING_RESPONSE_TYPES):
                 # Not streaming, first chunk should be a single response
                 return JSONResponse(content=first_chunk.model_dump())
 
@@ -544,7 +568,9 @@ async def completions(self, body: CompletionRequest) -> Response:
         Returns:
             A response object with completions.
         """
-        return await self._process_llm_request(body, is_chat=False)
+        return await self._process_llm_request(
+            body, call_method=CallMethod.COMPLETIONS.value
+        )
 
     async def chat(self, body: ChatCompletionRequest) -> Response:
         """Given a prompt, the model will return one or more predicted completions,
@@ -557,7 +583,7 @@ async def chat(self, body: ChatCompletionRequest) -> Response:
             A response object with completions.
         """
 
-        return await self._process_llm_request(body, is_chat=True)
+        return await self._process_llm_request(body, call_method=CallMethod.CHAT.value)
 
     async def embeddings(self, body: EmbeddingRequest) -> Response:
         """Create embeddings for the provided input.
@@ -581,6 +607,24 @@ async def embeddings(self, body: EmbeddingRequest) -> Response:
             if isinstance(result, EmbeddingResponse):
                 return JSONResponse(content=result.model_dump())
 
+    # Annotated[..., Form()] is wrapper that is used to handle multiple form data, which is how audio is sent in transcription requests.
+    # vLLM implementation for handling transcription requests: https://github.com/vllm-project/vllm/blob/0825197bee8dea547f2ab25f48afd8aea0cd2578/vllm/entrypoints/openai/api_server.py#L839.
+    async def transcriptions(
+        self, body: Annotated[TranscriptionRequest, Form()]
+    ) -> Response:
+        """Create transcription for the provided audio input.
+
+        Args:
+            body: The TranscriptionRequest object.
+
+        Returns:
+            A response object with transcriptions.
+        """
+
+        return await self._process_llm_request(
+            body, call_method=CallMethod.TRANSCRIPTIONS.value
+        )
+
     async def score(self, body: ScoreRequest) -> Response:
         """Create scores for the provided text pairs.
 
diff --git a/python/ray/llm/_internal/serve/core/server/llm_server.py b/python/ray/llm/_internal/serve/core/server/llm_server.py
index 263d934f0020..0da17151cc12 100644
--- a/python/ray/llm/_internal/serve/core/server/llm_server.py
+++ b/python/ray/llm/_internal/serve/core/server/llm_server.py
@@ -52,6 +52,8 @@
         ErrorResponse,
         ScoreRequest,
         ScoreResponse,
+        TranscriptionRequest,
+        TranscriptionResponse,
     )
 
 logger = get_logger(__name__)
@@ -251,7 +253,10 @@ def _get_batch_interval_ms(self, stream: bool = True) -> int:
     async def _maybe_add_request_id_to_request(
         self,
         request: Union[
-            "ChatCompletionRequest", "CompletionRequest", "EmbeddingRequest"
+            "ChatCompletionRequest",
+            "CompletionRequest",
+            "EmbeddingRequest",
+            "TranscriptionRequest",
         ],
     ):
         """Add the request id to the request."""
@@ -282,6 +287,7 @@ async def _run_request(
             "ChatCompletionRequest",
             "CompletionRequest",
             "EmbeddingRequest",
+            "TranscriptionRequest",
             "ScoreRequest",
         ],
         *,
@@ -355,7 +361,7 @@ async def embeddings(
     ) -> AsyncGenerator[Union[List["ErrorResponse"], "EmbeddingResponse"], None]:
         """Runs an embeddings request to the engine and returns the response.
 
-        Returns an AsyncGenerator over the EmbeddingResponse object. This is so that the caller can have a consistent interface across all the methods of chat, completions, and embeddings.
+        Returns an AsyncGenerator over the EmbeddingResponse object. This is so that the caller can have a consistent interface across all the methods of chat, completions, embeddings and transcriptions.
 
         Args:
             request: An EmbeddingRequest object.
@@ -365,7 +371,30 @@ async def embeddings(
         """
         # NOTE: Embeddings does not need batching.
         return await self._run_request(
-            request, engine_method="embeddings", batch_output_stream=False
+            request,
+            engine_method="embeddings",
+            batch_output_stream=False,
+        )
+
+    async def transcriptions(
+        self, request: "TranscriptionRequest"
+    ) -> AsyncGenerator[
+        Union[List[Union[str, "ErrorResponse"]], "TranscriptionResponse"], None
+    ]:
+        """Runs an transcriptions request to the engine and returns the response.
+
+        Returns an AsyncGenerator over the TranscriptionResponse object. This is so that the caller can have a consistent interface across all the methods of chat, completions, embeddings and transcriptions.
+
+        Args:
+            request: An TranscriptionRequest object.
+
+        Returns:
+            An AsyncGenerator over the TranscriptionResponse object.
+        """
+        return await self._run_request(
+            request,
+            engine_method="transcriptions",
+            batch_output_stream=True,
         )
 
     async def score(
diff --git a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py
index 6c422f38a48b..13be7465f885 100644
--- a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py
+++ b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py
@@ -26,6 +26,8 @@
     ErrorResponse,
     ScoreRequest,
     ScoreResponse,
+    TranscriptionRequest,
+    TranscriptionResponse,
 )
 from ray.llm._internal.serve.core.engine.protocol import LLMEngine
 from ray.llm._internal.serve.engines.vllm.vllm_models import (
@@ -46,6 +48,7 @@
     from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
     from vllm.entrypoints.openai.serving_models import OpenAIServingModels
     from vllm.entrypoints.openai.serving_score import ServingScores
+    from vllm.entrypoints.openai.serving_transcription import OpenAIServingTranscription
 
 vllm = try_import("vllm")
 logger = get_logger(__name__)
@@ -147,6 +150,7 @@ def __init__(
         self._oai_serving_chat: Optional["OpenAIServingChat"] = None
         self._oai_serving_completion: Optional["OpenAIServingCompletion"] = None
         self._oai_serving_embedding: Optional["OpenAIServingEmbedding"] = None
+        self._oai_serving_transcription: Optional["OpenAIServingTranscription"] = None
         self._oai_serving_scores: Optional["ServingScores"] = None
 
     async def start(self) -> None:
@@ -208,6 +212,7 @@ async def start(self) -> None:
         self._oai_serving_chat = state.openai_serving_chat
         self._oai_serving_completion = state.openai_serving_completion
         self._oai_serving_embedding = state.openai_serving_embedding
+        self._oai_serving_transcription = state.openai_serving_transcription
         self._oai_serving_scores = state.openai_serving_scores
 
         self._validate_openai_serving_models()
@@ -241,6 +246,11 @@ def _validate_openai_serving_embedding(self):
             self._oai_serving_embedding, "create_embedding"
         ), "oai_serving_embedding must have a create_embedding attribute"
 
+    def _validate_openai_serving_transcription(self):
+        assert hasattr(
+            self._oai_serving_transcription, "create_transcription"
+        ), "oai_serving_transcription must have a create_transcription attribute"
+
     def _validate_openai_serving_scores(self):
         assert hasattr(
             self._oai_serving_scores, "create_score"
@@ -351,7 +361,11 @@ async def resolve_lora(self, disk_lora_model: DiskMultiplexConfig):
     def _create_raw_request(
         self,
         request: Union[
-            CompletionRequest, ChatCompletionRequest, EmbeddingRequest, ScoreRequest
+            CompletionRequest,
+            ChatCompletionRequest,
+            EmbeddingRequest,
+            TranscriptionRequest,
+            ScoreRequest,
         ],
         path: str,
     ) -> Request:
@@ -383,7 +397,7 @@ async def chat(
             async for response in chat_response:
                 if not isinstance(response, str):
                     raise ValueError(
-                        f"Expected create_chat_completion to return a stream of strings, got and item with type {type(response)}"
+                        f"Expected create_chat_completion to return a stream of strings, got an item with type {type(response)}"
                     )
                 yield response
         else:
@@ -412,7 +426,7 @@ async def completions(
             async for response in completion_response:
                 if not isinstance(response, str):
                     raise ValueError(
-                        f"Expected create_completion to return a stream of strings, got and item with type {type(response)}"
+                        f"Expected create_completion to return a stream of strings, got an item with type {type(response)}"
                     )
                 yield response
         else:
@@ -444,6 +458,41 @@ async def embeddings(
         else:
             yield EmbeddingResponse(**embedding_response.model_dump())
 
+    async def transcriptions(
+        self, request: TranscriptionRequest
+    ) -> AsyncGenerator[Union[str, TranscriptionResponse, ErrorResponse], None]:
+        self._validate_openai_serving_transcription()
+
+        # TODO (Kourosh): Remove when we upstream request_id attribute to vLLM.
+        # PR: https://github.com/vllm-project/vllm/pull/21009
+        # Create a fake starlette.Request object with the x-request-id header
+        # so that the create_transcription API can assign the request_id properly.
+        raw_request = self._create_raw_request(request, "/audio/transcriptions")
+
+        # Extract audio data from the request file
+        audio_data = await request.file.read()
+
+        transcription_response = await self._oai_serving_transcription.create_transcription(  # type: ignore[attr-defined]
+            audio_data,
+            request,
+            raw_request=raw_request,
+        )
+
+        if isinstance(transcription_response, AsyncGenerator):
+            async for response in transcription_response:
+                if not isinstance(response, str):
+                    raise ValueError(
+                        f"Expected create_transcription to return a stream of strings, got an item with type {type(response)}"
+                    )
+                yield response
+        else:
+            if isinstance(transcription_response, VLLMErrorResponse):
+                yield ErrorResponse(
+                    error=ErrorInfo(**transcription_response.error.model_dump())
+                )
+            else:
+                yield TranscriptionResponse(**transcription_response.model_dump())
+
     async def score(
         self, request: ScoreRequest
     ) -> AsyncGenerator[Union[ScoreResponse, ErrorResponse], None]:
diff --git a/python/ray/llm/tests/serve/conftest.py b/python/ray/llm/tests/serve/conftest.py
index 6598fe1dff1d..071e572a06f4 100644
--- a/python/ray/llm/tests/serve/conftest.py
+++ b/python/ray/llm/tests/serve/conftest.py
@@ -16,6 +16,7 @@
     CompletionRequest,
     EmbeddingCompletionRequest,
     ScoreRequest,
+    TranscriptionRequest,
 )
 from ray.llm._internal.serve.engines.vllm.vllm_models import (
     VLLMEngineConfig,
@@ -113,6 +114,31 @@ def mock_embedding_request(dimensions):
     return request
 
 
+@pytest.fixture
+def mock_transcription_request(stream, temperature, language):
+    """Fixture for creating transcription requests for mock testing."""
+    # Create a mock audio file for testing
+    from io import BytesIO
+
+    from fastapi import UploadFile
+
+    # Create a simple mock audio file (WAV format)
+    mock_audio_data = b"RIFF\x00\x00\x00\x00WAVEfmt \x10\x00\x00\x00\x01\x00\x01\x00\x44\xac\x00\x00\x88X\x01\x00\x02\x00\x10\x00data\x00\x00\x00\x00"  # random byte string to test the transcription API
+    mock_file = UploadFile(
+        file=BytesIO(mock_audio_data),
+        filename="test_audio.wav",
+    )
+
+    return TranscriptionRequest(
+        file=mock_file,
+        model=MOCK_MODEL_ID,
+        language=language,
+        temperature=temperature,
+        stream=stream,
+        prompt="",
+    )
+
+
 @pytest.fixture
 def mock_score_request():
     """Fixture for creating score requests for mock testing."""
diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py
index 4b259756aae6..5025b9d1d37b 100644
--- a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py
+++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py
@@ -83,6 +83,48 @@ async def test_embedding_mock_engine(
         async for response in engine.embeddings(request):
             LLMResponseValidator.validate_embedding_response(response, dimensions)
 
+    @pytest.mark.parametrize("stream", [False, True])
+    @pytest.mark.parametrize("temperature", [0.0])
+    @pytest.mark.parametrize("language", ["en", "hi"])
+    @pytest.mark.asyncio
+    async def test_transcription_mock_engine(
+        self,
+        mock_llm_config,
+        mock_transcription_request,
+        stream: bool,
+        temperature: float,
+        language: Optional[str],
+    ):
+        """Test transcription API with different language and temperature, streaming and non-streaming."""
+
+        engine = MockVLLMEngine(mock_llm_config)
+        await engine.start()
+
+        request = mock_transcription_request
+        response_generator = engine.transcriptions(request)
+
+        print(
+            f"\n\n_____ TRANSCRIPTION ({'STREAMING' if stream else 'NON-STREAMING'}) language={language} temperature={temperature} _____\n\n"
+        )
+
+        if stream:
+            # Collect streaming chunks
+            chunks = []
+            async for chunk in response_generator:
+                assert isinstance(chunk, str)
+                chunks.append(chunk)
+
+            # Validate streaming response
+            LLMResponseValidator.validate_transcription_response(
+                chunks, temperature, language
+            )
+        else:
+            # Validate non-streaming response
+            async for response in response_generator:
+                LLMResponseValidator.validate_transcription_response(
+                    response, temperature, language
+                )
+
     @pytest.mark.asyncio
     async def test_score_mock_engine(self, mock_llm_config, mock_score_request):
         """Test score API for text similarity."""
diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py
index 26814d6260f9..de74530d3e35 100644
--- a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py
+++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py
@@ -156,6 +156,61 @@ async def test_embedding_llm_server(
         # Validate embedding response
         LLMResponseValidator.validate_embedding_response(chunks[0], dimensions)
 
+    @pytest.mark.parametrize("stream", [False, True])
+    @pytest.mark.parametrize("temperature", [0.0])
+    @pytest.mark.parametrize("language", ["en", "hi"])
+    @pytest.mark.asyncio
+    async def test_transcription_llm_server(
+        self,
+        serve_handle,
+        mock_llm_config,
+        mock_transcription_request,
+        stream: bool,
+        temperature: float,
+        language: Optional[str],
+    ):
+        """Test transcription API from LLMServer perspective."""
+
+        # Create transcription request
+        request = mock_transcription_request
+
+        print(
+            f"\n\n_____ TRANSCRIPTION SERVER ({'STREAMING' if stream else 'NON-STREAMING'}) language={language} temperature={temperature} _____\n\n"
+        )
+
+        # Get the response
+        batched_chunks = serve_handle.transcriptions.remote(request)
+
+        if stream:
+            # Collect streaming responses
+            chunks = []
+            async for batch in batched_chunks:
+                if isinstance(batch, list):
+                    chunks.extend(batch)
+                else:
+                    chunks.append(batch)
+
+            # Check that we got responses
+            assert len(chunks) > 0
+
+            # Validate streaming response
+            LLMResponseValidator.validate_transcription_response(
+                chunks, temperature, language
+            )
+        else:
+            # Collect non-streaming response
+            chunks = []
+            async for batch in batched_chunks:
+                chunks.append(batch)
+
+            # Check that we got one response
+            assert len(chunks) == 1
+
+            # Validate non-streaming response
+            LLMResponseValidator.validate_transcription_response(
+                chunks[0], temperature, language
+            )
+
     @pytest.mark.asyncio
     async def test_score_llm_server(
         self,
diff --git a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py
index 4300b4859b91..c23e56b5e088 100644
--- a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py
+++ b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py
@@ -19,6 +19,8 @@
     ErrorResponse,
     ScoreRequest,
     ScoreResponse,
+    TranscriptionRequest,
+    TranscriptionResponse,
 )
 from ray.llm._internal.serve.core.engine.protocol import LLMEngine
 from ray.llm._internal.serve.utils.lora_serve_utils import LoraModelLoader
@@ -137,6 +139,23 @@ async def embeddings(
         )
         yield response
 
+    async def transcriptions(
+        self, request: TranscriptionRequest
+    ) -> AsyncGenerator[Union[str, TranscriptionResponse, ErrorResponse], None]:
+        """Mock transcription generation."""
+        if not self.started:
+            raise RuntimeError("Engine not started")
+
+        # Extract audio file info
+        language = getattr(request, "language", "en")
+        temperature = getattr(request, "temperature", 0.0)
+
+        # Generate transcription response
+        async for response in self._generate_transcription_response(
+            request=request, language=language, temperature=temperature
+        ):
+            yield response
+
     async def score(
         self, request: ScoreRequest
     ) -> AsyncGenerator[Union[str, ScoreResponse, ErrorResponse], None]:
@@ -314,6 +333,95 @@ async def _generate_completion_response(
 
             yield response
 
+    async def _generate_transcription_response(
+        self,
+        request: TranscriptionRequest,
+        language: str,
+        temperature: float,
+    ) -> AsyncGenerator[Union[str, TranscriptionResponse], None]:
+        """Generate mock transcription response."""
+
+        request_id = request.request_id or f"transcribe-{random.randint(1000, 9999)}"
+        lora_prefix = (
+            ""
+            if request.model not in self._current_lora_model
+            else f"[lora_model] {request.model}: "
+        )
+
+        # Generate mock transcription text with LoRA prefix
+        mock_transcription_text = (
+            f"Mock transcription in {language} language with temperature {temperature}"
+        )
+        if lora_prefix:
+            mock_transcription_text = f"{lora_prefix}{mock_transcription_text}"
+
+        if request.stream:
+            # Streaming response - return SSE formatted strings
+            created_time = int(asyncio.get_event_loop().time())
+            model_name = getattr(request, "model", "mock-model")
+
+            # Split transcription into words for streaming
+            words = mock_transcription_text.split()
+
+            for i, word in enumerate(words):
+                # Create streaming chunk
+                choice = {
+                    "delta": {
+                        "content": word + (" " if i < len(words) - 1 else ""),
+                    },
+                }
+
+                chunk_data = {
+                    "delta": None,
+                    "type": None,
+                    "logprobs": None,
+                    "id": request_id,
+                    "object": "transcription.chunk",
+                    "created": created_time,
+                    "model": model_name,
+                    "choices": [choice],
+                }
+
+                # Format as SSE
+                yield f"data: {json.dumps(chunk_data)}\n\n"
+                await asyncio.sleep(0.01)  # Simulate processing time
+
+            # Send final chunk with finish_reason
+            final_choice = {
+                "delta": {
+                    "content": "",
+                    "finish_reason": "stop",
+                    "stop_reason": None,
+                },
+            }
+
+            final_chunk_data = {
+                "delta": None,
+                "type": None,
+                "logprobs": None,
+                "id": request_id,
+                "object": "transcription.chunk",
+                "created": created_time,
+                "model": model_name,
+                "choices": [final_choice],
+            }
+
+            yield f"data: {json.dumps(final_chunk_data)}\n\n"
+
+            # Send final [DONE] message
+            yield "data: [DONE]\n\n"
+        else:
+            # Non-streaming response - return response object
+            response = TranscriptionResponse(
+                text=mock_transcription_text,
+                logprobs=None,
+                usage={
+                    "seconds": 5.0,
+                    "type": "duration",
+                },
+            )
+            yield response
+
 
 class FakeLoraModelLoader(LoraModelLoader):
     """Fake LoRA model loader for testing that bypasses S3 entirely."""
diff --git a/python/ray/llm/tests/serve/utils/testing_utils.py b/python/ray/llm/tests/serve/utils/testing_utils.py
index c63c92921b6c..0a8b4a95ad56 100644
--- a/python/ray/llm/tests/serve/utils/testing_utils.py
+++ b/python/ray/llm/tests/serve/utils/testing_utils.py
@@ -12,6 +12,7 @@
     CompletionResponse,
     EmbeddingResponse,
     ScoreResponse,
+    TranscriptionResponse,
 )
 
 
@@ -108,3 +109,88 @@ def validate_score_response(response: ScoreResponse):
             assert score_data.object == "score"
             assert isinstance(score_data.score, float)
             assert score_data.index == i  # Index should match position in list
+
+    @staticmethod
+    def validate_transcription_response(
+        response: Union[TranscriptionResponse, List[str]],
+        temperature: float,
+        language: Optional[str] = None,
+        lora_model_id: str = "",
+    ):
+        """Validate transcription responses for both streaming and non-streaming."""
+        if isinstance(response, list):
+            # Streaming response - validate chunks
+            LLMResponseValidator.validate_transcription_streaming_chunks(
+                response, temperature, language, lora_model_id
+            )
+        else:
+            # Non-streaming response
+            assert isinstance(response, TranscriptionResponse)
+            assert hasattr(response, "text")
+            assert isinstance(response.text, str)
+            assert len(response.text) > 0
+
+            # Check that the response contains expected language and temperature info
+            expected_text = f"Mock transcription in {language} language with temperature {temperature}"
+            if lora_model_id:
+                expected_text = f"[lora_model] {lora_model_id}: {expected_text}"
+            assert response.text == expected_text
+
+            # Validate usage information
+            if hasattr(response, "usage"):
+                assert hasattr(response.usage, "seconds")
+                assert hasattr(response.usage, "type")
+                assert response.usage.seconds > 0
+                assert response.usage.type == "duration"
+
+    @staticmethod
+    def validate_transcription_streaming_chunks(
+        chunks: List[str],
+        temperature: float,
+        language: Optional[str] = None,
+        lora_model_id: str = "",
+    ):
+        """Validate streaming transcription response chunks."""
+        # Should have at least one chunk (transcription text) + final chunk + [DONE]
+        assert len(chunks) >= 3
+
+        # Validate each chunk except the last [DONE] chunk
+        transcription_chunks = []
+        for chunk in chunks[:-1]:  # Exclude the final [DONE] chunk
+            pattern = r"data: (.*)\n\n"
+            match = re.match(pattern, chunk)
+            assert match is not None
+            chunk_data = json.loads(match.group(1))
+
+            # Validate chunk structure
+            assert "id" in chunk_data
+            assert "object" in chunk_data
+            assert chunk_data["object"] == "transcription.chunk"
+            assert "delta" in chunk_data
+            assert chunk_data["delta"] is None
+            assert "type" in chunk_data
+            assert chunk_data["type"] is None
+            assert "logprobs" in chunk_data
+            assert chunk_data["logprobs"] is None
+            assert "choices" in chunk_data
+            assert len(chunk_data["choices"]) == 1
+
+            choice = chunk_data["choices"][0]
+            assert "delta" in choice
+            assert "content" in choice["delta"]
+
+            # Collect text for final validation
+            if choice["delta"]["content"]:
+                transcription_chunks.append(choice["delta"]["content"])
+
+        # Validate final transcription text
+        full_transcription = "".join(transcription_chunks)
+        expected_text = (
+            f"Mock transcription in {language} language with temperature {temperature}"
+        )
+        if lora_model_id:
+            expected_text = f"[lora_model] {lora_model_id}: {expected_text}"
+        assert full_transcription.strip() == expected_text.strip()
+
+        # Validate final [DONE] chunk
+        assert chunks[-1] == "data: [DONE]\n\n"
diff --git a/python/ray/serve/llm/openai_api_models.py b/python/ray/serve/llm/openai_api_models.py
index 4b04d54dbfdd..18603ac3deb0 100644
--- a/python/ray/serve/llm/openai_api_models.py
+++ b/python/ray/serve/llm/openai_api_models.py
@@ -8,6 +8,9 @@
     EmbeddingRequest as _EmbeddingRequest,
     EmbeddingResponse as _EmbeddingResponse,
     ErrorResponse as _ErrorResponse,
+    TranscriptionRequest as _TranscriptionRequest,
+    TranscriptionResponse as _TranscriptionResponse,
+    TranscriptionStreamResponse as _TranscriptionStreamResponse,
 )
 from ray.util.annotations import PublicAPI
 
@@ -85,6 +88,36 @@ class EmbeddingResponse(_EmbeddingResponse):
     pass
 
 
+@PublicAPI(stability="alpha")
+class TranscriptionRequest(_TranscriptionRequest):
+    """TranscriptionRequest is the request body for the transcription API.
+
+    This model is compatible with vLLM's OpenAI API models.
+    """
+
+    pass
+
+
+@PublicAPI(stability="alpha")
+class TranscriptionResponse(_TranscriptionResponse):
+    """TranscriptionResponse is the response body for the transcription API.
+
+    This model is compatible with vLLM's OpenAI API models.
+    """
+
+    pass
+
+
+@PublicAPI(stability="alpha")
+class TranscriptionStreamResponse(_TranscriptionStreamResponse):
+    """TranscriptionStreamResponse is the response body for the transcription API.
+
+    This model is compatible with vLLM's OpenAI API models.
+    """
+
+    pass
+
+
 @PublicAPI(stability="alpha")
 class ErrorResponse(_ErrorResponse):
     """The returned response in case of an error."""
diff --git a/python/requirements/llm/llm-requirements.txt b/python/requirements/llm/llm-requirements.txt
index fe3543757e4f..d32e70d23f89 100644
--- a/python/requirements/llm/llm-requirements.txt
+++ b/python/requirements/llm/llm-requirements.txt
@@ -2,7 +2,7 @@
 # constraining to a maximum version (i.e. <=) to temporarily work around a bug.
 # Those pins for the sake of workarounds should not be advertised as constraints
 # on future releases in setup.py.
-vllm>=0.11.0
+vllm[audio]>=0.11.0
 nixl>=0.6.1
 # For json mode
 jsonref>=1.1.0
diff --git a/python/setup.py b/python/setup.py
index 8799f262f1fb..869d5dfabecf 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -374,7 +374,7 @@ def get_packages(self):
     setup_spec.extras["llm"] = list(
         set(
             [
-                "vllm>=0.11.0",
+                "vllm[audio]>=0.11.0",
                 "nixl>=0.6.1",
                 "jsonref>=1.1.0",
                 "jsonschema",
@@ -382,6 +382,8 @@ def get_packages(self):
                 # async-timeout is a backport of asyncio.timeout for python < 3.11
                 "async-timeout; python_version < '3.11'",
                 "typer",
+                "meson",
+                "pybind11",
                 "hf_transfer",
             ]
             + setup_spec.extras["data"]
diff --git a/release/llm_tests/serve/test_llm_serve_integration.py b/release/llm_tests/serve/test_llm_serve_integration.py
index 8d1b423ba4b9..03e01dc1766e 100644
--- a/release/llm_tests/serve/test_llm_serve_integration.py
+++ b/release/llm_tests/serve/test_llm_serve_integration.py
@@ -156,6 +156,36 @@ def test_deepseek_model(model_name):
     time.sleep(1)
 
 
+@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"])
+def test_transcription_model(model_name):
+    """
+    Test that the transcription models can be loaded successfully.
+    """
+    llm_config = LLMConfig(
+        model_loading_config=dict(
+            model_id=model_name,
+            model_source=model_name,
+        ),
+        deployment_config=dict(
+            autoscaling_config=dict(min_replicas=1, max_replicas=4),
+        ),
+        engine_kwargs=dict(
+            trust_remote_code=True,
+            gpu_memory_utilization=0.9,
+            enable_prefix_caching=True,
+            max_model_len=2048,
+            tokenizer_mode="mistral",
+            config_format="mistral",
+            load_format="mistral",
+        ),
+    )
+    app = build_openai_app({"llm_configs": [llm_config]})
+    serve.run(app, blocking=False)
+    wait_for_condition(is_default_app_running, timeout=180)
+    serve.shutdown()
+    time.sleep(1)
+
+
 @pytest.mark.asyncio(scope="function")
 @pytest.fixture
 def remote_model_app(request):