Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
8c48511
initial commit for transcriptions api integration
Oct 4, 2025
1c793b3
naming fixes
Oct 4, 2025
0d4039c
ci tests for transcriptions api and docs for transcription
Oct 7, 2025
863de39
type error fix
Oct 8, 2025
fd611a5
formatting updated and added engine transcription function def
Oct 8, 2025
c55fdc9
naming updates
Oct 8, 2025
7b62802
lora prefix updates and code formatting
Oct 8, 2025
77d162a
request_id added in transcription request
Oct 8, 2025
8294a33
modified docs for ci tests and added release test
Oct 9, 2025
c5134d5
enum fix
Oct 9, 2025
2cd0ac9
enum fix
Oct 9, 2025
b248c90
router updates
Oct 10, 2025
92d4fdb
router fix
Oct 10, 2025
fff6dba
pre commit hooks run and bazel build
Oct 11, 2025
7485e36
enum fixes
Oct 11, 2025
bea6209
inconsistency fixes
Oct 12, 2025
7d80528
updates
Oct 12, 2025
fa48092
query server doc test added and router updates
Oct 16, 2025
cf20ea5
fix
Oct 18, 2025
2910796
create_transcription and release test fixes
Oct 18, 2025
6dc2d41
requirements updates
Oct 18, 2025
4d97377
lock updates
Oct 19, 2025
5f8edde
doc updates
Oct 19, 2025
b2f92d9
doc fix
Oct 19, 2025
d108753
docs fix
Oct 19, 2025
53b500d
docs fix
Oct 19, 2025
29b7c34
Code review updates and fixes
Oct 20, 2025
6d10b03
lock updates
Oct 20, 2025
6df59eb
yaml tests for bazel
Oct 20, 2025
795cf28
Merge branch 'master' into master
Blaze-DSP Oct 20, 2025
b59bcab
Merge branch 'master' into master
Blaze-DSP Oct 21, 2025
288ff91
removed .yaml doc code example and tests
Oct 21, 2025
1405c2a
Merge branch 'master' into master
Blaze-DSP Oct 21, 2025
897ce85
Merge branch 'master' into master
Blaze-DSP Oct 22, 2025
5f6fa73
Merge branch 'master' into master
Blaze-DSP Oct 23, 2025
ea3b762
Merge branch 'master' into master
Blaze-DSP Oct 23, 2025
1773359
Merge branch 'master' into master
Blaze-DSP Oct 23, 2025
4095f75
review updates
Oct 23, 2025
57e323a
test fix
Oct 24, 2025
d6f4183
Merge branch 'master' into master
Blaze-DSP Oct 24, 2025
05cf83e
doc updates
Oct 24, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 106 additions & 0 deletions doc/source/llm/doc_code/serve/transcription/transcription_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""
This file serves as a documentation example and CI test.

Structure:
1. Monkeypatch setup: Ensures serve.run is non-blocking and removes accelerator requirements for CI testing.
2. Docs example (between __transcription_example_start/end__): Embedded in Sphinx docs via literalinclude.
3. Test validation (deployment status polling + cleanup)
"""

import time
import openai
import requests
from ray import serve
from ray.serve.schema import ApplicationStatus
from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME
from ray.serve import llm

_original_serve_run = serve.run
_original_build_openai_app = llm.build_openai_app


def _non_blocking_serve_run(app, **kwargs):
"""Forces blocking=False for testing"""
kwargs["blocking"] = False
return _original_serve_run(app, **kwargs)


def _testing_build_openai_app(llm_serving_args):
"""Removes accelerator requirements for testing"""
for config in llm_serving_args["llm_configs"]:
config.accelerator_type = None

return _original_build_openai_app(llm_serving_args)


serve.run = _non_blocking_serve_run
llm.build_openai_app = _testing_build_openai_app

# __transcription_example_start__
from ray import serve
from ray.serve.llm import LLMConfig, build_openai_app

llm_config = LLMConfig(
model_loading_config={
"model_id": "voxtral-mini",
"model_source": "mistralai/Voxtral-Mini-3B-2507",
},
deployment_config={
"autoscaling_config": {
"min_replicas": 1,
"max_replicas": 4,
}
},
accelerator_type="A10G",
# You can customize the engine arguments (e.g. vLLM engine kwargs)
engine_kwargs={
"tokenizer_mode": "mistral",
"config_format": "mistral",
"load_format": "mistral",
},
log_engine_metrics=True,
)

app = build_openai_app({"llm_configs": [llm_config]})
serve.run(app, blocking=True)
# __transcription_example_end__

status = ApplicationStatus.NOT_STARTED
timeout_seconds = 300
start_time = time.time()

while (
status != ApplicationStatus.RUNNING and time.time() - start_time < timeout_seconds
):
status = serve.status().applications[SERVE_DEFAULT_APP_NAME].status

if status in [ApplicationStatus.DEPLOY_FAILED, ApplicationStatus.UNHEALTHY]:
raise AssertionError(f"Deployment failed with status: {status}")

time.sleep(1)

if status != ApplicationStatus.RUNNING:
raise AssertionError(
f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}"
)

response = requests.get("https://voiceage.com/wbsamples/in_stereo/Sports.wav")
with open("audio.wav", "wb") as f:
f.write(response.content)

client = openai.OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key")

with open("audio.wav", "rb") as f:
try:
response = client.audio.transcriptions.create(
model="voxtral-mini",
file=f,
temperature=0.0,
language="en",
)
except Exception as e:
raise AssertionError(
f"Error while querying models: {e}. Check the logs for more details."
)

serve.shutdown()
61 changes: 60 additions & 1 deletion doc/source/serve/llm/user-guides/vllm-compatibility.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,66 @@ curl -X POST http://localhost:8000/v1/embeddings \

::::


## Transcriptions

You can generate audio transcriptions using Speech-to-Text (STT) models trained specifically for Automatic Speech Recognition (ASR) tasks. Models supporting this use case are listed in the [vLLM transcription models documentation](https://docs.vllm.ai/en/stable/models/supported_models.html).


### Deploy a transcription model

::::{tab-set}

:::{tab-item} Server
:sync: server

```{literalinclude} ../../../llm/doc_code/serve/transcription/transcription_example.py
:language: python
:start-after: __transcription_example_start__
:end-before: __transcription_example_end__
```
:::

:::{tab-item} Python Client
:sync: client

```python
from openai import OpenAI

# Initialize client
client = OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key")

# Open audio file
with open("/path/to/audio.wav", "rb") as f:
# Make a request to the transcription model
response = client.audio.transcriptions.create(
model="whisper-large",
file=f,
temperature=0.0,
language="en",
)

print(response.text)
```
:::

:::{tab-item} cURL
:sync: curl

```bash
curl http://localhost:8000/v1/audio/transcriptions \
-X POST \
-H "Authorization: Bearer fake-key" \
-F "file=@/path/to/audio.wav" \
-F "model=whisper-large" \
-F "temperature=0.0" \
-F "language=en"
```
:::

::::


## Structured output

You can request structured JSON output similar to OpenAI's API using JSON mode or JSON schema validation with Pydantic models.
Expand Down Expand Up @@ -179,7 +239,6 @@ response = client.chat.completions.create(
response_format={
"type": "json_schema",
"json_schema": Color.model_json_schema()

},
messages=[
{
Expand Down
86 changes: 86 additions & 0 deletions python/deplocks/llm/rayllm_py311_cpu.lock
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,12 @@ attrs==25.1.0 \
# aiohttp
# jsonschema
# referencing
audioread==3.0.1 \
--hash=sha256:4cdce70b8adc0da0a3c9e0d85fb10b3ace30fbdf8d1670fd443929b61d117c33 \
--hash=sha256:ac5460a5498c48bdf2e8e767402583a4dcd13f4414d286f42ce4379e8b35066d
# via
# -c python/deplocks/llm/rayllm_test_py311_cpu.lock
# librosa
billiard==4.2.1 \
--hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \
--hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb
Expand Down Expand Up @@ -572,6 +578,12 @@ cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \
# -c python/deplocks/llm/rayllm_test_py311_cpu.lock
# -r python/requirements.txt
# ray
decorator==5.1.1 \
--hash=sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330 \
--hash=sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186
# via
# -c python/deplocks/llm/rayllm_test_py311_cpu.lock
# librosa
depyf==0.19.0 \
--hash=sha256:040b35fc0997d49df024b7d094f2a7836f91e9ed02f49982dd37e70aa3285ad5 \
--hash=sha256:afed0916b32d141cc90fa6220df01885eda442ca43b297d5050eeb90b4a5cb44
Expand Down Expand Up @@ -1229,6 +1241,13 @@ jiter==0.8.2 \
# via
# -c python/deplocks/llm/rayllm_test_py311_cpu.lock
# openai
joblib==1.5.2 \
--hash=sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55 \
--hash=sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241
# via
# -c python/deplocks/llm/rayllm_test_py311_cpu.lock
# librosa
# scikit-learn
jsonref==1.1.0 \
--hash=sha256:32fe8e1d85af0fdefbebce950af85590b22b60f9e95443176adbde4e1ecea552 \
--hash=sha256:590dc7773df6c21cbf948b5dac07a72a251db28b0238ceecce0a2abfa8ec30a9
Expand Down Expand Up @@ -1267,7 +1286,14 @@ lazy-loader==0.4 \
--hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1
# via
# -c python/deplocks/llm/rayllm_test_py311_cpu.lock
# librosa
# scikit-image
librosa==0.11.0 \
--hash=sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1 \
--hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908
# via
# -c python/deplocks/llm/rayllm_test_py311_cpu.lock
# vllm
llguidance==0.7.26 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \
--hash=sha256:1895ff449c8ec0a5f1d3b142d723fc9b26a85b021b72d7f1173f8b7507f528c0 \
--hash=sha256:5e6f6cec9c6648164062f0347262b3ec7c39f54d1be5c5347d6446bc7fdba115 \
Expand Down Expand Up @@ -1544,6 +1570,7 @@ msgpack==1.0.7 \
# via
# -c python/deplocks/llm/rayllm_test_py311_cpu.lock
# -r python/requirements.txt
# librosa
# ray
msgspec==0.19.0 \
--hash=sha256:00e87ecfa9795ee5214861eab8326b0e75475c2e68a384002aa135ea2a27d909 \
Expand Down Expand Up @@ -1746,6 +1773,7 @@ numba==0.61.2 \
--hash=sha256:efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2
# via
# -c python/deplocks/llm/rayllm_test_py311_cpu.lock
# librosa
# vllm
numpy==1.26.4 \
--hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \
Expand Down Expand Up @@ -1791,12 +1819,14 @@ numpy==1.26.4 \
# gguf
# gymnasium
# imageio
# librosa
# mistral-common
# nixl
# numba
# opencv-python-headless
# pandas
# scikit-image
# scikit-learn
# scipy
# soundfile
# soxr
Expand Down Expand Up @@ -1944,6 +1974,7 @@ packaging==23.0 \
# kombu
# lazy-loader
# lm-format-enforcer
# pooch
# ray
# scikit-image
# tensorboardx
Expand Down Expand Up @@ -2067,7 +2098,14 @@ platformdirs==3.11.0 \
--hash=sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e
# via
# -c python/deplocks/llm/rayllm_test_py311_cpu.lock
# pooch
# virtualenv
pooch==1.8.2 \
--hash=sha256:3529a57096f7198778a5ceefd5ac3ef0e4d06a6ddaf9fc2d609b806f25302c47 \
--hash=sha256:76561f0de68a01da4df6af38e9955c4c9d1a5c90da73f7e40276a5728ec83d10
# via
# -c python/deplocks/llm/rayllm_test_py311_cpu.lock
# librosa
prometheus-client==0.19.0 \
--hash=sha256:4585b0d1223148c27a225b10dbec5ae9bc4c81a99a3fa80774fa6209935324e1 \
--hash=sha256:c88b1e6ecf6b41cd8fb5731c7ae919bf66df6ec6fafa555cd6c0e16ca169ae92
Expand Down Expand Up @@ -2919,6 +2957,7 @@ requests==2.32.3 \
# google-api-core
# huggingface-hub
# mistral-common
# pooch
# ray
# tiktoken
# transformers
Expand Down Expand Up @@ -3089,6 +3128,41 @@ scikit-image==0.24.0 \
# via
# -c python/deplocks/llm/rayllm_test_py311_cpu.lock
# -r python/requirements.txt
scikit-learn==1.7.2 \
--hash=sha256:0486c8f827c2e7b64837c731c8feff72c0bd2b998067a8a9cbc10643c31f0fe1 \
--hash=sha256:0b7dacaa05e5d76759fb071558a8b5130f4845166d88654a0f9bdf3eb57851b7 \
--hash=sha256:191e5550980d45449126e23ed1d5e9e24b2c68329ee1f691a3987476e115e09c \
--hash=sha256:20e9e49ecd130598f1ca38a1d85090e1a600147b9c02fa6f15d69cb53d968fda \
--hash=sha256:2a41e2a0ef45063e654152ec9d8bcfc39f7afce35b08902bfe290c2498a67a6a \
--hash=sha256:36749fb62b3d961b1ce4fedf08fa57a1986cd409eff2d783bca5d4b9b5fce51c \
--hash=sha256:4a847fea807e278f821a0406ca01e387f97653e284ecbd9750e3ee7c90347f18 \
--hash=sha256:502c18e39849c0ea1a5d681af1dbcf15f6cce601aebb657aabbfe84133c1907f \
--hash=sha256:57dc4deb1d3762c75d685507fbd0bc17160144b2f2ba4ccea5dc285ab0d0e973 \
--hash=sha256:6088aa475f0785e01bcf8529f55280a3d7d298679f50c0bb70a2364a82d0b290 \
--hash=sha256:63a9afd6f7b229aad94618c01c252ce9e6fa97918c5ca19c9a17a087d819440c \
--hash=sha256:6b33579c10a3081d076ab403df4a4190da4f4432d443521674637677dc91e61f \
--hash=sha256:7a4c328a71785382fe3fe676a9ecf2c86189249beff90bf85e22bdb7efaf9ae0 \
--hash=sha256:7a58814265dfc52b3295b1900cfb5701589d30a8bb026c7540f1e9d3499d5ec8 \
--hash=sha256:89877e19a80c7b11a2891a27c21c4894fb18e2c2e077815bcade10d34287b20d \
--hash=sha256:8d91a97fa2b706943822398ab943cde71858a50245e31bc71dba62aab1d60a96 \
--hash=sha256:8da8bf89d4d79aaec192d2bda62f9b56ae4e5b4ef93b6a56b5de4977e375c1f1 \
--hash=sha256:9656e4a53e54578ad10a434dc1f993330568cfee176dff07112b8785fb413106 \
--hash=sha256:96dc05a854add0e50d3f47a1ef21a10a595016da5b007c7d9cd9d0bffd1fcc61 \
--hash=sha256:98335fb98509b73385b3ab2bd0639b1f610541d3988ee675c670371d6a87aa7c \
--hash=sha256:9acb6c5e867447b4e1390930e3944a005e2cb115922e693c08a323421a6966e8 \
--hash=sha256:9b7ed8d58725030568523e937c43e56bc01cadb478fc43c042a9aca1dacb3ba1 \
--hash=sha256:abebbd61ad9e1deed54cca45caea8ad5f79e1b93173dece40bb8e0c658dbe6fe \
--hash=sha256:acbc0f5fd2edd3432a22c69bed78e837c70cf896cd7993d71d51ba6708507476 \
--hash=sha256:b4d6e9deed1a47aca9fe2f267ab8e8fe82ee20b4526b2c0cd9e135cea10feb44 \
--hash=sha256:bb24510ed3f9f61476181e4db51ce801e2ba37541def12dc9333b946fc7a9cf8 \
--hash=sha256:c7509693451651cd7361d30ce4e86a1347493554f172b1c72a39300fa2aea79e \
--hash=sha256:ca250e6836d10e6f402436d6463d6c0e4d8e0234cfb6a9a47835bd392b852ce5 \
--hash=sha256:e5bf3d930aee75a65478df91ac1225ff89cd28e9ac7bd1196853a9229b6adb0b \
--hash=sha256:f95dc55b7902b91331fa4e5845dd5bde0580c9cd9612b1b2791b7e80c3d32615 \
--hash=sha256:fa8f63940e29c82d1e67a45d5297bdebbcb585f5a5a50c4914cc2e852ab77f33
# via
# -c python/deplocks/llm/rayllm_test_py311_cpu.lock
# librosa
scipy==1.11.4 \
--hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \
--hash=sha256:028eccd22e654b3ea01ee63705681ee79933652b2d8f873e7949898dda6d11b6 \
Expand Down Expand Up @@ -3118,7 +3192,9 @@ scipy==1.11.4 \
# via
# -c python/deplocks/llm/rayllm_test_py311_cpu.lock
# -r python/requirements.txt
# librosa
# scikit-image
# scikit-learn
# vllm
sentencepiece==0.2.0 \
--hash=sha256:0461324897735512a32d222e3d886e24ad6a499761952b6bda2a9ee6e4313ea5 \
Expand Down Expand Up @@ -3317,7 +3393,9 @@ soundfile==0.13.1 \
--hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5
# via
# -c python/deplocks/llm/rayllm_test_py311_cpu.lock
# librosa
# mistral-common
# vllm
soxr==0.5.0.post1 \
--hash=sha256:39e0f791ba178d69cd676485dbee37e75a34f20daa478d90341ecb7f6d9d690f \
--hash=sha256:4704ba6b13a3f1e41d12acf192878384c1c31f71ce606829c64abdf64a8d7d32 \
Expand All @@ -3342,6 +3420,7 @@ soxr==0.5.0.post1 \
--hash=sha256:fef509466c9c25f65eae0ce1e4b9ac9705d22c6038c914160ddaf459589c6e31
# via
# -c python/deplocks/llm/rayllm_test_py311_cpu.lock
# librosa
# mistral-common
starlette==0.46.2 \
--hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \
Expand All @@ -3363,6 +3442,12 @@ tensorboardx==2.6.2.2 \
# via
# -c python/deplocks/llm/rayllm_test_py311_cpu.lock
# -r python/requirements.txt
threadpoolctl==3.6.0 \
--hash=sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb \
--hash=sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e
# via
# -c python/deplocks/llm/rayllm_test_py311_cpu.lock
# scikit-learn
tifffile==2024.7.21 \
--hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \
--hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483
Expand Down Expand Up @@ -3518,6 +3603,7 @@ typing-extensions==4.12.2 \
# fastapi
# gymnasium
# huggingface-hub
# librosa
# mistral-common
# openai
# opentelemetry-api
Expand Down
Loading