-
Notifications
You must be signed in to change notification settings - Fork 7k
[serve][llm][transcription] Add support for Transcription in vLLM engine backend #57194
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
41 commits
Select commit
Hold shift + click to select a range
8c48511
initial commit for transcriptions api integration
1c793b3
naming fixes
0d4039c
ci tests for transcriptions api and docs for transcription
863de39
type error fix
fd611a5
formatting updated and added engine transcription function def
c55fdc9
naming updates
7b62802
lora prefix updates and code formatting
77d162a
request_id added in transcription request
8294a33
modified docs for ci tests and added release test
c5134d5
enum fix
2cd0ac9
enum fix
b248c90
router updates
92d4fdb
router fix
fff6dba
pre commit hooks run and bazel build
7485e36
enum fixes
bea6209
inconsistency fixes
7d80528
updates
fa48092
query server doc test added and router updates
cf20ea5
fix
2910796
create_transcription and release test fixes
6dc2d41
requirements updates
4d97377
lock updates
5f8edde
doc updates
b2f92d9
doc fix
d108753
docs fix
53b500d
docs fix
29b7c34
Code review updates and fixes
6d10b03
lock updates
6df59eb
yaml tests for bazel
795cf28
Merge branch 'master' into master
Blaze-DSP b59bcab
Merge branch 'master' into master
Blaze-DSP 288ff91
removed .yaml doc code example and tests
1405c2a
Merge branch 'master' into master
Blaze-DSP 897ce85
Merge branch 'master' into master
Blaze-DSP 5f6fa73
Merge branch 'master' into master
Blaze-DSP ea3b762
Merge branch 'master' into master
Blaze-DSP 1773359
Merge branch 'master' into master
Blaze-DSP 4095f75
review updates
57e323a
test fix
d6f4183
Merge branch 'master' into master
Blaze-DSP 05cf83e
doc updates
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
106 changes: 106 additions & 0 deletions
106
doc/source/llm/doc_code/serve/transcription/transcription_example.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,106 @@ | ||
| """ | ||
| This file serves as a documentation example and CI test. | ||
|
|
||
| Structure: | ||
| 1. Monkeypatch setup: Ensures serve.run is non-blocking and removes accelerator requirements for CI testing. | ||
| 2. Docs example (between __transcription_example_start/end__): Embedded in Sphinx docs via literalinclude. | ||
| 3. Test validation (deployment status polling + cleanup) | ||
| """ | ||
|
|
||
| import time | ||
| import openai | ||
| import requests | ||
| from ray import serve | ||
| from ray.serve.schema import ApplicationStatus | ||
| from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME | ||
| from ray.serve import llm | ||
|
|
||
| _original_serve_run = serve.run | ||
| _original_build_openai_app = llm.build_openai_app | ||
|
|
||
|
|
||
| def _non_blocking_serve_run(app, **kwargs): | ||
| """Forces blocking=False for testing""" | ||
| kwargs["blocking"] = False | ||
| return _original_serve_run(app, **kwargs) | ||
|
|
||
|
|
||
| def _testing_build_openai_app(llm_serving_args): | ||
| """Removes accelerator requirements for testing""" | ||
| for config in llm_serving_args["llm_configs"]: | ||
| config.accelerator_type = None | ||
|
|
||
| return _original_build_openai_app(llm_serving_args) | ||
|
|
||
|
|
||
| serve.run = _non_blocking_serve_run | ||
| llm.build_openai_app = _testing_build_openai_app | ||
|
|
||
| # __transcription_example_start__ | ||
| from ray import serve | ||
| from ray.serve.llm import LLMConfig, build_openai_app | ||
|
|
||
| llm_config = LLMConfig( | ||
| model_loading_config={ | ||
| "model_id": "voxtral-mini", | ||
| "model_source": "mistralai/Voxtral-Mini-3B-2507", | ||
| }, | ||
| deployment_config={ | ||
| "autoscaling_config": { | ||
| "min_replicas": 1, | ||
| "max_replicas": 4, | ||
| } | ||
| }, | ||
| accelerator_type="A10G", | ||
| # You can customize the engine arguments (e.g. vLLM engine kwargs) | ||
| engine_kwargs={ | ||
| "tokenizer_mode": "mistral", | ||
| "config_format": "mistral", | ||
| "load_format": "mistral", | ||
| }, | ||
| log_engine_metrics=True, | ||
| ) | ||
|
|
||
| app = build_openai_app({"llm_configs": [llm_config]}) | ||
| serve.run(app, blocking=True) | ||
| # __transcription_example_end__ | ||
|
|
||
| status = ApplicationStatus.NOT_STARTED | ||
| timeout_seconds = 300 | ||
| start_time = time.time() | ||
|
|
||
| while ( | ||
| status != ApplicationStatus.RUNNING and time.time() - start_time < timeout_seconds | ||
| ): | ||
| status = serve.status().applications[SERVE_DEFAULT_APP_NAME].status | ||
|
|
||
| if status in [ApplicationStatus.DEPLOY_FAILED, ApplicationStatus.UNHEALTHY]: | ||
| raise AssertionError(f"Deployment failed with status: {status}") | ||
|
|
||
| time.sleep(1) | ||
|
|
||
| if status != ApplicationStatus.RUNNING: | ||
| raise AssertionError( | ||
| f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}" | ||
| ) | ||
|
|
||
| response = requests.get("https://voiceage.com/wbsamples/in_stereo/Sports.wav") | ||
| with open("audio.wav", "wb") as f: | ||
| f.write(response.content) | ||
|
|
||
| client = openai.OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key") | ||
|
|
||
| with open("audio.wav", "rb") as f: | ||
| try: | ||
| response = client.audio.transcriptions.create( | ||
| model="voxtral-mini", | ||
| file=f, | ||
| temperature=0.0, | ||
| language="en", | ||
| ) | ||
| except Exception as e: | ||
| raise AssertionError( | ||
| f"Error while querying models: {e}. Check the logs for more details." | ||
| ) | ||
|
|
||
| serve.shutdown() | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.