diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_example.py b/doc/source/llm/doc_code/serve/transcription/transcription_example.py new file mode 100644 index 000000000000..aed2e567146e --- /dev/null +++ b/doc/source/llm/doc_code/serve/transcription/transcription_example.py @@ -0,0 +1,106 @@ +""" +This file serves as a documentation example and CI test. + +Structure: +1. Monkeypatch setup: Ensures serve.run is non-blocking and removes accelerator requirements for CI testing. +2. Docs example (between __transcription_example_start/end__): Embedded in Sphinx docs via literalinclude. +3. Test validation (deployment status polling + cleanup) +""" + +import time +import openai +import requests +from ray import serve +from ray.serve.schema import ApplicationStatus +from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME +from ray.serve import llm + +_original_serve_run = serve.run +_original_build_openai_app = llm.build_openai_app + + +def _non_blocking_serve_run(app, **kwargs): + """Forces blocking=False for testing""" + kwargs["blocking"] = False + return _original_serve_run(app, **kwargs) + + +def _testing_build_openai_app(llm_serving_args): + """Removes accelerator requirements for testing""" + for config in llm_serving_args["llm_configs"]: + config.accelerator_type = None + + return _original_build_openai_app(llm_serving_args) + + +serve.run = _non_blocking_serve_run +llm.build_openai_app = _testing_build_openai_app + +# __transcription_example_start__ +from ray import serve +from ray.serve.llm import LLMConfig, build_openai_app + +llm_config = LLMConfig( + model_loading_config={ + "model_id": "voxtral-mini", + "model_source": "mistralai/Voxtral-Mini-3B-2507", + }, + deployment_config={ + "autoscaling_config": { + "min_replicas": 1, + "max_replicas": 4, + } + }, + accelerator_type="A10G", + # You can customize the engine arguments (e.g. vLLM engine kwargs) + engine_kwargs={ + "tokenizer_mode": "mistral", + "config_format": "mistral", + "load_format": "mistral", + }, + log_engine_metrics=True, +) + +app = build_openai_app({"llm_configs": [llm_config]}) +serve.run(app, blocking=True) +# __transcription_example_end__ + +status = ApplicationStatus.NOT_STARTED +timeout_seconds = 300 +start_time = time.time() + +while ( + status != ApplicationStatus.RUNNING and time.time() - start_time < timeout_seconds +): + status = serve.status().applications[SERVE_DEFAULT_APP_NAME].status + + if status in [ApplicationStatus.DEPLOY_FAILED, ApplicationStatus.UNHEALTHY]: + raise AssertionError(f"Deployment failed with status: {status}") + + time.sleep(1) + +if status != ApplicationStatus.RUNNING: + raise AssertionError( + f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}" + ) + +response = requests.get("https://voiceage.com/wbsamples/in_stereo/Sports.wav") +with open("audio.wav", "wb") as f: + f.write(response.content) + +client = openai.OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key") + +with open("audio.wav", "rb") as f: + try: + response = client.audio.transcriptions.create( + model="voxtral-mini", + file=f, + temperature=0.0, + language="en", + ) + except Exception as e: + raise AssertionError( + f"Error while querying models: {e}. Check the logs for more details." + ) + +serve.shutdown() diff --git a/doc/source/serve/llm/user-guides/vllm-compatibility.md b/doc/source/serve/llm/user-guides/vllm-compatibility.md index 846fc79720c3..4ec9a44b6ad4 100644 --- a/doc/source/serve/llm/user-guides/vllm-compatibility.md +++ b/doc/source/serve/llm/user-guides/vllm-compatibility.md @@ -80,6 +80,66 @@ curl -X POST http://localhost:8000/v1/embeddings \ :::: + +## Transcriptions + +You can generate audio transcriptions using Speech-to-Text (STT) models trained specifically for Automatic Speech Recognition (ASR) tasks. Models supporting this use case are listed in the [vLLM transcription models documentation](https://docs.vllm.ai/en/stable/models/supported_models.html). + + +### Deploy a transcription model + +::::{tab-set} + +:::{tab-item} Server +:sync: server + +```{literalinclude} ../../../llm/doc_code/serve/transcription/transcription_example.py +:language: python +:start-after: __transcription_example_start__ +:end-before: __transcription_example_end__ +``` +::: + +:::{tab-item} Python Client +:sync: client + +```python +from openai import OpenAI + +# Initialize client +client = OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key") + +# Open audio file +with open("/path/to/audio.wav", "rb") as f: + # Make a request to the transcription model + response = client.audio.transcriptions.create( + model="whisper-large", + file=f, + temperature=0.0, + language="en", + ) + + print(response.text) +``` +::: + +:::{tab-item} cURL +:sync: curl + +```bash +curl http://localhost:8000/v1/audio/transcriptions \ + -X POST \ + -H "Authorization: Bearer fake-key" \ + -F "file=@/path/to/audio.wav" \ + -F "model=whisper-large" \ + -F "temperature=0.0" \ + -F "language=en" +``` +::: + +:::: + + ## Structured output You can request structured JSON output similar to OpenAI's API using JSON mode or JSON schema validation with Pydantic models. @@ -179,7 +239,6 @@ response = client.chat.completions.create( response_format={ "type": "json_schema", "json_schema": Color.model_json_schema() - }, messages=[ { diff --git a/python/deplocks/llm/rayllm_py311_cpu.lock b/python/deplocks/llm/rayllm_py311_cpu.lock index 9461ae88b62b..9ad44ad117b2 100644 --- a/python/deplocks/llm/rayllm_py311_cpu.lock +++ b/python/deplocks/llm/rayllm_py311_cpu.lock @@ -149,6 +149,12 @@ attrs==25.1.0 \ # aiohttp # jsonschema # referencing +audioread==3.0.1 \ + --hash=sha256:4cdce70b8adc0da0a3c9e0d85fb10b3ace30fbdf8d1670fd443929b61d117c33 \ + --hash=sha256:ac5460a5498c48bdf2e8e767402583a4dcd13f4414d286f42ce4379e8b35066d + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # librosa billiard==4.2.1 \ --hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \ --hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb @@ -572,6 +578,12 @@ cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \ # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt # ray +decorator==5.1.1 \ + --hash=sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330 \ + --hash=sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186 + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # librosa depyf==0.19.0 \ --hash=sha256:040b35fc0997d49df024b7d094f2a7836f91e9ed02f49982dd37e70aa3285ad5 \ --hash=sha256:afed0916b32d141cc90fa6220df01885eda442ca43b297d5050eeb90b4a5cb44 @@ -1229,6 +1241,13 @@ jiter==0.8.2 \ # via # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # openai +joblib==1.5.2 \ + --hash=sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55 \ + --hash=sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241 + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # librosa + # scikit-learn jsonref==1.1.0 \ --hash=sha256:32fe8e1d85af0fdefbebce950af85590b22b60f9e95443176adbde4e1ecea552 \ --hash=sha256:590dc7773df6c21cbf948b5dac07a72a251db28b0238ceecce0a2abfa8ec30a9 @@ -1267,7 +1286,14 @@ lazy-loader==0.4 \ --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1 # via # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # librosa # scikit-image +librosa==0.11.0 \ + --hash=sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1 \ + --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908 + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # vllm llguidance==0.7.26 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:1895ff449c8ec0a5f1d3b142d723fc9b26a85b021b72d7f1173f8b7507f528c0 \ --hash=sha256:5e6f6cec9c6648164062f0347262b3ec7c39f54d1be5c5347d6446bc7fdba115 \ @@ -1544,6 +1570,7 @@ msgpack==1.0.7 \ # via # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt + # librosa # ray msgspec==0.19.0 \ --hash=sha256:00e87ecfa9795ee5214861eab8326b0e75475c2e68a384002aa135ea2a27d909 \ @@ -1746,6 +1773,7 @@ numba==0.61.2 \ --hash=sha256:efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2 # via # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # librosa # vllm numpy==1.26.4 \ --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \ @@ -1791,12 +1819,14 @@ numpy==1.26.4 \ # gguf # gymnasium # imageio + # librosa # mistral-common # nixl # numba # opencv-python-headless # pandas # scikit-image + # scikit-learn # scipy # soundfile # soxr @@ -1944,6 +1974,7 @@ packaging==23.0 \ # kombu # lazy-loader # lm-format-enforcer + # pooch # ray # scikit-image # tensorboardx @@ -2067,7 +2098,14 @@ platformdirs==3.11.0 \ --hash=sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e # via # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # pooch # virtualenv +pooch==1.8.2 \ + --hash=sha256:3529a57096f7198778a5ceefd5ac3ef0e4d06a6ddaf9fc2d609b806f25302c47 \ + --hash=sha256:76561f0de68a01da4df6af38e9955c4c9d1a5c90da73f7e40276a5728ec83d10 + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # librosa prometheus-client==0.19.0 \ --hash=sha256:4585b0d1223148c27a225b10dbec5ae9bc4c81a99a3fa80774fa6209935324e1 \ --hash=sha256:c88b1e6ecf6b41cd8fb5731c7ae919bf66df6ec6fafa555cd6c0e16ca169ae92 @@ -2919,6 +2957,7 @@ requests==2.32.3 \ # google-api-core # huggingface-hub # mistral-common + # pooch # ray # tiktoken # transformers @@ -3089,6 +3128,41 @@ scikit-image==0.24.0 \ # via # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt +scikit-learn==1.7.2 \ + --hash=sha256:0486c8f827c2e7b64837c731c8feff72c0bd2b998067a8a9cbc10643c31f0fe1 \ + --hash=sha256:0b7dacaa05e5d76759fb071558a8b5130f4845166d88654a0f9bdf3eb57851b7 \ + --hash=sha256:191e5550980d45449126e23ed1d5e9e24b2c68329ee1f691a3987476e115e09c \ + --hash=sha256:20e9e49ecd130598f1ca38a1d85090e1a600147b9c02fa6f15d69cb53d968fda \ + --hash=sha256:2a41e2a0ef45063e654152ec9d8bcfc39f7afce35b08902bfe290c2498a67a6a \ + --hash=sha256:36749fb62b3d961b1ce4fedf08fa57a1986cd409eff2d783bca5d4b9b5fce51c \ + --hash=sha256:4a847fea807e278f821a0406ca01e387f97653e284ecbd9750e3ee7c90347f18 \ + --hash=sha256:502c18e39849c0ea1a5d681af1dbcf15f6cce601aebb657aabbfe84133c1907f \ + --hash=sha256:57dc4deb1d3762c75d685507fbd0bc17160144b2f2ba4ccea5dc285ab0d0e973 \ + --hash=sha256:6088aa475f0785e01bcf8529f55280a3d7d298679f50c0bb70a2364a82d0b290 \ + --hash=sha256:63a9afd6f7b229aad94618c01c252ce9e6fa97918c5ca19c9a17a087d819440c \ + --hash=sha256:6b33579c10a3081d076ab403df4a4190da4f4432d443521674637677dc91e61f \ + --hash=sha256:7a4c328a71785382fe3fe676a9ecf2c86189249beff90bf85e22bdb7efaf9ae0 \ + --hash=sha256:7a58814265dfc52b3295b1900cfb5701589d30a8bb026c7540f1e9d3499d5ec8 \ + --hash=sha256:89877e19a80c7b11a2891a27c21c4894fb18e2c2e077815bcade10d34287b20d \ + --hash=sha256:8d91a97fa2b706943822398ab943cde71858a50245e31bc71dba62aab1d60a96 \ + --hash=sha256:8da8bf89d4d79aaec192d2bda62f9b56ae4e5b4ef93b6a56b5de4977e375c1f1 \ + --hash=sha256:9656e4a53e54578ad10a434dc1f993330568cfee176dff07112b8785fb413106 \ + --hash=sha256:96dc05a854add0e50d3f47a1ef21a10a595016da5b007c7d9cd9d0bffd1fcc61 \ + --hash=sha256:98335fb98509b73385b3ab2bd0639b1f610541d3988ee675c670371d6a87aa7c \ + --hash=sha256:9acb6c5e867447b4e1390930e3944a005e2cb115922e693c08a323421a6966e8 \ + --hash=sha256:9b7ed8d58725030568523e937c43e56bc01cadb478fc43c042a9aca1dacb3ba1 \ + --hash=sha256:abebbd61ad9e1deed54cca45caea8ad5f79e1b93173dece40bb8e0c658dbe6fe \ + --hash=sha256:acbc0f5fd2edd3432a22c69bed78e837c70cf896cd7993d71d51ba6708507476 \ + --hash=sha256:b4d6e9deed1a47aca9fe2f267ab8e8fe82ee20b4526b2c0cd9e135cea10feb44 \ + --hash=sha256:bb24510ed3f9f61476181e4db51ce801e2ba37541def12dc9333b946fc7a9cf8 \ + --hash=sha256:c7509693451651cd7361d30ce4e86a1347493554f172b1c72a39300fa2aea79e \ + --hash=sha256:ca250e6836d10e6f402436d6463d6c0e4d8e0234cfb6a9a47835bd392b852ce5 \ + --hash=sha256:e5bf3d930aee75a65478df91ac1225ff89cd28e9ac7bd1196853a9229b6adb0b \ + --hash=sha256:f95dc55b7902b91331fa4e5845dd5bde0580c9cd9612b1b2791b7e80c3d32615 \ + --hash=sha256:fa8f63940e29c82d1e67a45d5297bdebbcb585f5a5a50c4914cc2e852ab77f33 + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # librosa scipy==1.11.4 \ --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \ --hash=sha256:028eccd22e654b3ea01ee63705681ee79933652b2d8f873e7949898dda6d11b6 \ @@ -3118,7 +3192,9 @@ scipy==1.11.4 \ # via # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt + # librosa # scikit-image + # scikit-learn # vllm sentencepiece==0.2.0 \ --hash=sha256:0461324897735512a32d222e3d886e24ad6a499761952b6bda2a9ee6e4313ea5 \ @@ -3317,7 +3393,9 @@ soundfile==0.13.1 \ --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5 # via # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # librosa # mistral-common + # vllm soxr==0.5.0.post1 \ --hash=sha256:39e0f791ba178d69cd676485dbee37e75a34f20daa478d90341ecb7f6d9d690f \ --hash=sha256:4704ba6b13a3f1e41d12acf192878384c1c31f71ce606829c64abdf64a8d7d32 \ @@ -3342,6 +3420,7 @@ soxr==0.5.0.post1 \ --hash=sha256:fef509466c9c25f65eae0ce1e4b9ac9705d22c6038c914160ddaf459589c6e31 # via # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # librosa # mistral-common starlette==0.46.2 \ --hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \ @@ -3363,6 +3442,12 @@ tensorboardx==2.6.2.2 \ # via # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt +threadpoolctl==3.6.0 \ + --hash=sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb \ + --hash=sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # scikit-learn tifffile==2024.7.21 \ --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \ --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483 @@ -3518,6 +3603,7 @@ typing-extensions==4.12.2 \ # fastapi # gymnasium # huggingface-hub + # librosa # mistral-common # openai # opentelemetry-api diff --git a/python/deplocks/llm/rayllm_py311_cu128.lock b/python/deplocks/llm/rayllm_py311_cu128.lock index 8445dd9c5354..a7f14ffe377e 100644 --- a/python/deplocks/llm/rayllm_py311_cu128.lock +++ b/python/deplocks/llm/rayllm_py311_cu128.lock @@ -149,6 +149,12 @@ attrs==25.1.0 \ # aiohttp # jsonschema # referencing +audioread==3.0.1 \ + --hash=sha256:4cdce70b8adc0da0a3c9e0d85fb10b3ace30fbdf8d1670fd443929b61d117c33 \ + --hash=sha256:ac5460a5498c48bdf2e8e767402583a4dcd13f4414d286f42ce4379e8b35066d + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # librosa billiard==4.2.1 \ --hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \ --hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb @@ -572,6 +578,12 @@ cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \ # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt # ray +decorator==5.1.1 \ + --hash=sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330 \ + --hash=sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # librosa depyf==0.19.0 \ --hash=sha256:040b35fc0997d49df024b7d094f2a7836f91e9ed02f49982dd37e70aa3285ad5 \ --hash=sha256:afed0916b32d141cc90fa6220df01885eda442ca43b297d5050eeb90b4a5cb44 @@ -1230,6 +1242,13 @@ jiter==0.10.0 \ # via # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # openai +joblib==1.5.2 \ + --hash=sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55 \ + --hash=sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # librosa + # scikit-learn jsonref==1.1.0 \ --hash=sha256:32fe8e1d85af0fdefbebce950af85590b22b60f9e95443176adbde4e1ecea552 \ --hash=sha256:590dc7773df6c21cbf948b5dac07a72a251db28b0238ceecce0a2abfa8ec30a9 @@ -1268,7 +1287,14 @@ lazy-loader==0.4 \ --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1 # via # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # librosa # scikit-image +librosa==0.11.0 \ + --hash=sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1 \ + --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # vllm llguidance==0.7.29 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:17fd439957d6ca5f459d0dec755a2d040c2dc946ed7e3c332b469ef6861292f8 \ --hash=sha256:1d30a76b30b646ac7f9025d262665f62bdbf2d43698115eeb1119c6ee062a36f \ @@ -1509,6 +1535,7 @@ msgpack==1.0.7 \ # via # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt + # librosa # ray msgspec==0.19.0 \ --hash=sha256:00e87ecfa9795ee5214861eab8326b0e75475c2e68a384002aa135ea2a27d909 \ @@ -1710,6 +1737,7 @@ numba==0.61.2 \ --hash=sha256:efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2 # via # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # librosa # vllm numpy==1.26.4 \ --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \ @@ -1755,12 +1783,14 @@ numpy==1.26.4 \ # gguf # gymnasium # imageio + # librosa # mistral-common # nixl # numba # opencv-python-headless # pandas # scikit-image + # scikit-learn # scipy # soundfile # soxr @@ -1984,6 +2014,7 @@ packaging==23.0 \ # kombu # lazy-loader # lm-format-enforcer + # pooch # ray # scikit-image # tensorboardx @@ -2107,7 +2138,14 @@ platformdirs==3.11.0 \ --hash=sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e # via # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # pooch # virtualenv +pooch==1.8.2 \ + --hash=sha256:3529a57096f7198778a5ceefd5ac3ef0e4d06a6ddaf9fc2d609b806f25302c47 \ + --hash=sha256:76561f0de68a01da4df6af38e9955c4c9d1a5c90da73f7e40276a5728ec83d10 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # librosa prometheus-client==0.19.0 \ --hash=sha256:4585b0d1223148c27a225b10dbec5ae9bc4c81a99a3fa80774fa6209935324e1 \ --hash=sha256:c88b1e6ecf6b41cd8fb5731c7ae919bf66df6ec6fafa555cd6c0e16ca169ae92 @@ -2959,6 +2997,7 @@ requests==2.32.3 \ # google-api-core # huggingface-hub # mistral-common + # pooch # ray # tiktoken # transformers @@ -3129,6 +3168,41 @@ scikit-image==0.24.0 \ # via # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt +scikit-learn==1.7.2 \ + --hash=sha256:0486c8f827c2e7b64837c731c8feff72c0bd2b998067a8a9cbc10643c31f0fe1 \ + --hash=sha256:0b7dacaa05e5d76759fb071558a8b5130f4845166d88654a0f9bdf3eb57851b7 \ + --hash=sha256:191e5550980d45449126e23ed1d5e9e24b2c68329ee1f691a3987476e115e09c \ + --hash=sha256:20e9e49ecd130598f1ca38a1d85090e1a600147b9c02fa6f15d69cb53d968fda \ + --hash=sha256:2a41e2a0ef45063e654152ec9d8bcfc39f7afce35b08902bfe290c2498a67a6a \ + --hash=sha256:36749fb62b3d961b1ce4fedf08fa57a1986cd409eff2d783bca5d4b9b5fce51c \ + --hash=sha256:4a847fea807e278f821a0406ca01e387f97653e284ecbd9750e3ee7c90347f18 \ + --hash=sha256:502c18e39849c0ea1a5d681af1dbcf15f6cce601aebb657aabbfe84133c1907f \ + --hash=sha256:57dc4deb1d3762c75d685507fbd0bc17160144b2f2ba4ccea5dc285ab0d0e973 \ + --hash=sha256:6088aa475f0785e01bcf8529f55280a3d7d298679f50c0bb70a2364a82d0b290 \ + --hash=sha256:63a9afd6f7b229aad94618c01c252ce9e6fa97918c5ca19c9a17a087d819440c \ + --hash=sha256:6b33579c10a3081d076ab403df4a4190da4f4432d443521674637677dc91e61f \ + --hash=sha256:7a4c328a71785382fe3fe676a9ecf2c86189249beff90bf85e22bdb7efaf9ae0 \ + --hash=sha256:7a58814265dfc52b3295b1900cfb5701589d30a8bb026c7540f1e9d3499d5ec8 \ + --hash=sha256:89877e19a80c7b11a2891a27c21c4894fb18e2c2e077815bcade10d34287b20d \ + --hash=sha256:8d91a97fa2b706943822398ab943cde71858a50245e31bc71dba62aab1d60a96 \ + --hash=sha256:8da8bf89d4d79aaec192d2bda62f9b56ae4e5b4ef93b6a56b5de4977e375c1f1 \ + --hash=sha256:9656e4a53e54578ad10a434dc1f993330568cfee176dff07112b8785fb413106 \ + --hash=sha256:96dc05a854add0e50d3f47a1ef21a10a595016da5b007c7d9cd9d0bffd1fcc61 \ + --hash=sha256:98335fb98509b73385b3ab2bd0639b1f610541d3988ee675c670371d6a87aa7c \ + --hash=sha256:9acb6c5e867447b4e1390930e3944a005e2cb115922e693c08a323421a6966e8 \ + --hash=sha256:9b7ed8d58725030568523e937c43e56bc01cadb478fc43c042a9aca1dacb3ba1 \ + --hash=sha256:abebbd61ad9e1deed54cca45caea8ad5f79e1b93173dece40bb8e0c658dbe6fe \ + --hash=sha256:acbc0f5fd2edd3432a22c69bed78e837c70cf896cd7993d71d51ba6708507476 \ + --hash=sha256:b4d6e9deed1a47aca9fe2f267ab8e8fe82ee20b4526b2c0cd9e135cea10feb44 \ + --hash=sha256:bb24510ed3f9f61476181e4db51ce801e2ba37541def12dc9333b946fc7a9cf8 \ + --hash=sha256:c7509693451651cd7361d30ce4e86a1347493554f172b1c72a39300fa2aea79e \ + --hash=sha256:ca250e6836d10e6f402436d6463d6c0e4d8e0234cfb6a9a47835bd392b852ce5 \ + --hash=sha256:e5bf3d930aee75a65478df91ac1225ff89cd28e9ac7bd1196853a9229b6adb0b \ + --hash=sha256:f95dc55b7902b91331fa4e5845dd5bde0580c9cd9612b1b2791b7e80c3d32615 \ + --hash=sha256:fa8f63940e29c82d1e67a45d5297bdebbcb585f5a5a50c4914cc2e852ab77f33 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # librosa scipy==1.11.4 \ --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \ --hash=sha256:028eccd22e654b3ea01ee63705681ee79933652b2d8f873e7949898dda6d11b6 \ @@ -3158,7 +3232,9 @@ scipy==1.11.4 \ # via # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt + # librosa # scikit-image + # scikit-learn # vllm sentencepiece==0.2.0 \ --hash=sha256:0461324897735512a32d222e3d886e24ad6a499761952b6bda2a9ee6e4313ea5 \ @@ -3357,7 +3433,9 @@ soundfile==0.13.1 \ --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5 # via # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # librosa # mistral-common + # vllm soxr==0.5.0.post1 \ --hash=sha256:39e0f791ba178d69cd676485dbee37e75a34f20daa478d90341ecb7f6d9d690f \ --hash=sha256:4704ba6b13a3f1e41d12acf192878384c1c31f71ce606829c64abdf64a8d7d32 \ @@ -3382,6 +3460,7 @@ soxr==0.5.0.post1 \ --hash=sha256:fef509466c9c25f65eae0ce1e4b9ac9705d22c6038c914160ddaf459589c6e31 # via # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # librosa # mistral-common starlette==0.46.2 \ --hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \ @@ -3403,6 +3482,12 @@ tensorboardx==2.6.2.2 \ # via # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt +threadpoolctl==3.6.0 \ + --hash=sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb \ + --hash=sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # scikit-learn tifffile==2024.7.21 \ --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \ --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483 @@ -3547,6 +3632,7 @@ typing-extensions==4.12.2 \ # fastapi # gymnasium # huggingface-hub + # librosa # mistral-common # openai # opentelemetry-api diff --git a/python/deplocks/llm/rayllm_test_py311_cpu.lock b/python/deplocks/llm/rayllm_test_py311_cpu.lock index 06eec0f1fbf6..d5f4c289ab74 100644 --- a/python/deplocks/llm/rayllm_test_py311_cpu.lock +++ b/python/deplocks/llm/rayllm_test_py311_cpu.lock @@ -219,6 +219,10 @@ attrs==25.1.0 \ # aiohttp # jsonschema # referencing +audioread==3.0.1 \ + --hash=sha256:4cdce70b8adc0da0a3c9e0d85fb10b3ace30fbdf8d1670fd443929b61d117c33 \ + --hash=sha256:ac5460a5498c48bdf2e8e767402583a4dcd13f4414d286f42ce4379e8b35066d + # via librosa azure-common==1.1.28 \ --hash=sha256:4ac0cd3214e36b6a1b6a442686722a5d8cc449603aa833f3f0f40bda836704a3 \ --hash=sha256:5c12d3dcf4ec20599ca6b0d3e09e86e146353d443e7fcc050c9a19c1f9df20ad @@ -766,6 +770,7 @@ decorator==5.1.1 \ # via # -c python/deplocks/llm/ray_test_py311_cpu.lock # ipython + # librosa defusedxml==0.7.1 \ --hash=sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69 \ --hash=sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61 @@ -1678,6 +1683,12 @@ jmespath==1.0.1 \ # -c python/deplocks/llm/ray_test_py311_cpu.lock # boto3 # botocore +joblib==1.5.2 \ + --hash=sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55 \ + --hash=sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241 + # via + # librosa + # scikit-learn json5==0.9.14 \ --hash=sha256:740c7f1b9e584a468dbb2939d8d458db3427f2c93ae2139d05f47e453eae964f \ --hash=sha256:9ed66c3a6ca3510a976a9ef9b8c0787de24802724ab1860bc0153c7fdd589b02 @@ -1821,7 +1832,12 @@ lazy-loader==0.4 \ --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1 # via # -c python/deplocks/llm/ray_test_py311_cpu.lock + # librosa # scikit-image +librosa==0.11.0 \ + --hash=sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1 \ + --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908 + # via vllm llguidance==0.7.26 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:1895ff449c8ec0a5f1d3b142d723fc9b26a85b021b72d7f1173f8b7507f528c0 \ --hash=sha256:5e6f6cec9c6648164062f0347262b3ec7c39f54d1be5c5347d6446bc7fdba115 \ @@ -2223,6 +2239,7 @@ msgpack==1.0.7 \ # via # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt + # librosa # ray msgspec==0.19.0 \ --hash=sha256:00e87ecfa9795ee5214861eab8326b0e75475c2e68a384002aa135ea2a27d909 \ @@ -2472,7 +2489,9 @@ numba==0.61.2 \ --hash=sha256:cf9f9fc00d6eca0c23fc840817ce9f439b9f03c8f03d6246c0e7f0cb15b7162a \ --hash=sha256:ea0247617edcb5dd61f6106a56255baab031acc4257bddaeddb3a1003b4ca3fd \ --hash=sha256:efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2 - # via vllm + # via + # librosa + # vllm numpy==1.26.4 \ --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \ --hash=sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818 \ @@ -2517,12 +2536,14 @@ numpy==1.26.4 \ # gguf # gymnasium # imageio + # librosa # mistral-common # nixl # numba # opencv-python-headless # pandas # scikit-image + # scikit-learn # scipy # soundfile # soxr @@ -2680,6 +2701,7 @@ packaging==23.0 \ # lazy-loader # lm-format-enforcer # nbconvert + # pooch # pytest # ray # scikit-image @@ -2835,6 +2857,7 @@ platformdirs==3.11.0 \ # via # -c python/deplocks/llm/ray_test_py311_cpu.lock # jupyter-core + # pooch # virtualenv pluggy==1.3.0 \ --hash=sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12 \ @@ -2842,6 +2865,10 @@ pluggy==1.3.0 \ # via # -c python/deplocks/llm/ray_test_py311_cpu.lock # pytest +pooch==1.8.2 \ + --hash=sha256:3529a57096f7198778a5ceefd5ac3ef0e4d06a6ddaf9fc2d609b806f25302c47 \ + --hash=sha256:76561f0de68a01da4df6af38e9955c4c9d1a5c90da73f7e40276a5728ec83d10 + # via librosa portalocker==2.8.2 \ --hash=sha256:2b035aa7828e46c58e9b31390ee1f169b98e1066ab10b9a6a861fe7e25ee4f33 \ --hash=sha256:cfb86acc09b9aa7c3b43594e19be1345b9d16af3feb08bf92f23d4dce513a28e @@ -3804,6 +3831,7 @@ requests==2.32.3 \ # jupyterlab-server # mistral-common # msal + # pooch # ray # smart-open # sphinx @@ -3996,6 +4024,39 @@ scikit-image==0.24.0 \ # via # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt +scikit-learn==1.7.2 \ + --hash=sha256:0486c8f827c2e7b64837c731c8feff72c0bd2b998067a8a9cbc10643c31f0fe1 \ + --hash=sha256:0b7dacaa05e5d76759fb071558a8b5130f4845166d88654a0f9bdf3eb57851b7 \ + --hash=sha256:191e5550980d45449126e23ed1d5e9e24b2c68329ee1f691a3987476e115e09c \ + --hash=sha256:20e9e49ecd130598f1ca38a1d85090e1a600147b9c02fa6f15d69cb53d968fda \ + --hash=sha256:2a41e2a0ef45063e654152ec9d8bcfc39f7afce35b08902bfe290c2498a67a6a \ + --hash=sha256:36749fb62b3d961b1ce4fedf08fa57a1986cd409eff2d783bca5d4b9b5fce51c \ + --hash=sha256:4a847fea807e278f821a0406ca01e387f97653e284ecbd9750e3ee7c90347f18 \ + --hash=sha256:502c18e39849c0ea1a5d681af1dbcf15f6cce601aebb657aabbfe84133c1907f \ + --hash=sha256:57dc4deb1d3762c75d685507fbd0bc17160144b2f2ba4ccea5dc285ab0d0e973 \ + --hash=sha256:6088aa475f0785e01bcf8529f55280a3d7d298679f50c0bb70a2364a82d0b290 \ + --hash=sha256:63a9afd6f7b229aad94618c01c252ce9e6fa97918c5ca19c9a17a087d819440c \ + --hash=sha256:6b33579c10a3081d076ab403df4a4190da4f4432d443521674637677dc91e61f \ + --hash=sha256:7a4c328a71785382fe3fe676a9ecf2c86189249beff90bf85e22bdb7efaf9ae0 \ + --hash=sha256:7a58814265dfc52b3295b1900cfb5701589d30a8bb026c7540f1e9d3499d5ec8 \ + --hash=sha256:89877e19a80c7b11a2891a27c21c4894fb18e2c2e077815bcade10d34287b20d \ + --hash=sha256:8d91a97fa2b706943822398ab943cde71858a50245e31bc71dba62aab1d60a96 \ + --hash=sha256:8da8bf89d4d79aaec192d2bda62f9b56ae4e5b4ef93b6a56b5de4977e375c1f1 \ + --hash=sha256:9656e4a53e54578ad10a434dc1f993330568cfee176dff07112b8785fb413106 \ + --hash=sha256:96dc05a854add0e50d3f47a1ef21a10a595016da5b007c7d9cd9d0bffd1fcc61 \ + --hash=sha256:98335fb98509b73385b3ab2bd0639b1f610541d3988ee675c670371d6a87aa7c \ + --hash=sha256:9acb6c5e867447b4e1390930e3944a005e2cb115922e693c08a323421a6966e8 \ + --hash=sha256:9b7ed8d58725030568523e937c43e56bc01cadb478fc43c042a9aca1dacb3ba1 \ + --hash=sha256:abebbd61ad9e1deed54cca45caea8ad5f79e1b93173dece40bb8e0c658dbe6fe \ + --hash=sha256:acbc0f5fd2edd3432a22c69bed78e837c70cf896cd7993d71d51ba6708507476 \ + --hash=sha256:b4d6e9deed1a47aca9fe2f267ab8e8fe82ee20b4526b2c0cd9e135cea10feb44 \ + --hash=sha256:bb24510ed3f9f61476181e4db51ce801e2ba37541def12dc9333b946fc7a9cf8 \ + --hash=sha256:c7509693451651cd7361d30ce4e86a1347493554f172b1c72a39300fa2aea79e \ + --hash=sha256:ca250e6836d10e6f402436d6463d6c0e4d8e0234cfb6a9a47835bd392b852ce5 \ + --hash=sha256:e5bf3d930aee75a65478df91ac1225ff89cd28e9ac7bd1196853a9229b6adb0b \ + --hash=sha256:f95dc55b7902b91331fa4e5845dd5bde0580c9cd9612b1b2791b7e80c3d32615 \ + --hash=sha256:fa8f63940e29c82d1e67a45d5297bdebbcb585f5a5a50c4914cc2e852ab77f33 + # via librosa scipy==1.11.4 \ --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \ --hash=sha256:028eccd22e654b3ea01ee63705681ee79933652b2d8f873e7949898dda6d11b6 \ @@ -4025,7 +4086,9 @@ scipy==1.11.4 \ # via # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt + # librosa # scikit-image + # scikit-learn # vllm send2trash==1.8.3 \ --hash=sha256:0c31227e0bd08961c7665474a3d1ef7193929fedda4233843689baa056be46c9 \ @@ -4246,7 +4309,10 @@ soundfile==0.13.1 \ --hash=sha256:a23c717560da2cf4c7b5ae1142514e0fd82d6bbd9dfc93a50423447142f2c445 \ --hash=sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b \ --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5 - # via mistral-common + # via + # librosa + # mistral-common + # vllm soupsieve==2.5 \ --hash=sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690 \ --hash=sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7 @@ -4275,7 +4341,9 @@ soxr==0.5.0.post1 \ --hash=sha256:fa0a382fb8d8e2afed2c1642723b2d2d1b9a6728ff89f77f3524034c8885b8c9 \ --hash=sha256:fcc049b0a151a65aa75b92f0ac64bb2dba785d16b78c31c2b94e68c141751d6d \ --hash=sha256:fef509466c9c25f65eae0ce1e4b9ac9705d22c6038c914160ddaf459589c6e31 - # via mistral-common + # via + # librosa + # mistral-common sphinx==6.2.1 \ --hash=sha256:6d56a34697bb749ffa0152feafc4b19836c755d90a7c59b72bc7dfd371b9cc6b \ --hash=sha256:97787ff1fa3256a3eef9eda523a63dbf299f7b47e053cfcf684a1c2a8380c912 @@ -4354,6 +4422,10 @@ terminado==0.18.1 \ # jupyter-server # nbclassic # notebook +threadpoolctl==3.6.0 \ + --hash=sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb \ + --hash=sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e + # via scikit-learn tifffile==2024.7.21 \ --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \ --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483 @@ -4585,6 +4657,7 @@ typing-extensions==4.12.2 \ # fastapi # gymnasium # huggingface-hub + # librosa # mistral-common # openai # opentelemetry-api diff --git a/python/deplocks/llm/rayllm_test_py311_cu128.lock b/python/deplocks/llm/rayllm_test_py311_cu128.lock index 34a7a94ed149..ab9931f03ebf 100644 --- a/python/deplocks/llm/rayllm_test_py311_cu128.lock +++ b/python/deplocks/llm/rayllm_test_py311_cu128.lock @@ -219,6 +219,10 @@ attrs==25.1.0 \ # aiohttp # jsonschema # referencing +audioread==3.0.1 \ + --hash=sha256:4cdce70b8adc0da0a3c9e0d85fb10b3ace30fbdf8d1670fd443929b61d117c33 \ + --hash=sha256:ac5460a5498c48bdf2e8e767402583a4dcd13f4414d286f42ce4379e8b35066d + # via librosa azure-common==1.1.28 \ --hash=sha256:4ac0cd3214e36b6a1b6a442686722a5d8cc449603aa833f3f0f40bda836704a3 \ --hash=sha256:5c12d3dcf4ec20599ca6b0d3e09e86e146353d443e7fcc050c9a19c1f9df20ad @@ -765,6 +769,7 @@ decorator==5.1.1 \ # via # -c python/deplocks/llm/ray_test_py311_cu128.lock # ipython + # librosa defusedxml==0.7.1 \ --hash=sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69 \ --hash=sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61 @@ -1678,6 +1683,12 @@ jmespath==1.0.1 \ # -c python/deplocks/llm/ray_test_py311_cu128.lock # boto3 # botocore +joblib==1.5.2 \ + --hash=sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55 \ + --hash=sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241 + # via + # librosa + # scikit-learn json5==0.9.14 \ --hash=sha256:740c7f1b9e584a468dbb2939d8d458db3427f2c93ae2139d05f47e453eae964f \ --hash=sha256:9ed66c3a6ca3510a976a9ef9b8c0787de24802724ab1860bc0153c7fdd589b02 @@ -1821,7 +1832,12 @@ lazy-loader==0.4 \ --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1 # via # -c python/deplocks/llm/ray_test_py311_cu128.lock + # librosa # scikit-image +librosa==0.11.0 \ + --hash=sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1 \ + --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908 + # via vllm llguidance==0.7.29 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:17fd439957d6ca5f459d0dec755a2d040c2dc946ed7e3c332b469ef6861292f8 \ --hash=sha256:1d30a76b30b646ac7f9025d262665f62bdbf2d43698115eeb1119c6ee062a36f \ @@ -2187,6 +2203,7 @@ msgpack==1.0.7 \ # via # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt + # librosa # ray msgspec==0.19.0 \ --hash=sha256:00e87ecfa9795ee5214861eab8326b0e75475c2e68a384002aa135ea2a27d909 \ @@ -2435,7 +2452,9 @@ numba==0.61.2 \ --hash=sha256:cf9f9fc00d6eca0c23fc840817ce9f439b9f03c8f03d6246c0e7f0cb15b7162a \ --hash=sha256:ea0247617edcb5dd61f6106a56255baab031acc4257bddaeddb3a1003b4ca3fd \ --hash=sha256:efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2 - # via vllm + # via + # librosa + # vllm numpy==1.26.4 \ --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \ --hash=sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818 \ @@ -2480,12 +2499,14 @@ numpy==1.26.4 \ # gguf # gymnasium # imageio + # librosa # mistral-common # nixl # numba # opencv-python-headless # pandas # scikit-image + # scikit-learn # scipy # soundfile # soxr @@ -2694,6 +2715,7 @@ packaging==23.0 \ # lazy-loader # lm-format-enforcer # nbconvert + # pooch # pytest # ray # scikit-image @@ -2849,6 +2871,7 @@ platformdirs==3.11.0 \ # via # -c python/deplocks/llm/ray_test_py311_cu128.lock # jupyter-core + # pooch # virtualenv pluggy==1.3.0 \ --hash=sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12 \ @@ -2856,6 +2879,10 @@ pluggy==1.3.0 \ # via # -c python/deplocks/llm/ray_test_py311_cu128.lock # pytest +pooch==1.8.2 \ + --hash=sha256:3529a57096f7198778a5ceefd5ac3ef0e4d06a6ddaf9fc2d609b806f25302c47 \ + --hash=sha256:76561f0de68a01da4df6af38e9955c4c9d1a5c90da73f7e40276a5728ec83d10 + # via librosa portalocker==2.8.2 \ --hash=sha256:2b035aa7828e46c58e9b31390ee1f169b98e1066ab10b9a6a861fe7e25ee4f33 \ --hash=sha256:cfb86acc09b9aa7c3b43594e19be1345b9d16af3feb08bf92f23d4dce513a28e @@ -3818,6 +3845,7 @@ requests==2.32.3 \ # jupyterlab-server # mistral-common # msal + # pooch # ray # smart-open # sphinx @@ -4010,6 +4038,39 @@ scikit-image==0.24.0 \ # via # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt +scikit-learn==1.7.2 \ + --hash=sha256:0486c8f827c2e7b64837c731c8feff72c0bd2b998067a8a9cbc10643c31f0fe1 \ + --hash=sha256:0b7dacaa05e5d76759fb071558a8b5130f4845166d88654a0f9bdf3eb57851b7 \ + --hash=sha256:191e5550980d45449126e23ed1d5e9e24b2c68329ee1f691a3987476e115e09c \ + --hash=sha256:20e9e49ecd130598f1ca38a1d85090e1a600147b9c02fa6f15d69cb53d968fda \ + --hash=sha256:2a41e2a0ef45063e654152ec9d8bcfc39f7afce35b08902bfe290c2498a67a6a \ + --hash=sha256:36749fb62b3d961b1ce4fedf08fa57a1986cd409eff2d783bca5d4b9b5fce51c \ + --hash=sha256:4a847fea807e278f821a0406ca01e387f97653e284ecbd9750e3ee7c90347f18 \ + --hash=sha256:502c18e39849c0ea1a5d681af1dbcf15f6cce601aebb657aabbfe84133c1907f \ + --hash=sha256:57dc4deb1d3762c75d685507fbd0bc17160144b2f2ba4ccea5dc285ab0d0e973 \ + --hash=sha256:6088aa475f0785e01bcf8529f55280a3d7d298679f50c0bb70a2364a82d0b290 \ + --hash=sha256:63a9afd6f7b229aad94618c01c252ce9e6fa97918c5ca19c9a17a087d819440c \ + --hash=sha256:6b33579c10a3081d076ab403df4a4190da4f4432d443521674637677dc91e61f \ + --hash=sha256:7a4c328a71785382fe3fe676a9ecf2c86189249beff90bf85e22bdb7efaf9ae0 \ + --hash=sha256:7a58814265dfc52b3295b1900cfb5701589d30a8bb026c7540f1e9d3499d5ec8 \ + --hash=sha256:89877e19a80c7b11a2891a27c21c4894fb18e2c2e077815bcade10d34287b20d \ + --hash=sha256:8d91a97fa2b706943822398ab943cde71858a50245e31bc71dba62aab1d60a96 \ + --hash=sha256:8da8bf89d4d79aaec192d2bda62f9b56ae4e5b4ef93b6a56b5de4977e375c1f1 \ + --hash=sha256:9656e4a53e54578ad10a434dc1f993330568cfee176dff07112b8785fb413106 \ + --hash=sha256:96dc05a854add0e50d3f47a1ef21a10a595016da5b007c7d9cd9d0bffd1fcc61 \ + --hash=sha256:98335fb98509b73385b3ab2bd0639b1f610541d3988ee675c670371d6a87aa7c \ + --hash=sha256:9acb6c5e867447b4e1390930e3944a005e2cb115922e693c08a323421a6966e8 \ + --hash=sha256:9b7ed8d58725030568523e937c43e56bc01cadb478fc43c042a9aca1dacb3ba1 \ + --hash=sha256:abebbd61ad9e1deed54cca45caea8ad5f79e1b93173dece40bb8e0c658dbe6fe \ + --hash=sha256:acbc0f5fd2edd3432a22c69bed78e837c70cf896cd7993d71d51ba6708507476 \ + --hash=sha256:b4d6e9deed1a47aca9fe2f267ab8e8fe82ee20b4526b2c0cd9e135cea10feb44 \ + --hash=sha256:bb24510ed3f9f61476181e4db51ce801e2ba37541def12dc9333b946fc7a9cf8 \ + --hash=sha256:c7509693451651cd7361d30ce4e86a1347493554f172b1c72a39300fa2aea79e \ + --hash=sha256:ca250e6836d10e6f402436d6463d6c0e4d8e0234cfb6a9a47835bd392b852ce5 \ + --hash=sha256:e5bf3d930aee75a65478df91ac1225ff89cd28e9ac7bd1196853a9229b6adb0b \ + --hash=sha256:f95dc55b7902b91331fa4e5845dd5bde0580c9cd9612b1b2791b7e80c3d32615 \ + --hash=sha256:fa8f63940e29c82d1e67a45d5297bdebbcb585f5a5a50c4914cc2e852ab77f33 + # via librosa scipy==1.11.4 \ --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \ --hash=sha256:028eccd22e654b3ea01ee63705681ee79933652b2d8f873e7949898dda6d11b6 \ @@ -4039,7 +4100,9 @@ scipy==1.11.4 \ # via # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt + # librosa # scikit-image + # scikit-learn # vllm send2trash==1.8.3 \ --hash=sha256:0c31227e0bd08961c7665474a3d1ef7193929fedda4233843689baa056be46c9 \ @@ -4260,7 +4323,10 @@ soundfile==0.13.1 \ --hash=sha256:a23c717560da2cf4c7b5ae1142514e0fd82d6bbd9dfc93a50423447142f2c445 \ --hash=sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b \ --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5 - # via mistral-common + # via + # librosa + # mistral-common + # vllm soupsieve==2.5 \ --hash=sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690 \ --hash=sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7 @@ -4289,7 +4355,9 @@ soxr==0.5.0.post1 \ --hash=sha256:fa0a382fb8d8e2afed2c1642723b2d2d1b9a6728ff89f77f3524034c8885b8c9 \ --hash=sha256:fcc049b0a151a65aa75b92f0ac64bb2dba785d16b78c31c2b94e68c141751d6d \ --hash=sha256:fef509466c9c25f65eae0ce1e4b9ac9705d22c6038c914160ddaf459589c6e31 - # via mistral-common + # via + # librosa + # mistral-common sphinx==6.2.1 \ --hash=sha256:6d56a34697bb749ffa0152feafc4b19836c755d90a7c59b72bc7dfd371b9cc6b \ --hash=sha256:97787ff1fa3256a3eef9eda523a63dbf299f7b47e053cfcf684a1c2a8380c912 @@ -4368,6 +4436,10 @@ terminado==0.18.1 \ # jupyter-server # nbclassic # notebook +threadpoolctl==3.6.0 \ + --hash=sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb \ + --hash=sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e + # via scikit-learn tifffile==2024.7.21 \ --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \ --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483 @@ -4589,6 +4661,7 @@ typing-extensions==4.12.2 \ # fastapi # gymnasium # huggingface-hub + # librosa # mistral-common # openai # opentelemetry-api diff --git a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py index ed2adf153d2c..9fc708ce0bc6 100644 --- a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py +++ b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py @@ -25,6 +25,9 @@ ErrorResponse as vLLMErrorResponse, ScoreRequest as vLLMScoreRequest, ScoreResponse as vLLMScoreResponse, + TranscriptionRequest as vLLMTranscriptionRequest, + TranscriptionResponse as vLLMTranscriptionResponse, + TranscriptionStreamResponse as vLLMTranscriptionStreamResponse, ) from vllm.utils import random_uuid @@ -96,6 +99,27 @@ class EmbeddingResponse(vLLMEmbeddingResponse): model_config = ConfigDict(arbitrary_types_allowed=True) +class TranscriptionRequest(vLLMTranscriptionRequest): + model_config = ConfigDict(arbitrary_types_allowed=True) + + request_id: str = Field( + default_factory=lambda: f"{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response." + ), + ) + + +class TranscriptionResponse(vLLMTranscriptionResponse): + model_config = ConfigDict(arbitrary_types_allowed=True) + + +class TranscriptionStreamResponse(vLLMTranscriptionStreamResponse): + model_config = ConfigDict(arbitrary_types_allowed=True) + + class ScoreRequest(vLLMScoreRequest): model_config = ConfigDict(arbitrary_types_allowed=True) @@ -115,15 +139,26 @@ class ScoreResponse(vLLMScoreResponse): ] LLMChatResponse = Union[ - AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None], + AsyncGenerator[ + Union[str, ChatCompletionStreamResponse, ChatCompletionResponse, ErrorResponse], + None, + ], ] LLMCompletionsResponse = Union[ AsyncGenerator[ - Union[CompletionStreamResponse, CompletionResponse, ErrorResponse], None + Union[str, CompletionStreamResponse, CompletionResponse, ErrorResponse], None + ], +] + +LLMTranscriptionResponse = Union[ + AsyncGenerator[ + Union[str, TranscriptionStreamResponse, TranscriptionResponse, ErrorResponse], + None, ], ] + # TODO: remove this class class OpenAIHTTPException(Exception): def __init__( diff --git a/python/ray/llm/_internal/serve/core/engine/protocol.py b/python/ray/llm/_internal/serve/core/engine/protocol.py index 56bcc5acf827..c36b8073d0da 100644 --- a/python/ray/llm/_internal/serve/core/engine/protocol.py +++ b/python/ray/llm/_internal/serve/core/engine/protocol.py @@ -15,6 +15,8 @@ EmbeddingRequest, EmbeddingResponse, ErrorResponse, + TranscriptionRequest, + TranscriptionResponse, ) @@ -118,6 +120,35 @@ async def embeddings( """ pass + @abc.abstractmethod + async def transcriptions( + self, request: "TranscriptionRequest" + ) -> AsyncGenerator[Union[str, "TranscriptionResponse", "ErrorResponse"], None]: + """Run a Transcription with the engine. + + Similar to chat and completion, this method is an async generator, + so it yields chunks of response and when it is done, it returns None. + We have the following convention: + + * In case of streaming, yield a string representing data: + \n\n for each chunk. This should be already openAI compatible, + so the higher level can just yield it to the client. + * In case of non-streaming, yield a single object of type TranscriptionResponse. + * In case of error, yield a single object of type ErrorResponse. + + Args: + request: The transcription request. + + Yields: + Union[str, TranscriptionResponse, ErrorResponse]: A string + representing a chunk of the response, a TranscriptionResponse object, + or an ErrorResponse object. + + Returns: + None when the generator is done. + """ + pass + async def check_health(self) -> None: """Check the health of the engine. diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py index cb61e1ab7a22..29a9e17ada4d 100644 --- a/python/ray/llm/_internal/serve/core/ingress/ingress.py +++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py @@ -2,7 +2,9 @@ import json import sys from contextlib import asynccontextmanager +from enum import Enum from typing import ( + Annotated, Any, AsyncGenerator, Awaitable, @@ -16,7 +18,7 @@ Union, ) -from fastapi import FastAPI, HTTPException, status +from fastapi import FastAPI, Form, HTTPException, status from fastapi.middleware.cors import CORSMiddleware from starlette.responses import JSONResponse, Response, StreamingResponse @@ -45,11 +47,15 @@ LLMCompletionsResponse, LLMEmbeddingsResponse, LLMScoreResponse, + LLMTranscriptionResponse, ModelCard, ModelList, OpenAIHTTPException, ScoreRequest, ScoreResponse, + TranscriptionRequest, + TranscriptionResponse, + TranscriptionStreamResponse, to_model_metadata, ) from ray.llm._internal.serve.core.ingress.middleware import ( @@ -83,6 +89,19 @@ "max_ongoing_requests": DEFAULT_MAX_ONGOING_REQUESTS, } +# These methods correspond to functions defined in the LLMEngine class in python/ray/llm/_internal/serve/deployments/llm/llm_engine.py +class CallMethod(Enum): + CHAT = "chat" + COMPLETIONS = "completions" + TRANSCRIPTIONS = "transcriptions" + + +NON_STREAMING_RESPONSE_TYPES = ( + ChatCompletionResponse, + CompletionResponse, + TranscriptionResponse, +) + def _sanitize_chat_completion_request( request: ChatCompletionRequest, @@ -108,8 +127,7 @@ def _sanitize_chat_completion_request( StreamResponseType = Union[ - ChatCompletionStreamResponse, - CompletionStreamResponse, + ChatCompletionStreamResponse, CompletionStreamResponse, TranscriptionStreamResponse ] BatchedStreamResponseType = List[StreamResponseType] @@ -122,6 +140,9 @@ def _sanitize_chat_completion_request( "completions": lambda app: app.post("/v1/completions"), "chat": lambda app: app.post("/v1/chat/completions"), "embeddings": lambda app: app.post("/v1/embeddings"), + "transcriptions": lambda app: app.post( + "/v1/audio/transcriptions", + ), "score": lambda app: app.post("/v1/score"), } @@ -227,7 +248,7 @@ def make_fastapi_ingress( def _apply_openai_json_format( - response: Union[StreamResponseType, BatchedStreamResponseType] + response: Union[StreamResponseType, BatchedStreamResponseType], ) -> str: """Converts the stream response to OpenAI format. @@ -256,7 +277,7 @@ def _apply_openai_json_format( async def _peek_at_generator( - gen: AsyncGenerator[T, None] + gen: AsyncGenerator[T, None], ) -> Tuple[T, AsyncGenerator[T, None]]: # Peek at the first element first_item = await gen.__anext__() @@ -403,7 +424,11 @@ async def _get_response( self, *, body: Union[ - CompletionRequest, ChatCompletionRequest, EmbeddingRequest, ScoreRequest + CompletionRequest, + ChatCompletionRequest, + EmbeddingRequest, + TranscriptionRequest, + ScoreRequest, ], call_method: str, ) -> AsyncGenerator[ @@ -411,6 +436,7 @@ async def _get_response( LLMChatResponse, LLMCompletionsResponse, LLMEmbeddingsResponse, + LLMTranscriptionResponse, LLMScoreResponse, ], None, @@ -497,12 +523,10 @@ async def model_data(self, model: str) -> ModelCard: return model_data async def _process_llm_request( - self, body: Union[CompletionRequest, ChatCompletionRequest], is_chat: bool + self, + body: Union[CompletionRequest, ChatCompletionRequest, TranscriptionRequest], + call_method: str, ) -> Response: - NoneStreamingResponseType = ( - ChatCompletionResponse if is_chat else CompletionResponse - ) - call_method = "chat" if is_chat else "completions" async with router_request_timeout(DEFAULT_LLM_ROUTER_HTTP_TIMEOUT): @@ -523,7 +547,7 @@ async def _process_llm_request( type=first_chunk.error.type, ) - if isinstance(first_chunk, NoneStreamingResponseType): + if isinstance(first_chunk, NON_STREAMING_RESPONSE_TYPES): # Not streaming, first chunk should be a single response return JSONResponse(content=first_chunk.model_dump()) @@ -544,7 +568,9 @@ async def completions(self, body: CompletionRequest) -> Response: Returns: A response object with completions. """ - return await self._process_llm_request(body, is_chat=False) + return await self._process_llm_request( + body, call_method=CallMethod.COMPLETIONS.value + ) async def chat(self, body: ChatCompletionRequest) -> Response: """Given a prompt, the model will return one or more predicted completions, @@ -557,7 +583,7 @@ async def chat(self, body: ChatCompletionRequest) -> Response: A response object with completions. """ - return await self._process_llm_request(body, is_chat=True) + return await self._process_llm_request(body, call_method=CallMethod.CHAT.value) async def embeddings(self, body: EmbeddingRequest) -> Response: """Create embeddings for the provided input. @@ -581,6 +607,24 @@ async def embeddings(self, body: EmbeddingRequest) -> Response: if isinstance(result, EmbeddingResponse): return JSONResponse(content=result.model_dump()) + # Annotated[..., Form()] is wrapper that is used to handle multiple form data, which is how audio is sent in transcription requests. + # vLLM implementation for handling transcription requests: https://github.com/vllm-project/vllm/blob/0825197bee8dea547f2ab25f48afd8aea0cd2578/vllm/entrypoints/openai/api_server.py#L839. + async def transcriptions( + self, body: Annotated[TranscriptionRequest, Form()] + ) -> Response: + """Create transcription for the provided audio input. + + Args: + body: The TranscriptionRequest object. + + Returns: + A response object with transcriptions. + """ + + return await self._process_llm_request( + body, call_method=CallMethod.TRANSCRIPTIONS.value + ) + async def score(self, body: ScoreRequest) -> Response: """Create scores for the provided text pairs. diff --git a/python/ray/llm/_internal/serve/core/server/llm_server.py b/python/ray/llm/_internal/serve/core/server/llm_server.py index 263d934f0020..0da17151cc12 100644 --- a/python/ray/llm/_internal/serve/core/server/llm_server.py +++ b/python/ray/llm/_internal/serve/core/server/llm_server.py @@ -52,6 +52,8 @@ ErrorResponse, ScoreRequest, ScoreResponse, + TranscriptionRequest, + TranscriptionResponse, ) logger = get_logger(__name__) @@ -251,7 +253,10 @@ def _get_batch_interval_ms(self, stream: bool = True) -> int: async def _maybe_add_request_id_to_request( self, request: Union[ - "ChatCompletionRequest", "CompletionRequest", "EmbeddingRequest" + "ChatCompletionRequest", + "CompletionRequest", + "EmbeddingRequest", + "TranscriptionRequest", ], ): """Add the request id to the request.""" @@ -282,6 +287,7 @@ async def _run_request( "ChatCompletionRequest", "CompletionRequest", "EmbeddingRequest", + "TranscriptionRequest", "ScoreRequest", ], *, @@ -355,7 +361,7 @@ async def embeddings( ) -> AsyncGenerator[Union[List["ErrorResponse"], "EmbeddingResponse"], None]: """Runs an embeddings request to the engine and returns the response. - Returns an AsyncGenerator over the EmbeddingResponse object. This is so that the caller can have a consistent interface across all the methods of chat, completions, and embeddings. + Returns an AsyncGenerator over the EmbeddingResponse object. This is so that the caller can have a consistent interface across all the methods of chat, completions, embeddings and transcriptions. Args: request: An EmbeddingRequest object. @@ -365,7 +371,30 @@ async def embeddings( """ # NOTE: Embeddings does not need batching. return await self._run_request( - request, engine_method="embeddings", batch_output_stream=False + request, + engine_method="embeddings", + batch_output_stream=False, + ) + + async def transcriptions( + self, request: "TranscriptionRequest" + ) -> AsyncGenerator[ + Union[List[Union[str, "ErrorResponse"]], "TranscriptionResponse"], None + ]: + """Runs an transcriptions request to the engine and returns the response. + + Returns an AsyncGenerator over the TranscriptionResponse object. This is so that the caller can have a consistent interface across all the methods of chat, completions, embeddings and transcriptions. + + Args: + request: An TranscriptionRequest object. + + Returns: + An AsyncGenerator over the TranscriptionResponse object. + """ + return await self._run_request( + request, + engine_method="transcriptions", + batch_output_stream=True, ) async def score( diff --git a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py index 6c422f38a48b..13be7465f885 100644 --- a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py @@ -26,6 +26,8 @@ ErrorResponse, ScoreRequest, ScoreResponse, + TranscriptionRequest, + TranscriptionResponse, ) from ray.llm._internal.serve.core.engine.protocol import LLMEngine from ray.llm._internal.serve.engines.vllm.vllm_models import ( @@ -46,6 +48,7 @@ from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_score import ServingScores + from vllm.entrypoints.openai.serving_transcription import OpenAIServingTranscription vllm = try_import("vllm") logger = get_logger(__name__) @@ -147,6 +150,7 @@ def __init__( self._oai_serving_chat: Optional["OpenAIServingChat"] = None self._oai_serving_completion: Optional["OpenAIServingCompletion"] = None self._oai_serving_embedding: Optional["OpenAIServingEmbedding"] = None + self._oai_serving_transcription: Optional["OpenAIServingTranscription"] = None self._oai_serving_scores: Optional["ServingScores"] = None async def start(self) -> None: @@ -208,6 +212,7 @@ async def start(self) -> None: self._oai_serving_chat = state.openai_serving_chat self._oai_serving_completion = state.openai_serving_completion self._oai_serving_embedding = state.openai_serving_embedding + self._oai_serving_transcription = state.openai_serving_transcription self._oai_serving_scores = state.openai_serving_scores self._validate_openai_serving_models() @@ -241,6 +246,11 @@ def _validate_openai_serving_embedding(self): self._oai_serving_embedding, "create_embedding" ), "oai_serving_embedding must have a create_embedding attribute" + def _validate_openai_serving_transcription(self): + assert hasattr( + self._oai_serving_transcription, "create_transcription" + ), "oai_serving_transcription must have a create_transcription attribute" + def _validate_openai_serving_scores(self): assert hasattr( self._oai_serving_scores, "create_score" @@ -351,7 +361,11 @@ async def resolve_lora(self, disk_lora_model: DiskMultiplexConfig): def _create_raw_request( self, request: Union[ - CompletionRequest, ChatCompletionRequest, EmbeddingRequest, ScoreRequest + CompletionRequest, + ChatCompletionRequest, + EmbeddingRequest, + TranscriptionRequest, + ScoreRequest, ], path: str, ) -> Request: @@ -383,7 +397,7 @@ async def chat( async for response in chat_response: if not isinstance(response, str): raise ValueError( - f"Expected create_chat_completion to return a stream of strings, got and item with type {type(response)}" + f"Expected create_chat_completion to return a stream of strings, got an item with type {type(response)}" ) yield response else: @@ -412,7 +426,7 @@ async def completions( async for response in completion_response: if not isinstance(response, str): raise ValueError( - f"Expected create_completion to return a stream of strings, got and item with type {type(response)}" + f"Expected create_completion to return a stream of strings, got an item with type {type(response)}" ) yield response else: @@ -444,6 +458,41 @@ async def embeddings( else: yield EmbeddingResponse(**embedding_response.model_dump()) + async def transcriptions( + self, request: TranscriptionRequest + ) -> AsyncGenerator[Union[str, TranscriptionResponse, ErrorResponse], None]: + self._validate_openai_serving_transcription() + + # TODO (Kourosh): Remove when we upstream request_id attribute to vLLM. + # PR: https://github.com/vllm-project/vllm/pull/21009 + # Create a fake starlette.Request object with the x-request-id header + # so that the create_transcription API can assign the request_id properly. + raw_request = self._create_raw_request(request, "/audio/transcriptions") + + # Extract audio data from the request file + audio_data = await request.file.read() + + transcription_response = await self._oai_serving_transcription.create_transcription( # type: ignore[attr-defined] + audio_data, + request, + raw_request=raw_request, + ) + + if isinstance(transcription_response, AsyncGenerator): + async for response in transcription_response: + if not isinstance(response, str): + raise ValueError( + f"Expected create_transcription to return a stream of strings, got an item with type {type(response)}" + ) + yield response + else: + if isinstance(transcription_response, VLLMErrorResponse): + yield ErrorResponse( + error=ErrorInfo(**transcription_response.error.model_dump()) + ) + else: + yield TranscriptionResponse(**transcription_response.model_dump()) + async def score( self, request: ScoreRequest ) -> AsyncGenerator[Union[ScoreResponse, ErrorResponse], None]: diff --git a/python/ray/llm/tests/serve/conftest.py b/python/ray/llm/tests/serve/conftest.py index 6598fe1dff1d..071e572a06f4 100644 --- a/python/ray/llm/tests/serve/conftest.py +++ b/python/ray/llm/tests/serve/conftest.py @@ -16,6 +16,7 @@ CompletionRequest, EmbeddingCompletionRequest, ScoreRequest, + TranscriptionRequest, ) from ray.llm._internal.serve.engines.vllm.vllm_models import ( VLLMEngineConfig, @@ -113,6 +114,31 @@ def mock_embedding_request(dimensions): return request +@pytest.fixture +def mock_transcription_request(stream, temperature, language): + """Fixture for creating transcription requests for mock testing.""" + # Create a mock audio file for testing + from io import BytesIO + + from fastapi import UploadFile + + # Create a simple mock audio file (WAV format) + mock_audio_data = b"RIFF\x00\x00\x00\x00WAVEfmt \x10\x00\x00\x00\x01\x00\x01\x00\x44\xac\x00\x00\x88X\x01\x00\x02\x00\x10\x00data\x00\x00\x00\x00" # random byte string to test the transcription API + mock_file = UploadFile( + file=BytesIO(mock_audio_data), + filename="test_audio.wav", + ) + + return TranscriptionRequest( + file=mock_file, + model=MOCK_MODEL_ID, + language=language, + temperature=temperature, + stream=stream, + prompt="", + ) + + @pytest.fixture def mock_score_request(): """Fixture for creating score requests for mock testing.""" diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py index 4b259756aae6..5025b9d1d37b 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py @@ -83,6 +83,48 @@ async def test_embedding_mock_engine( async for response in engine.embeddings(request): LLMResponseValidator.validate_embedding_response(response, dimensions) + @pytest.mark.parametrize("stream", [False, True]) + @pytest.mark.parametrize("temperature", [0.0]) + @pytest.mark.parametrize("language", ["en", "hi"]) + @pytest.mark.asyncio + async def test_transcription_mock_engine( + self, + mock_llm_config, + mock_transcription_request, + stream: bool, + temperature: float, + language: Optional[str], + ): + """Test transcription API with different language and temperature, streaming and non-streaming.""" + + engine = MockVLLMEngine(mock_llm_config) + await engine.start() + + request = mock_transcription_request + response_generator = engine.transcriptions(request) + + print( + f"\n\n_____ TRANSCRIPTION ({'STREAMING' if stream else 'NON-STREAMING'}) language={language} temperature={temperature} _____\n\n" + ) + + if stream: + # Collect streaming chunks + chunks = [] + async for chunk in response_generator: + assert isinstance(chunk, str) + chunks.append(chunk) + + # Validate streaming response + LLMResponseValidator.validate_transcription_response( + chunks, temperature, language + ) + else: + # Validate non-streaming response + async for response in response_generator: + LLMResponseValidator.validate_transcription_response( + response, temperature, language + ) + @pytest.mark.asyncio async def test_score_mock_engine(self, mock_llm_config, mock_score_request): """Test score API for text similarity.""" diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py index 26814d6260f9..de74530d3e35 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py @@ -156,6 +156,61 @@ async def test_embedding_llm_server( # Validate embedding response LLMResponseValidator.validate_embedding_response(chunks[0], dimensions) + @pytest.mark.parametrize("stream", [False, True]) + @pytest.mark.parametrize("temperature", [0.0]) + @pytest.mark.parametrize("language", ["en", "hi"]) + @pytest.mark.asyncio + async def test_transcription_llm_server( + self, + serve_handle, + mock_llm_config, + mock_transcription_request, + stream: bool, + temperature: float, + language: Optional[str], + ): + """Test transcription API from LLMServer perspective.""" + + # Create transcription request + request = mock_transcription_request + + print( + f"\n\n_____ TRANSCRIPTION SERVER ({'STREAMING' if stream else 'NON-STREAMING'}) language={language} temperature={temperature} _____\n\n" + ) + + # Get the response + batched_chunks = serve_handle.transcriptions.remote(request) + + if stream: + # Collect streaming responses + chunks = [] + async for batch in batched_chunks: + if isinstance(batch, list): + chunks.extend(batch) + else: + chunks.append(batch) + + # Check that we got responses + assert len(chunks) > 0 + + # Validate streaming response + LLMResponseValidator.validate_transcription_response( + chunks, temperature, language + ) + else: + # Collect non-streaming response + chunks = [] + async for batch in batched_chunks: + chunks.append(batch) + + # Check that we got one response + assert len(chunks) == 1 + + # Validate non-streaming response + LLMResponseValidator.validate_transcription_response( + chunks[0], temperature, language + ) + @pytest.mark.asyncio async def test_score_llm_server( self, diff --git a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py index 4300b4859b91..c23e56b5e088 100644 --- a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py +++ b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py @@ -19,6 +19,8 @@ ErrorResponse, ScoreRequest, ScoreResponse, + TranscriptionRequest, + TranscriptionResponse, ) from ray.llm._internal.serve.core.engine.protocol import LLMEngine from ray.llm._internal.serve.utils.lora_serve_utils import LoraModelLoader @@ -137,6 +139,23 @@ async def embeddings( ) yield response + async def transcriptions( + self, request: TranscriptionRequest + ) -> AsyncGenerator[Union[str, TranscriptionResponse, ErrorResponse], None]: + """Mock transcription generation.""" + if not self.started: + raise RuntimeError("Engine not started") + + # Extract audio file info + language = getattr(request, "language", "en") + temperature = getattr(request, "temperature", 0.0) + + # Generate transcription response + async for response in self._generate_transcription_response( + request=request, language=language, temperature=temperature + ): + yield response + async def score( self, request: ScoreRequest ) -> AsyncGenerator[Union[str, ScoreResponse, ErrorResponse], None]: @@ -314,6 +333,95 @@ async def _generate_completion_response( yield response + async def _generate_transcription_response( + self, + request: TranscriptionRequest, + language: str, + temperature: float, + ) -> AsyncGenerator[Union[str, TranscriptionResponse], None]: + """Generate mock transcription response.""" + + request_id = request.request_id or f"transcribe-{random.randint(1000, 9999)}" + lora_prefix = ( + "" + if request.model not in self._current_lora_model + else f"[lora_model] {request.model}: " + ) + + # Generate mock transcription text with LoRA prefix + mock_transcription_text = ( + f"Mock transcription in {language} language with temperature {temperature}" + ) + if lora_prefix: + mock_transcription_text = f"{lora_prefix}{mock_transcription_text}" + + if request.stream: + # Streaming response - return SSE formatted strings + created_time = int(asyncio.get_event_loop().time()) + model_name = getattr(request, "model", "mock-model") + + # Split transcription into words for streaming + words = mock_transcription_text.split() + + for i, word in enumerate(words): + # Create streaming chunk + choice = { + "delta": { + "content": word + (" " if i < len(words) - 1 else ""), + }, + } + + chunk_data = { + "delta": None, + "type": None, + "logprobs": None, + "id": request_id, + "object": "transcription.chunk", + "created": created_time, + "model": model_name, + "choices": [choice], + } + + # Format as SSE + yield f"data: {json.dumps(chunk_data)}\n\n" + await asyncio.sleep(0.01) # Simulate processing time + + # Send final chunk with finish_reason + final_choice = { + "delta": { + "content": "", + "finish_reason": "stop", + "stop_reason": None, + }, + } + + final_chunk_data = { + "delta": None, + "type": None, + "logprobs": None, + "id": request_id, + "object": "transcription.chunk", + "created": created_time, + "model": model_name, + "choices": [final_choice], + } + + yield f"data: {json.dumps(final_chunk_data)}\n\n" + + # Send final [DONE] message + yield "data: [DONE]\n\n" + else: + # Non-streaming response - return response object + response = TranscriptionResponse( + text=mock_transcription_text, + logprobs=None, + usage={ + "seconds": 5.0, + "type": "duration", + }, + ) + yield response + class FakeLoraModelLoader(LoraModelLoader): """Fake LoRA model loader for testing that bypasses S3 entirely.""" diff --git a/python/ray/llm/tests/serve/utils/testing_utils.py b/python/ray/llm/tests/serve/utils/testing_utils.py index c63c92921b6c..0a8b4a95ad56 100644 --- a/python/ray/llm/tests/serve/utils/testing_utils.py +++ b/python/ray/llm/tests/serve/utils/testing_utils.py @@ -12,6 +12,7 @@ CompletionResponse, EmbeddingResponse, ScoreResponse, + TranscriptionResponse, ) @@ -108,3 +109,88 @@ def validate_score_response(response: ScoreResponse): assert score_data.object == "score" assert isinstance(score_data.score, float) assert score_data.index == i # Index should match position in list + + @staticmethod + def validate_transcription_response( + response: Union[TranscriptionResponse, List[str]], + temperature: float, + language: Optional[str] = None, + lora_model_id: str = "", + ): + """Validate transcription responses for both streaming and non-streaming.""" + if isinstance(response, list): + # Streaming response - validate chunks + LLMResponseValidator.validate_transcription_streaming_chunks( + response, temperature, language, lora_model_id + ) + else: + # Non-streaming response + assert isinstance(response, TranscriptionResponse) + assert hasattr(response, "text") + assert isinstance(response.text, str) + assert len(response.text) > 0 + + # Check that the response contains expected language and temperature info + expected_text = f"Mock transcription in {language} language with temperature {temperature}" + if lora_model_id: + expected_text = f"[lora_model] {lora_model_id}: {expected_text}" + assert response.text == expected_text + + # Validate usage information + if hasattr(response, "usage"): + assert hasattr(response.usage, "seconds") + assert hasattr(response.usage, "type") + assert response.usage.seconds > 0 + assert response.usage.type == "duration" + + @staticmethod + def validate_transcription_streaming_chunks( + chunks: List[str], + temperature: float, + language: Optional[str] = None, + lora_model_id: str = "", + ): + """Validate streaming transcription response chunks.""" + # Should have at least one chunk (transcription text) + final chunk + [DONE] + assert len(chunks) >= 3 + + # Validate each chunk except the last [DONE] chunk + transcription_chunks = [] + for chunk in chunks[:-1]: # Exclude the final [DONE] chunk + pattern = r"data: (.*)\n\n" + match = re.match(pattern, chunk) + assert match is not None + chunk_data = json.loads(match.group(1)) + + # Validate chunk structure + assert "id" in chunk_data + assert "object" in chunk_data + assert chunk_data["object"] == "transcription.chunk" + assert "delta" in chunk_data + assert chunk_data["delta"] is None + assert "type" in chunk_data + assert chunk_data["type"] is None + assert "logprobs" in chunk_data + assert chunk_data["logprobs"] is None + assert "choices" in chunk_data + assert len(chunk_data["choices"]) == 1 + + choice = chunk_data["choices"][0] + assert "delta" in choice + assert "content" in choice["delta"] + + # Collect text for final validation + if choice["delta"]["content"]: + transcription_chunks.append(choice["delta"]["content"]) + + # Validate final transcription text + full_transcription = "".join(transcription_chunks) + expected_text = ( + f"Mock transcription in {language} language with temperature {temperature}" + ) + if lora_model_id: + expected_text = f"[lora_model] {lora_model_id}: {expected_text}" + assert full_transcription.strip() == expected_text.strip() + + # Validate final [DONE] chunk + assert chunks[-1] == "data: [DONE]\n\n" diff --git a/python/ray/serve/llm/openai_api_models.py b/python/ray/serve/llm/openai_api_models.py index 4b04d54dbfdd..18603ac3deb0 100644 --- a/python/ray/serve/llm/openai_api_models.py +++ b/python/ray/serve/llm/openai_api_models.py @@ -8,6 +8,9 @@ EmbeddingRequest as _EmbeddingRequest, EmbeddingResponse as _EmbeddingResponse, ErrorResponse as _ErrorResponse, + TranscriptionRequest as _TranscriptionRequest, + TranscriptionResponse as _TranscriptionResponse, + TranscriptionStreamResponse as _TranscriptionStreamResponse, ) from ray.util.annotations import PublicAPI @@ -85,6 +88,36 @@ class EmbeddingResponse(_EmbeddingResponse): pass +@PublicAPI(stability="alpha") +class TranscriptionRequest(_TranscriptionRequest): + """TranscriptionRequest is the request body for the transcription API. + + This model is compatible with vLLM's OpenAI API models. + """ + + pass + + +@PublicAPI(stability="alpha") +class TranscriptionResponse(_TranscriptionResponse): + """TranscriptionResponse is the response body for the transcription API. + + This model is compatible with vLLM's OpenAI API models. + """ + + pass + + +@PublicAPI(stability="alpha") +class TranscriptionStreamResponse(_TranscriptionStreamResponse): + """TranscriptionStreamResponse is the response body for the transcription API. + + This model is compatible with vLLM's OpenAI API models. + """ + + pass + + @PublicAPI(stability="alpha") class ErrorResponse(_ErrorResponse): """The returned response in case of an error.""" diff --git a/python/requirements/llm/llm-requirements.txt b/python/requirements/llm/llm-requirements.txt index fe3543757e4f..d32e70d23f89 100644 --- a/python/requirements/llm/llm-requirements.txt +++ b/python/requirements/llm/llm-requirements.txt @@ -2,7 +2,7 @@ # constraining to a maximum version (i.e. <=) to temporarily work around a bug. # Those pins for the sake of workarounds should not be advertised as constraints # on future releases in setup.py. -vllm>=0.11.0 +vllm[audio]>=0.11.0 nixl>=0.6.1 # For json mode jsonref>=1.1.0 diff --git a/python/setup.py b/python/setup.py index 8799f262f1fb..869d5dfabecf 100644 --- a/python/setup.py +++ b/python/setup.py @@ -374,7 +374,7 @@ def get_packages(self): setup_spec.extras["llm"] = list( set( [ - "vllm>=0.11.0", + "vllm[audio]>=0.11.0", "nixl>=0.6.1", "jsonref>=1.1.0", "jsonschema", @@ -382,6 +382,8 @@ def get_packages(self): # async-timeout is a backport of asyncio.timeout for python < 3.11 "async-timeout; python_version < '3.11'", "typer", + "meson", + "pybind11", "hf_transfer", ] + setup_spec.extras["data"] diff --git a/release/llm_tests/serve/test_llm_serve_integration.py b/release/llm_tests/serve/test_llm_serve_integration.py index 8d1b423ba4b9..03e01dc1766e 100644 --- a/release/llm_tests/serve/test_llm_serve_integration.py +++ b/release/llm_tests/serve/test_llm_serve_integration.py @@ -156,6 +156,36 @@ def test_deepseek_model(model_name): time.sleep(1) +@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"]) +def test_transcription_model(model_name): + """ + Test that the transcription models can be loaded successfully. + """ + llm_config = LLMConfig( + model_loading_config=dict( + model_id=model_name, + model_source=model_name, + ), + deployment_config=dict( + autoscaling_config=dict(min_replicas=1, max_replicas=4), + ), + engine_kwargs=dict( + trust_remote_code=True, + gpu_memory_utilization=0.9, + enable_prefix_caching=True, + max_model_len=2048, + tokenizer_mode="mistral", + config_format="mistral", + load_format="mistral", + ), + ) + app = build_openai_app({"llm_configs": [llm_config]}) + serve.run(app, blocking=False) + wait_for_condition(is_default_app_running, timeout=180) + serve.shutdown() + time.sleep(1) + + @pytest.mark.asyncio(scope="function") @pytest.fixture def remote_model_app(request):