From 8c48511aecded92b6954b9181e4eeee458c191ae Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Sun, 5 Oct 2025 02:14:41 +0530 Subject: [PATCH 01/33] initial commit for transcriptions api integration Signed-off-by: DPatel_7 --- .../serve/core/configs/openai_api_models.py | 23 +++++++++- .../_internal/serve/core/ingress/ingress.py | 35 +++++++++++---- .../_internal/serve/core/server/llm_server.py | 31 +++++++++++-- .../serve/engines/vllm/vllm_engine.py | 43 ++++++++++++++++++- python/ray/serve/llm/openai_api_models.py | 33 ++++++++++++++ 5 files changed, 152 insertions(+), 13 deletions(-) diff --git a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py index ed2adf153d2c..073f5753c9de 100644 --- a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py +++ b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py @@ -21,6 +21,9 @@ EmbeddingChatRequest as vLLMEmbeddingChatRequest, EmbeddingCompletionRequest as vLLMEmbeddingCompletionRequest, EmbeddingResponse as vLLMEmbeddingResponse, + TranscriptionRequest as vLLMTranscriptionRequest, + TranscriptionResponse as vLLMTranscriptionResponse, + TranscriptionStreamResponse as vLLMTranscriptionStreamResponse, ErrorInfo as vLLMErrorInfo, ErrorResponse as vLLMErrorResponse, ScoreRequest as vLLMScoreRequest, @@ -96,6 +99,18 @@ class EmbeddingResponse(vLLMEmbeddingResponse): model_config = ConfigDict(arbitrary_types_allowed=True) +class TranscriptionRequest(vLLMTranscriptionRequest): + model_config = ConfigDict(arbitrary_types_allowed=True) + + +class TranscriptionResponse(vLLMTranscriptionResponse): + model_config = ConfigDict(arbitrary_types_allowed=True) + + +class TranscriptionStreamResponse(vLLMTranscriptionStreamResponse): + model_config = ConfigDict(arbitrary_types_allowed=True) + + class ScoreRequest(vLLMScoreRequest): model_config = ConfigDict(arbitrary_types_allowed=True) @@ -115,7 +130,7 @@ class ScoreResponse(vLLMScoreResponse): ] LLMChatResponse = Union[ - AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None], + AsyncGenerator[Union[str, ChatCompletionStreamResponse, ChatCompletionResponse, ErrorResponse], None], ] LLMCompletionsResponse = Union[ @@ -124,6 +139,12 @@ class ScoreResponse(vLLMScoreResponse): ], ] +LLMTranscriptionResponse = Union[ + AsyncGenerator[ + Union[TranscriptionStreamResponse, TranscriptionResponse, ErrorResponse], None + ], +] + # TODO: remove this class class OpenAIHTTPException(Exception): def __init__( diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py index cb61e1ab7a22..f6ae23c43a2d 100644 --- a/python/ray/llm/_internal/serve/core/ingress/ingress.py +++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py @@ -40,10 +40,14 @@ CompletionStreamResponse, EmbeddingRequest, EmbeddingResponse, + TranscriptionRequest, + TranscriptionResponse, + TranscriptionStreamResponse, ErrorResponse, LLMChatResponse, LLMCompletionsResponse, LLMEmbeddingsResponse, + LLMTranscriptionResponse, LLMScoreResponse, ModelCard, ModelList, @@ -110,6 +114,7 @@ def _sanitize_chat_completion_request( StreamResponseType = Union[ ChatCompletionStreamResponse, CompletionStreamResponse, + TranscriptionStreamResponse ] BatchedStreamResponseType = List[StreamResponseType] @@ -403,7 +408,7 @@ async def _get_response( self, *, body: Union[ - CompletionRequest, ChatCompletionRequest, EmbeddingRequest, ScoreRequest + CompletionRequest, ChatCompletionRequest, EmbeddingRequest, TranscriptionRequest, ScoreRequest ], call_method: str, ) -> AsyncGenerator[ @@ -411,6 +416,7 @@ async def _get_response( LLMChatResponse, LLMCompletionsResponse, LLMEmbeddingsResponse, + LLMTranscriptionResponse, LLMScoreResponse, ], None, @@ -497,12 +503,15 @@ async def model_data(self, model: str) -> ModelCard: return model_data async def _process_llm_request( - self, body: Union[CompletionRequest, ChatCompletionRequest], is_chat: bool + self, body: Union[CompletionRequest, ChatCompletionRequest, TranscriptionRequest], call_method: str ) -> Response: - NoneStreamingResponseType = ( - ChatCompletionResponse if is_chat else CompletionResponse - ) - call_method = "chat" if is_chat else "completions" + + if call_method == "chat": + NoneStreamingResponseType = ChatCompletionResponse + elif call_method == "completions": + NoneStreamingResponseType = CompletionResponse + elif call_method == "transcriptions": + NoneStreamingResponseType = TranscriptionResponse async with router_request_timeout(DEFAULT_LLM_ROUTER_HTTP_TIMEOUT): @@ -544,7 +553,7 @@ async def completions(self, body: CompletionRequest) -> Response: Returns: A response object with completions. """ - return await self._process_llm_request(body, is_chat=False) + return await self._process_llm_request(body, call_method="completions") async def chat(self, body: ChatCompletionRequest) -> Response: """Given a prompt, the model will return one or more predicted completions, @@ -557,7 +566,7 @@ async def chat(self, body: ChatCompletionRequest) -> Response: A response object with completions. """ - return await self._process_llm_request(body, is_chat=True) + return await self._process_llm_request(body, call_method="chat") async def embeddings(self, body: EmbeddingRequest) -> Response: """Create embeddings for the provided input. @@ -580,6 +589,16 @@ async def embeddings(self, body: EmbeddingRequest) -> Response: if isinstance(result, EmbeddingResponse): return JSONResponse(content=result.model_dump()) + + @fastapi_router_app.post("/v1/audio/transcriptions") + async def transcriptions(self, body: TranscriptionRequest) -> Response: + """Create transcription for the provided audio input. + + Returns: + A response object with transcriptins. + """ + + return await self._process_llm_request(body, call_method="transcriptions") async def score(self, body: ScoreRequest) -> Response: """Create scores for the provided text pairs. diff --git a/python/ray/llm/_internal/serve/core/server/llm_server.py b/python/ray/llm/_internal/serve/core/server/llm_server.py index 263d934f0020..c3390828b947 100644 --- a/python/ray/llm/_internal/serve/core/server/llm_server.py +++ b/python/ray/llm/_internal/serve/core/server/llm_server.py @@ -49,6 +49,8 @@ CompletionResponse, EmbeddingRequest, EmbeddingResponse, + TranscriptionRequest, + TranscriptionResponse, ErrorResponse, ScoreRequest, ScoreResponse, @@ -251,7 +253,7 @@ def _get_batch_interval_ms(self, stream: bool = True) -> int: async def _maybe_add_request_id_to_request( self, request: Union[ - "ChatCompletionRequest", "CompletionRequest", "EmbeddingRequest" + "ChatCompletionRequest", "CompletionRequest", "EmbeddingRequest", "TranscriptionRequest" ], ): """Add the request id to the request.""" @@ -282,6 +284,7 @@ async def _run_request( "ChatCompletionRequest", "CompletionRequest", "EmbeddingRequest", + "TranscriptionRequest" "ScoreRequest", ], *, @@ -355,7 +358,7 @@ async def embeddings( ) -> AsyncGenerator[Union[List["ErrorResponse"], "EmbeddingResponse"], None]: """Runs an embeddings request to the engine and returns the response. - Returns an AsyncGenerator over the EmbeddingResponse object. This is so that the caller can have a consistent interface across all the methods of chat, completions, and embeddings. + Returns an AsyncGenerator over the EmbeddingResponse object. This is so that the caller can have a consistent interface across all the methods of chat, completions, embeddings and transcriptions. Args: request: An EmbeddingRequest object. @@ -365,7 +368,29 @@ async def embeddings( """ # NOTE: Embeddings does not need batching. return await self._run_request( - request, engine_method="embeddings", batch_output_stream=False + request, + engine_method="embeddings", + batch_output_stream=False, + ) + + async def transcriptions( + self, request: "TranscriptionRequest" + ) -> AsyncGenerator[Union[List["ErrorResponse"], "TranscriptionResponse"], None]: + """Runs an transcriptions request to the engine and returns the response. + + Returns an AsyncGenerator over the TranscriptionResponse object. This is so that the caller can have a consistent interface across all the methods of chat, completions, embeddings and transcriptions. + + Args: + request: An TranscriptionRequest object. + + Returns: + An AsyncGenerator over the TranscriptionResponse object. + """ + # NOTE: Embeddings does not need batching. + return await self._run_request( + request, + engine_method="transcriptions", + batch_output_stream=True, ) async def score( diff --git a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py index 6c422f38a48b..2acac244345f 100644 --- a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py @@ -22,6 +22,8 @@ CompletionResponse, EmbeddingRequest, EmbeddingResponse, + TranscriptionRequest, + TranscriptionResponse, ErrorInfo, ErrorResponse, ScoreRequest, @@ -44,6 +46,7 @@ from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding + from vllm.entrypoints.openai.serving_transcription import OpenAIServingTranscription from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_score import ServingScores @@ -147,6 +150,7 @@ def __init__( self._oai_serving_chat: Optional["OpenAIServingChat"] = None self._oai_serving_completion: Optional["OpenAIServingCompletion"] = None self._oai_serving_embedding: Optional["OpenAIServingEmbedding"] = None + self._oai_serving_transcription: Optional["OpenAIServingTranscription"] = None self._oai_serving_scores: Optional["ServingScores"] = None async def start(self) -> None: @@ -208,6 +212,7 @@ async def start(self) -> None: self._oai_serving_chat = state.openai_serving_chat self._oai_serving_completion = state.openai_serving_completion self._oai_serving_embedding = state.openai_serving_embedding + self._oai_serving_transcription = state.openai_serving_transcription self._oai_serving_scores = state.openai_serving_scores self._validate_openai_serving_models() @@ -241,6 +246,11 @@ def _validate_openai_serving_embedding(self): self._oai_serving_embedding, "create_embedding" ), "oai_serving_embedding must have a create_embedding attribute" + def _validate_openai_serving_transcription(self): + assert hasattr( + self._oai_serving_transcription, "create_transcription" + ), "oai_serving_transcription must have a create_transcription attribute" + def _validate_openai_serving_scores(self): assert hasattr( self._oai_serving_scores, "create_score" @@ -351,7 +361,7 @@ async def resolve_lora(self, disk_lora_model: DiskMultiplexConfig): def _create_raw_request( self, request: Union[ - CompletionRequest, ChatCompletionRequest, EmbeddingRequest, ScoreRequest + CompletionRequest, ChatCompletionRequest, EmbeddingRequest, TranscriptionRequest, ScoreRequest ], path: str, ) -> Request: @@ -444,6 +454,37 @@ async def embeddings( else: yield EmbeddingResponse(**embedding_response.model_dump()) + async def transcription( + self, request: TranscriptionRequest + ) -> AsyncGenerator[Union[str, TranscriptionResponse, ErrorResponse], None]: + self._validate_openai_serving_transcription() + + # TODO (Kourosh): Remove when we upstream request_id attribute to vLLM. + # PR: https://github.com/vllm-project/vllm/pull/21009 + # Create a fake starlette.Request object with the x-request-id header + # so that the create_transcription API can assign the request_id properly. + raw_request = self._create_raw_request(request, "/audio/transcriptions") + + transcription_response = await self._oai_serving_transcription.create_transcription( # type: ignore[attr-defined] + request, + raw_request=raw_request, + ) + + if isinstance(transcription_response, AsyncGenerator): + async for response in transcription_response: + if not isinstance(response, str): + raise ValueError( + f"Expected create_transcription to return a stream of strings, got and item with type {type(response)}" + ) + yield response + else: + if isinstance(transcription_response, VLLMErrorResponse): + yield ErrorResponse( + error=ErrorInfo(**transcription_response.error.model_dump()) + ) + else: + yield TranscriptionResponse(**transcription_response.model_dump()) + async def score( self, request: ScoreRequest ) -> AsyncGenerator[Union[ScoreResponse, ErrorResponse], None]: diff --git a/python/ray/serve/llm/openai_api_models.py b/python/ray/serve/llm/openai_api_models.py index 4b04d54dbfdd..48fe7d3bee2a 100644 --- a/python/ray/serve/llm/openai_api_models.py +++ b/python/ray/serve/llm/openai_api_models.py @@ -7,6 +7,9 @@ CompletionStreamResponse as _CompletionStreamResponse, EmbeddingRequest as _EmbeddingRequest, EmbeddingResponse as _EmbeddingResponse, + TranscriptionRequest as _TranscriptionRequest, + TranscriptionResponse as _TranscriptionResponse, + TranscriptionStreamResponse as _TranscriptionStreamResponse, ErrorResponse as _ErrorResponse, ) from ray.util.annotations import PublicAPI @@ -85,6 +88,36 @@ class EmbeddingResponse(_EmbeddingResponse): pass +@PublicAPI(stability="alpha") +class TranscriptionRequest(_TranscriptionRequest): + """TranscriptionRequest is the request body for the transcription API. + + This model is compatible with vLLM's OpenAI API models. + """ + + pass + + +@PublicAPI(stability="alpha") +class TranscriptionResponse(_TranscriptionResponse): + """TranscriptionResponse is the response body for the transcription API. + + This model is compatible with vLLM's OpenAI API models. + """ + + pass + + +@PublicAPI(stability="alpha") +class TranscriptionStreamResponse(_TranscriptionStreamResponse): + """TranscriptionStreamResponse is the response body for the transcription API. + + This model is compatible with vLLM's OpenAI API models. + """ + + pass + + @PublicAPI(stability="alpha") class ErrorResponse(_ErrorResponse): """The returned response in case of an error.""" From 1c793b3675b4f57a5417fde5b7116f8699989592 Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Sun, 5 Oct 2025 02:33:24 +0530 Subject: [PATCH 02/33] naming fixes Signed-off-by: DPatel_7 --- python/ray/llm/_internal/serve/core/ingress/ingress.py | 2 +- python/ray/llm/_internal/serve/core/server/llm_server.py | 3 +-- python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py index f6ae23c43a2d..dbeaa769454c 100644 --- a/python/ray/llm/_internal/serve/core/ingress/ingress.py +++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py @@ -595,7 +595,7 @@ async def transcriptions(self, body: TranscriptionRequest) -> Response: """Create transcription for the provided audio input. Returns: - A response object with transcriptins. + A response object with transcriptions. """ return await self._process_llm_request(body, call_method="transcriptions") diff --git a/python/ray/llm/_internal/serve/core/server/llm_server.py b/python/ray/llm/_internal/serve/core/server/llm_server.py index c3390828b947..dec41f2dbed7 100644 --- a/python/ray/llm/_internal/serve/core/server/llm_server.py +++ b/python/ray/llm/_internal/serve/core/server/llm_server.py @@ -284,7 +284,7 @@ async def _run_request( "ChatCompletionRequest", "CompletionRequest", "EmbeddingRequest", - "TranscriptionRequest" + "TranscriptionRequest", "ScoreRequest", ], *, @@ -386,7 +386,6 @@ async def transcriptions( Returns: An AsyncGenerator over the TranscriptionResponse object. """ - # NOTE: Embeddings does not need batching. return await self._run_request( request, engine_method="transcriptions", diff --git a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py index 2acac244345f..1c69e7b72ba6 100644 --- a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py @@ -454,7 +454,7 @@ async def embeddings( else: yield EmbeddingResponse(**embedding_response.model_dump()) - async def transcription( + async def transcriptions( self, request: TranscriptionRequest ) -> AsyncGenerator[Union[str, TranscriptionResponse, ErrorResponse], None]: self._validate_openai_serving_transcription() From 0d4039c7e160c75f722074ee83c7b69549139699 Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Wed, 8 Oct 2025 02:02:24 +0530 Subject: [PATCH 03/33] ci tests for transcriptions api and docs for transcription Signed-off-by: DPatel_7 --- python/ray/llm/tests/serve/conftest.py | 26 ++++ .../cpu/deployments/llm/test_llm_engine.py | 42 +++++++ .../cpu/deployments/llm/test_llm_server.py | 55 +++++++++ .../llm/tests/serve/mocks/mock_vllm_engine.py | 113 ++++++++++++++++++ .../llm/tests/serve/utils/testing_utils.py | 78 ++++++++++++ 5 files changed, 314 insertions(+) diff --git a/python/ray/llm/tests/serve/conftest.py b/python/ray/llm/tests/serve/conftest.py index 6598fe1dff1d..ed58b4cc809e 100644 --- a/python/ray/llm/tests/serve/conftest.py +++ b/python/ray/llm/tests/serve/conftest.py @@ -16,6 +16,7 @@ CompletionRequest, EmbeddingCompletionRequest, ScoreRequest, + TranscriptionRequest, ) from ray.llm._internal.serve.engines.vllm.vllm_models import ( VLLMEngineConfig, @@ -113,6 +114,31 @@ def mock_embedding_request(dimensions): return request +@pytest.fixture +def mock_transcription_request(stream, temperature, language): + """Fixture for creating transcription requests for mock testing.""" + # Create a mock audio file for testing + from io import BytesIO + from fastapi import UploadFile + + # Create a simple mock audio file (WAV format) + mock_audio_data = b"RIFF\x00\x00\x00\x00WAVEfmt \x10\x00\x00\x00\x01\x00\x01\x00\x44\xac\x00\x00\x88X\x01\x00\x02\x00\x10\x00data\x00\x00\x00\x00" + mock_file = UploadFile( + file=BytesIO(mock_audio_data), + filename="test_audio.wav", + content_type="audio/wav", + ) + + return TranscriptionRequest( + file=mock_file, + model=MOCK_MODEL_ID, + language=language, + temperature=temperature, + stream=stream, + prompt="", + ) + + @pytest.fixture def mock_score_request(): """Fixture for creating score requests for mock testing.""" diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py index 4b259756aae6..0755105d3886 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py @@ -83,6 +83,48 @@ async def test_embedding_mock_engine( async for response in engine.embeddings(request): LLMResponseValidator.validate_embedding_response(response, dimensions) + @pytest.mark.parametrize("stream", [False, True]) + @pytest.mark.parametrize("temperature", [0.0]) + @pytest.mark.parametrize("language", ["en", "hi"]) + @pytest.mark.asyncio + async def test_transcription_mock_engine( + self, + mock_llm_config, + mock_transcription_request, + stream: bool, + temperature: float, + language: Optional[str], + ): + """Test transcription API with different language and temperature, streaming and non-streaming.""" + + engine = MockVLLMEngine(mock_llm_config) + await engine.start() + + request = mock_transcription_request + response_generator = engine.transcription(request) + + print( + f"\n\n_____ TRANSCRIPTION ({'STREAMING' if stream else 'NON-STREAMING'}) language={language} temperature={temperature} _____\n\n" + ) + + if stream: + # Collect streaming chunks + chunks = [] + async for chunk in response_generator: + assert isinstance(chunk, str) + chunks.append(chunk) + + # Validate streaming response + LLMResponseValidator.validate_transcription_response( + chunks, temperature, language + ) + else: + # Validate non-streaming response + async for response in response_generator: + LLMResponseValidator.validate_transcription_response( + response, temperature, language + ) + @pytest.mark.asyncio async def test_score_mock_engine(self, mock_llm_config, mock_score_request): """Test score API for text similarity.""" diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py index 26814d6260f9..07425024b522 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py @@ -155,6 +155,61 @@ async def test_embedding_llm_server( # Validate embedding response LLMResponseValidator.validate_embedding_response(chunks[0], dimensions) + + @pytest.mark.parametrize("stream", [False, True]) + @pytest.mark.parametrize("temperature", [0.0]) + @pytest.mark.parametrize("language", ["en", "hi"]) + @pytest.mark.asyncio + async def test_transcription_llm_server( + self, + serve_handle, + mock_llm_config, + mock_transcription_request, + stream: bool, + temperature: float, + language: Optional[str], + ): + """Test transcription API from LLMServer perspective.""" + + # Create transcription request + request = mock_transcription_request + + print( + f"\n\n_____ TRANSCRIPTION SERVER ({'STREAMING' if stream else 'NON-STREAMING'}) language={language} temperature={temperature} _____\n\n" + ) + + # Get the response + batched_chunks = serve_handle.transcriptions.remote(request) + + if stream: + # Collect streaming responses + chunks = [] + async for batch in batched_chunks: + if isinstance(batch, list): + chunks.extend(batch) + else: + chunks.append(batch) + + # Check that we got responses + assert len(chunks) > 0 + + # Validate streaming response + LLMResponseValidator.validate_transcription_response( + chunks, temperature, language + ) + else: + # Collect non-streaming response + chunks = [] + async for batch in batched_chunks: + chunks.append(batch) + + # Check that we got one response + assert len(chunks) == 1 + + # Validate non-streaming response + LLMResponseValidator.validate_transcription_response( + chunks[0], temperature, language + ) @pytest.mark.asyncio async def test_score_llm_server( diff --git a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py index 4300b4859b91..a926c429c577 100644 --- a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py +++ b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py @@ -16,9 +16,12 @@ CompletionResponse, EmbeddingRequest, EmbeddingResponse, + TranscriptionRequest, + TranscriptionResponse, ErrorResponse, ScoreRequest, ScoreResponse, + TranscriptionStreamResponse, ) from ray.llm._internal.serve.core.engine.protocol import LLMEngine from ray.llm._internal.serve.utils.lora_serve_utils import LoraModelLoader @@ -137,6 +140,33 @@ async def embeddings( ) yield response + async def transcription( + self, request: TranscriptionRequest + ) -> AsyncGenerator[Union[str, TranscriptionResponse, ErrorResponse], None]: + """Mock transcription generation.""" + if not self.started: + raise RuntimeError("Engine not started") + + # Extract audio file info + audio_file = request.file + language = getattr(request, "language", "en") + temperature = getattr(request, "temperature", 0.0) + stream = getattr(request, "stream", False) + + # Generate mock transcription response + mock_transcription_text = ( + f"Mock transcription in {language} language with temperature {temperature}" + ) + + # Generate transcription response + async for response in self._generate_transcription_response( + request=request, + transcription_text=mock_transcription_text, + language=language, + temperature=temperature, + ): + yield response + async def score( self, request: ScoreRequest ) -> AsyncGenerator[Union[str, ScoreResponse, ErrorResponse], None]: @@ -314,6 +344,89 @@ async def _generate_completion_response( yield response + async def _generate_transcription_response( + self, + request: TranscriptionRequest, + transcription_text: str, + language: str, + temperature: float, + ) -> AsyncGenerator[Union[str, TranscriptionResponse], None]: + """Generate mock transcription response.""" + + request_id = request.request_id or f"transcribe-{random.randint(1000, 9999)}" + lora_prefix = ( + "" + if request.model not in self._current_lora_model + else f"[lora_model] {request.model}: " + ) + + if request.stream: + # Streaming response - return SSE formatted strings + created_time = int(asyncio.get_event_loop().time()) + model_name = getattr(request, "model", "mock-model") + + # Split transcription into words for streaming + words = transcription_text.split() + + for i, word in enumerate(words): + # Create streaming chunk + choice = { + "delta": { + "content": word + (" " if i < len(words) - 1 else ""), + }, + } + + chunk_data = { + "delta": None, + "type": None, + "logprobs": None, + "id": request_id, + "object": "transcription.chunk", + "created": created_time, + "model": model_name, + "choices": [choice], + } + + # Format as SSE + yield f"data: {json.dumps(chunk_data)}\n\n" + await asyncio.sleep(0.01) # Simulate processing time + + # Send final chunk with finish_reason + final_choice = { + "delta": { + "content": "", + "finish_reason": "stop", + "stop_reason": None, + }, + } + + final_chunk_data = { + "delta": None, + "type": None, + "logprobs": None, + "id": request_id, + "object": "transcription.chunk", + "created": created_time, + "model": model_name, + "choices": [final_choice], + } + + yield f"data: {json.dumps(final_chunk_data)}\n\n" + + # Send final [DONE] message + yield "data: [DONE]\n\n" + else: + # Non-streaming response - return response object + response = TranscriptionResponse( + text=transcription_text, + logprobs=None, + usage={ + "seconds": 5.0, + "type": "duration", + }, + ) + yield response + class FakeLoraModelLoader(LoraModelLoader): """Fake LoRA model loader for testing that bypasses S3 entirely.""" diff --git a/python/ray/llm/tests/serve/utils/testing_utils.py b/python/ray/llm/tests/serve/utils/testing_utils.py index c63c92921b6c..b1fd72bce525 100644 --- a/python/ray/llm/tests/serve/utils/testing_utils.py +++ b/python/ray/llm/tests/serve/utils/testing_utils.py @@ -12,6 +12,7 @@ CompletionResponse, EmbeddingResponse, ScoreResponse, + TranscriptionResponse, ) @@ -108,3 +109,80 @@ def validate_score_response(response: ScoreResponse): assert score_data.object == "score" assert isinstance(score_data.score, float) assert score_data.index == i # Index should match position in list + + @staticmethod + def validate_transcription_response( + response: Union[TranscriptionResponse, List[str]], + temperature: float, + language: Optional[str] = None, + ): + """Validate transcription responses for both streaming and non-streaming.""" + if isinstance(response, list): + # Streaming response - validate chunks + LLMResponseValidator.validate_transcription_streaming_chunks( + response, temperature, language + ) + else: + # Non-streaming response + assert isinstance(response, TranscriptionResponse) + assert hasattr(response, "text") + assert isinstance(response.text, str) + assert len(response.text) > 0 + + # Check that the response contains expected language and temperature info + expected_text = f"Mock transcription in {language} language with temperature {temperature}" + assert response.text == expected_text + + # Validate usage information + if hasattr(response, "usage"): + assert hasattr(response.usage, "seconds") + assert hasattr(response.usage, "type") + assert response.usage.seconds > 0 + assert response.usage.type == "duration" + + @staticmethod + def validate_transcription_streaming_chunks( + chunks: List[str], temperature: float, language: Optional[str] = None + ): + """Validate streaming transcription response chunks.""" + # Should have at least one chunk (transcription text) + final chunk + [DONE] + assert len(chunks) >= 3 + + # Validate each chunk except the last [DONE] chunk + transcription_chunks = [] + for chunk in chunks[:-1]: # Exclude the final [DONE] chunk + pattern = r"data: (.*)\n\n" + match = re.match(pattern, chunk) + assert match is not None + chunk_data = json.loads(match.group(1)) + + # Validate chunk structure + assert "id" in chunk_data + assert "object" in chunk_data + assert chunk_data["object"] == "transcription.chunk" + assert "delta" in chunk_data + assert chunk_data["delta"] is None + assert "type" in chunk_data + assert chunk_data["type"] is None + assert "logprobs" in chunk_data + assert chunk_data["logprobs"] is None + assert "choices" in chunk_data + assert len(chunk_data["choices"]) == 1 + + choice = chunk_data["choices"][0] + assert "delta" in choice + assert "content" in choice["delta"] + + # Collect text for final validation + if choice["delta"]["content"]: + transcription_chunks.append(choice["delta"]["content"]) + + # Validate final transcription text + full_transcription = "".join(transcription_chunks) + expected_text = ( + f"Mock transcription in {language} language with temperature {temperature}" + ) + assert full_transcription.strip() == expected_text.strip() + + # Validate final [DONE] chunk + assert chunks[-1] == "data: [DONE]\n\n" From 863de39c28892e8a64a19bd85cd6736cc31cb2f6 Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Wed, 8 Oct 2025 19:21:02 +0530 Subject: [PATCH 04/33] type error fix Signed-off-by: DPatel_7 --- python/ray/llm/tests/serve/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/ray/llm/tests/serve/conftest.py b/python/ray/llm/tests/serve/conftest.py index ed58b4cc809e..f540de167cd0 100644 --- a/python/ray/llm/tests/serve/conftest.py +++ b/python/ray/llm/tests/serve/conftest.py @@ -126,7 +126,6 @@ def mock_transcription_request(stream, temperature, language): mock_file = UploadFile( file=BytesIO(mock_audio_data), filename="test_audio.wav", - content_type="audio/wav", ) return TranscriptionRequest( From fd611a5d321bf6806d1118e6eaf7246aa493bf40 Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Wed, 8 Oct 2025 19:30:04 +0530 Subject: [PATCH 05/33] formatting updated and added engine transcription function def Signed-off-by: DPatel_7 --- .../serve/core/configs/openai_api_models.py | 10 +++--- .../_internal/serve/core/engine/protocol.py | 31 +++++++++++++++++++ .../_internal/serve/core/ingress/ingress.py | 22 +++++++------ .../_internal/serve/core/server/llm_server.py | 5 ++- .../serve/engines/vllm/vllm_engine.py | 6 +++- .../cpu/deployments/llm/test_llm_server.py | 2 +- 6 files changed, 60 insertions(+), 16 deletions(-) diff --git a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py index 073f5753c9de..1c432f9a44bf 100644 --- a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py +++ b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py @@ -125,12 +125,13 @@ class ScoreResponse(vLLMScoreResponse): AsyncGenerator[Union[EmbeddingResponse, ErrorResponse], None], ] -LLMScoreResponse = Union[ - AsyncGenerator[Union[ScoreResponse, ErrorResponse], None], -] +LLMScoreResponse = Union[AsyncGenerator[Union[ScoreResponse, ErrorResponse], None],] LLMChatResponse = Union[ - AsyncGenerator[Union[str, ChatCompletionStreamResponse, ChatCompletionResponse, ErrorResponse], None], + AsyncGenerator[ + Union[str, ChatCompletionStreamResponse, ChatCompletionResponse, ErrorResponse], + None, + ], ] LLMCompletionsResponse = Union[ @@ -145,6 +146,7 @@ class ScoreResponse(vLLMScoreResponse): ], ] + # TODO: remove this class class OpenAIHTTPException(Exception): def __init__( diff --git a/python/ray/llm/_internal/serve/core/engine/protocol.py b/python/ray/llm/_internal/serve/core/engine/protocol.py index 56bcc5acf827..468157734a2c 100644 --- a/python/ray/llm/_internal/serve/core/engine/protocol.py +++ b/python/ray/llm/_internal/serve/core/engine/protocol.py @@ -14,6 +14,8 @@ CompletionResponse, EmbeddingRequest, EmbeddingResponse, + TranscriptionRequest, + TranscriptionResponse, ErrorResponse, ) @@ -118,6 +120,35 @@ async def embeddings( """ pass + @abc.abstractmethod + async def transcriptions( + self, request: "TranscriptionRequest" + ) -> AsyncGenerator[Union["TranscriptionResponse", "ErrorResponse"], None]: + """Run a Transcription with the engine. + + Similar to chat and completion, this method is an async generator, + so it yields chunks of response and when it is done, it returns None. + We have the following convention: + + * In case of streaming, yield a string representing data: + \n\n for each chunk. This should be already openAI compatible, + so the higher level can just yield it to the client. + * In case of non-streaming, yield a single object of type TranscriptionResponse. + * In case of error, yield a single object of type ErrorResponse. + + Args: + request: The transcription request. + + Yields: + Union[str, TranscriptionResponse, ErrorResponse]: A string + representing a chunk of the response, a TranscriptionResponse object, + or an ErrorResponse object. + + Returns: + None when the generator is done. + """ + pass + async def check_health(self) -> None: """Check the health of the engine. diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py index dbeaa769454c..54b6e05590ba 100644 --- a/python/ray/llm/_internal/serve/core/ingress/ingress.py +++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py @@ -112,9 +112,7 @@ def _sanitize_chat_completion_request( StreamResponseType = Union[ - ChatCompletionStreamResponse, - CompletionStreamResponse, - TranscriptionStreamResponse + ChatCompletionStreamResponse, CompletionStreamResponse, TranscriptionStreamResponse ] BatchedStreamResponseType = List[StreamResponseType] @@ -232,7 +230,7 @@ def make_fastapi_ingress( def _apply_openai_json_format( - response: Union[StreamResponseType, BatchedStreamResponseType] + response: Union[StreamResponseType, BatchedStreamResponseType], ) -> str: """Converts the stream response to OpenAI format. @@ -261,7 +259,7 @@ def _apply_openai_json_format( async def _peek_at_generator( - gen: AsyncGenerator[T, None] + gen: AsyncGenerator[T, None], ) -> Tuple[T, AsyncGenerator[T, None]]: # Peek at the first element first_item = await gen.__anext__() @@ -408,7 +406,11 @@ async def _get_response( self, *, body: Union[ - CompletionRequest, ChatCompletionRequest, EmbeddingRequest, TranscriptionRequest, ScoreRequest + CompletionRequest, + ChatCompletionRequest, + EmbeddingRequest, + TranscriptionRequest, + ScoreRequest, ], call_method: str, ) -> AsyncGenerator[ @@ -503,9 +505,11 @@ async def model_data(self, model: str) -> ModelCard: return model_data async def _process_llm_request( - self, body: Union[CompletionRequest, ChatCompletionRequest, TranscriptionRequest], call_method: str + self, + body: Union[CompletionRequest, ChatCompletionRequest, TranscriptionRequest], + call_method: str, ) -> Response: - + if call_method == "chat": NoneStreamingResponseType = ChatCompletionResponse elif call_method == "completions": @@ -589,7 +593,7 @@ async def embeddings(self, body: EmbeddingRequest) -> Response: if isinstance(result, EmbeddingResponse): return JSONResponse(content=result.model_dump()) - + @fastapi_router_app.post("/v1/audio/transcriptions") async def transcriptions(self, body: TranscriptionRequest) -> Response: """Create transcription for the provided audio input. diff --git a/python/ray/llm/_internal/serve/core/server/llm_server.py b/python/ray/llm/_internal/serve/core/server/llm_server.py index dec41f2dbed7..6328e3623b9a 100644 --- a/python/ray/llm/_internal/serve/core/server/llm_server.py +++ b/python/ray/llm/_internal/serve/core/server/llm_server.py @@ -253,7 +253,10 @@ def _get_batch_interval_ms(self, stream: bool = True) -> int: async def _maybe_add_request_id_to_request( self, request: Union[ - "ChatCompletionRequest", "CompletionRequest", "EmbeddingRequest", "TranscriptionRequest" + "ChatCompletionRequest", + "CompletionRequest", + "EmbeddingRequest", + "TranscriptionRequest", ], ): """Add the request id to the request.""" diff --git a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py index 1c69e7b72ba6..6d16252955ba 100644 --- a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py @@ -361,7 +361,11 @@ async def resolve_lora(self, disk_lora_model: DiskMultiplexConfig): def _create_raw_request( self, request: Union[ - CompletionRequest, ChatCompletionRequest, EmbeddingRequest, TranscriptionRequest, ScoreRequest + CompletionRequest, + ChatCompletionRequest, + EmbeddingRequest, + TranscriptionRequest, + ScoreRequest, ], path: str, ) -> Request: diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py index 07425024b522..de74530d3e35 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py @@ -155,7 +155,7 @@ async def test_embedding_llm_server( # Validate embedding response LLMResponseValidator.validate_embedding_response(chunks[0], dimensions) - + @pytest.mark.parametrize("stream", [False, True]) @pytest.mark.parametrize("temperature", [0.0]) @pytest.mark.parametrize("language", ["en", "hi"]) From c55fdc9fb83549fae125d4f3d39ff335f3cccb02 Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Wed, 8 Oct 2025 19:37:03 +0530 Subject: [PATCH 06/33] naming updates Signed-off-by: DPatel_7 --- .../ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py | 2 +- python/ray/llm/tests/serve/mocks/mock_vllm_engine.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py index 0755105d3886..5025b9d1d37b 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py +++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py @@ -101,7 +101,7 @@ async def test_transcription_mock_engine( await engine.start() request = mock_transcription_request - response_generator = engine.transcription(request) + response_generator = engine.transcriptions(request) print( f"\n\n_____ TRANSCRIPTION ({'STREAMING' if stream else 'NON-STREAMING'}) language={language} temperature={temperature} _____\n\n" diff --git a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py index a926c429c577..8bc4a65e1114 100644 --- a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py +++ b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py @@ -140,7 +140,7 @@ async def embeddings( ) yield response - async def transcription( + async def transcriptions( self, request: TranscriptionRequest ) -> AsyncGenerator[Union[str, TranscriptionResponse, ErrorResponse], None]: """Mock transcription generation.""" From 7b62802a5330cab6571c65e0cdc71f97f04d3b2e Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Wed, 8 Oct 2025 20:13:08 +0530 Subject: [PATCH 07/33] lora prefix updates and code formatting Signed-off-by: DPatel_7 --- .../serve/core/configs/openai_api_models.py | 4 +++- .../llm/tests/serve/mocks/mock_vllm_engine.py | 24 ++++++++----------- .../llm/tests/serve/utils/testing_utils.py | 12 ++++++++-- 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py index 1c432f9a44bf..150db113d180 100644 --- a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py +++ b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py @@ -125,7 +125,9 @@ class ScoreResponse(vLLMScoreResponse): AsyncGenerator[Union[EmbeddingResponse, ErrorResponse], None], ] -LLMScoreResponse = Union[AsyncGenerator[Union[ScoreResponse, ErrorResponse], None],] +LLMScoreResponse = Union[ + AsyncGenerator[Union[ScoreResponse, ErrorResponse], None], +] LLMChatResponse = Union[ AsyncGenerator[ diff --git a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py index 8bc4a65e1114..ab277691983e 100644 --- a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py +++ b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py @@ -148,22 +148,12 @@ async def transcriptions( raise RuntimeError("Engine not started") # Extract audio file info - audio_file = request.file language = getattr(request, "language", "en") temperature = getattr(request, "temperature", 0.0) - stream = getattr(request, "stream", False) - - # Generate mock transcription response - mock_transcription_text = ( - f"Mock transcription in {language} language with temperature {temperature}" - ) # Generate transcription response async for response in self._generate_transcription_response( - request=request, - transcription_text=mock_transcription_text, - language=language, - temperature=temperature, + request=request, language=language, temperature=temperature ): yield response @@ -347,7 +337,6 @@ async def _generate_completion_response( async def _generate_transcription_response( self, request: TranscriptionRequest, - transcription_text: str, language: str, temperature: float, ) -> AsyncGenerator[Union[str, TranscriptionResponse], None]: @@ -360,13 +349,20 @@ async def _generate_transcription_response( else f"[lora_model] {request.model}: " ) + # Generate mock transcription text with LoRA prefix + mock_transcription_text = ( + f"Mock transcription in {language} language with temperature {temperature}" + ) + if lora_prefix: + mock_transcription_text = f"{lora_prefix}{mock_transcription_text}" + if request.stream: # Streaming response - return SSE formatted strings created_time = int(asyncio.get_event_loop().time()) model_name = getattr(request, "model", "mock-model") # Split transcription into words for streaming - words = transcription_text.split() + words = mock_transcription_text.split() for i, word in enumerate(words): # Create streaming chunk @@ -418,7 +414,7 @@ async def _generate_transcription_response( else: # Non-streaming response - return response object response = TranscriptionResponse( - text=transcription_text, + text=mock_transcription_text, logprobs=None, usage={ "seconds": 5.0, diff --git a/python/ray/llm/tests/serve/utils/testing_utils.py b/python/ray/llm/tests/serve/utils/testing_utils.py index b1fd72bce525..0a8b4a95ad56 100644 --- a/python/ray/llm/tests/serve/utils/testing_utils.py +++ b/python/ray/llm/tests/serve/utils/testing_utils.py @@ -115,12 +115,13 @@ def validate_transcription_response( response: Union[TranscriptionResponse, List[str]], temperature: float, language: Optional[str] = None, + lora_model_id: str = "", ): """Validate transcription responses for both streaming and non-streaming.""" if isinstance(response, list): # Streaming response - validate chunks LLMResponseValidator.validate_transcription_streaming_chunks( - response, temperature, language + response, temperature, language, lora_model_id ) else: # Non-streaming response @@ -131,6 +132,8 @@ def validate_transcription_response( # Check that the response contains expected language and temperature info expected_text = f"Mock transcription in {language} language with temperature {temperature}" + if lora_model_id: + expected_text = f"[lora_model] {lora_model_id}: {expected_text}" assert response.text == expected_text # Validate usage information @@ -142,7 +145,10 @@ def validate_transcription_response( @staticmethod def validate_transcription_streaming_chunks( - chunks: List[str], temperature: float, language: Optional[str] = None + chunks: List[str], + temperature: float, + language: Optional[str] = None, + lora_model_id: str = "", ): """Validate streaming transcription response chunks.""" # Should have at least one chunk (transcription text) + final chunk + [DONE] @@ -182,6 +188,8 @@ def validate_transcription_streaming_chunks( expected_text = ( f"Mock transcription in {language} language with temperature {temperature}" ) + if lora_model_id: + expected_text = f"[lora_model] {lora_model_id}: {expected_text}" assert full_transcription.strip() == expected_text.strip() # Validate final [DONE] chunk From 77d162ade5594fcd7562b9a255fe3590eddea78c Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Wed, 8 Oct 2025 21:57:44 +0530 Subject: [PATCH 08/33] request_id added in transcription request Signed-off-by: DPatel_7 --- .../_internal/serve/core/configs/openai_api_models.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py index 150db113d180..376f6e8b436d 100644 --- a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py +++ b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py @@ -102,6 +102,15 @@ class EmbeddingResponse(vLLMEmbeddingResponse): class TranscriptionRequest(vLLMTranscriptionRequest): model_config = ConfigDict(arbitrary_types_allowed=True) + request_id: str = Field( + default_factory=lambda: f"{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response." + ), + ) + class TranscriptionResponse(vLLMTranscriptionResponse): model_config = ConfigDict(arbitrary_types_allowed=True) From 8294a338b9f2a41d1f3e0ed3b661b2775cc13252 Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Thu, 9 Oct 2025 22:25:01 +0530 Subject: [PATCH 09/33] modified docs for ci tests and added release test Signed-off-by: DPatel_7 --- .../transcription_config_example.yaml | 29 +++++ .../transcription/transcription_example.py | 101 ++++++++++++++++++ .../transcription_yaml_config_example.py | 51 +++++++++ .../serve/core/configs/openai_api_models.py | 2 +- .../_internal/serve/core/ingress/ingress.py | 14 ++- .../serve/test_llm_serve_integration.py | 28 +++++ 6 files changed, 219 insertions(+), 6 deletions(-) create mode 100644 doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml create mode 100644 doc/source/llm/doc_code/serve/transcription/transcription_example.py create mode 100644 doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml b/doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml new file mode 100644 index 000000000000..eeac1ea33001 --- /dev/null +++ b/doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml @@ -0,0 +1,29 @@ +# config.yaml +applications: +- args: + llm_configs: + - model_loading_config: + model_id: whisper-large + model_source: openai/whisper-large-v3 + accelerator_type: A10G + deployment_config: + autoscaling_config: + min_replicas: 1 + max_replicas: 2 + log_engine_metrics: true + - model_loading_config: + model_id: voxtral-mini + model_source: mistralai/Voxtral-Mini-3B-2507 + accelerator_type: A10G + engine_kwargs: + tokenizer_mode: mistral + config_format: mistral + load_format: mistral + deployment_config: + autoscaling_config: + min_replicas: 1 + max_replicas: 2 + log_engine_metrics: true + import_path: ray.serve.llm:build_openai_app + name: llm_app + route_prefix: "/" diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_example.py b/doc/source/llm/doc_code/serve/transcription/transcription_example.py new file mode 100644 index 000000000000..6ff55a6e2272 --- /dev/null +++ b/doc/source/llm/doc_code/serve/transcription/transcription_example.py @@ -0,0 +1,101 @@ +""" +This file serves as a documentation example and CI test. + +Structure: +1. Monkeypatch setup: Ensures serve.run is non-blocking and removes accelerator requirements for CI testing. +2. Docs example (between __transcription_example_start/end__): Embedded in Sphinx docs via literalinclude. +3. Test validation (deployment status polling + cleanup) +""" + +import time +from ray import serve +from ray.serve.schema import ApplicationStatus +from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME +from ray.serve import llm + +_original_serve_run = serve.run +_original_build_openai_app = llm.build_openai_app + + +def _non_blocking_serve_run(app, **kwargs): + """Forces blocking=False for testing""" + kwargs["blocking"] = False + return _original_serve_run(app, **kwargs) + + +def _testing_build_openai_app(llm_serving_args): + """Removes accelerator requirements for testing""" + for config in llm_serving_args["llm_configs"]: + config.accelerator_type = None + + return _original_build_openai_app(llm_serving_args) + + +serve.run = _non_blocking_serve_run +llm.build_openai_app = _testing_build_openai_app + +# __transcription_example_start__ +from ray import serve +from ray.serve.llm import LLMConfig, build_openai_app + +whisper_llm_config = LLMConfig( + model_loading_config={ + "model_id": "whisper-large", + "model_source": "openai/whisper-large-v3", + }, + deployment_config={ + "autoscaling_config": { + "min_replicas": 1, + "max_replicas": 2, + } + }, + # Pass the desired accelerator type (e.g. A10G, L4, etc.) + accelerator_type="A10G", + log_engine_metrics=True, +) + +voxtral_llm_config = LLMConfig( + model_loading_config={ + "model_id": "voxtral-mini", + "model_source": "mistralai/Voxtral-Mini-3B-2507", + }, + deployment_config={ + "autoscaling_config": { + "min_replicas": 1, + "max_replicas": 2, + } + }, + accelerator_type="A10G", + # You can customize the engine arguments (e.g. vLLM engine kwargs) + engine_kwargs={ + "tokenizer_mode": "mistral", + "config_format": "mistral", + "load_format": "mistral", + }, + log_engine_metrics=True, +) + +app = build_openai_app({"llm_configs": [whisper_llm_config, voxtral_llm_config]}) +serve.run(app, blocking=True) +# __transcription_example_end__ + +status = ApplicationStatus.NOT_STARTED +timeout_seconds = 300 +start_time = time.time() + +while ( + status != ApplicationStatus.RUNNING and time.time() - start_time < timeout_seconds +): + status = serve.status().applications[SERVE_DEFAULT_APP_NAME].status + + if status in [ApplicationStatus.DEPLOY_FAILED, ApplicationStatus.UNHEALTHY]: + raise AssertionError(f"Deployment failed with status: {status}") + + time.sleep(1) + +if status != ApplicationStatus.RUNNING: + raise AssertionError( + f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}" + ) + +serve.shutdown() diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py b/doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py new file mode 100644 index 000000000000..e37def988888 --- /dev/null +++ b/doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py @@ -0,0 +1,51 @@ +""" +This file serves as a documentation example and CI test for YAML config deployment. + +Structure: +1. Monkeypatch setup: Ensures serve.run is non-blocking and removes accelerator requirements for CI testing. +2. Load YAML config and convert to Python using build_openai_app +3. Test validation (deployment status polling + cleanup) +""" + +import time +import os +import yaml +from ray import serve +from ray.serve.schema import ApplicationStatus +from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME +from ray.serve import llm + + +config_path = os.path.join( + os.path.dirname(__file__), "transcription_config_example.yaml" +) +with open(config_path, "r") as f: + config_dict = yaml.safe_load(f) + +llm_configs = config_dict["applications"][0]["args"]["llm_configs"] +for config in llm_configs: + config.pop("accelerator_type", None) + +app = llm.build_openai_app({"llm_configs": llm_configs}) +serve.run(app, blocking=False) + +status = ApplicationStatus.NOT_STARTED +timeout_seconds = 300 +start_time = time.time() + +while ( + status != ApplicationStatus.RUNNING and time.time() - start_time < timeout_seconds +): + status = serve.status().applications[SERVE_DEFAULT_APP_NAME].status + + if status in [ApplicationStatus.DEPLOY_FAILED, ApplicationStatus.UNHEALTHY]: + raise AssertionError(f"Deployment failed with status: {status}") + + time.sleep(1) + +if status != ApplicationStatus.RUNNING: + raise AssertionError( + f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}" + ) + +serve.shutdown() diff --git a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py index 376f6e8b436d..b7c75b8d8064 100644 --- a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py +++ b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py @@ -110,7 +110,7 @@ class TranscriptionRequest(vLLMTranscriptionRequest): "through out the inference process and return in response." ), ) - + class TranscriptionResponse(vLLMTranscriptionResponse): model_config = ConfigDict(arbitrary_types_allowed=True) diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py index 54b6e05590ba..7fc62a881dd7 100644 --- a/python/ray/llm/_internal/serve/core/ingress/ingress.py +++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py @@ -1,6 +1,7 @@ import asyncio import json import sys +from enum import Enum from contextlib import asynccontextmanager from typing import ( Any, @@ -86,6 +87,10 @@ DEFAULT_INGRESS_OPTIONS = { "max_ongoing_requests": DEFAULT_MAX_ONGOING_REQUESTS, } +class CallMethod(Enum): + CHAT = "chat" + COMPLETIONS = "completions" + TRANSCRIPTIONS = "transcriptions" def _sanitize_chat_completion_request( @@ -507,14 +512,14 @@ async def model_data(self, model: str) -> ModelCard: async def _process_llm_request( self, body: Union[CompletionRequest, ChatCompletionRequest, TranscriptionRequest], - call_method: str, + call_method: CallMethod, ) -> Response: - if call_method == "chat": + if call_method == CallMethod.CHAT: NoneStreamingResponseType = ChatCompletionResponse - elif call_method == "completions": + elif call_method == CallMethod.COMPLETIONS: NoneStreamingResponseType = CompletionResponse - elif call_method == "transcriptions": + elif call_method == CallMethod.TRANSCRIPTIONS: NoneStreamingResponseType = TranscriptionResponse async with router_request_timeout(DEFAULT_LLM_ROUTER_HTTP_TIMEOUT): @@ -594,7 +599,6 @@ async def embeddings(self, body: EmbeddingRequest) -> Response: if isinstance(result, EmbeddingResponse): return JSONResponse(content=result.model_dump()) - @fastapi_router_app.post("/v1/audio/transcriptions") async def transcriptions(self, body: TranscriptionRequest) -> Response: """Create transcription for the provided audio input. diff --git a/release/llm_tests/serve/test_llm_serve_integration.py b/release/llm_tests/serve/test_llm_serve_integration.py index 8d1b423ba4b9..894b31d65930 100644 --- a/release/llm_tests/serve/test_llm_serve_integration.py +++ b/release/llm_tests/serve/test_llm_serve_integration.py @@ -156,6 +156,34 @@ def test_deepseek_model(model_name): time.sleep(1) +@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"]) +def test_transcription_model(model_name): + """ + Test that the transcription models can be loaded successfully. + """ + llm_config = LLMConfig( + model_loading_config=dict( + model_id=model_name, + model_source=model_name, + ), + deployment_config=dict( + autoscaling_config=dict(min_replicas=1, max_replicas=4), + ), + engine_kwargs=dict( + trust_remote_code=True, + gpu_memory_utilization=0.9, + enable_prefix_caching=True, + max_model_len=2048, + tokenizer_mode="mistral", + config_format="mistral", + load_format="mistral", + ), + ) + app = build_openai_app({"llm_configs": [llm_config]}) + serve.run(app, blocking=False) + wait_for_condition(is_default_app_running, timeout=180) + + @pytest.mark.asyncio(scope="function") @pytest.fixture def remote_model_app(request): From c5134d545a415db512fe801b8b32ebdd2f0ff0d8 Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Thu, 9 Oct 2025 23:07:47 +0530 Subject: [PATCH 10/33] enum fix Signed-off-by: DPatel_7 --- python/ray/llm/_internal/serve/core/ingress/ingress.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py index 7fc62a881dd7..1621ea9a2b1f 100644 --- a/python/ray/llm/_internal/serve/core/ingress/ingress.py +++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py @@ -524,7 +524,7 @@ async def _process_llm_request( async with router_request_timeout(DEFAULT_LLM_ROUTER_HTTP_TIMEOUT): - gen = self._get_response(body=body, call_method=call_method) + gen = self._get_response(body=body, call_method=call_method.value) # In streaming with batching enabled, this first response can be a list of chunks. initial_response, gen = await _peek_at_generator(gen) From 2cd0ac9c25d90d3ac7049ccc4104a12028601561 Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Fri, 10 Oct 2025 01:54:49 +0530 Subject: [PATCH 11/33] enum fix Signed-off-by: DPatel_7 --- python/ray/llm/_internal/serve/core/ingress/ingress.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py index 1621ea9a2b1f..7fc62a881dd7 100644 --- a/python/ray/llm/_internal/serve/core/ingress/ingress.py +++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py @@ -524,7 +524,7 @@ async def _process_llm_request( async with router_request_timeout(DEFAULT_LLM_ROUTER_HTTP_TIMEOUT): - gen = self._get_response(body=body, call_method=call_method.value) + gen = self._get_response(body=body, call_method=call_method) # In streaming with batching enabled, this first response can be a list of chunks. initial_response, gen = await _peek_at_generator(gen) From b248c9015f502a4e2d307150e72453b77e6b77ac Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Sat, 11 Oct 2025 01:45:05 +0530 Subject: [PATCH 12/33] router updates Signed-off-by: DPatel_7 --- .../_internal/serve/core/ingress/ingress.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py index 7fc62a881dd7..a61a8b86a674 100644 --- a/python/ray/llm/_internal/serve/core/ingress/ingress.py +++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py @@ -87,12 +87,21 @@ DEFAULT_INGRESS_OPTIONS = { "max_ongoing_requests": DEFAULT_MAX_ONGOING_REQUESTS, } + + class CallMethod(Enum): CHAT = "chat" COMPLETIONS = "completions" TRANSCRIPTIONS = "transcriptions" +NON_STREAMING_RESPONSE_TYPES = ( + ChatCompletionResponse, + CompletionResponse, + TranscriptionResponse, +) + + def _sanitize_chat_completion_request( request: ChatCompletionRequest, ) -> ChatCompletionRequest: @@ -130,6 +139,7 @@ def _sanitize_chat_completion_request( "completions": lambda app: app.post("/v1/completions"), "chat": lambda app: app.post("/v1/chat/completions"), "embeddings": lambda app: app.post("/v1/embeddings"), + "transcriptions": lambda app: app.post("/v1/audio/transcriptions"), "score": lambda app: app.post("/v1/score"), } @@ -515,13 +525,6 @@ async def _process_llm_request( call_method: CallMethod, ) -> Response: - if call_method == CallMethod.CHAT: - NoneStreamingResponseType = ChatCompletionResponse - elif call_method == CallMethod.COMPLETIONS: - NoneStreamingResponseType = CompletionResponse - elif call_method == CallMethod.TRANSCRIPTIONS: - NoneStreamingResponseType = TranscriptionResponse - async with router_request_timeout(DEFAULT_LLM_ROUTER_HTTP_TIMEOUT): gen = self._get_response(body=body, call_method=call_method) @@ -541,7 +544,7 @@ async def _process_llm_request( type=first_chunk.error.type, ) - if isinstance(first_chunk, NoneStreamingResponseType): + if isinstance(first_chunk, NON_STREAMING_RESPONSE_TYPES): # Not streaming, first chunk should be a single response return JSONResponse(content=first_chunk.model_dump()) From 92d4fdbcb136588fe21077428553c41255daf3cd Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Sat, 11 Oct 2025 02:12:27 +0530 Subject: [PATCH 13/33] router fix Signed-off-by: DPatel_7 --- python/ray/llm/_internal/serve/core/ingress/ingress.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py index a61a8b86a674..e1545a7e57f2 100644 --- a/python/ray/llm/_internal/serve/core/ingress/ingress.py +++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py @@ -605,6 +605,9 @@ async def embeddings(self, body: EmbeddingRequest) -> Response: async def transcriptions(self, body: TranscriptionRequest) -> Response: """Create transcription for the provided audio input. + Args: + body: The TranscriptionRequest object. + Returns: A response object with transcriptions. """ From fff6dbac8c90ad7754d61f4e4a34a96c17e058ca Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Sat, 11 Oct 2025 23:56:47 +0530 Subject: [PATCH 14/33] pre commit hooks run and bazel build Signed-off-by: DPatel_7 --- doc/BUILD.bazel | 2 +- .../_internal/serve/core/configs/openai_api_models.py | 6 +++--- python/ray/llm/_internal/serve/core/engine/protocol.py | 2 +- python/ray/llm/_internal/serve/core/ingress/ingress.py | 10 +++++----- .../ray/llm/_internal/serve/core/server/llm_server.py | 4 ++-- .../llm/_internal/serve/engines/vllm/vllm_engine.py | 6 +++--- python/ray/llm/tests/serve/conftest.py | 1 + python/ray/llm/tests/serve/mocks/mock_vllm_engine.py | 5 ++--- python/ray/serve/llm/openai_api_models.py | 2 +- 9 files changed, 19 insertions(+), 19 deletions(-) diff --git a/doc/BUILD.bazel b/doc/BUILD.bazel index 26df9d2197af..830d0e95d357 100644 --- a/doc/BUILD.bazel +++ b/doc/BUILD.bazel @@ -348,7 +348,7 @@ py_test_run_all_subdirectory( include = ["source/llm/doc_code/serve/**/*.py"], exclude = [], extra_srcs = [], - data = ["source/llm/doc_code/serve/qwen/llm_config_example.yaml"], + data = ["source/llm/doc_code/serve/qwen/llm_config_example.yaml", "source/llm/doc_code/serve/transcription/transcription_config_example.yaml"], tags = [ "exclusive", "gpu", diff --git a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py index b7c75b8d8064..ec12e9988890 100644 --- a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py +++ b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py @@ -21,13 +21,13 @@ EmbeddingChatRequest as vLLMEmbeddingChatRequest, EmbeddingCompletionRequest as vLLMEmbeddingCompletionRequest, EmbeddingResponse as vLLMEmbeddingResponse, - TranscriptionRequest as vLLMTranscriptionRequest, - TranscriptionResponse as vLLMTranscriptionResponse, - TranscriptionStreamResponse as vLLMTranscriptionStreamResponse, ErrorInfo as vLLMErrorInfo, ErrorResponse as vLLMErrorResponse, ScoreRequest as vLLMScoreRequest, ScoreResponse as vLLMScoreResponse, + TranscriptionRequest as vLLMTranscriptionRequest, + TranscriptionResponse as vLLMTranscriptionResponse, + TranscriptionStreamResponse as vLLMTranscriptionStreamResponse, ) from vllm.utils import random_uuid diff --git a/python/ray/llm/_internal/serve/core/engine/protocol.py b/python/ray/llm/_internal/serve/core/engine/protocol.py index 468157734a2c..9270865302c3 100644 --- a/python/ray/llm/_internal/serve/core/engine/protocol.py +++ b/python/ray/llm/_internal/serve/core/engine/protocol.py @@ -14,9 +14,9 @@ CompletionResponse, EmbeddingRequest, EmbeddingResponse, + ErrorResponse, TranscriptionRequest, TranscriptionResponse, - ErrorResponse, ) diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py index e1545a7e57f2..ebdbbd06e3a6 100644 --- a/python/ray/llm/_internal/serve/core/ingress/ingress.py +++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py @@ -1,8 +1,8 @@ import asyncio import json import sys -from enum import Enum from contextlib import asynccontextmanager +from enum import Enum from typing import ( Any, AsyncGenerator, @@ -41,20 +41,20 @@ CompletionStreamResponse, EmbeddingRequest, EmbeddingResponse, - TranscriptionRequest, - TranscriptionResponse, - TranscriptionStreamResponse, ErrorResponse, LLMChatResponse, LLMCompletionsResponse, LLMEmbeddingsResponse, - LLMTranscriptionResponse, LLMScoreResponse, + LLMTranscriptionResponse, ModelCard, ModelList, OpenAIHTTPException, ScoreRequest, ScoreResponse, + TranscriptionRequest, + TranscriptionResponse, + TranscriptionStreamResponse, to_model_metadata, ) from ray.llm._internal.serve.core.ingress.middleware import ( diff --git a/python/ray/llm/_internal/serve/core/server/llm_server.py b/python/ray/llm/_internal/serve/core/server/llm_server.py index 6328e3623b9a..7f44ea501655 100644 --- a/python/ray/llm/_internal/serve/core/server/llm_server.py +++ b/python/ray/llm/_internal/serve/core/server/llm_server.py @@ -49,11 +49,11 @@ CompletionResponse, EmbeddingRequest, EmbeddingResponse, - TranscriptionRequest, - TranscriptionResponse, ErrorResponse, ScoreRequest, ScoreResponse, + TranscriptionRequest, + TranscriptionResponse, ) logger = get_logger(__name__) diff --git a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py index 6d16252955ba..6e065f0c8598 100644 --- a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py @@ -22,12 +22,12 @@ CompletionResponse, EmbeddingRequest, EmbeddingResponse, - TranscriptionRequest, - TranscriptionResponse, ErrorInfo, ErrorResponse, ScoreRequest, ScoreResponse, + TranscriptionRequest, + TranscriptionResponse, ) from ray.llm._internal.serve.core.engine.protocol import LLMEngine from ray.llm._internal.serve.engines.vllm.vllm_models import ( @@ -46,9 +46,9 @@ from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding - from vllm.entrypoints.openai.serving_transcription import OpenAIServingTranscription from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_score import ServingScores + from vllm.entrypoints.openai.serving_transcription import OpenAIServingTranscription vllm = try_import("vllm") logger = get_logger(__name__) diff --git a/python/ray/llm/tests/serve/conftest.py b/python/ray/llm/tests/serve/conftest.py index f540de167cd0..5c7357a1823e 100644 --- a/python/ray/llm/tests/serve/conftest.py +++ b/python/ray/llm/tests/serve/conftest.py @@ -119,6 +119,7 @@ def mock_transcription_request(stream, temperature, language): """Fixture for creating transcription requests for mock testing.""" # Create a mock audio file for testing from io import BytesIO + from fastapi import UploadFile # Create a simple mock audio file (WAV format) diff --git a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py index ab277691983e..c23e56b5e088 100644 --- a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py +++ b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py @@ -16,12 +16,11 @@ CompletionResponse, EmbeddingRequest, EmbeddingResponse, - TranscriptionRequest, - TranscriptionResponse, ErrorResponse, ScoreRequest, ScoreResponse, - TranscriptionStreamResponse, + TranscriptionRequest, + TranscriptionResponse, ) from ray.llm._internal.serve.core.engine.protocol import LLMEngine from ray.llm._internal.serve.utils.lora_serve_utils import LoraModelLoader diff --git a/python/ray/serve/llm/openai_api_models.py b/python/ray/serve/llm/openai_api_models.py index 48fe7d3bee2a..18603ac3deb0 100644 --- a/python/ray/serve/llm/openai_api_models.py +++ b/python/ray/serve/llm/openai_api_models.py @@ -7,10 +7,10 @@ CompletionStreamResponse as _CompletionStreamResponse, EmbeddingRequest as _EmbeddingRequest, EmbeddingResponse as _EmbeddingResponse, + ErrorResponse as _ErrorResponse, TranscriptionRequest as _TranscriptionRequest, TranscriptionResponse as _TranscriptionResponse, TranscriptionStreamResponse as _TranscriptionStreamResponse, - ErrorResponse as _ErrorResponse, ) from ray.util.annotations import PublicAPI From 7485e369a9c853aae27846500c174945b38b9a4a Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Sun, 12 Oct 2025 00:21:23 +0530 Subject: [PATCH 15/33] enum fixes Signed-off-by: DPatel_7 --- .../ray/llm/_internal/serve/core/ingress/ingress.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py index ebdbbd06e3a6..5d7973b833ed 100644 --- a/python/ray/llm/_internal/serve/core/ingress/ingress.py +++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py @@ -522,7 +522,7 @@ async def model_data(self, model: str) -> ModelCard: async def _process_llm_request( self, body: Union[CompletionRequest, ChatCompletionRequest, TranscriptionRequest], - call_method: CallMethod, + call_method: str, ) -> Response: async with router_request_timeout(DEFAULT_LLM_ROUTER_HTTP_TIMEOUT): @@ -565,7 +565,9 @@ async def completions(self, body: CompletionRequest) -> Response: Returns: A response object with completions. """ - return await self._process_llm_request(body, call_method="completions") + return await self._process_llm_request( + body, call_method=CallMethod.COMPLETIONS.value + ) async def chat(self, body: ChatCompletionRequest) -> Response: """Given a prompt, the model will return one or more predicted completions, @@ -578,7 +580,7 @@ async def chat(self, body: ChatCompletionRequest) -> Response: A response object with completions. """ - return await self._process_llm_request(body, call_method="chat") + return await self._process_llm_request(body, call_method=CallMethod.CHAT.value) async def embeddings(self, body: EmbeddingRequest) -> Response: """Create embeddings for the provided input. @@ -612,7 +614,9 @@ async def transcriptions(self, body: TranscriptionRequest) -> Response: A response object with transcriptions. """ - return await self._process_llm_request(body, call_method="transcriptions") + return await self._process_llm_request( + body, call_method=CallMethod.TRANSCRIPTIONS.value + ) async def score(self, body: ScoreRequest) -> Response: """Create scores for the provided text pairs. From bea6209b5603c7d4e0704a74e924072fceff7497 Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Sun, 12 Oct 2025 16:46:02 +0530 Subject: [PATCH 16/33] inconsistency fixes Signed-off-by: DPatel_7 --- .../llm/_internal/serve/core/configs/openai_api_models.py | 5 +++-- python/ray/llm/_internal/serve/core/server/llm_server.py | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py index ec12e9988890..9fc708ce0bc6 100644 --- a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py +++ b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py @@ -147,13 +147,14 @@ class ScoreResponse(vLLMScoreResponse): LLMCompletionsResponse = Union[ AsyncGenerator[ - Union[CompletionStreamResponse, CompletionResponse, ErrorResponse], None + Union[str, CompletionStreamResponse, CompletionResponse, ErrorResponse], None ], ] LLMTranscriptionResponse = Union[ AsyncGenerator[ - Union[TranscriptionStreamResponse, TranscriptionResponse, ErrorResponse], None + Union[str, TranscriptionStreamResponse, TranscriptionResponse, ErrorResponse], + None, ], ] diff --git a/python/ray/llm/_internal/serve/core/server/llm_server.py b/python/ray/llm/_internal/serve/core/server/llm_server.py index 7f44ea501655..0da17151cc12 100644 --- a/python/ray/llm/_internal/serve/core/server/llm_server.py +++ b/python/ray/llm/_internal/serve/core/server/llm_server.py @@ -378,7 +378,9 @@ async def embeddings( async def transcriptions( self, request: "TranscriptionRequest" - ) -> AsyncGenerator[Union[List["ErrorResponse"], "TranscriptionResponse"], None]: + ) -> AsyncGenerator[ + Union[List[Union[str, "ErrorResponse"]], "TranscriptionResponse"], None + ]: """Runs an transcriptions request to the engine and returns the response. Returns an AsyncGenerator over the TranscriptionResponse object. This is so that the caller can have a consistent interface across all the methods of chat, completions, embeddings and transcriptions. From 7d805283d8ca4a84b33932d9cba4f6441651f84a Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Sun, 12 Oct 2025 16:55:31 +0530 Subject: [PATCH 17/33] updates Signed-off-by: DPatel_7 --- python/ray/llm/_internal/serve/core/engine/protocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/llm/_internal/serve/core/engine/protocol.py b/python/ray/llm/_internal/serve/core/engine/protocol.py index 9270865302c3..c36b8073d0da 100644 --- a/python/ray/llm/_internal/serve/core/engine/protocol.py +++ b/python/ray/llm/_internal/serve/core/engine/protocol.py @@ -123,7 +123,7 @@ async def embeddings( @abc.abstractmethod async def transcriptions( self, request: "TranscriptionRequest" - ) -> AsyncGenerator[Union["TranscriptionResponse", "ErrorResponse"], None]: + ) -> AsyncGenerator[Union[str, "TranscriptionResponse", "ErrorResponse"], None]: """Run a Transcription with the engine. Similar to chat and completion, this method is an async generator, From fa48092442ed6d581277b1e91d968c7cbd56e686 Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Thu, 16 Oct 2025 23:28:30 +0530 Subject: [PATCH 18/33] query server doc test added and router updates Signed-off-by: DPatel_7 --- .../transcription/transcription_example.py | 18 ++++++++++++++++++ .../_internal/serve/core/ingress/ingress.py | 11 ++++++++--- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_example.py b/doc/source/llm/doc_code/serve/transcription/transcription_example.py index 6ff55a6e2272..b733e2288038 100644 --- a/doc/source/llm/doc_code/serve/transcription/transcription_example.py +++ b/doc/source/llm/doc_code/serve/transcription/transcription_example.py @@ -8,6 +8,8 @@ """ import time +import openai +import requests from ray import serve from ray.serve.schema import ApplicationStatus from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME @@ -98,4 +100,20 @@ def _testing_build_openai_app(llm_serving_args): f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}" ) +response = requests.get("https://voiceage.com/wbsamples/in_stereo/Sports.wav") +with open("audio.wav", "wb") as f: + f.write(response.content) + +client = openai.OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key") +file = open("/audio.wav", "rb") +try: + response = client.audio.transcriptions.create( + model="whisper-large", + file=file, + temperature=0.0, + language="en", + ) +except Exception as e: + raise AssertionError(f"Error while querying models: {e}. Check the logs for more details.") + serve.shutdown() diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py index 5d7973b833ed..e84c41c9d227 100644 --- a/python/ray/llm/_internal/serve/core/ingress/ingress.py +++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py @@ -4,6 +4,7 @@ from contextlib import asynccontextmanager from enum import Enum from typing import ( + Annotated, Any, AsyncGenerator, Awaitable, @@ -17,7 +18,7 @@ Union, ) -from fastapi import FastAPI, HTTPException, status +from fastapi import FastAPI, Form, HTTPException, status from fastapi.middleware.cors import CORSMiddleware from starlette.responses import JSONResponse, Response, StreamingResponse @@ -139,7 +140,9 @@ def _sanitize_chat_completion_request( "completions": lambda app: app.post("/v1/completions"), "chat": lambda app: app.post("/v1/chat/completions"), "embeddings": lambda app: app.post("/v1/embeddings"), - "transcriptions": lambda app: app.post("/v1/audio/transcriptions"), + "transcriptions": lambda app: app.post( + "/v1/audio/transcriptions", + ), "score": lambda app: app.post("/v1/score"), } @@ -604,7 +607,9 @@ async def embeddings(self, body: EmbeddingRequest) -> Response: if isinstance(result, EmbeddingResponse): return JSONResponse(content=result.model_dump()) - async def transcriptions(self, body: TranscriptionRequest) -> Response: + async def transcriptions( + self, body: Annotated[TranscriptionRequest, Form()] + ) -> Response: """Create transcription for the provided audio input. Args: From cf20ea5c805658e1e826f53b953f60d77a1be1b4 Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Sat, 18 Oct 2025 15:57:52 +0530 Subject: [PATCH 19/33] fix Signed-off-by: DPatel_7 --- .../doc_code/serve/transcription/transcription_example.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_example.py b/doc/source/llm/doc_code/serve/transcription/transcription_example.py index b733e2288038..181d83b35d47 100644 --- a/doc/source/llm/doc_code/serve/transcription/transcription_example.py +++ b/doc/source/llm/doc_code/serve/transcription/transcription_example.py @@ -105,7 +105,8 @@ def _testing_build_openai_app(llm_serving_args): f.write(response.content) client = openai.OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key") -file = open("/audio.wav", "rb") +file = open("audio.wav", "rb") + try: response = client.audio.transcriptions.create( model="whisper-large", @@ -114,6 +115,8 @@ def _testing_build_openai_app(llm_serving_args): language="en", ) except Exception as e: - raise AssertionError(f"Error while querying models: {e}. Check the logs for more details.") + raise AssertionError( + f"Error while querying models: {e}. Check the logs for more details." + ) serve.shutdown() From 2910796dc78a3640deef5804f691479d7a219e4a Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Sat, 18 Oct 2025 17:20:31 +0530 Subject: [PATCH 20/33] create_transcription and release test fixes Signed-off-by: DPatel_7 --- python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py | 4 ++++ release/llm_tests/serve/test_llm_serve_integration.py | 2 ++ 2 files changed, 6 insertions(+) diff --git a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py index 6e065f0c8598..2f0adea640ac 100644 --- a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py @@ -469,7 +469,11 @@ async def transcriptions( # so that the create_transcription API can assign the request_id properly. raw_request = self._create_raw_request(request, "/audio/transcriptions") + # Extract audio data from the request file + audio_data = await request.file.read() + transcription_response = await self._oai_serving_transcription.create_transcription( # type: ignore[attr-defined] + audio_data, request, raw_request=raw_request, ) diff --git a/release/llm_tests/serve/test_llm_serve_integration.py b/release/llm_tests/serve/test_llm_serve_integration.py index 894b31d65930..03e01dc1766e 100644 --- a/release/llm_tests/serve/test_llm_serve_integration.py +++ b/release/llm_tests/serve/test_llm_serve_integration.py @@ -182,6 +182,8 @@ def test_transcription_model(model_name): app = build_openai_app({"llm_configs": [llm_config]}) serve.run(app, blocking=False) wait_for_condition(is_default_app_running, timeout=180) + serve.shutdown() + time.sleep(1) @pytest.mark.asyncio(scope="function") From 6dc2d41dd20ad4ea2ad12ab14ff5cce3fbfdadec Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Sat, 18 Oct 2025 18:30:03 +0530 Subject: [PATCH 21/33] requirements updates Signed-off-by: DPatel_7 --- python/requirements/llm/llm-requirements.txt | 5 ++++- python/setup.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/python/requirements/llm/llm-requirements.txt b/python/requirements/llm/llm-requirements.txt index fe3543757e4f..a9b40e9b7fe6 100644 --- a/python/requirements/llm/llm-requirements.txt +++ b/python/requirements/llm/llm-requirements.txt @@ -2,7 +2,7 @@ # constraining to a maximum version (i.e. <=) to temporarily work around a bug. # Those pins for the sake of workarounds should not be advertised as constraints # on future releases in setup.py. -vllm>=0.11.0 +vllm[audio]>=0.11.0 nixl>=0.6.1 # For json mode jsonref>=1.1.0 @@ -14,3 +14,6 @@ typer meson pybind11 hf_transfer +# Audio processing dependencies for transcription support +librosa +soundfile diff --git a/python/setup.py b/python/setup.py index 8799f262f1fb..2c124ee01494 100644 --- a/python/setup.py +++ b/python/setup.py @@ -374,7 +374,7 @@ def get_packages(self): setup_spec.extras["llm"] = list( set( [ - "vllm>=0.11.0", + "vllm[audio]>=0.11.0", "nixl>=0.6.1", "jsonref>=1.1.0", "jsonschema", @@ -383,6 +383,9 @@ def get_packages(self): "async-timeout; python_version < '3.11'", "typer", "hf_transfer", + # Audio processing dependencies for transcription support + "librosa", + "soundfile", ] + setup_spec.extras["data"] + setup_spec.extras["serve"] From 4d97377d42ea9301a3fcc84c23603b16f7953af3 Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Sun, 19 Oct 2025 14:44:51 +0530 Subject: [PATCH 22/33] lock updates Signed-off-by: DPatel_7 --- python/deplocks/llm/rayllm_py311_cpu.lock | 88 +++++++++++++++++++ python/deplocks/llm/rayllm_py311_cu128.lock | 88 +++++++++++++++++++ .../deplocks/llm/rayllm_test_py311_cpu.lock | 82 ++++++++++++++++- .../deplocks/llm/rayllm_test_py311_cu128.lock | 82 ++++++++++++++++- 4 files changed, 334 insertions(+), 6 deletions(-) diff --git a/python/deplocks/llm/rayllm_py311_cpu.lock b/python/deplocks/llm/rayllm_py311_cpu.lock index 9461ae88b62b..32249f6d1995 100644 --- a/python/deplocks/llm/rayllm_py311_cpu.lock +++ b/python/deplocks/llm/rayllm_py311_cpu.lock @@ -149,6 +149,12 @@ attrs==25.1.0 \ # aiohttp # jsonschema # referencing +audioread==3.0.1 \ + --hash=sha256:4cdce70b8adc0da0a3c9e0d85fb10b3ace30fbdf8d1670fd443929b61d117c33 \ + --hash=sha256:ac5460a5498c48bdf2e8e767402583a4dcd13f4414d286f42ce4379e8b35066d + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # librosa billiard==4.2.1 \ --hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \ --hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb @@ -572,6 +578,12 @@ cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \ # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt # ray +decorator==5.1.1 \ + --hash=sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330 \ + --hash=sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186 + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # librosa depyf==0.19.0 \ --hash=sha256:040b35fc0997d49df024b7d094f2a7836f91e9ed02f49982dd37e70aa3285ad5 \ --hash=sha256:afed0916b32d141cc90fa6220df01885eda442ca43b297d5050eeb90b4a5cb44 @@ -1229,6 +1241,13 @@ jiter==0.8.2 \ # via # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # openai +joblib==1.5.2 \ + --hash=sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55 \ + --hash=sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241 + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # librosa + # scikit-learn jsonref==1.1.0 \ --hash=sha256:32fe8e1d85af0fdefbebce950af85590b22b60f9e95443176adbde4e1ecea552 \ --hash=sha256:590dc7773df6c21cbf948b5dac07a72a251db28b0238ceecce0a2abfa8ec30a9 @@ -1267,7 +1286,15 @@ lazy-loader==0.4 \ --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1 # via # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # librosa # scikit-image +librosa==0.11.0 \ + --hash=sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1 \ + --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908 + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # -r python/requirements/llm/llm-requirements.txt + # vllm llguidance==0.7.26 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:1895ff449c8ec0a5f1d3b142d723fc9b26a85b021b72d7f1173f8b7507f528c0 \ --hash=sha256:5e6f6cec9c6648164062f0347262b3ec7c39f54d1be5c5347d6446bc7fdba115 \ @@ -1544,6 +1571,7 @@ msgpack==1.0.7 \ # via # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt + # librosa # ray msgspec==0.19.0 \ --hash=sha256:00e87ecfa9795ee5214861eab8326b0e75475c2e68a384002aa135ea2a27d909 \ @@ -1746,6 +1774,7 @@ numba==0.61.2 \ --hash=sha256:efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2 # via # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # librosa # vllm numpy==1.26.4 \ --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \ @@ -1791,12 +1820,14 @@ numpy==1.26.4 \ # gguf # gymnasium # imageio + # librosa # mistral-common # nixl # numba # opencv-python-headless # pandas # scikit-image + # scikit-learn # scipy # soundfile # soxr @@ -1944,6 +1975,7 @@ packaging==23.0 \ # kombu # lazy-loader # lm-format-enforcer + # pooch # ray # scikit-image # tensorboardx @@ -2067,7 +2099,14 @@ platformdirs==3.11.0 \ --hash=sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e # via # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # pooch # virtualenv +pooch==1.8.2 \ + --hash=sha256:3529a57096f7198778a5ceefd5ac3ef0e4d06a6ddaf9fc2d609b806f25302c47 \ + --hash=sha256:76561f0de68a01da4df6af38e9955c4c9d1a5c90da73f7e40276a5728ec83d10 + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # librosa prometheus-client==0.19.0 \ --hash=sha256:4585b0d1223148c27a225b10dbec5ae9bc4c81a99a3fa80774fa6209935324e1 \ --hash=sha256:c88b1e6ecf6b41cd8fb5731c7ae919bf66df6ec6fafa555cd6c0e16ca169ae92 @@ -2919,6 +2958,7 @@ requests==2.32.3 \ # google-api-core # huggingface-hub # mistral-common + # pooch # ray # tiktoken # transformers @@ -3089,6 +3129,41 @@ scikit-image==0.24.0 \ # via # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt +scikit-learn==1.7.2 \ + --hash=sha256:0486c8f827c2e7b64837c731c8feff72c0bd2b998067a8a9cbc10643c31f0fe1 \ + --hash=sha256:0b7dacaa05e5d76759fb071558a8b5130f4845166d88654a0f9bdf3eb57851b7 \ + --hash=sha256:191e5550980d45449126e23ed1d5e9e24b2c68329ee1f691a3987476e115e09c \ + --hash=sha256:20e9e49ecd130598f1ca38a1d85090e1a600147b9c02fa6f15d69cb53d968fda \ + --hash=sha256:2a41e2a0ef45063e654152ec9d8bcfc39f7afce35b08902bfe290c2498a67a6a \ + --hash=sha256:36749fb62b3d961b1ce4fedf08fa57a1986cd409eff2d783bca5d4b9b5fce51c \ + --hash=sha256:4a847fea807e278f821a0406ca01e387f97653e284ecbd9750e3ee7c90347f18 \ + --hash=sha256:502c18e39849c0ea1a5d681af1dbcf15f6cce601aebb657aabbfe84133c1907f \ + --hash=sha256:57dc4deb1d3762c75d685507fbd0bc17160144b2f2ba4ccea5dc285ab0d0e973 \ + --hash=sha256:6088aa475f0785e01bcf8529f55280a3d7d298679f50c0bb70a2364a82d0b290 \ + --hash=sha256:63a9afd6f7b229aad94618c01c252ce9e6fa97918c5ca19c9a17a087d819440c \ + --hash=sha256:6b33579c10a3081d076ab403df4a4190da4f4432d443521674637677dc91e61f \ + --hash=sha256:7a4c328a71785382fe3fe676a9ecf2c86189249beff90bf85e22bdb7efaf9ae0 \ + --hash=sha256:7a58814265dfc52b3295b1900cfb5701589d30a8bb026c7540f1e9d3499d5ec8 \ + --hash=sha256:89877e19a80c7b11a2891a27c21c4894fb18e2c2e077815bcade10d34287b20d \ + --hash=sha256:8d91a97fa2b706943822398ab943cde71858a50245e31bc71dba62aab1d60a96 \ + --hash=sha256:8da8bf89d4d79aaec192d2bda62f9b56ae4e5b4ef93b6a56b5de4977e375c1f1 \ + --hash=sha256:9656e4a53e54578ad10a434dc1f993330568cfee176dff07112b8785fb413106 \ + --hash=sha256:96dc05a854add0e50d3f47a1ef21a10a595016da5b007c7d9cd9d0bffd1fcc61 \ + --hash=sha256:98335fb98509b73385b3ab2bd0639b1f610541d3988ee675c670371d6a87aa7c \ + --hash=sha256:9acb6c5e867447b4e1390930e3944a005e2cb115922e693c08a323421a6966e8 \ + --hash=sha256:9b7ed8d58725030568523e937c43e56bc01cadb478fc43c042a9aca1dacb3ba1 \ + --hash=sha256:abebbd61ad9e1deed54cca45caea8ad5f79e1b93173dece40bb8e0c658dbe6fe \ + --hash=sha256:acbc0f5fd2edd3432a22c69bed78e837c70cf896cd7993d71d51ba6708507476 \ + --hash=sha256:b4d6e9deed1a47aca9fe2f267ab8e8fe82ee20b4526b2c0cd9e135cea10feb44 \ + --hash=sha256:bb24510ed3f9f61476181e4db51ce801e2ba37541def12dc9333b946fc7a9cf8 \ + --hash=sha256:c7509693451651cd7361d30ce4e86a1347493554f172b1c72a39300fa2aea79e \ + --hash=sha256:ca250e6836d10e6f402436d6463d6c0e4d8e0234cfb6a9a47835bd392b852ce5 \ + --hash=sha256:e5bf3d930aee75a65478df91ac1225ff89cd28e9ac7bd1196853a9229b6adb0b \ + --hash=sha256:f95dc55b7902b91331fa4e5845dd5bde0580c9cd9612b1b2791b7e80c3d32615 \ + --hash=sha256:fa8f63940e29c82d1e67a45d5297bdebbcb585f5a5a50c4914cc2e852ab77f33 + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # librosa scipy==1.11.4 \ --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \ --hash=sha256:028eccd22e654b3ea01ee63705681ee79933652b2d8f873e7949898dda6d11b6 \ @@ -3118,7 +3193,9 @@ scipy==1.11.4 \ # via # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt + # librosa # scikit-image + # scikit-learn # vllm sentencepiece==0.2.0 \ --hash=sha256:0461324897735512a32d222e3d886e24ad6a499761952b6bda2a9ee6e4313ea5 \ @@ -3317,7 +3394,10 @@ soundfile==0.13.1 \ --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5 # via # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # -r python/requirements/llm/llm-requirements.txt + # librosa # mistral-common + # vllm soxr==0.5.0.post1 \ --hash=sha256:39e0f791ba178d69cd676485dbee37e75a34f20daa478d90341ecb7f6d9d690f \ --hash=sha256:4704ba6b13a3f1e41d12acf192878384c1c31f71ce606829c64abdf64a8d7d32 \ @@ -3342,6 +3422,7 @@ soxr==0.5.0.post1 \ --hash=sha256:fef509466c9c25f65eae0ce1e4b9ac9705d22c6038c914160ddaf459589c6e31 # via # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # librosa # mistral-common starlette==0.46.2 \ --hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \ @@ -3363,6 +3444,12 @@ tensorboardx==2.6.2.2 \ # via # -c python/deplocks/llm/rayllm_test_py311_cpu.lock # -r python/requirements.txt +threadpoolctl==3.6.0 \ + --hash=sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb \ + --hash=sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e + # via + # -c python/deplocks/llm/rayllm_test_py311_cpu.lock + # scikit-learn tifffile==2024.7.21 \ --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \ --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483 @@ -3518,6 +3605,7 @@ typing-extensions==4.12.2 \ # fastapi # gymnasium # huggingface-hub + # librosa # mistral-common # openai # opentelemetry-api diff --git a/python/deplocks/llm/rayllm_py311_cu128.lock b/python/deplocks/llm/rayllm_py311_cu128.lock index 8445dd9c5354..8caae4a2ded5 100644 --- a/python/deplocks/llm/rayllm_py311_cu128.lock +++ b/python/deplocks/llm/rayllm_py311_cu128.lock @@ -149,6 +149,12 @@ attrs==25.1.0 \ # aiohttp # jsonschema # referencing +audioread==3.0.1 \ + --hash=sha256:4cdce70b8adc0da0a3c9e0d85fb10b3ace30fbdf8d1670fd443929b61d117c33 \ + --hash=sha256:ac5460a5498c48bdf2e8e767402583a4dcd13f4414d286f42ce4379e8b35066d + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # librosa billiard==4.2.1 \ --hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \ --hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb @@ -572,6 +578,12 @@ cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \ # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt # ray +decorator==5.1.1 \ + --hash=sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330 \ + --hash=sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # librosa depyf==0.19.0 \ --hash=sha256:040b35fc0997d49df024b7d094f2a7836f91e9ed02f49982dd37e70aa3285ad5 \ --hash=sha256:afed0916b32d141cc90fa6220df01885eda442ca43b297d5050eeb90b4a5cb44 @@ -1230,6 +1242,13 @@ jiter==0.10.0 \ # via # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # openai +joblib==1.5.2 \ + --hash=sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55 \ + --hash=sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # librosa + # scikit-learn jsonref==1.1.0 \ --hash=sha256:32fe8e1d85af0fdefbebce950af85590b22b60f9e95443176adbde4e1ecea552 \ --hash=sha256:590dc7773df6c21cbf948b5dac07a72a251db28b0238ceecce0a2abfa8ec30a9 @@ -1268,7 +1287,15 @@ lazy-loader==0.4 \ --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1 # via # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # librosa # scikit-image +librosa==0.11.0 \ + --hash=sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1 \ + --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # -r python/requirements/llm/llm-requirements.txt + # vllm llguidance==0.7.29 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:17fd439957d6ca5f459d0dec755a2d040c2dc946ed7e3c332b469ef6861292f8 \ --hash=sha256:1d30a76b30b646ac7f9025d262665f62bdbf2d43698115eeb1119c6ee062a36f \ @@ -1509,6 +1536,7 @@ msgpack==1.0.7 \ # via # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt + # librosa # ray msgspec==0.19.0 \ --hash=sha256:00e87ecfa9795ee5214861eab8326b0e75475c2e68a384002aa135ea2a27d909 \ @@ -1710,6 +1738,7 @@ numba==0.61.2 \ --hash=sha256:efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2 # via # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # librosa # vllm numpy==1.26.4 \ --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \ @@ -1755,12 +1784,14 @@ numpy==1.26.4 \ # gguf # gymnasium # imageio + # librosa # mistral-common # nixl # numba # opencv-python-headless # pandas # scikit-image + # scikit-learn # scipy # soundfile # soxr @@ -1984,6 +2015,7 @@ packaging==23.0 \ # kombu # lazy-loader # lm-format-enforcer + # pooch # ray # scikit-image # tensorboardx @@ -2107,7 +2139,14 @@ platformdirs==3.11.0 \ --hash=sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e # via # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # pooch # virtualenv +pooch==1.8.2 \ + --hash=sha256:3529a57096f7198778a5ceefd5ac3ef0e4d06a6ddaf9fc2d609b806f25302c47 \ + --hash=sha256:76561f0de68a01da4df6af38e9955c4c9d1a5c90da73f7e40276a5728ec83d10 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # librosa prometheus-client==0.19.0 \ --hash=sha256:4585b0d1223148c27a225b10dbec5ae9bc4c81a99a3fa80774fa6209935324e1 \ --hash=sha256:c88b1e6ecf6b41cd8fb5731c7ae919bf66df6ec6fafa555cd6c0e16ca169ae92 @@ -2959,6 +2998,7 @@ requests==2.32.3 \ # google-api-core # huggingface-hub # mistral-common + # pooch # ray # tiktoken # transformers @@ -3129,6 +3169,41 @@ scikit-image==0.24.0 \ # via # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt +scikit-learn==1.7.2 \ + --hash=sha256:0486c8f827c2e7b64837c731c8feff72c0bd2b998067a8a9cbc10643c31f0fe1 \ + --hash=sha256:0b7dacaa05e5d76759fb071558a8b5130f4845166d88654a0f9bdf3eb57851b7 \ + --hash=sha256:191e5550980d45449126e23ed1d5e9e24b2c68329ee1f691a3987476e115e09c \ + --hash=sha256:20e9e49ecd130598f1ca38a1d85090e1a600147b9c02fa6f15d69cb53d968fda \ + --hash=sha256:2a41e2a0ef45063e654152ec9d8bcfc39f7afce35b08902bfe290c2498a67a6a \ + --hash=sha256:36749fb62b3d961b1ce4fedf08fa57a1986cd409eff2d783bca5d4b9b5fce51c \ + --hash=sha256:4a847fea807e278f821a0406ca01e387f97653e284ecbd9750e3ee7c90347f18 \ + --hash=sha256:502c18e39849c0ea1a5d681af1dbcf15f6cce601aebb657aabbfe84133c1907f \ + --hash=sha256:57dc4deb1d3762c75d685507fbd0bc17160144b2f2ba4ccea5dc285ab0d0e973 \ + --hash=sha256:6088aa475f0785e01bcf8529f55280a3d7d298679f50c0bb70a2364a82d0b290 \ + --hash=sha256:63a9afd6f7b229aad94618c01c252ce9e6fa97918c5ca19c9a17a087d819440c \ + --hash=sha256:6b33579c10a3081d076ab403df4a4190da4f4432d443521674637677dc91e61f \ + --hash=sha256:7a4c328a71785382fe3fe676a9ecf2c86189249beff90bf85e22bdb7efaf9ae0 \ + --hash=sha256:7a58814265dfc52b3295b1900cfb5701589d30a8bb026c7540f1e9d3499d5ec8 \ + --hash=sha256:89877e19a80c7b11a2891a27c21c4894fb18e2c2e077815bcade10d34287b20d \ + --hash=sha256:8d91a97fa2b706943822398ab943cde71858a50245e31bc71dba62aab1d60a96 \ + --hash=sha256:8da8bf89d4d79aaec192d2bda62f9b56ae4e5b4ef93b6a56b5de4977e375c1f1 \ + --hash=sha256:9656e4a53e54578ad10a434dc1f993330568cfee176dff07112b8785fb413106 \ + --hash=sha256:96dc05a854add0e50d3f47a1ef21a10a595016da5b007c7d9cd9d0bffd1fcc61 \ + --hash=sha256:98335fb98509b73385b3ab2bd0639b1f610541d3988ee675c670371d6a87aa7c \ + --hash=sha256:9acb6c5e867447b4e1390930e3944a005e2cb115922e693c08a323421a6966e8 \ + --hash=sha256:9b7ed8d58725030568523e937c43e56bc01cadb478fc43c042a9aca1dacb3ba1 \ + --hash=sha256:abebbd61ad9e1deed54cca45caea8ad5f79e1b93173dece40bb8e0c658dbe6fe \ + --hash=sha256:acbc0f5fd2edd3432a22c69bed78e837c70cf896cd7993d71d51ba6708507476 \ + --hash=sha256:b4d6e9deed1a47aca9fe2f267ab8e8fe82ee20b4526b2c0cd9e135cea10feb44 \ + --hash=sha256:bb24510ed3f9f61476181e4db51ce801e2ba37541def12dc9333b946fc7a9cf8 \ + --hash=sha256:c7509693451651cd7361d30ce4e86a1347493554f172b1c72a39300fa2aea79e \ + --hash=sha256:ca250e6836d10e6f402436d6463d6c0e4d8e0234cfb6a9a47835bd392b852ce5 \ + --hash=sha256:e5bf3d930aee75a65478df91ac1225ff89cd28e9ac7bd1196853a9229b6adb0b \ + --hash=sha256:f95dc55b7902b91331fa4e5845dd5bde0580c9cd9612b1b2791b7e80c3d32615 \ + --hash=sha256:fa8f63940e29c82d1e67a45d5297bdebbcb585f5a5a50c4914cc2e852ab77f33 + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # librosa scipy==1.11.4 \ --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \ --hash=sha256:028eccd22e654b3ea01ee63705681ee79933652b2d8f873e7949898dda6d11b6 \ @@ -3158,7 +3233,9 @@ scipy==1.11.4 \ # via # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt + # librosa # scikit-image + # scikit-learn # vllm sentencepiece==0.2.0 \ --hash=sha256:0461324897735512a32d222e3d886e24ad6a499761952b6bda2a9ee6e4313ea5 \ @@ -3357,7 +3434,10 @@ soundfile==0.13.1 \ --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5 # via # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # -r python/requirements/llm/llm-requirements.txt + # librosa # mistral-common + # vllm soxr==0.5.0.post1 \ --hash=sha256:39e0f791ba178d69cd676485dbee37e75a34f20daa478d90341ecb7f6d9d690f \ --hash=sha256:4704ba6b13a3f1e41d12acf192878384c1c31f71ce606829c64abdf64a8d7d32 \ @@ -3382,6 +3462,7 @@ soxr==0.5.0.post1 \ --hash=sha256:fef509466c9c25f65eae0ce1e4b9ac9705d22c6038c914160ddaf459589c6e31 # via # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # librosa # mistral-common starlette==0.46.2 \ --hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \ @@ -3403,6 +3484,12 @@ tensorboardx==2.6.2.2 \ # via # -c python/deplocks/llm/rayllm_test_py311_cu128.lock # -r python/requirements.txt +threadpoolctl==3.6.0 \ + --hash=sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb \ + --hash=sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e + # via + # -c python/deplocks/llm/rayllm_test_py311_cu128.lock + # scikit-learn tifffile==2024.7.21 \ --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \ --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483 @@ -3547,6 +3634,7 @@ typing-extensions==4.12.2 \ # fastapi # gymnasium # huggingface-hub + # librosa # mistral-common # openai # opentelemetry-api diff --git a/python/deplocks/llm/rayllm_test_py311_cpu.lock b/python/deplocks/llm/rayllm_test_py311_cpu.lock index 06eec0f1fbf6..edd52e6eca9a 100644 --- a/python/deplocks/llm/rayllm_test_py311_cpu.lock +++ b/python/deplocks/llm/rayllm_test_py311_cpu.lock @@ -219,6 +219,10 @@ attrs==25.1.0 \ # aiohttp # jsonschema # referencing +audioread==3.0.1 \ + --hash=sha256:4cdce70b8adc0da0a3c9e0d85fb10b3ace30fbdf8d1670fd443929b61d117c33 \ + --hash=sha256:ac5460a5498c48bdf2e8e767402583a4dcd13f4414d286f42ce4379e8b35066d + # via librosa azure-common==1.1.28 \ --hash=sha256:4ac0cd3214e36b6a1b6a442686722a5d8cc449603aa833f3f0f40bda836704a3 \ --hash=sha256:5c12d3dcf4ec20599ca6b0d3e09e86e146353d443e7fcc050c9a19c1f9df20ad @@ -766,6 +770,7 @@ decorator==5.1.1 \ # via # -c python/deplocks/llm/ray_test_py311_cpu.lock # ipython + # librosa defusedxml==0.7.1 \ --hash=sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69 \ --hash=sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61 @@ -1678,6 +1683,12 @@ jmespath==1.0.1 \ # -c python/deplocks/llm/ray_test_py311_cpu.lock # boto3 # botocore +joblib==1.5.2 \ + --hash=sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55 \ + --hash=sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241 + # via + # librosa + # scikit-learn json5==0.9.14 \ --hash=sha256:740c7f1b9e584a468dbb2939d8d458db3427f2c93ae2139d05f47e453eae964f \ --hash=sha256:9ed66c3a6ca3510a976a9ef9b8c0787de24802724ab1860bc0153c7fdd589b02 @@ -1821,7 +1832,14 @@ lazy-loader==0.4 \ --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1 # via # -c python/deplocks/llm/ray_test_py311_cpu.lock + # librosa # scikit-image +librosa==0.11.0 \ + --hash=sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1 \ + --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908 + # via + # -r python/requirements/llm/llm-requirements.txt + # vllm llguidance==0.7.26 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:1895ff449c8ec0a5f1d3b142d723fc9b26a85b021b72d7f1173f8b7507f528c0 \ --hash=sha256:5e6f6cec9c6648164062f0347262b3ec7c39f54d1be5c5347d6446bc7fdba115 \ @@ -2223,6 +2241,7 @@ msgpack==1.0.7 \ # via # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt + # librosa # ray msgspec==0.19.0 \ --hash=sha256:00e87ecfa9795ee5214861eab8326b0e75475c2e68a384002aa135ea2a27d909 \ @@ -2472,7 +2491,9 @@ numba==0.61.2 \ --hash=sha256:cf9f9fc00d6eca0c23fc840817ce9f439b9f03c8f03d6246c0e7f0cb15b7162a \ --hash=sha256:ea0247617edcb5dd61f6106a56255baab031acc4257bddaeddb3a1003b4ca3fd \ --hash=sha256:efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2 - # via vllm + # via + # librosa + # vllm numpy==1.26.4 \ --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \ --hash=sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818 \ @@ -2517,12 +2538,14 @@ numpy==1.26.4 \ # gguf # gymnasium # imageio + # librosa # mistral-common # nixl # numba # opencv-python-headless # pandas # scikit-image + # scikit-learn # scipy # soundfile # soxr @@ -2680,6 +2703,7 @@ packaging==23.0 \ # lazy-loader # lm-format-enforcer # nbconvert + # pooch # pytest # ray # scikit-image @@ -2835,6 +2859,7 @@ platformdirs==3.11.0 \ # via # -c python/deplocks/llm/ray_test_py311_cpu.lock # jupyter-core + # pooch # virtualenv pluggy==1.3.0 \ --hash=sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12 \ @@ -2842,6 +2867,10 @@ pluggy==1.3.0 \ # via # -c python/deplocks/llm/ray_test_py311_cpu.lock # pytest +pooch==1.8.2 \ + --hash=sha256:3529a57096f7198778a5ceefd5ac3ef0e4d06a6ddaf9fc2d609b806f25302c47 \ + --hash=sha256:76561f0de68a01da4df6af38e9955c4c9d1a5c90da73f7e40276a5728ec83d10 + # via librosa portalocker==2.8.2 \ --hash=sha256:2b035aa7828e46c58e9b31390ee1f169b98e1066ab10b9a6a861fe7e25ee4f33 \ --hash=sha256:cfb86acc09b9aa7c3b43594e19be1345b9d16af3feb08bf92f23d4dce513a28e @@ -3804,6 +3833,7 @@ requests==2.32.3 \ # jupyterlab-server # mistral-common # msal + # pooch # ray # smart-open # sphinx @@ -3996,6 +4026,39 @@ scikit-image==0.24.0 \ # via # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt +scikit-learn==1.7.2 \ + --hash=sha256:0486c8f827c2e7b64837c731c8feff72c0bd2b998067a8a9cbc10643c31f0fe1 \ + --hash=sha256:0b7dacaa05e5d76759fb071558a8b5130f4845166d88654a0f9bdf3eb57851b7 \ + --hash=sha256:191e5550980d45449126e23ed1d5e9e24b2c68329ee1f691a3987476e115e09c \ + --hash=sha256:20e9e49ecd130598f1ca38a1d85090e1a600147b9c02fa6f15d69cb53d968fda \ + --hash=sha256:2a41e2a0ef45063e654152ec9d8bcfc39f7afce35b08902bfe290c2498a67a6a \ + --hash=sha256:36749fb62b3d961b1ce4fedf08fa57a1986cd409eff2d783bca5d4b9b5fce51c \ + --hash=sha256:4a847fea807e278f821a0406ca01e387f97653e284ecbd9750e3ee7c90347f18 \ + --hash=sha256:502c18e39849c0ea1a5d681af1dbcf15f6cce601aebb657aabbfe84133c1907f \ + --hash=sha256:57dc4deb1d3762c75d685507fbd0bc17160144b2f2ba4ccea5dc285ab0d0e973 \ + --hash=sha256:6088aa475f0785e01bcf8529f55280a3d7d298679f50c0bb70a2364a82d0b290 \ + --hash=sha256:63a9afd6f7b229aad94618c01c252ce9e6fa97918c5ca19c9a17a087d819440c \ + --hash=sha256:6b33579c10a3081d076ab403df4a4190da4f4432d443521674637677dc91e61f \ + --hash=sha256:7a4c328a71785382fe3fe676a9ecf2c86189249beff90bf85e22bdb7efaf9ae0 \ + --hash=sha256:7a58814265dfc52b3295b1900cfb5701589d30a8bb026c7540f1e9d3499d5ec8 \ + --hash=sha256:89877e19a80c7b11a2891a27c21c4894fb18e2c2e077815bcade10d34287b20d \ + --hash=sha256:8d91a97fa2b706943822398ab943cde71858a50245e31bc71dba62aab1d60a96 \ + --hash=sha256:8da8bf89d4d79aaec192d2bda62f9b56ae4e5b4ef93b6a56b5de4977e375c1f1 \ + --hash=sha256:9656e4a53e54578ad10a434dc1f993330568cfee176dff07112b8785fb413106 \ + --hash=sha256:96dc05a854add0e50d3f47a1ef21a10a595016da5b007c7d9cd9d0bffd1fcc61 \ + --hash=sha256:98335fb98509b73385b3ab2bd0639b1f610541d3988ee675c670371d6a87aa7c \ + --hash=sha256:9acb6c5e867447b4e1390930e3944a005e2cb115922e693c08a323421a6966e8 \ + --hash=sha256:9b7ed8d58725030568523e937c43e56bc01cadb478fc43c042a9aca1dacb3ba1 \ + --hash=sha256:abebbd61ad9e1deed54cca45caea8ad5f79e1b93173dece40bb8e0c658dbe6fe \ + --hash=sha256:acbc0f5fd2edd3432a22c69bed78e837c70cf896cd7993d71d51ba6708507476 \ + --hash=sha256:b4d6e9deed1a47aca9fe2f267ab8e8fe82ee20b4526b2c0cd9e135cea10feb44 \ + --hash=sha256:bb24510ed3f9f61476181e4db51ce801e2ba37541def12dc9333b946fc7a9cf8 \ + --hash=sha256:c7509693451651cd7361d30ce4e86a1347493554f172b1c72a39300fa2aea79e \ + --hash=sha256:ca250e6836d10e6f402436d6463d6c0e4d8e0234cfb6a9a47835bd392b852ce5 \ + --hash=sha256:e5bf3d930aee75a65478df91ac1225ff89cd28e9ac7bd1196853a9229b6adb0b \ + --hash=sha256:f95dc55b7902b91331fa4e5845dd5bde0580c9cd9612b1b2791b7e80c3d32615 \ + --hash=sha256:fa8f63940e29c82d1e67a45d5297bdebbcb585f5a5a50c4914cc2e852ab77f33 + # via librosa scipy==1.11.4 \ --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \ --hash=sha256:028eccd22e654b3ea01ee63705681ee79933652b2d8f873e7949898dda6d11b6 \ @@ -4025,7 +4088,9 @@ scipy==1.11.4 \ # via # -c python/deplocks/llm/ray_test_py311_cpu.lock # -r python/requirements.txt + # librosa # scikit-image + # scikit-learn # vllm send2trash==1.8.3 \ --hash=sha256:0c31227e0bd08961c7665474a3d1ef7193929fedda4233843689baa056be46c9 \ @@ -4246,7 +4311,11 @@ soundfile==0.13.1 \ --hash=sha256:a23c717560da2cf4c7b5ae1142514e0fd82d6bbd9dfc93a50423447142f2c445 \ --hash=sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b \ --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5 - # via mistral-common + # via + # -r python/requirements/llm/llm-requirements.txt + # librosa + # mistral-common + # vllm soupsieve==2.5 \ --hash=sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690 \ --hash=sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7 @@ -4275,7 +4344,9 @@ soxr==0.5.0.post1 \ --hash=sha256:fa0a382fb8d8e2afed2c1642723b2d2d1b9a6728ff89f77f3524034c8885b8c9 \ --hash=sha256:fcc049b0a151a65aa75b92f0ac64bb2dba785d16b78c31c2b94e68c141751d6d \ --hash=sha256:fef509466c9c25f65eae0ce1e4b9ac9705d22c6038c914160ddaf459589c6e31 - # via mistral-common + # via + # librosa + # mistral-common sphinx==6.2.1 \ --hash=sha256:6d56a34697bb749ffa0152feafc4b19836c755d90a7c59b72bc7dfd371b9cc6b \ --hash=sha256:97787ff1fa3256a3eef9eda523a63dbf299f7b47e053cfcf684a1c2a8380c912 @@ -4354,6 +4425,10 @@ terminado==0.18.1 \ # jupyter-server # nbclassic # notebook +threadpoolctl==3.6.0 \ + --hash=sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb \ + --hash=sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e + # via scikit-learn tifffile==2024.7.21 \ --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \ --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483 @@ -4585,6 +4660,7 @@ typing-extensions==4.12.2 \ # fastapi # gymnasium # huggingface-hub + # librosa # mistral-common # openai # opentelemetry-api diff --git a/python/deplocks/llm/rayllm_test_py311_cu128.lock b/python/deplocks/llm/rayllm_test_py311_cu128.lock index 34a7a94ed149..4cda7fdfdced 100644 --- a/python/deplocks/llm/rayllm_test_py311_cu128.lock +++ b/python/deplocks/llm/rayllm_test_py311_cu128.lock @@ -219,6 +219,10 @@ attrs==25.1.0 \ # aiohttp # jsonschema # referencing +audioread==3.0.1 \ + --hash=sha256:4cdce70b8adc0da0a3c9e0d85fb10b3ace30fbdf8d1670fd443929b61d117c33 \ + --hash=sha256:ac5460a5498c48bdf2e8e767402583a4dcd13f4414d286f42ce4379e8b35066d + # via librosa azure-common==1.1.28 \ --hash=sha256:4ac0cd3214e36b6a1b6a442686722a5d8cc449603aa833f3f0f40bda836704a3 \ --hash=sha256:5c12d3dcf4ec20599ca6b0d3e09e86e146353d443e7fcc050c9a19c1f9df20ad @@ -765,6 +769,7 @@ decorator==5.1.1 \ # via # -c python/deplocks/llm/ray_test_py311_cu128.lock # ipython + # librosa defusedxml==0.7.1 \ --hash=sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69 \ --hash=sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61 @@ -1678,6 +1683,12 @@ jmespath==1.0.1 \ # -c python/deplocks/llm/ray_test_py311_cu128.lock # boto3 # botocore +joblib==1.5.2 \ + --hash=sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55 \ + --hash=sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241 + # via + # librosa + # scikit-learn json5==0.9.14 \ --hash=sha256:740c7f1b9e584a468dbb2939d8d458db3427f2c93ae2139d05f47e453eae964f \ --hash=sha256:9ed66c3a6ca3510a976a9ef9b8c0787de24802724ab1860bc0153c7fdd589b02 @@ -1821,7 +1832,14 @@ lazy-loader==0.4 \ --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1 # via # -c python/deplocks/llm/ray_test_py311_cu128.lock + # librosa # scikit-image +librosa==0.11.0 \ + --hash=sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1 \ + --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908 + # via + # -r python/requirements/llm/llm-requirements.txt + # vllm llguidance==0.7.29 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:17fd439957d6ca5f459d0dec755a2d040c2dc946ed7e3c332b469ef6861292f8 \ --hash=sha256:1d30a76b30b646ac7f9025d262665f62bdbf2d43698115eeb1119c6ee062a36f \ @@ -2187,6 +2205,7 @@ msgpack==1.0.7 \ # via # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt + # librosa # ray msgspec==0.19.0 \ --hash=sha256:00e87ecfa9795ee5214861eab8326b0e75475c2e68a384002aa135ea2a27d909 \ @@ -2435,7 +2454,9 @@ numba==0.61.2 \ --hash=sha256:cf9f9fc00d6eca0c23fc840817ce9f439b9f03c8f03d6246c0e7f0cb15b7162a \ --hash=sha256:ea0247617edcb5dd61f6106a56255baab031acc4257bddaeddb3a1003b4ca3fd \ --hash=sha256:efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2 - # via vllm + # via + # librosa + # vllm numpy==1.26.4 \ --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \ --hash=sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818 \ @@ -2480,12 +2501,14 @@ numpy==1.26.4 \ # gguf # gymnasium # imageio + # librosa # mistral-common # nixl # numba # opencv-python-headless # pandas # scikit-image + # scikit-learn # scipy # soundfile # soxr @@ -2694,6 +2717,7 @@ packaging==23.0 \ # lazy-loader # lm-format-enforcer # nbconvert + # pooch # pytest # ray # scikit-image @@ -2849,6 +2873,7 @@ platformdirs==3.11.0 \ # via # -c python/deplocks/llm/ray_test_py311_cu128.lock # jupyter-core + # pooch # virtualenv pluggy==1.3.0 \ --hash=sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12 \ @@ -2856,6 +2881,10 @@ pluggy==1.3.0 \ # via # -c python/deplocks/llm/ray_test_py311_cu128.lock # pytest +pooch==1.8.2 \ + --hash=sha256:3529a57096f7198778a5ceefd5ac3ef0e4d06a6ddaf9fc2d609b806f25302c47 \ + --hash=sha256:76561f0de68a01da4df6af38e9955c4c9d1a5c90da73f7e40276a5728ec83d10 + # via librosa portalocker==2.8.2 \ --hash=sha256:2b035aa7828e46c58e9b31390ee1f169b98e1066ab10b9a6a861fe7e25ee4f33 \ --hash=sha256:cfb86acc09b9aa7c3b43594e19be1345b9d16af3feb08bf92f23d4dce513a28e @@ -3818,6 +3847,7 @@ requests==2.32.3 \ # jupyterlab-server # mistral-common # msal + # pooch # ray # smart-open # sphinx @@ -4010,6 +4040,39 @@ scikit-image==0.24.0 \ # via # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt +scikit-learn==1.7.2 \ + --hash=sha256:0486c8f827c2e7b64837c731c8feff72c0bd2b998067a8a9cbc10643c31f0fe1 \ + --hash=sha256:0b7dacaa05e5d76759fb071558a8b5130f4845166d88654a0f9bdf3eb57851b7 \ + --hash=sha256:191e5550980d45449126e23ed1d5e9e24b2c68329ee1f691a3987476e115e09c \ + --hash=sha256:20e9e49ecd130598f1ca38a1d85090e1a600147b9c02fa6f15d69cb53d968fda \ + --hash=sha256:2a41e2a0ef45063e654152ec9d8bcfc39f7afce35b08902bfe290c2498a67a6a \ + --hash=sha256:36749fb62b3d961b1ce4fedf08fa57a1986cd409eff2d783bca5d4b9b5fce51c \ + --hash=sha256:4a847fea807e278f821a0406ca01e387f97653e284ecbd9750e3ee7c90347f18 \ + --hash=sha256:502c18e39849c0ea1a5d681af1dbcf15f6cce601aebb657aabbfe84133c1907f \ + --hash=sha256:57dc4deb1d3762c75d685507fbd0bc17160144b2f2ba4ccea5dc285ab0d0e973 \ + --hash=sha256:6088aa475f0785e01bcf8529f55280a3d7d298679f50c0bb70a2364a82d0b290 \ + --hash=sha256:63a9afd6f7b229aad94618c01c252ce9e6fa97918c5ca19c9a17a087d819440c \ + --hash=sha256:6b33579c10a3081d076ab403df4a4190da4f4432d443521674637677dc91e61f \ + --hash=sha256:7a4c328a71785382fe3fe676a9ecf2c86189249beff90bf85e22bdb7efaf9ae0 \ + --hash=sha256:7a58814265dfc52b3295b1900cfb5701589d30a8bb026c7540f1e9d3499d5ec8 \ + --hash=sha256:89877e19a80c7b11a2891a27c21c4894fb18e2c2e077815bcade10d34287b20d \ + --hash=sha256:8d91a97fa2b706943822398ab943cde71858a50245e31bc71dba62aab1d60a96 \ + --hash=sha256:8da8bf89d4d79aaec192d2bda62f9b56ae4e5b4ef93b6a56b5de4977e375c1f1 \ + --hash=sha256:9656e4a53e54578ad10a434dc1f993330568cfee176dff07112b8785fb413106 \ + --hash=sha256:96dc05a854add0e50d3f47a1ef21a10a595016da5b007c7d9cd9d0bffd1fcc61 \ + --hash=sha256:98335fb98509b73385b3ab2bd0639b1f610541d3988ee675c670371d6a87aa7c \ + --hash=sha256:9acb6c5e867447b4e1390930e3944a005e2cb115922e693c08a323421a6966e8 \ + --hash=sha256:9b7ed8d58725030568523e937c43e56bc01cadb478fc43c042a9aca1dacb3ba1 \ + --hash=sha256:abebbd61ad9e1deed54cca45caea8ad5f79e1b93173dece40bb8e0c658dbe6fe \ + --hash=sha256:acbc0f5fd2edd3432a22c69bed78e837c70cf896cd7993d71d51ba6708507476 \ + --hash=sha256:b4d6e9deed1a47aca9fe2f267ab8e8fe82ee20b4526b2c0cd9e135cea10feb44 \ + --hash=sha256:bb24510ed3f9f61476181e4db51ce801e2ba37541def12dc9333b946fc7a9cf8 \ + --hash=sha256:c7509693451651cd7361d30ce4e86a1347493554f172b1c72a39300fa2aea79e \ + --hash=sha256:ca250e6836d10e6f402436d6463d6c0e4d8e0234cfb6a9a47835bd392b852ce5 \ + --hash=sha256:e5bf3d930aee75a65478df91ac1225ff89cd28e9ac7bd1196853a9229b6adb0b \ + --hash=sha256:f95dc55b7902b91331fa4e5845dd5bde0580c9cd9612b1b2791b7e80c3d32615 \ + --hash=sha256:fa8f63940e29c82d1e67a45d5297bdebbcb585f5a5a50c4914cc2e852ab77f33 + # via librosa scipy==1.11.4 \ --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \ --hash=sha256:028eccd22e654b3ea01ee63705681ee79933652b2d8f873e7949898dda6d11b6 \ @@ -4039,7 +4102,9 @@ scipy==1.11.4 \ # via # -c python/deplocks/llm/ray_test_py311_cu128.lock # -r python/requirements.txt + # librosa # scikit-image + # scikit-learn # vllm send2trash==1.8.3 \ --hash=sha256:0c31227e0bd08961c7665474a3d1ef7193929fedda4233843689baa056be46c9 \ @@ -4260,7 +4325,11 @@ soundfile==0.13.1 \ --hash=sha256:a23c717560da2cf4c7b5ae1142514e0fd82d6bbd9dfc93a50423447142f2c445 \ --hash=sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b \ --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5 - # via mistral-common + # via + # -r python/requirements/llm/llm-requirements.txt + # librosa + # mistral-common + # vllm soupsieve==2.5 \ --hash=sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690 \ --hash=sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7 @@ -4289,7 +4358,9 @@ soxr==0.5.0.post1 \ --hash=sha256:fa0a382fb8d8e2afed2c1642723b2d2d1b9a6728ff89f77f3524034c8885b8c9 \ --hash=sha256:fcc049b0a151a65aa75b92f0ac64bb2dba785d16b78c31c2b94e68c141751d6d \ --hash=sha256:fef509466c9c25f65eae0ce1e4b9ac9705d22c6038c914160ddaf459589c6e31 - # via mistral-common + # via + # librosa + # mistral-common sphinx==6.2.1 \ --hash=sha256:6d56a34697bb749ffa0152feafc4b19836c755d90a7c59b72bc7dfd371b9cc6b \ --hash=sha256:97787ff1fa3256a3eef9eda523a63dbf299f7b47e053cfcf684a1c2a8380c912 @@ -4368,6 +4439,10 @@ terminado==0.18.1 \ # jupyter-server # nbclassic # notebook +threadpoolctl==3.6.0 \ + --hash=sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb \ + --hash=sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e + # via scikit-learn tifffile==2024.7.21 \ --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \ --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483 @@ -4589,6 +4664,7 @@ typing-extensions==4.12.2 \ # fastapi # gymnasium # huggingface-hub + # librosa # mistral-common # openai # opentelemetry-api From 5f8edde4f1c18d24e50f4fa247b6bcf84064915d Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Sun, 19 Oct 2025 21:15:30 +0530 Subject: [PATCH 23/33] doc updates Signed-off-by: DPatel_7 --- .../llm/user-guides/vllm-compatibility.md | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/doc/source/serve/llm/user-guides/vllm-compatibility.md b/doc/source/serve/llm/user-guides/vllm-compatibility.md index 846fc79720c3..8a7219ed762d 100644 --- a/doc/source/serve/llm/user-guides/vllm-compatibility.md +++ b/doc/source/serve/llm/user-guides/vllm-compatibility.md @@ -80,6 +80,69 @@ curl -X POST http://localhost:8000/v1/embeddings \ :::: + +## Transcriptions + +You can generate audio transcriptions for Speech-to-Text (STT) models trained specifically for Automatic Speech Recognition (ASR) tasks. +Models supporting this use case are listed at +`vLLM transcription models `_. + + +### Deploy an transcription model + +::::{tab-set} + +:::{tab-item} Python +:sync: python + +```{literalinclude} ../../llm/doc_code/serve/transcription/transcription_example.py +language: python +:start-after: __transcription_example_start__ +:end-before: __transcription_example_end__ +``` +::: + +:::{tab-item} Python Client +:sync: client + +```python +from openai import OpenAI + +# Initialize client +client = OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key") + +# Open audio file +file = open("/path/to/audio.wav", "rb") + +# Make a request to the desired lora checkpoint +response = client.audio.transcriptions.create( + model="whisper-large", + file=file, + temperature=0.0, + language="en", +) + +print(response.text) +``` +::: + +:::{tab-item} cURL +:sync: curl + +```bash +curl http://localhost:8000/v1/audio/transcriptions \ + -X POST \ + -H "Authorization: Bearer fake-key" \ + -F "file=@/path/to/audio.wav" \ + -F "model=whisper-large" \ + -F "temperature=0.0" \ + -F "language=en" +``` +::: + +:::: + + ## Structured output You can request structured JSON output similar to OpenAI's API using JSON mode or JSON schema validation with Pydantic models. From b2f92d9fb8fde02a2e1bf078689483c6389eea94 Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Sun, 19 Oct 2025 21:55:39 +0530 Subject: [PATCH 24/33] doc fix Signed-off-by: DPatel_7 --- doc/source/serve/llm/user-guides/vllm-compatibility.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/serve/llm/user-guides/vllm-compatibility.md b/doc/source/serve/llm/user-guides/vllm-compatibility.md index 8a7219ed762d..1a323f8533f0 100644 --- a/doc/source/serve/llm/user-guides/vllm-compatibility.md +++ b/doc/source/serve/llm/user-guides/vllm-compatibility.md @@ -95,7 +95,7 @@ Models supporting this use case are listed at :::{tab-item} Python :sync: python -```{literalinclude} ../../llm/doc_code/serve/transcription/transcription_example.py +```{literalinclude} ../../../llm/doc_code/serve/transcription/transcription_example.py language: python :start-after: __transcription_example_start__ :end-before: __transcription_example_end__ From d1087539d1911890ffe3aa9d0da3d35074998785 Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Sun, 19 Oct 2025 22:22:00 +0530 Subject: [PATCH 25/33] docs fix Signed-off-by: DPatel_7 --- doc/source/serve/llm/user-guides/vllm-compatibility.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/doc/source/serve/llm/user-guides/vllm-compatibility.md b/doc/source/serve/llm/user-guides/vllm-compatibility.md index 1a323f8533f0..4cc5dcfe4f4b 100644 --- a/doc/source/serve/llm/user-guides/vllm-compatibility.md +++ b/doc/source/serve/llm/user-guides/vllm-compatibility.md @@ -83,9 +83,7 @@ curl -X POST http://localhost:8000/v1/embeddings \ ## Transcriptions -You can generate audio transcriptions for Speech-to-Text (STT) models trained specifically for Automatic Speech Recognition (ASR) tasks. -Models supporting this use case are listed at -`vLLM transcription models `_. +You can generate audio transcriptions for Speech-to-Text (STT) models trained specifically for Automatic Speech Recognition (ASR) tasks. Models supporting this use case are listed in the [vLLM transcription models documentation](https://docs.vllm.ai/en/stable/models/supported_models.html#transcription). ### Deploy an transcription model @@ -242,7 +240,6 @@ response = client.chat.completions.create( response_format={ "type": "json_schema", "json_schema": Color.model_json_schema() - }, messages=[ { From 53b500d5f041ecf81174ab2154015b3548a1da6a Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Sun, 19 Oct 2025 22:57:03 +0530 Subject: [PATCH 26/33] docs fix Signed-off-by: DPatel_7 --- doc/source/serve/llm/user-guides/vllm-compatibility.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/serve/llm/user-guides/vllm-compatibility.md b/doc/source/serve/llm/user-guides/vllm-compatibility.md index 4cc5dcfe4f4b..8a7d953a87dc 100644 --- a/doc/source/serve/llm/user-guides/vllm-compatibility.md +++ b/doc/source/serve/llm/user-guides/vllm-compatibility.md @@ -94,7 +94,7 @@ You can generate audio transcriptions for Speech-to-Text (STT) models trained sp :sync: python ```{literalinclude} ../../../llm/doc_code/serve/transcription/transcription_example.py -language: python +:language: python :start-after: __transcription_example_start__ :end-before: __transcription_example_end__ ``` From 29b7c34b0c16d55cb8116334cf3272db4e7f0103 Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Mon, 20 Oct 2025 13:57:00 +0530 Subject: [PATCH 27/33] Code review updates and fixes Signed-off-by: DPatel_7 --- .../transcription_config_example.yaml | 29 ----------- .../transcription/transcription_example.py | 26 +++++----- .../transcription_yaml_config_example.py | 51 ------------------- .../llm/user-guides/vllm-compatibility.md | 23 ++++----- .../_internal/serve/core/ingress/ingress.py | 2 +- .../serve/engines/vllm/vllm_engine.py | 6 +-- python/requirements/llm/llm-requirements.txt | 3 -- python/setup.py | 5 +- 8 files changed, 30 insertions(+), 115 deletions(-) delete mode 100644 doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml delete mode 100644 doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml b/doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml deleted file mode 100644 index eeac1ea33001..000000000000 --- a/doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml +++ /dev/null @@ -1,29 +0,0 @@ -# config.yaml -applications: -- args: - llm_configs: - - model_loading_config: - model_id: whisper-large - model_source: openai/whisper-large-v3 - accelerator_type: A10G - deployment_config: - autoscaling_config: - min_replicas: 1 - max_replicas: 2 - log_engine_metrics: true - - model_loading_config: - model_id: voxtral-mini - model_source: mistralai/Voxtral-Mini-3B-2507 - accelerator_type: A10G - engine_kwargs: - tokenizer_mode: mistral - config_format: mistral - load_format: mistral - deployment_config: - autoscaling_config: - min_replicas: 1 - max_replicas: 2 - log_engine_metrics: true - import_path: ray.serve.llm:build_openai_app - name: llm_app - route_prefix: "/" diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_example.py b/doc/source/llm/doc_code/serve/transcription/transcription_example.py index 181d83b35d47..d82241c2ff90 100644 --- a/doc/source/llm/doc_code/serve/transcription/transcription_example.py +++ b/doc/source/llm/doc_code/serve/transcription/transcription_example.py @@ -105,18 +105,18 @@ def _testing_build_openai_app(llm_serving_args): f.write(response.content) client = openai.OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key") -file = open("audio.wav", "rb") - -try: - response = client.audio.transcriptions.create( - model="whisper-large", - file=file, - temperature=0.0, - language="en", - ) -except Exception as e: - raise AssertionError( - f"Error while querying models: {e}. Check the logs for more details." - ) + +with open("audio.wav", "rb") as f: + try: + response = client.audio.transcriptions.create( + model="whisper-large", + file=f, + temperature=0.0, + language="en", + ) + except Exception as e: + raise AssertionError( + f"Error while querying models: {e}. Check the logs for more details." + ) serve.shutdown() diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py b/doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py deleted file mode 100644 index e37def988888..000000000000 --- a/doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py +++ /dev/null @@ -1,51 +0,0 @@ -""" -This file serves as a documentation example and CI test for YAML config deployment. - -Structure: -1. Monkeypatch setup: Ensures serve.run is non-blocking and removes accelerator requirements for CI testing. -2. Load YAML config and convert to Python using build_openai_app -3. Test validation (deployment status polling + cleanup) -""" - -import time -import os -import yaml -from ray import serve -from ray.serve.schema import ApplicationStatus -from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME -from ray.serve import llm - - -config_path = os.path.join( - os.path.dirname(__file__), "transcription_config_example.yaml" -) -with open(config_path, "r") as f: - config_dict = yaml.safe_load(f) - -llm_configs = config_dict["applications"][0]["args"]["llm_configs"] -for config in llm_configs: - config.pop("accelerator_type", None) - -app = llm.build_openai_app({"llm_configs": llm_configs}) -serve.run(app, blocking=False) - -status = ApplicationStatus.NOT_STARTED -timeout_seconds = 300 -start_time = time.time() - -while ( - status != ApplicationStatus.RUNNING and time.time() - start_time < timeout_seconds -): - status = serve.status().applications[SERVE_DEFAULT_APP_NAME].status - - if status in [ApplicationStatus.DEPLOY_FAILED, ApplicationStatus.UNHEALTHY]: - raise AssertionError(f"Deployment failed with status: {status}") - - time.sleep(1) - -if status != ApplicationStatus.RUNNING: - raise AssertionError( - f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}" - ) - -serve.shutdown() diff --git a/doc/source/serve/llm/user-guides/vllm-compatibility.md b/doc/source/serve/llm/user-guides/vllm-compatibility.md index 8a7d953a87dc..c4f4dd6e28d3 100644 --- a/doc/source/serve/llm/user-guides/vllm-compatibility.md +++ b/doc/source/serve/llm/user-guides/vllm-compatibility.md @@ -86,7 +86,7 @@ curl -X POST http://localhost:8000/v1/embeddings \ You can generate audio transcriptions for Speech-to-Text (STT) models trained specifically for Automatic Speech Recognition (ASR) tasks. Models supporting this use case are listed in the [vLLM transcription models documentation](https://docs.vllm.ai/en/stable/models/supported_models.html#transcription). -### Deploy an transcription model +### Deploy a transcription model ::::{tab-set} @@ -110,17 +110,16 @@ from openai import OpenAI client = OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key") # Open audio file -file = open("/path/to/audio.wav", "rb") - -# Make a request to the desired lora checkpoint -response = client.audio.transcriptions.create( - model="whisper-large", - file=file, - temperature=0.0, - language="en", -) - -print(response.text) +with open("/path/to/audio.wav", "rb") as f: + # Make a request to the transcription model + response = client.audio.transcriptions.create( + model="whisper-large", + file=f, + temperature=0.0, + language="en", + ) + + print(response.text) ``` ::: diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py index e84c41c9d227..7e13f0d8cd99 100644 --- a/python/ray/llm/_internal/serve/core/ingress/ingress.py +++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py @@ -89,7 +89,7 @@ "max_ongoing_requests": DEFAULT_MAX_ONGOING_REQUESTS, } - +# These methods correspond to functions defined in the LLMEngine class in python/ray/llm/_internal/serve/deployments/llm/llm_engine.py class CallMethod(Enum): CHAT = "chat" COMPLETIONS = "completions" diff --git a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py index 2f0adea640ac..13be7465f885 100644 --- a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py @@ -397,7 +397,7 @@ async def chat( async for response in chat_response: if not isinstance(response, str): raise ValueError( - f"Expected create_chat_completion to return a stream of strings, got and item with type {type(response)}" + f"Expected create_chat_completion to return a stream of strings, got an item with type {type(response)}" ) yield response else: @@ -426,7 +426,7 @@ async def completions( async for response in completion_response: if not isinstance(response, str): raise ValueError( - f"Expected create_completion to return a stream of strings, got and item with type {type(response)}" + f"Expected create_completion to return a stream of strings, got an item with type {type(response)}" ) yield response else: @@ -482,7 +482,7 @@ async def transcriptions( async for response in transcription_response: if not isinstance(response, str): raise ValueError( - f"Expected create_transcription to return a stream of strings, got and item with type {type(response)}" + f"Expected create_transcription to return a stream of strings, got an item with type {type(response)}" ) yield response else: diff --git a/python/requirements/llm/llm-requirements.txt b/python/requirements/llm/llm-requirements.txt index a9b40e9b7fe6..d32e70d23f89 100644 --- a/python/requirements/llm/llm-requirements.txt +++ b/python/requirements/llm/llm-requirements.txt @@ -14,6 +14,3 @@ typer meson pybind11 hf_transfer -# Audio processing dependencies for transcription support -librosa -soundfile diff --git a/python/setup.py b/python/setup.py index 2c124ee01494..869d5dfabecf 100644 --- a/python/setup.py +++ b/python/setup.py @@ -382,10 +382,9 @@ def get_packages(self): # async-timeout is a backport of asyncio.timeout for python < 3.11 "async-timeout; python_version < '3.11'", "typer", + "meson", + "pybind11", "hf_transfer", - # Audio processing dependencies for transcription support - "librosa", - "soundfile", ] + setup_spec.extras["data"] + setup_spec.extras["serve"] From 6d10b03a5fd436f01a26b42e03228b0189d21155 Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Mon, 20 Oct 2025 13:59:35 +0530 Subject: [PATCH 28/33] lock updates Signed-off-by: DPatel_7 --- python/deplocks/llm/rayllm_py311_cpu.lock | 2 -- python/deplocks/llm/rayllm_py311_cu128.lock | 2 -- python/deplocks/llm/rayllm_test_py311_cpu.lock | 5 +---- python/deplocks/llm/rayllm_test_py311_cu128.lock | 5 +---- 4 files changed, 2 insertions(+), 12 deletions(-) diff --git a/python/deplocks/llm/rayllm_py311_cpu.lock b/python/deplocks/llm/rayllm_py311_cpu.lock index 32249f6d1995..9ad44ad117b2 100644 --- a/python/deplocks/llm/rayllm_py311_cpu.lock +++ b/python/deplocks/llm/rayllm_py311_cpu.lock @@ -1293,7 +1293,6 @@ librosa==0.11.0 \ --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908 # via # -c python/deplocks/llm/rayllm_test_py311_cpu.lock - # -r python/requirements/llm/llm-requirements.txt # vllm llguidance==0.7.26 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:1895ff449c8ec0a5f1d3b142d723fc9b26a85b021b72d7f1173f8b7507f528c0 \ @@ -3394,7 +3393,6 @@ soundfile==0.13.1 \ --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5 # via # -c python/deplocks/llm/rayllm_test_py311_cpu.lock - # -r python/requirements/llm/llm-requirements.txt # librosa # mistral-common # vllm diff --git a/python/deplocks/llm/rayllm_py311_cu128.lock b/python/deplocks/llm/rayllm_py311_cu128.lock index 8caae4a2ded5..a7f14ffe377e 100644 --- a/python/deplocks/llm/rayllm_py311_cu128.lock +++ b/python/deplocks/llm/rayllm_py311_cu128.lock @@ -1294,7 +1294,6 @@ librosa==0.11.0 \ --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908 # via # -c python/deplocks/llm/rayllm_test_py311_cu128.lock - # -r python/requirements/llm/llm-requirements.txt # vllm llguidance==0.7.29 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:17fd439957d6ca5f459d0dec755a2d040c2dc946ed7e3c332b469ef6861292f8 \ @@ -3434,7 +3433,6 @@ soundfile==0.13.1 \ --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5 # via # -c python/deplocks/llm/rayllm_test_py311_cu128.lock - # -r python/requirements/llm/llm-requirements.txt # librosa # mistral-common # vllm diff --git a/python/deplocks/llm/rayllm_test_py311_cpu.lock b/python/deplocks/llm/rayllm_test_py311_cpu.lock index edd52e6eca9a..d5f4c289ab74 100644 --- a/python/deplocks/llm/rayllm_test_py311_cpu.lock +++ b/python/deplocks/llm/rayllm_test_py311_cpu.lock @@ -1837,9 +1837,7 @@ lazy-loader==0.4 \ librosa==0.11.0 \ --hash=sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1 \ --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908 - # via - # -r python/requirements/llm/llm-requirements.txt - # vllm + # via vllm llguidance==0.7.26 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:1895ff449c8ec0a5f1d3b142d723fc9b26a85b021b72d7f1173f8b7507f528c0 \ --hash=sha256:5e6f6cec9c6648164062f0347262b3ec7c39f54d1be5c5347d6446bc7fdba115 \ @@ -4312,7 +4310,6 @@ soundfile==0.13.1 \ --hash=sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b \ --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5 # via - # -r python/requirements/llm/llm-requirements.txt # librosa # mistral-common # vllm diff --git a/python/deplocks/llm/rayllm_test_py311_cu128.lock b/python/deplocks/llm/rayllm_test_py311_cu128.lock index 4cda7fdfdced..ab9931f03ebf 100644 --- a/python/deplocks/llm/rayllm_test_py311_cu128.lock +++ b/python/deplocks/llm/rayllm_test_py311_cu128.lock @@ -1837,9 +1837,7 @@ lazy-loader==0.4 \ librosa==0.11.0 \ --hash=sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1 \ --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908 - # via - # -r python/requirements/llm/llm-requirements.txt - # vllm + # via vllm llguidance==0.7.29 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \ --hash=sha256:17fd439957d6ca5f459d0dec755a2d040c2dc946ed7e3c332b469ef6861292f8 \ --hash=sha256:1d30a76b30b646ac7f9025d262665f62bdbf2d43698115eeb1119c6ee062a36f \ @@ -4326,7 +4324,6 @@ soundfile==0.13.1 \ --hash=sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b \ --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5 # via - # -r python/requirements/llm/llm-requirements.txt # librosa # mistral-common # vllm From 6df59eba72ccc40c203a1b56d47fc69f7e7f6cf1 Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Mon, 20 Oct 2025 23:47:59 +0530 Subject: [PATCH 29/33] yaml tests for bazel Signed-off-by: DPatel_7 --- .../transcription_config_example.yaml | 29 +++++++++++ .../transcription_yaml_config_example.py | 51 +++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml create mode 100644 doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml b/doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml new file mode 100644 index 000000000000..eeac1ea33001 --- /dev/null +++ b/doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml @@ -0,0 +1,29 @@ +# config.yaml +applications: +- args: + llm_configs: + - model_loading_config: + model_id: whisper-large + model_source: openai/whisper-large-v3 + accelerator_type: A10G + deployment_config: + autoscaling_config: + min_replicas: 1 + max_replicas: 2 + log_engine_metrics: true + - model_loading_config: + model_id: voxtral-mini + model_source: mistralai/Voxtral-Mini-3B-2507 + accelerator_type: A10G + engine_kwargs: + tokenizer_mode: mistral + config_format: mistral + load_format: mistral + deployment_config: + autoscaling_config: + min_replicas: 1 + max_replicas: 2 + log_engine_metrics: true + import_path: ray.serve.llm:build_openai_app + name: llm_app + route_prefix: "/" diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py b/doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py new file mode 100644 index 000000000000..e37def988888 --- /dev/null +++ b/doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py @@ -0,0 +1,51 @@ +""" +This file serves as a documentation example and CI test for YAML config deployment. + +Structure: +1. Monkeypatch setup: Ensures serve.run is non-blocking and removes accelerator requirements for CI testing. +2. Load YAML config and convert to Python using build_openai_app +3. Test validation (deployment status polling + cleanup) +""" + +import time +import os +import yaml +from ray import serve +from ray.serve.schema import ApplicationStatus +from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME +from ray.serve import llm + + +config_path = os.path.join( + os.path.dirname(__file__), "transcription_config_example.yaml" +) +with open(config_path, "r") as f: + config_dict = yaml.safe_load(f) + +llm_configs = config_dict["applications"][0]["args"]["llm_configs"] +for config in llm_configs: + config.pop("accelerator_type", None) + +app = llm.build_openai_app({"llm_configs": llm_configs}) +serve.run(app, blocking=False) + +status = ApplicationStatus.NOT_STARTED +timeout_seconds = 300 +start_time = time.time() + +while ( + status != ApplicationStatus.RUNNING and time.time() - start_time < timeout_seconds +): + status = serve.status().applications[SERVE_DEFAULT_APP_NAME].status + + if status in [ApplicationStatus.DEPLOY_FAILED, ApplicationStatus.UNHEALTHY]: + raise AssertionError(f"Deployment failed with status: {status}") + + time.sleep(1) + +if status != ApplicationStatus.RUNNING: + raise AssertionError( + f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}" + ) + +serve.shutdown() From 288ff91e5d030b1e2de900a9aba1152bcb9a68a7 Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Tue, 21 Oct 2025 22:54:03 +0530 Subject: [PATCH 30/33] removed .yaml doc code example and tests Signed-off-by: DPatel_7 --- doc/BUILD.bazel | 2 +- .../transcription_config_example.yaml | 29 ----------- .../transcription_yaml_config_example.py | 51 ------------------- 3 files changed, 1 insertion(+), 81 deletions(-) delete mode 100644 doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml delete mode 100644 doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py diff --git a/doc/BUILD.bazel b/doc/BUILD.bazel index 830d0e95d357..26df9d2197af 100644 --- a/doc/BUILD.bazel +++ b/doc/BUILD.bazel @@ -348,7 +348,7 @@ py_test_run_all_subdirectory( include = ["source/llm/doc_code/serve/**/*.py"], exclude = [], extra_srcs = [], - data = ["source/llm/doc_code/serve/qwen/llm_config_example.yaml", "source/llm/doc_code/serve/transcription/transcription_config_example.yaml"], + data = ["source/llm/doc_code/serve/qwen/llm_config_example.yaml"], tags = [ "exclusive", "gpu", diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml b/doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml deleted file mode 100644 index eeac1ea33001..000000000000 --- a/doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml +++ /dev/null @@ -1,29 +0,0 @@ -# config.yaml -applications: -- args: - llm_configs: - - model_loading_config: - model_id: whisper-large - model_source: openai/whisper-large-v3 - accelerator_type: A10G - deployment_config: - autoscaling_config: - min_replicas: 1 - max_replicas: 2 - log_engine_metrics: true - - model_loading_config: - model_id: voxtral-mini - model_source: mistralai/Voxtral-Mini-3B-2507 - accelerator_type: A10G - engine_kwargs: - tokenizer_mode: mistral - config_format: mistral - load_format: mistral - deployment_config: - autoscaling_config: - min_replicas: 1 - max_replicas: 2 - log_engine_metrics: true - import_path: ray.serve.llm:build_openai_app - name: llm_app - route_prefix: "/" diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py b/doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py deleted file mode 100644 index e37def988888..000000000000 --- a/doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py +++ /dev/null @@ -1,51 +0,0 @@ -""" -This file serves as a documentation example and CI test for YAML config deployment. - -Structure: -1. Monkeypatch setup: Ensures serve.run is non-blocking and removes accelerator requirements for CI testing. -2. Load YAML config and convert to Python using build_openai_app -3. Test validation (deployment status polling + cleanup) -""" - -import time -import os -import yaml -from ray import serve -from ray.serve.schema import ApplicationStatus -from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME -from ray.serve import llm - - -config_path = os.path.join( - os.path.dirname(__file__), "transcription_config_example.yaml" -) -with open(config_path, "r") as f: - config_dict = yaml.safe_load(f) - -llm_configs = config_dict["applications"][0]["args"]["llm_configs"] -for config in llm_configs: - config.pop("accelerator_type", None) - -app = llm.build_openai_app({"llm_configs": llm_configs}) -serve.run(app, blocking=False) - -status = ApplicationStatus.NOT_STARTED -timeout_seconds = 300 -start_time = time.time() - -while ( - status != ApplicationStatus.RUNNING and time.time() - start_time < timeout_seconds -): - status = serve.status().applications[SERVE_DEFAULT_APP_NAME].status - - if status in [ApplicationStatus.DEPLOY_FAILED, ApplicationStatus.UNHEALTHY]: - raise AssertionError(f"Deployment failed with status: {status}") - - time.sleep(1) - -if status != ApplicationStatus.RUNNING: - raise AssertionError( - f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}" - ) - -serve.shutdown() From 4095f754ed7222014d9c487e2b3d0a11bbd079ef Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Fri, 24 Oct 2025 00:21:38 +0530 Subject: [PATCH 31/33] review updates Signed-off-by: DPatel_7 --- .../transcription/transcription_example.py | 28 ++++--------------- .../llm/user-guides/vllm-compatibility.md | 2 +- .../_internal/serve/core/ingress/ingress.py | 2 ++ python/ray/llm/tests/serve/conftest.py | 2 +- 4 files changed, 10 insertions(+), 24 deletions(-) diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_example.py b/doc/source/llm/doc_code/serve/transcription/transcription_example.py index d82241c2ff90..832251422303 100644 --- a/doc/source/llm/doc_code/serve/transcription/transcription_example.py +++ b/doc/source/llm/doc_code/serve/transcription/transcription_example.py @@ -40,31 +40,15 @@ def _testing_build_openai_app(llm_serving_args): from ray import serve from ray.serve.llm import LLMConfig, build_openai_app -whisper_llm_config = LLMConfig( +transcription_config = LLMConfig( model_loading_config={ - "model_id": "whisper-large", - "model_source": "openai/whisper-large-v3", + "model_id": "voxtral-small", + "model_source": "mistralai/Voxtral-Small-24B-2507", }, deployment_config={ "autoscaling_config": { "min_replicas": 1, - "max_replicas": 2, - } - }, - # Pass the desired accelerator type (e.g. A10G, L4, etc.) - accelerator_type="A10G", - log_engine_metrics=True, -) - -voxtral_llm_config = LLMConfig( - model_loading_config={ - "model_id": "voxtral-mini", - "model_source": "mistralai/Voxtral-Mini-3B-2507", - }, - deployment_config={ - "autoscaling_config": { - "min_replicas": 1, - "max_replicas": 2, + "max_replicas": 4, } }, accelerator_type="A10G", @@ -77,7 +61,7 @@ def _testing_build_openai_app(llm_serving_args): log_engine_metrics=True, ) -app = build_openai_app({"llm_configs": [whisper_llm_config, voxtral_llm_config]}) +app = build_openai_app({"llm_configs": [transcription_config]}) serve.run(app, blocking=True) # __transcription_example_end__ @@ -109,7 +93,7 @@ def _testing_build_openai_app(llm_serving_args): with open("audio.wav", "rb") as f: try: response = client.audio.transcriptions.create( - model="whisper-large", + model="voxtral-small", file=f, temperature=0.0, language="en", diff --git a/doc/source/serve/llm/user-guides/vllm-compatibility.md b/doc/source/serve/llm/user-guides/vllm-compatibility.md index c4f4dd6e28d3..1c6518a8cf98 100644 --- a/doc/source/serve/llm/user-guides/vllm-compatibility.md +++ b/doc/source/serve/llm/user-guides/vllm-compatibility.md @@ -83,7 +83,7 @@ curl -X POST http://localhost:8000/v1/embeddings \ ## Transcriptions -You can generate audio transcriptions for Speech-to-Text (STT) models trained specifically for Automatic Speech Recognition (ASR) tasks. Models supporting this use case are listed in the [vLLM transcription models documentation](https://docs.vllm.ai/en/stable/models/supported_models.html#transcription). +You can generate audio transcriptions for Speech-to-Text (STT) models trained specifically for Automatic Speech Recognition (ASR) tasks. Models supporting this use case are listed in the [vLLM transcription models documentation](https://docs.vllm.ai/en/stable/models/supported_models.html). ### Deploy a transcription model diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py index 7e13f0d8cd99..29a9e17ada4d 100644 --- a/python/ray/llm/_internal/serve/core/ingress/ingress.py +++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py @@ -607,6 +607,8 @@ async def embeddings(self, body: EmbeddingRequest) -> Response: if isinstance(result, EmbeddingResponse): return JSONResponse(content=result.model_dump()) + # Annotated[..., Form()] is wrapper that is used to handle multiple form data, which is how audio is sent in transcription requests. + # vLLM implementation for handling transcription requests: https://github.com/vllm-project/vllm/blob/0825197bee8dea547f2ab25f48afd8aea0cd2578/vllm/entrypoints/openai/api_server.py#L839. async def transcriptions( self, body: Annotated[TranscriptionRequest, Form()] ) -> Response: diff --git a/python/ray/llm/tests/serve/conftest.py b/python/ray/llm/tests/serve/conftest.py index 5c7357a1823e..071e572a06f4 100644 --- a/python/ray/llm/tests/serve/conftest.py +++ b/python/ray/llm/tests/serve/conftest.py @@ -123,7 +123,7 @@ def mock_transcription_request(stream, temperature, language): from fastapi import UploadFile # Create a simple mock audio file (WAV format) - mock_audio_data = b"RIFF\x00\x00\x00\x00WAVEfmt \x10\x00\x00\x00\x01\x00\x01\x00\x44\xac\x00\x00\x88X\x01\x00\x02\x00\x10\x00data\x00\x00\x00\x00" + mock_audio_data = b"RIFF\x00\x00\x00\x00WAVEfmt \x10\x00\x00\x00\x01\x00\x01\x00\x44\xac\x00\x00\x88X\x01\x00\x02\x00\x10\x00data\x00\x00\x00\x00" # random byte string to test the transcription API mock_file = UploadFile( file=BytesIO(mock_audio_data), filename="test_audio.wav", From 57e323ae196946c05b80cd8f5eae9dac6443f7ac Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Fri, 24 Oct 2025 12:22:38 +0530 Subject: [PATCH 32/33] test fix Signed-off-by: DPatel_7 --- .../doc_code/serve/transcription/transcription_example.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_example.py b/doc/source/llm/doc_code/serve/transcription/transcription_example.py index 832251422303..1a980dcaac9b 100644 --- a/doc/source/llm/doc_code/serve/transcription/transcription_example.py +++ b/doc/source/llm/doc_code/serve/transcription/transcription_example.py @@ -42,8 +42,8 @@ def _testing_build_openai_app(llm_serving_args): transcription_config = LLMConfig( model_loading_config={ - "model_id": "voxtral-small", - "model_source": "mistralai/Voxtral-Small-24B-2507", + "model_id": "voxtral-mini", + "model_source": "mistralai/Voxtral-Mini-3B-2507", }, deployment_config={ "autoscaling_config": { @@ -93,7 +93,7 @@ def _testing_build_openai_app(llm_serving_args): with open("audio.wav", "rb") as f: try: response = client.audio.transcriptions.create( - model="voxtral-small", + model="voxtral-mini", file=f, temperature=0.0, language="en", From 05cf83edc28c2d8ca899a4e8b116a90eed44235c Mon Sep 17 00:00:00 2001 From: DPatel_7 Date: Fri, 24 Oct 2025 14:01:47 +0530 Subject: [PATCH 33/33] doc updates Signed-off-by: DPatel_7 --- .../doc_code/serve/transcription/transcription_example.py | 4 ++-- doc/source/serve/llm/user-guides/vllm-compatibility.md | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_example.py b/doc/source/llm/doc_code/serve/transcription/transcription_example.py index 1a980dcaac9b..aed2e567146e 100644 --- a/doc/source/llm/doc_code/serve/transcription/transcription_example.py +++ b/doc/source/llm/doc_code/serve/transcription/transcription_example.py @@ -40,7 +40,7 @@ def _testing_build_openai_app(llm_serving_args): from ray import serve from ray.serve.llm import LLMConfig, build_openai_app -transcription_config = LLMConfig( +llm_config = LLMConfig( model_loading_config={ "model_id": "voxtral-mini", "model_source": "mistralai/Voxtral-Mini-3B-2507", @@ -61,7 +61,7 @@ def _testing_build_openai_app(llm_serving_args): log_engine_metrics=True, ) -app = build_openai_app({"llm_configs": [transcription_config]}) +app = build_openai_app({"llm_configs": [llm_config]}) serve.run(app, blocking=True) # __transcription_example_end__ diff --git a/doc/source/serve/llm/user-guides/vllm-compatibility.md b/doc/source/serve/llm/user-guides/vllm-compatibility.md index 1c6518a8cf98..4ec9a44b6ad4 100644 --- a/doc/source/serve/llm/user-guides/vllm-compatibility.md +++ b/doc/source/serve/llm/user-guides/vllm-compatibility.md @@ -83,15 +83,15 @@ curl -X POST http://localhost:8000/v1/embeddings \ ## Transcriptions -You can generate audio transcriptions for Speech-to-Text (STT) models trained specifically for Automatic Speech Recognition (ASR) tasks. Models supporting this use case are listed in the [vLLM transcription models documentation](https://docs.vllm.ai/en/stable/models/supported_models.html). +You can generate audio transcriptions using Speech-to-Text (STT) models trained specifically for Automatic Speech Recognition (ASR) tasks. Models supporting this use case are listed in the [vLLM transcription models documentation](https://docs.vllm.ai/en/stable/models/supported_models.html). ### Deploy a transcription model ::::{tab-set} -:::{tab-item} Python -:sync: python +:::{tab-item} Server +:sync: server ```{literalinclude} ../../../llm/doc_code/serve/transcription/transcription_example.py :language: python