From 8c48511aecded92b6954b9181e4eeee458c191ae Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Sun, 5 Oct 2025 02:14:41 +0530
Subject: [PATCH 01/33] initial commit for transcriptions api integration

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 .../serve/core/configs/openai_api_models.py   | 23 +++++++++-
 .../_internal/serve/core/ingress/ingress.py   | 35 +++++++++++----
 .../_internal/serve/core/server/llm_server.py | 31 +++++++++++--
 .../serve/engines/vllm/vllm_engine.py         | 43 ++++++++++++++++++-
 python/ray/serve/llm/openai_api_models.py     | 33 ++++++++++++++
 5 files changed, 152 insertions(+), 13 deletions(-)

diff --git a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py
index ed2adf153d2c..073f5753c9de 100644
--- a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py
+++ b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py
@@ -21,6 +21,9 @@
     EmbeddingChatRequest as vLLMEmbeddingChatRequest,
     EmbeddingCompletionRequest as vLLMEmbeddingCompletionRequest,
     EmbeddingResponse as vLLMEmbeddingResponse,
+    TranscriptionRequest as vLLMTranscriptionRequest,
+    TranscriptionResponse as vLLMTranscriptionResponse,
+    TranscriptionStreamResponse as vLLMTranscriptionStreamResponse,
     ErrorInfo as vLLMErrorInfo,
     ErrorResponse as vLLMErrorResponse,
     ScoreRequest as vLLMScoreRequest,
@@ -96,6 +99,18 @@ class EmbeddingResponse(vLLMEmbeddingResponse):
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
 
+class TranscriptionRequest(vLLMTranscriptionRequest):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+
+class TranscriptionResponse(vLLMTranscriptionResponse):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+
+class TranscriptionStreamResponse(vLLMTranscriptionStreamResponse):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+
 class ScoreRequest(vLLMScoreRequest):
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
@@ -115,7 +130,7 @@ class ScoreResponse(vLLMScoreResponse):
 ]
 
 LLMChatResponse = Union[
-    AsyncGenerator[Union[str, ChatCompletionResponse, ErrorResponse], None],
+    AsyncGenerator[Union[str, ChatCompletionStreamResponse, ChatCompletionResponse, ErrorResponse], None],
 ]
 
 LLMCompletionsResponse = Union[
@@ -124,6 +139,12 @@ class ScoreResponse(vLLMScoreResponse):
     ],
 ]
 
+LLMTranscriptionResponse = Union[
+    AsyncGenerator[
+        Union[TranscriptionStreamResponse, TranscriptionResponse, ErrorResponse], None
+    ],
+]
+
 # TODO: remove this class
 class OpenAIHTTPException(Exception):
     def __init__(
diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py
index cb61e1ab7a22..f6ae23c43a2d 100644
--- a/python/ray/llm/_internal/serve/core/ingress/ingress.py
+++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py
@@ -40,10 +40,14 @@
     CompletionStreamResponse,
     EmbeddingRequest,
     EmbeddingResponse,
+    TranscriptionRequest,
+    TranscriptionResponse,
+    TranscriptionStreamResponse,
     ErrorResponse,
     LLMChatResponse,
     LLMCompletionsResponse,
     LLMEmbeddingsResponse,
+    LLMTranscriptionResponse,
     LLMScoreResponse,
     ModelCard,
     ModelList,
@@ -110,6 +114,7 @@ def _sanitize_chat_completion_request(
 StreamResponseType = Union[
     ChatCompletionStreamResponse,
     CompletionStreamResponse,
+    TranscriptionStreamResponse
 ]
 BatchedStreamResponseType = List[StreamResponseType]
 
@@ -403,7 +408,7 @@ async def _get_response(
         self,
         *,
         body: Union[
-            CompletionRequest, ChatCompletionRequest, EmbeddingRequest, ScoreRequest
+            CompletionRequest, ChatCompletionRequest, EmbeddingRequest, TranscriptionRequest, ScoreRequest
         ],
         call_method: str,
     ) -> AsyncGenerator[
@@ -411,6 +416,7 @@ async def _get_response(
             LLMChatResponse,
             LLMCompletionsResponse,
             LLMEmbeddingsResponse,
+            LLMTranscriptionResponse,
             LLMScoreResponse,
         ],
         None,
@@ -497,12 +503,15 @@ async def model_data(self, model: str) -> ModelCard:
         return model_data
 
     async def _process_llm_request(
-        self, body: Union[CompletionRequest, ChatCompletionRequest], is_chat: bool
+        self, body: Union[CompletionRequest, ChatCompletionRequest, TranscriptionRequest], call_method: str
     ) -> Response:
-        NoneStreamingResponseType = (
-            ChatCompletionResponse if is_chat else CompletionResponse
-        )
-        call_method = "chat" if is_chat else "completions"
+        
+        if call_method == "chat":
+            NoneStreamingResponseType = ChatCompletionResponse
+        elif call_method == "completions":
+            NoneStreamingResponseType = CompletionResponse
+        elif call_method == "transcriptions":
+            NoneStreamingResponseType = TranscriptionResponse
 
         async with router_request_timeout(DEFAULT_LLM_ROUTER_HTTP_TIMEOUT):
 
@@ -544,7 +553,7 @@ async def completions(self, body: CompletionRequest) -> Response:
         Returns:
             A response object with completions.
         """
-        return await self._process_llm_request(body, is_chat=False)
+        return await self._process_llm_request(body, call_method="completions")
 
     async def chat(self, body: ChatCompletionRequest) -> Response:
         """Given a prompt, the model will return one or more predicted completions,
@@ -557,7 +566,7 @@ async def chat(self, body: ChatCompletionRequest) -> Response:
             A response object with completions.
         """
 
-        return await self._process_llm_request(body, is_chat=True)
+        return await self._process_llm_request(body, call_method="chat")
 
     async def embeddings(self, body: EmbeddingRequest) -> Response:
         """Create embeddings for the provided input.
@@ -580,6 +589,16 @@ async def embeddings(self, body: EmbeddingRequest) -> Response:
 
             if isinstance(result, EmbeddingResponse):
                 return JSONResponse(content=result.model_dump())
+    
+    @fastapi_router_app.post("/v1/audio/transcriptions")
+    async def transcriptions(self, body: TranscriptionRequest) -> Response:
+        """Create transcription for the provided audio input.
+
+        Returns:
+            A response object with transcriptins.
+        """
+
+        return await self._process_llm_request(body, call_method="transcriptions")
 
     async def score(self, body: ScoreRequest) -> Response:
         """Create scores for the provided text pairs.
diff --git a/python/ray/llm/_internal/serve/core/server/llm_server.py b/python/ray/llm/_internal/serve/core/server/llm_server.py
index 263d934f0020..c3390828b947 100644
--- a/python/ray/llm/_internal/serve/core/server/llm_server.py
+++ b/python/ray/llm/_internal/serve/core/server/llm_server.py
@@ -49,6 +49,8 @@
         CompletionResponse,
         EmbeddingRequest,
         EmbeddingResponse,
+        TranscriptionRequest,
+        TranscriptionResponse,
         ErrorResponse,
         ScoreRequest,
         ScoreResponse,
@@ -251,7 +253,7 @@ def _get_batch_interval_ms(self, stream: bool = True) -> int:
     async def _maybe_add_request_id_to_request(
         self,
         request: Union[
-            "ChatCompletionRequest", "CompletionRequest", "EmbeddingRequest"
+            "ChatCompletionRequest", "CompletionRequest", "EmbeddingRequest", "TranscriptionRequest"
         ],
     ):
         """Add the request id to the request."""
@@ -282,6 +284,7 @@ async def _run_request(
             "ChatCompletionRequest",
             "CompletionRequest",
             "EmbeddingRequest",
+            "TranscriptionRequest"
             "ScoreRequest",
         ],
         *,
@@ -355,7 +358,7 @@ async def embeddings(
     ) -> AsyncGenerator[Union[List["ErrorResponse"], "EmbeddingResponse"], None]:
         """Runs an embeddings request to the engine and returns the response.
 
-        Returns an AsyncGenerator over the EmbeddingResponse object. This is so that the caller can have a consistent interface across all the methods of chat, completions, and embeddings.
+        Returns an AsyncGenerator over the EmbeddingResponse object. This is so that the caller can have a consistent interface across all the methods of chat, completions, embeddings and transcriptions.
 
         Args:
             request: An EmbeddingRequest object.
@@ -365,7 +368,29 @@ async def embeddings(
         """
         # NOTE: Embeddings does not need batching.
         return await self._run_request(
-            request, engine_method="embeddings", batch_output_stream=False
+            request,
+            engine_method="embeddings",
+            batch_output_stream=False,
+        )
+
+    async def transcriptions(
+        self, request: "TranscriptionRequest"
+    ) -> AsyncGenerator[Union[List["ErrorResponse"], "TranscriptionResponse"], None]:
+        """Runs an transcriptions request to the engine and returns the response.
+
+        Returns an AsyncGenerator over the TranscriptionResponse object. This is so that the caller can have a consistent interface across all the methods of chat, completions, embeddings and transcriptions.
+
+        Args:
+            request: An TranscriptionRequest object.
+
+        Returns:
+            An AsyncGenerator over the TranscriptionResponse object.
+        """
+        # NOTE: Embeddings does not need batching.
+        return await self._run_request(
+            request,
+            engine_method="transcriptions",
+            batch_output_stream=True,
         )
 
     async def score(
diff --git a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py
index 6c422f38a48b..2acac244345f 100644
--- a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py
+++ b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py
@@ -22,6 +22,8 @@
     CompletionResponse,
     EmbeddingRequest,
     EmbeddingResponse,
+    TranscriptionRequest,
+    TranscriptionResponse,
     ErrorInfo,
     ErrorResponse,
     ScoreRequest,
@@ -44,6 +46,7 @@
     from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
     from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
     from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
+    from vllm.entrypoints.openai.serving_transcription import OpenAIServingTranscription
     from vllm.entrypoints.openai.serving_models import OpenAIServingModels
     from vllm.entrypoints.openai.serving_score import ServingScores
 
@@ -147,6 +150,7 @@ def __init__(
         self._oai_serving_chat: Optional["OpenAIServingChat"] = None
         self._oai_serving_completion: Optional["OpenAIServingCompletion"] = None
         self._oai_serving_embedding: Optional["OpenAIServingEmbedding"] = None
+        self._oai_serving_transcription: Optional["OpenAIServingTranscription"] = None
         self._oai_serving_scores: Optional["ServingScores"] = None
 
     async def start(self) -> None:
@@ -208,6 +212,7 @@ async def start(self) -> None:
         self._oai_serving_chat = state.openai_serving_chat
         self._oai_serving_completion = state.openai_serving_completion
         self._oai_serving_embedding = state.openai_serving_embedding
+        self._oai_serving_transcription = state.openai_serving_transcription
         self._oai_serving_scores = state.openai_serving_scores
 
         self._validate_openai_serving_models()
@@ -241,6 +246,11 @@ def _validate_openai_serving_embedding(self):
             self._oai_serving_embedding, "create_embedding"
         ), "oai_serving_embedding must have a create_embedding attribute"
 
+    def _validate_openai_serving_transcription(self):
+        assert hasattr(
+            self._oai_serving_transcription, "create_transcription"
+        ), "oai_serving_transcription must have a create_transcription attribute"
+
     def _validate_openai_serving_scores(self):
         assert hasattr(
             self._oai_serving_scores, "create_score"
@@ -351,7 +361,7 @@ async def resolve_lora(self, disk_lora_model: DiskMultiplexConfig):
     def _create_raw_request(
         self,
         request: Union[
-            CompletionRequest, ChatCompletionRequest, EmbeddingRequest, ScoreRequest
+            CompletionRequest, ChatCompletionRequest, EmbeddingRequest, TranscriptionRequest, ScoreRequest
         ],
         path: str,
     ) -> Request:
@@ -444,6 +454,37 @@ async def embeddings(
         else:
             yield EmbeddingResponse(**embedding_response.model_dump())
 
+    async def transcription(
+        self, request: TranscriptionRequest
+    ) -> AsyncGenerator[Union[str, TranscriptionResponse, ErrorResponse], None]:
+        self._validate_openai_serving_transcription()
+
+        # TODO (Kourosh): Remove when we upstream request_id attribute to vLLM.
+        # PR: https://github.com/vllm-project/vllm/pull/21009
+        # Create a fake starlette.Request object with the x-request-id header
+        # so that the create_transcription API can assign the request_id properly.
+        raw_request = self._create_raw_request(request, "/audio/transcriptions")
+
+        transcription_response = await self._oai_serving_transcription.create_transcription(  # type: ignore[attr-defined]
+            request,
+            raw_request=raw_request,
+        )
+
+        if isinstance(transcription_response, AsyncGenerator):
+            async for response in transcription_response:
+                if not isinstance(response, str):
+                    raise ValueError(
+                        f"Expected create_transcription to return a stream of strings, got and item with type {type(response)}"
+                    )
+                yield response
+        else:
+            if isinstance(transcription_response, VLLMErrorResponse):
+                yield ErrorResponse(
+                    error=ErrorInfo(**transcription_response.error.model_dump())
+                )
+            else:
+                yield TranscriptionResponse(**transcription_response.model_dump())
+
     async def score(
         self, request: ScoreRequest
     ) -> AsyncGenerator[Union[ScoreResponse, ErrorResponse], None]:
diff --git a/python/ray/serve/llm/openai_api_models.py b/python/ray/serve/llm/openai_api_models.py
index 4b04d54dbfdd..48fe7d3bee2a 100644
--- a/python/ray/serve/llm/openai_api_models.py
+++ b/python/ray/serve/llm/openai_api_models.py
@@ -7,6 +7,9 @@
     CompletionStreamResponse as _CompletionStreamResponse,
     EmbeddingRequest as _EmbeddingRequest,
     EmbeddingResponse as _EmbeddingResponse,
+    TranscriptionRequest as _TranscriptionRequest,
+    TranscriptionResponse as _TranscriptionResponse,
+    TranscriptionStreamResponse as _TranscriptionStreamResponse,
     ErrorResponse as _ErrorResponse,
 )
 from ray.util.annotations import PublicAPI
@@ -85,6 +88,36 @@ class EmbeddingResponse(_EmbeddingResponse):
     pass
 
 
+@PublicAPI(stability="alpha")
+class TranscriptionRequest(_TranscriptionRequest):
+    """TranscriptionRequest is the request body for the transcription API.
+
+    This model is compatible with vLLM's OpenAI API models.
+    """
+
+    pass
+
+
+@PublicAPI(stability="alpha")
+class TranscriptionResponse(_TranscriptionResponse):
+    """TranscriptionResponse is the response body for the transcription API.
+
+    This model is compatible with vLLM's OpenAI API models.
+    """
+
+    pass
+
+
+@PublicAPI(stability="alpha")
+class TranscriptionStreamResponse(_TranscriptionStreamResponse):
+    """TranscriptionStreamResponse is the response body for the transcription API.
+
+    This model is compatible with vLLM's OpenAI API models.
+    """
+
+    pass
+
+
 @PublicAPI(stability="alpha")
 class ErrorResponse(_ErrorResponse):
     """The returned response in case of an error."""

From 1c793b3675b4f57a5417fde5b7116f8699989592 Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Sun, 5 Oct 2025 02:33:24 +0530
Subject: [PATCH 02/33] naming fixes

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 python/ray/llm/_internal/serve/core/ingress/ingress.py     | 2 +-
 python/ray/llm/_internal/serve/core/server/llm_server.py   | 3 +--
 python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py
index f6ae23c43a2d..dbeaa769454c 100644
--- a/python/ray/llm/_internal/serve/core/ingress/ingress.py
+++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py
@@ -595,7 +595,7 @@ async def transcriptions(self, body: TranscriptionRequest) -> Response:
         """Create transcription for the provided audio input.
 
         Returns:
-            A response object with transcriptins.
+            A response object with transcriptions.
         """
 
         return await self._process_llm_request(body, call_method="transcriptions")
diff --git a/python/ray/llm/_internal/serve/core/server/llm_server.py b/python/ray/llm/_internal/serve/core/server/llm_server.py
index c3390828b947..dec41f2dbed7 100644
--- a/python/ray/llm/_internal/serve/core/server/llm_server.py
+++ b/python/ray/llm/_internal/serve/core/server/llm_server.py
@@ -284,7 +284,7 @@ async def _run_request(
             "ChatCompletionRequest",
             "CompletionRequest",
             "EmbeddingRequest",
-            "TranscriptionRequest"
+            "TranscriptionRequest",
             "ScoreRequest",
         ],
         *,
@@ -386,7 +386,6 @@ async def transcriptions(
         Returns:
             An AsyncGenerator over the TranscriptionResponse object.
         """
-        # NOTE: Embeddings does not need batching.
         return await self._run_request(
             request,
             engine_method="transcriptions",
diff --git a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py
index 2acac244345f..1c69e7b72ba6 100644
--- a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py
+++ b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py
@@ -454,7 +454,7 @@ async def embeddings(
         else:
             yield EmbeddingResponse(**embedding_response.model_dump())
 
-    async def transcription(
+    async def transcriptions(
         self, request: TranscriptionRequest
     ) -> AsyncGenerator[Union[str, TranscriptionResponse, ErrorResponse], None]:
         self._validate_openai_serving_transcription()

From 0d4039c7e160c75f722074ee83c7b69549139699 Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Wed, 8 Oct 2025 02:02:24 +0530
Subject: [PATCH 03/33] ci tests for transcriptions api and docs for
 transcription

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 python/ray/llm/tests/serve/conftest.py        |  26 ++++
 .../cpu/deployments/llm/test_llm_engine.py    |  42 +++++++
 .../cpu/deployments/llm/test_llm_server.py    |  55 +++++++++
 .../llm/tests/serve/mocks/mock_vllm_engine.py | 113 ++++++++++++++++++
 .../llm/tests/serve/utils/testing_utils.py    |  78 ++++++++++++
 5 files changed, 314 insertions(+)

diff --git a/python/ray/llm/tests/serve/conftest.py b/python/ray/llm/tests/serve/conftest.py
index 6598fe1dff1d..ed58b4cc809e 100644
--- a/python/ray/llm/tests/serve/conftest.py
+++ b/python/ray/llm/tests/serve/conftest.py
@@ -16,6 +16,7 @@
     CompletionRequest,
     EmbeddingCompletionRequest,
     ScoreRequest,
+    TranscriptionRequest,
 )
 from ray.llm._internal.serve.engines.vllm.vllm_models import (
     VLLMEngineConfig,
@@ -113,6 +114,31 @@ def mock_embedding_request(dimensions):
     return request
 
 
+@pytest.fixture
+def mock_transcription_request(stream, temperature, language):
+    """Fixture for creating transcription requests for mock testing."""
+    # Create a mock audio file for testing
+    from io import BytesIO
+    from fastapi import UploadFile
+
+    # Create a simple mock audio file (WAV format)
+    mock_audio_data = b"RIFF\x00\x00\x00\x00WAVEfmt \x10\x00\x00\x00\x01\x00\x01\x00\x44\xac\x00\x00\x88X\x01\x00\x02\x00\x10\x00data\x00\x00\x00\x00"
+    mock_file = UploadFile(
+        file=BytesIO(mock_audio_data),
+        filename="test_audio.wav",
+        content_type="audio/wav",
+    )
+
+    return TranscriptionRequest(
+        file=mock_file,
+        model=MOCK_MODEL_ID,
+        language=language,
+        temperature=temperature,
+        stream=stream,
+        prompt="",
+    )
+
+
 @pytest.fixture
 def mock_score_request():
     """Fixture for creating score requests for mock testing."""
diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py
index 4b259756aae6..0755105d3886 100644
--- a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py
+++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py
@@ -83,6 +83,48 @@ async def test_embedding_mock_engine(
         async for response in engine.embeddings(request):
             LLMResponseValidator.validate_embedding_response(response, dimensions)
 
+    @pytest.mark.parametrize("stream", [False, True])
+    @pytest.mark.parametrize("temperature", [0.0])
+    @pytest.mark.parametrize("language", ["en", "hi"])
+    @pytest.mark.asyncio
+    async def test_transcription_mock_engine(
+        self,
+        mock_llm_config,
+        mock_transcription_request,
+        stream: bool,
+        temperature: float,
+        language: Optional[str],
+    ):
+        """Test transcription API with different language and temperature, streaming and non-streaming."""
+
+        engine = MockVLLMEngine(mock_llm_config)
+        await engine.start()
+
+        request = mock_transcription_request
+        response_generator = engine.transcription(request)
+
+        print(
+            f"\n\n_____ TRANSCRIPTION ({'STREAMING' if stream else 'NON-STREAMING'}) language={language} temperature={temperature} _____\n\n"
+        )
+
+        if stream:
+            # Collect streaming chunks
+            chunks = []
+            async for chunk in response_generator:
+                assert isinstance(chunk, str)
+                chunks.append(chunk)
+
+            # Validate streaming response
+            LLMResponseValidator.validate_transcription_response(
+                chunks, temperature, language
+            )
+        else:
+            # Validate non-streaming response
+            async for response in response_generator:
+                LLMResponseValidator.validate_transcription_response(
+                    response, temperature, language
+                )
+
     @pytest.mark.asyncio
     async def test_score_mock_engine(self, mock_llm_config, mock_score_request):
         """Test score API for text similarity."""
diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py
index 26814d6260f9..07425024b522 100644
--- a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py
+++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py
@@ -155,6 +155,61 @@ async def test_embedding_llm_server(
 
         # Validate embedding response
         LLMResponseValidator.validate_embedding_response(chunks[0], dimensions)
+    
+    @pytest.mark.parametrize("stream", [False, True])
+    @pytest.mark.parametrize("temperature", [0.0])
+    @pytest.mark.parametrize("language", ["en", "hi"])
+    @pytest.mark.asyncio
+    async def test_transcription_llm_server(
+        self,
+        serve_handle,
+        mock_llm_config,
+        mock_transcription_request,
+        stream: bool,
+        temperature: float,
+        language: Optional[str],
+    ):
+        """Test transcription API from LLMServer perspective."""
+
+        # Create transcription request
+        request = mock_transcription_request
+
+        print(
+            f"\n\n_____ TRANSCRIPTION SERVER ({'STREAMING' if stream else 'NON-STREAMING'}) language={language} temperature={temperature} _____\n\n"
+        )
+
+        # Get the response
+        batched_chunks = serve_handle.transcriptions.remote(request)
+
+        if stream:
+            # Collect streaming responses
+            chunks = []
+            async for batch in batched_chunks:
+                if isinstance(batch, list):
+                    chunks.extend(batch)
+                else:
+                    chunks.append(batch)
+
+            # Check that we got responses
+            assert len(chunks) > 0
+
+            # Validate streaming response
+            LLMResponseValidator.validate_transcription_response(
+                chunks, temperature, language
+            )
+        else:
+            # Collect non-streaming response
+            chunks = []
+            async for batch in batched_chunks:
+                chunks.append(batch)
+
+            # Check that we got one response
+            assert len(chunks) == 1
+
+            # Validate non-streaming response
+            LLMResponseValidator.validate_transcription_response(
+                chunks[0], temperature, language
+            )
 
     @pytest.mark.asyncio
     async def test_score_llm_server(
diff --git a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py
index 4300b4859b91..a926c429c577 100644
--- a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py
+++ b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py
@@ -16,9 +16,12 @@
     CompletionResponse,
     EmbeddingRequest,
     EmbeddingResponse,
+    TranscriptionRequest,
+    TranscriptionResponse,
     ErrorResponse,
     ScoreRequest,
     ScoreResponse,
+    TranscriptionStreamResponse,
 )
 from ray.llm._internal.serve.core.engine.protocol import LLMEngine
 from ray.llm._internal.serve.utils.lora_serve_utils import LoraModelLoader
@@ -137,6 +140,33 @@ async def embeddings(
         )
         yield response
 
+    async def transcription(
+        self, request: TranscriptionRequest
+    ) -> AsyncGenerator[Union[str, TranscriptionResponse, ErrorResponse], None]:
+        """Mock transcription generation."""
+        if not self.started:
+            raise RuntimeError("Engine not started")
+
+        # Extract audio file info
+        audio_file = request.file
+        language = getattr(request, "language", "en")
+        temperature = getattr(request, "temperature", 0.0)
+        stream = getattr(request, "stream", False)
+
+        # Generate mock transcription response
+        mock_transcription_text = (
+            f"Mock transcription in {language} language with temperature {temperature}"
+        )
+
+        # Generate transcription response
+        async for response in self._generate_transcription_response(
+            request=request,
+            transcription_text=mock_transcription_text,
+            language=language,
+            temperature=temperature,
+        ):
+            yield response
+
     async def score(
         self, request: ScoreRequest
     ) -> AsyncGenerator[Union[str, ScoreResponse, ErrorResponse], None]:
@@ -314,6 +344,89 @@ async def _generate_completion_response(
 
             yield response
 
+    async def _generate_transcription_response(
+        self,
+        request: TranscriptionRequest,
+        transcription_text: str,
+        language: str,
+        temperature: float,
+    ) -> AsyncGenerator[Union[str, TranscriptionResponse], None]:
+        """Generate mock transcription response."""
+
+        request_id = request.request_id or f"transcribe-{random.randint(1000, 9999)}"
+        lora_prefix = (
+            ""
+            if request.model not in self._current_lora_model
+            else f"[lora_model] {request.model}: "
+        )
+
+        if request.stream:
+            # Streaming response - return SSE formatted strings
+            created_time = int(asyncio.get_event_loop().time())
+            model_name = getattr(request, "model", "mock-model")
+
+            # Split transcription into words for streaming
+            words = transcription_text.split()
+
+            for i, word in enumerate(words):
+                # Create streaming chunk
+                choice = {
+                    "delta": {
+                        "content": word + (" " if i < len(words) - 1 else ""),
+                    },
+                }
+
+                chunk_data = {
+                    "delta": None,
+                    "type": None,
+                    "logprobs": None,
+                    "id": request_id,
+                    "object": "transcription.chunk",
+                    "created": created_time,
+                    "model": model_name,
+                    "choices": [choice],
+                }
+
+                # Format as SSE
+                yield f"data: {json.dumps(chunk_data)}\n\n"
+                await asyncio.sleep(0.01)  # Simulate processing time
+
+            # Send final chunk with finish_reason
+            final_choice = {
+                "delta": {
+                    "content": "",
+                    "finish_reason": "stop",
+                    "stop_reason": None,
+                },
+            }
+
+            final_chunk_data = {
+                "delta": None,
+                "type": None,
+                "logprobs": None,
+                "id": request_id,
+                "object": "transcription.chunk",
+                "created": created_time,
+                "model": model_name,
+                "choices": [final_choice],
+            }
+
+            yield f"data: {json.dumps(final_chunk_data)}\n\n"
+
+            # Send final [DONE] message
+            yield "data: [DONE]\n\n"
+        else:
+            # Non-streaming response - return response object
+            response = TranscriptionResponse(
+                text=transcription_text,
+                logprobs=None,
+                usage={
+                    "seconds": 5.0,
+                    "type": "duration",
+                },
+            )
+            yield response
+
 
 class FakeLoraModelLoader(LoraModelLoader):
     """Fake LoRA model loader for testing that bypasses S3 entirely."""
diff --git a/python/ray/llm/tests/serve/utils/testing_utils.py b/python/ray/llm/tests/serve/utils/testing_utils.py
index c63c92921b6c..b1fd72bce525 100644
--- a/python/ray/llm/tests/serve/utils/testing_utils.py
+++ b/python/ray/llm/tests/serve/utils/testing_utils.py
@@ -12,6 +12,7 @@
     CompletionResponse,
     EmbeddingResponse,
     ScoreResponse,
+    TranscriptionResponse,
 )
 
 
@@ -108,3 +109,80 @@ def validate_score_response(response: ScoreResponse):
             assert score_data.object == "score"
             assert isinstance(score_data.score, float)
             assert score_data.index == i  # Index should match position in list
+
+    @staticmethod
+    def validate_transcription_response(
+        response: Union[TranscriptionResponse, List[str]],
+        temperature: float,
+        language: Optional[str] = None,
+    ):
+        """Validate transcription responses for both streaming and non-streaming."""
+        if isinstance(response, list):
+            # Streaming response - validate chunks
+            LLMResponseValidator.validate_transcription_streaming_chunks(
+                response, temperature, language
+            )
+        else:
+            # Non-streaming response
+            assert isinstance(response, TranscriptionResponse)
+            assert hasattr(response, "text")
+            assert isinstance(response.text, str)
+            assert len(response.text) > 0
+
+            # Check that the response contains expected language and temperature info
+            expected_text = f"Mock transcription in {language} language with temperature {temperature}"
+            assert response.text == expected_text
+
+            # Validate usage information
+            if hasattr(response, "usage"):
+                assert hasattr(response.usage, "seconds")
+                assert hasattr(response.usage, "type")
+                assert response.usage.seconds > 0
+                assert response.usage.type == "duration"
+
+    @staticmethod
+    def validate_transcription_streaming_chunks(
+        chunks: List[str], temperature: float, language: Optional[str] = None
+    ):
+        """Validate streaming transcription response chunks."""
+        # Should have at least one chunk (transcription text) + final chunk + [DONE]
+        assert len(chunks) >= 3
+
+        # Validate each chunk except the last [DONE] chunk
+        transcription_chunks = []
+        for chunk in chunks[:-1]:  # Exclude the final [DONE] chunk
+            pattern = r"data: (.*)\n\n"
+            match = re.match(pattern, chunk)
+            assert match is not None
+            chunk_data = json.loads(match.group(1))
+
+            # Validate chunk structure
+            assert "id" in chunk_data
+            assert "object" in chunk_data
+            assert chunk_data["object"] == "transcription.chunk"
+            assert "delta" in chunk_data
+            assert chunk_data["delta"] is None
+            assert "type" in chunk_data
+            assert chunk_data["type"] is None
+            assert "logprobs" in chunk_data
+            assert chunk_data["logprobs"] is None
+            assert "choices" in chunk_data
+            assert len(chunk_data["choices"]) == 1
+
+            choice = chunk_data["choices"][0]
+            assert "delta" in choice
+            assert "content" in choice["delta"]
+
+            # Collect text for final validation
+            if choice["delta"]["content"]:
+                transcription_chunks.append(choice["delta"]["content"])
+
+        # Validate final transcription text
+        full_transcription = "".join(transcription_chunks)
+        expected_text = (
+            f"Mock transcription in {language} language with temperature {temperature}"
+        )
+        assert full_transcription.strip() == expected_text.strip()
+
+        # Validate final [DONE] chunk
+        assert chunks[-1] == "data: [DONE]\n\n"

From 863de39c28892e8a64a19bd85cd6736cc31cb2f6 Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Wed, 8 Oct 2025 19:21:02 +0530
Subject: [PATCH 04/33] type error fix

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 python/ray/llm/tests/serve/conftest.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/ray/llm/tests/serve/conftest.py b/python/ray/llm/tests/serve/conftest.py
index ed58b4cc809e..f540de167cd0 100644
--- a/python/ray/llm/tests/serve/conftest.py
+++ b/python/ray/llm/tests/serve/conftest.py
@@ -126,7 +126,6 @@ def mock_transcription_request(stream, temperature, language):
     mock_file = UploadFile(
         file=BytesIO(mock_audio_data),
         filename="test_audio.wav",
-        content_type="audio/wav",
     )
 
     return TranscriptionRequest(

From fd611a5d321bf6806d1118e6eaf7246aa493bf40 Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Wed, 8 Oct 2025 19:30:04 +0530
Subject: [PATCH 05/33] formatting updated and added engine transcription
 function def

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 .../serve/core/configs/openai_api_models.py   | 10 +++---
 .../_internal/serve/core/engine/protocol.py   | 31 +++++++++++++++++++
 .../_internal/serve/core/ingress/ingress.py   | 22 +++++++------
 .../_internal/serve/core/server/llm_server.py |  5 ++-
 .../serve/engines/vllm/vllm_engine.py         |  6 +++-
 .../cpu/deployments/llm/test_llm_server.py    |  2 +-
 6 files changed, 60 insertions(+), 16 deletions(-)

diff --git a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py
index 073f5753c9de..1c432f9a44bf 100644
--- a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py
+++ b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py
@@ -125,12 +125,13 @@ class ScoreResponse(vLLMScoreResponse):
     AsyncGenerator[Union[EmbeddingResponse, ErrorResponse], None],
 ]
 
-LLMScoreResponse = Union[
-    AsyncGenerator[Union[ScoreResponse, ErrorResponse], None],
-]
+LLMScoreResponse = Union[AsyncGenerator[Union[ScoreResponse, ErrorResponse], None],]
 
 LLMChatResponse = Union[
-    AsyncGenerator[Union[str, ChatCompletionStreamResponse, ChatCompletionResponse, ErrorResponse], None],
+    AsyncGenerator[
+        Union[str, ChatCompletionStreamResponse, ChatCompletionResponse, ErrorResponse],
+        None,
+    ],
 ]
 
 LLMCompletionsResponse = Union[
@@ -145,6 +146,7 @@ class ScoreResponse(vLLMScoreResponse):
     ],
 ]
 
+
 # TODO: remove this class
 class OpenAIHTTPException(Exception):
     def __init__(
diff --git a/python/ray/llm/_internal/serve/core/engine/protocol.py b/python/ray/llm/_internal/serve/core/engine/protocol.py
index 56bcc5acf827..468157734a2c 100644
--- a/python/ray/llm/_internal/serve/core/engine/protocol.py
+++ b/python/ray/llm/_internal/serve/core/engine/protocol.py
@@ -14,6 +14,8 @@
         CompletionResponse,
         EmbeddingRequest,
         EmbeddingResponse,
+        TranscriptionRequest,
+        TranscriptionResponse,
         ErrorResponse,
     )
 
@@ -118,6 +120,35 @@ async def embeddings(
         """
         pass
 
+    @abc.abstractmethod
+    async def transcriptions(
+        self, request: "TranscriptionRequest"
+    ) -> AsyncGenerator[Union["TranscriptionResponse", "ErrorResponse"], None]:
+        """Run a Transcription with the engine.
+
+        Similar to chat and completion, this method is an async generator,
+        so it yields chunks of response and when it is done, it returns None.
+        We have the following convention:
+
+        * In case of streaming, yield a string representing data:
+        <json_str>\n\n for each chunk. This should be already openAI compatible,
+        so the higher level can just yield it to the client.
+        * In case of non-streaming, yield a single object of type TranscriptionResponse.
+        * In case of error, yield a single object of type ErrorResponse.
+
+        Args:
+            request: The transcription request.
+
+        Yields:
+            Union[str, TranscriptionResponse, ErrorResponse]: A string
+            representing a chunk of the response, a TranscriptionResponse object,
+            or an ErrorResponse object.
+
+        Returns:
+            None when the generator is done.
+        """
+        pass
+
     async def check_health(self) -> None:
         """Check the health of the engine.
 
diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py
index dbeaa769454c..54b6e05590ba 100644
--- a/python/ray/llm/_internal/serve/core/ingress/ingress.py
+++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py
@@ -112,9 +112,7 @@ def _sanitize_chat_completion_request(
 
 
 StreamResponseType = Union[
-    ChatCompletionStreamResponse,
-    CompletionStreamResponse,
-    TranscriptionStreamResponse
+    ChatCompletionStreamResponse, CompletionStreamResponse, TranscriptionStreamResponse
 ]
 BatchedStreamResponseType = List[StreamResponseType]
 
@@ -232,7 +230,7 @@ def make_fastapi_ingress(
 
 
 def _apply_openai_json_format(
-    response: Union[StreamResponseType, BatchedStreamResponseType]
+    response: Union[StreamResponseType, BatchedStreamResponseType],
 ) -> str:
     """Converts the stream response to OpenAI format.
 
@@ -261,7 +259,7 @@ def _apply_openai_json_format(
 
 
 async def _peek_at_generator(
-    gen: AsyncGenerator[T, None]
+    gen: AsyncGenerator[T, None],
 ) -> Tuple[T, AsyncGenerator[T, None]]:
     # Peek at the first element
     first_item = await gen.__anext__()
@@ -408,7 +406,11 @@ async def _get_response(
         self,
         *,
         body: Union[
-            CompletionRequest, ChatCompletionRequest, EmbeddingRequest, TranscriptionRequest, ScoreRequest
+            CompletionRequest,
+            ChatCompletionRequest,
+            EmbeddingRequest,
+            TranscriptionRequest,
+            ScoreRequest,
         ],
         call_method: str,
     ) -> AsyncGenerator[
@@ -503,9 +505,11 @@ async def model_data(self, model: str) -> ModelCard:
         return model_data
 
     async def _process_llm_request(
-        self, body: Union[CompletionRequest, ChatCompletionRequest, TranscriptionRequest], call_method: str
+        self,
+        body: Union[CompletionRequest, ChatCompletionRequest, TranscriptionRequest],
+        call_method: str,
     ) -> Response:
-        
+
         if call_method == "chat":
             NoneStreamingResponseType = ChatCompletionResponse
         elif call_method == "completions":
@@ -589,7 +593,7 @@ async def embeddings(self, body: EmbeddingRequest) -> Response:
 
             if isinstance(result, EmbeddingResponse):
                 return JSONResponse(content=result.model_dump())
-    
+
     @fastapi_router_app.post("/v1/audio/transcriptions")
     async def transcriptions(self, body: TranscriptionRequest) -> Response:
         """Create transcription for the provided audio input.
diff --git a/python/ray/llm/_internal/serve/core/server/llm_server.py b/python/ray/llm/_internal/serve/core/server/llm_server.py
index dec41f2dbed7..6328e3623b9a 100644
--- a/python/ray/llm/_internal/serve/core/server/llm_server.py
+++ b/python/ray/llm/_internal/serve/core/server/llm_server.py
@@ -253,7 +253,10 @@ def _get_batch_interval_ms(self, stream: bool = True) -> int:
     async def _maybe_add_request_id_to_request(
         self,
         request: Union[
-            "ChatCompletionRequest", "CompletionRequest", "EmbeddingRequest", "TranscriptionRequest"
+            "ChatCompletionRequest",
+            "CompletionRequest",
+            "EmbeddingRequest",
+            "TranscriptionRequest",
         ],
     ):
         """Add the request id to the request."""
diff --git a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py
index 1c69e7b72ba6..6d16252955ba 100644
--- a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py
+++ b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py
@@ -361,7 +361,11 @@ async def resolve_lora(self, disk_lora_model: DiskMultiplexConfig):
     def _create_raw_request(
         self,
         request: Union[
-            CompletionRequest, ChatCompletionRequest, EmbeddingRequest, TranscriptionRequest, ScoreRequest
+            CompletionRequest,
+            ChatCompletionRequest,
+            EmbeddingRequest,
+            TranscriptionRequest,
+            ScoreRequest,
         ],
         path: str,
     ) -> Request:
diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py
index 07425024b522..de74530d3e35 100644
--- a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py
+++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py
@@ -155,7 +155,7 @@ async def test_embedding_llm_server(
 
         # Validate embedding response
         LLMResponseValidator.validate_embedding_response(chunks[0], dimensions)
-    
+
     @pytest.mark.parametrize("stream", [False, True])
     @pytest.mark.parametrize("temperature", [0.0])
     @pytest.mark.parametrize("language", ["en", "hi"])

From c55fdc9fb83549fae125d4f3d39ff335f3cccb02 Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Wed, 8 Oct 2025 19:37:03 +0530
Subject: [PATCH 06/33] naming updates

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 .../ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py  | 2 +-
 python/ray/llm/tests/serve/mocks/mock_vllm_engine.py            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py
index 0755105d3886..5025b9d1d37b 100644
--- a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py
+++ b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine.py
@@ -101,7 +101,7 @@ async def test_transcription_mock_engine(
         await engine.start()
 
         request = mock_transcription_request
-        response_generator = engine.transcription(request)
+        response_generator = engine.transcriptions(request)
 
         print(
             f"\n\n_____ TRANSCRIPTION ({'STREAMING' if stream else 'NON-STREAMING'}) language={language} temperature={temperature} _____\n\n"
diff --git a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py
index a926c429c577..8bc4a65e1114 100644
--- a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py
+++ b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py
@@ -140,7 +140,7 @@ async def embeddings(
         )
         yield response
 
-    async def transcription(
+    async def transcriptions(
         self, request: TranscriptionRequest
     ) -> AsyncGenerator[Union[str, TranscriptionResponse, ErrorResponse], None]:
         """Mock transcription generation."""

From 7b62802a5330cab6571c65e0cdc71f97f04d3b2e Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Wed, 8 Oct 2025 20:13:08 +0530
Subject: [PATCH 07/33] lora prefix updates and code formatting

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 .../serve/core/configs/openai_api_models.py   |  4 +++-
 .../llm/tests/serve/mocks/mock_vllm_engine.py | 24 ++++++++-----------
 .../llm/tests/serve/utils/testing_utils.py    | 12 ++++++++--
 3 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py
index 1c432f9a44bf..150db113d180 100644
--- a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py
+++ b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py
@@ -125,7 +125,9 @@ class ScoreResponse(vLLMScoreResponse):
     AsyncGenerator[Union[EmbeddingResponse, ErrorResponse], None],
 ]
 
-LLMScoreResponse = Union[AsyncGenerator[Union[ScoreResponse, ErrorResponse], None],]
+LLMScoreResponse = Union[
+    AsyncGenerator[Union[ScoreResponse, ErrorResponse], None],
+]
 
 LLMChatResponse = Union[
     AsyncGenerator[
diff --git a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py
index 8bc4a65e1114..ab277691983e 100644
--- a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py
+++ b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py
@@ -148,22 +148,12 @@ async def transcriptions(
             raise RuntimeError("Engine not started")
 
         # Extract audio file info
-        audio_file = request.file
         language = getattr(request, "language", "en")
         temperature = getattr(request, "temperature", 0.0)
-        stream = getattr(request, "stream", False)
-
-        # Generate mock transcription response
-        mock_transcription_text = (
-            f"Mock transcription in {language} language with temperature {temperature}"
-        )
 
         # Generate transcription response
         async for response in self._generate_transcription_response(
-            request=request,
-            transcription_text=mock_transcription_text,
-            language=language,
-            temperature=temperature,
+            request=request, language=language, temperature=temperature
         ):
             yield response
 
@@ -347,7 +337,6 @@ async def _generate_completion_response(
     async def _generate_transcription_response(
         self,
         request: TranscriptionRequest,
-        transcription_text: str,
         language: str,
         temperature: float,
     ) -> AsyncGenerator[Union[str, TranscriptionResponse], None]:
@@ -360,13 +349,20 @@ async def _generate_transcription_response(
             else f"[lora_model] {request.model}: "
         )
 
+        # Generate mock transcription text with LoRA prefix
+        mock_transcription_text = (
+            f"Mock transcription in {language} language with temperature {temperature}"
+        )
+        if lora_prefix:
+            mock_transcription_text = f"{lora_prefix}{mock_transcription_text}"
+
         if request.stream:
             # Streaming response - return SSE formatted strings
             created_time = int(asyncio.get_event_loop().time())
             model_name = getattr(request, "model", "mock-model")
 
             # Split transcription into words for streaming
-            words = transcription_text.split()
+            words = mock_transcription_text.split()
 
             for i, word in enumerate(words):
                 # Create streaming chunk
@@ -418,7 +414,7 @@ async def _generate_transcription_response(
         else:
             # Non-streaming response - return response object
             response = TranscriptionResponse(
-                text=transcription_text,
+                text=mock_transcription_text,
                 logprobs=None,
                 usage={
                     "seconds": 5.0,
diff --git a/python/ray/llm/tests/serve/utils/testing_utils.py b/python/ray/llm/tests/serve/utils/testing_utils.py
index b1fd72bce525..0a8b4a95ad56 100644
--- a/python/ray/llm/tests/serve/utils/testing_utils.py
+++ b/python/ray/llm/tests/serve/utils/testing_utils.py
@@ -115,12 +115,13 @@ def validate_transcription_response(
         response: Union[TranscriptionResponse, List[str]],
         temperature: float,
         language: Optional[str] = None,
+        lora_model_id: str = "",
     ):
         """Validate transcription responses for both streaming and non-streaming."""
         if isinstance(response, list):
             # Streaming response - validate chunks
             LLMResponseValidator.validate_transcription_streaming_chunks(
-                response, temperature, language
+                response, temperature, language, lora_model_id
             )
         else:
             # Non-streaming response
@@ -131,6 +132,8 @@ def validate_transcription_response(
 
             # Check that the response contains expected language and temperature info
             expected_text = f"Mock transcription in {language} language with temperature {temperature}"
+            if lora_model_id:
+                expected_text = f"[lora_model] {lora_model_id}: {expected_text}"
             assert response.text == expected_text
 
             # Validate usage information
@@ -142,7 +145,10 @@ def validate_transcription_response(
 
     @staticmethod
     def validate_transcription_streaming_chunks(
-        chunks: List[str], temperature: float, language: Optional[str] = None
+        chunks: List[str],
+        temperature: float,
+        language: Optional[str] = None,
+        lora_model_id: str = "",
     ):
         """Validate streaming transcription response chunks."""
         # Should have at least one chunk (transcription text) + final chunk + [DONE]
@@ -182,6 +188,8 @@ def validate_transcription_streaming_chunks(
         expected_text = (
             f"Mock transcription in {language} language with temperature {temperature}"
         )
+        if lora_model_id:
+            expected_text = f"[lora_model] {lora_model_id}: {expected_text}"
         assert full_transcription.strip() == expected_text.strip()
 
         # Validate final [DONE] chunk

From 77d162ade5594fcd7562b9a255fe3590eddea78c Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Wed, 8 Oct 2025 21:57:44 +0530
Subject: [PATCH 08/33] request_id added in transcription request

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 .../_internal/serve/core/configs/openai_api_models.py    | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py
index 150db113d180..376f6e8b436d 100644
--- a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py
+++ b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py
@@ -102,6 +102,15 @@ class EmbeddingResponse(vLLMEmbeddingResponse):
 class TranscriptionRequest(vLLMTranscriptionRequest):
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
+    request_id: str = Field(
+        default_factory=lambda: f"{random_uuid()}",
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    
 
 class TranscriptionResponse(vLLMTranscriptionResponse):
     model_config = ConfigDict(arbitrary_types_allowed=True)

From 8294a338b9f2a41d1f3e0ed3b661b2775cc13252 Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Thu, 9 Oct 2025 22:25:01 +0530
Subject: [PATCH 09/33] modified docs for ci tests and added release test

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 .../transcription_config_example.yaml         |  29 +++++
 .../transcription/transcription_example.py    | 101 ++++++++++++++++++
 .../transcription_yaml_config_example.py      |  51 +++++++++
 .../serve/core/configs/openai_api_models.py   |   2 +-
 .../_internal/serve/core/ingress/ingress.py   |  14 ++-
 .../serve/test_llm_serve_integration.py       |  28 +++++
 6 files changed, 219 insertions(+), 6 deletions(-)
 create mode 100644 doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml
 create mode 100644 doc/source/llm/doc_code/serve/transcription/transcription_example.py
 create mode 100644 doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py

diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml b/doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml
new file mode 100644
index 000000000000..eeac1ea33001
--- /dev/null
+++ b/doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml
@@ -0,0 +1,29 @@
+# config.yaml
+applications:
+- args:
+    llm_configs:
+    - model_loading_config:
+        model_id: whisper-large
+        model_source: openai/whisper-large-v3
+      accelerator_type: A10G
+      deployment_config:
+        autoscaling_config:
+          min_replicas: 1
+          max_replicas: 2
+      log_engine_metrics: true
+    - model_loading_config:
+        model_id: voxtral-mini
+        model_source: mistralai/Voxtral-Mini-3B-2507
+      accelerator_type: A10G
+      engine_kwargs:
+        tokenizer_mode: mistral
+        config_format: mistral
+        load_format: mistral
+      deployment_config:
+        autoscaling_config:
+          min_replicas: 1
+          max_replicas: 2
+      log_engine_metrics: true
+  import_path: ray.serve.llm:build_openai_app
+  name: llm_app
+  route_prefix: "/"
diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_example.py b/doc/source/llm/doc_code/serve/transcription/transcription_example.py
new file mode 100644
index 000000000000..6ff55a6e2272
--- /dev/null
+++ b/doc/source/llm/doc_code/serve/transcription/transcription_example.py
@@ -0,0 +1,101 @@
+"""
+This file serves as a documentation example and CI test.
+
+Structure:
+1. Monkeypatch setup: Ensures serve.run is non-blocking and removes accelerator requirements for CI testing.
+2. Docs example (between __transcription_example_start/end__): Embedded in Sphinx docs via literalinclude.
+3. Test validation (deployment status polling + cleanup)
+"""
+
+import time
+from ray import serve
+from ray.serve.schema import ApplicationStatus
+from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME
+from ray.serve import llm
+
+_original_serve_run = serve.run
+_original_build_openai_app = llm.build_openai_app
+
+
+def _non_blocking_serve_run(app, **kwargs):
+    """Forces blocking=False for testing"""
+    kwargs["blocking"] = False
+    return _original_serve_run(app, **kwargs)
+
+
+def _testing_build_openai_app(llm_serving_args):
+    """Removes accelerator requirements for testing"""
+    for config in llm_serving_args["llm_configs"]:
+        config.accelerator_type = None
+
+    return _original_build_openai_app(llm_serving_args)
+
+
+serve.run = _non_blocking_serve_run
+llm.build_openai_app = _testing_build_openai_app
+
+# __transcription_example_start__
+from ray import serve
+from ray.serve.llm import LLMConfig, build_openai_app
+
+whisper_llm_config = LLMConfig(
+    model_loading_config={
+        "model_id": "whisper-large",
+        "model_source": "openai/whisper-large-v3",
+    },
+    deployment_config={
+        "autoscaling_config": {
+            "min_replicas": 1,
+            "max_replicas": 2,
+        }
+    },
+    # Pass the desired accelerator type (e.g. A10G, L4, etc.)
+    accelerator_type="A10G",
+    log_engine_metrics=True,
+)
+
+voxtral_llm_config = LLMConfig(
+    model_loading_config={
+        "model_id": "voxtral-mini",
+        "model_source": "mistralai/Voxtral-Mini-3B-2507",
+    },
+    deployment_config={
+        "autoscaling_config": {
+            "min_replicas": 1,
+            "max_replicas": 2,
+        }
+    },
+    accelerator_type="A10G",
+    # You can customize the engine arguments (e.g. vLLM engine kwargs)
+    engine_kwargs={
+        "tokenizer_mode": "mistral",
+        "config_format": "mistral",
+        "load_format": "mistral",
+    },
+    log_engine_metrics=True,
+)
+
+app = build_openai_app({"llm_configs": [whisper_llm_config, voxtral_llm_config]})
+serve.run(app, blocking=True)
+# __transcription_example_end__
+
+status = ApplicationStatus.NOT_STARTED
+timeout_seconds = 300
+start_time = time.time()
+
+while (
+    status != ApplicationStatus.RUNNING and time.time() - start_time < timeout_seconds
+):
+    status = serve.status().applications[SERVE_DEFAULT_APP_NAME].status
+
+    if status in [ApplicationStatus.DEPLOY_FAILED, ApplicationStatus.UNHEALTHY]:
+        raise AssertionError(f"Deployment failed with status: {status}")
+
+    time.sleep(1)
+
+if status != ApplicationStatus.RUNNING:
+    raise AssertionError(
+        f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}"
+    )
+
+serve.shutdown()
diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py b/doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py
new file mode 100644
index 000000000000..e37def988888
--- /dev/null
+++ b/doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py
@@ -0,0 +1,51 @@
+"""
+This file serves as a documentation example and CI test for YAML config deployment.
+
+Structure:
+1. Monkeypatch setup: Ensures serve.run is non-blocking and removes accelerator requirements for CI testing.
+2. Load YAML config and convert to Python using build_openai_app
+3. Test validation (deployment status polling + cleanup)
+"""
+
+import time
+import os
+import yaml
+from ray import serve
+from ray.serve.schema import ApplicationStatus
+from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME
+from ray.serve import llm
+
+
+config_path = os.path.join(
+    os.path.dirname(__file__), "transcription_config_example.yaml"
+)
+with open(config_path, "r") as f:
+    config_dict = yaml.safe_load(f)
+
+llm_configs = config_dict["applications"][0]["args"]["llm_configs"]
+for config in llm_configs:
+    config.pop("accelerator_type", None)
+
+app = llm.build_openai_app({"llm_configs": llm_configs})
+serve.run(app, blocking=False)
+
+status = ApplicationStatus.NOT_STARTED
+timeout_seconds = 300
+start_time = time.time()
+
+while (
+    status != ApplicationStatus.RUNNING and time.time() - start_time < timeout_seconds
+):
+    status = serve.status().applications[SERVE_DEFAULT_APP_NAME].status
+
+    if status in [ApplicationStatus.DEPLOY_FAILED, ApplicationStatus.UNHEALTHY]:
+        raise AssertionError(f"Deployment failed with status: {status}")
+
+    time.sleep(1)
+
+if status != ApplicationStatus.RUNNING:
+    raise AssertionError(
+        f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}"
+    )
+
+serve.shutdown()
diff --git a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py
index 376f6e8b436d..b7c75b8d8064 100644
--- a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py
+++ b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py
@@ -110,7 +110,7 @@ class TranscriptionRequest(vLLMTranscriptionRequest):
             "through out the inference process and return in response."
         ),
     )
-    
+
 
 class TranscriptionResponse(vLLMTranscriptionResponse):
     model_config = ConfigDict(arbitrary_types_allowed=True)
diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py
index 54b6e05590ba..7fc62a881dd7 100644
--- a/python/ray/llm/_internal/serve/core/ingress/ingress.py
+++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py
@@ -1,6 +1,7 @@
 import asyncio
 import json
 import sys
+from enum import Enum
 from contextlib import asynccontextmanager
 from typing import (
     Any,
@@ -86,6 +87,10 @@
 DEFAULT_INGRESS_OPTIONS = {
     "max_ongoing_requests": DEFAULT_MAX_ONGOING_REQUESTS,
 }
+class CallMethod(Enum):
+    CHAT = "chat"
+    COMPLETIONS = "completions"
+    TRANSCRIPTIONS = "transcriptions"
 
 
 def _sanitize_chat_completion_request(
@@ -507,14 +512,14 @@ async def model_data(self, model: str) -> ModelCard:
     async def _process_llm_request(
         self,
         body: Union[CompletionRequest, ChatCompletionRequest, TranscriptionRequest],
-        call_method: str,
+        call_method: CallMethod,
     ) -> Response:
 
-        if call_method == "chat":
+        if call_method == CallMethod.CHAT:
             NoneStreamingResponseType = ChatCompletionResponse
-        elif call_method == "completions":
+        elif call_method == CallMethod.COMPLETIONS:
             NoneStreamingResponseType = CompletionResponse
-        elif call_method == "transcriptions":
+        elif call_method == CallMethod.TRANSCRIPTIONS:
             NoneStreamingResponseType = TranscriptionResponse
 
         async with router_request_timeout(DEFAULT_LLM_ROUTER_HTTP_TIMEOUT):
@@ -594,7 +599,6 @@ async def embeddings(self, body: EmbeddingRequest) -> Response:
             if isinstance(result, EmbeddingResponse):
                 return JSONResponse(content=result.model_dump())
 
-    @fastapi_router_app.post("/v1/audio/transcriptions")
     async def transcriptions(self, body: TranscriptionRequest) -> Response:
         """Create transcription for the provided audio input.
 
diff --git a/release/llm_tests/serve/test_llm_serve_integration.py b/release/llm_tests/serve/test_llm_serve_integration.py
index 8d1b423ba4b9..894b31d65930 100644
--- a/release/llm_tests/serve/test_llm_serve_integration.py
+++ b/release/llm_tests/serve/test_llm_serve_integration.py
@@ -156,6 +156,34 @@ def test_deepseek_model(model_name):
     time.sleep(1)
 
 
+@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"])
+def test_transcription_model(model_name):
+    """
+    Test that the transcription models can be loaded successfully.
+    """
+    llm_config = LLMConfig(
+        model_loading_config=dict(
+            model_id=model_name,
+            model_source=model_name,
+        ),
+        deployment_config=dict(
+            autoscaling_config=dict(min_replicas=1, max_replicas=4),
+        ),
+        engine_kwargs=dict(
+            trust_remote_code=True,
+            gpu_memory_utilization=0.9,
+            enable_prefix_caching=True,
+            max_model_len=2048,
+            tokenizer_mode="mistral",
+            config_format="mistral",
+            load_format="mistral",
+        ),
+    )
+    app = build_openai_app({"llm_configs": [llm_config]})
+    serve.run(app, blocking=False)
+    wait_for_condition(is_default_app_running, timeout=180)
+
+
 @pytest.mark.asyncio(scope="function")
 @pytest.fixture
 def remote_model_app(request):

From c5134d545a415db512fe801b8b32ebdd2f0ff0d8 Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Thu, 9 Oct 2025 23:07:47 +0530
Subject: [PATCH 10/33] enum fix

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 python/ray/llm/_internal/serve/core/ingress/ingress.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py
index 7fc62a881dd7..1621ea9a2b1f 100644
--- a/python/ray/llm/_internal/serve/core/ingress/ingress.py
+++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py
@@ -524,7 +524,7 @@ async def _process_llm_request(
 
         async with router_request_timeout(DEFAULT_LLM_ROUTER_HTTP_TIMEOUT):
 
-            gen = self._get_response(body=body, call_method=call_method)
+            gen = self._get_response(body=body, call_method=call_method.value)
 
             # In streaming with batching enabled, this first response can be a list of chunks.
             initial_response, gen = await _peek_at_generator(gen)

From 2cd0ac9c25d90d3ac7049ccc4104a12028601561 Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Fri, 10 Oct 2025 01:54:49 +0530
Subject: [PATCH 11/33] enum fix

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 python/ray/llm/_internal/serve/core/ingress/ingress.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py
index 1621ea9a2b1f..7fc62a881dd7 100644
--- a/python/ray/llm/_internal/serve/core/ingress/ingress.py
+++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py
@@ -524,7 +524,7 @@ async def _process_llm_request(
 
         async with router_request_timeout(DEFAULT_LLM_ROUTER_HTTP_TIMEOUT):
 
-            gen = self._get_response(body=body, call_method=call_method.value)
+            gen = self._get_response(body=body, call_method=call_method)
 
             # In streaming with batching enabled, this first response can be a list of chunks.
             initial_response, gen = await _peek_at_generator(gen)

From b248c9015f502a4e2d307150e72453b77e6b77ac Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Sat, 11 Oct 2025 01:45:05 +0530
Subject: [PATCH 12/33] router updates

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 .../_internal/serve/core/ingress/ingress.py   | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py
index 7fc62a881dd7..a61a8b86a674 100644
--- a/python/ray/llm/_internal/serve/core/ingress/ingress.py
+++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py
@@ -87,12 +87,21 @@
 DEFAULT_INGRESS_OPTIONS = {
     "max_ongoing_requests": DEFAULT_MAX_ONGOING_REQUESTS,
 }
+
+
 class CallMethod(Enum):
     CHAT = "chat"
     COMPLETIONS = "completions"
     TRANSCRIPTIONS = "transcriptions"
 
 
+NON_STREAMING_RESPONSE_TYPES = (
+    ChatCompletionResponse,
+    CompletionResponse,
+    TranscriptionResponse,
+)
+
+
 def _sanitize_chat_completion_request(
     request: ChatCompletionRequest,
 ) -> ChatCompletionRequest:
@@ -130,6 +139,7 @@ def _sanitize_chat_completion_request(
     "completions": lambda app: app.post("/v1/completions"),
     "chat": lambda app: app.post("/v1/chat/completions"),
     "embeddings": lambda app: app.post("/v1/embeddings"),
+    "transcriptions": lambda app: app.post("/v1/audio/transcriptions"),
     "score": lambda app: app.post("/v1/score"),
 }
 
@@ -515,13 +525,6 @@ async def _process_llm_request(
         call_method: CallMethod,
     ) -> Response:
 
-        if call_method == CallMethod.CHAT:
-            NoneStreamingResponseType = ChatCompletionResponse
-        elif call_method == CallMethod.COMPLETIONS:
-            NoneStreamingResponseType = CompletionResponse
-        elif call_method == CallMethod.TRANSCRIPTIONS:
-            NoneStreamingResponseType = TranscriptionResponse
-
         async with router_request_timeout(DEFAULT_LLM_ROUTER_HTTP_TIMEOUT):
 
             gen = self._get_response(body=body, call_method=call_method)
@@ -541,7 +544,7 @@ async def _process_llm_request(
                     type=first_chunk.error.type,
                 )
 
-            if isinstance(first_chunk, NoneStreamingResponseType):
+            if isinstance(first_chunk, NON_STREAMING_RESPONSE_TYPES):
                 # Not streaming, first chunk should be a single response
                 return JSONResponse(content=first_chunk.model_dump())
 

From 92d4fdbcb136588fe21077428553c41255daf3cd Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Sat, 11 Oct 2025 02:12:27 +0530
Subject: [PATCH 13/33] router fix

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 python/ray/llm/_internal/serve/core/ingress/ingress.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py
index a61a8b86a674..e1545a7e57f2 100644
--- a/python/ray/llm/_internal/serve/core/ingress/ingress.py
+++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py
@@ -605,6 +605,9 @@ async def embeddings(self, body: EmbeddingRequest) -> Response:
     async def transcriptions(self, body: TranscriptionRequest) -> Response:
         """Create transcription for the provided audio input.
 
+        Args:
+            body: The TranscriptionRequest object.
+
         Returns:
             A response object with transcriptions.
         """

From fff6dbac8c90ad7754d61f4e4a34a96c17e058ca Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Sat, 11 Oct 2025 23:56:47 +0530
Subject: [PATCH 14/33] pre commit hooks run and bazel build

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 doc/BUILD.bazel                                        |  2 +-
 .../_internal/serve/core/configs/openai_api_models.py  |  6 +++---
 python/ray/llm/_internal/serve/core/engine/protocol.py |  2 +-
 python/ray/llm/_internal/serve/core/ingress/ingress.py | 10 +++++-----
 .../ray/llm/_internal/serve/core/server/llm_server.py  |  4 ++--
 .../llm/_internal/serve/engines/vllm/vllm_engine.py    |  6 +++---
 python/ray/llm/tests/serve/conftest.py                 |  1 +
 python/ray/llm/tests/serve/mocks/mock_vllm_engine.py   |  5 ++---
 python/ray/serve/llm/openai_api_models.py              |  2 +-
 9 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/doc/BUILD.bazel b/doc/BUILD.bazel
index 26df9d2197af..830d0e95d357 100644
--- a/doc/BUILD.bazel
+++ b/doc/BUILD.bazel
@@ -348,7 +348,7 @@ py_test_run_all_subdirectory(
     include = ["source/llm/doc_code/serve/**/*.py"],
     exclude = [],
     extra_srcs = [],
-    data = ["source/llm/doc_code/serve/qwen/llm_config_example.yaml"],
+    data = ["source/llm/doc_code/serve/qwen/llm_config_example.yaml", "source/llm/doc_code/serve/transcription/transcription_config_example.yaml"],
     tags = [
         "exclusive",
         "gpu",
diff --git a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py
index b7c75b8d8064..ec12e9988890 100644
--- a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py
+++ b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py
@@ -21,13 +21,13 @@
     EmbeddingChatRequest as vLLMEmbeddingChatRequest,
     EmbeddingCompletionRequest as vLLMEmbeddingCompletionRequest,
     EmbeddingResponse as vLLMEmbeddingResponse,
-    TranscriptionRequest as vLLMTranscriptionRequest,
-    TranscriptionResponse as vLLMTranscriptionResponse,
-    TranscriptionStreamResponse as vLLMTranscriptionStreamResponse,
     ErrorInfo as vLLMErrorInfo,
     ErrorResponse as vLLMErrorResponse,
     ScoreRequest as vLLMScoreRequest,
     ScoreResponse as vLLMScoreResponse,
+    TranscriptionRequest as vLLMTranscriptionRequest,
+    TranscriptionResponse as vLLMTranscriptionResponse,
+    TranscriptionStreamResponse as vLLMTranscriptionStreamResponse,
 )
 from vllm.utils import random_uuid
 
diff --git a/python/ray/llm/_internal/serve/core/engine/protocol.py b/python/ray/llm/_internal/serve/core/engine/protocol.py
index 468157734a2c..9270865302c3 100644
--- a/python/ray/llm/_internal/serve/core/engine/protocol.py
+++ b/python/ray/llm/_internal/serve/core/engine/protocol.py
@@ -14,9 +14,9 @@
         CompletionResponse,
         EmbeddingRequest,
         EmbeddingResponse,
+        ErrorResponse,
         TranscriptionRequest,
         TranscriptionResponse,
-        ErrorResponse,
     )
 
 
diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py
index e1545a7e57f2..ebdbbd06e3a6 100644
--- a/python/ray/llm/_internal/serve/core/ingress/ingress.py
+++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py
@@ -1,8 +1,8 @@
 import asyncio
 import json
 import sys
-from enum import Enum
 from contextlib import asynccontextmanager
+from enum import Enum
 from typing import (
     Any,
     AsyncGenerator,
@@ -41,20 +41,20 @@
     CompletionStreamResponse,
     EmbeddingRequest,
     EmbeddingResponse,
-    TranscriptionRequest,
-    TranscriptionResponse,
-    TranscriptionStreamResponse,
     ErrorResponse,
     LLMChatResponse,
     LLMCompletionsResponse,
     LLMEmbeddingsResponse,
-    LLMTranscriptionResponse,
     LLMScoreResponse,
+    LLMTranscriptionResponse,
     ModelCard,
     ModelList,
     OpenAIHTTPException,
     ScoreRequest,
     ScoreResponse,
+    TranscriptionRequest,
+    TranscriptionResponse,
+    TranscriptionStreamResponse,
     to_model_metadata,
 )
 from ray.llm._internal.serve.core.ingress.middleware import (
diff --git a/python/ray/llm/_internal/serve/core/server/llm_server.py b/python/ray/llm/_internal/serve/core/server/llm_server.py
index 6328e3623b9a..7f44ea501655 100644
--- a/python/ray/llm/_internal/serve/core/server/llm_server.py
+++ b/python/ray/llm/_internal/serve/core/server/llm_server.py
@@ -49,11 +49,11 @@
         CompletionResponse,
         EmbeddingRequest,
         EmbeddingResponse,
-        TranscriptionRequest,
-        TranscriptionResponse,
         ErrorResponse,
         ScoreRequest,
         ScoreResponse,
+        TranscriptionRequest,
+        TranscriptionResponse,
     )
 
 logger = get_logger(__name__)
diff --git a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py
index 6d16252955ba..6e065f0c8598 100644
--- a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py
+++ b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py
@@ -22,12 +22,12 @@
     CompletionResponse,
     EmbeddingRequest,
     EmbeddingResponse,
-    TranscriptionRequest,
-    TranscriptionResponse,
     ErrorInfo,
     ErrorResponse,
     ScoreRequest,
     ScoreResponse,
+    TranscriptionRequest,
+    TranscriptionResponse,
 )
 from ray.llm._internal.serve.core.engine.protocol import LLMEngine
 from ray.llm._internal.serve.engines.vllm.vllm_models import (
@@ -46,9 +46,9 @@
     from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
     from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
     from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
-    from vllm.entrypoints.openai.serving_transcription import OpenAIServingTranscription
     from vllm.entrypoints.openai.serving_models import OpenAIServingModels
     from vllm.entrypoints.openai.serving_score import ServingScores
+    from vllm.entrypoints.openai.serving_transcription import OpenAIServingTranscription
 
 vllm = try_import("vllm")
 logger = get_logger(__name__)
diff --git a/python/ray/llm/tests/serve/conftest.py b/python/ray/llm/tests/serve/conftest.py
index f540de167cd0..5c7357a1823e 100644
--- a/python/ray/llm/tests/serve/conftest.py
+++ b/python/ray/llm/tests/serve/conftest.py
@@ -119,6 +119,7 @@ def mock_transcription_request(stream, temperature, language):
     """Fixture for creating transcription requests for mock testing."""
     # Create a mock audio file for testing
     from io import BytesIO
+
     from fastapi import UploadFile
 
     # Create a simple mock audio file (WAV format)
diff --git a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py
index ab277691983e..c23e56b5e088 100644
--- a/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py
+++ b/python/ray/llm/tests/serve/mocks/mock_vllm_engine.py
@@ -16,12 +16,11 @@
     CompletionResponse,
     EmbeddingRequest,
     EmbeddingResponse,
-    TranscriptionRequest,
-    TranscriptionResponse,
     ErrorResponse,
     ScoreRequest,
     ScoreResponse,
-    TranscriptionStreamResponse,
+    TranscriptionRequest,
+    TranscriptionResponse,
 )
 from ray.llm._internal.serve.core.engine.protocol import LLMEngine
 from ray.llm._internal.serve.utils.lora_serve_utils import LoraModelLoader
diff --git a/python/ray/serve/llm/openai_api_models.py b/python/ray/serve/llm/openai_api_models.py
index 48fe7d3bee2a..18603ac3deb0 100644
--- a/python/ray/serve/llm/openai_api_models.py
+++ b/python/ray/serve/llm/openai_api_models.py
@@ -7,10 +7,10 @@
     CompletionStreamResponse as _CompletionStreamResponse,
     EmbeddingRequest as _EmbeddingRequest,
     EmbeddingResponse as _EmbeddingResponse,
+    ErrorResponse as _ErrorResponse,
     TranscriptionRequest as _TranscriptionRequest,
     TranscriptionResponse as _TranscriptionResponse,
     TranscriptionStreamResponse as _TranscriptionStreamResponse,
-    ErrorResponse as _ErrorResponse,
 )
 from ray.util.annotations import PublicAPI
 

From 7485e369a9c853aae27846500c174945b38b9a4a Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Sun, 12 Oct 2025 00:21:23 +0530
Subject: [PATCH 15/33] enum fixes

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 .../ray/llm/_internal/serve/core/ingress/ingress.py  | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py
index ebdbbd06e3a6..5d7973b833ed 100644
--- a/python/ray/llm/_internal/serve/core/ingress/ingress.py
+++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py
@@ -522,7 +522,7 @@ async def model_data(self, model: str) -> ModelCard:
     async def _process_llm_request(
         self,
         body: Union[CompletionRequest, ChatCompletionRequest, TranscriptionRequest],
-        call_method: CallMethod,
+        call_method: str,
     ) -> Response:
 
         async with router_request_timeout(DEFAULT_LLM_ROUTER_HTTP_TIMEOUT):
@@ -565,7 +565,9 @@ async def completions(self, body: CompletionRequest) -> Response:
         Returns:
             A response object with completions.
         """
-        return await self._process_llm_request(body, call_method="completions")
+        return await self._process_llm_request(
+            body, call_method=CallMethod.COMPLETIONS.value
+        )
 
     async def chat(self, body: ChatCompletionRequest) -> Response:
         """Given a prompt, the model will return one or more predicted completions,
@@ -578,7 +580,7 @@ async def chat(self, body: ChatCompletionRequest) -> Response:
             A response object with completions.
         """
 
-        return await self._process_llm_request(body, call_method="chat")
+        return await self._process_llm_request(body, call_method=CallMethod.CHAT.value)
 
     async def embeddings(self, body: EmbeddingRequest) -> Response:
         """Create embeddings for the provided input.
@@ -612,7 +614,9 @@ async def transcriptions(self, body: TranscriptionRequest) -> Response:
             A response object with transcriptions.
         """
 
-        return await self._process_llm_request(body, call_method="transcriptions")
+        return await self._process_llm_request(
+            body, call_method=CallMethod.TRANSCRIPTIONS.value
+        )
 
     async def score(self, body: ScoreRequest) -> Response:
         """Create scores for the provided text pairs.

From bea6209b5603c7d4e0704a74e924072fceff7497 Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Sun, 12 Oct 2025 16:46:02 +0530
Subject: [PATCH 16/33] inconsistency fixes

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 .../llm/_internal/serve/core/configs/openai_api_models.py    | 5 +++--
 python/ray/llm/_internal/serve/core/server/llm_server.py     | 4 +++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py
index ec12e9988890..9fc708ce0bc6 100644
--- a/python/ray/llm/_internal/serve/core/configs/openai_api_models.py
+++ b/python/ray/llm/_internal/serve/core/configs/openai_api_models.py
@@ -147,13 +147,14 @@ class ScoreResponse(vLLMScoreResponse):
 
 LLMCompletionsResponse = Union[
     AsyncGenerator[
-        Union[CompletionStreamResponse, CompletionResponse, ErrorResponse], None
+        Union[str, CompletionStreamResponse, CompletionResponse, ErrorResponse], None
     ],
 ]
 
 LLMTranscriptionResponse = Union[
     AsyncGenerator[
-        Union[TranscriptionStreamResponse, TranscriptionResponse, ErrorResponse], None
+        Union[str, TranscriptionStreamResponse, TranscriptionResponse, ErrorResponse],
+        None,
     ],
 ]
 
diff --git a/python/ray/llm/_internal/serve/core/server/llm_server.py b/python/ray/llm/_internal/serve/core/server/llm_server.py
index 7f44ea501655..0da17151cc12 100644
--- a/python/ray/llm/_internal/serve/core/server/llm_server.py
+++ b/python/ray/llm/_internal/serve/core/server/llm_server.py
@@ -378,7 +378,9 @@ async def embeddings(
 
     async def transcriptions(
         self, request: "TranscriptionRequest"
-    ) -> AsyncGenerator[Union[List["ErrorResponse"], "TranscriptionResponse"], None]:
+    ) -> AsyncGenerator[
+        Union[List[Union[str, "ErrorResponse"]], "TranscriptionResponse"], None
+    ]:
         """Runs an transcriptions request to the engine and returns the response.
 
         Returns an AsyncGenerator over the TranscriptionResponse object. This is so that the caller can have a consistent interface across all the methods of chat, completions, embeddings and transcriptions.

From 7d805283d8ca4a84b33932d9cba4f6441651f84a Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Sun, 12 Oct 2025 16:55:31 +0530
Subject: [PATCH 17/33] updates

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 python/ray/llm/_internal/serve/core/engine/protocol.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/llm/_internal/serve/core/engine/protocol.py b/python/ray/llm/_internal/serve/core/engine/protocol.py
index 9270865302c3..c36b8073d0da 100644
--- a/python/ray/llm/_internal/serve/core/engine/protocol.py
+++ b/python/ray/llm/_internal/serve/core/engine/protocol.py
@@ -123,7 +123,7 @@ async def embeddings(
     @abc.abstractmethod
     async def transcriptions(
         self, request: "TranscriptionRequest"
-    ) -> AsyncGenerator[Union["TranscriptionResponse", "ErrorResponse"], None]:
+    ) -> AsyncGenerator[Union[str, "TranscriptionResponse", "ErrorResponse"], None]:
         """Run a Transcription with the engine.
 
         Similar to chat and completion, this method is an async generator,

From fa48092442ed6d581277b1e91d968c7cbd56e686 Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Thu, 16 Oct 2025 23:28:30 +0530
Subject: [PATCH 18/33] query server doc test added and router updates

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 .../transcription/transcription_example.py     | 18 ++++++++++++++++++
 .../_internal/serve/core/ingress/ingress.py    | 11 ++++++++---
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_example.py b/doc/source/llm/doc_code/serve/transcription/transcription_example.py
index 6ff55a6e2272..b733e2288038 100644
--- a/doc/source/llm/doc_code/serve/transcription/transcription_example.py
+++ b/doc/source/llm/doc_code/serve/transcription/transcription_example.py
@@ -8,6 +8,8 @@
 """
 
 import time
+import openai
+import requests
 from ray import serve
 from ray.serve.schema import ApplicationStatus
 from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME
@@ -98,4 +100,20 @@ def _testing_build_openai_app(llm_serving_args):
         f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}"
     )
 
+response = requests.get("https://voiceage.com/wbsamples/in_stereo/Sports.wav")
+with open("audio.wav", "wb") as f:
+    f.write(response.content)
+
+client = openai.OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key")
+file = open("/audio.wav", "rb")
+try:
+    response = client.audio.transcriptions.create(
+        model="whisper-large",
+        file=file,
+        temperature=0.0,
+        language="en",
+    )
+except Exception as e:
+    raise AssertionError(f"Error while querying models: {e}. Check the logs for more details.")
+
 serve.shutdown()
diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py
index 5d7973b833ed..e84c41c9d227 100644
--- a/python/ray/llm/_internal/serve/core/ingress/ingress.py
+++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py
@@ -4,6 +4,7 @@
 from contextlib import asynccontextmanager
 from enum import Enum
 from typing import (
+    Annotated,
     Any,
     AsyncGenerator,
     Awaitable,
@@ -17,7 +18,7 @@
     Union,
 )
 
-from fastapi import FastAPI, HTTPException, status
+from fastapi import FastAPI, Form, HTTPException, status
 from fastapi.middleware.cors import CORSMiddleware
 from starlette.responses import JSONResponse, Response, StreamingResponse
 
@@ -139,7 +140,9 @@ def _sanitize_chat_completion_request(
     "completions": lambda app: app.post("/v1/completions"),
     "chat": lambda app: app.post("/v1/chat/completions"),
     "embeddings": lambda app: app.post("/v1/embeddings"),
-    "transcriptions": lambda app: app.post("/v1/audio/transcriptions"),
+    "transcriptions": lambda app: app.post(
+        "/v1/audio/transcriptions",
+    ),
     "score": lambda app: app.post("/v1/score"),
 }
 
@@ -604,7 +607,9 @@ async def embeddings(self, body: EmbeddingRequest) -> Response:
             if isinstance(result, EmbeddingResponse):
                 return JSONResponse(content=result.model_dump())
 
-    async def transcriptions(self, body: TranscriptionRequest) -> Response:
+    async def transcriptions(
+        self, body: Annotated[TranscriptionRequest, Form()]
+    ) -> Response:
         """Create transcription for the provided audio input.
 
         Args:

From cf20ea5c805658e1e826f53b953f60d77a1be1b4 Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Sat, 18 Oct 2025 15:57:52 +0530
Subject: [PATCH 19/33] fix

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 .../doc_code/serve/transcription/transcription_example.py  | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_example.py b/doc/source/llm/doc_code/serve/transcription/transcription_example.py
index b733e2288038..181d83b35d47 100644
--- a/doc/source/llm/doc_code/serve/transcription/transcription_example.py
+++ b/doc/source/llm/doc_code/serve/transcription/transcription_example.py
@@ -105,7 +105,8 @@ def _testing_build_openai_app(llm_serving_args):
     f.write(response.content)
 
 client = openai.OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key")
-file = open("/audio.wav", "rb")
+file = open("audio.wav", "rb")
+
 try:
     response = client.audio.transcriptions.create(
         model="whisper-large",
@@ -114,6 +115,8 @@ def _testing_build_openai_app(llm_serving_args):
         language="en",
     )
 except Exception as e:
-    raise AssertionError(f"Error while querying models: {e}. Check the logs for more details.")
+    raise AssertionError(
+        f"Error while querying models: {e}. Check the logs for more details."
+    )
 
 serve.shutdown()

From 2910796dc78a3640deef5804f691479d7a219e4a Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Sat, 18 Oct 2025 17:20:31 +0530
Subject: [PATCH 20/33] create_transcription and release test fixes

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py | 4 ++++
 release/llm_tests/serve/test_llm_serve_integration.py      | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py
index 6e065f0c8598..2f0adea640ac 100644
--- a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py
+++ b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py
@@ -469,7 +469,11 @@ async def transcriptions(
         # so that the create_transcription API can assign the request_id properly.
         raw_request = self._create_raw_request(request, "/audio/transcriptions")
 
+        # Extract audio data from the request file
+        audio_data = await request.file.read()
+
         transcription_response = await self._oai_serving_transcription.create_transcription(  # type: ignore[attr-defined]
+            audio_data,
             request,
             raw_request=raw_request,
         )
diff --git a/release/llm_tests/serve/test_llm_serve_integration.py b/release/llm_tests/serve/test_llm_serve_integration.py
index 894b31d65930..03e01dc1766e 100644
--- a/release/llm_tests/serve/test_llm_serve_integration.py
+++ b/release/llm_tests/serve/test_llm_serve_integration.py
@@ -182,6 +182,8 @@ def test_transcription_model(model_name):
     app = build_openai_app({"llm_configs": [llm_config]})
     serve.run(app, blocking=False)
     wait_for_condition(is_default_app_running, timeout=180)
+    serve.shutdown()
+    time.sleep(1)
 
 
 @pytest.mark.asyncio(scope="function")

From 6dc2d41dd20ad4ea2ad12ab14ff5cce3fbfdadec Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Sat, 18 Oct 2025 18:30:03 +0530
Subject: [PATCH 21/33] requirements updates

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 python/requirements/llm/llm-requirements.txt | 5 ++++-
 python/setup.py                              | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/python/requirements/llm/llm-requirements.txt b/python/requirements/llm/llm-requirements.txt
index fe3543757e4f..a9b40e9b7fe6 100644
--- a/python/requirements/llm/llm-requirements.txt
+++ b/python/requirements/llm/llm-requirements.txt
@@ -2,7 +2,7 @@
 # constraining to a maximum version (i.e. <=) to temporarily work around a bug.
 # Those pins for the sake of workarounds should not be advertised as constraints
 # on future releases in setup.py.
-vllm>=0.11.0
+vllm[audio]>=0.11.0
 nixl>=0.6.1
 # For json mode
 jsonref>=1.1.0
@@ -14,3 +14,6 @@ typer
 meson
 pybind11
 hf_transfer
+# Audio processing dependencies for transcription support
+librosa
+soundfile
diff --git a/python/setup.py b/python/setup.py
index 8799f262f1fb..2c124ee01494 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -374,7 +374,7 @@ def get_packages(self):
     setup_spec.extras["llm"] = list(
         set(
             [
-                "vllm>=0.11.0",
+                "vllm[audio]>=0.11.0",
                 "nixl>=0.6.1",
                 "jsonref>=1.1.0",
                 "jsonschema",
@@ -383,6 +383,9 @@ def get_packages(self):
                 "async-timeout; python_version < '3.11'",
                 "typer",
                 "hf_transfer",
+                # Audio processing dependencies for transcription support
+                "librosa",
+                "soundfile",
             ]
             + setup_spec.extras["data"]
             + setup_spec.extras["serve"]

From 4d97377d42ea9301a3fcc84c23603b16f7953af3 Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Sun, 19 Oct 2025 14:44:51 +0530
Subject: [PATCH 22/33] lock updates

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 python/deplocks/llm/rayllm_py311_cpu.lock     | 88 +++++++++++++++++++
 python/deplocks/llm/rayllm_py311_cu128.lock   | 88 +++++++++++++++++++
 .../deplocks/llm/rayllm_test_py311_cpu.lock   | 82 ++++++++++++++++-
 .../deplocks/llm/rayllm_test_py311_cu128.lock | 82 ++++++++++++++++-
 4 files changed, 334 insertions(+), 6 deletions(-)

diff --git a/python/deplocks/llm/rayllm_py311_cpu.lock b/python/deplocks/llm/rayllm_py311_cpu.lock
index 9461ae88b62b..32249f6d1995 100644
--- a/python/deplocks/llm/rayllm_py311_cpu.lock
+++ b/python/deplocks/llm/rayllm_py311_cpu.lock
@@ -149,6 +149,12 @@ attrs==25.1.0 \
     #   aiohttp
     #   jsonschema
     #   referencing
+audioread==3.0.1 \
+    --hash=sha256:4cdce70b8adc0da0a3c9e0d85fb10b3ace30fbdf8d1670fd443929b61d117c33 \
+    --hash=sha256:ac5460a5498c48bdf2e8e767402583a4dcd13f4414d286f42ce4379e8b35066d
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
+    #   librosa
 billiard==4.2.1 \
     --hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \
     --hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb
@@ -572,6 +578,12 @@ cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \
     #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
     #   -r python/requirements.txt
     #   ray
+decorator==5.1.1 \
+    --hash=sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330 \
+    --hash=sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
+    #   librosa
 depyf==0.19.0 \
     --hash=sha256:040b35fc0997d49df024b7d094f2a7836f91e9ed02f49982dd37e70aa3285ad5 \
     --hash=sha256:afed0916b32d141cc90fa6220df01885eda442ca43b297d5050eeb90b4a5cb44
@@ -1229,6 +1241,13 @@ jiter==0.8.2 \
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
     #   openai
+joblib==1.5.2 \
+    --hash=sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55 \
+    --hash=sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
+    #   librosa
+    #   scikit-learn
 jsonref==1.1.0 \
     --hash=sha256:32fe8e1d85af0fdefbebce950af85590b22b60f9e95443176adbde4e1ecea552 \
     --hash=sha256:590dc7773df6c21cbf948b5dac07a72a251db28b0238ceecce0a2abfa8ec30a9
@@ -1267,7 +1286,15 @@ lazy-loader==0.4 \
     --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
+    #   librosa
     #   scikit-image
+librosa==0.11.0 \
+    --hash=sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1 \
+    --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
+    #   -r python/requirements/llm/llm-requirements.txt
+    #   vllm
 llguidance==0.7.26 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \
     --hash=sha256:1895ff449c8ec0a5f1d3b142d723fc9b26a85b021b72d7f1173f8b7507f528c0 \
     --hash=sha256:5e6f6cec9c6648164062f0347262b3ec7c39f54d1be5c5347d6446bc7fdba115 \
@@ -1544,6 +1571,7 @@ msgpack==1.0.7 \
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
     #   -r python/requirements.txt
+    #   librosa
     #   ray
 msgspec==0.19.0 \
     --hash=sha256:00e87ecfa9795ee5214861eab8326b0e75475c2e68a384002aa135ea2a27d909 \
@@ -1746,6 +1774,7 @@ numba==0.61.2 \
     --hash=sha256:efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
+    #   librosa
     #   vllm
 numpy==1.26.4 \
     --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \
@@ -1791,12 +1820,14 @@ numpy==1.26.4 \
     #   gguf
     #   gymnasium
     #   imageio
+    #   librosa
     #   mistral-common
     #   nixl
     #   numba
     #   opencv-python-headless
     #   pandas
     #   scikit-image
+    #   scikit-learn
     #   scipy
     #   soundfile
     #   soxr
@@ -1944,6 +1975,7 @@ packaging==23.0 \
     #   kombu
     #   lazy-loader
     #   lm-format-enforcer
+    #   pooch
     #   ray
     #   scikit-image
     #   tensorboardx
@@ -2067,7 +2099,14 @@ platformdirs==3.11.0 \
     --hash=sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
+    #   pooch
     #   virtualenv
+pooch==1.8.2 \
+    --hash=sha256:3529a57096f7198778a5ceefd5ac3ef0e4d06a6ddaf9fc2d609b806f25302c47 \
+    --hash=sha256:76561f0de68a01da4df6af38e9955c4c9d1a5c90da73f7e40276a5728ec83d10
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
+    #   librosa
 prometheus-client==0.19.0 \
     --hash=sha256:4585b0d1223148c27a225b10dbec5ae9bc4c81a99a3fa80774fa6209935324e1 \
     --hash=sha256:c88b1e6ecf6b41cd8fb5731c7ae919bf66df6ec6fafa555cd6c0e16ca169ae92
@@ -2919,6 +2958,7 @@ requests==2.32.3 \
     #   google-api-core
     #   huggingface-hub
     #   mistral-common
+    #   pooch
     #   ray
     #   tiktoken
     #   transformers
@@ -3089,6 +3129,41 @@ scikit-image==0.24.0 \
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
     #   -r python/requirements.txt
+scikit-learn==1.7.2 \
+    --hash=sha256:0486c8f827c2e7b64837c731c8feff72c0bd2b998067a8a9cbc10643c31f0fe1 \
+    --hash=sha256:0b7dacaa05e5d76759fb071558a8b5130f4845166d88654a0f9bdf3eb57851b7 \
+    --hash=sha256:191e5550980d45449126e23ed1d5e9e24b2c68329ee1f691a3987476e115e09c \
+    --hash=sha256:20e9e49ecd130598f1ca38a1d85090e1a600147b9c02fa6f15d69cb53d968fda \
+    --hash=sha256:2a41e2a0ef45063e654152ec9d8bcfc39f7afce35b08902bfe290c2498a67a6a \
+    --hash=sha256:36749fb62b3d961b1ce4fedf08fa57a1986cd409eff2d783bca5d4b9b5fce51c \
+    --hash=sha256:4a847fea807e278f821a0406ca01e387f97653e284ecbd9750e3ee7c90347f18 \
+    --hash=sha256:502c18e39849c0ea1a5d681af1dbcf15f6cce601aebb657aabbfe84133c1907f \
+    --hash=sha256:57dc4deb1d3762c75d685507fbd0bc17160144b2f2ba4ccea5dc285ab0d0e973 \
+    --hash=sha256:6088aa475f0785e01bcf8529f55280a3d7d298679f50c0bb70a2364a82d0b290 \
+    --hash=sha256:63a9afd6f7b229aad94618c01c252ce9e6fa97918c5ca19c9a17a087d819440c \
+    --hash=sha256:6b33579c10a3081d076ab403df4a4190da4f4432d443521674637677dc91e61f \
+    --hash=sha256:7a4c328a71785382fe3fe676a9ecf2c86189249beff90bf85e22bdb7efaf9ae0 \
+    --hash=sha256:7a58814265dfc52b3295b1900cfb5701589d30a8bb026c7540f1e9d3499d5ec8 \
+    --hash=sha256:89877e19a80c7b11a2891a27c21c4894fb18e2c2e077815bcade10d34287b20d \
+    --hash=sha256:8d91a97fa2b706943822398ab943cde71858a50245e31bc71dba62aab1d60a96 \
+    --hash=sha256:8da8bf89d4d79aaec192d2bda62f9b56ae4e5b4ef93b6a56b5de4977e375c1f1 \
+    --hash=sha256:9656e4a53e54578ad10a434dc1f993330568cfee176dff07112b8785fb413106 \
+    --hash=sha256:96dc05a854add0e50d3f47a1ef21a10a595016da5b007c7d9cd9d0bffd1fcc61 \
+    --hash=sha256:98335fb98509b73385b3ab2bd0639b1f610541d3988ee675c670371d6a87aa7c \
+    --hash=sha256:9acb6c5e867447b4e1390930e3944a005e2cb115922e693c08a323421a6966e8 \
+    --hash=sha256:9b7ed8d58725030568523e937c43e56bc01cadb478fc43c042a9aca1dacb3ba1 \
+    --hash=sha256:abebbd61ad9e1deed54cca45caea8ad5f79e1b93173dece40bb8e0c658dbe6fe \
+    --hash=sha256:acbc0f5fd2edd3432a22c69bed78e837c70cf896cd7993d71d51ba6708507476 \
+    --hash=sha256:b4d6e9deed1a47aca9fe2f267ab8e8fe82ee20b4526b2c0cd9e135cea10feb44 \
+    --hash=sha256:bb24510ed3f9f61476181e4db51ce801e2ba37541def12dc9333b946fc7a9cf8 \
+    --hash=sha256:c7509693451651cd7361d30ce4e86a1347493554f172b1c72a39300fa2aea79e \
+    --hash=sha256:ca250e6836d10e6f402436d6463d6c0e4d8e0234cfb6a9a47835bd392b852ce5 \
+    --hash=sha256:e5bf3d930aee75a65478df91ac1225ff89cd28e9ac7bd1196853a9229b6adb0b \
+    --hash=sha256:f95dc55b7902b91331fa4e5845dd5bde0580c9cd9612b1b2791b7e80c3d32615 \
+    --hash=sha256:fa8f63940e29c82d1e67a45d5297bdebbcb585f5a5a50c4914cc2e852ab77f33
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
+    #   librosa
 scipy==1.11.4 \
     --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \
     --hash=sha256:028eccd22e654b3ea01ee63705681ee79933652b2d8f873e7949898dda6d11b6 \
@@ -3118,7 +3193,9 @@ scipy==1.11.4 \
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
     #   -r python/requirements.txt
+    #   librosa
     #   scikit-image
+    #   scikit-learn
     #   vllm
 sentencepiece==0.2.0 \
     --hash=sha256:0461324897735512a32d222e3d886e24ad6a499761952b6bda2a9ee6e4313ea5 \
@@ -3317,7 +3394,10 @@ soundfile==0.13.1 \
     --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
+    #   -r python/requirements/llm/llm-requirements.txt
+    #   librosa
     #   mistral-common
+    #   vllm
 soxr==0.5.0.post1 \
     --hash=sha256:39e0f791ba178d69cd676485dbee37e75a34f20daa478d90341ecb7f6d9d690f \
     --hash=sha256:4704ba6b13a3f1e41d12acf192878384c1c31f71ce606829c64abdf64a8d7d32 \
@@ -3342,6 +3422,7 @@ soxr==0.5.0.post1 \
     --hash=sha256:fef509466c9c25f65eae0ce1e4b9ac9705d22c6038c914160ddaf459589c6e31
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
+    #   librosa
     #   mistral-common
 starlette==0.46.2 \
     --hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \
@@ -3363,6 +3444,12 @@ tensorboardx==2.6.2.2 \
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
     #   -r python/requirements.txt
+threadpoolctl==3.6.0 \
+    --hash=sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb \
+    --hash=sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
+    #   scikit-learn
 tifffile==2024.7.21 \
     --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \
     --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483
@@ -3518,6 +3605,7 @@ typing-extensions==4.12.2 \
     #   fastapi
     #   gymnasium
     #   huggingface-hub
+    #   librosa
     #   mistral-common
     #   openai
     #   opentelemetry-api
diff --git a/python/deplocks/llm/rayllm_py311_cu128.lock b/python/deplocks/llm/rayllm_py311_cu128.lock
index 8445dd9c5354..8caae4a2ded5 100644
--- a/python/deplocks/llm/rayllm_py311_cu128.lock
+++ b/python/deplocks/llm/rayllm_py311_cu128.lock
@@ -149,6 +149,12 @@ attrs==25.1.0 \
     #   aiohttp
     #   jsonschema
     #   referencing
+audioread==3.0.1 \
+    --hash=sha256:4cdce70b8adc0da0a3c9e0d85fb10b3ace30fbdf8d1670fd443929b61d117c33 \
+    --hash=sha256:ac5460a5498c48bdf2e8e767402583a4dcd13f4414d286f42ce4379e8b35066d
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
+    #   librosa
 billiard==4.2.1 \
     --hash=sha256:12b641b0c539073fc8d3f5b8b7be998956665c4233c7c1fcd66a7e677c4fb36f \
     --hash=sha256:40b59a4ac8806ba2c2369ea98d876bc6108b051c227baffd928c644d15d8f3cb
@@ -572,6 +578,12 @@ cupy-cuda12x==13.1.0 ; sys_platform != 'darwin' \
     #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
     #   -r python/requirements.txt
     #   ray
+decorator==5.1.1 \
+    --hash=sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330 \
+    --hash=sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
+    #   librosa
 depyf==0.19.0 \
     --hash=sha256:040b35fc0997d49df024b7d094f2a7836f91e9ed02f49982dd37e70aa3285ad5 \
     --hash=sha256:afed0916b32d141cc90fa6220df01885eda442ca43b297d5050eeb90b4a5cb44
@@ -1230,6 +1242,13 @@ jiter==0.10.0 \
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
     #   openai
+joblib==1.5.2 \
+    --hash=sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55 \
+    --hash=sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
+    #   librosa
+    #   scikit-learn
 jsonref==1.1.0 \
     --hash=sha256:32fe8e1d85af0fdefbebce950af85590b22b60f9e95443176adbde4e1ecea552 \
     --hash=sha256:590dc7773df6c21cbf948b5dac07a72a251db28b0238ceecce0a2abfa8ec30a9
@@ -1268,7 +1287,15 @@ lazy-loader==0.4 \
     --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
+    #   librosa
     #   scikit-image
+librosa==0.11.0 \
+    --hash=sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1 \
+    --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
+    #   -r python/requirements/llm/llm-requirements.txt
+    #   vllm
 llguidance==0.7.29 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \
     --hash=sha256:17fd439957d6ca5f459d0dec755a2d040c2dc946ed7e3c332b469ef6861292f8 \
     --hash=sha256:1d30a76b30b646ac7f9025d262665f62bdbf2d43698115eeb1119c6ee062a36f \
@@ -1509,6 +1536,7 @@ msgpack==1.0.7 \
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
     #   -r python/requirements.txt
+    #   librosa
     #   ray
 msgspec==0.19.0 \
     --hash=sha256:00e87ecfa9795ee5214861eab8326b0e75475c2e68a384002aa135ea2a27d909 \
@@ -1710,6 +1738,7 @@ numba==0.61.2 \
     --hash=sha256:efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
+    #   librosa
     #   vllm
 numpy==1.26.4 \
     --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \
@@ -1755,12 +1784,14 @@ numpy==1.26.4 \
     #   gguf
     #   gymnasium
     #   imageio
+    #   librosa
     #   mistral-common
     #   nixl
     #   numba
     #   opencv-python-headless
     #   pandas
     #   scikit-image
+    #   scikit-learn
     #   scipy
     #   soundfile
     #   soxr
@@ -1984,6 +2015,7 @@ packaging==23.0 \
     #   kombu
     #   lazy-loader
     #   lm-format-enforcer
+    #   pooch
     #   ray
     #   scikit-image
     #   tensorboardx
@@ -2107,7 +2139,14 @@ platformdirs==3.11.0 \
     --hash=sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
+    #   pooch
     #   virtualenv
+pooch==1.8.2 \
+    --hash=sha256:3529a57096f7198778a5ceefd5ac3ef0e4d06a6ddaf9fc2d609b806f25302c47 \
+    --hash=sha256:76561f0de68a01da4df6af38e9955c4c9d1a5c90da73f7e40276a5728ec83d10
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
+    #   librosa
 prometheus-client==0.19.0 \
     --hash=sha256:4585b0d1223148c27a225b10dbec5ae9bc4c81a99a3fa80774fa6209935324e1 \
     --hash=sha256:c88b1e6ecf6b41cd8fb5731c7ae919bf66df6ec6fafa555cd6c0e16ca169ae92
@@ -2959,6 +2998,7 @@ requests==2.32.3 \
     #   google-api-core
     #   huggingface-hub
     #   mistral-common
+    #   pooch
     #   ray
     #   tiktoken
     #   transformers
@@ -3129,6 +3169,41 @@ scikit-image==0.24.0 \
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
     #   -r python/requirements.txt
+scikit-learn==1.7.2 \
+    --hash=sha256:0486c8f827c2e7b64837c731c8feff72c0bd2b998067a8a9cbc10643c31f0fe1 \
+    --hash=sha256:0b7dacaa05e5d76759fb071558a8b5130f4845166d88654a0f9bdf3eb57851b7 \
+    --hash=sha256:191e5550980d45449126e23ed1d5e9e24b2c68329ee1f691a3987476e115e09c \
+    --hash=sha256:20e9e49ecd130598f1ca38a1d85090e1a600147b9c02fa6f15d69cb53d968fda \
+    --hash=sha256:2a41e2a0ef45063e654152ec9d8bcfc39f7afce35b08902bfe290c2498a67a6a \
+    --hash=sha256:36749fb62b3d961b1ce4fedf08fa57a1986cd409eff2d783bca5d4b9b5fce51c \
+    --hash=sha256:4a847fea807e278f821a0406ca01e387f97653e284ecbd9750e3ee7c90347f18 \
+    --hash=sha256:502c18e39849c0ea1a5d681af1dbcf15f6cce601aebb657aabbfe84133c1907f \
+    --hash=sha256:57dc4deb1d3762c75d685507fbd0bc17160144b2f2ba4ccea5dc285ab0d0e973 \
+    --hash=sha256:6088aa475f0785e01bcf8529f55280a3d7d298679f50c0bb70a2364a82d0b290 \
+    --hash=sha256:63a9afd6f7b229aad94618c01c252ce9e6fa97918c5ca19c9a17a087d819440c \
+    --hash=sha256:6b33579c10a3081d076ab403df4a4190da4f4432d443521674637677dc91e61f \
+    --hash=sha256:7a4c328a71785382fe3fe676a9ecf2c86189249beff90bf85e22bdb7efaf9ae0 \
+    --hash=sha256:7a58814265dfc52b3295b1900cfb5701589d30a8bb026c7540f1e9d3499d5ec8 \
+    --hash=sha256:89877e19a80c7b11a2891a27c21c4894fb18e2c2e077815bcade10d34287b20d \
+    --hash=sha256:8d91a97fa2b706943822398ab943cde71858a50245e31bc71dba62aab1d60a96 \
+    --hash=sha256:8da8bf89d4d79aaec192d2bda62f9b56ae4e5b4ef93b6a56b5de4977e375c1f1 \
+    --hash=sha256:9656e4a53e54578ad10a434dc1f993330568cfee176dff07112b8785fb413106 \
+    --hash=sha256:96dc05a854add0e50d3f47a1ef21a10a595016da5b007c7d9cd9d0bffd1fcc61 \
+    --hash=sha256:98335fb98509b73385b3ab2bd0639b1f610541d3988ee675c670371d6a87aa7c \
+    --hash=sha256:9acb6c5e867447b4e1390930e3944a005e2cb115922e693c08a323421a6966e8 \
+    --hash=sha256:9b7ed8d58725030568523e937c43e56bc01cadb478fc43c042a9aca1dacb3ba1 \
+    --hash=sha256:abebbd61ad9e1deed54cca45caea8ad5f79e1b93173dece40bb8e0c658dbe6fe \
+    --hash=sha256:acbc0f5fd2edd3432a22c69bed78e837c70cf896cd7993d71d51ba6708507476 \
+    --hash=sha256:b4d6e9deed1a47aca9fe2f267ab8e8fe82ee20b4526b2c0cd9e135cea10feb44 \
+    --hash=sha256:bb24510ed3f9f61476181e4db51ce801e2ba37541def12dc9333b946fc7a9cf8 \
+    --hash=sha256:c7509693451651cd7361d30ce4e86a1347493554f172b1c72a39300fa2aea79e \
+    --hash=sha256:ca250e6836d10e6f402436d6463d6c0e4d8e0234cfb6a9a47835bd392b852ce5 \
+    --hash=sha256:e5bf3d930aee75a65478df91ac1225ff89cd28e9ac7bd1196853a9229b6adb0b \
+    --hash=sha256:f95dc55b7902b91331fa4e5845dd5bde0580c9cd9612b1b2791b7e80c3d32615 \
+    --hash=sha256:fa8f63940e29c82d1e67a45d5297bdebbcb585f5a5a50c4914cc2e852ab77f33
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
+    #   librosa
 scipy==1.11.4 \
     --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \
     --hash=sha256:028eccd22e654b3ea01ee63705681ee79933652b2d8f873e7949898dda6d11b6 \
@@ -3158,7 +3233,9 @@ scipy==1.11.4 \
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
     #   -r python/requirements.txt
+    #   librosa
     #   scikit-image
+    #   scikit-learn
     #   vllm
 sentencepiece==0.2.0 \
     --hash=sha256:0461324897735512a32d222e3d886e24ad6a499761952b6bda2a9ee6e4313ea5 \
@@ -3357,7 +3434,10 @@ soundfile==0.13.1 \
     --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
+    #   -r python/requirements/llm/llm-requirements.txt
+    #   librosa
     #   mistral-common
+    #   vllm
 soxr==0.5.0.post1 \
     --hash=sha256:39e0f791ba178d69cd676485dbee37e75a34f20daa478d90341ecb7f6d9d690f \
     --hash=sha256:4704ba6b13a3f1e41d12acf192878384c1c31f71ce606829c64abdf64a8d7d32 \
@@ -3382,6 +3462,7 @@ soxr==0.5.0.post1 \
     --hash=sha256:fef509466c9c25f65eae0ce1e4b9ac9705d22c6038c914160ddaf459589c6e31
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
+    #   librosa
     #   mistral-common
 starlette==0.46.2 \
     --hash=sha256:595633ce89f8ffa71a015caed34a5b2dc1c0cdb3f0f1fbd1e69339cf2abeec35 \
@@ -3403,6 +3484,12 @@ tensorboardx==2.6.2.2 \
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
     #   -r python/requirements.txt
+threadpoolctl==3.6.0 \
+    --hash=sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb \
+    --hash=sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e
+    # via
+    #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
+    #   scikit-learn
 tifffile==2024.7.21 \
     --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \
     --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483
@@ -3547,6 +3634,7 @@ typing-extensions==4.12.2 \
     #   fastapi
     #   gymnasium
     #   huggingface-hub
+    #   librosa
     #   mistral-common
     #   openai
     #   opentelemetry-api
diff --git a/python/deplocks/llm/rayllm_test_py311_cpu.lock b/python/deplocks/llm/rayllm_test_py311_cpu.lock
index 06eec0f1fbf6..edd52e6eca9a 100644
--- a/python/deplocks/llm/rayllm_test_py311_cpu.lock
+++ b/python/deplocks/llm/rayllm_test_py311_cpu.lock
@@ -219,6 +219,10 @@ attrs==25.1.0 \
     #   aiohttp
     #   jsonschema
     #   referencing
+audioread==3.0.1 \
+    --hash=sha256:4cdce70b8adc0da0a3c9e0d85fb10b3ace30fbdf8d1670fd443929b61d117c33 \
+    --hash=sha256:ac5460a5498c48bdf2e8e767402583a4dcd13f4414d286f42ce4379e8b35066d
+    # via librosa
 azure-common==1.1.28 \
     --hash=sha256:4ac0cd3214e36b6a1b6a442686722a5d8cc449603aa833f3f0f40bda836704a3 \
     --hash=sha256:5c12d3dcf4ec20599ca6b0d3e09e86e146353d443e7fcc050c9a19c1f9df20ad
@@ -766,6 +770,7 @@ decorator==5.1.1 \
     # via
     #   -c python/deplocks/llm/ray_test_py311_cpu.lock
     #   ipython
+    #   librosa
 defusedxml==0.7.1 \
     --hash=sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69 \
     --hash=sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61
@@ -1678,6 +1683,12 @@ jmespath==1.0.1 \
     #   -c python/deplocks/llm/ray_test_py311_cpu.lock
     #   boto3
     #   botocore
+joblib==1.5.2 \
+    --hash=sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55 \
+    --hash=sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241
+    # via
+    #   librosa
+    #   scikit-learn
 json5==0.9.14 \
     --hash=sha256:740c7f1b9e584a468dbb2939d8d458db3427f2c93ae2139d05f47e453eae964f \
     --hash=sha256:9ed66c3a6ca3510a976a9ef9b8c0787de24802724ab1860bc0153c7fdd589b02
@@ -1821,7 +1832,14 @@ lazy-loader==0.4 \
     --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1
     # via
     #   -c python/deplocks/llm/ray_test_py311_cpu.lock
+    #   librosa
     #   scikit-image
+librosa==0.11.0 \
+    --hash=sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1 \
+    --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908
+    # via
+    #   -r python/requirements/llm/llm-requirements.txt
+    #   vllm
 llguidance==0.7.26 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \
     --hash=sha256:1895ff449c8ec0a5f1d3b142d723fc9b26a85b021b72d7f1173f8b7507f528c0 \
     --hash=sha256:5e6f6cec9c6648164062f0347262b3ec7c39f54d1be5c5347d6446bc7fdba115 \
@@ -2223,6 +2241,7 @@ msgpack==1.0.7 \
     # via
     #   -c python/deplocks/llm/ray_test_py311_cpu.lock
     #   -r python/requirements.txt
+    #   librosa
     #   ray
 msgspec==0.19.0 \
     --hash=sha256:00e87ecfa9795ee5214861eab8326b0e75475c2e68a384002aa135ea2a27d909 \
@@ -2472,7 +2491,9 @@ numba==0.61.2 \
     --hash=sha256:cf9f9fc00d6eca0c23fc840817ce9f439b9f03c8f03d6246c0e7f0cb15b7162a \
     --hash=sha256:ea0247617edcb5dd61f6106a56255baab031acc4257bddaeddb3a1003b4ca3fd \
     --hash=sha256:efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2
-    # via vllm
+    # via
+    #   librosa
+    #   vllm
 numpy==1.26.4 \
     --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \
     --hash=sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818 \
@@ -2517,12 +2538,14 @@ numpy==1.26.4 \
     #   gguf
     #   gymnasium
     #   imageio
+    #   librosa
     #   mistral-common
     #   nixl
     #   numba
     #   opencv-python-headless
     #   pandas
     #   scikit-image
+    #   scikit-learn
     #   scipy
     #   soundfile
     #   soxr
@@ -2680,6 +2703,7 @@ packaging==23.0 \
     #   lazy-loader
     #   lm-format-enforcer
     #   nbconvert
+    #   pooch
     #   pytest
     #   ray
     #   scikit-image
@@ -2835,6 +2859,7 @@ platformdirs==3.11.0 \
     # via
     #   -c python/deplocks/llm/ray_test_py311_cpu.lock
     #   jupyter-core
+    #   pooch
     #   virtualenv
 pluggy==1.3.0 \
     --hash=sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12 \
@@ -2842,6 +2867,10 @@ pluggy==1.3.0 \
     # via
     #   -c python/deplocks/llm/ray_test_py311_cpu.lock
     #   pytest
+pooch==1.8.2 \
+    --hash=sha256:3529a57096f7198778a5ceefd5ac3ef0e4d06a6ddaf9fc2d609b806f25302c47 \
+    --hash=sha256:76561f0de68a01da4df6af38e9955c4c9d1a5c90da73f7e40276a5728ec83d10
+    # via librosa
 portalocker==2.8.2 \
     --hash=sha256:2b035aa7828e46c58e9b31390ee1f169b98e1066ab10b9a6a861fe7e25ee4f33 \
     --hash=sha256:cfb86acc09b9aa7c3b43594e19be1345b9d16af3feb08bf92f23d4dce513a28e
@@ -3804,6 +3833,7 @@ requests==2.32.3 \
     #   jupyterlab-server
     #   mistral-common
     #   msal
+    #   pooch
     #   ray
     #   smart-open
     #   sphinx
@@ -3996,6 +4026,39 @@ scikit-image==0.24.0 \
     # via
     #   -c python/deplocks/llm/ray_test_py311_cpu.lock
     #   -r python/requirements.txt
+scikit-learn==1.7.2 \
+    --hash=sha256:0486c8f827c2e7b64837c731c8feff72c0bd2b998067a8a9cbc10643c31f0fe1 \
+    --hash=sha256:0b7dacaa05e5d76759fb071558a8b5130f4845166d88654a0f9bdf3eb57851b7 \
+    --hash=sha256:191e5550980d45449126e23ed1d5e9e24b2c68329ee1f691a3987476e115e09c \
+    --hash=sha256:20e9e49ecd130598f1ca38a1d85090e1a600147b9c02fa6f15d69cb53d968fda \
+    --hash=sha256:2a41e2a0ef45063e654152ec9d8bcfc39f7afce35b08902bfe290c2498a67a6a \
+    --hash=sha256:36749fb62b3d961b1ce4fedf08fa57a1986cd409eff2d783bca5d4b9b5fce51c \
+    --hash=sha256:4a847fea807e278f821a0406ca01e387f97653e284ecbd9750e3ee7c90347f18 \
+    --hash=sha256:502c18e39849c0ea1a5d681af1dbcf15f6cce601aebb657aabbfe84133c1907f \
+    --hash=sha256:57dc4deb1d3762c75d685507fbd0bc17160144b2f2ba4ccea5dc285ab0d0e973 \
+    --hash=sha256:6088aa475f0785e01bcf8529f55280a3d7d298679f50c0bb70a2364a82d0b290 \
+    --hash=sha256:63a9afd6f7b229aad94618c01c252ce9e6fa97918c5ca19c9a17a087d819440c \
+    --hash=sha256:6b33579c10a3081d076ab403df4a4190da4f4432d443521674637677dc91e61f \
+    --hash=sha256:7a4c328a71785382fe3fe676a9ecf2c86189249beff90bf85e22bdb7efaf9ae0 \
+    --hash=sha256:7a58814265dfc52b3295b1900cfb5701589d30a8bb026c7540f1e9d3499d5ec8 \
+    --hash=sha256:89877e19a80c7b11a2891a27c21c4894fb18e2c2e077815bcade10d34287b20d \
+    --hash=sha256:8d91a97fa2b706943822398ab943cde71858a50245e31bc71dba62aab1d60a96 \
+    --hash=sha256:8da8bf89d4d79aaec192d2bda62f9b56ae4e5b4ef93b6a56b5de4977e375c1f1 \
+    --hash=sha256:9656e4a53e54578ad10a434dc1f993330568cfee176dff07112b8785fb413106 \
+    --hash=sha256:96dc05a854add0e50d3f47a1ef21a10a595016da5b007c7d9cd9d0bffd1fcc61 \
+    --hash=sha256:98335fb98509b73385b3ab2bd0639b1f610541d3988ee675c670371d6a87aa7c \
+    --hash=sha256:9acb6c5e867447b4e1390930e3944a005e2cb115922e693c08a323421a6966e8 \
+    --hash=sha256:9b7ed8d58725030568523e937c43e56bc01cadb478fc43c042a9aca1dacb3ba1 \
+    --hash=sha256:abebbd61ad9e1deed54cca45caea8ad5f79e1b93173dece40bb8e0c658dbe6fe \
+    --hash=sha256:acbc0f5fd2edd3432a22c69bed78e837c70cf896cd7993d71d51ba6708507476 \
+    --hash=sha256:b4d6e9deed1a47aca9fe2f267ab8e8fe82ee20b4526b2c0cd9e135cea10feb44 \
+    --hash=sha256:bb24510ed3f9f61476181e4db51ce801e2ba37541def12dc9333b946fc7a9cf8 \
+    --hash=sha256:c7509693451651cd7361d30ce4e86a1347493554f172b1c72a39300fa2aea79e \
+    --hash=sha256:ca250e6836d10e6f402436d6463d6c0e4d8e0234cfb6a9a47835bd392b852ce5 \
+    --hash=sha256:e5bf3d930aee75a65478df91ac1225ff89cd28e9ac7bd1196853a9229b6adb0b \
+    --hash=sha256:f95dc55b7902b91331fa4e5845dd5bde0580c9cd9612b1b2791b7e80c3d32615 \
+    --hash=sha256:fa8f63940e29c82d1e67a45d5297bdebbcb585f5a5a50c4914cc2e852ab77f33
+    # via librosa
 scipy==1.11.4 \
     --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \
     --hash=sha256:028eccd22e654b3ea01ee63705681ee79933652b2d8f873e7949898dda6d11b6 \
@@ -4025,7 +4088,9 @@ scipy==1.11.4 \
     # via
     #   -c python/deplocks/llm/ray_test_py311_cpu.lock
     #   -r python/requirements.txt
+    #   librosa
     #   scikit-image
+    #   scikit-learn
     #   vllm
 send2trash==1.8.3 \
     --hash=sha256:0c31227e0bd08961c7665474a3d1ef7193929fedda4233843689baa056be46c9 \
@@ -4246,7 +4311,11 @@ soundfile==0.13.1 \
     --hash=sha256:a23c717560da2cf4c7b5ae1142514e0fd82d6bbd9dfc93a50423447142f2c445 \
     --hash=sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b \
     --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5
-    # via mistral-common
+    # via
+    #   -r python/requirements/llm/llm-requirements.txt
+    #   librosa
+    #   mistral-common
+    #   vllm
 soupsieve==2.5 \
     --hash=sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690 \
     --hash=sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7
@@ -4275,7 +4344,9 @@ soxr==0.5.0.post1 \
     --hash=sha256:fa0a382fb8d8e2afed2c1642723b2d2d1b9a6728ff89f77f3524034c8885b8c9 \
     --hash=sha256:fcc049b0a151a65aa75b92f0ac64bb2dba785d16b78c31c2b94e68c141751d6d \
     --hash=sha256:fef509466c9c25f65eae0ce1e4b9ac9705d22c6038c914160ddaf459589c6e31
-    # via mistral-common
+    # via
+    #   librosa
+    #   mistral-common
 sphinx==6.2.1 \
     --hash=sha256:6d56a34697bb749ffa0152feafc4b19836c755d90a7c59b72bc7dfd371b9cc6b \
     --hash=sha256:97787ff1fa3256a3eef9eda523a63dbf299f7b47e053cfcf684a1c2a8380c912
@@ -4354,6 +4425,10 @@ terminado==0.18.1 \
     #   jupyter-server
     #   nbclassic
     #   notebook
+threadpoolctl==3.6.0 \
+    --hash=sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb \
+    --hash=sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e
+    # via scikit-learn
 tifffile==2024.7.21 \
     --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \
     --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483
@@ -4585,6 +4660,7 @@ typing-extensions==4.12.2 \
     #   fastapi
     #   gymnasium
     #   huggingface-hub
+    #   librosa
     #   mistral-common
     #   openai
     #   opentelemetry-api
diff --git a/python/deplocks/llm/rayllm_test_py311_cu128.lock b/python/deplocks/llm/rayllm_test_py311_cu128.lock
index 34a7a94ed149..4cda7fdfdced 100644
--- a/python/deplocks/llm/rayllm_test_py311_cu128.lock
+++ b/python/deplocks/llm/rayllm_test_py311_cu128.lock
@@ -219,6 +219,10 @@ attrs==25.1.0 \
     #   aiohttp
     #   jsonschema
     #   referencing
+audioread==3.0.1 \
+    --hash=sha256:4cdce70b8adc0da0a3c9e0d85fb10b3ace30fbdf8d1670fd443929b61d117c33 \
+    --hash=sha256:ac5460a5498c48bdf2e8e767402583a4dcd13f4414d286f42ce4379e8b35066d
+    # via librosa
 azure-common==1.1.28 \
     --hash=sha256:4ac0cd3214e36b6a1b6a442686722a5d8cc449603aa833f3f0f40bda836704a3 \
     --hash=sha256:5c12d3dcf4ec20599ca6b0d3e09e86e146353d443e7fcc050c9a19c1f9df20ad
@@ -765,6 +769,7 @@ decorator==5.1.1 \
     # via
     #   -c python/deplocks/llm/ray_test_py311_cu128.lock
     #   ipython
+    #   librosa
 defusedxml==0.7.1 \
     --hash=sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69 \
     --hash=sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61
@@ -1678,6 +1683,12 @@ jmespath==1.0.1 \
     #   -c python/deplocks/llm/ray_test_py311_cu128.lock
     #   boto3
     #   botocore
+joblib==1.5.2 \
+    --hash=sha256:3faa5c39054b2f03ca547da9b2f52fde67c06240c31853f306aea97f13647b55 \
+    --hash=sha256:4e1f0bdbb987e6d843c70cf43714cb276623def372df3c22fe5266b2670bc241
+    # via
+    #   librosa
+    #   scikit-learn
 json5==0.9.14 \
     --hash=sha256:740c7f1b9e584a468dbb2939d8d458db3427f2c93ae2139d05f47e453eae964f \
     --hash=sha256:9ed66c3a6ca3510a976a9ef9b8c0787de24802724ab1860bc0153c7fdd589b02
@@ -1821,7 +1832,14 @@ lazy-loader==0.4 \
     --hash=sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1
     # via
     #   -c python/deplocks/llm/ray_test_py311_cu128.lock
+    #   librosa
     #   scikit-image
+librosa==0.11.0 \
+    --hash=sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1 \
+    --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908
+    # via
+    #   -r python/requirements/llm/llm-requirements.txt
+    #   vllm
 llguidance==0.7.29 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \
     --hash=sha256:17fd439957d6ca5f459d0dec755a2d040c2dc946ed7e3c332b469ef6861292f8 \
     --hash=sha256:1d30a76b30b646ac7f9025d262665f62bdbf2d43698115eeb1119c6ee062a36f \
@@ -2187,6 +2205,7 @@ msgpack==1.0.7 \
     # via
     #   -c python/deplocks/llm/ray_test_py311_cu128.lock
     #   -r python/requirements.txt
+    #   librosa
     #   ray
 msgspec==0.19.0 \
     --hash=sha256:00e87ecfa9795ee5214861eab8326b0e75475c2e68a384002aa135ea2a27d909 \
@@ -2435,7 +2454,9 @@ numba==0.61.2 \
     --hash=sha256:cf9f9fc00d6eca0c23fc840817ce9f439b9f03c8f03d6246c0e7f0cb15b7162a \
     --hash=sha256:ea0247617edcb5dd61f6106a56255baab031acc4257bddaeddb3a1003b4ca3fd \
     --hash=sha256:efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2
-    # via vllm
+    # via
+    #   librosa
+    #   vllm
 numpy==1.26.4 \
     --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \
     --hash=sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818 \
@@ -2480,12 +2501,14 @@ numpy==1.26.4 \
     #   gguf
     #   gymnasium
     #   imageio
+    #   librosa
     #   mistral-common
     #   nixl
     #   numba
     #   opencv-python-headless
     #   pandas
     #   scikit-image
+    #   scikit-learn
     #   scipy
     #   soundfile
     #   soxr
@@ -2694,6 +2717,7 @@ packaging==23.0 \
     #   lazy-loader
     #   lm-format-enforcer
     #   nbconvert
+    #   pooch
     #   pytest
     #   ray
     #   scikit-image
@@ -2849,6 +2873,7 @@ platformdirs==3.11.0 \
     # via
     #   -c python/deplocks/llm/ray_test_py311_cu128.lock
     #   jupyter-core
+    #   pooch
     #   virtualenv
 pluggy==1.3.0 \
     --hash=sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12 \
@@ -2856,6 +2881,10 @@ pluggy==1.3.0 \
     # via
     #   -c python/deplocks/llm/ray_test_py311_cu128.lock
     #   pytest
+pooch==1.8.2 \
+    --hash=sha256:3529a57096f7198778a5ceefd5ac3ef0e4d06a6ddaf9fc2d609b806f25302c47 \
+    --hash=sha256:76561f0de68a01da4df6af38e9955c4c9d1a5c90da73f7e40276a5728ec83d10
+    # via librosa
 portalocker==2.8.2 \
     --hash=sha256:2b035aa7828e46c58e9b31390ee1f169b98e1066ab10b9a6a861fe7e25ee4f33 \
     --hash=sha256:cfb86acc09b9aa7c3b43594e19be1345b9d16af3feb08bf92f23d4dce513a28e
@@ -3818,6 +3847,7 @@ requests==2.32.3 \
     #   jupyterlab-server
     #   mistral-common
     #   msal
+    #   pooch
     #   ray
     #   smart-open
     #   sphinx
@@ -4010,6 +4040,39 @@ scikit-image==0.24.0 \
     # via
     #   -c python/deplocks/llm/ray_test_py311_cu128.lock
     #   -r python/requirements.txt
+scikit-learn==1.7.2 \
+    --hash=sha256:0486c8f827c2e7b64837c731c8feff72c0bd2b998067a8a9cbc10643c31f0fe1 \
+    --hash=sha256:0b7dacaa05e5d76759fb071558a8b5130f4845166d88654a0f9bdf3eb57851b7 \
+    --hash=sha256:191e5550980d45449126e23ed1d5e9e24b2c68329ee1f691a3987476e115e09c \
+    --hash=sha256:20e9e49ecd130598f1ca38a1d85090e1a600147b9c02fa6f15d69cb53d968fda \
+    --hash=sha256:2a41e2a0ef45063e654152ec9d8bcfc39f7afce35b08902bfe290c2498a67a6a \
+    --hash=sha256:36749fb62b3d961b1ce4fedf08fa57a1986cd409eff2d783bca5d4b9b5fce51c \
+    --hash=sha256:4a847fea807e278f821a0406ca01e387f97653e284ecbd9750e3ee7c90347f18 \
+    --hash=sha256:502c18e39849c0ea1a5d681af1dbcf15f6cce601aebb657aabbfe84133c1907f \
+    --hash=sha256:57dc4deb1d3762c75d685507fbd0bc17160144b2f2ba4ccea5dc285ab0d0e973 \
+    --hash=sha256:6088aa475f0785e01bcf8529f55280a3d7d298679f50c0bb70a2364a82d0b290 \
+    --hash=sha256:63a9afd6f7b229aad94618c01c252ce9e6fa97918c5ca19c9a17a087d819440c \
+    --hash=sha256:6b33579c10a3081d076ab403df4a4190da4f4432d443521674637677dc91e61f \
+    --hash=sha256:7a4c328a71785382fe3fe676a9ecf2c86189249beff90bf85e22bdb7efaf9ae0 \
+    --hash=sha256:7a58814265dfc52b3295b1900cfb5701589d30a8bb026c7540f1e9d3499d5ec8 \
+    --hash=sha256:89877e19a80c7b11a2891a27c21c4894fb18e2c2e077815bcade10d34287b20d \
+    --hash=sha256:8d91a97fa2b706943822398ab943cde71858a50245e31bc71dba62aab1d60a96 \
+    --hash=sha256:8da8bf89d4d79aaec192d2bda62f9b56ae4e5b4ef93b6a56b5de4977e375c1f1 \
+    --hash=sha256:9656e4a53e54578ad10a434dc1f993330568cfee176dff07112b8785fb413106 \
+    --hash=sha256:96dc05a854add0e50d3f47a1ef21a10a595016da5b007c7d9cd9d0bffd1fcc61 \
+    --hash=sha256:98335fb98509b73385b3ab2bd0639b1f610541d3988ee675c670371d6a87aa7c \
+    --hash=sha256:9acb6c5e867447b4e1390930e3944a005e2cb115922e693c08a323421a6966e8 \
+    --hash=sha256:9b7ed8d58725030568523e937c43e56bc01cadb478fc43c042a9aca1dacb3ba1 \
+    --hash=sha256:abebbd61ad9e1deed54cca45caea8ad5f79e1b93173dece40bb8e0c658dbe6fe \
+    --hash=sha256:acbc0f5fd2edd3432a22c69bed78e837c70cf896cd7993d71d51ba6708507476 \
+    --hash=sha256:b4d6e9deed1a47aca9fe2f267ab8e8fe82ee20b4526b2c0cd9e135cea10feb44 \
+    --hash=sha256:bb24510ed3f9f61476181e4db51ce801e2ba37541def12dc9333b946fc7a9cf8 \
+    --hash=sha256:c7509693451651cd7361d30ce4e86a1347493554f172b1c72a39300fa2aea79e \
+    --hash=sha256:ca250e6836d10e6f402436d6463d6c0e4d8e0234cfb6a9a47835bd392b852ce5 \
+    --hash=sha256:e5bf3d930aee75a65478df91ac1225ff89cd28e9ac7bd1196853a9229b6adb0b \
+    --hash=sha256:f95dc55b7902b91331fa4e5845dd5bde0580c9cd9612b1b2791b7e80c3d32615 \
+    --hash=sha256:fa8f63940e29c82d1e67a45d5297bdebbcb585f5a5a50c4914cc2e852ab77f33
+    # via librosa
 scipy==1.11.4 \
     --hash=sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c \
     --hash=sha256:028eccd22e654b3ea01ee63705681ee79933652b2d8f873e7949898dda6d11b6 \
@@ -4039,7 +4102,9 @@ scipy==1.11.4 \
     # via
     #   -c python/deplocks/llm/ray_test_py311_cu128.lock
     #   -r python/requirements.txt
+    #   librosa
     #   scikit-image
+    #   scikit-learn
     #   vllm
 send2trash==1.8.3 \
     --hash=sha256:0c31227e0bd08961c7665474a3d1ef7193929fedda4233843689baa056be46c9 \
@@ -4260,7 +4325,11 @@ soundfile==0.13.1 \
     --hash=sha256:a23c717560da2cf4c7b5ae1142514e0fd82d6bbd9dfc93a50423447142f2c445 \
     --hash=sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b \
     --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5
-    # via mistral-common
+    # via
+    #   -r python/requirements/llm/llm-requirements.txt
+    #   librosa
+    #   mistral-common
+    #   vllm
 soupsieve==2.5 \
     --hash=sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690 \
     --hash=sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7
@@ -4289,7 +4358,9 @@ soxr==0.5.0.post1 \
     --hash=sha256:fa0a382fb8d8e2afed2c1642723b2d2d1b9a6728ff89f77f3524034c8885b8c9 \
     --hash=sha256:fcc049b0a151a65aa75b92f0ac64bb2dba785d16b78c31c2b94e68c141751d6d \
     --hash=sha256:fef509466c9c25f65eae0ce1e4b9ac9705d22c6038c914160ddaf459589c6e31
-    # via mistral-common
+    # via
+    #   librosa
+    #   mistral-common
 sphinx==6.2.1 \
     --hash=sha256:6d56a34697bb749ffa0152feafc4b19836c755d90a7c59b72bc7dfd371b9cc6b \
     --hash=sha256:97787ff1fa3256a3eef9eda523a63dbf299f7b47e053cfcf684a1c2a8380c912
@@ -4368,6 +4439,10 @@ terminado==0.18.1 \
     #   jupyter-server
     #   nbclassic
     #   notebook
+threadpoolctl==3.6.0 \
+    --hash=sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb \
+    --hash=sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e
+    # via scikit-learn
 tifffile==2024.7.21 \
     --hash=sha256:7f335b5d6ca49401fe0f1d87deb206f5dae47297e47b1ed52a676d05d6d26798 \
     --hash=sha256:818b577d49350421fb511f389f937984f9feaa2cd8177fa00823001920bf3483
@@ -4589,6 +4664,7 @@ typing-extensions==4.12.2 \
     #   fastapi
     #   gymnasium
     #   huggingface-hub
+    #   librosa
     #   mistral-common
     #   openai
     #   opentelemetry-api

From 5f8edde4f1c18d24e50f4fa247b6bcf84064915d Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Sun, 19 Oct 2025 21:15:30 +0530
Subject: [PATCH 23/33] doc updates

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 .../llm/user-guides/vllm-compatibility.md     | 63 +++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/doc/source/serve/llm/user-guides/vllm-compatibility.md b/doc/source/serve/llm/user-guides/vllm-compatibility.md
index 846fc79720c3..8a7219ed762d 100644
--- a/doc/source/serve/llm/user-guides/vllm-compatibility.md
+++ b/doc/source/serve/llm/user-guides/vllm-compatibility.md
@@ -80,6 +80,69 @@ curl -X POST http://localhost:8000/v1/embeddings \
 
 ::::
 
+
+## Transcriptions
+
+You can generate audio transcriptions for Speech-to-Text (STT) models trained specifically for Automatic Speech Recognition (ASR) tasks.
+Models supporting this use case are listed at
+`vLLM transcription models <https://docs.vllm.ai/en/stable/models/supported_models.html#transcription>`_.
+
+
+### Deploy an transcription model
+
+::::{tab-set}
+
+:::{tab-item} Python
+:sync: python
+
+```{literalinclude} ../../llm/doc_code/serve/transcription/transcription_example.py
+language: python
+:start-after: __transcription_example_start__
+:end-before: __transcription_example_end__
+```
+:::
+
+:::{tab-item} Python Client
+:sync: client
+
+```python
+from openai import OpenAI
+
+# Initialize client
+client = OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key")
+
+# Open audio file
+file = open("/path/to/audio.wav", "rb")
+
+# Make a request to the desired lora checkpoint
+response = client.audio.transcriptions.create(
+    model="whisper-large",
+    file=file,
+    temperature=0.0,
+    language="en",
+)
+
+print(response.text)
+```
+:::
+
+:::{tab-item} cURL
+:sync: curl
+
+```bash
+curl http://localhost:8000/v1/audio/transcriptions \
+    -X POST \
+    -H "Authorization: Bearer fake-key" \
+    -F "file=@/path/to/audio.wav" \
+    -F "model=whisper-large" \
+    -F "temperature=0.0" \
+    -F "language=en"
+```
+:::
+
+::::
+
+
 ## Structured output
 
 You can request structured JSON output similar to OpenAI's API using JSON mode or JSON schema validation with Pydantic models.

From b2f92d9fb8fde02a2e1bf078689483c6389eea94 Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Sun, 19 Oct 2025 21:55:39 +0530
Subject: [PATCH 24/33] doc fix

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 doc/source/serve/llm/user-guides/vllm-compatibility.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/serve/llm/user-guides/vllm-compatibility.md b/doc/source/serve/llm/user-guides/vllm-compatibility.md
index 8a7219ed762d..1a323f8533f0 100644
--- a/doc/source/serve/llm/user-guides/vllm-compatibility.md
+++ b/doc/source/serve/llm/user-guides/vllm-compatibility.md
@@ -95,7 +95,7 @@ Models supporting this use case are listed at
 :::{tab-item} Python
 :sync: python
 
-```{literalinclude} ../../llm/doc_code/serve/transcription/transcription_example.py
+```{literalinclude} ../../../llm/doc_code/serve/transcription/transcription_example.py
 language: python
 :start-after: __transcription_example_start__
 :end-before: __transcription_example_end__

From d1087539d1911890ffe3aa9d0da3d35074998785 Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Sun, 19 Oct 2025 22:22:00 +0530
Subject: [PATCH 25/33] docs fix

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 doc/source/serve/llm/user-guides/vllm-compatibility.md | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/doc/source/serve/llm/user-guides/vllm-compatibility.md b/doc/source/serve/llm/user-guides/vllm-compatibility.md
index 1a323f8533f0..4cc5dcfe4f4b 100644
--- a/doc/source/serve/llm/user-guides/vllm-compatibility.md
+++ b/doc/source/serve/llm/user-guides/vllm-compatibility.md
@@ -83,9 +83,7 @@ curl -X POST http://localhost:8000/v1/embeddings \
 
 ## Transcriptions
 
-You can generate audio transcriptions for Speech-to-Text (STT) models trained specifically for Automatic Speech Recognition (ASR) tasks.
-Models supporting this use case are listed at
-`vLLM transcription models <https://docs.vllm.ai/en/stable/models/supported_models.html#transcription>`_.
+You can generate audio transcriptions for Speech-to-Text (STT) models trained specifically for Automatic Speech Recognition (ASR) tasks. Models supporting this use case are listed in the [vLLM transcription models documentation](https://docs.vllm.ai/en/stable/models/supported_models.html#transcription).
 
 
 ### Deploy an transcription model
@@ -242,7 +240,6 @@ response = client.chat.completions.create(
     response_format={
         "type": "json_schema",
         "json_schema": Color.model_json_schema()
-
     },
     messages=[
         {

From 53b500d5f041ecf81174ab2154015b3548a1da6a Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Sun, 19 Oct 2025 22:57:03 +0530
Subject: [PATCH 26/33] docs fix

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 doc/source/serve/llm/user-guides/vllm-compatibility.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/serve/llm/user-guides/vllm-compatibility.md b/doc/source/serve/llm/user-guides/vllm-compatibility.md
index 4cc5dcfe4f4b..8a7d953a87dc 100644
--- a/doc/source/serve/llm/user-guides/vllm-compatibility.md
+++ b/doc/source/serve/llm/user-guides/vllm-compatibility.md
@@ -94,7 +94,7 @@ You can generate audio transcriptions for Speech-to-Text (STT) models trained sp
 :sync: python
 
 ```{literalinclude} ../../../llm/doc_code/serve/transcription/transcription_example.py
-language: python
+:language: python
 :start-after: __transcription_example_start__
 :end-before: __transcription_example_end__
 ```

From 29b7c34b0c16d55cb8116334cf3272db4e7f0103 Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Mon, 20 Oct 2025 13:57:00 +0530
Subject: [PATCH 27/33] Code review updates and fixes

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 .../transcription_config_example.yaml         | 29 -----------
 .../transcription/transcription_example.py    | 26 +++++-----
 .../transcription_yaml_config_example.py      | 51 -------------------
 .../llm/user-guides/vllm-compatibility.md     | 23 ++++-----
 .../_internal/serve/core/ingress/ingress.py   |  2 +-
 .../serve/engines/vllm/vllm_engine.py         |  6 +--
 python/requirements/llm/llm-requirements.txt  |  3 --
 python/setup.py                               |  5 +-
 8 files changed, 30 insertions(+), 115 deletions(-)
 delete mode 100644 doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml
 delete mode 100644 doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py

diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml b/doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml
deleted file mode 100644
index eeac1ea33001..000000000000
--- a/doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-# config.yaml
-applications:
-- args:
-    llm_configs:
-    - model_loading_config:
-        model_id: whisper-large
-        model_source: openai/whisper-large-v3
-      accelerator_type: A10G
-      deployment_config:
-        autoscaling_config:
-          min_replicas: 1
-          max_replicas: 2
-      log_engine_metrics: true
-    - model_loading_config:
-        model_id: voxtral-mini
-        model_source: mistralai/Voxtral-Mini-3B-2507
-      accelerator_type: A10G
-      engine_kwargs:
-        tokenizer_mode: mistral
-        config_format: mistral
-        load_format: mistral
-      deployment_config:
-        autoscaling_config:
-          min_replicas: 1
-          max_replicas: 2
-      log_engine_metrics: true
-  import_path: ray.serve.llm:build_openai_app
-  name: llm_app
-  route_prefix: "/"
diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_example.py b/doc/source/llm/doc_code/serve/transcription/transcription_example.py
index 181d83b35d47..d82241c2ff90 100644
--- a/doc/source/llm/doc_code/serve/transcription/transcription_example.py
+++ b/doc/source/llm/doc_code/serve/transcription/transcription_example.py
@@ -105,18 +105,18 @@ def _testing_build_openai_app(llm_serving_args):
     f.write(response.content)
 
 client = openai.OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key")
-file = open("audio.wav", "rb")
-
-try:
-    response = client.audio.transcriptions.create(
-        model="whisper-large",
-        file=file,
-        temperature=0.0,
-        language="en",
-    )
-except Exception as e:
-    raise AssertionError(
-        f"Error while querying models: {e}. Check the logs for more details."
-    )
+
+with open("audio.wav", "rb") as f:
+    try:
+        response = client.audio.transcriptions.create(
+            model="whisper-large",
+            file=f,
+            temperature=0.0,
+            language="en",
+        )
+    except Exception as e:
+        raise AssertionError(
+            f"Error while querying models: {e}. Check the logs for more details."
+        )
 
 serve.shutdown()
diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py b/doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py
deleted file mode 100644
index e37def988888..000000000000
--- a/doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py
+++ /dev/null
@@ -1,51 +0,0 @@
-"""
-This file serves as a documentation example and CI test for YAML config deployment.
-
-Structure:
-1. Monkeypatch setup: Ensures serve.run is non-blocking and removes accelerator requirements for CI testing.
-2. Load YAML config and convert to Python using build_openai_app
-3. Test validation (deployment status polling + cleanup)
-"""
-
-import time
-import os
-import yaml
-from ray import serve
-from ray.serve.schema import ApplicationStatus
-from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME
-from ray.serve import llm
-
-
-config_path = os.path.join(
-    os.path.dirname(__file__), "transcription_config_example.yaml"
-)
-with open(config_path, "r") as f:
-    config_dict = yaml.safe_load(f)
-
-llm_configs = config_dict["applications"][0]["args"]["llm_configs"]
-for config in llm_configs:
-    config.pop("accelerator_type", None)
-
-app = llm.build_openai_app({"llm_configs": llm_configs})
-serve.run(app, blocking=False)
-
-status = ApplicationStatus.NOT_STARTED
-timeout_seconds = 300
-start_time = time.time()
-
-while (
-    status != ApplicationStatus.RUNNING and time.time() - start_time < timeout_seconds
-):
-    status = serve.status().applications[SERVE_DEFAULT_APP_NAME].status
-
-    if status in [ApplicationStatus.DEPLOY_FAILED, ApplicationStatus.UNHEALTHY]:
-        raise AssertionError(f"Deployment failed with status: {status}")
-
-    time.sleep(1)
-
-if status != ApplicationStatus.RUNNING:
-    raise AssertionError(
-        f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}"
-    )
-
-serve.shutdown()
diff --git a/doc/source/serve/llm/user-guides/vllm-compatibility.md b/doc/source/serve/llm/user-guides/vllm-compatibility.md
index 8a7d953a87dc..c4f4dd6e28d3 100644
--- a/doc/source/serve/llm/user-guides/vllm-compatibility.md
+++ b/doc/source/serve/llm/user-guides/vllm-compatibility.md
@@ -86,7 +86,7 @@ curl -X POST http://localhost:8000/v1/embeddings \
 You can generate audio transcriptions for Speech-to-Text (STT) models trained specifically for Automatic Speech Recognition (ASR) tasks. Models supporting this use case are listed in the [vLLM transcription models documentation](https://docs.vllm.ai/en/stable/models/supported_models.html#transcription).
 
 
-### Deploy an transcription model
+### Deploy a transcription model
 
 ::::{tab-set}
 
@@ -110,17 +110,16 @@ from openai import OpenAI
 client = OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key")
 
 # Open audio file
-file = open("/path/to/audio.wav", "rb")
-
-# Make a request to the desired lora checkpoint
-response = client.audio.transcriptions.create(
-    model="whisper-large",
-    file=file,
-    temperature=0.0,
-    language="en",
-)
-
-print(response.text)
+with open("/path/to/audio.wav", "rb") as f:
+    # Make a request to the transcription model
+    response = client.audio.transcriptions.create(
+        model="whisper-large",
+        file=f,
+        temperature=0.0,
+        language="en",
+    )
+
+    print(response.text)
 ```
 :::
 
diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py
index e84c41c9d227..7e13f0d8cd99 100644
--- a/python/ray/llm/_internal/serve/core/ingress/ingress.py
+++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py
@@ -89,7 +89,7 @@
     "max_ongoing_requests": DEFAULT_MAX_ONGOING_REQUESTS,
 }
 
-
+# These methods correspond to functions defined in the LLMEngine class in python/ray/llm/_internal/serve/deployments/llm/llm_engine.py
 class CallMethod(Enum):
     CHAT = "chat"
     COMPLETIONS = "completions"
diff --git a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py
index 2f0adea640ac..13be7465f885 100644
--- a/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py
+++ b/python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py
@@ -397,7 +397,7 @@ async def chat(
             async for response in chat_response:
                 if not isinstance(response, str):
                     raise ValueError(
-                        f"Expected create_chat_completion to return a stream of strings, got and item with type {type(response)}"
+                        f"Expected create_chat_completion to return a stream of strings, got an item with type {type(response)}"
                     )
                 yield response
         else:
@@ -426,7 +426,7 @@ async def completions(
             async for response in completion_response:
                 if not isinstance(response, str):
                     raise ValueError(
-                        f"Expected create_completion to return a stream of strings, got and item with type {type(response)}"
+                        f"Expected create_completion to return a stream of strings, got an item with type {type(response)}"
                     )
                 yield response
         else:
@@ -482,7 +482,7 @@ async def transcriptions(
             async for response in transcription_response:
                 if not isinstance(response, str):
                     raise ValueError(
-                        f"Expected create_transcription to return a stream of strings, got and item with type {type(response)}"
+                        f"Expected create_transcription to return a stream of strings, got an item with type {type(response)}"
                     )
                 yield response
         else:
diff --git a/python/requirements/llm/llm-requirements.txt b/python/requirements/llm/llm-requirements.txt
index a9b40e9b7fe6..d32e70d23f89 100644
--- a/python/requirements/llm/llm-requirements.txt
+++ b/python/requirements/llm/llm-requirements.txt
@@ -14,6 +14,3 @@ typer
 meson
 pybind11
 hf_transfer
-# Audio processing dependencies for transcription support
-librosa
-soundfile
diff --git a/python/setup.py b/python/setup.py
index 2c124ee01494..869d5dfabecf 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -382,10 +382,9 @@ def get_packages(self):
                 # async-timeout is a backport of asyncio.timeout for python < 3.11
                 "async-timeout; python_version < '3.11'",
                 "typer",
+                "meson",
+                "pybind11",
                 "hf_transfer",
-                # Audio processing dependencies for transcription support
-                "librosa",
-                "soundfile",
             ]
             + setup_spec.extras["data"]
             + setup_spec.extras["serve"]

From 6d10b03a5fd436f01a26b42e03228b0189d21155 Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Mon, 20 Oct 2025 13:59:35 +0530
Subject: [PATCH 28/33] lock updates

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 python/deplocks/llm/rayllm_py311_cpu.lock        | 2 --
 python/deplocks/llm/rayllm_py311_cu128.lock      | 2 --
 python/deplocks/llm/rayllm_test_py311_cpu.lock   | 5 +----
 python/deplocks/llm/rayllm_test_py311_cu128.lock | 5 +----
 4 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/python/deplocks/llm/rayllm_py311_cpu.lock b/python/deplocks/llm/rayllm_py311_cpu.lock
index 32249f6d1995..9ad44ad117b2 100644
--- a/python/deplocks/llm/rayllm_py311_cpu.lock
+++ b/python/deplocks/llm/rayllm_py311_cpu.lock
@@ -1293,7 +1293,6 @@ librosa==0.11.0 \
     --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
-    #   -r python/requirements/llm/llm-requirements.txt
     #   vllm
 llguidance==0.7.26 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \
     --hash=sha256:1895ff449c8ec0a5f1d3b142d723fc9b26a85b021b72d7f1173f8b7507f528c0 \
@@ -3394,7 +3393,6 @@ soundfile==0.13.1 \
     --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cpu.lock
-    #   -r python/requirements/llm/llm-requirements.txt
     #   librosa
     #   mistral-common
     #   vllm
diff --git a/python/deplocks/llm/rayllm_py311_cu128.lock b/python/deplocks/llm/rayllm_py311_cu128.lock
index 8caae4a2ded5..a7f14ffe377e 100644
--- a/python/deplocks/llm/rayllm_py311_cu128.lock
+++ b/python/deplocks/llm/rayllm_py311_cu128.lock
@@ -1294,7 +1294,6 @@ librosa==0.11.0 \
     --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
-    #   -r python/requirements/llm/llm-requirements.txt
     #   vllm
 llguidance==0.7.29 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \
     --hash=sha256:17fd439957d6ca5f459d0dec755a2d040c2dc946ed7e3c332b469ef6861292f8 \
@@ -3434,7 +3433,6 @@ soundfile==0.13.1 \
     --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5
     # via
     #   -c python/deplocks/llm/rayllm_test_py311_cu128.lock
-    #   -r python/requirements/llm/llm-requirements.txt
     #   librosa
     #   mistral-common
     #   vllm
diff --git a/python/deplocks/llm/rayllm_test_py311_cpu.lock b/python/deplocks/llm/rayllm_test_py311_cpu.lock
index edd52e6eca9a..d5f4c289ab74 100644
--- a/python/deplocks/llm/rayllm_test_py311_cpu.lock
+++ b/python/deplocks/llm/rayllm_test_py311_cpu.lock
@@ -1837,9 +1837,7 @@ lazy-loader==0.4 \
 librosa==0.11.0 \
     --hash=sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1 \
     --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908
-    # via
-    #   -r python/requirements/llm/llm-requirements.txt
-    #   vllm
+    # via vllm
 llguidance==0.7.26 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \
     --hash=sha256:1895ff449c8ec0a5f1d3b142d723fc9b26a85b021b72d7f1173f8b7507f528c0 \
     --hash=sha256:5e6f6cec9c6648164062f0347262b3ec7c39f54d1be5c5347d6446bc7fdba115 \
@@ -4312,7 +4310,6 @@ soundfile==0.13.1 \
     --hash=sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b \
     --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5
     # via
-    #   -r python/requirements/llm/llm-requirements.txt
     #   librosa
     #   mistral-common
     #   vllm
diff --git a/python/deplocks/llm/rayllm_test_py311_cu128.lock b/python/deplocks/llm/rayllm_test_py311_cu128.lock
index 4cda7fdfdced..ab9931f03ebf 100644
--- a/python/deplocks/llm/rayllm_test_py311_cu128.lock
+++ b/python/deplocks/llm/rayllm_test_py311_cu128.lock
@@ -1837,9 +1837,7 @@ lazy-loader==0.4 \
 librosa==0.11.0 \
     --hash=sha256:0b6415c4fd68bff4c29288abe67c6d80b587e0e1e2cfb0aad23e4559504a7fa1 \
     --hash=sha256:f5ed951ca189b375bbe2e33b2abd7e040ceeee302b9bbaeeffdfddb8d0ace908
-    # via
-    #   -r python/requirements/llm/llm-requirements.txt
-    #   vllm
+    # via vllm
 llguidance==0.7.29 ; platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64' \
     --hash=sha256:17fd439957d6ca5f459d0dec755a2d040c2dc946ed7e3c332b469ef6861292f8 \
     --hash=sha256:1d30a76b30b646ac7f9025d262665f62bdbf2d43698115eeb1119c6ee062a36f \
@@ -4326,7 +4324,6 @@ soundfile==0.13.1 \
     --hash=sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b \
     --hash=sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5
     # via
-    #   -r python/requirements/llm/llm-requirements.txt
     #   librosa
     #   mistral-common
     #   vllm

From 6df59eba72ccc40c203a1b56d47fc69f7e7f6cf1 Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Mon, 20 Oct 2025 23:47:59 +0530
Subject: [PATCH 29/33] yaml tests for bazel

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 .../transcription_config_example.yaml         | 29 +++++++++++
 .../transcription_yaml_config_example.py      | 51 +++++++++++++++++++
 2 files changed, 80 insertions(+)
 create mode 100644 doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml
 create mode 100644 doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py

diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml b/doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml
new file mode 100644
index 000000000000..eeac1ea33001
--- /dev/null
+++ b/doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml
@@ -0,0 +1,29 @@
+# config.yaml
+applications:
+- args:
+    llm_configs:
+    - model_loading_config:
+        model_id: whisper-large
+        model_source: openai/whisper-large-v3
+      accelerator_type: A10G
+      deployment_config:
+        autoscaling_config:
+          min_replicas: 1
+          max_replicas: 2
+      log_engine_metrics: true
+    - model_loading_config:
+        model_id: voxtral-mini
+        model_source: mistralai/Voxtral-Mini-3B-2507
+      accelerator_type: A10G
+      engine_kwargs:
+        tokenizer_mode: mistral
+        config_format: mistral
+        load_format: mistral
+      deployment_config:
+        autoscaling_config:
+          min_replicas: 1
+          max_replicas: 2
+      log_engine_metrics: true
+  import_path: ray.serve.llm:build_openai_app
+  name: llm_app
+  route_prefix: "/"
diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py b/doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py
new file mode 100644
index 000000000000..e37def988888
--- /dev/null
+++ b/doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py
@@ -0,0 +1,51 @@
+"""
+This file serves as a documentation example and CI test for YAML config deployment.
+
+Structure:
+1. Monkeypatch setup: Ensures serve.run is non-blocking and removes accelerator requirements for CI testing.
+2. Load YAML config and convert to Python using build_openai_app
+3. Test validation (deployment status polling + cleanup)
+"""
+
+import time
+import os
+import yaml
+from ray import serve
+from ray.serve.schema import ApplicationStatus
+from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME
+from ray.serve import llm
+
+
+config_path = os.path.join(
+    os.path.dirname(__file__), "transcription_config_example.yaml"
+)
+with open(config_path, "r") as f:
+    config_dict = yaml.safe_load(f)
+
+llm_configs = config_dict["applications"][0]["args"]["llm_configs"]
+for config in llm_configs:
+    config.pop("accelerator_type", None)
+
+app = llm.build_openai_app({"llm_configs": llm_configs})
+serve.run(app, blocking=False)
+
+status = ApplicationStatus.NOT_STARTED
+timeout_seconds = 300
+start_time = time.time()
+
+while (
+    status != ApplicationStatus.RUNNING and time.time() - start_time < timeout_seconds
+):
+    status = serve.status().applications[SERVE_DEFAULT_APP_NAME].status
+
+    if status in [ApplicationStatus.DEPLOY_FAILED, ApplicationStatus.UNHEALTHY]:
+        raise AssertionError(f"Deployment failed with status: {status}")
+
+    time.sleep(1)
+
+if status != ApplicationStatus.RUNNING:
+    raise AssertionError(
+        f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}"
+    )
+
+serve.shutdown()

From 288ff91e5d030b1e2de900a9aba1152bcb9a68a7 Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Tue, 21 Oct 2025 22:54:03 +0530
Subject: [PATCH 30/33] removed .yaml doc code example and tests

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 doc/BUILD.bazel                               |  2 +-
 .../transcription_config_example.yaml         | 29 -----------
 .../transcription_yaml_config_example.py      | 51 -------------------
 3 files changed, 1 insertion(+), 81 deletions(-)
 delete mode 100644 doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml
 delete mode 100644 doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py

diff --git a/doc/BUILD.bazel b/doc/BUILD.bazel
index 830d0e95d357..26df9d2197af 100644
--- a/doc/BUILD.bazel
+++ b/doc/BUILD.bazel
@@ -348,7 +348,7 @@ py_test_run_all_subdirectory(
     include = ["source/llm/doc_code/serve/**/*.py"],
     exclude = [],
     extra_srcs = [],
-    data = ["source/llm/doc_code/serve/qwen/llm_config_example.yaml", "source/llm/doc_code/serve/transcription/transcription_config_example.yaml"],
+    data = ["source/llm/doc_code/serve/qwen/llm_config_example.yaml"],
     tags = [
         "exclusive",
         "gpu",
diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml b/doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml
deleted file mode 100644
index eeac1ea33001..000000000000
--- a/doc/source/llm/doc_code/serve/transcription/transcription_config_example.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-# config.yaml
-applications:
-- args:
-    llm_configs:
-    - model_loading_config:
-        model_id: whisper-large
-        model_source: openai/whisper-large-v3
-      accelerator_type: A10G
-      deployment_config:
-        autoscaling_config:
-          min_replicas: 1
-          max_replicas: 2
-      log_engine_metrics: true
-    - model_loading_config:
-        model_id: voxtral-mini
-        model_source: mistralai/Voxtral-Mini-3B-2507
-      accelerator_type: A10G
-      engine_kwargs:
-        tokenizer_mode: mistral
-        config_format: mistral
-        load_format: mistral
-      deployment_config:
-        autoscaling_config:
-          min_replicas: 1
-          max_replicas: 2
-      log_engine_metrics: true
-  import_path: ray.serve.llm:build_openai_app
-  name: llm_app
-  route_prefix: "/"
diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py b/doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py
deleted file mode 100644
index e37def988888..000000000000
--- a/doc/source/llm/doc_code/serve/transcription/transcription_yaml_config_example.py
+++ /dev/null
@@ -1,51 +0,0 @@
-"""
-This file serves as a documentation example and CI test for YAML config deployment.
-
-Structure:
-1. Monkeypatch setup: Ensures serve.run is non-blocking and removes accelerator requirements for CI testing.
-2. Load YAML config and convert to Python using build_openai_app
-3. Test validation (deployment status polling + cleanup)
-"""
-
-import time
-import os
-import yaml
-from ray import serve
-from ray.serve.schema import ApplicationStatus
-from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME
-from ray.serve import llm
-
-
-config_path = os.path.join(
-    os.path.dirname(__file__), "transcription_config_example.yaml"
-)
-with open(config_path, "r") as f:
-    config_dict = yaml.safe_load(f)
-
-llm_configs = config_dict["applications"][0]["args"]["llm_configs"]
-for config in llm_configs:
-    config.pop("accelerator_type", None)
-
-app = llm.build_openai_app({"llm_configs": llm_configs})
-serve.run(app, blocking=False)
-
-status = ApplicationStatus.NOT_STARTED
-timeout_seconds = 300
-start_time = time.time()
-
-while (
-    status != ApplicationStatus.RUNNING and time.time() - start_time < timeout_seconds
-):
-    status = serve.status().applications[SERVE_DEFAULT_APP_NAME].status
-
-    if status in [ApplicationStatus.DEPLOY_FAILED, ApplicationStatus.UNHEALTHY]:
-        raise AssertionError(f"Deployment failed with status: {status}")
-
-    time.sleep(1)
-
-if status != ApplicationStatus.RUNNING:
-    raise AssertionError(
-        f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}"
-    )
-
-serve.shutdown()

From 4095f754ed7222014d9c487e2b3d0a11bbd079ef Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Fri, 24 Oct 2025 00:21:38 +0530
Subject: [PATCH 31/33] review updates

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 .../transcription/transcription_example.py    | 28 ++++---------------
 .../llm/user-guides/vllm-compatibility.md     |  2 +-
 .../_internal/serve/core/ingress/ingress.py   |  2 ++
 python/ray/llm/tests/serve/conftest.py        |  2 +-
 4 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_example.py b/doc/source/llm/doc_code/serve/transcription/transcription_example.py
index d82241c2ff90..832251422303 100644
--- a/doc/source/llm/doc_code/serve/transcription/transcription_example.py
+++ b/doc/source/llm/doc_code/serve/transcription/transcription_example.py
@@ -40,31 +40,15 @@ def _testing_build_openai_app(llm_serving_args):
 from ray import serve
 from ray.serve.llm import LLMConfig, build_openai_app
 
-whisper_llm_config = LLMConfig(
+transcription_config = LLMConfig(
     model_loading_config={
-        "model_id": "whisper-large",
-        "model_source": "openai/whisper-large-v3",
+        "model_id": "voxtral-small",
+        "model_source": "mistralai/Voxtral-Small-24B-2507",
     },
     deployment_config={
         "autoscaling_config": {
             "min_replicas": 1,
-            "max_replicas": 2,
-        }
-    },
-    # Pass the desired accelerator type (e.g. A10G, L4, etc.)
-    accelerator_type="A10G",
-    log_engine_metrics=True,
-)
-
-voxtral_llm_config = LLMConfig(
-    model_loading_config={
-        "model_id": "voxtral-mini",
-        "model_source": "mistralai/Voxtral-Mini-3B-2507",
-    },
-    deployment_config={
-        "autoscaling_config": {
-            "min_replicas": 1,
-            "max_replicas": 2,
+            "max_replicas": 4,
         }
     },
     accelerator_type="A10G",
@@ -77,7 +61,7 @@ def _testing_build_openai_app(llm_serving_args):
     log_engine_metrics=True,
 )
 
-app = build_openai_app({"llm_configs": [whisper_llm_config, voxtral_llm_config]})
+app = build_openai_app({"llm_configs": [transcription_config]})
 serve.run(app, blocking=True)
 # __transcription_example_end__
 
@@ -109,7 +93,7 @@ def _testing_build_openai_app(llm_serving_args):
 with open("audio.wav", "rb") as f:
     try:
         response = client.audio.transcriptions.create(
-            model="whisper-large",
+            model="voxtral-small",
             file=f,
             temperature=0.0,
             language="en",
diff --git a/doc/source/serve/llm/user-guides/vllm-compatibility.md b/doc/source/serve/llm/user-guides/vllm-compatibility.md
index c4f4dd6e28d3..1c6518a8cf98 100644
--- a/doc/source/serve/llm/user-guides/vllm-compatibility.md
+++ b/doc/source/serve/llm/user-guides/vllm-compatibility.md
@@ -83,7 +83,7 @@ curl -X POST http://localhost:8000/v1/embeddings \
 
 ## Transcriptions
 
-You can generate audio transcriptions for Speech-to-Text (STT) models trained specifically for Automatic Speech Recognition (ASR) tasks. Models supporting this use case are listed in the [vLLM transcription models documentation](https://docs.vllm.ai/en/stable/models/supported_models.html#transcription).
+You can generate audio transcriptions for Speech-to-Text (STT) models trained specifically for Automatic Speech Recognition (ASR) tasks. Models supporting this use case are listed in the [vLLM transcription models documentation](https://docs.vllm.ai/en/stable/models/supported_models.html).
 
 
 ### Deploy a transcription model
diff --git a/python/ray/llm/_internal/serve/core/ingress/ingress.py b/python/ray/llm/_internal/serve/core/ingress/ingress.py
index 7e13f0d8cd99..29a9e17ada4d 100644
--- a/python/ray/llm/_internal/serve/core/ingress/ingress.py
+++ b/python/ray/llm/_internal/serve/core/ingress/ingress.py
@@ -607,6 +607,8 @@ async def embeddings(self, body: EmbeddingRequest) -> Response:
             if isinstance(result, EmbeddingResponse):
                 return JSONResponse(content=result.model_dump())
 
+    # Annotated[..., Form()] is wrapper that is used to handle multiple form data, which is how audio is sent in transcription requests.
+    # vLLM implementation for handling transcription requests: https://github.com/vllm-project/vllm/blob/0825197bee8dea547f2ab25f48afd8aea0cd2578/vllm/entrypoints/openai/api_server.py#L839.
     async def transcriptions(
         self, body: Annotated[TranscriptionRequest, Form()]
     ) -> Response:
diff --git a/python/ray/llm/tests/serve/conftest.py b/python/ray/llm/tests/serve/conftest.py
index 5c7357a1823e..071e572a06f4 100644
--- a/python/ray/llm/tests/serve/conftest.py
+++ b/python/ray/llm/tests/serve/conftest.py
@@ -123,7 +123,7 @@ def mock_transcription_request(stream, temperature, language):
     from fastapi import UploadFile
 
     # Create a simple mock audio file (WAV format)
-    mock_audio_data = b"RIFF\x00\x00\x00\x00WAVEfmt \x10\x00\x00\x00\x01\x00\x01\x00\x44\xac\x00\x00\x88X\x01\x00\x02\x00\x10\x00data\x00\x00\x00\x00"
+    mock_audio_data = b"RIFF\x00\x00\x00\x00WAVEfmt \x10\x00\x00\x00\x01\x00\x01\x00\x44\xac\x00\x00\x88X\x01\x00\x02\x00\x10\x00data\x00\x00\x00\x00"  # random byte string to test the transcription API
     mock_file = UploadFile(
         file=BytesIO(mock_audio_data),
         filename="test_audio.wav",

From 57e323ae196946c05b80cd8f5eae9dac6443f7ac Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Fri, 24 Oct 2025 12:22:38 +0530
Subject: [PATCH 32/33] test fix

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 .../doc_code/serve/transcription/transcription_example.py   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_example.py b/doc/source/llm/doc_code/serve/transcription/transcription_example.py
index 832251422303..1a980dcaac9b 100644
--- a/doc/source/llm/doc_code/serve/transcription/transcription_example.py
+++ b/doc/source/llm/doc_code/serve/transcription/transcription_example.py
@@ -42,8 +42,8 @@ def _testing_build_openai_app(llm_serving_args):
 
 transcription_config = LLMConfig(
     model_loading_config={
-        "model_id": "voxtral-small",
-        "model_source": "mistralai/Voxtral-Small-24B-2507",
+        "model_id": "voxtral-mini",
+        "model_source": "mistralai/Voxtral-Mini-3B-2507",
     },
     deployment_config={
         "autoscaling_config": {
@@ -93,7 +93,7 @@ def _testing_build_openai_app(llm_serving_args):
 with open("audio.wav", "rb") as f:
     try:
         response = client.audio.transcriptions.create(
-            model="voxtral-small",
+            model="voxtral-mini",
             file=f,
             temperature=0.0,
             language="en",

From 05cf83edc28c2d8ca899a4e8b116a90eed44235c Mon Sep 17 00:00:00 2001
From: DPatel_7 <dpatel@gocommotion.com>
Date: Fri, 24 Oct 2025 14:01:47 +0530
Subject: [PATCH 33/33] doc updates

Signed-off-by: DPatel_7 <dpatel@gocommotion.com>
---
 .../doc_code/serve/transcription/transcription_example.py   | 4 ++--
 doc/source/serve/llm/user-guides/vllm-compatibility.md      | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/source/llm/doc_code/serve/transcription/transcription_example.py b/doc/source/llm/doc_code/serve/transcription/transcription_example.py
index 1a980dcaac9b..aed2e567146e 100644
--- a/doc/source/llm/doc_code/serve/transcription/transcription_example.py
+++ b/doc/source/llm/doc_code/serve/transcription/transcription_example.py
@@ -40,7 +40,7 @@ def _testing_build_openai_app(llm_serving_args):
 from ray import serve
 from ray.serve.llm import LLMConfig, build_openai_app
 
-transcription_config = LLMConfig(
+llm_config = LLMConfig(
     model_loading_config={
         "model_id": "voxtral-mini",
         "model_source": "mistralai/Voxtral-Mini-3B-2507",
@@ -61,7 +61,7 @@ def _testing_build_openai_app(llm_serving_args):
     log_engine_metrics=True,
 )
 
-app = build_openai_app({"llm_configs": [transcription_config]})
+app = build_openai_app({"llm_configs": [llm_config]})
 serve.run(app, blocking=True)
 # __transcription_example_end__
 
diff --git a/doc/source/serve/llm/user-guides/vllm-compatibility.md b/doc/source/serve/llm/user-guides/vllm-compatibility.md
index 1c6518a8cf98..4ec9a44b6ad4 100644
--- a/doc/source/serve/llm/user-guides/vllm-compatibility.md
+++ b/doc/source/serve/llm/user-guides/vllm-compatibility.md
@@ -83,15 +83,15 @@ curl -X POST http://localhost:8000/v1/embeddings \
 
 ## Transcriptions
 
-You can generate audio transcriptions for Speech-to-Text (STT) models trained specifically for Automatic Speech Recognition (ASR) tasks. Models supporting this use case are listed in the [vLLM transcription models documentation](https://docs.vllm.ai/en/stable/models/supported_models.html).
+You can generate audio transcriptions using Speech-to-Text (STT) models trained specifically for Automatic Speech Recognition (ASR) tasks. Models supporting this use case are listed in the [vLLM transcription models documentation](https://docs.vllm.ai/en/stable/models/supported_models.html).
 
 
 ### Deploy a transcription model
 
 ::::{tab-set}
 
-:::{tab-item} Python
-:sync: python
+:::{tab-item} Server
+:sync: server
 
 ```{literalinclude} ../../../llm/doc_code/serve/transcription/transcription_example.py
 :language: python