vllm-project · DarkLight1337 · May 12, 2026 · May 12, 2026 · May 12, 2026 · May 12, 2026
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -31,8 +31,8 @@
 /vllm/entrypoints/cli @hmellor @mgoin @DarkLight1337 @russellb
 /vllm/entrypoints/mcp @heheda12345
 /vllm/entrypoints/openai @aarnphm @chaunceyjiang @DarkLight1337 @russellb
-/vllm/entrypoints/openai/realtime @njhill
-/vllm/entrypoints/openai/speech_to_text @NickLucche
+/vllm/entrypoints/speech_to_text/realtime @njhill
+/vllm/entrypoints/speech_to_text @NickLucche
 /vllm/entrypoints/pooling @noooop
 /vllm/entrypoints/sagemaker @DarkLight1337
 /vllm/entrypoints/serve @njhill

@@ -7,8 +7,8 @@
 
 import pytest
 
-from vllm.entrypoints.openai.speech_to_text.protocol import TranscriptionResponse
-from vllm.entrypoints.openai.speech_to_text.speech_to_text import OpenAISpeechToText
+from vllm.entrypoints.speech_to_text.base.serving import OpenAISpeechToText
+from vllm.entrypoints.speech_to_text.transcription.protocol import TranscriptionResponse
 
 
 async def _never_finishes():

@@ -24,12 +24,14 @@
     RequestResponseMetadata,
 )
 from vllm.entrypoints.openai.models.serving import OpenAIServingModels
-from vllm.entrypoints.openai.speech_to_text.protocol import TranscriptionRequest
-from vllm.entrypoints.openai.speech_to_text.serving import OpenAIServingTranscription
-from vllm.entrypoints.openai.speech_to_text.speech_to_text import (
+from vllm.entrypoints.speech_to_text.base.serving import (
     OpenAISpeechToText,
     asr_inter_chunk_separator,
 )
+from vllm.entrypoints.speech_to_text.transcription.protocol import TranscriptionRequest
+from vllm.entrypoints.speech_to_text.transcription.serving import (
+    OpenAIServingTranscription,
+)
 from vllm.model_executor.models.interfaces import SupportsTranscription
 from vllm.outputs import CompletionOutput, RequestOutput
 

@@ -233,19 +233,12 @@ def build_app(
 
         attach_render_router(app)
 
-    if "transcription" in supported_tasks:
-        from vllm.entrypoints.openai.speech_to_text.api_router import (
-            attach_router as register_speech_to_text_api_router,
+    if "transcription" in supported_tasks or "realtime" in supported_tasks:
+        from vllm.entrypoints.speech_to_text.factories import (
+            register_speech_to_text_api_routers,
         )
 
-        register_speech_to_text_api_router(app)
-
-    if "realtime" in supported_tasks:
-        from vllm.entrypoints.openai.realtime.api_router import (
-            attach_router as register_realtime_api_router,
-        )
-
-        register_realtime_api_router(app)
+        register_speech_to_text_api_routers(app, supported_tasks)
 
     if any(task in POOLING_TASKS for task in supported_tasks):
         from vllm.entrypoints.pooling.factories import register_pooling_api_routers
@@ -284,11 +277,11 @@ def build_app(
 
     if "realtime" in supported_tasks:
         # Add WebSocket metrics middleware
-        from vllm.entrypoints.openai.realtime.metrics import (
-            WebSocketMetricsMiddleware,
+        from vllm.entrypoints.speech_to_text.factories import (
+            add_websocket_metrics_middleware,
         )
 
-        app.add_middleware(WebSocketMetricsMiddleware)
+        add_websocket_metrics_middleware(app)
 
     if envs.VLLM_DEBUG_LOG_API_SERVER_RESPONSE:
         logger.warning(
@@ -421,20 +414,13 @@ async def init_app_state(
 
         await init_generative_scoring_state(engine_client, state, args, request_logger)
 
-    if "transcription" in supported_tasks:
-        from vllm.entrypoints.openai.speech_to_text.api_router import (
-            init_transcription_state,
-        )
+    if "transcription" in supported_tasks or "realtime" in supported_tasks:
+        from vllm.entrypoints.speech_to_text.factories import init_speech_to_text_state
 
-        init_transcription_state(
+        init_speech_to_text_state(
             engine_client, state, args, request_logger, supported_tasks
         )
 
-    if "realtime" in supported_tasks:
-        from vllm.entrypoints.openai.realtime.api_router import init_realtime_state
-
-        init_realtime_state(engine_client, state, args, request_logger, supported_tasks)
-
     if any(task in POOLING_TASKS for task in supported_tasks):
         from vllm.entrypoints.pooling.factories import init_pooling_state
 

@@ -39,18 +39,18 @@
 )
 from vllm.entrypoints.openai.models.serving import OpenAIServingModels
 from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
-from vllm.entrypoints.openai.speech_to_text.protocol import (
-    TranscriptionRequest,
-    TranscriptionResponse,
-    TranslationRequest,
-)
 from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse
 from vllm.entrypoints.serve.tokenize.protocol import (
     DetokenizeRequest,
     TokenizeChatRequest,
     TokenizeCompletionRequest,
     TokenizeResponse,
 )
+from vllm.entrypoints.speech_to_text.transcription.protocol import (
+    TranscriptionRequest,
+    TranscriptionResponse,
+)
+from vllm.entrypoints.speech_to_text.translation.protocol import TranslationRequest
 from vllm.entrypoints.utils import create_error_response
 from vllm.inputs import EngineInput, PromptType
 from vllm.logger import init_logger

@@ -41,14 +41,6 @@
     ErrorResponse,
     OpenAIBaseModel,
 )
-from vllm.entrypoints.openai.speech_to_text.protocol import (
-    TranscriptionRequest,
-    TranscriptionResponse,
-    TranscriptionResponseVerbose,
-    TranslationRequest,
-    TranslationResponse,
-    TranslationResponseVerbose,
-)
 from vllm.entrypoints.pooling.embed.protocol import (
     EmbeddingRequest,
     EmbeddingResponse,
@@ -59,6 +51,16 @@
     ScoreRequest,
     ScoreResponse,
 )
+from vllm.entrypoints.speech_to_text.transcription.protocol import (
+    TranscriptionRequest,
+    TranscriptionResponse,
+    TranscriptionResponseVerbose,
+)
+from vllm.entrypoints.speech_to_text.translation.protocol import (
+    TranslationRequest,
+    TranslationResponse,
+    TranslationResponseVerbose,
+)
 from vllm.entrypoints.utils import create_error_response
 from vllm.exceptions import VLLMValidationError
 from vllm.logger import init_logger

@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from typing import Literal, TypeAlias
+
+import torch
+
+## Protocols for Audio
+AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json", "vtt"]
+_LONG_INFO = torch.iinfo(torch.long)
@@ -24,18 +24,6 @@
 )
 from vllm.entrypoints.openai.engine.serving import OpenAIServing, SpeechToTextRequest
 from vllm.entrypoints.openai.models.serving import OpenAIServingModels
-from vllm.entrypoints.openai.speech_to_text.protocol import (
-    TranscriptionResponse,
-    TranscriptionResponseStreamChoice,
-    TranscriptionResponseVerbose,
-    TranscriptionSegment,
-    TranscriptionStreamResponse,
-    TranslationResponse,
-    TranslationResponseStreamChoice,
-    TranslationResponseVerbose,
-    TranslationSegment,
-    TranslationStreamResponse,
-)
 from vllm.entrypoints.utils import get_max_tokens
 from vllm.exceptions import VLLMValidationError
 from vllm.inputs import EncoderDecoderInput, EngineInput
@@ -51,6 +39,21 @@
 from vllm.tokenizers import get_tokenizer
 from vllm.utils.async_utils import merge_async_iterators
 
+from ..transcription.protocol import (
+    TranscriptionResponse,
+    TranscriptionResponseStreamChoice,
+    TranscriptionResponseVerbose,
+    TranscriptionSegment,
+    TranscriptionStreamResponse,
+)
+from ..translation.protocol import (
+    TranslationResponse,
+    TranslationResponseStreamChoice,
+    TranslationResponseVerbose,
+    TranslationSegment,
+    TranslationStreamResponse,
+)
+
 SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse
 SpeechToTextResponseVerbose: TypeAlias = (
     TranscriptionResponseVerbose | TranslationResponseVerbose