From 2760b16a1b01a3402d7400143f49cc6b3902823d Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Tue, 12 May 2026 11:22:55 +0800
Subject: [PATCH 1/3] init

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
---
 .github/CODEOWNERS                            |   4 +-
 .../test_speech_to_text_cancellation.py       |   4 +-
 .../test_transcription_inter_chunk_spacing.py |   8 +-
 vllm/entrypoints/openai/api_server.py         |  34 +-
 vllm/entrypoints/openai/engine/serving.py     |  10 +-
 vllm/entrypoints/openai/run_batch.py          |  16 +-
 .../openai/speech_to_text/api_router.py       | 148 ---------
 .../realtime => speech_to_text}/__init__.py   |   0
 .../speech_to_text/base/__init__.py           |   0
 .../speech_to_text/base/protocol.py           |  11 +
 .../base/serving.py}                          |  27 +-
 vllm/entrypoints/speech_to_text/factories.py  |  84 +++++
 .../realtime}/__init__.py                     |   0
 .../realtime/api_router.py                    |  41 +--
 .../realtime/connection.py                    |   9 +-
 .../realtime/metrics.py                       |   0
 .../realtime/protocol.py                      |   0
 .../realtime/serving.py                       |   0
 .../speech_to_text/transcription/__init__.py  |   0
 .../transcription/api_router.py               |  60 ++++
 .../transcription}/protocol.py                | 281 +---------------
 .../transcription}/serving.py                 |  87 +----
 .../speech_to_text/translation/__init__.py    |   0
 .../speech_to_text/translation/api_router.py  |  60 ++++
 .../speech_to_text/translation/protocol.py    | 308 ++++++++++++++++++
 .../speech_to_text/translation/serving.py     |  99 ++++++
 26 files changed, 684 insertions(+), 607 deletions(-)
 delete mode 100644 vllm/entrypoints/openai/speech_to_text/api_router.py
 rename vllm/entrypoints/{openai/realtime => speech_to_text}/__init__.py (100%)
 create mode 100644 vllm/entrypoints/speech_to_text/base/__init__.py
 create mode 100644 vllm/entrypoints/speech_to_text/base/protocol.py
 rename vllm/entrypoints/{openai/speech_to_text/speech_to_text.py => speech_to_text/base/serving.py} (99%)
 create mode 100644 vllm/entrypoints/speech_to_text/factories.py
 rename vllm/entrypoints/{openai/speech_to_text => speech_to_text/realtime}/__init__.py (100%)
 rename vllm/entrypoints/{openai => speech_to_text}/realtime/api_router.py (51%)
 rename vllm/entrypoints/{openai => speech_to_text}/realtime/connection.py (98%)
 rename vllm/entrypoints/{openai => speech_to_text}/realtime/metrics.py (100%)
 rename vllm/entrypoints/{openai => speech_to_text}/realtime/protocol.py (100%)
 rename vllm/entrypoints/{openai => speech_to_text}/realtime/serving.py (100%)
 create mode 100644 vllm/entrypoints/speech_to_text/transcription/__init__.py
 create mode 100644 vllm/entrypoints/speech_to_text/transcription/api_router.py
 rename vllm/entrypoints/{openai/speech_to_text => speech_to_text/transcription}/protocol.py (59%)
 rename vllm/entrypoints/{openai/speech_to_text => speech_to_text/transcription}/serving.py (53%)
 create mode 100644 vllm/entrypoints/speech_to_text/translation/__init__.py
 create mode 100644 vllm/entrypoints/speech_to_text/translation/api_router.py
 create mode 100644 vllm/entrypoints/speech_to_text/translation/protocol.py
 create mode 100644 vllm/entrypoints/speech_to_text/translation/serving.py

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 44cf10076ee7..66041fcee588 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -31,8 +31,8 @@
 /vllm/entrypoints/cli @hmellor @mgoin @DarkLight1337 @russellb
 /vllm/entrypoints/mcp @heheda12345
 /vllm/entrypoints/openai @aarnphm @chaunceyjiang @DarkLight1337 @russellb
-/vllm/entrypoints/openai/realtime @njhill
-/vllm/entrypoints/openai/speech_to_text @NickLucche
+/vllm/entrypoints/speech_to_text/realtime @njhill
+/vllm/entrypoints/speech_to_text @NickLucche
 /vllm/entrypoints/pooling @noooop
 /vllm/entrypoints/sagemaker @DarkLight1337
 /vllm/entrypoints/serve @njhill
diff --git a/tests/entrypoints/speech_to_text/test_speech_to_text_cancellation.py b/tests/entrypoints/speech_to_text/test_speech_to_text_cancellation.py
index 2c8c1229e840..08553c641103 100644
--- a/tests/entrypoints/speech_to_text/test_speech_to_text_cancellation.py
+++ b/tests/entrypoints/speech_to_text/test_speech_to_text_cancellation.py
@@ -7,8 +7,8 @@
 
 import pytest
 
-from vllm.entrypoints.openai.speech_to_text.protocol import TranscriptionResponse
-from vllm.entrypoints.openai.speech_to_text.speech_to_text import OpenAISpeechToText
+from vllm.entrypoints.speech_to_text.base.serving import OpenAISpeechToText
+from vllm.entrypoints.speech_to_text.transcription.protocol import TranscriptionResponse
 
 
 async def _never_finishes():
diff --git a/tests/entrypoints/speech_to_text/transcription/test_transcription_inter_chunk_spacing.py b/tests/entrypoints/speech_to_text/transcription/test_transcription_inter_chunk_spacing.py
index 1e80d47f82f9..c4da9a80f7ab 100644
--- a/tests/entrypoints/speech_to_text/transcription/test_transcription_inter_chunk_spacing.py
+++ b/tests/entrypoints/speech_to_text/transcription/test_transcription_inter_chunk_spacing.py
@@ -24,12 +24,14 @@
     RequestResponseMetadata,
 )
 from vllm.entrypoints.openai.models.serving import OpenAIServingModels
-from vllm.entrypoints.openai.speech_to_text.protocol import TranscriptionRequest
-from vllm.entrypoints.openai.speech_to_text.serving import OpenAIServingTranscription
-from vllm.entrypoints.openai.speech_to_text.speech_to_text import (
+from vllm.entrypoints.speech_to_text.base.serving import (
     OpenAISpeechToText,
     asr_inter_chunk_separator,
 )
+from vllm.entrypoints.speech_to_text.transcription.protocol import TranscriptionRequest
+from vllm.entrypoints.speech_to_text.transcription.serving import (
+    OpenAIServingTranscription,
+)
 from vllm.model_executor.models.interfaces import SupportsTranscription
 from vllm.outputs import CompletionOutput, RequestOutput
 
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index da2ec10284c5..8330706538b3 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -233,19 +233,12 @@ def build_app(
 
         attach_render_router(app)
 
-    if "transcription" in supported_tasks:
-        from vllm.entrypoints.openai.speech_to_text.api_router import (
-            attach_router as register_speech_to_text_api_router,
+    if "transcription" in supported_tasks or "realtime" in supported_tasks:
+        from vllm.entrypoints.speech_to_text.factories import (
+            register_speech_to_text_api_routers,
         )
 
-        register_speech_to_text_api_router(app)
-
-    if "realtime" in supported_tasks:
-        from vllm.entrypoints.openai.realtime.api_router import (
-            attach_router as register_realtime_api_router,
-        )
-
-        register_realtime_api_router(app)
+        register_speech_to_text_api_routers(app, supported_tasks)
 
     if any(task in POOLING_TASKS for task in supported_tasks):
         from vllm.entrypoints.pooling.factories import register_pooling_api_routers
@@ -284,11 +277,11 @@ def build_app(
 
     if "realtime" in supported_tasks:
         # Add WebSocket metrics middleware
-        from vllm.entrypoints.openai.realtime.metrics import (
-            WebSocketMetricsMiddleware,
+        from vllm.entrypoints.speech_to_text.factories import (
+            add_websocket_metrics_middleware,
         )
 
-        app.add_middleware(WebSocketMetricsMiddleware)
+        add_websocket_metrics_middleware(app)
 
     if envs.VLLM_DEBUG_LOG_API_SERVER_RESPONSE:
         logger.warning(
@@ -421,20 +414,13 @@ async def init_app_state(
 
         await init_generative_scoring_state(engine_client, state, args, request_logger)
 
-    if "transcription" in supported_tasks:
-        from vllm.entrypoints.openai.speech_to_text.api_router import (
-            init_transcription_state,
-        )
+    if "transcription" in supported_tasks or "realtime" in supported_tasks:
+        from vllm.entrypoints.speech_to_text.factories import init_speech_to_text_state
 
-        init_transcription_state(
+        init_speech_to_text_state(
             engine_client, state, args, request_logger, supported_tasks
         )
 
-    if "realtime" in supported_tasks:
-        from vllm.entrypoints.openai.realtime.api_router import init_realtime_state
-
-        init_realtime_state(engine_client, state, args, request_logger, supported_tasks)
-
     if any(task in POOLING_TASKS for task in supported_tasks):
         from vllm.entrypoints.pooling.factories import init_pooling_state
 
diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py
index 2a51cc0bfac0..70dbe289b2a1 100644
--- a/vllm/entrypoints/openai/engine/serving.py
+++ b/vllm/entrypoints/openai/engine/serving.py
@@ -39,11 +39,6 @@
 )
 from vllm.entrypoints.openai.models.serving import OpenAIServingModels
 from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
-from vllm.entrypoints.openai.speech_to_text.protocol import (
-    TranscriptionRequest,
-    TranscriptionResponse,
-    TranslationRequest,
-)
 from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse
 from vllm.entrypoints.serve.tokenize.protocol import (
     DetokenizeRequest,
@@ -51,6 +46,11 @@
     TokenizeCompletionRequest,
     TokenizeResponse,
 )
+from vllm.entrypoints.speech_to_text import (
+    TranscriptionRequest,
+    TranscriptionResponse,
+    TranslationRequest,
+)
 from vllm.entrypoints.utils import create_error_response
 from vllm.inputs import EngineInput, PromptType
 from vllm.logger import init_logger
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index ba53a6df651b..7e52c0e9d73d 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -41,14 +41,6 @@
     ErrorResponse,
     OpenAIBaseModel,
 )
-from vllm.entrypoints.openai.speech_to_text.protocol import (
-    TranscriptionRequest,
-    TranscriptionResponse,
-    TranscriptionResponseVerbose,
-    TranslationRequest,
-    TranslationResponse,
-    TranslationResponseVerbose,
-)
 from vllm.entrypoints.pooling.embed.protocol import (
     EmbeddingRequest,
     EmbeddingResponse,
@@ -59,6 +51,14 @@
     ScoreRequest,
     ScoreResponse,
 )
+from vllm.entrypoints.speech_to_text import (
+    TranscriptionRequest,
+    TranscriptionResponse,
+    TranscriptionResponseVerbose,
+    TranslationRequest,
+    TranslationResponse,
+    TranslationResponseVerbose,
+)
 from vllm.entrypoints.utils import create_error_response
 from vllm.exceptions import VLLMValidationError
 from vllm.logger import init_logger
diff --git a/vllm/entrypoints/openai/speech_to_text/api_router.py b/vllm/entrypoints/openai/speech_to_text/api_router.py
deleted file mode 100644
index b940a97e4dff..000000000000
--- a/vllm/entrypoints/openai/speech_to_text/api_router.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-
-from http import HTTPStatus
-from typing import TYPE_CHECKING, Annotated
-
-from fastapi import APIRouter, FastAPI, Form, Request
-from fastapi.responses import JSONResponse, StreamingResponse
-
-from vllm.entrypoints.openai.engine.protocol import ErrorResponse
-from vllm.entrypoints.openai.speech_to_text.protocol import (
-    TranscriptionRequest,
-    TranscriptionResponseVariant,
-    TranslationRequest,
-    TranslationResponseVariant,
-)
-from vllm.entrypoints.openai.speech_to_text.serving import (
-    OpenAIServingTranscription,
-    OpenAIServingTranslation,
-)
-from vllm.entrypoints.utils import (
-    load_aware_call,
-    with_cancellation,
-)
-from vllm.logger import init_logger
-
-if TYPE_CHECKING:
-    from argparse import Namespace
-
-    from starlette.datastructures import State
-
-    from vllm.engine.protocol import EngineClient
-    from vllm.entrypoints.logger import RequestLogger
-    from vllm.tasks import SupportedTask
-else:
-    RequestLogger = object
-
-logger = init_logger(__name__)
-
-router = APIRouter()
-
-
-def transcription(request: Request) -> OpenAIServingTranscription:
-    return request.app.state.openai_serving_transcription
-
-
-def translation(request: Request) -> OpenAIServingTranslation:
-    return request.app.state.openai_serving_translation
-
-
-@router.post(
-    "/v1/audio/transcriptions",
-    responses={
-        HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.UNPROCESSABLE_ENTITY.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-@load_aware_call
-async def create_transcriptions(
-    raw_request: Request, request: Annotated[TranscriptionRequest, Form()]
-):
-    handler = transcription(raw_request)
-    if handler is None:
-        raise NotImplementedError("The model does not support Transcriptions API")
-
-    audio_data = await request.file.read()
-
-    generator = await handler.create_transcription(audio_data, request, raw_request)
-
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-
-    elif isinstance(generator, TranscriptionResponseVariant):
-        return JSONResponse(content=generator.model_dump())
-
-    return StreamingResponse(content=generator, media_type="text/event-stream")
-
-
-@router.post(
-    "/v1/audio/translations",
-    responses={
-        HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.UNPROCESSABLE_ENTITY.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-@load_aware_call
-async def create_translations(
-    request: Annotated[TranslationRequest, Form()], raw_request: Request
-):
-    handler = translation(raw_request)
-    if handler is None:
-        raise NotImplementedError("The model does not support Translations API")
-
-    audio_data = await request.file.read()
-
-    generator = await handler.create_translation(audio_data, request, raw_request)
-
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-
-    elif isinstance(generator, TranslationResponseVariant):
-        return JSONResponse(content=generator.model_dump())
-
-    return StreamingResponse(content=generator, media_type="text/event-stream")
-
-
-def attach_router(app: FastAPI):
-    app.include_router(router)
-
-
-def init_transcription_state(
-    engine_client: "EngineClient",
-    state: "State",
-    args: "Namespace",
-    request_logger: RequestLogger | None,
-    supported_tasks: tuple["SupportedTask", ...],
-):
-    state.openai_serving_transcription = (
-        OpenAIServingTranscription(
-            engine_client,
-            state.openai_serving_models,
-            request_logger=request_logger,
-            enable_force_include_usage=args.enable_force_include_usage,
-        )
-        if "transcription" in supported_tasks
-        else None
-    )
-    state.openai_serving_translation = (
-        OpenAIServingTranslation(
-            engine_client,
-            state.openai_serving_models,
-            request_logger=request_logger,
-            enable_force_include_usage=args.enable_force_include_usage,
-        )
-        if "transcription" in supported_tasks
-        else None
-    )
diff --git a/vllm/entrypoints/openai/realtime/__init__.py b/vllm/entrypoints/speech_to_text/__init__.py
similarity index 100%
rename from vllm/entrypoints/openai/realtime/__init__.py
rename to vllm/entrypoints/speech_to_text/__init__.py
diff --git a/vllm/entrypoints/speech_to_text/base/__init__.py b/vllm/entrypoints/speech_to_text/base/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/entrypoints/speech_to_text/base/protocol.py b/vllm/entrypoints/speech_to_text/base/protocol.py
new file mode 100644
index 000000000000..e8cb61a41472
--- /dev/null
+++ b/vllm/entrypoints/speech_to_text/base/protocol.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from typing import Literal, TypeAlias
+
+import torch
+
+## Protocols for Audio
+AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json", "vtt"]
+_LONG_INFO = torch.iinfo(torch.long)
diff --git a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py b/vllm/entrypoints/speech_to_text/base/serving.py
similarity index 99%
rename from vllm/entrypoints/openai/speech_to_text/speech_to_text.py
rename to vllm/entrypoints/speech_to_text/base/serving.py
index 8c3aed4531d4..a0f02a2c7830 100644
--- a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py
+++ b/vllm/entrypoints/speech_to_text/base/serving.py
@@ -24,18 +24,6 @@
 )
 from vllm.entrypoints.openai.engine.serving import OpenAIServing, SpeechToTextRequest
 from vllm.entrypoints.openai.models.serving import OpenAIServingModels
-from vllm.entrypoints.openai.speech_to_text.protocol import (
-    TranscriptionResponse,
-    TranscriptionResponseStreamChoice,
-    TranscriptionResponseVerbose,
-    TranscriptionSegment,
-    TranscriptionStreamResponse,
-    TranslationResponse,
-    TranslationResponseStreamChoice,
-    TranslationResponseVerbose,
-    TranslationSegment,
-    TranslationStreamResponse,
-)
 from vllm.entrypoints.utils import get_max_tokens
 from vllm.exceptions import VLLMValidationError
 from vllm.inputs import EncoderDecoderInput, EngineInput
@@ -51,6 +39,21 @@
 from vllm.tokenizers import get_tokenizer
 from vllm.utils.async_utils import merge_async_iterators
 
+from ..transcription.protocol import (
+    TranscriptionResponse,
+    TranscriptionResponseStreamChoice,
+    TranscriptionResponseVerbose,
+    TranscriptionSegment,
+    TranscriptionStreamResponse,
+)
+from ..translation.protocol import (
+    TranslationResponse,
+    TranslationResponseStreamChoice,
+    TranslationResponseVerbose,
+    TranslationSegment,
+    TranslationStreamResponse,
+)
+
 SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse
 SpeechToTextResponseVerbose: TypeAlias = (
     TranscriptionResponseVerbose | TranslationResponseVerbose
diff --git a/vllm/entrypoints/speech_to_text/factories.py b/vllm/entrypoints/speech_to_text/factories.py
new file mode 100644
index 000000000000..f89c66092fc7
--- /dev/null
+++ b/vllm/entrypoints/speech_to_text/factories.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from typing import TYPE_CHECKING
+
+from fastapi import FastAPI
+
+if TYPE_CHECKING:
+    from argparse import Namespace
+
+    from starlette.datastructures import State
+
+    from vllm.engine.protocol import EngineClient
+    from vllm.entrypoints.logger import RequestLogger
+    from vllm.tasks import SupportedTask
+else:
+    RequestLogger = object
+
+
+def register_speech_to_text_api_routers(
+    app: FastAPI,
+    supported_tasks: tuple["SupportedTask", ...],
+):
+    if "realtime" in supported_tasks:
+        from .realtime.api_router import router as realtime_router
+
+        app.include_router(realtime_router)
+
+    if "transcription" in supported_tasks:
+        from .transcription.api_router import router as transcription_router
+
+        app.include_router(transcription_router)
+
+        from .translation.api_router import router as translation_router
+
+        app.include_router(translation_router)
+
+
+def add_websocket_metrics_middleware(app: FastAPI):
+    from vllm.entrypoints.speech_to_text.realtime.metrics import (
+        WebSocketMetricsMiddleware,
+    )
+
+    app.add_middleware(WebSocketMetricsMiddleware)
+
+
+def init_speech_to_text_state(
+    engine_client: "EngineClient",
+    state: "State",
+    args: "Namespace",
+    request_logger: RequestLogger | None,
+    supported_tasks: tuple["SupportedTask", ...],
+):
+    if "transcription" in supported_tasks:
+        from vllm.entrypoints.speech_to_text.transcription.serving import (
+            OpenAIServingTranscription,
+        )
+        from vllm.entrypoints.speech_to_text.translation.serving import (
+            OpenAIServingTranslation,
+        )
+
+        state.openai_serving_transcription = OpenAIServingTranscription(
+            engine_client,
+            state.openai_serving_models,
+            request_logger=request_logger,
+            enable_force_include_usage=args.enable_force_include_usage,
+        )
+
+        state.openai_serving_translation = OpenAIServingTranslation(
+            engine_client,
+            state.openai_serving_models,
+            request_logger=request_logger,
+            enable_force_include_usage=args.enable_force_include_usage,
+        )
+
+    if "realtime" in supported_tasks:
+        from .realtime.serving import OpenAIServingRealtime
+
+        state.openai_serving_realtime = OpenAIServingRealtime(
+            engine_client,
+            state.openai_serving_models,
+            request_logger=request_logger,
+        )
diff --git a/vllm/entrypoints/openai/speech_to_text/__init__.py b/vllm/entrypoints/speech_to_text/realtime/__init__.py
similarity index 100%
rename from vllm/entrypoints/openai/speech_to_text/__init__.py
rename to vllm/entrypoints/speech_to_text/realtime/__init__.py
diff --git a/vllm/entrypoints/openai/realtime/api_router.py b/vllm/entrypoints/speech_to_text/realtime/api_router.py
similarity index 51%
rename from vllm/entrypoints/openai/realtime/api_router.py
rename to vllm/entrypoints/speech_to_text/realtime/api_router.py
index c48191d14cd4..2529b28633eb 100644
--- a/vllm/entrypoints/openai/realtime/api_router.py
+++ b/vllm/entrypoints/speech_to_text/realtime/api_router.py
@@ -1,26 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING
 
-from fastapi import APIRouter, FastAPI, WebSocket
+from fastapi import APIRouter, WebSocket
 
-from vllm.entrypoints.openai.realtime.connection import RealtimeConnection
-from vllm.entrypoints.openai.realtime.serving import OpenAIServingRealtime
 from vllm.logger import init_logger
 
-logger = init_logger(__name__)
-
-if TYPE_CHECKING:
-    from argparse import Namespace
+from .connection import RealtimeConnection
 
-    from starlette.datastructures import State
+logger = init_logger(__name__)
 
-    from vllm.engine.protocol import EngineClient
-    from vllm.entrypoints.logger import RequestLogger
-    from vllm.tasks import SupportedTask
-else:
-    RequestLogger = object
 
 router = APIRouter()
 
@@ -48,27 +37,3 @@ async def realtime_endpoint(websocket: WebSocket):
 
     connection = RealtimeConnection(websocket, serving)
     await connection.handle_connection()
-
-
-def attach_router(app: FastAPI):
-    """Attach the realtime router to the FastAPI app."""
-    app.include_router(router)
-    logger.info("Realtime API router attached")
-
-
-def init_realtime_state(
-    engine_client: "EngineClient",
-    state: "State",
-    args: "Namespace",
-    request_logger: RequestLogger | None,
-    supported_tasks: tuple["SupportedTask", ...],
-):
-    state.openai_serving_realtime = (
-        OpenAIServingRealtime(
-            engine_client,
-            state.openai_serving_models,
-            request_logger=request_logger,
-        )
-        if "realtime" in supported_tasks
-        else None
-    )
diff --git a/vllm/entrypoints/openai/realtime/connection.py b/vllm/entrypoints/speech_to_text/realtime/connection.py
similarity index 98%
rename from vllm/entrypoints/openai/realtime/connection.py
rename to vllm/entrypoints/speech_to_text/realtime/connection.py
index 58af329054e1..c7d1af92990e 100644
--- a/vllm/entrypoints/openai/realtime/connection.py
+++ b/vllm/entrypoints/speech_to_text/realtime/connection.py
@@ -14,7 +14,10 @@
 
 from vllm import envs
 from vllm.entrypoints.openai.engine.protocol import ErrorResponse, UsageInfo
-from vllm.entrypoints.openai.realtime.protocol import (
+from vllm.exceptions import VLLMValidationError
+from vllm.logger import init_logger
+
+from .protocol import (
     ErrorEvent,
     InputAudioBufferAppend,
     InputAudioBufferCommit,
@@ -22,9 +25,7 @@
     TranscriptionDelta,
     TranscriptionDone,
 )
-from vllm.entrypoints.openai.realtime.serving import OpenAIServingRealtime
-from vllm.exceptions import VLLMValidationError
-from vllm.logger import init_logger
+from .serving import OpenAIServingRealtime
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/realtime/metrics.py b/vllm/entrypoints/speech_to_text/realtime/metrics.py
similarity index 100%
rename from vllm/entrypoints/openai/realtime/metrics.py
rename to vllm/entrypoints/speech_to_text/realtime/metrics.py
diff --git a/vllm/entrypoints/openai/realtime/protocol.py b/vllm/entrypoints/speech_to_text/realtime/protocol.py
similarity index 100%
rename from vllm/entrypoints/openai/realtime/protocol.py
rename to vllm/entrypoints/speech_to_text/realtime/protocol.py
diff --git a/vllm/entrypoints/openai/realtime/serving.py b/vllm/entrypoints/speech_to_text/realtime/serving.py
similarity index 100%
rename from vllm/entrypoints/openai/realtime/serving.py
rename to vllm/entrypoints/speech_to_text/realtime/serving.py
diff --git a/vllm/entrypoints/speech_to_text/transcription/__init__.py b/vllm/entrypoints/speech_to_text/transcription/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/entrypoints/speech_to_text/transcription/api_router.py b/vllm/entrypoints/speech_to_text/transcription/api_router.py
new file mode 100644
index 000000000000..c4de6810ca66
--- /dev/null
+++ b/vllm/entrypoints/speech_to_text/transcription/api_router.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from http import HTTPStatus
+from typing import Annotated
+
+from fastapi import APIRouter, Form, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+
+from vllm.entrypoints.openai.engine.protocol import ErrorResponse
+from vllm.entrypoints.utils import (
+    load_aware_call,
+    with_cancellation,
+)
+from vllm.logger import init_logger
+
+from .protocol import TranscriptionRequest, TranscriptionResponseVariant
+from .serving import OpenAIServingTranscription
+
+logger = init_logger(__name__)
+
+router = APIRouter()
+
+
+def transcription(request: Request) -> OpenAIServingTranscription:
+    return request.app.state.openai_serving_transcription
+
+
+@router.post(
+    "/v1/audio/transcriptions",
+    responses={
+        HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.UNPROCESSABLE_ENTITY.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def create_transcriptions(
+    raw_request: Request, request: Annotated[TranscriptionRequest, Form()]
+):
+    handler = transcription(raw_request)
+    if handler is None:
+        raise NotImplementedError("The model does not support Transcriptions API")
+
+    audio_data = await request.file.read()
+
+    generator = await handler.create_transcription(audio_data, request, raw_request)
+
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+
+    elif isinstance(generator, TranscriptionResponseVariant):
+        return JSONResponse(content=generator.model_dump())
+
+    return StreamingResponse(content=generator, media_type="text/event-stream")
diff --git a/vllm/entrypoints/openai/speech_to_text/protocol.py b/vllm/entrypoints/speech_to_text/transcription/protocol.py
similarity index 59%
rename from vllm/entrypoints/openai/speech_to_text/protocol.py
rename to vllm/entrypoints/speech_to_text/transcription/protocol.py
index af1aaf086555..abf1a11a0eea 100644
--- a/vllm/entrypoints/openai/speech_to_text/protocol.py
+++ b/vllm/entrypoints/speech_to_text/transcription/protocol.py
@@ -6,7 +6,6 @@
 from http import HTTPStatus
 from typing import TYPE_CHECKING, Literal, TypeAlias
 
-import torch
 from fastapi import HTTPException, UploadFile
 from pydantic import (
     Field,
@@ -28,13 +27,14 @@
 )
 from vllm.utils import random_uuid
 
+from ..base.protocol import _LONG_INFO, AudioResponseFormat
+
 if TYPE_CHECKING:
     import numpy as np
 
     from vllm.config import ModelConfig, SpeechToTextConfig
 
 logger = init_logger(__name__)
-_LONG_INFO = torch.iinfo(torch.long)
 
 
 class TranscriptionResponseStreamChoice(OpenAIBaseModel):
@@ -52,10 +52,6 @@ class TranscriptionStreamResponse(OpenAIBaseModel):
     usage: UsageInfo | None = Field(default=None)
 
 
-## Protocols for Audio
-AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json", "vtt"]
-
-
 class TranscriptionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/audio/createTranscription
@@ -407,276 +403,3 @@ class TranscriptionResponseVerbose(OpenAIBaseModel):
 TranscriptionResponseVariant: TypeAlias = (
     TranscriptionResponse | TranscriptionResponseVerbose
 )
-
-
-class TranslationResponseStreamChoice(OpenAIBaseModel):
-    delta: DeltaMessage
-    finish_reason: str | None = None
-    stop_reason: int | str | None = None
-
-
-class TranslationStreamResponse(OpenAIBaseModel):
-    id: str = Field(default_factory=lambda: f"trsl-{random_uuid()}")
-    object: Literal["translation.chunk"] = "translation.chunk"
-    created: int = Field(default_factory=lambda: int(time.time()))
-    model: str
-    choices: list[TranslationResponseStreamChoice]
-    usage: UsageInfo | None = Field(default=None)
-
-
-class TranslationRequest(OpenAIBaseModel):
-    # Ordered by official OpenAI API documentation
-    # https://platform.openai.com/docs/api-reference/audio/createTranslation
-
-    file: UploadFile
-    """
-    The audio file object (not file name) to translate, in one of these
-    formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
-    """
-
-    model: str | None = None
-    """ID of the model to use.
-    """
-
-    prompt: str = Field(default="")
-    """An optional text to guide the model's style or continue a previous audio
-    segment.
-
-    The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
-    should match the audio language.
-    """
-
-    response_format: AudioResponseFormat = Field(default="json")
-    """
-    The format of the output, in one of these options: `json`, `text`, `srt`,
-    `verbose_json`, or `vtt`.
-    """
-
-    # TODO support additional sampling parameters
-    # --8<-- [start:translation-sampling-params]
-    use_beam_search: bool = False
-    """Whether or not beam search should be used."""
-
-    n: int = 1
-    """The number of beams to be used in beam search."""
-
-    length_penalty: float = 1.0
-    """Length penalty to be used for beam search."""
-
-    include_stop_str_in_output: bool = False
-    """Whether to include the stop strings in output text."""
-
-    seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
-    """The seed to use for sampling."""
-
-    temperature: float = Field(default=0.0)
-    """The sampling temperature, between 0 and 1.
-
-    Higher values like 0.8 will make the output more random, while lower values
-    like 0.2 will make it more focused / deterministic. If set to 0, the model
-    will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
-    to automatically increase the temperature until certain thresholds are hit.
-    """
-    # --8<-- [end:translation-sampling-params]
-
-    # --8<-- [start:translation-extra-params]
-    language: str | None = None
-    """The language of the input audio we translate from.
-
-    Supplying the input language in
-    [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
-    will improve accuracy.
-    """
-
-    hotwords: str | None = None
-    """
-    hotwords refers to a list of important words or phrases that the model
-    should pay extra attention to during transcription.
-    """
-
-    to_language: str | None = None
-    """The language of the input audio we translate to.
-
-    Please note that this is not supported by all models, refer to the specific
-    model documentation for more details.
-    For instance, Whisper only supports `to_language=en`.
-    """
-
-    stream: bool | None = False
-    """Custom field not present in the original OpenAI definition. When set,
-    it will enable output to be streamed in a similar fashion as the Chat
-    Completion endpoint.
-    """
-    # Flattened stream option to simplify form data.
-    stream_include_usage: bool | None = False
-    stream_continuous_usage_stats: bool | None = False
-
-    max_completion_tokens: int | None = None
-    """The maximum number of tokens to generate."""
-    # --8<-- [end:translation-extra-params]
-
-    # Default sampling parameters for translation requests.
-    _DEFAULT_SAMPLING_PARAMS: dict = {
-        "temperature": 0,
-    }
-
-    def build_stt_params(
-        self,
-        audio: "np.ndarray",
-        stt_config: "SpeechToTextConfig",
-        model_config: "ModelConfig",
-        task_type: str,
-    ) -> SpeechToTextParams:
-        return SpeechToTextParams(
-            audio=audio,
-            stt_config=stt_config,
-            model_config=model_config,
-            language=self.language,
-            task_type=task_type,
-            request_prompt=self.prompt,
-            to_language=self.to_language,
-            hotwords=self.hotwords,
-        )
-
-    def to_beam_search_params(
-        self,
-        default_max_tokens: int,
-        default_sampling_params: dict | None = None,
-    ) -> BeamSearchParams:
-        if default_sampling_params is None:
-            default_sampling_params = {}
-
-        max_tokens = default_max_tokens
-        n = self.n if self.n is not None else 1
-
-        # NOTE: Temp 0 is a different fallback than completions
-        if (temperature := self.temperature) is None:
-            temperature = default_sampling_params.get("temperature", 0)
-
-        return BeamSearchParams(
-            beam_width=n,
-            max_tokens=max_tokens,
-            temperature=temperature,
-            length_penalty=self.length_penalty,
-            include_stop_str_in_output=self.include_stop_str_in_output,
-        )
-
-    def to_sampling_params(
-        self, default_max_tokens: int, default_sampling_params: dict | None = None
-    ) -> SamplingParams:
-        max_tokens = default_max_tokens
-
-        if default_sampling_params is None:
-            default_sampling_params = {}
-        # Default parameters
-        if (temperature := self.temperature) is None:
-            temperature = default_sampling_params.get(
-                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
-            )
-
-        return SamplingParams.from_optional(
-            temperature=temperature,
-            max_tokens=max_tokens,
-            seed=self.seed,
-            output_kind=RequestOutputKind.DELTA
-            if self.stream
-            else RequestOutputKind.FINAL_ONLY,
-            skip_clone=True,  # Created fresh per request, safe to skip clone
-        )
-
-    @model_validator(mode="before")
-    @classmethod
-    def validate_stream_options(cls, data):
-        stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
-        stream = data.get("stream", False)
-        if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
-            # Find which specific stream option was set
-            invalid_param = next(
-                (so for so in stream_opts if data.get(so, False)),
-                "stream_include_usage",
-            )
-            raise VLLMValidationError(
-                "Stream options can only be defined when `stream=True`.",
-                parameter=invalid_param,
-            )
-
-        return data
-
-
-# Translation response objects
-class TranslationResponse(OpenAIBaseModel):
-    text: str
-    """The translated text."""
-
-
-class TranslationWord(OpenAIBaseModel):
-    end: float
-    """End time of the word in seconds."""
-
-    start: float
-    """Start time of the word in seconds."""
-
-    word: str
-    """The text content of the word."""
-
-
-class TranslationSegment(OpenAIBaseModel):
-    id: int
-    """Unique identifier of the segment."""
-
-    avg_logprob: float
-    """Average logprob of the segment.
-
-    If the value is lower than -1, consider the logprobs failed.
-    """
-
-    compression_ratio: float
-    """Compression ratio of the segment.
-
-    If the value is greater than 2.4, consider the compression failed.
-    """
-
-    end: float
-    """End time of the segment in seconds."""
-
-    no_speech_prob: float | None = None
-    """Probability of no speech in the segment.
-
-    If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
-    this segment silent.
-    """
-
-    seek: int
-    """Seek offset of the segment."""
-
-    start: float
-    """Start time of the segment in seconds."""
-
-    temperature: float
-    """Temperature parameter used for generating the segment."""
-
-    text: str
-    """Text content of the segment."""
-
-    tokens: list[int]
-    """Array of token IDs for the text content."""
-
-
-class TranslationResponseVerbose(OpenAIBaseModel):
-    duration: str
-    """The duration of the input audio."""
-
-    language: str
-    """The language of the input audio."""
-
-    text: str
-    """The translated text."""
-
-    segments: list[TranslationSegment] | None = None
-    """Segments of the translated text and their corresponding details."""
-
-    words: list[TranslationWord] | None = None
-    """Extracted words and their corresponding timestamps."""
-
-
-TranslationResponseVariant: TypeAlias = TranslationResponse | TranslationResponseVerbose
diff --git a/vllm/entrypoints/openai/speech_to_text/serving.py b/vllm/entrypoints/speech_to_text/transcription/serving.py
similarity index 53%
rename from vllm/entrypoints/openai/speech_to_text/serving.py
rename to vllm/entrypoints/speech_to_text/transcription/serving.py
index bacd6d794deb..123c4c234ecb 100644
--- a/vllm/entrypoints/openai/speech_to_text/serving.py
+++ b/vllm/entrypoints/speech_to_text/transcription/serving.py
@@ -11,21 +11,17 @@
     RequestResponseMetadata,
 )
 from vllm.entrypoints.openai.models.serving import OpenAIServingModels
-from vllm.entrypoints.openai.speech_to_text.protocol import (
+from vllm.logger import init_logger
+from vllm.outputs import RequestOutput
+
+from ..base.serving import OpenAISpeechToText
+from .protocol import (
     TranscriptionRequest,
     TranscriptionResponse,
     TranscriptionResponseStreamChoice,
     TranscriptionResponseVerbose,
     TranscriptionStreamResponse,
-    TranslationRequest,
-    TranslationResponse,
-    TranslationResponseStreamChoice,
-    TranslationResponseVerbose,
-    TranslationStreamResponse,
 )
-from vllm.entrypoints.openai.speech_to_text.speech_to_text import OpenAISpeechToText
-from vllm.logger import init_logger
-from vllm.outputs import RequestOutput
 
 logger = init_logger(__name__)
 
@@ -101,76 +97,3 @@ async def transcription_stream_generator(
         )
         async for chunk in generator:
             yield chunk
-
-
-class OpenAIServingTranslation(OpenAISpeechToText):
-    """Handles translation requests."""
-
-    def __init__(
-        self,
-        engine_client: EngineClient,
-        models: OpenAIServingModels,
-        *,
-        request_logger: RequestLogger | None,
-        return_tokens_as_token_ids: bool = False,
-        enable_force_include_usage: bool = False,
-    ):
-        super().__init__(
-            engine_client=engine_client,
-            models=models,
-            request_logger=request_logger,
-            return_tokens_as_token_ids=return_tokens_as_token_ids,
-            task_type="translate",
-            enable_force_include_usage=enable_force_include_usage,
-        )
-
-    async def create_translation(
-        self,
-        audio_data: bytes,
-        request: TranslationRequest,
-        raw_request: Request | None = None,
-    ) -> (
-        TranslationResponse
-        | TranslationResponseVerbose
-        | AsyncGenerator[str, None]
-        | ErrorResponse
-    ):
-        """Translation API similar to OpenAI's API.
-
-        See https://platform.openai.com/docs/api-reference/audio/createTranslation
-        for the API specification. This API mimics the OpenAI translation API.
-        """
-        return await self._create_speech_to_text(
-            audio_data=audio_data,
-            request=request,
-            raw_request=raw_request,
-            response_class=(
-                TranslationResponseVerbose
-                if request.response_format == "verbose_json"
-                else TranslationResponse
-            ),
-            stream_generator_method=self.translation_stream_generator,
-        )
-
-    async def translation_stream_generator(
-        self,
-        request: TranslationRequest,
-        result_generator: list[AsyncGenerator[RequestOutput, None]],
-        request_id: str,
-        request_metadata: RequestResponseMetadata,
-        audio_duration_s: float,
-        separator: str,
-    ) -> AsyncGenerator[str, None]:
-        generator = self._speech_to_text_stream_generator(
-            request=request,
-            list_result_generator=result_generator,
-            request_id=request_id,
-            request_metadata=request_metadata,
-            audio_duration_s=audio_duration_s,
-            chunk_object_type="translation.chunk",
-            response_stream_choice_class=TranslationResponseStreamChoice,
-            stream_response_class=TranslationStreamResponse,
-            separator=separator,
-        )
-        async for chunk in generator:
-            yield chunk
diff --git a/vllm/entrypoints/speech_to_text/translation/__init__.py b/vllm/entrypoints/speech_to_text/translation/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/entrypoints/speech_to_text/translation/api_router.py b/vllm/entrypoints/speech_to_text/translation/api_router.py
new file mode 100644
index 000000000000..a68b098834bf
--- /dev/null
+++ b/vllm/entrypoints/speech_to_text/translation/api_router.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from http import HTTPStatus
+from typing import Annotated
+
+from fastapi import APIRouter, Form, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+
+from vllm.entrypoints.openai.engine.protocol import ErrorResponse
+from vllm.entrypoints.utils import (
+    load_aware_call,
+    with_cancellation,
+)
+from vllm.logger import init_logger
+
+from .protocol import TranslationRequest, TranslationResponseVariant
+from .serving import OpenAIServingTranslation
+
+logger = init_logger(__name__)
+
+router = APIRouter()
+
+
+def translation(request: Request) -> OpenAIServingTranslation:
+    return request.app.state.openai_serving_translation
+
+
+@router.post(
+    "/v1/audio/translations",
+    responses={
+        HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.UNPROCESSABLE_ENTITY.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def create_translations(
+    request: Annotated[TranslationRequest, Form()], raw_request: Request
+):
+    handler = translation(raw_request)
+    if handler is None:
+        raise NotImplementedError("The model does not support Translations API")
+
+    audio_data = await request.file.read()
+
+    generator = await handler.create_translation(audio_data, request, raw_request)
+
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+
+    elif isinstance(generator, TranslationResponseVariant):
+        return JSONResponse(content=generator.model_dump())
+
+    return StreamingResponse(content=generator, media_type="text/event-stream")
diff --git a/vllm/entrypoints/speech_to_text/translation/protocol.py b/vllm/entrypoints/speech_to_text/translation/protocol.py
new file mode 100644
index 000000000000..6e457682c2f6
--- /dev/null
+++ b/vllm/entrypoints/speech_to_text/translation/protocol.py
@@ -0,0 +1,308 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import time
+from typing import TYPE_CHECKING, Literal, TypeAlias
+
+from fastapi import UploadFile
+from pydantic import (
+    Field,
+    model_validator,
+)
+
+from vllm.config.speech_to_text import SpeechToTextParams
+from vllm.entrypoints.openai.engine.protocol import (
+    DeltaMessage,
+    OpenAIBaseModel,
+    UsageInfo,
+)
+from vllm.exceptions import VLLMValidationError
+from vllm.logger import init_logger
+from vllm.sampling_params import (
+    BeamSearchParams,
+    RequestOutputKind,
+    SamplingParams,
+)
+from vllm.utils import random_uuid
+
+from ..base.protocol import _LONG_INFO, AudioResponseFormat
+
+if TYPE_CHECKING:
+    import numpy as np
+
+    from vllm.config import ModelConfig, SpeechToTextConfig
+
+logger = init_logger(__name__)
+
+
+class TranslationResponseStreamChoice(OpenAIBaseModel):
+    delta: DeltaMessage
+    finish_reason: str | None = None
+    stop_reason: int | str | None = None
+
+
+class TranslationStreamResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"trsl-{random_uuid()}")
+    object: Literal["translation.chunk"] = "translation.chunk"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: list[TranslationResponseStreamChoice]
+    usage: UsageInfo | None = Field(default=None)
+
+
+class TranslationRequest(OpenAIBaseModel):
+    # Ordered by official OpenAI API documentation
+    # https://platform.openai.com/docs/api-reference/audio/createTranslation
+
+    file: UploadFile
+    """
+    The audio file object (not file name) to translate, in one of these
+    formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
+    """
+
+    model: str | None = None
+    """ID of the model to use.
+    """
+
+    prompt: str = Field(default="")
+    """An optional text to guide the model's style or continue a previous audio
+    segment.
+
+    The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
+    should match the audio language.
+    """
+
+    response_format: AudioResponseFormat = Field(default="json")
+    """
+    The format of the output, in one of these options: `json`, `text`, `srt`,
+    `verbose_json`, or `vtt`.
+    """
+
+    # TODO support additional sampling parameters
+    # --8<-- [start:translation-sampling-params]
+    use_beam_search: bool = False
+    """Whether or not beam search should be used."""
+
+    n: int = 1
+    """The number of beams to be used in beam search."""
+
+    length_penalty: float = 1.0
+    """Length penalty to be used for beam search."""
+
+    include_stop_str_in_output: bool = False
+    """Whether to include the stop strings in output text."""
+
+    seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
+    """The seed to use for sampling."""
+
+    temperature: float = Field(default=0.0)
+    """The sampling temperature, between 0 and 1.
+
+    Higher values like 0.8 will make the output more random, while lower values
+    like 0.2 will make it more focused / deterministic. If set to 0, the model
+    will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
+    to automatically increase the temperature until certain thresholds are hit.
+    """
+    # --8<-- [end:translation-sampling-params]
+
+    # --8<-- [start:translation-extra-params]
+    language: str | None = None
+    """The language of the input audio we translate from.
+
+    Supplying the input language in
+    [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
+    will improve accuracy.
+    """
+
+    hotwords: str | None = None
+    """
+    hotwords refers to a list of important words or phrases that the model
+    should pay extra attention to during transcription.
+    """
+
+    to_language: str | None = None
+    """The language of the input audio we translate to.
+
+    Please note that this is not supported by all models, refer to the specific
+    model documentation for more details.
+    For instance, Whisper only supports `to_language=en`.
+    """
+
+    stream: bool | None = False
+    """Custom field not present in the original OpenAI definition. When set,
+    it will enable output to be streamed in a similar fashion as the Chat
+    Completion endpoint.
+    """
+    # Flattened stream option to simplify form data.
+    stream_include_usage: bool | None = False
+    stream_continuous_usage_stats: bool | None = False
+
+    max_completion_tokens: int | None = None
+    """The maximum number of tokens to generate."""
+    # --8<-- [end:translation-extra-params]
+
+    # Default sampling parameters for translation requests.
+    _DEFAULT_SAMPLING_PARAMS: dict = {
+        "temperature": 0,
+    }
+
+    def build_stt_params(
+        self,
+        audio: "np.ndarray",
+        stt_config: "SpeechToTextConfig",
+        model_config: "ModelConfig",
+        task_type: str,
+    ) -> SpeechToTextParams:
+        return SpeechToTextParams(
+            audio=audio,
+            stt_config=stt_config,
+            model_config=model_config,
+            language=self.language,
+            task_type=task_type,
+            request_prompt=self.prompt,
+            to_language=self.to_language,
+            hotwords=self.hotwords,
+        )
+
+    def to_beam_search_params(
+        self,
+        default_max_tokens: int,
+        default_sampling_params: dict | None = None,
+    ) -> BeamSearchParams:
+        if default_sampling_params is None:
+            default_sampling_params = {}
+
+        max_tokens = default_max_tokens
+        n = self.n if self.n is not None else 1
+
+        # NOTE: Temp 0 is a different fallback than completions
+        if (temperature := self.temperature) is None:
+            temperature = default_sampling_params.get("temperature", 0)
+
+        return BeamSearchParams(
+            beam_width=n,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            length_penalty=self.length_penalty,
+            include_stop_str_in_output=self.include_stop_str_in_output,
+        )
+
+    def to_sampling_params(
+        self, default_max_tokens: int, default_sampling_params: dict | None = None
+    ) -> SamplingParams:
+        max_tokens = default_max_tokens
+
+        if default_sampling_params is None:
+            default_sampling_params = {}
+        # Default parameters
+        if (temperature := self.temperature) is None:
+            temperature = default_sampling_params.get(
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
+            )
+
+        return SamplingParams.from_optional(
+            temperature=temperature,
+            max_tokens=max_tokens,
+            seed=self.seed,
+            output_kind=RequestOutputKind.DELTA
+            if self.stream
+            else RequestOutputKind.FINAL_ONLY,
+            skip_clone=True,  # Created fresh per request, safe to skip clone
+        )
+
+    @model_validator(mode="before")
+    @classmethod
+    def validate_stream_options(cls, data):
+        stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
+        stream = data.get("stream", False)
+        if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
+            # Find which specific stream option was set
+            invalid_param = next(
+                (so for so in stream_opts if data.get(so, False)),
+                "stream_include_usage",
+            )
+            raise VLLMValidationError(
+                "Stream options can only be defined when `stream=True`.",
+                parameter=invalid_param,
+            )
+
+        return data
+
+
+# Translation response objects
+class TranslationResponse(OpenAIBaseModel):
+    text: str
+    """The translated text."""
+
+
+class TranslationWord(OpenAIBaseModel):
+    end: float
+    """End time of the word in seconds."""
+
+    start: float
+    """Start time of the word in seconds."""
+
+    word: str
+    """The text content of the word."""
+
+
+class TranslationSegment(OpenAIBaseModel):
+    id: int
+    """Unique identifier of the segment."""
+
+    avg_logprob: float
+    """Average logprob of the segment.
+
+    If the value is lower than -1, consider the logprobs failed.
+    """
+
+    compression_ratio: float
+    """Compression ratio of the segment.
+
+    If the value is greater than 2.4, consider the compression failed.
+    """
+
+    end: float
+    """End time of the segment in seconds."""
+
+    no_speech_prob: float | None = None
+    """Probability of no speech in the segment.
+
+    If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
+    this segment silent.
+    """
+
+    seek: int
+    """Seek offset of the segment."""
+
+    start: float
+    """Start time of the segment in seconds."""
+
+    temperature: float
+    """Temperature parameter used for generating the segment."""
+
+    text: str
+    """Text content of the segment."""
+
+    tokens: list[int]
+    """Array of token IDs for the text content."""
+
+
+class TranslationResponseVerbose(OpenAIBaseModel):
+    duration: str
+    """The duration of the input audio."""
+
+    language: str
+    """The language of the input audio."""
+
+    text: str
+    """The translated text."""
+
+    segments: list[TranslationSegment] | None = None
+    """Segments of the translated text and their corresponding details."""
+
+    words: list[TranslationWord] | None = None
+    """Extracted words and their corresponding timestamps."""
+
+
+TranslationResponseVariant: TypeAlias = TranslationResponse | TranslationResponseVerbose
diff --git a/vllm/entrypoints/speech_to_text/translation/serving.py b/vllm/entrypoints/speech_to_text/translation/serving.py
new file mode 100644
index 000000000000..257f8f74396e
--- /dev/null
+++ b/vllm/entrypoints/speech_to_text/translation/serving.py
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import AsyncGenerator
+
+from fastapi import Request
+
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.engine.protocol import (
+    ErrorResponse,
+    RequestResponseMetadata,
+)
+from vllm.entrypoints.openai.models.serving import OpenAIServingModels
+from vllm.logger import init_logger
+from vllm.outputs import RequestOutput
+
+from ..base.serving import OpenAISpeechToText
+from .protocol import (
+    TranslationRequest,
+    TranslationResponse,
+    TranslationResponseStreamChoice,
+    TranslationResponseVerbose,
+    TranslationStreamResponse,
+)
+
+logger = init_logger(__name__)
+
+
+class OpenAIServingTranslation(OpenAISpeechToText):
+    """Handles translation requests."""
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        models: OpenAIServingModels,
+        *,
+        request_logger: RequestLogger | None,
+        return_tokens_as_token_ids: bool = False,
+        enable_force_include_usage: bool = False,
+    ):
+        super().__init__(
+            engine_client=engine_client,
+            models=models,
+            request_logger=request_logger,
+            return_tokens_as_token_ids=return_tokens_as_token_ids,
+            task_type="translate",
+            enable_force_include_usage=enable_force_include_usage,
+        )
+
+    async def create_translation(
+        self,
+        audio_data: bytes,
+        request: TranslationRequest,
+        raw_request: Request | None = None,
+    ) -> (
+        TranslationResponse
+        | TranslationResponseVerbose
+        | AsyncGenerator[str, None]
+        | ErrorResponse
+    ):
+        """Translation API similar to OpenAI's API.
+
+        See https://platform.openai.com/docs/api-reference/audio/createTranslation
+        for the API specification. This API mimics the OpenAI translation API.
+        """
+        return await self._create_speech_to_text(
+            audio_data=audio_data,
+            request=request,
+            raw_request=raw_request,
+            response_class=(
+                TranslationResponseVerbose
+                if request.response_format == "verbose_json"
+                else TranslationResponse
+            ),
+            stream_generator_method=self.translation_stream_generator,
+        )
+
+    async def translation_stream_generator(
+        self,
+        request: TranslationRequest,
+        result_generator: list[AsyncGenerator[RequestOutput, None]],
+        request_id: str,
+        request_metadata: RequestResponseMetadata,
+        audio_duration_s: float,
+        separator: str,
+    ) -> AsyncGenerator[str, None]:
+        generator = self._speech_to_text_stream_generator(
+            request=request,
+            list_result_generator=result_generator,
+            request_id=request_id,
+            request_metadata=request_metadata,
+            audio_duration_s=audio_duration_s,
+            chunk_object_type="translation.chunk",
+            response_stream_choice_class=TranslationResponseStreamChoice,
+            stream_response_class=TranslationStreamResponse,
+            separator=separator,
+        )
+        async for chunk in generator:
+            yield chunk

From db045c2cb1e5bc613b4e6920aa785f5ca381b8e4 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 12 May 2026 11:36:35 +0800
Subject: [PATCH 2/3] Update vllm/entrypoints/speech_to_text/factories.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Signed-off-by: wang.yuqi <noooop@126.com>
---
 vllm/entrypoints/speech_to_text/factories.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/speech_to_text/factories.py b/vllm/entrypoints/speech_to_text/factories.py
index f89c66092fc7..0a3223fd9be0 100644
--- a/vllm/entrypoints/speech_to_text/factories.py
+++ b/vllm/entrypoints/speech_to_text/factories.py
@@ -53,10 +53,10 @@ def init_speech_to_text_state(
     supported_tasks: tuple["SupportedTask", ...],
 ):
     if "transcription" in supported_tasks:
-        from vllm.entrypoints.speech_to_text.transcription.serving import (
+        from .transcription.serving import (
             OpenAIServingTranscription,
         )
-        from vllm.entrypoints.speech_to_text.translation.serving import (
+        from .translation.serving import (
             OpenAIServingTranslation,
         )
 

From b3c1a498c7f67fc578d78e5b81cd7543e905a34b Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <yuqi.wang@daocloud.io>
Date: Tue, 12 May 2026 11:47:44 +0800
Subject: [PATCH 3/3] refine

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
---
 vllm/entrypoints/openai/engine/serving.py    |  4 ++--
 vllm/entrypoints/openai/run_batch.py         |  4 +++-
 vllm/entrypoints/speech_to_text/factories.py | 13 ++++---------
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py
index 70dbe289b2a1..6152f915cc3a 100644
--- a/vllm/entrypoints/openai/engine/serving.py
+++ b/vllm/entrypoints/openai/engine/serving.py
@@ -46,11 +46,11 @@
     TokenizeCompletionRequest,
     TokenizeResponse,
 )
-from vllm.entrypoints.speech_to_text import (
+from vllm.entrypoints.speech_to_text.transcription.protocol import (
     TranscriptionRequest,
     TranscriptionResponse,
-    TranslationRequest,
 )
+from vllm.entrypoints.speech_to_text.translation.protocol import TranslationRequest
 from vllm.entrypoints.utils import create_error_response
 from vllm.inputs import EngineInput, PromptType
 from vllm.logger import init_logger
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 7e52c0e9d73d..327254e3acc1 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -51,10 +51,12 @@
     ScoreRequest,
     ScoreResponse,
 )
-from vllm.entrypoints.speech_to_text import (
+from vllm.entrypoints.speech_to_text.transcription.protocol import (
     TranscriptionRequest,
     TranscriptionResponse,
     TranscriptionResponseVerbose,
+)
+from vllm.entrypoints.speech_to_text.translation.protocol import (
     TranslationRequest,
     TranslationResponse,
     TranslationResponseVerbose,
diff --git a/vllm/entrypoints/speech_to_text/factories.py b/vllm/entrypoints/speech_to_text/factories.py
index 0a3223fd9be0..3625f6d2a8d5 100644
--- a/vllm/entrypoints/speech_to_text/factories.py
+++ b/vllm/entrypoints/speech_to_text/factories.py
@@ -38,9 +38,7 @@ def register_speech_to_text_api_routers(
 
 
 def add_websocket_metrics_middleware(app: FastAPI):
-    from vllm.entrypoints.speech_to_text.realtime.metrics import (
-        WebSocketMetricsMiddleware,
-    )
+    from .realtime.metrics import WebSocketMetricsMiddleware
 
     app.add_middleware(WebSocketMetricsMiddleware)
 
@@ -53,12 +51,7 @@ def init_speech_to_text_state(
     supported_tasks: tuple["SupportedTask", ...],
 ):
     if "transcription" in supported_tasks:
-        from .transcription.serving import (
-            OpenAIServingTranscription,
-        )
-        from .translation.serving import (
-            OpenAIServingTranslation,
-        )
+        from .transcription.serving import OpenAIServingTranscription
 
         state.openai_serving_transcription = OpenAIServingTranscription(
             engine_client,
@@ -67,6 +60,8 @@ def init_speech_to_text_state(
             enable_force_include_usage=args.enable_force_include_usage,
         )
 
+        from .translation.serving import OpenAIServingTranslation
+
         state.openai_serving_translation = OpenAIServingTranslation(
             engine_client,
             state.openai_serving_models,