vllm-project · DarkLight1337 · Jun 25, 2025 · Apr 1, 2025 · Jun 12, 2025 · Jun 13, 2025
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
@@ -57,6 +57,8 @@ We currently support the following OpenAI APIs:
     - Only applicable to [embedding models](../models/pooling_models.md) (`--task embed`).
 - [Transcriptions API][transcriptions-api] (`/v1/audio/transcriptions`)
     - Only applicable to Automatic Speech Recognition (ASR) models (OpenAI Whisper) (`--task generate`).
+- [Translation API][translations-api] (`/v1/audio/translations`)
+    - Only applicable to Automatic Speech Recognition (ASR) models (OpenAI Whisper) (`--task generate`).
 
 In addition, we have the following custom APIs:
 
@@ -374,6 +376,34 @@ The following extra parameters are supported:
     ```python
     --8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
     ```
+
+[](){ #translations-api }
+
+### Translations API
+
+Our Translation API is compatible with [OpenAI's Translations API](https://platform.openai.com/docs/api-reference/audio/createTranslation);
+you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
+Whisper models can translate audio from one of the 55 non-English supported languages into English.
+Please mind that the popular `openai/whisper-large-v3-turbo` model does not support translating.
+
+!!! note
+    To use the Translation API, please install with extra audio dependencies using `pip install vllm[audio]`.
+
+Code example: <gh-file:examples/online_serving/openai_translation_client.py>
+
+#### Extra Parameters
+
+The following [sampling parameters][sampling-params] are supported.
+
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:translation-sampling-params"
+```
+
+The following extra parameters are supported:
+
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:translation-extra-params"
+```
 
 [](){ #tokenizer-api }
 

diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py
@@ -26,23 +26,12 @@
 
 from vllm.assets.audio import AudioAsset
 
-mary_had_lamb = AudioAsset("mary_had_lamb").get_local_path()
-winning_call = AudioAsset("winning_call").get_local_path()
 
-# Modify OpenAI's API key and API base to use vLLM's API server.
-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-
-
-def sync_openai():
+def sync_openai(audio_path: str, client: OpenAI):
     """
     Perform synchronous transcription using OpenAI-compatible API.
     """
-    with open(str(mary_had_lamb), "rb") as f:
+    with open(audio_path, "rb") as f:
         transcription = client.audio.transcriptions.create(
             file=f,
             model="openai/whisper-large-v3",
@@ -58,8 +47,7 @@ def sync_openai():
         print("transcription result:", transcription.text)
 
 
-# OpenAI Transcription API client does not support streaming.
-async def stream_openai_response():
+async def stream_openai_response(audio_path: str, base_url: str, api_key: str):
     """
     Perform streaming transcription using vLLM's raw HTTP streaming API.
     """
@@ -68,11 +56,12 @@ async def stream_openai_response():
         "stream": True,
         "model": "openai/whisper-large-v3",
     }
-    url = openai_api_base + "/audio/transcriptions"
-    headers = {"Authorization": f"Bearer {openai_api_key}"}
+    url = base_url + "/audio/transcriptions"
+    headers = {"Authorization": f"Bearer {api_key}"}
     print("transcription result:", end=" ")
+    # OpenAI Transcription API client does not support streaming.
     async with httpx.AsyncClient() as client:
-        with open(str(winning_call), "rb") as f:
+        with open(audio_path, "rb") as f:
             async with client.stream(
                 "POST", url, files={"file": f}, data=data, headers=headers
             ) as response:
@@ -93,10 +82,20 @@ async def stream_openai_response():
 
 
 def main():
-    sync_openai()
-
+    mary_had_lamb = str(AudioAsset("mary_had_lamb").get_local_path())
+    winning_call = str(AudioAsset("winning_call").get_local_path())
+
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    sync_openai(mary_had_lamb, client)
     # Run the asynchronous function
-    asyncio.run(stream_openai_response())
+    asyncio.run(stream_openai_response(winning_call, openai_api_base, openai_api_key))
 
 
 if __name__ == "__main__":

diff --git a/examples/online_serving/openai_translation_client.py b/examples/online_serving/openai_translation_client.py
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import json
+
+import httpx
+from openai import OpenAI
+
+from vllm.assets.audio import AudioAsset
+
+
+def sync_openai(audio_path: str, client: OpenAI):
+    with open(audio_path, "rb") as f:
+        translation = client.audio.translations.create(
+            file=f,
+            model="openai/whisper-large-v3",
+            response_format="json",
+            temperature=0.0,
+            # Additional params not provided by OpenAI API.
+            extra_body=dict(
+                language="it",
+                seed=4419,
+                repetition_penalty=1.3,
+            ),
+        )
+        print("translation result:", translation.text)
+
+
+async def stream_openai_response(audio_path: str, base_url: str, api_key: str):
+    data = {
+        "language": "it",
+        "stream": True,
+        "model": "openai/whisper-large-v3",
+    }
+    url = base_url + "/audio/translations"
+    headers = {"Authorization": f"Bearer {api_key}"}
+    print("translation result:", end=" ")
+    # OpenAI translation API client does not support streaming.
+    async with httpx.AsyncClient() as client:
+        with open(audio_path, "rb") as f:
+            async with client.stream(
+                "POST", url, files={"file": f}, data=data, headers=headers
+            ) as response:
+                async for line in response.aiter_lines():
+                    # Each line is a JSON object prefixed with 'data: '
+                    if line:
+                        if line.startswith("data: "):
+                            line = line[len("data: ") :]
+                        # Last chunk, stream ends
+                        if line.strip() == "[DONE]":
+                            break
+                        # Parse the JSON response
+                        chunk = json.loads(line)
+                        # Extract and print the content
+                        content = chunk["choices"][0].get("delta", {}).get("content")
+                        print(content, end="")
+
+
+def main():
+    foscolo = str(AudioAsset("azacinto_foscolo").get_local_path())
+
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    sync_openai(foscolo, client)
+    # Run the asynchronous function
+    asyncio.run(stream_openai_response(foscolo, openai_api_base, openai_api_key))
+
+
+if __name__ == "__main__":
+    main()
@@ -82,6 +82,8 @@ async def test_long_audio_request(mary_had_lamb):
 
     mary_had_lamb.seek(0)
     audio, sr = librosa.load(mary_had_lamb)
+    # Add small silence after each audio for repeatability in the split process
+    audio = np.pad(audio, (0, 1600))
     repeated_audio = np.tile(audio, 10)
     # Repeated audio to buffer
     buffer = io.BytesIO()

@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import io
+# imports for guided decoding tests
+import json
+from unittest.mock import patch
+
+import librosa
+import numpy as np
+import pytest
+import soundfile as sf
+from openai._base_client import AsyncAPIClient
+
+from vllm.assets.audio import AudioAsset
+
+from ...utils import RemoteOpenAIServer
+
+
+@pytest.fixture
+def foscolo():
+    # Test translation it->en
+    path = AudioAsset('azacinto_foscolo').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
+
+
+# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
+@pytest.mark.asyncio
+async def test_basic_audio(foscolo):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        translation = await client.audio.translations.create(
+            model=model_name,
+            file=foscolo,
+            response_format="text",
+            # TODO remove once language detection is implemented
+            extra_body=dict(language="it"),
+            temperature=0.0)
+        out = json.loads(translation)['text'].strip()
+        assert "Nor will I ever touch the sacred" in out
+
+
+@pytest.mark.asyncio
+async def test_audio_prompt(foscolo):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    # Condition whisper on starting text
+    prompt = "Nor have I ever"
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.translations.create(
+            model=model_name,
+            file=foscolo,
+            prompt=prompt,
+            extra_body=dict(language="it"),
+            response_format="text",
+            temperature=0.0)
+        out = json.loads(transcription)['text']
+        assert "Nor will I ever touch the sacred" not in out
+        assert prompt not in out
+
+
+@pytest.mark.asyncio
+async def test_non_asr_model(foscolo):
+    # text to text model
+    model_name = "JackFram/llama-68m"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        res = await client.audio.translations.create(model=model_name,
+                                                     file=foscolo,
+                                                     temperature=0.0)
+        assert res.code == 400 and not res.text
+        assert res.message == "The model does not support Translations API"
+
+
+@pytest.mark.asyncio
+async def test_streaming_response(foscolo):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    translation = ""
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        res_no_stream = await client.audio.translations.create(
+            model=model_name,
+            file=foscolo,
+            response_format="json",
+            extra_body=dict(language="it"),
+            temperature=0.0)
+        # Unfortunately this only works when the openai client is patched
+        # to use streaming mode, not exposed in the translation api.
+        original_post = AsyncAPIClient.post
+
+        async def post_with_stream(*args, **kwargs):
+            kwargs['stream'] = True
+            return await original_post(*args, **kwargs)
+
+        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
+            client = remote_server.get_async_client()
+            res = await client.audio.translations.create(model=model_name,
+                                                         file=foscolo,
+                                                         temperature=0.0,
+                                                         extra_body=dict(
+                                                             stream=True,
+                                                             language="it"))
+            # Reconstruct from chunks and validate
+            async for chunk in res:
+                # just a chunk
+                text = chunk.choices[0]['delta']['content']
+                translation += text
+
+        assert translation == res_no_stream.text
+
+
+@pytest.mark.asyncio
+async def test_stream_options(foscolo):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        original_post = AsyncAPIClient.post
+
+        async def post_with_stream(*args, **kwargs):
+            kwargs['stream'] = True
+            return await original_post(*args, **kwargs)
+
+        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
+            client = remote_server.get_async_client()
+            res = await client.audio.translations.create(
+                model=model_name,
+                file=foscolo,
+                temperature=0.0,
+                extra_body=dict(language="it",
+                                stream=True,
+                                stream_include_usage=True,
+                                stream_continuous_usage_stats=True))
+            final = False
+            continuous = True
+            async for chunk in res:
+                if not len(chunk.choices):
+                    # final usage sent
+                    final = True
+                else:
+                    continuous = continuous and hasattr(chunk, 'usage')
+            assert final and continuous
+
+
+@pytest.mark.asyncio
+async def test_long_audio_request(foscolo):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+
+    foscolo.seek(0)
+    audio, sr = librosa.load(foscolo)
+    repeated_audio = np.tile(audio, 2)
+    # Repeated audio to buffer
+    buffer = io.BytesIO()
+    sf.write(buffer, repeated_audio, sr, format='WAV')
+    buffer.seek(0)
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        translation = await client.audio.translations.create(
+            model=model_name,
+            file=buffer,
+            extra_body=dict(language="it"),
+            response_format="text",
+            temperature=0.0)
+        out = json.loads(translation)['text'].strip().lower()
+        # TODO investigate higher model uncertainty in for longer translations.
+        assert out.count("nor will i ever") == 2