Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions python/sglang/srt/entrypoints/http_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
Query,
Request,
UploadFile,
WebSocket,
)
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
Expand Down Expand Up @@ -1580,6 +1581,12 @@ async def openai_v1_audio_transcriptions(
)


@app.websocket("/v1/audio/transcriptions/stream")
async def openai_v1_audio_transcriptions_ws(ws: WebSocket):
"""WebSocket endpoint for real-time streaming audio transcription."""
await ws.app.state.openai_serving_transcription.handle_websocket(ws)


@app.get("/v1/models", response_class=ORJSONResponse)
async def available_models():
"""Show available models. OpenAI-compatible endpoint."""
Expand Down
49 changes: 16 additions & 33 deletions python/sglang/srt/entrypoints/openai/serving_transcription.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
import uuid
from typing import TYPE_CHECKING, AsyncGenerator, List, Optional, Union

from fastapi import Request
from fastapi import Request, WebSocket
from fastapi.responses import ORJSONResponse, Response, StreamingResponse

from sglang.srt.entrypoints.openai.protocol import (
Expand All @@ -45,6 +45,7 @@
from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
from sglang.srt.entrypoints.openai.streaming_asr import (
StreamingASRState,
process_asr_chunk,
split_audio_chunks,
)
from sglang.srt.entrypoints.openai.transcription_adapters import resolve_adapter
Expand Down Expand Up @@ -272,7 +273,6 @@ async def _generate_chunked_asr_stream(
- Token-level streaming within chunks (stream=True)
- Encoder window caching across chunks
- Cross-chunk KV cache reuse
- WebSocket endpoint for real-time audio input
"""
created_time = int(time.time())
request_id = f"{self._request_id_prefix()}{uuid.uuid4().hex}"
Expand All @@ -288,43 +288,18 @@ async def _generate_chunked_asr_stream(
logger.info("[streaming_asr] client disconnected, stopping")
break
is_last = i == len(chunks) - 1
prompt = self._adapter.prompt_template + state.get_prefix_text()

chunk_request = GenerateReqInput(
text=prompt,
delta = await process_asr_chunk(
tokenizer_manager=self.tokenizer_manager,
adapter=self._adapter,
state=state,
audio_data=chunk_audio,
sampling_params=adapted_request.sampling_params,
stream=False,
modalities=["audio"],
is_last=is_last,
raw_request=raw_request,
routing_key=self.extract_routing_key(raw_request),
)

try:
ret = None
async for ret in self.tokenizer_manager.generate_request(
chunk_request, raw_request
):
break
except asyncio.CancelledError:
raise
except ValueError as e:
logger.warning(
"[streaming_asr] chunk %d failed with ValueError: %s", i, e
)
continue

if ret is None:
logger.warning("[streaming_asr] empty response for chunk %d", i)
continue

text = self._adapter.postprocess_text(ret.get("text", ""))

if is_last:
state.full_transcript = text
delta = state.finalize()
else:
delta = state.update(text)

if delta:
for word in delta.split(" "):
if not word:
Expand Down Expand Up @@ -366,3 +341,11 @@ async def _generate_chunked_asr_stream(
yield f"data: {error}\n\n"

yield "data: [DONE]\n\n"

async def handle_websocket(self, websocket: WebSocket) -> None:
"""Handle a Realtime transcription session over WebSocket."""
from sglang.srt.entrypoints.openai.serving_transcription_websocket import (
handle_realtime_transcription,
)

await handle_realtime_transcription(websocket, self)
Loading
Loading