1010
1111import pydantic
1212import websockets
13+ from openai .types .realtime import realtime_audio_config as _rt_audio_config
1314from openai .types .realtime .conversation_item import (
1415 ConversationItem ,
1516 ConversationItem as OpenAIConversationItem ,
2930from openai .types .realtime .input_audio_buffer_commit_event import (
3031 InputAudioBufferCommitEvent as OpenAIInputAudioBufferCommitEvent ,
3132)
32- from openai .types .realtime .realtime_audio_config import (
33- RealtimeAudioConfig as OpenAIRealtimeAudioConfig ,
34- RealtimeAudioConfigInput as OpenAIRealtimeAudioInput ,
35- RealtimeAudioConfigOutput as OpenAIRealtimeAudioOutput ,
36- )
3733from openai .types .realtime .realtime_client_event import (
3834 RealtimeClientEvent as OpenAIRealtimeClientEvent ,
3935)
6258from openai .types .realtime .realtime_tracing_config import (
6359 TracingConfiguration as OpenAITracingConfiguration ,
6460)
61+ from openai .types .realtime .realtime_transcription_session_create_request import (
62+ RealtimeTranscriptionSessionCreateRequest as OpenAIRealtimeTranscriptionSessionCreateRequest ,
63+ )
6564from openai .types .realtime .response_audio_delta_event import ResponseAudioDeltaEvent
6665from openai .types .realtime .response_cancel_event import (
6766 ResponseCancelEvent as OpenAIResponseCancelEvent ,
@@ -535,7 +534,8 @@ async def _handle_ws_event(self, event: dict[str, Any]):
535534 if status not in ("in_progress" , "completed" , "incomplete" ):
536535 is_done = event .get ("type" ) == "response.output_item.done"
537536 status = "completed" if is_done else "in_progress"
538- type_adapter = TypeAdapter (RealtimeMessageItem )
537+ # Explicitly type the adapter for mypy
538+ type_adapter : TypeAdapter [RealtimeMessageItem ] = TypeAdapter (RealtimeMessageItem )
539539 message_item : RealtimeMessageItem = type_adapter .validate_python (
540540 {
541541 "item_id" : item .get ("id" , "" ),
@@ -559,21 +559,21 @@ async def _handle_ws_event(self, event: dict[str, Any]):
559559 except Exception as e :
560560 event_type = event .get ("type" , "unknown" ) if isinstance (event , dict ) else "unknown"
561561 logger .error (f"Failed to validate server event: { event } " , exc_info = True )
562- event = RealtimeModelExceptionEvent (
562+ exception_event = RealtimeModelExceptionEvent (
563563 exception = e ,
564564 context = f"Failed to validate server event: { event_type } " ,
565565 )
566- await self ._emit_event (event )
566+ await self ._emit_event (exception_event )
567567 return
568568
569569 if parsed .type == "response.output_audio.delta" :
570570 await self ._handle_audio_delta (parsed )
571571 elif parsed .type == "response.output_audio.done" :
572- event = RealtimeModelAudioDoneEvent (
572+ audio_done_event = RealtimeModelAudioDoneEvent (
573573 item_id = parsed .item_id ,
574574 content_index = parsed .content_index ,
575575 )
576- await self ._emit_event (event )
576+ await self ._emit_event (audio_done_event )
577577 elif parsed .type == "input_audio_buffer.speech_started" :
578578 # On VAD speech start, immediately stop local playback so the user can
579579 # barge‑in without overlapping assistant audio.
@@ -673,17 +673,39 @@ async def _handle_ws_event(self, event: dict[str, Any]):
673673 )
674674 )
675675
676- def _update_created_session (self , session : OpenAISessionCreateRequest ) -> None :
677- self ._created_session = session
678- if (
679- session .audio is not None
680- and session .audio .output is not None
681- and session .audio .output .format is not None
682- ):
683- audio_format = session .audio .output .format
684- self ._audio_state_tracker .set_audio_format (audio_format )
685- if self ._playback_tracker :
686- self ._playback_tracker .set_audio_format (audio_format )
676+ def _update_created_session (
677+ self ,
678+ session : OpenAISessionCreateRequest | OpenAIRealtimeTranscriptionSessionCreateRequest ,
679+ ) -> None :
680+ # Only store/playback-format information for realtime sessions (not transcription-only)
681+ if isinstance (session , OpenAISessionCreateRequest ):
682+ self ._created_session = session
683+ if (
684+ session .audio is not None
685+ and session .audio .output is not None
686+ and session .audio .output .format is not None
687+ ):
688+ # Convert OpenAI audio format objects to our internal string format
689+ from openai .types .realtime .realtime_audio_formats import (
690+ AudioPCM ,
691+ AudioPCMA ,
692+ AudioPCMU ,
693+ )
694+
695+ fmt = session .audio .output .format
696+ if isinstance (fmt , AudioPCM ):
697+ normalized = "pcm16"
698+ elif isinstance (fmt , AudioPCMU ):
699+ normalized = "g711_ulaw"
700+ elif isinstance (fmt , AudioPCMA ):
701+ normalized = "g711_alaw"
702+ else :
703+ # Fallback for unknown/str-like values
704+ normalized = cast ("str" , getattr (fmt , "type" , str (fmt )))
705+
706+ self ._audio_state_tracker .set_audio_format (normalized )
707+ if self ._playback_tracker :
708+ self ._playback_tracker .set_audio_format (normalized )
687709
688710 async def _update_session_config (self , model_settings : RealtimeSessionModelSettings ) -> None :
689711 session_config = self ._get_session_config (model_settings )
@@ -718,6 +740,11 @@ def _get_session_config(
718740 DEFAULT_MODEL_SETTINGS .get ("output_audio_format" ),
719741 )
720742
743+ # Avoid direct imports of non-exported names by referencing via module
744+ OpenAIRealtimeAudioConfig = _rt_audio_config .RealtimeAudioConfig
745+ OpenAIRealtimeAudioInput = _rt_audio_config .RealtimeAudioConfigInput # type: ignore[attr-defined]
746+ OpenAIRealtimeAudioOutput = _rt_audio_config .RealtimeAudioConfigOutput # type: ignore[attr-defined]
747+
721748 input_audio_config = None
722749 if any (
723750 value is not None
@@ -816,7 +843,7 @@ def conversation_item_to_realtime_message_item(
816843 ),
817844 ):
818845 raise ValueError ("Unsupported conversation item type for message conversion." )
819- content : list [dict ] = []
846+ content : list [dict [ str , Any ] ] = []
820847 for each in item .content :
821848 c = each .model_dump ()
822849 if each .type == "output_text" :
0 commit comments