Upgrade openai package and fix warnings

seratch · seratch · commit 7afde98a2e02 · 2025-09-11T10:37:59.000+09:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ requires-python = ">=3.9"
 license = "MIT"
 authors = [{ name = "OpenAI", email = "support@openai.com" }]
 dependencies = [
-    "openai>=1.106.1,<2",
+    "openai>=1.107.1,<2",
     "pydantic>=2.10, <3",
     "griffe>=1.5.6, <2",
     "typing-extensions>=4.12.2, <5",
diff --git a/src/agents/extensions/models/litellm_model.py b/src/agents/extensions/models/litellm_model.py
@@ -369,9 +369,9 @@ def convert_message_to_openai(
         if message.role != "assistant":
             raise ModelBehaviorError(f"Unsupported role: {message.role}")
 
-        tool_calls: list[
-            ChatCompletionMessageFunctionToolCall | ChatCompletionMessageCustomToolCall
-        ] | None = (
+        tool_calls: (
+            list[ChatCompletionMessageFunctionToolCall | ChatCompletionMessageCustomToolCall] | None
+        ) = (
             [LitellmConverter.convert_tool_call_to_openai(tool) for tool in message.tool_calls]
             if message.tool_calls
             else None
diff --git a/src/agents/realtime/audio_formats.py b/src/agents/realtime/audio_formats.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-from typing import Literal
-
 from openai.types.realtime.realtime_audio_formats import (
     AudioPCM,
     AudioPCMA,
diff --git a/src/agents/realtime/openai_realtime.py b/src/agents/realtime/openai_realtime.py
@@ -10,6 +10,7 @@
 
 import pydantic
 import websockets
+from openai.types.realtime import realtime_audio_config as _rt_audio_config
 from openai.types.realtime.conversation_item import (
     ConversationItem,
     ConversationItem as OpenAIConversationItem,
@@ -29,11 +30,6 @@
 from openai.types.realtime.input_audio_buffer_commit_event import (
     InputAudioBufferCommitEvent as OpenAIInputAudioBufferCommitEvent,
 )
-from openai.types.realtime.realtime_audio_config import (
-    RealtimeAudioConfig as OpenAIRealtimeAudioConfig,
-    RealtimeAudioConfigInput as OpenAIRealtimeAudioInput,
-    RealtimeAudioConfigOutput as OpenAIRealtimeAudioOutput,
-)
 from openai.types.realtime.realtime_client_event import (
     RealtimeClientEvent as OpenAIRealtimeClientEvent,
 )
@@ -62,6 +58,9 @@
 from openai.types.realtime.realtime_tracing_config import (
     TracingConfiguration as OpenAITracingConfiguration,
 )
+from openai.types.realtime.realtime_transcription_session_create_request import (
+    RealtimeTranscriptionSessionCreateRequest as OpenAIRealtimeTranscriptionSessionCreateRequest,
+)
 from openai.types.realtime.response_audio_delta_event import ResponseAudioDeltaEvent
 from openai.types.realtime.response_cancel_event import (
     ResponseCancelEvent as OpenAIResponseCancelEvent,
@@ -535,7 +534,8 @@ async def _handle_ws_event(self, event: dict[str, Any]):
                 if status not in ("in_progress", "completed", "incomplete"):
                     is_done = event.get("type") == "response.output_item.done"
                     status = "completed" if is_done else "in_progress"
-                type_adapter = TypeAdapter(RealtimeMessageItem)
+                # Explicitly type the adapter for mypy
+                type_adapter: TypeAdapter[RealtimeMessageItem] = TypeAdapter(RealtimeMessageItem)
                 message_item: RealtimeMessageItem = type_adapter.validate_python(
                     {
                         "item_id": item.get("id", ""),
@@ -559,21 +559,21 @@ async def _handle_ws_event(self, event: dict[str, Any]):
         except Exception as e:
             event_type = event.get("type", "unknown") if isinstance(event, dict) else "unknown"
             logger.error(f"Failed to validate server event: {event}", exc_info=True)
-            event = RealtimeModelExceptionEvent(
+            exception_event = RealtimeModelExceptionEvent(
                 exception=e,
                 context=f"Failed to validate server event: {event_type}",
             )
-            await self._emit_event(event)
+            await self._emit_event(exception_event)
             return
 
         if parsed.type == "response.output_audio.delta":
             await self._handle_audio_delta(parsed)
         elif parsed.type == "response.output_audio.done":
-            event = RealtimeModelAudioDoneEvent(
+            audio_done_event = RealtimeModelAudioDoneEvent(
                 item_id=parsed.item_id,
                 content_index=parsed.content_index,
             )
-            await self._emit_event(event)
+            await self._emit_event(audio_done_event)
         elif parsed.type == "input_audio_buffer.speech_started":
             # On VAD speech start, immediately stop local playback so the user can
             # barge‑in without overlapping assistant audio.
@@ -673,17 +673,39 @@ async def _handle_ws_event(self, event: dict[str, Any]):
                 )
             )
 
-    def _update_created_session(self, session: OpenAISessionCreateRequest) -> None:
-        self._created_session = session
-        if (
-            session.audio is not None
-            and session.audio.output is not None
-            and session.audio.output.format is not None
-        ):
-            audio_format = session.audio.output.format
-            self._audio_state_tracker.set_audio_format(audio_format)
-            if self._playback_tracker:
-                self._playback_tracker.set_audio_format(audio_format)
+    def _update_created_session(
+        self,
+        session: OpenAISessionCreateRequest | OpenAIRealtimeTranscriptionSessionCreateRequest,
+    ) -> None:
+        # Only store/playback-format information for realtime sessions (not transcription-only)
+        if isinstance(session, OpenAISessionCreateRequest):
+            self._created_session = session
+            if (
+                session.audio is not None
+                and session.audio.output is not None
+                and session.audio.output.format is not None
+            ):
+                # Convert OpenAI audio format objects to our internal string format
+                from openai.types.realtime.realtime_audio_formats import (
+                    AudioPCM,
+                    AudioPCMA,
+                    AudioPCMU,
+                )
+
+                fmt = session.audio.output.format
+                if isinstance(fmt, AudioPCM):
+                    normalized = "pcm16"
+                elif isinstance(fmt, AudioPCMU):
+                    normalized = "g711_ulaw"
+                elif isinstance(fmt, AudioPCMA):
+                    normalized = "g711_alaw"
+                else:
+                    # Fallback for unknown/str-like values
+                    normalized = cast("str", getattr(fmt, "type", str(fmt)))
+
+                self._audio_state_tracker.set_audio_format(normalized)
+                if self._playback_tracker:
+                    self._playback_tracker.set_audio_format(normalized)
 
     async def _update_session_config(self, model_settings: RealtimeSessionModelSettings) -> None:
         session_config = self._get_session_config(model_settings)
@@ -718,6 +740,11 @@ def _get_session_config(
             DEFAULT_MODEL_SETTINGS.get("output_audio_format"),
         )
 
+        # Avoid direct imports of non-exported names by referencing via module
+        OpenAIRealtimeAudioConfig = _rt_audio_config.RealtimeAudioConfig
+        OpenAIRealtimeAudioInput = _rt_audio_config.RealtimeAudioConfigInput  # type: ignore[attr-defined]
+        OpenAIRealtimeAudioOutput = _rt_audio_config.RealtimeAudioConfigOutput  # type: ignore[attr-defined]
+
         input_audio_config = None
         if any(
             value is not None
@@ -816,7 +843,7 @@ def conversation_item_to_realtime_message_item(
             ),
         ):
             raise ValueError("Unsupported conversation item type for message conversion.")
-        content: list[dict] = []
+        content: list[dict[str, Any]] = []
         for each in item.content:
             c = each.model_dump()
             if each.type == "output_text":
diff --git a/src/agents/realtime/session.py b/src/agents/realtime/session.py
@@ -38,6 +38,7 @@
 from .items import (
     AssistantAudio,
     AssistantMessageItem,
+    AssistantText,
     InputAudio,
     InputText,
     RealtimeItem,
@@ -512,26 +513,86 @@ def _get_new_history(
         if existing_index is not None:
             new_history = old_history.copy()
             if event.type == "message" and event.content is not None and len(event.content) > 0:
-                new_content = []
-                existing_content = old_history[existing_index].content
-                for idx, c in enumerate(event.content):
-                    if idx >= len(existing_content):
-                        new_content.append(c)
-                        continue
-
-                    current_one = existing_content[idx]
-                    if c.type == "audio" or c.type == "input_audio":
-                        if c.transcript is None:
-                            new_content.append(current_one)
-                        else:
-                            new_content.append(c)
-                    elif c.type == "text" or c.type == "input_text":
-                        if current_one.text is not None and c.text is None:
-                            new_content.append(current_one)
-                        else:
-                            new_content.append(c)
-                event.content = new_content
-                new_history[existing_index] = event
+                existing_item = old_history[existing_index]
+                if existing_item.type == "message":
+                    # Merge content preserving existing transcript/text when incoming entry is empty
+                    if event.role == "assistant" and existing_item.role == "assistant":
+                        assistant_existing_content = existing_item.content
+                        assistant_incoming = event.content
+                        assistant_new_content: list[AssistantText | AssistantAudio] = []
+                        for idx, ac in enumerate(assistant_incoming):
+                            if idx >= len(assistant_existing_content):
+                                assistant_new_content.append(ac)
+                                continue
+                            assistant_current = assistant_existing_content[idx]
+                            if ac.type == "audio":
+                                if ac.transcript is None:
+                                    assistant_new_content.append(assistant_current)
+                                else:
+                                    assistant_new_content.append(ac)
+                            else:  # text
+                                cur_text = (
+                                    assistant_current.text
+                                    if isinstance(assistant_current, AssistantText)
+                                    else None
+                                )
+                                if cur_text is not None and ac.text is None:
+                                    assistant_new_content.append(assistant_current)
+                                else:
+                                    assistant_new_content.append(ac)
+                        updated_assistant = event.model_copy(
+                            update={"content": assistant_new_content}
+                        )
+                        new_history[existing_index] = updated_assistant
+                    elif event.role == "user" and existing_item.role == "user":
+                        user_existing_content = existing_item.content
+                        user_incoming = event.content
+                        user_new_content: list[InputText | InputAudio] = []
+                        for idx, uc in enumerate(user_incoming):
+                            if idx >= len(user_existing_content):
+                                user_new_content.append(uc)
+                                continue
+                            user_current = user_existing_content[idx]
+                            if uc.type == "input_audio":
+                                if uc.transcript is None:
+                                    user_new_content.append(user_current)
+                                else:
+                                    user_new_content.append(uc)
+                            else:  # input_text
+                                cur_text = (
+                                    user_current.text
+                                    if isinstance(user_current, InputText)
+                                    else None
+                                )
+                                if cur_text is not None and uc.text is None:
+                                    user_new_content.append(user_current)
+                                else:
+                                    user_new_content.append(uc)
+                        updated_user = event.model_copy(update={"content": user_new_content})
+                        new_history[existing_index] = updated_user
+                    elif event.role == "system" and existing_item.role == "system":
+                        system_existing_content = existing_item.content
+                        system_incoming = event.content
+                        # Prefer existing non-empty text when incoming is empty
+                        system_new_content: list[InputText] = []
+                        for idx, sc in enumerate(system_incoming):
+                            if idx >= len(system_existing_content):
+                                system_new_content.append(sc)
+                                continue
+                            system_current = system_existing_content[idx]
+                            cur_text = system_current.text
+                            if cur_text is not None and sc.text is None:
+                                system_new_content.append(system_current)
+                            else:
+                                system_new_content.append(sc)
+                        updated_system = event.model_copy(update={"content": system_new_content})
+                        new_history[existing_index] = updated_system
+                    else:
+                        # Role changed or mismatched; just replace
+                        new_history[existing_index] = event
+                else:
+                    # If the existing item is not a message, just replace it.
+                    new_history[existing_index] = event
             return new_history
 
         # Otherwise, insert it after the previous_item_id if that is set
diff --git a/tests/realtime/test_tracing.py b/tests/realtime/test_tracing.py
@@ -1,6 +1,10 @@
+from typing import cast
 from unittest.mock import AsyncMock, Mock, patch
 
 import pytest
+from openai.types.realtime.realtime_session_create_request import (
+    RealtimeSessionCreateRequest,
+)
 from openai.types.realtime.realtime_tracing_config import TracingConfiguration
 
 from agents.realtime.agent import RealtimeAgent
@@ -111,9 +115,10 @@ async def async_websocket(*args, **kwargs):
                     call_args = mock_send_raw_message.call_args[0][0]
                     assert isinstance(call_args, SessionUpdateEvent)
                     assert call_args.type == "session.update"
-                    assert isinstance(call_args.session.tracing, TracingConfiguration)
-                    assert call_args.session.tracing.workflow_name == "test_workflow"
-                    assert call_args.session.tracing.group_id == "group_123"
+                    session_req = cast(RealtimeSessionCreateRequest, call_args.session)
+                    assert isinstance(session_req.tracing, TracingConfiguration)
+                    assert session_req.tracing.workflow_name == "test_workflow"
+                    assert session_req.tracing.group_id == "group_123"
 
     @pytest.mark.asyncio
     async def test_send_tracing_config_auto_mode(self, model, mock_websocket):
@@ -149,7 +154,8 @@ async def async_websocket(*args, **kwargs):
                     call_args = mock_send_raw_message.call_args[0][0]
                     assert isinstance(call_args, SessionUpdateEvent)
                     assert call_args.type == "session.update"
-                    assert call_args.session.tracing == "auto"
+                    session_req = cast(RealtimeSessionCreateRequest, call_args.session)
+                    assert session_req.tracing == "auto"
 
     @pytest.mark.asyncio
     async def test_tracing_config_none_skips_session_update(self, model, mock_websocket):
@@ -214,9 +220,10 @@ async def async_websocket(*args, **kwargs):
                     call_args = mock_send_raw_message.call_args[0][0]
                     assert isinstance(call_args, SessionUpdateEvent)
                     assert call_args.type == "session.update"
-                    assert isinstance(call_args.session.tracing, TracingConfiguration)
-                    assert call_args.session.tracing.workflow_name == "complex_workflow"
-                    assert call_args.session.tracing.metadata == complex_metadata
+                    session_req = cast(RealtimeSessionCreateRequest, call_args.session)
+                    assert isinstance(session_req.tracing, TracingConfiguration)
+                    assert session_req.tracing.workflow_name == "complex_workflow"
+                    assert session_req.tracing.metadata == complex_metadata
 
     @pytest.mark.asyncio
     async def test_tracing_disabled_prevents_tracing(self, mock_websocket):
diff --git a/tests/test_session.py b/tests/test_session.py
@@ -399,6 +399,7 @@ async def test_session_memory_rejects_both_session_and_list_input(runner_method)
 
         session.close()
 
+
 @pytest.mark.asyncio
 async def test_sqlite_session_unicode_content():
     """Test that session correctly stores and retrieves unicode/non-ASCII content."""
@@ -437,9 +438,7 @@ async def test_sqlite_session_special_characters_and_sql_injection():
         items: list[TResponseInputItem] = [
             {"role": "user", "content": "O'Reilly"},
             {"role": "assistant", "content": "DROP TABLE sessions;"},
-            {"role": "user", "content": (
-                '"SELECT * FROM users WHERE name = \"admin\";"'
-            )},
+            {"role": "user", "content": ('"SELECT * FROM users WHERE name = "admin";"')},
             {"role": "assistant", "content": "Robert'); DROP TABLE students;--"},
             {"role": "user", "content": "Normal message"},
         ]
@@ -450,17 +449,19 @@ async def test_sqlite_session_special_characters_and_sql_injection():
         assert len(retrieved) == len(items)
         assert retrieved[0].get("content") == "O'Reilly"
         assert retrieved[1].get("content") == "DROP TABLE sessions;"
-        assert retrieved[2].get("content") == '"SELECT * FROM users WHERE name = \"admin\";"'
+        assert retrieved[2].get("content") == '"SELECT * FROM users WHERE name = "admin";"'
         assert retrieved[3].get("content") == "Robert'); DROP TABLE students;--"
         assert retrieved[4].get("content") == "Normal message"
         session.close()
 
+
 @pytest.mark.asyncio
 async def test_sqlite_session_concurrent_access():
     """
     Test concurrent access to the same session to verify data integrity.
     """
     import concurrent.futures
+
     with tempfile.TemporaryDirectory() as temp_dir:
         db_path = Path(temp_dir) / "test_concurrent.db"
         session_id = "concurrent_test"
@@ -477,6 +478,7 @@ def add_item(item):
             asyncio.set_event_loop(loop)
             loop.run_until_complete(session.add_items([item]))
             loop.close()
+
         with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
             executor.map(add_item, items)
 
diff --git a/uv.lock b/uv.lock