openai: audio modality, remove sockets from unit tests (langchain-ai#27436)

Erick Friis · frances720 · commit d84ab352af07 · 2024-10-31T16:37:05.000-07:00
diff --git a/libs/partners/openai/.gitignore b/libs/partners/openai/.gitignore
@@ -1 +1,2 @@
 __pycache__
+tiktoken_cache
diff --git a/libs/partners/openai/Makefile b/libs/partners/openai/Makefile
@@ -8,7 +8,19 @@ TEST_FILE ?= tests/unit_tests/
 
 integration_test integration_tests: TEST_FILE=tests/integration_tests/
 
-test tests integration_test integration_tests:
+# unit tests are run with the --disable-socket flag to prevent network calls
+# use tiktoken cache to enable token counting without socket (internet) access
+test tests:
+	mkdir -p tiktoken_cache
+	@if [ ! -f tiktoken_cache/9b5ad71b2ce5302211f9c61530b329a4922fc6a4 ]; then \
+		curl -o tiktoken_cache/9b5ad71b2ce5302211f9c61530b329a4922fc6a4 https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken; \
+	fi
+	@if [ ! -f tiktoken_cache/fb374d419588a4632f3f557e76b4b70aebbca790 ]; then \
+		curl -o tiktoken_cache/fb374d419588a4632f3f557e76b4b70aebbca790 https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken; \
+	fi
+	TIKTOKEN_CACHE_DIR=tiktoken_cache poetry run pytest --disable-socket --allow-unix-socket $(TEST_FILE)
+
+integration_test integration_tests:
 	poetry run pytest $(TEST_FILE)
 
 test_watch:
diff --git a/libs/partners/openai/langchain_openai/chat_models/base.py b/libs/partners/openai/langchain_openai/chat_models/base.py
@@ -129,6 +129,8 @@ def _convert_dict_to_message(_dict: Mapping[str, Any]) -> BaseMessage:
                     invalid_tool_calls.append(
                         make_invalid_tool_call(raw_tool_call, str(e))
                     )
+        if audio := _dict.get("audio"):
+            additional_kwargs["audio"] = audio
         return AIMessage(
             content=content,
             additional_kwargs=additional_kwargs,
@@ -219,6 +221,17 @@ def _convert_message_to_dict(message: BaseMessage) -> dict:
         # If tool calls present, content null value should be None not empty string.
         if "function_call" in message_dict or "tool_calls" in message_dict:
             message_dict["content"] = message_dict["content"] or None
+
+        if "audio" in message.additional_kwargs:
+            # openai doesn't support passing the data back - only the id
+            # https://platform.openai.com/docs/guides/audio/multi-turn-conversations
+            raw_audio = message.additional_kwargs["audio"]
+            audio = (
+                {"id": message.additional_kwargs["audio"]["id"]}
+                if "id" in raw_audio
+                else raw_audio
+            )
+            message_dict["audio"] = audio
     elif isinstance(message, SystemMessage):
         message_dict["role"] = "system"
     elif isinstance(message, FunctionMessage):
diff --git a/libs/partners/openai/poetry.lock b/libs/partners/openai/poetry.lock
diff --git a/libs/partners/openai/pyproject.toml b/libs/partners/openai/pyproject.toml
@@ -24,7 +24,7 @@ ignore_missing_imports = true
 [tool.poetry.dependencies]
 python = ">=3.9,<4.0"
 langchain-core = "^0.3.9"
-openai = "^1.40.0"
+openai = "^1.52.0"
 tiktoken = ">=0.7,<1"
 
 [tool.ruff.lint]
@@ -72,6 +72,7 @@ syrupy = "^4.0.2"
 pytest-watcher = "^0.3.4"
 pytest-asyncio = "^0.21.1"
 pytest-cov = "^4.1.0"
+pytest-socket = "^0.6.0"
 [[tool.poetry.group.test.dependencies.numpy]]
 version = "^1"
 python = "<3.12"
diff --git a/libs/partners/openai/tests/integration_tests/chat_models/audio_input.wav b/libs/partners/openai/tests/integration_tests/chat_models/audio_input.wav
diff --git a/libs/partners/openai/tests/integration_tests/chat_models/test_base.py b/libs/partners/openai/tests/integration_tests/chat_models/test_base.py
@@ -2,6 +2,7 @@
 
 import base64
 import json
+from pathlib import Path
 from typing import Any, AsyncIterator, List, Literal, Optional, cast
 
 import httpx
@@ -949,3 +950,71 @@ async def test_json_mode_async() -> None:
     assert isinstance(full, AIMessageChunk)
     assert isinstance(full.content, str)
     assert json.loads(full.content) == {"a": 1}
+
+
+def test_audio_output_modality() -> None:
+    llm = ChatOpenAI(
+        model="gpt-4o-audio-preview",
+        temperature=0,
+        model_kwargs={
+            "modalities": ["text", "audio"],
+            "audio": {"voice": "alloy", "format": "wav"},
+        },
+    )
+
+    history: List[BaseMessage] = [
+        HumanMessage("Make me a short audio clip of you yelling")
+    ]
+
+    output = llm.invoke(history)
+
+    assert isinstance(output, AIMessage)
+    assert "audio" in output.additional_kwargs
+
+    history.append(output)
+    history.append(HumanMessage("Make me a short audio clip of you whispering"))
+
+    output = llm.invoke(history)
+
+    assert isinstance(output, AIMessage)
+    assert "audio" in output.additional_kwargs
+
+
+def test_audio_input_modality() -> None:
+    llm = ChatOpenAI(
+        model="gpt-4o-audio-preview",
+        temperature=0,
+        model_kwargs={
+            "modalities": ["text", "audio"],
+            "audio": {"voice": "alloy", "format": "wav"},
+        },
+    )
+    filepath = Path(__file__).parent / "audio_input.wav"
+
+    audio_data = filepath.read_bytes()
+    b64_audio_data = base64.b64encode(audio_data).decode("utf-8")
+
+    history: list[BaseMessage] = [
+        HumanMessage(
+            [
+                {"type": "text", "text": "What is happening in this audio clip"},
+                {
+                    "type": "input_audio",
+                    "input_audio": {"data": b64_audio_data, "format": "wav"},
+                },
+            ]
+        )
+    ]
+
+    output = llm.invoke(history)
+
+    assert isinstance(output, AIMessage)
+    assert "audio" in output.additional_kwargs
+
+    history.append(output)
+    history.append(HumanMessage("Why?"))
+
+    output = llm.invoke(history)
+
+    assert isinstance(output, AIMessage)
+    assert "audio" in output.additional_kwargs
diff --git a/libs/partners/openai/tests/unit_tests/chat_models/test_base.py b/libs/partners/openai/tests/unit_tests/chat_models/test_base.py
@@ -162,7 +162,12 @@ def test__convert_dict_to_message_tool_call() -> None:
                 name="GenerateUsername",
                 args="oops",
                 id="call_wm0JY6CdwOMZ4eTxHWUThDNz",
-                error="Function GenerateUsername arguments:\n\noops\n\nare not valid JSON. Received JSONDecodeError Expecting value: line 1 column 1 (char 0)",  # noqa: E501
+                error=(
+                    "Function GenerateUsername arguments:\n\noops\n\nare not "
+                    "valid JSON. Received JSONDecodeError Expecting value: line 1 "
+                    "column 1 (char 0)\nFor troubleshooting, visit: https://python"
+                    ".langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE"
+                ),
                 type="invalid_tool_call",
             )
         ],
diff --git a/libs/partners/openai/tests/unit_tests/llms/test_base.py b/libs/partners/openai/tests/unit_tests/llms/test_base.py
@@ -58,7 +58,7 @@ def mock_completion() -> dict:
     }
 
 
-@pytest.mark.parametrize("model", ["gpt-3.5-turbo-instruct", "text-davinci-003"])
+@pytest.mark.parametrize("model", ["gpt-3.5-turbo-instruct"])
 def test_get_token_ids(model: str) -> None:
     OpenAI(model=model).get_token_ids("foo")
     return
diff --git a/libs/partners/openai/tests/unit_tests/test_token_counts.py b/libs/partners/openai/tests/unit_tests/test_token_counts.py
@@ -16,6 +16,7 @@
 _CHAT_MODELS = ["gpt-4", "gpt-4-32k", "gpt-3.5-turbo"]
 
 
+@pytest.mark.xfail(reason="Old models require different tiktoken cached file")
 @pytest.mark.parametrize("model", _MODELS)
 def test_openai_get_num_tokens(model: str) -> None:
     """Test get_tokens."""

Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,7 @@ def mock_completion() -> dict:`
`58`	`58`	`}`
`59`	`59`
`60`	`60`
`61`		`-@pytest.mark.parametrize("model", ["gpt-3.5-turbo-instruct", "text-davinci-003"])`
	`61`	`+@pytest.mark.parametrize("model", ["gpt-3.5-turbo-instruct"])`
`62`	`62`	`def test_get_token_ids(model: str) -> None:`
`63`	`63`	`OpenAI(model=model).get_token_ids("foo")`
`64`	`64`	`return`