microsoft · tongyu0924 · Mar 23, 2025 · Apr 3, 2025 · Apr 5, 2025 · Apr 6, 2025
diff --git a/python/packages/autogen-agentchat/tests/test_assistant_agent.py b/python/packages/autogen-agentchat/tests/test_assistant_agent.py
@@ -66,6 +66,7 @@ async def test_run_with_tools(monkeypatch: pytest.MonkeyPatch) -> None:
                 usage=RequestUsage(prompt_tokens=10, completion_tokens=5),
                 thought="Calling pass function",
                 cached=False,
+                raw_response={"id": "mock-id", "provider": "replay"},
             ),
             "pass",
             "TERMINATE",
@@ -144,18 +145,21 @@ async def test_run_with_tools_and_reflection() -> None:
                 content=[FunctionCall(id="1", arguments=json.dumps({"input": "task"}), name="_pass_function")],
                 usage=RequestUsage(prompt_tokens=10, completion_tokens=5),
                 cached=False,
+                raw_response={"id": "mock-id", "provider": "replay"},
             ),
             CreateResult(
                 finish_reason="stop",
                 content="Hello",
                 usage=RequestUsage(prompt_tokens=10, completion_tokens=5),
                 cached=False,
+                raw_response={"id": "mock-id", "provider": "replay"},
             ),
             CreateResult(
                 finish_reason="stop",
                 content="TERMINATE",
                 usage=RequestUsage(prompt_tokens=10, completion_tokens=5),
                 cached=False,
+                raw_response={"id": "mock-id", "provider": "replay"},
             ),
         ],
         model_info={
@@ -246,6 +250,7 @@ async def test_run_with_parallel_tools() -> None:
                 usage=RequestUsage(prompt_tokens=10, completion_tokens=5),
                 thought="Calling pass and echo functions",
                 cached=False,
+                raw_response={"id": "mock-id", "provider": "replay"},
             ),
             "pass",
             "TERMINATE",
@@ -331,6 +336,7 @@ async def test_run_with_parallel_tools_with_empty_call_ids() -> None:
                 ],
                 usage=RequestUsage(prompt_tokens=10, completion_tokens=5),
                 cached=False,
+                raw_response={"id": "mock-id", "provider": "replay"},
             ),
             "pass",
             "TERMINATE",
@@ -672,6 +678,7 @@ async def test_handoffs() -> None:
                 ],
                 usage=RequestUsage(prompt_tokens=42, completion_tokens=43),
                 cached=False,
+                raw_response={"id": "mock-id", "provider": "replay"},
                 thought="Calling handoff function",
             )
         ],
@@ -1064,6 +1071,7 @@ async def test_list_chat_messages(monkeypatch: pytest.MonkeyPatch) -> None:
                 content="Response to message 1",
                 usage=RequestUsage(prompt_tokens=10, completion_tokens=5),
                 cached=False,
+                raw_response={"id": "mock-id", "provider": "replay"},
             )
         ]
     )
@@ -1269,6 +1277,7 @@ async def test_model_client_stream_with_tool_calls() -> None:
                 finish_reason="function_calls",
                 usage=RequestUsage(prompt_tokens=10, completion_tokens=5),
                 cached=False,
+                raw_response={"id": "mock-id", "provider": "replay"},
             ),
             "Example response 2 to task",
         ]

diff --git a/python/packages/autogen-agentchat/tests/test_code_executor_agent.py b/python/packages/autogen-agentchat/tests/test_code_executor_agent.py
@@ -142,11 +142,6 @@ async def test_self_debugging_loop() -> None:
 numbers = [10, 20, 30, 40, 50]
 mean = sum(numbers) / len(numbers
 print("The mean is:", mean)
-""".strip()
-    incorrect_code_result = """
-    mean = sum(numbers) / len(numbers
-                             ^
-SyntaxError: '(' was never closed
 """.strip()
     correct_code_block = """
 numbers = [10, 20, 30, 40, 50]
@@ -218,8 +213,8 @@ async def test_self_debugging_loop() -> None:
         elif isinstance(message, CodeExecutionEvent) and message_id == 1:
             # Step 2: First code execution
             assert (
-                incorrect_code_result in message.to_text().strip()
-            ), f"Expected {incorrect_code_result} in execution result, got: {message.to_text().strip()}"
+                "SyntaxError: '(' was never closed" in message.to_text()
+            ), f"Expected SyntaxError in execution result, got: {message.to_text().strip()}"
             incorrect_code_execution_event = message
 
         elif isinstance(message, CodeGenerationEvent) and message_id == 2:

diff --git a/python/packages/autogen-agentchat/tests/test_group_chat.py b/python/packages/autogen-agentchat/tests/test_group_chat.py
@@ -450,6 +450,7 @@ async def test_round_robin_group_chat_with_tools(runtime: AgentRuntime | None) -
                 content=[FunctionCall(id="1", name="pass", arguments=json.dumps({"input": "pass"}))],
                 usage=RequestUsage(prompt_tokens=0, completion_tokens=0),
                 cached=False,
+                raw_response={"id": "mock-id", "provider": "replay"},
             ),
             "Hello",
             "TERMINATE",
@@ -1267,6 +1268,7 @@ async def test_swarm_handoff_using_tool_calls(runtime: AgentRuntime | None) -> N
                 content=[FunctionCall(id="1", name="handoff_to_agent2", arguments=json.dumps({}))],
                 usage=RequestUsage(prompt_tokens=0, completion_tokens=0),
                 cached=False,
+                raw_response={"id": "mock-id", "provider": "replay"},
             ),
             "Hello",
             "TERMINATE",
@@ -1367,6 +1369,7 @@ async def test_swarm_with_parallel_tool_calls(runtime: AgentRuntime | None) -> N
                 ],
                 usage=RequestUsage(prompt_tokens=0, completion_tokens=0),
                 cached=False,
+                raw_response={"id": "mock-id", "provider": "replay"},
             ),
             "Hello",
             "TERMINATE",

diff --git a/python/packages/autogen-core/src/autogen_core/models/_types.py b/python/packages/autogen-core/src/autogen_core/models/_types.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import List, Literal, Optional, Union
+from typing import Any, Dict, List, Literal, Optional, Union
 
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated
@@ -125,3 +125,6 @@ class CreateResult(BaseModel):
     thought: Optional[str] = None
     """The reasoning text for the completion if available. Used for reasoning models
     and additional text content besides function calls."""
+
+    raw_response: Optional[Dict[str, Any]] = None
+    """Raw response from the model API, useful for custom field access."""
diff --git a/python/packages/autogen-core/tests/test_tool_agent.py b/python/packages/autogen-core/tests/test_tool_agent.py
@@ -113,13 +113,15 @@ async def create(
                     usage=RequestUsage(prompt_tokens=0, completion_tokens=0),
                     cached=False,
                     logprobs=None,
+                    raw_response={"id": "mock-id", "provider": "replay"},
                 )
             return CreateResult(
                 content="Done",
                 finish_reason="stop",
                 usage=RequestUsage(prompt_tokens=0, completion_tokens=0),
                 cached=False,
                 logprobs=None,
+                raw_response={"id": "mock-id", "provider": "replay"},
             )
 
         def create_stream(

diff --git a/...src/autogen_ext/experimental/task_centric_memory/utils/chat_completion_client_recorder.py b/...src/autogen_ext/experimental/task_centric_memory/utils/chat_completion_client_recorder.py
@@ -141,6 +141,7 @@ async def create(
                 finish_reason=data.get("finish_reason", "stop"),
                 usage=data.get("usage", RequestUsage(prompt_tokens=0, completion_tokens=0)),
                 cached=True,
+                raw_response=data.get("raw_response", {"id": "mock-id", "provider": "replay"}),
             )
             return result
 

diff --git a/python/packages/autogen-ext/src/autogen_ext/models/anthropic/_anthropic_client.py b/python/packages/autogen-ext/src/autogen_ext/models/anthropic/_anthropic_client.py
@@ -646,6 +646,7 @@ async def create(
             usage=usage,
             cached=False,
             thought=thought,
+            raw_response=result,
         )
 
         # Update usage statistics
@@ -863,13 +864,20 @@ async def create_stream(
             # Just text content
             content = "".join(text_content)
 
+        future: asyncio.Task[Message] = asyncio.ensure_future(
+            self._client.messages.create(**request_args)  # type: ignore
+        )
+
+        message_result: Message = cast(Message, await future)
+
         # Create the final result
         result = CreateResult(
             finish_reason=normalize_stop_reason(stop_reason),
             content=content,
             usage=usage,
             cached=False,
             thought=thought,
+            raw_response=message_result,
         )
 
         # Emit the end event.

diff --git a/python/packages/autogen-ext/src/autogen_ext/models/azure/_azure_ai_client.py b/python/packages/autogen-ext/src/autogen_ext/models/azure/_azure_ai_client.py
@@ -440,6 +440,7 @@ async def create(
             usage=usage,
             cached=False,
             thought=thought,
+            raw_response=result,
         )
 
         self.add_usage(usage)

diff --git a/python/packages/autogen-ext/src/autogen_ext/models/llama_cpp/_llama_cpp_completion_client.py b/python/packages/autogen-ext/src/autogen_ext/models/llama_cpp/_llama_cpp_completion_client.py
@@ -357,7 +357,11 @@ async def create(
         if not response_tool_calls and not response_text:
             logger.debug("DEBUG: No response text found. Returning empty response.")
             return CreateResult(
-                content="", usage=RequestUsage(prompt_tokens=0, completion_tokens=0), finish_reason="stop", cached=False
+                content="",
+                usage=RequestUsage(prompt_tokens=0, completion_tokens=0),
+                finish_reason="stop",
+                cached=False,
+                raw_response=response,
             )
 
         # Create a CreateResult object
@@ -373,6 +377,7 @@ async def create(
             usage=cast(RequestUsage, response["usage"]),
             finish_reason=normalize_stop_reason(finish_reason),  # type: ignore
             cached=False,
+            raw_response=response,
         )
 
         # If we are running in the context of a handler we can get the agent_id

diff --git a/python/packages/autogen-ext/src/autogen_ext/models/ollama/_ollama_client.py b/python/packages/autogen-ext/src/autogen_ext/models/ollama/_ollama_client.py
@@ -691,6 +691,7 @@ async def create(
             usage=usage,
             cached=False,
             logprobs=None,
+            raw_response=result,
             thought=thought,
         )
 
@@ -827,6 +828,7 @@ async def create_stream(
             usage=usage,
             cached=False,
             logprobs=None,
+            raw_response=None,
             thought=thought,
         )
 

diff --git a/python/packages/autogen-ext/src/autogen_ext/models/openai/_openai_client.py b/python/packages/autogen-ext/src/autogen_ext/models/openai/_openai_client.py
@@ -722,6 +722,7 @@ async def create(
             cached=False,
             logprobs=logprobs,
             thought=thought,
+            raw_response=result,
         )
 
         self._total_usage = _add_usage(self._total_usage, usage)
@@ -956,6 +957,28 @@ async def create_stream(
             if isinstance(content, str) and self._model_info["family"] == ModelFamily.R1 and thought is None:
                 thought, content = parse_r1_content(content)
 
+        create_params = self._process_create_args(
+            messages,
+            tools,
+            json_output,
+            extra_create_args,
+        )
+
+        if create_params.response_format is not None:
+            result = await self._client.beta.chat.completions.parse(
+                messages=create_params.messages,
+                tools=(create_params.tools if len(create_params.tools) > 0 else NOT_GIVEN),
+                response_format=create_params.response_format,
+                **create_params.create_args,
+            )
+        else:
+            result = await self._client.chat.completions.create(
+                messages=create_params.messages,
+                stream=False,
+                tools=(create_params.tools if len(create_params.tools) > 0 else NOT_GIVEN),
+                **create_params.create_args,
+            )
+
         # Create the result.
         result = CreateResult(
             finish_reason=normalize_stop_reason(stop_reason),
@@ -964,6 +987,7 @@ async def create_stream(
             cached=False,
             logprobs=logprobs,
             thought=thought,
+            raw_response=result,
         )
 
         # Log the end of the stream.

diff --git a/python/packages/autogen-ext/src/autogen_ext/models/replay/_replay_chat_completion_client.py b/python/packages/autogen-ext/src/autogen_ext/models/replay/_replay_chat_completion_client.py
@@ -176,7 +176,11 @@ async def create(
             _, output_token_count = self._tokenize(response)
             self._cur_usage = RequestUsage(prompt_tokens=prompt_token_count, completion_tokens=output_token_count)
             response = CreateResult(
-                finish_reason="stop", content=response, usage=self._cur_usage, cached=self._cached_bool_value
+                finish_reason="stop",
+                content=response,
+                usage=self._cur_usage,
+                cached=self._cached_bool_value,
+                raw_response=response,
             )
         else:
             self._cur_usage = RequestUsage(
@@ -221,7 +225,11 @@ async def create_stream(
                 else:
                     yield token
             yield CreateResult(
-                finish_reason="stop", content=response, usage=self._cur_usage, cached=self._cached_bool_value
+                finish_reason="stop",
+                content=response,
+                usage=self._cur_usage,
+                cached=self._cached_bool_value,
+                raw_response=response,
             )
             self._update_total_usage()
         else:

diff --git a/...ackages/autogen-ext/src/autogen_ext/models/semantic_kernel/_sk_chat_completion_adapter.py b/...ackages/autogen-ext/src/autogen_ext/models/semantic_kernel/_sk_chat_completion_adapter.py
@@ -521,6 +521,7 @@ async def create(
             usage=RequestUsage(prompt_tokens=prompt_tokens, completion_tokens=completion_tokens),
             cached=False,
             thought=thought,
+            raw_response=result,
         )
 
     @staticmethod
@@ -676,6 +677,7 @@ async def create_stream(
                         finish_reason="function_calls",
                         usage=RequestUsage(prompt_tokens=prompt_tokens, completion_tokens=completion_tokens),
                         cached=False,
+                        raw_response=None,
                     )
                     return
 
@@ -698,6 +700,7 @@ async def create_stream(
             usage=RequestUsage(prompt_tokens=prompt_tokens, completion_tokens=completion_tokens),
             cached=False,
             thought=thought,
+            raw_response=None,
         )
 
         # Emit the end event.

diff --git a/python/packages/autogen-ext/tests/models/test_sk_chat_completion_adapter.py b/python/packages/autogen-ext/tests/models/test_sk_chat_completion_adapter.py
@@ -211,7 +211,7 @@ async def mock_get_streaming_chat_message_contents(
                             created=1736674044,
                             model="gpt-4o-mini-2024-07-18",
                             object="chat.completion.chunk",
-                            service_tier="scale",
+                            service_tier="default",
                             system_fingerprint="fingerprint",
                             usage=CompletionUsage(prompt_tokens=20, completion_tokens=9, total_tokens=29),
                         ),
@@ -232,7 +232,7 @@ async def mock_get_streaming_chat_message_contents(
                             created=1736674044,
                             model="gpt-4o-mini-2024-07-18",
                             object="chat.completion.chunk",
-                            service_tier="scale",
+                            service_tier="default",
                             system_fingerprint="fingerprint",
                             usage=CompletionUsage(prompt_tokens=20, completion_tokens=9, total_tokens=29),
                         ),
@@ -253,7 +253,7 @@ async def mock_get_streaming_chat_message_contents(
                             created=1736674044,
                             model="gpt-4o-mini-2024-07-18",
                             object="chat.completion.chunk",
-                            service_tier="scale",
+                            service_tier="default",
                             system_fingerprint="fingerprint",
                             usage=CompletionUsage(prompt_tokens=20, completion_tokens=9, total_tokens=29),
                         ),
@@ -280,7 +280,7 @@ async def mock_get_streaming_chat_message_contents(
                             created=1736674044,
                             model="gpt-4o-mini-2024-07-18",
                             object="chat.completion.chunk",
-                            service_tier="scale",
+                            service_tier="default",
                             system_fingerprint="fingerprint",
                             usage=CompletionUsage(prompt_tokens=20, completion_tokens=9, total_tokens=29),
                         ),
@@ -503,7 +503,7 @@ async def mock_get_streaming_chat_message_contents(
                         created=1736674044,
                         model="r1",
                         object="chat.completion.chunk",
-                        service_tier="scale",
+                        service_tier="default",
                         system_fingerprint="fingerprint",
                         usage=CompletionUsage(prompt_tokens=20, completion_tokens=9, total_tokens=29),
                     ),

diff --git a/python/packages/autogen-ext/tests/test_openai_assistant_agent.py b/python/packages/autogen-ext/tests/test_openai_assistant_agent.py
@@ -10,13 +10,21 @@
 import pytest
 from autogen_agentchat.messages import BaseChatMessage, TextMessage, ToolCallRequestEvent
 from autogen_core import CancellationToken
+from autogen_core.models import UserMessage
 from autogen_core.tools._base import BaseTool, Tool
 from autogen_ext.agents.openai import OpenAIAssistantAgent
 from azure.identity import DefaultAzureCredential, get_bearer_token_provider
 from openai import AsyncAzureOpenAI, AsyncOpenAI
 from pydantic import BaseModel
 
 
+def fake_to_model_message(self):
+    return UserMessage(content=self.content, source=self.source)
+
+
+TextMessage.to_model_message = fake_to_model_message
+
+
 class QuestionType(str, Enum):
     MULTIPLE_CHOICE = "MULTIPLE_CHOICE"
     FREE_RESPONSE = "FREE_RESPONSE"