From c052063d90da8c3e1c9489e618655bb55257598a Mon Sep 17 00:00:00 2001 From: Alexander Alderman Webb Date: Wed, 10 Dec 2025 15:18:39 +0100 Subject: [PATCH 1/6] feat(langgraph): Usage attributes on invocation spans --- sentry_sdk/integrations/langgraph.py | 51 ++- .../integrations/langgraph/test_langgraph.py | 334 ++++++++++++++++++ 2 files changed, 381 insertions(+), 4 deletions(-) diff --git a/sentry_sdk/integrations/langgraph.py b/sentry_sdk/integrations/langgraph.py index 5bb0e0fd08..7ef4688028 100644 --- a/sentry_sdk/integrations/langgraph.py +++ b/sentry_sdk/integrations/langgraph.py @@ -62,7 +62,13 @@ def _normalize_langgraph_message(message): parsed = {"role": getattr(message, "type", None), "content": message.content} - for attr in ["name", "tool_calls", "function_call", "tool_call_id"]: + for attr in [ + "name", + "tool_calls", + "function_call", + "tool_call_id", + "response_metadata", + ]: if hasattr(message, attr): value = getattr(message, attr) if value is not None: @@ -311,14 +317,51 @@ def _extract_tool_calls(messages): return tool_calls if tool_calls else None +def _set_usage_data(span, messages): + # type: (Any, Any) -> None + input_tokens = 0 + output_tokens = 0 + total_tokens = 0 + + for message in messages: + response_metadata = message.get("response_metadata") + if response_metadata is None: + continue + + token_usage = response_metadata.get("token_usage") + if not token_usage: + continue + + input_tokens += int(token_usage.get("prompt_tokens", 0)) + output_tokens += int(token_usage.get("completion_tokens", 0)) + total_tokens += int(token_usage.get("total_tokens", 0)) + + if input_tokens is not None: + span.set_data(SPANDATA.GEN_AI_USAGE_INPUT_TOKENS, input_tokens) + + if output_tokens is not None: + span.set_data(SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS, output_tokens) + + if total_tokens is not None: + span.set_data( + SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS, + total_tokens, + ) + + def _set_response_attributes(span, input_messages, result, integration): # type: (Any, Optional[List[Any]], Any, LanggraphIntegration) -> None - if not (should_send_default_pii() and integration.include_prompts): - return - parsed_response_messages = _parse_langgraph_messages(result) new_messages = _get_new_messages(input_messages, parsed_response_messages) + if new_messages is None: + return + + _set_usage_data(span, new_messages) + + if not (should_send_default_pii() and integration.include_prompts): + return + llm_response_text = _extract_llm_response_text(new_messages) if llm_response_text: set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, llm_response_text) diff --git a/tests/integrations/langgraph/test_langgraph.py b/tests/integrations/langgraph/test_langgraph.py index df574dd2c3..0951eace66 100644 --- a/tests/integrations/langgraph/test_langgraph.py +++ b/tests/integrations/langgraph/test_langgraph.py @@ -96,6 +96,7 @@ def __init__( function_call=None, role=None, type=None, + response_metadata=None, ): self.content = content self.name = name @@ -108,6 +109,7 @@ def __init__( self.type = name else: self.type = type + self.response_metadata = response_metadata class MockPregelInstance: @@ -509,6 +511,338 @@ def original_invoke(self, *args, **kwargs): assert SPANDATA.GEN_AI_AGENT_NAME not in invoke_span.get("data", {}) +def test_pregel_invoke_span_includes_usage_data(sentry_init, capture_events): + """ + Test that invoke_agent spans include aggregated usage data from context_wrapper. + This verifies the new functionality added to track token usage in invoke_agent spans. + """ + sentry_init( + integrations=[LanggraphIntegration()], + traces_sample_rate=1.0, + ) + events = capture_events() + + test_state = { + "messages": [ + MockMessage("Hello, can you help me?", name="user"), + MockMessage("Of course! How can I assist you?", name="assistant"), + ] + } + + pregel = MockPregelInstance("test_graph") + + expected_assistant_response = "I'll help you with that task!" + expected_tool_calls = [ + { + "id": "call_test_123", + "type": "function", + "function": {"name": "search_tool", "arguments": '{"query": "help"}'}, + } + ] + + def original_invoke(self, *args, **kwargs): + input_messages = args[0].get("messages", []) + new_messages = input_messages + [ + MockMessage( + content=expected_assistant_response, + name="assistant", + tool_calls=expected_tool_calls, + response_metadata={ + "token_usage": { + "total_tokens": 30, + "prompt_tokens": 10, + "completion_tokens": 20, + }, + "model_name": "gpt-4.1-2025-04-14", + }, + ) + ] + return {"messages": new_messages} + + with start_transaction(): + wrapped_invoke = _wrap_pregel_invoke(original_invoke) + result = wrapped_invoke(pregel, test_state) + + assert result is not None + + tx = events[0] + assert tx["type"] == "transaction" + + invoke_spans = [ + span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + + invoke_agent_span = invoke_spans[0] + + # Verify invoke_agent span has usage data + assert invoke_agent_span["description"] == "invoke_agent test_graph" + assert "gen_ai.usage.input_tokens" in invoke_agent_span["data"] + assert "gen_ai.usage.output_tokens" in invoke_agent_span["data"] + assert "gen_ai.usage.total_tokens" in invoke_agent_span["data"] + + # The usage should match the mock_usage values (aggregated across all calls) + assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 10 + assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 + assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 30 + + +def test_pregel_ainvoke_span_includes_usage_data(sentry_init, capture_events): + """ + Test that invoke_agent spans include aggregated usage data from context_wrapper. + This verifies the new functionality added to track token usage in invoke_agent spans. + """ + sentry_init( + integrations=[LanggraphIntegration()], + traces_sample_rate=1.0, + ) + events = capture_events() + + test_state = { + "messages": [ + MockMessage("Hello, can you help me?", name="user"), + MockMessage("Of course! How can I assist you?", name="assistant"), + ] + } + + pregel = MockPregelInstance("test_graph") + + expected_assistant_response = "I'll help you with that task!" + expected_tool_calls = [ + { + "id": "call_test_123", + "type": "function", + "function": {"name": "search_tool", "arguments": '{"query": "help"}'}, + } + ] + + async def original_ainvoke(self, *args, **kwargs): + input_messages = args[0].get("messages", []) + new_messages = input_messages + [ + MockMessage( + content=expected_assistant_response, + name="assistant", + tool_calls=expected_tool_calls, + response_metadata={ + "token_usage": { + "total_tokens": 30, + "prompt_tokens": 10, + "completion_tokens": 20, + }, + "model_name": "gpt-4.1-2025-04-14", + }, + ) + ] + return {"messages": new_messages} + + async def run_test(): + with start_transaction(): + wrapped_ainvoke = _wrap_pregel_ainvoke(original_ainvoke) + result = await wrapped_ainvoke(pregel, test_state) + return result + + result = asyncio.run(run_test()) + assert result is not None + + tx = events[0] + assert tx["type"] == "transaction" + + invoke_spans = [ + span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + + invoke_agent_span = invoke_spans[0] + + # Verify invoke_agent span has usage data + assert invoke_agent_span["description"] == "invoke_agent test_graph" + assert "gen_ai.usage.input_tokens" in invoke_agent_span["data"] + assert "gen_ai.usage.output_tokens" in invoke_agent_span["data"] + assert "gen_ai.usage.total_tokens" in invoke_agent_span["data"] + + # The usage should match the mock_usage values (aggregated across all calls) + assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 10 + assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 + assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 30 + + +def test_pregel_invoke_multiple_llm_calls_aggregate_usage(sentry_init, capture_events): + """ + Test that invoke_agent spans show aggregated usage across multiple LLM calls + (e.g., when tools are used and multiple API calls are made). + """ + sentry_init( + integrations=[LanggraphIntegration()], + traces_sample_rate=1.0, + ) + events = capture_events() + + test_state = { + "messages": [ + MockMessage("Hello, can you help me?", name="user"), + MockMessage("Of course! How can I assist you?", name="assistant"), + ] + } + + pregel = MockPregelInstance("test_graph") + + expected_assistant_response = "I'll help you with that task!" + expected_tool_calls = [ + { + "id": "call_test_123", + "type": "function", + "function": {"name": "search_tool", "arguments": '{"query": "help"}'}, + } + ] + + def original_invoke(self, *args, **kwargs): + input_messages = args[0].get("messages", []) + new_messages = input_messages + [ + MockMessage( + content=expected_assistant_response, + name="assistant", + tool_calls=expected_tool_calls, + response_metadata={ + "token_usage": { + "total_tokens": 15, + "prompt_tokens": 10, + "completion_tokens": 5, + }, + }, + ), + MockMessage( + content=expected_assistant_response, + name="assistant", + tool_calls=expected_tool_calls, + response_metadata={ + "token_usage": { + "total_tokens": 35, + "prompt_tokens": 20, + "completion_tokens": 15, + }, + }, + ), + ] + return {"messages": new_messages} + + with start_transaction(): + wrapped_invoke = _wrap_pregel_invoke(original_invoke) + result = wrapped_invoke(pregel, test_state) + + assert result is not None + + tx = events[0] + assert tx["type"] == "transaction" + + invoke_spans = [ + span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + + invoke_agent_span = invoke_spans[0] + + # Verify invoke_agent span has usage data + assert invoke_agent_span["description"] == "invoke_agent test_graph" + assert "gen_ai.usage.input_tokens" in invoke_agent_span["data"] + assert "gen_ai.usage.output_tokens" in invoke_agent_span["data"] + assert "gen_ai.usage.total_tokens" in invoke_agent_span["data"] + + # The usage should match the mock_usage values (aggregated across all calls) + assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 30 + assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 + assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 50 + + +def test_pregel_ainvoke_multiple_llm_calls_aggregate_usage(sentry_init, capture_events): + """ + Test that invoke_agent spans show aggregated usage across multiple LLM calls + (e.g., when tools are used and multiple API calls are made). + """ + sentry_init( + integrations=[LanggraphIntegration()], + traces_sample_rate=1.0, + ) + events = capture_events() + + test_state = { + "messages": [ + MockMessage("Hello, can you help me?", name="user"), + MockMessage("Of course! How can I assist you?", name="assistant"), + ] + } + + pregel = MockPregelInstance("test_graph") + + expected_assistant_response = "I'll help you with that task!" + expected_tool_calls = [ + { + "id": "call_test_123", + "type": "function", + "function": {"name": "search_tool", "arguments": '{"query": "help"}'}, + } + ] + + async def original_ainvoke(self, *args, **kwargs): + input_messages = args[0].get("messages", []) + new_messages = input_messages + [ + MockMessage( + content=expected_assistant_response, + name="assistant", + tool_calls=expected_tool_calls, + response_metadata={ + "token_usage": { + "total_tokens": 15, + "prompt_tokens": 10, + "completion_tokens": 5, + }, + }, + ), + MockMessage( + content=expected_assistant_response, + name="assistant", + tool_calls=expected_tool_calls, + response_metadata={ + "token_usage": { + "total_tokens": 35, + "prompt_tokens": 20, + "completion_tokens": 15, + }, + }, + ), + ] + return {"messages": new_messages} + + async def run_test(): + with start_transaction(): + wrapped_ainvoke = _wrap_pregel_ainvoke(original_ainvoke) + result = await wrapped_ainvoke(pregel, test_state) + return result + + result = asyncio.run(run_test()) + assert result is not None + + tx = events[0] + assert tx["type"] == "transaction" + + invoke_spans = [ + span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + + invoke_agent_span = invoke_spans[0] + + # Verify invoke_agent span has usage data + assert invoke_agent_span["description"] == "invoke_agent test_graph" + assert "gen_ai.usage.input_tokens" in invoke_agent_span["data"] + assert "gen_ai.usage.output_tokens" in invoke_agent_span["data"] + assert "gen_ai.usage.total_tokens" in invoke_agent_span["data"] + + # The usage should match the mock_usage values (aggregated across all calls) + assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 30 + assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 + assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 50 + + def test_complex_message_parsing(): """Test message parsing with complex message structures.""" messages = [ From fc5621310b49e7f7bf31cff7d225fce23f942377 Mon Sep 17 00:00:00 2001 From: Alexander Alderman Webb Date: Wed, 10 Dec 2025 15:26:20 +0100 Subject: [PATCH 2/6] test cleanup --- .../integrations/langgraph/test_langgraph.py | 20 ++++--------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/tests/integrations/langgraph/test_langgraph.py b/tests/integrations/langgraph/test_langgraph.py index 0951eace66..1f6c27cd62 100644 --- a/tests/integrations/langgraph/test_langgraph.py +++ b/tests/integrations/langgraph/test_langgraph.py @@ -738,16 +738,10 @@ def original_invoke(self, *args, **kwargs): span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT ] assert len(invoke_spans) == 1 - invoke_agent_span = invoke_spans[0] - # Verify invoke_agent span has usage data - assert invoke_agent_span["description"] == "invoke_agent test_graph" - assert "gen_ai.usage.input_tokens" in invoke_agent_span["data"] - assert "gen_ai.usage.output_tokens" in invoke_agent_span["data"] - assert "gen_ai.usage.total_tokens" in invoke_agent_span["data"] - - # The usage should match the mock_usage values (aggregated across all calls) + # Verify invoke_agent span has aggregated usage from both API calls + # Total: 10 + 20 = 30 input tokens, 5 + 15 = 20 output tokens, 15 + 35 = 50 total assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 30 assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 50 @@ -828,16 +822,10 @@ async def run_test(): span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT ] assert len(invoke_spans) == 1 - invoke_agent_span = invoke_spans[0] - # Verify invoke_agent span has usage data - assert invoke_agent_span["description"] == "invoke_agent test_graph" - assert "gen_ai.usage.input_tokens" in invoke_agent_span["data"] - assert "gen_ai.usage.output_tokens" in invoke_agent_span["data"] - assert "gen_ai.usage.total_tokens" in invoke_agent_span["data"] - - # The usage should match the mock_usage values (aggregated across all calls) + # Verify invoke_agent span has aggregated usage from both API calls + # Total: 10 + 20 = 30 input tokens, 5 + 15 = 20 output tokens, 15 + 35 = 50 total assert invoke_agent_span["data"]["gen_ai.usage.input_tokens"] == 30 assert invoke_agent_span["data"]["gen_ai.usage.output_tokens"] == 20 assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 50 From 1d7fba32a3c19a406d7c520684575a8e9c5063ff Mon Sep 17 00:00:00 2001 From: Alexander Alderman Webb Date: Wed, 10 Dec 2025 15:40:27 +0100 Subject: [PATCH 3/6] feat(langgraph): Response model attribute on invocation spans --- sentry_sdk/integrations/langgraph.py | 14 + .../integrations/langgraph/test_langgraph.py | 310 ++++++++++++++++++ 2 files changed, 324 insertions(+) diff --git a/sentry_sdk/integrations/langgraph.py b/sentry_sdk/integrations/langgraph.py index 7ef4688028..1d235d7db5 100644 --- a/sentry_sdk/integrations/langgraph.py +++ b/sentry_sdk/integrations/langgraph.py @@ -349,6 +349,19 @@ def _set_usage_data(span, messages): ) +def _set_response_model_name(span, messages): + last_message = messages[-1] + response_metadata = last_message.get("response_metadata") + if response_metadata is None: + return + + model_name = response_metadata.get("model_name") + if model_name is None: + return + + set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_MODEL, model_name) + + def _set_response_attributes(span, input_messages, result, integration): # type: (Any, Optional[List[Any]], Any, LanggraphIntegration) -> None parsed_response_messages = _parse_langgraph_messages(result) @@ -358,6 +371,7 @@ def _set_response_attributes(span, input_messages, result, integration): return _set_usage_data(span, new_messages) + _set_response_model_name(span, new_messages) if not (should_send_default_pii() and integration.include_prompts): return diff --git a/tests/integrations/langgraph/test_langgraph.py b/tests/integrations/langgraph/test_langgraph.py index 1f6c27cd62..99ab216957 100644 --- a/tests/integrations/langgraph/test_langgraph.py +++ b/tests/integrations/langgraph/test_langgraph.py @@ -831,6 +831,316 @@ async def run_test(): assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 50 +def test_pregel_invoke_span_includes_response_model(sentry_init, capture_events): + """ + Test that invoke_agent spans include the response model. + When an agent makes multiple LLM calls, it should report the last model used. + """ + sentry_init( + integrations=[LanggraphIntegration()], + traces_sample_rate=1.0, + ) + events = capture_events() + + test_state = { + "messages": [ + MockMessage("Hello, can you help me?", name="user"), + MockMessage("Of course! How can I assist you?", name="assistant"), + ] + } + + pregel = MockPregelInstance("test_graph") + + expected_assistant_response = "I'll help you with that task!" + expected_tool_calls = [ + { + "id": "call_test_123", + "type": "function", + "function": {"name": "search_tool", "arguments": '{"query": "help"}'}, + } + ] + + def original_invoke(self, *args, **kwargs): + input_messages = args[0].get("messages", []) + new_messages = input_messages + [ + MockMessage( + content=expected_assistant_response, + name="assistant", + tool_calls=expected_tool_calls, + response_metadata={ + "token_usage": { + "total_tokens": 30, + "prompt_tokens": 10, + "completion_tokens": 20, + }, + "model_name": "gpt-4.1-2025-04-14", + }, + ) + ] + return {"messages": new_messages} + + with start_transaction(): + wrapped_invoke = _wrap_pregel_invoke(original_invoke) + result = wrapped_invoke(pregel, test_state) + + assert result is not None + + tx = events[0] + assert tx["type"] == "transaction" + + invoke_spans = [ + span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + + invoke_agent_span = invoke_spans[0] + + # Verify invoke_agent span has response model + assert invoke_agent_span["description"] == "invoke_agent test_graph" + assert "gen_ai.response.model" in invoke_agent_span["data"] + assert invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + + +def test_pregel_ainvoke_span_includes_response_model(sentry_init, capture_events): + """ + Test that invoke_agent spans include the response model. + When an agent makes multiple LLM calls, it should report the last model used. + """ + sentry_init( + integrations=[LanggraphIntegration()], + traces_sample_rate=1.0, + ) + events = capture_events() + + test_state = { + "messages": [ + MockMessage("Hello, can you help me?", name="user"), + MockMessage("Of course! How can I assist you?", name="assistant"), + ] + } + + pregel = MockPregelInstance("test_graph") + + expected_assistant_response = "I'll help you with that task!" + expected_tool_calls = [ + { + "id": "call_test_123", + "type": "function", + "function": {"name": "search_tool", "arguments": '{"query": "help"}'}, + } + ] + + async def original_ainvoke(self, *args, **kwargs): + input_messages = args[0].get("messages", []) + new_messages = input_messages + [ + MockMessage( + content=expected_assistant_response, + name="assistant", + tool_calls=expected_tool_calls, + response_metadata={ + "token_usage": { + "total_tokens": 30, + "prompt_tokens": 10, + "completion_tokens": 20, + }, + "model_name": "gpt-4.1-2025-04-14", + }, + ) + ] + return {"messages": new_messages} + + async def run_test(): + with start_transaction(): + wrapped_ainvoke = _wrap_pregel_ainvoke(original_ainvoke) + result = await wrapped_ainvoke(pregel, test_state) + return result + + result = asyncio.run(run_test()) + assert result is not None + + tx = events[0] + assert tx["type"] == "transaction" + + invoke_spans = [ + span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + + invoke_agent_span = invoke_spans[0] + + # Verify invoke_agent span has response model + assert invoke_agent_span["description"] == "invoke_agent test_graph" + assert "gen_ai.response.model" in invoke_agent_span["data"] + assert invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + + +def test_pregel_invoke_span_uses_last_response_model(sentry_init, capture_events): + """ + Test that when an agent makes multiple LLM calls (e.g., with tools), + the invoke_agent span reports the last response model used. + """ + sentry_init( + integrations=[LanggraphIntegration()], + traces_sample_rate=1.0, + ) + events = capture_events() + + test_state = { + "messages": [ + MockMessage("Hello, can you help me?", name="user"), + MockMessage("Of course! How can I assist you?", name="assistant"), + ] + } + + pregel = MockPregelInstance("test_graph") + + expected_assistant_response = "I'll help you with that task!" + expected_tool_calls = [ + { + "id": "call_test_123", + "type": "function", + "function": {"name": "search_tool", "arguments": '{"query": "help"}'}, + } + ] + + def original_invoke(self, *args, **kwargs): + input_messages = args[0].get("messages", []) + new_messages = input_messages + [ + MockMessage( + content=expected_assistant_response, + name="assistant", + tool_calls=expected_tool_calls, + response_metadata={ + "token_usage": { + "total_tokens": 15, + "prompt_tokens": 10, + "completion_tokens": 5, + }, + "model_name": "gpt-4-0613", + }, + ), + MockMessage( + content=expected_assistant_response, + name="assistant", + tool_calls=expected_tool_calls, + response_metadata={ + "token_usage": { + "total_tokens": 35, + "prompt_tokens": 20, + "completion_tokens": 15, + }, + "model_name": "gpt-4.1-2025-04-14", + }, + ), + ] + return {"messages": new_messages} + + with start_transaction(): + wrapped_invoke = _wrap_pregel_invoke(original_invoke) + result = wrapped_invoke(pregel, test_state) + + assert result is not None + + tx = events[0] + assert tx["type"] == "transaction" + + invoke_spans = [ + span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + + invoke_agent_span = invoke_spans[0] + + # Verify invoke_agent span uses the LAST response model + assert "gen_ai.response.model" in invoke_agent_span["data"] + assert invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + + +def test_pregel_ainvoke_span_uses_last_response_model(sentry_init, capture_events): + """ + Test that when an agent makes multiple LLM calls (e.g., with tools), + the invoke_agent span reports the last response model used. + """ + sentry_init( + integrations=[LanggraphIntegration()], + traces_sample_rate=1.0, + ) + events = capture_events() + + test_state = { + "messages": [ + MockMessage("Hello, can you help me?", name="user"), + MockMessage("Of course! How can I assist you?", name="assistant"), + ] + } + + pregel = MockPregelInstance("test_graph") + + expected_assistant_response = "I'll help you with that task!" + expected_tool_calls = [ + { + "id": "call_test_123", + "type": "function", + "function": {"name": "search_tool", "arguments": '{"query": "help"}'}, + } + ] + + async def original_ainvoke(self, *args, **kwargs): + input_messages = args[0].get("messages", []) + new_messages = input_messages + [ + MockMessage( + content=expected_assistant_response, + name="assistant", + tool_calls=expected_tool_calls, + response_metadata={ + "token_usage": { + "total_tokens": 15, + "prompt_tokens": 10, + "completion_tokens": 5, + }, + "model_name": "gpt-4-0613", + }, + ), + MockMessage( + content=expected_assistant_response, + name="assistant", + tool_calls=expected_tool_calls, + response_metadata={ + "token_usage": { + "total_tokens": 35, + "prompt_tokens": 20, + "completion_tokens": 15, + }, + "model_name": "gpt-4.1-2025-04-14", + }, + ), + ] + return {"messages": new_messages} + + async def run_test(): + with start_transaction(): + wrapped_ainvoke = _wrap_pregel_ainvoke(original_ainvoke) + result = await wrapped_ainvoke(pregel, test_state) + return result + + result = asyncio.run(run_test()) + assert result is not None + + tx = events[0] + assert tx["type"] == "transaction" + + invoke_spans = [ + span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT + ] + assert len(invoke_spans) == 1 + + invoke_agent_span = invoke_spans[0] + + # Verify invoke_agent span uses the LAST response model + assert "gen_ai.response.model" in invoke_agent_span["data"] + assert invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14" + + def test_complex_message_parsing(): """Test message parsing with complex message structures.""" messages = [ From 4f3fab3193c3bb9833f4ec7cc40e6c22bdda5ac3 Mon Sep 17 00:00:00 2001 From: Alexander Alderman Webb Date: Wed, 10 Dec 2025 15:59:26 +0100 Subject: [PATCH 4/6] . --- sentry_sdk/integrations/langgraph.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sentry_sdk/integrations/langgraph.py b/sentry_sdk/integrations/langgraph.py index 7ef4688028..5464b2daef 100644 --- a/sentry_sdk/integrations/langgraph.py +++ b/sentry_sdk/integrations/langgraph.py @@ -318,7 +318,7 @@ def _extract_tool_calls(messages): def _set_usage_data(span, messages): - # type: (Any, Any) -> None + # type: (sentry_sdk.tracing.Span, Any) -> None input_tokens = 0 output_tokens = 0 total_tokens = 0 @@ -336,13 +336,13 @@ def _set_usage_data(span, messages): output_tokens += int(token_usage.get("completion_tokens", 0)) total_tokens += int(token_usage.get("total_tokens", 0)) - if input_tokens is not None: + if input_tokens > 0: span.set_data(SPANDATA.GEN_AI_USAGE_INPUT_TOKENS, input_tokens) - if output_tokens is not None: + if output_tokens > 0: span.set_data(SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS, output_tokens) - if total_tokens is not None: + if total_tokens > 0: span.set_data( SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS, total_tokens, From 5eb9a9a271401f70047cdddcb49d3ea36c7242cd Mon Sep 17 00:00:00 2001 From: Alexander Alderman Webb Date: Wed, 10 Dec 2025 16:06:15 +0100 Subject: [PATCH 5/6] add type --- sentry_sdk/integrations/langgraph.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sentry_sdk/integrations/langgraph.py b/sentry_sdk/integrations/langgraph.py index 1589d0b15d..2832a47250 100644 --- a/sentry_sdk/integrations/langgraph.py +++ b/sentry_sdk/integrations/langgraph.py @@ -350,6 +350,7 @@ def _set_usage_data(span, messages): def _set_response_model_name(span, messages): + # type: (sentry_sdk.tracing.Span, Any) -> None last_message = messages[-1] response_metadata = last_message.get("response_metadata") if response_metadata is None: From 01486c6c240d6bbd11afce92181211e60170907c Mon Sep 17 00:00:00 2001 From: Alexander Alderman Webb Date: Thu, 11 Dec 2025 15:04:57 +0100 Subject: [PATCH 6/6] check for empty messages --- sentry_sdk/integrations/langgraph.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sentry_sdk/integrations/langgraph.py b/sentry_sdk/integrations/langgraph.py index 2832a47250..aa955a1a88 100644 --- a/sentry_sdk/integrations/langgraph.py +++ b/sentry_sdk/integrations/langgraph.py @@ -351,6 +351,9 @@ def _set_usage_data(span, messages): def _set_response_model_name(span, messages): # type: (sentry_sdk.tracing.Span, Any) -> None + if len(messages) == 0: + return + last_message = messages[-1] response_metadata = last_message.get("response_metadata") if response_metadata is None: