Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
ee55a3a
feat(openai-agents): use upstream gen_ai constants directly, rename L…
max-deygin-traceloop Mar 18, 2026
e1fb150
fix: remove duplicate GEN_AI_OPERATION_NAME dict keys and migrate too…
max-deygin-traceloop Mar 18, 2026
cd88432
fix(openai-agents): migrate to GEN_AI_INPUT/OUTPUT_MESSAGES, fix brok…
max-deygin-traceloop Mar 18, 2026
1133ac6
fix(openai-agents): update recipe hierarchy test to use JSON array me…
max-deygin-traceloop Mar 18, 2026
a763e85
fix(openai-agents): migrate realtime flat format to JSON arrays, remo…
max-deygin-traceloop Mar 18, 2026
2f6394c
fix(openai-agents): restore Speech/Transcription/SpeechGroup handlers…
max-deygin-traceloop Mar 18, 2026
741bdce
Improving coverage
max-deygin-traceloop Mar 31, 2026
2021b3e
fixed finish_reason assertion
max-deygin-traceloop Mar 31, 2026
826a2a7
Minor fixes
max-deygin-traceloop Mar 31, 2026
50bfc23
added gen_ai.tool.call responce/request
max-deygin-traceloop Mar 31, 2026
6ea5fc0
adjust test
max-deygin-traceloop Mar 31, 2026
4f95dfe
test additions
max-deygin-traceloop Apr 12, 2026
a98c409
Merge branch 'main' into max/tlp-1928-openai-agents-insturmentation
max-deygin-traceloop Apr 12, 2026
f2e7241
Merge remote-tracking branch 'origin/main' into max/tlp-1928-openai-a…
max-deygin-traceloop Apr 15, 2026
1a0bc38
fix review comments
max-deygin-traceloop Apr 16, 2026
ca942bc
Merge remote-tracking branch 'origin/main' into max/tlp-1928-openai-a…
max-deygin-traceloop Apr 19, 2026
aa8707c
semconv final adjustments
max-deygin-traceloop Apr 19, 2026
b6b02aa
Review comments #2
max-deygin-traceloop Apr 19, 2026
8190972
missing tests
max-deygin-traceloop Apr 20, 2026
6b78202
bump
max-deygin-traceloop Apr 20, 2026
4bcca80
review fixes
max-deygin-traceloop Apr 27, 2026
cb10561
lint
max-deygin-traceloop Apr 27, 2026
1beb398
Merge branch 'main' into max/tlp-1928-openai-agents-insturmentation
max-deygin-traceloop Apr 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
so we need to patch the RealtimeSession class directly to add OpenTelemetry tracing.
"""

import json
import logging
import time
from typing import Dict, Any, Optional, List, Tuple
Expand Down Expand Up @@ -119,7 +120,7 @@ def start_workflow_span(self, agent_name: str):
kind=SpanKind.CLIENT,
attributes={
SpanAttributes.TRACELOOP_SPAN_KIND: TraceloopSpanKindValues.WORKFLOW.value,
GenAIAttributes.GEN_AI_SYSTEM: "openai_agents",
GenAIAttributes.GEN_AI_PROVIDER_NAME: "openai",
SpanAttributes.TRACELOOP_WORKFLOW_NAME: "Realtime Session",
},
)
Expand Down Expand Up @@ -175,7 +176,7 @@ def start_agent_span(self, agent_name: str):
attributes={
SpanAttributes.TRACELOOP_SPAN_KIND: TraceloopSpanKindValues.AGENT.value,
GenAIAttributes.GEN_AI_AGENT_NAME: agent_name,
GenAIAttributes.GEN_AI_SYSTEM: "openai_agents",
GenAIAttributes.GEN_AI_PROVIDER_NAME: "openai",
},
)
self.agent_spans[agent_name] = span
Expand All @@ -202,7 +203,7 @@ def start_tool_span(self, tool_name: str, agent_name: Optional[str] = None):
SpanAttributes.TRACELOOP_SPAN_KIND: TraceloopSpanKindValues.TOOL.value,
GenAIAttributes.GEN_AI_TOOL_NAME: tool_name,
GenAIAttributes.GEN_AI_TOOL_TYPE: "function",
GenAIAttributes.GEN_AI_SYSTEM: "openai_agents",
GenAIAttributes.GEN_AI_PROVIDER_NAME: "openai",
},
)
self.tool_spans[tool_name] = span
Expand Down Expand Up @@ -239,7 +240,7 @@ def create_handoff_span(self, from_agent: str, to_agent: str):
context=parent_context,
attributes={
SpanAttributes.TRACELOOP_SPAN_KIND: "handoff",
GenAIAttributes.GEN_AI_SYSTEM: "openai_agents",
GenAIAttributes.GEN_AI_PROVIDER_NAME: "openai",
GEN_AI_HANDOFF_FROM_AGENT: from_agent,
GEN_AI_HANDOFF_TO_AGENT: to_agent,
},
Expand All @@ -258,8 +259,8 @@ def start_audio_span(self, item_id: str, content_index: int):
kind=SpanKind.CLIENT,
context=parent_context,
attributes={
SpanAttributes.LLM_REQUEST_TYPE: "realtime",
GenAIAttributes.GEN_AI_SYSTEM: "openai",
GenAIAttributes.GEN_AI_OPERATION_NAME: "realtime",
GenAIAttributes.GEN_AI_PROVIDER_NAME: "openai",
},
)
Comment thread
max-deygin-traceloop marked this conversation as resolved.
self.audio_spans[span_key] = span
Expand Down Expand Up @@ -351,9 +352,8 @@ def create_llm_span(self, completion_content: str):
context=parent_context,
start_time=start_time,
attributes={
SpanAttributes.LLM_REQUEST_TYPE: "realtime",
SpanAttributes.LLM_SYSTEM: "openai",
GenAIAttributes.GEN_AI_SYSTEM: "openai",
GenAIAttributes.GEN_AI_OPERATION_NAME: "realtime",
GenAIAttributes.GEN_AI_PROVIDER_NAME: "openai",
GenAIAttributes.GEN_AI_REQUEST_MODEL: model_name_str,
},
)
Expand All @@ -373,21 +373,23 @@ def create_llm_span(self, completion_content: str):

if should_send_prompts():
if prompt_content:
input_msg = {
"role": prompt_role or "user",
"parts": [{"type": "text", "content": prompt_content}],
}
span.set_attribute(
f"{GenAIAttributes.GEN_AI_PROMPT}.0.role", prompt_role or "user"
)
span.set_attribute(
f"{GenAIAttributes.GEN_AI_PROMPT}.0.content", prompt_content
GenAIAttributes.GEN_AI_INPUT_MESSAGES,
json.dumps([input_msg]),
)

out_msg = {
"role": "assistant",
"parts": [{"type": "text", "content": completion_content}],
"finish_reason": None,
}
span.set_attribute(
f"{GenAIAttributes.GEN_AI_COMPLETION}.0.role", "assistant"
)
span.set_attribute(
f"{GenAIAttributes.GEN_AI_COMPLETION}.0.content", completion_content
)
span.set_attribute(
f"{GenAIAttributes.GEN_AI_COMPLETION}.0.finish_reason", "stop"
GenAIAttributes.GEN_AI_OUTPUT_MESSAGES,
json.dumps([out_msg]),
)

span.set_status(Status(StatusCode.OK))
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import pytest
from unittest.mock import MagicMock
from opentelemetry.instrumentation.openai_agents import (
Expand Down Expand Up @@ -49,11 +50,11 @@ def test_dict_content_serialization(exporter):

spans = exporter.get_finished_spans()

# Look for any spans with prompt/content attributes
# Look for any spans with message content attributes
for span in spans:
for attr_name, attr_value in span.attributes.items():
prompt_content_check = ("prompt" in attr_name and "content" in attr_name) or (
"gen_ai.prompt" in attr_name and "content" in attr_name
prompt_content_check = (
attr_name in ("gen_ai.input.messages", "gen_ai.output.messages")
)
if prompt_content_check:
# All content attributes should be strings, not dicts
Expand Down Expand Up @@ -90,43 +91,47 @@ def test_agent_spans(exporter, test_agent):
assert agent_span.kind == agent_span.kind.CLIENT
assert agent_span.attributes[SpanAttributes.TRACELOOP_SPAN_KIND] == TraceloopSpanKindValues.AGENT.value
assert agent_span.attributes[GenAIAttributes.GEN_AI_AGENT_NAME] == "testAgent"
assert agent_span.attributes[GenAIAttributes.GEN_AI_SYSTEM] == "openai_agents"
assert agent_span.attributes[GenAIAttributes.GEN_AI_PROVIDER_NAME] == "openai"
assert agent_span.status.status_code == StatusCode.OK

# Agent span should NOT contain LLM parameters
assert SpanAttributes.LLM_REQUEST_TEMPERATURE not in agent_span.attributes
assert SpanAttributes.LLM_REQUEST_MAX_TOKENS not in agent_span.attributes
assert SpanAttributes.LLM_REQUEST_TOP_P not in agent_span.attributes
assert "openai.agent.model.frequency_penalty" not in agent_span.attributes
assert GenAIAttributes.GEN_AI_REQUEST_TEMPERATURE not in agent_span.attributes
assert GenAIAttributes.GEN_AI_REQUEST_MAX_TOKENS not in agent_span.attributes
assert GenAIAttributes.GEN_AI_REQUEST_TOP_P not in agent_span.attributes
assert GenAIAttributes.GEN_AI_REQUEST_FREQUENCY_PENALTY not in agent_span.attributes

# Find the response span (openai.response) - this should contain prompts/completions/usage
response_spans = [s for s in spans if s.name == "openai.response"]
assert len(response_spans) >= 1, f"Expected at least 1 openai.response span, got {len(response_spans)}"
response_span = response_spans[0]

# Test response span attributes (should contain prompts/completions/usage)

# Test proper semantic conventions
assert response_span.attributes[SpanAttributes.LLM_REQUEST_TYPE] == "response"
assert response_span.attributes["gen_ai.operation.name"] == "response"
assert response_span.attributes["gen_ai.system"] == "openai"
assert response_span.attributes[GenAIAttributes.GEN_AI_OPERATION_NAME] == "chat"
assert response_span.attributes[GenAIAttributes.GEN_AI_PROVIDER_NAME] == "openai"

# Test prompts using OpenAI semantic conventions
assert response_span.attributes[f"{GenAIAttributes.GEN_AI_PROMPT}.0.role"] == "user"
assert response_span.attributes[f"{GenAIAttributes.GEN_AI_PROMPT}.0.content"] == "What is AI?"
# Test input messages (JSON array with parts-based schema)
input_messages = json.loads(response_span.attributes[GenAIAttributes.GEN_AI_INPUT_MESSAGES])
assert input_messages[0]["role"] == "user"
assert "parts" in input_messages[0], "Input messages must use parts-based schema"
assert input_messages[0]["parts"][0]["type"] == "text"
assert input_messages[0]["parts"][0]["content"] == "What is AI?"

# Test usage tokens
assert response_span.attributes[GenAIAttributes.GEN_AI_USAGE_INPUT_TOKENS] is not None
assert response_span.attributes[GenAIAttributes.GEN_AI_USAGE_OUTPUT_TOKENS] is not None
assert response_span.attributes[SpanAttributes.LLM_USAGE_TOTAL_TOKENS] is not None
assert response_span.attributes[SpanAttributes.GEN_AI_USAGE_TOTAL_TOKENS] is not None
assert response_span.attributes[GenAIAttributes.GEN_AI_USAGE_INPUT_TOKENS] > 0
assert response_span.attributes[GenAIAttributes.GEN_AI_USAGE_OUTPUT_TOKENS] > 0
assert response_span.attributes[SpanAttributes.LLM_USAGE_TOTAL_TOKENS] > 0
assert response_span.attributes[SpanAttributes.GEN_AI_USAGE_TOTAL_TOKENS] > 0

# Test completions using OpenAI semantic conventions
assert response_span.attributes[f"{GenAIAttributes.GEN_AI_COMPLETION}.0.content"] is not None
assert len(response_span.attributes[f"{GenAIAttributes.GEN_AI_COMPLETION}.0.content"]) > 0
assert response_span.attributes[f"{GenAIAttributes.GEN_AI_COMPLETION}.0.role"] is not None
# Test output messages (JSON array with parts-based schema)
output_messages = json.loads(response_span.attributes[GenAIAttributes.GEN_AI_OUTPUT_MESSAGES])
assert "parts" in output_messages[0], "Output messages must use parts-based schema"
assert output_messages[0]["parts"][0]["type"] == "text"
assert output_messages[0]["parts"][0]["content"] is not None
assert len(output_messages[0]["parts"][0]["content"]) > 0
assert output_messages[0]["role"] is not None
assert "finish_reason" in output_messages[0], "Output messages must have finish_reason"

# Test model settings are in the response span
assert response_span.attributes["gen_ai.request.temperature"] == 0.3
Expand Down Expand Up @@ -444,60 +449,53 @@ async def get_city_info(city_name: str) -> str:
second_response_span = response_spans[1]

# The tool call and result appear in the SECOND response span as part of conversation history
# Find the assistant message with tool call
# Parse the input messages JSON array (parts-based schema)
input_messages = json.loads(
second_response_span.attributes[GenAIAttributes.GEN_AI_INPUT_MESSAGES]
)

tool_call_found = False
tool_result_found = False

for i in range(20): # Check conversation history
role_key = f"{SpanAttributes.LLM_PROMPTS}.{i}.role"
if role_key not in second_response_span.attributes:
continue

role = second_response_span.attributes[role_key]
for msg in input_messages:
role = msg.get("role")
parts = msg.get("parts", [])

if role == "assistant" and not tool_call_found:
# Check if this assistant message has tool_calls
tool_call_name_key = f"{SpanAttributes.LLM_PROMPTS}.{i}.tool_calls.0.name"
if tool_call_name_key in second_response_span.attributes:
tool_call_found = True
# Verify tool call attributes
assert second_response_span.attributes[tool_call_name_key] == "get_city_info", (
f"Expected tool name 'get_city_info', got '{second_response_span.attributes[tool_call_name_key]}'"
)
# Verify tool call ID exists
tool_call_id_key = f"{SpanAttributes.LLM_PROMPTS}.{i}.tool_calls.0.id"
assert tool_call_id_key in second_response_span.attributes, (
f"Tool call ID not found at {tool_call_id_key}"
)
tool_call_id = second_response_span.attributes[tool_call_id_key]
assert len(tool_call_id) > 0, "Tool call ID should not be empty"

# Verify arguments exist and contain city name
tool_call_args_key = f"{SpanAttributes.LLM_PROMPTS}.{i}.tool_calls.0.arguments"
assert tool_call_args_key in second_response_span.attributes, (
f"Tool call arguments not found at {tool_call_args_key}"
)
arguments = second_response_span.attributes[tool_call_args_key]
assert "London" in arguments or "london" in arguments.lower(), (
f"Expected 'London' in arguments, got: {arguments}"
)
# Look for tool_call parts
for part in parts:
if part.get("type") == "tool_call":
tool_call_found = True
assert part["name"] == "get_city_info", (
f"Expected tool name 'get_city_info', got '{part['name']}'"
)
tool_call_id = part.get("id", "")
assert len(tool_call_id) > 0, "Tool call ID should not be empty"
arguments = part.get("arguments", "")
if isinstance(arguments, dict):
arguments = json.dumps(arguments)
assert "London" in arguments or "london" in arguments.lower(), (
f"Expected 'London' in arguments, got: {arguments}"
)
break

elif role == "tool" and not tool_result_found:
tool_result_found = True
# Verify tool result attributes
content_key = f"{SpanAttributes.LLM_PROMPTS}.{i}.content"
tool_call_id_key = f"{SpanAttributes.LLM_PROMPTS}.{i}.tool_call_id"

assert content_key in second_response_span.attributes, f"Tool result content not found at {content_key}"
content = second_response_span.attributes[content_key]
assert len(content) > 0, "Tool result content should not be empty"
assert "London" in content or "9000000" in content or "United Kingdom" in content, (
f"Expected tool result to contain city info, got: {content}"
)

assert tool_call_id_key in second_response_span.attributes, f"Tool call ID not found at {tool_call_id_key}"
tool_call_id = second_response_span.attributes[tool_call_id_key]
assert len(tool_call_id) > 0, "Tool call ID should not be empty"

assert tool_call_found, "No assistant message with tool_calls found in second response span"
assert tool_result_found, "No tool message found in second response span"
# Look for tool_call_response parts
for part in parts:
if part.get("type") == "tool_call_response":
tool_result_found = True
response_text = part.get("response", "")
assert len(response_text) > 0, "Tool result response should not be empty"
assert (
"London" in response_text
or "9000000" in response_text
or "United Kingdom" in response_text
), (
f"Expected tool result to contain city info, got: {response_text}"
)
tool_call_id = part.get("id", "")
assert len(tool_call_id) > 0, "Tool call ID should not be empty"
break

assert tool_call_found, "No assistant message with tool_call parts found in second response span"
assert tool_result_found, "No tool message with tool_call_response parts found in second response span"
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from opentelemetry.trace import StatusCode
from opentelemetry.semconv_ai import SpanAttributes
from opentelemetry.semconv._incubating.attributes import (
gen_ai_attributes as GenAIAttributes,
)
Expand Down Expand Up @@ -117,8 +116,7 @@ def test_speech_span_start_creates_otel_span(self, tracer_provider_and_exporter)
assert "openai.realtime.speech" in span_names

speech_span = next(s for s in spans if s.name == "openai.realtime.speech")
assert speech_span.attributes[SpanAttributes.LLM_REQUEST_TYPE] == "realtime"
assert speech_span.attributes["gen_ai.system"] == "openai"
assert speech_span.attributes["gen_ai.provider.name"] == "openai"
assert speech_span.attributes["gen_ai.operation.name"] == "speech"
assert speech_span.status.status_code == StatusCode.OK

Expand Down Expand Up @@ -213,8 +211,7 @@ def test_transcription_span_start_creates_otel_span(self, tracer_provider_and_ex
assert "openai.realtime.transcription" in span_names

transcription_span = next(s for s in spans if s.name == "openai.realtime.transcription")
assert transcription_span.attributes[SpanAttributes.LLM_REQUEST_TYPE] == "realtime"
assert transcription_span.attributes["gen_ai.system"] == "openai"
assert transcription_span.attributes["gen_ai.provider.name"] == "openai"
assert transcription_span.attributes["gen_ai.operation.name"] == "transcription"

def test_transcription_span_captures_model_and_format(self, tracer_provider_and_exporter):
Expand Down Expand Up @@ -306,8 +303,7 @@ def test_speech_group_span_creates_otel_span(self, tracer_provider_and_exporter)
assert "openai.realtime.speech_group" in span_names

speech_group_span = next(s for s in spans if s.name == "openai.realtime.speech_group")
assert speech_group_span.attributes[SpanAttributes.LLM_REQUEST_TYPE] == "realtime"
assert speech_group_span.attributes["gen_ai.system"] == "openai"
assert speech_group_span.attributes["gen_ai.provider.name"] == "openai"
assert speech_group_span.attributes["gen_ai.operation.name"] == "speech_group"
assert speech_group_span.status.status_code == StatusCode.OK

Expand Down
Loading
Loading