diff --git a/instrumentation-genai/opentelemetry-instrumentation-google-genai/CHANGELOG.md b/instrumentation-genai/opentelemetry-instrumentation-google-genai/CHANGELOG.md index bdc05dc98b..3b62e516bd 100644 --- a/instrumentation-genai/opentelemetry-instrumentation-google-genai/CHANGELOG.md +++ b/instrumentation-genai/opentelemetry-instrumentation-google-genai/CHANGELOG.md @@ -6,8 +6,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## Unreleased - -Add `gen_ai.usage.cache_read.input_tokens` attribute to capture cached tokens on spans/events when the experimental sem conv flag is set. ([#4313](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/4313)) +- Include thinking tokens in `gen_ai.usage.output_tokens` ([#4206](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/4206)). +- Add `gen_ai.usage.reasoning.output_tokens` to span and event attributes ([#4276](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/4276)). + ## Version 0.7b0 (2026-02-20) - Fix bug in how tokens are counted when using the streaming `generateContent` method. ([#4152](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/4152)). diff --git a/instrumentation-genai/opentelemetry-instrumentation-google-genai/src/opentelemetry/instrumentation/google_genai/generate_content.py b/instrumentation-genai/opentelemetry-instrumentation-google-genai/src/opentelemetry/instrumentation/google_genai/generate_content.py index 874bcb8144..46513b9b3b 100644 --- a/instrumentation-genai/opentelemetry-instrumentation-google-genai/src/opentelemetry/instrumentation/google_genai/generate_content.py +++ b/instrumentation-genai/opentelemetry-instrumentation-google-genai/src/opentelemetry/instrumentation/google_genai/generate_content.py @@ -101,6 +101,10 @@ GEN_AI_TOOL_DEFINITIONS = getattr( gen_ai_attributes, "GEN_AI_TOOL_DEFINITIONS", "gen_ai.tool.definitions" ) +# TODO: Replace with the new semconv once it's defined. +GEN_AI_USAGE_CACHE_REASONING_OUTPUT_TOKENS = ( + "gen_ai.usage.reasoning.output_tokens" +) # Constant used to make the absence of content more understandable. _CONTENT_ELIDED = "" @@ -526,6 +530,7 @@ def __init__( self._input_tokens = 0 self._cached_tokens = 0 self._output_tokens = 0 + self._thoughts_tokens = 0 sem_conv_opt_in_mode = _OpenTelemetrySemanticConventionStability._get_opentelemetry_stability_opt_in_mode( _OpenTelemetryStabilitySignalType.GEN_AI ) @@ -627,18 +632,30 @@ def _maybe_update_token_counts(self, response: GenerateContentResponse): input_tokens = _get_response_property( response, "usage_metadata.prompt_token_count" ) - output_tokens = _get_response_property( + candidates_tokens = _get_response_property( response, "usage_metadata.candidates_token_count" ) + + thoughts_tokens = _get_response_property( + response, "usage_metadata.thoughts_token_count" + ) + output_tokens: int = 0 + if candidates_tokens and isinstance(candidates_tokens, int): + output_tokens += candidates_tokens + if thoughts_tokens and isinstance(thoughts_tokens, int): + self._thoughts_tokens = thoughts_tokens + output_tokens += self._thoughts_tokens + cached_tokens = _get_response_property( response, "usage_metadata.cached_content_token_count" ) if cached_tokens and isinstance(cached_tokens, int): self._cached_tokens = cached_tokens + if input_tokens and isinstance(input_tokens, int): self._input_tokens = input_tokens - if output_tokens and isinstance(output_tokens, int): - self._output_tokens = output_tokens + + self._output_tokens = output_tokens def _maybe_update_error_type(self, response: GenerateContentResponse): if response.candidates: @@ -778,6 +795,12 @@ def _maybe_log_completion_details( event.attributes[ gen_ai_attributes.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS ] = self._cached_tokens + span.set_attribute( + GEN_AI_USAGE_CACHE_REASONING_OUTPUT_TOKENS, self._thoughts_tokens + ) + event.attributes[GEN_AI_USAGE_CACHE_REASONING_OUTPUT_TOKENS] = ( + self._thoughts_tokens + ) tool_definitions = tool_definitions or [] self.completion_hook.on_completion( inputs=input_messages, diff --git a/instrumentation-genai/opentelemetry-instrumentation-google-genai/tests/generate_content/nonstreaming_base.py b/instrumentation-genai/opentelemetry-instrumentation-google-genai/tests/generate_content/nonstreaming_base.py index 00596d3ce4..56ff358478 100644 --- a/instrumentation-genai/opentelemetry-instrumentation-google-genai/tests/generate_content/nonstreaming_base.py +++ b/instrumentation-genai/opentelemetry-instrumentation-google-genai/tests/generate_content/nonstreaming_base.py @@ -261,17 +261,25 @@ def test_generated_span_has_vertex_ai_system_when_configured(self): def test_generated_span_counts_tokens(self): self.configure_valid_response( - input_tokens=123, output_tokens=456, cached_tokens=50 + input_tokens=123, + candidates_tokens=456, + thoughts_tokens=789, + cached_tokens=50, ) self.generate_content(model="gemini-2.0-flash", contents="Some input") self.otel.assert_has_span_named("generate_content gemini-2.0-flash") span = self.otel.get_span_named("generate_content gemini-2.0-flash") self.assertEqual(span.attributes["gen_ai.usage.input_tokens"], 123) - self.assertEqual(span.attributes["gen_ai.usage.output_tokens"], 456) + self.assertEqual( + span.attributes["gen_ai.usage.output_tokens"], 456 + 789 + ) # New sem conv should not appear when flag is not experimental mode.. self.assertNotIn( "gen_ai.usage.cache_read.input_tokens", span.attributes ) + self.assertNotIn( + "gen_ai.usage.reasoning.output_tokens", span.attributes + ) @patch.dict( "os.environ", @@ -452,7 +460,7 @@ def test_new_semconv_record_completion_as_log(self): self.setUp() with patched_environ, patched_otel_mapping: self.configure_valid_response( - text=output, cached_tokens=50 + text=output, cached_tokens=50, thoughts_tokens=10 ) self.generate_content( model="gemini-2.0-flash", @@ -475,6 +483,12 @@ def test_new_semconv_record_completion_as_log(self): ], 50, ) + self.assertEqual( + event.attributes[ + "gen_ai.usage.reasoning.output_tokens" + ], + 10, + ) assert ( event.attributes[ "gcp.gen_ai.operation.config.response_schema" @@ -780,7 +794,9 @@ def test_new_semconv_record_completion_in_span(self): self.setUp() with patched_environ, patched_otel_mapping: self.configure_valid_response( - text="Some response content", cached_tokens=50 + text="Some response content", + cached_tokens=50, + thoughts_tokens=10, ) self.generate_content( model="gemini-2.0-flash", @@ -800,6 +816,12 @@ def test_new_semconv_record_completion_in_span(self): ], 50, ) + self.assertEqual( + span.attributes[ + "gen_ai.usage.reasoning.output_tokens" + ], + 10, + ) if mode in [ ContentCapturingMode.SPAN_ONLY, ContentCapturingMode.SPAN_AND_EVENT, diff --git a/instrumentation-genai/opentelemetry-instrumentation-google-genai/tests/generate_content/streaming_base.py b/instrumentation-genai/opentelemetry-instrumentation-google-genai/tests/generate_content/streaming_base.py index 9d702033bb..bc7c02b575 100644 --- a/instrumentation-genai/opentelemetry-instrumentation-google-genai/tests/generate_content/streaming_base.py +++ b/instrumentation-genai/opentelemetry-instrumentation-google-genai/tests/generate_content/streaming_base.py @@ -91,16 +91,25 @@ def test_handles_multiple_ressponses(self): def test_includes_token_counts_in_span_not_aggregated_from_responses(self): # Tokens should not be aggregated in streaming. Cumulative counts are returned on each response. - self.configure_valid_response(input_tokens=3, output_tokens=5) - self.configure_valid_response(input_tokens=3, output_tokens=5) - self.configure_valid_response(input_tokens=3, output_tokens=5) + self.configure_valid_response( + input_tokens=3, candidates_tokens=5, thoughts_tokens=2 + ) + self.configure_valid_response( + input_tokens=3, candidates_tokens=5, thoughts_tokens=2 + ) + self.configure_valid_response( + input_tokens=3, candidates_tokens=5, thoughts_tokens=2 + ) self.generate_content(model="gemini-2.0-flash", contents="Some input") self.otel.assert_has_span_named("generate_content gemini-2.0-flash") span = self.otel.get_span_named("generate_content gemini-2.0-flash") self.assertEqual(span.attributes["gen_ai.usage.input_tokens"], 3) - self.assertEqual(span.attributes["gen_ai.usage.output_tokens"], 5) + self.assertEqual(span.attributes["gen_ai.usage.output_tokens"], 7) + self.assertNotIn( + "gen_ai.usage.reasoning.output_tokens", span.attributes + ) def test_new_semconv_log_has_extra_genai_attributes(self): patched_environ = patch.dict( @@ -117,7 +126,9 @@ def test_new_semconv_log_has_extra_genai_attributes(self): }, ) with patched_environ, patched_otel_mapping: - self.configure_valid_response(text="Yep, it works!") + self.configure_valid_response( + text="Yep, it works!", thoughts_tokens=10 + ) tok = context_api.attach( context_api.set_value( GENERATE_CONTENT_EXTRA_ATTRIBUTES_CONTEXT_KEY, @@ -135,6 +146,10 @@ def test_new_semconv_log_has_extra_genai_attributes(self): event = self.otel.get_event_named( "gen_ai.client.inference.operation.details" ) + self.assertEqual( + event.attributes["gen_ai.usage.reasoning.output_tokens"], + 10, + ) assert ( event.attributes["extra_attribute_key"] == "extra_attribute_value" diff --git a/instrumentation-genai/opentelemetry-instrumentation-google-genai/tests/generate_content/util.py b/instrumentation-genai/opentelemetry-instrumentation-google-genai/tests/generate_content/util.py index 3c3f81c646..c9ee78ca0f 100644 --- a/instrumentation-genai/opentelemetry-instrumentation-google-genai/tests/generate_content/util.py +++ b/instrumentation-genai/opentelemetry-instrumentation-google-genai/tests/generate_content/util.py @@ -25,7 +25,8 @@ def create_response( candidates: Optional[list[genai_types.Candidate]] = None, text: Optional[str] = None, input_tokens: Optional[int] = None, - output_tokens: Optional[int] = None, + candidates_tokens: Optional[int] = None, + thoughts_tokens: Optional[int] = None, cached_tokens: Optional[int] = None, model_version: Optional[str] = None, usage_metadata: Optional[ @@ -52,8 +53,10 @@ def create_response( usage_metadata = genai_types.GenerateContentResponseUsageMetadata() if input_tokens is not None: usage_metadata.prompt_token_count = input_tokens - if output_tokens is not None: - usage_metadata.candidates_token_count = output_tokens + if candidates_tokens is not None: + usage_metadata.candidates_token_count = candidates_tokens + if thoughts_tokens is not None: + usage_metadata.thoughts_token_count = thoughts_tokens if cached_tokens is not None: usage_metadata.cached_content_token_count = cached_tokens return genai_types.GenerateContentResponse(