diff --git a/CHANGELOG.md b/CHANGELOG.md index 8d79a899ba..148f3397ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Fixed + +- `opentelemetry-util-http`, `opentelemetry-instrumentation-requests`, `opentelemetry-instrumentation-wsgi`, `opentelemetry-instrumentation-asgi`: normalize byte-valued user-agent headers before detecting synthetic sources so attributes are recorded reliably. + ([#4001](https://github.com/open-telemetry/opentelemetry-python-contrib/issues/4001)) + ## Version 1.39.0/0.60b0 (2025-12-03) ### Added diff --git a/instrumentation/opentelemetry-instrumentation-asgi/src/opentelemetry/instrumentation/asgi/__init__.py b/instrumentation/opentelemetry-instrumentation-asgi/src/opentelemetry/instrumentation/asgi/__init__.py index fb809e6836..6891051a9e 100644 --- a/instrumentation/opentelemetry-instrumentation-asgi/src/opentelemetry/instrumentation/asgi/__init__.py +++ b/instrumentation/opentelemetry-instrumentation-asgi/src/opentelemetry/instrumentation/asgi/__init__.py @@ -283,6 +283,7 @@ def client_response_hook(span: Span, scope: Scope, message: dict[str, Any]): get_custom_headers, normalise_request_header_name, normalise_response_header_name, + normalize_user_agent, parse_excluded_urls, redact_url, sanitize_method, @@ -401,11 +402,15 @@ def collect_request_attributes( ) http_user_agent = asgi_getter.get(scope, "user-agent") if http_user_agent: - user_agent_value = http_user_agent[0] - _set_http_user_agent(result, user_agent_value, sem_conv_opt_in_mode) + user_agent_raw = http_user_agent[0] + user_agent_value = normalize_user_agent(user_agent_raw) + if user_agent_value: + _set_http_user_agent( + result, user_agent_value, sem_conv_opt_in_mode + ) # Check for synthetic user agent type - synthetic_type = detect_synthetic_user_agent(user_agent_value) + synthetic_type = detect_synthetic_user_agent(user_agent_raw) if synthetic_type: result[USER_AGENT_SYNTHETIC_TYPE] = synthetic_type diff --git a/instrumentation/opentelemetry-instrumentation-requests/src/opentelemetry/instrumentation/requests/__init__.py b/instrumentation/opentelemetry-instrumentation-requests/src/opentelemetry/instrumentation/requests/__init__.py index d834c1bb6c..f7fbf8bb48 100644 --- a/instrumentation/opentelemetry-instrumentation-requests/src/opentelemetry/instrumentation/requests/__init__.py +++ b/instrumentation/opentelemetry-instrumentation-requests/src/opentelemetry/instrumentation/requests/__init__.py @@ -153,6 +153,7 @@ def response_hook(span, request_obj, response): ExcludeList, detect_synthetic_user_agent, get_excluded_urls, + normalize_user_agent, parse_excluded_urls, redact_url, sanitize_method, @@ -252,8 +253,9 @@ def get_or_create_headers(): # Check for synthetic user agent type headers = get_or_create_headers() - user_agent = headers.get("User-Agent") - synthetic_type = detect_synthetic_user_agent(user_agent) + user_agent_value = headers.get("User-Agent") + synthetic_type = detect_synthetic_user_agent(user_agent_value) + user_agent = normalize_user_agent(user_agent_value) if synthetic_type: span_attributes[USER_AGENT_SYNTHETIC_TYPE] = synthetic_type if user_agent: diff --git a/instrumentation/opentelemetry-instrumentation-requests/tests/test_user_agent_synthetic.py b/instrumentation/opentelemetry-instrumentation-requests/tests/test_user_agent_synthetic.py index 4adcc2146b..a23dfd8a70 100644 --- a/instrumentation/opentelemetry-instrumentation-requests/tests/test_user_agent_synthetic.py +++ b/instrumentation/opentelemetry-instrumentation-requests/tests/test_user_agent_synthetic.py @@ -12,11 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +from unittest import mock + import httpretty import requests from opentelemetry.instrumentation.requests import RequestsInstrumentor from opentelemetry.semconv._incubating.attributes.user_agent_attributes import ( + USER_AGENT_ORIGINAL, USER_AGENT_SYNTHETIC_TYPE, UserAgentSyntheticTypeValues, ) @@ -165,3 +168,34 @@ def test_user_agent_priority_alwayson_over_bot(self): span.attributes.get(USER_AGENT_SYNTHETIC_TYPE), UserAgentSyntheticTypeValues.TEST.value, ) + + def test_user_agent_bytes_like_header(self): + """Test that bytes-like user agent headers are handled.""" + + original_prepare_headers = ( + requests.models.PreparedRequest.prepare_headers + ) + + def prepare_headers_bytes(self, headers): + original_prepare_headers(self, headers) + if "User-Agent" in self.headers: + value = self.headers["User-Agent"] + if isinstance(value, str): + self.headers["User-Agent"] = value.encode("utf-8") + + headers = {"User-Agent": "AlwaysOn-Monitor/1.0"} + with mock.patch( + "requests.models.PreparedRequest.prepare_headers", + new=prepare_headers_bytes, + ): + requests.get(self.URL, headers=headers, timeout=5) + + span = self.assert_span() + self.assertEqual( + span.attributes.get(USER_AGENT_SYNTHETIC_TYPE), + UserAgentSyntheticTypeValues.TEST.value, + ) + self.assertEqual( + span.attributes.get(USER_AGENT_ORIGINAL), + "AlwaysOn-Monitor/1.0", + ) diff --git a/instrumentation/opentelemetry-instrumentation-wsgi/src/opentelemetry/instrumentation/wsgi/__init__.py b/instrumentation/opentelemetry-instrumentation-wsgi/src/opentelemetry/instrumentation/wsgi/__init__.py index 1107287b68..da2fde4d65 100644 --- a/instrumentation/opentelemetry-instrumentation-wsgi/src/opentelemetry/instrumentation/wsgi/__init__.py +++ b/instrumentation/opentelemetry-instrumentation-wsgi/src/opentelemetry/instrumentation/wsgi/__init__.py @@ -278,6 +278,7 @@ def response_hook(span: Span, environ: WSGIEnvironment, status: str, response_he get_custom_headers, normalise_request_header_name, normalise_response_header_name, + normalize_user_agent, redact_url, sanitize_method, ) @@ -391,14 +392,7 @@ def collect_request_attributes( result, remote_host, sem_conv_opt_in_mode ) - user_agent = environ.get("HTTP_USER_AGENT") - if user_agent is not None and len(user_agent) > 0: - _set_http_user_agent(result, user_agent, sem_conv_opt_in_mode) - - # Check for synthetic user agent type - synthetic_type = detect_synthetic_user_agent(user_agent) - if synthetic_type: - result[USER_AGENT_SYNTHETIC_TYPE] = synthetic_type + _apply_user_agent_attributes(result, environ, sem_conv_opt_in_mode) flavor = environ.get("SERVER_PROTOCOL", "") if flavor.upper().startswith(_HTTP_VERSION_PREFIX): @@ -409,6 +403,25 @@ def collect_request_attributes( return result +def _apply_user_agent_attributes( + result: dict[str, str | None], + environ: WSGIEnvironment, + sem_conv_opt_in_mode: _StabilityMode, +): + user_agent_raw = environ.get("HTTP_USER_AGENT") + if not user_agent_raw: + return + + user_agent = normalize_user_agent(user_agent_raw) + if not user_agent: + return + + _set_http_user_agent(result, user_agent, sem_conv_opt_in_mode) + synthetic_type = detect_synthetic_user_agent(user_agent_raw) + if synthetic_type: + result[USER_AGENT_SYNTHETIC_TYPE] = synthetic_type + + def collect_custom_request_headers_attributes(environ: WSGIEnvironment): """Returns custom HTTP request headers which are configured by the user from the PEP3333-conforming WSGI environ to be used as span creation attributes as described diff --git a/instrumentation/opentelemetry-instrumentation-wsgi/tests/test_wsgi_middleware.py b/instrumentation/opentelemetry-instrumentation-wsgi/tests/test_wsgi_middleware.py index bb6c3aca2f..6230d27826 100644 --- a/instrumentation/opentelemetry-instrumentation-wsgi/tests/test_wsgi_middleware.py +++ b/instrumentation/opentelemetry-instrumentation-wsgi/tests/test_wsgi_middleware.py @@ -795,6 +795,13 @@ def test_http_user_agent_attribute(self): expected_new.items(), ) + def test_http_user_agent_bytes_like_attribute(self): + self.environ["HTTP_USER_AGENT"] = b"AlwaysOn-Monitor/1.0" + attributes = otel_wsgi.collect_request_attributes(self.environ) + + self.assertEqual(attributes[HTTP_USER_AGENT], "AlwaysOn-Monitor/1.0") + self.assertEqual(attributes[USER_AGENT_SYNTHETIC_TYPE], "test") + def test_http_user_agent_synthetic_bot_detection(self): """Test that bot user agents are detected as synthetic with type 'bot'""" test_cases = [ diff --git a/util/opentelemetry-util-http/src/opentelemetry/util/http/__init__.py b/util/opentelemetry-util-http/src/opentelemetry/util/http/__init__.py index e23e03dede..35d3187c7a 100644 --- a/util/opentelemetry-util-http/src/opentelemetry/util/http/__init__.py +++ b/util/opentelemetry-util-http/src/opentelemetry/util/http/__init__.py @@ -19,7 +19,7 @@ from re import IGNORECASE as RE_IGNORECASE from re import compile as re_compile from re import search -from typing import Callable, Iterable, Optional, overload +from typing import Callable, Iterable, overload from urllib.parse import parse_qs, urlencode, urlparse, urlunparse from opentelemetry.semconv._incubating.attributes.http_attributes import ( @@ -307,7 +307,26 @@ def redact_url(url: str) -> str: return url -def detect_synthetic_user_agent(user_agent: Optional[str]) -> Optional[str]: +def normalize_user_agent( + user_agent: str | bytes | bytearray | memoryview | None, +) -> str | None: + """Convert user-agent header values into a usable string.""" + # Different servers/frameworks surface headers as str, bytes, bytearray or memoryview; + # keep decoding logic centralized so instrumentation modules just call this helper. + if user_agent is None: + return None + if isinstance(user_agent, str): + return user_agent + if isinstance(user_agent, (bytes, bytearray)): + return user_agent.decode("latin-1") + if isinstance(user_agent, memoryview): + return user_agent.tobytes().decode("latin-1") + return str(user_agent) + + +def detect_synthetic_user_agent( + user_agent: str | bytes | bytearray | memoryview | None, +) -> str | None: """ Detect synthetic user agent type based on user agent string contents. @@ -321,6 +340,7 @@ def detect_synthetic_user_agent(user_agent: Optional[str]) -> Optional[str]: Note: Test patterns take priority over bot patterns. """ + user_agent = normalize_user_agent(user_agent) if not user_agent: return None diff --git a/util/opentelemetry-util-http/tests/test_detect_synthetic_user_agent.py b/util/opentelemetry-util-http/tests/test_detect_synthetic_user_agent.py index 2d9d3e9913..0440c80460 100644 --- a/util/opentelemetry-util-http/tests/test_detect_synthetic_user_agent.py +++ b/util/opentelemetry-util-http/tests/test_detect_synthetic_user_agent.py @@ -86,3 +86,20 @@ def test_priority_test_over_bot(self): result = detect_synthetic_user_agent(user_agent) # alwayson should be checked first and return 'test' self.assertEqual(result, UserAgentSyntheticTypeValues.TEST.value) + + def test_bytes_like_user_agent(self): + """Test that bytes-like user agents are decoded and detected.""" + + test_cases = [ + (b"alwayson-monitor/1.0", UserAgentSyntheticTypeValues.TEST.value), + ( + bytearray(b"googlebot/2.1"), + UserAgentSyntheticTypeValues.BOT.value, + ), + (memoryview(b"MyApp/1.0"), None), + ] + + for user_agent, expected in test_cases: + with self.subTest(user_agent=user_agent): + result = detect_synthetic_user_agent(user_agent) + self.assertEqual(result, expected)