Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## Unreleased

### Fixed

- `opentelemetry-util-http`, `opentelemetry-instrumentation-requests`, `opentelemetry-instrumentation-wsgi`, `opentelemetry-instrumentation-asgi`: normalize byte-valued user-agent headers before detecting synthetic sources so attributes are recorded reliably.
([#4001](https://github.com/open-telemetry/opentelemetry-python-contrib/issues/4001))

## Version 1.39.0/0.60b0 (2025-12-03)

### Added
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,7 @@ def client_response_hook(span: Span, scope: Scope, message: dict[str, Any]):
get_custom_headers,
normalise_request_header_name,
normalise_response_header_name,
normalize_user_agent,
parse_excluded_urls,
redact_url,
sanitize_method,
Expand Down Expand Up @@ -401,11 +402,15 @@ def collect_request_attributes(
)
http_user_agent = asgi_getter.get(scope, "user-agent")
if http_user_agent:
user_agent_value = http_user_agent[0]
_set_http_user_agent(result, user_agent_value, sem_conv_opt_in_mode)
user_agent_raw = http_user_agent[0]
user_agent_value = normalize_user_agent(user_agent_raw)
if user_agent_value:
_set_http_user_agent(
result, user_agent_value, sem_conv_opt_in_mode
)

# Check for synthetic user agent type
synthetic_type = detect_synthetic_user_agent(user_agent_value)
synthetic_type = detect_synthetic_user_agent(user_agent_raw)
if synthetic_type:
result[USER_AGENT_SYNTHETIC_TYPE] = synthetic_type

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ def response_hook(span, request_obj, response):
ExcludeList,
detect_synthetic_user_agent,
get_excluded_urls,
normalize_user_agent,
parse_excluded_urls,
redact_url,
sanitize_method,
Expand Down Expand Up @@ -252,8 +253,9 @@ def get_or_create_headers():

# Check for synthetic user agent type
headers = get_or_create_headers()
user_agent = headers.get("User-Agent")
synthetic_type = detect_synthetic_user_agent(user_agent)
user_agent_value = headers.get("User-Agent")
synthetic_type = detect_synthetic_user_agent(user_agent_value)
user_agent = normalize_user_agent(user_agent_value)
if synthetic_type:
span_attributes[USER_AGENT_SYNTHETIC_TYPE] = synthetic_type
if user_agent:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from unittest import mock

import httpretty
import requests

from opentelemetry.instrumentation.requests import RequestsInstrumentor
from opentelemetry.semconv._incubating.attributes.user_agent_attributes import (
USER_AGENT_ORIGINAL,
USER_AGENT_SYNTHETIC_TYPE,
UserAgentSyntheticTypeValues,
)
Expand Down Expand Up @@ -165,3 +168,34 @@ def test_user_agent_priority_alwayson_over_bot(self):
span.attributes.get(USER_AGENT_SYNTHETIC_TYPE),
UserAgentSyntheticTypeValues.TEST.value,
)

def test_user_agent_bytes_like_header(self):
"""Test that bytes-like user agent headers are handled."""

original_prepare_headers = (
requests.models.PreparedRequest.prepare_headers
)

def prepare_headers_bytes(self, headers):
original_prepare_headers(self, headers)
if "User-Agent" in self.headers:
value = self.headers["User-Agent"]
if isinstance(value, str):
self.headers["User-Agent"] = value.encode("utf-8")

headers = {"User-Agent": "AlwaysOn-Monitor/1.0"}
with mock.patch(
"requests.models.PreparedRequest.prepare_headers",
new=prepare_headers_bytes,
):
requests.get(self.URL, headers=headers, timeout=5)

span = self.assert_span()
self.assertEqual(
span.attributes.get(USER_AGENT_SYNTHETIC_TYPE),
UserAgentSyntheticTypeValues.TEST.value,
)
self.assertEqual(
span.attributes.get(USER_AGENT_ORIGINAL),
"AlwaysOn-Monitor/1.0",
)
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ def response_hook(span: Span, environ: WSGIEnvironment, status: str, response_he
get_custom_headers,
normalise_request_header_name,
normalise_response_header_name,
normalize_user_agent,
redact_url,
sanitize_method,
)
Expand Down Expand Up @@ -391,14 +392,7 @@ def collect_request_attributes(
result, remote_host, sem_conv_opt_in_mode
)

user_agent = environ.get("HTTP_USER_AGENT")
if user_agent is not None and len(user_agent) > 0:
_set_http_user_agent(result, user_agent, sem_conv_opt_in_mode)

# Check for synthetic user agent type
synthetic_type = detect_synthetic_user_agent(user_agent)
if synthetic_type:
result[USER_AGENT_SYNTHETIC_TYPE] = synthetic_type
_apply_user_agent_attributes(result, environ, sem_conv_opt_in_mode)

flavor = environ.get("SERVER_PROTOCOL", "")
if flavor.upper().startswith(_HTTP_VERSION_PREFIX):
Expand All @@ -409,6 +403,25 @@ def collect_request_attributes(
return result


def _apply_user_agent_attributes(
result: dict[str, str | None],
environ: WSGIEnvironment,
sem_conv_opt_in_mode: _StabilityMode,
):
user_agent_raw = environ.get("HTTP_USER_AGENT")
if not user_agent_raw:
return

user_agent = normalize_user_agent(user_agent_raw)
if not user_agent:
return

_set_http_user_agent(result, user_agent, sem_conv_opt_in_mode)
synthetic_type = detect_synthetic_user_agent(user_agent_raw)
if synthetic_type:
result[USER_AGENT_SYNTHETIC_TYPE] = synthetic_type


def collect_custom_request_headers_attributes(environ: WSGIEnvironment):
"""Returns custom HTTP request headers which are configured by the user
from the PEP3333-conforming WSGI environ to be used as span creation attributes as described
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -795,6 +795,13 @@ def test_http_user_agent_attribute(self):
expected_new.items(),
)

def test_http_user_agent_bytes_like_attribute(self):
self.environ["HTTP_USER_AGENT"] = b"AlwaysOn-Monitor/1.0"
attributes = otel_wsgi.collect_request_attributes(self.environ)

self.assertEqual(attributes[HTTP_USER_AGENT], "AlwaysOn-Monitor/1.0")
self.assertEqual(attributes[USER_AGENT_SYNTHETIC_TYPE], "test")

def test_http_user_agent_synthetic_bot_detection(self):
"""Test that bot user agents are detected as synthetic with type 'bot'"""
test_cases = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from re import IGNORECASE as RE_IGNORECASE
from re import compile as re_compile
from re import search
from typing import Callable, Iterable, Optional, overload
from typing import Callable, Iterable, overload
from urllib.parse import parse_qs, urlencode, urlparse, urlunparse

from opentelemetry.semconv._incubating.attributes.http_attributes import (
Expand Down Expand Up @@ -307,7 +307,26 @@ def redact_url(url: str) -> str:
return url


def detect_synthetic_user_agent(user_agent: Optional[str]) -> Optional[str]:
def normalize_user_agent(
user_agent: str | bytes | bytearray | memoryview | None,
) -> str | None:
"""Convert user-agent header values into a usable string."""
# Different servers/frameworks surface headers as str, bytes, bytearray or memoryview;
# keep decoding logic centralized so instrumentation modules just call this helper.
if user_agent is None:
return None
if isinstance(user_agent, str):
return user_agent
if isinstance(user_agent, (bytes, bytearray)):
return user_agent.decode("latin-1")
if isinstance(user_agent, memoryview):
return user_agent.tobytes().decode("latin-1")
return str(user_agent)


def detect_synthetic_user_agent(
user_agent: str | bytes | bytearray | memoryview | None,
) -> str | None:
"""
Detect synthetic user agent type based on user agent string contents.

Expand All @@ -321,6 +340,7 @@ def detect_synthetic_user_agent(user_agent: Optional[str]) -> Optional[str]:

Note: Test patterns take priority over bot patterns.
"""
user_agent = normalize_user_agent(user_agent)
if not user_agent:
return None

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,20 @@ def test_priority_test_over_bot(self):
result = detect_synthetic_user_agent(user_agent)
# alwayson should be checked first and return 'test'
self.assertEqual(result, UserAgentSyntheticTypeValues.TEST.value)

def test_bytes_like_user_agent(self):
"""Test that bytes-like user agents are decoded and detected."""

test_cases = [
(b"alwayson-monitor/1.0", UserAgentSyntheticTypeValues.TEST.value),
(
bytearray(b"googlebot/2.1"),
UserAgentSyntheticTypeValues.BOT.value,
),
(memoryview(b"MyApp/1.0"), None),
]

for user_agent, expected in test_cases:
with self.subTest(user_agent=user_agent):
result = detect_synthetic_user_agent(user_agent)
self.assertEqual(result, expected)