Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Draft: /messages investigation scratch pad1 #13440

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
522c29b
Instrument /messages for understandable traces
MadLittleMods Jul 23, 2022
b6a18d2
Trace in Complement
MadLittleMods Aug 3, 2022
2504bc6
Merge branch 'madlittlemods/instrument-messages-tracing' into madlitt…
MadLittleMods Aug 3, 2022
9cd6320
Fix imports after OTEL changes
MadLittleMods Aug 3, 2022
c3f3e59
Merge branch 'madlittlemods/11850-migrate-to-opentelemetry' into madl…
MadLittleMods Aug 3, 2022
9f69182
Move Twisted git install where it was before
MadLittleMods Aug 3, 2022
2f75287
Fix @tag_args being one-off (ahead)
MadLittleMods Aug 3, 2022
fdce1c2
Allow @trace and @tag_args to be used together
MadLittleMods Aug 4, 2022
a7eabb7
Trace more
MadLittleMods Aug 4, 2022
13855c5
More tracing for federated side
MadLittleMods Aug 6, 2022
552b7f1
More tracing for federation
MadLittleMods Aug 6, 2022
c51883e
Add length to the list of events
MadLittleMods Aug 6, 2022
ee465f9
Fix some lints (mistakes) and better trace when fetching events
MadLittleMods Aug 6, 2022
aa5e925
Only set attribute if going forward
MadLittleMods Aug 9, 2022
2a467fd
Merge branch 'madlittlemods/11850-migrate-to-opentelemetry' into madl…
MadLittleMods Aug 9, 2022
597c3f2
Trace some results
MadLittleMods Aug 9, 2022
f4ec9d1
Instrument FederationStateIdsServlet
MadLittleMods Aug 10, 2022
898ba0e
More tracing
MadLittleMods Aug 11, 2022
53b8453
Refactor from feedback
MadLittleMods Aug 15, 2022
0f2bfa4
Merge branch 'madlittlemods/11850-migrate-to-opentelemetry' into madl…
MadLittleMods Aug 18, 2022
db04b16
Some cleanup
MadLittleMods Aug 18, 2022
4168ba5
Remove debug logs
MadLittleMods Aug 19, 2022
05e5113
Merge branch 'madlittlemods/11850-migrate-to-opentelemetry' into madl…
MadLittleMods Sep 20, 2022
d8899e4
Merge branch 'madlittlemods/11850-migrate-to-opentelemetry' into madl…
MadLittleMods Sep 26, 2022
04de9ea
Merge branch 'madlittlemods/11850-migrate-to-opentelemetry' into madl…
MadLittleMods Nov 18, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions docker/conf/homeserver.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,23 @@ trusted_key_servers:

password_config:
MadLittleMods marked this conversation as resolved.
Show resolved Hide resolved
enabled: true


# foo
tracing:
enabled: true
sample_rate: 1
homeserver_whitelist:
- ".*"
jaeger_exporter_config:
agent_host_name: host.docker.internal
agent_port: 6831
# Split UDP packets (UDP_PACKET_MAX_LENGTH is set to 65k in OpenTelemetry)
udp_split_oversized_batches: true
# If you define a collector, it will communicate directly to the collector,
# bypassing the agent
#
# It does not seem like the agent can keep up with the massive UDP load
# (1065 spans in one trace) so lets just use the HTTP collector endpoint
# instead which seems to work.
Comment on lines +203 to +205
Copy link
Contributor Author

@MadLittleMods MadLittleMods Aug 3, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder why this is the case? I was seeing this same behavior with the Jaeger opentracing stuff. Is the UDP connection being over saturated? Can the Jaeger agent in Docker not keep up? We see some spans come over but never the main servlet overarching one that is probably the last to be exported.

But using the HTTP Jaeger collector endpoint seems to work fine for getting the whole trace.

collector_endpoint: "http://host.docker.internal:14268/api/traces?format=jaeger.thrift"
29 changes: 20 additions & 9 deletions synapse/handlers/federation.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,13 @@
from synapse.federation.federation_client import InvalidResponseError
from synapse.http.servlet import assert_params_in_dict
from synapse.logging.context import nested_logging_context
from synapse.logging.tracing import SynapseTags, set_attribute, tag_args, trace
from synapse.logging.tracing import (
SynapseTags,
set_attribute,
start_active_span,
tag_args,
trace,
)
from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.module_api import NOT_SPAM
from synapse.replication.http.federation import (
Expand Down Expand Up @@ -349,10 +355,14 @@ async def _maybe_backfill_inner(
# attempting to paginate before backfill reached the visible history.

extremities_to_request: List[str] = []
for bp in sorted_backfill_points:
for i, bp in enumerate(sorted_backfill_points):
if len(extremities_to_request) >= 5:
break

set_attribute(
SynapseTags.RESULT_PREFIX + "backfill_point" + str(i), str(bp)
)

# For regular backwards extremities, we don't have the extremity events
# themselves, so we need to actually check the events that reference them -
# their "successor" events.
Expand Down Expand Up @@ -409,14 +419,15 @@ async def _maybe_backfill_inner(
str(len(extremities_to_request)),
)

# Now we need to decide which hosts to hit first.
# First we try hosts that are already in the room.
# TODO: HEURISTIC ALERT.
likely_domains = (
await self._storage_controllers.state.get_current_hosts_in_room_ordered(
room_id
with start_active_span("getting likely_domains"):
# Now we need to decide which hosts to hit first.
# First we try hosts that are already in the room.
# TODO: HEURISTIC ALERT.
likely_domains = (
await self._storage_controllers.state.get_current_hosts_in_room_ordered(
room_id
)
)
)

async def try_backfill(domains: Collection[str]) -> bool:
# TODO: Should we try multiple of these at a time?
Expand Down
5 changes: 5 additions & 0 deletions synapse/handlers/federation_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -1249,6 +1249,11 @@ async def _get_state_ids_after_missing_prev_event(
str(len(failed_to_fetch)),
)

set_attribute(
SynapseTags.RESULT_PREFIX + f"failed_to_fetch ({len(failed_to_fetch)})",
str(failed_to_fetch),
)

if remote_event.is_state() and remote_event.rejected_reason is None:
state_map[
(remote_event.type, remote_event.state_key)
Expand Down
10 changes: 9 additions & 1 deletion synapse/handlers/relations.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from synapse.api.constants import EventTypes, RelationTypes
from synapse.api.errors import SynapseError
from synapse.events import EventBase, relation_from_event
from synapse.logging.tracing import trace
from synapse.logging.tracing import SynapseTags, set_attribute, trace
from synapse.storage.databases.main.relations import ThreadsNextBatch, _RelatedEvent
from synapse.streams.config import PaginationConfig
from synapse.types import JsonDict, Requester, StreamToken, UserID
Expand Down Expand Up @@ -172,6 +172,7 @@ async def get_relations(

return return_value

@trace
async def get_relations_for_event(
self,
event_id: str,
Expand Down Expand Up @@ -259,6 +260,7 @@ async def redact_events_related_to(
e.msg,
)

@trace
async def get_annotations_for_event(
self,
event_id: str,
Expand Down Expand Up @@ -304,6 +306,7 @@ async def get_annotations_for_event(

return filtered_results

@trace
async def _get_threads_for_events(
self,
events_by_id: Dict[str, EventBase],
Expand Down Expand Up @@ -465,6 +468,11 @@ async def get_bundled_aggregations(
# The event should get bundled aggregations.
events_by_id[event.event_id] = event

set_attribute(
SynapseTags.FUNC_ARG_PREFIX + f"event_ids ({len(events_by_id)})",
str(events_by_id.keys()),
)

# event ID -> bundled aggregation in non-serialized form.
results: Dict[str, BundledAggregations] = {}

Expand Down
4 changes: 2 additions & 2 deletions synapse/logging/tracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,8 @@ class SynapseTags:

# Used to tag function arguments
#
# Tag a named arg. The name of the argument should be appended to this prefix.
# Tag a named arg. The name of the argument should be appended to this
# prefix.
FUNC_ARG_PREFIX = "ARG."
# Tag extra variadic number of positional arguments (`def foo(first, second, *extras)`)
FUNC_ARGS = "args"
Expand Down Expand Up @@ -928,7 +929,6 @@ def trace(func: Callable[P, R]) -> Callable[P, R]:

See the module's doc string for usage examples.
"""

return trace_with_opname(func.__name__)(func)


Expand Down
5 changes: 5 additions & 0 deletions synapse/storage/databases/main/relations.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from synapse.api.constants import MAIN_TIMELINE, RelationTypes
from synapse.api.errors import SynapseError
from synapse.events import EventBase
from synapse.logging.tracing import trace
from synapse.storage._base import SQLBaseStore
from synapse.storage.database import (
DatabasePool,
Expand Down Expand Up @@ -503,6 +504,10 @@ def _get_aggregation_groups_for_users_txn(
def get_applicable_edit(self, event_id: str) -> Optional[EventBase]:
raise NotImplementedError()

# TODO: What's the proper way to fix this so we can stack @trace on top of
# @cachedList
#
# @trace
@cachedList(cached_method_name="get_applicable_edit", list_name="event_ids")
async def get_applicable_edits(
self, event_ids: Collection[str]
Expand Down