From 94fb46e3b68b7ead95eac2fd19c615f622487c11 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Tue, 16 Aug 2022 01:32:57 -0500 Subject: [PATCH] Time how long it takes us to do backfill processing The after timing can be influenced by slow `/state_ids` requests when proessing the pulled events. Part of https://github.com/matrix-org/synapse/issues/13356 --- synapse/handlers/federation.py | 26 +++++++++++++++ synapse/handlers/federation_event.py | 48 ++++++++++++++++++++-------- 2 files changed, 60 insertions(+), 14 deletions(-) diff --git a/synapse/handlers/federation.py b/synapse/handlers/federation.py index 5042236742da..db90e626d225 100644 --- a/synapse/handlers/federation.py +++ b/synapse/handlers/federation.py @@ -32,6 +32,7 @@ ) import attr +from prometheus_client import Histogram from signedjson.key import decode_verify_key_bytes from signedjson.sign import verify_signed_json from unpaddedbase64 import decode_base64 @@ -80,6 +81,24 @@ logger = logging.getLogger(__name__) +backfill_processing_before_timer = Histogram( + "synapse_federation_backfill_processing_before_time_seconds", + "sec", + [], + buckets=( + 1.0, + 5.0, + 10.0, + 20.0, + 30.0, + 40.0, + 60.0, + 80.0, + "+Inf", + ), +) + + def get_domains_from_state(state: StateMap[EventBase]) -> List[Tuple[str, int]]: """Get joined domains from state @@ -138,6 +157,7 @@ class FederationHandler: def __init__(self, hs: "HomeServer"): self.hs = hs + self.clock = hs.get_clock() self.store = hs.get_datastores().main self._storage_controllers = hs.get_storage_controllers() self._state_storage_controller = self._storage_controllers.state @@ -203,6 +223,7 @@ async def maybe_backfill( async def _maybe_backfill_inner( self, room_id: str, current_depth: int, limit: int ) -> bool: + processing_start_time = self.clock.time_msec() backwards_extremities = [ _BackfillPoint(event_id, depth, _BackfillPointType.BACKWARDS_EXTREMITY) for event_id, depth in await self.store.get_oldest_event_ids_with_depth_in_room( @@ -425,6 +446,11 @@ async def try_backfill(domains: List[str]) -> bool: return False + processing_end_time = self.clock.time_msec() + backfill_processing_before_timer.observe( + (processing_start_time - processing_end_time) / 1000 + ) + success = await try_backfill(likely_domains) if success: return True diff --git a/synapse/handlers/federation_event.py b/synapse/handlers/federation_event.py index 8968b705d43a..986f2b61e57d 100644 --- a/synapse/handlers/federation_event.py +++ b/synapse/handlers/federation_event.py @@ -29,7 +29,7 @@ Tuple, ) -from prometheus_client import Counter +from prometheus_client import Counter, Histogram from synapse import event_auth from synapse.api.constants import ( @@ -92,6 +92,25 @@ "Events received over federation that we marked as soft_failed", ) +backfill_processing_after_timer = Histogram( + "synapse_federation_backfill_processing_after_time_seconds", + "sec", + [], + buckets=( + 1.0, + 5.0, + 10.0, + 20.0, + 30.0, + 40.0, + 60.0, + 80.0, + 120.0, + 180.0, + "+Inf", + ), +) + class FederationEventHandler: """Handles events that originated from federation. @@ -597,20 +616,21 @@ async def backfill( if not events: return - # if there are any events in the wrong room, the remote server is buggy and - # should not be trusted. - for ev in events: - if ev.room_id != room_id: - raise InvalidResponseError( - f"Remote server {dest} returned event {ev.event_id} which is in " - f"room {ev.room_id}, when we were backfilling in {room_id}" - ) + with backfill_processing_after_timer.time(): + # if there are any events in the wrong room, the remote server is buggy and + # should not be trusted. + for ev in events: + if ev.room_id != room_id: + raise InvalidResponseError( + f"Remote server {dest} returned event {ev.event_id} which is in " + f"room {ev.room_id}, when we were backfilling in {room_id}" + ) - await self._process_pulled_events( - dest, - events, - backfilled=True, - ) + await self._process_pulled_events( + dest, + events, + backfilled=True, + ) @trace async def _get_missing_events_for_pdu(