From ca3db044a3b5a207ff8d65ad7b761427ab215ccc Mon Sep 17 00:00:00 2001 From: Richard van der Hoff <1389908+richvdh@users.noreply.github.com> Date: Tue, 26 Jul 2022 12:47:31 +0100 Subject: [PATCH] Fix infinite loop in partial-state resync (#13353) Make sure that we only pull out events from the db once they have no prev-events with partial state. --- changelog.d/13353.bugfix | 1 + synapse/handlers/federation_event.py | 14 ++++++------- .../storage/databases/main/events_worker.py | 20 ++++++++++++++++++- 3 files changed, 27 insertions(+), 8 deletions(-) create mode 100644 changelog.d/13353.bugfix diff --git a/changelog.d/13353.bugfix b/changelog.d/13353.bugfix new file mode 100644 index 000000000000..8e18bfae1f9d --- /dev/null +++ b/changelog.d/13353.bugfix @@ -0,0 +1 @@ +Fix a bug in the experimental faster-room-joins support which could cause it to get stuck in an infinite loop. diff --git a/synapse/handlers/federation_event.py b/synapse/handlers/federation_event.py index fc1254d2ad77..2ba2b1527eac 100644 --- a/synapse/handlers/federation_event.py +++ b/synapse/handlers/federation_event.py @@ -569,15 +569,15 @@ async def update_state_for_partial_state_event( if context is None or context.partial_state: # this can happen if some or all of the event's prev_events still have - # partial state - ie, an event has an earlier stream_ordering than one - # or more of its prev_events, so we de-partial-state it before its - # prev_events. + # partial state. We were careful to only pick events from the db without + # partial-state prev events, so that implies that a prev event has + # been persisted (with partial state) since we did the query. # - # TODO(faster_joins): we probably need to be more intelligent, and - # exclude partial-state prev_events from consideration - # https://github.com/matrix-org/synapse/issues/13001 + # So, let's just ignore `event` for now; when we re-run the db query + # we should instead get its partial-state prev event, which we will + # de-partial-state, and then come back to event. logger.warning( - "%s still has partial state: can't de-partial-state it yet", + "%s still has prev_events with partial state: can't de-partial-state it yet", event.event_id, ) return diff --git a/synapse/storage/databases/main/events_worker.py b/synapse/storage/databases/main/events_worker.py index 5914a35420e9..29c99c635735 100644 --- a/synapse/storage/databases/main/events_worker.py +++ b/synapse/storage/databases/main/events_worker.py @@ -2110,11 +2110,29 @@ async def get_partial_state_events_batch(self, room_id: str) -> List[str]: def _get_partial_state_events_batch_txn( txn: LoggingTransaction, room_id: str ) -> List[str]: + # we want to work through the events from oldest to newest, so + # we only want events whose prev_events do *not* have partial state - hence + # the 'NOT EXISTS' clause in the below. + # + # This is necessary because ordering by stream ordering isn't quite enough + # to ensure that we work from oldest to newest event (in particular, + # if an event is initially persisted as an outlier and later de-outliered, + # it can end up with a lower stream_ordering than its prev_events). + # + # Typically this means we'll only return one event per batch, but that's + # hard to do much about. + # + # See also: https://github.com/matrix-org/synapse/issues/13001 txn.execute( """ SELECT event_id FROM partial_state_events AS pse JOIN events USING (event_id) - WHERE pse.room_id = ? + WHERE pse.room_id = ? AND + NOT EXISTS( + SELECT 1 FROM event_edges AS ee + JOIN partial_state_events AS prev_pse ON (prev_pse.event_id=ee.prev_event_id) + WHERE ee.event_id=pse.event_id + ) ORDER BY events.stream_ordering LIMIT 100 """,