Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Track ongoing event fetches correctly (again) #11376

Merged
merged 15 commits into from
Nov 26, 2021
1 change: 1 addition & 0 deletions changelog.d/11376.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix a long-standing bug where all requests that read events from the database could get stuck as a result of losing the database connection, for real this time. Also fix a race condition introduced in the previous insufficient fix in 1.47.0.
92 changes: 63 additions & 29 deletions synapse/storage/databases/main/events_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,7 @@ async def _get_events_from_cache_or_db(
# already due to `_get_events_from_db`).
fetching_deferred: ObservableDeferred[
Dict[str, _EventCacheEntry]
] = ObservableDeferred(defer.Deferred())
] = ObservableDeferred(defer.Deferred(), consumeErrors=True)
richvdh marked this conversation as resolved.
Show resolved Hide resolved
for event_id in missing_events_ids:
self._current_event_fetches[event_id] = fetching_deferred

Expand Down Expand Up @@ -736,35 +736,71 @@ async def get_stripped_room_state_from_event_context(
for e in state_to_include.values()
]

def _do_fetch(self, conn: Connection) -> None:
async def _do_fetch(self) -> None:
"""Services requests for events from the `_event_fetch_list` queue."""
try:
await self.db_pool.runWithConnection(self._do_fetch_txn)
except BaseException as e:
failed_event_list = []
with self._event_fetch_lock:
if self._event_fetch_ongoing == 1 and self._event_fetch_list:
# We are the last remaining fetcher and we have just failed.
# Fail any outstanding fetches, since they won't get processed
# otherwise.
failed_event_list = self._event_fetch_list
self._event_fetch_list = []

self._event_fetch_ongoing -= 1
event_fetch_ongoing_gauge.set(self._event_fetch_ongoing)

for _, deferred in failed_event_list:
if not deferred.called:
with PreserveLoggingContext():
deferred.errback(e)
squahtx marked this conversation as resolved.
Show resolved Hide resolved
else:
should_restart = False
with self._event_fetch_lock:
if self._event_fetch_ongoing == 1 and self._event_fetch_list:
# An event fetch has been queued, but we're the last remaining
# fetcher and have already decided to terminate.
# Start a new fetcher.
should_restart = True

# `_event_fetch_ongoing` ought to be decremented for ourself and
# incremented for the new fetcher. These cancel out, so we leave it
# alone.
else:
self._event_fetch_ongoing -= 1
event_fetch_ongoing_gauge.set(self._event_fetch_ongoing)
richvdh marked this conversation as resolved.
Show resolved Hide resolved

if should_restart:
run_as_background_process("fetch_events", self._do_fetch)

def _do_fetch_txn(self, conn: Connection) -> None:
"""Takes a database connection and waits for requests for events from
the _event_fetch_list queue.
"""
try:
i = 0
while True:
with self._event_fetch_lock:
event_list = self._event_fetch_list
self._event_fetch_list = []
i = 0
while True:
with self._event_fetch_lock:
event_list = self._event_fetch_list
self._event_fetch_list = []

if not event_list:
richvdh marked this conversation as resolved.
Show resolved Hide resolved
single_threaded = self.database_engine.single_threaded
if (
not self.USE_DEDICATED_DB_THREADS_FOR_EVENT_FETCHING
richvdh marked this conversation as resolved.
Show resolved Hide resolved
or single_threaded
or i > EVENT_QUEUE_ITERATIONS
):
return
else:
self._event_fetch_lock.wait(EVENT_QUEUE_TIMEOUT_S)
i += 1
continue
richvdh marked this conversation as resolved.
Show resolved Hide resolved
i = 0

if not event_list:
single_threaded = self.database_engine.single_threaded
if (
not self.USE_DEDICATED_DB_THREADS_FOR_EVENT_FETCHING
or single_threaded
or i > EVENT_QUEUE_ITERATIONS
):
break
else:
self._event_fetch_lock.wait(EVENT_QUEUE_TIMEOUT_S)
i += 1
continue
i = 0

self._fetch_event_list(conn, event_list)
finally:
self._event_fetch_ongoing -= 1
event_fetch_ongoing_gauge.set(self._event_fetch_ongoing)
self._fetch_event_list(conn, event_list)

def _fetch_event_list(
self, conn: Connection, event_list: List[Tuple[List[str], defer.Deferred]]
Expand Down Expand Up @@ -994,9 +1030,7 @@ async def _enqueue_events(self, events: Iterable[str]) -> Dict[str, _EventRow]:
should_start = False

if should_start:
run_as_background_process(
"fetch_events", self.db_pool.runWithConnection, self._do_fetch
)
run_as_background_process("fetch_events", self._do_fetch)

logger.debug("Loading %d events: %s", len(events), events)
with PreserveLoggingContext():
Expand Down
120 changes: 119 additions & 1 deletion tests/storage/databases/main/test_events_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import json
from contextlib import contextmanager
from typing import Generator

from twisted.enterprise.adbapi import ConnectionPool
from twisted.internet.defer import ensureDeferred
from twisted.test.proto_helpers import MemoryReactor

from synapse.api.room_versions import EventFormatVersions, RoomVersions
from synapse.logging.context import LoggingContext
from synapse.rest import admin
from synapse.rest.client import login, room
from synapse.storage.databases.main.events_worker import EventsWorkerStore
from synapse.server import HomeServer
from synapse.storage.databases.main.events_worker import (
EVENT_QUEUE_THREADS,
EventsWorkerStore,
)
from synapse.storage.types import Connection
from synapse.util import Clock
from synapse.util.async_helpers import yieldable_gather_results

from tests import unittest
Expand Down Expand Up @@ -144,3 +157,108 @@ def test_dedupe(self):

# We should have fetched the event from the DB
self.assertEqual(ctx.get_resource_usage().evt_db_fetch_count, 1)


class DatabaseOutageTestCase(unittest.HomeserverTestCase):
"""Test event fetching during a database outage."""

def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer):
self.store: EventsWorkerStore = hs.get_datastore()

self.room_id = f"!room:{hs.hostname}"
self.event_ids = [f"event{i}" for i in range(20)]

self._populate_events()

def _populate_events(self) -> None:
"""Ensure that there are test events in the database."""
self.get_success(
self.store.db_pool.simple_upsert(
"rooms",
{"room_id": self.room_id},
{"room_version": RoomVersions.V4.identifier},
)
)

self.event_ids = [f"event{i}" for i in range(20)]
for idx, event_id in enumerate(self.event_ids):
self.get_success(
self.store.db_pool.simple_upsert(
"events",
{"event_id": event_id},
{
"event_id": event_id,
"room_id": self.room_id,
"topological_ordering": idx,
"stream_ordering": idx,
"type": "test",
"processed": True,
"outlier": False,
},
)
)
self.get_success(
self.store.db_pool.simple_upsert(
"event_json",
{"event_id": event_id},
{
"room_id": self.room_id,
"json": json.dumps({"type": "test", "room_id": self.room_id}),
"internal_metadata": "{}",
"format_version": EventFormatVersions.V3,
},
)
)
richvdh marked this conversation as resolved.
Show resolved Hide resolved

@contextmanager
def _outage(self) -> Generator[None, None, None]:
"""Simulate a database outage."""
richvdh marked this conversation as resolved.
Show resolved Hide resolved
connection_pool = self.store.db_pool._db_pool
connection_pool.close()
connection_pool.start()
richvdh marked this conversation as resolved.
Show resolved Hide resolved
original_connection_factory = connection_pool.connectionFactory

def connection_factory(_pool: ConnectionPool) -> Connection:
raise Exception("Could not connect to the database.")

connection_pool.connectionFactory = connection_factory # type: ignore[assignment]
try:
yield
finally:
connection_pool.connectionFactory = original_connection_factory

# If the in-memory SQLite database is being used, all the events are gone.
# Restore the test data.
self._populate_events()

def test_failure(self) -> None:
"""Test that event fetches do not get stuck during a database outage."""
with self._outage():
failure = self.get_failure(
self.store.get_event(self.event_ids[0]), Exception
)
self.assertEqual(str(failure.value), "Could not connect to the database.")

def test_recovery(self) -> None:
"""Test that event fetchers recover after a database outage."""
with self._outage():
# Kick off a bunch of event fetches but do not pump the reactor
event_deferreds = []
for event_id in self.event_ids:
event_deferreds.append(ensureDeferred(self.store.get_event(event_id)))

# We should have maxed out on event fetcher threads
self.assertEqual(self.store._event_fetch_ongoing, EVENT_QUEUE_THREADS)

# All the event fetchers will fail
self.pump()
self.assertEqual(self.store._event_fetch_ongoing, 0)

for event_deferred in event_deferreds:
failure = self.get_failure(event_deferred, Exception)
self.assertEqual(
str(failure.value), "Could not connect to the database."
)

# This next event fetch should succeed
self.get_success(self.store.get_event(self.event_ids[0]))