-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Fix race in replication #7226
Fix race in replication #7226
Changes from 4 commits
3a86ea5
7280f95
af12f54
dfb4d01
84ac795
a2e0bb9
55eccdc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Move catchup of replication streams logic to worker. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -92,16 +92,28 @@ async def on_RDATA(self, cmd: RdataCommand): | |
logger.exception("Failed to parse RDATA: %r %r", stream_name, cmd.row) | ||
raise | ||
|
||
if cmd.token is None or stream_name not in self._streams_connected: | ||
# I.e. either this is part of a batch of updates for this stream (in | ||
# which case batch until we get an update for the stream with a non | ||
# None token) or we're currently connecting so we queue up rows. | ||
self._pending_batches.setdefault(stream_name, []).append(row) | ||
else: | ||
# Check if this is the last of a batch of updates | ||
rows = self._pending_batches.pop(stream_name, []) | ||
rows.append(row) | ||
await self.on_rdata(stream_name, cmd.token, rows) | ||
# We linearize here for two reasons: | ||
# 1. so we don't try and concurrently handle multiple rows for the | ||
# same stream, and | ||
# 2. so we don't race with getting a POSITION command and fetching | ||
# missing RDATA. | ||
with await self._position_linearizer.queue(cmd.stream_name): | ||
if stream_name not in self._streams_connected: | ||
logger.warning( | ||
"Discarding RDATA for unconnected stream %s", stream_name | ||
) | ||
return | ||
richvdh marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if cmd.token is None: | ||
# I.e. either this is part of a batch of updates for this stream (in | ||
# which case batch until we get an update for the stream with a non | ||
# None token) or we're currently connecting so we queue up rows. | ||
erikjohnston marked this conversation as resolved.
Show resolved
Hide resolved
|
||
self._pending_batches.setdefault(stream_name, []).append(row) | ||
else: | ||
# Check if this is the last of a batch of updates | ||
rows = self._pending_batches.pop(stream_name, []) | ||
rows.append(row) | ||
await self.on_rdata(stream_name, cmd.token, rows) | ||
|
||
async def on_rdata(self, stream_name: str, token: int, rows: list): | ||
"""Called to handle a batch of replication data with a given stream token. | ||
|
@@ -124,12 +136,13 @@ async def on_POSITION(self, cmd: PositionCommand): | |
# We protect catching up with a linearizer in case the replication | ||
# connection reconnects under us. | ||
with await self._position_linearizer.queue(cmd.stream_name): | ||
# We're about to go and catch up with the stream, so mark as connecting | ||
# to stop RDATA being handled at the same time by removing stream from | ||
# list of connected streams. We also clear any batched up RDATA from | ||
# before we got the POSITION. | ||
# We're about to go and catch up with the stream, so remove from set | ||
# of connected streams. | ||
self._streams_connected.discard(cmd.stream_name) | ||
Comment on lines
+145
to
147
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this actually doing anything useful? once we've caught up for the first time, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thinking about this a bit, the only difference is if an exception is raised. I don't think we want to handle RDATA for that stream if we fail to handle the position, but dropping everything doesn't sound like the right thing either There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mmmm this sounds like a thing we need to improve, but ok let's punt it for now. |
||
self._pending_batches.clear() | ||
|
||
# We clear the pending batches for the stream as the fetching | ||
# updates below will fetch all rows in the batch. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "the fetching updates" ? |
||
self._pending_batches.pop(cmd.stream_name, []) | ||
|
||
# Find where we previously streamed up to. | ||
current_token = self._replication_data_handler.get_streams_to_replicate().get( | ||
|
@@ -158,13 +171,6 @@ async def on_POSITION(self, cmd: PositionCommand): | |
# We've now caught up to position sent to us, notify handler. | ||
await self._replication_data_handler.on_position(cmd.stream_name, cmd.token) | ||
|
||
# Handle any RDATA that came in while we were catching up. | ||
rows = self._pending_batches.pop(cmd.stream_name, []) | ||
if rows: | ||
await self._replication_data_handler.on_rdata( | ||
cmd.stream_name, rows[-1].token, rows | ||
) | ||
|
||
self._streams_connected.add(cmd.stream_name) | ||
|
||
async def on_SYNC(self, cmd: SyncCommand): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm a bit worried that we could get quite far behind (ie, have a long list of things waiting for the position linearizer) if the catchup is a bit slow and we get a few POSITION lines intermixed with lots of RDATA lines, all of which will end up getting processed in series.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Mmm, though I'm not sure the solution to that is allowing RDATA to be processed in parallel. Perhaps we just want to add metrics for the queue size?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah maybe it's not worth worrying about for now. especially if we can mitigate it as per #7226 (comment).