-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Tighten sequence numbers recovery #22212
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
3164819
1c71393
3c37f4b
f57eb99
2a8d069
b9f68d4
3935af2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -175,8 +175,18 @@ public InternalEngine(EngineConfig engineConfig) throws EngineException { | |
| throw new IllegalArgumentException(openMode.toString()); | ||
| } | ||
| logger.trace("recovered [{}]", seqNoStats); | ||
| indexWriter = writer; | ||
| seqNoService = sequenceNumberService(shardId, engineConfig.getIndexSettings(), seqNoStats); | ||
| // norelease | ||
| /* | ||
| * We have no guarantees that all operations above the local checkpoint are in the Lucene commit or the translog. This means | ||
| * that we there might be operations greater than the local checkpoint that will not be replayed. Here we force the local | ||
| * checkpoint to the maximum sequence number in the commit (at the potential expense of correctness). | ||
| */ | ||
| while (seqNoService.getLocalCheckpoint() < seqNoService.getMaxSeqNo()) { | ||
| final long next = seqNoService.getLocalCheckpoint() + 1; | ||
| seqNoService.markSeqNoAsCompleted(next); | ||
| } | ||
| indexWriter = writer; | ||
| translog = openTranslog(engineConfig, writer, seqNoService::getGlobalCheckpoint); | ||
| assert translog.getGeneration() != null; | ||
| } catch (IOException | TranslogCorruptedException e) { | ||
|
|
@@ -638,16 +648,23 @@ private IndexResult innerIndex(Index index) throws IOException { | |
| } | ||
| } | ||
| final long expectedVersion = index.version(); | ||
| if (checkVersionConflict(index, currentVersion, expectedVersion, deleted)) { | ||
| // skip index operation because of version conflict on recovery | ||
| indexResult = new IndexResult(expectedVersion, SequenceNumbersService.UNASSIGNED_SEQ_NO, false); | ||
| } else { | ||
| final long seqNo; | ||
| if (index.origin() == Operation.Origin.PRIMARY) { | ||
| final boolean conflict = checkVersionConflict(index, currentVersion, expectedVersion, deleted); | ||
|
||
|
|
||
| final long seqNo; | ||
| if (index.origin() == Operation.Origin.PRIMARY) { | ||
| if (!conflict) { | ||
| seqNo = seqNoService.generateSeqNo(); | ||
| } else { | ||
| seqNo = index.seqNo(); | ||
| seqNo = SequenceNumbersService.UNASSIGNED_SEQ_NO; | ||
| } | ||
| } else { | ||
| seqNo = index.seqNo(); | ||
| } | ||
|
|
||
| if (conflict) { | ||
| // skip index operation because of version conflict on recovery | ||
| indexResult = new IndexResult(expectedVersion, seqNo, false); | ||
| } else { | ||
| updatedVersion = index.versionType().updateVersion(currentVersion, expectedVersion); | ||
| index.parsedDoc().version().setLongValue(updatedVersion); | ||
|
|
||
|
|
@@ -764,16 +781,24 @@ private DeleteResult innerDelete(Delete delete) throws IOException { | |
| } | ||
|
|
||
| final long expectedVersion = delete.version(); | ||
| if (checkVersionConflict(delete, currentVersion, expectedVersion, deleted)) { | ||
| // skip executing delete because of version conflict on recovery | ||
| deleteResult = new DeleteResult(expectedVersion, SequenceNumbersService.UNASSIGNED_SEQ_NO, true); | ||
| } else { | ||
| final long seqNo; | ||
| if (delete.origin() == Operation.Origin.PRIMARY) { | ||
|
|
||
| final boolean conflict = checkVersionConflict(delete, currentVersion, expectedVersion, deleted); | ||
|
|
||
| final long seqNo; | ||
| if (delete.origin() == Operation.Origin.PRIMARY) { | ||
| if (!conflict) { | ||
| seqNo = seqNoService.generateSeqNo(); | ||
| } else { | ||
| seqNo = delete.seqNo(); | ||
| seqNo = SequenceNumbersService.UNASSIGNED_SEQ_NO; | ||
| } | ||
| } else { | ||
| seqNo = delete.seqNo(); | ||
| } | ||
|
|
||
| if (conflict) { | ||
| // skip executing delete because of version conflict on recovery | ||
| deleteResult = new DeleteResult(expectedVersion, seqNo, true); | ||
| } else { | ||
| updatedVersion = delete.versionType().updateVersion(currentVersion, expectedVersion); | ||
| found = deleteIfFound(delete.uid(), currentVersion, deleted, versionValue); | ||
| deleteResult = new DeleteResult(updatedVersion, seqNo, found); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -149,12 +149,14 @@ public synchronized long getCheckpoint() { | |
| * updates the global checkpoint on a replica shard (after it has been updated by the primary). | ||
| */ | ||
| synchronized void updateCheckpointOnReplica(long globalCheckpoint) { | ||
| /* | ||
| * The global checkpoint here is a local knowledge which is updated under the mandate of the primary. It can happen that the primary | ||
| * information is lagging compared to a replica (e.g., if a replica is promoted to primary but has stale info relative to other | ||
| * replica shards). In these cases, the local knowledge of the global checkpoint could be higher than sync from the lagging primary. | ||
| */ | ||
| if (this.globalCheckpoint <= globalCheckpoint) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a comment about when the current global checkpoint can be higher? here is what I wrote in #10708
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I pushed 1c71393. |
||
| this.globalCheckpoint = globalCheckpoint; | ||
| logger.trace("global checkpoint updated from primary to [{}]", globalCheckpoint); | ||
| } else { | ||
| throw new IllegalArgumentException("global checkpoint from primary should never decrease. current [" + | ||
| this.globalCheckpoint + "], got [" + globalCheckpoint + "]"); | ||
| } | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -391,8 +391,8 @@ public void finalizeRecovery() { | |
| StopWatch stopWatch = new StopWatch().start(); | ||
| logger.trace("[{}][{}] finalizing recovery to {}", indexName, shardId, request.targetNode()); | ||
| cancellableThreads.execute(() -> { | ||
| recoveryTarget.finalizeRecovery(); | ||
| shard.markAllocationIdAsInSync(recoveryTarget.getTargetAllocationId()); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. paranoia - can we flip this around and mark the target allocation as "in sync" before we give it the global checkpoint? it at least reads better as "we know you are in sync and therefore every global checkpoint advances will take you into account"
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I pushed 1c71393. |
||
| recoveryTarget.finalizeRecovery(shard.getGlobalCheckpoint()); | ||
| }); | ||
|
|
||
| if (request.isPrimaryRelocation()) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe assert at the end of this method that the seqNo is set on the replica request?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I pushed 1c71393.