Skip to content

Commit 743327e

Browse files
authored
Reset replica engine to global checkpoint on promotion (#33473)
When a replica starts following a newly promoted primary, it may have some operations which don't exist on the new primary. Thus we need to throw those operations to align a replica with the new primary. This can be done by first resetting an engine from the safe commit, then replaying the local translog up to the global checkpoint. Relates #32867
1 parent 27e07ec commit 743327e

File tree

16 files changed

+274
-121
lines changed

16 files changed

+274
-121
lines changed

server/src/main/java/org/elasticsearch/index/engine/Engine.java

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -678,12 +678,6 @@ public final CommitStats commitStats() {
678678
*/
679679
public abstract void waitForOpsToComplete(long seqNo) throws InterruptedException;
680680

681-
/**
682-
* Reset the local checkpoint in the tracker to the given local checkpoint
683-
* @param localCheckpoint the new checkpoint to be set
684-
*/
685-
public abstract void resetLocalCheckpoint(long localCheckpoint);
686-
687681
/**
688682
* @return a {@link SeqNoStats} object, using local state and the supplied global checkpoint
689683
*/
@@ -1165,11 +1159,16 @@ public enum Origin {
11651159
PRIMARY,
11661160
REPLICA,
11671161
PEER_RECOVERY,
1168-
LOCAL_TRANSLOG_RECOVERY;
1162+
LOCAL_TRANSLOG_RECOVERY,
1163+
LOCAL_RESET;
11691164

11701165
public boolean isRecovery() {
11711166
return this == PEER_RECOVERY || this == LOCAL_TRANSLOG_RECOVERY;
11721167
}
1168+
1169+
boolean isFromTranslog() {
1170+
return this == LOCAL_TRANSLOG_RECOVERY || this == LOCAL_RESET;
1171+
}
11731172
}
11741173

11751174
public Origin origin() {

server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -729,6 +729,7 @@ private boolean canOptimizeAddDocument(Index index) {
729729
: "version: " + index.version() + " type: " + index.versionType();
730730
return true;
731731
case LOCAL_TRANSLOG_RECOVERY:
732+
case LOCAL_RESET:
732733
assert index.isRetry();
733734
return true; // allow to optimize in order to update the max safe time stamp
734735
default:
@@ -827,7 +828,7 @@ public IndexResult index(Index index) throws IOException {
827828
indexResult = new IndexResult(
828829
plan.versionForIndexing, getPrimaryTerm(), plan.seqNoForIndexing, plan.currentNotFoundOrDeleted);
829830
}
830-
if (index.origin() != Operation.Origin.LOCAL_TRANSLOG_RECOVERY) {
831+
if (index.origin().isFromTranslog() == false) {
831832
final Translog.Location location;
832833
if (indexResult.getResultType() == Result.Type.SUCCESS) {
833834
location = translog.add(new Translog.Index(index, indexResult));
@@ -1167,7 +1168,7 @@ public DeleteResult delete(Delete delete) throws IOException {
11671168
deleteResult = new DeleteResult(
11681169
plan.versionOfDeletion, getPrimaryTerm(), plan.seqNoOfDeletion, plan.currentlyDeleted == false);
11691170
}
1170-
if (delete.origin() != Operation.Origin.LOCAL_TRANSLOG_RECOVERY) {
1171+
if (delete.origin().isFromTranslog() == false) {
11711172
final Translog.Location location;
11721173
if (deleteResult.getResultType() == Result.Type.SUCCESS) {
11731174
location = translog.add(new Translog.Delete(delete, deleteResult));
@@ -1405,7 +1406,7 @@ private NoOpResult innerNoOp(final NoOp noOp) throws IOException {
14051406
}
14061407
}
14071408
final NoOpResult noOpResult = failure != null ? new NoOpResult(getPrimaryTerm(), noOp.seqNo(), failure) : new NoOpResult(getPrimaryTerm(), noOp.seqNo());
1408-
if (noOp.origin() != Operation.Origin.LOCAL_TRANSLOG_RECOVERY) {
1409+
if (noOp.origin().isFromTranslog() == false) {
14091410
final Translog.Location location = translog.add(new Translog.NoOp(noOp.seqNo(), noOp.primaryTerm(), noOp.reason()));
14101411
noOpResult.setTranslogLocation(location);
14111412
}
@@ -2324,11 +2325,6 @@ public void waitForOpsToComplete(long seqNo) throws InterruptedException {
23242325
localCheckpointTracker.waitForOpsToComplete(seqNo);
23252326
}
23262327

2327-
@Override
2328-
public void resetLocalCheckpoint(long localCheckpoint) {
2329-
localCheckpointTracker.resetCheckpoint(localCheckpoint);
2330-
}
2331-
23322328
@Override
23332329
public SeqNoStats getSeqNoStats(long globalCheckpoint) {
23342330
return localCheckpointTracker.getStats(globalCheckpoint);

server/src/main/java/org/elasticsearch/index/engine/ReadOnlyEngine.java

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -257,10 +257,6 @@ public long getLocalCheckpoint() {
257257
public void waitForOpsToComplete(long seqNo) {
258258
}
259259

260-
@Override
261-
public void resetLocalCheckpoint(long newCheckpoint) {
262-
}
263-
264260
@Override
265261
public SeqNoStats getSeqNoStats(long globalCheckpoint) {
266262
return new SeqNoStats(seqNoStats.getMaxSeqNo(), seqNoStats.getLocalCheckpoint(), globalCheckpoint);

server/src/main/java/org/elasticsearch/index/seqno/LocalCheckpointTracker.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ public synchronized void markSeqNoAsCompleted(final long seqNo) {
109109
* @param checkpoint the local checkpoint to reset this tracker to
110110
*/
111111
public synchronized void resetCheckpoint(final long checkpoint) {
112+
// TODO: remove this method as after we restore the local history on promotion.
112113
assert checkpoint != SequenceNumbers.UNASSIGNED_SEQ_NO;
113114
assert checkpoint <= this.checkpoint;
114115
processedSeqNo.clear();

server/src/main/java/org/elasticsearch/index/shard/IndexShard.java

Lines changed: 56 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,6 @@
163163
import java.util.stream.StreamSupport;
164164

165165
import static org.elasticsearch.index.mapper.SourceToParse.source;
166-
import static org.elasticsearch.index.seqno.SequenceNumbers.NO_OPS_PERFORMED;
167166
import static org.elasticsearch.index.seqno.SequenceNumbers.UNASSIGNED_SEQ_NO;
168167

169168
public class IndexShard extends AbstractIndexShardComponent implements IndicesClusterStateService.Shard {
@@ -1273,16 +1272,18 @@ public Engine.Result applyTranslogOperation(Translog.Operation operation, Engine
12731272
return result;
12741273
}
12751274

1276-
// package-private for testing
1277-
int runTranslogRecovery(Engine engine, Translog.Snapshot snapshot) throws IOException {
1278-
recoveryState.getTranslog().totalOperations(snapshot.totalOperations());
1279-
recoveryState.getTranslog().totalOperationsOnStart(snapshot.totalOperations());
1275+
/**
1276+
* Replays translog operations from the provided translog {@code snapshot} to the current engine using the given {@code origin}.
1277+
* The callback {@code onOperationRecovered} is notified after each translog operation is replayed successfully.
1278+
*/
1279+
int runTranslogRecovery(Engine engine, Translog.Snapshot snapshot, Engine.Operation.Origin origin,
1280+
Runnable onOperationRecovered) throws IOException {
12801281
int opsRecovered = 0;
12811282
Translog.Operation operation;
12821283
while ((operation = snapshot.next()) != null) {
12831284
try {
12841285
logger.trace("[translog] recover op {}", operation);
1285-
Engine.Result result = applyTranslogOperation(operation, Engine.Operation.Origin.LOCAL_TRANSLOG_RECOVERY);
1286+
Engine.Result result = applyTranslogOperation(operation, origin);
12861287
switch (result.getResultType()) {
12871288
case FAILURE:
12881289
throw result.getFailure();
@@ -1295,7 +1296,7 @@ int runTranslogRecovery(Engine engine, Translog.Snapshot snapshot) throws IOExce
12951296
}
12961297

12971298
opsRecovered++;
1298-
recoveryState.getTranslog().incrementRecoveredOperations();
1299+
onOperationRecovered.run();
12991300
} catch (Exception e) {
13001301
if (ExceptionsHelper.status(e) == RestStatus.BAD_REQUEST) {
13011302
// mainly for MapperParsingException and Failure to detect xcontent
@@ -1313,8 +1314,15 @@ int runTranslogRecovery(Engine engine, Translog.Snapshot snapshot) throws IOExce
13131314
* Operations from the translog will be replayed to bring lucene up to date.
13141315
**/
13151316
public void openEngineAndRecoverFromTranslog() throws IOException {
1317+
final RecoveryState.Translog translogRecoveryStats = recoveryState.getTranslog();
1318+
final Engine.TranslogRecoveryRunner translogRecoveryRunner = (engine, snapshot) -> {
1319+
translogRecoveryStats.totalOperations(snapshot.totalOperations());
1320+
translogRecoveryStats.totalOperationsOnStart(snapshot.totalOperations());
1321+
return runTranslogRecovery(engine, snapshot, Engine.Operation.Origin.LOCAL_TRANSLOG_RECOVERY,
1322+
translogRecoveryStats::incrementRecoveredOperations);
1323+
};
13161324
innerOpenEngineAndTranslog();
1317-
getEngine().recoverFromTranslog(this::runTranslogRecovery, Long.MAX_VALUE);
1325+
getEngine().recoverFromTranslog(translogRecoveryRunner, Long.MAX_VALUE);
13181326
}
13191327

13201328
/**
@@ -1352,11 +1360,7 @@ private void innerOpenEngineAndTranslog() throws IOException {
13521360
final String translogUUID = store.readLastCommittedSegmentsInfo().getUserData().get(Translog.TRANSLOG_UUID_KEY);
13531361
final long globalCheckpoint = Translog.readGlobalCheckpoint(translogConfig.getTranslogPath(), translogUUID);
13541362
replicationTracker.updateGlobalCheckpointOnReplica(globalCheckpoint, "read from translog checkpoint");
1355-
1356-
assertMaxUnsafeAutoIdInCommit();
1357-
1358-
final long minRetainedTranslogGen = Translog.readMinTranslogGeneration(translogConfig.getTranslogPath(), translogUUID);
1359-
store.trimUnsafeCommits(globalCheckpoint, minRetainedTranslogGen, config.getIndexSettings().getIndexVersionCreated());
1363+
trimUnsafeCommits();
13601364

13611365
createNewEngine(config);
13621366
verifyNotClosed();
@@ -1367,6 +1371,15 @@ private void innerOpenEngineAndTranslog() throws IOException {
13671371
assert recoveryState.getStage() == RecoveryState.Stage.TRANSLOG : "TRANSLOG stage expected but was: " + recoveryState.getStage();
13681372
}
13691373

1374+
private void trimUnsafeCommits() throws IOException {
1375+
assert currentEngineReference.get() == null : "engine is running";
1376+
final String translogUUID = store.readLastCommittedSegmentsInfo().getUserData().get(Translog.TRANSLOG_UUID_KEY);
1377+
final long globalCheckpoint = Translog.readGlobalCheckpoint(translogConfig.getTranslogPath(), translogUUID);
1378+
final long minRetainedTranslogGen = Translog.readMinTranslogGeneration(translogConfig.getTranslogPath(), translogUUID);
1379+
assertMaxUnsafeAutoIdInCommit();
1380+
store.trimUnsafeCommits(globalCheckpoint, minRetainedTranslogGen, indexSettings.getIndexVersionCreated());
1381+
}
1382+
13701383
private boolean assertSequenceNumbersInCommit() throws IOException {
13711384
final Map<String, String> userData = SegmentInfos.readLatestCommit(store.directory()).getUserData();
13721385
assert userData.containsKey(SequenceNumbers.LOCAL_CHECKPOINT_KEY) : "commit point doesn't contains a local checkpoint";
@@ -1463,7 +1476,7 @@ private void ensureWriteAllowed(Engine.Operation.Origin origin) throws IllegalIn
14631476
if (origin == Engine.Operation.Origin.PRIMARY) {
14641477
assert assertPrimaryMode();
14651478
} else {
1466-
assert origin == Engine.Operation.Origin.REPLICA;
1479+
assert origin == Engine.Operation.Origin.REPLICA || origin == Engine.Operation.Origin.LOCAL_RESET;
14671480
assert assertReplicationTarget();
14681481
}
14691482
if (writeAllowedStates.contains(state) == false) {
@@ -2166,9 +2179,7 @@ public void onFailedEngine(String reason, @Nullable Exception failure) {
21662179

21672180
private Engine createNewEngine(EngineConfig config) {
21682181
synchronized (mutex) {
2169-
if (state == IndexShardState.CLOSED) {
2170-
throw new AlreadyClosedException(shardId + " can't create engine - shard is closed");
2171-
}
2182+
verifyNotClosed();
21722183
assert this.currentEngineReference.get() == null;
21732184
Engine engine = newEngine(config);
21742185
onNewEngine(engine); // call this before we pass the memory barrier otherwise actions that happen
@@ -2314,19 +2325,14 @@ public void acquireReplicaOperationPermit(final long opPrimaryTerm, final long g
23142325
bumpPrimaryTerm(opPrimaryTerm, () -> {
23152326
updateGlobalCheckpointOnReplica(globalCheckpoint, "primary term transition");
23162327
final long currentGlobalCheckpoint = getGlobalCheckpoint();
2317-
final long localCheckpoint;
2318-
if (currentGlobalCheckpoint == UNASSIGNED_SEQ_NO) {
2319-
localCheckpoint = NO_OPS_PERFORMED;
2328+
final long maxSeqNo = seqNoStats().getMaxSeqNo();
2329+
logger.info("detected new primary with primary term [{}], global checkpoint [{}], max_seq_no [{}]",
2330+
opPrimaryTerm, currentGlobalCheckpoint, maxSeqNo);
2331+
if (currentGlobalCheckpoint < maxSeqNo) {
2332+
resetEngineToGlobalCheckpoint();
23202333
} else {
2321-
localCheckpoint = currentGlobalCheckpoint;
2334+
getEngine().rollTranslogGeneration();
23222335
}
2323-
logger.trace(
2324-
"detected new primary with primary term [{}], resetting local checkpoint from [{}] to [{}]",
2325-
opPrimaryTerm,
2326-
getLocalCheckpoint(),
2327-
localCheckpoint);
2328-
getEngine().resetLocalCheckpoint(localCheckpoint);
2329-
getEngine().rollTranslogGeneration();
23302336
});
23312337
}
23322338
}
@@ -2687,4 +2693,26 @@ public ParsedDocument newNoopTombstoneDoc(String reason) {
26872693
}
26882694
};
26892695
}
2696+
2697+
/**
2698+
* Rollback the current engine to the safe commit, then replay local translog up to the global checkpoint.
2699+
*/
2700+
void resetEngineToGlobalCheckpoint() throws IOException {
2701+
assert getActiveOperationsCount() == 0 : "Ongoing writes [" + getActiveOperations() + "]";
2702+
sync(); // persist the global checkpoint to disk
2703+
final long globalCheckpoint = getGlobalCheckpoint();
2704+
final Engine newEngine;
2705+
synchronized (mutex) {
2706+
verifyNotClosed();
2707+
IOUtils.close(currentEngineReference.getAndSet(null));
2708+
trimUnsafeCommits();
2709+
newEngine = createNewEngine(newEngineConfig());
2710+
active.set(true);
2711+
}
2712+
final Engine.TranslogRecoveryRunner translogRunner = (engine, snapshot) -> runTranslogRecovery(
2713+
engine, snapshot, Engine.Operation.Origin.LOCAL_RESET, () -> {
2714+
// TODO: add a dedicate recovery stats for the reset translog
2715+
});
2716+
newEngine.recoverFromTranslog(translogRunner, globalCheckpoint);
2717+
}
26902718
}

server/src/test/java/org/elasticsearch/discovery/AbstractDisruptionTestCase.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ protected void beforeIndexDeletion() throws Exception {
111111
super.beforeIndexDeletion();
112112
internalCluster().assertConsistentHistoryBetweenTranslogAndLuceneIndex();
113113
assertSeqNos();
114+
assertSameDocIdsOnShards();
114115
}
115116
}
116117

server/src/test/java/org/elasticsearch/index/engine/InternalEngineTests.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4087,7 +4087,7 @@ public void markSeqNoAsCompleted(long seqNo) {
40874087
final long currentLocalCheckpoint = actualEngine.getLocalCheckpoint();
40884088
final long resetLocalCheckpoint =
40894089
randomIntBetween(Math.toIntExact(SequenceNumbers.NO_OPS_PERFORMED), Math.toIntExact(currentLocalCheckpoint));
4090-
actualEngine.resetLocalCheckpoint(resetLocalCheckpoint);
4090+
actualEngine.getLocalCheckpointTracker().resetCheckpoint(resetLocalCheckpoint);
40914091
completedSeqNos.clear();
40924092
actualEngine.restoreLocalCheckpointFromTranslog();
40934093
final Set<Long> intersection = new HashSet<>(expectedCompletedSeqNos);

server/src/test/java/org/elasticsearch/index/engine/ReadOnlyEngineTests.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
import org.elasticsearch.index.store.Store;
2828

2929
import java.io.IOException;
30-
import java.util.Set;
30+
import java.util.List;
3131
import java.util.concurrent.atomic.AtomicLong;
3232
import java.util.function.Function;
3333

@@ -43,7 +43,7 @@ public void testReadOnlyEngine() throws Exception {
4343
EngineConfig config = config(defaultSettings, store, createTempDir(), newMergePolicy(), null, null, globalCheckpoint::get);
4444
int numDocs = scaledRandomIntBetween(10, 1000);
4545
final SeqNoStats lastSeqNoStats;
46-
final Set<String> lastDocIds;
46+
final List<DocIdSeqNoAndTerm> lastDocIds;
4747
try (InternalEngine engine = createEngine(config)) {
4848
Engine.Get get = null;
4949
for (int i = 0; i < numDocs; i++) {

server/src/test/java/org/elasticsearch/index/replication/IndexLevelReplicationTests.java

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -519,18 +519,14 @@ public void testSeqNoCollision() throws Exception {
519519
shards.promoteReplicaToPrimary(replica2).get();
520520
logger.info("--> Recover replica3 from replica2");
521521
recoverReplica(replica3, replica2, true);
522-
try (Translog.Snapshot snapshot = getTranslog(replica3).newSnapshot()) {
522+
try (Translog.Snapshot snapshot = replica3.getHistoryOperations("test", 0)) {
523523
assertThat(snapshot.totalOperations(), equalTo(initDocs + 1));
524524
final List<Translog.Operation> expectedOps = new ArrayList<>(initOperations);
525525
expectedOps.add(op2);
526526
assertThat(snapshot, containsOperationsInAnyOrder(expectedOps));
527527
assertThat("Peer-recovery should not send overridden operations", snapshot.skippedOperations(), equalTo(0));
528528
}
529-
// TODO: We should assert the content of shards in the ReplicationGroup.
530-
// Without rollback replicas(current implementation), we don't have the same content across shards:
531-
// - replica1 has {doc1}
532-
// - replica2 has {doc1, doc2}
533-
// - replica3 can have either {doc2} only if operation-based recovery or {doc1, doc2} if file-based recovery
529+
shards.assertAllEqual(initDocs + 1);
534530
}
535531
}
536532

0 commit comments

Comments
 (0)