Skip to content

Commit 087f182

Browse files
authored
Translog file recovery should not rely on lucene commits (#25005)
When we open a translog, we rely on the `translog.ckp` file to tell us what the maximum generation file should be and on the information stored in the last lucene commit to know the first file we need to recover. This requires coordination and is currently subject to a race condition: if a node dies after a lucene commit is made but before we remove the translog generations that were unneeded by it, the next time we open the translog we will ignore those files and never delete them (I have added tests for this). This PR changes the approach to have the translog store both of those numbers in the `translog.ckp`. This means it's more self contained and easier to control. This change also decouples the translog recovery logic from the specific commit we're opening. This prepares the ground to fully utilize the deletion policy introduced in #24950 and store more translog data that's needed for Lucene, keep multiple lucene commits around and be free to recover from any of them.
1 parent ce24331 commit 087f182

File tree

10 files changed

+356
-99
lines changed

10 files changed

+356
-99
lines changed

core/src/main/java/org/elasticsearch/index/engine/CombinedDeletionPolicy.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,4 +83,8 @@ private void setLastCommittedTranslogGeneration(List<? extends IndexCommit> comm
8383
public SnapshotDeletionPolicy getIndexDeletionPolicy() {
8484
return indexDeletionPolicy;
8585
}
86+
87+
public TranslogDeletionPolicy getTranslogDeletionPolicy() {
88+
return translogDeletionPolicy;
89+
}
8690
}

core/src/main/java/org/elasticsearch/index/engine/InternalEngine.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,8 @@ private void recoverFromTranslogInternal() throws IOException {
305305
Translog.TranslogGeneration translogGeneration = translog.getGeneration();
306306
final int opsRecovered;
307307
try {
308-
Translog.Snapshot snapshot = translog.newSnapshot();
308+
final long translogGen = Long.parseLong(lastCommittedSegmentInfos.getUserData().get(Translog.TRANSLOG_GENERATION_KEY));
309+
Translog.Snapshot snapshot = translog.newSnapshot(translogGen);
309310
opsRecovered = config().getTranslogRecoveryRunner().run(this, snapshot);
310311
} catch (Exception e) {
311312
throw new EngineException(shardId, "failed to recover from translog", e);
@@ -321,6 +322,8 @@ private void recoverFromTranslogInternal() throws IOException {
321322
} else if (translog.isCurrent(translogGeneration) == false) {
322323
commitIndexWriter(indexWriter, translog, lastCommittedSegmentInfos.getUserData().get(Engine.SYNC_COMMIT_ID));
323324
}
325+
// clean up what's not needed
326+
translog.trimUnreferencedReaders();
324327
}
325328

326329
private Translog openTranslog(EngineConfig engineConfig, IndexWriter writer, TranslogDeletionPolicy translogDeletionPolicy, LongSupplier globalCheckpointSupplier) throws IOException {
@@ -1772,7 +1775,7 @@ protected void doRun() throws Exception {
17721775
* @param syncId the sync flush ID ({@code null} if not committing a synced flush)
17731776
* @throws IOException if an I/O exception occurs committing the specfied writer
17741777
*/
1775-
private void commitIndexWriter(final IndexWriter writer, final Translog translog, @Nullable final String syncId) throws IOException {
1778+
protected void commitIndexWriter(final IndexWriter writer, final Translog translog, @Nullable final String syncId) throws IOException {
17761779
ensureCanFlush();
17771780
try {
17781781
final long localCheckpoint = seqNoService().getLocalCheckpoint();

core/src/main/java/org/elasticsearch/index/translog/Checkpoint.java

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ final class Checkpoint {
4444
final long minSeqNo;
4545
final long maxSeqNo;
4646
final long globalCheckpoint;
47+
final long minTranslogGeneration;
4748

4849
private static final int INITIAL_VERSION = 1; // start with 1, just to recognize there was some magic serialization logic before
4950
private static final int CURRENT_VERSION = 2; // introduction of global checkpoints
@@ -58,6 +59,7 @@ final class Checkpoint {
5859
+ Long.BYTES // minimum sequence number, introduced in 6.0.0
5960
+ Long.BYTES // maximum sequence number, introduced in 6.0.0
6061
+ Long.BYTES // global checkpoint, introduced in 6.0.0
62+
+ Long.BYTES // minimum translog generation in the translog - introduced in 6.0.0
6163
+ CodecUtil.footerLength();
6264

6365
// size of 5.0.0 checkpoint
@@ -76,15 +78,19 @@ final class Checkpoint {
7678
* @param minSeqNo the current minimum sequence number of all operations in the translog
7779
* @param maxSeqNo the current maximum sequence number of all operations in the translog
7880
* @param globalCheckpoint the last-known global checkpoint
81+
* @param minTranslogGeneration the minimum generation referenced by the translog at this moment.
7982
*/
80-
Checkpoint(long offset, int numOps, long generation, long minSeqNo, long maxSeqNo, long globalCheckpoint) {
81-
assert minSeqNo <= maxSeqNo;
83+
Checkpoint(long offset, int numOps, long generation, long minSeqNo, long maxSeqNo, long globalCheckpoint, long minTranslogGeneration) {
84+
assert minSeqNo <= maxSeqNo : "minSeqNo [" + minSeqNo + "] is higher than maxSeqNo [" + maxSeqNo + "]";
85+
assert minTranslogGeneration <= generation :
86+
"minTranslogGen [" + minTranslogGeneration + "] is higher than generation [" + generation + "]";
8287
this.offset = offset;
8388
this.numOps = numOps;
8489
this.generation = generation;
8590
this.minSeqNo = minSeqNo;
8691
this.maxSeqNo = maxSeqNo;
8792
this.globalCheckpoint = globalCheckpoint;
93+
this.minTranslogGeneration = minTranslogGeneration;
8894
}
8995

9096
private void write(DataOutput out) throws IOException {
@@ -94,24 +100,27 @@ private void write(DataOutput out) throws IOException {
94100
out.writeLong(minSeqNo);
95101
out.writeLong(maxSeqNo);
96102
out.writeLong(globalCheckpoint);
103+
out.writeLong(minTranslogGeneration);
97104
}
98105

99-
static Checkpoint emptyTranslogCheckpoint(final long offset, final long generation, final long globalCheckpoint) {
106+
static Checkpoint emptyTranslogCheckpoint(final long offset, final long generation, final long globalCheckpoint,
107+
long minTranslogGeneration) {
100108
final long minSeqNo = SequenceNumbersService.NO_OPS_PERFORMED;
101109
final long maxSeqNo = SequenceNumbersService.NO_OPS_PERFORMED;
102-
return new Checkpoint(offset, 0, generation, minSeqNo, maxSeqNo, globalCheckpoint);
110+
return new Checkpoint(offset, 0, generation, minSeqNo, maxSeqNo, globalCheckpoint, minTranslogGeneration);
103111
}
104112

105113
static Checkpoint readCheckpointV6_0_0(final DataInput in) throws IOException {
106-
return new Checkpoint(in.readLong(), in.readInt(), in.readLong(), in.readLong(), in.readLong(), in.readLong());
114+
return new Checkpoint(in.readLong(), in.readInt(), in.readLong(), in.readLong(), in.readLong(), in.readLong(), in.readLong());
107115
}
108116

109117
// reads a checksummed checkpoint introduced in ES 5.0.0
110118
static Checkpoint readCheckpointV5_0_0(final DataInput in) throws IOException {
111119
final long minSeqNo = SequenceNumbersService.NO_OPS_PERFORMED;
112120
final long maxSeqNo = SequenceNumbersService.NO_OPS_PERFORMED;
113121
final long globalCheckpoint = SequenceNumbersService.UNASSIGNED_SEQ_NO;
114-
return new Checkpoint(in.readLong(), in.readInt(), in.readLong(), minSeqNo, maxSeqNo, globalCheckpoint);
122+
final long minTranslogGeneration = -1L;
123+
return new Checkpoint(in.readLong(), in.readInt(), in.readLong(), minSeqNo, maxSeqNo, globalCheckpoint, minTranslogGeneration);
115124
}
116125

117126
@Override
@@ -123,6 +132,7 @@ public String toString() {
123132
", minSeqNo=" + minSeqNo +
124133
", maxSeqNo=" + maxSeqNo +
125134
", globalCheckpoint=" + globalCheckpoint +
135+
", minTranslogGeneration=" + minTranslogGeneration +
126136
'}';
127137
}
128138

0 commit comments

Comments
 (0)