-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Rollback a primary before recovering from translog #27804
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 15 commits
03a3d58
0aaee95
6aa3200
7c11d37
d94b06e
f6b5c58
db24f52
63c3cd4
fba0784
0a447e9
4c4a1c7
f412572
4892029
40d5d24
cebbe6d
ac01498
1b1b984
9db0d41
b6b5226
3b265e0
d5558d0
2bd3354
b27ac02
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,6 +26,8 @@ | |
| import org.elasticsearch.index.translog.TranslogDeletionPolicy; | ||
|
|
||
| import java.io.IOException; | ||
| import java.nio.file.Path; | ||
| import java.util.ArrayList; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.function.LongSupplier; | ||
|
|
@@ -37,7 +39,7 @@ | |
| * In particular, this policy will delete index commits whose max sequence number is at most | ||
| * the current global checkpoint except the index commit which has the highest max sequence number among those. | ||
| */ | ||
| final class CombinedDeletionPolicy extends IndexDeletionPolicy { | ||
| public final class CombinedDeletionPolicy extends IndexDeletionPolicy { | ||
| private final TranslogDeletionPolicy translogDeletionPolicy; | ||
| private final EngineConfig.OpenMode openMode; | ||
| private final LongSupplier globalCheckpointSupplier; | ||
|
|
@@ -71,7 +73,7 @@ public void onInit(List<? extends IndexCommit> commits) throws IOException { | |
|
|
||
| @Override | ||
| public void onCommit(List<? extends IndexCommit> commits) throws IOException { | ||
| final int keptPosition = indexOfKeptCommits(commits); | ||
| final int keptPosition = indexOfKeptCommits(commits, globalCheckpointSupplier.getAsLong()); | ||
| for (int i = 0; i < keptPosition; i++) { | ||
| commits.get(i).delete(); | ||
| } | ||
|
|
@@ -90,12 +92,38 @@ private void updateTranslogDeletionPolicy(final IndexCommit minRequiredCommit, f | |
| translogDeletionPolicy.setMinTranslogGenerationForRecovery(minRequiredGen); | ||
| } | ||
|
|
||
| /** | ||
| * Selects a starting commit point from a list of existing commits based on the persisted global checkpoint from translog | ||
| * and the retained translog generations. All the required translog files of a starting commit point must exist, | ||
| * and its max seqno should be at most the global checkpoint from the translog checkpoint. | ||
| * | ||
| * @param commits a list of existing commit points | ||
| * @param globalCheckpoint the persisted global checkpoint from the translog, see {@link Translog#readGlobalCheckpoint(Path)} | ||
| * @param minRetainedTranslogGen the minimum translog generation is retained, see {@link Translog#readMinReferencedTranslogGen(Path)} | ||
| */ | ||
| public static IndexCommit startingCommitPoint(List<IndexCommit> commits, long globalCheckpoint, long minRetainedTranslogGen) | ||
| throws IOException { | ||
| if (commits.isEmpty()) { | ||
| throw new IllegalArgumentException("Commit list must not empty"); | ||
| } | ||
| // Snapshotted commits may not have all its required translog. | ||
| final List<IndexCommit> recoverableCommits = new ArrayList<>(); | ||
| for (IndexCommit commit : commits) { | ||
| if (minRetainedTranslogGen <= Long.parseLong(commit.getUserData().get(Translog.TRANSLOG_GENERATION_KEY))) { | ||
|
||
| recoverableCommits.add(commit); | ||
| } | ||
| } | ||
| assert recoverableCommits.isEmpty() == false : "Unable to select a proper starting commit point; " + | ||
| "commits [" + commits + "], minRetainedTranslogGen [" + minRetainedTranslogGen + "]"; | ||
| final int keptPosition = indexOfKeptCommits(recoverableCommits, globalCheckpoint); | ||
| return recoverableCommits.get(keptPosition); | ||
| } | ||
|
|
||
| /** | ||
| * Find the highest index position of a safe index commit whose max sequence number is not greater than the global checkpoint. | ||
| * Index commits with different translog UUID will be filtered out as they don't belong to this engine. | ||
| */ | ||
| private int indexOfKeptCommits(List<? extends IndexCommit> commits) throws IOException { | ||
| final long currentGlobalCheckpoint = globalCheckpointSupplier.getAsLong(); | ||
| private static int indexOfKeptCommits(List<? extends IndexCommit> commits, long globalCheckpoint) throws IOException { | ||
| final String expectedTranslogUUID = commits.get(commits.size() - 1).getUserData().get(Translog.TRANSLOG_UUID_KEY); | ||
|
|
||
| // Commits are sorted by age (the 0th one is the oldest commit). | ||
|
|
@@ -110,7 +138,7 @@ private int indexOfKeptCommits(List<? extends IndexCommit> commits) throws IOExc | |
| return i; | ||
| } | ||
| final long maxSeqNoFromCommit = Long.parseLong(commitUserData.get(SequenceNumbers.MAX_SEQ_NO)); | ||
| if (maxSeqNoFromCommit <= currentGlobalCheckpoint) { | ||
| if (maxSeqNoFromCommit <= globalCheckpoint) { | ||
| return i; | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ | |
|
|
||
| import org.apache.logging.log4j.Logger; | ||
| import org.apache.lucene.index.DirectoryReader; | ||
| import org.apache.lucene.index.IndexCommit; | ||
| import org.apache.lucene.index.IndexFormatTooOldException; | ||
| import org.apache.lucene.index.IndexReader; | ||
| import org.apache.lucene.index.IndexWriter; | ||
|
|
@@ -79,6 +80,7 @@ | |
| import org.elasticsearch.threadpool.ThreadPool; | ||
|
|
||
| import java.io.IOException; | ||
| import java.nio.file.Path; | ||
| import java.util.Arrays; | ||
| import java.util.HashMap; | ||
| import java.util.List; | ||
|
|
@@ -139,6 +141,7 @@ public class InternalEngine extends Engine { | |
| private final AtomicLong maxUnsafeAutoIdTimestamp = new AtomicLong(-1); | ||
| private final CounterMetric numVersionLookups = new CounterMetric(); | ||
| private final CounterMetric numIndexVersionsLookups = new CounterMetric(); | ||
| private final IndexCommit startingCommit; | ||
| /** | ||
| * How many bytes we are currently moving to disk, via either IndexWriter.flush or refresh. IndexingMemoryController polls this | ||
| * across all shards to decide if throttling is necessary because moving bytes to disk is falling behind vs incoming documents | ||
|
|
@@ -179,6 +182,9 @@ public InternalEngine(EngineConfig engineConfig) { | |
| mergeScheduler = scheduler = new EngineMergeScheduler(engineConfig.getShardId(), engineConfig.getIndexSettings()); | ||
| throttle = new IndexThrottle(); | ||
| try { | ||
| this.startingCommit = getStartingCommitPoint(); | ||
|
||
| assert startingCommit == null || openMode == EngineConfig.OpenMode.OPEN_INDEX_AND_TRANSLOG : | ||
|
||
| "OPEN_INDEX_AND_TRANSLOG must have starting commit; mode [" + openMode + "]; startingCommit [" + startingCommit + "]"; | ||
| this.localCheckpointTracker = createLocalCheckpointTracker(localCheckpointTrackerSupplier); | ||
| translog = openTranslog(engineConfig, translogDeletionPolicy, engineConfig.getGlobalCheckpointSupplier()); | ||
| assert translog.getGeneration() != null; | ||
|
|
@@ -237,23 +243,24 @@ public InternalEngine(EngineConfig engineConfig) { | |
|
|
||
| private LocalCheckpointTracker createLocalCheckpointTracker( | ||
| BiFunction<Long, Long, LocalCheckpointTracker> localCheckpointTrackerSupplier) throws IOException { | ||
| final long maxSeqNo; | ||
| final long localCheckpoint; | ||
| switch (openMode) { | ||
| case CREATE_INDEX_AND_TRANSLOG: | ||
| maxSeqNo = SequenceNumbers.NO_OPS_PERFORMED; | ||
| localCheckpoint = SequenceNumbers.NO_OPS_PERFORMED; | ||
| break; | ||
| case OPEN_INDEX_AND_TRANSLOG: | ||
| return localCheckpointTrackerSupplier.apply(SequenceNumbers.NO_OPS_PERFORMED, SequenceNumbers.NO_OPS_PERFORMED); | ||
| case OPEN_INDEX_CREATE_TRANSLOG: | ||
| final Tuple<Long, Long> seqNoStats = store.loadSeqNoInfo(); | ||
| maxSeqNo = seqNoStats.v1(); | ||
| localCheckpoint = seqNoStats.v2(); | ||
| final Tuple<Long, Long> seqNoInfo = store.loadSeqNoInfo(null); | ||
| logger.trace("recovered maximum sequence number [{}] and local checkpoint [{}]", seqNoInfo.v1(), seqNoInfo.v2()); | ||
| return localCheckpointTrackerSupplier.apply(seqNoInfo.v1(), seqNoInfo.v2()); | ||
| case OPEN_INDEX_AND_TRANSLOG: | ||
| // When recovering from a previous commit point, we use the local checkpoint from that commit, | ||
| // but the max_seqno from the last commit. This allows use to throw away stale operations. | ||
|
||
| assert startingCommit != null; | ||
| final long localCheckpoint = store.loadSeqNoInfo(startingCommit).v2(); | ||
| final long maxSeqNo = store.loadSeqNoInfo(null).v1(); | ||
| logger.trace("recovered maximum sequence number [{}] and local checkpoint [{}]", maxSeqNo, localCheckpoint); | ||
| break; | ||
| default: throw new IllegalArgumentException("unknown type: " + openMode); | ||
| return localCheckpointTrackerSupplier.apply(maxSeqNo, localCheckpoint); | ||
| default: | ||
| throw new IllegalArgumentException("unknown type: " + openMode); | ||
| } | ||
| return localCheckpointTrackerSupplier.apply(maxSeqNo, localCheckpoint); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i like the final approach better this gives a uniform return value (easier to understand) and it makes sure these things are set. |
||
| } | ||
|
|
||
| /** | ||
|
|
@@ -381,6 +388,9 @@ public InternalEngine recoverFromTranslog() throws IOException { | |
| if (openMode != EngineConfig.OpenMode.OPEN_INDEX_AND_TRANSLOG) { | ||
| throw new IllegalStateException("Can't recover from translog with open mode: " + openMode); | ||
| } | ||
| if (startingCommit == null) { | ||
| throw new IllegalStateException("Can't recover from translog without starting commit "); | ||
| } | ||
| if (pendingTranslogRecovery.get() == false) { | ||
| throw new IllegalStateException("Engine has already been recovered"); | ||
| } | ||
|
|
@@ -401,10 +411,21 @@ public InternalEngine recoverFromTranslog() throws IOException { | |
| return this; | ||
| } | ||
|
|
||
| private IndexCommit getStartingCommitPoint() throws IOException { | ||
| if (openMode == EngineConfig.OpenMode.OPEN_INDEX_AND_TRANSLOG) { | ||
| final Path translogPath = engineConfig.getTranslogConfig().getTranslogPath(); | ||
| final List<IndexCommit> existingCommits = DirectoryReader.listCommits(store.directory()); | ||
| return CombinedDeletionPolicy.startingCommitPoint(existingCommits, | ||
| Translog.readGlobalCheckpoint(translogPath), | ||
| Translog.readMinReferencedTranslogGen(translogPath)); | ||
| } | ||
| return null; | ||
| } | ||
|
|
||
| private void recoverFromTranslogInternal() throws IOException { | ||
| Translog.TranslogGeneration translogGeneration = translog.getGeneration(); | ||
| final int opsRecovered; | ||
| final long translogGen = Long.parseLong(lastCommittedSegmentInfos.getUserData().get(Translog.TRANSLOG_GENERATION_KEY)); | ||
| final long translogGen = Long.parseLong(startingCommit.getUserData().get(Translog.TRANSLOG_GENERATION_KEY)); | ||
|
||
| try (Translog.Snapshot snapshot = translog.newSnapshotFromGen(translogGen)) { | ||
| opsRecovered = config().getTranslogRecoveryRunner().run(this, snapshot); | ||
| } catch (Exception e) { | ||
|
|
@@ -1785,6 +1806,8 @@ private long loadCurrentVersionFromIndex(Term uid) throws IOException { | |
| private IndexWriter createWriter(boolean create) throws IOException { | ||
| try { | ||
| final IndexWriterConfig iwc = getIndexWriterConfig(create); | ||
| assert startingCommit == null || create == false : "Starting commit makes sense only when create=false"; | ||
| iwc.setIndexCommit(startingCommit); | ||
|
||
| return createWriter(store.directory(), iwc); | ||
| } catch (LockObtainFailedException ex) { | ||
| logger.warn("could not lock IndexWriter", ex); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
wondering - should we call this findSafeCommit? I suspect that this will be use full for later too.