-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Propagate max_auto_id_timestamp in peer recovery #33693
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
60aa124
27cf199
d08cdd0
e0c48da
be3c3bd
360adb2
8975a1e
377267a
e6a929a
b30ff8e
07e1621
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1014,8 +1014,7 @@ private boolean mayHaveBeenIndexedBefore(Index index) { | |
| final boolean mayHaveBeenIndexBefore; | ||
| if (index.isRetry()) { | ||
| mayHaveBeenIndexBefore = true; | ||
| maxUnsafeAutoIdTimestamp.updateAndGet(curr -> Math.max(index.getAutoGeneratedIdTimestamp(), curr)); | ||
| assert maxUnsafeAutoIdTimestamp.get() >= index.getAutoGeneratedIdTimestamp(); | ||
| updateMaxAutoIdTimestamp(index.getAutoGeneratedIdTimestamp()); | ||
| } else { | ||
| // in this case we force | ||
| mayHaveBeenIndexBefore = maxUnsafeAutoIdTimestamp.get() >= index.getAutoGeneratedIdTimestamp(); | ||
|
|
@@ -2531,4 +2530,16 @@ void updateRefreshedCheckpoint(long checkpoint) { | |
| assert refreshedCheckpoint.get() >= checkpoint : refreshedCheckpoint.get() + " < " + checkpoint; | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public long getMaxAutoIdTimestamp() { | ||
|
||
| return maxUnsafeAutoIdTimestamp.get(); | ||
| } | ||
|
|
||
| @Override | ||
| public void updateMaxAutoIdTimestamp(long newTimestamp) { | ||
|
||
| assert newTimestamp >= -1 : "invalid timestamp [" + newTimestamp + "]"; | ||
| maxUnsafeAutoIdTimestamp.updateAndGet(curr -> Math.max(curr, newTimestamp)); | ||
| assert newTimestamp <= maxUnsafeAutoIdTimestamp.get(); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -201,6 +201,8 @@ public RecoveryResponse recoverToTarget() throws IOException { | |||
| runUnderPrimaryPermit(() -> shard.initiateTracking(request.targetAllocationId()), | ||||
| shardId + " initiating tracking of " + request.targetAllocationId(), shard, cancellableThreads, logger); | ||||
|
|
||||
| // DISCUSS: Is it possible to have an operation gets delivered via recovery first, then delivered via replication? | ||||
|
||||
| if (appendOnlyRequest && mayHaveBeenIndexedBefore(index) == false && index.seqNo() > maxSeqNoOfNonAppendOnlyOperations.get()) { |
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instead of using a new one for every batch that is to be sent, I would prefer to capture this after we call cancellableThreads.execute(() -> shard.waitForOpsToComplete(endingSeqNo)); in RecoverySourceHandler, and then only pass that same value. You could also add a comment then and there saying why we do it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We have to do this after the snapshot was captured. That said, I'm +1 on explicitly capturing it once at the right moment and use the same value.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,6 +24,7 @@ | |
| import org.apache.lucene.index.Term; | ||
| import org.apache.lucene.search.TermQuery; | ||
| import org.apache.lucene.search.TopDocs; | ||
| import org.elasticsearch.Version; | ||
| import org.elasticsearch.action.DocWriteResponse; | ||
| import org.elasticsearch.action.bulk.BulkItemResponse; | ||
| import org.elasticsearch.action.bulk.BulkShardRequest; | ||
|
|
@@ -141,10 +142,81 @@ public void cleanFiles(int totalTranslogOps, Store.MetadataSnapshot sourceMetaDa | |
| } | ||
| } | ||
|
|
||
| public void testRetryAppendOnlyWhileRecovering() throws Exception { | ||
|
||
| try (ReplicationGroup shards = createGroup(0)) { | ||
| shards.startAll(); | ||
| final IndexRequest originalRequest = new IndexRequest(index.getName(), "type").source("{}", XContentType.JSON); | ||
| originalRequest.process(Version.CURRENT, null, index.getName()); | ||
| final IndexRequest retryRequest = copyIndexRequest(originalRequest); | ||
| retryRequest.onRetry(); | ||
| shards.index(retryRequest); | ||
| IndexShard replica = shards.addReplica(); | ||
| shards.recoverReplica(replica); | ||
| shards.assertAllEqual(1); | ||
| shards.index(originalRequest); | ||
| shards.assertAllEqual(1); | ||
| assertThat(replica.getMaxAutoIdTimestamp(), equalTo(originalRequest.getAutoGeneratedTimestamp())); | ||
| assertThat(replica.getMaxAutoIdTimestamp(), equalTo(shards.getPrimary().getMaxAutoIdTimestamp())); | ||
| } | ||
| } | ||
|
|
||
| public void testAppendOnlyRecoveryThenReplication() throws Exception { | ||
| CountDownLatch indexedOnPrimary = new CountDownLatch(1); | ||
| CountDownLatch recoveryDone = new CountDownLatch(1); | ||
| try (ReplicationGroup shards = new ReplicationGroup(buildIndexMetaData(1)) { | ||
| @Override | ||
| protected EngineFactory getEngineFactory(ShardRouting routing) { | ||
| return config -> new InternalEngine(config) { | ||
| @Override | ||
| public IndexResult index(Index op) throws IOException { | ||
| IndexResult result = super.index(op); | ||
| if (op.origin() == Operation.Origin.PRIMARY) { | ||
| indexedOnPrimary.countDown(); | ||
| // prevent the indexing on the primary from returning (it was added to Lucene and translog already) | ||
| // to make sure that this operation is replicated to the replica via recovery, then via replication. | ||
| try { | ||
| recoveryDone.await(); | ||
| } catch (InterruptedException e) { | ||
| throw new AssertionError(e); | ||
| } | ||
| } | ||
| return result; | ||
| } | ||
| }; | ||
| } | ||
| }) { | ||
| shards.startAll(); | ||
| Thread thread = new Thread(() -> { | ||
| IndexRequest indexRequest = new IndexRequest(index.getName(), "type").source("{}", XContentType.JSON); | ||
| try { | ||
| shards.index(indexRequest); | ||
| } catch (Exception e) { | ||
| throw new AssertionError(e); | ||
| } | ||
| }); | ||
| thread.start(); | ||
| IndexShard replica = shards.addReplica(); | ||
| Future<Void> fut = shards.asyncRecoverReplica(replica, | ||
| (shard, node) -> new RecoveryTarget(shard, node, recoveryListener, v -> {}){ | ||
| @Override | ||
| public void prepareForTranslogOperations(boolean fileBasedRecovery, int totalTranslogOps) throws IOException { | ||
| try { | ||
| indexedOnPrimary.await(); | ||
| } catch (InterruptedException e) { | ||
| throw new AssertionError(e); | ||
| } | ||
| super.prepareForTranslogOperations(fileBasedRecovery, totalTranslogOps); | ||
| } | ||
| }); | ||
| fut.get(); | ||
| recoveryDone.countDown(); | ||
| thread.join(); | ||
| shards.assertAllEqual(1); | ||
| } | ||
| } | ||
|
|
||
| public void testInheritMaxValidAutoIDTimestampOnRecovery() throws Exception { | ||
| //TODO: Enables this test with soft-deletes once we have timestamp | ||
| Settings settings = Settings.builder().put(IndexSettings.INDEX_SOFT_DELETES_SETTING.getKey(), false).build(); | ||
| try (ReplicationGroup shards = createGroup(0, settings)) { | ||
| try (ReplicationGroup shards = createGroup(0)) { | ||
| shards.startAll(); | ||
| final IndexRequest indexRequest = new IndexRequest(index.getName(), "type").source("{}", XContentType.JSON); | ||
| indexRequest.onRetry(); // force an update of the timestamp | ||
|
|
@@ -161,6 +233,7 @@ public void testInheritMaxValidAutoIDTimestampOnRecovery() throws Exception { | |
| assertNotEquals(IndexRequest.UNSET_AUTO_GENERATED_TIMESTAMP, primarySegmentStats.getMaxUnsafeAutoIdTimestamp()); | ||
| assertEquals(primarySegmentStats.getMaxUnsafeAutoIdTimestamp(), segmentsStats.getMaxUnsafeAutoIdTimestamp()); | ||
| assertNotEquals(Long.MAX_VALUE, segmentsStats.getMaxUnsafeAutoIdTimestamp()); | ||
| assertThat(replica.getMaxAutoIdTimestamp(), equalTo(shards.getPrimary().getMaxAutoIdTimestamp())); | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we want to speak about updating the unsafe marker here?