-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Let primary own its replication group #25692
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
ywelsch
merged 7 commits into
elastic:master
from
ywelsch:enhance/gcptracker-coordinator
Jul 14, 2017
Merged
Changes from 1 commit
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
5296871
GCP coordinates replication and recovery
ywelsch a8c3ab8
Take relocating shards into account for BWC
ywelsch 4f86056
address review comments
ywelsch 132302b
Merge branch 'master' into enhance/gcptracker-coordinator
ywelsch 297f7bf
Merge remote-tracking branch 'elastic/master' into enhance/gcptracker…
ywelsch 3cee847
checkstyle
ywelsch de7113f
assertion message
ywelsch File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -129,21 +129,16 @@ public StartRecoveryRequest getRequest() { | |
| * performs the recovery from the local engine to the target | ||
| */ | ||
| public RecoveryResponse recoverToTarget() throws IOException { | ||
| cancellableThreads.execute(() -> runUnderOperationPermit(() -> { | ||
| runUnderPrimaryPermit(() -> { | ||
| final IndexShardRoutingTable routingTable = shard.getReplicationGroup().getRoutingTable(); | ||
| ShardRouting targetShardRouting = routingTable.getByAllocationId(request.targetAllocationId()); | ||
| if (targetShardRouting == null) { | ||
| logger.debug("delaying recovery of {} as it is not listed as assigned to target node {}", request.shardId(), | ||
| request.targetNode()); | ||
| throw new DelayRecoveryException("source node does not have the shard listed in its state as allocated on the node"); | ||
| } | ||
| if (targetShardRouting.initializing() == false) { | ||
| logger.debug("delaying recovery of {} as it is not listed as initializing on the source node {}. " + | ||
| "known shards state is [{}]", request.shardId(), request.sourceNode(), targetShardRouting.state()); | ||
| throw new DelayRecoveryException("source node has the state of the target shard to be [" + | ||
| targetShardRouting.state() + "], expecting to be [initializing]"); | ||
| } | ||
| })); | ||
| assert targetShardRouting.initializing() : "expected recovery target to be initializing but was " + targetShardRouting; | ||
| }); | ||
|
|
||
| try (Translog.View translogView = shard.acquireTranslogView()) { | ||
|
|
||
|
|
@@ -179,7 +174,7 @@ public RecoveryResponse recoverToTarget() throws IOException { | |
| } | ||
| } | ||
|
|
||
| cancellableThreads.execute(() -> runUnderOperationPermit(() -> shard.initiateTracking(request.targetAllocationId()))); | ||
| runUnderPrimaryPermit(() -> shard.initiateTracking(request.targetAllocationId())); | ||
|
|
||
| try { | ||
| prepareTargetForTranslog(translogView.estimateTotalOperations(startingSeqNo)); | ||
|
|
@@ -200,15 +195,19 @@ public RecoveryResponse recoverToTarget() throws IOException { | |
| return response; | ||
| } | ||
|
|
||
| private void runUnderOperationPermit(CancellableThreads.Interruptable runnable) throws InterruptedException { | ||
| final PlainActionFuture<Releasable> onAcquired = new PlainActionFuture<>(); | ||
| shard.acquirePrimaryOperationPermit(onAcquired, ThreadPool.Names.SAME); | ||
| try (Releasable ignored = onAcquired.actionGet()) { | ||
| if (shard.state() == IndexShardState.RELOCATED) { | ||
| throw new IndexShardRelocatedException(shard.shardId()); | ||
| private void runUnderPrimaryPermit(CancellableThreads.Interruptable runnable) { | ||
| cancellableThreads.execute(() -> { | ||
| final PlainActionFuture<Releasable> onAcquired = new PlainActionFuture<>(); | ||
| shard.acquirePrimaryOperationPermit(onAcquired, ThreadPool.Names.SAME); | ||
| try (Releasable ignored = onAcquired.actionGet()) { | ||
| // check that the IndexShard still has the primary authority. This needs to be checked under operation permit to prevent | ||
| // races, as IndexShard will change its state to RELOCATED only when it holds all operation permits, see IndexShard.relocated() | ||
| if (shard.state() == IndexShardState.RELOCATED) { | ||
| throw new IndexShardRelocatedException(shard.shardId()); | ||
| } | ||
| runnable.run(); | ||
| } | ||
| runnable.run(); | ||
| } | ||
| }); | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -461,19 +460,18 @@ public void finalizeRecovery(final long targetLocalCheckpoint) { | |
| cancellableThreads.checkForCancel(); | ||
| StopWatch stopWatch = new StopWatch().start(); | ||
| logger.trace("finalizing recovery"); | ||
| cancellableThreads.execute(() -> { | ||
| /* | ||
| * Before marking the shard as in-sync we acquire an operation permit. We do this so that there is a barrier between marking a | ||
| * shard as in-sync and relocating a shard. If we acquire the permit then no relocation handoff can complete before we are done | ||
| * marking the shard as in-sync. If the relocation handoff holds all the permits then after the handoff completes and we acquire | ||
| * the permit then the state of the shard will be relocated and this recovery will fail. | ||
| */ | ||
| runUnderOperationPermit(() -> shard.markAllocationIdAsInSync(request.targetAllocationId(), targetLocalCheckpoint)); | ||
| recoveryTarget.finalizeRecovery(shard.getGlobalCheckpoint()); | ||
| }); | ||
| /* | ||
| * Before marking the shard as in-sync we acquire an operation permit. We do this so that there is a barrier between marking a | ||
| * shard as in-sync and relocating a shard. If we acquire the permit then no relocation handoff can complete before we are done | ||
| * marking the shard as in-sync. If the relocation handoff holds all the permits then after the handoff completes and we acquire | ||
| * the permit then the state of the shard will be relocated and this recovery will fail. | ||
| */ | ||
| runUnderPrimaryPermit(() -> shard.markAllocationIdAsInSync(request.targetAllocationId(), targetLocalCheckpoint)); | ||
| cancellableThreads.execute(() -> recoveryTarget.finalizeRecovery(shard.getGlobalCheckpoint())); | ||
|
|
||
| if (request.isPrimaryRelocation()) { | ||
| logger.trace("performing relocation hand-off"); | ||
| // this acquires all IndexShard operation permits and will thus delay new recoveries until it is done | ||
| cancellableThreads.execute(() -> shard.relocated("to " + request.targetNode(), recoveryTarget::handoffPrimaryContext)); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we add a comment here say that this will acquire all permits and will thus will delay new recoveries until it's done? |
||
| /* | ||
| * if the recovery process fails after setting the shard state to RELOCATED, both relocation source and | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
message?