Skip to content

Commit 8f0b357

Browse files
authored
Let primary own its replication group (#25692)
Currently replication and recovery are both coordinated through the latest cluster state available on the ClusterService as well as through the GlobalCheckpointTracker (to have consistent local/global checkpoint information), making it difficult to understand the relation between recovery and replication, and requiring some tricky checks in the recovery code to coordinate between the two. This commit makes the primary the single owner of its replication group, which simplifies the replication model and allows to clean up corner cases we have in our recovery code. It also reduces the dependencies in the code, so that neither RecoverySourceXXX nor ReplicationOperation need access to the latest state on ClusterService anymore. Finally, it gives us the property that in-sync shard copies won't receive global checkpoint updates which are above their local checkpoint (relates #25485).
1 parent f809a12 commit 8f0b357

29 files changed

+686
-630
lines changed

core/src/main/java/org/elasticsearch/action/support/replication/ReplicationOperation.java

Lines changed: 36 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,13 @@
2727
import org.elasticsearch.action.UnavailableShardsException;
2828
import org.elasticsearch.action.support.ActiveShardCount;
2929
import org.elasticsearch.action.support.TransportActions;
30-
import org.elasticsearch.cluster.ClusterState;
31-
import org.elasticsearch.cluster.metadata.IndexMetaData;
3230
import org.elasticsearch.cluster.routing.AllocationId;
33-
import org.elasticsearch.cluster.routing.IndexRoutingTable;
3431
import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
3532
import org.elasticsearch.cluster.routing.ShardRouting;
3633
import org.elasticsearch.common.Nullable;
3734
import org.elasticsearch.common.io.stream.StreamInput;
3835
import org.elasticsearch.common.util.set.Sets;
36+
import org.elasticsearch.index.shard.ReplicationGroup;
3937
import org.elasticsearch.index.shard.ShardId;
4038
import org.elasticsearch.rest.RestStatus;
4139

@@ -49,8 +47,8 @@
4947
import java.util.concurrent.atomic.AtomicBoolean;
5048
import java.util.concurrent.atomic.AtomicInteger;
5149
import java.util.function.Consumer;
52-
import java.util.function.Supplier;
5350
import java.util.stream.Collectors;
51+
import java.util.stream.Stream;
5452

5553
public class ReplicationOperation<
5654
Request extends ReplicationRequest<Request>,
@@ -59,7 +57,6 @@ public class ReplicationOperation<
5957
> {
6058
private final Logger logger;
6159
private final Request request;
62-
private final Supplier<ClusterState> clusterStateSupplier;
6360
private final String opType;
6461
private final AtomicInteger totalShards = new AtomicInteger();
6562
/**
@@ -86,13 +83,12 @@ public class ReplicationOperation<
8683
public ReplicationOperation(Request request, Primary<Request, ReplicaRequest, PrimaryResultT> primary,
8784
ActionListener<PrimaryResultT> listener,
8885
Replicas<ReplicaRequest> replicas,
89-
Supplier<ClusterState> clusterStateSupplier, Logger logger, String opType) {
86+
Logger logger, String opType) {
9087
this.replicasProxy = replicas;
9188
this.primary = primary;
9289
this.resultListener = listener;
9390
this.logger = logger;
9491
this.request = request;
95-
this.clusterStateSupplier = clusterStateSupplier;
9692
this.opType = opType;
9793
}
9894

@@ -117,51 +113,45 @@ public void execute() throws Exception {
117113
logger.trace("[{}] op [{}] completed on primary for request [{}]", primaryId, opType, request);
118114
}
119115

120-
// we have to get a new state after successfully indexing into the primary in order to honour recovery semantics.
116+
// we have to get the replication group after successfully indexing into the primary in order to honour recovery semantics.
121117
// we have to make sure that every operation indexed into the primary after recovery start will also be replicated
122-
// to the recovery target. If we use an old cluster state, we may miss a relocation that has started since then.
123-
ClusterState clusterState = clusterStateSupplier.get();
124-
final List<ShardRouting> shards = getShards(primaryId, clusterState);
125-
Set<String> inSyncAllocationIds = getInSyncAllocationIds(primaryId, clusterState);
126-
127-
markUnavailableShardsAsStale(replicaRequest, inSyncAllocationIds, shards);
128-
129-
performOnReplicas(replicaRequest, primary.globalCheckpoint(), shards);
118+
// to the recovery target. If we used an old replication group, we may miss a recovery that has started since then.
119+
// we also have to make sure to get the global checkpoint before the replication group, to ensure that the global checkpoint
120+
// is valid for this replication group. If we would sample in the reverse, the global checkpoint might be based on a subset
121+
// of the sampled replication group, and advanced further than what the given replication group would allow it to.
122+
// This would entail that some shards could learn about a global checkpoint that would be higher than its local checkpoint.
123+
final long globalCheckpoint = primary.globalCheckpoint();
124+
final ReplicationGroup replicationGroup = primary.getReplicationGroup();
125+
markUnavailableShardsAsStale(replicaRequest, replicationGroup.getInSyncAllocationIds(), replicationGroup.getRoutingTable());
126+
performOnReplicas(replicaRequest, globalCheckpoint, replicationGroup.getRoutingTable());
130127
}
131128

132129
successfulShards.incrementAndGet(); // mark primary as successful
133130
decPendingAndFinishIfNeeded();
134131
}
135132

136-
private void markUnavailableShardsAsStale(ReplicaRequest replicaRequest, Set<String> inSyncAllocationIds, List<ShardRouting> shards) {
137-
if (inSyncAllocationIds.isEmpty() == false && shards.isEmpty() == false) {
138-
Set<String> availableAllocationIds = shards.stream()
139-
.map(ShardRouting::allocationId)
140-
.filter(Objects::nonNull)
141-
.map(AllocationId::getId)
142-
.collect(Collectors.toSet());
143-
144-
// if inSyncAllocationIds contains allocation ids of shards that don't exist in RoutingTable, mark copies as stale
145-
for (String allocationId : Sets.difference(inSyncAllocationIds, availableAllocationIds)) {
146-
// mark copy as stale
147-
pendingActions.incrementAndGet();
148-
replicasProxy.markShardCopyAsStaleIfNeeded(replicaRequest.shardId(), allocationId, replicaRequest.primaryTerm(),
149-
ReplicationOperation.this::decPendingAndFinishIfNeeded,
150-
ReplicationOperation.this::onPrimaryDemoted,
151-
throwable -> decPendingAndFinishIfNeeded()
152-
);
153-
}
133+
private void markUnavailableShardsAsStale(ReplicaRequest replicaRequest, Set<String> inSyncAllocationIds,
134+
IndexShardRoutingTable indexShardRoutingTable) {
135+
// if inSyncAllocationIds contains allocation ids of shards that don't exist in RoutingTable, mark copies as stale
136+
for (String allocationId : Sets.difference(inSyncAllocationIds, indexShardRoutingTable.getAllAllocationIds())) {
137+
// mark copy as stale
138+
pendingActions.incrementAndGet();
139+
replicasProxy.markShardCopyAsStaleIfNeeded(replicaRequest.shardId(), allocationId, replicaRequest.primaryTerm(),
140+
ReplicationOperation.this::decPendingAndFinishIfNeeded,
141+
ReplicationOperation.this::onPrimaryDemoted,
142+
throwable -> decPendingAndFinishIfNeeded()
143+
);
154144
}
155145
}
156146

157-
private void performOnReplicas(final ReplicaRequest replicaRequest, final long globalCheckpoint, final List<ShardRouting> shards) {
147+
private void performOnReplicas(final ReplicaRequest replicaRequest, final long globalCheckpoint,
148+
final IndexShardRoutingTable indexShardRoutingTable) {
158149
final String localNodeId = primary.routingEntry().currentNodeId();
159150
// If the index gets deleted after primary operation, we skip replication
160-
for (final ShardRouting shard : shards) {
151+
for (final ShardRouting shard : indexShardRoutingTable) {
161152
if (shard.unassigned()) {
162-
if (shard.primary() == false) {
163-
totalShards.incrementAndGet();
164-
}
153+
assert shard.primary() == false : "primary shard should not be unassigned in a replication group: " + shard;
154+
totalShards.incrementAndGet();
165155
continue;
166156
}
167157

@@ -238,23 +228,11 @@ private void onPrimaryDemoted(Exception demotionFailure) {
238228
*/
239229
protected String checkActiveShardCount() {
240230
final ShardId shardId = primary.routingEntry().shardId();
241-
final String indexName = shardId.getIndexName();
242-
final ClusterState state = clusterStateSupplier.get();
243-
assert state != null : "replication operation must have access to the cluster state";
244231
final ActiveShardCount waitForActiveShards = request.waitForActiveShards();
245232
if (waitForActiveShards == ActiveShardCount.NONE) {
246233
return null; // not waiting for any shards
247234
}
248-
IndexRoutingTable indexRoutingTable = state.getRoutingTable().index(indexName);
249-
if (indexRoutingTable == null) {
250-
logger.trace("[{}] index not found in the routing table", shardId);
251-
return "Index " + indexName + " not found in the routing table";
252-
}
253-
IndexShardRoutingTable shardRoutingTable = indexRoutingTable.shard(shardId.getId());
254-
if (shardRoutingTable == null) {
255-
logger.trace("[{}] shard not found in the routing table", shardId);
256-
return "Shard " + shardId + " not found in the routing table";
257-
}
235+
final IndexShardRoutingTable shardRoutingTable = primary.getReplicationGroup().getRoutingTable();
258236
if (waitForActiveShards.enoughShardsActive(shardRoutingTable)) {
259237
return null;
260238
} else {
@@ -268,21 +246,6 @@ protected String checkActiveShardCount() {
268246
}
269247
}
270248

271-
protected Set<String> getInSyncAllocationIds(ShardId shardId, ClusterState clusterState) {
272-
IndexMetaData indexMetaData = clusterState.metaData().index(shardId.getIndex());
273-
if (indexMetaData != null) {
274-
return indexMetaData.inSyncAllocationIds(shardId.id());
275-
}
276-
return Collections.emptySet();
277-
}
278-
279-
protected List<ShardRouting> getShards(ShardId shardId, ClusterState state) {
280-
// can be null if the index is deleted / closed on us..
281-
final IndexShardRoutingTable shardRoutingTable = state.getRoutingTable().shardRoutingTableOrNull(shardId);
282-
List<ShardRouting> shards = shardRoutingTable == null ? Collections.emptyList() : shardRoutingTable.shards();
283-
return shards;
284-
}
285-
286249
private void decPendingAndFinishIfNeeded() {
287250
assert pendingActions.get() > 0 : "pending action count goes below 0 for request [" + request + "]";
288251
if (pendingActions.decrementAndGet() == 0) {
@@ -371,6 +334,12 @@ public interface Primary<
371334
*/
372335
long globalCheckpoint();
373336

337+
/**
338+
* Returns the current replication group on the primary shard
339+
*
340+
* @return the replication group
341+
*/
342+
ReplicationGroup getReplicationGroup();
374343
}
375344

376345
/**

core/src/main/java/org/elasticsearch/action/support/replication/TransportReplicationAction.java

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
import org.elasticsearch.index.seqno.SequenceNumbersService;
5757
import org.elasticsearch.index.shard.IndexShard;
5858
import org.elasticsearch.index.shard.IndexShardState;
59+
import org.elasticsearch.index.shard.ReplicationGroup;
5960
import org.elasticsearch.index.shard.ShardId;
6061
import org.elasticsearch.index.shard.ShardNotFoundException;
6162
import org.elasticsearch.indices.IndexClosedException;
@@ -383,7 +384,7 @@ protected ReplicationOperation<Request, ReplicaRequest, PrimaryResult<ReplicaReq
383384
Request request, ActionListener<PrimaryResult<ReplicaRequest, Response>> listener,
384385
PrimaryShardReference primaryShardReference) {
385386
return new ReplicationOperation<>(request, primaryShardReference, listener,
386-
replicasProxy, clusterService::state, logger, actionName);
387+
replicasProxy, logger, actionName);
387388
}
388389
}
389390

@@ -629,7 +630,7 @@ public void onFailure(Exception e) {
629630
}
630631
}
631632

632-
private IndexShard getIndexShard(ShardId shardId) {
633+
protected IndexShard getIndexShard(ShardId shardId) {
633634
IndexService indexService = indicesService.indexServiceSafe(shardId.getIndex());
634635
return indexService.getShard(shardId.id());
635636
}
@@ -1006,6 +1007,11 @@ public long globalCheckpoint() {
10061007
return indexShard.getGlobalCheckpoint();
10071008
}
10081009

1010+
@Override
1011+
public ReplicationGroup getReplicationGroup() {
1012+
return indexShard.getReplicationGroup();
1013+
}
1014+
10091015
}
10101016

10111017

core/src/main/java/org/elasticsearch/cluster/routing/IndexShardRoutingTable.java

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
import org.elasticsearch.cluster.node.DiscoveryNode;
2323
import org.elasticsearch.cluster.node.DiscoveryNodes;
24+
import org.elasticsearch.common.Nullable;
2425
import org.elasticsearch.common.Randomness;
2526
import org.elasticsearch.common.collect.MapBuilder;
2627
import org.elasticsearch.common.io.stream.StreamInput;
@@ -61,6 +62,7 @@ public class IndexShardRoutingTable implements Iterable<ShardRouting> {
6162
final List<ShardRouting> shards;
6263
final List<ShardRouting> activeShards;
6364
final List<ShardRouting> assignedShards;
65+
final Set<String> allAllocationIds;
6466
static final List<ShardRouting> NO_SHARDS = Collections.emptyList();
6567
final boolean allShardsStarted;
6668

@@ -84,6 +86,7 @@ public class IndexShardRoutingTable implements Iterable<ShardRouting> {
8486
List<ShardRouting> activeShards = new ArrayList<>();
8587
List<ShardRouting> assignedShards = new ArrayList<>();
8688
List<ShardRouting> allInitializingShards = new ArrayList<>();
89+
Set<String> allAllocationIds = new HashSet<>();
8790
boolean allShardsStarted = true;
8891
for (ShardRouting shard : shards) {
8992
if (shard.primary()) {
@@ -100,9 +103,11 @@ public class IndexShardRoutingTable implements Iterable<ShardRouting> {
100103
if (shard.relocating()) {
101104
// create the target initializing shard routing on the node the shard is relocating to
102105
allInitializingShards.add(shard.getTargetRelocatingShard());
106+
allAllocationIds.add(shard.getTargetRelocatingShard().allocationId().getId());
103107
}
104108
if (shard.assignedToNode()) {
105109
assignedShards.add(shard);
110+
allAllocationIds.add(shard.allocationId().getId());
106111
}
107112
if (shard.state() != ShardRoutingState.STARTED) {
108113
allShardsStarted = false;
@@ -119,6 +124,7 @@ public class IndexShardRoutingTable implements Iterable<ShardRouting> {
119124
this.activeShards = Collections.unmodifiableList(activeShards);
120125
this.assignedShards = Collections.unmodifiableList(assignedShards);
121126
this.allInitializingShards = Collections.unmodifiableList(allInitializingShards);
127+
this.allAllocationIds = Collections.unmodifiableSet(allAllocationIds);
122128
}
123129

124130
/**
@@ -435,6 +441,25 @@ public boolean allShardsStarted() {
435441
return allShardsStarted;
436442
}
437443

444+
@Nullable
445+
public ShardRouting getByAllocationId(String allocationId) {
446+
for (ShardRouting shardRouting : assignedShards()) {
447+
if (shardRouting.allocationId().getId().equals(allocationId)) {
448+
return shardRouting;
449+
}
450+
if (shardRouting.relocating()) {
451+
if (shardRouting.getTargetRelocatingShard().allocationId().getId().equals(allocationId)) {
452+
return shardRouting.getTargetRelocatingShard();
453+
}
454+
}
455+
}
456+
return null;
457+
}
458+
459+
public Set<String> getAllAllocationIds() {
460+
return allAllocationIds;
461+
}
462+
438463
static class AttributesKey {
439464

440465
final String[] attributes;
@@ -634,7 +659,7 @@ public static IndexShardRoutingTable readFromThin(StreamInput in, Index index) t
634659
}
635660

636661
public static void writeTo(IndexShardRoutingTable indexShard, StreamOutput out) throws IOException {
637-
out.writeString(indexShard.shardId().getIndex().getName());
662+
indexShard.shardId().getIndex().writeTo(out);
638663
writeToThin(indexShard, out);
639664
}
640665

@@ -648,4 +673,19 @@ public static void writeToThin(IndexShardRoutingTable indexShard, StreamOutput o
648673
}
649674

650675
}
676+
677+
@Override
678+
public String toString() {
679+
StringBuilder sb = new StringBuilder();
680+
sb.append("IndexShardRoutingTable(").append(shardId()).append("){");
681+
final int numShards = shards.size();
682+
for (int i = 0; i < numShards; i++) {
683+
sb.append(shards.get(i).shortSummary());
684+
if (i < numShards - 1) {
685+
sb.append(", ");
686+
}
687+
}
688+
sb.append("}");
689+
return sb.toString();
690+
}
651691
}

core/src/main/java/org/elasticsearch/cluster/routing/RoutingTable.java

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -156,17 +156,7 @@ public ShardRouting getByAllocationId(ShardId shardId, String allocationId) {
156156
if (shardRoutingTable == null) {
157157
return null;
158158
}
159-
for (ShardRouting shardRouting : shardRoutingTable.assignedShards()) {
160-
if (shardRouting.allocationId().getId().equals(allocationId)) {
161-
return shardRouting;
162-
}
163-
if (shardRouting.relocating()) {
164-
if (shardRouting.getTargetRelocatingShard().allocationId().getId().equals(allocationId)) {
165-
return shardRouting.getTargetRelocatingShard();
166-
}
167-
}
168-
}
169-
return null;
159+
return shardRoutingTable.getByAllocationId(allocationId);
170160
}
171161

172162

core/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ public final class ShardRouting implements Writeable, ToXContent {
8282
assert !(state == ShardRoutingState.UNASSIGNED && unassignedInfo == null) : "unassigned shard must be created with meta";
8383
assert (state == ShardRoutingState.UNASSIGNED || state == ShardRoutingState.INITIALIZING) == (recoverySource != null) : "recovery source only available on unassigned or initializing shard but was " + state;
8484
assert recoverySource == null || recoverySource == PeerRecoverySource.INSTANCE || primary : "replica shards always recover from primary";
85+
assert (currentNodeId == null) == (state == ShardRoutingState.UNASSIGNED) : "unassigned shard must not be assigned to a node " + this;
8586
}
8687

8788
@Nullable

0 commit comments

Comments
 (0)