elastic
diff --git a/‎core/src/main/java/org/elasticsearch/action/support/replication/ReplicationOperation.java‎
Lines changed: 36 additions & 67 deletions b/‎core/src/main/java/org/elasticsearch/action/support/replication/ReplicationOperation.java‎
Lines changed: 36 additions & 67 deletions
diff --git a/‎core/src/main/java/org/elasticsearch/action/support/replication/TransportReplicationAction.java‎
Lines changed: 8 additions & 2 deletions b/‎core/src/main/java/org/elasticsearch/action/support/replication/TransportReplicationAction.java‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎core/src/main/java/org/elasticsearch/cluster/routing/IndexShardRoutingTable.java‎
Lines changed: 41 additions & 1 deletion b/‎core/src/main/java/org/elasticsearch/cluster/routing/IndexShardRoutingTable.java‎
Lines changed: 41 additions & 1 deletion
diff --git a/‎core/src/main/java/org/elasticsearch/cluster/routing/RoutingTable.java‎
Lines changed: 1 addition & 11 deletions b/‎core/src/main/java/org/elasticsearch/cluster/routing/RoutingTable.java‎
Lines changed: 1 addition & 11 deletions
diff --git a/‎core/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java‎
Lines changed: 1 addition & 0 deletions b/‎core/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java‎
Lines changed: 1 addition & 0 deletions
@@ -27,15 +27,13 @@
 import org.elasticsearch.action.UnavailableShardsException;
 import org.elasticsearch.action.support.ActiveShardCount;
 import org.elasticsearch.action.support.TransportActions;
-import org.elasticsearch.cluster.ClusterState;
-import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.cluster.routing.AllocationId;
-import org.elasticsearch.cluster.routing.IndexRoutingTable;
 import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
 import org.elasticsearch.cluster.routing.ShardRouting;
 import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.util.set.Sets;
+import org.elasticsearch.index.shard.ReplicationGroup;
 import org.elasticsearch.index.shard.ShardId;
 import org.elasticsearch.rest.RestStatus;
 
@@ -49,8 +47,8 @@
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.function.Consumer;
-import java.util.function.Supplier;
 import java.util.stream.Collectors;
+import java.util.stream.Stream;
 
 public class ReplicationOperation<
             Request extends ReplicationRequest<Request>,
@@ -59,7 +57,6 @@ public class ReplicationOperation<
         > {
     private final Logger logger;
     private final Request request;
-    private final Supplier<ClusterState> clusterStateSupplier;
     private final String opType;
     private final AtomicInteger totalShards = new AtomicInteger();
     /**
@@ -86,13 +83,12 @@ public class ReplicationOperation<
     public ReplicationOperation(Request request, Primary<Request, ReplicaRequest, PrimaryResultT> primary,
                                 ActionListener<PrimaryResultT> listener,
                                 Replicas<ReplicaRequest> replicas,
-                                Supplier<ClusterState> clusterStateSupplier, Logger logger, String opType) {
+                                Logger logger, String opType) {
         this.replicasProxy = replicas;
         this.primary = primary;
         this.resultListener = listener;
         this.logger = logger;
         this.request = request;
-        this.clusterStateSupplier = clusterStateSupplier;
         this.opType = opType;
     }
 
@@ -117,51 +113,45 @@ public void execute() throws Exception {
                 logger.trace("[{}] op [{}] completed on primary for request [{}]", primaryId, opType, request);
             }
 
-            // we have to get a new state after successfully indexing into the primary in order to honour recovery semantics.
+            // we have to get the replication group after successfully indexing into the primary in order to honour recovery semantics.
             // we have to make sure that every operation indexed into the primary after recovery start will also be replicated
-            // to the recovery target. If we use an old cluster state, we may miss a relocation that has started since then.
-            ClusterState clusterState = clusterStateSupplier.get();
-            final List<ShardRouting> shards = getShards(primaryId, clusterState);
-            Set<String> inSyncAllocationIds = getInSyncAllocationIds(primaryId, clusterState);
-
-            markUnavailableShardsAsStale(replicaRequest, inSyncAllocationIds, shards);
-
-            performOnReplicas(replicaRequest, primary.globalCheckpoint(), shards);
+            // to the recovery target. If we used an old replication group, we may miss a recovery that has started since then.
+            // we also have to make sure to get the global checkpoint before the replication group, to ensure that the global checkpoint
+            // is valid for this replication group. If we would sample in the reverse, the global checkpoint might be based on a subset
+            // of the sampled replication group, and advanced further than what the given replication group would allow it to.
+            // This would entail that some shards could learn about a global checkpoint that would be higher than its local checkpoint.
+            final long globalCheckpoint = primary.globalCheckpoint();
+            final ReplicationGroup replicationGroup = primary.getReplicationGroup();
+            markUnavailableShardsAsStale(replicaRequest, replicationGroup.getInSyncAllocationIds(), replicationGroup.getRoutingTable());
+            performOnReplicas(replicaRequest, globalCheckpoint, replicationGroup.getRoutingTable());
         }
 
         successfulShards.incrementAndGet();  // mark primary as successful
         decPendingAndFinishIfNeeded();
     }
 
-    private void markUnavailableShardsAsStale(ReplicaRequest replicaRequest, Set<String> inSyncAllocationIds, List<ShardRouting> shards) {
-        if (inSyncAllocationIds.isEmpty() == false && shards.isEmpty() == false) {
-            Set<String> availableAllocationIds = shards.stream()
-                .map(ShardRouting::allocationId)
-                .filter(Objects::nonNull)
-                .map(AllocationId::getId)
-                .collect(Collectors.toSet());
-
-            // if inSyncAllocationIds contains allocation ids of shards that don't exist in RoutingTable, mark copies as stale
-            for (String allocationId : Sets.difference(inSyncAllocationIds, availableAllocationIds)) {
-                // mark copy as stale
-                pendingActions.incrementAndGet();
-                replicasProxy.markShardCopyAsStaleIfNeeded(replicaRequest.shardId(), allocationId, replicaRequest.primaryTerm(),
-                    ReplicationOperation.this::decPendingAndFinishIfNeeded,
-                    ReplicationOperation.this::onPrimaryDemoted,
-                    throwable -> decPendingAndFinishIfNeeded()
-                );
-            }
+    private void markUnavailableShardsAsStale(ReplicaRequest replicaRequest, Set<String> inSyncAllocationIds,
+                                              IndexShardRoutingTable indexShardRoutingTable) {
+        // if inSyncAllocationIds contains allocation ids of shards that don't exist in RoutingTable, mark copies as stale
+        for (String allocationId : Sets.difference(inSyncAllocationIds, indexShardRoutingTable.getAllAllocationIds())) {
+            // mark copy as stale
+            pendingActions.incrementAndGet();
+            replicasProxy.markShardCopyAsStaleIfNeeded(replicaRequest.shardId(), allocationId, replicaRequest.primaryTerm(),
+                ReplicationOperation.this::decPendingAndFinishIfNeeded,
+                ReplicationOperation.this::onPrimaryDemoted,
+                throwable -> decPendingAndFinishIfNeeded()
+            );
         }
     }
 
-    private void performOnReplicas(final ReplicaRequest replicaRequest, final long globalCheckpoint, final List<ShardRouting> shards) {
+    private void performOnReplicas(final ReplicaRequest replicaRequest, final long globalCheckpoint,
+                                   final IndexShardRoutingTable indexShardRoutingTable) {
         final String localNodeId = primary.routingEntry().currentNodeId();
         // If the index gets deleted after primary operation, we skip replication
-        for (final ShardRouting shard : shards) {
+        for (final ShardRouting shard : indexShardRoutingTable) {
             if (shard.unassigned()) {
-                if (shard.primary() == false) {
-                    totalShards.incrementAndGet();
-                }
+                assert shard.primary() == false : "primary shard should not be unassigned in a replication group: " + shard;
+                totalShards.incrementAndGet();
                 continue;
             }
 
@@ -238,23 +228,11 @@ private void onPrimaryDemoted(Exception demotionFailure) {
      */
     protected String checkActiveShardCount() {
         final ShardId shardId = primary.routingEntry().shardId();
-        final String indexName = shardId.getIndexName();
-        final ClusterState state = clusterStateSupplier.get();
-        assert state != null : "replication operation must have access to the cluster state";
         final ActiveShardCount waitForActiveShards = request.waitForActiveShards();
         if (waitForActiveShards == ActiveShardCount.NONE) {
             return null;  // not waiting for any shards
         }
-        IndexRoutingTable indexRoutingTable = state.getRoutingTable().index(indexName);
-        if (indexRoutingTable == null) {
-            logger.trace("[{}] index not found in the routing table", shardId);
-            return "Index " + indexName + " not found in the routing table";
-        }
-        IndexShardRoutingTable shardRoutingTable = indexRoutingTable.shard(shardId.getId());
-        if (shardRoutingTable == null) {
-            logger.trace("[{}] shard not found in the routing table", shardId);
-            return "Shard " + shardId + " not found in the routing table";
-        }
+        final IndexShardRoutingTable shardRoutingTable = primary.getReplicationGroup().getRoutingTable();
         if (waitForActiveShards.enoughShardsActive(shardRoutingTable)) {
             return null;
         } else {
@@ -268,21 +246,6 @@ protected String checkActiveShardCount() {
         }
     }
 
-    protected Set<String> getInSyncAllocationIds(ShardId shardId, ClusterState clusterState) {
-        IndexMetaData indexMetaData = clusterState.metaData().index(shardId.getIndex());
-        if (indexMetaData != null) {
-            return indexMetaData.inSyncAllocationIds(shardId.id());
-        }
-        return Collections.emptySet();
-    }
-
-    protected List<ShardRouting> getShards(ShardId shardId, ClusterState state) {
-        // can be null if the index is deleted / closed on us..
-        final IndexShardRoutingTable shardRoutingTable = state.getRoutingTable().shardRoutingTableOrNull(shardId);
-        List<ShardRouting> shards = shardRoutingTable == null ? Collections.emptyList() : shardRoutingTable.shards();
-        return shards;
-    }
-
     private void decPendingAndFinishIfNeeded() {
         assert pendingActions.get() > 0 : "pending action count goes below 0 for request [" + request + "]";
         if (pendingActions.decrementAndGet() == 0) {
@@ -371,6 +334,12 @@ public interface Primary<
          */
         long globalCheckpoint();
 
+        /**
+         * Returns the current replication group on the primary shard
+         *
+         * @return the replication group
+         */
+        ReplicationGroup getReplicationGroup();
     }
 
     /**
 
@@ -56,6 +56,7 @@
 import org.elasticsearch.index.seqno.SequenceNumbersService;
 import org.elasticsearch.index.shard.IndexShard;
 import org.elasticsearch.index.shard.IndexShardState;
+import org.elasticsearch.index.shard.ReplicationGroup;
 import org.elasticsearch.index.shard.ShardId;
 import org.elasticsearch.index.shard.ShardNotFoundException;
 import org.elasticsearch.indices.IndexClosedException;
@@ -383,7 +384,7 @@ protected ReplicationOperation<Request, ReplicaRequest, PrimaryResult<ReplicaReq
             Request request, ActionListener<PrimaryResult<ReplicaRequest, Response>> listener,
             PrimaryShardReference primaryShardReference) {
             return new ReplicationOperation<>(request, primaryShardReference, listener,
-                    replicasProxy, clusterService::state, logger, actionName);
+                    replicasProxy, logger, actionName);
         }
     }
 
@@ -629,7 +630,7 @@ public void onFailure(Exception e) {
         }
     }
 
-    private IndexShard getIndexShard(ShardId shardId) {
+    protected IndexShard getIndexShard(ShardId shardId) {
         IndexService indexService = indicesService.indexServiceSafe(shardId.getIndex());
         return indexService.getShard(shardId.id());
     }
@@ -1006,6 +1007,11 @@ public long globalCheckpoint() {
             return indexShard.getGlobalCheckpoint();
         }
 
+        @Override
+        public ReplicationGroup getReplicationGroup() {
+            return indexShard.getReplicationGroup();
+        }
+
     }
 
 
 
@@ -21,6 +21,7 @@
 
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
+import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.Randomness;
 import org.elasticsearch.common.collect.MapBuilder;
 import org.elasticsearch.common.io.stream.StreamInput;
@@ -61,6 +62,7 @@ public class IndexShardRoutingTable implements Iterable<ShardRouting> {
     final List<ShardRouting> shards;
     final List<ShardRouting> activeShards;
     final List<ShardRouting> assignedShards;
+    final Set<String> allAllocationIds;
     static final List<ShardRouting> NO_SHARDS = Collections.emptyList();
     final boolean allShardsStarted;
 
@@ -84,6 +86,7 @@ public class IndexShardRoutingTable implements Iterable<ShardRouting> {
         List<ShardRouting> activeShards = new ArrayList<>();
         List<ShardRouting> assignedShards = new ArrayList<>();
         List<ShardRouting> allInitializingShards = new ArrayList<>();
+        Set<String> allAllocationIds = new HashSet<>();
         boolean allShardsStarted = true;
         for (ShardRouting shard : shards) {
             if (shard.primary()) {
@@ -100,9 +103,11 @@ public class IndexShardRoutingTable implements Iterable<ShardRouting> {
             if (shard.relocating()) {
                 // create the target initializing shard routing on the node the shard is relocating to
                 allInitializingShards.add(shard.getTargetRelocatingShard());
+                allAllocationIds.add(shard.getTargetRelocatingShard().allocationId().getId());
             }
             if (shard.assignedToNode()) {
                 assignedShards.add(shard);
+                allAllocationIds.add(shard.allocationId().getId());
             }
             if (shard.state() != ShardRoutingState.STARTED) {
                 allShardsStarted = false;
@@ -119,6 +124,7 @@ public class IndexShardRoutingTable implements Iterable<ShardRouting> {
         this.activeShards = Collections.unmodifiableList(activeShards);
         this.assignedShards = Collections.unmodifiableList(assignedShards);
         this.allInitializingShards = Collections.unmodifiableList(allInitializingShards);
+        this.allAllocationIds = Collections.unmodifiableSet(allAllocationIds);
     }
 
     /**
@@ -435,6 +441,25 @@ public boolean allShardsStarted() {
         return allShardsStarted;
     }
 
+    @Nullable
+    public ShardRouting getByAllocationId(String allocationId) {
+        for (ShardRouting shardRouting : assignedShards()) {
+            if (shardRouting.allocationId().getId().equals(allocationId)) {
+                return shardRouting;
+            }
+            if (shardRouting.relocating()) {
+                if (shardRouting.getTargetRelocatingShard().allocationId().getId().equals(allocationId)) {
+                    return shardRouting.getTargetRelocatingShard();
+                }
+            }
+        }
+        return null;
+    }
+
+    public Set<String> getAllAllocationIds() {
+        return allAllocationIds;
+    }
+
     static class AttributesKey {
 
         final String[] attributes;
@@ -634,7 +659,7 @@ public static IndexShardRoutingTable readFromThin(StreamInput in, Index index) t
         }
 
         public static void writeTo(IndexShardRoutingTable indexShard, StreamOutput out) throws IOException {
-            out.writeString(indexShard.shardId().getIndex().getName());
+            indexShard.shardId().getIndex().writeTo(out);
             writeToThin(indexShard, out);
         }
 
@@ -648,4 +673,19 @@ public static void writeToThin(IndexShardRoutingTable indexShard, StreamOutput o
         }
 
     }
+
+    @Override
+    public String toString() {
+        StringBuilder sb = new StringBuilder();
+        sb.append("IndexShardRoutingTable(").append(shardId()).append("){");
+        final int numShards = shards.size();
+        for (int i = 0; i < numShards; i++) {
+            sb.append(shards.get(i).shortSummary());
+            if (i < numShards - 1) {
+                sb.append(", ");
+            }
+        }
+        sb.append("}");
+        return sb.toString();
+    }
 }
@@ -156,17 +156,7 @@ public ShardRouting getByAllocationId(ShardId shardId, String allocationId) {
         if (shardRoutingTable == null) {
             return null;
         }
-        for (ShardRouting shardRouting : shardRoutingTable.assignedShards()) {
-            if (shardRouting.allocationId().getId().equals(allocationId)) {
-                return shardRouting;
-            }
-            if (shardRouting.relocating()) {
-                if (shardRouting.getTargetRelocatingShard().allocationId().getId().equals(allocationId)) {
-                    return shardRouting.getTargetRelocatingShard();
-                }
-            }
-        }
-        return null;
+        return shardRoutingTable.getByAllocationId(allocationId);
     }
 
 
 
@@ -82,6 +82,7 @@ public final class ShardRouting implements Writeable, ToXContent {
         assert !(state == ShardRoutingState.UNASSIGNED && unassignedInfo == null) : "unassigned shard must be created with meta";
         assert (state == ShardRoutingState.UNASSIGNED || state == ShardRoutingState.INITIALIZING) == (recoverySource != null) : "recovery source only available on unassigned or initializing shard but was " + state;
         assert recoverySource == null || recoverySource == PeerRecoverySource.INSTANCE || primary : "replica shards always recover from primary";
+        assert (currentNodeId == null) == (state == ShardRoutingState.UNASSIGNED)  : "unassigned shard must not be assigned to a node " + this;
     }
 
     @Nullable
Original file line number	Diff line number	Diff line change
`@@ -56,6 +56,7 @@`
`56`	`56`	`import org.elasticsearch.index.seqno.SequenceNumbersService;`
`57`	`57`	`import org.elasticsearch.index.shard.IndexShard;`
`58`	`58`	`import org.elasticsearch.index.shard.IndexShardState;`
	`59`	`+import org.elasticsearch.index.shard.ReplicationGroup;`
`59`	`60`	`import org.elasticsearch.index.shard.ShardId;`
`60`	`61`	`import org.elasticsearch.index.shard.ShardNotFoundException;`
`61`	`62`	`import org.elasticsearch.indices.IndexClosedException;`
`@@ -383,7 +384,7 @@ protected ReplicationOperation<Request, ReplicaRequest, PrimaryResult<ReplicaReq`
`383`	`384`	`Request request, ActionListener<PrimaryResult<ReplicaRequest, Response>> listener,`
`384`	`385`	`PrimaryShardReference primaryShardReference) {`
`385`	`386`	`return new ReplicationOperation<>(request, primaryShardReference, listener,`
`386`		`- replicasProxy, clusterService::state, logger, actionName);`
	`387`	`+ replicasProxy, logger, actionName);`
`387`	`388`	`}`
`388`	`389`	`}`
`389`	`390`
`@@ -629,7 +630,7 @@ public void onFailure(Exception e) {`
`629`	`630`	`}`
`630`	`631`	`}`
`631`	`632`
`632`		`- private IndexShard getIndexShard(ShardId shardId) {`
	`633`	`+ protected IndexShard getIndexShard(ShardId shardId) {`
`633`	`634`	`IndexService indexService = indicesService.indexServiceSafe(shardId.getIndex());`
`634`	`635`	`return indexService.getShard(shardId.id());`
`635`	`636`	`}`
`@@ -1006,6 +1007,11 @@ public long globalCheckpoint() {`
`1006`	`1007`	`return indexShard.getGlobalCheckpoint();`
`1007`	`1008`	`}`
`1008`	`1009`
	`1010`	`+ @Override`
	`1011`	`+ public ReplicationGroup getReplicationGroup() {`
	`1012`	`+ return indexShard.getReplicationGroup();`
	`1013`	`+ }`
	`1014`	`+`
`1009`	`1015`	`}`
`1010`	`1016`
`1011`	`1017`
Original file line number	Diff line number	Diff line change
`@@ -82,6 +82,7 @@ public final class ShardRouting implements Writeable, ToXContent {`
`82`	`82`	`assert !(state == ShardRoutingState.UNASSIGNED && unassignedInfo == null) : "unassigned shard must be created with meta";`
`83`	`83`	`assert (state == ShardRoutingState.UNASSIGNED \|\| state == ShardRoutingState.INITIALIZING) == (recoverySource != null) : "recovery source only available on unassigned or initializing shard but was " + state;`
`84`	`84`	`assert recoverySource == null \|\| recoverySource == PeerRecoverySource.INSTANCE \|\| primary : "replica shards always recover from primary";`
	`85`	`+ assert (currentNodeId == null) == (state == ShardRoutingState.UNASSIGNED) : "unassigned shard must not be assigned to a node " + this;`
`85`	`86`	`}`
`86`	`87`
`87`	`88`	`@Nullable`