DaveCTurner
diff --git a/‎server/src/main/java/org/elasticsearch/cluster/action/shard/ShardStateAction.java‎
Lines changed: 48 additions & 5 deletions b/‎server/src/main/java/org/elasticsearch/cluster/action/shard/ShardStateAction.java‎
Lines changed: 48 additions & 5 deletions
diff --git a/‎server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java‎
Lines changed: 0 additions & 1 deletion b/‎server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java‎
Lines changed: 2 additions & 0 deletions b/‎server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎server/src/test/java/org/elasticsearch/action/admin/indices/shrink/TransportResizeActionTests.java‎
Lines changed: 5 additions & 9 deletions b/‎server/src/test/java/org/elasticsearch/action/admin/indices/shrink/TransportResizeActionTests.java‎
Lines changed: 5 additions & 9 deletions
diff --git a/‎server/src/test/java/org/elasticsearch/action/support/replication/TransportReplicationActionTests.java‎
Lines changed: 1 addition & 1 deletion b/‎server/src/test/java/org/elasticsearch/action/support/replication/TransportReplicationActionTests.java‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎server/src/test/java/org/elasticsearch/cluster/action/shard/ShardFailedClusterStateTaskExecutorTests.java‎
Lines changed: 8 additions & 12 deletions b/‎server/src/test/java/org/elasticsearch/cluster/action/shard/ShardFailedClusterStateTaskExecutorTests.java‎
Lines changed: 8 additions & 12 deletions
@@ -47,6 +47,7 @@
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
+import org.elasticsearch.common.settings.Setting;
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.index.shard.ShardId;
 import org.elasticsearch.node.NodeClosedException;
@@ -71,6 +72,7 @@
 import java.util.Objects;
 import java.util.Set;
 import java.util.function.Predicate;
+import java.util.function.Supplier;
 
 public class ShardStateAction {
 
@@ -79,10 +81,23 @@ public class ShardStateAction {
     public static final String SHARD_STARTED_ACTION_NAME = "internal:cluster/shard/started";
     public static final String SHARD_FAILED_ACTION_NAME = "internal:cluster/shard/failure";
 
+    /**
+     * Adjusts the priority of the followup reroute task. NORMAL is right for reasonable clusters, but in a badly configured cluster it may
+     * be necessary to raise this higher to recover the older behaviour of rerouting after processing every shard-started task. Deliberately
+     * undocumented, since this is a last-resort escape hatch for experts rather than something we want to expose to anyone, and deprecated
+     * since we will remove it once we have confirmed from experience that this priority is appropriate in all cases.
+     */
+    public static final Setting<Priority> FOLLOW_UP_REROUTE_PRIORITY_SETTING
+        = new Setting<>("cluster.routing.allocation.shard_state.reroute.priority", Priority.NORMAL.toString(),
+        name -> Priority.valueOf(name.toUpperCase(Locale.ROOT)),
+        Setting.Property.NodeScope, Setting.Property.Dynamic, Setting.Property.Deprecated);
+
     private final TransportService transportService;
     private final ClusterService clusterService;
     private final ThreadPool threadPool;
 
+    private volatile Priority followUpRerouteTaskPriority;
+
     // a list of shards that failed during replication
     // we keep track of these shards in order to avoid sending duplicate failed shard requests for a single failing shard.
     private final TransportRequestDeduplicator<FailedShardEntry> remoteFailedShardsDeduplicator = new TransportRequestDeduplicator<>();
@@ -94,11 +109,18 @@ public ShardStateAction(ClusterService clusterService, TransportService transpor
         this.clusterService = clusterService;
         this.threadPool = threadPool;
 
+        followUpRerouteTaskPriority = FOLLOW_UP_REROUTE_PRIORITY_SETTING.get(clusterService.getSettings());
+        clusterService.getClusterSettings().addSettingsUpdateConsumer(FOLLOW_UP_REROUTE_PRIORITY_SETTING,
+            this::setFollowUpRerouteTaskPriority);
+
         transportService.registerRequestHandler(SHARD_STARTED_ACTION_NAME, ThreadPool.Names.SAME, StartedShardEntry::new,
-            new ShardStartedTransportHandler(clusterService, new ShardStartedClusterStateTaskExecutor(allocationService, logger), logger));
+            new ShardStartedTransportHandler(clusterService,
+                new ShardStartedClusterStateTaskExecutor(allocationService, rerouteService, () -> followUpRerouteTaskPriority, logger),
+                logger));
         transportService.registerRequestHandler(SHARD_FAILED_ACTION_NAME, ThreadPool.Names.SAME, FailedShardEntry::new,
             new ShardFailedTransportHandler(clusterService,
-                new ShardFailedClusterStateTaskExecutor(allocationService, rerouteService, logger), logger));
+                new ShardFailedClusterStateTaskExecutor(allocationService, rerouteService, () -> followUpRerouteTaskPriority, logger),
+                logger));
     }
 
     private void sendShardAction(final String actionName, final ClusterState currentState,
@@ -215,6 +237,10 @@ public void onTimeout(TimeValue timeout) {
         }, changePredicate);
     }
 
+    private void setFollowUpRerouteTaskPriority(Priority followUpRerouteTaskPriority) {
+        this.followUpRerouteTaskPriority = followUpRerouteTaskPriority;
+    }
+
     private static class ShardFailedTransportHandler implements TransportRequestHandler<FailedShardEntry> {
         private final ClusterService clusterService;
         private final ShardFailedClusterStateTaskExecutor shardFailedClusterStateTaskExecutor;
@@ -282,11 +308,14 @@ public static class ShardFailedClusterStateTaskExecutor implements ClusterStateT
         private final AllocationService allocationService;
         private final RerouteService rerouteService;
         private final Logger logger;
+        private final Supplier<Priority> prioritySupplier;
 
-        public ShardFailedClusterStateTaskExecutor(AllocationService allocationService, RerouteService rerouteService, Logger logger) {
+        public ShardFailedClusterStateTaskExecutor(AllocationService allocationService, RerouteService rerouteService,
+                                                   Supplier<Priority> prioritySupplier, Logger logger) {
             this.allocationService = allocationService;
             this.rerouteService = rerouteService;
             this.logger = logger;
+            this.prioritySupplier = prioritySupplier;
         }
 
         @Override
@@ -380,7 +409,7 @@ public void clusterStatePublished(ClusterChangedEvent clusterChangedEvent) {
                 // assign it again, even if that means putting it back on the node on which it previously failed:
                 final String reason = String.format(Locale.ROOT, "[%d] unassigned shards after failing shards", numberOfUnassignedShards);
                 logger.trace("{}, scheduling a reroute", reason);
-                rerouteService.reroute(reason, Priority.HIGH, ActionListener.wrap(
+                rerouteService.reroute(reason, prioritySupplier.get(), ActionListener.wrap(
                     r -> logger.trace("{}, reroute completed", reason),
                     e -> logger.debug(new ParameterizedMessage("{}, reroute failed", reason), e)));
             }
@@ -511,10 +540,15 @@ public static class ShardStartedClusterStateTaskExecutor
             implements ClusterStateTaskExecutor<StartedShardEntry>, ClusterStateTaskListener {
         private final AllocationService allocationService;
         private final Logger logger;
+        private final RerouteService rerouteService;
+        private final Supplier<Priority> prioritySupplier;
 
-        public ShardStartedClusterStateTaskExecutor(AllocationService allocationService, Logger logger) {
+        public ShardStartedClusterStateTaskExecutor(AllocationService allocationService, RerouteService rerouteService,
+                                                    Supplier<Priority> prioritySupplier, Logger logger) {
             this.allocationService = allocationService;
             this.logger = logger;
+            this.rerouteService = rerouteService;
+            this.prioritySupplier = prioritySupplier;
         }
 
         @Override
@@ -589,6 +623,15 @@ public void onFailure(String source, Exception e) {
                 logger.error(() -> new ParameterizedMessage("unexpected failure during [{}]", source), e);
             }
         }
+
+        @Override
+        public void clusterStatePublished(ClusterChangedEvent clusterChangedEvent) {
+            if (clusterChangedEvent.previousState() != clusterChangedEvent.state()) {
+                rerouteService.reroute("reroute after starting shards", prioritySupplier.get(), ActionListener.wrap(
+                    r -> logger.trace("reroute after starting shards succeeded"),
+                    e -> logger.debug("reroute after starting shards failed", e)));
+            }
+        }
     }
 
     public static class StartedShardEntry extends TransportRequest {
 
@@ -109,7 +109,6 @@ public ClusterState applyStartedShards(ClusterState clusterState, List<ShardRout
         Collections.sort(startedShards, Comparator.comparing(ShardRouting::primary));
         applyStartedShards(allocation, startedShards);
         gatewayAllocator.applyStartedShards(allocation, startedShards);
-        reroute(allocation);
         String startedShardsAsString = firstListElementsToCommaDelimitedString(startedShards, s -> s.shardId().toString());
         return buildResultAndLogHealthChange(clusterState, allocation, "shards started [" + startedShardsAsString + "] ...");
     }
 
@@ -31,6 +31,7 @@
 import org.elasticsearch.cluster.InternalClusterInfoService;
 import org.elasticsearch.cluster.NodeConnectionsService;
 import org.elasticsearch.cluster.action.index.MappingUpdatedAction;
+import org.elasticsearch.cluster.action.shard.ShardStateAction;
 import org.elasticsearch.cluster.coordination.ClusterBootstrapService;
 import org.elasticsearch.cluster.coordination.ClusterFormationFailureHelper;
 import org.elasticsearch.cluster.coordination.Coordinator;
@@ -213,6 +214,7 @@ public void apply(Settings value, Settings current, Settings previous) {
             DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_INCLUDE_RELOCATIONS_SETTING,
             DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING,
             SameShardAllocationDecider.CLUSTER_ROUTING_ALLOCATION_SAME_HOST_SETTING,
+            ShardStateAction.FOLLOW_UP_REROUTE_PRIORITY_SETTING,
             InternalClusterInfoService.INTERNAL_CLUSTER_INFO_UPDATE_INTERVAL_SETTING,
             InternalClusterInfoService.INTERNAL_CLUSTER_INFO_TIMEOUT_SETTING,
             DestructiveOperations.REQUIRES_NAME_SETTING,
 
@@ -25,6 +25,7 @@
 import org.elasticsearch.action.support.ActiveShardCount;
 import org.elasticsearch.cluster.ClusterName;
 import org.elasticsearch.cluster.ClusterState;
+import org.elasticsearch.cluster.ESAllocationTestCase;
 import org.elasticsearch.cluster.EmptyClusterInfoService;
 import org.elasticsearch.cluster.block.ClusterBlocks;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
@@ -33,7 +34,6 @@
 import org.elasticsearch.cluster.node.DiscoveryNodeRole;
 import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.cluster.routing.RoutingTable;
-import org.elasticsearch.cluster.routing.ShardRoutingState;
 import org.elasticsearch.cluster.routing.allocation.AllocationService;
 import org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator;
 import org.elasticsearch.cluster.routing.allocation.decider.AllocationDeciders;
@@ -114,8 +114,7 @@ public void testErrorCondition() {
         RoutingTable routingTable = service.reroute(clusterState, "reroute").routingTable();
         clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
         // now we start the shard
-        routingTable = service.applyStartedShards(clusterState,
-            routingTable.index("source").shardsWithState(ShardRoutingState.INITIALIZING)).routingTable();
+        routingTable = ESAllocationTestCase.startInitializingShardsAndReroute(service, clusterState, "source").routingTable();
         clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
 
         TransportResizeAction.prepareCreateIndexRequest(new ResizeRequest("target", "source"), clusterState,
@@ -133,8 +132,7 @@ public void testPassNumRoutingShards() {
         RoutingTable routingTable = service.reroute(clusterState, "reroute").routingTable();
         clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
         // now we start the shard
-        routingTable = service.applyStartedShards(clusterState,
-            routingTable.index("source").shardsWithState(ShardRoutingState.INITIALIZING)).routingTable();
+        routingTable = ESAllocationTestCase.startInitializingShardsAndReroute(service, clusterState, "source").routingTable();
         clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
 
         ResizeRequest resizeRequest = new ResizeRequest("target", "source");
@@ -163,8 +161,7 @@ public void testPassNumRoutingShardsAndFail() {
         RoutingTable routingTable = service.reroute(clusterState, "reroute").routingTable();
         clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
         // now we start the shard
-        routingTable = service.applyStartedShards(clusterState,
-            routingTable.index("source").shardsWithState(ShardRoutingState.INITIALIZING)).routingTable();
+        routingTable = ESAllocationTestCase.startInitializingShardsAndReroute(service, clusterState, "source").routingTable();
         clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
 
         ResizeRequest resizeRequest = new ResizeRequest("target", "source");
@@ -198,8 +195,7 @@ public void testShrinkIndexSettings() {
         RoutingTable routingTable = service.reroute(clusterState, "reroute").routingTable();
         clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
         // now we start the shard
-        routingTable = service.applyStartedShards(clusterState,
-            routingTable.index(indexName).shardsWithState(ShardRoutingState.INITIALIZING)).routingTable();
+        routingTable = ESAllocationTestCase.startInitializingShardsAndReroute(service, clusterState, indexName).routingTable();
         clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
         int numSourceShards = clusterState.metaData().index(indexName).getNumberOfShards();
         DocsStats stats = new DocsStats(between(0, (IndexWriter.MAX_DOCS) / numSourceShards), between(1, 1000), between(1, 10000));
 
@@ -466,7 +466,7 @@ public void testNoRerouteOnStaleClusterState() {
         ShardRouting relocationTarget = clusterService.state().getRoutingTable().shardRoutingTable(shardId)
             .shardsWithState(ShardRoutingState.INITIALIZING).get(0);
         AllocationService allocationService = ESAllocationTestCase.createAllocationService();
-        ClusterState updatedState = allocationService.applyStartedShards(state, Collections.singletonList(relocationTarget));
+        ClusterState updatedState = ESAllocationTestCase.startShardsAndReroute(allocationService, state, relocationTarget);
 
         setState(clusterService, updatedState);
         logger.debug("--> relocation complete state:\n{}", clusterService.state());
 
@@ -32,7 +32,6 @@
 import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.cluster.routing.GroupShardsIterator;
 import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
-import org.elasticsearch.cluster.routing.RoutingNodes;
 import org.elasticsearch.cluster.routing.RoutingTable;
 import org.elasticsearch.cluster.routing.ShardIterator;
 import org.elasticsearch.cluster.routing.ShardRouting;
@@ -42,6 +41,7 @@
 import org.elasticsearch.cluster.routing.allocation.FailedShard;
 import org.elasticsearch.cluster.routing.allocation.StaleShard;
 import org.elasticsearch.cluster.routing.allocation.decider.ClusterRebalanceAllocationDecider;
+import org.elasticsearch.common.Priority;
 import org.elasticsearch.common.UUIDs;
 import org.elasticsearch.common.collect.Tuple;
 import org.elasticsearch.common.settings.Settings;
@@ -50,14 +50,12 @@
 import org.elasticsearch.index.shard.ShardId;
 
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 import java.util.Set;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 
-import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING;
 import static org.hamcrest.CoreMatchers.equalTo;
 import static org.hamcrest.CoreMatchers.instanceOf;
 import static org.hamcrest.Matchers.contains;
@@ -89,7 +87,7 @@ public void setUp() throws Exception {
             .build();
         clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
             .metaData(metaData).routingTable(routingTable).build();
-        executor = new ShardStateAction.ShardFailedClusterStateTaskExecutor(allocationService, null, logger);
+        executor = new ShardStateAction.ShardFailedClusterStateTaskExecutor(allocationService, null, () -> Priority.NORMAL, logger);
     }
 
     public void testEmptyTaskListProducesSameClusterState() throws Exception {
@@ -121,7 +119,7 @@ public void testTriviallySuccessfulTasksBatchedWithFailingTasks() throws Excepti
         List<FailedShardEntry> failingTasks = createExistingShards(currentState, reason);
         List<FailedShardEntry> nonExistentTasks = createNonExistentShards(currentState, reason);
         ShardStateAction.ShardFailedClusterStateTaskExecutor failingExecutor =
-            new ShardStateAction.ShardFailedClusterStateTaskExecutor(allocationService, null, logger) {
+            new ShardStateAction.ShardFailedClusterStateTaskExecutor(allocationService, null, () -> Priority.NORMAL, logger) {
                 @Override
                 ClusterState applyFailedShards(ClusterState currentState, List<FailedShard> failedShards, List<StaleShard> staleShards) {
                     throw new RuntimeException("simulated applyFailedShards failure");
@@ -165,22 +163,22 @@ public void testIllegalShardFailureRequests() throws Exception {
     public void testMarkAsStaleWhenFailingShard() throws Exception {
         final MockAllocationService allocation = createAllocationService();
         ClusterState clusterState = createClusterStateWithStartedShards("test markAsStale");
-        clusterState = allocation.applyStartedShards(clusterState, clusterState.getRoutingNodes().shardsWithState(INITIALIZING));
+        clusterState = startInitializingShardsAndReroute(allocation, clusterState);
         IndexShardRoutingTable shardRoutingTable = clusterState.routingTable().index(INDEX).shard(0);
         long primaryTerm = clusterState.metaData().index(INDEX).primaryTerm(0);
         final Set<String> oldInSync = clusterState.metaData().index(INDEX).inSyncAllocationIds(0);
         {
             ShardStateAction.FailedShardEntry failShardOnly = new ShardStateAction.FailedShardEntry(shardRoutingTable.shardId(),
                 randomFrom(oldInSync), primaryTerm, "dummy", null, false);
-            ClusterState appliedState = executor.execute(clusterState, Arrays.asList(failShardOnly)).resultingState;
+            ClusterState appliedState = executor.execute(clusterState, Collections.singletonList(failShardOnly)).resultingState;
             Set<String> newInSync = appliedState.metaData().index(INDEX).inSyncAllocationIds(0);
             assertThat(newInSync, equalTo(oldInSync));
         }
         {
             final String failedAllocationId = randomFrom(oldInSync);
             ShardStateAction.FailedShardEntry failAndMarkAsStale = new ShardStateAction.FailedShardEntry(shardRoutingTable.shardId(),
                 failedAllocationId, primaryTerm, "dummy", null, true);
-            ClusterState appliedState = executor.execute(clusterState, Arrays.asList(failAndMarkAsStale)).resultingState;
+            ClusterState appliedState = executor.execute(clusterState, Collections.singletonList(failAndMarkAsStale)).resultingState;
             Set<String> newInSync = appliedState.metaData().index(INDEX).inSyncAllocationIds(0);
             assertThat(Sets.difference(oldInSync, newInSync), contains(failedAllocationId));
         }
@@ -192,11 +190,9 @@ private ClusterState createClusterStateWithStartedShards(String reason) {
         IntStream.rangeClosed(1, numberOfNodes).mapToObj(node -> newNode("node" + node)).forEach(nodes::add);
         ClusterState stateAfterAddingNode =
             ClusterState.builder(clusterState).nodes(nodes).build();
-        RoutingTable afterReroute =
-            allocationService.reroute(stateAfterAddingNode, reason).routingTable();
+        RoutingTable afterReroute = allocationService.reroute(stateAfterAddingNode, reason).routingTable();
         ClusterState stateAfterReroute = ClusterState.builder(stateAfterAddingNode).routingTable(afterReroute).build();
-        RoutingNodes routingNodes = stateAfterReroute.getRoutingNodes();
-        return allocationService.applyStartedShards(stateAfterReroute, routingNodes.shardsWithState(ShardRoutingState.INITIALIZING));
+        return ESAllocationTestCase.startInitializingShardsAndReroute(allocationService, stateAfterReroute);
     }
 
     private List<ShardStateAction.FailedShardEntry> createExistingShards(ClusterState currentState, String reason) {
Original file line number	Diff line number	Diff line change
`@@ -109,7 +109,6 @@ public ClusterState applyStartedShards(ClusterState clusterState, List<ShardRout`
`109`	`109`	`Collections.sort(startedShards, Comparator.comparing(ShardRouting::primary));`
`110`	`110`	`applyStartedShards(allocation, startedShards);`
`111`	`111`	`gatewayAllocator.applyStartedShards(allocation, startedShards);`
`112`		`- reroute(allocation);`
`113`	`112`	`String startedShardsAsString = firstListElementsToCommaDelimitedString(startedShards, s -> s.shardId().toString());`
`114`	`113`	`return buildResultAndLogHealthChange(clusterState, allocation, "shards started [" + startedShardsAsString + "] ...");`
`115`	`114`	`}`