Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
4a0ee8a
Return NOT_PREFERRED decisions in allocation explain
DiannaHohensee Nov 8, 2025
7165744
improve assert message
DiannaHohensee Nov 8, 2025
00a44f4
Merge branch 'main' into 2025/10/23/ES-12833
DiannaHohensee Nov 20, 2025
debef72
fixes per Nick's review
DiannaHohensee Nov 20, 2025
08fad2f
comment improvement
DiannaHohensee Nov 20, 2025
89f83fa
fix tests: Decision.Type enum order; assertion did not account for no…
DiannaHohensee Nov 24, 2025
92d9abe
Merge branch 'main' into 2025/10/23/ES-12833
DiannaHohensee Nov 24, 2025
9968684
fix Decision.Type.allowed() usage
DiannaHohensee Nov 25, 2025
b28ead1
Merge branch 'main' into 2025/10/23/ES-12833
DiannaHohensee Nov 25, 2025
8c6ed36
spotless
DiannaHohensee Nov 25, 2025
e796461
wip -- not final
DiannaHohensee Nov 25, 2025
b9800f0
Merge branch 'main' into 2025/10/23/ES-12833
DiannaHohensee Dec 1, 2025
e4b124e
backwards compatibility for AllocationDecision enum
DiannaHohensee Dec 1, 2025
a515642
Merge branch 'main' into 2025/10/23/ES-12833
DiannaHohensee Dec 1, 2025
f3b0ab6
tidy
DiannaHohensee Dec 1, 2025
f9dae18
Merge branch 'main' into 2025/10/23/ES-12833
DiannaHohensee Dec 4, 2025
8aea860
Add missing transport definition
mark-vieira Dec 4, 2025
daf17ed
undo NOT_PREFERRED leaking through to allocation explain when it shou…
DiannaHohensee Dec 5, 2025
ab3f15c
Merge branch 'main' into 2025/10/23/ES-12833
DiannaHohensee Dec 5, 2025
873b51b
fix yet another test.
DiannaHohensee Dec 11, 2025
5137bd8
fix transport version
DiannaHohensee Dec 11, 2025
2d5af74
Merge branch 'main' into 2025/10/23/ES-12833
DiannaHohensee Dec 11, 2025
763eac1
fix AllocationDecidersTests
DiannaHohensee Dec 11, 2025
6b1a346
Merge branch 'main' into 2025/10/23/ES-12833
DiannaHohensee Dec 11, 2025
4fa0670
fix versioning
DiannaHohensee Dec 11, 2025
9da79ff
fix transport version after merge main
DiannaHohensee Dec 12, 2025
9f0fee3
add assertion to MoveDecision#move; clear targetNode on THROTTLE in #…
DiannaHohensee Dec 15, 2025
f2d429f
Merge branch 'main' into 2025/10/23/ES-12833
DiannaHohensee Dec 15, 2025
0705c5e
fix transport version after merge
DiannaHohensee Dec 15, 2025
0353019
fix assertion...
DiannaHohensee Dec 15, 2025
bbf524d
actually fixed test?
DiannaHohensee Dec 15, 2025
91c6b4c
Merge branch 'main' into 2025/10/23/ES-12833
DiannaHohensee Dec 15, 2025
4773a4f
transport version fix
DiannaHohensee Dec 15, 2025
aac1157
fix msg and remove isSimulating non-existent scenario
DiannaHohensee Dec 16, 2025
1ff02aa
Merge branch 'main' into 2025/10/23/ES-12833
DiannaHohensee Dec 16, 2025
29f813e
fix transport version
DiannaHohensee Dec 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@
package org.elasticsearch.cluster.routing.allocation.decider;

import org.apache.logging.log4j.Level;
import org.elasticsearch.action.admin.cluster.allocation.ClusterAllocationExplainRequest;
import org.elasticsearch.action.admin.cluster.allocation.DesiredBalanceRequest;
import org.elasticsearch.action.admin.cluster.allocation.DesiredBalanceResponse;
import org.elasticsearch.action.admin.cluster.allocation.TransportClusterAllocationExplainAction;
import org.elasticsearch.action.admin.cluster.allocation.TransportGetDesiredBalanceAction;
import org.elasticsearch.action.admin.cluster.node.usage.NodeUsageStatsForThreadPoolsAction;
import org.elasticsearch.action.admin.cluster.node.usage.TransportNodeUsageStatsForThreadPoolsAction;
Expand All @@ -27,11 +29,14 @@
import org.elasticsearch.cluster.routing.RoutingNodes;
import org.elasticsearch.cluster.routing.ShardRouting;
import org.elasticsearch.cluster.routing.UnassignedInfo;
import org.elasticsearch.cluster.routing.allocation.AllocationDecision;
import org.elasticsearch.cluster.routing.allocation.Explanations;
import org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintSettings;
import org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator;
import org.elasticsearch.cluster.routing.allocation.allocator.DesiredBalanceMetrics;
import org.elasticsearch.cluster.routing.allocation.allocator.DesiredBalanceShardsAllocator;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.CollectionUtils;
import org.elasticsearch.core.TimeValue;
Expand Down Expand Up @@ -264,13 +269,55 @@ public void testShardsAreAssignedToNotPreferredWhenAlternativeIsNo() {
}
}

@TestLogging(
reason = "track when reconciliation has completed",
value = "org.elasticsearch.cluster.routing.allocation.allocator.DesiredBalanceShardsAllocator:DEBUG"
)
public void testAllocationExplainMoveShardNotPreferred() {
TestHarness harness = setUpThreeTestNodesAndAllIndexShardsOnFirstNode();

// Running the {@link #runCanRemainNotPreferredIsIgnoredWhenAllOtherNodesReturnNotPreferred} logic should set up a cluster where
// shards are all allocated to a {@link AllocationDeciders#canRemain} {@link Decision#NOT_PREFERRED} node, while the other nodes
// return {@link AllocationDeciders#canAllocate} {@link Decision#NOT_PREFERRED} responses. This should exercise NOT_PREFERRED in
// the allocation/explain paths for remaining on a node AND assignment to other nodes.
runCanRemainNotPreferredIsIgnoredWhenAllOtherNodesReturnNotPreferred(harness);
int numDataNodes = internalCluster().numDataNodes();
assertThat(
"test requires at least two nodes, one node for canRemain explanation, one for canAllocation explanation",
numDataNodes,
greaterThanOrEqualTo(2)
);

ClusterAllocationExplainRequest allocationExplainRequest = new ClusterAllocationExplainRequest(TEST_REQUEST_TIMEOUT).setIndex(
harness.indexName
).setShard(0).setPrimary(true);
var allocationExplainResponse = safeGet(client().execute(TransportClusterAllocationExplainAction.TYPE, allocationExplainRequest));
logger.info("---> Allocation explain response: " + Strings.toString(allocationExplainResponse.getExplanation(), true, true));

var decision = allocationExplainResponse.getExplanation().getShardAllocationDecision().getMoveDecision();
assertThat("Rebalancing should be disabled", decision.canRebalanceCluster(), equalTo(false));
assertThat(decision.getCanRemainDecision().type(), equalTo(Decision.NOT_PREFERRED.type()));
assertNull(decision.getTargetNode());
assertThat(decision.getAllocationDecision(), equalTo(AllocationDecision.NOT_PREFERRED));

var canAllocateDecisions = allocationExplainResponse.getExplanation()
.getShardAllocationDecision()
.getMoveDecision()
.getNodeDecisions();
assertThat(canAllocateDecisions.size(), equalTo(/* number of nodes to which the shard can be relocated = */ numDataNodes - 1));
canAllocateDecisions.forEach(nodeDecision -> assertThat(nodeDecision.getNodeDecision(), equalTo(AllocationDecision.NOT_PREFERRED)));
}

@TestLogging(
reason = "track when reconciliation has completed",
value = "org.elasticsearch.cluster.routing.allocation.allocator.DesiredBalanceShardsAllocator:DEBUG"
)
public void testCanRemainNotPreferredIsIgnoredWhenAllOtherNodesReturnNotPreferred() {
TestHarness harness = setUpThreeTestNodesAndAllIndexShardsOnFirstNode();
runCanRemainNotPreferredIsIgnoredWhenAllOtherNodesReturnNotPreferred(harness);
}

private void runCanRemainNotPreferredIsIgnoredWhenAllOtherNodesReturnNotPreferred(TestHarness harness) {
/**
* Override the {@link TransportNodeUsageStatsForThreadPoolsAction} action on the data nodes to supply artificial thread pool write
* load stats. The stats will show all the nodes above the high utilization threshold, so they do not accept new shards, while the
Expand Down Expand Up @@ -455,6 +502,80 @@ public void testCanRemainRelocatesOneShardWhenAHotSpotOccurs() {
}
}

public void testAllocationExplainRebalancingNotPreferred() {
var harness = setUpThreeTestNodesAndAllIndexShardsOnFirstNode();

// Set up all data nodes to report write thread pool usage above the utilization threshold, to stop canAllocate, but no write thread
// pool queuing to force a shard relocation. This will leave all the data nodes unable to accept new shards when rebalancing
// attempts to redistribute shards more evenly than all shards on a single node.

final NodeUsageStatsForThreadPools firstNodeAboveUtilizationThresholdNodeStats = createNodeUsageStatsForThreadPools(
harness.firstDiscoveryNode,
harness.randomNumberOfWritePoolThreads,
randomIntBetween(harness.randomUtilizationThresholdPercent, 100) / 100f,
0
);
final NodeUsageStatsForThreadPools secondNodeAboveUtilizationThresholdNodeStats = createNodeUsageStatsForThreadPools(
harness.secondDiscoveryNode,
harness.randomNumberOfWritePoolThreads,
randomIntBetween(harness.randomUtilizationThresholdPercent, 100) / 100f,
0
);
final NodeUsageStatsForThreadPools thirdNodeAboveUtilizationThresholdNodeStats = createNodeUsageStatsForThreadPools(
harness.thirdDiscoveryNode,
harness.randomNumberOfWritePoolThreads,
randomIntBetween(harness.randomUtilizationThresholdPercent, 100) / 100f,
0
);
setUpMockTransportNodeUsageStatsResponse(harness.firstDiscoveryNode, firstNodeAboveUtilizationThresholdNodeStats);
setUpMockTransportNodeUsageStatsResponse(harness.secondDiscoveryNode, secondNodeAboveUtilizationThresholdNodeStats);
setUpMockTransportNodeUsageStatsResponse(harness.thirdDiscoveryNode, thirdNodeAboveUtilizationThresholdNodeStats);

// Override the {@link TransportIndicesStatsAction} action on the data nodes to supply artificial shard write load stats. The stats
// will show that all shards have non-empty write load stats (so that the WriteLoadDecider will evaluate assigning them to a node).
final ClusterState originalClusterState = internalCluster().getCurrentMasterNodeInstance(ClusterService.class).state();
final IndexMetadata indexMetadata = originalClusterState.getMetadata().getProject().index(harness.indexName);
setUpMockTransportIndicesStatsResponse(
harness.firstDiscoveryNode,
indexMetadata.getNumberOfShards(),
createShardStatsResponseForIndex(indexMetadata, harness.maxShardWriteLoad, harness.firstDataNodeId)
);
setUpMockTransportIndicesStatsResponse(harness.secondDiscoveryNode, 0, List.of());
setUpMockTransportIndicesStatsResponse(harness.thirdDiscoveryNode, 0, List.of());

logger.info("---> Refreshing the cluster info to pull in the dummy thread pool stats from the data nodes");
refreshClusterInfo();

// Allow rebalancing and clear the exclusion setting that holds the shards on a single node.
updateClusterSettings(
Settings.builder()
.put(EnableAllocationDecider.CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), EnableAllocationDecider.Rebalance.ALL)
.putNull("cluster.routing.allocation.exclude._name")
);

logger.info("---> Rebalancing is now allowed");

ClusterAllocationExplainRequest allocationExplainRequest = new ClusterAllocationExplainRequest(TEST_REQUEST_TIMEOUT).setIndex(
harness.indexName
).setShard(0).setPrimary(true);
var allocationExplainResponse = safeGet(client().execute(TransportClusterAllocationExplainAction.TYPE, allocationExplainRequest));
logger.info("---> Allocation explain response: " + Strings.toString(allocationExplainResponse.getExplanation(), true, true));

var decision = allocationExplainResponse.getExplanation().getShardAllocationDecision().getMoveDecision();
assertThat("Rebalancing should be enabled", decision.canRebalanceCluster(), equalTo(true));
assertThat(decision.getCanRemainDecision().type(), equalTo(Decision.YES.type()));
assertNull(decision.getTargetNode());
assertThat(decision.getAllocationDecision(), equalTo(AllocationDecision.NOT_PREFERRED));
assertThat(decision.getExplanation(), equalTo(Explanations.Rebalance.NOT_PREFERRED));

var canAllocateDecisions = allocationExplainResponse.getExplanation()
.getShardAllocationDecision()
.getMoveDecision()
.getNodeDecisions();
assertThat(canAllocateDecisions.size(), equalTo(2));
canAllocateDecisions.forEach(nodeDecision -> assertThat(nodeDecision.getNodeDecision(), equalTo(AllocationDecision.NOT_PREFERRED)));
}

/**
* Determine which shard was moved and check that it's the "best" according to
* {@link org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator.Balancer.PrioritiseByShardWriteLoadComparator}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ public String getExplanation() {
TimeValue.timeValueMillis(remainingDelayInMillis)
);
case NO -> reuseStore ? Explanations.Allocation.EXISTING_STORES_FORBIDDEN : Explanations.Allocation.ALL_NODES_FORBIDDEN;
case WORSE_BALANCE, NO_ATTEMPT -> {
case WORSE_BALANCE, NO_ATTEMPT, NOT_PREFERRED -> {
assert false : getAllocationDecision();
yield getAllocationDecision().toString();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

package org.elasticsearch.cluster.routing.allocation;

import org.elasticsearch.TransportVersion;
import org.elasticsearch.cluster.routing.UnassignedInfo.AllocationStatus;
import org.elasticsearch.cluster.routing.allocation.decider.Decision;
import org.elasticsearch.common.io.stream.StreamInput;
Expand All @@ -31,59 +32,101 @@ public enum AllocationDecision implements Writeable {
* The allocation attempt was throttled for the shard.
*/
THROTTLED((byte) 1),
/**
* It is _not_ preferred to allocate a shard to this node, preference should be given to a YES node.
* This can happen when the shard allocation to a node is allowed, but the node resource usage is
* already high. Preference can be overridden if a shard's current allocation is no longer allowed
* and no other node responded YES to the shard relocation.
*/
NOT_PREFERRED((byte) 2),
/**
* The shard cannot be allocated, which can happen for any number of reasons,
* including the allocation deciders gave a NO decision for allocating.
*/
NO((byte) 2),
NO((byte) 3),
/**
* The shard could not be rebalanced to another node despite rebalancing
* being allowed, because moving the shard to the other node would not form
* a better cluster balance.
*/
WORSE_BALANCE((byte) 3),
WORSE_BALANCE((byte) 4),
/**
* Waiting on getting shard data from all nodes before making a decision
* about where to allocate the shard.
*/
AWAITING_INFO((byte) 4),
AWAITING_INFO((byte) 5),
/**
* The allocation decision has been delayed waiting for a replica with a shard copy
* that left the cluster to rejoin.
*/
ALLOCATION_DELAYED((byte) 5),
ALLOCATION_DELAYED((byte) 6),
/**
* The shard was denied allocation because there were no valid shard copies
* found for it amongst the nodes in the cluster.
*/
NO_VALID_SHARD_COPY((byte) 6),
NO_VALID_SHARD_COPY((byte) 7),
/**
* No attempt was made to allocate the shard
*/
NO_ATTEMPT((byte) 7);
NO_ATTEMPT((byte) 8);

private final byte id;
final byte id;

private static final TransportVersion ADD_NOT_PREFERRED_ALLOCATION_DECISION = TransportVersion.fromName(
"add_not_preferred_allocation_decision"
);

AllocationDecision(byte id) {
this.id = id;
}

@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeByte(id);
if (out.getTransportVersion().supports(ADD_NOT_PREFERRED_ALLOCATION_DECISION) == false) {
if (id == NOT_PREFERRED.id) {
// NOT_PREFERRED was originally hidden / unimplemented converted to YES. So for older versions continue to use YES.
out.write(YES.id);
}
if (id > THROTTLED.id) {
// NOT_PREFERRED was placed after THROTTLE in the enum list, so any subsequent values were pushed +1. Shift the enum value
// back for older versions.
out.write(id - 1);
} else {
assert id == YES.id || id == THROTTLED.id;
out.write(id);
}
} else {
out.writeByte(id);
}
}

public static AllocationDecision readFrom(StreamInput in) throws IOException {
byte id = in.readByte();
if (in.getTransportVersion().supports(ADD_NOT_PREFERRED_ALLOCATION_DECISION) == false) {
// This is the old enum, without NOT_PREFERRED.
return switch (id) {
case 0 -> YES;
case 1 -> THROTTLED;
case 2 -> NO;
case 3 -> WORSE_BALANCE;
case 4 -> AWAITING_INFO;
case 5 -> ALLOCATION_DELAYED;
case 6 -> NO_VALID_SHARD_COPY;
case 7 -> NO_ATTEMPT;
default -> throw new IllegalArgumentException("Unknown value [" + id + "]");
};
}

return switch (id) {
case 0 -> YES;
case 1 -> THROTTLED;
case 2 -> NO;
case 3 -> WORSE_BALANCE;
case 4 -> AWAITING_INFO;
case 5 -> ALLOCATION_DELAYED;
case 6 -> NO_VALID_SHARD_COPY;
case 7 -> NO_ATTEMPT;
case 2 -> NOT_PREFERRED;
case 3 -> NO;
case 4 -> WORSE_BALANCE;
case 5 -> AWAITING_INFO;
case 6 -> ALLOCATION_DELAYED;
case 7 -> NO_VALID_SHARD_COPY;
case 8 -> NO_ATTEMPT;
default -> throw new IllegalArgumentException("Unknown value [" + id + "]");
};
}
Expand Down Expand Up @@ -111,8 +154,8 @@ public static AllocationDecision fromAllocationStatus(AllocationStatus allocatio
*/
public static AllocationDecision fromDecisionType(Decision.Type type) {
return switch (type) {
// TODO: should not_preferred have own variant? ES-12729
case YES, NOT_PREFERRED -> YES;
case YES -> YES;
case NOT_PREFERRED -> NOT_PREFERRED;
case THROTTLE -> THROTTLED;
case NO -> NO;
};
Expand Down
Loading