-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Tighten on when THROTTLE decision can be returned #136794
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 13 commits
9c56ded
e039dc1
4024c40
3a8664d
4af235f
7d19eac
548ea9d
c7f50d8
1f04641
4025f16
d36a3d9
69227ae
f7eba77
b905995
c62be8e
3750f3d
44c9c66
6cdfdc1
1726061
228c35d
d0883de
2c44ae1
abfe345
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -705,12 +705,15 @@ private boolean balanceByWeights(NodeSorter sorter) { | |
| highIdx = relevantNodes - 1; | ||
|
|
||
| if (routingNodes.getRelocatingShardCount() > 0) { | ||
| // ES-12955: Check routingNodes.getRelocatingShardCount() > 0 in case the first relocation is a THROTTLE. | ||
| // This should rarely happen since in most cases, we don't throttle unless there is an existing relocation. | ||
| // But it can happen in production for frozen indices when the cache is still being prepared. It can also | ||
| // happen in tests because we have decider like RandomAllocationDecider that can randomly return THROTTLE | ||
| // when there is no existing relocation. | ||
| shardBalanced = true; | ||
| } else { | ||
| // A THROTTLE decision can happen when not simulating | ||
| assert allocation.isSimulating() == false | ||
| : "unexpected THROTTLE decision (simulation=" | ||
| + allocation.isSimulating() | ||
| + ") when balancing index [" | ||
| + index | ||
| + "]"; | ||
|
||
| } | ||
| if (completeEarlyOnShardAssignmentChange && shardBalanced) { | ||
| return true; | ||
|
|
@@ -835,6 +838,18 @@ public boolean moveShards() { | |
| } else if (moveDecision.isDecisionTaken() && moveDecision.canRemain() == false) { | ||
| logger.trace("[{}][{}] can't move", shardRouting.index(), shardRouting.id()); | ||
| } | ||
|
|
||
| // A THROTTLE allocation decision can happen when not simulating | ||
| assert moveDecision.getAllocationDecision() != AllocationDecision.THROTTLED || allocation.isSimulating() == false | ||
|
||
| : "unexpected allocation decision [" | ||
| + moveDecision.getAllocationDecision() | ||
| + "] (simulation=" | ||
| + allocation.isSimulating() | ||
| + ") with " | ||
| + (shardMoved ? "" : "no ") | ||
| + "prior shard movements when moving shard [" | ||
| + shardRouting | ||
| + "]"; | ||
| } | ||
|
|
||
| // If we get here, attempt to move one of the best not-preferred shards that we identified earlier | ||
|
|
@@ -1268,9 +1283,15 @@ private boolean allocateUnassigned() { | |
| assert allocationDecision.getAllocationStatus() == AllocationStatus.DECIDERS_THROTTLED; | ||
| final long shardSize = getExpectedShardSize(shard, ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE, allocation); | ||
| minNode.addShard(projectIndex(shard), shard.initialize(minNode.getNodeId(), null, shardSize)); | ||
| // If we see a throttle decision in simulation, there must be other shards that got assigned before it. | ||
| // If we see a THROTTLE decision, it's either: | ||
| // 1. Not simulating | ||
| // 2. Or, there is shard assigned before this one | ||
| assert allocation.isSimulating() == false || shardAssignmentChanged | ||
| : "shard " + shard + " was throttled but no other shards were assigned"; | ||
| : "unexpected THROTTLE decision (simulation=" | ||
| + allocation.isSimulating() | ||
| + ") with no prior assignment when allocating unassigned shard [" | ||
| + shard | ||
| + "]"; | ||
|
||
| } else { | ||
| if (logger.isTraceEnabled()) { | ||
| logger.trace("No Node found to assign shard [{}]", shard); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -486,6 +486,13 @@ public DesiredBalance compute( | |
| || info.lastAllocationStatus() == UnassignedInfo.AllocationStatus.DECIDERS_THROTTLED) : "Unexpected stats in: " + info; | ||
|
|
||
| if (hasChanges == false && info.lastAllocationStatus() == UnassignedInfo.AllocationStatus.DECIDERS_THROTTLED) { | ||
| // Unassigned ignored shards must be based on the provided set of ignoredShards | ||
| assert ignoredShards.contains(discardAllocationStatus(shard)) | ||
| || ignoredShards.stream().filter(ShardRouting::primary).anyMatch(primary -> primary.shardId().equals(shard.shardId())) | ||
| : "ignored shard " | ||
| + shard | ||
| + " unexpectedly has THROTTLE status and no counterpart in the provided ignoredShards set " | ||
| + ignoredShards; | ||
| // Simulation could not progress due to missing information in any of the deciders. | ||
| // Currently, this could happen if `HasFrozenCacheAllocationDecider` is still fetching the data. | ||
| // Progress would be made after the followup reroute call. | ||
|
Comment on lines
488
to
498
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The comment about |
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,6 +28,12 @@ public class HasFrozenCacheAllocationDecider extends AllocationDecider { | |
| "value of [" + SHARED_CACHE_SIZE_SETTING.getKey() + "] on this node is not known yet" | ||
| ); | ||
|
|
||
| private static final Decision NO_STILL_FETCHING = Decision.single( | ||
| Decision.Type.NO, | ||
| NAME, | ||
| "Shard movement is not allowed in simulation when value of [" + SHARED_CACHE_SIZE_SETTING.getKey() + "] on this node is not known" | ||
| ); | ||
|
|
||
| private static final Decision HAS_FROZEN_CACHE = Decision.single( | ||
| Decision.Type.YES, | ||
| NAME, | ||
|
|
@@ -48,6 +54,12 @@ public class HasFrozenCacheAllocationDecider extends AllocationDecider { | |
| "there was an error fetching the searchable snapshot shared cache state from this node" | ||
| ); | ||
|
|
||
| private static final Decision UNKNOWN_NODE = Decision.single( | ||
| Decision.Type.NO, | ||
| NAME, | ||
| "this node is unknown to the searchable snapshot shared cache state" | ||
| ); | ||
|
|
||
| private final FrozenCacheInfoService frozenCacheService; | ||
|
|
||
| public HasFrozenCacheAllocationDecider(FrozenCacheInfoService frozenCacheService) { | ||
|
|
@@ -56,25 +68,25 @@ public HasFrozenCacheAllocationDecider(FrozenCacheInfoService frozenCacheService | |
|
|
||
| @Override | ||
| public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { | ||
| return canAllocateToNode(allocation.metadata().indexMetadata(shardRouting.index()), node.node()); | ||
| return canAllocateToNode(allocation.metadata().indexMetadata(shardRouting.index()), node.node(), allocation); | ||
| } | ||
|
|
||
| @Override | ||
| public Decision canRemain(IndexMetadata indexMetadata, ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { | ||
| return canAllocateToNode(indexMetadata, node.node()); | ||
| return canAllocateToNode(indexMetadata, node.node(), allocation); | ||
| } | ||
|
|
||
| @Override | ||
| public Decision canAllocate(IndexMetadata indexMetadata, RoutingNode node, RoutingAllocation allocation) { | ||
| return canAllocateToNode(indexMetadata, node.node()); | ||
| return canAllocateToNode(indexMetadata, node.node(), allocation); | ||
| } | ||
|
|
||
| @Override | ||
| public Decision shouldAutoExpandToNode(IndexMetadata indexMetadata, DiscoveryNode node, RoutingAllocation allocation) { | ||
| return canAllocateToNode(indexMetadata, node); | ||
| return canAllocateToNode(indexMetadata, node, allocation); | ||
| } | ||
|
|
||
| private Decision canAllocateToNode(IndexMetadata indexMetadata, DiscoveryNode discoveryNode) { | ||
| private Decision canAllocateToNode(IndexMetadata indexMetadata, DiscoveryNode discoveryNode, RoutingAllocation allocation) { | ||
| if (indexMetadata.isPartialSearchableSnapshot() == false) { | ||
| return Decision.ALWAYS; | ||
| } | ||
|
|
@@ -83,7 +95,8 @@ private Decision canAllocateToNode(IndexMetadata indexMetadata, DiscoveryNode di | |
| case HAS_CACHE -> HAS_FROZEN_CACHE; | ||
| case NO_CACHE -> NO_FROZEN_CACHE; | ||
| case FAILED -> UNKNOWN_FROZEN_CACHE; | ||
| default -> STILL_FETCHING; | ||
| case FETCHING -> allocation.isSimulating() ? NO_STILL_FETCHING : STILL_FETCHING; | ||
| case UNKNOWN -> UNKNOWN_NODE; | ||
|
||
| }; | ||
| } | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.