From 0e9151cd99c04ed50b41952865ff1fce6dea484b Mon Sep 17 00:00:00 2001 From: Rishab Nahata Date: Wed, 14 Aug 2024 11:23:45 +0530 Subject: [PATCH] Make balanced shards allocator timebound Signed-off-by: Rishab Nahata --- .../allocator/BalancedShardsAllocator.java | 46 +++++++++++++++++-- .../allocator/LocalShardsBalancer.java | 29 +++++++++++- .../common/settings/ClusterSettings.java | 1 + 3 files changed, 72 insertions(+), 4 deletions(-) diff --git a/server/src/main/java/org/opensearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/opensearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 212583d1fb14f..d200c0506aabe 100644 --- a/server/src/main/java/org/opensearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -54,6 +54,7 @@ import org.opensearch.common.settings.Setting; import org.opensearch.common.settings.Setting.Property; import org.opensearch.common.settings.Settings; +import org.opensearch.common.unit.TimeValue; import java.util.HashMap; import java.util.HashSet; @@ -87,6 +88,7 @@ public class BalancedShardsAllocator implements ShardsAllocator { private static final Logger logger = LogManager.getLogger(BalancedShardsAllocator.class); + public static final TimeValue MIN_ALLOCATOR_TIMEOUT = TimeValue.timeValueSeconds(20); public static final Setting INDEX_BALANCE_FACTOR_SETTING = Setting.floatSetting( "cluster.routing.allocation.balance.index", @@ -169,6 +171,23 @@ public class BalancedShardsAllocator implements ShardsAllocator { Property.NodeScope ); + public static final Setting ALLOCATOR_TIMEOUT_SETTING = Setting.timeSetting( + "cluster.routing.allocation.balanced_shards_allocator.allocator_timeout", + TimeValue.timeValueSeconds(20), + TimeValue.MINUS_ONE, + timeValue -> { + if (timeValue.compareTo(MIN_ALLOCATOR_TIMEOUT) < 0 && timeValue.compareTo(TimeValue.MINUS_ONE) != 0) { + throw new IllegalArgumentException( + "Setting [" + + "cluster.routing.allocation.balanced_shards_allocator.allocator_timeout" + + "] should be more than 20s or -1ms to disable timeout" + ); + } + }, + Setting.Property.NodeScope, + Setting.Property.Dynamic + ); + private volatile boolean movePrimaryFirst; private volatile ShardMovementStrategy shardMovementStrategy; @@ -181,6 +200,8 @@ public class BalancedShardsAllocator implements ShardsAllocator { private volatile float threshold; private volatile boolean ignoreThrottleInRestore; + private volatile TimeValue allocatorTimeout; + private long startTime; public BalancedShardsAllocator(Settings settings) { this(settings, new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS)); @@ -197,6 +218,7 @@ public BalancedShardsAllocator(Settings settings, ClusterSettings clusterSetting setPreferPrimaryShardBalance(PREFER_PRIMARY_SHARD_BALANCE.get(settings)); setPreferPrimaryShardRebalance(PREFER_PRIMARY_SHARD_REBALANCE.get(settings)); setShardMovementStrategy(SHARD_MOVEMENT_STRATEGY_SETTING.get(settings)); + setAllocatorTimeout(ALLOCATOR_TIMEOUT_SETTING.get(settings)); clusterSettings.addSettingsUpdateConsumer(PREFER_PRIMARY_SHARD_BALANCE, this::setPreferPrimaryShardBalance); clusterSettings.addSettingsUpdateConsumer(SHARD_MOVE_PRIMARY_FIRST_SETTING, this::setMovePrimaryFirst); clusterSettings.addSettingsUpdateConsumer(SHARD_MOVEMENT_STRATEGY_SETTING, this::setShardMovementStrategy); @@ -206,6 +228,7 @@ public BalancedShardsAllocator(Settings settings, ClusterSettings clusterSetting clusterSettings.addSettingsUpdateConsumer(PREFER_PRIMARY_SHARD_REBALANCE, this::setPreferPrimaryShardRebalance); clusterSettings.addSettingsUpdateConsumer(THRESHOLD_SETTING, this::setThreshold); clusterSettings.addSettingsUpdateConsumer(IGNORE_THROTTLE_FOR_REMOTE_RESTORE, this::setIgnoreThrottleInRestore); + clusterSettings.addSettingsUpdateConsumer(ALLOCATOR_TIMEOUT_SETTING, this::setAllocatorTimeout); } /** @@ -284,6 +307,20 @@ private void setThreshold(float threshold) { this.threshold = threshold; } + private void setAllocatorTimeout(TimeValue allocatorTimeout) { + this.allocatorTimeout = allocatorTimeout; + } + + private boolean allocatorTimedOut(long currentTime) { + if (allocatorTimeout.equals(TimeValue.MINUS_ONE)) { + if (logger.isTraceEnabled()) { + logger.trace("Allocator timeout is disabled. Will not short circuit allocator tasks"); + } + return false; + } + return currentTime - this.startTime > allocatorTimeout.nanos(); + } + @Override public void allocate(RoutingAllocation allocation) { if (allocation.routingNodes().size() == 0) { @@ -298,8 +335,10 @@ public void allocate(RoutingAllocation allocation) { threshold, preferPrimaryShardBalance, preferPrimaryShardRebalance, - ignoreThrottleInRestore + ignoreThrottleInRestore, + this::allocatorTimedOut ); + this.startTime = System.nanoTime(); localShardsBalancer.allocateUnassigned(); localShardsBalancer.moveShards(); localShardsBalancer.balance(); @@ -321,7 +360,8 @@ public ShardAllocationDecision decideShardAllocation(final ShardRouting shard, f threshold, preferPrimaryShardBalance, preferPrimaryShardRebalance, - ignoreThrottleInRestore + ignoreThrottleInRestore, + this::allocatorTimedOut ); AllocateUnassignedDecision allocateUnassignedDecision = AllocateUnassignedDecision.NOT_TAKEN; MoveDecision moveDecision = MoveDecision.NOT_TAKEN; @@ -585,7 +625,7 @@ public Balancer( float threshold, boolean preferPrimaryBalance ) { - super(logger, allocation, shardMovementStrategy, weight, threshold, preferPrimaryBalance, false, false); + super(logger, allocation, shardMovementStrategy, weight, threshold, preferPrimaryBalance, false, false, null); } } diff --git a/server/src/main/java/org/opensearch/cluster/routing/allocation/allocator/LocalShardsBalancer.java b/server/src/main/java/org/opensearch/cluster/routing/allocation/allocator/LocalShardsBalancer.java index 7e4ae58548c55..6d945a31f19d2 100644 --- a/server/src/main/java/org/opensearch/cluster/routing/allocation/allocator/LocalShardsBalancer.java +++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/allocator/LocalShardsBalancer.java @@ -41,6 +41,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.function.Function; import java.util.stream.Stream; import java.util.stream.StreamSupport; @@ -71,8 +72,10 @@ public class LocalShardsBalancer extends ShardsBalancer { private final float avgPrimaryShardsPerNode; private final BalancedShardsAllocator.NodeSorter sorter; private final Set inEligibleTargetNode; + private final Function timedOutFunc; private int totalShardCount = 0; + public LocalShardsBalancer( Logger logger, RoutingAllocation allocation, @@ -81,7 +84,8 @@ public LocalShardsBalancer( float threshold, boolean preferPrimaryBalance, boolean preferPrimaryRebalance, - boolean ignoreThrottleInRestore + boolean ignoreThrottleInRestore, + Function timedOutFunc ) { this.logger = logger; this.allocation = allocation; @@ -99,6 +103,7 @@ public LocalShardsBalancer( this.preferPrimaryRebalance = preferPrimaryRebalance; this.shardMovementStrategy = shardMovementStrategy; this.ignoreThrottleInRestore = ignoreThrottleInRestore; + this.timedOutFunc = timedOutFunc; } /** @@ -572,6 +577,15 @@ void moveShards() { return; } + // Terminate if the time allocated to the balanced shards allocator has elapsed + if (timedOutFunc.apply(System.nanoTime())) { + logger.info( + "Cannot move any shard in the cluster as time allocated to balanced shards allocator has elapsed" + + ". Skipping shard iteration" + ); + return; + } + ShardRouting shardRouting = it.next(); if (RoutingPool.REMOTE_CAPABLE.equals(RoutingPool.getShardPool(shardRouting, allocation))) { @@ -799,8 +813,21 @@ void allocateUnassigned() { int secondaryLength = 0; int primaryLength = primary.length; ArrayUtil.timSort(primary, comparator); + if (logger.isTraceEnabled()) { + logger.trace("Staring allocation of [{}] unassigned shards", primaryLength); + } do { for (int i = 0; i < primaryLength; i++) { + if (timedOutFunc.apply(System.nanoTime())) { + // TODO - maybe check if we can allow wait for active shards thingy bypass this condition + logger.info("Ignoring [{}] unassigned shards for allocation as time allocated to " + + "balanced shards allocator has elapsed", (primaryLength - i)); + while (i < primaryLength - 1) { + unassigned.ignoreShard(primary[i], UnassignedInfo.AllocationStatus.NO_ATTEMPT, allocation.changes()); + i++; + } + return; + } ShardRouting shard = primary[i]; final AllocateUnassignedDecision allocationDecision = decideAllocateUnassigned(shard); final String assignedNodeId = allocationDecision.getTargetNode() != null diff --git a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java index a73e5d44b7e02..f2d51f87cfa2a 100644 --- a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java @@ -269,6 +269,7 @@ public void apply(Settings value, Settings current, Settings previous) { BalancedShardsAllocator.SHARD_MOVEMENT_STRATEGY_SETTING, BalancedShardsAllocator.THRESHOLD_SETTING, BalancedShardsAllocator.IGNORE_THROTTLE_FOR_REMOTE_RESTORE, + BalancedShardsAllocator.ALLOCATOR_TIMEOUT_SETTING, BreakerSettings.CIRCUIT_BREAKER_LIMIT_SETTING, BreakerSettings.CIRCUIT_BREAKER_OVERHEAD_SETTING, BreakerSettings.CIRCUIT_BREAKER_TYPE,