diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 5f08b96a382a..ad05c6ef3ef7 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -1877,6 +1877,8 @@ public List executeRegionPlansWithThrottling(List plans) } } } + LOG.info("Balancer is going into sleep until next period in {}ms", getConfiguration() + .getInt(HConstants.HBASE_BALANCER_PERIOD, HConstants.DEFAULT_HBASE_BALANCER_PERIOD)); return successRegionPlans; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java index ba214b73575c..fc5939e7dd09 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/balancer/StochasticLoadBalancer.java @@ -133,7 +133,8 @@ public class StochasticLoadBalancer extends BaseLoadBalancer { private List candidateGenerators; private List costFunctions; // FindBugs: Wants this protected; IS2_INCONSISTENT_SYNC - + // To save currently configed sum of multiplier. Defaulted at 1 for cases that carry high cost + private float sumMultiplier = 1.0f; // to save and report costs to JMX private double curOverallCost = 0d; private double[] tempFunctionCosts; @@ -229,7 +230,6 @@ protected void loadConf(Configuration conf) { regionReplicaHostCostFunction = new RegionReplicaHostCostFunction(conf); regionReplicaRackCostFunction = new RegionReplicaRackCostFunction(conf); - costFunctions = new ArrayList<>(); addCostFunction(new RegionCountSkewCostFunction(conf)); addCostFunction(new PrimaryRegionCountSkewCostFunction(conf)); @@ -310,63 +310,66 @@ private boolean areSomeRegionReplicasColocated(BalancerClusterState c) { boolean needsBalance(TableName tableName, BalancerClusterState cluster) { ClusterLoadState cs = new ClusterLoadState(cluster.clusterState); if (cs.getNumServers() < MIN_SERVER_BALANCE) { - if (LOG.isDebugEnabled()) { - LOG.debug("Not running balancer because only " + cs.getNumServers() - + " active regionserver(s)"); - } - if (this.isBalancerRejectionRecording) { - sendRejectionReasonToRingBuffer("The number of RegionServers " + - cs.getNumServers() + " < MIN_SERVER_BALANCE(" + MIN_SERVER_BALANCE + ")", null); - } + LOG.info("Not running balancer because only " + cs.getNumServers() + + " active regionserver(s)"); + sendRejectionReasonToRingBuffer( + "The number of RegionServers " + cs.getNumServers() + " < MIN_SERVER_BALANCE(" + + MIN_SERVER_BALANCE + ")", null); return false; } if (areSomeRegionReplicasColocated(cluster)) { + LOG.info("Running balancer because at least one server hosts replicas of the same region."); return true; } if (idleRegionServerExist(cluster)){ + LOG.info("Running balancer because cluster has idle server(s)."); return true; } + sumMultiplier = 0.0f; double total = 0.0; - float sumMultiplier = 0.0f; for (CostFunction c : costFunctions) { float multiplier = c.getMultiplier(); - if (multiplier <= 0) { - LOG.trace("{} not needed because multiplier is <= 0", c.getClass().getSimpleName()); - continue; - } + double cost = c.cost(); if (!c.isNeeded()) { LOG.trace("{} not needed", c.getClass().getSimpleName()); continue; } + total += cost * multiplier; sumMultiplier += multiplier; - total += c.cost() * multiplier; - } - - boolean balanced = total <= 0 || sumMultiplier <= 0 || - (sumMultiplier > 0 && (total / sumMultiplier) < minCostNeedBalance); - if(balanced && isBalancerRejectionRecording){ - String reason = ""; - if (total <= 0) { - reason = "(cost1*multiplier1)+(cost2*multiplier2)+...+(costn*multipliern) = " + total + " <= 0"; - } else if (sumMultiplier <= 0) { - reason = "sumMultiplier = " + sumMultiplier + " <= 0"; - } else if ((total / sumMultiplier) < minCostNeedBalance) { - reason = - "[(cost1*multiplier1)+(cost2*multiplier2)+...+(costn*multipliern)]/sumMultiplier = " + (total - / sumMultiplier) + " <= minCostNeedBalance(" + minCostNeedBalance + ")"; - } - sendRejectionReasonToRingBuffer(reason, costFunctions); - } - if (LOG.isDebugEnabled()) { - LOG.debug("{} {}; total cost={}, sum multiplier={}; cost/multiplier to need a balance is {}", - balanced ? "Skipping load balancing because balanced" : "We need to load balance", - isByTable ? String.format("table (%s)", tableName) : "cluster", - total, sumMultiplier, minCostNeedBalance); - if (LOG.isTraceEnabled()) { - LOG.trace("Balance decision detailed function costs={}", functionCost()); + } + if (sumMultiplier <= 0) { + LOG.error("At least one cost function needs a multiplier > 0. For example, set " + + "hbase.master.balancer.stochastic.regionCountCost to a positive value or default"); + return false; + } + + boolean balanced = (total / sumMultiplier < minCostNeedBalance); + if (balanced) { + if (isBalancerRejectionRecording) { + String reason = ""; + if (total <= 0) { + reason = "(cost1*multiplier1)+(cost2*multiplier2)+...+(costn*multipliern) = " + + total + " <= 0"; + } else if (sumMultiplier <= 0) { + reason = "sumMultiplier = " + sumMultiplier + " <= 0"; + } else if ((total / sumMultiplier) < minCostNeedBalance) { + reason = + "[(cost1*multiplier1)+(cost2*multiplier2)+...+(costn*multipliern)]/sumMultiplier = " + + (total / sumMultiplier) + " <= minCostNeedBalance(" + minCostNeedBalance + ")"; + } + sendRejectionReasonToRingBuffer(reason, costFunctions); } + LOG.info("{} - skipping load balancing because weighted average imbalance={} <= " + + "threshold({}). If you want more aggressive balancing, either lower " + + "hbase.master.balancer.stochastic.minCostNeedBalance from {} or increase the relative " + + "multiplier(s) of the specific cost function(s). functionCost={}", + isByTable ? "Table specific ("+tableName+")" : "Cluster wide", total / sumMultiplier, + minCostNeedBalance, minCostNeedBalance, functionCost()); + } else { + LOG.info("{} - Calculating plan. may take up to {}ms to complete.", + isByTable ? "Table specific ("+tableName+")" : "Cluster wide", maxRunningTime); } return !balanced; } @@ -452,8 +455,9 @@ protected List balanceTable(TableName tableName, Map balanceTable(TableName tableName, Map currentCost) { plans = createRegionPlans(cluster); - LOG.info("Finished computing new load balance plan. Computation took {}" + - " to try {} different iterations. Found a solution that moves " + - "{} regions; Going from a computed cost of {}" + - " to a new cost of {}", java.time.Duration.ofMillis(endTime - startTime), - step, plans.size(), initCost, currentCost); + LOG.info("Finished computing new moving plan. Computation took {} ms" + + " to try {} different iterations. Found a solution that moves " + + "{} regions; Going from a computed imbalance of {}" + + " to a new imbalance of {}. ", + endTime - startTime, step, plans.size(), + initCost / sumMultiplier, currentCost / sumMultiplier); + sendRegionPlansToRingBuffer(plans, currentCost, initCost, initFunctionTotalCosts, step); return plans; } - LOG.info("Could not find a better load balance plan. Tried {} different configurations in " + - "{}, and did not find anything with a computed cost less than {}", step, - java.time.Duration.ofMillis(endTime - startTime), initCost); + LOG.info("Could not find a better moving plan. Tried {} different configurations in " + + "{} ms, and did not find anything with an imbalance score less than {}", step, + endTime - startTime, initCost / sumMultiplier); return null; } @@ -520,8 +526,7 @@ private void sendRejectionReasonToRingBuffer(String reason, List c .setReason(reason); if (costFunctions != null) { for (CostFunction c : costFunctions) { - float multiplier = c.getMultiplier(); - if (multiplier <= 0 || !c.isNeeded()) { + if (!c.isNeeded()) { continue; } builder.addCostFuncInfo(c.getClass().getName(), c.cost(), c.getMultiplier()); @@ -580,7 +585,8 @@ private void updateStochasticCosts(TableName tableName, double overall, double[] } private void addCostFunction(CostFunction costFunction) { - if (costFunction.getMultiplier() > 0) { + float multiplier = costFunction.getMultiplier(); + if (multiplier > 0) { costFunctions.add(costFunction); } } @@ -591,9 +597,13 @@ private String functionCost() { builder.append(c.getClass().getSimpleName()); builder.append(" : ("); if (c.isNeeded()) { - builder.append(c.getMultiplier()); + builder.append("multiplier=" + c.getMultiplier()); builder.append(", "); - builder.append(c.cost()); + double cost = c.cost(); + builder.append("imbalance=" + cost); + if (cost < minCostNeedBalance) { + builder.append(", balanced"); + } } else { builder.append("not needed"); } @@ -605,7 +615,7 @@ private String functionCost() { private String totalCostsPerFunc() { StringBuilder builder = new StringBuilder(); for (CostFunction c : costFunctions) { - if (c.getMultiplier() <= 0 || !c.isNeeded()) { + if (!c.isNeeded()) { continue; } double cost = c.getMultiplier() * c.cost(); @@ -689,7 +699,7 @@ void initCosts(BalancerClusterState cluster) { allowedOnPath = ".*(/src/test/.*|StochasticLoadBalancer).java") void updateCostsWithAction(BalancerClusterState cluster, BalanceAction action) { for (CostFunction c : costFunctions) { - if (c.getMultiplier() > 0 && c.isNeeded()) { + if (c.isNeeded()) { c.postAction(action); } } @@ -728,7 +738,7 @@ String[] getCostFunctionNames() { CostFunction c = costFunctions.get(i); this.tempFunctionCosts[i] = 0.0; - if (c.getMultiplier() <= 0 || !c.isNeeded()) { + if (!c.isNeeded()) { continue; } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/BalancerTestBase.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/BalancerTestBase.java index 669f1cad17e0..435a59132c23 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/BalancerTestBase.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/BalancerTestBase.java @@ -75,7 +75,6 @@ public static void beforeAllTests() throws Exception { conf.setFloat("hbase.master.balancer.stochastic.maxMovePercent", 0.75f); conf.setFloat("hbase.regions.slop", 0.0f); conf.setFloat("hbase.master.balancer.stochastic.localityCost", 0); - conf.setBoolean("hbase.master.balancer.stochastic.runMaxSteps", true); loadBalancer = new StochasticLoadBalancer(); MasterServices services = mock(MasterServices.class); when(services.getConfiguration()).thenReturn(conf); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancer.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancer.java index 99491109a910..b0beac78a7da 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancer.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancer.java @@ -169,6 +169,7 @@ public void testNeedBalance() { for (boolean isByTable : perTableBalancerConfigs) { conf.setBoolean(HConstants.HBASE_MASTER_LOADBALANCE_BYTABLE, isByTable); loadBalancer.onConfigurationChange(conf); + for (int[] mockCluster : clusterStateMocks) { Map> servers = mockClusterServers(mockCluster); Map>> LoadOfAllTable = diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancerBalanceCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancerBalanceCluster.java index eb657d23ee8d..5018f59300d7 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancerBalanceCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancerBalanceCluster.java @@ -53,12 +53,13 @@ public class TestStochasticLoadBalancerBalanceCluster extends BalancerTestBase { public void testBalanceCluster() throws Exception { conf.setLong("hbase.master.balancer.stochastic.maxRunningTime", 3 * 60 * 1000); // 3 min conf.setFloat("hbase.master.balancer.stochastic.maxMovePercent", 1.0f); + conf.setLong(StochasticLoadBalancer.MAX_STEPS_KEY, 20000000L); loadBalancer.onConfigurationChange(conf); + for (int[] mockCluster : clusterStateMocks) { Map> servers = mockClusterServers(mockCluster); List list = convertToList(servers); LOG.info("Mock Cluster : " + printMock(list) + " " + printStats(list)); - Map>> LoadOfAllTable = (Map) mockClusterServersWithTables(servers); List plans = loadBalancer.balanceCluster(LoadOfAllTable); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancerLargeCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancerLargeCluster.java index e31cf132bce3..a218bbff018c 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancerLargeCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/balancer/TestStochasticLoadBalancerLargeCluster.java @@ -38,8 +38,8 @@ public void testLargeCluster() { int numRegionsPerServer = 80; // all servers except one int numTables = 100; int replication = 1; - conf.setLong("hbase.master.balancer.stochastic.maxRunningTime", 6 * 60 * 1000); - conf.setFloat("hbase.master.balancer.stochastic.maxMovePercent", 1.0f); + conf.setLong("hbase.master.balancer.stochastic.maxRunningTime", 3 * 60 * 1000); + conf.setBoolean("hbase.master.balancer.stochastic.runMaxSteps", true); loadBalancer.onConfigurationChange(conf); testWithCluster(numNodes, numRegions, numRegionsPerServer, replication, numTables, true, true); }