diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java index 995a5da111ba..c3a82f5cdca9 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java @@ -194,6 +194,7 @@ private void balance() { //if no new move option is generated, it means the cluster can //not be balanced any more , so just stop IterationResult iR = doIteration(); + metrics.incrementNumIterations(1); LOG.info("Result of this iteration of Container Balancer: {}", iR); if (iR == IterationResult.CAN_NOT_BALANCE_ANY_MORE) { stop(); @@ -267,15 +268,9 @@ private boolean initializeIteration() { datanodeUsageInfo.getDatanodeDetails())); this.totalNodesInCluster = datanodeUsageInfos.size(); - this.clusterCapacity = 0L; - this.clusterUsed = 0L; - this.clusterRemaining = 0L; - this.selectedContainers.clear(); - this.overUtilizedNodes.clear(); - this.underUtilizedNodes.clear(); - this.unBalancedNodes.clear(); - this.countDatanodesInvolvedPerIteration = 0; - this.sizeMovedPerIteration = 0; + + // reset some variables and metrics for this iteration + resetState(); clusterAvgUtilisation = calculateAvgUtilization(datanodeUsageInfos); if (LOG.isDebugEnabled()) { @@ -313,11 +308,7 @@ private boolean initializeIteration() { } if (Double.compare(utilization, upperLimit) > 0) { overUtilizedNodes.add(datanodeUsageInfo); - metrics.incrementDatanodesNumToBalance(1); - - metrics.setMaxDatanodeUtilizedPercentage(Math.max( - metrics.getMaxDatanodeUtilizedPercentage(), - ratioToPercent(utilization))); + metrics.incrementNumDatanodesUnbalanced(1); // amount of bytes greater than upper limit in this node Long overUtilizedBytes = ratioToBytes( @@ -328,7 +319,7 @@ private boolean initializeIteration() { totalOverUtilizedBytes += overUtilizedBytes; } else if (Double.compare(utilization, lowerLimit) < 0) { underUtilizedNodes.add(datanodeUsageInfo); - metrics.incrementDatanodesNumToBalance(1); + metrics.incrementNumDatanodesUnbalanced(1); // amount of bytes lesser than lower limit in this node Long underUtilizedBytes = ratioToBytes( @@ -341,7 +332,7 @@ private boolean initializeIteration() { withinThresholdUtilizedNodes.add(datanodeUsageInfo); } } - metrics.setDataSizeToBalanceGB( + metrics.incrementDataSizeUnbalancedGB( Math.max(totalOverUtilizedBytes, totalUnderUtilizedBytes) / OzoneConsts.GB); Collections.reverse(underUtilizedNodes); @@ -451,7 +442,7 @@ private void checkIterationMoveResults(Set selectedTargets) { ContainerInfo container = containerManager.getContainer(moveSelection.getContainerID()); this.sizeMovedPerIteration += container.getUsedBytes(); - metrics.incrementMovedContainersNum(1); + metrics.incrementNumMovedContainersInLatestIteration(1); LOG.info("Move completed for container {} to target {}", container.containerID(), moveSelection.getTargetNode().getUuidString()); @@ -462,7 +453,8 @@ private void checkIterationMoveResults(Set selectedTargets) { } } } catch (InterruptedException e) { - LOG.warn("Container move for container {} was interrupted.", + LOG.warn("Interrupted while waiting for container move result for " + + "container {}.", moveSelection.getContainerID(), e); Thread.currentThread().interrupt(); } catch (ExecutionException e) { @@ -475,7 +467,9 @@ private void checkIterationMoveResults(Set selectedTargets) { } countDatanodesInvolvedPerIteration = sourceToTargetMap.size() + selectedTargets.size(); - metrics.incrementDataSizeMovedGB( + metrics.incrementNumDatanodesInvolvedInLatestIteration( + countDatanodesInvolvedPerIteration); + metrics.incrementDataSizeMovedGBInLatestIteration( sizeMovedPerIteration / OzoneConsts.GB); LOG.info("Number of datanodes involved in this iteration: {}. Size moved " + "in this iteration: {}B.", @@ -740,6 +734,26 @@ private void incSizeSelectedForMoving(DatanodeDetails source, findTargetStrategy.increaseSizeEntering(target, size); } + /** + * Resets some variables and metrics for this iteration. + */ + private void resetState() { + this.clusterCapacity = 0L; + this.clusterUsed = 0L; + this.clusterRemaining = 0L; + this.selectedContainers.clear(); + this.overUtilizedNodes.clear(); + this.underUtilizedNodes.clear(); + this.unBalancedNodes.clear(); + this.countDatanodesInvolvedPerIteration = 0; + this.sizeMovedPerIteration = 0; + metrics.resetDataSizeMovedGBInLatestIteration(); + metrics.resetNumMovedContainersInLatestIteration(); + metrics.resetNumDatanodesInvolvedInLatestIteration(); + metrics.resetDataSizeUnbalancedGB(); + metrics.resetNumDatanodesUnbalanced(); + } + /** * Stops ContainerBalancer. */ diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java index 984787fdecc1..07998447ccad 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java @@ -23,8 +23,7 @@ import org.apache.hadoop.metrics2.annotation.Metric; import org.apache.hadoop.metrics2.annotation.Metrics; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; -import org.apache.hadoop.metrics2.lib.MutableGaugeInt; -import org.apache.hadoop.metrics2.lib.MutableGaugeLong; +import org.apache.hadoop.metrics2.lib.MutableCounterLong; /** * Metrics related to Container Balancer running in SCM. @@ -37,27 +36,26 @@ public final class ContainerBalancerMetrics { private final MetricsSystem ms; - @Metric(about = "The total amount of used space in GigaBytes that needs to " + - "be balanced.") - private MutableGaugeLong dataSizeToBalanceGB; + @Metric(about = "Amount of Gigabytes that Container Balancer moved" + + " in the latest iteration.") + private MutableCounterLong dataSizeMovedGBInLatestIteration; - @Metric(about = "The amount of Giga Bytes that have been moved to achieve " + - "balance.") - private MutableGaugeLong dataSizeMovedGB; + @Metric(about = "Number of containers that Container Balancer moved" + + " in the latest iteration.") + private MutableCounterLong numMovedContainersInLatestIteration; - @Metric(about = "Number of containers that Container Balancer has moved" + - " until now.") - private MutableGaugeLong movedContainersNum; + @Metric(about = "Number of iterations that Container Balancer has run for.") + private MutableCounterLong numIterations; - @Metric(about = "The total number of datanodes that need to be balanced.") - private MutableGaugeLong datanodesNumToBalance; + @Metric(about = "Number of datanodes that were involved in balancing in the" + + " latest iteration.") + private MutableCounterLong numDatanodesInvolvedInLatestIteration; - @Metric(about = "Number of datanodes that Container Balancer has balanced " + - "until now.") - private MutableGaugeLong datanodesNumBalanced; + @Metric(about = "Amount of data in Gigabytes that is causing unbalance.") + private MutableCounterLong dataSizeUnbalancedGB; - @Metric(about = "Utilisation value of the current maximum utilised datanode.") - private MutableGaugeInt maxDatanodeUtilizedPercentage; + @Metric(about = "Number of unbalanced datanodes.") + private MutableCounterLong numDatanodesUnbalanced; /** * Create and register metrics named {@link ContainerBalancerMetrics#NAME} @@ -75,82 +73,101 @@ private ContainerBalancerMetrics(MetricsSystem ms) { this.ms = ms; } - public long getDataSizeToBalanceGB() { - return dataSizeToBalanceGB.value(); + /** + * Gets the amount of data moved by Container Balancer in the latest + * iteration. + * @return size in GB + */ + public long getDataSizeMovedGBInLatestIteration() { + return dataSizeMovedGBInLatestIteration.value(); + } + + public void incrementDataSizeMovedGBInLatestIteration(long valueToAdd) { + this.dataSizeMovedGBInLatestIteration.incr(valueToAdd); } - public void setDataSizeToBalanceGB(long size) { - this.dataSizeToBalanceGB.set(size); + public void resetDataSizeMovedGBInLatestIteration() { + dataSizeMovedGBInLatestIteration.incr( + -getDataSizeMovedGBInLatestIteration()); } - public long getDataSizeMovedGB() { - return dataSizeMovedGB.value(); + /** + * Gets the number of containers moved by Container Balancer in the latest + * iteration. + * @return number of containers + */ + public long getNumMovedContainersInLatestIteration() { + return numMovedContainersInLatestIteration.value(); } - public void setDataSizeMovedGB(long dataSizeMovedGB) { - this.dataSizeMovedGB.set(dataSizeMovedGB); + public void incrementNumMovedContainersInLatestIteration(long valueToAdd) { + this.numMovedContainersInLatestIteration.incr(valueToAdd); } - public long incrementDataSizeMovedGB(long valueToAdd) { - this.dataSizeMovedGB.incr(valueToAdd); - return this.dataSizeMovedGB.value(); + public void resetNumMovedContainersInLatestIteration() { + numMovedContainersInLatestIteration.incr( + -getNumMovedContainersInLatestIteration()); } - public long getMovedContainersNum() { - return movedContainersNum.value(); + /** + * Gets the number of iterations that Container Balancer has run for. + * @return number of iterations + */ + public long getNumIterations() { + return numIterations.value(); } - public void setMovedContainersNum(long movedContainersNum) { - this.movedContainersNum.set(movedContainersNum); + public void incrementNumIterations(long valueToAdd) { + numIterations.incr(valueToAdd); } - public long incrementMovedContainersNum(long valueToAdd) { - this.movedContainersNum.incr(valueToAdd); - return this.movedContainersNum.value(); + /** + * Gets number of datanodes that were involved in balancing in the latest + * iteration. + * @return number of datanodes + */ + public long getNumDatanodesInvolvedInLatestIteration() { + return numDatanodesInvolvedInLatestIteration.value(); } - public long getDatanodesNumToBalance() { - return datanodesNumToBalance.value(); + public void incrementNumDatanodesInvolvedInLatestIteration(long valueToAdd) { + numDatanodesInvolvedInLatestIteration.incr(valueToAdd); } - public void setDatanodesNumToBalance(long datanodesNumToBalance) { - this.datanodesNumToBalance.set(datanodesNumToBalance); + public void resetNumDatanodesInvolvedInLatestIteration() { + numDatanodesInvolvedInLatestIteration.incr( + -getNumDatanodesInvolvedInLatestIteration()); } /** - * Add specified valueToAdd to the number of datanodes that need to be - * balanced. - * - * @param valueToAdd number of datanodes to add + * Gets the amount of data in Gigabytes that is causing unbalance. + * @return size of data as a long value */ - public void incrementDatanodesNumToBalance(long valueToAdd) { - this.datanodesNumToBalance.incr(valueToAdd); + public long getDataSizeUnbalancedGB() { + return dataSizeUnbalancedGB.value(); } - public long getDatanodesNumBalanced() { - return datanodesNumBalanced.value(); + public void incrementDataSizeUnbalancedGB(long valueToAdd) { + dataSizeUnbalancedGB.incr(valueToAdd); } - public void setDatanodesNumBalanced(long datanodesNumBalanced) { - this.datanodesNumBalanced.set(datanodesNumBalanced); + public void resetDataSizeUnbalancedGB() { + dataSizeUnbalancedGB.incr(-getDataSizeUnbalancedGB()); } /** - * Add specified valueToAdd to datanodesNumBalanced. - * - * @param valueToAdd The value to add. - * @return The result after addition. + * Gets the number of datanodes that are unbalanced. + * @return long value */ - public long incrementDatanodesNumBalanced(long valueToAdd) { - datanodesNumBalanced.incr(valueToAdd); - return datanodesNumBalanced.value(); + public long getNumDatanodesUnbalanced() { + return numDatanodesUnbalanced.value(); } - public int getMaxDatanodeUtilizedPercentage() { - return maxDatanodeUtilizedPercentage.value(); + public void incrementNumDatanodesUnbalanced(long valueToAdd) { + numDatanodesUnbalanced.incr(valueToAdd); } - public void setMaxDatanodeUtilizedPercentage(int percentage) { - this.maxDatanodeUtilizedPercentage.set(percentage); + public void resetNumDatanodesUnbalanced() { + numDatanodesUnbalanced.incr(-getNumDatanodesUnbalanced()); } } diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java index b23482b91573..8d58af1925a0 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java @@ -53,6 +53,7 @@ import org.slf4j.LoggerFactory; import org.slf4j.event.Level; +import java.time.Duration; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; @@ -214,15 +215,12 @@ public void unBalancedNodesListShouldBeEmptyWhenClusterIsBalanced() { balancerConfiguration.setThreshold(99.99); containerBalancer.start(balancerConfiguration); - // waiting for balance completed. - // TODO: this is a temporary implementation for now - // modify this after balancer is fully completed - try { - Thread.sleep(100); - } catch (InterruptedException e) { } + sleepWhileBalancing(100); containerBalancer.stop(); + ContainerBalancerMetrics metrics = containerBalancer.getMetrics(); Assert.assertEquals(0, containerBalancer.getUnBalancedNodes().size()); + Assert.assertEquals(0, metrics.getNumDatanodesUnbalanced()); } /** @@ -239,16 +237,15 @@ public void containerBalancerShouldObeyMaxDatanodesToInvolveLimit() { balancerConfiguration.setIterations(1); containerBalancer.start(balancerConfiguration); - // waiting for balance completed. - // TODO: this is a temporary implementation for now - // modify this after balancer is fully completed - try { - Thread.sleep(1000); - } catch (InterruptedException e) { } + sleepWhileBalancing(500); + int number = percent * numberOfNodes / 100; + ContainerBalancerMetrics metrics = containerBalancer.getMetrics(); Assert.assertFalse( - containerBalancer.getCountDatanodesInvolvedPerIteration() > - (percent * numberOfNodes / 100)); + containerBalancer.getCountDatanodesInvolvedPerIteration() > number); + Assert.assertTrue(metrics.getNumDatanodesInvolvedInLatestIteration() > 0); + Assert.assertFalse( + metrics.getNumDatanodesInvolvedInLatestIteration() > number); containerBalancer.stop(); } @@ -305,16 +302,16 @@ public void containerBalancerShouldObeyMaxSizeToMoveLimit() { balancerConfiguration.setIterations(1); containerBalancer.start(balancerConfiguration); - // waiting for balance completed. - // TODO: this is a temporary implementation for now - // modify this after balancer is fully completed - try { - Thread.sleep(1000); - } catch (InterruptedException e) { } + sleepWhileBalancing(500); // balancer should not have moved more size than the limit Assert.assertFalse(containerBalancer.getSizeMovedPerIteration() > 10 * OzoneConsts.GB); + + long size = + containerBalancer.getMetrics().getDataSizeMovedGBInLatestIteration(); + Assert.assertTrue(size > 0); + Assert.assertFalse(size > 10); containerBalancer.stop(); } @@ -500,29 +497,25 @@ public void balancerShouldObeyMaxSizeEnteringTargetLimit() { @Test public void testMetrics() { + conf.set("hdds.datanode.du.refresh.period", "1ms"); + balancerConfiguration.setBalancingInterval(Duration.ofMillis(2)); balancerConfiguration.setThreshold(10); balancerConfiguration.setIterations(1); - balancerConfiguration.setMaxSizeEnteringTarget(10 * OzoneConsts.GB); - balancerConfiguration.setMaxSizeToMovePerIteration(100 * OzoneConsts.GB); + balancerConfiguration.setMaxSizeEnteringTarget(6 * OzoneConsts.GB); + // deliberately set max size per iteration to a low value, 6GB + balancerConfiguration.setMaxSizeToMovePerIteration(6 * OzoneConsts.GB); balancerConfiguration.setMaxDatanodesPercentageToInvolvePerIteration(100); containerBalancer.start(balancerConfiguration); + sleepWhileBalancing(500); - // waiting for balance completed. - // TODO: this is a temporary implementation for now - // modify this after balancer is fully completed - try { - Thread.sleep(500); - } catch (InterruptedException e) { } - - containerBalancer.stop(); ContainerBalancerMetrics metrics = containerBalancer.getMetrics(); Assert.assertEquals(determineExpectedUnBalancedNodes( balancerConfiguration.getThreshold()).size(), - metrics.getDatanodesNumToBalance()); - Assert.assertEquals(ContainerBalancer.ratioToPercent( - nodeUtilizations.get(nodeUtilizations.size() - 1)), - metrics.getMaxDatanodeUtilizedPercentage()); + metrics.getNumDatanodesUnbalanced()); + Assert.assertTrue(metrics.getDataSizeMovedGBInLatestIteration() <= 6); + Assert.assertEquals(1, metrics.getNumIterations()); + containerBalancer.stop(); } /**