Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
774fbbc
HDDS-11367. Improve ozone balancing status command output
juncevich Aug 30, 2024
00ad471
HDDS-11367. Improve ozone balancing status command output
juncevich Sep 1, 2024
bac5794
HDDS-11367. Add space between Gb and Mb in getPrettySize
juncevich Sep 19, 2024
9dbd39c
HDDS-11367. Fixe review notices + add unit tests
juncevich Oct 3, 2024
4f6af05
Merge branch 'master' into HDDS-11367
juncevich Oct 3, 2024
570025f
HDDS-11367. Small fixes + improve robot test
juncevich Oct 7, 2024
0c04a2a
HDDS-11367. Add licences to files
juncevich Oct 8, 2024
f1f11ec
HDDS-11367. Fix review notice
juncevich Oct 8, 2024
68766f9
HDDS-11367. Improve balancer robot test
juncevich Oct 8, 2024
4138ad7
HDDS-11367. Fix review notices
juncevich Oct 10, 2024
4d916a5
HDDS-11367. Fix review notices
juncevich Oct 15, 2024
75a103c
HDDS-11367. Fix review notices
juncevich Oct 16, 2024
33af9b1
HDDS-11367. Improve javadocs
juncevich Oct 17, 2024
8ff20ee
HDDS-11367. fix timeouts in balancer test
juncevich Oct 17, 2024
4a7d8cd
Merge branch 'master' into HDDS-11367
juncevich Oct 18, 2024
5284e6b
HDDS-11367. fix robo test
juncevich Oct 18, 2024
051ea1d
HDDS-11367. fix robo test
juncevich Oct 18, 2024
49b6e14
HDDS-11367. fix robo test
juncevich Oct 19, 2024
1513b9c
HDDS-11367. fix robo test
juncevich Oct 19, 2024
16683a2
HDDS-11367. fix robo test
juncevich Oct 19, 2024
899f314
HDDS-11367. fix robo test
juncevich Oct 20, 2024
c8e21d9
HDDS-11367. fix review notices
juncevich Oct 24, 2024
b0cb760
HDDS-11367. Fix review comments
juncevich Oct 25, 2024
d682ccc
HDDS-11367. Add licence info to classes
juncevich Oct 25, 2024
2c4e1f4
HDDS-11367. Partly fix review notices
juncevich Oct 28, 2024
ad0c95d
HDDS-11367. Fix review notices
juncevich Oct 30, 2024
4243f98
HDDS-11367. Fix review notices
juncevich Oct 30, 2024
e9080b3
HDDS-11367. Fix review notices
juncevich Oct 30, 2024
dabd4c7
HDDS-11367. Fix review notices
juncevich Oct 30, 2024
c6562fc
HDDS-11367. Fix tests
juncevich Nov 1, 2024
361ba74
HDDS-11367. Fix tests
juncevich Nov 1, 2024
78caa97
HDDS-11367. Fix tests
juncevich Nov 1, 2024
6a68f45
Merge branch 'master' into HDDS-11367
juncevich Nov 2, 2024
bae2a6b
HDDS-11367. Fix tests
juncevich Nov 2, 2024
bea5d60
HDDS-11367. Fix tests
juncevich Nov 4, 2024
9ae7527
HDDS-11367. Fix tests
juncevich Nov 4, 2024
9e8213b
HDDS-11367. Add tests
juncevich Nov 5, 2024
786fc4f
HDDS-11367. Fix review comments.
juncevich Nov 20, 2024
e77ea14
HDDS-11367. Fix review comments.
juncevich Nov 20, 2024
6ccd658
HDDS-11367. Fix flaky TestContainerBalancerStatusInfo.testGetCurrentS…
juncevich Nov 22, 2024
6acad2b
Merge branch 'master' into HDDS-11367
juncevich Nov 25, 2024
15a8558
HDDS-11367. Remove unstarted balancing iteration
juncevich Nov 25, 2024
2e60bf3
HDDS-11367. Fix review notice. Refactor saving iteration statistic.
juncevich Nov 25, 2024
e41c219
Merge branch 'master' into HDDS-11367
juncevich Dec 4, 2024
4ff34c7
HDDS-11367. Fix review flaky test.
juncevich Dec 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -627,19 +627,20 @@ message ContainerBalancerStatusInfo {
message ContainerBalancerTaskIterationStatusInfo {
optional int32 iterationNumber = 1;
optional string iterationResult = 2;
optional int64 sizeScheduledForMoveGB = 3;
optional int64 dataSizeMovedGB = 4;
optional int64 sizeScheduledForMove = 3;
optional int64 dataSizeMoved = 4;
optional int64 containerMovesScheduled = 5;
optional int64 containerMovesCompleted = 6;
optional int64 containerMovesFailed = 7;
optional int64 containerMovesTimeout = 8;
repeated NodeTransferInfo sizeEnteringNodesGB = 9;
repeated NodeTransferInfo sizeLeavingNodesGB = 10;
repeated NodeTransferInfo sizeEnteringNodes = 9;
repeated NodeTransferInfo sizeLeavingNodes = 10;
optional int64 iterationDuration = 11;
}

message NodeTransferInfo {
optional string uuid = 1;
optional int64 dataVolumeGB = 2;
optional int64 dataVolume = 2;
}

message DecommissionScmRequestProto {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ public final class ContainerBalancerMetrics {
" in the latest iteration.")
private MutableCounterLong dataSizeMovedGBInLatestIteration;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need this metric now? Or is it for backward compatibility?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For backward compatibility.


@Metric(about = "Amount of bytes that Container Balancer moved in the latest iteration.")
private MutableCounterLong dataSizeMovedBytesInLatestIteration;

@Metric(about = "Number of completed container moves performed by " +
"Container Balancer in the latest iteration.")
private MutableCounterLong numContainerMovesCompletedInLatestIteration;
Expand Down Expand Up @@ -131,14 +134,16 @@ void incrementNumContainerMovesScheduledInLatestIteration(long valueToAdd) {
this.numContainerMovesScheduledInLatestIteration.incr(valueToAdd);
}

/**
* Reset number of containers scheduled to move in last iteration.
*/
public void resetNumContainerMovesScheduledInLatestIteration() {
numContainerMovesScheduledInLatestIteration.incr(
-getNumContainerMovesScheduledInLatestIteration());
}

/**
* Gets the amount of data moved by Container Balancer in the latest
* iteration.
* Gets the amount of data moved by Container Balancer in the latest iteration.
* @return size in GB
*/
public long getDataSizeMovedGBInLatestIteration() {
Expand All @@ -154,6 +159,29 @@ public void resetDataSizeMovedGBInLatestIteration() {
-getDataSizeMovedGBInLatestIteration());
}

/**
* Gets the amount of data moved by Container Balancer in the latest iteration.
* @return size in bytes
*/
public long getDataSizeMovedInLatestIteration() {
return dataSizeMovedBytesInLatestIteration.value();
}

/**
* Increment data size moved in last iteration.
* @param bytes bytes to add
*/
public void incrementDataSizeMovedInLatestIteration(long bytes) {
this.dataSizeMovedBytesInLatestIteration.incr(bytes);
}

/**
* Reset data size moved in last iteration.
*/
public void resetDataSizeMovedInLatestIteration() {
dataSizeMovedBytesInLatestIteration.incr(-getDataSizeMovedInLatestIteration());
}

/**
* Gets the number of container moves performed by Container Balancer in the
* latest iteration.
Expand All @@ -163,11 +191,6 @@ public long getNumContainerMovesCompletedInLatestIteration() {
return numContainerMovesCompletedInLatestIteration.value();
}

public void incrementNumContainerMovesCompletedInLatestIteration(
long valueToAdd) {
this.numContainerMovesCompletedInLatestIteration.incr(valueToAdd);
}

public void incrementCurrentIterationContainerMoveMetric(
MoveManager.MoveResult result, long valueToAdd) {
if (result == null) {
Expand Down Expand Up @@ -204,9 +227,11 @@ public void incrementCurrentIterationContainerMoveMetric(
}
}

/**
* Moved containers in last iteration.
*/
public void resetNumContainerMovesCompletedInLatestIteration() {
numContainerMovesCompletedInLatestIteration.incr(
-getNumContainerMovesCompletedInLatestIteration());
numContainerMovesCompletedInLatestIteration.incr(-getNumContainerMovesCompletedInLatestIteration());
}

/**
Expand All @@ -218,14 +243,19 @@ public long getNumContainerMovesTimeoutInLatestIteration() {
return numContainerMovesTimeoutInLatestIteration.value();
}

/**
* Increment number timeouted container moves.
*/
public void incrementNumContainerMovesTimeoutInLatestIteration(
long valueToAdd) {
this.numContainerMovesTimeoutInLatestIteration.incr(valueToAdd);
}

/**
* Reset number timeouted container moves.
*/
public void resetNumContainerMovesTimeoutInLatestIteration() {
numContainerMovesTimeoutInLatestIteration.incr(
-getNumContainerMovesTimeoutInLatestIteration());
numContainerMovesTimeoutInLatestIteration.incr(-getNumContainerMovesTimeoutInLatestIteration());
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@
package org.apache.hadoop.hdds.scm.container.balancer;

import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos;

import java.time.OffsetDateTime;
import java.util.List;
import java.util.stream.Collectors;

/**
* Info about balancer status.
Expand Down Expand Up @@ -51,4 +53,21 @@ public HddsProtos.ContainerBalancerConfigurationProto getConfiguration() {
public List<ContainerBalancerTaskIterationStatusInfo> getIterationsStatusInfo() {
return iterationsStatusInfo;
}

/**
* Map to proto.
* @return proto representation
*/
public StorageContainerLocationProtocolProtos.ContainerBalancerStatusInfo toProto() {
return StorageContainerLocationProtocolProtos.ContainerBalancerStatusInfo
.newBuilder()
.setStartedAt(getStartedAt().toEpochSecond())
.setConfiguration(getConfiguration())
.addAllIterationsStatusInfo(
getIterationsStatusInfo()
.stream()
.map(ContainerBalancerTaskIterationStatusInfo::toProto)
.collect(Collectors.toList())
).build();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@

import java.io.IOException;
import java.time.Duration;
import java.time.OffsetDateTime;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
Expand All @@ -60,8 +61,10 @@
import java.util.concurrent.TimeoutException;
import java.util.stream.Collectors;

import static java.time.OffsetDateTime.now;
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_NODE_REPORT_INTERVAL;
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_NODE_REPORT_INTERVAL_DEFAULT;
import static org.apache.hadoop.util.StringUtils.byteDesc;

/**
* Container balancer task performs move of containers between over- and
Expand Down Expand Up @@ -118,7 +121,7 @@ public class ContainerBalancerTask implements Runnable {
private int nextIterationIndex;
private boolean delayStart;
private List<ContainerBalancerTaskIterationStatusInfo> iterationsStatistic;

private OffsetDateTime currentIterationStarted;
/**
* Constructs ContainerBalancerTask with the specified arguments.
*
Expand Down Expand Up @@ -215,6 +218,7 @@ private void balance() {
// leader change or restart
int i = nextIterationIndex;
for (; i < iterations && isBalancerRunning(); i++) {
currentIterationStarted = now();
// reset some variables and metrics for this iteration
resetState();
if (config.getTriggerDuEnable()) {
Expand Down Expand Up @@ -262,7 +266,7 @@ private void balance() {
}

IterationResult iR = doIteration();
saveIterationStatistic(i, iR);
saveIterationStatistic(i + 1, iR);
metrics.incrementNumIterations(1);

LOG.info("Result of this iteration of Container Balancer: {}", iR);
Expand Down Expand Up @@ -307,53 +311,53 @@ private void balance() {
}

private void saveIterationStatistic(Integer iterationNumber, IterationResult iR) {
long iterationDuration = now().toEpochSecond() - currentIterationStarted.toEpochSecond();
ContainerBalancerTaskIterationStatusInfo iterationStatistic = new ContainerBalancerTaskIterationStatusInfo(
iterationNumber,
iR.name(),
getSizeScheduledForMoveInLatestIteration() / OzoneConsts.GB,
metrics.getDataSizeMovedGBInLatestIteration(),
metrics.getNumContainerMovesScheduledInLatestIteration(),
metrics.getNumContainerMovesCompletedInLatestIteration(),
metrics.getNumContainerMovesFailedInLatestIteration(),
metrics.getNumContainerMovesTimeoutInLatestIteration(),
findTargetStrategy.getSizeEnteringNodes()
.entrySet()
.stream()
.filter(Objects::nonNull)
.filter(datanodeDetailsLongEntry -> datanodeDetailsLongEntry.getValue() > 0)
.collect(
Collectors.toMap(
entry -> entry.getKey().getUuid(),
entry -> entry.getValue() / OzoneConsts.GB
)
),
findSourceStrategy.getSizeLeavingNodes()
.entrySet()
.stream()
.filter(Objects::nonNull)
.filter(datanodeDetailsLongEntry -> datanodeDetailsLongEntry.getValue() > 0)
.collect(
Collectors.toMap(
entry -> entry.getKey().getUuid(),
entry -> entry.getValue() / OzoneConsts.GB
)
)
iterationNumber,
iR.name(),
iterationDuration,
getSizeScheduledForMoveInLatestIteration(),
metrics.getDataSizeMovedInLatestIteration(),
metrics.getNumContainerMovesScheduledInLatestIteration(),
metrics.getNumContainerMovesCompletedInLatestIteration(),
metrics.getNumContainerMovesFailedInLatestIteration(),
metrics.getNumContainerMovesTimeoutInLatestIteration(),
findTargetStrategy.getSizeEnteringNodes()
.entrySet()
.stream()
.filter(Objects::nonNull)
.filter(datanodeDetailsLongEntry -> datanodeDetailsLongEntry.getValue() > 0)
.collect(
Collectors.toMap(
entry -> entry.getKey().getUuid(),
Map.Entry::getValue
)
),
findSourceStrategy.getSizeLeavingNodes()
.entrySet()
.stream()
.filter(Objects::nonNull)
.filter(datanodeDetailsLongEntry -> datanodeDetailsLongEntry.getValue() > 0)
.collect(
Collectors.toMap(
entry -> entry.getKey().getUuid(),
Map.Entry::getValue
)
)
);
iterationsStatistic.add(iterationStatistic);
}

public List<ContainerBalancerTaskIterationStatusInfo> getCurrentIterationsStatistic() {

int lastIterationNumber = iterationsStatistic.stream()
.mapToInt(ContainerBalancerTaskIterationStatusInfo::getIterationNumber)
.max()
.orElse(0);

int lastIterationNumber = iterationsStatistic.isEmpty() ? 0
: iterationsStatistic.get(iterationsStatistic.size() - 1).getIterationNumber();
long iterationDuration = getCurrentIterationDuration();
ContainerBalancerTaskIterationStatusInfo currentIterationStatistic = new ContainerBalancerTaskIterationStatusInfo(
lastIterationNumber + 1,
lastIterationNumber,
null,
getSizeScheduledForMoveInLatestIteration() / OzoneConsts.GB,
sizeActuallyMovedInLatestIteration / OzoneConsts.GB,
iterationDuration,
getSizeScheduledForMoveInLatestIteration(),
sizeActuallyMovedInLatestIteration,
metrics.getNumContainerMovesScheduledInLatestIteration(),
metrics.getNumContainerMovesCompletedInLatestIteration(),
metrics.getNumContainerMovesFailedInLatestIteration(),
Expand All @@ -365,7 +369,7 @@ public List<ContainerBalancerTaskIterationStatusInfo> getCurrentIterationsStatis
.filter(datanodeDetailsLongEntry -> datanodeDetailsLongEntry.getValue() > 0)
.collect(Collectors.toMap(
entry -> entry.getKey().getUuid(),
entry -> entry.getValue() / OzoneConsts.GB
Map.Entry::getValue
)
),
findSourceStrategy.getSizeLeavingNodes()
Expand All @@ -376,7 +380,7 @@ public List<ContainerBalancerTaskIterationStatusInfo> getCurrentIterationsStatis
.collect(
Collectors.toMap(
entry -> entry.getKey().getUuid(),
entry -> entry.getValue() / OzoneConsts.GB
Map.Entry::getValue
)
)
);
Expand All @@ -385,6 +389,14 @@ public List<ContainerBalancerTaskIterationStatusInfo> getCurrentIterationsStatis
return resultList;
}

private long getCurrentIterationDuration() {
if (currentIterationStarted == null) {
return -1L;
} else {
return now().toEpochSecond() - currentIterationStarted.toEpochSecond();
}
}

/**
* Logs the reason for stop and save configuration and stop the task.
*
Expand Down Expand Up @@ -708,26 +720,28 @@ private void checkIterationMoveResults() {
}
}

countDatanodesInvolvedPerIteration =
selectedSources.size() + selectedTargets.size();
metrics.incrementNumDatanodesInvolvedInLatestIteration(
countDatanodesInvolvedPerIteration);
metrics.incrementNumContainerMovesScheduled(
metrics.getNumContainerMovesScheduledInLatestIteration());
metrics.incrementNumContainerMovesCompleted(
metrics.getNumContainerMovesCompletedInLatestIteration());
metrics.incrementNumContainerMovesTimeout(
metrics.getNumContainerMovesTimeoutInLatestIteration());
metrics.incrementDataSizeMovedGBInLatestIteration(
sizeActuallyMovedInLatestIteration / OzoneConsts.GB);
metrics.incrementDataSizeMovedGB(
metrics.getDataSizeMovedGBInLatestIteration());
metrics.incrementNumContainerMovesFailed(
metrics.getNumContainerMovesFailedInLatestIteration());
countDatanodesInvolvedPerIteration = selectedSources.size() + selectedTargets.size();

metrics.incrementNumDatanodesInvolvedInLatestIteration(countDatanodesInvolvedPerIteration);

metrics.incrementNumContainerMovesScheduled(metrics.getNumContainerMovesScheduledInLatestIteration());

metrics.incrementNumContainerMovesCompleted(metrics.getNumContainerMovesCompletedInLatestIteration());

metrics.incrementNumContainerMovesTimeout(metrics.getNumContainerMovesTimeoutInLatestIteration());

metrics.incrementDataSizeMovedGBInLatestIteration(sizeActuallyMovedInLatestIteration / OzoneConsts.GB);

metrics.incrementDataSizeMovedInLatestIteration(sizeActuallyMovedInLatestIteration);

metrics.incrementDataSizeMovedGB(metrics.getDataSizeMovedGBInLatestIteration());

metrics.incrementNumContainerMovesFailed(metrics.getNumContainerMovesFailedInLatestIteration());

LOG.info("Iteration Summary. Number of Datanodes involved: {}. Size " +
"moved: {} ({} Bytes). Number of Container moves completed: {}.",
countDatanodesInvolvedPerIteration,
StringUtils.byteDesc(sizeActuallyMovedInLatestIteration),
byteDesc(sizeActuallyMovedInLatestIteration),
sizeActuallyMovedInLatestIteration,
metrics.getNumContainerMovesCompletedInLatestIteration());
}
Expand Down Expand Up @@ -1146,6 +1160,7 @@ private void resetState() {
this.sizeScheduledForMoveInLatestIteration = 0;
this.sizeActuallyMovedInLatestIteration = 0;
metrics.resetDataSizeMovedGBInLatestIteration();
metrics.resetDataSizeMovedInLatestIteration();
metrics.resetNumContainerMovesScheduledInLatestIteration();
metrics.resetNumContainerMovesCompletedInLatestIteration();
metrics.resetNumContainerMovesTimeoutInLatestIteration();
Expand Down
Loading