-
Notifications
You must be signed in to change notification settings - Fork 587
HDDS-7252. Polled source Datanodes are wrongly not re-considered for balancing in Container Balancer #6305
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
HDDS-7252. Polled source Datanodes are wrongly not re-considered for balancing in Container Balancer #6305
Changes from all commits
e11f214
b670262
b6fea1f
6177ce1
5d754a0
d8cfa4e
79e491a
e721ce5
8290f79
fa36822
efac536
834fd06
f982c2c
7c82f06
ee90c03
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -553,6 +553,10 @@ private boolean processMoveSelection(DatanodeDetails source, | |
| containerID, | ||
| containerToSourceMap.get(containerID), | ||
| containerToTargetMap.get(containerID)); | ||
| // add source back to queue as a different container can be selected in next run. | ||
| findSourceStrategy.addBackSourceDataNode(source); | ||
| // exclude the container which caused failure of move to avoid error in next run. | ||
| selectionCriteria.addToExcludeDueToFailContainers(moveSelection.getContainerID()); | ||
| return false; | ||
| } | ||
|
|
||
|
|
@@ -563,6 +567,10 @@ private boolean processMoveSelection(DatanodeDetails source, | |
| } catch (ContainerNotFoundException e) { | ||
| LOG.warn("Could not get container {} from Container Manager before " + | ||
| "starting a container move", containerID, e); | ||
| // add source back to queue as a different container can be selected in next run. | ||
| findSourceStrategy.addBackSourceDataNode(source); | ||
| // exclude the container which caused failure of move to avoid error in next run. | ||
| selectionCriteria.addToExcludeDueToFailContainers(moveSelection.getContainerID()); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about same container in another source? do that also needs keep excluding? I think we need reconsider this container exclusion to be at source level only.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I missed this comment - @sumitagrawl do you want to take another look? I think @Tejaskriya has updated the PR now.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the review! I have created a Map of DN and containers to be excluded for that DN. Could you please take a look at the recent changes?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
What's your reasoning behind this? If container manager can't find this container I think we should avoid this container for any DN.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If 2 different source have same container choosen to be moved to target to reduce usages at source,
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When a container is not found in the SCM, the problem is likely outside of balancer and will affect any other sources trying to move the same container later. So I think we should exclude the container for all sources when it's a |
||
| return false; | ||
| } | ||
| LOG.info("ContainerBalancer is trying to move container {} with size " + | ||
|
|
@@ -862,13 +870,23 @@ private boolean moveContainer(DatanodeDetails source, | |
| } catch (ContainerNotFoundException e) { | ||
| LOG.warn("Could not find Container {} for container move", | ||
| containerID, e); | ||
| // add source back to queue as a different container can be selected in next run. | ||
| findSourceStrategy.addBackSourceDataNode(source); | ||
| // exclude the container which caused failure of move to avoid error in next run. | ||
| selectionCriteria.addToExcludeDueToFailContainers(moveSelection.getContainerID()); | ||
| metrics.incrementNumContainerMovesFailedInLatestIteration(1); | ||
| return false; | ||
| } catch (NodeNotFoundException | TimeoutException | | ||
| ContainerReplicaNotFoundException e) { | ||
| } catch (NodeNotFoundException | TimeoutException e) { | ||
| LOG.warn("Container move failed for container {}", containerID, e); | ||
| metrics.incrementNumContainerMovesFailedInLatestIteration(1); | ||
| return false; | ||
| } catch (ContainerReplicaNotFoundException e) { | ||
| LOG.warn("Container move failed for container {}", containerID, e); | ||
| metrics.incrementNumContainerMovesFailedInLatestIteration(1); | ||
| // add source back to queue for replica not found only | ||
| // the container is not excluded as it is a replica related failure | ||
| findSourceStrategy.addBackSourceDataNode(source); | ||
| return false; | ||
| } | ||
|
|
||
| /* | ||
|
|
@@ -881,6 +899,16 @@ private boolean moveContainer(DatanodeDetails source, | |
| } else { | ||
| MoveManager.MoveResult result = future.join(); | ||
| moveSelectionToFutureMap.put(moveSelection, future); | ||
| if (result == MoveManager.MoveResult.REPLICATION_FAIL_NOT_EXIST_IN_SOURCE || | ||
| result == MoveManager.MoveResult.REPLICATION_FAIL_EXIST_IN_TARGET || | ||
| result == MoveManager.MoveResult.REPLICATION_FAIL_CONTAINER_NOT_CLOSED || | ||
| result == MoveManager.MoveResult.REPLICATION_FAIL_INFLIGHT_DELETION || | ||
| result == MoveManager.MoveResult.REPLICATION_FAIL_INFLIGHT_REPLICATION) { | ||
| // add source back to queue as a different container can be selected in next run. | ||
| // the container which caused failure of move is not excluded | ||
| // as it is an intermittent failure or a replica related failure | ||
| findSourceStrategy.addBackSourceDataNode(source); | ||
| } | ||
| return result == MoveManager.MoveResult.COMPLETED; | ||
| } | ||
| } else { | ||
|
|
@@ -1098,6 +1126,11 @@ Set<DatanodeDetails> getSelectedTargets() { | |
| return selectedTargets; | ||
| } | ||
|
|
||
| @VisibleForTesting | ||
| Set<DatanodeDetails> getSelectedSources() { | ||
| return selectedSources; | ||
| } | ||
|
|
||
| @VisibleForTesting | ||
| int getCountDatanodesInvolvedPerIteration() { | ||
| return countDatanodesInvolvedPerIteration; | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.