From b7adbd5475c43b823e2b838338ca87f3ed1acc06 Mon Sep 17 00:00:00 2001 From: VarshaRavi Date: Wed, 4 Sep 2024 12:59:18 +0530 Subject: [PATCH 1/5] HDDS-11380. Fixing the error message of node decommission to be more comprehensive --- .../hadoop/hdds/scm/node/NodeDecommissionManager.java | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java index d6f0e89c96d2..07703406e0a8 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java @@ -431,8 +431,9 @@ private synchronized boolean checkIfDecommissionPossible(List d int reqNodes = cif.getReplicationConfig().getRequiredNodes(); if ((inServiceTotal - numDecom) < reqNodes) { String errorMsg = "Insufficient nodes. Tried to decommission " + dns.size() + - " nodes of which " + numDecom + " nodes were valid. Cluster has " + inServiceTotal + - " IN-SERVICE nodes, " + reqNodes + " of which are required for minimum replication. "; + " nodes out of " + inServiceTotal + " IN-SERVICE nodes. Cannot decommission because we need " + + (reqNodes - (inServiceTotal - numDecom)) + " more IN-SERVICE nodes " + + "to maintain replication factor. "; LOG.info(errorMsg + "Failing due to datanode : {}, container : {}", dn, cid); errors.add(new DatanodeAdminError("AllHosts", errorMsg)); return false; @@ -595,8 +596,9 @@ private synchronized boolean checkIfMaintenancePossible(List dn } if ((inServiceTotal - numMaintenance) < minInService) { String errorMsg = "Insufficient nodes. Tried to start maintenance for " + dns.size() + - " nodes of which " + numMaintenance + " nodes were valid. Cluster has " + inServiceTotal + - " IN-SERVICE nodes, " + minInService + " of which are required for minimum replication. "; + " nodes out of " + inServiceTotal + " IN-SERVICE nodes. Cannot enter maintenance mode because " + + "we need " + (minInService - (inServiceTotal - numMaintenance)) + + " more IN-SERVICE nodes to maintain minimum replication. "; LOG.info(errorMsg + "Failing due to datanode : {}, container : {}", dn, cid); errors.add(new DatanodeAdminError("AllHosts", errorMsg)); return false; From 314313b3cbdbe877beed95b7d110234b8a73783d Mon Sep 17 00:00:00 2001 From: VarshaRavi Date: Thu, 5 Sep 2024 12:29:09 +0530 Subject: [PATCH 2/5] Fixing the error message based on review comments --- .../scm/node/NodeDecommissionManager.java | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java index 07703406e0a8..9a3b45d851e1 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java @@ -391,6 +391,8 @@ private synchronized boolean checkIfDecommissionPossible(List d int numDecom = dns.size(); List validDns = new ArrayList<>(dns); int inServiceTotal = nodeManager.getNodeCount(NodeStatus.inServiceHealthy()); + int unHealthyTotal = nodeManager.getNodeCount(NodeStatus.inServiceStale()) + + nodeManager.getNodeCount(NodeStatus.inServiceDead()); for (DatanodeDetails dn : dns) { try { NodeStatus nodeStatus = getNodeStatus(dn); @@ -398,10 +400,12 @@ private synchronized boolean checkIfDecommissionPossible(List d if (opState != NodeOperationalState.IN_SERVICE) { numDecom--; validDns.remove(dn); + LOG.warn("Cannot decommission " + dn.getHostName() + " because it is not IN-SERVICE"); } } catch (NodeNotFoundException ex) { numDecom--; validDns.remove(dn); + LOG.warn("Cannot decommission " + dn.getHostName() + " because it is not found in SCM"); } } @@ -431,9 +435,9 @@ private synchronized boolean checkIfDecommissionPossible(List d int reqNodes = cif.getReplicationConfig().getRequiredNodes(); if ((inServiceTotal - numDecom) < reqNodes) { String errorMsg = "Insufficient nodes. Tried to decommission " + dns.size() + - " nodes out of " + inServiceTotal + " IN-SERVICE nodes. Cannot decommission because we need " + - (reqNodes - (inServiceTotal - numDecom)) + " more IN-SERVICE nodes " + - "to maintain replication factor. "; + " nodes out of " + inServiceTotal + " IN-SERVICE HEALTHY and " + unHealthyTotal + + " UNHEALTHY nodes. Cannot decommission as a minimum of " + reqNodes + + " IN-SERVICE HEALTHY nodes are required to maintain replication after decommission. "; LOG.info(errorMsg + "Failing due to datanode : {}, container : {}", dn, cid); errors.add(new DatanodeAdminError("AllHosts", errorMsg)); return false; @@ -546,6 +550,8 @@ private synchronized boolean checkIfMaintenancePossible(List dn List validDns = dns.stream().collect(Collectors.toList()); Collections.copy(validDns, dns); int inServiceTotal = nodeManager.getNodeCount(NodeStatus.inServiceHealthy()); + int unHealthyTotal = nodeManager.getNodeCount(NodeStatus.inServiceStale()) + + nodeManager.getNodeCount(NodeStatus.inServiceDead()); for (DatanodeDetails dn : dns) { try { NodeStatus nodeStatus = getNodeStatus(dn); @@ -553,10 +559,12 @@ private synchronized boolean checkIfMaintenancePossible(List dn if (opState != NodeOperationalState.IN_SERVICE) { numMaintenance--; validDns.remove(dn); + LOG.warn(dn.getHostName() + " cannot enter maintenance because it is not IN-SERVICE"); } } catch (NodeNotFoundException ex) { numMaintenance--; validDns.remove(dn); + LOG.warn(dn.getHostName() + " cannot enter maintenance because it is not found in SCM"); } } @@ -596,9 +604,9 @@ private synchronized boolean checkIfMaintenancePossible(List dn } if ((inServiceTotal - numMaintenance) < minInService) { String errorMsg = "Insufficient nodes. Tried to start maintenance for " + dns.size() + - " nodes out of " + inServiceTotal + " IN-SERVICE nodes. Cannot enter maintenance mode because " + - "we need " + (minInService - (inServiceTotal - numMaintenance)) + - " more IN-SERVICE nodes to maintain minimum replication. "; + " nodes out of " + inServiceTotal + " IN-SERVICE HEALTHY and " + unHealthyTotal + + " UNHEALTHY nodes. Cannot enter maintenance mode as a minimum of " + minInService + + " IN-SERVICE HEALTHY nodes are required to maintain replication after maintenance. "; LOG.info(errorMsg + "Failing due to datanode : {}, container : {}", dn, cid); errors.add(new DatanodeAdminError("AllHosts", errorMsg)); return false; From cad52086d380623a1b63acf6dbe04c1c0d0aa9e1 Mon Sep 17 00:00:00 2001 From: VarshaRavi Date: Tue, 10 Sep 2024 11:21:42 +0530 Subject: [PATCH 3/5] Making changes based on review comments --- .../hdds/scm/node/NodeDecommissionManager.java | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java index 9a3b45d851e1..3bbfdb6f9ea0 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java @@ -391,8 +391,6 @@ private synchronized boolean checkIfDecommissionPossible(List d int numDecom = dns.size(); List validDns = new ArrayList<>(dns); int inServiceTotal = nodeManager.getNodeCount(NodeStatus.inServiceHealthy()); - int unHealthyTotal = nodeManager.getNodeCount(NodeStatus.inServiceStale()) + - nodeManager.getNodeCount(NodeStatus.inServiceDead()); for (DatanodeDetails dn : dns) { try { NodeStatus nodeStatus = getNodeStatus(dn); @@ -400,12 +398,12 @@ private synchronized boolean checkIfDecommissionPossible(List d if (opState != NodeOperationalState.IN_SERVICE) { numDecom--; validDns.remove(dn); - LOG.warn("Cannot decommission " + dn.getHostName() + " because it is not IN-SERVICE"); + LOG.warn("Cannot decommission {} because it is not IN-SERVICE", dn.getHostName()); } } catch (NodeNotFoundException ex) { numDecom--; validDns.remove(dn); - LOG.warn("Cannot decommission " + dn.getHostName() + " because it is not found in SCM"); + LOG.warn("Cannot decommission {} because it is not found in SCM", dn.getHostName()); } } @@ -434,6 +432,7 @@ private synchronized boolean checkIfDecommissionPossible(List d } int reqNodes = cif.getReplicationConfig().getRequiredNodes(); if ((inServiceTotal - numDecom) < reqNodes) { + int unHealthyTotal = nodeManager.getAllNodes().size() - inServiceTotal; String errorMsg = "Insufficient nodes. Tried to decommission " + dns.size() + " nodes out of " + inServiceTotal + " IN-SERVICE HEALTHY and " + unHealthyTotal + " UNHEALTHY nodes. Cannot decommission as a minimum of " + reqNodes + @@ -550,8 +549,6 @@ private synchronized boolean checkIfMaintenancePossible(List dn List validDns = dns.stream().collect(Collectors.toList()); Collections.copy(validDns, dns); int inServiceTotal = nodeManager.getNodeCount(NodeStatus.inServiceHealthy()); - int unHealthyTotal = nodeManager.getNodeCount(NodeStatus.inServiceStale()) + - nodeManager.getNodeCount(NodeStatus.inServiceDead()); for (DatanodeDetails dn : dns) { try { NodeStatus nodeStatus = getNodeStatus(dn); @@ -559,12 +556,12 @@ private synchronized boolean checkIfMaintenancePossible(List dn if (opState != NodeOperationalState.IN_SERVICE) { numMaintenance--; validDns.remove(dn); - LOG.warn(dn.getHostName() + " cannot enter maintenance because it is not IN-SERVICE"); + LOG.warn("{} cannot enter maintenance because it is not IN-SERVICE", dn.getHostName()); } } catch (NodeNotFoundException ex) { numMaintenance--; validDns.remove(dn); - LOG.warn(dn.getHostName() + " cannot enter maintenance because it is not found in SCM"); + LOG.warn("{} cannot enter maintenance because it is not found in SCM", dn.getHostName()); } } @@ -603,6 +600,7 @@ private synchronized boolean checkIfMaintenancePossible(List dn minInService = maintenanceReplicaMinimum; } if ((inServiceTotal - numMaintenance) < minInService) { + int unHealthyTotal = nodeManager.getAllNodes().size() - inServiceTotal; String errorMsg = "Insufficient nodes. Tried to start maintenance for " + dns.size() + " nodes out of " + inServiceTotal + " IN-SERVICE HEALTHY and " + unHealthyTotal + " UNHEALTHY nodes. Cannot enter maintenance mode as a minimum of " + minInService + From 608b9eba864801bc61aef0699c466992f2ea5229 Mon Sep 17 00:00:00 2001 From: VarshaRavi Date: Tue, 10 Sep 2024 15:00:25 +0530 Subject: [PATCH 4/5] Change the unhealthy keyword to not in-service or not healthy --- .../apache/hadoop/hdds/scm/node/NodeDecommissionManager.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java index 3bbfdb6f9ea0..e13a40e3a75f 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java @@ -435,7 +435,7 @@ private synchronized boolean checkIfDecommissionPossible(List d int unHealthyTotal = nodeManager.getAllNodes().size() - inServiceTotal; String errorMsg = "Insufficient nodes. Tried to decommission " + dns.size() + " nodes out of " + inServiceTotal + " IN-SERVICE HEALTHY and " + unHealthyTotal + - " UNHEALTHY nodes. Cannot decommission as a minimum of " + reqNodes + + " not IN-SERVICE or not HEALTHY nodes. Cannot decommission as a minimum of " + reqNodes + " IN-SERVICE HEALTHY nodes are required to maintain replication after decommission. "; LOG.info(errorMsg + "Failing due to datanode : {}, container : {}", dn, cid); errors.add(new DatanodeAdminError("AllHosts", errorMsg)); @@ -603,7 +603,7 @@ private synchronized boolean checkIfMaintenancePossible(List dn int unHealthyTotal = nodeManager.getAllNodes().size() - inServiceTotal; String errorMsg = "Insufficient nodes. Tried to start maintenance for " + dns.size() + " nodes out of " + inServiceTotal + " IN-SERVICE HEALTHY and " + unHealthyTotal + - " UNHEALTHY nodes. Cannot enter maintenance mode as a minimum of " + minInService + + " not IN-SERVICE or not HEALTHY nodes. Cannot enter maintenance mode as a minimum of " + minInService + " IN-SERVICE HEALTHY nodes are required to maintain replication after maintenance. "; LOG.info(errorMsg + "Failing due to datanode : {}, container : {}", dn, cid); errors.add(new DatanodeAdminError("AllHosts", errorMsg)); From cbd007f7bcc9666e8ce9a13483bf1249e85e403b Mon Sep 17 00:00:00 2001 From: VarshaRavi Date: Tue, 17 Sep 2024 12:33:05 +0530 Subject: [PATCH 5/5] Adding node count assertions for decomm and maintenance in existing tests under TestNodeDecommissionManager --- .../scm/node/TestNodeDecommissionManager.java | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeDecommissionManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeDecommissionManager.java index 4511ffea5d2e..e5ab8c09e0c3 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeDecommissionManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeDecommissionManager.java @@ -440,6 +440,10 @@ public void testInsufficientNodeDecommissionThrowsExceptionForRatis() throws error = decom.decommissionNodes(Arrays.asList(dns.get(1).getIpAddress(), dns.get(2).getIpAddress(), dns.get(3).getIpAddress(), dns.get(4).getIpAddress()), false); assertTrue(error.get(0).getHostname().contains("AllHosts")); + String errorMsg = String.format("%d IN-SERVICE HEALTHY and %d not IN-SERVICE or not HEALTHY nodes.", 5, 0); + assertTrue(error.get(0).getError().contains(errorMsg)); + errorMsg = String.format("Cannot decommission as a minimum of %d IN-SERVICE HEALTHY nodes are required", 3); + assertTrue(error.get(0).getError().contains(errorMsg)); assertEquals(HddsProtos.NodeOperationalState.IN_SERVICE, nodeManager.getNodeStatus(dns.get(1)).getOperationalState()); assertEquals(HddsProtos.NodeOperationalState.IN_SERVICE, @@ -489,6 +493,10 @@ public void testInsufficientNodeDecommissionThrowsExceptionForEc() throws error = decom.decommissionNodes(Arrays.asList(dns.get(1).getIpAddress()), false); assertTrue(error.get(0).getHostname().contains("AllHosts")); + String errorMsg = String.format("%d IN-SERVICE HEALTHY and %d not IN-SERVICE or not HEALTHY nodes.", 5, 0); + assertTrue(error.get(0).getError().contains(errorMsg)); + errorMsg = String.format("Cannot decommission as a minimum of %d IN-SERVICE HEALTHY nodes are required", 5); + assertTrue(error.get(0).getError().contains(errorMsg)); assertEquals(HddsProtos.NodeOperationalState.IN_SERVICE, nodeManager.getNodeStatus(dns.get(1)).getOperationalState()); error = decom.decommissionNodes(Arrays.asList(dns.get(1).getIpAddress()), true); @@ -537,6 +545,10 @@ public void testInsufficientNodeDecommissionThrowsExceptionRatisAndEc() throws error = decom.decommissionNodes(Arrays.asList(dns.get(1).getIpAddress()), false); assertTrue(error.get(0).getHostname().contains("AllHosts")); + String errorMsg = String.format("%d IN-SERVICE HEALTHY and %d not IN-SERVICE or not HEALTHY nodes.", 5, 0); + assertTrue(error.get(0).getError().contains(errorMsg)); + errorMsg = String.format("Cannot decommission as a minimum of %d IN-SERVICE HEALTHY nodes are required", 5); + assertTrue(error.get(0).getError().contains(errorMsg)); assertEquals(HddsProtos.NodeOperationalState.IN_SERVICE, nodeManager.getNodeStatus(dns.get(1)).getOperationalState()); error = decom.decommissionNodes(Arrays.asList(dns.get(1).getIpAddress()), true); @@ -637,6 +649,7 @@ public void testInsufficientNodeDecommissionChecksForNNF() throws error = decom.decommissionNodes(Arrays.asList(dns.get(0).getIpAddress(), dns.get(1).getIpAddress(), dns.get(2).getIpAddress()), false); assertFalse(error.get(0).getHostname().contains("AllHosts")); + assertTrue(error.get(0).getError().contains("The host was not found in SCM")); assertEquals(HddsProtos.NodeOperationalState.DECOMMISSIONING, nodeManager.getNodeStatus(dns.get(1)).getOperationalState()); assertEquals(HddsProtos.NodeOperationalState.DECOMMISSIONING, @@ -673,6 +686,11 @@ public void testInsufficientNodeMaintenanceThrowsExceptionForRatis() throws error = decom.startMaintenanceNodes(Arrays.asList(dns.get(1).getIpAddress(), dns.get(2).getIpAddress(), dns.get(3).getIpAddress(), dns.get(4).getIpAddress()), 100, false); assertTrue(error.get(0).getHostname().contains("AllHosts")); + String errorMsg = String.format("%d IN-SERVICE HEALTHY and %d not IN-SERVICE or not HEALTHY nodes.", 5, 0); + assertTrue(error.get(0).getError().contains(errorMsg)); + errorMsg = String.format("Cannot enter maintenance mode as a minimum of %d IN-SERVICE HEALTHY nodes are required", + 2); + assertTrue(error.get(0).getError().contains(errorMsg)); assertEquals(HddsProtos.NodeOperationalState.IN_SERVICE, nodeManager.getNodeStatus(dns.get(1)).getOperationalState()); assertEquals(HddsProtos.NodeOperationalState.IN_SERVICE, @@ -768,6 +786,11 @@ public void testInsufficientNodeMaintenanceThrowsExceptionForEc() throws error = decom.startMaintenanceNodes(Arrays.asList(dns.get(1).getIpAddress(), dns.get(2).getIpAddress()), 100, false); assertTrue(error.get(0).getHostname().contains("AllHosts")); + String errorMsg = String.format("%d IN-SERVICE HEALTHY and %d not IN-SERVICE or not HEALTHY nodes.", 5, 0); + assertTrue(error.get(0).getError().contains(errorMsg)); + errorMsg = String.format("Cannot enter maintenance mode as a minimum of %d IN-SERVICE HEALTHY nodes are required", + 4); + assertTrue(error.get(0).getError().contains(errorMsg)); assertEquals(HddsProtos.NodeOperationalState.IN_SERVICE, nodeManager.getNodeStatus(dns.get(1)).getOperationalState()); assertEquals(HddsProtos.NodeOperationalState.IN_SERVICE, @@ -869,6 +892,11 @@ public void testInsufficientNodeMaintenanceThrowsExceptionForRatisAndEc() throws // it should not be allowed as for EC, maintenance.remaining.redundancy is 2 => 3+2=5 DNs are required error = decom.startMaintenanceNodes(Arrays.asList(dns.get(1).getIpAddress()), 100, false); assertTrue(error.get(0).getHostname().contains("AllHosts")); + String errorMsg = String.format("%d IN-SERVICE HEALTHY and %d not IN-SERVICE or not HEALTHY nodes.", 5, 0); + assertTrue(error.get(0).getError().contains(errorMsg)); + errorMsg = String.format("Cannot enter maintenance mode as a minimum of %d IN-SERVICE HEALTHY nodes are required", + 5); + assertTrue(error.get(0).getError().contains(errorMsg)); assertEquals(HddsProtos.NodeOperationalState.IN_SERVICE, nodeManager.getNodeStatus(dns.get(1)).getOperationalState());