From 5a3ef8aafc8390abececc191189b30c1bece3583 Mon Sep 17 00:00:00 2001 From: tejaskriya Date: Wed, 15 May 2024 13:36:08 +0530 Subject: [PATCH] HDDS-10859. Improve error messages when decommission and maintenance fail-early --- .../scm/node/NodeDecommissionManager.java | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java index 96628aa6cf47..df224d1d4494 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java @@ -327,7 +327,6 @@ public synchronized List decommissionNodes( boolean decommissionPossible = checkIfDecommissionPossible(dns, errors); if (!decommissionPossible) { LOG.error("Cannot decommission nodes as sufficient node are not available."); - errors.add(new DatanodeAdminError("AllHosts", "Sufficient nodes are not available.")); return errors; } } else { @@ -436,10 +435,11 @@ private synchronized boolean checkIfDecommissionPossible(List d } int reqNodes = cif.getReplicationConfig().getRequiredNodes(); if ((inServiceTotal - numDecom) < reqNodes) { - LOG.info("Cannot decommission nodes. Tried to decommission {} nodes of which valid nodes = {}. " + - "Cluster state: In-service nodes = {}, nodes required for replication = {}. " + - "Failing due to datanode : {}, container : {}", - dns.size(), numDecom, inServiceTotal, reqNodes, dn, cid); + String errorMsg = "Insufficient nodes. Tried to decommission " + dns.size() + + " nodes of which " + numDecom + " nodes were valid. Cluster has " + inServiceTotal + + " IN-SERVICE nodes, " + reqNodes + " of which are required for minimum replication. "; + LOG.info(errorMsg + "Failing due to datanode : {}, container : {}", dn, cid); + errors.add(new DatanodeAdminError("AllHosts", errorMsg)); return false; } } @@ -495,7 +495,6 @@ public synchronized List startMaintenanceNodes( boolean maintenancePossible = checkIfMaintenancePossible(dns, errors); if (!maintenancePossible) { LOG.error("Cannot put nodes to maintenance as sufficient node are not available."); - errors.add(new DatanodeAdminError("AllHosts", "Sufficient nodes are not available.")); return errors; } } else { @@ -600,11 +599,11 @@ private synchronized boolean checkIfMaintenancePossible(List dn minInService = maintenanceReplicaMinimum; } if ((inServiceTotal - numMaintenance) < minInService) { - LOG.info("Cannot enter nodes into maintenance. Tried to start maintenance for {} nodes " + - "of which valid nodes = {}. " + - "Cluster state: In-service nodes = {}, nodes required for replication = {}. " + - "Failing due to datanode : {}, container : {}", - dns.size(), numMaintenance, inServiceTotal, minInService, dn, cid); + String errorMsg = "Insufficient nodes. Tried to start maintenance for " + dns.size() + + " nodes of which " + numMaintenance + " nodes were valid. Cluster has " + inServiceTotal + + " IN-SERVICE nodes, " + minInService + " of which are required for minimum replication. "; + LOG.info(errorMsg + "Failing due to datanode : {}, container : {}", dn, cid); + errors.add(new DatanodeAdminError("AllHosts", errorMsg)); return false; } }