From e38efdeff41f03131a864253ac47acaca31fc131 Mon Sep 17 00:00:00 2001 From: "Doroszlai, Attila" Date: Fri, 6 Jan 2023 17:19:39 +0100 Subject: [PATCH 1/5] HDDS-7726. EC: Enhance datanode reconstruction log message --- .../ECReconstructionCommandInfo.java | 16 +++++++++++++--- .../ECReconstructionCoordinatorTask.java | 15 +++++++-------- .../commands/ReconstructECContainersCommand.java | 5 +++++ 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCommandInfo.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCommandInfo.java index f7d5e6a771ee..f10874722e13 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCommandInfo.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCommandInfo.java @@ -24,6 +24,8 @@ import java.util.Arrays; import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; /** * This class is to keep the required EC reconstruction info. @@ -77,16 +79,24 @@ public List getTargetDatanodes() { @Override public String toString() { + String src = sources.stream() + .map(Objects::toString) + .collect(Collectors.joining(", ")); + String target = targetDatanodes.stream() + .map(DatanodeDetails::getUuidString) + .collect(Collectors.joining(", ")); + return "ECReconstructionCommandInfo{" + "containerID=" + containerID - + ", ecReplicationConfig=" + ecReplicationConfig + + ", replication=" + ecReplicationConfig + ", missingContainerIndexes=" + Arrays .toString(missingContainerIndexes) - + ", sources=" + sources - + ", targetDatanodes=" + targetDatanodes + '}'; + + ", sources={" + src + "}" + + ", targets=[" + target + "]}"; } public long getTerm() { return term; } + } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinatorTask.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinatorTask.java index 5325314c256b..d75b6fbbb9cb 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinatorTask.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinatorTask.java @@ -19,6 +19,7 @@ import org.apache.hadoop.hdds.protocol.DatanodeDetails; import org.apache.hadoop.ozone.protocol.commands.ReconstructECContainersCommand.DatanodeDetailsAndReplicaIndex; +import org.apache.hadoop.util.Time; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -69,6 +70,7 @@ public void run() { // respective container. HDDS-6582 // 5. Close/finalize the recovered containers. long containerID = this.reconstructionCommandInfo.getContainerID(); + long start = Time.monotonicNow(); if (LOG.isDebugEnabled()) { LOG.debug("Starting the EC reconstruction of the container {}", containerID); @@ -88,7 +90,7 @@ public void run() { final long taskTerm = reconstructionCommandInfo.getTerm(); if (currentTerm.isPresent() && taskTerm < currentTerm.getAsLong()) { LOG.info("Ignoring {} since SCM leader has new term ({} < {})", - this, taskTerm, currentTerm.getAsLong()); + reconstructionCommandInfo, taskTerm, currentTerm.getAsLong()); return; } @@ -108,12 +110,10 @@ public void run() { reconstructionCommandInfo.getContainerID(), reconstructionCommandInfo.getEcReplicationConfig(), sourceNodeMap, targetNodeMap); - LOG.info("Completed the EC reconstruction of the container {}", - reconstructionCommandInfo.getContainerID()); + long elapsed = Time.monotonicNow() - start; + LOG.info("Completed {} in {} ms", reconstructionCommandInfo, elapsed); } catch (IOException e) { - LOG.warn( - "Failed to complete the reconstruction task for the container: " - + reconstructionCommandInfo.getContainerID(), e); + LOG.warn("Failed {}", reconstructionCommandInfo, e); } finally { this.inprogressCounter.remove(containerID); } @@ -121,7 +121,6 @@ public void run() { @Override public String toString() { - return "ECReconstructionCoordinatorTask{" + "reconstructionCommandInfo=" - + reconstructionCommandInfo + '}'; + return "ECReconstructionTask{info=" + reconstructionCommandInfo + '}'; } } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/ReconstructECContainersCommand.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/ReconstructECContainersCommand.java index 203e5e6bed07..b18f31426a73 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/ReconstructECContainersCommand.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/ReconstructECContainersCommand.java @@ -188,5 +188,10 @@ public boolean equals(Object o) { public int hashCode() { return Objects.hash(dnDetails, replicaIndex); } + + @Override + public String toString() { + return "[" + replicaIndex + "]:" + dnDetails.getUuidString(); + } } } From ead5752d03b9deae9063f0df4aa8507e47bce383 Mon Sep 17 00:00:00 2001 From: "Doroszlai, Attila" Date: Sat, 7 Jan 2023 12:00:20 +0100 Subject: [PATCH 2/5] Move source/target mapping to ECReconstructionCommandInfo --- .../ECReconstructionCommandInfo.java | 74 ++++++++++--------- .../ECReconstructionCoordinatorTask.java | 25 +------ 2 files changed, 44 insertions(+), 55 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCommandInfo.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCommandInfo.java index f10874722e13..31ef246867be 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCommandInfo.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCommandInfo.java @@ -23,21 +23,24 @@ import org.apache.hadoop.ozone.protocol.commands.ReconstructECContainersCommand.DatanodeDetailsAndReplicaIndex; import java.util.Arrays; -import java.util.List; -import java.util.Objects; -import java.util.stream.Collectors; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.stream.IntStream; + +import static java.util.Collections.unmodifiableSortedMap; +import static java.util.stream.Collectors.joining; +import static java.util.stream.Collectors.toMap; /** * This class is to keep the required EC reconstruction info. */ public class ECReconstructionCommandInfo { - private long containerID; - private ECReplicationConfig ecReplicationConfig; - private byte[] missingContainerIndexes; - private List - sources; - private List targetDatanodes; - private long deadlineMsSinceEpoch = 0; + private final SortedMap sourceNodeMap; + private final SortedMap targetNodeMap; + private final long containerID; + private final ECReplicationConfig ecReplicationConfig; + private final byte[] missingContainerIndexes; + private final long deadlineMsSinceEpoch; private final long term; public ECReconstructionCommandInfo(ReconstructECContainersCommand cmd) { @@ -46,10 +49,20 @@ public ECReconstructionCommandInfo(ReconstructECContainersCommand cmd) { this.missingContainerIndexes = Arrays.copyOf(cmd.getMissingContainerIndexes(), cmd.getMissingContainerIndexes().length); - this.sources = cmd.getSources(); - this.targetDatanodes = cmd.getTargetDatanodes(); this.deadlineMsSinceEpoch = cmd.getDeadline(); this.term = cmd.getTerm(); + + sourceNodeMap = cmd.getSources().stream() + .collect(toMap( + DatanodeDetailsAndReplicaIndex::getReplicaIndex, + DatanodeDetailsAndReplicaIndex::getDnDetails, + (v1, v2) -> v1, TreeMap::new)); + targetNodeMap = IntStream.range(0, cmd.getTargetDatanodes().size()) + .boxed() + .collect(toMap( + i -> (int) missingContainerIndexes[i], + i -> cmd.getTargetDatanodes().get(i), + (v1, v2) -> v1, TreeMap::new)); } public long getDeadline() { @@ -60,39 +73,32 @@ public long getContainerID() { return containerID; } - public byte[] getMissingContainerIndexes() { - return Arrays - .copyOf(missingContainerIndexes, missingContainerIndexes.length); - } - public ECReplicationConfig getEcReplicationConfig() { return ecReplicationConfig; } - public List getSources() { - return sources; + SortedMap getSourceNodeMap() { + return unmodifiableSortedMap(sourceNodeMap); } - public List getTargetDatanodes() { - return targetDatanodes; + SortedMap getTargetNodeMap() { + return unmodifiableSortedMap(targetNodeMap); } @Override public String toString() { - String src = sources.stream() - .map(Objects::toString) - .collect(Collectors.joining(", ")); - String target = targetDatanodes.stream() - .map(DatanodeDetails::getUuidString) - .collect(Collectors.joining(", ")); - - return "ECReconstructionCommandInfo{" + return "ECReconstructionCommand{" + "containerID=" + containerID - + ", replication=" + ecReplicationConfig - + ", missingContainerIndexes=" + Arrays - .toString(missingContainerIndexes) - + ", sources={" + src + "}" - + ", targets=[" + target + "]}"; + + ", replication=" + ecReplicationConfig.getReplication() + + ", missingIndexes=" + Arrays.toString(missingContainerIndexes) + + ", sources={" + toString(sourceNodeMap) + "}" + + ", targets={" + toString(targetNodeMap) + "}}"; + } + + private String toString(SortedMap nodeMap) { + return nodeMap.entrySet().stream() + .map(e -> e.getKey() + ":" + e.getValue().getUuidString()) + .collect(joining(",")); } public long getTerm() { diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinatorTask.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinatorTask.java index d75b6fbbb9cb..e25af56fc2f0 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinatorTask.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinatorTask.java @@ -17,8 +17,6 @@ */ package org.apache.hadoop.ozone.container.ec.reconstruction; -import org.apache.hadoop.hdds.protocol.DatanodeDetails; -import org.apache.hadoop.ozone.protocol.commands.ReconstructECContainersCommand.DatanodeDetailsAndReplicaIndex; import org.apache.hadoop.util.Time; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -26,17 +24,13 @@ import java.io.IOException; import java.time.Clock; import java.util.OptionalLong; -import java.util.SortedMap; -import java.util.TreeMap; import java.util.concurrent.ConcurrentHashMap; -import java.util.stream.Collectors; -import java.util.stream.IntStream; /** * This is the actual EC reconstruction coordination task. */ public class ECReconstructionCoordinatorTask implements Runnable { - static final Logger LOG = + private static final Logger LOG = LoggerFactory.getLogger(ECReconstructionCoordinatorTask.class); private final ConcurrentHashMap.KeySetView inprogressCounter; private final ECReconstructionCoordinator reconstructionCoordinator; @@ -94,22 +88,11 @@ public void run() { return; } - SortedMap sourceNodeMap = - reconstructionCommandInfo.getSources().stream().collect(Collectors - .toMap(DatanodeDetailsAndReplicaIndex::getReplicaIndex, - DatanodeDetailsAndReplicaIndex::getDnDetails, (v1, v2) -> v1, - TreeMap::new)); - SortedMap targetNodeMap = IntStream - .range(0, reconstructionCommandInfo.getTargetDatanodes().size()) - .boxed().collect(Collectors.toMap(i -> (int) reconstructionCommandInfo - .getMissingContainerIndexes()[i], - i -> reconstructionCommandInfo.getTargetDatanodes().get(i), - (v1, v2) -> v1, TreeMap::new)); - reconstructionCoordinator.reconstructECContainerGroup( reconstructionCommandInfo.getContainerID(), - reconstructionCommandInfo.getEcReplicationConfig(), sourceNodeMap, - targetNodeMap); + reconstructionCommandInfo.getEcReplicationConfig(), + reconstructionCommandInfo.getSourceNodeMap(), + reconstructionCommandInfo.getTargetNodeMap()); long elapsed = Time.monotonicNow() - start; LOG.info("Completed {} in {} ms", reconstructionCommandInfo, elapsed); } catch (IOException e) { From dc062062fcb78151969f9c3c11783dddf73ef955 Mon Sep 17 00:00:00 2001 From: "Doroszlai, Attila" Date: Sat, 7 Jan 2023 12:01:36 +0100 Subject: [PATCH 3/5] Remove unused toString --- .../protocol/commands/ReconstructECContainersCommand.java | 5 ----- 1 file changed, 5 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/ReconstructECContainersCommand.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/ReconstructECContainersCommand.java index b18f31426a73..203e5e6bed07 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/ReconstructECContainersCommand.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/protocol/commands/ReconstructECContainersCommand.java @@ -188,10 +188,5 @@ public boolean equals(Object o) { public int hashCode() { return Objects.hash(dnDetails, replicaIndex); } - - @Override - public String toString() { - return "[" + replicaIndex + "]:" + dnDetails.getUuidString(); - } } } From e3c40d740385536265be928daa59178a85ba93fc Mon Sep 17 00:00:00 2001 From: "Doroszlai, Attila" Date: Mon, 9 Jan 2023 12:15:14 +0100 Subject: [PATCH 4/5] Add elapsed time in failure message --- .../ec/reconstruction/ECReconstructionCoordinatorTask.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinatorTask.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinatorTask.java index e25af56fc2f0..03d771da0489 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinatorTask.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCoordinatorTask.java @@ -96,7 +96,8 @@ public void run() { long elapsed = Time.monotonicNow() - start; LOG.info("Completed {} in {} ms", reconstructionCommandInfo, elapsed); } catch (IOException e) { - LOG.warn("Failed {}", reconstructionCommandInfo, e); + long elapsed = Time.monotonicNow() - start; + LOG.warn("Failed {} after {} ms", reconstructionCommandInfo, elapsed, e); } finally { this.inprogressCounter.remove(containerID); } From 104ab19132c331e4bcb0c0b6944ea63bc04adeb0 Mon Sep 17 00:00:00 2001 From: "Doroszlai, Attila" Date: Mon, 9 Jan 2023 12:37:36 +0100 Subject: [PATCH 5/5] Log hostname and IP instead of UUID --- .../ec/reconstruction/ECReconstructionCommandInfo.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCommandInfo.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCommandInfo.java index 31ef246867be..2851c735445f 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCommandInfo.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECReconstructionCommandInfo.java @@ -97,7 +97,7 @@ public String toString() { private String toString(SortedMap nodeMap) { return nodeMap.entrySet().stream() - .map(e -> e.getKey() + ":" + e.getValue().getUuidString()) + .map(e -> e.getKey() + ":" + e.getValue().getHostNameAndIP()) .collect(joining(",")); }