diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/ECReplicationConfig.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/ECReplicationConfig.java index a6dbd933ff1d..9709029634c8 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/ECReplicationConfig.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/ECReplicationConfig.java @@ -218,6 +218,11 @@ public String configFormat() { + "/" + data + "-" + parity + "-" + chunkKB(); } + @Override + public int getMinimumNodes() { + return data; + } + private String chunkKB() { return ecChunkSize / 1024 + "k"; } diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/RatisReplicationConfig.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/RatisReplicationConfig.java index 36d4d90e1afb..9c42e3d59b17 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/RatisReplicationConfig.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/RatisReplicationConfig.java @@ -126,4 +126,9 @@ public String toString() { public String configFormat() { return toString(); } + + @Override + public int getMinimumNodes() { + return 1; + } } diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/ReplicationConfig.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/ReplicationConfig.java index 7542409679b0..d82cd08c08e8 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/ReplicationConfig.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/ReplicationConfig.java @@ -234,4 +234,6 @@ static ReplicationConfig parseWithoutFallback(ReplicationType type, String configFormat(); + /** Minimum number of nodes, below this data loss happens. */ + int getMinimumNodes(); } diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/StandaloneReplicationConfig.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/StandaloneReplicationConfig.java index 9ca2dfb538a9..0b82ab8c8727 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/StandaloneReplicationConfig.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/client/StandaloneReplicationConfig.java @@ -128,4 +128,9 @@ public String toString() { public String configFormat() { return toString(); } + + @Override + public int getMinimumNodes() { + return replicationFactor.getNumber(); + } } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReportHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReportHandler.java index daadcd824ec8..36a51c4e3cac 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReportHandler.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerReportHandler.java @@ -26,11 +26,13 @@ .StorageContainerDatanodeProtocolProtos.ContainerReportsProto; import org.apache.hadoop.hdds.scm.ScmConfig; import org.apache.hadoop.hdds.scm.container.report.ContainerReportValidator; +import org.apache.hadoop.hdds.scm.events.SCMEvents; import org.apache.hadoop.hdds.scm.ha.SCMContext; import org.apache.hadoop.hdds.scm.node.NodeManager; import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException; import org.apache.hadoop.hdds.scm.server.SCMDatanodeHeartbeatDispatcher .ContainerReportFromDatanode; +import org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer; import org.apache.hadoop.hdds.server.events.EventHandler; import org.apache.hadoop.hdds.server.events.EventPublisher; import org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionException; @@ -199,6 +201,11 @@ public void onMessage(final ContainerReportFromDatanode reportFromDatanode, // list processMissingReplicas(datanodeDetails, expectedContainersInDatanode); containerManager.notifyContainerReportProcessing(true, true); + if (reportFromDatanode.isRegister()) { + publisher.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT, + new SCMDatanodeProtocolServer.NodeRegistrationContainerReport(datanodeDetails, + reportFromDatanode.getReport())); + } } } catch (NodeNotFoundException ex) { containerManager.notifyContainerReportProcessing(true, false); diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/events/SCMEvents.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/events/SCMEvents.java index 4fcf130117e9..6c155a6ec5f5 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/events/SCMEvents.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/events/SCMEvents.java @@ -64,6 +64,13 @@ public final class SCMEvents { NodeRegistrationContainerReport.class, "Node_Registration_Container_Report"); + /** + * Event generated on DataNode Registration Container Report. + */ + public static final TypedEvent + CONTAINER_REGISTRATION_REPORT = new TypedEvent<>( + NodeRegistrationContainerReport.class, "Container_Registration_Report"); + /** * ContainerReports are sent out by Datanodes. This report is received by * SCMDatanodeHeartbeatDispatcher and Container_Report Event is generated. diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java index ae645858a33d..accd805602e3 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java @@ -19,16 +19,25 @@ import java.util.List; import java.util.Map; -import java.util.Optional; +import java.util.UUID; +import java.util.Set; +import java.util.HashSet; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicLong; import java.util.stream.Collectors; +import com.google.common.collect.Sets; import org.apache.hadoop.hdds.HddsConfigKeys; +import org.apache.hadoop.hdds.client.ReplicationConfig; import org.apache.hadoop.hdds.conf.ConfigurationSource; +import org.apache.hadoop.hdds.protocol.DatanodeDetails; import org.apache.hadoop.hdds.protocol.proto.HddsProtos; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos; +import org.apache.hadoop.hdds.scm.container.ContainerID; import org.apache.hadoop.hdds.scm.container.ContainerInfo; import org.apache.hadoop.hdds.scm.container.ContainerManager; +import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException; import org.apache.hadoop.hdds.scm.events.SCMEvents; import org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport; import org.apache.hadoop.hdds.server.events.EventQueue; @@ -50,10 +59,13 @@ public class ContainerSafeModeRule extends // Required cutoff % for containers with at least 1 reported replica. private double safeModeCutoff; // Containers read from scm db (excluding containers in ALLOCATED state). - private Map containerMap; - private double maxContainer; - - private AtomicLong containerWithMinReplicas = new AtomicLong(0); + private Set ratisContainers; + private Set ecContainers; + private Map> ecContainerDNsMap; + private double ratisMaxContainer; + private double ecMaxContainer; + private AtomicLong ratisContainerWithMinReplicas = new AtomicLong(0); + private AtomicLong ecContainerWithMinReplicas = new AtomicLong(0); private final ContainerManager containerManager; public ContainerSafeModeRule(String ruleName, EventQueue eventQueue, @@ -71,127 +83,268 @@ public ContainerSafeModeRule(String ruleName, EventQueue eventQueue, HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT + " value should be >= 0.0 and <= 1.0"); - containerMap = new ConcurrentHashMap<>(); - containers.forEach(container -> { - // There can be containers in OPEN/CLOSING state which were never - // created by the client. We are not considering these containers for - // now. These containers can be handled by tracking pipelines. - - Optional.ofNullable(container.getState()) - .filter(state -> (state == HddsProtos.LifeCycleState.QUASI_CLOSED || - state == HddsProtos.LifeCycleState.CLOSED) - && container.getNumberOfKeys() > 0) - .ifPresent(s -> containerMap.put(container.getContainerID(), - container)); - }); - maxContainer = containerMap.size(); - long cutOff = (long) Math.ceil(maxContainer * safeModeCutoff); - getSafeModeMetrics().setNumContainerWithOneReplicaReportedThreshold(cutOff); + ratisContainers = new HashSet<>(); + ecContainers = new HashSet<>(); + ecContainerDNsMap = new ConcurrentHashMap<>(); - LOG.info("containers with one replica threshold count {}", cutOff); + initializeRule(containers); } @Override protected TypedEvent getEventType() { - return SCMEvents.NODE_REGISTRATION_CONT_REPORT; + return SCMEvents.CONTAINER_REGISTRATION_REPORT; } - @Override protected synchronized boolean validate() { - return getCurrentContainerThreshold() >= safeModeCutoff; + return (getCurrentContainerThreshold() >= safeModeCutoff) && + (getCurrentECContainerThreshold() >= safeModeCutoff); } @VisibleForTesting public synchronized double getCurrentContainerThreshold() { - if (maxContainer == 0) { + if (ratisMaxContainer == 0) { + return 1; + } + return (ratisContainerWithMinReplicas.doubleValue() / ratisMaxContainer); + } + + @VisibleForTesting + public synchronized double getCurrentECContainerThreshold() { + if (ecMaxContainer == 0) { + return 1; + } + return (ecContainerWithMinReplicas.doubleValue() / ecMaxContainer); + } + + private synchronized double getEcMaxContainer() { + if (ecMaxContainer == 0) { + return 1; + } + return ecMaxContainer; + } + + private synchronized double getRatisMaxContainer() { + if (ratisMaxContainer == 0) { return 1; } - return (containerWithMinReplicas.doubleValue() / maxContainer); + return ratisMaxContainer; } @Override protected synchronized void process( NodeRegistrationContainerReport reportsProto) { + DatanodeDetails datanodeDetails = reportsProto.getDatanodeDetails(); + UUID datanodeUUID = datanodeDetails.getUuid(); + StorageContainerDatanodeProtocolProtos.ContainerReportsProto report = reportsProto.getReport(); - reportsProto.getReport().getReportsList().forEach(c -> { - if (containerMap.containsKey(c.getContainerID())) { - if (containerMap.remove(c.getContainerID()) != null) { - containerWithMinReplicas.getAndAdd(1); - getSafeModeMetrics() - .incCurrentContainersWithOneReplicaReportedCount(); - } + report.getReportsList().forEach(c -> { + long containerID = c.getContainerID(); + + + // If it is a Ratis container. + if (ratisContainers.contains(containerID)) { + recordReportedContainer(containerID, Boolean.FALSE); + ratisContainers.remove(containerID); + } + + // If it is an EC container. + if (ecContainers.contains(containerID)) { + putInContainerDNsMap(containerID, ecContainerDNsMap, datanodeUUID); + recordReportedContainer(containerID, Boolean.TRUE); } }); if (scmInSafeMode()) { SCMSafeModeManager.getLogger().info( - "SCM in safe mode. {} % containers have at least one" - + " reported replica.", - (containerWithMinReplicas.doubleValue() / maxContainer) * 100); + "SCM in safe mode. {} % containers [Ratis] have at least one" + + " reported replica, {} % containers [EC] have at N reported replica.", + ((ratisContainerWithMinReplicas.doubleValue() / getRatisMaxContainer()) * 100), + ((ecContainerWithMinReplicas.doubleValue() / getEcMaxContainer()) * 100) + ); } } + /** + * Record the reported Container. + * + * We will differentiate and count according to the type of Container. + * + * @param containerID containerID + * @param isEcContainer true, means ECContainer, false, means not ECContainer. + */ + private void recordReportedContainer(long containerID, boolean isEcContainer) { + + int uuids = 1; + if (isEcContainer && ecContainerDNsMap.containsKey(containerID)) { + uuids = ecContainerDNsMap.get(containerID).size(); + } + + int minReplica = getMinReplica(containerID); + if (uuids >= minReplica) { + if (isEcContainer) { + getSafeModeMetrics() + .incCurrentContainersWithECDataReplicaReportedCount(); + ecContainerWithMinReplicas.getAndAdd(1); + } else { + ratisContainerWithMinReplicas.getAndAdd(1); + getSafeModeMetrics() + .incCurrentContainersWithOneReplicaReportedCount(); + } + } + } + + /** + * Get the minimum replica. + * + * If it is a Ratis Contianer, the minimum copy is 1. + * If it is an EC Container, the minimum copy will be the number of Data in replicationConfig. + * + * @param pContainerID containerID + * @return MinReplica. + */ + private int getMinReplica(long pContainerID) { + + try { + ContainerID containerID = ContainerID.valueOf(pContainerID); + ContainerInfo container = containerManager.getContainer(containerID); + ReplicationConfig replicationConfig = container.getReplicationConfig(); + return replicationConfig.getMinimumNodes(); + } catch (ContainerNotFoundException e) { + LOG.error("containerId = {} not found.", pContainerID, e); + } catch (Exception e) { + LOG.error("containerId = {} not found.", pContainerID, e); + } + + return 1; + } + + private void putInContainerDNsMap(long containerID, Map> containerDNsMap, + UUID datanodeUUID) { + containerDNsMap.computeIfAbsent(containerID, key -> Sets.newHashSet()); + containerDNsMap.get(containerID).add(datanodeUUID); + } + @Override protected synchronized void cleanup() { - containerMap.clear(); + ratisContainers.clear(); + ecContainers.clear(); + ecContainerDNsMap.clear(); } @Override public String getStatusText() { - List sampleContainers = containerMap.keySet() - .stream() - .limit(SAMPLE_CONTAINER_DISPLAY_LIMIT) - .collect(Collectors.toList()); - String status = String.format("%% of containers with at least one reported" - + " replica (=%1.2f) >= safeModeCutoff (=%1.2f)", + // ratis container + String status = String.format( + "%1.2f%% of [Ratis] Containers(%s / %s) with at least one reported replica (=%1.2f) >= " + + "safeModeCutoff (=%1.2f);", + (ratisContainerWithMinReplicas.doubleValue() / getRatisMaxContainer()) * 100, + ratisContainerWithMinReplicas, (long) getRatisMaxContainer(), getCurrentContainerThreshold(), this.safeModeCutoff); - if (!sampleContainers.isEmpty()) { + Set sampleRatisContainers = ratisContainers.stream(). + limit(SAMPLE_CONTAINER_DISPLAY_LIMIT). + collect(Collectors.toSet()); + + if (!sampleRatisContainers.isEmpty()) { String sampleContainerText = - "Sample containers not satisfying the criteria : " + sampleContainers; + "Sample Ratis Containers not satisfying the criteria : " + sampleRatisContainers + ";"; status = status.concat("\n").concat(sampleContainerText); } + // ec container + String ecStatus = String.format( + "%1.2f%% of [EC] Containers(%s / %s) with at least N reported replica (=%1.2f) >= " + + "safeModeCutoff (=%1.2f);", + (ecContainerWithMinReplicas.doubleValue() / getEcMaxContainer()) * 100, + ecContainerWithMinReplicas, (long) getEcMaxContainer(), + getCurrentECContainerThreshold(), this.safeModeCutoff); + status = status.concat("\n").concat(ecStatus); + + Set sampleEcContainers = ecContainerDNsMap.entrySet().stream(). + filter(entry -> { + Long containerId = entry.getKey(); + int minReplica = getMinReplica(containerId); + Set allReplicas = entry.getValue(); + if (allReplicas.size() >= minReplica) { + return false; + } + return true; + }). + map(Map.Entry::getKey). + limit(SAMPLE_CONTAINER_DISPLAY_LIMIT). + collect(Collectors.toSet()); + + if (!sampleEcContainers.isEmpty()) { + String sampleECContainerText = + "Sample EC Containers not satisfying the criteria : " + sampleEcContainers + ";"; + status = status.concat("\n").concat(sampleECContainerText); + } + return status; } @Override public synchronized void refresh(boolean forceRefresh) { + List containers = containerManager.getContainers(); if (forceRefresh) { - reInitializeRule(); + initializeRule(containers); } else { if (!validate()) { - reInitializeRule(); + initializeRule(containers); } } } - private void reInitializeRule() { - containerMap.clear(); - containerManager.getContainers().forEach(container -> { + private boolean checkContainerState(LifeCycleState state) { + if (state == LifeCycleState.QUASI_CLOSED || state == LifeCycleState.CLOSED) { + return true; + } + return false; + } + + private void initializeRule(List containers) { + + // Clean up the related data in the map. + ratisContainers.clear(); + ecContainers.clear(); + + // Iterate through the container list to + // get the minimum replica count for each container. + containers.forEach(container -> { // There can be containers in OPEN/CLOSING state which were never // created by the client. We are not considering these containers for // now. These containers can be handled by tracking pipelines. - Optional.ofNullable(container.getState()) - .filter(state -> (state == HddsProtos.LifeCycleState.QUASI_CLOSED || - state == HddsProtos.LifeCycleState.CLOSED) - && container.getNumberOfKeys() > 0) - .ifPresent(s -> containerMap.put(container.getContainerID(), - container)); + LifeCycleState containerState = container.getState(); + HddsProtos.ReplicationType replicationType = container.getReplicationType(); + + if (checkContainerState(containerState) && container.getNumberOfKeys() > 0) { + // If it's of type Ratis + if (replicationType.equals(HddsProtos.ReplicationType.RATIS)) { + ratisContainers.add(container.getContainerID()); + } + + // If it's of type EC + if (replicationType.equals(HddsProtos.ReplicationType.EC)) { + ecContainers.add(container.getContainerID()); + } + } }); - maxContainer = containerMap.size(); - long cutOff = (long) Math.ceil(maxContainer * safeModeCutoff); + ratisMaxContainer = ratisContainers.size(); + ecMaxContainer = ecContainers.size(); - LOG.info("Refreshed one replica container threshold {}, " + - "currentThreshold {}", cutOff, containerWithMinReplicas.get()); - getSafeModeMetrics() - .setNumContainerWithOneReplicaReportedThreshold(cutOff); - } + long ratisCutOff = (long) Math.ceil(ratisMaxContainer * safeModeCutoff); + long ecCutOff = (long) Math.ceil(ecMaxContainer * safeModeCutoff); + + getSafeModeMetrics().setNumContainerWithOneReplicaReportedThreshold(ratisCutOff); + getSafeModeMetrics().setNumContainerWithECDataReplicaReportedThreshold(ecCutOff); + LOG.info("Refreshed Containers with one replica threshold count {}, " + + "with ec n replica threshold count {}.", ratisCutOff, ecCutOff); + } } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java index a5ecdb23425b..39530de16b6b 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java @@ -341,6 +341,17 @@ public double getCurrentContainerThreshold() { .getCurrentContainerThreshold(); } + @VisibleForTesting + public double getCurrentECContainerThreshold() { + return ((ContainerSafeModeRule) exitRules.get(CONT_EXIT_RULE)) + .getCurrentECContainerThreshold(); + } + + @VisibleForTesting + public ContainerSafeModeRule getContainerSafeModeRule() { + return (ContainerSafeModeRule) exitRules.get(CONT_EXIT_RULE); + } + @VisibleForTesting public HealthyPipelineSafeModeRule getHealthyPipelineSafeModeRule() { return (HealthyPipelineSafeModeRule) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java index 02bc10ba6e40..44c77ac3de85 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java @@ -36,8 +36,12 @@ public class SafeModeMetrics { // These all values will be set to some values when safemode is enabled. private @Metric MutableGaugeLong numContainerWithOneReplicaReportedThreshold; + private @Metric MutableGaugeLong + numContainerWithECDataReplicaReportedThreshold; private @Metric MutableCounterLong currentContainersWithOneReplicaReportedCount; + private @Metric MutableCounterLong + currentContainersWithECDataReplicaReportedCount; // When hdds.scm.safemode.pipeline-availability.check is set then only // below metrics will have some values, otherwise they will be zero. @@ -75,10 +79,18 @@ public void setNumContainerWithOneReplicaReportedThreshold(long val) { this.numContainerWithOneReplicaReportedThreshold.set(val); } + public void setNumContainerWithECDataReplicaReportedThreshold(long val) { + this.numContainerWithECDataReplicaReportedThreshold.set(val); + } + public void incCurrentContainersWithOneReplicaReportedCount() { this.currentContainersWithOneReplicaReportedCount.incr(); } + public void incCurrentContainersWithECDataReplicaReportedCount() { + this.currentContainersWithECDataReplicaReportedCount.incr(); + } + MutableGaugeLong getNumHealthyPipelinesThreshold() { return numHealthyPipelinesThreshold; } @@ -100,6 +112,10 @@ MutableGaugeLong getNumContainerWithOneReplicaReportedThreshold() { return numContainerWithOneReplicaReportedThreshold; } + MutableGaugeLong getNumContainerWithECDataReplicaReportedThreshold() { + return numContainerWithECDataReplicaReportedThreshold; + } + MutableCounterLong getCurrentContainersWithOneReplicaReportedCount() { return currentContainersWithOneReplicaReportedCount; } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeHeartbeatDispatcher.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeHeartbeatDispatcher.java index 6f5429a853bd..5a4dc505d84f 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeHeartbeatDispatcher.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeHeartbeatDispatcher.java @@ -306,12 +306,20 @@ public static class ContainerReportFromDatanode extends ReportFromDatanode implements ContainerReport, IEventInfo { private long createTime = Time.monotonicNow(); + // Used to identify whether container reporting is from a registration. + private boolean isRegister = false; public ContainerReportFromDatanode(DatanodeDetails datanodeDetails, ContainerReportsProto report) { super(datanodeDetails, report); } + public ContainerReportFromDatanode(DatanodeDetails datanodeDetails, + ContainerReportsProto report, boolean isRegister) { + super(datanodeDetails, report); + this.isRegister = isRegister; + } + @Override public boolean equals(Object o) { return this == o; @@ -331,6 +339,10 @@ public long getCreateTime() { return createTime; } + public boolean isRegister() { + return isRegister; + } + @Override public String getEventId() { return getDatanodeDetails().toString() + ", {type: " + getType() diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeProtocolServer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeProtocolServer.java index 105e7ac34862..b03f737b3406 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeProtocolServer.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/SCMDatanodeProtocolServer.java @@ -251,7 +251,7 @@ public SCMRegisteredResponseProto register( == SCMRegisteredResponseProto.ErrorCode.success) { eventPublisher.fireEvent(CONTAINER_REPORT, new SCMDatanodeHeartbeatDispatcher.ContainerReportFromDatanode( - datanodeDetails, containerReportsProto)); + datanodeDetails, containerReportsProto, true)); eventPublisher.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT, new NodeRegistrationContainerReport(datanodeDetails, containerReportsProto)); diff --git a/hadoop-hdds/server-scm/src/main/resources/webapps/scm/scm-overview.html b/hadoop-hdds/server-scm/src/main/resources/webapps/scm/scm-overview.html index 2748716e67f0..7bfe405850e2 100644 --- a/hadoop-hdds/server-scm/src/main/resources/webapps/scm/scm-overview.html +++ b/hadoop-hdds/server-scm/src/main/resources/webapps/scm/scm-overview.html @@ -387,7 +387,7 @@

Safemode rules statuses

{{typestat.key}} - {{typestat.value[0]}} + {{typestat.value[1]}} diff --git a/hadoop-hdds/server-scm/src/main/resources/webapps/scm/scm.js b/hadoop-hdds/server-scm/src/main/resources/webapps/scm/scm.js index fc216c068625..eca79852e43b 100644 --- a/hadoop-hdds/server-scm/src/main/resources/webapps/scm/scm.js +++ b/hadoop-hdds/server-scm/src/main/resources/webapps/scm/scm.js @@ -24,7 +24,7 @@ require: { overview: "^overview" }, - controller: function ($http,$scope) { + controller: function ($http,$scope,$sce) { var ctrl = this; $scope.reverse = false; $scope.columnName = "hostname"; @@ -140,6 +140,14 @@ $scope.lastIndex = Math.ceil(nodeStatusCopy.length / $scope.RecordsToDisplay); $scope.nodeStatus = nodeStatusCopy.slice(0, $scope.RecordsToDisplay); + $scope.formatValue = function(value) { + if (value && value.includes(';')) { + return $sce.trustAsHtml(value.replace('/;/g', '
')); + } else { + return $sce.trustAsHtml(value); + } + }; + ctrl.nodemanagermetrics.NodeStatistics.forEach(({key, value}) => { if(key == "Min") { $scope.statistics.nodes.usages.min = value; diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/HddsTestUtils.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/HddsTestUtils.java index fe5459764c9b..787f83e1a832 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/HddsTestUtils.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/HddsTestUtils.java @@ -828,10 +828,36 @@ public static Pipeline getRandomPipeline() { */ public static List getContainerInfo(int numContainers) { List containerInfoList = new ArrayList<>(); + RatisReplicationConfig ratisReplicationConfig = + RatisReplicationConfig.getInstance(ReplicationFactor.THREE); for (int i = 0; i < numContainers; i++) { ContainerInfo.Builder builder = new ContainerInfo.Builder(); containerInfoList.add(builder .setContainerID(RandomUtils.nextLong()) + .setReplicationConfig(ratisReplicationConfig) + .build()); + } + return containerInfoList; + } + + /** + * Generate EC Container data. + * + * @param numContainers number of ContainerInfo to be included in list. + * @param data Data block Num. + * @param parity Parity block Num. + * @return {@literal List} + */ + public static List getECContainerInfo(int numContainers, int data, int parity) { + List containerInfoList = new ArrayList<>(); + ECReplicationConfig eCReplicationConfig = new ECReplicationConfig(data, parity); + for (int i = 0; i < numContainers; i++) { + ContainerInfo.Builder builder = new ContainerInfo.Builder(); + containerInfoList.add(builder + .setContainerID(RandomUtils.nextLong()) + .setOwner("test-owner") + .setPipelineID(PipelineID.randomId()) + .setReplicationConfig(eCReplicationConfig) .build()); } return containerInfoList; diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java index eedef0794459..05e231776592 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java @@ -20,6 +20,7 @@ import java.io.File; import java.io.IOException; import java.time.Clock; +import java.time.ZoneId; import java.time.ZoneOffset; import java.util.ArrayList; import java.util.Collections; @@ -40,7 +41,10 @@ import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos; import org.apache.hadoop.hdds.scm.HddsTestUtils; import org.apache.hadoop.hdds.scm.container.ContainerInfo; +import org.apache.hadoop.hdds.scm.container.ContainerManager; +import org.apache.hadoop.hdds.scm.container.ContainerManagerImpl; import org.apache.hadoop.hdds.scm.container.MockNodeManager; +import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaPendingOps; import org.apache.hadoop.hdds.scm.events.SCMEvents; import org.apache.hadoop.hdds.scm.ha.SCMHAManagerStub; import org.apache.hadoop.hdds.scm.ha.SCMContext; @@ -53,6 +57,7 @@ import org.apache.hadoop.hdds.scm.pipeline.PipelineProvider; import org.apache.hadoop.hdds.scm.pipeline.PipelineManagerImpl; import org.apache.hadoop.hdds.scm.server.SCMDatanodeHeartbeatDispatcher; +import org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer; import org.apache.hadoop.hdds.server.events.EventHandler; import org.apache.hadoop.hdds.server.events.EventPublisher; import org.apache.hadoop.hdds.server.events.EventQueue; @@ -74,6 +79,7 @@ import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.params.provider.Arguments.arguments; import static org.mockito.Mockito.mock; /** Test class for SCMSafeModeManager. @@ -100,8 +106,7 @@ public void setUp() throws IOException { config = new OzoneConfiguration(); config.setBoolean(HddsConfigKeys.HDDS_SCM_SAFEMODE_PIPELINE_CREATION, false); - config.set(HddsConfigKeys.OZONE_METADATA_DIRS, - tempDir.getAbsolutePath().toString()); + config.set(HddsConfigKeys.OZONE_METADATA_DIRS, tempDir.getAbsolutePath()); scmMetadataStore = new SCMMetadataStoreImpl(config); } @@ -139,8 +144,10 @@ private void testSafeMode(int numContainers) throws Exception { assertTrue(scmSafeModeManager.getInSafeMode()); validateRuleStatus("DatanodeSafeModeRule", "registered datanodes 0"); - queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT, - HddsTestUtils.createNodeRegistrationContainerReport(containers)); + SCMDatanodeProtocolServer.NodeRegistrationContainerReport nodeRegistrationContainerReport = + HddsTestUtils.createNodeRegistrationContainerReport(containers); + queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT, nodeRegistrationContainerReport); + queue.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT, nodeRegistrationContainerReport); long cutOff = (long) Math.ceil(numContainers * config.getDouble( HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT, @@ -181,7 +188,7 @@ public void testSafeModeExitRule() throws Exception { assertTrue(scmSafeModeManager.getInSafeMode()); validateRuleStatus("ContainerSafeModeRule", - "% of containers with at least one reported"); + "0.00% of [Ratis] Containers(0 / 100) with at least one reported"); testContainerThreshold(containers.subList(0, 25), 0.25); assertEquals(25, scmSafeModeManager.getSafeModeMetrics() .getCurrentContainersWithOneReplicaReportedCount().value()); @@ -492,6 +499,79 @@ public void testContainerSafeModeRule() throws Exception { 100, 1000 * 5); } + // We simulate common EC types: EC-2-2-1024K, EC-3-2-1024K, EC-6-3-1024K. + static Stream processECDataParityCombination() { + Stream args = Stream.of(arguments(2, 2), + arguments(3, 2), arguments(6, 3)); + return args; + } + + @ParameterizedTest + @MethodSource("processECDataParityCombination") + public void testContainerSafeModeRuleEC(int data, int parity) throws Exception { + containers = new ArrayList<>(); + + // We generate 100 EC Containers. + containers.addAll(HddsTestUtils.getECContainerInfo(25 * 4, data, parity)); + + // Prepare the data for the container. + // We have prepared 25 containers in the CLOSED state and 75 containers in the OPEN state. + // Out of the 25 containers, only 20 containers have a NumberOfKeys greater than 0. + for (ContainerInfo container : containers.subList(0, 25)) { + container.setState(HddsProtos.LifeCycleState.CLOSED); + container.setNumberOfKeys(10); + } + + for (ContainerInfo container : containers.subList(25, 100)) { + container.setState(HddsProtos.LifeCycleState.OPEN); + container.setNumberOfKeys(10); + } + + // Set the last 5 closed containers to be empty + for (ContainerInfo container : containers.subList(20, 25)) { + container.setNumberOfKeys(0); + } + + for (ContainerInfo container : containers) { + scmMetadataStore.getContainerTable().put(container.containerID(), container); + } + + // Declare SCMSafeModeManager and confirm entry into Safe Mode. + EventQueue eventQueue = new EventQueue(); + MockNodeManager nodeManager = new MockNodeManager(true, 0); + PipelineManager pipelineManager = PipelineManagerImpl.newPipelineManager( + config, + SCMHAManagerStub.getInstance(true), + nodeManager, + scmMetadataStore.getPipelineTable(), + eventQueue, + scmContext, + serviceManager, + Clock.system(ZoneOffset.UTC)); + + ContainerManager containerManager = new ContainerManagerImpl(config, + SCMHAManagerStub.getInstance(true), null, pipelineManager, + scmMetadataStore.getContainerTable(), + new ContainerReplicaPendingOps(Clock.system(ZoneId.systemDefault()))); + + scmSafeModeManager = new SCMSafeModeManager( + config, containers, containerManager, pipelineManager, queue, serviceManager, scmContext); + assertTrue(scmSafeModeManager.getInSafeMode()); + + // Only 20 containers are involved in the calculation, + // so when 10 containers complete registration, our threshold is 50%. + testECContainerThreshold(containers.subList(0, 10), 0.5, data); + assertTrue(scmSafeModeManager.getInSafeMode()); + + // When the registration of the remaining containers is completed, + // the threshold will reach 100%. + testECContainerThreshold(containers.subList(10, 20), 1.0, data); + + ContainerSafeModeRule containerSafeModeRule = + scmSafeModeManager.getContainerSafeModeRule(); + assertTrue(containerSafeModeRule.validate()); + } + private void testSafeModeDataNodes(int numOfDns) throws Exception { OzoneConfiguration conf = new OzoneConfiguration(config); conf.setInt(HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE, numOfDns); @@ -504,8 +584,10 @@ private void testSafeModeDataNodes(int numOfDns) throws Exception { // Register all DataNodes except last one and assert SCM is in safe mode. for (int i = 0; i < numOfDns - 1; i++) { - queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT, - HddsTestUtils.createNodeRegistrationContainerReport(containers)); + SCMDatanodeProtocolServer.NodeRegistrationContainerReport nodeRegistrationContainerReport = + HddsTestUtils.createNodeRegistrationContainerReport(containers); + queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT, nodeRegistrationContainerReport); + queue.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT, nodeRegistrationContainerReport); assertTrue(scmSafeModeManager.getInSafeMode()); assertEquals(1, scmSafeModeManager.getCurrentContainerThreshold()); } @@ -525,14 +607,52 @@ private void testSafeModeDataNodes(int numOfDns) throws Exception { private void testContainerThreshold(List dnContainers, double expectedThreshold) throws Exception { + SCMDatanodeProtocolServer.NodeRegistrationContainerReport nodeRegistrationContainerReport = + HddsTestUtils.createNodeRegistrationContainerReport(dnContainers); queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT, - HddsTestUtils.createNodeRegistrationContainerReport(dnContainers)); + nodeRegistrationContainerReport); + queue.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT, + nodeRegistrationContainerReport); GenericTestUtils.waitFor(() -> { double threshold = scmSafeModeManager.getCurrentContainerThreshold(); return threshold == expectedThreshold; }, 100, 2000 * 9); } + /** + * Test ECContainer reaching SafeMode threshold. + * + * @param dnContainers + * The list of containers that need to reach the threshold. + * @param expectedThreshold + * The expected threshold. + * @param dataBlockNum + * The number of data blocks. For EC-3-2-1024K, + * we need 3 registration requests to ensure the EC Container is confirmed. + * For EC-6-3-1024K, we need 6 registration requests to ensure the EC Container is confirmed. + * @throws Exception The thrown exception message. + */ + private void testECContainerThreshold(List dnContainers, + double expectedThreshold, int dataBlockNum) throws Exception { + + // Step1. We need to ensure the number of confirmed EC data blocks + // based on the quantity of dataBlockNum. + for (int i = 0; i < dataBlockNum; i++) { + SCMDatanodeProtocolServer.NodeRegistrationContainerReport nodeRegistrationContainerReport = + HddsTestUtils.createNodeRegistrationContainerReport(dnContainers); + queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT, + nodeRegistrationContainerReport); + queue.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT, + nodeRegistrationContainerReport); + } + + // Step2. Wait for the threshold to be reached. + GenericTestUtils.waitFor(() -> { + double threshold = scmSafeModeManager.getCurrentECContainerThreshold(); + return threshold == expectedThreshold; + }, 100, 2000 * 9); + } + @Test public void testSafeModePipelineExitRule() throws Exception { containers = new ArrayList<>(); @@ -571,8 +691,11 @@ public void testSafeModePipelineExitRule() throws Exception { config, containers, null, pipelineManager, queue, serviceManager, scmContext); - queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT, - HddsTestUtils.createNodeRegistrationContainerReport(containers)); + SCMDatanodeProtocolServer.NodeRegistrationContainerReport nodeRegistrationContainerReport = + HddsTestUtils.createNodeRegistrationContainerReport(containers); + queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT, nodeRegistrationContainerReport); + queue.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT, nodeRegistrationContainerReport); + assertTrue(scmSafeModeManager.getInSafeMode()); firePipelineEvent(pipelineManager, pipeline); @@ -629,8 +752,10 @@ public void testPipelinesNotCreatedUntilPreCheckPasses() throws Exception { // Register all DataNodes except last one and assert SCM is in safe mode. for (int i = 0; i < numOfDns - 1; i++) { - queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT, - HddsTestUtils.createNodeRegistrationContainerReport(containers)); + SCMDatanodeProtocolServer.NodeRegistrationContainerReport nodeRegistrationContainerReport = + HddsTestUtils.createNodeRegistrationContainerReport(containers); + queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT, nodeRegistrationContainerReport); + queue.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT, nodeRegistrationContainerReport); assertTrue(scmSafeModeManager.getInSafeMode()); assertFalse(scmSafeModeManager.getPreCheckComplete()); }