From 37e046ea7aad3e9a99f94e1566a7e49946c515e1 Mon Sep 17 00:00:00 2001 From: peterxcli Date: Sat, 22 Feb 2025 21:26:33 +0800 Subject: [PATCH 1/8] HDDS-11974. Split Container Safemode Rule into Ratis & EC Container Safemode Rules --- .../scm/safemode/ContainerSafeModeRule.java | 359 ------------------ .../scm/safemode/ECContainerSafeModeRule.java | 237 ++++++++++++ .../safemode/RatisContainerSafeModeRule.java | 207 ++++++++++ .../hdds/scm/safemode/SCMSafeModeManager.java | 22 +- .../scm/safemode/SafeModeRuleFactory.java | 7 +- .../scm/safemode/TestSCMSafeModeManager.java | 10 +- .../scm/safemode/TestSafeModeRuleFactory.java | 2 +- 7 files changed, 468 insertions(+), 376 deletions(-) delete mode 100644 hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java create mode 100644 hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java create mode 100644 hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java deleted file mode 100644 index 32cc639571fa..000000000000 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java +++ /dev/null @@ -1,359 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hdds.scm.safemode; - -import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT; -import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.collect.Sets; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.UUID; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.atomic.AtomicLong; -import java.util.stream.Collectors; -import org.apache.hadoop.hdds.client.ReplicationConfig; -import org.apache.hadoop.hdds.conf.ConfigurationSource; -import org.apache.hadoop.hdds.protocol.DatanodeDetails; -import org.apache.hadoop.hdds.protocol.proto.HddsProtos; -import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState; -import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType; -import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos; -import org.apache.hadoop.hdds.scm.container.ContainerID; -import org.apache.hadoop.hdds.scm.container.ContainerInfo; -import org.apache.hadoop.hdds.scm.container.ContainerManager; -import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException; -import org.apache.hadoop.hdds.scm.events.SCMEvents; -import org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport; -import org.apache.hadoop.hdds.server.events.EventQueue; -import org.apache.hadoop.hdds.server.events.TypedEvent; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Class defining Safe mode exit criteria for Containers. - */ -public class ContainerSafeModeRule extends - SafeModeExitRule { - - public static final Logger LOG = LoggerFactory.getLogger(ContainerSafeModeRule.class); - - private static final String NAME = "ContainerSafeModeRule"; - - private final ContainerManager containerManager; - // Required cutoff % for containers with at least 1 reported replica. - private final double safeModeCutoff; - // Containers read from scm db (excluding containers in ALLOCATED state). - private final Set ratisContainers; - private final Set ecContainers; - private final Map> ecContainerDNsMap; - private final AtomicLong ratisContainerWithMinReplicas = new AtomicLong(0); - private final AtomicLong ecContainerWithMinReplicas = new AtomicLong(0); - - private double ratisMaxContainer; - private double ecMaxContainer; - - public ContainerSafeModeRule(final EventQueue eventQueue, - final ConfigurationSource conf, - final ContainerManager containerManager, - final SCMSafeModeManager manager) { - super(manager, NAME, eventQueue); - this.safeModeCutoff = getSafeModeCutoff(conf); - this.containerManager = containerManager; - this.ratisContainers = new HashSet<>(); - this.ecContainers = new HashSet<>(); - this.ecContainerDNsMap = new ConcurrentHashMap<>(); - initializeRule(); - } - - - private static double getSafeModeCutoff(ConfigurationSource conf) { - final double cutoff = conf.getDouble(HDDS_SCM_SAFEMODE_THRESHOLD_PCT, - HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT); - Preconditions.checkArgument((cutoff >= 0.0 && cutoff <= 1.0), - HDDS_SCM_SAFEMODE_THRESHOLD_PCT + - " value should be >= 0.0 and <= 1.0"); - return cutoff; - } - - @Override - protected TypedEvent getEventType() { - return SCMEvents.CONTAINER_REGISTRATION_REPORT; - } - - @Override - protected synchronized boolean validate() { - if (validateBasedOnReportProcessing()) { - return (getCurrentContainerThreshold() >= safeModeCutoff) && - (getCurrentECContainerThreshold() >= safeModeCutoff); - } - - // TODO: Split ContainerSafeModeRule into RatisContainerSafeModeRule and - // ECContainerSafeModeRule - final List containers = containerManager.getContainers( - ReplicationType.RATIS); - - return containers.stream() - .filter(this::isClosed) - .map(ContainerInfo::containerID) - .noneMatch(this::isMissing); - } - - /** - * Checks if the container has any replica. - */ - private boolean isMissing(ContainerID id) { - try { - return containerManager.getContainerReplicas(id).isEmpty(); - } catch (ContainerNotFoundException ex) { - /* - * This should never happen, in case this happens the container - * somehow got removed from SCM. - * Safemode rule doesn't have to log/fix this. We will just exclude this - * from the rule validation. - */ - return false; - - } - } - - @VisibleForTesting - public double getCurrentContainerThreshold() { - return ratisMaxContainer == 0 ? 1 : - (ratisContainerWithMinReplicas.doubleValue() / ratisMaxContainer); - } - - @VisibleForTesting - public double getCurrentECContainerThreshold() { - return ecMaxContainer == 0 ? 1 : - (ecContainerWithMinReplicas.doubleValue() / ecMaxContainer); - } - - - // TODO: Report processing logic will be removed in future. HDDS-11958. - @Override - protected synchronized void process( - final NodeRegistrationContainerReport reportsProto) { - final DatanodeDetails datanodeDetails = reportsProto.getDatanodeDetails(); - final UUID datanodeUUID = datanodeDetails.getUuid(); - StorageContainerDatanodeProtocolProtos.ContainerReportsProto report = reportsProto.getReport(); - - report.getReportsList().forEach(c -> { - long containerID = c.getContainerID(); - - - // If it is a Ratis container. - if (ratisContainers.contains(containerID)) { - recordReportedContainer(containerID, Boolean.FALSE); - ratisContainers.remove(containerID); - } - - // If it is an EC container. - if (ecContainers.contains(containerID)) { - putInContainerDNsMap(containerID, ecContainerDNsMap, datanodeUUID); - recordReportedContainer(containerID, Boolean.TRUE); - } - }); - - if (scmInSafeMode()) { - SCMSafeModeManager.getLogger().info( - "SCM in safe mode. {} % containers [Ratis] have at least one" - + " reported replica, {} % containers [EC] have at N reported replica.", - getCurrentContainerThreshold() * 100, getCurrentECContainerThreshold() * 100); - } - } - - /** - * Record the reported Container. - * - * We will differentiate and count according to the type of Container. - * - * @param containerID containerID - * @param isEcContainer true, means ECContainer, false, means not ECContainer. - */ - private void recordReportedContainer(long containerID, boolean isEcContainer) { - - int uuids = 1; - if (isEcContainer && ecContainerDNsMap.containsKey(containerID)) { - uuids = ecContainerDNsMap.get(containerID).size(); - } - - int minReplica = getMinReplica(containerID); - if (uuids >= minReplica) { - if (isEcContainer) { - getSafeModeMetrics() - .incCurrentContainersWithECDataReplicaReportedCount(); - ecContainerWithMinReplicas.getAndAdd(1); - } else { - ratisContainerWithMinReplicas.getAndAdd(1); - getSafeModeMetrics() - .incCurrentContainersWithOneReplicaReportedCount(); - } - } - } - - /** - * Get the minimum replica. - * - * If it is a Ratis Contianer, the minimum copy is 1. - * If it is an EC Container, the minimum copy will be the number of Data in replicationConfig. - * - * @param pContainerID containerID - * @return MinReplica. - */ - private int getMinReplica(long pContainerID) { - - try { - ContainerID containerID = ContainerID.valueOf(pContainerID); - ContainerInfo container = containerManager.getContainer(containerID); - ReplicationConfig replicationConfig = container.getReplicationConfig(); - return replicationConfig.getMinimumNodes(); - } catch (ContainerNotFoundException e) { - LOG.error("containerId = {} not found.", pContainerID, e); - } catch (Exception e) { - LOG.error("containerId = {} not found.", pContainerID, e); - } - - return 1; - } - - private void putInContainerDNsMap(long containerID, Map> containerDNsMap, - UUID datanodeUUID) { - containerDNsMap.computeIfAbsent(containerID, key -> Sets.newHashSet()); - containerDNsMap.get(containerID).add(datanodeUUID); - } - - @Override - protected synchronized void cleanup() { - ratisContainers.clear(); - ecContainers.clear(); - ecContainerDNsMap.clear(); - } - - @Override - public String getStatusText() { - - // ratis container - String status = String.format( - "%1.2f%% of [Ratis] Containers(%s / %s) with at least one reported replica (=%1.2f) >= " + - "safeModeCutoff (=%1.2f);", - getCurrentContainerThreshold() * 100, - ratisContainerWithMinReplicas, (long) ratisMaxContainer, - getCurrentContainerThreshold(), this.safeModeCutoff); - - Set sampleRatisContainers = ratisContainers.stream(). - limit(SAMPLE_CONTAINER_DISPLAY_LIMIT). - collect(Collectors.toSet()); - - if (!sampleRatisContainers.isEmpty()) { - String sampleContainerText = - "Sample Ratis Containers not satisfying the criteria : " + sampleRatisContainers + ";"; - status = status.concat("\n").concat(sampleContainerText); - } - - // ec container - String ecStatus = String.format( - "%1.2f%% of [EC] Containers(%s / %s) with at least N reported replica (=%1.2f) >= " + - "safeModeCutoff (=%1.2f);", - getCurrentECContainerThreshold() * 100, - ecContainerWithMinReplicas, (long) ecMaxContainer, - getCurrentECContainerThreshold(), this.safeModeCutoff); - status = status.concat("\n").concat(ecStatus); - - Set sampleEcContainers = ecContainerDNsMap.entrySet().stream(). - filter(entry -> { - Long containerId = entry.getKey(); - int minReplica = getMinReplica(containerId); - Set allReplicas = entry.getValue(); - if (allReplicas.size() >= minReplica) { - return false; - } - return true; - }). - map(Map.Entry::getKey). - limit(SAMPLE_CONTAINER_DISPLAY_LIMIT). - collect(Collectors.toSet()); - - if (!sampleEcContainers.isEmpty()) { - String sampleECContainerText = - "Sample EC Containers not satisfying the criteria : " + sampleEcContainers + ";"; - status = status.concat("\n").concat(sampleECContainerText); - } - - return status; - } - - - @Override - public synchronized void refresh(boolean forceRefresh) { - if (forceRefresh || !validate()) { - initializeRule(); - } - } - - private boolean isClosed(ContainerInfo container) { - final LifeCycleState state = container.getState(); - return state == LifeCycleState.QUASI_CLOSED || - state == LifeCycleState.CLOSED; - } - - private void initializeRule() { - final List containers = containerManager.getContainers(); - // Clean up the related data in the map. - ratisContainers.clear(); - ecContainers.clear(); - - // Iterate through the container list to - // get the minimum replica count for each container. - containers.forEach(container -> { - // There can be containers in OPEN/CLOSING state which were never - // created by the client. We are not considering these containers for - // now. These containers can be handled by tracking pipelines. - - HddsProtos.ReplicationType replicationType = container.getReplicationType(); - - if (isClosed(container) && container.getNumberOfKeys() > 0) { - // If it's of type Ratis - if (replicationType.equals(HddsProtos.ReplicationType.RATIS)) { - ratisContainers.add(container.getContainerID()); - } - - // If it's of type EC - if (replicationType.equals(HddsProtos.ReplicationType.EC)) { - ecContainers.add(container.getContainerID()); - } - } - }); - - ratisMaxContainer = ratisContainers.size(); - ecMaxContainer = ecContainers.size(); - - long ratisCutOff = (long) Math.ceil(ratisMaxContainer * safeModeCutoff); - long ecCutOff = (long) Math.ceil(ecMaxContainer * safeModeCutoff); - - getSafeModeMetrics().setNumContainerWithOneReplicaReportedThreshold(ratisCutOff); - getSafeModeMetrics().setNumContainerWithECDataReplicaReportedThreshold(ecCutOff); - - LOG.info("Refreshed Containers with one replica threshold count {}, " + - "with ec n replica threshold count {}.", ratisCutOff, ecCutOff); - } -} diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java new file mode 100644 index 000000000000..445bf3937688 --- /dev/null +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java @@ -0,0 +1,237 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hdds.scm.safemode; + +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT; +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import com.google.common.collect.Sets; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.UUID; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; +import org.apache.hadoop.hdds.client.ReplicationConfig; +import org.apache.hadoop.hdds.conf.ConfigurationSource; +import org.apache.hadoop.hdds.protocol.DatanodeDetails; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState; +import org.apache.hadoop.hdds.scm.container.ContainerID; +import org.apache.hadoop.hdds.scm.container.ContainerInfo; +import org.apache.hadoop.hdds.scm.container.ContainerManager; +import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException; +import org.apache.hadoop.hdds.scm.events.SCMEvents; +import org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport; +import org.apache.hadoop.hdds.server.events.EventQueue; +import org.apache.hadoop.hdds.server.events.TypedEvent; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Safe mode rule for EC containers. + */ +public class ECContainerSafeModeRule extends SafeModeExitRule { + + public static final Logger LOG = LoggerFactory.getLogger(ECContainerSafeModeRule.class); + private static final String NAME = "ECContainerSafeModeRule"; + + private final ContainerManager containerManager; + private final double safeModeCutoff; + private final Set ecContainers; + private final Map> ecContainerDNsMap; + private final AtomicLong ecContainerWithMinReplicas; + private double ecMaxContainer; + + public ECContainerSafeModeRule(EventQueue eventQueue, + ConfigurationSource conf, + ContainerManager containerManager, + SCMSafeModeManager manager) { + super(manager, NAME, eventQueue); + this.safeModeCutoff = getSafeModeCutoff(conf); + this.containerManager = containerManager; + this.ecContainers = new HashSet<>(); + this.ecContainerDNsMap = new ConcurrentHashMap<>(); + this.ecContainerWithMinReplicas = new AtomicLong(0); + initializeRule(); + } + + private static double getSafeModeCutoff(ConfigurationSource conf) { + final double cutoff = conf.getDouble(HDDS_SCM_SAFEMODE_THRESHOLD_PCT, + HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT); + Preconditions.checkArgument((cutoff >= 0.0 && cutoff <= 1.0), + HDDS_SCM_SAFEMODE_THRESHOLD_PCT + " value should be >= 0.0 and <= 1.0"); + return cutoff; + } + + @Override + protected TypedEvent getEventType() { + return SCMEvents.CONTAINER_REGISTRATION_REPORT; + } + + @Override + protected synchronized boolean validate() { + if (validateBasedOnReportProcessing()) { + return getCurrentContainerThreshold() >= safeModeCutoff; + } + return false; + } + + @VisibleForTesting + public double getCurrentContainerThreshold() { + return ecMaxContainer == 0 ? 1 : (ecContainerWithMinReplicas.doubleValue() / ecMaxContainer); + } + + /** + * Get the minimum replica. + * + * If it is a Ratis Contianer, the minimum copy is 1. + * If it is an EC Container, the minimum copy will be the number of Data in replicationConfig. + * + * @param pContainerID containerID + * @return MinReplica. + */ + private int getMinReplica(long pContainerID) { + + try { + ContainerID containerID = ContainerID.valueOf(pContainerID); + ContainerInfo container = containerManager.getContainer(containerID); + ReplicationConfig replicationConfig = container.getReplicationConfig(); + return replicationConfig.getMinimumNodes(); + } catch (ContainerNotFoundException e) { + LOG.error("containerId = {} not found.", pContainerID, e); + } catch (Exception e) { + LOG.error("containerId = {} not found.", pContainerID, e); + } + + return 1; + } + + @Override + protected void process(NodeRegistrationContainerReport report) { + DatanodeDetails datanodeDetails = report.getDatanodeDetails(); + UUID datanodeUUID = datanodeDetails.getUuid(); + + report.getReport().getReportsList().forEach(c -> { + long containerID = c.getContainerID(); + if (ecContainers.contains(containerID)) { + putInContainerDNsMap(containerID, ecContainerDNsMap, datanodeUUID); + recordReportedContainer(containerID, true); + } + }); + + if (scmInSafeMode()) { + SCMSafeModeManager.getLogger().info( + "SCM in safe mode. {} % containers [EC] have at N reported replica", + getCurrentContainerThreshold() * 100); + } + } + + private void putInContainerDNsMap(long containerID, + Map> containerDNsMap, + UUID datanodeUUID) { + containerDNsMap.computeIfAbsent(containerID, key -> Sets.newHashSet()); + containerDNsMap.get(containerID).add(datanodeUUID); + } + + /** + * Record the reported Container. + * + * We will differentiate and count according to the type of Container. + * + * @param containerID containerID + * @param isEcContainer true, means ECContainer, false, means not ECContainer. + */ + private void recordReportedContainer(long containerID, boolean isEcContainer) { + + int uuids = 1; + if (ecContainerDNsMap.containsKey(containerID)) { + uuids = ecContainerDNsMap.get(containerID).size(); + } + + int minReplica = getMinReplica(containerID); + if (uuids >= minReplica) { + getSafeModeMetrics() + .incCurrentContainersWithECDataReplicaReportedCount(); + ecContainerWithMinReplicas.getAndAdd(1); + } + } + + private void initializeRule() { + ecContainers.clear(); + ecContainerDNsMap.clear(); + containerManager.getContainers().forEach(container -> { + if (container.getReplicationType() == HddsProtos.ReplicationType.EC + && isClosed(container) + && container.getNumberOfKeys() > 0) { + ecContainers.add(container.getContainerID()); + } + }); + ecMaxContainer = ecContainers.size(); + long ecCutOff = (long) Math.ceil(ecMaxContainer * safeModeCutoff); + getSafeModeMetrics().setNumContainerWithECDataReplicaReportedThreshold(ecCutOff); + } + + private boolean isClosed(ContainerInfo container) { + final LifeCycleState state = container.getState(); + return state == LifeCycleState.QUASI_CLOSED || state == LifeCycleState.CLOSED; + } + + @Override + public String getStatusText() { + String status = String.format( + "%1.2f%% of [EC] Containers(%s / %s) with at least N reported replica (=%1.2f) >= " + + "safeModeCutoff (=%1.2f);", + getCurrentContainerThreshold() * 100, + ecContainerWithMinReplicas, (long) ecMaxContainer, + getCurrentContainerThreshold(), this.safeModeCutoff); + + Set sampleEcContainers = ecContainerDNsMap.entrySet().stream().filter(entry -> { + Long containerId = entry.getKey(); + int minReplica = getMinReplica(containerId); + Set allReplicas = entry.getValue(); + if (allReplicas.size() >= minReplica) { + return false; + } + return true; + }).map(Map.Entry::getKey).limit(SAMPLE_CONTAINER_DISPLAY_LIMIT).collect(Collectors.toSet()); + + if (!sampleEcContainers.isEmpty()) { + String sampleECContainerText = "Sample EC Containers not satisfying the criteria : " + sampleEcContainers + ";"; + status = status.concat("\n").concat(sampleECContainerText); + } + + return status; + } + + @Override + public synchronized void refresh(boolean forceRefresh) { + if (forceRefresh || !validate()) { + initializeRule(); + } + } + + @Override + protected void cleanup() { + ecContainers.clear(); + ecContainerDNsMap.clear(); + } +} diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java new file mode 100644 index 000000000000..9acf4edc3452 --- /dev/null +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java @@ -0,0 +1,207 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hdds.scm.safemode; + +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT; +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; +import org.apache.hadoop.hdds.conf.ConfigurationSource; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType; +import org.apache.hadoop.hdds.scm.container.ContainerID; +import org.apache.hadoop.hdds.scm.container.ContainerInfo; +import org.apache.hadoop.hdds.scm.container.ContainerManager; +import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException; +import org.apache.hadoop.hdds.scm.events.SCMEvents; +import org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport; +import org.apache.hadoop.hdds.server.events.EventQueue; +import org.apache.hadoop.hdds.server.events.TypedEvent; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * Class defining Safe mode exit criteria for Ratis Containers. + */ +public class RatisContainerSafeModeRule extends SafeModeExitRule { + + public static final Logger LOG = LoggerFactory.getLogger(RatisContainerSafeModeRule.class); + private static final String NAME = "RatisContainerSafeModeRule"; + + private final ContainerManager containerManager; + // Required cutoff % for containers with at least 1 reported replica. + private final double safeModeCutoff; + // Containers read from scm db (excluding containers in ALLOCATED state). + private final Set ratisContainers; + private final AtomicLong ratisContainerWithMinReplicas; + private double ratisMaxContainer; + + public RatisContainerSafeModeRule(EventQueue eventQueue, + ConfigurationSource conf, + ContainerManager containerManager, + SCMSafeModeManager manager) { + super(manager, NAME, eventQueue); + this.safeModeCutoff = getSafeModeCutoff(conf); + this.containerManager = containerManager; + this.ratisContainers = new HashSet<>(); + this.ratisContainerWithMinReplicas = new AtomicLong(0); + initializeRule(); + } + + private static double getSafeModeCutoff(ConfigurationSource conf) { + final double cutoff = conf.getDouble(HDDS_SCM_SAFEMODE_THRESHOLD_PCT, + HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT); + Preconditions.checkArgument((cutoff >= 0.0 && cutoff <= 1.0), + HDDS_SCM_SAFEMODE_THRESHOLD_PCT + " value should be >= 0.0 and <= 1.0"); + return cutoff; + } + + @Override + protected TypedEvent getEventType() { + return SCMEvents.CONTAINER_REGISTRATION_REPORT; + } + + @Override + protected synchronized boolean validate() { + if (validateBasedOnReportProcessing()) { + return (getCurrentContainerThreshold() >= safeModeCutoff); + } + + final List containers = containerManager.getContainers( + ReplicationType.RATIS); + + return containers.stream() + .filter(this::isClosed) + .map(ContainerInfo::containerID) + .noneMatch(this::isMissing); + } + + /** + * Checks if the container has any replica. + */ + private boolean isMissing(ContainerID id) { + try { + return containerManager.getContainerReplicas(id).isEmpty(); + } catch (ContainerNotFoundException ex) { + /* + * This should never happen, in case this happens the container + * somehow got removed from SCM. + * Safemode rule doesn't have to log/fix this. We will just exclude this + * from the rule validation. + */ + return false; + + } + } + + @VisibleForTesting + public double getCurrentContainerThreshold() { + return ratisMaxContainer == 0 ? 1 : (ratisContainerWithMinReplicas.doubleValue() / ratisMaxContainer); + } + + @Override + protected void process(NodeRegistrationContainerReport report) { + report.getReport().getReportsList().forEach(c -> { + long containerID = c.getContainerID(); + if (ratisContainers.contains(containerID)) { + recordReportedContainer(containerID); + ratisContainers.remove(containerID); + } + }); + + if (scmInSafeMode()) { + SCMSafeModeManager.getLogger().info( + "SCM in safe mode. {} % containers [Ratis] have at least one reported replica", + getCurrentContainerThreshold() * 100); + } + } + + /** + * Record the reported Container. + * + * We will differentiate and count according to the type of Container. + * + * @param containerID containerID + * @param isEcContainer true, means ECContainer, false, means not ECContainer. + */ + private void recordReportedContainer(long containerID) { + ratisContainerWithMinReplicas.getAndAdd(1); + getSafeModeMetrics() + .incCurrentContainersWithOneReplicaReportedCount(); + } + + private void initializeRule() { + ratisContainers.clear(); + containerManager.getContainers().forEach(container -> { + if (container.getReplicationType() == HddsProtos.ReplicationType.RATIS + && isClosed(container) + && container.getNumberOfKeys() > 0) { + ratisContainers.add(container.getContainerID()); + } + }); + ratisMaxContainer = ratisContainers.size(); + long ratisCutOff = (long) Math.ceil(ratisMaxContainer * safeModeCutoff); + getSafeModeMetrics().setNumContainerWithOneReplicaReportedThreshold(ratisCutOff); + } + + private boolean isClosed(ContainerInfo container) { + final LifeCycleState state = container.getState(); + return state == LifeCycleState.QUASI_CLOSED || state == LifeCycleState.CLOSED; + } + + @Override + public String getStatusText() { + String status = String.format( + "%1.2f%% of [Ratis] Containers(%s / %s) with at least one reported replica (=%1.2f) >= " + + "safeModeCutoff (=%1.2f);", + getCurrentContainerThreshold() * 100, + ratisContainerWithMinReplicas, (long) ratisMaxContainer, + getCurrentContainerThreshold(), this.safeModeCutoff); + + Set sampleRatisContainers = ratisContainers.stream().limit(SAMPLE_CONTAINER_DISPLAY_LIMIT) + .collect(Collectors.toSet()); + + if (!sampleRatisContainers.isEmpty()) { + String sampleContainerText = "Sample Ratis Containers not satisfying the criteria : " + sampleRatisContainers + + ";"; + status = status.concat("\n").concat(sampleContainerText); + } + + return status; + } + + @Override + public synchronized void refresh(boolean forceRefresh) { + if (forceRefresh || !validate()) { + initializeRule(); + } + } + + @Override + protected void cleanup() { + ratisContainers.clear(); + } +} diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java index ced9d2c31ee0..7f560c62d1e2 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java @@ -90,7 +90,8 @@ public class SCMSafeModeManager implements SafeModeManager { private Map exitRules = new HashMap<>(1); private Set preCheckRules = new HashSet<>(1); private ConfigurationSource config; - private static final String CONT_EXIT_RULE = "ContainerSafeModeRule"; + private static final String RATIS_CONTAINER_EXIT_RULE = "RatisContainerSafeModeRule"; + private static final String EC_CONTAINER_EXIT_RULE = "ECContainerSafeModeRule"; private static final String HEALTHY_PIPELINE_EXIT_RULE = "HealthyPipelineSafeModeRule"; private static final String ATLEAST_ONE_DATANODE_REPORTED_PIPELINE_EXIT_RULE = @@ -317,30 +318,29 @@ public static Logger getLogger() { return LOG; } - @VisibleForTesting public double getCurrentContainerThreshold() { - return ((ContainerSafeModeRule) exitRules.get(CONT_EXIT_RULE)) + return ((RatisContainerSafeModeRule) exitRules.get(RATIS_CONTAINER_EXIT_RULE)) .getCurrentContainerThreshold(); } - @VisibleForTesting public double getCurrentECContainerThreshold() { - return ((ContainerSafeModeRule) exitRules.get(CONT_EXIT_RULE)) - .getCurrentECContainerThreshold(); + return ((ECContainerSafeModeRule) exitRules.get(EC_CONTAINER_EXIT_RULE)) + .getCurrentContainerThreshold(); } - @VisibleForTesting - public ContainerSafeModeRule getContainerSafeModeRule() { - return (ContainerSafeModeRule) exitRules.get(CONT_EXIT_RULE); + public RatisContainerSafeModeRule getRatisContainerSafeModeRule() { + return (RatisContainerSafeModeRule) exitRules.get(RATIS_CONTAINER_EXIT_RULE); + } + + public ECContainerSafeModeRule getECContainerSafeModeRule() { + return (ECContainerSafeModeRule) exitRules.get(EC_CONTAINER_EXIT_RULE); } - @VisibleForTesting public HealthyPipelineSafeModeRule getHealthyPipelineSafeModeRule() { return (HealthyPipelineSafeModeRule) exitRules.get(HEALTHY_PIPELINE_EXIT_RULE); } - @VisibleForTesting public OneReplicaPipelineSafeModeRule getOneReplicaPipelineSafeModeRule() { return (OneReplicaPipelineSafeModeRule) exitRules.get(ATLEAST_ONE_DATANODE_REPORTED_PIPELINE_EXIT_RULE); diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeRuleFactory.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeRuleFactory.java index a8c7fcc0597d..1214e9554387 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeRuleFactory.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeRuleFactory.java @@ -65,12 +65,15 @@ private SafeModeRuleFactory(final ConfigurationSource config, private void loadRules() { // TODO: Use annotation to load the rules. (HDDS-11730) - SafeModeExitRule containerRule = new ContainerSafeModeRule(eventQueue, + SafeModeExitRule ratisContainerRule = new RatisContainerSafeModeRule(eventQueue, + config, containerManager, safeModeManager); + SafeModeExitRule ecContainerRule = new ECContainerSafeModeRule(eventQueue, config, containerManager, safeModeManager); SafeModeExitRule datanodeRule = new DataNodeSafeModeRule(eventQueue, config, safeModeManager); - safeModeRules.add(containerRule); + safeModeRules.add(ratisContainerRule); + safeModeRules.add(ecContainerRule); safeModeRules.add(datanodeRule); preCheckRules.add(datanodeRule); diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java index 049a69c5d981..326645ca131c 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java @@ -575,9 +575,13 @@ public void testContainerSafeModeRuleEC(int data, int parity) throws Exception { // the threshold will reach 100%. testECContainerThreshold(containers.subList(10, 20), 1.0, data); - ContainerSafeModeRule containerSafeModeRule = - scmSafeModeManager.getContainerSafeModeRule(); - assertTrue(containerSafeModeRule.validate()); + ECContainerSafeModeRule ecContainerSafeModeRule = + scmSafeModeManager.getECContainerSafeModeRule(); + assertTrue(ecContainerSafeModeRule.validate()); + + RatisContainerSafeModeRule ratisContainerSafeModeRule = + scmSafeModeManager.getRatisContainerSafeModeRule(); + assertTrue(ratisContainerSafeModeRule.validate()); } private void testSafeModeDataNodes(int numOfDns) throws Exception { diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java index 51890ba4ea12..028b39d21e30 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java @@ -54,7 +54,7 @@ public void testLoadedSafeModeRules() { // as the rules are hardcoded in SafeModeRuleFactory. // This will be fixed once we load rules using annotation. - assertEquals(4, factory.getSafeModeRules().size(), + assertEquals(5, factory.getSafeModeRules().size(), "The total safemode rules count doesn't match"); } From ea4805d7b75890610213e5db9a98eeb244a258b2 Mon Sep 17 00:00:00 2001 From: peterxcli Date: Sat, 22 Feb 2025 23:59:50 +0800 Subject: [PATCH 2/8] Add logging when container safe rule when initializing --- .../hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java | 2 ++ .../hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java | 2 ++ 2 files changed, 4 insertions(+) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java index 445bf3937688..ed851a2a129f 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java @@ -188,6 +188,8 @@ && isClosed(container) ecMaxContainer = ecContainers.size(); long ecCutOff = (long) Math.ceil(ecMaxContainer * safeModeCutoff); getSafeModeMetrics().setNumContainerWithECDataReplicaReportedThreshold(ecCutOff); + + LOG.info("Refreshed Containers with ec n replica threshold count {}.", ecCutOff); } private boolean isClosed(ContainerInfo container) { diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java index 9acf4edc3452..fed9f1012b31 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java @@ -165,6 +165,8 @@ && isClosed(container) ratisMaxContainer = ratisContainers.size(); long ratisCutOff = (long) Math.ceil(ratisMaxContainer * safeModeCutoff); getSafeModeMetrics().setNumContainerWithOneReplicaReportedThreshold(ratisCutOff); + + LOG.info("Refreshed Containers with one replica threshold count {}.", ratisCutOff); } private boolean isClosed(ContainerInfo container) { From d5cc309c8d9b5305a34569cbd399d9e764e12835 Mon Sep 17 00:00:00 2001 From: peterxcli Date: Wed, 26 Feb 2025 21:35:18 +0800 Subject: [PATCH 3/8] remove leftover comments --- .../hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java | 6 ------ .../hdds/scm/safemode/RatisContainerSafeModeRule.java | 3 --- 2 files changed, 9 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java index ed851a2a129f..e679c5ace60b 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java @@ -103,9 +103,6 @@ public double getCurrentContainerThreshold() { /** * Get the minimum replica. * - * If it is a Ratis Contianer, the minimum copy is 1. - * If it is an EC Container, the minimum copy will be the number of Data in replicationConfig. - * * @param pContainerID containerID * @return MinReplica. */ @@ -155,10 +152,7 @@ private void putInContainerDNsMap(long containerID, /** * Record the reported Container. * - * We will differentiate and count according to the type of Container. - * * @param containerID containerID - * @param isEcContainer true, means ECContainer, false, means not ECContainer. */ private void recordReportedContainer(long containerID, boolean isEcContainer) { diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java index fed9f1012b31..65c3855918ad 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java @@ -142,10 +142,7 @@ protected void process(NodeRegistrationContainerReport report) { /** * Record the reported Container. * - * We will differentiate and count according to the type of Container. - * * @param containerID containerID - * @param isEcContainer true, means ECContainer, false, means not ECContainer. */ private void recordReportedContainer(long containerID) { ratisContainerWithMinReplicas.getAndAdd(1); From 9740d445adfad20d79247a4610c34b149e5a3cec Mon Sep 17 00:00:00 2001 From: peterxcli Date: Wed, 26 Feb 2025 22:35:03 +0800 Subject: [PATCH 4/8] correct ec container without report validate logic --- .../scm/safemode/ECContainerSafeModeRule.java | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java index e679c5ace60b..e34d3d17e2e1 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java @@ -24,6 +24,7 @@ import com.google.common.base.Preconditions; import com.google.common.collect.Sets; import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; import java.util.UUID; @@ -35,6 +36,7 @@ import org.apache.hadoop.hdds.protocol.DatanodeDetails; import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType; import org.apache.hadoop.hdds.scm.container.ContainerID; import org.apache.hadoop.hdds.scm.container.ContainerInfo; import org.apache.hadoop.hdds.scm.container.ContainerManager; @@ -92,7 +94,32 @@ protected synchronized boolean validate() { if (validateBasedOnReportProcessing()) { return getCurrentContainerThreshold() >= safeModeCutoff; } - return false; + + final List containers = containerManager.getContainers( + ReplicationType.EC); + + return containers.stream() + .filter(this::isClosed) + .map(ContainerInfo::containerID) + .noneMatch(this::isMissing); + } + + /** + * Checks if the container has at least the minimum required number of replicas. + */ + private boolean isMissing(ContainerID id) { + try { + int minReplica = getMinReplica(id.getId()); + return containerManager.getContainerReplicas(id).size() < minReplica; + } catch (ContainerNotFoundException ex) { + /* + * This should never happen, in case this happens the container + * somehow got removed from SCM. + * Safemode rule doesn't have to log/fix this. We will just exclude this + * from the rule validation. + */ + return false; + } } @VisibleForTesting From 40e69471a2b80823a5a01a82376898b8ab123e9e Mon Sep 17 00:00:00 2001 From: peterxcli Date: Tue, 11 Mar 2025 13:09:46 +0000 Subject: [PATCH 5/8] Addressed comment: some cleaup --- .../hdds/scm/safemode/ECContainerSafeModeRule.java | 9 +++------ .../hadoop/hdds/scm/safemode/SCMSafeModeManager.java | 6 ++++++ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java index e34d3d17e2e1..ad9133caf2de 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java @@ -158,7 +158,7 @@ protected void process(NodeRegistrationContainerReport report) { long containerID = c.getContainerID(); if (ecContainers.contains(containerID)) { putInContainerDNsMap(containerID, ecContainerDNsMap, datanodeUUID); - recordReportedContainer(containerID, true); + recordReportedContainer(containerID); } }); @@ -181,7 +181,7 @@ private void putInContainerDNsMap(long containerID, * * @param containerID containerID */ - private void recordReportedContainer(long containerID, boolean isEcContainer) { + private void recordReportedContainer(long containerID) { int uuids = 1; if (ecContainerDNsMap.containsKey(containerID)) { @@ -231,10 +231,7 @@ public String getStatusText() { Long containerId = entry.getKey(); int minReplica = getMinReplica(containerId); Set allReplicas = entry.getValue(); - if (allReplicas.size() >= minReplica) { - return false; - } - return true; + return allReplicas.size() < minReplica; }).map(Map.Entry::getKey).limit(SAMPLE_CONTAINER_DISPLAY_LIMIT).collect(Collectors.toSet()); if (!sampleEcContainers.isEmpty()) { diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java index 7f560c62d1e2..92c93a8f844d 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java @@ -318,29 +318,35 @@ public static Logger getLogger() { return LOG; } + @VisibleForTesting public double getCurrentContainerThreshold() { return ((RatisContainerSafeModeRule) exitRules.get(RATIS_CONTAINER_EXIT_RULE)) .getCurrentContainerThreshold(); } + @VisibleForTesting public double getCurrentECContainerThreshold() { return ((ECContainerSafeModeRule) exitRules.get(EC_CONTAINER_EXIT_RULE)) .getCurrentContainerThreshold(); } + @VisibleForTesting public RatisContainerSafeModeRule getRatisContainerSafeModeRule() { return (RatisContainerSafeModeRule) exitRules.get(RATIS_CONTAINER_EXIT_RULE); } + @VisibleForTesting public ECContainerSafeModeRule getECContainerSafeModeRule() { return (ECContainerSafeModeRule) exitRules.get(EC_CONTAINER_EXIT_RULE); } + @VisibleForTesting public HealthyPipelineSafeModeRule getHealthyPipelineSafeModeRule() { return (HealthyPipelineSafeModeRule) exitRules.get(HEALTHY_PIPELINE_EXIT_RULE); } + @VisibleForTesting public OneReplicaPipelineSafeModeRule getOneReplicaPipelineSafeModeRule() { return (OneReplicaPipelineSafeModeRule) exitRules.get(ATLEAST_ONE_DATANODE_REPORTED_PIPELINE_EXIT_RULE); From d2a973103c7ca72e90896a41f406816ba614fe3b Mon Sep 17 00:00:00 2001 From: peterxcli Date: Tue, 11 Mar 2025 13:11:06 +0000 Subject: [PATCH 6/8] Change to getContainer with replicattion type in initializeRule of ContainerSafeModeRules --- .../scm/safemode/ECContainerSafeModeRule.java | 11 +++-------- .../safemode/RatisContainerSafeModeRule.java | 11 +++-------- .../scm/safemode/TestSCMSafeModeManager.java | 19 ++++++++++--------- 3 files changed, 16 insertions(+), 25 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java index ad9133caf2de..eb8ed53db8c1 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java @@ -34,7 +34,6 @@ import org.apache.hadoop.hdds.client.ReplicationConfig; import org.apache.hadoop.hdds.conf.ConfigurationSource; import org.apache.hadoop.hdds.protocol.DatanodeDetails; -import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType; import org.apache.hadoop.hdds.scm.container.ContainerID; @@ -199,13 +198,9 @@ private void recordReportedContainer(long containerID) { private void initializeRule() { ecContainers.clear(); ecContainerDNsMap.clear(); - containerManager.getContainers().forEach(container -> { - if (container.getReplicationType() == HddsProtos.ReplicationType.EC - && isClosed(container) - && container.getNumberOfKeys() > 0) { - ecContainers.add(container.getContainerID()); - } - }); + containerManager.getContainers(ReplicationType.EC).stream() + .filter(this::isClosed).filter(c -> c.getNumberOfKeys() > 0) + .map(ContainerInfo::getContainerID).forEach(ecContainers::add); ecMaxContainer = ecContainers.size(); long ecCutOff = (long) Math.ceil(ecMaxContainer * safeModeCutoff); getSafeModeMetrics().setNumContainerWithECDataReplicaReportedThreshold(ecCutOff); diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java index 65c3855918ad..e2ed583aad7a 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java @@ -28,7 +28,6 @@ import java.util.concurrent.atomic.AtomicLong; import java.util.stream.Collectors; import org.apache.hadoop.hdds.conf.ConfigurationSource; -import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType; import org.apache.hadoop.hdds.scm.container.ContainerID; @@ -152,13 +151,9 @@ private void recordReportedContainer(long containerID) { private void initializeRule() { ratisContainers.clear(); - containerManager.getContainers().forEach(container -> { - if (container.getReplicationType() == HddsProtos.ReplicationType.RATIS - && isClosed(container) - && container.getNumberOfKeys() > 0) { - ratisContainers.add(container.getContainerID()); - } - }); + containerManager.getContainers(ReplicationType.RATIS).stream() + .filter(this::isClosed).filter(c -> c.getNumberOfKeys() > 0) + .map(ContainerInfo::getContainerID).forEach(ratisContainers::add); ratisMaxContainer = ratisContainers.size(); long ratisCutOff = (long) Math.ceil(ratisMaxContainer * safeModeCutoff); getSafeModeMetrics().setNumContainerWithOneReplicaReportedThreshold(ratisCutOff); diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java index 326645ca131c..9a7e3d16a962 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java @@ -46,6 +46,7 @@ import org.apache.hadoop.hdds.protocol.DatanodeDetails; import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType; import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos; import org.apache.hadoop.hdds.scm.HddsTestUtils; import org.apache.hadoop.hdds.scm.container.ContainerInfo; @@ -132,7 +133,7 @@ private void testSafeMode(int numContainers) throws Exception { container.setNumberOfKeys(10); } ContainerManager containerManager = mock(ContainerManager.class); - when(containerManager.getContainers()).thenReturn(containers); + when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers); scmSafeModeManager = new SCMSafeModeManager( config, containerManager, null, queue, serviceManager, scmContext); @@ -171,7 +172,7 @@ public void testSafeModeExitRule() throws Exception { container.setNumberOfKeys(10); } ContainerManager containerManager = mock(ContainerManager.class); - when(containerManager.getContainers()).thenReturn(containers); + when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers); scmSafeModeManager = new SCMSafeModeManager( config, containerManager, null, queue, serviceManager, scmContext); @@ -240,7 +241,7 @@ public void testHealthyPipelinePercentWithIncorrectValue(double healthyPercent, serviceManager, Clock.system(ZoneOffset.UTC)); ContainerManager containerManager = mock(ContainerManager.class); - when(containerManager.getContainers()).thenReturn(containers); + when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers); IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, () -> new SCMSafeModeManager(conf, containerManager, pipelineManager, queue, serviceManager, scmContext)); @@ -306,7 +307,7 @@ public void testSafeModeExitRuleWithPipelineAvailabilityCheck( } ContainerManager containerManager = mock(ContainerManager.class); - when(containerManager.getContainers()).thenReturn(containers); + when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers); scmSafeModeManager = new SCMSafeModeManager( conf, containerManager, pipelineManager, queue, serviceManager, @@ -443,7 +444,7 @@ public void testDisableSafeMode() { conf.setBoolean(HddsConfigKeys.HDDS_SCM_SAFEMODE_ENABLED, false); PipelineManager pipelineManager = mock(PipelineManager.class); ContainerManager containerManager = mock(ContainerManager.class); - when(containerManager.getContainers()).thenReturn(containers); + when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers); scmSafeModeManager = new SCMSafeModeManager( conf, containerManager, pipelineManager, queue, serviceManager, scmContext); @@ -484,7 +485,7 @@ public void testContainerSafeModeRule() throws Exception { } ContainerManager containerManager = mock(ContainerManager.class); - when(containerManager.getContainers()).thenReturn(containers); + when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers); scmSafeModeManager = new SCMSafeModeManager( config, containerManager, null, queue, serviceManager, scmContext); @@ -588,7 +589,7 @@ private void testSafeModeDataNodes(int numOfDns) throws Exception { OzoneConfiguration conf = new OzoneConfiguration(config); conf.setInt(HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE, numOfDns); ContainerManager containerManager = mock(ContainerManager.class); - when(containerManager.getContainers()).thenReturn(containers); + when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers); scmSafeModeManager = new SCMSafeModeManager( conf, containerManager, null, queue, serviceManager, scmContext); @@ -701,7 +702,7 @@ public void testSafeModePipelineExitRule() throws Exception { pipeline = pipelineManager.getPipeline(pipeline.getId()); MockRatisPipelineProvider.markPipelineHealthy(pipeline); ContainerManager containerManager = mock(ContainerManager.class); - when(containerManager.getContainers()).thenReturn(containers); + when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers); scmSafeModeManager = new SCMSafeModeManager( config, containerManager, pipelineManager, queue, serviceManager, @@ -756,7 +757,7 @@ public void testPipelinesNotCreatedUntilPreCheckPasses() throws Exception { mockRatisProvider); ContainerManager containerManager = mock(ContainerManager.class); - when(containerManager.getContainers()).thenReturn(containers); + when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers); scmSafeModeManager = new SCMSafeModeManager( config, containerManager, pipelineManager, queue, serviceManager, From d5c65960a7c138eeaf2acb7b6edc769afd423152 Mon Sep 17 00:00:00 2001 From: peterxcli Date: Thu, 10 Apr 2025 08:53:01 +0000 Subject: [PATCH 7/8] clean up --- .../hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java index 1a2720a8f3e8..7ff3a6cfe27d 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java @@ -133,14 +133,11 @@ public double getCurrentContainerThreshold() { * @return MinReplica. */ private int getMinReplica(long pContainerID) { - try { ContainerID containerID = ContainerID.valueOf(pContainerID); ContainerInfo container = containerManager.getContainer(containerID); ReplicationConfig replicationConfig = container.getReplicationConfig(); return replicationConfig.getMinimumNodes(); - } catch (ContainerNotFoundException e) { - LOG.error("containerId = {} not found.", pContainerID, e); } catch (Exception e) { LOG.error("containerId = {} not found.", pContainerID, e); } From 2ac08b84b6ce74a4f74a2aed27072ac49e5bae6a Mon Sep 17 00:00:00 2001 From: peterxcli Date: Thu, 10 Apr 2025 16:49:14 +0000 Subject: [PATCH 8/8] Address review --- .../hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java | 6 +++--- .../hdds/scm/safemode/RatisContainerSafeModeRule.java | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java index 7ff3a6cfe27d..a6c30aace93f 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java @@ -54,6 +54,7 @@ public class ECContainerSafeModeRule extends SafeModeExitRule> containerDNsMap, UUID datanodeUUID) { - containerDNsMap.computeIfAbsent(containerID, key -> Sets.newHashSet()); - containerDNsMap.get(containerID).add(datanodeUUID); + containerDNsMap.computeIfAbsent(containerID, key -> Sets.newHashSet()).add(datanodeUUID); } /** diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java index d39d7ab8adf3..4015fd81b4df 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java @@ -134,7 +134,7 @@ protected void process(NodeRegistrationContainerReport report) { if (scmInSafeMode()) { SCMSafeModeManager.getLogger().info( "SCM in safe mode. {} % containers [Ratis] have at least one reported replica", - getCurrentContainerThreshold() * 100); + String.format("%.2f", getCurrentContainerThreshold() * 100)); } }