diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java deleted file mode 100644 index 4fec2e4a1798..000000000000 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java +++ /dev/null @@ -1,359 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hdds.scm.safemode; - -import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT; -import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.collect.Sets; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.UUID; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.atomic.AtomicLong; -import java.util.stream.Collectors; -import org.apache.hadoop.hdds.client.ReplicationConfig; -import org.apache.hadoop.hdds.conf.ConfigurationSource; -import org.apache.hadoop.hdds.protocol.DatanodeDetails; -import org.apache.hadoop.hdds.protocol.proto.HddsProtos; -import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState; -import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType; -import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos; -import org.apache.hadoop.hdds.scm.container.ContainerID; -import org.apache.hadoop.hdds.scm.container.ContainerInfo; -import org.apache.hadoop.hdds.scm.container.ContainerManager; -import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException; -import org.apache.hadoop.hdds.scm.events.SCMEvents; -import org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport; -import org.apache.hadoop.hdds.server.events.EventQueue; -import org.apache.hadoop.hdds.server.events.TypedEvent; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Class defining Safe mode exit criteria for Containers. - */ -public class ContainerSafeModeRule extends - SafeModeExitRule { - - private static final Logger LOG = LoggerFactory.getLogger(ContainerSafeModeRule.class); - - private static final String NAME = "ContainerSafeModeRule"; - - private final ContainerManager containerManager; - // Required cutoff % for containers with at least 1 reported replica. - private final double safeModeCutoff; - // Containers read from scm db (excluding containers in ALLOCATED state). - private final Set ratisContainers; - private final Set ecContainers; - private final Map> ecContainerDNsMap; - private final AtomicLong ratisContainerWithMinReplicas = new AtomicLong(0); - private final AtomicLong ecContainerWithMinReplicas = new AtomicLong(0); - - private double ratisMaxContainer; - private double ecMaxContainer; - - public ContainerSafeModeRule(final EventQueue eventQueue, - final ConfigurationSource conf, - final ContainerManager containerManager, - final SCMSafeModeManager manager) { - super(manager, NAME, eventQueue); - this.safeModeCutoff = getSafeModeCutoff(conf); - this.containerManager = containerManager; - this.ratisContainers = new HashSet<>(); - this.ecContainers = new HashSet<>(); - this.ecContainerDNsMap = new ConcurrentHashMap<>(); - initializeRule(); - } - - - private static double getSafeModeCutoff(ConfigurationSource conf) { - final double cutoff = conf.getDouble(HDDS_SCM_SAFEMODE_THRESHOLD_PCT, - HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT); - Preconditions.checkArgument((cutoff >= 0.0 && cutoff <= 1.0), - HDDS_SCM_SAFEMODE_THRESHOLD_PCT + - " value should be >= 0.0 and <= 1.0"); - return cutoff; - } - - @Override - protected TypedEvent getEventType() { - return SCMEvents.CONTAINER_REGISTRATION_REPORT; - } - - @Override - protected synchronized boolean validate() { - if (validateBasedOnReportProcessing()) { - return (getCurrentContainerThreshold() >= safeModeCutoff) && - (getCurrentECContainerThreshold() >= safeModeCutoff); - } - - // TODO: Split ContainerSafeModeRule into RatisContainerSafeModeRule and - // ECContainerSafeModeRule - final List containers = containerManager.getContainers( - ReplicationType.RATIS); - - return containers.stream() - .filter(this::isClosed) - .map(ContainerInfo::containerID) - .noneMatch(this::isMissing); - } - - /** - * Checks if the container has any replica. - */ - private boolean isMissing(ContainerID id) { - try { - return containerManager.getContainerReplicas(id).isEmpty(); - } catch (ContainerNotFoundException ex) { - /* - * This should never happen, in case this happens the container - * somehow got removed from SCM. - * Safemode rule doesn't have to log/fix this. We will just exclude this - * from the rule validation. - */ - return false; - - } - } - - @VisibleForTesting - public double getCurrentContainerThreshold() { - return ratisMaxContainer == 0 ? 1 : - (ratisContainerWithMinReplicas.doubleValue() / ratisMaxContainer); - } - - @VisibleForTesting - public double getCurrentECContainerThreshold() { - return ecMaxContainer == 0 ? 1 : - (ecContainerWithMinReplicas.doubleValue() / ecMaxContainer); - } - - - // TODO: Report processing logic will be removed in future. HDDS-11958. - @Override - protected synchronized void process( - final NodeRegistrationContainerReport reportsProto) { - final DatanodeDetails datanodeDetails = reportsProto.getDatanodeDetails(); - final UUID datanodeUUID = datanodeDetails.getUuid(); - StorageContainerDatanodeProtocolProtos.ContainerReportsProto report = reportsProto.getReport(); - - report.getReportsList().forEach(c -> { - long containerID = c.getContainerID(); - - - // If it is a Ratis container. - if (ratisContainers.contains(containerID)) { - recordReportedContainer(containerID, Boolean.FALSE); - ratisContainers.remove(containerID); - } - - // If it is an EC container. - if (ecContainers.contains(containerID)) { - putInContainerDNsMap(containerID, ecContainerDNsMap, datanodeUUID); - recordReportedContainer(containerID, Boolean.TRUE); - } - }); - - if (scmInSafeMode()) { - SCMSafeModeManager.getLogger().info( - "SCM in safe mode. {} % containers [Ratis] have at least one" - + " reported replica, {} % containers [EC] have at N reported replica.", - getCurrentContainerThreshold() * 100, getCurrentECContainerThreshold() * 100); - } - } - - /** - * Record the reported Container. - * - * We will differentiate and count according to the type of Container. - * - * @param containerID containerID - * @param isEcContainer true, means ECContainer, false, means not ECContainer. - */ - private void recordReportedContainer(long containerID, boolean isEcContainer) { - - int uuids = 1; - if (isEcContainer && ecContainerDNsMap.containsKey(containerID)) { - uuids = ecContainerDNsMap.get(containerID).size(); - } - - int minReplica = getMinReplica(containerID); - if (uuids >= minReplica) { - if (isEcContainer) { - getSafeModeMetrics() - .incCurrentContainersWithECDataReplicaReportedCount(); - ecContainerWithMinReplicas.getAndAdd(1); - } else { - ratisContainerWithMinReplicas.getAndAdd(1); - getSafeModeMetrics() - .incCurrentContainersWithOneReplicaReportedCount(); - } - } - } - - /** - * Get the minimum replica. - * - * If it is a Ratis Contianer, the minimum copy is 1. - * If it is an EC Container, the minimum copy will be the number of Data in replicationConfig. - * - * @param pContainerID containerID - * @return MinReplica. - */ - private int getMinReplica(long pContainerID) { - - try { - ContainerID containerID = ContainerID.valueOf(pContainerID); - ContainerInfo container = containerManager.getContainer(containerID); - ReplicationConfig replicationConfig = container.getReplicationConfig(); - return replicationConfig.getMinimumNodes(); - } catch (ContainerNotFoundException e) { - LOG.error("containerId = {} not found.", pContainerID, e); - } catch (Exception e) { - LOG.error("containerId = {} not found.", pContainerID, e); - } - - return 1; - } - - private void putInContainerDNsMap(long containerID, Map> containerDNsMap, - UUID datanodeUUID) { - containerDNsMap.computeIfAbsent(containerID, key -> Sets.newHashSet()); - containerDNsMap.get(containerID).add(datanodeUUID); - } - - @Override - protected synchronized void cleanup() { - ratisContainers.clear(); - ecContainers.clear(); - ecContainerDNsMap.clear(); - } - - @Override - public String getStatusText() { - - // ratis container - String status = String.format( - "%1.2f%% of [Ratis] Containers(%s / %s) with at least one reported replica (=%1.2f) >= " + - "safeModeCutoff (=%1.2f);", - getCurrentContainerThreshold() * 100, - ratisContainerWithMinReplicas, (long) ratisMaxContainer, - getCurrentContainerThreshold(), this.safeModeCutoff); - - Set sampleRatisContainers = ratisContainers.stream(). - limit(SAMPLE_CONTAINER_DISPLAY_LIMIT). - collect(Collectors.toSet()); - - if (!sampleRatisContainers.isEmpty()) { - String sampleContainerText = - "Sample Ratis Containers not satisfying the criteria : " + sampleRatisContainers + ";"; - status = status.concat("\n").concat(sampleContainerText); - } - - // ec container - String ecStatus = String.format( - "%1.2f%% of [EC] Containers(%s / %s) with at least N reported replica (=%1.2f) >= " + - "safeModeCutoff (=%1.2f);", - getCurrentECContainerThreshold() * 100, - ecContainerWithMinReplicas, (long) ecMaxContainer, - getCurrentECContainerThreshold(), this.safeModeCutoff); - status = status.concat("\n").concat(ecStatus); - - Set sampleEcContainers = ecContainerDNsMap.entrySet().stream(). - filter(entry -> { - Long containerId = entry.getKey(); - int minReplica = getMinReplica(containerId); - Set allReplicas = entry.getValue(); - if (allReplicas.size() >= minReplica) { - return false; - } - return true; - }). - map(Map.Entry::getKey). - limit(SAMPLE_CONTAINER_DISPLAY_LIMIT). - collect(Collectors.toSet()); - - if (!sampleEcContainers.isEmpty()) { - String sampleECContainerText = - "Sample EC Containers not satisfying the criteria : " + sampleEcContainers + ";"; - status = status.concat("\n").concat(sampleECContainerText); - } - - return status; - } - - - @Override - public synchronized void refresh(boolean forceRefresh) { - if (forceRefresh || !validate()) { - initializeRule(); - } - } - - private boolean isClosed(ContainerInfo container) { - final LifeCycleState state = container.getState(); - return state == LifeCycleState.QUASI_CLOSED || - state == LifeCycleState.CLOSED; - } - - private void initializeRule() { - final List containers = containerManager.getContainers(); - // Clean up the related data in the map. - ratisContainers.clear(); - ecContainers.clear(); - - // Iterate through the container list to - // get the minimum replica count for each container. - containers.forEach(container -> { - // There can be containers in OPEN/CLOSING state which were never - // created by the client. We are not considering these containers for - // now. These containers can be handled by tracking pipelines. - - HddsProtos.ReplicationType replicationType = container.getReplicationType(); - - if (isClosed(container) && container.getNumberOfKeys() > 0) { - // If it's of type Ratis - if (replicationType.equals(HddsProtos.ReplicationType.RATIS)) { - ratisContainers.add(container.getContainerID()); - } - - // If it's of type EC - if (replicationType.equals(HddsProtos.ReplicationType.EC)) { - ecContainers.add(container.getContainerID()); - } - } - }); - - ratisMaxContainer = ratisContainers.size(); - ecMaxContainer = ecContainers.size(); - - long ratisCutOff = (long) Math.ceil(ratisMaxContainer * safeModeCutoff); - long ecCutOff = (long) Math.ceil(ecMaxContainer * safeModeCutoff); - - getSafeModeMetrics().setNumContainerWithOneReplicaReportedThreshold(ratisCutOff); - getSafeModeMetrics().setNumContainerWithECDataReplicaReportedThreshold(ecCutOff); - - LOG.info("Refreshed Containers with one replica threshold count {}, " + - "with ec n replica threshold count {}.", ratisCutOff, ecCutOff); - } -} diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java new file mode 100644 index 000000000000..a6c30aace93f --- /dev/null +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java @@ -0,0 +1,249 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hdds.scm.safemode; + +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT; +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import com.google.common.collect.Sets; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.UUID; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; +import org.apache.hadoop.hdds.client.ReplicationConfig; +import org.apache.hadoop.hdds.conf.ConfigurationSource; +import org.apache.hadoop.hdds.protocol.DatanodeDetails; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType; +import org.apache.hadoop.hdds.scm.container.ContainerID; +import org.apache.hadoop.hdds.scm.container.ContainerInfo; +import org.apache.hadoop.hdds.scm.container.ContainerManager; +import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException; +import org.apache.hadoop.hdds.scm.events.SCMEvents; +import org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport; +import org.apache.hadoop.hdds.server.events.EventQueue; +import org.apache.hadoop.hdds.server.events.TypedEvent; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Safe mode rule for EC containers. + */ +public class ECContainerSafeModeRule extends SafeModeExitRule { + + private static final Logger LOG = LoggerFactory.getLogger(ECContainerSafeModeRule.class); + private static final String NAME = "ECContainerSafeModeRule"; + private static final int DEFAULT_MIN_REPLICA = 1; + + private final ContainerManager containerManager; + private final double safeModeCutoff; + private final Set ecContainers; + private final Map> ecContainerDNsMap; + private final AtomicLong ecContainerWithMinReplicas; + private double ecMaxContainer; + + public ECContainerSafeModeRule(EventQueue eventQueue, + ConfigurationSource conf, + ContainerManager containerManager, + SCMSafeModeManager manager) { + super(manager, NAME, eventQueue); + this.safeModeCutoff = getSafeModeCutoff(conf); + this.containerManager = containerManager; + this.ecContainers = new HashSet<>(); + this.ecContainerDNsMap = new ConcurrentHashMap<>(); + this.ecContainerWithMinReplicas = new AtomicLong(0); + initializeRule(); + } + + private static double getSafeModeCutoff(ConfigurationSource conf) { + final double cutoff = conf.getDouble(HDDS_SCM_SAFEMODE_THRESHOLD_PCT, + HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT); + Preconditions.checkArgument((cutoff >= 0.0 && cutoff <= 1.0), + HDDS_SCM_SAFEMODE_THRESHOLD_PCT + " value should be >= 0.0 and <= 1.0"); + return cutoff; + } + + @Override + protected TypedEvent getEventType() { + return SCMEvents.CONTAINER_REGISTRATION_REPORT; + } + + @Override + protected synchronized boolean validate() { + if (validateBasedOnReportProcessing()) { + return getCurrentContainerThreshold() >= safeModeCutoff; + } + + final List containers = containerManager.getContainers( + ReplicationType.EC); + + return containers.stream() + .filter(this::isClosed) + .map(ContainerInfo::containerID) + .noneMatch(this::isMissing); + } + + /** + * Checks if the container has at least the minimum required number of replicas. + */ + private boolean isMissing(ContainerID id) { + try { + int minReplica = getMinReplica(id.getId()); + return containerManager.getContainerReplicas(id).size() < minReplica; + } catch (ContainerNotFoundException ex) { + /* + * This should never happen, in case this happens the container + * somehow got removed from SCM. + * Safemode rule doesn't have to log/fix this. We will just exclude this + * from the rule validation. + */ + return false; + } + } + + @VisibleForTesting + public double getCurrentContainerThreshold() { + return ecMaxContainer == 0 ? 1 : (ecContainerWithMinReplicas.doubleValue() / ecMaxContainer); + } + + /** + * Get the minimum replica. + * + * @param pContainerID containerID + * @return MinReplica. + */ + private int getMinReplica(long pContainerID) { + try { + ContainerID containerID = ContainerID.valueOf(pContainerID); + ContainerInfo container = containerManager.getContainer(containerID); + ReplicationConfig replicationConfig = container.getReplicationConfig(); + return replicationConfig.getMinimumNodes(); + } catch (Exception e) { + LOG.error("containerId = {} not found.", pContainerID, e); + } + + return DEFAULT_MIN_REPLICA; + } + + @Override + protected void process(NodeRegistrationContainerReport report) { + DatanodeDetails datanodeDetails = report.getDatanodeDetails(); + UUID datanodeUUID = datanodeDetails.getUuid(); + + report.getReport().getReportsList().forEach(c -> { + long containerID = c.getContainerID(); + if (ecContainers.contains(containerID)) { + putInContainerDNsMap(containerID, ecContainerDNsMap, datanodeUUID); + recordReportedContainer(containerID); + } + }); + + if (scmInSafeMode()) { + SCMSafeModeManager.getLogger().info( + "SCM in safe mode. {} % containers [EC] have at N reported replica", + getCurrentContainerThreshold() * 100); + } + } + + private void putInContainerDNsMap(long containerID, + Map> containerDNsMap, + UUID datanodeUUID) { + containerDNsMap.computeIfAbsent(containerID, key -> Sets.newHashSet()).add(datanodeUUID); + } + + /** + * Record the reported Container. + * + * @param containerID containerID + */ + private void recordReportedContainer(long containerID) { + + int uuids = 1; + if (ecContainerDNsMap.containsKey(containerID)) { + uuids = ecContainerDNsMap.get(containerID).size(); + } + + int minReplica = getMinReplica(containerID); + if (uuids >= minReplica) { + getSafeModeMetrics() + .incCurrentContainersWithECDataReplicaReportedCount(); + ecContainerWithMinReplicas.getAndAdd(1); + } + } + + private void initializeRule() { + ecContainers.clear(); + ecContainerDNsMap.clear(); + containerManager.getContainers(ReplicationType.EC).stream() + .filter(this::isClosed).filter(c -> c.getNumberOfKeys() > 0) + .map(ContainerInfo::getContainerID).forEach(ecContainers::add); + ecMaxContainer = ecContainers.size(); + long ecCutOff = (long) Math.ceil(ecMaxContainer * safeModeCutoff); + getSafeModeMetrics().setNumContainerWithECDataReplicaReportedThreshold(ecCutOff); + + LOG.info("Refreshed Containers with ec n replica threshold count {}.", ecCutOff); + } + + private boolean isClosed(ContainerInfo container) { + final LifeCycleState state = container.getState(); + return state == LifeCycleState.QUASI_CLOSED || state == LifeCycleState.CLOSED; + } + + @Override + public String getStatusText() { + String status = String.format( + "%1.2f%% of [EC] Containers(%s / %s) with at least N reported replica (=%1.2f) >= " + + "safeModeCutoff (=%1.2f);", + getCurrentContainerThreshold() * 100, + ecContainerWithMinReplicas, (long) ecMaxContainer, + getCurrentContainerThreshold(), this.safeModeCutoff); + + Set sampleEcContainers = ecContainerDNsMap.entrySet().stream().filter(entry -> { + Long containerId = entry.getKey(); + int minReplica = getMinReplica(containerId); + Set allReplicas = entry.getValue(); + return allReplicas.size() < minReplica; + }).map(Map.Entry::getKey).limit(SAMPLE_CONTAINER_DISPLAY_LIMIT).collect(Collectors.toSet()); + + if (!sampleEcContainers.isEmpty()) { + String sampleECContainerText = "Sample EC Containers not satisfying the criteria : " + sampleEcContainers + ";"; + status = status.concat("\n").concat(sampleECContainerText); + } + + return status; + } + + @Override + public synchronized void refresh(boolean forceRefresh) { + if (forceRefresh || !validate()) { + initializeRule(); + } + } + + @Override + protected void cleanup() { + ecContainers.clear(); + ecContainerDNsMap.clear(); + } +} diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java new file mode 100644 index 000000000000..4015fd81b4df --- /dev/null +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java @@ -0,0 +1,201 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hdds.scm.safemode; + +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT; +import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; +import org.apache.hadoop.hdds.conf.ConfigurationSource; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType; +import org.apache.hadoop.hdds.scm.container.ContainerID; +import org.apache.hadoop.hdds.scm.container.ContainerInfo; +import org.apache.hadoop.hdds.scm.container.ContainerManager; +import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException; +import org.apache.hadoop.hdds.scm.events.SCMEvents; +import org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport; +import org.apache.hadoop.hdds.server.events.EventQueue; +import org.apache.hadoop.hdds.server.events.TypedEvent; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * Class defining Safe mode exit criteria for Ratis Containers. + */ +public class RatisContainerSafeModeRule extends SafeModeExitRule { + + private static final Logger LOG = LoggerFactory.getLogger(RatisContainerSafeModeRule.class); + private static final String NAME = "RatisContainerSafeModeRule"; + + private final ContainerManager containerManager; + // Required cutoff % for containers with at least 1 reported replica. + private final double safeModeCutoff; + // Containers read from scm db (excluding containers in ALLOCATED state). + private final Set ratisContainers; + private final AtomicLong ratisContainerWithMinReplicas; + private double ratisMaxContainer; + + public RatisContainerSafeModeRule(EventQueue eventQueue, + ConfigurationSource conf, + ContainerManager containerManager, + SCMSafeModeManager manager) { + super(manager, NAME, eventQueue); + this.safeModeCutoff = getSafeModeCutoff(conf); + this.containerManager = containerManager; + this.ratisContainers = new HashSet<>(); + this.ratisContainerWithMinReplicas = new AtomicLong(0); + initializeRule(); + } + + private static double getSafeModeCutoff(ConfigurationSource conf) { + final double cutoff = conf.getDouble(HDDS_SCM_SAFEMODE_THRESHOLD_PCT, + HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT); + Preconditions.checkArgument((cutoff >= 0.0 && cutoff <= 1.0), + HDDS_SCM_SAFEMODE_THRESHOLD_PCT + " value should be >= 0.0 and <= 1.0"); + return cutoff; + } + + @Override + protected TypedEvent getEventType() { + return SCMEvents.CONTAINER_REGISTRATION_REPORT; + } + + @Override + protected synchronized boolean validate() { + if (validateBasedOnReportProcessing()) { + return (getCurrentContainerThreshold() >= safeModeCutoff); + } + + final List containers = containerManager.getContainers( + ReplicationType.RATIS); + + return containers.stream() + .filter(this::isClosed) + .map(ContainerInfo::containerID) + .noneMatch(this::isMissing); + } + + /** + * Checks if the container has any replica. + */ + private boolean isMissing(ContainerID id) { + try { + return containerManager.getContainerReplicas(id).isEmpty(); + } catch (ContainerNotFoundException ex) { + /* + * This should never happen, in case this happens the container + * somehow got removed from SCM. + * Safemode rule doesn't have to log/fix this. We will just exclude this + * from the rule validation. + */ + return false; + + } + } + + @VisibleForTesting + public double getCurrentContainerThreshold() { + return ratisMaxContainer == 0 ? 1 : (ratisContainerWithMinReplicas.doubleValue() / ratisMaxContainer); + } + + @Override + protected void process(NodeRegistrationContainerReport report) { + report.getReport().getReportsList().forEach(c -> { + long containerID = c.getContainerID(); + if (ratisContainers.contains(containerID)) { + recordReportedContainer(containerID); + ratisContainers.remove(containerID); + } + }); + + if (scmInSafeMode()) { + SCMSafeModeManager.getLogger().info( + "SCM in safe mode. {} % containers [Ratis] have at least one reported replica", + String.format("%.2f", getCurrentContainerThreshold() * 100)); + } + } + + /** + * Record the reported Container. + * + * @param containerID containerID + */ + private void recordReportedContainer(long containerID) { + ratisContainerWithMinReplicas.getAndAdd(1); + getSafeModeMetrics() + .incCurrentContainersWithOneReplicaReportedCount(); + } + + private void initializeRule() { + ratisContainers.clear(); + containerManager.getContainers(ReplicationType.RATIS).stream() + .filter(this::isClosed).filter(c -> c.getNumberOfKeys() > 0) + .map(ContainerInfo::getContainerID).forEach(ratisContainers::add); + ratisMaxContainer = ratisContainers.size(); + long ratisCutOff = (long) Math.ceil(ratisMaxContainer * safeModeCutoff); + getSafeModeMetrics().setNumContainerWithOneReplicaReportedThreshold(ratisCutOff); + + LOG.info("Refreshed Containers with one replica threshold count {}.", ratisCutOff); + } + + private boolean isClosed(ContainerInfo container) { + final LifeCycleState state = container.getState(); + return state == LifeCycleState.QUASI_CLOSED || state == LifeCycleState.CLOSED; + } + + @Override + public String getStatusText() { + String status = String.format( + "%1.2f%% of [Ratis] Containers(%s / %s) with at least one reported replica (=%1.2f) >= " + + "safeModeCutoff (=%1.2f);", + getCurrentContainerThreshold() * 100, + ratisContainerWithMinReplicas, (long) ratisMaxContainer, + getCurrentContainerThreshold(), this.safeModeCutoff); + + Set sampleRatisContainers = ratisContainers.stream().limit(SAMPLE_CONTAINER_DISPLAY_LIMIT) + .collect(Collectors.toSet()); + + if (!sampleRatisContainers.isEmpty()) { + String sampleContainerText = "Sample Ratis Containers not satisfying the criteria : " + sampleRatisContainers + + ";"; + status = status.concat("\n").concat(sampleContainerText); + } + + return status; + } + + @Override + public synchronized void refresh(boolean forceRefresh) { + if (forceRefresh || !validate()) { + initializeRule(); + } + } + + @Override + protected void cleanup() { + ratisContainers.clear(); + } +} diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java index 63784599251f..9dd79ca81595 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java @@ -91,8 +91,9 @@ public class SCMSafeModeManager implements SafeModeManager { private Map exitRules = new HashMap<>(1); private Set preCheckRules = new HashSet<>(1); private ConfigurationSource config; + private static final String RATIS_CONTAINER_EXIT_RULE = "RatisContainerSafeModeRule"; + private static final String EC_CONTAINER_EXIT_RULE = "ECContainerSafeModeRule"; private static final String DN_EXIT_RULE = "DataNodeSafeModeRule"; - private static final String CONT_EXIT_RULE = "ContainerSafeModeRule"; private static final String HEALTHY_PIPELINE_EXIT_RULE = "HealthyPipelineSafeModeRule"; private static final String ATLEAST_ONE_DATANODE_REPORTED_PIPELINE_EXIT_RULE = @@ -319,19 +320,24 @@ public static Logger getLogger() { @VisibleForTesting public double getCurrentContainerThreshold() { - return ((ContainerSafeModeRule) exitRules.get(CONT_EXIT_RULE)) + return ((RatisContainerSafeModeRule) exitRules.get(RATIS_CONTAINER_EXIT_RULE)) .getCurrentContainerThreshold(); } @VisibleForTesting public double getCurrentECContainerThreshold() { - return ((ContainerSafeModeRule) exitRules.get(CONT_EXIT_RULE)) - .getCurrentECContainerThreshold(); + return ((ECContainerSafeModeRule) exitRules.get(EC_CONTAINER_EXIT_RULE)) + .getCurrentContainerThreshold(); + } + + @VisibleForTesting + public RatisContainerSafeModeRule getRatisContainerSafeModeRule() { + return (RatisContainerSafeModeRule) exitRules.get(RATIS_CONTAINER_EXIT_RULE); } @VisibleForTesting - public ContainerSafeModeRule getContainerSafeModeRule() { - return (ContainerSafeModeRule) exitRules.get(CONT_EXIT_RULE); + public ECContainerSafeModeRule getECContainerSafeModeRule() { + return (ECContainerSafeModeRule) exitRules.get(EC_CONTAINER_EXIT_RULE); } @VisibleForTesting diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeRuleFactory.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeRuleFactory.java index 65be38ae6ef9..96c22d06e7db 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeRuleFactory.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeRuleFactory.java @@ -68,12 +68,15 @@ private SafeModeRuleFactory(final ConfigurationSource config, private void loadRules() { // TODO: Use annotation to load the rules. (HDDS-11730) - SafeModeExitRule containerRule = new ContainerSafeModeRule(eventQueue, + SafeModeExitRule ratisContainerRule = new RatisContainerSafeModeRule(eventQueue, + config, containerManager, safeModeManager); + SafeModeExitRule ecContainerRule = new ECContainerSafeModeRule(eventQueue, config, containerManager, safeModeManager); SafeModeExitRule datanodeRule = new DataNodeSafeModeRule(eventQueue, config, nodeManager, safeModeManager); - safeModeRules.add(containerRule); + safeModeRules.add(ratisContainerRule); + safeModeRules.add(ecContainerRule); safeModeRules.add(datanodeRule); preCheckRules.add(datanodeRule); diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java index f693df81dc3f..7a43792eb5db 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java @@ -44,6 +44,7 @@ import org.apache.hadoop.hdds.protocol.DatanodeDetails; import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType; import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos; import org.apache.hadoop.hdds.scm.HddsTestUtils; import org.apache.hadoop.hdds.scm.container.ContainerInfo; @@ -130,7 +131,7 @@ private void testSafeMode(int numContainers) throws Exception { container.setNumberOfKeys(10); } ContainerManager containerManager = mock(ContainerManager.class); - when(containerManager.getContainers()).thenReturn(containers); + when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers); scmSafeModeManager = new SCMSafeModeManager( config, containerManager, null, null, queue, serviceManager, scmContext); @@ -169,7 +170,7 @@ public void testSafeModeExitRule() throws Exception { container.setNumberOfKeys(10); } ContainerManager containerManager = mock(ContainerManager.class); - when(containerManager.getContainers()).thenReturn(containers); + when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers); scmSafeModeManager = new SCMSafeModeManager( config, containerManager, null, null, queue, serviceManager, scmContext); @@ -235,7 +236,7 @@ public void testHealthyPipelinePercentWithIncorrectValue(double healthyPercent, serviceManager, Clock.system(ZoneOffset.UTC)); ContainerManager containerManager = mock(ContainerManager.class); - when(containerManager.getContainers()).thenReturn(containers); + when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers); IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, () -> new SCMSafeModeManager(conf, containerManager, pipelineManager, mockNodeManager, queue, serviceManager, scmContext)); @@ -301,7 +302,7 @@ public void testSafeModeExitRuleWithPipelineAvailabilityCheck( } ContainerManager containerManager = mock(ContainerManager.class); - when(containerManager.getContainers()).thenReturn(containers); + when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers); scmSafeModeManager = new SCMSafeModeManager( conf, containerManager, pipelineManager, mockNodeManager, queue, @@ -438,8 +439,8 @@ public void testDisableSafeMode() { conf.setBoolean(HddsConfigKeys.HDDS_SCM_SAFEMODE_ENABLED, false); PipelineManager pipelineManager = mock(PipelineManager.class); ContainerManager containerManager = mock(ContainerManager.class); + when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers); NodeManager nodeManager = mock(SCMNodeManager.class); - when(containerManager.getContainers()).thenReturn(containers); scmSafeModeManager = new SCMSafeModeManager( conf, containerManager, pipelineManager, nodeManager, queue, serviceManager, scmContext); @@ -480,7 +481,7 @@ public void testContainerSafeModeRule() throws Exception { } ContainerManager containerManager = mock(ContainerManager.class); - when(containerManager.getContainers()).thenReturn(containers); + when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers); scmSafeModeManager = new SCMSafeModeManager( config, containerManager, null, null, queue, serviceManager, scmContext); @@ -572,16 +573,20 @@ public void testContainerSafeModeRuleEC(int data, int parity) throws Exception { // the threshold will reach 100%. testECContainerThreshold(containers.subList(10, 20), 1.0, data); - ContainerSafeModeRule containerSafeModeRule = - scmSafeModeManager.getContainerSafeModeRule(); - assertTrue(containerSafeModeRule.validate()); + ECContainerSafeModeRule ecContainerSafeModeRule = + scmSafeModeManager.getECContainerSafeModeRule(); + assertTrue(ecContainerSafeModeRule.validate()); + + RatisContainerSafeModeRule ratisContainerSafeModeRule = + scmSafeModeManager.getRatisContainerSafeModeRule(); + assertTrue(ratisContainerSafeModeRule.validate()); } private void testSafeModeDataNodes(int numOfDns) throws Exception { OzoneConfiguration conf = new OzoneConfiguration(config); conf.setInt(HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE, numOfDns); ContainerManager containerManager = mock(ContainerManager.class); - when(containerManager.getContainers()).thenReturn(containers); + when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers); scmSafeModeManager = new SCMSafeModeManager( conf, containerManager, null, null, queue, serviceManager, scmContext); @@ -689,7 +694,7 @@ public void testSafeModePipelineExitRule() throws Exception { pipeline = pipelineManager.getPipeline(pipeline.getId()); MockRatisPipelineProvider.markPipelineHealthy(pipeline); ContainerManager containerManager = mock(ContainerManager.class); - when(containerManager.getContainers()).thenReturn(containers); + when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers); scmSafeModeManager = new SCMSafeModeManager( config, containerManager, pipelineManager, nodeManager, queue, @@ -738,7 +743,7 @@ public void testPipelinesNotCreatedUntilPreCheckPasses() throws Exception { mockRatisProvider); ContainerManager containerManager = mock(ContainerManager.class); - when(containerManager.getContainers()).thenReturn(containers); + when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers); scmSafeModeManager = new SCMSafeModeManager( config, containerManager, pipelineManager, nodeManager, queue, diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java index 1868d9e3cbcb..35917039d50c 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java @@ -55,7 +55,7 @@ public void testLoadedSafeModeRules() { // as the rules are hardcoded in SafeModeRuleFactory. // This will be fixed once we load rules using annotation. - assertEquals(4, factory.getSafeModeRules().size(), + assertEquals(5, factory.getSafeModeRules().size(), "The total safemode rules count doesn't match"); }