diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManagerReport.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManagerReport.java index 1dbbc7384321..a10846bd61d0 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManagerReport.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManagerReport.java @@ -74,7 +74,10 @@ public enum HealthState { "OpenUnhealthyContainers"), QUASI_CLOSED_STUCK( "Containers QuasiClosed with insufficient datanode origins", - "StuckQuasiClosedContainers"); + "StuckQuasiClosedContainers"), + OPEN_WITHOUT_PIPELINE( + "Containers in OPEN state without any healthy Pipeline", + "OpenContainersWithoutPipeline"); private String description; private String metricName; diff --git a/hadoop-hdds/common/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManagerReport.java b/hadoop-hdds/common/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManagerReport.java index 3bf2ef402315..f022a6030c03 100644 --- a/hadoop-hdds/common/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManagerReport.java +++ b/hadoop-hdds/common/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManagerReport.java @@ -112,6 +112,7 @@ void testJsonOutput() throws IOException { assertEquals(0, stats.get("EMPTY").longValue()); assertEquals(0, stats.get("OPEN_UNHEALTHY").longValue()); assertEquals(0, stats.get("QUASI_CLOSED_STUCK").longValue()); + assertEquals(0, stats.get("OPEN_WITHOUT_PIPELINE").longValue()); JsonNode samples = json.get("samples"); assertEquals(ARRAY, samples.get("UNDER_REPLICATED").getNodeType()); diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerStateManagerImpl.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerStateManagerImpl.java index 72d90abe1f4f..62e1f0193561 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerStateManagerImpl.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ContainerStateManagerImpl.java @@ -251,16 +251,12 @@ private void initialize() throws IOException { pipelineManager.addContainerToPipelineSCMStart( container.getPipelineID(), container.containerID()); } catch (PipelineNotFoundException ex) { + // We are ignoring this here. The container will be moved to + // CLOSING state by ReplicationManager's OpenContainerHandler + // For more info: HDDS-10231 LOG.warn("Found container {} which is in OPEN state with " + "pipeline {} that does not exist. Marking container for " + "closing.", container, container.getPipelineID()); - try { - updateContainerState(container.containerID().getProtobuf(), - LifeCycleEvent.FINALIZE); - } catch (InvalidStateTransitionException e) { - // This cannot happen. - LOG.warn("Unable to finalize Container {}.", container); - } } } } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java index 979cff799fa5..a3661243be69 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java @@ -61,6 +61,7 @@ import org.apache.hadoop.hdds.scm.node.NodeManager; import org.apache.hadoop.hdds.scm.node.NodeStatus; import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException; +import org.apache.hadoop.hdds.scm.pipeline.PipelineNotFoundException; import org.apache.hadoop.hdds.scm.server.StorageContainerManager; import org.apache.hadoop.hdds.server.events.EventPublisher; import org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionException; @@ -1545,5 +1546,14 @@ private int getRemainingMaintenanceRedundancy(boolean isEC) { private static boolean isEC(ReplicationConfig replicationConfig) { return replicationConfig.getReplicationType() == EC; } + + public boolean hasHealthyPipeline(ContainerInfo container) { + try { + return scmContext.getScm().getPipelineManager() + .getPipeline(container.getPipelineID()) != null; + } catch (PipelineNotFoundException e) { + return false; + } + } } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/OpenContainerHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/OpenContainerHandler.java index 2c0b405db972..21c3c76d3e97 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/OpenContainerHandler.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/OpenContainerHandler.java @@ -53,20 +53,26 @@ public boolean handle(ContainerCheckRequest request) { if (containerInfo.getState() == HddsProtos.LifeCycleState.OPEN) { LOG.debug("Checking open container {} in OpenContainerHandler", containerInfo); - if (!isOpenContainerHealthy( - containerInfo, request.getContainerReplicas())) { - // This is an unhealthy open container, so we need to trigger the - // close process on it. - LOG.debug("Container {} is open but unhealthy. Triggering close.", - containerInfo); - request.getReport().incrementAndSample( - ReplicationManagerReport.HealthState.OPEN_UNHEALTHY, + final boolean noPipeline = !replicationManager.hasHealthyPipeline(containerInfo); + // Minor optimization. If noPipeline is true, isOpenContainerHealthy will not + // be called. + final boolean unhealthy = noPipeline || !isOpenContainerHealthy(containerInfo, + request.getContainerReplicas()); + if (unhealthy) { + // For an OPEN container, we close the container + // if the container has no Pipeline or if the container is unhealthy. + LOG.info("Container {} is open but {}. Triggering close.", + containerInfo, noPipeline ? "has no Pipeline" : "unhealthy"); + + request.getReport().incrementAndSample(noPipeline ? + ReplicationManagerReport.HealthState.OPEN_WITHOUT_PIPELINE : + ReplicationManagerReport.HealthState.OPEN_UNHEALTHY, containerInfo.containerID()); + if (!request.isReadOnly()) { replicationManager .sendCloseContainerEvent(containerInfo.containerID()); } - return true; } // For open containers we do not want to do any further processing in RM // so return true to stop the command chain. diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java index c67008c097ba..47844f32fb0d 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManager.java @@ -28,6 +28,7 @@ import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto; import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.SCMCommandProto; +import org.apache.hadoop.hdds.scm.HddsTestUtils; import org.apache.hadoop.hdds.scm.PlacementPolicy; import org.apache.hadoop.hdds.scm.container.ContainerID; import org.apache.hadoop.hdds.scm.container.ContainerInfo; @@ -43,6 +44,9 @@ import org.apache.hadoop.hdds.scm.node.NodeManager; import org.apache.hadoop.hdds.scm.node.NodeStatus; import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException; +import org.apache.hadoop.hdds.scm.pipeline.PipelineManager; +import org.apache.hadoop.hdds.scm.server.StorageContainerManager; +import org.apache.hadoop.hdds.security.token.ContainerTokenGenerator; import org.apache.hadoop.hdds.server.events.EventPublisher; import org.apache.hadoop.ozone.protocol.commands.DeleteContainerCommand; import org.apache.hadoop.ozone.protocol.commands.ReconstructECContainersCommand; @@ -174,6 +178,16 @@ public void setup() throws IOException { // Ensure that RM will run when asked. when(scmContext.isLeaderReady()).thenReturn(true); when(scmContext.isInSafeMode()).thenReturn(false); + + PipelineManager pipelineManager = mock(PipelineManager.class); + when(pipelineManager.getPipeline(any())) + .thenReturn(HddsTestUtils.getRandomPipeline()); + + StorageContainerManager scm = mock(StorageContainerManager.class); + when(scm.getPipelineManager()).thenReturn(pipelineManager); + when(scm.getContainerTokenGenerator()).thenReturn(ContainerTokenGenerator.DISABLED); + + when(scmContext.getScm()).thenReturn(scm); } private ReplicationManager createReplicationManager() throws IOException { diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestOpenContainerHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestOpenContainerHandler.java index a950008ec9f7..dec61610d1ed 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestOpenContainerHandler.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestOpenContainerHandler.java @@ -24,17 +24,20 @@ import org.apache.hadoop.hdds.scm.container.ContainerInfo; import org.apache.hadoop.hdds.scm.container.ContainerReplica; import org.apache.hadoop.hdds.scm.container.ReplicationManagerReport; +import org.apache.hadoop.hdds.scm.container.ReplicationManagerReport.HealthState; import org.apache.hadoop.hdds.scm.container.replication.ContainerCheckRequest; import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager; import org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.mockito.Mockito; import java.util.Collections; import java.util.Set; import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.CLOSED; import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.OPEN; +import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.mockito.Mockito.mock; @@ -58,6 +61,7 @@ public void setup() { ratisReplicationConfig = RatisReplicationConfig.getInstance( HddsProtos.ReplicationFactor.THREE); replicationManager = mock(ReplicationManager.class); + Mockito.when(replicationManager.hasHealthyPipeline(any())).thenReturn(true); openContainerHandler = new OpenContainerHandler(replicationManager); } @@ -119,8 +123,36 @@ public void testOpenUnhealthyContainerIsClosed() { assertTrue(openContainerHandler.handle(readRequest)); verify(replicationManager, times(1)) .sendCloseContainerEvent(containerInfo.containerID()); + assertEquals(1, request.getReport().getStat(HealthState.OPEN_UNHEALTHY)); } + @Test + public void testOpenContainerWithoutPipelineIsClosed() { + Mockito.when(replicationManager.hasHealthyPipeline(any())).thenReturn(false); + ContainerInfo containerInfo = ReplicationTestUtil.createContainerInfo( + ecReplicationConfig, 1, OPEN); + Set containerReplicas = ReplicationTestUtil + .createReplicas(containerInfo.containerID(), + ContainerReplicaProto.State.OPEN, 1, 2, 3, 4); + ContainerCheckRequest request = new ContainerCheckRequest.Builder() + .setPendingOps(Collections.emptyList()) + .setReport(new ReplicationManagerReport()) + .setContainerInfo(containerInfo) + .setContainerReplicas(containerReplicas) + .build(); + ContainerCheckRequest readRequest = new ContainerCheckRequest.Builder() + .setPendingOps(Collections.emptyList()) + .setReport(new ReplicationManagerReport()) + .setContainerInfo(containerInfo) + .setContainerReplicas(containerReplicas) + .setReadOnly(true) + .build(); + assertTrue(openContainerHandler.handle(request)); + assertTrue(openContainerHandler.handle(readRequest)); + verify(replicationManager, times(1)) + .sendCloseContainerEvent(containerInfo.containerID()); + assertEquals(1, request.getReport().getStat(HealthState.OPEN_WITHOUT_PIPELINE)); + } @Test public void testClosedRatisContainerReturnsFalse() { ContainerInfo containerInfo = ReplicationTestUtil.createContainerInfo( @@ -178,5 +210,33 @@ public void testOpenUnhealthyRatisContainerIsClosed() { assertTrue(openContainerHandler.handle(request)); assertTrue(openContainerHandler.handle(readRequest)); verify(replicationManager, times(1)).sendCloseContainerEvent(any()); + assertEquals(1, request.getReport().getStat(HealthState.OPEN_UNHEALTHY)); + } + + @Test + public void testOpenRatisContainerWithoutPipelineIsClosed() { + Mockito.when(replicationManager.hasHealthyPipeline(any())).thenReturn(false); + ContainerInfo containerInfo = ReplicationTestUtil.createContainerInfo( + ratisReplicationConfig, 1, OPEN); + Set containerReplicas = ReplicationTestUtil + .createReplicas(containerInfo.containerID(), + ContainerReplicaProto.State.OPEN, 0, 0, 0); + ContainerCheckRequest request = new ContainerCheckRequest.Builder() + .setPendingOps(Collections.emptyList()) + .setReport(new ReplicationManagerReport()) + .setContainerInfo(containerInfo) + .setContainerReplicas(containerReplicas) + .build(); + ContainerCheckRequest readRequest = new ContainerCheckRequest.Builder() + .setPendingOps(Collections.emptyList()) + .setReport(new ReplicationManagerReport()) + .setContainerInfo(containerInfo) + .setContainerReplicas(containerReplicas) + .setReadOnly(true) + .build(); + assertTrue(openContainerHandler.handle(request)); + assertTrue(openContainerHandler.handle(readRequest)); + verify(replicationManager, times(1)).sendCloseContainerEvent(any()); + assertEquals(1, request.getReport().getStat(HealthState.OPEN_WITHOUT_PIPELINE)); } }