diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/QuasiClosedContainerHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/QuasiClosedContainerHandler.java index 1240a57a1fe7..b3e44246a299 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/QuasiClosedContainerHandler.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/QuasiClosedContainerHandler.java @@ -99,17 +99,33 @@ private boolean canForceCloseContainer(final ContainerInfo container, final Set replicas) { final int replicationFactor = container.getReplicationConfig().getRequiredNodes(); - final long uniqueQuasiClosedReplicaCount = replicas.stream() - .filter(r -> r.getState() == State.QUASI_CLOSED) + + final long uniqueQuasiClosedOrUnhealthyReplicaCount = replicas.stream() + .filter(r -> r.getState() == State.QUASI_CLOSED || r.getState() == State.UNHEALTHY) .map(ContainerReplica::getOriginDatanodeId) .distinct() .count(); + + long maxQCSeq = -1; + long maxUnhealthySeq = -1; + for (ContainerReplica r : replicas) { + if (r.getState() == State.QUASI_CLOSED) { + maxQCSeq = Math.max(maxQCSeq, r.getSequenceId()); + } else if (r.getState() == State.UNHEALTHY) { + maxUnhealthySeq = Math.max(maxUnhealthySeq, r.getSequenceId()); + } + } + // We can only force close the container if we have seen all the replicas from unique origins. // Due to unexpected behavior when writing to ratis containers, it is possible for blocks to be committed // on the ratis leader, but not on the followers. A failure on the leader can result in two replicas // without the latest transactions, which are then force closed. This can result in data loss. // Note that if the 3rd replica is permanently lost, the container will be stuck in QUASI_CLOSED state forever. - return uniqueQuasiClosedReplicaCount >= replicationFactor; + // It is possible to CLOSE a container that has one QC and the remaining UNHEALTHY, provided the QC is one of the + // replicas with the highest sequence ID. If an UNHEALTHY replica has a higher sequence ID, the container will + // remain in QUASI_CLOSED state. + return maxQCSeq > -1 && maxQCSeq >= maxUnhealthySeq + && uniqueQuasiClosedOrUnhealthyReplicaCount >= replicationFactor; } /** diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestQuasiClosedContainerHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestQuasiClosedContainerHandler.java index ce161127d9d7..fab3d8a18f94 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestQuasiClosedContainerHandler.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestQuasiClosedContainerHandler.java @@ -177,6 +177,90 @@ public void testQuasiClosedWithAllUniqueOriginSendsForceClose() { .sendCloseContainerReplicaCommand(any(), any(), anyBoolean()); } + /** + * When a container is QUASI_CLOSED with some unhealthy and all 3 are reported with unique + * origins, it should be forced closed. + */ + @Test + public void testQuasiClosedWithAllUniqueOriginAndUnhealthySendsForceClose() { + ContainerInfo containerInfo = ReplicationTestUtil.createContainerInfo( + ratisReplicationConfig, 1, QUASI_CLOSED); + // These 3 replicas will have the same BCSID and unique origin node ids + Set containerReplicas = ReplicationTestUtil + .createReplicas(containerInfo.containerID(), + State.QUASI_CLOSED, 0, 0); + containerReplicas.addAll(ReplicationTestUtil + .createReplicas(containerInfo.containerID(), + State.UNHEALTHY, 0)); + ContainerCheckRequest request = new ContainerCheckRequest.Builder() + .setPendingOps(Collections.emptyList()) + .setReport(new ReplicationManagerReport()) + .setContainerInfo(containerInfo) + .setContainerReplicas(containerReplicas) + .build(); + ContainerCheckRequest readRequest = new ContainerCheckRequest.Builder() + .setPendingOps(Collections.emptyList()) + .setReport(new ReplicationManagerReport()) + .setContainerInfo(containerInfo) + .setContainerReplicas(containerReplicas) + .setReadOnly(true) + .build(); + + assertFalse(quasiClosedContainerHandler.handle(request)); + assertFalse(quasiClosedContainerHandler.handle(readRequest)); + verify(replicationManager, times(2)) + .sendCloseContainerReplicaCommand(any(), any(), anyBoolean()); + } + + /** + * If it's possible to force close replicas then only replicas with the + * highest Sequence ID (also known as BCSID) should be closed. + */ + @Test + public void testQuasiClosedWithUnhealthyHavingHighestSeq() { + final ContainerInfo containerInfo = + getContainer(HddsProtos.LifeCycleState.QUASI_CLOSED); + containerInfo.setUsedBytes(99); + final ContainerID id = containerInfo.containerID(); + + // create replicas with unique origin DNs + DatanodeDetails dnOne = randomDatanodeDetails(); + DatanodeDetails dnTwo = randomDatanodeDetails(); + DatanodeDetails dnThree = randomDatanodeDetails(); + + // 1001 is the highest sequence id + final ContainerReplica replicaOne = getReplicas( + id, State.QUASI_CLOSED, 1000L, dnOne.getUuid(), dnOne); + final ContainerReplica replicaTwo = getReplicas( + id, State.QUASI_CLOSED, 1000L, dnTwo.getUuid(), dnTwo); + final ContainerReplica replicaThree = getReplicas( + id, State.UNHEALTHY, 1001L, dnThree.getUuid(), dnThree); + Set containerReplicas = new HashSet<>(); + containerReplicas.add(replicaOne); + containerReplicas.add(replicaTwo); + containerReplicas.add(replicaThree); + + ContainerCheckRequest request = new ContainerCheckRequest.Builder() + .setPendingOps(Collections.emptyList()) + .setReport(new ReplicationManagerReport()) + .setContainerInfo(containerInfo) + .setContainerReplicas(containerReplicas) + .build(); + ContainerCheckRequest readRequest = new ContainerCheckRequest.Builder() + .setPendingOps(Collections.emptyList()) + .setReport(new ReplicationManagerReport()) + .setContainerInfo(containerInfo) + .setContainerReplicas(containerReplicas) + .setReadOnly(true) + .build(); + + assertFalse(quasiClosedContainerHandler.handle(request)); + assertFalse(quasiClosedContainerHandler.handle(readRequest)); + // verify no close commands are sent as the container cannot be closed. + verify(replicationManager, times(0)) + .sendCloseContainerReplicaCommand(eq(containerInfo), any(), anyBoolean()); + } + /** * The replicas are QUASI_CLOSED, but all of them have the same origin node * id. Since all replicas must have unique origin node ids, the handler diff --git a/hadoop-hdds/server-scm/src/test/resources/replicationManagerTests/quasi_closed.json b/hadoop-hdds/server-scm/src/test/resources/replicationManagerTests/quasi_closed.json index 3b7bfbc52b4c..c9e54ded4497 100644 --- a/hadoop-hdds/server-scm/src/test/resources/replicationManagerTests/quasi_closed.json +++ b/hadoop-hdds/server-scm/src/test/resources/replicationManagerTests/quasi_closed.json @@ -58,5 +58,58 @@ { "type": "closeContainerCommand", "datanode": "d2" } ], "commands": [] + }, + { "description": "Quasi-Closed with 2 replicas and unhealthy", "containerState": "QUASI_CLOSED", "replicationConfig": "RATIS:THREE", "sequenceId": 10, + "replicas": [ + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d1", "sequenceId": 10, "isEmpty": false, "origin": "o1"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d2", "sequenceId": 10, "isEmpty": false, "origin": "o2"}, + { "state": "UNHEALTHY", "index": 0, "datanode": "d3", "sequenceId": 10, "isEmpty": false, "origin": "o3"} + ], + "expectation": { "underReplicated": 1, "underReplicatedQueue": 1, "overReplicated": 0, "overReplicatedQueue": 0, "quasiClosedStuck": 0, "unhealthy": 0 }, + "checkCommands": [ + { "type": "closeContainerCommand", "datanode": "d1" }, + { "type": "closeContainerCommand", "datanode": "d2" } + ], + "commands": [ + { "type": "replicateContainerCommand" } + ] + }, + { "description": "Quasi-Closed with 1 replica and two unhealthy", "containerState": "QUASI_CLOSED", "replicationConfig": "RATIS:THREE", "sequenceId": 10, + "replicas": [ + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d1", "sequenceId": 10, "isEmpty": false, "origin": "o1"}, + { "state": "UNHEALTHY", "index": 0, "datanode": "d2", "sequenceId": 10, "isEmpty": false, "origin": "o2"}, + { "state": "UNHEALTHY", "index": 0, "datanode": "d3", "sequenceId": 10, "isEmpty": false, "origin": "o3"} + ], + "expectation": { "underReplicated": 1, "underReplicatedQueue": 1, "overReplicated": 0, "overReplicatedQueue": 0, "quasiClosedStuck": 0, "unhealthy": 0 }, + "checkCommands": [ + { "type": "closeContainerCommand", "datanode": "d1" } + ], + "commands": [ + { "type": "replicateContainerCommand", "datanode": "d1" }, + { "type": "replicateContainerCommand", "datanode": "d1" } + ] + }, + { "description": "Quasi-Closed with 2 replicas and unhealthy where unhealthy is highest BCSID", "containerState": "QUASI_CLOSED", "replicationConfig": "RATIS:THREE", "sequenceId": 11, + "replicas": [ + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d1", "sequenceId": 10, "isEmpty": false, "origin": "o1"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d2", "sequenceId": 10, "isEmpty": false, "origin": "o2"}, + { "state": "UNHEALTHY", "index": 0, "datanode": "d3", "sequenceId": 11, "isEmpty": false, "origin": "o3"} + ], + "expectation": { "underReplicated": 1, "underReplicatedQueue": 1, "overReplicated": 0, "overReplicatedQueue": 0, "quasiClosedStuck": 1, "unhealthy": 0 }, + "checkCommands": [], + "commands": [ + { "type": "replicateContainerCommand" } + ] + }, + { "description": "Quasi-Closed with 3 QC and one unhealthy", "containerState": "QUASI_CLOSED", "replicationConfig": "RATIS:THREE", "sequenceId": 11, + "replicas": [ + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d1", "sequenceId": 10, "isEmpty": false, "origin": "o1"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d2", "sequenceId": 10, "isEmpty": false, "origin": "o2"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d3", "sequenceId": 10, "isEmpty": false, "origin": "o2"}, + { "state": "UNHEALTHY", "index": 0, "datanode": "d4", "sequenceId": 11, "isEmpty": false, "origin": "o3"} + ], + "expectation": { "underReplicated": 0, "underReplicatedQueue": 0, "overReplicated": 0, "overReplicatedQueue": 0, "quasiClosedStuck": 1, "unhealthy": 0 }, + "checkCommands": [], + "commands": [] } ] \ No newline at end of file