diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/QuasiClosedContainerHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/QuasiClosedContainerHandler.java index 94a93af1074b..1240a57a1fe7 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/QuasiClosedContainerHandler.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/QuasiClosedContainerHandler.java @@ -104,7 +104,12 @@ private boolean canForceCloseContainer(final ContainerInfo container, .map(ContainerReplica::getOriginDatanodeId) .distinct() .count(); - return uniqueQuasiClosedReplicaCount > (replicationFactor / 2); + // We can only force close the container if we have seen all the replicas from unique origins. + // Due to unexpected behavior when writing to ratis containers, it is possible for blocks to be committed + // on the ratis leader, but not on the followers. A failure on the leader can result in two replicas + // without the latest transactions, which are then force closed. This can result in data loss. + // Note that if the 3rd replica is permanently lost, the container will be stuck in QUASI_CLOSED state forever. + return uniqueQuasiClosedReplicaCount >= replicationFactor; } /** diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestQuasiClosedContainerHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestQuasiClosedContainerHandler.java index e57879a90467..ce161127d9d7 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestQuasiClosedContainerHandler.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestQuasiClosedContainerHandler.java @@ -108,13 +108,12 @@ public void testOpenContainerReturnsFalse() { } /** - * When a container is QUASI_CLOSED, and it has greater than 50% of its + * When a container is QUASI_CLOSED, and it has only 2 * replicas in QUASI_CLOSED state with unique origin node id, - * the handler should send force close commands to the replica(s) with - * highest BCSID. + * the handler should not force close it as all 3 unique replicas are needed. */ @Test - public void testQuasiClosedWithQuorumReturnsTrue() { + public void testQuasiClosedWithQuorumReturnsFalse() { ContainerInfo containerInfo = ReplicationTestUtil.createContainerInfo( ratisReplicationConfig, 1, QUASI_CLOSED); Set containerReplicas = ReplicationTestUtil @@ -140,14 +139,48 @@ public void testQuasiClosedWithQuorumReturnsTrue() { assertFalse(quasiClosedContainerHandler.handle(request)); assertFalse(quasiClosedContainerHandler.handle(readRequest)); - verify(replicationManager, times(2)) + verify(replicationManager, times(0)) + .sendCloseContainerReplicaCommand(any(), any(), anyBoolean()); + assertEquals(1, request.getReport().getStat( + ReplicationManagerReport.HealthState.QUASI_CLOSED_STUCK)); + } + + /** + * When a container is QUASI_CLOSED, and all 3 replicas are reported with unique + * origins, it should be forced closed. + */ + @Test + public void testQuasiClosedWithAllUniqueOriginSendsForceClose() { + ContainerInfo containerInfo = ReplicationTestUtil.createContainerInfo( + ratisReplicationConfig, 1, QUASI_CLOSED); + // These 3 replicas will have the same BCSID and unique origin node ids + Set containerReplicas = ReplicationTestUtil + .createReplicas(containerInfo.containerID(), + State.QUASI_CLOSED, 0, 0, 0); + ContainerCheckRequest request = new ContainerCheckRequest.Builder() + .setPendingOps(Collections.emptyList()) + .setReport(new ReplicationManagerReport()) + .setContainerInfo(containerInfo) + .setContainerReplicas(containerReplicas) + .build(); + ContainerCheckRequest readRequest = new ContainerCheckRequest.Builder() + .setPendingOps(Collections.emptyList()) + .setReport(new ReplicationManagerReport()) + .setContainerInfo(containerInfo) + .setContainerReplicas(containerReplicas) + .setReadOnly(true) + .build(); + + assertFalse(quasiClosedContainerHandler.handle(request)); + assertFalse(quasiClosedContainerHandler.handle(readRequest)); + verify(replicationManager, times(3)) .sendCloseContainerReplicaCommand(any(), any(), anyBoolean()); } /** * The replicas are QUASI_CLOSED, but all of them have the same origin node - * id. Since a quorum (greater than 50% of replicas with unique origin node - * ids in QUASI_CLOSED state) is not formed, the handler should return false. + * id. Since all replicas must have unique origin node ids, the handler + * should not force close it. */ @Test public void testHealthyQuasiClosedContainerReturnsFalse() { @@ -172,8 +205,7 @@ public void testHealthyQuasiClosedContainerReturnsFalse() { /** * Only one replica is in QUASI_CLOSED state. This fails the condition of - * having greater than 50% of replicas with unique origin nodes in - * QUASI_CLOSED state. The handler should return false. + * having all replicas with unique origin nodes in QUASI_CLOSED state. */ @Test public void testQuasiClosedWithTwoOpenReplicasReturnsFalse() { diff --git a/hadoop-hdds/server-scm/src/test/resources/replicationManagerTests/quasi_closed.json b/hadoop-hdds/server-scm/src/test/resources/replicationManagerTests/quasi_closed.json new file mode 100644 index 000000000000..3b7bfbc52b4c --- /dev/null +++ b/hadoop-hdds/server-scm/src/test/resources/replicationManagerTests/quasi_closed.json @@ -0,0 +1,62 @@ +[ + { "description": "Quasi-closed replicas with one open", "containerState": "QUASI_CLOSED", "replicationConfig": "RATIS:THREE", "sequenceId": 12, + "replicas": [ + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d1", "sequenceId": 12, "isEmpty": false, "origin": "o1"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d2", "sequenceId": 12, "isEmpty": false, "origin": "o2"}, + { "state": "OPEN", "index": 0, "datanode": "d3", "sequenceId": 12, "isEmpty": false, "origin": "o3"} + ], + "expectation": { "overReplicated": 0, "overReplicatedQueue": 0, "quasiClosedStuck": 1}, + "checkCommands": [ + { "type": "closeContainerCommand", "datanode": "d3" } + ], + "commands": [] + }, + { "description": "Quasi-closed with 2 replicas", "containerState": "QUASI_CLOSED", "replicationConfig": "RATIS:THREE", "sequenceId": 12, + "replicas": [ + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d1", "sequenceId": 12, "isEmpty": false, "origin": "o1"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d2", "sequenceId": 12, "isEmpty": false, "origin": "o2"} + ], + "expectation": { "underReplicated": 1, "underReplicatedQueue": 1, "overReplicated": 0, "overReplicatedQueue": 0, "quasiClosedStuck": 1}, + "checkCommands": [], + "commands": [ + { "type": "replicateContainerCommand" } + ] + }, + { "description": "Quasi-closed with 3 replicas 2 origins", "containerState": "QUASI_CLOSED", "replicationConfig": "RATIS:THREE", "sequenceId": 12, + "replicas": [ + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d1", "sequenceId": 12, "isEmpty": false, "origin": "o1"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d2", "sequenceId": 12, "isEmpty": false, "origin": "o2"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d3", "sequenceId": 12, "isEmpty": false, "origin": "o2"} + ], + "expectation": { "underReplicated": 0, "underReplicatedQueue": 0, "overReplicated": 0, "overReplicatedQueue": 0, "quasiClosedStuck": 1}, + "checkCommands": [], + "commands": [] + }, + { "description": "Quasi-closed with 3 replicas 3 origins", "containerState": "QUASI_CLOSED", "replicationConfig": "RATIS:THREE", "sequenceId": 12, + "replicas": [ + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d1", "sequenceId": 12, "isEmpty": false, "origin": "o1"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d2", "sequenceId": 12, "isEmpty": false, "origin": "o2"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d3", "sequenceId": 12, "isEmpty": false, "origin": "o3"} + ], + "expectation": { "underReplicated": 0, "underReplicatedQueue": 0, "overReplicated": 0, "overReplicatedQueue": 0, "quasiClosedStuck": 0 }, + "checkCommands": [ + { "type": "closeContainerCommand", "datanode": "d1" }, + { "type": "closeContainerCommand", "datanode": "d2" }, + { "type": "closeContainerCommand", "datanode": "d3" } + ], + "commands": [] + }, + { "description": "Quasi-closed with 3 replicas 3 origins different BCSID", "containerState": "QUASI_CLOSED", "replicationConfig": "RATIS:THREE", "sequenceId": 12, + "replicas": [ + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d1", "sequenceId": 12, "isEmpty": false, "origin": "o1"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d2", "sequenceId": 12, "isEmpty": false, "origin": "o2"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d3", "sequenceId": 11, "isEmpty": false, "origin": "o3"} + ], + "expectation": { "underReplicated": 0, "underReplicatedQueue": 0, "overReplicated": 0, "overReplicatedQueue": 0, "quasiClosedStuck": 0 }, + "checkCommands": [ + { "type": "closeContainerCommand", "datanode": "d1" }, + { "type": "closeContainerCommand", "datanode": "d2" } + ], + "commands": [] + } +] \ No newline at end of file