From 02c37e1c70f62856038e8c87433ee4adf0eec787 Mon Sep 17 00:00:00 2001 From: djordje Date: Mon, 16 Jan 2023 15:18:53 +0100 Subject: [PATCH 1/8] HDDS-7210 Enable HealthState calculation when LifeCycleState is CLOSING or QUASI_CLOSED --- .../scm/container/replication/LegacyReplicationManager.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java index 80a61ee5db31..3dfe078ca6ce 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java @@ -434,7 +434,6 @@ protected void processContainer(ContainerInfo container, container, replica.getDatanodeDetails(), false); } } - return; } /* @@ -444,7 +443,6 @@ protected void processContainer(ContainerInfo container, if (state == LifeCycleState.QUASI_CLOSED) { if (canForceCloseContainer(container, replicas)) { forceCloseContainer(container, replicas); - return; } else { report.incrementAndSample(HealthState.QUASI_CLOSED_STUCK, container.containerID()); From fe3d513c3f554cf56e7c7d76a776309663d32762 Mon Sep 17 00:00:00 2001 From: djordje Date: Tue, 17 Jan 2023 14:38:52 +0100 Subject: [PATCH 2/8] HDDS-7210 Recalculate MISSING HealthState when LifeCycleState is CLOSING --- .../replication/LegacyReplicationManager.java | 15 +++++++++++++++ .../replication/TestLegacyReplicationManager.java | 7 ++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java index 3dfe078ca6ce..422227c80e3e 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java @@ -428,12 +428,16 @@ protected void processContainer(ContainerInfo container, * we have to resend close container command to the datanodes. */ if (state == LifeCycleState.CLOSING) { + if (isClosingContainerMissing(replicas)) { + report.incrementAndSample(HealthState.MISSING, container.containerID()); + } for (ContainerReplica replica: replicas) { if (replica.getState() != State.UNHEALTHY) { sendCloseCommand( container, replica.getDatanodeDetails(), false); } } + return; } /* @@ -443,6 +447,7 @@ protected void processContainer(ContainerInfo container, if (state == LifeCycleState.QUASI_CLOSED) { if (canForceCloseContainer(container, replicas)) { forceCloseContainer(container, replicas); + return; } else { report.incrementAndSample(HealthState.QUASI_CLOSED_STUCK, container.containerID()); @@ -1610,6 +1615,16 @@ private boolean isOpenContainerHealthy( .allMatch(r -> compareState(state, r.getState())); } + /** + * A closing container is missing if there is no open replicas + * @param replicas The replicas belonging to the container + * @return True if the container is healthy, false otherwise + */ + private boolean isClosingContainerMissing(Set replicas) { + return !replicas.stream() + .anyMatch(r -> compareState(LifeCycleState.OPEN, r.getState())); + } + public boolean isContainerReplicatingOrDeleting(ContainerID containerID) { return inflightReplication.containsKey(containerID) || inflightDeletion.containsKey(containerID); diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java index 8c46db43ac3d..61d7cc16ee81 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java @@ -383,6 +383,10 @@ public void testClosingContainer() throws IOException, TimeoutException { datanodeCommandHandler.getInvocationCount( SCMCommandProto.Type.closeContainerCommand)); + ReplicationManagerReport report = replicationManager.getContainerReport(); + Assertions.assertEquals(1, report.getStat(LifeCycleState.CLOSING)); + Assertions.assertEquals(0, report.getStat(ReplicationManagerReport.HealthState.MISSING)); + // Update the OPEN to CLOSING for (ContainerReplica replica: getReplicas(id, State.CLOSING, datanode)) { containerStateManager.updateContainerReplica(id, replica); @@ -393,8 +397,9 @@ public void testClosingContainer() throws IOException, TimeoutException { Assertions.assertEquals(currentCloseCommandCount + 6, datanodeCommandHandler.getInvocationCount( SCMCommandProto.Type.closeContainerCommand)); - ReplicationManagerReport report = replicationManager.getContainerReport(); + report = replicationManager.getContainerReport(); Assertions.assertEquals(1, report.getStat(LifeCycleState.CLOSING)); + Assertions.assertEquals(1, report.getStat(ReplicationManagerReport.HealthState.MISSING)); } @Test From bab561c0b63b66acf091a2c9936d29ba7bcf9869 Mon Sep 17 00:00:00 2001 From: djordje Date: Wed, 18 Jan 2023 10:44:45 +0100 Subject: [PATCH 3/8] HDDS-7210 Fix StyleCheck problems and add unit test --- .../replication/LegacyReplicationManager.java | 26 +++++++-- .../TestLegacyReplicationManager.java | 57 +++++++++++++++++-- 2 files changed, 73 insertions(+), 10 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java index 422227c80e3e..b915aeb87223 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java @@ -429,7 +429,8 @@ protected void processContainer(ContainerInfo container, */ if (state == LifeCycleState.CLOSING) { if (isClosingContainerMissing(replicas)) { - report.incrementAndSample(HealthState.MISSING, container.containerID()); + report.incrementAndSample(HealthState.MISSING, + container.containerID()); } for (ContainerReplica replica: replicas) { if (replica.getState() != State.UNHEALTHY) { @@ -447,6 +448,10 @@ protected void processContainer(ContainerInfo container, if (state == LifeCycleState.QUASI_CLOSED) { if (canForceCloseContainer(container, replicas)) { forceCloseContainer(container, replicas); + if (isQuasyCloseContainerMissing(replicas)) { + report.incrementAndSample(HealthState.MISSING, + container.containerID()); + } return; } else { report.incrementAndSample(HealthState.QUASI_CLOSED_STUCK, @@ -1616,13 +1621,24 @@ private boolean isOpenContainerHealthy( } /** - * A closing container is missing if there is no open replicas + * A closing container is missing if there is no replicas. * @param replicas The replicas belonging to the container - * @return True if the container is healthy, false otherwise + * @return True if the container is missing, false otherwise */ private boolean isClosingContainerMissing(Set replicas) { - return !replicas.stream() - .anyMatch(r -> compareState(LifeCycleState.OPEN, r.getState())); + return replicas.size() == 0; + } + + /** + * A quasy close container is missing if there is no replicas or + * all replicas are in quasy close state. + * @param replicas The replicas belonging to the container + * @return True if the container is missing, false otherwise + */ + private boolean isQuasyCloseContainerMissing(Set replicas) { + return replicas.size() == 0 || replicas.stream() + .allMatch(r -> + compareState(LifeCycleState.QUASI_CLOSED, r.getState())); } public boolean isContainerReplicatingOrDeleting(ContainerID containerID) { diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java index 61d7cc16ee81..aa07c2e77257 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java @@ -383,10 +383,6 @@ public void testClosingContainer() throws IOException, TimeoutException { datanodeCommandHandler.getInvocationCount( SCMCommandProto.Type.closeContainerCommand)); - ReplicationManagerReport report = replicationManager.getContainerReport(); - Assertions.assertEquals(1, report.getStat(LifeCycleState.CLOSING)); - Assertions.assertEquals(0, report.getStat(ReplicationManagerReport.HealthState.MISSING)); - // Update the OPEN to CLOSING for (ContainerReplica replica: getReplicas(id, State.CLOSING, datanode)) { containerStateManager.updateContainerReplica(id, replica); @@ -397,9 +393,60 @@ public void testClosingContainer() throws IOException, TimeoutException { Assertions.assertEquals(currentCloseCommandCount + 6, datanodeCommandHandler.getInvocationCount( SCMCommandProto.Type.closeContainerCommand)); + ReplicationManagerReport report = replicationManager.getContainerReport(); + Assertions.assertEquals(1, report.getStat(LifeCycleState.CLOSING)); + } + + /** + * Create closing container with 1 replica. + * Expectation: Missing containers 0. + * Remove the only replica. + * Expectation: Missing containers 1. + */ + @Test + public void testClosingLastContainer() + throws IOException, TimeoutException { + final ContainerInfo container = getContainer(LifeCycleState.CLOSING); + final ContainerID id = container.containerID(); + + containerStateManager.addContainer(container.getProtobuf()); + + // One replica in OPEN state + final Set replicas = getReplicas(id, State.OPEN, + randomDatanodeDetails()); + + for (ContainerReplica replica : replicas) { + containerStateManager.updateContainerReplica(id, replica); + } + + final int currentCloseCommandCount = datanodeCommandHandler + .getInvocationCount(SCMCommandProto.Type.closeContainerCommand); + + replicationManager.processAll(); + eventQueue.processAll(1000); + Assertions.assertEquals(currentCloseCommandCount + 1, + datanodeCommandHandler.getInvocationCount( + SCMCommandProto.Type.closeContainerCommand)); + + ReplicationManagerReport report = replicationManager.getContainerReport(); + Assertions.assertEquals(1, report.getStat(LifeCycleState.CLOSING)); + Assertions.assertEquals(0, + report.getStat(ReplicationManagerReport.HealthState.MISSING)); + + for (ContainerReplica replica : replicas) { + containerStateManager.removeContainerReplica(id, replica); + } + + replicationManager.processAll(); + eventQueue.processAll(1000); + Assertions.assertEquals(currentCloseCommandCount + 1, + datanodeCommandHandler.getInvocationCount( + SCMCommandProto.Type.closeContainerCommand)); + report = replicationManager.getContainerReport(); Assertions.assertEquals(1, report.getStat(LifeCycleState.CLOSING)); - Assertions.assertEquals(1, report.getStat(ReplicationManagerReport.HealthState.MISSING)); + Assertions.assertEquals(1, + report.getStat(ReplicationManagerReport.HealthState.MISSING)); } @Test From a66748d4bb4be01290ffc117ca5ae1aa88dc1a99 Mon Sep 17 00:00:00 2001 From: djordje Date: Thu, 19 Jan 2023 18:09:04 +0100 Subject: [PATCH 4/8] HDDS-7210 Execute the same code as for CLOSED container --- .../replication/LegacyReplicationManager.java | 99 +++++++++---------- .../TestLegacyReplicationManager.java | 14 ++- 2 files changed, 57 insertions(+), 56 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java index b915aeb87223..ad1c919008f6 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java @@ -422,16 +422,38 @@ protected void processContainer(ContainerInfo container, return; } + /* + * Before processing the container we have to reconcile the + * inflightReplication and inflightDeletion actions. + * + * We remove the entry from inflightReplication and inflightDeletion + * list, if the operation is completed or if it has timed out. + */ + updateInflightAction(container, inflightReplication, + action -> replicas.stream().anyMatch( + r -> r.getDatanodeDetails().equals(action.getDatanode())), + () -> metrics.incrNumReplicationCmdsTimeout(), + action -> updateCompletedReplicationMetrics(container, action)); + + updateInflightAction(container, inflightDeletion, + action -> replicas.stream().noneMatch( + r -> r.getDatanodeDetails().equals(action.getDatanode())), + () -> metrics.incrNumDeletionCmdsTimeout(), + action -> updateCompletedDeletionMetrics(container, action)); + + RatisContainerReplicaCount replicaSet = + getContainerReplicaCount(container, replicas); + ContainerPlacementStatus placementStatus = getPlacementStatus( + replicas, container.getReplicationConfig().getRequiredNodes()); + /* * If the container is in CLOSING state, the replicas can either * be in OPEN or in CLOSING state. In both of this cases * we have to resend close container command to the datanodes. */ if (state == LifeCycleState.CLOSING) { - if (isClosingContainerMissing(replicas)) { - report.incrementAndSample(HealthState.MISSING, - container.containerID()); - } + setHealthStateForClosing(replicaSet, placementStatus, container, + report); for (ContainerReplica replica: replicas) { if (replica.getState() != State.UNHEALTHY) { sendCloseCommand( @@ -448,10 +470,6 @@ protected void processContainer(ContainerInfo container, if (state == LifeCycleState.QUASI_CLOSED) { if (canForceCloseContainer(container, replicas)) { forceCloseContainer(container, replicas); - if (isQuasyCloseContainerMissing(replicas)) { - report.incrementAndSample(HealthState.MISSING, - container.containerID()); - } return; } else { report.incrementAndSample(HealthState.QUASI_CLOSED_STUCK, @@ -466,25 +484,6 @@ protected void processContainer(ContainerInfo container, return; } - /* - * Before processing the container we have to reconcile the - * inflightReplication and inflightDeletion actions. - * - * We remove the entry from inflightReplication and inflightDeletion - * list, if the operation is completed or if it has timed out. - */ - updateInflightAction(container, inflightReplication, - action -> replicas.stream().anyMatch( - r -> r.getDatanodeDetails().equals(action.getDatanode())), - () -> metrics.incrNumReplicationCmdsTimeout(), - action -> updateCompletedReplicationMetrics(container, action)); - - updateInflightAction(container, inflightDeletion, - action -> replicas.stream().noneMatch( - r -> r.getDatanodeDetails().equals(action.getDatanode())), - () -> metrics.incrNumDeletionCmdsTimeout(), - action -> updateCompletedDeletionMetrics(container, action)); - /* * If container is under deleting and all it's replicas are deleted, * then make the container as CLEANED, @@ -503,11 +502,6 @@ protected void processContainer(ContainerInfo container, return; } - RatisContainerReplicaCount replicaSet = - getContainerReplicaCount(container, replicas); - ContainerPlacementStatus placementStatus = getPlacementStatus( - replicas, container.getReplicationConfig().getRequiredNodes()); - /* * We don't have to take any action if the container is healthy. * @@ -1620,25 +1614,28 @@ private boolean isOpenContainerHealthy( .allMatch(r -> compareState(state, r.getState())); } - /** - * A closing container is missing if there is no replicas. - * @param replicas The replicas belonging to the container - * @return True if the container is missing, false otherwise - */ - private boolean isClosingContainerMissing(Set replicas) { - return replicas.size() == 0; - } - - /** - * A quasy close container is missing if there is no replicas or - * all replicas are in quasy close state. - * @param replicas The replicas belonging to the container - * @return True if the container is missing, false otherwise - */ - private boolean isQuasyCloseContainerMissing(Set replicas) { - return replicas.size() == 0 || replicas.stream() - .allMatch(r -> - compareState(LifeCycleState.QUASI_CLOSED, r.getState())); + private void setHealthStateForClosing(RatisContainerReplicaCount replicaSet, + ContainerPlacementStatus placementStatus, + ContainerInfo container, + ReplicationManagerReport report) { + boolean sufficientlyReplicated = replicaSet.isSufficientlyReplicated(); + boolean placementSatisfied = placementStatus.isPolicySatisfied(); + ContainerID containerID = container.containerID(); + if (!placementStatus.isPolicySatisfied()) { + report.incrementAndSample(HealthState.MIS_REPLICATED, containerID); + } + if (!replicaSet.isHealthy()) { + report.incrementAndSample(HealthState.UNHEALTHY, containerID); + } + if (!sufficientlyReplicated || !placementSatisfied) { + if (!inflightReplication.isFull() || !inflightDeletion.isFull()) { + if (replicaSet.isUnrecoverable()) { + report.incrementAndSample(HealthState.MISSING, containerID); + report.incrementAndSample(HealthState.UNDER_REPLICATED, + containerID); + } + } + } } public boolean isContainerReplicatingOrDeleting(ContainerID containerID) { diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java index aa07c2e77257..f8b536aa4c89 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java @@ -404,7 +404,7 @@ public void testClosingContainer() throws IOException, TimeoutException { * Expectation: Missing containers 1. */ @Test - public void testClosingLastContainer() + public void testClosingMissingContainer() throws IOException, TimeoutException { final ContainerInfo container = getContainer(LifeCycleState.CLOSING); final ContainerID id = container.containerID(); @@ -430,8 +430,8 @@ public void testClosingLastContainer() ReplicationManagerReport report = replicationManager.getContainerReport(); Assertions.assertEquals(1, report.getStat(LifeCycleState.CLOSING)); - Assertions.assertEquals(0, - report.getStat(ReplicationManagerReport.HealthState.MISSING)); + Assertions.assertEquals(0, report.getStat( + ReplicationManagerReport.HealthState.MISSING)); for (ContainerReplica replica : replicas) { containerStateManager.removeContainerReplica(id, replica); @@ -445,8 +445,12 @@ public void testClosingLastContainer() report = replicationManager.getContainerReport(); Assertions.assertEquals(1, report.getStat(LifeCycleState.CLOSING)); - Assertions.assertEquals(1, - report.getStat(ReplicationManagerReport.HealthState.MISSING)); + Assertions.assertEquals(1, report.getStat( + ReplicationManagerReport.HealthState.MISSING)); + Assertions.assertEquals(1, report.getStat( + ReplicationManagerReport.HealthState.UNDER_REPLICATED)); + Assertions.assertEquals(1, report.getStat( + ReplicationManagerReport.HealthState.UNHEALTHY)); } @Test From 5103c802310bbac0f72f27297064f9d2cb168edd Mon Sep 17 00:00:00 2001 From: djordje Date: Thu, 19 Jan 2023 19:17:45 +0100 Subject: [PATCH 5/8] HDDS-7210 Fix StyleCheck problems --- .../container/replication/LegacyReplicationManager.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java index ad1c919008f6..db428c29a579 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java @@ -430,14 +430,14 @@ protected void processContainer(ContainerInfo container, * list, if the operation is completed or if it has timed out. */ updateInflightAction(container, inflightReplication, - action -> replicas.stream().anyMatch( - r -> r.getDatanodeDetails().equals(action.getDatanode())), + action -> replicas.stream().anyMatch(r -> + r.getDatanodeDetails().equals(action.getDatanode())), () -> metrics.incrNumReplicationCmdsTimeout(), action -> updateCompletedReplicationMetrics(container, action)); updateInflightAction(container, inflightDeletion, - action -> replicas.stream().noneMatch( - r -> r.getDatanodeDetails().equals(action.getDatanode())), + action -> replicas.stream().noneMatch(r -> + r.getDatanodeDetails().equals(action.getDatanode())), () -> metrics.incrNumDeletionCmdsTimeout(), action -> updateCompletedDeletionMetrics(container, action)); From a47c397e06adfad39be5cfc6430262ee3efa8a96 Mon Sep 17 00:00:00 2001 From: djordje Date: Tue, 24 Jan 2023 16:48:30 +0100 Subject: [PATCH 6/8] HDDS-7210 Write safe setHealthStateForClosing function --- .../replication/LegacyReplicationManager.java | 83 ++++++++----------- .../TestLegacyReplicationManager.java | 2 +- 2 files changed, 37 insertions(+), 48 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java index db428c29a579..d3c156682302 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java @@ -422,38 +422,13 @@ protected void processContainer(ContainerInfo container, return; } - /* - * Before processing the container we have to reconcile the - * inflightReplication and inflightDeletion actions. - * - * We remove the entry from inflightReplication and inflightDeletion - * list, if the operation is completed or if it has timed out. - */ - updateInflightAction(container, inflightReplication, - action -> replicas.stream().anyMatch(r -> - r.getDatanodeDetails().equals(action.getDatanode())), - () -> metrics.incrNumReplicationCmdsTimeout(), - action -> updateCompletedReplicationMetrics(container, action)); - - updateInflightAction(container, inflightDeletion, - action -> replicas.stream().noneMatch(r -> - r.getDatanodeDetails().equals(action.getDatanode())), - () -> metrics.incrNumDeletionCmdsTimeout(), - action -> updateCompletedDeletionMetrics(container, action)); - - RatisContainerReplicaCount replicaSet = - getContainerReplicaCount(container, replicas); - ContainerPlacementStatus placementStatus = getPlacementStatus( - replicas, container.getReplicationConfig().getRequiredNodes()); - /* * If the container is in CLOSING state, the replicas can either * be in OPEN or in CLOSING state. In both of this cases * we have to resend close container command to the datanodes. */ if (state == LifeCycleState.CLOSING) { - setHealthStateForClosing(replicaSet, placementStatus, container, - report); + setHealthStateForClosing(replicas, container, report); for (ContainerReplica replica: replicas) { if (replica.getState() != State.UNHEALTHY) { sendCloseCommand( @@ -484,6 +459,25 @@ protected void processContainer(ContainerInfo container, return; } + /* + * Before processing the container we have to reconcile the + * inflightReplication and inflightDeletion actions. + * + * We remove the entry from inflightReplication and inflightDeletion + * list, if the operation is completed or if it has timed out. + */ + updateInflightAction(container, inflightReplication, + action -> replicas.stream().anyMatch(r -> + r.getDatanodeDetails().equals(action.getDatanode())), + () -> metrics.incrNumReplicationCmdsTimeout(), + action -> updateCompletedReplicationMetrics(container, action)); + + updateInflightAction(container, inflightDeletion, + action -> replicas.stream().noneMatch(r -> + r.getDatanodeDetails().equals(action.getDatanode())), + () -> metrics.incrNumDeletionCmdsTimeout(), + action -> updateCompletedDeletionMetrics(container, action)); + /* * If container is under deleting and all it's replicas are deleted, * then make the container as CLEANED, @@ -502,6 +496,11 @@ protected void processContainer(ContainerInfo container, return; } + RatisContainerReplicaCount replicaSet = + getContainerReplicaCount(container, replicas); + ContainerPlacementStatus placementStatus = getPlacementStatus( + replicas, container.getReplicationConfig().getRequiredNodes()); + /* * We don't have to take any action if the container is healthy. * @@ -1614,27 +1613,17 @@ private boolean isOpenContainerHealthy( .allMatch(r -> compareState(state, r.getState())); } - private void setHealthStateForClosing(RatisContainerReplicaCount replicaSet, - ContainerPlacementStatus placementStatus, - ContainerInfo container, - ReplicationManagerReport report) { - boolean sufficientlyReplicated = replicaSet.isSufficientlyReplicated(); - boolean placementSatisfied = placementStatus.isPolicySatisfied(); - ContainerID containerID = container.containerID(); - if (!placementStatus.isPolicySatisfied()) { - report.incrementAndSample(HealthState.MIS_REPLICATED, containerID); - } - if (!replicaSet.isHealthy()) { - report.incrementAndSample(HealthState.UNHEALTHY, containerID); - } - if (!sufficientlyReplicated || !placementSatisfied) { - if (!inflightReplication.isFull() || !inflightDeletion.isFull()) { - if (replicaSet.isUnrecoverable()) { - report.incrementAndSample(HealthState.MISSING, containerID); - report.incrementAndSample(HealthState.UNDER_REPLICATED, - containerID); - } - } + private void setHealthStateForClosing(Set replicas, + ContainerInfo container, + ReplicationManagerReport report) { + if (!replicas.stream(). + anyMatch(r -> compareState(LifeCycleState.OPEN, r.getState()) || + compareState(LifeCycleState.CLOSED, r.getState()))) { + report.incrementAndSample(HealthState.MISSING, container.containerID()); + report.incrementAndSample(HealthState.UNDER_REPLICATED, + container.containerID()); + report.incrementAndSample(HealthState.MIS_REPLICATED, + container.containerID()); } } diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java index f8b536aa4c89..27a33b53ea45 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestLegacyReplicationManager.java @@ -450,7 +450,7 @@ public void testClosingMissingContainer() Assertions.assertEquals(1, report.getStat( ReplicationManagerReport.HealthState.UNDER_REPLICATED)); Assertions.assertEquals(1, report.getStat( - ReplicationManagerReport.HealthState.UNHEALTHY)); + ReplicationManagerReport.HealthState.MIS_REPLICATED)); } @Test From 62defa5260f0ef33b29cb20432d86132bb80854c Mon Sep 17 00:00:00 2001 From: djordje Date: Tue, 24 Jan 2023 16:52:35 +0100 Subject: [PATCH 7/8] HDDS-7210 Update Style --- .../replication/LegacyReplicationManager.java | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java index d3c156682302..d0179e4a0130 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java @@ -467,16 +467,16 @@ protected void processContainer(ContainerInfo container, * list, if the operation is completed or if it has timed out. */ updateInflightAction(container, inflightReplication, - action -> replicas.stream().anyMatch(r -> - r.getDatanodeDetails().equals(action.getDatanode())), - () -> metrics.incrNumReplicationCmdsTimeout(), - action -> updateCompletedReplicationMetrics(container, action)); + action -> replicas.stream().anyMatch( + r -> r.getDatanodeDetails().equals(action.getDatanode())), + () -> metrics.incrNumReplicationCmdsTimeout(), + action -> updateCompletedReplicationMetrics(container, action)); updateInflightAction(container, inflightDeletion, - action -> replicas.stream().noneMatch(r -> - r.getDatanodeDetails().equals(action.getDatanode())), - () -> metrics.incrNumDeletionCmdsTimeout(), - action -> updateCompletedDeletionMetrics(container, action)); + action -> replicas.stream().noneMatch( + r -> r.getDatanodeDetails().equals(action.getDatanode())), + () -> metrics.incrNumDeletionCmdsTimeout(), + action -> updateCompletedDeletionMetrics(container, action)); /* * If container is under deleting and all it's replicas are deleted, @@ -497,9 +497,9 @@ protected void processContainer(ContainerInfo container, } RatisContainerReplicaCount replicaSet = - getContainerReplicaCount(container, replicas); + getContainerReplicaCount(container, replicas); ContainerPlacementStatus placementStatus = getPlacementStatus( - replicas, container.getReplicationConfig().getRequiredNodes()); + replicas, container.getReplicationConfig().getRequiredNodes()); /* * We don't have to take any action if the container is healthy. From a6c4450b613cb12c8dd172732951db9f28e67d19 Mon Sep 17 00:00:00 2001 From: djordje Date: Wed, 1 Feb 2023 19:43:50 +0100 Subject: [PATCH 8/8] HDDS-7210 Update condition --- .../scm/container/replication/LegacyReplicationManager.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java index d0179e4a0130..48deb0c53203 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/LegacyReplicationManager.java @@ -1616,9 +1616,7 @@ private boolean isOpenContainerHealthy( private void setHealthStateForClosing(Set replicas, ContainerInfo container, ReplicationManagerReport report) { - if (!replicas.stream(). - anyMatch(r -> compareState(LifeCycleState.OPEN, r.getState()) || - compareState(LifeCycleState.CLOSED, r.getState()))) { + if (replicas.size() == 0) { report.incrementAndSample(HealthState.MISSING, container.containerID()); report.incrementAndSample(HealthState.UNDER_REPLICATED, container.containerID());