From a35318f9b17b5715b300a8cf19bfffe2e317fb46 Mon Sep 17 00:00:00 2001 From: baoloongmao Date: Tue, 15 Sep 2020 10:21:01 +0800 Subject: [PATCH 1/8] HDDS-4244. Container deleted wrong replica cause mis-replicated. --- .../apache/hadoop/hdds/scm/container/ReplicationManager.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java index 3a9ad1bc8acb..8269cb1d8404 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java @@ -661,7 +661,7 @@ private void handleOverReplicatedContainer(final ContainerInfo container, eligibleReplicas.removeAll(unhealthyReplicas); Set replicaSet = new HashSet<>(eligibleReplicas); boolean misReplicated = - getPlacementStatus(replicaSet, replicationFactor) + !getPlacementStatus(replicaSet, replicationFactor) .isPolicySatisfied(); for (ContainerReplica r : eligibleReplicas) { if (excess <= 0) { From 2c560209277098e849aea6bcbeb83897e543bef3 Mon Sep 17 00:00:00 2001 From: baoloongmao Date: Tue, 15 Sep 2020 11:23:20 +0800 Subject: [PATCH 2/8] HDDS-4244. Container deleted wrong replica cause mis-replicated. --- .../hdds/scm/container/ReplicationManager.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java index 8269cb1d8404..0e1898c811ed 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java @@ -660,8 +660,8 @@ private void handleOverReplicatedContainer(final ContainerInfo container, if (excess > 0) { eligibleReplicas.removeAll(unhealthyReplicas); Set replicaSet = new HashSet<>(eligibleReplicas); - boolean misReplicated = - !getPlacementStatus(replicaSet, replicationFactor) + boolean satisfied = + getPlacementStatus(replicaSet, replicationFactor) .isPolicySatisfied(); for (ContainerReplica r : eligibleReplicas) { if (excess <= 0) { @@ -670,11 +670,11 @@ private void handleOverReplicatedContainer(final ContainerInfo container, // First remove the replica we are working on from the set, and then // check if the set is now mis-replicated. replicaSet.remove(r); - boolean nowMisRep = getPlacementStatus(replicaSet, replicationFactor) + boolean nowSatisfied = getPlacementStatus(replicaSet, replicationFactor) .isPolicySatisfied(); - if (misReplicated || !nowMisRep) { - // Remove the replica if the container was already mis-replicated - // OR if losing this replica does not make it become mis-replicated + if (!satisfied || nowSatisfied) { + // Remove the replica if the container was already unsatisfied + // OR if losing this replica still keep satisfied sendDeleteCommand(container, r.getDatanodeDetails(), true); excess -= 1; continue; From 5758e15e0af8c5a83ae81f54e5982d629c722e69 Mon Sep 17 00:00:00 2001 From: baoloongmao Date: Tue, 15 Sep 2020 11:41:06 +0800 Subject: [PATCH 3/8] Fix style --- .../apache/hadoop/hdds/scm/container/ReplicationManager.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java index 0e1898c811ed..ba2703f3bf33 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java @@ -670,8 +670,9 @@ private void handleOverReplicatedContainer(final ContainerInfo container, // First remove the replica we are working on from the set, and then // check if the set is now mis-replicated. replicaSet.remove(r); - boolean nowSatisfied = getPlacementStatus(replicaSet, replicationFactor) - .isPolicySatisfied(); + boolean nowSatisfied = + getPlacementStatus(replicaSet, replicationFactor) + .isPolicySatisfied(); if (!satisfied || nowSatisfied) { // Remove the replica if the container was already unsatisfied // OR if losing this replica still keep satisfied From 38233103f81973ce112bba7c03369f567293d1ba Mon Sep 17 00:00:00 2001 From: baoloongmao Date: Tue, 15 Sep 2020 17:25:17 +0800 Subject: [PATCH 4/8] add a test case --- .../scm/container/ReplicationManager.java | 15 ++++---- .../scm/container/TestReplicationManager.java | 38 +++++++++++++++++++ 2 files changed, 46 insertions(+), 7 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java index ba2703f3bf33..6b1e2afc52bc 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java @@ -660,9 +660,8 @@ private void handleOverReplicatedContainer(final ContainerInfo container, if (excess > 0) { eligibleReplicas.removeAll(unhealthyReplicas); Set replicaSet = new HashSet<>(eligibleReplicas); - boolean satisfied = - getPlacementStatus(replicaSet, replicationFactor) - .isPolicySatisfied(); + ContainerPlacementStatus ps = + getPlacementStatus(replicaSet, replicationFactor); for (ContainerReplica r : eligibleReplicas) { if (excess <= 0) { break; @@ -670,11 +669,13 @@ private void handleOverReplicatedContainer(final ContainerInfo container, // First remove the replica we are working on from the set, and then // check if the set is now mis-replicated. replicaSet.remove(r); - boolean nowSatisfied = - getPlacementStatus(replicaSet, replicationFactor) - .isPolicySatisfied(); - if (!satisfied || nowSatisfied) { + ContainerPlacementStatus nowPS = + getPlacementStatus(replicaSet, replicationFactor); + if ((!ps.isPolicySatisfied() + && nowPS.actualPlacementCount() == ps.actualPlacementCount()) + || (ps.isPolicySatisfied() && nowPS.isPolicySatisfied())) { // Remove the replica if the container was already unsatisfied + // and losing this replica keep actual placement count unchanged. // OR if losing this replica still keep satisfied sendDeleteCommand(container, r.getDatanodeDetails(), true); excess -= 1; diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java index 51692c3cde8e..575ac3247339 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java @@ -764,6 +764,44 @@ public void overReplicatedButRemovingMakesMisReplicated() replicaFive.getDatanodeDetails())); } + @Test + public void testOverReplicatedAndPolicySatisfied() throws + SCMException, ContainerNotFoundException, InterruptedException { + final ContainerInfo container = getContainer(LifeCycleState.CLOSED); + final ContainerID id = container.containerID(); + final UUID originNodeId = UUID.randomUUID(); + final ContainerReplica replicaOne = getReplicas( + id, State.CLOSED, 1000L, originNodeId, randomDatanodeDetails()); + final ContainerReplica replicaTwo = getReplicas( + id, State.CLOSED, 1000L, originNodeId, randomDatanodeDetails()); + final ContainerReplica replicaThree = getReplicas( + id, State.CLOSED, 1000L, originNodeId, randomDatanodeDetails()); + final ContainerReplica replicaFour = getReplicas( + id, State.CLOSED, 1000L, originNodeId, randomDatanodeDetails()); + + containerStateManager.loadContainer(container); + containerStateManager.updateContainerReplica(id, replicaOne); + containerStateManager.updateContainerReplica(id, replicaTwo); + containerStateManager.updateContainerReplica(id, replicaThree); + containerStateManager.updateContainerReplica(id, replicaFour); + + Mockito.when(containerPlacementPolicy.validateContainerPlacement( + Mockito.argThat(new ListOfNElements(3)), + Mockito.anyInt() + )).thenAnswer( + invocation -> new ContainerPlacementStatusDefault(2, 2, 3)); + + + final int currentDeleteCommandCount = datanodeCommandHandler + .getInvocationCount(SCMCommandProto.Type.deleteContainerCommand); + + replicationManager.processContainersNow(); + // Wait for EventQueue to call the event handler + Thread.sleep(100L); + Assert.assertEquals(currentDeleteCommandCount + 1, datanodeCommandHandler + .getInvocationCount(SCMCommandProto.Type.deleteContainerCommand)); + } + @After public void teardown() throws IOException { containerStateManager.close(); From 2fd1d9e5eba1b97e20cc2ea3f91346c279fd30ee Mon Sep 17 00:00:00 2001 From: baoloongmao Date: Wed, 16 Sep 2020 11:49:46 +0800 Subject: [PATCH 5/8] add an testOverReplicatedAndPolicyUnSatisfied test case --- .../scm/container/TestReplicationManager.java | 41 ++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java index 575ac3247339..a9e6408a446f 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java @@ -791,7 +791,6 @@ public void testOverReplicatedAndPolicySatisfied() throws )).thenAnswer( invocation -> new ContainerPlacementStatusDefault(2, 2, 3)); - final int currentDeleteCommandCount = datanodeCommandHandler .getInvocationCount(SCMCommandProto.Type.deleteContainerCommand); @@ -802,6 +801,46 @@ public void testOverReplicatedAndPolicySatisfied() throws .getInvocationCount(SCMCommandProto.Type.deleteContainerCommand)); } + @Test + public void testOverReplicatedAndPolicyUnSatisfied() throws + SCMException, ContainerNotFoundException, InterruptedException { + final ContainerInfo container = getContainer(LifeCycleState.CLOSED); + final ContainerID id = container.containerID(); + final UUID originNodeId = UUID.randomUUID(); + final ContainerReplica replicaOne = getReplicas( + id, State.CLOSED, 1000L, originNodeId, randomDatanodeDetails()); + final ContainerReplica replicaTwo = getReplicas( + id, State.CLOSED, 1000L, originNodeId, randomDatanodeDetails()); + final ContainerReplica replicaThree = getReplicas( + id, State.CLOSED, 1000L, originNodeId, randomDatanodeDetails()); + final ContainerReplica replicaFour = getReplicas( + id, State.CLOSED, 1000L, originNodeId, randomDatanodeDetails()); + final ContainerReplica replicaFive = getReplicas( + id, State.CLOSED, 1000L, originNodeId, randomDatanodeDetails()); + + containerStateManager.loadContainer(container); + containerStateManager.updateContainerReplica(id, replicaOne); + containerStateManager.updateContainerReplica(id, replicaTwo); + containerStateManager.updateContainerReplica(id, replicaThree); + containerStateManager.updateContainerReplica(id, replicaFour); + containerStateManager.updateContainerReplica(id, replicaFive); + + Mockito.when(containerPlacementPolicy.validateContainerPlacement( + Mockito.argThat(new ListOfNElements(3)), + Mockito.anyInt() + )).thenAnswer( + invocation -> new ContainerPlacementStatusDefault(1, 2, 3)); + + final int currentDeleteCommandCount = datanodeCommandHandler + .getInvocationCount(SCMCommandProto.Type.deleteContainerCommand); + + replicationManager.processContainersNow(); + // Wait for EventQueue to call the event handler + Thread.sleep(100L); + Assert.assertEquals(currentDeleteCommandCount, datanodeCommandHandler + .getInvocationCount(SCMCommandProto.Type.deleteContainerCommand)); + } + @After public void teardown() throws IOException { containerStateManager.close(); From 8111f0f8456f13ffeee051eeafd2205452ba2142 Mon Sep 17 00:00:00 2001 From: baoloongmao Date: Wed, 16 Sep 2020 16:11:45 +0800 Subject: [PATCH 6/8] add an testOverReplicatedAndPolicyUnSatisfiedAndDeleted test case --- .../scm/container/TestReplicationManager.java | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java index a9e6408a446f..94e603018232 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java @@ -54,6 +54,7 @@ import java.util.Set; import java.util.UUID; import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -841,6 +842,47 @@ public void testOverReplicatedAndPolicyUnSatisfied() throws .getInvocationCount(SCMCommandProto.Type.deleteContainerCommand)); } + @Test + public void testOverReplicatedAndPolicyUnSatisfiedAndDeleted() throws + SCMException, ContainerNotFoundException, InterruptedException { + final ContainerInfo container = getContainer(LifeCycleState.CLOSED); + final ContainerID id = container.containerID(); + final UUID originNodeId = UUID.randomUUID(); + final ContainerReplica replicaOne = getReplicas( + id, State.CLOSED, 1000L, originNodeId, randomDatanodeDetails()); + final ContainerReplica replicaTwo = getReplicas( + id, State.CLOSED, 1000L, originNodeId, randomDatanodeDetails()); + final ContainerReplica replicaThree = getReplicas( + id, State.CLOSED, 1000L, originNodeId, randomDatanodeDetails()); + final ContainerReplica replicaFour = getReplicas( + id, State.CLOSED, 1000L, originNodeId, randomDatanodeDetails()); + final ContainerReplica replicaFive = getReplicas( + id, State.CLOSED, 1000L, originNodeId, randomDatanodeDetails()); + + containerStateManager.loadContainer(container); + containerStateManager.updateContainerReplica(id, replicaOne); + containerStateManager.updateContainerReplica(id, replicaTwo); + containerStateManager.updateContainerReplica(id, replicaThree); + containerStateManager.updateContainerReplica(id, replicaFour); + containerStateManager.updateContainerReplica(id, replicaFive); + + Mockito.when(containerPlacementPolicy.validateContainerPlacement( + Mockito.argThat(new FunctionMatcher(list -> + list != null && ((List) list).size() <= 4)), + Mockito.anyInt() + )).thenAnswer( + invocation -> new ContainerPlacementStatusDefault(1, 2, 3)); + + final int currentDeleteCommandCount = datanodeCommandHandler + .getInvocationCount(SCMCommandProto.Type.deleteContainerCommand); + + replicationManager.processContainersNow(); + // Wait for EventQueue to call the event handler + Thread.sleep(100L); + Assert.assertEquals(currentDeleteCommandCount + 2, datanodeCommandHandler + .getInvocationCount(SCMCommandProto.Type.deleteContainerCommand)); + } + @After public void teardown() throws IOException { containerStateManager.close(); @@ -908,4 +950,17 @@ public boolean matches(Object argument) { } } + class FunctionMatcher extends ArgumentMatcher { + + Function function; + + FunctionMatcher(Function function) { + this.function = function; + } + + @Override + public boolean matches(Object argument) { + return function.apply(argument); + } + } } \ No newline at end of file From c56cdd1b9b990546db2f08ba0b9f7c0e71e3299c Mon Sep 17 00:00:00 2001 From: baoloongmao Date: Wed, 16 Sep 2020 16:33:16 +0800 Subject: [PATCH 7/8] Remove the testOverReplicatedAndPolicyUnSatisfied test case --- .../scm/container/TestReplicationManager.java | 40 ------------------- 1 file changed, 40 deletions(-) diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java index 94e603018232..a5fde48c5f70 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java @@ -802,46 +802,6 @@ public void testOverReplicatedAndPolicySatisfied() throws .getInvocationCount(SCMCommandProto.Type.deleteContainerCommand)); } - @Test - public void testOverReplicatedAndPolicyUnSatisfied() throws - SCMException, ContainerNotFoundException, InterruptedException { - final ContainerInfo container = getContainer(LifeCycleState.CLOSED); - final ContainerID id = container.containerID(); - final UUID originNodeId = UUID.randomUUID(); - final ContainerReplica replicaOne = getReplicas( - id, State.CLOSED, 1000L, originNodeId, randomDatanodeDetails()); - final ContainerReplica replicaTwo = getReplicas( - id, State.CLOSED, 1000L, originNodeId, randomDatanodeDetails()); - final ContainerReplica replicaThree = getReplicas( - id, State.CLOSED, 1000L, originNodeId, randomDatanodeDetails()); - final ContainerReplica replicaFour = getReplicas( - id, State.CLOSED, 1000L, originNodeId, randomDatanodeDetails()); - final ContainerReplica replicaFive = getReplicas( - id, State.CLOSED, 1000L, originNodeId, randomDatanodeDetails()); - - containerStateManager.loadContainer(container); - containerStateManager.updateContainerReplica(id, replicaOne); - containerStateManager.updateContainerReplica(id, replicaTwo); - containerStateManager.updateContainerReplica(id, replicaThree); - containerStateManager.updateContainerReplica(id, replicaFour); - containerStateManager.updateContainerReplica(id, replicaFive); - - Mockito.when(containerPlacementPolicy.validateContainerPlacement( - Mockito.argThat(new ListOfNElements(3)), - Mockito.anyInt() - )).thenAnswer( - invocation -> new ContainerPlacementStatusDefault(1, 2, 3)); - - final int currentDeleteCommandCount = datanodeCommandHandler - .getInvocationCount(SCMCommandProto.Type.deleteContainerCommand); - - replicationManager.processContainersNow(); - // Wait for EventQueue to call the event handler - Thread.sleep(100L); - Assert.assertEquals(currentDeleteCommandCount, datanodeCommandHandler - .getInvocationCount(SCMCommandProto.Type.deleteContainerCommand)); - } - @Test public void testOverReplicatedAndPolicyUnSatisfiedAndDeleted() throws SCMException, ContainerNotFoundException, InterruptedException { From ecc8c6d9a5fcc1b540a0ad2c93ea16fb6a916abc Mon Sep 17 00:00:00 2001 From: baoloongmao Date: Wed, 16 Sep 2020 16:34:38 +0800 Subject: [PATCH 8/8] Fix style --- .../hadoop/hdds/scm/container/TestReplicationManager.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java index a5fde48c5f70..b1e27c0816e8 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java @@ -912,7 +912,7 @@ public boolean matches(Object argument) { class FunctionMatcher extends ArgumentMatcher { - Function function; + private Function function; FunctionMatcher(Function function) { this.function = function;