From 22b2ebb3c16a60adf6dc2171e8864fb25b05213a Mon Sep 17 00:00:00 2001 From: S O'Donnell Date: Mon, 10 Mar 2025 12:02:40 +0000 Subject: [PATCH 1/8] Add template for QC Stuck Over Rep Handler --- ...uasiClosedStuckOverReplicationHandler.java | 43 +++++++++++++++++++ .../replication/ReplicationManager.java | 15 ++++++- 2 files changed, 56 insertions(+), 2 deletions(-) create mode 100644 hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/QuasiClosedStuckOverReplicationHandler.java diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/QuasiClosedStuckOverReplicationHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/QuasiClosedStuckOverReplicationHandler.java new file mode 100644 index 000000000000..74522a35fdd3 --- /dev/null +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/QuasiClosedStuckOverReplicationHandler.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hdds.scm.container.replication; + +import java.io.IOException; +import java.util.List; +import java.util.Set; +import org.apache.hadoop.hdds.conf.ConfigurationSource; +import org.apache.hadoop.hdds.scm.PlacementPolicy; +import org.apache.hadoop.hdds.scm.container.ContainerReplica; + +/** + * Class to correct over replicated QuasiClosed Stuck Ratis containers. + */ +public class QuasiClosedStuckOverReplicationHandler implements UnhealthyReplicationHandler { + + + public QuasiClosedStuckOverReplicationHandler(final PlacementPolicy placementPolicy, + final ConfigurationSource conf, final ReplicationManager replicationManager) { + } + + @Override + public int processAndSendCommands(Set replicas, List pendingOps, + ContainerHealthResult result, int remainingMaintenanceRedundancy) + throws IOException { + return 0; + } +} diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java index edaaf7ef2627..b98d29293807 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java @@ -184,6 +184,7 @@ public class ReplicationManager implements SCMService, ContainerReplicaPendingOp private final RatisOverReplicationHandler ratisOverReplicationHandler; private final RatisMisReplicationHandler ratisMisReplicationHandler; private final QuasiClosedStuckUnderReplicationHandler quasiClosedStuckUnderReplicationHandler; + private final QuasiClosedStuckOverReplicationHandler quasiClosedStuckOverReplicationHandler; private Thread underReplicatedProcessorThread; private Thread overReplicatedProcessorThread; private final UnderReplicatedProcessor underReplicatedProcessor; @@ -252,6 +253,8 @@ public ReplicationManager(final ConfigurationSource conf, ratisContainerPlacement, conf, this); quasiClosedStuckUnderReplicationHandler = new QuasiClosedStuckUnderReplicationHandler(ratisContainerPlacement, conf, this); + quasiClosedStuckOverReplicationHandler = + new QuasiClosedStuckOverReplicationHandler(ratisContainerPlacement, conf, this); underReplicatedProcessor = new UnderReplicatedProcessor(this, rmConf::getUnderReplicatedInterval); overReplicatedProcessor = @@ -781,8 +784,16 @@ int processOverReplicatedContainer( containerReplicaPendingOps.getPendingOps(containerID); final boolean isEC = isEC(result.getContainerInfo().getReplicationConfig()); - final UnhealthyReplicationHandler handler = isEC ? ecOverReplicationHandler - : ratisOverReplicationHandler; + UnhealthyReplicationHandler handler; + if (isEC) { + handler = ecOverReplicationHandler; + } else { + if (QuasiClosedStuckReplicationCheck.shouldHandleAsQuasiClosedStuck(result.getContainerInfo(), replicas)) { + handler = quasiClosedStuckUnderReplicationHandler; + } else { + handler = ratisOverReplicationHandler; + } + } return handler.processAndSendCommands(replicas, pendingOps, result, getRemainingMaintenanceRedundancy(isEC)); From b79741465f640aeb2e5cd8436dc43fabbc16b69f Mon Sep 17 00:00:00 2001 From: S O'Donnell Date: Mon, 10 Mar 2025 16:08:03 +0000 Subject: [PATCH 2/8] Add logic for over rep handling and edge case tests --- ...uasiClosedStuckOverReplicationHandler.java | 76 +++++++- .../replication/ReplicationManager.java | 3 +- .../replication/ReplicationTestUtil.java | 22 ++- ...uasiClosedStuckOverReplicationHandler.java | 176 ++++++++++++++++++ 4 files changed, 269 insertions(+), 8 deletions(-) create mode 100644 hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestQuasiClosedStuckOverReplicationHandler.java diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/QuasiClosedStuckOverReplicationHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/QuasiClosedStuckOverReplicationHandler.java index 74522a35fdd3..47e5e4caabfe 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/QuasiClosedStuckOverReplicationHandler.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/QuasiClosedStuckOverReplicationHandler.java @@ -18,10 +18,11 @@ package org.apache.hadoop.hdds.scm.container.replication; import java.io.IOException; +import java.util.Comparator; import java.util.List; import java.util.Set; -import org.apache.hadoop.hdds.conf.ConfigurationSource; -import org.apache.hadoop.hdds.scm.PlacementPolicy; +import java.util.stream.Collectors; +import org.apache.hadoop.hdds.scm.container.ContainerInfo; import org.apache.hadoop.hdds.scm.container.ContainerReplica; /** @@ -29,15 +30,80 @@ */ public class QuasiClosedStuckOverReplicationHandler implements UnhealthyReplicationHandler { + private static final org.slf4j.Logger LOG = + org.slf4j.LoggerFactory.getLogger(QuasiClosedStuckOverReplicationHandler.class); + private final ReplicationManager replicationManager; + private final ReplicationManagerMetrics metrics; - public QuasiClosedStuckOverReplicationHandler(final PlacementPolicy placementPolicy, - final ConfigurationSource conf, final ReplicationManager replicationManager) { + public QuasiClosedStuckOverReplicationHandler(final ReplicationManager replicationManager) { + this.replicationManager = replicationManager; + this.metrics = replicationManager.getMetrics(); } @Override public int processAndSendCommands(Set replicas, List pendingOps, ContainerHealthResult result, int remainingMaintenanceRedundancy) throws IOException { - return 0; + + ContainerInfo containerInfo = result.getContainerInfo(); + LOG.debug("Handling over replicated QuasiClosed Stuck Ratis container {}", containerInfo); + + int pendingDelete = 0; + for (ContainerReplicaOp op : pendingOps) { + if (op.getOpType() == ContainerReplicaOp.PendingOpType.ADD) { + pendingDelete++; + } + } + + if (pendingDelete > 0) { + LOG.debug("Container {} has pending delete operations. No more over replication will be scheduled until they " + + "complete", containerInfo); + return 0; + } + + QuasiClosedStuckReplicaCount replicaCount = + new QuasiClosedStuckReplicaCount(replicas, remainingMaintenanceRedundancy); + + List misReplicatedOrigins + = replicaCount.getOverReplicatedOrigins(); + + if (misReplicatedOrigins.isEmpty()) { + LOG.debug("Container {} is not over replicated", containerInfo); + return 0; + } + + int totalCommandsSent = 0; + IOException firstException = null; + for (QuasiClosedStuckReplicaCount.MisReplicatedOrigin origin : misReplicatedOrigins) { + List sortedReplicas = getEligibleReplicas(origin.getSources()); + for (int i = 0; i < origin.getReplicaDelta(); i++) { + try { + replicationManager.sendThrottledDeleteCommand( + containerInfo, 0, sortedReplicas.get(i).getDatanodeDetails(), true); + totalCommandsSent++; + } catch (CommandTargetOverloadedException e) { + LOG.debug("Unable to send delete command for container {} to {} as it has too many pending delete commands", + containerInfo, sortedReplicas.get(i).getDatanodeDetails()); + firstException = e; + } + } + } + + if (firstException != null) { + // Some nodes were overloaded when attempting to send commands. + if (totalCommandsSent > 0) { + metrics.incrPartialReplicationTotal(); + } + throw firstException; + } + return totalCommandsSent; + } + + private List getEligibleReplicas( + Set replicas) { + // sort replicas so that they can be selected in a deterministic way + return replicas.stream() + .sorted(Comparator.comparingLong(ContainerReplica::hashCode)) + .collect(Collectors.toList()); } } diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java index b98d29293807..5f927d97b713 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java @@ -253,8 +253,7 @@ public ReplicationManager(final ConfigurationSource conf, ratisContainerPlacement, conf, this); quasiClosedStuckUnderReplicationHandler = new QuasiClosedStuckUnderReplicationHandler(ratisContainerPlacement, conf, this); - quasiClosedStuckOverReplicationHandler = - new QuasiClosedStuckOverReplicationHandler(ratisContainerPlacement, conf, this); + quasiClosedStuckOverReplicationHandler = new QuasiClosedStuckOverReplicationHandler(this); underReplicatedProcessor = new UnderReplicatedProcessor(this, rmConf::getUnderReplicatedInterval); overReplicatedProcessor = diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationTestUtil.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationTestUtil.java index 93c670e3ab4f..444affca7db9 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationTestUtil.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationTestUtil.java @@ -504,9 +504,29 @@ public static void mockRMSendDeleteCommand(ReplicationManager mock, * @param commandsSent Set to add the command to rather than sending it. */ public static void mockRMSendThrottledDeleteCommand(ReplicationManager mock, - Set>> commandsSent) + Set>> commandsSent) + throws NotLeaderException, CommandTargetOverloadedException { + mockRMSendThrottledDeleteCommand(mock, commandsSent, new AtomicBoolean(false)); + } + + /** + * Given a Mockito mock of ReplicationManager, this method will mock the + * sendThrottledDeleteCommand method so that it adds the command created to + * the commandsSent set. + * @param mock Mock of ReplicationManager + * @param commandsSent Set to add the command to rather than sending it. + * @param throwOverloaded If the atomic boolean is true, throw a + * CommandTargetOverloadedException and set the boolean + * to false, instead of creating the replicate command. + */ + public static void mockRMSendThrottledDeleteCommand(ReplicationManager mock, + Set>> commandsSent, AtomicBoolean throwOverloaded) throws NotLeaderException, CommandTargetOverloadedException { doAnswer((Answer) invocationOnMock -> { + if (throwOverloaded.get()) { + throwOverloaded.set(false); + throw new CommandTargetOverloadedException("Overloaded"); + } ContainerInfo containerInfo = invocationOnMock.getArgument(0); int replicaIndex = invocationOnMock.getArgument(1); DatanodeDetails target = invocationOnMock.getArgument(2); diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestQuasiClosedStuckOverReplicationHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestQuasiClosedStuckOverReplicationHandler.java new file mode 100644 index 000000000000..45907c3da06a --- /dev/null +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestQuasiClosedStuckOverReplicationHandler.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hdds.scm.container.replication; + +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor.THREE; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.UUID; +import java.util.concurrent.atomic.AtomicBoolean; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.hadoop.hdds.client.RatisReplicationConfig; +import org.apache.hadoop.hdds.conf.OzoneConfiguration; +import org.apache.hadoop.hdds.protocol.DatanodeDetails; +import org.apache.hadoop.hdds.protocol.MockDatanodeDetails; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos; +import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos; +import org.apache.hadoop.hdds.scm.container.ContainerInfo; +import org.apache.hadoop.hdds.scm.container.ContainerReplica; +import org.apache.hadoop.hdds.scm.node.NodeManager; +import org.apache.hadoop.hdds.scm.node.NodeStatus; +import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException; +import org.apache.hadoop.ozone.container.common.SCMTestUtils; +import org.apache.hadoop.ozone.protocol.commands.SCMCommand; +import org.apache.ratis.protocol.exceptions.NotLeaderException; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +/** + * Test for QuasiClosedStuckOverReplicationHandler. + */ +public class TestQuasiClosedStuckOverReplicationHandler { + + private static final RatisReplicationConfig RATIS_REPLICATION_CONFIG = RatisReplicationConfig.getInstance(THREE); + private ContainerInfo container; + private NodeManager nodeManager; + private OzoneConfiguration conf; + private ReplicationManager replicationManager; + private ReplicationManagerMetrics metrics; + private Set>> commandsSent; + private QuasiClosedStuckOverReplicationHandler handler; + + @BeforeEach + void setup(@TempDir File testDir) throws NodeNotFoundException, + CommandTargetOverloadedException, NotLeaderException { + container = ReplicationTestUtil.createContainer( + HddsProtos.LifeCycleState.QUASI_CLOSED, RATIS_REPLICATION_CONFIG); + + nodeManager = mock(NodeManager.class); + conf = SCMTestUtils.getConf(testDir); + replicationManager = mock(ReplicationManager.class); + OzoneConfiguration ozoneConfiguration = new OzoneConfiguration(); + ozoneConfiguration.setBoolean("hdds.scm.replication.push", true); + when(replicationManager.getConfig()) + .thenReturn(ozoneConfiguration.getObject( + ReplicationManager.ReplicationManagerConfiguration.class)); + metrics = ReplicationManagerMetrics.create(replicationManager); + when(replicationManager.getMetrics()).thenReturn(metrics); + + /* + Return NodeStatus with NodeOperationalState as specified in + DatanodeDetails, and NodeState as HEALTHY. + */ + when( + replicationManager.getNodeStatus(any(DatanodeDetails.class))) + .thenAnswer(invocationOnMock -> { + DatanodeDetails dn = invocationOnMock.getArgument(0); + return new NodeStatus(dn.getPersistedOpState(), + HddsProtos.NodeState.HEALTHY); + }); + + commandsSent = new HashSet<>(); + ReplicationTestUtil.mockRMSendThrottledDeleteCommand( + replicationManager, commandsSent); + handler = new QuasiClosedStuckOverReplicationHandler(replicationManager); + } + + @Test + public void testReturnsZeroIfNotOverReplicated() throws IOException { + UUID origin = UUID.randomUUID(); + Set replicas = ReplicationTestUtil.createReplicasWithOriginAndOpState(container.containerID(), + StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.QUASI_CLOSED, + Pair.of(origin, HddsProtos.NodeOperationalState.IN_SERVICE), + Pair.of(origin, HddsProtos.NodeOperationalState.IN_SERVICE), + Pair.of(origin, HddsProtos.NodeOperationalState.IN_SERVICE)); + + int count = handler.processAndSendCommands(replicas, Collections.emptyList(), getOverReplicatedHealthResult(), 1); + assertEquals(0, count); + } + + @Test + public void testNoCommandsScheduledIfPendingOps() throws IOException { + UUID origin = UUID.randomUUID(); + Set replicas = ReplicationTestUtil.createReplicasWithOriginAndOpState(container.containerID(), + StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.QUASI_CLOSED, + Pair.of(origin, HddsProtos.NodeOperationalState.IN_SERVICE), + Pair.of(origin, HddsProtos.NodeOperationalState.IN_SERVICE)); + List pendingOps = new ArrayList<>(); + pendingOps.add(ContainerReplicaOp.create( + ContainerReplicaOp.PendingOpType.DELETE, MockDatanodeDetails.randomDatanodeDetails(), 0)); + + int count = handler.processAndSendCommands(replicas, pendingOps, getOverReplicatedHealthResult(), 1); + assertEquals(0, count); + } + + @Test + public void testCommandScheduledForOverReplicatedContainer() throws IOException { + UUID origin = UUID.randomUUID(); + Set replicas = ReplicationTestUtil.createReplicasWithOriginAndOpState(container.containerID(), + StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.QUASI_CLOSED, + Pair.of(origin, HddsProtos.NodeOperationalState.IN_SERVICE), + Pair.of(origin, HddsProtos.NodeOperationalState.IN_SERVICE), + Pair.of(origin, HddsProtos.NodeOperationalState.IN_SERVICE), + Pair.of(origin, HddsProtos.NodeOperationalState.IN_SERVICE)); + + int count = handler.processAndSendCommands(replicas, Collections.emptyList(), getOverReplicatedHealthResult(), 1); + assertEquals(1, count); + SCMCommand command = commandsSent.iterator().next().getRight(); + assertEquals(StorageContainerDatanodeProtocolProtos.SCMCommandProto.Type.deleteContainerCommand, command.getType()); + } + + @Test + public void testOverloadedExceptionContinuesAndThrows() throws NotLeaderException, CommandTargetOverloadedException { + UUID origin1 = UUID.randomUUID(); + UUID origin2 = UUID.randomUUID(); + Set replicas = ReplicationTestUtil.createReplicasWithOriginAndOpState(container.containerID(), + StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.QUASI_CLOSED, + Pair.of(origin1, HddsProtos.NodeOperationalState.IN_SERVICE), + Pair.of(origin1, HddsProtos.NodeOperationalState.IN_SERVICE), + Pair.of(origin1, HddsProtos.NodeOperationalState.IN_SERVICE), + Pair.of(origin2, HddsProtos.NodeOperationalState.IN_SERVICE), + Pair.of(origin2, HddsProtos.NodeOperationalState.IN_SERVICE), + Pair.of(origin2, HddsProtos.NodeOperationalState.IN_SERVICE)); + + ReplicationTestUtil.mockRMSendThrottledDeleteCommand(replicationManager, commandsSent, new AtomicBoolean(true)); + + assertThrows(CommandTargetOverloadedException.class, () -> + handler.processAndSendCommands(replicas, Collections.emptyList(), getOverReplicatedHealthResult(), 1)); + assertEquals(1, commandsSent.size()); + } + + + private ContainerHealthResult.OverReplicatedHealthResult getOverReplicatedHealthResult() { + ContainerHealthResult.OverReplicatedHealthResult + healthResult = mock(ContainerHealthResult.OverReplicatedHealthResult.class); + when(healthResult.getContainerInfo()).thenReturn(container); + return healthResult; + } + +} From 011450aae4e9662244f754daaccaea4053847a95 Mon Sep 17 00:00:00 2001 From: S O'Donnell Date: Wed, 12 Mar 2025 17:06:00 +0000 Subject: [PATCH 3/8] Added test scenarios --- ...uasiClosedStuckOverReplicationHandler.java | 4 +- .../replication/ReplicationManager.java | 2 +- .../replicationManagerTests/quasi_closed.json | 71 +++++++++++++++++++ 3 files changed, 74 insertions(+), 3 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/QuasiClosedStuckOverReplicationHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/QuasiClosedStuckOverReplicationHandler.java index 47e5e4caabfe..2a83c5d19018 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/QuasiClosedStuckOverReplicationHandler.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/QuasiClosedStuckOverReplicationHandler.java @@ -75,7 +75,7 @@ public int processAndSendCommands(Set replicas, List sortedReplicas = getEligibleReplicas(origin.getSources()); + List sortedReplicas = getSortedReplicas(origin.getSources()); for (int i = 0; i < origin.getReplicaDelta(); i++) { try { replicationManager.sendThrottledDeleteCommand( @@ -99,7 +99,7 @@ public int processAndSendCommands(Set replicas, List getEligibleReplicas( + private List getSortedReplicas( Set replicas) { // sort replicas so that they can be selected in a deterministic way return replicas.stream() diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java index 5f927d97b713..09245b2bee7a 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java @@ -788,7 +788,7 @@ int processOverReplicatedContainer( handler = ecOverReplicationHandler; } else { if (QuasiClosedStuckReplicationCheck.shouldHandleAsQuasiClosedStuck(result.getContainerInfo(), replicas)) { - handler = quasiClosedStuckUnderReplicationHandler; + handler = quasiClosedStuckOverReplicationHandler; } else { handler = ratisOverReplicationHandler; } diff --git a/hadoop-hdds/server-scm/src/test/resources/replicationManagerTests/quasi_closed.json b/hadoop-hdds/server-scm/src/test/resources/replicationManagerTests/quasi_closed.json index 8b841635b730..31cb6f7c3b16 100644 --- a/hadoop-hdds/server-scm/src/test/resources/replicationManagerTests/quasi_closed.json +++ b/hadoop-hdds/server-scm/src/test/resources/replicationManagerTests/quasi_closed.json @@ -181,5 +181,76 @@ "commands": [ { "type": "replicateContainerCommand"} ] + }, + { "description": "Quasi-Closed stuck one origin over replicated", "containerState": "QUASI_CLOSED", "replicationConfig": "RATIS:THREE", "sequenceId": 10, + "replicas": [ + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d1", "sequenceId": 10, "isEmpty": false, "origin": "o1"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d2", "sequenceId": 10, "isEmpty": false, "origin": "o1"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d3", "sequenceId": 10, "isEmpty": false, "origin": "o1"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d4", "sequenceId": 10, "isEmpty": false, "origin": "o1"} + ], + "expectation": { "underReplicated": 0, "underReplicatedQueue": 0, "overReplicated": 1, "overReplicatedQueue": 1, "quasiClosedStuck": 1, "unhealthy": 0 }, + "checkCommands": [], + "commands": [ + { "type": "deleteContainerCommand" } + ] + }, + { "description": "Quasi-Closed stuck two origins over replicated", "containerState": "QUASI_CLOSED", "replicationConfig": "RATIS:THREE", "sequenceId": 10, + "replicas": [ + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d1", "sequenceId": 10, "isEmpty": false, "origin": "o1"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d2", "sequenceId": 10, "isEmpty": false, "origin": "o1"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d3", "sequenceId": 10, "isEmpty": false, "origin": "o1"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d4", "sequenceId": 10, "isEmpty": false, "origin": "o2"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d5", "sequenceId": 10, "isEmpty": false, "origin": "o2"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d6", "sequenceId": 10, "isEmpty": false, "origin": "o2"} + ], + "expectation": { "underReplicated": 0, "underReplicatedQueue": 0, "overReplicated": 1, "overReplicatedQueue": 1, "quasiClosedStuck": 1, "unhealthy": 0 }, + "checkCommands": [], + "commands": [ + { "type": "deleteContainerCommand" }, + { "type": "deleteContainerCommand" } + ] + }, + { "description": "Quasi-Closed stuck two origins not over replicated with maintenance", "containerState": "QUASI_CLOSED", "replicationConfig": "RATIS:THREE", "sequenceId": 10, + "replicas": [ + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d1", "sequenceId": 10, "isEmpty": false, "origin": "o1"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d2", "sequenceId": 10, "isEmpty": false, "origin": "o1"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d3", "sequenceId": 10, "isEmpty": false, "origin": "o1", "operationalState": "IN_MAINTENANCE" }, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d4", "sequenceId": 10, "isEmpty": false, "origin": "o2"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d5", "sequenceId": 10, "isEmpty": false, "origin": "o2"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d6", "sequenceId": 10, "isEmpty": false, "origin": "o2", "operationalState": "IN_MAINTENANCE"} + ], + "expectation": { "underReplicated": 0, "underReplicatedQueue": 0, "overReplicated": 0, "overReplicatedQueue": 0, "quasiClosedStuck": 1, "unhealthy": 0 }, + "checkCommands": [], + "commands": [] + }, + { "description": "Quasi-Closed stuck two origins not over replicated with decommission", "containerState": "QUASI_CLOSED", "replicationConfig": "RATIS:THREE", "sequenceId": 10, + "replicas": [ + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d1", "sequenceId": 10, "isEmpty": false, "origin": "o1"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d2", "sequenceId": 10, "isEmpty": false, "origin": "o1"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d3", "sequenceId": 10, "isEmpty": false, "origin": "o1", "operationalState": "DECOMMISSIONED" }, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d4", "sequenceId": 10, "isEmpty": false, "origin": "o2"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d5", "sequenceId": 10, "isEmpty": false, "origin": "o2"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d6", "sequenceId": 10, "isEmpty": false, "origin": "o2", "operationalState": "DECOMMISSIONED"} + ], + "expectation": { "underReplicated": 0, "underReplicatedQueue": 0, "overReplicated": 0, "overReplicatedQueue": 0, "quasiClosedStuck": 1, "unhealthy": 0 }, + "checkCommands": [], + "commands": [] + }, + { "description": "Quasi-Closed stuck two origins over replicated with maintenance", "containerState": "QUASI_CLOSED", "replicationConfig": "RATIS:THREE", "sequenceId": 10, + "replicas": [ + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d1", "sequenceId": 10, "isEmpty": false, "origin": "o1"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d2", "sequenceId": 10, "isEmpty": false, "origin": "o1"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d3", "sequenceId": 10, "isEmpty": false, "origin": "o1"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d4", "sequenceId": 10, "isEmpty": false, "origin": "o1", "operationalState": "IN_MAINTENANCE" }, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d5", "sequenceId": 10, "isEmpty": false, "origin": "o2"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d6", "sequenceId": 10, "isEmpty": false, "origin": "o2"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d7", "sequenceId": 10, "isEmpty": false, "origin": "o2", "operationalState": "IN_MAINTENANCE"} + ], + "expectation": { "underReplicated": 0, "underReplicatedQueue": 0, "overReplicated": 1, "overReplicatedQueue": 1, "quasiClosedStuck": 1, "unhealthy": 0 }, + "checkCommands": [], + "commands": [ + { "type": "deleteContainerCommand" } + ] } ] From 974aad917d723b5506d286d63bfe5207d596de7e Mon Sep 17 00:00:00 2001 From: S O'Donnell Date: Wed, 12 Mar 2025 18:25:41 +0000 Subject: [PATCH 4/8] Fix findbugs --- .../TestQuasiClosedStuckOverReplicationHandler.java | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestQuasiClosedStuckOverReplicationHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestQuasiClosedStuckOverReplicationHandler.java index 45907c3da06a..24fb83664683 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestQuasiClosedStuckOverReplicationHandler.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestQuasiClosedStuckOverReplicationHandler.java @@ -24,7 +24,6 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; -import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; @@ -42,15 +41,12 @@ import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos; import org.apache.hadoop.hdds.scm.container.ContainerInfo; import org.apache.hadoop.hdds.scm.container.ContainerReplica; -import org.apache.hadoop.hdds.scm.node.NodeManager; import org.apache.hadoop.hdds.scm.node.NodeStatus; import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException; -import org.apache.hadoop.ozone.container.common.SCMTestUtils; import org.apache.hadoop.ozone.protocol.commands.SCMCommand; import org.apache.ratis.protocol.exceptions.NotLeaderException; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; /** * Test for QuasiClosedStuckOverReplicationHandler. @@ -59,21 +55,17 @@ public class TestQuasiClosedStuckOverReplicationHandler { private static final RatisReplicationConfig RATIS_REPLICATION_CONFIG = RatisReplicationConfig.getInstance(THREE); private ContainerInfo container; - private NodeManager nodeManager; - private OzoneConfiguration conf; private ReplicationManager replicationManager; private ReplicationManagerMetrics metrics; private Set>> commandsSent; private QuasiClosedStuckOverReplicationHandler handler; @BeforeEach - void setup(@TempDir File testDir) throws NodeNotFoundException, + void setup() throws NodeNotFoundException, CommandTargetOverloadedException, NotLeaderException { container = ReplicationTestUtil.createContainer( HddsProtos.LifeCycleState.QUASI_CLOSED, RATIS_REPLICATION_CONFIG); - nodeManager = mock(NodeManager.class); - conf = SCMTestUtils.getConf(testDir); replicationManager = mock(ReplicationManager.class); OzoneConfiguration ozoneConfiguration = new OzoneConfiguration(); ozoneConfiguration.setBoolean("hdds.scm.replication.push", true); From 9831a5b180ecbe90a2fe3d379f0bf34a2f907150 Mon Sep 17 00:00:00 2001 From: S O'Donnell Date: Thu, 13 Mar 2025 12:21:35 +0000 Subject: [PATCH 5/8] Fix pending op handling --- ...uasiClosedStuckOverReplicationHandler.java | 4 ++- ...uasiClosedStuckOverReplicationHandler.java | 31 ++++++++++--------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/QuasiClosedStuckOverReplicationHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/QuasiClosedStuckOverReplicationHandler.java index 2a83c5d19018..721b04ddc3f8 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/QuasiClosedStuckOverReplicationHandler.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/QuasiClosedStuckOverReplicationHandler.java @@ -22,8 +22,10 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.scm.container.ContainerInfo; import org.apache.hadoop.hdds.scm.container.ContainerReplica; +import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException; /** * Class to correct over replicated QuasiClosed Stuck Ratis containers. @@ -50,7 +52,7 @@ public int processAndSendCommands(Set replicas, List>> commandsSent; private QuasiClosedStuckOverReplicationHandler handler; + private UUID origin1 = UUID.randomUUID(); + private UUID origin2 = UUID.randomUUID(); @BeforeEach void setup() throws NodeNotFoundException, @@ -95,12 +97,12 @@ void setup() throws NodeNotFoundException, @Test public void testReturnsZeroIfNotOverReplicated() throws IOException { - UUID origin = UUID.randomUUID(); Set replicas = ReplicationTestUtil.createReplicasWithOriginAndOpState(container.containerID(), StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.QUASI_CLOSED, - Pair.of(origin, HddsProtos.NodeOperationalState.IN_SERVICE), - Pair.of(origin, HddsProtos.NodeOperationalState.IN_SERVICE), - Pair.of(origin, HddsProtos.NodeOperationalState.IN_SERVICE)); + Pair.of(origin1, HddsProtos.NodeOperationalState.IN_SERVICE), + Pair.of(origin1, HddsProtos.NodeOperationalState.IN_SERVICE), + Pair.of(origin2, HddsProtos.NodeOperationalState.IN_SERVICE), + Pair.of(origin2, HddsProtos.NodeOperationalState.IN_SERVICE)); int count = handler.processAndSendCommands(replicas, Collections.emptyList(), getOverReplicatedHealthResult(), 1); assertEquals(0, count); @@ -108,11 +110,14 @@ public void testReturnsZeroIfNotOverReplicated() throws IOException { @Test public void testNoCommandsScheduledIfPendingOps() throws IOException { - UUID origin = UUID.randomUUID(); Set replicas = ReplicationTestUtil.createReplicasWithOriginAndOpState(container.containerID(), StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.QUASI_CLOSED, - Pair.of(origin, HddsProtos.NodeOperationalState.IN_SERVICE), - Pair.of(origin, HddsProtos.NodeOperationalState.IN_SERVICE)); + Pair.of(origin1, HddsProtos.NodeOperationalState.IN_SERVICE), + Pair.of(origin1, HddsProtos.NodeOperationalState.IN_SERVICE), + Pair.of(origin1, HddsProtos.NodeOperationalState.IN_SERVICE), + Pair.of(origin2, HddsProtos.NodeOperationalState.IN_SERVICE), + Pair.of(origin2, HddsProtos.NodeOperationalState.IN_SERVICE), + Pair.of(origin2, HddsProtos.NodeOperationalState.IN_SERVICE)); List pendingOps = new ArrayList<>(); pendingOps.add(ContainerReplicaOp.create( ContainerReplicaOp.PendingOpType.DELETE, MockDatanodeDetails.randomDatanodeDetails(), 0)); @@ -123,13 +128,13 @@ public void testNoCommandsScheduledIfPendingOps() throws IOException { @Test public void testCommandScheduledForOverReplicatedContainer() throws IOException { - UUID origin = UUID.randomUUID(); Set replicas = ReplicationTestUtil.createReplicasWithOriginAndOpState(container.containerID(), StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.QUASI_CLOSED, - Pair.of(origin, HddsProtos.NodeOperationalState.IN_SERVICE), - Pair.of(origin, HddsProtos.NodeOperationalState.IN_SERVICE), - Pair.of(origin, HddsProtos.NodeOperationalState.IN_SERVICE), - Pair.of(origin, HddsProtos.NodeOperationalState.IN_SERVICE)); + Pair.of(origin1, HddsProtos.NodeOperationalState.IN_SERVICE), + Pair.of(origin1, HddsProtos.NodeOperationalState.IN_SERVICE), + Pair.of(origin1, HddsProtos.NodeOperationalState.IN_SERVICE), + Pair.of(origin2, HddsProtos.NodeOperationalState.IN_SERVICE), + Pair.of(origin2, HddsProtos.NodeOperationalState.IN_SERVICE)); int count = handler.processAndSendCommands(replicas, Collections.emptyList(), getOverReplicatedHealthResult(), 1); assertEquals(1, count); @@ -139,8 +144,6 @@ public void testCommandScheduledForOverReplicatedContainer() throws IOException @Test public void testOverloadedExceptionContinuesAndThrows() throws NotLeaderException, CommandTargetOverloadedException { - UUID origin1 = UUID.randomUUID(); - UUID origin2 = UUID.randomUUID(); Set replicas = ReplicationTestUtil.createReplicasWithOriginAndOpState(container.containerID(), StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.QUASI_CLOSED, Pair.of(origin1, HddsProtos.NodeOperationalState.IN_SERVICE), From 0e6717e8630972369949c5caee844b3d5b0c5858 Mon Sep 17 00:00:00 2001 From: S O'Donnell Date: Thu, 13 Mar 2025 12:23:31 +0000 Subject: [PATCH 6/8] Remove stale replicas before deleting over rep containers --- .../QuasiClosedStuckOverReplicationHandler.java | 15 ++++++++++++++- .../replicationManagerTests/quasi_closed.json | 13 +++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/QuasiClosedStuckOverReplicationHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/QuasiClosedStuckOverReplicationHandler.java index 721b04ddc3f8..3ff23b495e8c 100644 --- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/QuasiClosedStuckOverReplicationHandler.java +++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/QuasiClosedStuckOverReplicationHandler.java @@ -63,8 +63,21 @@ public int processAndSendCommands(Set replicas, List healthyReplicas = replicas.stream() + .filter(replica -> { + try { + return replicationManager.getNodeStatus( + replica.getDatanodeDetails()).getHealth() == HddsProtos.NodeState.HEALTHY; + } catch (NodeNotFoundException e) { + return false; + } + }) + .collect(Collectors.toSet()); + QuasiClosedStuckReplicaCount replicaCount = - new QuasiClosedStuckReplicaCount(replicas, remainingMaintenanceRedundancy); + new QuasiClosedStuckReplicaCount(healthyReplicas, remainingMaintenanceRedundancy); List misReplicatedOrigins = replicaCount.getOverReplicatedOrigins(); diff --git a/hadoop-hdds/server-scm/src/test/resources/replicationManagerTests/quasi_closed.json b/hadoop-hdds/server-scm/src/test/resources/replicationManagerTests/quasi_closed.json index 31cb6f7c3b16..946f15ea4090 100644 --- a/hadoop-hdds/server-scm/src/test/resources/replicationManagerTests/quasi_closed.json +++ b/hadoop-hdds/server-scm/src/test/resources/replicationManagerTests/quasi_closed.json @@ -252,5 +252,18 @@ "commands": [ { "type": "deleteContainerCommand" } ] + }, + { "description": "Quasi-Closed stuck two origins over replicated with stale", "containerState": "QUASI_CLOSED", "replicationConfig": "RATIS:THREE", "sequenceId": 10, + "replicas": [ + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d1", "sequenceId": 10, "isEmpty": false, "origin": "o1"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d2", "sequenceId": 10, "isEmpty": false, "origin": "o1"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d3", "sequenceId": 10, "isEmpty": false, "origin": "o1", "healthState": "STALE" }, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d4", "sequenceId": 10, "isEmpty": false, "origin": "o2"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d5", "sequenceId": 10, "isEmpty": false, "origin": "o2"}, + { "state": "QUASI_CLOSED", "index": 0, "datanode": "d6", "sequenceId": 10, "isEmpty": false, "origin": "o2", "healthState": "STALE" } + ], + "expectation": { "underReplicated": 0, "underReplicatedQueue": 0, "overReplicated": 1, "overReplicatedQueue": 1, "quasiClosedStuck": 1, "unhealthy": 0 }, + "checkCommands": [], + "commands": [] } ] From 8f5c3ec6f41a1c281314003de864c5c223563732 Mon Sep 17 00:00:00 2001 From: S O'Donnell Date: Fri, 14 Mar 2025 11:59:46 +0000 Subject: [PATCH 7/8] Allow scenario tests to assert a command is sent to one of a list of DNs --- .../TestReplicationManagerScenarios.java | 37 ++++++++++++++----- .../replicationManagerTests/quasi_closed.json | 11 +++--- 2 files changed, 33 insertions(+), 15 deletions(-) diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManagerScenarios.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManagerScenarios.java index 6484fc3b976b..5acb58cab7f6 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManagerScenarios.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManagerScenarios.java @@ -130,6 +130,7 @@ private static List getTestFiles() throws URISyntaxException { private static List loadTestsInFile(URI testFile) throws IOException { + System.out.println("Loading test file: " + testFile); ObjectReader reader = new ObjectMapper().readerFor(Scenario.class); try (InputStream stream = testFile.toURL().openStream()) { try (MappingIterator iterator = reader.readValues(stream)) { @@ -332,11 +333,10 @@ private void assertExpectedCommands(Scenario scenario, boolean found = false; for (Pair> command : commandsSent) { if (command.getRight().getType() == expectedCommand.getType()) { - DatanodeDetails targetDatanode = expectedCommand.getTargetDatanode(); - if (targetDatanode != null) { + if (expectedCommand.hasExpectedDatanode()) { // We need to assert against the command the datanode is sent to DatanodeDetails commandDatanode = findDatanodeFromUUID(command.getKey()); - if (commandDatanode != null && commandDatanode.equals(targetDatanode)) { + if (commandDatanode != null && expectedCommand.isTargetExpected(commandDatanode)) { found = true; commandsSent.remove(command); break; @@ -550,6 +550,7 @@ public int getOverReplicatedQueue() { public static class ExpectedCommands { private SCMCommandProto.Type type; private String datanode; + private Set expectedDatanodes = new HashSet<>(); public void setDatanode(String datanode) { this.datanode = datanode; @@ -564,15 +565,33 @@ public SCMCommandProto.Type getType() { return type; } - public DatanodeDetails getTargetDatanode() { + public boolean hasExpectedDatanode() { + createExpectedDatanodes(); + return !expectedDatanodes.isEmpty(); + } + + public boolean isTargetExpected(DatanodeDetails dn) { + createExpectedDatanodes(); + return expectedDatanodes.contains(dn); + } + + private void createExpectedDatanodes() { + if (expectedDatanodes != null) { + return; + } + this.expectedDatanodes = new HashSet<>(); if (datanode == null) { - return null; + return; } - DatanodeDetails datanodeDetails = DATANODE_ALIASES.get(this.datanode); - if (datanodeDetails == null) { - fail("Unable to find a datanode for the alias: " + datanode + " in the expected commands."); + String[] nodes = datanode.split("\\|"); + for (String n : nodes) { + DatanodeDetails dn = DATANODE_ALIASES.get(n); + if (dn != null) { + expectedDatanodes.add(dn); + } else { + fail("Expected datanode not found: " + datanode); + } } - return datanodeDetails; } } diff --git a/hadoop-hdds/server-scm/src/test/resources/replicationManagerTests/quasi_closed.json b/hadoop-hdds/server-scm/src/test/resources/replicationManagerTests/quasi_closed.json index 946f15ea4090..3e6217d5dc88 100644 --- a/hadoop-hdds/server-scm/src/test/resources/replicationManagerTests/quasi_closed.json +++ b/hadoop-hdds/server-scm/src/test/resources/replicationManagerTests/quasi_closed.json @@ -179,8 +179,7 @@ "expectation": { "underReplicated": 1, "underReplicatedQueue": 1, "overReplicated": 0, "overReplicatedQueue": 0, "quasiClosedStuck": 1, "unhealthy": 0 }, "checkCommands": [], "commands": [ - { "type": "replicateContainerCommand"} - ] + { "type": "replicateContainerCommand", "datanode": "d4|d5" }] }, { "description": "Quasi-Closed stuck one origin over replicated", "containerState": "QUASI_CLOSED", "replicationConfig": "RATIS:THREE", "sequenceId": 10, "replicas": [ @@ -192,7 +191,7 @@ "expectation": { "underReplicated": 0, "underReplicatedQueue": 0, "overReplicated": 1, "overReplicatedQueue": 1, "quasiClosedStuck": 1, "unhealthy": 0 }, "checkCommands": [], "commands": [ - { "type": "deleteContainerCommand" } + { "type": "deleteContainerCommand", "datanode": "d1|d2|d3|d4" } ] }, { "description": "Quasi-Closed stuck two origins over replicated", "containerState": "QUASI_CLOSED", "replicationConfig": "RATIS:THREE", "sequenceId": 10, @@ -207,8 +206,8 @@ "expectation": { "underReplicated": 0, "underReplicatedQueue": 0, "overReplicated": 1, "overReplicatedQueue": 1, "quasiClosedStuck": 1, "unhealthy": 0 }, "checkCommands": [], "commands": [ - { "type": "deleteContainerCommand" }, - { "type": "deleteContainerCommand" } + { "type": "deleteContainerCommand", "datanode": "d1|d2|d3" }, + { "type": "deleteContainerCommand", "datanode": "d4|d5|d6" } ] }, { "description": "Quasi-Closed stuck two origins not over replicated with maintenance", "containerState": "QUASI_CLOSED", "replicationConfig": "RATIS:THREE", "sequenceId": 10, @@ -250,7 +249,7 @@ "expectation": { "underReplicated": 0, "underReplicatedQueue": 0, "overReplicated": 1, "overReplicatedQueue": 1, "quasiClosedStuck": 1, "unhealthy": 0 }, "checkCommands": [], "commands": [ - { "type": "deleteContainerCommand" } + { "type": "deleteContainerCommand", "datanode": "d1|d2|d3" } ] }, { "description": "Quasi-Closed stuck two origins over replicated with stale", "containerState": "QUASI_CLOSED", "replicationConfig": "RATIS:THREE", "sequenceId": 10, From 6e523214d73b9e1a2a22df5ab46c689f94a59395 Mon Sep 17 00:00:00 2001 From: S O'Donnell Date: Fri, 14 Mar 2025 12:12:01 +0000 Subject: [PATCH 8/8] Make set start as null --- .../container/replication/TestReplicationManagerScenarios.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManagerScenarios.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManagerScenarios.java index 5acb58cab7f6..79f6299bace2 100644 --- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManagerScenarios.java +++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestReplicationManagerScenarios.java @@ -550,7 +550,7 @@ public int getOverReplicatedQueue() { public static class ExpectedCommands { private SCMCommandProto.Type type; private String datanode; - private Set expectedDatanodes = new HashSet<>(); + private Set expectedDatanodes; public void setDatanode(String datanode) { this.datanode = datanode;