From 90987b996e4da3666ceb1171f9c7c892eef5cfe3 Mon Sep 17 00:00:00 2001 From: Aswin Shakil Balasubramanian Date: Wed, 12 Jun 2024 20:15:12 +0530 Subject: [PATCH 1/3] HDDS-10923. Container Scanner should still scan unhealthy containers. --- .../common/interfaces/Container.java | 6 --- .../container/keyvalue/KeyValueContainer.java | 15 +----- .../BackgroundContainerMetadataScanner.java | 3 +- .../container/common/ContainerTestUtils.java | 1 - .../TestBackgroundContainerDataScanner.java | 47 ------------------ ...estBackgroundContainerMetadataScanner.java | 49 ------------------- .../TestContainerScannersAbstract.java | 5 -- .../TestOnDemandContainerDataScanner.java | 40 --------------- 8 files changed, 3 insertions(+), 163 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/interfaces/Container.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/interfaces/Container.java index 5fe148d6aaac..12c6835ee773 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/interfaces/Container.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/interfaces/Container.java @@ -225,12 +225,6 @@ ContainerReplicaProto getContainerReport() */ long getBlockCommitSequenceId(); - /** - * Returns if the container metadata should be checked. The result depends - * on the state of the container. - */ - boolean shouldScanMetadata(); - /** * check and report the structural integrity of the container. * @return true if the integrity checks pass diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainer.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainer.java index de43a29d9f6d..31c861ee42c0 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainer.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/KeyValueContainer.java @@ -933,18 +933,6 @@ public File getContainerDBFile() { } - @Override - public boolean shouldScanMetadata() { - boolean shouldScan = - getContainerState() != ContainerDataProto.State.UNHEALTHY; - if (!shouldScan && LOG.isDebugEnabled()) { - LOG.debug("Container {} in state {} should not have its metadata " + - "scanned.", - containerData.getContainerID(), containerData.getState()); - } - return shouldScan; - } - @Override public ScanResult scanMetaData() throws InterruptedException { long containerId = containerData.getContainerID(); @@ -958,7 +946,8 @@ public ScanResult scanMetaData() throws InterruptedException { public boolean shouldScanData() { boolean shouldScan = getContainerState() == ContainerDataProto.State.CLOSED - || getContainerState() == ContainerDataProto.State.QUASI_CLOSED; + || getContainerState() == ContainerDataProto.State.QUASI_CLOSED + || getContainerState() == ContainerDataProto.State.UNHEALTHY; if (!shouldScan && LOG.isDebugEnabled()) { LOG.debug("Container {} in state {} should not have its data scanned.", containerData.getContainerID(), containerData.getState()); diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/BackgroundContainerMetadataScanner.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/BackgroundContainerMetadataScanner.java index 018870e21d66..659b31630bd0 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/BackgroundContainerMetadataScanner.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/BackgroundContainerMetadataScanner.java @@ -96,7 +96,6 @@ public ContainerMetadataScannerMetrics getMetrics() { private boolean shouldScan(Container container) { // Full data scan also does a metadata scan. If a full data scan was done // recently, we can skip this metadata scan. - return container.shouldScanMetadata() && - !ContainerUtils.recentlyScanned(container, minScanGap, LOG); + return !ContainerUtils.recentlyScanned(container, minScanGap, LOG); } } diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/ContainerTestUtils.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/ContainerTestUtils.java index c63f82025e09..b5b578554b19 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/ContainerTestUtils.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/ContainerTestUtils.java @@ -214,7 +214,6 @@ public static void setupMockContainer( when(data.getContainerID()).thenReturn(containerIdSeq.getAndIncrement()); when(c.getContainerData()).thenReturn(data); when(c.shouldScanData()).thenReturn(shouldScanData); - when(c.shouldScanMetadata()).thenReturn(true); when(c.getContainerData().getVolume()).thenReturn(vol); try { diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java index e34bfe9396a5..5a2123480b25 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java @@ -19,11 +19,7 @@ */ package org.apache.hadoop.ozone.container.ozoneimpl; -import org.apache.hadoop.hdfs.util.Canceler; -import org.apache.hadoop.hdfs.util.DataTransferThrottler; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; -import org.apache.hadoop.ozone.container.common.interfaces.Container; -import org.apache.hadoop.ozone.container.common.interfaces.Container.ScanResult; import org.apache.ozone.test.GenericTestUtils; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.BeforeEach; @@ -35,9 +31,6 @@ import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; -import static org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State.UNHEALTHY; -import static org.apache.hadoop.ozone.container.common.ContainerTestUtils.getUnhealthyScanResult; -import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; @@ -45,7 +38,6 @@ import static org.mockito.Mockito.any; import static org.mockito.Mockito.eq; import static org.mockito.Mockito.atLeastOnce; -import static org.mockito.Mockito.atMostOnce; import static org.mockito.Mockito.never; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; @@ -152,45 +144,6 @@ public void testScanTimestampUpdated() throws Exception { eq(deletedContainer.getContainerData().getContainerID()), any()); } - @Test - @Override - public void testUnhealthyContainerNotRescanned() throws Exception { - Container unhealthy = mockKeyValueContainer(); - when(unhealthy.scanMetaData()).thenReturn(ScanResult.healthy()); - when(unhealthy.scanData(any(DataTransferThrottler.class), - any(Canceler.class))).thenReturn(getUnhealthyScanResult()); - - setContainers(unhealthy, healthy); - - // First iteration should find the unhealthy container. - scanner.runIteration(); - verifyContainerMarkedUnhealthy(unhealthy, atMostOnce()); - ContainerDataScannerMetrics metrics = scanner.getMetrics(); - assertEquals(1, metrics.getNumScanIterations()); - assertEquals(2, metrics.getNumContainersScanned()); - assertEquals(1, metrics.getNumUnHealthyContainers()); - - // The unhealthy container should have been moved to the unhealthy state. - verify(unhealthy.getContainerData(), atMostOnce()) - .setState(UNHEALTHY); - // Update the mock to reflect this. - when(unhealthy.getContainerState()).thenReturn(UNHEALTHY); - assertFalse(unhealthy.shouldScanData()); - - // Clear metrics to check the next run. - metrics.resetNumContainersScanned(); - metrics.resetNumUnhealthyContainers(); - - scanner.runIteration(); - // The only invocation of unhealthy on this container should have been from - // the previous scan. - verifyContainerMarkedUnhealthy(unhealthy, atMostOnce()); - // This iteration should skip the already unhealthy container. - assertEquals(2, metrics.getNumScanIterations()); - assertEquals(1, metrics.getNumContainersScanned()); - assertEquals(0, metrics.getNumUnHealthyContainers()); - } - /** * A datanode will have one background data scanner per volume. When the * volume fails, the scanner thread should be terminated. diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerMetadataScanner.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerMetadataScanner.java index 9bb3a382c60e..9af5e36b3a18 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerMetadataScanner.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerMetadataScanner.java @@ -19,11 +19,7 @@ */ package org.apache.hadoop.ozone.container.ozoneimpl; -import org.apache.hadoop.hdfs.util.Canceler; -import org.apache.hadoop.hdfs.util.DataTransferThrottler; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; -import org.apache.hadoop.ozone.container.common.interfaces.Container; -import org.apache.hadoop.ozone.container.common.interfaces.Container.ScanResult; import org.apache.ozone.test.GenericTestUtils; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.BeforeEach; @@ -35,16 +31,11 @@ import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; -import static org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State.UNHEALTHY; -import static org.apache.hadoop.ozone.container.common.ContainerTestUtils.getUnhealthyScanResult; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.mockito.Mockito.any; import static org.mockito.Mockito.atLeastOnce; -import static org.mockito.Mockito.atMostOnce; import static org.mockito.Mockito.never; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; @@ -127,46 +118,6 @@ public void testUnhealthyContainersDetected() throws Exception { verifyContainerMarkedUnhealthy(openContainer, never()); } - @Test - @Override - public void testUnhealthyContainerNotRescanned() throws Exception { - Container unhealthy = mockKeyValueContainer(); - when(unhealthy.scanMetaData()).thenReturn(getUnhealthyScanResult()); - when(unhealthy.scanData( - any(DataTransferThrottler.class), any(Canceler.class))) - .thenReturn(ScanResult.healthy()); - - setContainers(unhealthy, healthy); - - // First iteration should find the unhealthy container. - scanner.runIteration(); - verifyContainerMarkedUnhealthy(unhealthy, atMostOnce()); - ContainerMetadataScannerMetrics metrics = scanner.getMetrics(); - assertEquals(1, metrics.getNumScanIterations()); - assertEquals(2, metrics.getNumContainersScanned()); - assertEquals(1, metrics.getNumUnHealthyContainers()); - - // The unhealthy container should have been moved to the unhealthy state. - verify(unhealthy.getContainerData(), atMostOnce()) - .setState(UNHEALTHY); - // Update the mock to reflect this. - when(unhealthy.getContainerState()).thenReturn(UNHEALTHY); - assertFalse(unhealthy.shouldScanMetadata()); - - // Clear metrics to check the next run. - metrics.resetNumContainersScanned(); - metrics.resetNumUnhealthyContainers(); - - scanner.runIteration(); - // The only invocation of unhealthy on this container should have been from - // the previous scan. - verifyContainerMarkedUnhealthy(unhealthy, atMostOnce()); - // This iteration should skip the already unhealthy container. - assertEquals(2, metrics.getNumScanIterations()); - assertEquals(1, metrics.getNumContainersScanned()); - assertEquals(0, metrics.getNumUnHealthyContainers()); - } - /** * A datanode will have one metadata scanner thread for the whole process. * When a volume fails, any the containers queued for scanning in that volume diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestContainerScannersAbstract.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestContainerScannersAbstract.java index d892e916ce60..941472d86c02 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestContainerScannersAbstract.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestContainerScannersAbstract.java @@ -121,9 +121,6 @@ public abstract void testPreviouslyScannedContainerIsScanned() @Test public abstract void testShutdownDuringScan() throws Exception; - @Test - public abstract void testUnhealthyContainerNotRescanned() throws Exception; - // HELPER METHODS protected void setScannedTimestampOld(Container container) { @@ -170,8 +167,6 @@ protected Container mockKeyValueContainer() { // and test it. when(unhealthy.shouldScanData()).thenCallRealMethod(); assertTrue(unhealthy.shouldScanData()); - when(unhealthy.shouldScanMetadata()).thenCallRealMethod(); - assertTrue(unhealthy.shouldScanMetadata()); when(unhealthy.getContainerData().getVolume()).thenReturn(vol); diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestOnDemandContainerDataScanner.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestOnDemandContainerDataScanner.java index 8334c7b078cb..feba1301bb1e 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestOnDemandContainerDataScanner.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestOnDemandContainerDataScanner.java @@ -20,8 +20,6 @@ package org.apache.hadoop.ozone.container.ozoneimpl; import org.apache.commons.compress.utils.Lists; -import org.apache.hadoop.hdfs.util.Canceler; -import org.apache.hadoop.hdfs.util.DataTransferThrottler; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.ozone.container.common.interfaces.Container; import org.apache.hadoop.ozone.container.common.interfaces.Container.ScanResult; @@ -39,7 +37,6 @@ import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; -import static org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State.UNHEALTHY; import static org.apache.hadoop.ozone.container.common.ContainerTestUtils.getUnhealthyScanResult; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -49,7 +46,6 @@ import static org.mockito.Mockito.any; import static org.mockito.Mockito.eq; import static org.mockito.Mockito.atLeastOnce; -import static org.mockito.Mockito.atMostOnce; import static org.mockito.Mockito.never; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.verifyNoInteractions; @@ -256,42 +252,6 @@ public void testShutdownDuringScan() throws Exception { verifyContainerMarkedUnhealthy(healthy, never()); } - @Test - @Override - public void testUnhealthyContainerNotRescanned() throws Exception { - Container unhealthy = mockKeyValueContainer(); - when(unhealthy.scanMetaData()).thenReturn(ScanResult.healthy()); - when(unhealthy.scanData( - any(DataTransferThrottler.class), any(Canceler.class))) - .thenReturn(getUnhealthyScanResult()); - - // First iteration should find the unhealthy container. - scanContainer(unhealthy); - verifyContainerMarkedUnhealthy(unhealthy, atMostOnce()); - OnDemandScannerMetrics metrics = OnDemandContainerDataScanner.getMetrics(); - assertEquals(1, metrics.getNumContainersScanned()); - assertEquals(1, metrics.getNumUnHealthyContainers()); - - // The unhealthy container should have been moved to the unhealthy state. - verify(unhealthy.getContainerData(), atMostOnce()) - .setState(UNHEALTHY); - // Update the mock to reflect this. - when(unhealthy.getContainerState()).thenReturn(UNHEALTHY); - assertFalse(unhealthy.shouldScanData()); - - // Clear metrics to check the next run. - metrics.resetNumContainersScanned(); - metrics.resetNumUnhealthyContainers(); - - scanContainer(unhealthy); - // The only invocation of unhealthy on this container should have been from - // the previous scan. - verifyContainerMarkedUnhealthy(unhealthy, atMostOnce()); - // This iteration should skip the already unhealthy container. - assertEquals(0, metrics.getNumContainersScanned()); - assertEquals(0, metrics.getNumUnHealthyContainers()); - } - private void scanContainer(Container container) throws Exception { OnDemandContainerDataScanner.init(conf, controller); Optional> scanFuture = From 3bb03a5bb17b927d6627901d91824f8a82fc598a Mon Sep 17 00:00:00 2001 From: Aswin Shakil Balasubramanian Date: Fri, 21 Jun 2024 13:06:22 +0530 Subject: [PATCH 2/3] HDDS-10923. Update Tests. --- .../TestBackgroundContainerDataScanner.java | 47 ++++++++++++++++++ ...estBackgroundContainerMetadataScanner.java | 48 +++++++++++++++++++ .../TestContainerScannersAbstract.java | 3 ++ .../TestOnDemandContainerDataScanner.java | 40 ++++++++++++++++ 4 files changed, 138 insertions(+) diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java index 5a2123480b25..202f38fddb78 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java @@ -19,7 +19,11 @@ */ package org.apache.hadoop.ozone.container.ozoneimpl; +import org.apache.hadoop.hdfs.util.Canceler; +import org.apache.hadoop.hdfs.util.DataTransferThrottler; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; +import org.apache.hadoop.ozone.container.common.interfaces.Container; +import org.apache.hadoop.ozone.container.common.interfaces.Container.ScanResult; import org.apache.ozone.test.GenericTestUtils; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.BeforeEach; @@ -31,6 +35,8 @@ import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; +import static org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State.UNHEALTHY; +import static org.apache.hadoop.ozone.container.common.ContainerTestUtils.getUnhealthyScanResult; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; @@ -38,9 +44,11 @@ import static org.mockito.Mockito.any; import static org.mockito.Mockito.eq; import static org.mockito.Mockito.atLeastOnce; +import static org.mockito.Mockito.atMostOnce; import static org.mockito.Mockito.never; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; +import static org.mockito.internal.verification.VerificationModeFactory.atMost; /** * Unit tests for the background container data scanner. @@ -144,6 +152,45 @@ public void testScanTimestampUpdated() throws Exception { eq(deletedContainer.getContainerData().getContainerID()), any()); } + @Test + @Override + public void testUnhealthyContainerRescanned() throws Exception { + Container unhealthy = mockKeyValueContainer(); + when(unhealthy.scanMetaData()).thenReturn(ScanResult.healthy()); + when(unhealthy.scanData(any(DataTransferThrottler.class), + any(Canceler.class))).thenReturn(getUnhealthyScanResult()); + + setContainers(unhealthy, healthy); + + // First iteration should find the unhealthy container. + scanner.runIteration(); + verifyContainerMarkedUnhealthy(unhealthy, atMostOnce()); + ContainerDataScannerMetrics metrics = scanner.getMetrics(); + assertEquals(1, metrics.getNumScanIterations()); + assertEquals(2, metrics.getNumContainersScanned()); + assertEquals(1, metrics.getNumUnHealthyContainers()); + + // The unhealthy container should have been moved to the unhealthy state. + verify(unhealthy.getContainerData(), atMostOnce()) + .setState(UNHEALTHY); + // Update the mock to reflect this. + when(unhealthy.getContainerState()).thenReturn(UNHEALTHY); + assertTrue(unhealthy.shouldScanData()); + + // Clear metrics to check the next run. + metrics.resetNumContainersScanned(); + metrics.resetNumUnhealthyContainers(); + + scanner.runIteration(); + // The only invocation of unhealthy on this container should have been from + // the previous scan. + verifyContainerMarkedUnhealthy(unhealthy, atMost(2)); + // This iteration should scan the unhealthy container. + assertEquals(2, metrics.getNumScanIterations()); + assertEquals(2, metrics.getNumContainersScanned()); + assertEquals(1, metrics.getNumUnHealthyContainers()); + } + /** * A datanode will have one background data scanner per volume. When the * volume fails, the scanner thread should be terminated. diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerMetadataScanner.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerMetadataScanner.java index 9af5e36b3a18..7a4b9759b68c 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerMetadataScanner.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerMetadataScanner.java @@ -19,7 +19,11 @@ */ package org.apache.hadoop.ozone.container.ozoneimpl; +import org.apache.hadoop.hdfs.util.Canceler; +import org.apache.hadoop.hdfs.util.DataTransferThrottler; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; +import org.apache.hadoop.ozone.container.common.interfaces.Container; +import org.apache.hadoop.ozone.container.common.interfaces.Container.ScanResult; import org.apache.ozone.test.GenericTestUtils; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.BeforeEach; @@ -31,14 +35,19 @@ import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; +import static org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State.UNHEALTHY; +import static org.apache.hadoop.ozone.container.common.ContainerTestUtils.getUnhealthyScanResult; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.Mockito.any; import static org.mockito.Mockito.atLeastOnce; +import static org.mockito.Mockito.atMostOnce; import static org.mockito.Mockito.never; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; +import static org.mockito.internal.verification.VerificationModeFactory.atMost; /** * Unit tests for the background container metadata scanner. @@ -118,6 +127,45 @@ public void testUnhealthyContainersDetected() throws Exception { verifyContainerMarkedUnhealthy(openContainer, never()); } + @Test + @Override + public void testUnhealthyContainerRescanned() throws Exception { + Container unhealthy = mockKeyValueContainer(); + when(unhealthy.scanMetaData()).thenReturn(getUnhealthyScanResult()); + when(unhealthy.scanData( + any(DataTransferThrottler.class), any(Canceler.class))) + .thenReturn(ScanResult.healthy()); + + setContainers(unhealthy, healthy); + + // First iteration should find the unhealthy container. + scanner.runIteration(); + verifyContainerMarkedUnhealthy(unhealthy, atMostOnce()); + ContainerMetadataScannerMetrics metrics = scanner.getMetrics(); + assertEquals(1, metrics.getNumScanIterations()); + assertEquals(2, metrics.getNumContainersScanned()); + assertEquals(1, metrics.getNumUnHealthyContainers()); + + // The unhealthy container should have been moved to the unhealthy state. + verify(unhealthy.getContainerData(), atMostOnce()) + .setState(UNHEALTHY); + // Update the mock to reflect this. + when(unhealthy.getContainerState()).thenReturn(UNHEALTHY); + + // Clear metrics to check the next run. + metrics.resetNumContainersScanned(); + metrics.resetNumUnhealthyContainers(); + + scanner.runIteration(); + // The only invocation of unhealthy on this container should have been from + // the previous scan. + verifyContainerMarkedUnhealthy(unhealthy, atMost(2)); + // This iteration should skip the already unhealthy container. + assertEquals(2, metrics.getNumScanIterations()); + assertEquals(2, metrics.getNumContainersScanned()); + assertEquals(1, metrics.getNumUnHealthyContainers()); + } + /** * A datanode will have one metadata scanner thread for the whole process. * When a volume fails, any the containers queued for scanning in that volume diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestContainerScannersAbstract.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestContainerScannersAbstract.java index 941472d86c02..5d7d8c3100bd 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestContainerScannersAbstract.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestContainerScannersAbstract.java @@ -121,6 +121,9 @@ public abstract void testPreviouslyScannedContainerIsScanned() @Test public abstract void testShutdownDuringScan() throws Exception; + @Test + public abstract void testUnhealthyContainerRescanned() throws Exception; + // HELPER METHODS protected void setScannedTimestampOld(Container container) { diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestOnDemandContainerDataScanner.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestOnDemandContainerDataScanner.java index feba1301bb1e..24818c842b30 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestOnDemandContainerDataScanner.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestOnDemandContainerDataScanner.java @@ -20,6 +20,8 @@ package org.apache.hadoop.ozone.container.ozoneimpl; import org.apache.commons.compress.utils.Lists; +import org.apache.hadoop.hdfs.util.Canceler; +import org.apache.hadoop.hdfs.util.DataTransferThrottler; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.ozone.container.common.interfaces.Container; import org.apache.hadoop.ozone.container.common.interfaces.Container.ScanResult; @@ -37,6 +39,7 @@ import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; +import static org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State.UNHEALTHY; import static org.apache.hadoop.ozone.container.common.ContainerTestUtils.getUnhealthyScanResult; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; @@ -46,10 +49,12 @@ import static org.mockito.Mockito.any; import static org.mockito.Mockito.eq; import static org.mockito.Mockito.atLeastOnce; +import static org.mockito.Mockito.atMostOnce; import static org.mockito.Mockito.never; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.verifyNoInteractions; import static org.mockito.Mockito.when; +import static org.mockito.internal.verification.VerificationModeFactory.atMost; /** * Unit tests for the on-demand container scanner. @@ -252,6 +257,41 @@ public void testShutdownDuringScan() throws Exception { verifyContainerMarkedUnhealthy(healthy, never()); } + @Test + @Override + public void testUnhealthyContainerRescanned() throws Exception { + Container unhealthy = mockKeyValueContainer(); + when(unhealthy.scanMetaData()).thenReturn(ScanResult.healthy()); + when(unhealthy.scanData( + any(DataTransferThrottler.class), any(Canceler.class))) + .thenReturn(getUnhealthyScanResult()); + + // First iteration should find the unhealthy container. + scanContainer(unhealthy); + verifyContainerMarkedUnhealthy(unhealthy, atMostOnce()); + OnDemandScannerMetrics metrics = OnDemandContainerDataScanner.getMetrics(); + assertEquals(1, metrics.getNumContainersScanned()); + assertEquals(1, metrics.getNumUnHealthyContainers()); + + // The unhealthy container should have been moved to the unhealthy state. + verify(unhealthy.getContainerData(), atMostOnce()) + .setState(UNHEALTHY); + // Update the mock to reflect this. + when(unhealthy.getContainerState()).thenReturn(UNHEALTHY); + assertTrue(unhealthy.shouldScanData()); + + // Clear metrics to check the next run. + metrics.resetNumContainersScanned(); + metrics.resetNumUnhealthyContainers(); + + // When rescanned the metrics should increase as we scan + // UNHEALTHY containers as well. + scanContainer(unhealthy); + verifyContainerMarkedUnhealthy(unhealthy, atMost(2)); + assertEquals(1, metrics.getNumContainersScanned()); + assertEquals(1, metrics.getNumUnHealthyContainers()); + } + private void scanContainer(Container container) throws Exception { OnDemandContainerDataScanner.init(conf, controller); Optional> scanFuture = From dd55fc1ea170e8e3f899933f2987be2dca8c93a0 Mon Sep 17 00:00:00 2001 From: Aswin Shakil Balasubramanian Date: Tue, 25 Jun 2024 01:12:31 +0530 Subject: [PATCH 3/3] HDDS-10923. Fix unhealthy container count and update Tests. --- .../BackgroundContainerDataScanner.java | 6 +++-- .../BackgroundContainerMetadataScanner.java | 6 +++-- .../ozoneimpl/ContainerController.java | 8 ++++--- .../OnDemandContainerDataScanner.java | 8 ++++--- .../TestBackgroundContainerDataScanner.java | 20 ++++++++--------- ...estBackgroundContainerMetadataScanner.java | 22 +++++++++---------- .../TestOnDemandContainerDataScanner.java | 17 ++++++++------ 7 files changed, 49 insertions(+), 38 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/BackgroundContainerDataScanner.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/BackgroundContainerDataScanner.java index 327f01922431..84d21a2017e1 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/BackgroundContainerDataScanner.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/BackgroundContainerDataScanner.java @@ -98,8 +98,10 @@ public void scanContainer(Container c) if (!result.isHealthy()) { LOG.error("Corruption detected in container [{}]. Marking it UNHEALTHY.", containerId, result.getException()); - metrics.incNumUnHealthyContainers(); - controller.markContainerUnhealthy(containerId, result); + boolean containerMarkedUnhealthy = controller.markContainerUnhealthy(containerId, result); + if (containerMarkedUnhealthy) { + metrics.incNumUnHealthyContainers(); + } } metrics.incNumContainersScanned(); diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/BackgroundContainerMetadataScanner.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/BackgroundContainerMetadataScanner.java index 659b31630bd0..4f76efd08345 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/BackgroundContainerMetadataScanner.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/BackgroundContainerMetadataScanner.java @@ -79,8 +79,10 @@ public void scanContainer(Container container) if (!result.isHealthy()) { LOG.error("Corruption detected in container [{}]. Marking it UNHEALTHY.", containerID, result.getException()); - metrics.incNumUnHealthyContainers(); - controller.markContainerUnhealthy(containerID, result); + boolean containerMarkedUnhealthy = controller.markContainerUnhealthy(containerID, result); + if (containerMarkedUnhealthy) { + metrics.incNumUnHealthyContainers(); + } } // Do not update the scan timestamp after the scan since this was just a diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerController.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerController.java index feb86f351975..f9aa315664a7 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerController.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerController.java @@ -114,15 +114,17 @@ public void markContainerForClose(final long containerId) * @param reason The reason the container was marked unhealthy * @throws IOException in case of exception */ - public void markContainerUnhealthy(final long containerId, ScanResult reason) + public boolean markContainerUnhealthy(final long containerId, ScanResult reason) throws IOException { - Container container = containerSet.getContainer(containerId); - if (container != null) { + Container container = getContainer(containerId); + if (container != null && container.getContainerState() != State.UNHEALTHY) { getHandler(container).markContainerUnhealthy(container, reason); + return true; } else { LOG.warn("Container {} not found, may be deleted, skip mark UNHEALTHY", containerId); } + return false; } /** diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OnDemandContainerDataScanner.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OnDemandContainerDataScanner.java index 44884c5c2900..2cfd7930cb70 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OnDemandContainerDataScanner.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OnDemandContainerDataScanner.java @@ -144,9 +144,11 @@ private static void performOnDemandScan(Container container) { if (!result.isHealthy()) { LOG.error("Corruption detected in container [{}]." + "Marking it UNHEALTHY.", containerId, result.getException()); - instance.metrics.incNumUnHealthyContainers(); - instance.containerController.markContainerUnhealthy(containerId, - result); + boolean containerMarkedUnhealthy = instance.containerController + .markContainerUnhealthy(containerId, result); + if (containerMarkedUnhealthy) { + instance.metrics.incNumUnHealthyContainers(); + } } instance.metrics.incNumContainersScanned(); diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java index 202f38fddb78..06509af9f426 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerDataScanner.java @@ -42,13 +42,13 @@ import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.Mockito.any; +import static org.mockito.Mockito.atMost; import static org.mockito.Mockito.eq; import static org.mockito.Mockito.atLeastOnce; import static org.mockito.Mockito.atMostOnce; import static org.mockito.Mockito.never; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; -import static org.mockito.internal.verification.VerificationModeFactory.atMost; /** * Unit tests for the background container data scanner. @@ -101,7 +101,7 @@ public void testScannerMetrics() { ContainerDataScannerMetrics metrics = scanner.getMetrics(); assertEquals(1, metrics.getNumScanIterations()); assertEquals(2, metrics.getNumContainersScanned()); - assertEquals(1, metrics.getNumUnHealthyContainers()); + assertEquals(0, metrics.getNumUnHealthyContainers()); } @Test @@ -159,7 +159,8 @@ public void testUnhealthyContainerRescanned() throws Exception { when(unhealthy.scanMetaData()).thenReturn(ScanResult.healthy()); when(unhealthy.scanData(any(DataTransferThrottler.class), any(Canceler.class))).thenReturn(getUnhealthyScanResult()); - + when(controller.markContainerUnhealthy(eq(unhealthy.getContainerData().getContainerID()), + any())).thenReturn(true); setContainers(unhealthy, healthy); // First iteration should find the unhealthy container. @@ -177,17 +178,16 @@ public void testUnhealthyContainerRescanned() throws Exception { when(unhealthy.getContainerState()).thenReturn(UNHEALTHY); assertTrue(unhealthy.shouldScanData()); - // Clear metrics to check the next run. - metrics.resetNumContainersScanned(); - metrics.resetNumUnhealthyContainers(); - + when(controller.markContainerUnhealthy(eq(unhealthy.getContainerData().getContainerID()), + any())).thenReturn(false); scanner.runIteration(); - // The only invocation of unhealthy on this container should have been from - // the previous scan. + // The invocation of unhealthy on this container will also happen in the + // next iteration. verifyContainerMarkedUnhealthy(unhealthy, atMost(2)); // This iteration should scan the unhealthy container. assertEquals(2, metrics.getNumScanIterations()); - assertEquals(2, metrics.getNumContainersScanned()); + assertEquals(4, metrics.getNumContainersScanned()); + // numUnHealthyContainers metrics is not incremented in the 2nd iteration. assertEquals(1, metrics.getNumUnHealthyContainers()); } diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerMetadataScanner.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerMetadataScanner.java index 7a4b9759b68c..c4c40de247f7 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerMetadataScanner.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestBackgroundContainerMetadataScanner.java @@ -41,13 +41,14 @@ import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.any; import static org.mockito.Mockito.atLeastOnce; +import static org.mockito.Mockito.atMost; import static org.mockito.Mockito.atMostOnce; import static org.mockito.Mockito.never; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; -import static org.mockito.internal.verification.VerificationModeFactory.atMost; /** * Unit tests for the background container metadata scanner. @@ -100,7 +101,7 @@ public void testScannerMetrics() { ContainerMetadataScannerMetrics metrics = scanner.getMetrics(); assertEquals(1, metrics.getNumScanIterations()); assertEquals(4, metrics.getNumContainersScanned()); - assertEquals(1, metrics.getNumUnHealthyContainers()); + assertEquals(0, metrics.getNumUnHealthyContainers()); } @Test @@ -135,7 +136,8 @@ public void testUnhealthyContainerRescanned() throws Exception { when(unhealthy.scanData( any(DataTransferThrottler.class), any(Canceler.class))) .thenReturn(ScanResult.healthy()); - + when(controller.markContainerUnhealthy(eq(unhealthy.getContainerData().getContainerID()), + any())).thenReturn(true); setContainers(unhealthy, healthy); // First iteration should find the unhealthy container. @@ -151,18 +153,16 @@ public void testUnhealthyContainerRescanned() throws Exception { .setState(UNHEALTHY); // Update the mock to reflect this. when(unhealthy.getContainerState()).thenReturn(UNHEALTHY); - - // Clear metrics to check the next run. - metrics.resetNumContainersScanned(); - metrics.resetNumUnhealthyContainers(); - + when(controller.markContainerUnhealthy(eq(unhealthy.getContainerData().getContainerID()), + any())).thenReturn(false); scanner.runIteration(); - // The only invocation of unhealthy on this container should have been from - // the previous scan. + // The invocation of unhealthy on this container will also happen in the + // next iteration. verifyContainerMarkedUnhealthy(unhealthy, atMost(2)); // This iteration should skip the already unhealthy container. assertEquals(2, metrics.getNumScanIterations()); - assertEquals(2, metrics.getNumContainersScanned()); + assertEquals(4, metrics.getNumContainersScanned()); + // numUnHealthyContainers metrics is not incremented in the 2nd iteration. assertEquals(1, metrics.getNumUnHealthyContainers()); } diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestOnDemandContainerDataScanner.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestOnDemandContainerDataScanner.java index 24818c842b30..c08dc935ad45 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestOnDemandContainerDataScanner.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/ozoneimpl/TestOnDemandContainerDataScanner.java @@ -47,6 +47,7 @@ import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.Mockito.any; +import static org.mockito.Mockito.atMost; import static org.mockito.Mockito.eq; import static org.mockito.Mockito.atLeastOnce; import static org.mockito.Mockito.atMostOnce; @@ -54,7 +55,6 @@ import static org.mockito.Mockito.verify; import static org.mockito.Mockito.verifyNoInteractions; import static org.mockito.Mockito.when; -import static org.mockito.internal.verification.VerificationModeFactory.atMost; /** * Unit tests for the on-demand container scanner. @@ -176,7 +176,7 @@ public void testScannerMetrics() throws Exception { OnDemandScannerMetrics metrics = OnDemandContainerDataScanner.getMetrics(); //Containers with shouldScanData = false shouldn't increase // the number of scanned containers - assertEquals(1, metrics.getNumUnHealthyContainers()); + assertEquals(0, metrics.getNumUnHealthyContainers()); assertEquals(2, metrics.getNumContainersScanned()); } @@ -265,6 +265,8 @@ public void testUnhealthyContainerRescanned() throws Exception { when(unhealthy.scanData( any(DataTransferThrottler.class), any(Canceler.class))) .thenReturn(getUnhealthyScanResult()); + when(controller.markContainerUnhealthy(eq(unhealthy.getContainerData().getContainerID()), + any())).thenReturn(true); // First iteration should find the unhealthy container. scanContainer(unhealthy); @@ -279,16 +281,17 @@ public void testUnhealthyContainerRescanned() throws Exception { // Update the mock to reflect this. when(unhealthy.getContainerState()).thenReturn(UNHEALTHY); assertTrue(unhealthy.shouldScanData()); - - // Clear metrics to check the next run. - metrics.resetNumContainersScanned(); - metrics.resetNumUnhealthyContainers(); + when(controller.markContainerUnhealthy(eq(unhealthy.getContainerData().getContainerID()), + any())).thenReturn(false); // When rescanned the metrics should increase as we scan // UNHEALTHY containers as well. scanContainer(unhealthy); + // The invocation of unhealthy on this container will also happen in the + // next iteration. verifyContainerMarkedUnhealthy(unhealthy, atMost(2)); - assertEquals(1, metrics.getNumContainersScanned()); + assertEquals(2, metrics.getNumContainersScanned()); + // numUnHealthyContainers metrics is not incremented in the 2nd iteration. assertEquals(1, metrics.getNumUnHealthyContainers()); }