diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java index 822510512663..7b73ba2d39e1 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/volume/StorageVolume.java @@ -614,6 +614,15 @@ public synchronized VolumeCheckResult check(@Nullable Boolean unused) return VolumeCheckResult.HEALTHY; } + // At least some space required to check disk read/write + // If there are not enough space remaining, + // to avoid volume failure we can ignore checking disk read/write + int minimumDiskSpace = healthCheckFileSize * 2; + if (volumeInfo.get().getCurrentUsage().getAvailable() < minimumDiskSpace) { + ioTestSlidingWindow.add(true); + return VolumeCheckResult.HEALTHY; + } + // Since IO errors may be intermittent, volume remains healthy until the // threshold of failures is crossed. boolean diskChecksPassed = DiskCheckUtil.checkReadWrite(storageDir, @@ -625,6 +634,14 @@ public synchronized VolumeCheckResult check(@Nullable Boolean unused) " interrupted."); } + // As WRITE keeps happening there is probability, disk has become full during above check. + // We can check again if disk is full. If it is full, + // in this case keep volume as healthy so that READ can still be served + if (!diskChecksPassed && volumeInfo.get().getCurrentUsage().getAvailable() < minimumDiskSpace) { + ioTestSlidingWindow.add(true); + return VolumeCheckResult.HEALTHY; + } + // Move the sliding window of IO test results forward 1 by adding the // latest entry and removing the oldest entry from the window. // Update the failure counter for the new window. diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeHealthChecks.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeHealthChecks.java index eddf80ef4246..9e16e4f9b72a 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeHealthChecks.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeHealthChecks.java @@ -110,6 +110,53 @@ public boolean checkExistence(File storageDir) { assertEquals(VolumeCheckResult.FAILED, result); } + @ParameterizedTest + @MethodSource("volumeBuilders") + public void testVolumeFullHealth(StorageVolume.Builder builder) throws Exception { + verifyFullVolumeHealthWithDiskReadWriteStatus(builder, true, false); + } + + + public void verifyFullVolumeHealthWithDiskReadWriteStatus(StorageVolume.Builder builder, boolean... checkResult) + throws Exception { + + for (boolean result : checkResult) { + StorageVolume volume = builder.build(); + + VolumeUsage usage = volume.getVolumeInfo().get().getUsageForTesting(); + DatanodeConfiguration dnConf = CONF.getObject(DatanodeConfiguration.class); + int minimumDiskSpace = dnConf.getVolumeHealthCheckFileSize() * 2; + // Keep remaining space as just less than double of VolumeHealthCheckFileSize. + usage.incrementUsedSpace(usage.getCurrentUsage().getAvailable() - minimumDiskSpace + 1); + usage.realUsage(); + DiskCheckUtil.DiskChecks ioFailure = new DiskCheckUtil.DiskChecks() { + @Override + public boolean checkReadWrite(File storageDir, File testFileDir, + int numBytesToWrite) { + return result; + } + }; + DiskCheckUtil.setTestImpl(ioFailure); + // Volume will remain healthy as volume don't have enough space to check READ/WRITE + assertEquals(VolumeCheckResult.HEALTHY, volume.check(false)); + // Even in second try volume will remain HEALTHY + assertEquals(VolumeCheckResult.HEALTHY, volume.check(false)); + + // Now keep enough space for read/write check to go through + usage.decrementUsedSpace(minimumDiskSpace + 1); + + // volumeIOFailureTolerance is 1, so first time it will be HEALTHY always + assertEquals(VolumeCheckResult.HEALTHY, volume.check(false)); + if (result) { + // Volume will remain as healthy as READ/WRITE check is fine + assertEquals(VolumeCheckResult.HEALTHY, volume.check(false)); + } else { + // Second time volume will fail as READ/WRITE check has failed + assertEquals(VolumeCheckResult.FAILED, volume.check(false)); + } + } + } + @ParameterizedTest @MethodSource("volumeBuilders") public void testCheckPermissions(StorageVolume.Builder builder)