-
Notifications
You must be signed in to change notification settings - Fork 587
HDDS-12239. Volume should not be marked as unhealthy when disk full #7830
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
64e4685
b620c21
caaa427
4404202
49d1956
b6f9c70
354efd1
b24600f
fb07fb0
1bdcbac
a9a3b46
16e4c88
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -58,9 +58,13 @@ public HddsVolume chooseVolume(List<HddsVolume> volumes, | |
| throw new DiskOutOfSpaceException("No more available volumes"); | ||
| } | ||
|
|
||
| List<HddsVolume> volumesWithWriteAllowed = | ||
| volumes.stream().filter(k -> k.getStorageState() != StorageVolume.VolumeState.READ_ONLY) | ||
| .collect(Collectors.toList()); | ||
|
|
||
| AvailableSpaceFilter filter = new AvailableSpaceFilter(maxContainerSize); | ||
|
|
||
| List<HddsVolume> volumesWithEnoughSpace = volumes.stream() | ||
| List<HddsVolume> volumesWithEnoughSpace = volumesWithWriteAllowed.stream() | ||
| .filter(filter) | ||
| .collect(Collectors.toList()); | ||
|
||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -220,14 +220,16 @@ public void checkAllVolumes(StorageVolumeChecker checker) | |
| Set<? extends StorageVolume> failedVolumes; | ||
| try { | ||
| failedVolumes = checker.checkAllVolumes(allVolumes); | ||
| if (failedVolumes.size() > 0) { | ||
| LOG.warn("checkAllVolumes got {} failed volumes - {}", | ||
| failedVolumes.size(), failedVolumes); | ||
| } | ||
| } catch (InterruptedException e) { | ||
| Thread.currentThread().interrupt(); | ||
| throw new IOException("Interrupted while running disk check", e); | ||
| } | ||
|
|
||
| if (failedVolumes.size() > 0) { | ||
| LOG.warn("checkAllVolumes got {} failed volumes - {}", | ||
| failedVolumes.size(), failedVolumes); | ||
| if (failedVolumeMap.size() > 0 || failedVolumes.size() > 0) { | ||
sadanand48 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| handleVolumeFailures(failedVolumes); | ||
| } else { | ||
| LOG.debug("checkAllVolumes encountered no failures"); | ||
|
|
@@ -482,7 +484,7 @@ public StorageLocationReport[] getStorageReport() { | |
| rootDir = volumeInfo.get().getRootDir(); | ||
| SpaceUsageSource usage = volumeInfo.get().getCurrentUsage(); | ||
| scmUsed = usage.getUsedSpace(); | ||
| remaining = usage.getAvailable(); | ||
| remaining = volume.getStorageState() == HddsVolume.VolumeState.READ_ONLY ? 0 : usage.getAvailable(); | ||
|
||
| capacity = usage.getCapacity(); | ||
| committed = (volume instanceof HddsVolume) ? | ||
| ((HddsVolume) volume).getCommittedBytes() : 0; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,6 +26,7 @@ | |
| import java.io.IOException; | ||
| import java.util.List; | ||
| import java.util.concurrent.atomic.AtomicInteger; | ||
| import java.util.stream.Collectors; | ||
|
|
||
| import static org.apache.hadoop.ozone.container.common.volume.VolumeChoosingUtil.logIfSomeVolumesOutOfSpace; | ||
| import static org.apache.hadoop.ozone.container.common.volume.VolumeChoosingUtil.throwDiskOutOfSpace; | ||
|
|
@@ -51,6 +52,10 @@ public HddsVolume chooseVolume(List<HddsVolume> volumes, | |
| throw new DiskOutOfSpaceException("No more available volumes"); | ||
| } | ||
|
|
||
| List<HddsVolume> volumesWithWriteAllowed = | ||
| volumes.stream().filter(k -> k.getStorageState() != StorageVolume.VolumeState.READ_ONLY) | ||
| .collect(Collectors.toList()); | ||
|
|
||
| AvailableSpaceFilter filter = new AvailableSpaceFilter(maxContainerSize); | ||
|
|
||
| // since volumes could've been removed because of the failure | ||
|
|
@@ -61,7 +66,7 @@ public HddsVolume chooseVolume(List<HddsVolume> volumes, | |
| int startVolumeIndex = currentVolumeIndex; | ||
|
|
||
| while (true) { | ||
| final HddsVolume volume = volumes.get(currentVolumeIndex); | ||
| final HddsVolume volume = volumesWithWriteAllowed.get(currentVolumeIndex); | ||
|
||
| // adjust for remaining capacity in Open containers | ||
| boolean hasEnoughSpace = filter.test(volume); | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -106,7 +106,8 @@ public enum VolumeState { | |
| NON_EXISTENT, | ||
| INCONSISTENT, | ||
| NOT_FORMATTED, | ||
| NOT_INITIALIZED | ||
| NOT_INITIALIZED, | ||
| READ_ONLY | ||
| } | ||
|
|
||
| private volatile VolumeState state; | ||
|
|
@@ -599,35 +600,47 @@ private void cleanTmpDiskCheckDir() { | |
| @Override | ||
| public synchronized VolumeCheckResult check(@Nullable Boolean unused) | ||
| throws Exception { | ||
| DiskCheckUtil.ReadWriteStatus readWriteStatus = DiskCheckUtil.checkPermissions(storageDir); | ||
|
|
||
| boolean directoryChecksPassed = | ||
| DiskCheckUtil.checkExistence(storageDir) && | ||
| DiskCheckUtil.checkPermissions(storageDir); | ||
| DiskCheckUtil.checkExistence(storageDir); | ||
| // If the directory is not present or has incorrect permissions, fail the | ||
| // volume immediately. This is not an intermittent error. | ||
| if (!directoryChecksPassed) { | ||
| if (!directoryChecksPassed || readWriteStatus == DiskCheckUtil.ReadWriteStatus.READ_FAIL) { | ||
| if (Thread.currentThread().isInterrupted()) { | ||
| throw new InterruptedException("Directory check of volume " + this + | ||
| " interrupted."); | ||
| } | ||
| return VolumeCheckResult.FAILED; | ||
| } | ||
|
|
||
| if (readWriteStatus == DiskCheckUtil.ReadWriteStatus.WRITE_FAIL) { | ||
| setState(VolumeState.READ_ONLY); | ||
|
||
| return VolumeCheckResult.HEALTHY; | ||
| } | ||
|
|
||
| // If IO test count is set to 0, IO tests for disk health are disabled. | ||
| if (ioTestCount == 0) { | ||
| return VolumeCheckResult.HEALTHY; | ||
errose28 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| // Since IO errors may be intermittent, volume remains healthy until the | ||
| // threshold of failures is crossed. | ||
| boolean diskChecksPassed = DiskCheckUtil.checkReadWrite(storageDir, | ||
| readWriteStatus = DiskCheckUtil.checkReadWrite(storageDir, | ||
| diskCheckDir, healthCheckFileSize); | ||
| if (readWriteStatus == DiskCheckUtil.ReadWriteStatus.WRITE_FAIL) { | ||
| // Mark volume as READ only | ||
| setState(VolumeState.READ_ONLY); | ||
| return VolumeCheckResult.HEALTHY; | ||
| } | ||
| if (Thread.currentThread().isInterrupted()) { | ||
| // Thread interrupt may have caused IO operations to abort. Do not | ||
| // consider this a failure. | ||
| throw new InterruptedException("IO check of volume " + this + | ||
| " interrupted."); | ||
| } | ||
|
|
||
| boolean diskChecksPassed = readWriteStatus == DiskCheckUtil.ReadWriteStatus.READ_WRITE_OK; | ||
| // Move the sliding window of IO test results forward 1 by adding the | ||
| // latest entry and removing the oldest entry from the window. | ||
| // Update the failure counter for the new window. | ||
|
|
@@ -640,8 +653,7 @@ public synchronized VolumeCheckResult check(@Nullable Boolean unused) | |
| currentIOFailureCount.decrementAndGet(); | ||
| } | ||
|
|
||
| // If the failure threshold has been crossed, fail the volume without | ||
| // further scans. | ||
| // If the failure threshold has been crossed, mark volume as READ only | ||
| // Once the volume is failed, it will not be checked anymore. | ||
| // The failure counts can be left as is. | ||
| if (currentIOFailureCount.get() > ioFailureTolerance) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This naming is more consistent with the container scanner, which uses ScanResult. FWIW we moved the scan result to a separate class in the reconciliation branch. Volume scan is simpler than container scan though so I think enum is good for now.