From 169c01912e8936c09efffbf6f046cbcd795d997f Mon Sep 17 00:00:00 2001 From: Swaminathan Balachandran Date: Wed, 16 Jul 2025 04:54:50 -0400 Subject: [PATCH 1/5] HDDS-13449. Incorrect Interrupt Handling for DirectoryDeletingService and KeyDeletingService Change-Id: Icb1730e2968da6a8eff1b22dfcc80f0d36fc28e4 --- .../om/service/DirectoryDeletingService.java | 20 +++++++++++++++---- .../ozone/om/service/KeyDeletingService.java | 11 ++++++---- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/service/DirectoryDeletingService.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/service/DirectoryDeletingService.java index f3fdebedd7f8..f679787937d7 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/service/DirectoryDeletingService.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/service/DirectoryDeletingService.java @@ -226,6 +226,15 @@ public BackgroundTaskQueue getTasks() { @Override public void shutdown() { + deletionThreadPool.shutdown(); + try { + if (!deletionThreadPool.awaitTermination(60, TimeUnit.SECONDS)) { + deletionThreadPool.shutdownNow(); + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + deletionThreadPool.shutdownNow(); + } super.shutdown(); } @@ -238,7 +247,7 @@ void optimizeDirDeletesAndSubmitRequest( long remainingBufLimit, KeyManager keyManager, CheckedFunction, Boolean, IOException> reclaimableDirChecker, CheckedFunction, Boolean, IOException> reclaimableFileChecker, - UUID expectedPreviousSnapshotId, long rnCnt) { + UUID expectedPreviousSnapshotId, long rnCnt) throws InterruptedException { // Optimization to handle delete sub-dir and keys to remove quickly // This case will be useful to handle when depth of directory is high @@ -434,7 +443,7 @@ private OzoneManagerProtocolProtos.PurgePathRequest wrapPurgeRequest( } private OzoneManagerProtocolProtos.OMResponse submitPurgePaths(List requests, - String snapTableKey, UUID expectedPreviousSnapshotId) { + String snapTableKey, UUID expectedPreviousSnapshotId) throws InterruptedException { OzoneManagerProtocolProtos.PurgeDirectoriesRequest.Builder purgeDirRequest = OzoneManagerProtocolProtos.PurgeDirectoriesRequest.newBuilder(); @@ -460,7 +469,7 @@ private OzoneManagerProtocolProtos.OMResponse submitPurgePaths(List> totalExclusiveSizeMap, long runCount) { + Map> totalExclusiveSizeMap, long runCount) throws InterruptedException { OmSnapshotManager omSnapshotManager = getOzoneManager().getOmSnapshotManager(); IOzoneManagerLock lock = getOzoneManager().getMetadataManager().getLock(); String snapshotTableKey = currentSnapshotInfo == null ? null : currentSnapshotInfo.getTableKey(); diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/service/KeyDeletingService.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/service/KeyDeletingService.java index 781184b86291..70313ec81e80 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/service/KeyDeletingService.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/service/KeyDeletingService.java @@ -118,7 +118,7 @@ public AtomicLong getDeletedKeyCount() { Pair processKeyDeletes(List keyBlocksList, Map keysToModify, List renameEntries, - String snapTableKey, UUID expectedPreviousSnapshotId) throws IOException { + String snapTableKey, UUID expectedPreviousSnapshotId) throws IOException, InterruptedException { long startTime = Time.monotonicNow(); Pair purgeResult = Pair.of(0, false); if (LOG.isDebugEnabled()) { @@ -157,7 +157,7 @@ Pair processKeyDeletes(List keyBlocksList, */ private Pair submitPurgeKeysRequest(List results, Map keysToModify, List renameEntriesToBeDeleted, - String snapTableKey, UUID expectedPreviousSnapshotId) { + String snapTableKey, UUID expectedPreviousSnapshotId) throws InterruptedException { List purgeKeys = new ArrayList<>(); // Put all keys to be purged in a list @@ -243,7 +243,7 @@ private Pair submitPurgeKeysRequest(List Date: Wed, 16 Jul 2025 04:58:06 -0400 Subject: [PATCH 2/5] HDDS-13449. Incorrect Interrupt Handling for DirectoryDeletingService and KeyDeletingService Change-Id: I5cf3604db059bff39716de98b63dff61d8ffdadd --- .../hadoop/ozone/om/service/DirectoryDeletingService.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/service/DirectoryDeletingService.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/service/DirectoryDeletingService.java index f679787937d7..9401ac0037f6 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/service/DirectoryDeletingService.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/service/DirectoryDeletingService.java @@ -687,9 +687,13 @@ public BackgroundTaskResult call() { : omSnapshot.get().getKeyManager(); processDeletedDirsForStore(snapInfo, keyManager, ratisByteLimit, run); } - } catch (IOException | ExecutionException | InterruptedException e) { + } catch (IOException | ExecutionException e) { LOG.error("Error while running delete files background task for store {}. Will retry at next run.", snapInfo, e); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + LOG.error("Interruption running delete directory background task for store {}.", + snapInfo, e); } } // By design, no one cares about the results of this call back. From c755de7650a5746d1b33512d467f39e1b9e9db40 Mon Sep 17 00:00:00 2001 From: Swaminathan Balachandran Date: Thu, 17 Jul 2025 05:44:01 -0400 Subject: [PATCH 3/5] HDDS-13449. Fix build Change-Id: I061bc7b2fc4ead98027fd0c1be84b9a87cd269b2 --- .../apache/hadoop/ozone/om/service/TestKeyDeletingService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/service/TestKeyDeletingService.java b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/service/TestKeyDeletingService.java index ba6f644c4967..134e9b5f7310 100644 --- a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/service/TestKeyDeletingService.java +++ b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/service/TestKeyDeletingService.java @@ -788,7 +788,7 @@ void cleanup() { @Test @DisplayName("Should not update keys when purge request times out during key deletion") - public void testFailingModifiedKeyPurge() throws IOException { + public void testFailingModifiedKeyPurge() throws IOException, InterruptedException { try (MockedStatic mocked = mockStatic(OzoneManagerRatisUtils.class, CALLS_REAL_METHODS)) { From b7e2682058b7f5f3fd78380c1cc342e236a1f494 Mon Sep 17 00:00:00 2001 From: Swaminathan Balachandran Date: Thu, 17 Jul 2025 10:03:24 -0400 Subject: [PATCH 4/5] HDDS-13449. Fix test Change-Id: I484558936456504adb29f8a914c47ea86e725b27 --- .../hadoop/hdds/utils/BackgroundService.java | 4 ++ .../om/service/DirectoryDeletingService.java | 37 +++++++++++++------ 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/utils/BackgroundService.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/utils/BackgroundService.java index 7ddd20bde35c..e7e713eb6732 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/utils/BackgroundService.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/utils/BackgroundService.java @@ -125,6 +125,10 @@ protected synchronized void setInterval(long newInterval, TimeUnit newUnit) { this.unit = newUnit; } + protected long getIntervalMillis() { + return this.unit.toMillis(interval); + } + public abstract BackgroundTaskQueue getTasks(); /** diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/service/DirectoryDeletingService.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/service/DirectoryDeletingService.java index 2becac867ad3..4ee29a5aee2d 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/service/DirectoryDeletingService.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/service/DirectoryDeletingService.java @@ -38,6 +38,7 @@ import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; +import java.util.concurrent.ForkJoinPool; import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; @@ -152,7 +153,7 @@ public class DirectoryDeletingService extends AbstractKeyDeletingService { private int ratisByteLimit; private final SnapshotChainManager snapshotChainManager; private final boolean deepCleanSnapshots; - private final ExecutorService deletionThreadPool; + private ExecutorService deletionThreadPool; private final int numberOfParallelThreadsPerStore; private final AtomicLong deletedDirsCount; private final AtomicLong movedDirsCount; @@ -168,9 +169,8 @@ public DirectoryDeletingService(long interval, TimeUnit unit, OMConfigKeys.OZONE_OM_RATIS_LOG_APPENDER_QUEUE_BYTE_LIMIT_DEFAULT, StorageUnit.BYTES); this.numberOfParallelThreadsPerStore = dirDeletingServiceCorePoolSize; - this.deletionThreadPool = new ThreadPoolExecutor(0, numberOfParallelThreadsPerStore, interval, unit, - new LinkedBlockingDeque<>(Integer.MAX_VALUE)); - + this.deletionThreadPool = new ThreadPoolExecutor(0, numberOfParallelThreadsPerStore, + interval, unit, new LinkedBlockingDeque<>(Integer.MAX_VALUE)); // always go to 90% of max limit for request as other header will be added this.ratisByteLimit = (int) (limit * 0.9); registerReconfigCallbacks(ozoneManager.getReconfigurationHandler()); @@ -226,18 +226,33 @@ public BackgroundTaskQueue getTasks() { @Override public void shutdown() { - deletionThreadPool.shutdown(); - try { - if (!deletionThreadPool.awaitTermination(60, TimeUnit.SECONDS)) { + if (deletionThreadPool != null) { + deletionThreadPool.shutdown(); + try { + if (!deletionThreadPool.awaitTermination(60, TimeUnit.SECONDS)) { + deletionThreadPool.shutdownNow(); + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); deletionThreadPool.shutdownNow(); } - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - deletionThreadPool.shutdownNow(); } super.shutdown(); } + @Override + public synchronized void start() { + if (deletionThreadPool == null || deletionThreadPool.isShutdown() || deletionThreadPool.isTerminated()) { + this.deletionThreadPool = new ThreadPoolExecutor(0, numberOfParallelThreadsPerStore, + super.getIntervalMillis(), TimeUnit.MILLISECONDS, new LinkedBlockingDeque<>(Integer.MAX_VALUE)); + } + super.start(); + } + + private boolean isThreadPoolActive(ExecutorService threadPoolExecutor) { + return threadPoolExecutor != null && !threadPoolExecutor.isShutdown() && !threadPoolExecutor.isTerminated(); + } + @SuppressWarnings("checkstyle:ParameterNumber") void optimizeDirDeletesAndSubmitRequest( long dirNum, long subDirNum, long subFileNum, @@ -539,7 +554,7 @@ private void processDeletedDirsForStore(SnapshotInfo currentSnapshotInfo, KeyMan } catch (Throwable e) { return false; } - }, deletionThreadPool); + }, isThreadPoolActive(deletionThreadPool) ? deletionThreadPool : ForkJoinPool.commonPool()); processedAllDeletedDirs = future.thenCombine(future, (a, b) -> a && b); } // If AOS or all directories have been processed for snapshot, update snapshot size delta and deep clean flag From aeaad3e633597ddd9c0229294129751136cab198 Mon Sep 17 00:00:00 2001 From: Swaminathan Balachandran Date: Thu, 17 Jul 2025 12:52:06 -0400 Subject: [PATCH 5/5] HDDS-13449. Fix test Change-Id: If4c7149676882019e2e68ad02af028f33047a16a --- .../java/org/apache/hadoop/hdds/utils/BackgroundService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/utils/BackgroundService.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/utils/BackgroundService.java index e7e713eb6732..9051ad1c7375 100644 --- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/utils/BackgroundService.java +++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/utils/BackgroundService.java @@ -125,7 +125,7 @@ protected synchronized void setInterval(long newInterval, TimeUnit newUnit) { this.unit = newUnit; } - protected long getIntervalMillis() { + protected synchronized long getIntervalMillis() { return this.unit.toMillis(interval); }