Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
155 commits
Select commit Hold shift + click to select a range
6e241f6
HDDS-13765. SnapshotLocalData yaml should also track snapshotId
swamirishi Oct 8, 2025
a869500
HDDS-13627. In memory Manager for Snapshot Local Data
swamirishi Oct 8, 2025
252d338
HDDS-13627. In memory Manager for Snapshot Local Data
swamirishi Oct 9, 2025
4099bc6
HDDS-13767. Refactor SnapshotLocalDataYaml related code into OmSnapsh…
swamirishi Oct 9, 2025
e02670c
HDDS-13767. Fix pmd
swamirishi Oct 9, 2025
5a66cfc
Merge remote-tracking branch 'origin/HDDS-13767' into HEAD
swamirishi Oct 9, 2025
79580e9
HDDS-13627. Fix checkstyle
swamirishi Oct 9, 2025
2a331ef
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Oct 9, 2025
c4f69e2
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Oct 10, 2025
afbc592
HDDS-13627. Add tests
swamirishi Oct 10, 2025
70ac2c7
HDDS-13783. Implement locks for OmSnapshotLocalDataManager
swamirishi Oct 12, 2025
b554cc7
HDDS-13783. Implement locks for OmSnapshotLocalDataManager
swamirishi Oct 12, 2025
49eccfa
HDDS-13783. Refactor inline variable
swamirishi Oct 12, 2025
51eda04
HDDS-13627. Refactor map data structure
swamirishi Oct 13, 2025
25f766c
Merge remote-tracking branch 'origin/HDDS-13627' into HEAD
swamirishi Oct 13, 2025
96689fa
HDDS-13783. Add more condition to upsert
swamirishi Oct 13, 2025
0674299
HDDS-13783. Add java doc comment
swamirishi Oct 13, 2025
5d9fc49
HDDS-13783. Add java doc comment
swamirishi Oct 13, 2025
2d88176
HDDS-13783. Implement full lock
swamirishi Oct 13, 2025
a3c4c69
HDDS-13783. Refactor and move modify method into WritableOmSnapshotLo…
swamirishi Oct 13, 2025
686d0c7
HDDS-13783. Make full lock non static
swamirishi Oct 13, 2025
491a54b
HDDS-13783. Fix remove
swamirishi Oct 13, 2025
5e69ee9
HDDS-13627. Fix findbugs
swamirishi Oct 13, 2025
d36622a
HDDS-13785. Remove orphan versions from SnapshotLocalData Yaml file
swamirishi Oct 14, 2025
ee213d1
HDDS-13785. Fix findbugs
swamirishi Oct 14, 2025
81871b2
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Oct 14, 2025
a95604e
HDDS-13627. Fix tests
swamirishi Oct 14, 2025
5a90fcf
HDDS-13627. remove checksum interface
swamirishi Oct 14, 2025
20d7d6a
HDDS-13627. Fix test failures
swamirishi Oct 14, 2025
ae655cb
HDDS-13785. Set defrag flag on previous snapshotId update
swamirishi Oct 14, 2025
25fa6ae
Merge remote-tracking branch 'origin/HDDS-13627' into HEAD
swamirishi Oct 14, 2025
d419283
HDDS-13783. Fix findbugs
swamirishi Oct 14, 2025
8a44308
HDDS-13783. Fix pmd
swamirishi Oct 14, 2025
cb94c36
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Oct 14, 2025
e26052c
Merge remote-tracking branch 'origin/HDDS-13627' into HEAD
swamirishi Oct 14, 2025
4d272d1
HDDS-13783. Fix lock release
swamirishi Oct 14, 2025
2a38f59
HDDS-13627. address review comments
swamirishi Oct 14, 2025
ca098cf
HDDS-13783. Make graph updates synchronized
swamirishi Oct 15, 2025
67d4b3d
HDDS-13627. Make add version with dependents package private
swamirishi Oct 15, 2025
9838cda
Merge remote-tracking branch 'origin/HDDS-13627' into HEAD
swamirishi Oct 15, 2025
6a19dbb
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Oct 15, 2025
665f411
HDDS-13783. Fix checkstyle
swamirishi Oct 15, 2025
2894e40
HDDS-13783. Fix merge conflict
swamirishi Oct 15, 2025
ea0ab16
HDDS-13783. Add write version api
swamirishi Oct 15, 2025
915562b
HDDS-13797. Refactor OzoneManagerLock Resource class to handle handle…
swamirishi Oct 15, 2025
1c0d0ac
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Oct 15, 2025
24da3eb
HDDS-13797. Update interface
swamirishi Oct 15, 2025
8f3774a
HDDS-13798. Implement PoolBasedHierarchicalResourceLockManager for Hi…
swamirishi Oct 15, 2025
503cd4e
Merge remote-tracking branch 'origin/HDDS-13798' into HEAD
swamirishi Oct 15, 2025
6865fad
HDDS-13797. Revert move of Leveled Resource and Resource enum/interface
swamirishi Oct 15, 2025
903ecd1
Merge remote-tracking branch 'origin/HDDS-13797' into HEAD
swamirishi Oct 15, 2025
06d1e99
Merge remote-tracking branch 'origin/HDDS-13798' into HEAD
swamirishi Oct 15, 2025
60a7728
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Oct 15, 2025
4711517
HDDS-13798. Fix pmd findbugs
swamirishi Oct 15, 2025
af8754c
Merge remote-tracking branch 'origin/HDDS-13798' into HEAD
swamirishi Oct 15, 2025
655a724
HDDS-13798. Fix pmd findbugs
swamirishi Oct 15, 2025
6386c1b
Merge remote-tracking branch 'origin/HDDS-13798' into HEAD
swamirishi Oct 15, 2025
2bc6134
HDDS-13798. Fix ozone-default.xml
swamirishi Oct 16, 2025
0de7c62
Merge remote-tracking branch 'origin/HDDS-13798' into HEAD
swamirishi Oct 16, 2025
8e8c534
HDDS-13798. Stop lock data manager on metadata stop
swamirishi Oct 16, 2025
f148f24
HDDS-13798. Update tests
swamirishi Oct 16, 2025
da030c0
HDDS-13798. Rename class
swamirishi Oct 16, 2025
6af6498
Merge remote-tracking branch 'origin/HDDS-13798' into HEAD
swamirishi Oct 16, 2025
b281569
HDDS-13783. Add tests
swamirishi Oct 16, 2025
1ad24b4
HDDS-13783. Fix checkstyle
swamirishi Oct 16, 2025
d629911
HDDS-13783. Fix findbugs
swamirishi Oct 16, 2025
2aecde4
Merge remote-tracking branch 'origin/HDDS-13783' into HEAD
swamirishi Oct 16, 2025
8eeb44b
HDDS-13783. Fix pmd
swamirishi Oct 16, 2025
d9301b3
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Oct 16, 2025
06e7d37
HDDS-13785. Fix merge issue
swamirishi Oct 16, 2025
efd6c51
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Oct 16, 2025
fab85ea
HDDS-13785. Fix checkstyle
swamirishi Oct 16, 2025
c73a355
HDDS-13783. Fix test
swamirishi Oct 16, 2025
76b99e2
Merge remote-tracking branch 'origin/HDDS-13783' into HEAD
swamirishi Oct 16, 2025
1d39bee
HDDS-13785. Fix test
swamirishi Oct 16, 2025
54f1508
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Oct 16, 2025
b1a3834
HDDS-13785. Fix test
swamirishi Oct 16, 2025
1986bbe
HDDS-13785. Fix conditions
swamirishi Oct 16, 2025
52be3dd
HDDS-13783. Allow version resolution to null
swamirishi Oct 16, 2025
5f50a04
Merge remote-tracking branch 'origin/HDDS-13783' into HEAD
swamirishi Oct 16, 2025
908c47d
HDDS-13785. Fix tests
swamirishi Oct 17, 2025
278605a
HDDS-13783. Add dirty bit
swamirishi Oct 17, 2025
40265e1
Merge remote-tracking branch 'origin/HDDS-13783' into HEAD
swamirishi Oct 17, 2025
ac4719b
Merge
swamirishi Oct 17, 2025
cf19dce
HDDS-13783. Address review comments
swamirishi Oct 17, 2025
34097de
HDDS-13783. Address review comments
swamirishi Oct 17, 2025
c46ddc2
HDDS-13783. Address review comments
swamirishi Oct 17, 2025
99afc02
HDDS-13783. Address review comments
swamirishi Oct 17, 2025
fcc630e
Merge remote-tracking branch 'origin/HDDS-13783' into HEAD
swamirishi Oct 17, 2025
6f144e2
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Oct 17, 2025
4600c96
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Oct 18, 2025
48ec0bb
HDDS-13810. Fix Build Issue because of unused dependency
swamirishi Oct 18, 2025
f524cad
Merge remote-tracking branch 'origin/HDDS-13810' into HEAD
swamirishi Oct 18, 2025
cb31b7c
Revert "HDDS-13810. Fix Build Issue because of unused dependency"
swamirishi Oct 19, 2025
02dd061
Merge remote-tracking branch 'origin/HDDS-13783' into HEAD
swamirishi Oct 19, 2025
ff90af8
HDDS-13785. Add unit tests
swamirishi Oct 20, 2025
8b014dd
HDDS-13783. Add case for commit key in middle of chain
swamirishi Oct 20, 2025
57662c6
HDDS-13783. Convert set to list of predecessors
swamirishi Oct 20, 2025
bcc0fc8
Merge remote-tracking branch 'origin/HDDS-13783' into HEAD
swamirishi Oct 20, 2025
79a46f4
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Oct 23, 2025
cd24a81
Merge remote-tracking branch 'origin/HDDS-13783' into HEAD
swamirishi Oct 23, 2025
5f0bb91
HDDS-13833. Add transactionInfo field in SnapshotLocalData and update…
swamirishi Oct 24, 2025
3de4346
Merge remote-tracking branch 'origin/HDDS-13833' into HEAD
swamirishi Oct 24, 2025
5b55a59
HDDS-13785. Merge with HDDS-13833
swamirishi Oct 24, 2025
aa6facf
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Oct 24, 2025
cc35056
HDDS-13783. Make local data graph synchrnous
swamirishi Oct 25, 2025
9c1689c
Merge remote-tracking branch 'origin/HDDS-13783' into HEAD
swamirishi Oct 25, 2025
3f59895
HDDS-13785. Use internal lock on orphan block cleanup
swamirishi Oct 25, 2025
95341dd
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Oct 27, 2025
616bef3
HDDS-13783. Fix NPE with concurrentHashMap
swamirishi Oct 27, 2025
b0023d1
HDDS-13830. Snapshot Rocks DB directory path computation based on loc…
swamirishi Oct 28, 2025
36b6fb3
HDDS-13830. Add test
swamirishi Oct 28, 2025
4596386
HDDS-13783. Add comments for localDataGraph
swamirishi Oct 29, 2025
7af6521
Merge remote-tracking branch 'origin/HDDS-13783' into HEAD
swamirishi Oct 29, 2025
e19dae2
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Oct 29, 2025
8c1373a
Merge remote-tracking branch 'origin/HDDS-13833' into HEAD
swamirishi Oct 29, 2025
25ee4e2
Merge remote-tracking branch 'origin/HDDS-13783' into HEAD
swamirishi Oct 29, 2025
c6e3914
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Oct 29, 2025
fd4bfdb
HDDS-13785. Add test for handling needs defrag
swamirishi Oct 30, 2025
49f4424
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Oct 30, 2025
613d106
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Oct 30, 2025
8a29736
HDDS-13833. Fix checkstyle
swamirishi Oct 30, 2025
cca2dbf
Merge remote-tracking branch 'origin/HDDS-13833' into HEAD
swamirishi Oct 30, 2025
a810cc1
HDDS-13785. Fix findbugs
swamirishi Oct 30, 2025
78c1036
HDDS-13859. OmSnapshotLocalDataManager should handle needsDefrag flag…
swamirishi Oct 31, 2025
a2bbea5
Merge remote-tracking branch 'origin/HDDS-13859' into HEAD
swamirishi Oct 31, 2025
bf4746f
HDDS-13859. Fix Test
swamirishi Oct 31, 2025
09d955c
HDDS-13859. Add comments
swamirishi Oct 31, 2025
cde567d
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Oct 31, 2025
49c662a
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Oct 31, 2025
2cf1bce
HDDS-13859. Fix test after merge master
swamirishi Oct 31, 2025
7afc8f5
Merge remote-tracking branch 'origin/HDDS-13859' into HEAD
swamirishi Oct 31, 2025
5849dac
HDDS-13785. Fix tests after merge
swamirishi Oct 31, 2025
e58ff09
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Oct 31, 2025
83b887e
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Oct 31, 2025
519495a
HDDS-13785. Address review comments
swamirishi Oct 31, 2025
c125250
HDDS-13785. Address review comments
swamirishi Oct 31, 2025
408e213
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Nov 1, 2025
715b2f0
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Nov 1, 2025
ec59b89
HDDS-13785. Address review comments
swamirishi Nov 1, 2025
b0b6d6a
HDDS-13785. Address review comments
swamirishi Nov 1, 2025
a759807
HDDS-13785. Change catch exception
swamirishi Nov 1, 2025
808b174
HDDS-13785. Address review comments
swamirishi Nov 1, 2025
c829a8b
HDDS-13830. Fix test
swamirishi Nov 1, 2025
8e91e47
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Nov 1, 2025
bfd341c
HDDS-13830. Fix test
swamirishi Nov 1, 2025
4ccd3fc
HDDS-13830. Fix test
swamirishi Nov 1, 2025
41b7cfb
HDDS-13830. Fix pmd
swamirishi Nov 1, 2025
3a8c8f6
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Nov 2, 2025
d0422ae
HDDS-13830. Fix mrge issue
swamirishi Nov 2, 2025
4ff8cea
Merge remote-tracking branch 'origin/HDDS-13830' into HEAD
swamirishi Nov 2, 2025
55c68bd
Merge remote-tracking branch 'apache/master' into HEAD
swamirishi Nov 3, 2025
018571c
HDDS-13785. Address review comments
swamirishi Nov 5, 2025
6cd54dd
HDDS-13785. Address review comments
swamirishi Nov 5, 2025
261a669
HDDS-13785. Remove unnecessary read lock
swamirishi Nov 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions hadoop-hdds/common/src/main/resources/ozone-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4853,4 +4853,9 @@
<value>10000</value>
<description>Maximum number of lock objects that could be present in the pool.</description>
</property>
<property>
<name>ozone.om.snapshot.local.data.manager.service.interval</name>
<value>5m</value>
<description>Interval for cleaning up orphan snapshot local data versions corresponding to snapshots</description>
</property>
</configuration>
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,9 @@ public final class OMConfigKeys {
public static final String OZONE_OM_HIERARCHICAL_RESOURCE_LOCKS_HARD_LIMIT =
"ozone.om.hierarchical.resource.locks.hard.limit";
public static final int OZONE_OM_HIERARCHICAL_RESOURCE_LOCKS_HARD_LIMIT_DEFAULT = 10000;
public static final String OZONE_OM_SNAPSHOT_LOCAL_DATA_MANAGER_SERVICE_INTERVAL =
"ozone.om.snapshot.local.data.manager.service.interval";
public static final String OZONE_OM_SNAPSHOT_LOCAL_DATA_MANAGER_SERVICE_INTERVAL_DEFAULT = "5m";

/**
* Never constructed.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,10 @@ public final class OmSnapshotManager implements AutoCloseable {
private final AtomicInteger inFlightSnapshotCount = new AtomicInteger(0);

public OmSnapshotManager(OzoneManager ozoneManager) throws IOException {
this.snapshotLocalDataManager = new OmSnapshotLocalDataManager(ozoneManager.getMetadataManager());
boolean isFilesystemSnapshotEnabled =
ozoneManager.isFilesystemSnapshotEnabled();
OmMetadataManagerImpl omMetadataManager = (OmMetadataManagerImpl) ozoneManager.getMetadataManager();
this.snapshotLocalDataManager = new OmSnapshotLocalDataManager(ozoneManager.getMetadataManager(),
omMetadataManager.getSnapshotChainManager(), ozoneManager.getConfiguration());
boolean isFilesystemSnapshotEnabled = ozoneManager.isFilesystemSnapshotEnabled();
LOG.info("Ozone filesystem snapshot feature is {}.",
isFilesystemSnapshotEnabled ? "enabled" : "disabled");

Expand Down Expand Up @@ -344,6 +345,16 @@ public OmSnapshotManager(OzoneManager ozoneManager) throws IOException {
}
}

public static boolean isSnapshotPurged(SnapshotChainManager chainManager, OMMetadataManager omMetadataManager,
UUID snapshotId, TransactionInfo transactionInfo) throws IOException {
String tableKey = chainManager.getTableKey(snapshotId);
if (tableKey == null) {
return true;
}
return !omMetadataManager.getSnapshotInfoTable().isExist(tableKey) && transactionInfo != null &&
isTransactionFlushedToDisk(omMetadataManager, transactionInfo);
}

/**
* Help reject OM startup if snapshot feature is disabled
* but there are snapshots remaining in this OM. Note: snapshots that are
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

package org.apache.hadoop.ozone.om.snapshot;

import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_LOCAL_DATA_MANAGER_SERVICE_INTERVAL;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_LOCAL_DATA_MANAGER_SERVICE_INTERVAL_DEFAULT;
import static org.apache.hadoop.ozone.om.OmSnapshotLocalDataYaml.YAML_FILE_EXTENSION;

import com.google.common.annotations.VisibleForTesting;
Expand All @@ -41,17 +43,21 @@
import java.util.Stack;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.stream.Collectors;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.utils.Scheduler;
import org.apache.hadoop.hdds.utils.TransactionInfo;
import org.apache.hadoop.hdds.utils.db.RDBStore;
import org.apache.hadoop.ozone.om.OMMetadataManager;
import org.apache.hadoop.ozone.om.OmSnapshotLocalData;
import org.apache.hadoop.ozone.om.OmSnapshotLocalData.VersionMeta;
import org.apache.hadoop.ozone.om.OmSnapshotLocalDataYaml;
import org.apache.hadoop.ozone.om.OmSnapshotManager;
import org.apache.hadoop.ozone.om.SnapshotChainManager;
import org.apache.hadoop.ozone.om.helpers.SnapshotInfo;
import org.apache.hadoop.ozone.om.lock.FlatResource;
import org.apache.hadoop.ozone.om.lock.HierarchicalResourceLockManager;
Expand All @@ -73,6 +79,7 @@
public class OmSnapshotLocalDataManager implements AutoCloseable {

private static final Logger LOG = LoggerFactory.getLogger(OmSnapshotLocalDataManager.class);
private static final String LOCAL_DATA_MANAGER_SERVICE_NAME = "OmSnapshotLocalDataManagerService";

private final ObjectSerializer<OmSnapshotLocalData> snapshotLocalDataSerializer;
// In-memory DAG of snapshot-version dependencies. Each node represents a
Expand Down Expand Up @@ -101,8 +108,13 @@ public class OmSnapshotLocalDataManager implements AutoCloseable {
private final ReadWriteLock internalLock;
// Locks should be always acquired by iterating through the snapshot chain to avoid deadlocks.
private HierarchicalResourceLockManager locks;
private Map<UUID, Integer> snapshotToBeCheckedForOrphans;
private Scheduler scheduler;
private volatile boolean closed;

public OmSnapshotLocalDataManager(OMMetadataManager omMetadataManager) throws IOException {
public OmSnapshotLocalDataManager(OMMetadataManager omMetadataManager,
SnapshotChainManager snapshotChainManager,
OzoneConfiguration configuration) throws IOException {
this.localDataGraph = GraphBuilder.directed().build();
this.omMetadataManager = omMetadataManager;
this.snapshotLocalDataSerializer = new YamlSerializer<OmSnapshotLocalData>(
Expand All @@ -116,7 +128,7 @@ public void computeAndSetChecksum(Yaml yaml, OmSnapshotLocalData data) throws IO
this.versionNodeMap = new ConcurrentHashMap<>();
this.fullLock = new ReentrantReadWriteLock();
this.internalLock = new ReentrantReadWriteLock();
init();
init(configuration, snapshotChainManager);
}

@VisibleForTesting
Expand Down Expand Up @@ -216,7 +228,7 @@ private LocalDataVersionNode getVersionNode(UUID snapshotId, int version) {

private void addSnapshotVersionMeta(UUID snapshotId, SnapshotVersionsMeta snapshotVersionsMeta)
throws IOException {
if (!versionNodeMap.containsKey(snapshotId)) {
if (!versionNodeMap.containsKey(snapshotId) && !snapshotVersionsMeta.getSnapshotVersions().isEmpty()) {
for (LocalDataVersionNode versionNode : snapshotVersionsMeta.getSnapshotVersions().values()) {
validateVersionAddition(versionNode);
LocalDataVersionNode previousVersionNode =
Expand Down Expand Up @@ -261,8 +273,33 @@ void addVersionNodeWithDependents(OmSnapshotLocalData snapshotLocalData) throws
}
}

private void init() throws IOException {
private void incrementOrphanCheckCount(UUID snapshotId) {
if (snapshotId != null) {
this.snapshotToBeCheckedForOrphans.compute(snapshotId, (k, v) -> v == null ? 1 : (v + 1));
}
}

private void decrementOrphanCheckCount(UUID snapshotId, int decrementBy) {
this.snapshotToBeCheckedForOrphans.compute(snapshotId, (k, v) -> {
if (v == null) {
return null;
}
int newValue = v - decrementBy;
if (newValue <= 0) {
return null;
}
return newValue;
});
}

@VisibleForTesting
Map<UUID, Integer> getSnapshotToBeCheckedForOrphans() {
return snapshotToBeCheckedForOrphans;
}

private void init(OzoneConfiguration configuration, SnapshotChainManager chainManager) throws IOException {
this.locks = omMetadataManager.getHierarchicalLockManager();
this.snapshotToBeCheckedForOrphans = new ConcurrentHashMap<>();
RDBStore store = (RDBStore) omMetadataManager.getStore();
String checkpointPrefix = store.getDbLocation().getName();
File snapshotDir = new File(store.getSnapshotsParentDir());
Expand All @@ -283,6 +320,74 @@ private void init() throws IOException {
}
addVersionNodeWithDependents(snapshotLocalData);
}
for (UUID snapshotId : versionNodeMap.keySet()) {
incrementOrphanCheckCount(snapshotId);
}
long snapshotLocalDataManagerServiceInterval = configuration.getTimeDuration(
OZONE_OM_SNAPSHOT_LOCAL_DATA_MANAGER_SERVICE_INTERVAL,
OZONE_OM_SNAPSHOT_LOCAL_DATA_MANAGER_SERVICE_INTERVAL_DEFAULT,
TimeUnit.MILLISECONDS);
if (snapshotLocalDataManagerServiceInterval > 0) {
this.scheduler = new Scheduler(LOCAL_DATA_MANAGER_SERVICE_NAME, true, 1);
this.scheduler.scheduleWithFixedDelay(
() -> {
try {
checkOrphanSnapshotVersions(omMetadataManager, chainManager);
} catch (Exception e) {
LOG.error("Exception while checking orphan snapshot versions", e);
}
}, snapshotLocalDataManagerServiceInterval, snapshotLocalDataManagerServiceInterval, TimeUnit.MILLISECONDS);
}

}

private void checkOrphanSnapshotVersions(OMMetadataManager metadataManager, SnapshotChainManager chainManager)
throws IOException {
for (Map.Entry<UUID, Integer> entry : snapshotToBeCheckedForOrphans.entrySet()) {
UUID snapshotId = entry.getKey();
int countBeforeCheck = entry.getValue();
checkOrphanSnapshotVersions(metadataManager, chainManager, snapshotId);
decrementOrphanCheckCount(snapshotId, countBeforeCheck);
Comment on lines +348 to +350
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the count always decremented to 0?

Copy link
Contributor Author

@swamirishi swamirishi Nov 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, If there was another update just after the snapshot versions were updated or while the entries were updated then we don't want to decrement it completely to 0. For instance when a snapshot has been purged but it has not flushed because of double buffer thread we know we have to delete it but only after flush so the commit can just increment the value again meaning the orphan check needs to be performed again in some time.

}
}

@VisibleForTesting
void checkOrphanSnapshotVersions(OMMetadataManager metadataManager, SnapshotChainManager chainManager,
UUID snapshotId) throws IOException {
LOG.info("Checking orphan snapshot versions for snapshot {}", snapshotId);
try (WritableOmSnapshotLocalDataProvider snapshotLocalDataProvider = new WritableOmSnapshotLocalDataProvider(
snapshotId)) {
OmSnapshotLocalData snapshotLocalData = snapshotLocalDataProvider.getSnapshotLocalData();
boolean isSnapshotPurged = OmSnapshotManager.isSnapshotPurged(chainManager, metadataManager, snapshotId,
snapshotLocalData.getTransactionInfo());
for (Map.Entry<Integer, LocalDataVersionNode> integerLocalDataVersionNodeEntry : getVersionNodeMap()
.get(snapshotId).getSnapshotVersions().entrySet()) {
Comment on lines +363 to +364
Copy link

Copilot AI Nov 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potential NullPointerException if getVersionNodeMap().get(snapshotId) returns null. This can occur if the snapshot is removed from the map between the time it's added to snapshotToBeCheckedForOrphans and when this method executes. Add a null check before iterating.

Suggested change
for (Map.Entry<Integer, LocalDataVersionNode> integerLocalDataVersionNodeEntry : getVersionNodeMap()
.get(snapshotId).getSnapshotVersions().entrySet()) {
LocalDataVersionNodeMap versionNodeMap = getVersionNodeMap().get(snapshotId);
if (versionNodeMap == null) {
// The snapshotId is no longer present; skip orphan check for this snapshot.
return;
}
for (Map.Entry<Integer, LocalDataVersionNode> integerLocalDataVersionNodeEntry : versionNodeMap.getSnapshotVersions().entrySet()) {

Copilot uses AI. Check for mistakes.
Copy link
Contributor Author

@swamirishi swamirishi Nov 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This cannot happen since this happens inside a write lock. The inmemory structure would always be same as the file

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are you sure about this? Looks like write lock is held later inside commit() --> upsert(). Right at here there's no lock.

LocalDataVersionNode versionEntry = integerLocalDataVersionNodeEntry.getValue();
// remove the version entry if it is not referenced by any other snapshot version node. For version node 0
// a newly created snapshot version could point to a version with indegree 0 in such a scenario a version 0
// node can be only deleted if the snapshot is also purged.
internalLock.readLock().lock();
try {
boolean toRemove = localDataGraph.inDegree(versionEntry) == 0
&& ((versionEntry.getVersion() != 0 && versionEntry.getVersion() != snapshotLocalData.getVersion())
|| isSnapshotPurged);
if (toRemove) {
LOG.info("Removing snapshot Id : {} version: {} from local data, snapshotLocalDataVersion : {}, " +
"snapshotPurged: {}, inDegree : {}", snapshotId, versionEntry.getVersion(),
snapshotLocalData.getVersion(), isSnapshotPurged, localDataGraph.inDegree(versionEntry));
snapshotLocalDataProvider.removeVersion(versionEntry.getVersion());
}
} finally {
internalLock.readLock().unlock();
}
}
// If Snapshot is purged but not flushed completely to disk then this needs to wait for the next iteration
// which can be done by incrementing the orphan check count for the snapshotId.
if (!snapshotLocalData.getVersionSstFileInfos().isEmpty() && snapshotLocalData.getTransactionInfo() != null) {
incrementOrphanCheckCount(snapshotId);
}
snapshotLocalDataProvider.commit();
}
}

/**
Expand Down Expand Up @@ -326,13 +431,19 @@ private void validateVersionAddition(LocalDataVersionNode versionNode) throws IO
}

@Override
public void close() {
if (snapshotLocalDataSerializer != null) {
try {
snapshotLocalDataSerializer.close();
} catch (IOException e) {
LOG.error("Failed to close snapshot local data serializer", e);
public synchronized void close() {
if (!closed) {
if (snapshotLocalDataSerializer != null) {
try {
snapshotLocalDataSerializer.close();
} catch (IOException e) {
LOG.error("Failed to close snapshot local data serializer", e);
}
}
if (scheduler != null) {
scheduler.close();
}
closed = true;
}
}

Expand Down Expand Up @@ -730,30 +841,66 @@ public synchronized void commit() throws IOException {
// Need to update the disk state if and only if the dirty bit is set.
if (isDirty()) {
String filePath = getSnapshotLocalPropertyYamlPath(super.snapshotId);
String tmpFilePath = filePath + ".tmp";
File tmpFile = new File(tmpFilePath);
boolean tmpFileExists = tmpFile.exists();
if (tmpFileExists) {
tmpFileExists = !tmpFile.delete();
}
if (tmpFileExists) {
throw new IOException("Unable to delete tmp file " + tmpFilePath);
File snapshotLocalDataFile = new File(filePath);
if (!localDataVersionNodes.getSnapshotVersions().isEmpty()) {
String tmpFilePath = filePath + ".tmp";
File tmpFile = new File(tmpFilePath);
boolean tmpFileExists = tmpFile.exists();
if (tmpFileExists) {
tmpFileExists = !tmpFile.delete();
}
if (tmpFileExists) {
throw new IOException("Unable to delete tmp file " + tmpFilePath);
}
snapshotLocalDataSerializer.save(new File(tmpFilePath), super.snapshotLocalData);
Files.move(tmpFile.toPath(), Paths.get(filePath), StandardCopyOption.ATOMIC_MOVE,
StandardCopyOption.REPLACE_EXISTING);
} else if (snapshotLocalDataFile.exists()) {
LOG.info("Deleting YAML file corresponding to snapshotId: {} in path : {}",
super.snapshotId, snapshotLocalDataFile.getAbsolutePath());
if (!snapshotLocalDataFile.delete()) {
throw new IOException("Unable to delete file " + snapshotLocalDataFile.getAbsolutePath());
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

else: clean: no prior snapshot versions and no local meta file. nothing to do here.

}
snapshotLocalDataSerializer.save(new File(tmpFilePath), super.snapshotLocalData);
Files.move(tmpFile.toPath(), Paths.get(filePath), StandardCopyOption.ATOMIC_MOVE,
StandardCopyOption.REPLACE_EXISTING);
upsertNode(super.snapshotId, localDataVersionNodes);
SnapshotVersionsMeta previousVersionMeta = upsertNode(super.snapshotId, localDataVersionNodes);
checkForOphanVersionsAndIncrementCount(super.snapshotId, previousVersionMeta, localDataVersionNodes,
getSnapshotLocalData().getTransactionInfo() != null);
// Reset dirty bit
resetDirty();
}
}

private void upsertNode(UUID snapshotId, SnapshotVersionsMeta snapshotVersions) throws IOException {
private void checkForOphanVersionsAndIncrementCount(UUID snapshotId, SnapshotVersionsMeta previousVersionsMeta,
SnapshotVersionsMeta currentVersionMeta, boolean isPurgeTransactionSet) {
if (previousVersionsMeta != null) {
Map<Integer, LocalDataVersionNode> currentVersionNodeMap = currentVersionMeta.getSnapshotVersions();
Map<Integer, LocalDataVersionNode> previousVersionNodeMap = previousVersionsMeta.getSnapshotVersions();
boolean versionsRemoved = previousVersionNodeMap.keySet().stream()
.anyMatch(version -> !currentVersionNodeMap.containsKey(version));

// The previous snapshotId could have become an orphan entry or could have orphan versions.(In case of
// version removals)
if (versionsRemoved || !Objects.equals(previousVersionsMeta.getPreviousSnapshotId(),
currentVersionMeta.getPreviousSnapshotId())) {
incrementOrphanCheckCount(previousVersionsMeta.getPreviousSnapshotId());
}
// If the transactionInfo set, this means the snapshot has been purged and the entire YAML file could have
// become an orphan. Otherwise if the version is updated it
// could mean that there could be some orphan version present within the
// same snapshot.
if (isPurgeTransactionSet || previousVersionsMeta.getVersion() != currentVersionMeta.getVersion()) {
incrementOrphanCheckCount(snapshotId);
}
}
}

private SnapshotVersionsMeta upsertNode(UUID snapshotId, SnapshotVersionsMeta snapshotVersions) throws IOException {
internalLock.writeLock().lock();
try {
SnapshotVersionsMeta existingSnapVersions = getVersionNodeMap().remove(snapshotId);
Map<Integer, LocalDataVersionNode> existingVersions = existingSnapVersions == null ? Collections.emptyMap() :
existingSnapVersions.getSnapshotVersions();
Map<Integer, LocalDataVersionNode> newVersions = snapshotVersions.getSnapshotVersions();
Map<Integer, List<LocalDataVersionNode>> predecessors = new HashMap<>();
// Track all predecessors of the existing versions and remove the node from the graph.
for (Map.Entry<Integer, LocalDataVersionNode> existingVersion : existingVersions.entrySet()) {
Expand All @@ -763,14 +910,16 @@ private void upsertNode(UUID snapshotId, SnapshotVersionsMeta snapshotVersions)
predecessors.put(existingVersion.getKey(), new ArrayList<>(localDataGraph.predecessors(existingVersionNode)));
localDataGraph.removeNode(existingVersionNode);
}

// Add the nodes to be added in the graph and map.
addSnapshotVersionMeta(snapshotId, snapshotVersions);
// Reconnect all the predecessors for existing nodes.
for (Map.Entry<Integer, LocalDataVersionNode> entry : snapshotVersions.getSnapshotVersions().entrySet()) {
for (Map.Entry<Integer, LocalDataVersionNode> entry : newVersions.entrySet()) {
for (LocalDataVersionNode predecessor : predecessors.getOrDefault(entry.getKey(), Collections.emptyList())) {
localDataGraph.putEdge(predecessor, entry.getValue());
}
}
return existingSnapVersions;
} finally {
internalLock.writeLock().unlock();
}
Expand Down
Loading