Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
a4d878e
HDDS-12596. OM fs snapshot max limit is not enforced
peterxcli Mar 25, 2025
f14a095
fix compile error
peterxcli Mar 25, 2025
fe3780c
Should set default snapshot limit for TestOMSnapshotRequestResponse b…
peterxcli Mar 25, 2025
39d87be
Set default filesystem snapshot max limit in TestOMKeyRequest setup
peterxcli Mar 25, 2025
8338f29
Addressed comment
peterxcli Mar 27, 2025
0834515
Move snapshot validation to preExecute and add new TOO_MANY_SNAPSHOTS…
peterxcli Mar 28, 2025
3c2727d
Use snapshotChain size to validete snapshot limit
peterxcli Mar 30, 2025
7b52f1a
Cleanup test
peterxcli Apr 3, 2025
2607c3e
Count inflight snapshot, too
peterxcli Apr 3, 2025
78bc4c4
fix test
peterxcli Apr 3, 2025
3b367e4
Merge remote-tracking branch 'upstream/master' into hdds12596-om-fs-s…
peterxcli Apr 3, 2025
902b507
Revert "Count inflight snapshot, too"
peterxcli Apr 8, 2025
65922fb
Revert "fix test"
peterxcli Apr 8, 2025
3cb97fe
Rename `validateSnapshotLimit` to `snapshotLimitCheck` in snapshotMan…
peterxcli Apr 8, 2025
0d2d15b
Merge remote-tracking branch 'upstream/master' into hdds12596-om-fs-s…
peterxcli Apr 15, 2025
8f0bab2
Add snaphot inflight count
peterxcli Apr 15, 2025
6465e9f
Reset snapshot inflight count when om become leader
peterxcli Apr 15, 2025
385eeb1
add sync block on inFlightSnapshotCount
peterxcli Apr 25, 2025
ede525e
Merge remote-tracking branch 'upstream/master' into hdds12596-om-fs-s…
peterxcli Apr 25, 2025
55cb418
use AtomicReference for exception handling
peterxcli Apr 25, 2025
1106481
move snapshotLimitCheck to the place that wont have further error
peterxcli Apr 26, 2025
be31e15
(CI affected by HDDS-12173 flakiness)Merge remote-tracking branch 'up…
peterxcli Apr 26, 2025
a537ddd
Merge remote-tracking branch 'upstream/master' into hdds12596-om-fs-s…
peterxcli May 2, 2025
c900d9b
Fix error in TestOmSnapshotManager by changing mock from Table to Typ…
peterxcli May 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,8 @@ public enum ResultCodes {
TOO_MANY_BUCKETS,
KEY_UNDER_LEASE_RECOVERY,
KEY_ALREADY_CLOSED,
KEY_UNDER_LEASE_SOFT_LIMIT_PERIOD
KEY_UNDER_LEASE_SOFT_LIMIT_PERIOD,

TOO_MANY_SNAPSHOTS,
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,29 @@ public void testKeyAndSnapshotDeletionService() throws IOException, InterruptedE
checkSnapshotIsPurgedFromDB(omFollower, tableKey);
}

@Test
public void testSnapshotInFlightCount() throws Exception {
// snapshot inflight count should be reset to 0 when leader changes

// first do some snapshot creations
String snapshotName1 = UUID.randomUUID().toString();
store.createSnapshot(volumeName, bucketName, snapshotName1);

// then shutdown the leader
OzoneManager omLeader = cluster.getOMLeader();
cluster.shutdownOzoneManager(omLeader);

// wait for the new leader to be elected
cluster.waitForLeaderOM();

// check the inflight count on the new leader is 0
OzoneManager newLeader = cluster.getOMLeader();
assertEquals(0, newLeader.getOmSnapshotManager().getInFlightSnapshotCount());

// restart the previous shutdowned node
cluster.restartOzoneManager(omLeader, true);
}

private void createSnapshot(String volName, String buckName, String snapName) throws IOException {
store.createSnapshot(volName, buckName, snapName);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,7 @@ enum Status {
KEY_ALREADY_CLOSED = 96;
KEY_UNDER_LEASE_SOFT_LIMIT_PERIOD = 97;

TOO_MANY_SNAPSHOTS = 98;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
import static org.apache.hadoop.ozone.OzoneConsts.OM_SNAPSHOT_CHECKPOINT_DIR;
import static org.apache.hadoop.ozone.OzoneConsts.OM_SNAPSHOT_DIFF_DB_NAME;
import static org.apache.hadoop.ozone.OzoneConsts.OM_SNAPSHOT_INDICATOR;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_FS_SNAPSHOT_MAX_LIMIT;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_FS_SNAPSHOT_MAX_LIMIT_DEFAULT;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_CACHE_CLEANUP_SERVICE_RUN_INTERVAL;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_CACHE_CLEANUP_SERVICE_RUN_INTERVAL_DEFAULT;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_CACHE_MAX_SIZE;
Expand Down Expand Up @@ -63,6 +65,8 @@
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.hdds.StringUtils;
Expand Down Expand Up @@ -173,6 +177,9 @@ public final class OmSnapshotManager implements AutoCloseable {
// Soft limit of the snapshot cache size.
private final int softCacheSize;

private int fsSnapshotMaxLimit;
private final AtomicInteger inFlightSnapshotCount = new AtomicInteger(0);

public OmSnapshotManager(OzoneManager ozoneManager) {

boolean isFilesystemSnapshotEnabled =
Expand Down Expand Up @@ -247,6 +254,9 @@ public OmSnapshotManager(OzoneManager ozoneManager) {
this.softCacheSize = ozoneManager.getConfiguration().getInt(
OZONE_OM_SNAPSHOT_CACHE_MAX_SIZE,
OZONE_OM_SNAPSHOT_CACHE_MAX_SIZE_DEFAULT);

fsSnapshotMaxLimit = ozoneManager.getConfiguration().getInt(OZONE_OM_FS_SNAPSHOT_MAX_LIMIT,
OZONE_OM_FS_SNAPSHOT_MAX_LIMIT_DEFAULT);

CacheLoader<UUID, OmSnapshot> loader = createCacheLoader();

Expand Down Expand Up @@ -859,6 +869,42 @@ private void validateSnapshotsExistAndActive(final String volumeName,
checkSnapshotActive(toSnapInfo, false);
}

public void snapshotLimitCheck() throws IOException, OMException {
OmMetadataManagerImpl omMetadataManager = (OmMetadataManagerImpl) ozoneManager.getMetadataManager();
SnapshotChainManager snapshotChainManager = omMetadataManager.getSnapshotChainManager();
int currentSnapshotNum = snapshotChainManager.getGlobalSnapshotChain().size();

AtomicReference<OMException> exceptionRef = new AtomicReference<>(null);
inFlightSnapshotCount.updateAndGet(count -> {
if (currentSnapshotNum + count >= fsSnapshotMaxLimit) {
exceptionRef.set(new OMException(
String.format("Snapshot limit of %d reached. Cannot create more snapshots. " +
"Current snapshots: %d, In-flight creations: %d",
fsSnapshotMaxLimit, currentSnapshotNum, count) +
" If you already deleted some snapshots, " +
"please wait for the background service to complete the cleanup.",
OMException.ResultCodes.TOO_MANY_SNAPSHOTS));
return count;
}
return count + 1;
});
if (exceptionRef.get() != null) {
throw exceptionRef.get();
}
}

public void decrementInFlightSnapshotCount() {
inFlightSnapshotCount.decrementAndGet();
}

public void resetInFlightSnapshotCount() {
inFlightSnapshotCount.set(0);
}

public int getInFlightSnapshotCount() {
return inFlightSnapshotCount.get();
}

private int getIndexFromToken(final String token) throws IOException {
if (isBlank(token)) {
return 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,11 @@ public SnapshotInfo getLatestSnapshot() {
return snapshotInfo;
}

@Override
public void notifyLeaderReady() {
ozoneManager.getOmSnapshotManager().resetInFlightSnapshotCount();
}

@Override
public void notifyLeaderChanged(RaftGroupMemberId groupMemberId,
RaftPeerId newLeaderId) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ public OMRequest preExecute(OzoneManager ozoneManager) throws IOException {
"Only bucket owners and Ozone admins can create snapshots",
OMException.ResultCodes.PERMISSION_DENIED);
}
// verify snapshot limit
ozoneManager.getOmSnapshotManager().snapshotLimitCheck();
CreateSnapshotRequest.Builder createSnapshotRequest = omRequest.getCreateSnapshotRequest().toBuilder()
.setSnapshotId(toProtobuf(UUID.randomUUID()))
.setVolumeName(volumeName)
Expand Down Expand Up @@ -188,7 +190,7 @@ public OMClientResponse validateAndUpdateCache(OzoneManager ozoneManager, Execut
// pre-replicated key size counter in OmBucketInfo.
snapshotInfo.setReferencedSize(estimateBucketDataSize(omBucketInfo));

addSnapshotInfoToSnapshotChainAndCache(omMetadataManager, context.getIndex());
addSnapshotInfoToSnapshotChainAndCache(ozoneManager, omMetadataManager, context.getIndex());

omResponse.setCreateSnapshotResponse(
CreateSnapshotResponse.newBuilder()
Expand Down Expand Up @@ -250,6 +252,7 @@ public OMClientResponse validateAndUpdateCache(OzoneManager ozoneManager, Execut
* it was removed at T-5.
*/
private void addSnapshotInfoToSnapshotChainAndCache(
OzoneManager ozoneManager,
OmMetadataManagerImpl omMetadataManager,
long transactionLogIndex
) throws IOException {
Expand Down Expand Up @@ -288,6 +291,8 @@ private void addSnapshotInfoToSnapshotChainAndCache(
removeSnapshotInfoFromSnapshotChainManager(snapshotChainManager,
snapshotInfo);
throw new IOException(exception.getMessage(), exception);
} finally {
ozoneManager.getOmSnapshotManager().decrementInFlightSnapshotCount();
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import static org.apache.hadoop.ozone.om.snapshot.OmSnapshotUtils.getINode;
import static org.apache.hadoop.ozone.om.snapshot.OmSnapshotUtils.truncateFileName;
import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
Expand All @@ -54,6 +55,7 @@
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
Expand All @@ -66,6 +68,7 @@
import org.apache.hadoop.hdds.utils.db.DBStore;
import org.apache.hadoop.hdds.utils.db.RDBBatchOperation;
import org.apache.hadoop.hdds.utils.db.RDBStore;
import org.apache.hadoop.hdds.utils.db.Table;
import org.apache.hadoop.hdds.utils.db.TypedTable;
import org.apache.hadoop.ozone.om.exceptions.OMException;
import org.apache.hadoop.ozone.om.helpers.OmBucketInfo;
Expand All @@ -76,6 +79,7 @@
import org.apache.ozone.test.GenericTestUtils;
import org.apache.ozone.test.GenericTestUtils.LogCapturer;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
Expand All @@ -90,6 +94,9 @@
class TestOmSnapshotManager {

private OzoneManager om;
private SnapshotChainManager snapshotChainManager;
private OmMetadataManagerImpl omMetadataManager;
private OmSnapshotManager omSnapshotManager;
private static final String CANDIDATE_DIR_NAME = OM_DB_NAME +
SNAPSHOT_CANDIDATE_DIR;
private File leaderDir;
Expand All @@ -114,16 +121,37 @@ void init(@TempDir File tempDir) throws Exception {
OMConfigKeys.OZONE_OM_SNAPSHOT_CACHE_MAX_SIZE, 1);
configuration.setBoolean(
OMConfigKeys.OZONE_OM_SNAPSHOT_ROCKSDB_METRICS_ENABLED, false);
// Allow 2 fs snapshots
configuration.setInt(
OMConfigKeys.OZONE_OM_FS_SNAPSHOT_MAX_LIMIT, 2);

OmTestManagers omTestManagers = new OmTestManagers(configuration);
om = omTestManagers.getOzoneManager();
omMetadataManager = (OmMetadataManagerImpl) om.getMetadataManager();
omSnapshotManager = om.getOmSnapshotManager();
snapshotChainManager = omMetadataManager.getSnapshotChainManager();
}

@AfterAll
void stop() {
om.stop();
}

@AfterEach
void cleanup() throws IOException {
Table<String, SnapshotInfo> snapshotInfoTable = omMetadataManager.getSnapshotInfoTable();

Iterator<UUID> iter = snapshotChainManager.iterator(true);
while (iter.hasNext()) {
UUID snapshotId = iter.next();
String snapshotInfoKey = snapshotChainManager.getTableKey(snapshotId);
SnapshotInfo snapshotInfo = snapshotInfoTable.get(snapshotInfoKey);
snapshotChainManager.deleteSnapshot(snapshotInfo);
snapshotInfoTable.delete(snapshotInfoKey);
}
omSnapshotManager.invalidateCache();
}

@Test
public void testSnapshotFeatureFlagSafetyCheck() throws IOException {
// Verify that the snapshot feature config safety check method
Expand All @@ -150,11 +178,11 @@ public void testCloseOnEviction() throws IOException,
final TypedTable<String, OmBucketInfo> bucketTable = mock(TypedTable.class);
final TypedTable<String, SnapshotInfo> snapshotInfoTable = mock(TypedTable.class);
HddsWhiteboxTestUtils.setInternalState(
om.getMetadataManager(), VOLUME_TABLE, volumeTable);
omMetadataManager, VOLUME_TABLE, volumeTable);
HddsWhiteboxTestUtils.setInternalState(
om.getMetadataManager(), BUCKET_TABLE, bucketTable);
omMetadataManager, BUCKET_TABLE, bucketTable);
HddsWhiteboxTestUtils.setInternalState(
om.getMetadataManager(), SNAPSHOT_INFO_TABLE, snapshotInfoTable);
omMetadataManager, SNAPSHOT_INFO_TABLE, snapshotInfoTable);

final String volumeName = UUID.randomUUID().toString();
final String dbVolumeKey = om.getMetadataManager().getVolumeKey(volumeName);
Expand Down Expand Up @@ -184,16 +212,15 @@ public void testCloseOnEviction() throws IOException,
when(snapshotInfoTable.get(first.getTableKey())).thenReturn(first);
when(snapshotInfoTable.get(second.getTableKey())).thenReturn(second);

((OmMetadataManagerImpl) om.getMetadataManager()).getSnapshotChainManager().addSnapshot(first);
((OmMetadataManagerImpl) om.getMetadataManager()).getSnapshotChainManager().addSnapshot(second);
snapshotChainManager.addSnapshot(first);
snapshotChainManager.addSnapshot(second);
RDBBatchOperation rdbBatchOperation = new RDBBatchOperation();
// create the first snapshot checkpoint
OmSnapshotManager.createOmSnapshotCheckpoint(om.getMetadataManager(),
first, rdbBatchOperation);
om.getMetadataManager().getStore().commitBatchOperation(rdbBatchOperation);

// retrieve it and setup store mock
OmSnapshotManager omSnapshotManager = om.getOmSnapshotManager();
OmSnapshot firstSnapshot = omSnapshotManager
.getActiveSnapshot(first.getVolumeName(), first.getBucketName(), first.getName())
.get();
Expand Down Expand Up @@ -229,6 +256,37 @@ public void testCloseOnEviction() throws IOException,
}, 100, 30_000);
}

@Test
public void testValidateSnapshotLimit() throws IOException {
TypedTable<String, SnapshotInfo> snapshotInfoTable = mock(TypedTable.class);
HddsWhiteboxTestUtils.setInternalState(
omMetadataManager, SNAPSHOT_INFO_TABLE, snapshotInfoTable);

SnapshotInfo first = createSnapshotInfo("vol1", "buck1");
SnapshotInfo second = createSnapshotInfo("vol1", "buck1");

first.setGlobalPreviousSnapshotId(null);
first.setPathPreviousSnapshotId(null);
second.setGlobalPreviousSnapshotId(first.getSnapshotId());
second.setPathPreviousSnapshotId(first.getSnapshotId());

when(snapshotInfoTable.get(first.getTableKey())).thenReturn(first);
when(snapshotInfoTable.get(second.getTableKey())).thenReturn(second);

snapshotChainManager.addSnapshot(first);
assertDoesNotThrow(() -> omSnapshotManager.snapshotLimitCheck());
omSnapshotManager.decrementInFlightSnapshotCount();

snapshotChainManager.addSnapshot(second);

OMException exception = assertThrows(OMException.class, () -> omSnapshotManager.snapshotLimitCheck());
assertEquals(OMException.ResultCodes.TOO_MANY_SNAPSHOTS, exception.getResult());

snapshotChainManager.deleteSnapshot(second);

assertDoesNotThrow(() -> omSnapshotManager.snapshotLimitCheck());
}

@BeforeEach
void setupData(@TempDir File testDir) throws IOException {
// Set up the leader with the following files:
Expand Down Expand Up @@ -332,16 +390,12 @@ public void testHardLinkCreation() throws IOException {
@Test
public void testGetSnapshotInfo() throws IOException {
SnapshotInfo s1 = createSnapshotInfo("vol", "buck");
UUID latestGlobalSnapId =
((OmMetadataManagerImpl) om.getMetadataManager()).getSnapshotChainManager()
.getLatestGlobalSnapshotId();
UUID latestGlobalSnapId = snapshotChainManager.getLatestGlobalSnapshotId();
UUID latestPathSnapId =
((OmMetadataManagerImpl) om.getMetadataManager()).getSnapshotChainManager()
.getLatestPathSnapshotId(String.join("/", "vol", "buck"));
snapshotChainManager.getLatestPathSnapshotId(String.join("/", "vol", "buck"));
s1.setPathPreviousSnapshotId(latestPathSnapId);
s1.setGlobalPreviousSnapshotId(latestGlobalSnapId);
((OmMetadataManagerImpl) om.getMetadataManager()).getSnapshotChainManager()
.addSnapshot(s1);
snapshotChainManager.addSnapshot(s1);
OMException ome = assertThrows(OMException.class,
() -> om.getOmSnapshotManager().getSnapshot(s1.getSnapshotId()));
assertEquals(OMException.ResultCodes.FILE_NOT_FOUND, ome.getResult());
Expand All @@ -350,6 +404,12 @@ public void testGetSnapshotInfo() throws IOException {
ome = assertThrows(OMException.class,
() -> om.getOmSnapshotManager().getSnapshot(s2.getSnapshotId()));
assertEquals(OMException.ResultCodes.FILE_NOT_FOUND, ome.getResult());

// add to make cleanup work
TypedTable<String, SnapshotInfo> snapshotInfoTable = mock(TypedTable.class);
HddsWhiteboxTestUtils.setInternalState(
omMetadataManager, SNAPSHOT_INFO_TABLE, snapshotInfoTable);
when(snapshotInfoTable.get(s1.getTableKey())).thenReturn(s1);
}

/*
Expand Down
Loading