Skip to content
Merged
16 changes: 6 additions & 10 deletions hadoop-hdds/common/src/main/resources/ozone-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3541,15 +3541,6 @@
However this parameter allows the namespace to support non-S3 compatible characters.
</description>
</property>
<property>
<name>ozone.om.snapshot.cache.max.size</name>
<value>10</value>
<tag>OZONE, OM</tag>
<description>
Size of the OM Snapshot LRU cache. This is the maximum number of open OM Snapshot RocksDb instances
that will be held in memory at any time.
</description>
</property>

<property>
<name>ozone.om.container.location.cache.size</name>
Expand Down Expand Up @@ -3687,7 +3678,12 @@
<value>10</value>
<tag>OZONE, OM</tag>
<description>
Maximum number of entries allowed in the snapshot cache.
Size of the OM Snapshot LRU cache.

This is a soft limit of open OM Snapshot RocksDB instances that will be
held. The actual number of cached instance could exceed this limit if
more than this number of snapshot instances are still in-use by snapDiff
or other tasks.
</description>
</property>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
import org.apache.hadoop.ozone.client.OzoneVolume;
import org.apache.hadoop.ozone.client.io.OzoneInputStream;
import org.apache.hadoop.ozone.client.io.OzoneOutputStream;
import org.apache.hadoop.ozone.om.exceptions.OMException;
import org.apache.hadoop.ozone.om.helpers.BucketLayout;
import org.apache.hadoop.ozone.om.helpers.OmKeyArgs;
import org.apache.hadoop.ozone.om.helpers.OpenKeySession;
Expand All @@ -59,6 +58,7 @@
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.nio.ByteBuffer;
Expand Down Expand Up @@ -411,15 +411,19 @@ public void testBlockSnapshotFSAccessAfterDeletion() throws Exception {
deleteSnapshot(snapshotName);

// Can't access keys in snapshot anymore with FS API. Should throw exception
final String errorMsg = "no longer active";
LambdaTestUtils.intercept(OMException.class, errorMsg,
final String errorMsg1 = "no longer active";
LambdaTestUtils.intercept(FileNotFoundException.class, errorMsg1,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 on checking exception.

() -> o3fs.listStatus(snapshotRoot));
LambdaTestUtils.intercept(OMException.class, errorMsg,
LambdaTestUtils.intercept(FileNotFoundException.class, errorMsg1,
() -> o3fs.listStatus(snapshotParent));

LambdaTestUtils.intercept(OMException.class, errorMsg,
// Note: Different error message due to inconsistent FNFE client-side
// handling in BasicOzoneClientAdapterImpl#getFileStatus
// TODO: Reconciliation?
final String errorMsg2 = "No such file or directory";
LambdaTestUtils.intercept(FileNotFoundException.class, errorMsg2,
() -> o3fs.getFileStatus(snapshotKey1));
LambdaTestUtils.intercept(OMException.class, errorMsg,
LambdaTestUtils.intercept(FileNotFoundException.class, errorMsg2,
() -> o3fs.getFileStatus(snapshotKey2));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,19 @@ public void close() throws IOException {
omMetadataManager.getStore().close();
}

@Override
protected void finalize() throws Throwable {
// Verify that the DB handle has been closed, log warning otherwise
// https://softwareengineering.stackexchange.com/a/288724
if (!omMetadataManager.getStore().isClosed()) {
LOG.warn("{} is not closed properly. snapshotName: {}",
// Print hash code for debugging
omMetadataManager.getStore().toString(),
snapshotName);
}
super.finalize();
}

@VisibleForTesting
public OMMetadataManager getMetadataManager() {
return omMetadataManager;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,6 @@
import org.apache.hadoop.ozone.om.exceptions.OMException;
import org.apache.hadoop.ozone.om.helpers.RepeatedOmKeyInfo;
import org.apache.hadoop.ozone.om.helpers.SnapshotInfo;
import org.apache.hadoop.ozone.om.helpers.SnapshotInfo.SnapshotStatus;
import org.apache.hadoop.ozone.om.service.SnapshotDeletingService;
import org.apache.hadoop.ozone.om.service.SnapshotDiffCleanupService;
import org.apache.hadoop.ozone.om.snapshot.SnapshotDiffJob;
import org.apache.hadoop.ozone.om.snapshot.SnapshotDiffManager;
Expand Down Expand Up @@ -89,8 +87,8 @@
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_DIFF_DB_DIR;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_DIFF_REPORT_MAX_PAGE_SIZE;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_DIFF_REPORT_MAX_PAGE_SIZE_DEFAULT;
import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.FILE_NOT_FOUND;
import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.INVALID_KEY_NAME;
import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.checkSnapshotActive;
import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.dropColumnFamilyHandle;

/**
Expand Down Expand Up @@ -156,6 +154,9 @@ public final class OmSnapshotManager implements AutoCloseable {

private final int maxPageSize;

// Soft limit of the snapshot cache size.
private final int softCacheSize;

public OmSnapshotManager(OzoneManager ozoneManager) {
this.options = new ManagedDBOptions();
this.options.setCreateIfMissing(true);
Expand Down Expand Up @@ -204,29 +205,42 @@ public OmSnapshotManager(OzoneManager ozoneManager) {
.getStore()
.getRocksDBCheckpointDiffer();

// size of lru cache
int cacheSize = ozoneManager.getConfiguration().getInt(
// snapshot cache size, soft-limit
this.softCacheSize = ozoneManager.getConfiguration().getInt(
OZONE_OM_SNAPSHOT_CACHE_MAX_SIZE,
OZONE_OM_SNAPSHOT_CACHE_MAX_SIZE_DEFAULT);

CacheLoader<String, OmSnapshot> loader = createCacheLoader();

RemovalListener<String, OmSnapshot> removalListener
= notification -> {
try {
// close snapshot's rocksdb on eviction
LOG.debug("Closing snapshot: {}", notification.getKey());
// TODO: [SNAPSHOT] HDDS-7935.Close only when refcount reaches zero?
notification.getValue().close();
} catch (IOException e) {
LOG.error("Failed to close snapshot: {} {}",
notification.getKey(), e);
}
};
RemovalListener<String, OmSnapshot> removalListener = notification -> {
try {
final String snapshotTableKey = notification.getKey();
final OmSnapshot omSnapshot = notification.getValue();
if (omSnapshot != null) {
// close snapshot's rocksdb on eviction
LOG.debug("Closing OmSnapshot '{}' due to {}",
snapshotTableKey, notification.getCause());
omSnapshot.close();
} else {
// Cache value would never be null in RemovalListener.

// When a soft reference is GC'ed by the JVM, this RemovalListener
// would be called. But the cache value should still exist at that
// point. Thus in-theory this condition won't be triggered by JVM GC
throw new IllegalStateException("Unexpected: OmSnapshot is null");
}
} catch (IOException e) {
LOG.error("Failed to close OmSnapshot: {}", notification.getKey(), e);
}
};

// init LRU cache
snapshotCache = CacheBuilder.newBuilder()
.maximumSize(cacheSize)
this.snapshotCache = CacheBuilder.newBuilder()
// Indicating OmSnapshot instances are softly referenced from the cache.
// If no thread is holding a strong reference to an OmSnapshot instance
// (e.g. SnapDiff), the instance could be garbage collected by JVM at
// its discretion.
.softValues()
.removalListener(removalListener)
.build(loader);

Expand Down Expand Up @@ -265,31 +279,12 @@ private CacheLoader<String, OmSnapshot> createCacheLoader() {
@Nonnull
public OmSnapshot load(@Nonnull String snapshotTableKey)
throws IOException {
SnapshotInfo snapshotInfo;
// see if the snapshot exists
snapshotInfo = getSnapshotInfo(snapshotTableKey);
SnapshotInfo snapshotInfo = getSnapshotInfo(snapshotTableKey);

// Block snapshot from loading when it is no longer active e.g. DELETED,
// unless this is called from SnapshotDeletingService.
// TODO: [SNAPSHOT] However, snapshotCache.get() from other requests
// (not from SDS) would be able to piggyback off of this because
// snapshot still in cache won't trigger loader again.
// This needs proper addressal in e.g. HDDS-7935
// by introducing another cache just for SDS.
// While the snapshotCache would host ACTIVE snapshots only.
if (!snapshotInfo.getSnapshotStatus().equals(
SnapshotStatus.SNAPSHOT_ACTIVE)) {
if (isCalledFromSnapshotDeletingService()) {
LOG.debug("Permitting {} to load snapshot {} in status: {}",
SnapshotDeletingService.class.getSimpleName(),
snapshotInfo.getTableKey(),
snapshotInfo.getSnapshotStatus());
} else {
throw new OMException("Unable to load snapshot. " +
"Snapshot with table key '" + snapshotTableKey +
"' is no longer active", FILE_NOT_FOUND);
}
}
checkSnapshotActive(snapshotInfo);

CacheValue<SnapshotInfo> cacheValue =
ozoneManager.getMetadataManager().getSnapshotInfoTable()
Expand All @@ -308,7 +303,7 @@ public OmSnapshot load(@Nonnull String snapshotTableKey)
snapshotMetadataManager = new OmMetadataManagerImpl(conf,
snapshotInfo.getCheckpointDirName(), isSnapshotInCache);
} catch (IOException e) {
LOG.error("Failed to retrieve snapshot: {}, {}", snapshotTableKey, e);
LOG.error("Failed to retrieve snapshot: {}", snapshotTableKey);
throw e;
}

Expand Down Expand Up @@ -342,24 +337,6 @@ private CodecRegistry createCodecRegistryForSnapDiff() {
return registry;
}

/**
* Helper method to check whether the loader is called from
* SnapshotDeletingTask (return true) or not (return false).
*/
private boolean isCalledFromSnapshotDeletingService() {

StackTraceElement[] stackTrace = Thread.currentThread().getStackTrace();
for (StackTraceElement elem : stackTrace) {
// Allow as long as loader is called from SDS. e.g. SnapshotDeletingTask
if (elem.getClassName().startsWith(
SnapshotDeletingService.class.getName())) {
return true;
}
}

return false;
}

/**
* Get snapshot instance LRU cache.
* @return LoadingCache
Expand Down Expand Up @@ -449,7 +426,7 @@ private static void deleteKeysInSnapshotScopeFromDTableInternal(

try (TableIterator<String,
? extends Table.KeyValue<String, RepeatedOmKeyInfo>>
keyIter = omMetadataManager.getDeletedTable().iterator()) {
keyIter = omMetadataManager.getDeletedTable().iterator()) {

keyIter.seek(beginKey);
// Continue only when there are entries of snapshot (bucket) scope
Expand Down Expand Up @@ -516,6 +493,15 @@ public IOmMetadataReader checkForSnapshot(String volumeName,
String snapshotTableKey = SnapshotInfo.getTableKey(volumeName,
bucketName, snapshotName);

// Block FS API reads when snapshot is not active.
checkSnapshotActive(ozoneManager, snapshotTableKey);

// Warn if actual cache size exceeds the soft limit already.
if (snapshotCache.size() > softCacheSize) {
LOG.warn("Snapshot cache size ({}) exceeds configured soft-limit ({}).",
snapshotCache.size(), softCacheSize);
}

// retrieve the snapshot from the cache
try {
return snapshotCache.get(snapshotTableKey);
Expand All @@ -537,7 +523,7 @@ public static String getSnapshotPrefix(String snapshotName) {
}

public static String getSnapshotPath(OzoneConfiguration conf,
SnapshotInfo snapshotInfo) {
SnapshotInfo snapshotInfo) {
return OMStorage.getOmDbDir(conf) +
OM_KEY_PREFIX + OM_SNAPSHOT_CHECKPOINT_DIR + OM_KEY_PREFIX +
OM_DB_NAME + snapshotInfo.getCheckpointDirName();
Expand Down Expand Up @@ -574,7 +560,7 @@ public SnapshotDiffResponse getSnapshotDiffReport(final String volume,
final OmSnapshot fs = snapshotCache.get(fsKey);
final OmSnapshot ts = snapshotCache.get(tsKey);
return snapshotDiffManager.getSnapshotDiffReport(volume, bucket, fs, ts,
fsInfo, tsInfo, index, pageSize, forceFullDiff);
fsInfo, tsInfo, index, pageSize, forceFullDiff);
} catch (ExecutionException e) {
throw new IOException(e.getCause());
}
Expand All @@ -583,12 +569,10 @@ public SnapshotDiffResponse getSnapshotDiffReport(final String volume,
private void verifySnapshotInfoForSnapDiff(final SnapshotInfo fromSnapshot,
final SnapshotInfo toSnapshot)
throws IOException {
if ((fromSnapshot.getSnapshotStatus() != SnapshotStatus.SNAPSHOT_ACTIVE) ||
(toSnapshot.getSnapshotStatus() != SnapshotStatus.SNAPSHOT_ACTIVE)) {
// TODO: [SNAPSHOT] Throw custom snapshot exception.
throw new IOException("Cannot generate snapshot diff for non-active " +
"snapshots.");
}
// Block SnapDiff if either of the snapshots is not active.
checkSnapshotActive(fromSnapshot);
checkSnapshotActive(toSnapshot);
// Check snapshot creation time
if (fromSnapshot.getCreationTime() > toSnapshot.getCreationTime()) {
throw new IOException("fromSnapshot:" + fromSnapshot.getName() +
" should be older than to toSnapshot:" + toSnapshot.getName());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -184,10 +184,7 @@ public OMClientResponse validateAndUpdateCache(OzoneManager ozoneManager,
omClientResponse = new OMSnapshotDeleteResponse(
omResponse.build(), tableKey, snapshotInfo);

// Evict the snapshot entry from cache, and close the snapshot DB
// Nothing happens if the key doesn't exist in cache (snapshot not loaded)
ozoneManager.getOmSnapshotManager().getSnapshotCache()
.invalidate(tableKey);
// No longer need to invalidate the entry in the snapshot cache here.

} catch (IOException ex) {
exception = ex;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_FORCE_FULL_DIFF;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_FORCE_FULL_DIFF_DEFAULT;
import static org.apache.hadoop.ozone.om.OmSnapshotManager.DELIMITER;
import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.checkSnapshotActive;
import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.dropColumnFamilyHandle;
import static org.apache.hadoop.ozone.OzoneConsts.OZONE_URI_DELIMITER;
import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.getSnapshotInfo;
Expand Down Expand Up @@ -1147,6 +1148,11 @@ private synchronized void loadJobOnStartUp(final String jobKey,

String fsKey = SnapshotInfo.getTableKey(volume, bucket, fromSnapshot);
String tsKey = SnapshotInfo.getTableKey(volume, bucket, toSnapshot);

// Block SnapDiff if either one of the snapshots is not active
checkSnapshotActive(ozoneManager, fsKey);
checkSnapshotActive(ozoneManager, tsKey);

try {
submitSnapDiffJob(jobKey, jobId, volume, bucket, snapshotCache.get(fsKey),
snapshotCache.get(tsKey), fsInfo, tsInfo, forceFullDiff);
Expand Down
Loading