Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,7 @@ public final class OzoneConfigKeys {
"org.apache.hadoop.ozone.om.TrashPolicyOzone";


// TODO: [SNAPSHOT] Document this in ozone-default.xml
public static final String OZONE_OM_SNAPSHOT_CACHE_MAX_SIZE =
"ozone.om.snapshot.cache.max.size";
public static final int OZONE_OM_SNAPSHOT_CACHE_MAX_SIZE_DEFAULT = 10;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
import org.apache.hadoop.ozone.client.OzoneVolume;
import org.apache.hadoop.ozone.client.io.OzoneInputStream;
import org.apache.hadoop.ozone.client.io.OzoneOutputStream;
import org.apache.hadoop.ozone.om.exceptions.OMException;
import org.apache.hadoop.ozone.om.helpers.BucketLayout;
import org.apache.hadoop.ozone.om.helpers.OmKeyArgs;
import org.apache.hadoop.ozone.om.helpers.OpenKeySession;
Expand All @@ -59,6 +58,7 @@
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.nio.ByteBuffer;
Expand Down Expand Up @@ -411,15 +411,19 @@ public void testBlockSnapshotFSAccessAfterDeletion() throws Exception {
deleteSnapshot(snapshotName);

// Can't access keys in snapshot anymore with FS API. Should throw exception
final String errorMsg = "no longer active";
LambdaTestUtils.intercept(OMException.class, errorMsg,
final String errorMsg1 = "no longer active";
LambdaTestUtils.intercept(FileNotFoundException.class, errorMsg1,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 on checking exception.

() -> o3fs.listStatus(snapshotRoot));
LambdaTestUtils.intercept(OMException.class, errorMsg,
LambdaTestUtils.intercept(FileNotFoundException.class, errorMsg1,
() -> o3fs.listStatus(snapshotParent));

LambdaTestUtils.intercept(OMException.class, errorMsg,
// Note: Different error message due to inconsistent FNFE client-side
// handling in BasicOzoneClientAdapterImpl#getFileStatus
// TODO: Reconciliation?
final String errorMsg2 = "No such file or directory";
LambdaTestUtils.intercept(FileNotFoundException.class, errorMsg2,
() -> o3fs.getFileStatus(snapshotKey1));
LambdaTestUtils.intercept(OMException.class, errorMsg,
LambdaTestUtils.intercept(FileNotFoundException.class, errorMsg2,
() -> o3fs.getFileStatus(snapshotKey2));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,19 @@ public void close() throws IOException {
omMetadataManager.getStore().close();
}

@Override
protected void finalize() throws Throwable {
// Verify that the DB handle has been closed, log warning otherwise
// https://softwareengineering.stackexchange.com/a/288724
if (!omMetadataManager.getStore().isClosed()) {
LOG.warn("{} is not closed properly. snapshotName: {}",
// Print hash code for debugging
omMetadataManager.getStore().toString(),
snapshotName);
}
super.finalize();
}

@VisibleForTesting
public OMMetadataManager getMetadataManager() {
return omMetadataManager;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@
import org.apache.hadoop.ozone.om.exceptions.OMException;
import org.apache.hadoop.ozone.om.helpers.RepeatedOmKeyInfo;
import org.apache.hadoop.ozone.om.helpers.SnapshotInfo;
import org.apache.hadoop.ozone.om.helpers.SnapshotInfo.SnapshotStatus;
import org.apache.hadoop.ozone.om.service.SnapshotDeletingService;
import org.apache.hadoop.ozone.om.service.SnapshotDiffCleanupService;
import org.apache.hadoop.ozone.om.snapshot.SnapshotDiffJob;
import org.apache.hadoop.ozone.om.snapshot.SnapshotDiffManager;
Expand All @@ -84,8 +82,8 @@
import static org.apache.hadoop.ozone.OzoneConsts.OM_SNAPSHOT_DIFF_DB_NAME;
import static org.apache.hadoop.ozone.OzoneConsts.OM_SNAPSHOT_INDICATOR;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_SNAPSHOT_DIFF_DB_DIR;
import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.FILE_NOT_FOUND;
import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.INVALID_KEY_NAME;
import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.checkSnapshotActive;
import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.dropColumnFamilyHandle;

/**
Expand Down Expand Up @@ -150,6 +148,9 @@ public final class OmSnapshotManager implements AutoCloseable {
private final CodecRegistry codecRegistry;
private final SnapshotDiffCleanupService snapshotDiffCleanupService;

// Soft limit of the snapshot cache size.
private final int softCacheSize;

public OmSnapshotManager(OzoneManager ozoneManager) {
this.options = new ManagedDBOptions();
this.options.setCreateIfMissing(true);
Expand Down Expand Up @@ -194,29 +195,42 @@ public OmSnapshotManager(OzoneManager ozoneManager) {
.getStore()
.getRocksDBCheckpointDiffer();

// size of lru cache
int cacheSize = ozoneManager.getConfiguration().getInt(
// snapshot cache size, soft-limit
this.softCacheSize = ozoneManager.getConfiguration().getInt(
OzoneConfigKeys.OZONE_OM_SNAPSHOT_CACHE_MAX_SIZE,
OzoneConfigKeys.OZONE_OM_SNAPSHOT_CACHE_MAX_SIZE_DEFAULT);

CacheLoader<String, OmSnapshot> loader = createCacheLoader();

RemovalListener<String, OmSnapshot> removalListener
= notification -> {
try {
// close snapshot's rocksdb on eviction
LOG.debug("Closing snapshot: {}", notification.getKey());
// TODO: [SNAPSHOT] HDDS-7935.Close only when refcount reaches zero?
notification.getValue().close();
} catch (IOException e) {
LOG.error("Failed to close snapshot: {} {}",
notification.getKey(), e);
}
};
RemovalListener<String, OmSnapshot> removalListener = notification -> {
try {
final String snapshotTableKey = notification.getKey();
final OmSnapshot omSnapshot = notification.getValue();
if (omSnapshot != null) {
// close snapshot's rocksdb on eviction
LOG.debug("Closing OmSnapshot '{}' due to {}",
snapshotTableKey, notification.getCause());
omSnapshot.close();
} else {
// Cache value would never be null in RemovalListener.

// When a soft reference is GC'ed by the JVM, this RemovalListener
// would be called. But the cache value should still exist at that
// point. Thus in-theory this condition won't be triggered by JVM GC
throw new IllegalStateException("Unexpected: OmSnapshot is null");
}
} catch (IOException e) {
LOG.error("Failed to close OmSnapshot: {}", notification.getKey(), e);
}
};

// init LRU cache
snapshotCache = CacheBuilder.newBuilder()
.maximumSize(cacheSize)
this.snapshotCache = CacheBuilder.newBuilder()
// Indicating OmSnapshot instances are softly referenced from the cache.
// If no thread is holding a strong reference to an OmSnapshot instance
// (e.g. SnapDiff), the instance could be garbage collected by JVM at
// its discretion.
.softValues()
.removalListener(removalListener)
.build(loader);

Expand Down Expand Up @@ -249,31 +263,12 @@ private CacheLoader<String, OmSnapshot> createCacheLoader() {
@Nonnull
public OmSnapshot load(@Nonnull String snapshotTableKey)
throws IOException {
SnapshotInfo snapshotInfo;
// see if the snapshot exists
snapshotInfo = getSnapshotInfo(snapshotTableKey);
SnapshotInfo snapshotInfo = getSnapshotInfo(snapshotTableKey);

// Block snapshot from loading when it is no longer active e.g. DELETED,
// unless this is called from SnapshotDeletingService.
// TODO: [SNAPSHOT] However, snapshotCache.get() from other requests
// (not from SDS) would be able to piggyback off of this because
// snapshot still in cache won't trigger loader again.
// This needs proper addressal in e.g. HDDS-7935
// by introducing another cache just for SDS.
// While the snapshotCache would host ACTIVE snapshots only.
if (!snapshotInfo.getSnapshotStatus().equals(
SnapshotStatus.SNAPSHOT_ACTIVE)) {
if (isCalledFromSnapshotDeletingService()) {
LOG.debug("Permitting {} to load snapshot {} in status: {}",
SnapshotDeletingService.class.getSimpleName(),
snapshotInfo.getTableKey(),
snapshotInfo.getSnapshotStatus());
} else {
throw new OMException("Unable to load snapshot. " +
"Snapshot with table key '" + snapshotTableKey +
"' is no longer active", FILE_NOT_FOUND);
}
}
checkSnapshotActive(snapshotInfo);

CacheValue<SnapshotInfo> cacheValue =
ozoneManager.getMetadataManager().getSnapshotInfoTable()
Expand All @@ -292,7 +287,7 @@ public OmSnapshot load(@Nonnull String snapshotTableKey)
snapshotMetadataManager = new OmMetadataManagerImpl(conf,
snapshotInfo.getCheckpointDirName(), isSnapshotInCache);
} catch (IOException e) {
LOG.error("Failed to retrieve snapshot: {}, {}", snapshotTableKey, e);
LOG.error("Failed to retrieve snapshot: {}", snapshotTableKey);
throw e;
}

Expand Down Expand Up @@ -326,24 +321,6 @@ private CodecRegistry createCodecRegistryForSnapDiff() {
return registry;
}

/**
* Helper method to check whether the loader is called from
* SnapshotDeletingTask (return true) or not (return false).
*/
private boolean isCalledFromSnapshotDeletingService() {

StackTraceElement[] stackTrace = Thread.currentThread().getStackTrace();
for (StackTraceElement elem : stackTrace) {
// Allow as long as loader is called from SDS. e.g. SnapshotDeletingTask
if (elem.getClassName().startsWith(
SnapshotDeletingService.class.getName())) {
return true;
}
}

return false;
}

/**
* Get snapshot instance LRU cache.
* @return LoadingCache
Expand Down Expand Up @@ -433,7 +410,7 @@ private static void deleteKeysInSnapshotScopeFromDTableInternal(

try (TableIterator<String,
? extends Table.KeyValue<String, RepeatedOmKeyInfo>>
keyIter = omMetadataManager.getDeletedTable().iterator()) {
keyIter = omMetadataManager.getDeletedTable().iterator()) {

keyIter.seek(beginKey);
// Continue only when there are entries of snapshot (bucket) scope
Expand Down Expand Up @@ -500,6 +477,15 @@ public IOmMetadataReader checkForSnapshot(String volumeName,
String snapshotTableKey = SnapshotInfo.getTableKey(volumeName,
bucketName, snapshotName);

// Block FS API reads when snapshot is not active.
checkSnapshotActive(ozoneManager, snapshotTableKey);

// Warn if actual cache size exceeds the soft limit already.
if (snapshotCache.size() > softCacheSize) {
LOG.warn("Snapshot cache size ({}) exceeds configured soft-limit ({}).",
snapshotCache.size(), softCacheSize);
}

// retrieve the snapshot from the cache
try {
return snapshotCache.get(snapshotTableKey);
Expand All @@ -521,7 +507,7 @@ public static String getSnapshotPrefix(String snapshotName) {
}

public static String getSnapshotPath(OzoneConfiguration conf,
SnapshotInfo snapshotInfo) {
SnapshotInfo snapshotInfo) {
return OMStorage.getOmDbDir(conf) +
OM_KEY_PREFIX + OM_SNAPSHOT_CHECKPOINT_DIR + OM_KEY_PREFIX +
OM_DB_NAME + snapshotInfo.getCheckpointDirName();
Expand Down Expand Up @@ -561,7 +547,7 @@ public SnapshotDiffResponse getSnapshotDiffReport(final String volume,
final OmSnapshot fs = snapshotCache.get(fsKey);
final OmSnapshot ts = snapshotCache.get(tsKey);
return snapshotDiffManager.getSnapshotDiffReport(volume, bucket, fs, ts,
fsInfo, tsInfo, index, pageSize, forceFullDiff);
fsInfo, tsInfo, index, pageSize, forceFullDiff);
} catch (ExecutionException e) {
throw new IOException(e.getCause());
}
Expand All @@ -570,12 +556,10 @@ public SnapshotDiffResponse getSnapshotDiffReport(final String volume,
private void verifySnapshotInfoForSnapDiff(final SnapshotInfo fromSnapshot,
final SnapshotInfo toSnapshot)
throws IOException {
if ((fromSnapshot.getSnapshotStatus() != SnapshotStatus.SNAPSHOT_ACTIVE) ||
(toSnapshot.getSnapshotStatus() != SnapshotStatus.SNAPSHOT_ACTIVE)) {
// TODO: [SNAPSHOT] Throw custom snapshot exception.
throw new IOException("Cannot generate snapshot diff for non-active " +
"snapshots.");
}
// Block SnapDiff if either of the snapshots is not active.
checkSnapshotActive(fromSnapshot);
checkSnapshotActive(toSnapshot);
// Check snapshot creation time
if (fromSnapshot.getCreationTime() > toSnapshot.getCreationTime()) {
throw new IOException("fromSnapshot:" + fromSnapshot.getName() +
" should be older than to toSnapshot:" + toSnapshot.getName());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -184,10 +184,7 @@ public OMClientResponse validateAndUpdateCache(OzoneManager ozoneManager,
omClientResponse = new OMSnapshotDeleteResponse(
omResponse.build(), tableKey, snapshotInfo);

// Evict the snapshot entry from cache, and close the snapshot DB
// Nothing happens if the key doesn't exist in cache (snapshot not loaded)
ozoneManager.getOmSnapshotManager().getSnapshotCache()
.invalidate(tableKey);
// No longer need to invalidate the entry in the snapshot cache here.

} catch (IOException ex) {
exception = ex;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@

import static org.apache.hadoop.ozone.OzoneConsts.OM_KEY_PREFIX;
import static org.apache.hadoop.ozone.om.OmSnapshotManager.DELIMITER;
import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.checkSnapshotActive;
import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.dropColumnFamilyHandle;
import static org.apache.hadoop.ozone.OzoneConsts.OZONE_URI_DELIMITER;
import static org.apache.hadoop.ozone.om.snapshot.SnapshotUtils.getSnapshotInfo;
Expand Down Expand Up @@ -1055,6 +1056,11 @@ private synchronized void loadJobOnStartUp(final String jobKey,

String fsKey = SnapshotInfo.getTableKey(volume, bucket, fromSnapshot);
String tsKey = SnapshotInfo.getTableKey(volume, bucket, toSnapshot);

// Block SnapDiff if either one of the snapshots is not active
checkSnapshotActive(ozoneManager, fsKey);
checkSnapshotActive(ozoneManager, tsKey);

try {
submitSnapDiffJob(jobKey, jobId, volume, bucket, snapshotCache.get(fsKey),
snapshotCache.get(tsKey), fsInfo, tsInfo, forceFullDiff);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,14 @@
import org.apache.hadoop.ozone.om.OzoneManager;
import org.apache.hadoop.ozone.om.exceptions.OMException;
import org.apache.hadoop.ozone.om.helpers.SnapshotInfo;
import org.apache.hadoop.ozone.om.helpers.SnapshotInfo.SnapshotStatus;
import org.apache.hadoop.ozone.om.service.SnapshotDeletingService;
import org.rocksdb.ColumnFamilyHandle;
import org.rocksdb.RocksDBException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.FILE_NOT_FOUND;
import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.KEY_NOT_FOUND;

/**
Expand Down Expand Up @@ -83,4 +86,51 @@ public static void dropColumnFamilyHandle(
throw new RuntimeException(exception);
}
}


/**
* Throws OMException FILE_NOT_FOUND if snapshot is not in active status.
* @param snapshotTableKey snapshot table key
*/
public static void checkSnapshotActive(OzoneManager ozoneManager,
String snapshotTableKey)
throws IOException {
checkSnapshotActive(getSnapshotInfo(ozoneManager, snapshotTableKey));
}

public static void checkSnapshotActive(SnapshotInfo snapInfo)
throws OMException {

if (snapInfo.getSnapshotStatus() != SnapshotStatus.SNAPSHOT_ACTIVE) {
if (isCalledFromSnapshotDeletingService()) {
LOG.debug("Permitting {} to load snapshot {} even in status: {}",
SnapshotDeletingService.class.getSimpleName(),
snapInfo.getTableKey(),
snapInfo.getSnapshotStatus());
} else {
throw new OMException("Unable to load snapshot. " +
"Snapshot with table key '" + snapInfo.getTableKey() +
"' is no longer active", FILE_NOT_FOUND);
}
}
}

/**
* Helper method to check whether the loader is called from
* SnapshotDeletingTask (return true) or not (return false).
*/
private static boolean isCalledFromSnapshotDeletingService() {

StackTraceElement[] stackTrace = Thread.currentThread().getStackTrace();
for (StackTraceElement elem : stackTrace) {
// Allow as long as loader is called from SDS. e.g. SnapshotDeletingTask
if (elem.getClassName().startsWith(
SnapshotDeletingService.class.getName())) {
return true;
}
}

return false;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,11 @@ public void testCloseOnEviction() throws IOException {
.checkForSnapshot(second.getVolumeName(),
second.getBucketName(), getSnapshotPrefix(second.getName()));

// As a workaround, invalidate all cache entries in order to trigger
// instances close in this test case, since JVM GC most likely would not
// have triggered and closed the instances yet at this point.
omSnapshotManager.getSnapshotCache().invalidateAll();

// confirm store was closed
verify(firstSnapshotStore, timeout(3000).times(1)).close();
}
Expand Down