Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
8423ac7
HDDS-13009. Background snapshot defrag service
smengcl Oct 7, 2025
838cd43
Make `RDBSstFileWriter` public to be used in defrag service.
smengcl Aug 19, 2025
61b9ffc
Implement `delete(key)` in `RDBSstFileWriter`.
smengcl Aug 19, 2025
73625a8
Add config `ozone.snapshot.defrag.service.timeout`, `ozone.snapshot.d…
smengcl Aug 19, 2025
f442502
Add comments/TODOs for defrag service interval/timeout
smengcl Oct 6, 2025
8e37e5d
Implement SnapshotDefragService
smengcl Aug 19, 2025
08cde3d
Make it compile
smengcl Oct 7, 2025
2cd2cbf
Checkstyle
smengcl Oct 7, 2025
6a8adb5
Add a test case for debugging
smengcl Aug 19, 2025
0a131dd
Handle yaml previousSnapshotId null
smengcl Oct 9, 2025
668799d
Use snapshot chain iterator in findFirstSnapshotNeedingDefrag
smengcl Oct 9, 2025
93db705
Merge remote-tracking branch 'asf' into HDDS-13009-defrag-service
smengcl Oct 9, 2025
135100e
Handle yaml snapId null so that it won't cause NPE. Null check should…
smengcl Oct 9, 2025
f5fd06a
Remove findPreviousDefraggedSnapshot
smengcl Oct 9, 2025
e970a3d
Remove TODO that is not applicable
smengcl Oct 9, 2025
c3de774
Add new configs to ozone-default.xml
smengcl Oct 9, 2025
92256eb
Clean up test case
smengcl Oct 9, 2025
9d57b89
Logging and TODO cleanups.
smengcl Oct 9, 2025
4a3e01a
Remove POC defrag logic and test case, keeping the skeleton
smengcl Oct 9, 2025
a17a5b6
Address applicable comments from copilot
smengcl Oct 9, 2025
2d29d7f
Revert the change that makes RDBSstFileWriter and RocksDatabase public.
smengcl Oct 9, 2025
c11eda5
Empty performFullDefragmentation and performIncrementalDefragmentatio…
smengcl Oct 9, 2025
d6a07c4
Remove unused fields
smengcl Oct 10, 2025
66d6378
Override
smengcl Oct 10, 2025
5f1c393
Checkstyle
smengcl Oct 10, 2025
9d5166a
Config
smengcl Oct 10, 2025
1c71318
Revert the revert that makes RDBSstFileWriter public.
smengcl Oct 13, 2025
d6d7c6a
Refactor. context: https://github.com/apache/ozone/pull/9133#discussi…
smengcl Oct 13, 2025
808f0bb
getSnapshot(snapshotTableKey, false); remove updateSnapshotMetadata
smengcl Oct 13, 2025
02f18ce
Use MultiSnapshotLocks.
smengcl Oct 13, 2025
2a51f60
Use `getSnapshot(UUID)`; pmd
smengcl Oct 14, 2025
eee47f3
Revert package-private
smengcl Oct 14, 2025
48c47e4
Merge remote-tracking branch 'asf' into HDDS-13780-skeleton-defrag-se…
smengcl Oct 14, 2025
f8e3077
Use new API `getOmSnapshotLocalData(SnapshotInfo)` that comes with th…
smengcl Oct 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,15 @@ public final class OzoneConfigKeys {
OZONE_SNAPSHOT_SST_FILTERING_SERVICE_TIMEOUT_DEFAULT = "300s";
// 300s for default

public static final String OZONE_SNAPSHOT_DEFRAG_SERVICE_TIMEOUT =
"ozone.snapshot.defrag.service.timeout";
public static final String
OZONE_SNAPSHOT_DEFRAG_SERVICE_TIMEOUT_DEFAULT = "300s";
// TODO: Adjust timeout as needed.
// One concern would be that snapdiff can take a long time.
// If snapdiff wait time is included in the timeout it can make it indeterministic.
// -- So don't wait? Trigger and check later?

public static final String OZONE_SNAPSHOT_DELETING_SERVICE_INTERVAL =
"ozone.snapshot.deleting.service.interval";
public static final String
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,7 @@ public final class OzoneConsts {
public static final String OM_SNAPSHOT_DIR = "db.snapshots";
public static final String OM_SNAPSHOT_CHECKPOINT_DIR = OM_SNAPSHOT_DIR
+ OM_KEY_PREFIX + "checkpointState";
public static final String OM_SNAPSHOT_CHECKPOINT_DEFRAGGED_DIR = "checkpointStateDefragged";
public static final String OM_SNAPSHOT_DIFF_DIR = OM_SNAPSHOT_DIR
+ OM_KEY_PREFIX + "diffState";

Expand Down
22 changes: 22 additions & 0 deletions hadoop-hdds/common/src/main/resources/ozone-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3758,13 +3758,28 @@
Snapshot Deleting Service per run.
</description>
</property>
<property>
<name>ozone.snapshot.defrag.limit.per.task</name>
<value>1</value>
<tag>OZONE, PERFORMANCE, OM</tag>
<description>The maximum number of snapshots that would be defragmented in
each task run of snapshot defragmentation service.
</description>
</property>
<property>
<name>ozone.snapshot.filtering.service.interval</name>
<value>1m</value>
<tag>OZONE, PERFORMANCE, OM</tag>
<description>Time interval of the SST File filtering service from Snapshot.
</description>
</property>
<property>
<name>ozone.snapshot.defrag.service.interval</name>
<value>-1</value>
<tag>OZONE, PERFORMANCE, OM</tag>
<description>Task interval of snapshot defragmentation service.
</description>
</property>
<property>
<name>ozone.om.snapshot.checkpoint.dir.creation.poll.timeout</name>
<value>20s</value>
Expand All @@ -3781,6 +3796,13 @@
<description>A timeout value of sst filtering service.
</description>
</property>
<property>
<name>ozone.snapshot.defrag.service.timeout</name>
<value>300s</value>
<tag>OZONE, PERFORMANCE,OM</tag>
<description>Timeout value of a run of snapshot defragmentation service.
</description>
</property>

<property>
<name>ozone.filesystem.snapshot.enabled</name>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@
/**
* DumpFileWriter using rocksdb sst files.
*/
class RDBSstFileWriter implements Closeable {
public class RDBSstFileWriter implements Closeable {

private ManagedSstFileWriter sstFileWriter;
private File sstFile;
private AtomicLong keyCounter;
private ManagedOptions emptyOption = new ManagedOptions();
private final ManagedEnvOptions emptyEnvOptions = new ManagedEnvOptions();

RDBSstFileWriter(File externalFile) throws RocksDatabaseException {
public RDBSstFileWriter(File externalFile) throws RocksDatabaseException {
this.sstFileWriter = new ManagedSstFileWriter(emptyEnvOptions, emptyOption);
this.keyCounter = new AtomicLong(0);
this.sstFile = externalFile;
Expand All @@ -60,6 +60,17 @@ public void put(byte[] key, byte[] value) throws RocksDatabaseException {
}
}

public void delete(byte[] key) throws RocksDatabaseException {
try {
sstFileWriter.delete(key);
keyCounter.incrementAndGet();
} catch (RocksDBException e) {
closeOnFailure();
throw new RocksDatabaseException("Failed to delete key (length=" + key.length
+ "), sstFile=" + sstFile.getAbsolutePath(), e);
}
}

@Override
public void close() throws RocksDatabaseException {
if (sstFileWriter != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -428,11 +428,22 @@ public final class OMConfigKeys {
"ozone.snapshot.deleting.limit.per.task";
public static final int SNAPSHOT_DELETING_LIMIT_PER_TASK_DEFAULT = 10;

// Snapshot defragmentation service configuration
public static final String SNAPSHOT_DEFRAG_LIMIT_PER_TASK =
"ozone.snapshot.defrag.limit.per.task";
public static final int SNAPSHOT_DEFRAG_LIMIT_PER_TASK_DEFAULT = 1;

public static final String OZONE_SNAPSHOT_SST_FILTERING_SERVICE_INTERVAL =
"ozone.snapshot.filtering.service.interval";
public static final String
OZONE_SNAPSHOT_SST_FILTERING_SERVICE_INTERVAL_DEFAULT = "60s";

public static final String OZONE_SNAPSHOT_DEFRAG_SERVICE_INTERVAL =
"ozone.snapshot.defrag.service.interval";
public static final String
OZONE_SNAPSHOT_DEFRAG_SERVICE_INTERVAL_DEFAULT = "-1";
// TODO: Disabled by default. Do not enable by default until upgrade handling is complete.

public static final String
OZONE_SNAPSHOT_CHECKPOINT_DIR_CREATION_POLL_TIMEOUT =
"ozone.om.snapshot.checkpoint.dir.creation.poll.timeout";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,12 @@ DeleteKeysResult getPendingDeletionSubFiles(long volumeId,
*/
SstFilteringService getSnapshotSstFilteringService();

/**
* Returns the instance of Snapshot Defrag service.
* @return Background service.
*/
SnapshotDefragService getSnapshotDefragService();

/**
* Returns the instance of Snapshot Deleting service.
* @return Background service.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_SERVICE_TIMEOUT_DEFAULT;
import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_SCM_BLOCK_SIZE;
import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_SCM_BLOCK_SIZE_DEFAULT;
import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_SNAPSHOT_DEFRAG_SERVICE_TIMEOUT;
import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_SNAPSHOT_DEFRAG_SERVICE_TIMEOUT_DEFAULT;
import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_SNAPSHOT_DELETING_SERVICE_INTERVAL;
import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_SNAPSHOT_DELETING_SERVICE_INTERVAL_DEFAULT;
import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_SNAPSHOT_DELETING_SERVICE_TIMEOUT;
Expand Down Expand Up @@ -58,6 +60,8 @@
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_OPEN_KEY_CLEANUP_SERVICE_TIMEOUT_DEFAULT;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_SNAPSHOT_DEEP_CLEANING_ENABLED;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_SNAPSHOT_DEEP_CLEANING_ENABLED_DEFAULT;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_SNAPSHOT_DEFRAG_SERVICE_INTERVAL;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_SNAPSHOT_DEFRAG_SERVICE_INTERVAL_DEFAULT;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_SNAPSHOT_SST_FILTERING_SERVICE_INTERVAL;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_SNAPSHOT_SST_FILTERING_SERVICE_INTERVAL_DEFAULT;
import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_THREAD_NUMBER_DIR_DELETION;
Expand Down Expand Up @@ -202,6 +206,7 @@ public class KeyManagerImpl implements KeyManager {
private KeyDeletingService keyDeletingService;

private SstFilteringService snapshotSstFilteringService;
private SnapshotDefragService snapshotDefragService;
private SnapshotDeletingService snapshotDeletingService;

private final KeyProviderCryptoExtension kmsProvider;
Expand Down Expand Up @@ -310,6 +315,11 @@ public void start(OzoneConfiguration configuration) {
startSnapshotSstFilteringService(configuration);
}

if (snapshotDefragService == null &&
ozoneManager.isFilesystemSnapshotEnabled()) {
startSnapshotDefragService(configuration);
}

if (snapshotDeletingService == null &&
ozoneManager.isFilesystemSnapshotEnabled()) {

Expand Down Expand Up @@ -393,6 +403,42 @@ public void stopSnapshotSstFilteringService() {
}
}

/**
* Start the snapshot defrag service if interval is not set to disabled value.
* @param conf
*/
public void startSnapshotDefragService(OzoneConfiguration conf) {
if (isDefragSvcEnabled()) {
long serviceInterval = conf.getTimeDuration(
OZONE_SNAPSHOT_DEFRAG_SERVICE_INTERVAL,
OZONE_SNAPSHOT_DEFRAG_SERVICE_INTERVAL_DEFAULT,
TimeUnit.MILLISECONDS);
long serviceTimeout = conf.getTimeDuration(
OZONE_SNAPSHOT_DEFRAG_SERVICE_TIMEOUT,
OZONE_SNAPSHOT_DEFRAG_SERVICE_TIMEOUT_DEFAULT,
TimeUnit.MILLISECONDS);

snapshotDefragService =
new SnapshotDefragService(serviceInterval, TimeUnit.MILLISECONDS,
serviceTimeout, ozoneManager, conf);
snapshotDefragService.start();
} else {
LOG.info("SnapshotDefragService is disabled. Snapshot defragmentation will not run periodically.");
}
}

/**
* Stop the snapshot defrag service if it is running.
*/
public void stopSnapshotDefragService() {
if (snapshotDefragService != null) {
snapshotDefragService.shutdown();
snapshotDefragService = null;
} else {
LOG.info("SnapshotDefragService is already stopped or not started.");
}
}

private void startCompactionService(OzoneConfiguration configuration,
boolean isCompactionServiceEnabled) {
if (compactionService == null && isCompactionServiceEnabled) {
Expand All @@ -419,7 +465,7 @@ KeyProviderCryptoExtension getKMSProvider() {
}

@Override
public void stop() throws IOException {
public void stop() {
if (keyDeletingService != null) {
keyDeletingService.shutdown();
keyDeletingService = null;
Expand All @@ -436,6 +482,10 @@ public void stop() throws IOException {
snapshotSstFilteringService.shutdown();
snapshotSstFilteringService = null;
}
if (snapshotDefragService != null) {
snapshotDefragService.shutdown();
snapshotDefragService = null;
}
if (snapshotDeletingService != null) {
snapshotDeletingService.shutdown();
snapshotDeletingService = null;
Expand All @@ -450,6 +500,16 @@ public void stop() throws IOException {
}
}

/**
* Get the SnapshotDefragService instance.
*
* @return SnapshotDefragService instance, or null if not initialized
*/
@Override
public SnapshotDefragService getSnapshotDefragService() {
return snapshotDefragService;
}

private OmBucketInfo getBucketInfo(String volumeName, String bucketName)
throws IOException {
String bucketKey = metadataManager.getBucketKey(volumeName, bucketName);
Expand Down Expand Up @@ -973,7 +1033,16 @@ public boolean isSstFilteringSvcEnabled() {
// any interval <= 0 causes IllegalArgumentException from scheduleWithFixedDelay
return serviceInterval > 0;
}


public boolean isDefragSvcEnabled() {
long serviceInterval = ozoneManager.getConfiguration()
.getTimeDuration(OZONE_SNAPSHOT_DEFRAG_SERVICE_INTERVAL,
OZONE_SNAPSHOT_DEFRAG_SERVICE_INTERVAL_DEFAULT,
TimeUnit.MILLISECONDS);
// any interval <= 0 causes IllegalArgumentException from scheduleWithFixedDelay
return serviceInterval > 0;
}

@Override
public OmMultipartUploadList listMultipartUploads(String volumeName,
String bucketName,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,10 +164,11 @@ private final class ConstructSnapshotLocalData extends AbstractConstruct {
public Object construct(Node node) {
MappingNode mnode = (MappingNode) node;
Map<Object, Object> nodes = constructMapping(mnode);
UUID snapId = UUID.fromString((String) nodes.get(OzoneConsts.OM_SLD_SNAP_ID));
UUID prevSnapId = UUID.fromString((String) nodes.get(OzoneConsts.OM_SLD_PREV_SNAP_ID));
OmSnapshotLocalData snapshotLocalData = new OmSnapshotLocalData(snapId, Collections.emptyList(),
prevSnapId);
final String snapIdStr = (String) nodes.get(OzoneConsts.OM_SLD_SNAP_ID);
UUID snapId = UUID.fromString(snapIdStr);
final String prevSnapIdStr = (String) nodes.get(OzoneConsts.OM_SLD_PREV_SNAP_ID);
UUID prevSnapId = prevSnapIdStr != null ? UUID.fromString(prevSnapIdStr) : null;
OmSnapshotLocalData snapshotLocalData = new OmSnapshotLocalData(snapId, Collections.emptyList(), prevSnapId);

// Set version from YAML
Integer version = (Integer) nodes.get(OzoneConsts.OM_SLD_VERSION);
Expand Down
Loading