Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package org.apache.hadoop.hdds.utils;

import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
Expand All @@ -40,8 +41,9 @@ public MapBackedTableIterator(TreeMap<String, V> values, String prefix) {
@Override
public void seekToFirst() {
this.itr = this.values.entrySet().stream()
.filter(e -> prefix == null || e.getKey().startsWith(prefix))
.map(e -> Table.newKeyValue(e.getKey(), e.getValue())).iterator();
.filter(e -> prefix == null || e.getKey().startsWith(prefix)).map(
e -> Table.newKeyValue(e.getKey(), e.getValue(),
e.getValue().toString().getBytes(StandardCharsets.UTF_8).length)).iterator();
}

@Override
Expand All @@ -53,8 +55,8 @@ public void seekToLast() {
public Table.KeyValue<String, V> seek(String s) {
this.itr = this.values.entrySet().stream()
.filter(e -> prefix == null || e.getKey().startsWith(prefix))
.filter(e -> e.getKey().compareTo(s) >= 0)
.map(e -> Table.newKeyValue(e.getKey(), e.getValue())).iterator();
.filter(e -> e.getKey().compareTo(s) >= 0).map(e -> Table.newKeyValue(e.getKey(), e.getValue(),
e.getValue().toString().getBytes(StandardCharsets.UTF_8).length)).iterator();
Map.Entry<String, V> firstEntry = values.ceilingEntry(s);
return firstEntry == null ? null : Table.newKeyValue(firstEntry.getKey(), firstEntry.getValue());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import java.util.UUID;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.conf.StorageUnit;
import org.apache.hadoop.hdds.utils.IOUtils;
import org.apache.hadoop.ozone.MiniOzoneCluster;
import org.apache.hadoop.ozone.TestDataUtil;
Expand Down Expand Up @@ -56,6 +57,7 @@ public class TestKeyPurging {
private static final int NUM_KEYS = 10;
private static final int KEY_SIZE = 100;
private OzoneClient client;
private int ratisLimit;

@BeforeEach
public void setup() throws Exception {
Expand All @@ -74,6 +76,11 @@ public void setup() throws Exception {
client = OzoneClientFactory.getRpcClient(conf);
store = client.getObjectStore();
om = cluster.getOzoneManager();
int limit = (int) conf.getStorageSize(
OMConfigKeys.OZONE_OM_RATIS_LOG_APPENDER_QUEUE_BYTE_LIMIT,
OMConfigKeys.OZONE_OM_RATIS_LOG_APPENDER_QUEUE_BYTE_LIMIT_DEFAULT,
StorageUnit.BYTES);
ratisLimit = (int) (limit * 0.9);
}

@AfterEach
Expand Down Expand Up @@ -126,7 +133,7 @@ public void testKeysPurgingByKeyDeletingService() throws Exception {
GenericTestUtils.waitFor(
() -> {
try {
return keyManager.getPendingDeletionKeys((kv) -> true, Integer.MAX_VALUE)
return keyManager.getPendingDeletionKeys((kv) -> true, Integer.MAX_VALUE, ratisLimit)
.getKeyBlocksList().isEmpty();
} catch (IOException e) {
return false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ ListKeysResult listKeys(String volumeName, String bucketName, String startKey,
* @throws IOException if an I/O error occurs while fetching the keys.
*/
PendingKeysDeletion getPendingDeletionKeys(
CheckedFunction<Table.KeyValue<String, OmKeyInfo>, Boolean, IOException> filter, int count)
CheckedFunction<Table.KeyValue<String, OmKeyInfo>, Boolean, IOException> filter, int count, int ratisByteLimit)
throws IOException;

/**
Expand All @@ -142,7 +142,7 @@ PendingKeysDeletion getPendingDeletionKeys(
*/
PendingKeysDeletion getPendingDeletionKeys(
String volume, String bucket, String startKey,
CheckedFunction<Table.KeyValue<String, OmKeyInfo>, Boolean, IOException> filter, int count)
CheckedFunction<Table.KeyValue<String, OmKeyInfo>, Boolean, IOException> filter, int count, int ratisByteLimit)
throws IOException;

/**
Expand All @@ -156,7 +156,7 @@ PendingKeysDeletion getPendingDeletionKeys(
*/
List<Table.KeyValue<String, String>> getRenamesKeyEntries(
String volume, String bucket, String startKey,
CheckedFunction<Table.KeyValue<String, String>, Boolean, IOException> filter, int count)
CheckedFunction<Table.KeyValue<String, String>, Boolean, IOException> filter, int count, int ratisLimit)
throws IOException;


Expand Down Expand Up @@ -190,7 +190,7 @@ CheckedFunction<KeyManager, OmKeyInfo, IOException> getPreviousSnapshotOzoneKeyI
List<Table.KeyValue<String, List<OmKeyInfo>>> getDeletedKeyEntries(
String volume, String bucket, String startKey,
CheckedFunction<Table.KeyValue<String, RepeatedOmKeyInfo>, Boolean, IOException> filter,
int count) throws IOException;
int count, int ratisLimit) throws IOException;

/**
* Returns the names of up to {@code count} open keys whose age is
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -710,17 +710,18 @@ public ListKeysResult listKeys(String volumeName, String bucketName,

@Override
public PendingKeysDeletion getPendingDeletionKeys(
final CheckedFunction<KeyValue<String, OmKeyInfo>, Boolean, IOException> filter, final int count)
throws IOException {
return getPendingDeletionKeys(null, null, null, filter, count);
final CheckedFunction<KeyValue<String, OmKeyInfo>, Boolean, IOException> filter, final int count,
int ratisByteLimit) throws IOException {
return getPendingDeletionKeys(null, null, null, filter, count, ratisByteLimit);
}

@Override
public PendingKeysDeletion getPendingDeletionKeys(
String volume, String bucket, String startKey,
CheckedFunction<KeyValue<String, OmKeyInfo>, Boolean, IOException> filter,
int count) throws IOException {
int count, int ratisByteLimit) throws IOException {
List<BlockGroup> keyBlocksList = Lists.newArrayList();
long serializedSize = 0;
Map<String, RepeatedOmKeyInfo> keysToModify = new HashMap<>();
// Bucket prefix would be empty if volume is empty i.e. either null or "".
Optional<String> bucketPrefix = getBucketPrefix(volume, bucket, false);
Expand All @@ -741,6 +742,7 @@ public PendingKeysDeletion getPendingDeletionKeys(
List<BlockGroup> blockGroupList = Lists.newArrayList();
// Multiple keys with the same path can be queued in one DB entry
RepeatedOmKeyInfo infoList = kv.getValue();
boolean flag = false;
for (OmKeyInfo info : infoList.getOmKeyInfoList()) {

// Skip the key if the filter doesn't allow the file to be deleted.
Expand All @@ -750,12 +752,23 @@ public PendingKeysDeletion getPendingDeletionKeys(
.map(b -> new BlockID(b.getContainerID(), b.getLocalID()))).collect(Collectors.toList());
BlockGroup keyBlocks = BlockGroup.newBuilder().setKeyName(kv.getKey())
.addAllBlockIDs(blockIDS).build();
serializedSize += keyBlocks.getProto().getSerializedSize();
if (serializedSize > ratisByteLimit) {
flag = true;
LOG.info(
"Total size of cumulative keys in a cycle crossed 90% ratis limit, serialized size: {}",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be a common occurrence, and this log will flood the logs. I would recommend adding a metric that be charted for measuring progress.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on my testing, this scenario doesn't appear to be very common. In a typical production case, a key might have around 10 blocks. From test results, 20,000 such keys occupy roughly 2 MB (1.8 MB to be exact). Given that 90% of the default Ratis limit is about 29 MB, it would take approximately 300,000 keys in a single iteration to reach that limit — which seems unlikely in most practical scenarios.

Moreover, to successfully delete 300,000 keys in one iteration, the following configurations would need to be tuned:

hdds.scm.block.deletion.per-interval.max (on SCM)

hdds.datanode.block.deleting.limit.per.interval (on Datanodes)

Given the above, it makes sense to downgrade this log message to DEBUG level.

Let me know your thoughts — open to suggestions.

cc @ashishkumar50

serializedSize);
break;
}
blockGroupList.add(keyBlocks);
currentCount++;
} else {
notReclaimableKeyInfo.addOmKeyInfo(info);
}
}
if (flag) {
break;
}

List<OmKeyInfo> notReclaimableKeyInfoList = notReclaimableKeyInfo.getOmKeyInfoList();

Expand All @@ -775,8 +788,9 @@ private <V, R> List<KeyValue<String, R>> getTableEntries(String startKey,
TableIterator<String, ? extends KeyValue<String, V>> tableIterator,
Function<V, R> valueFunction,
CheckedFunction<KeyValue<String, V>, Boolean, IOException> filter,
int size) throws IOException {
int size, int ratisLimit) throws IOException {
List<KeyValue<String, R>> entries = new ArrayList<>();
int consumedSize = 0;
/* Seek to the start key if it's not null. The next key in queue is ensured to start with the bucket
prefix, {@link org.apache.hadoop.hdds.utils.db.Table#iterator(bucketPrefix)} would ensure this.
*/
Expand All @@ -789,8 +803,13 @@ private <V, R> List<KeyValue<String, R>> getTableEntries(String startKey,
while (tableIterator.hasNext() && currentCount < size) {
KeyValue<String, V> kv = tableIterator.next();
if (kv != null && filter.apply(kv)) {
entries.add(Table.newKeyValue(kv.getKey(), valueFunction.apply(kv.getValue())));
consumedSize += kv.getValueByteSize();
entries.add(Table.newKeyValue(kv.getKey(), valueFunction.apply(kv.getValue()), kv.getValueByteSize()));
currentCount++;
if (consumedSize > ratisLimit) {
LOG.info("Serialized size exceeded the ratis limit, current serailized size : {}", consumedSize);
break;
}
}
}
return entries;
Expand All @@ -811,11 +830,12 @@ private Optional<String> getBucketPrefix(String volumeName, String bucketName, b
@Override
public List<KeyValue<String, String>> getRenamesKeyEntries(
String volume, String bucket, String startKey,
CheckedFunction<KeyValue<String, String>, Boolean, IOException> filter, int size) throws IOException {
CheckedFunction<KeyValue<String, String>, Boolean, IOException> filter, int size, int ratisLimit)
throws IOException {
Optional<String> bucketPrefix = getBucketPrefix(volume, bucket, false);
try (TableIterator<String, ? extends KeyValue<String, String>>
renamedKeyIter = metadataManager.getSnapshotRenamedTable().iterator(bucketPrefix.orElse(""))) {
return getTableEntries(startKey, renamedKeyIter, Function.identity(), filter, size);
return getTableEntries(startKey, renamedKeyIter, Function.identity(), filter, size, ratisLimit);
}
}

Expand Down Expand Up @@ -861,11 +881,11 @@ private <T> CheckedFunction<KeyManager, T, IOException> getPreviousSnapshotOzone
public List<KeyValue<String, List<OmKeyInfo>>> getDeletedKeyEntries(
String volume, String bucket, String startKey,
CheckedFunction<KeyValue<String, RepeatedOmKeyInfo>, Boolean, IOException> filter,
int size) throws IOException {
int size, int ratisLimit) throws IOException {
Optional<String> bucketPrefix = getBucketPrefix(volume, bucket, false);
try (TableIterator<String, ? extends KeyValue<String, RepeatedOmKeyInfo>>
delKeyIter = metadataManager.getDeletedTable().iterator(bucketPrefix.orElse(""))) {
return getTableEntries(startKey, delKeyIter, RepeatedOmKeyInfo::cloneOmKeyInfoList, filter, size);
return getTableEntries(startKey, delKeyIter, RepeatedOmKeyInfo::cloneOmKeyInfoList, filter, size, ratisLimit);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -272,8 +272,9 @@ void optimizeDirDeletesAndSubmitRequest(
break;
}
}
if (!purgePathRequestList.isEmpty()) {
submitPurgePaths(purgePathRequestList, snapTableKey, expectedPreviousSnapshotId);
if (purgePathRequestList.isEmpty() ||
submitPurgePaths(purgePathRequestList, snapTableKey, expectedPreviousSnapshotId) == null) {
return;
}

if (dirNum != 0 || subDirNum != 0 || subFileNum != 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.hdds.HddsUtils;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
import org.apache.hadoop.hdds.conf.StorageUnit;
import org.apache.hadoop.hdds.scm.protocol.ScmBlockLocationProtocol;
import org.apache.hadoop.hdds.utils.BackgroundTask;
import org.apache.hadoop.hdds.utils.BackgroundTaskQueue;
Expand Down Expand Up @@ -88,6 +89,7 @@ public class KeyDeletingService extends AbstractKeyDeletingService {
private final AtomicLong deletedKeyCount;
private final boolean deepCleanSnapshots;
private final SnapshotChainManager snapshotChainManager;
private int ratisByteLimit;

public KeyDeletingService(OzoneManager ozoneManager,
ScmBlockLocationProtocol scmClient, long serviceInterval,
Expand All @@ -104,6 +106,12 @@ public KeyDeletingService(OzoneManager ozoneManager,
this.deepCleanSnapshots = deepCleanSnapshots;
this.snapshotChainManager = ((OmMetadataManagerImpl)ozoneManager.getMetadataManager()).getSnapshotChainManager();
this.scmClient = scmClient;
int limit = (int) ozoneManager.getConfiguration().getStorageSize(
OMConfigKeys.OZONE_OM_RATIS_LOG_APPENDER_QUEUE_BYTE_LIMIT,
OMConfigKeys.OZONE_OM_RATIS_LOG_APPENDER_QUEUE_BYTE_LIMIT_DEFAULT,
StorageUnit.BYTES);
// always go to 90% of max limit for request as other header will be added
this.ratisByteLimit = (int) (limit * 0.9);
}

/**
Expand Down Expand Up @@ -316,7 +324,7 @@ private OzoneManagerProtocolProtos.SetSnapshotPropertyRequest getSetSnapshotRequ
* @param keyManager KeyManager of the underlying store.
*/
private void processDeletedKeysForStore(SnapshotInfo currentSnapshotInfo, KeyManager keyManager,
int remainNum) throws IOException {
int remainNum, int ratisLimit) throws IOException {
String volume = null, bucket = null, snapshotTableKey = null;
if (currentSnapshotInfo != null) {
volume = currentSnapshotInfo.getVolumeName();
Expand Down Expand Up @@ -348,16 +356,24 @@ private void processDeletedKeysForStore(SnapshotInfo currentSnapshotInfo, KeyMan
ReclaimableRenameEntryFilter renameEntryFilter = new ReclaimableRenameEntryFilter(
getOzoneManager(), omSnapshotManager, snapshotChainManager, currentSnapshotInfo,
keyManager, lock)) {
List<String> renamedTableEntries =
keyManager.getRenamesKeyEntries(volume, bucket, null, renameEntryFilter, remainNum).stream()
.map(Table.KeyValue::getKey)
.collect(Collectors.toList());
List<Table.KeyValue<String, String>> renameKeyEntries =
keyManager.getRenamesKeyEntries(volume, bucket, null, renameEntryFilter, remainNum, ratisLimit);

List<String> renamedTableEntries = new ArrayList<>(renameKeyEntries.size());
int serializedSize = 0;

for (Table.KeyValue<String, String> kv : renameKeyEntries) {
renamedTableEntries.add(kv.getKey());
serializedSize += kv.getValueByteSize();
}

remainNum -= renamedTableEntries.size();
ratisLimit -= serializedSize;

// Get pending keys that can be deleted
PendingKeysDeletion pendingKeysDeletion = currentSnapshotInfo == null
? keyManager.getPendingDeletionKeys(reclaimableKeyFilter, remainNum)
: keyManager.getPendingDeletionKeys(volume, bucket, null, reclaimableKeyFilter, remainNum);
PendingKeysDeletion pendingKeysDeletion = currentSnapshotInfo == null ?
keyManager.getPendingDeletionKeys(reclaimableKeyFilter, remainNum, ratisLimit) :
keyManager.getPendingDeletionKeys(volume, bucket, null, reclaimableKeyFilter, remainNum, ratisLimit);
List<BlockGroup> keyBlocksList = pendingKeysDeletion.getKeyBlocksList();
//submit purge requests if there are renamed entries to be purged or keys to be purged.
if (!renamedTableEntries.isEmpty() || keyBlocksList != null && !keyBlocksList.isEmpty()) {
Expand Down Expand Up @@ -449,7 +465,7 @@ public BackgroundTaskResult call() {
snapInfo.getName())) {
KeyManager keyManager = snapInfo == null ? getOzoneManager().getKeyManager()
: omSnapshot.get().getKeyManager();
processDeletedKeysForStore(snapInfo, keyManager, remainNum);
processDeletedKeysForStore(snapInfo, keyManager, remainNum, ratisByteLimit);
}
} catch (IOException e) {
LOG.error("Error while running delete files background task for store {}. Will retry at next run.",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,15 +177,16 @@ public BackgroundTaskResult call() throws InterruptedException {
// Get all entries from deletedKeyTable.
List<Table.KeyValue<String, List<OmKeyInfo>>> deletedKeyEntries =
snapshotKeyManager.getDeletedKeyEntries(snapInfo.getVolumeName(), snapInfo.getBucketName(),
null, (kv) -> true, remaining);
null, (kv) -> true, remaining, ratisByteLimit);
moveCount += deletedKeyEntries.size();
// Get all entries from deletedDirTable.
List<Table.KeyValue<String, OmKeyInfo>> deletedDirEntries = snapshotKeyManager.getDeletedDirEntries(
snapInfo.getVolumeName(), snapInfo.getBucketName(), remaining - moveCount);
moveCount += deletedDirEntries.size();
// Get all entries from snapshotRenamedTable.
List<Table.KeyValue<String, String>> renameEntries = snapshotKeyManager.getRenamesKeyEntries(
snapInfo.getVolumeName(), snapInfo.getBucketName(), null, (kv) -> true, remaining - moveCount);
List<Table.KeyValue<String, String>> renameEntries =
snapshotKeyManager.getRenamesKeyEntries(snapInfo.getVolumeName(), snapInfo.getBucketName(), null,
(kv) -> true, remaining - moveCount, ratisByteLimit);
moveCount += renameEntries.size();
if (moveCount > 0) {
List<SnapshotMoveKeyInfos> deletedKeys = new ArrayList<>(deletedKeyEntries.size());
Expand Down
Loading