Skip to content

Commit 9addf0b

Browse files
Stop Blocking Snapshot Deletes Due to Concurrency Limits (#71050)
Limiting the number of concurrent snapshots is useful in preventing excessive memory use. For deletes that aren't actively executing memory use is negligible. We also only have at the most two delete snapshot entries per repository (a currently executing one and a queued up one that that new delete requests get batched into). Thus there is no good reason to prevent snapshot deletes via the concurrent operations limit to limit memory use. On the other hand though, not allowing deletes and specifically aborts to exceed the concurrency limits makes it impossible for a user that already has the maximum number of snapshot create operations running to abort any of them.
1 parent c8415a7 commit 9addf0b

File tree

2 files changed

+26
-17
lines changed

2 files changed

+26
-17
lines changed

server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1112,41 +1112,52 @@ public void testConcurrentOperationsLimit() throws Exception {
11121112

11131113
blockNodeOnAnyFiles(repoName, masterName);
11141114
int blockedSnapshots = 0;
1115-
boolean blockedDelete = false;
11161115
final List<ActionFuture<CreateSnapshotResponse>> snapshotFutures = new ArrayList<>();
11171116
ActionFuture<AcknowledgedResponse> deleteFuture = null;
11181117
for (int i = 0; i < limitToTest; ++i) {
1119-
if (blockedDelete || randomBoolean()) {
1118+
if (deleteFuture != null || randomBoolean()) {
11201119
snapshotFutures.add(startFullSnapshot(repoName, "snap-" + i));
11211120
++blockedSnapshots;
11221121
} else {
1123-
blockedDelete = true;
11241122
deleteFuture = startDeleteSnapshot(repoName, randomFrom(snapshotNames));
11251123
}
11261124
}
11271125
awaitNumberOfSnapshotsInProgress(blockedSnapshots);
1128-
if (blockedDelete) {
1126+
if (deleteFuture != null) {
11291127
awaitNDeletionsInProgress(1);
11301128
}
11311129
waitForBlock(masterName, repoName);
11321130

1133-
final String expectedFailureMessage = "Cannot start another operation, already running [" + limitToTest +
1134-
"] operations and the current limit for concurrent snapshot operations is set to [" + limitToTest + "]";
1135-
final ConcurrentSnapshotExecutionException csen1 = expectThrows(ConcurrentSnapshotExecutionException.class,
1131+
final ConcurrentSnapshotExecutionException cse = expectThrows(ConcurrentSnapshotExecutionException.class,
11361132
() -> client().admin().cluster().prepareCreateSnapshot(repoName, "expected-to-fail").execute().actionGet());
1137-
assertThat(csen1.getMessage(), containsString(expectedFailureMessage));
1138-
if (blockedDelete == false || limitToTest == 1) {
1139-
final ConcurrentSnapshotExecutionException csen2 = expectThrows(ConcurrentSnapshotExecutionException.class,
1140-
() -> client().admin().cluster().prepareDeleteSnapshot(repoName, "*").execute().actionGet());
1141-
assertThat(csen2.getMessage(), containsString(expectedFailureMessage));
1133+
assertThat(cse.getMessage(), containsString("Cannot start another operation, already running [" + limitToTest +
1134+
"] operations and the current limit for concurrent snapshot operations is set to [" + limitToTest + "]"));
1135+
boolean deleteAndAbortAll = false;
1136+
if (deleteFuture == null && randomBoolean()) {
1137+
deleteFuture = client().admin().cluster().prepareDeleteSnapshot(repoName, "*").execute();
1138+
deleteAndAbortAll = true;
1139+
if (randomBoolean()) {
1140+
awaitNDeletionsInProgress(1);
1141+
}
11421142
}
11431143

11441144
unblockNode(repoName, masterName);
11451145
if (deleteFuture != null) {
11461146
assertAcked(deleteFuture.get());
11471147
}
1148-
for (ActionFuture<CreateSnapshotResponse> snapshotFuture : snapshotFutures) {
1149-
assertSuccessful(snapshotFuture);
1148+
1149+
if (deleteAndAbortAll) {
1150+
awaitNumberOfSnapshotsInProgress(0);
1151+
for (ActionFuture<CreateSnapshotResponse> snapshotFuture : snapshotFutures) {
1152+
// just check that the futures resolve, whether or not things worked out with the snapshot actually finalizing or failing
1153+
// due to the abort does not matter
1154+
assertBusy(() -> assertTrue(snapshotFuture.isDone()));
1155+
}
1156+
assertThat(getRepositoryData(repoName).getSnapshotIds(), empty());
1157+
} else {
1158+
for (ActionFuture<CreateSnapshotResponse> snapshotFuture : snapshotFutures) {
1159+
assertSuccessful(snapshotFuture);
1160+
}
11501161
}
11511162
}
11521163

server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1685,10 +1685,8 @@ public ClusterState execute(ClusterState currentState) {
16851685
reusedExistingDelete = true;
16861686
return currentState;
16871687
}
1688-
final List<SnapshotId> toDelete = List.copyOf(snapshotIdsRequiringCleanup);
1689-
ensureBelowConcurrencyLimit(repoName, toDelete.get(0).getName(), snapshots, deletionsInProgress);
16901688
newDelete = new SnapshotDeletionsInProgress.Entry(
1691-
toDelete,
1689+
List.copyOf(snapshotIdsRequiringCleanup),
16921690
repoName,
16931691
threadPool.absoluteTimeInMillis(),
16941692
repositoryData.getGenId(),

0 commit comments

Comments
 (0)