Skip to content

Commit e3a1f2b

Browse files
Improve Snapshot Abort Behavior (#54256)
This commit improves the behavior of aborting snapshots and by that fixes some extremely rare test failures. Improvements: 1. When aborting a snapshot while it is in the `INIT` stage we do not need to ever delete anything from the repository because nothing is written to the repo during INIT any more (in the past running deletes for these snapshots made sense because we were writing `snap-` and `meta-` blobs during the `INIT` step). 2. Do not try to finalize snapshots that never moved past `INIT`. Same reason as with the first step. If we never moved past `INIT` no data was written to the repo so no need to now write a useless entry for the aborted snapshot to `index-N`. This is especially true, since the reason the snapshot was aborted during `INIT` was a delete call so the useless empty snapshot just added to `index-N` would be removed by the subsequent delete that is still waiting anyway. 3. if after aborting a snapshot we wait for it to finish we should not try deleting it if it failed. If the snapshot failed it means it did not become part of the most recent `RepositoryData` so a delete for it will needlessly fail with a confusing message about that snapshot being missing or concurrent repository modification. I moved to throw the snapshot missing exception here because that seems the most user friendly. This allows the user to simply ignore `404` returns from the delete API when using it to make sure a snapshot is aborted+deleted. Marking this as a non-issue since it doesn't have any negative repercussions other than confusing exceptions on some snapshot aborts. Closes #52843
1 parent 58af49b commit e3a1f2b

File tree

2 files changed

+30
-18
lines changed

2 files changed

+30
-18
lines changed

server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -885,7 +885,8 @@ public void finalizeSnapshot(final SnapshotId snapshotId,
885885
final Map<String, Object> userMetadata,
886886
Version repositoryMetaVersion,
887887
final ActionListener<SnapshotInfo> listener) {
888-
888+
assert repositoryStateId > RepositoryData.UNKNOWN_REPO_GEN :
889+
"Must finalize based on a valid repository generation but received [" + repositoryStateId + "]";
889890
final Collection<IndexId> indices = shardGenerations.indices();
890891
// Once we are done writing the updated index-N blob we remove the now unreferenced index-${uuid} blobs in each shard
891892
// directory if all nodes are at least at version SnapshotsService#SHARD_GEN_IN_REPO_DATA_VERSION

server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java

Lines changed: 28 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -106,9 +106,8 @@
106106
* <ul>
107107
* <li>On the master node the {@link #createSnapshot(CreateSnapshotRequest, ActionListener)} is called and makes sure that
108108
* no snapshot is currently running and registers the new snapshot in cluster state</li>
109-
* <li>When cluster state is updated
110-
* the {@link #beginSnapshot} method kicks in and initializes
111-
* the snapshot in the repository and then populates list of shards that needs to be snapshotted in cluster state</li>
109+
* <li>When the cluster state is updated the {@link #beginSnapshot} method kicks in and populates the list of shards that need to be
110+
* snapshotted in cluster state</li>
112111
* <li>Each data node is watching for these shards and when new shards scheduled for snapshotting appear in the cluster state, data nodes
113112
* start processing them through {@link SnapshotShardsService#startNewSnapshots} method</li>
114113
* <li>Once shard snapshot is created data node updates state of the shard in the cluster state using
@@ -1006,10 +1005,16 @@ private void endSnapshot(SnapshotsInProgress.Entry entry, MetaData metaData) {
10061005
if (endingSnapshots.add(entry.snapshot()) == false) {
10071006
return;
10081007
}
1008+
final Snapshot snapshot = entry.snapshot();
1009+
if (entry.repositoryStateId() == RepositoryData.UNKNOWN_REPO_GEN) {
1010+
logger.debug("[{}] was aborted before starting", snapshot);
1011+
removeSnapshotFromClusterState(entry.snapshot(), null,
1012+
new SnapshotException(snapshot, "Aborted on initialization"));
1013+
return;
1014+
}
10091015
threadPool.executor(ThreadPool.Names.SNAPSHOT).execute(new AbstractRunnable() {
10101016
@Override
10111017
protected void doRun() {
1012-
final Snapshot snapshot = entry.snapshot();
10131018
final Repository repository = repositoriesService.repository(snapshot.getRepository());
10141019
final String failure = entry.failure();
10151020
logger.trace("[{}] finalizing snapshot in repository, state: [{}], failure[{}]", snapshot, entry.state(), failure);
@@ -1205,12 +1210,15 @@ public void deleteSnapshot(final String repositoryName, final String snapshotNam
12051210
*/
12061211
private void deleteSnapshot(final Snapshot snapshot, final ActionListener<Void> listener, final long repositoryStateId,
12071212
final boolean immediatePriority) {
1208-
logger.info("deleting snapshot [{}]", snapshot);
12091213
Priority priority = immediatePriority ? Priority.IMMEDIATE : Priority.NORMAL;
1214+
logger.info("deleting snapshot [{}] assuming repository generation [{}] and with priority [{}]",
1215+
snapshot, repositoryStateId, priority);
12101216
clusterService.submitStateUpdateTask("delete snapshot", new ClusterStateUpdateTask(priority) {
12111217

12121218
boolean waitForSnapshot = false;
12131219

1220+
boolean abortedDuringInit = false;
1221+
12141222
@Override
12151223
public ClusterState execute(ClusterState currentState) {
12161224
SnapshotDeletionsInProgress deletionsInProgress = currentState.custom(SnapshotDeletionsInProgress.TYPE);
@@ -1270,6 +1278,7 @@ public ClusterState execute(ClusterState currentState) {
12701278
shards = snapshotEntry.shards();
12711279
assert shards.isEmpty();
12721280
failure = "Snapshot was aborted during initialization";
1281+
abortedDuringInit = true;
12731282
} else if (state == State.STARTED) {
12741283
// snapshot is started - mark every non completed shard as aborted
12751284
final ImmutableOpenMap.Builder<ShardId, ShardSnapshotStatus> shardsBuilder = ImmutableOpenMap.builder();
@@ -1334,19 +1343,21 @@ public void clusterStateProcessed(String source, ClusterState oldState, ClusterS
13341343
);
13351344
},
13361345
e -> {
1337-
logger.warn("deleted snapshot failed - deleting files", e);
1338-
threadPool.executor(ThreadPool.Names.SNAPSHOT).execute(() -> {
1339-
try {
1340-
deleteSnapshot(snapshot.getRepository(), snapshot.getSnapshotId().getName(), listener, true);
1341-
} catch (SnapshotMissingException smex) {
1342-
logger.info(() -> new ParameterizedMessage(
1343-
"Tried deleting in-progress snapshot [{}], but it could not be found after failing to abort.",
1344-
smex.getSnapshotName()), e);
1345-
listener.onFailure(new SnapshotException(snapshot,
1346-
"Tried deleting in-progress snapshot [" + smex.getSnapshotName() + "], but it " +
1347-
"could not be found after failing to abort.", smex));
1346+
if (abortedDuringInit) {
1347+
logger.debug(() -> new ParameterizedMessage("Snapshot [{}] was aborted during INIT", snapshot), e);
1348+
listener.onResponse(null);
1349+
} else {
1350+
if (ExceptionsHelper.unwrap(e, NotMasterException.class, FailedToCommitClusterStateException.class)
1351+
!= null) {
1352+
logger.warn("master failover before deleted snapshot could complete", e);
1353+
// Just pass the exception to the transport handler as is so it is retried on the new master
1354+
listener.onFailure(e);
1355+
} else {
1356+
logger.warn("deleted snapshot failed", e);
1357+
listener.onFailure(
1358+
new SnapshotMissingException(snapshot.getRepository(), snapshot.getSnapshotId(), e));
13481359
}
1349-
});
1360+
}
13501361
}
13511362
));
13521363
} else {

0 commit comments

Comments
 (0)