Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
import org.elasticsearch.cluster.SnapshotDeletionsInProgress;
import org.elasticsearch.cluster.SnapshotsInProgress;
import org.elasticsearch.cluster.metadata.MetadataDeleteIndexService;
import org.elasticsearch.cluster.metadata.ProjectId;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.concurrent.ListenableFuture;
Expand Down Expand Up @@ -2288,6 +2290,76 @@ private ActionFuture<CreateSnapshotResponse> startFullSnapshotFromMasterClient(S
.execute();
}

/**
* This test ensures that deleting a snapshot with paused shards works fine when there are shards queued behind it
* as well as creating new snapshots concurrently. The snapshot state machine should propagate correctly.
*/
public void testConcurrentDeletePausedSnapshotAndCreateSnapshot() throws Exception {
final String masterNode = internalCluster().startMasterOnlyNode();
final String dataNode = internalCluster().startDataOnlyNode();
ensureStableCluster(2);
final String repoName = randomRepoName();
createRepository(repoName, "mock");

final String indexName = randomIndexName();
createIndexWithContent(indexName, indexSettingsNoReplicas(1).build());

final var snap0 = randomSnapshotName();
final var snap1 = randomSnapshotName();
final var snap2 = randomSnapshotName();
final var snap3 = randomSnapshotName();

// snap0: block finalization on the master so that later deletion stays WAITING
blockMasterOnWriteIndexFile(repoName);
final var snap0Future = startFullSnapshot(repoName, snap0);
waitForBlock(masterNode, repoName);

// Add data so that snap1's shard snapshot has new segment files to write (allowing data node block)
indexDoc(indexName, "another_id", "foo", "bar");

// snap1: block on data node, then pause for node removal
final var snap1Future = startFullSnapshotBlockedOnDataNode(snap1, repoName, dataNode);

// Shutdown and unblock the data node so that the shard snapshot moves to PAUSED_FOR_NODE_REMOVAL
final var clusterService = internalCluster().getCurrentMasterNodeInstance(ClusterService.class);
putShutdownForRemovalMetadata(dataNode, clusterService);
unblockNode(repoName, dataNode);
awaitClusterState(state -> {
for (var entry : SnapshotsInProgress.get(state).forRepo(ProjectId.DEFAULT, repoName)) {
if (entry.snapshot().getSnapshotId().getName().equals(snap1)) {
for (var shard : entry.shards().values()) {
if (shard.state() == SnapshotsInProgress.ShardState.PAUSED_FOR_NODE_REMOVAL) {
return true;
}
}
}
}
return false;
});

// snap2: shard is QUEUED behind snap1's active PAUSED_FOR_NODE_REMOVAL shard
final var snap2Future = startFullSnapshot(repoName, snap2);
awaitNumberOfSnapshotsInProgress(3);

// Delete snap1: PAUSED_FOR_NODE_REMOVAL → ABORTED in abort(), deletion is WAITING because snap0 is still finalizing
final ActionFuture<AcknowledgedResponse> deleteFuture = startDeleteSnapshot(repoName, snap1);
awaitNDeletionsInProgress(1);

// snap3: start yet another snapshot and its shard correctly gets QUEUED
final var snap3Future = startFullSnapshot(repoName, snap3);

// Clean up: clear shutdown so shard snapshots can run, then unblock finalization
clearShutdownMetadata(clusterService);
unblockNode(repoName, masterNode);

// All snapshots except the deleted on should complete successfully
assertSuccessful(snap0Future);
assertThat(snap1Future.get().getSnapshotInfo().state(), is(SnapshotState.FAILED));
assertThat(deleteFuture.get().isAcknowledged(), is(true));
assertSuccessful(snap2Future);
assertSuccessful(snap3Future);
}

private void createIndexWithContent(String indexName, String nodeInclude, String nodeExclude) {
createIndexWithContent(
indexName,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1226,9 +1226,7 @@ public Entry abort() {
allQueued &= status.state() == ShardState.QUEUED;
if (status.state().completed() == false) {
final String nodeId = status.nodeId();
final var newState = (nodeId == null || status.state() == ShardState.PAUSED_FOR_NODE_REMOVAL)
? ShardState.FAILED
: ShardState.ABORTED;
final var newState = nodeId == null ? ShardState.FAILED : ShardState.ABORTED;
status = new ShardSnapshotStatus(nodeId, newState, status.generation(), "aborted by snapshot deletion");
}
completed &= status.state().completed();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,15 @@ public synchronized void moveToFailed(final long endTime, final String failure)
}
}

/**
* Transition from {@link Stage#PAUSED} to {@link Stage#FAILURE}. The abort listeners and timing fields were already
* handled during the earlier PAUSING → PAUSED transition, so only the stage needs updating.
*/
public void moveFromPausedToFailed() {
final boolean moved = stage.compareAndSet(Stage.PAUSED, Stage.FAILURE);
assert moved : "expected stage PAUSED but got " + stage.get();
}

public ShardGeneration generation() {
return generation.get();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,19 @@ private void handleUpdatedSnapshotsInProgressEntry(String localNodeId, boolean r
(outcomeInfoString) -> {}
);
}
} else if (snapshotStatus.isPaused()) {
// Shard was paused for node removal then aborted by snapshot deletion.
// abortIfNotCompleted won't transition from PAUSED, so report FAILED directly.
logger.debug("snapshot [{}] is deleted after PAUSED, updating shard snapshot for {} to FAILED", snapshot, sid);
snapshotStatus.moveFromPausedToFailed();
notifyUnsuccessfulSnapshotShard(
snapshot,
sid,
ShardState.FAILED,
shard.getValue().reason(),
shard.getValue().generation(),
(outcomeInfoString) -> {}
);
} else {
snapshotStatus.abortIfNotCompleted("snapshot has been aborted", notifyOnAbortTaskRunner::enqueueTask);
}
Expand Down