From 662bac4da37f47a667bf0c8ef598d9464f15f5ef Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Wed, 28 Jan 2026 20:55:27 +1100 Subject: [PATCH 1/2] Ensure paused shard snapshot can be deleted (#141408) When a shard snapshot is paused due to node shutdown, the associated snapshot can be deleted before the shard snapshot transition to another state. When this happens, we ensure such shard snapshot is deleted directly without going back to the data node where it gets incorrectly ignored. (cherry picked from commit a2b9254918174bffb1630161f91640a2c61eca9c) # Conflicts: # server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotShutdownIT.java --- docs/changelog/141408.yaml | 5 ++ .../snapshots/SnapshotShutdownIT.java | 53 +++++++++++++++++++ .../cluster/SnapshotsInProgress.java | 10 ++-- 3 files changed, 62 insertions(+), 6 deletions(-) create mode 100644 docs/changelog/141408.yaml diff --git a/docs/changelog/141408.yaml b/docs/changelog/141408.yaml new file mode 100644 index 0000000000000..f9dd51a77edfa --- /dev/null +++ b/docs/changelog/141408.yaml @@ -0,0 +1,5 @@ +area: Snapshot/Restore +issues: [] +pr: 141408 +summary: Ensure paused shard snapshot can be deleted +type: bug diff --git a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotShutdownIT.java b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotShutdownIT.java index 71cfa1510a5ae..b8253e31defc2 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotShutdownIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotShutdownIT.java @@ -25,6 +25,7 @@ import org.elasticsearch.cluster.SnapshotDeletionsInProgress; import org.elasticsearch.cluster.SnapshotsInProgress; import org.elasticsearch.cluster.metadata.IndexMetadata; +import org.elasticsearch.cluster.metadata.ProjectId; import org.elasticsearch.cluster.metadata.SingleNodeShutdownMetadata; import org.elasticsearch.cluster.routing.ShardRoutingState; import org.elasticsearch.cluster.service.ClusterService; @@ -51,6 +52,7 @@ import static org.elasticsearch.snapshots.SnapshotShutdownProgressTracker.SNAPSHOT_PROGRESS_DURING_SHUTDOWN_LOG_INTERVAL_SETTING; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.empty; +import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.hasSize; import static org.hamcrest.Matchers.oneOf; @@ -657,6 +659,57 @@ && switch (shardEntry.getValue().state()) { resetMockLog(); } + public void testDeleteSnapshotWithPausedShardSnapshots() throws Exception { + final var originalNode = internalCluster().startDataOnlyNode(); + final var indexName = randomIndexName(); + createIndexWithContent(indexName, indexSettings(1, 0).put(REQUIRE_NODE_NAME_SETTING, originalNode).build()); + + final var repoName = randomRepoName(); + createRepository(repoName, "mock"); + + // Start the snapshot and block it on the data node + final String snapshotName = randomSnapshotName(); + final var snapshotFuture = startFullSnapshotBlockedOnDataNode(snapshotName, repoName, originalNode); + + // Mark data node for shutdown and ensure shard snapshot is paused + final var clusterService = internalCluster().getCurrentMasterNodeInstance(ClusterService.class); + final var shardSnapshotsPausedListener = ClusterServiceUtils.addTemporaryStateListener(clusterService, state -> { + final var snapshotEntry = SnapshotsInProgress.get(state) + .forRepo(ProjectId.DEFAULT, repoName) + .stream() + .filter(entry -> entry.snapshot().getSnapshotId().getName().equals(snapshotName)) + .findFirst() + .orElseThrow(() -> new AssertionError("Snapshot [" + snapshotName + "] not found")); + + return snapshotEntry.shards() + .values() + .stream() + .allMatch(shardSnapshotStatus -> shardSnapshotStatus.state() == SnapshotsInProgress.ShardState.PAUSED_FOR_NODE_REMOVAL); + }); + putShutdownForRemovalMetadata(originalNode, clusterService); + unblockAllDataNodes(repoName); + safeAwait(shardSnapshotsPausedListener); + + // Delete the snapshot and ensure it is successfully and clears all snapshot operations from cluster state + final var snapshotClearedListener = ClusterServiceUtils.addTemporaryStateListener( + clusterService, + state -> SnapshotsInProgress.get(state).isEmpty() && SnapshotDeletionsInProgress.get(state).getEntries().isEmpty() + ); + assertTrue(safeGet(startDeleteSnapshot(repoName, snapshotName)).isAcknowledged()); + safeAwait(snapshotClearedListener); + + // Snapshot creation has failed snapshot response + final var createSnapshotResponse = safeGet(snapshotFuture); + assertThat(createSnapshotResponse.getSnapshotInfo().state(), equalTo(SnapshotState.FAILED)); + assertThat(createSnapshotResponse.getSnapshotInfo().reason(), containsString("Snapshot was aborted by deletion")); + + // No snapshot is in the repository + final List snapshotInfos = clusterAdmin().prepareGetSnapshots(TEST_REQUEST_TIMEOUT, repoName).get().getSnapshots(); + assertThat(snapshotInfos, empty()); + + clearShutdownMetadata(clusterService); + } + private static void addUnassignedShardsWatcher(ClusterService clusterService, String indexName) { ClusterServiceUtils.addTemporaryStateListener(clusterService, state -> { final var indexRoutingTable = state.routingTable().index(indexName); diff --git a/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java b/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java index 7e70f9a0ab5c1..7f9e699c9631c 100644 --- a/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java +++ b/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java @@ -1226,12 +1226,10 @@ public Entry abort() { allQueued &= status.state() == ShardState.QUEUED; if (status.state().completed() == false) { final String nodeId = status.nodeId(); - status = new ShardSnapshotStatus( - nodeId, - nodeId == null ? ShardState.FAILED : ShardState.ABORTED, - status.generation(), - "aborted by snapshot deletion" - ); + final var newState = (nodeId == null || status.state() == ShardState.PAUSED_FOR_NODE_REMOVAL) + ? ShardState.FAILED + : ShardState.ABORTED; + status = new ShardSnapshotStatus(nodeId, newState, status.generation(), "aborted by snapshot deletion"); } completed &= status.state().completed(); shardsBuilder.put(shardEntry.getKey(), status); From fb9c6b6a3db99ce8e3acdc936eb0b8d3e4e88fec Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Thu, 29 Jan 2026 09:39:43 +1100 Subject: [PATCH 2/2] import --- .../java/org/elasticsearch/snapshots/SnapshotShutdownIT.java | 1 + 1 file changed, 1 insertion(+) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotShutdownIT.java b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotShutdownIT.java index b8253e31defc2..79e5563b5d9a8 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotShutdownIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotShutdownIT.java @@ -44,6 +44,7 @@ import org.elasticsearch.test.transport.MockTransportService; import java.util.Collection; +import java.util.List; import java.util.concurrent.CountDownLatch; import java.util.concurrent.CyclicBarrier; import java.util.concurrent.ExecutionException;