Skip to content

Commit 4d543a5

Browse files
Add Snapshot Resiliency Test for Master Failover during Delete (#54866) (#55456)
* Add Snapshot Resiliency Test for Master Failover during Delete We only have very indirect coverage of master failovers during snaphot delete at the moment. This comment adds a direct test of this scenario and also an assertion that makes sure we are not leaking any snapshot completion listeners in the snapshots service in this scenario. This gives us better coverage of scenarios like #54256 and makes the diff to the upcoming more consistent snapshot delete implementation in #54705 smaller.
1 parent dc703d7 commit 4d543a5

File tree

2 files changed

+65
-4
lines changed

2 files changed

+65
-4
lines changed

server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -624,6 +624,19 @@ public void applyClusterState(ClusterChangedEvent event) {
624624
} catch (Exception e) {
625625
logger.warn("Failed to update snapshot state ", e);
626626
}
627+
assert assertConsistentWithClusterState(event.state());
628+
}
629+
630+
private boolean assertConsistentWithClusterState(ClusterState state) {
631+
final SnapshotsInProgress snapshotsInProgress = state.custom(SnapshotsInProgress.TYPE);
632+
if (snapshotsInProgress != null && snapshotsInProgress.entries().isEmpty() == false) {
633+
final Set<Snapshot> runningSnapshots =
634+
snapshotsInProgress.entries().stream().map(SnapshotsInProgress.Entry::snapshot).collect(Collectors.toSet());
635+
final Set<Snapshot> snapshotListenerKeys = snapshotCompletionListeners.keySet();
636+
assert runningSnapshots.containsAll(snapshotListenerKeys) : "Saw completion listeners for unknown snapshots in "
637+
+ snapshotListenerKeys + " but running snapshots are " + runningSnapshots;
638+
}
639+
return true;
627640
}
628641

629642
/**

server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java

Lines changed: 52 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@
9696
import org.elasticsearch.cluster.ClusterStateListener;
9797
import org.elasticsearch.cluster.ESAllocationTestCase;
9898
import org.elasticsearch.cluster.NodeConnectionsService;
99+
import org.elasticsearch.cluster.SnapshotDeletionsInProgress;
99100
import org.elasticsearch.cluster.SnapshotsInProgress;
100101
import org.elasticsearch.cluster.action.index.MappingUpdatedAction;
101102
import org.elasticsearch.cluster.action.index.NodeMappingRefreshAction;
@@ -411,6 +412,49 @@ public void testSnapshotWithNodeDisconnects() {
411412
assertThat(snapshotIds, hasSize(1));
412413
}
413414

415+
public void testSnapshotDeleteWithMasterFailover() {
416+
final int dataNodes = randomIntBetween(2, 10);
417+
final int masterNodes = randomFrom(3, 5);
418+
setupTestCluster(masterNodes, dataNodes);
419+
420+
String repoName = "repo";
421+
String snapshotName = "snapshot";
422+
final String index = "test";
423+
final int shards = randomIntBetween(1, 10);
424+
425+
final boolean waitForSnapshot = randomBoolean();
426+
final StepListener<CreateSnapshotResponse> createSnapshotResponseStepListener = new StepListener<>();
427+
continueOrDie(createRepoAndIndex(repoName, index, shards), createIndexResponse ->
428+
testClusterNodes.randomMasterNodeSafe().client.admin().cluster().prepareCreateSnapshot(repoName, snapshotName)
429+
.setWaitForCompletion(waitForSnapshot).execute(createSnapshotResponseStepListener));
430+
431+
final AtomicBoolean snapshotDeleteResponded = new AtomicBoolean(false);
432+
continueOrDie(createSnapshotResponseStepListener, createSnapshotResponse -> {
433+
scheduleNow(this::disconnectOrRestartMasterNode);
434+
testClusterNodes.randomDataNodeSafe().client.admin().cluster()
435+
.prepareDeleteSnapshot(repoName, snapshotName).execute(ActionListener.wrap(() -> snapshotDeleteResponded.set(true)));
436+
});
437+
438+
runUntil(() -> testClusterNodes.randomMasterNode().map(master -> {
439+
if (snapshotDeleteResponded.get() == false) {
440+
return false;
441+
}
442+
final SnapshotDeletionsInProgress snapshotDeletionsInProgress =
443+
master.clusterService.state().custom(SnapshotDeletionsInProgress.TYPE);
444+
return snapshotDeletionsInProgress == null || snapshotDeletionsInProgress.getEntries().isEmpty();
445+
}).orElse(false), TimeUnit.MINUTES.toMillis(1L));
446+
447+
clearDisruptionsAndAwaitSync();
448+
449+
final TestClusterNodes.TestClusterNode randomMaster = testClusterNodes.randomMasterNode()
450+
.orElseThrow(() -> new AssertionError("expected to find at least one active master node"));
451+
SnapshotsInProgress finalSnapshotsInProgress = randomMaster.clusterService.state().custom(SnapshotsInProgress.TYPE);
452+
assertThat(finalSnapshotsInProgress.entries(), empty());
453+
final Repository repository = randomMaster.repositoriesService.repository(repoName);
454+
Collection<SnapshotId> snapshotIds = getRepositoryData(repository).getSnapshotIds();
455+
assertThat(snapshotIds, hasSize(0));
456+
}
457+
414458
public void testConcurrentSnapshotCreateAndDelete() {
415459
setupTestCluster(randomFrom(1, 3, 5), randomIntBetween(2, 10));
416460

@@ -1159,6 +1203,8 @@ private final class TestClusterNode {
11591203

11601204
private final ClusterService clusterService;
11611205

1206+
private final NodeConnectionsService nodeConnectionsService;
1207+
11621208
private final RepositoriesService repositoriesService;
11631209

11641210
private final SnapshotsService snapshotsService;
@@ -1310,6 +1356,8 @@ public void onFailure(final Exception e) {
13101356
new BatchedRerouteService(clusterService, allocationService::reroute),
13111357
threadPool
13121358
);
1359+
nodeConnectionsService =
1360+
new NodeConnectionsService(clusterService.getSettings(), threadPool, transportService);
13131361
final MetadataMappingService metadataMappingService = new MetadataMappingService(clusterService, indicesService);
13141362
indicesClusterStateService = new IndicesClusterStateService(
13151363
settings,
@@ -1467,6 +1515,7 @@ public void stop() {
14671515
testClusterNodes.disconnectNode(this);
14681516
indicesService.close();
14691517
clusterService.close();
1518+
nodeConnectionsService.stop();
14701519
indicesClusterStateService.close();
14711520
if (coordinator != null) {
14721521
coordinator.close();
@@ -1491,10 +1540,9 @@ public void start(ClusterState initialState) {
14911540
new BatchedRerouteService(clusterService, allocationService::reroute), ElectionStrategy.DEFAULT_INSTANCE);
14921541
masterService.setClusterStatePublisher(coordinator);
14931542
coordinator.start();
1494-
masterService.start();
1495-
clusterService.getClusterApplierService().setNodeConnectionsService(
1496-
new NodeConnectionsService(clusterService.getSettings(), threadPool, transportService));
1497-
clusterService.getClusterApplierService().start();
1543+
clusterService.getClusterApplierService().setNodeConnectionsService(nodeConnectionsService);
1544+
nodeConnectionsService.start();
1545+
clusterService.start();
14981546
indicesService.start();
14991547
indicesClusterStateService.start();
15001548
coordinator.startInitialJoin();

0 commit comments

Comments
 (0)