Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,19 @@ public void applyClusterState(ClusterChangedEvent event) {
} catch (Exception e) {
logger.warn("Failed to update snapshot state ", e);
}
assert assertConsistentWithClusterState(event.state());
}

private boolean assertConsistentWithClusterState(ClusterState state) {
final SnapshotsInProgress snapshotsInProgress = state.custom(SnapshotsInProgress.TYPE);
if (snapshotsInProgress != null && snapshotsInProgress.entries().isEmpty() == false) {
final Set<Snapshot> runningSnapshots =
snapshotsInProgress.entries().stream().map(SnapshotsInProgress.Entry::snapshot).collect(Collectors.toSet());
final Set<Snapshot> snapshotListenerKeys = snapshotCompletionListeners.keySet();
assert runningSnapshots.containsAll(snapshotListenerKeys) : "Saw completion listeners for unknown snapshots in "
+ snapshotListenerKeys + " but running snapshots are " + runningSnapshots;
}
return true;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@
import org.elasticsearch.cluster.ClusterStateListener;
import org.elasticsearch.cluster.ESAllocationTestCase;
import org.elasticsearch.cluster.NodeConnectionsService;
import org.elasticsearch.cluster.SnapshotDeletionsInProgress;
import org.elasticsearch.cluster.SnapshotsInProgress;
import org.elasticsearch.cluster.action.index.MappingUpdatedAction;
import org.elasticsearch.cluster.action.index.NodeMappingRefreshAction;
Expand Down Expand Up @@ -407,6 +408,49 @@ public void testSnapshotWithNodeDisconnects() {
assertThat(snapshotIds, hasSize(1));
}

public void testSnapshotDeleteWithMasterFailOvers() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Failover

final int dataNodes = randomIntBetween(2, 10);
final int masterNodes = randomFrom(3, 5);
setupTestCluster(masterNodes, dataNodes);

String repoName = "repo";
String snapshotName = "snapshot";
final String index = "test";
final int shards = randomIntBetween(1, 10);

final boolean waitForSnapshot = randomBoolean();
final StepListener<CreateSnapshotResponse> createSnapshotResponseStepListener = new StepListener<>();
continueOrDie(createRepoAndIndex(repoName, index, shards), createIndexResponse ->
testClusterNodes.randomMasterNodeSafe().client.admin().cluster().prepareCreateSnapshot(repoName, snapshotName)
.setWaitForCompletion(waitForSnapshot).execute(createSnapshotResponseStepListener));

final AtomicBoolean snapshotDeleteResponded = new AtomicBoolean(false);
continueOrDie(createSnapshotResponseStepListener, createSnapshotResponse -> {
scheduleNow(this::disconnectOrRestartMasterNode);
testClusterNodes.randomDataNodeSafe().client.admin().cluster()
.prepareDeleteSnapshot(repoName, snapshotName).execute(ActionListener.wrap(() -> snapshotDeleteResponded.set(true)));
});

runUntil(() -> testClusterNodes.randomMasterNode().map(master -> {
if (snapshotDeleteResponded.get() == false) {
return false;
}
final SnapshotDeletionsInProgress snapshotDeletionsInProgress =
master.clusterService.state().custom(SnapshotDeletionsInProgress.TYPE);
return snapshotDeletionsInProgress == null || snapshotDeletionsInProgress.getEntries().isEmpty();
}).orElse(false), TimeUnit.MINUTES.toMillis(1L));

clearDisruptionsAndAwaitSync();

final TestClusterNodes.TestClusterNode randomMaster = testClusterNodes.randomMasterNode()
.orElseThrow(() -> new AssertionError("expected to find at least one active master node"));
SnapshotsInProgress finalSnapshotsInProgress = randomMaster.clusterService.state().custom(SnapshotsInProgress.TYPE);
assertThat(finalSnapshotsInProgress.entries(), empty());
final Repository repository = randomMaster.repositoriesService.repository(repoName);
Collection<SnapshotId> snapshotIds = getRepositoryData(repository).getSnapshotIds();
assertThat(snapshotIds, either(hasSize(1)).or(hasSize(0)));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't it be always hasSize(0) when waitForSnapshot is true?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right :) I shouldn't have mindlessly copied that from the concurrent snapshots branch. Thanks for spotting :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As a matter of fact, thanks to recent fixes this is always 0. Even on master fail-over the deletes are now properly retried :) Adjusted tests accordingly. Interestingly enough, this created one strange spot for one, one in a million seed where the cleanup logic would take multiple minutes to complete on the fake threadpool (so it's just fake minutes) but still interesting => that's why I had to up the timeout there.
I'm investigating why that is now.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tlrx This was a really stupid bug ... forgot to start the node connections service. This had some strange side effects since it resulted in some transport handlers never failing, causing some CS publications on the failing over master to never complete, causing this test to only move on once the failing master was again removed from the cluster after the 1.5m publication timeout ... behaves much better now.
Should be good for review with 3370c84 now :)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM :) Thanks Armin

}

public void testConcurrentSnapshotCreateAndDelete() {
setupTestCluster(randomFrom(1, 3, 5), randomIntBetween(2, 10));

Expand Down