Skip to content

Commit f0d8c78

Browse files
Fix Inconsistent Shard Failure Count in Failed Snapshots (#51416) (#51426)
* Fix Inconsistent Shard Failure Count in Failed Snapshots This fix was necessary to allow for the below test enhancement: We were not adding shard failure entries to a failed snapshot for those snapshot entries that were never attempted because the snapshot failed during the init stage and wasn't partial. This caused the never attempted snapshots to be counted towards the successful shard count which seems wrong and broke repository consistency tests. Also, this change adjusts snapshot resiliency tests to run another snapshot at the end of each test run to guarantee a correct `index.latest` blob exists after each run. Closes #47550
1 parent bf53ca3 commit f0d8c78

File tree

2 files changed

+20
-3
lines changed

2 files changed

+20
-3
lines changed

server/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1090,8 +1090,13 @@ protected void doRun() {
10901090
for (ObjectObjectCursor<ShardId, ShardSnapshotStatus> shardStatus : entry.shards()) {
10911091
ShardId shardId = shardStatus.key;
10921092
ShardSnapshotStatus status = shardStatus.value;
1093-
if (status.state().failed()) {
1093+
final ShardState state = status.state();
1094+
if (state.failed()) {
10941095
shardFailures.add(new SnapshotShardFailure(status.nodeId(), shardId, status.reason()));
1096+
} else if (state.completed() == false) {
1097+
shardFailures.add(new SnapshotShardFailure(status.nodeId(), shardId, "skipped"));
1098+
} else {
1099+
assert state == ShardState.SUCCESS;
10951100
}
10961101
}
10971102
final ShardGenerations shardGenerations = buildGenerations(entry, metaData);

server/src/test/java/org/elasticsearch/snapshots/SnapshotResiliencyTests.java

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,7 @@
216216
import static org.hamcrest.Matchers.hasSize;
217217
import static org.hamcrest.Matchers.instanceOf;
218218
import static org.hamcrest.Matchers.is;
219+
import static org.hamcrest.Matchers.iterableWithSize;
219220
import static org.hamcrest.Matchers.lessThanOrEqualTo;
220221
import static org.mockito.Mockito.mock;
221222

@@ -249,8 +250,19 @@ public void verifyReposThenStopServices() {
249250
clearDisruptionsAndAwaitSync();
250251

251252
final StepListener<CleanupRepositoryResponse> cleanupResponse = new StepListener<>();
252-
client().admin().cluster().cleanupRepository(
253-
new CleanupRepositoryRequest("repo"), cleanupResponse);
253+
final StepListener<CreateSnapshotResponse> createSnapshotResponse = new StepListener<>();
254+
// Create another snapshot and then clean up the repository to verify that the repository works correctly no matter the
255+
// failures seen during the previous test.
256+
client().admin().cluster().prepareCreateSnapshot("repo", "last-snapshot")
257+
.setWaitForCompletion(true).execute(createSnapshotResponse);
258+
continueOrDie(createSnapshotResponse, r -> {
259+
final SnapshotInfo snapshotInfo = r.getSnapshotInfo();
260+
// Snapshot can fail because some tests leave indices in a red state because data nodes were stopped
261+
assertThat(snapshotInfo.state(), either(is(SnapshotState.SUCCESS)).or(is(SnapshotState.FAILED)));
262+
assertThat(snapshotInfo.shardFailures(), iterableWithSize(snapshotInfo.failedShards()));
263+
assertThat(snapshotInfo.successfulShards(), is(snapshotInfo.totalShards() - snapshotInfo.failedShards()));
264+
client().admin().cluster().cleanupRepository(new CleanupRepositoryRequest("repo"), cleanupResponse);
265+
});
254266
final AtomicBoolean cleanedUp = new AtomicBoolean(false);
255267
continueOrDie(cleanupResponse, r -> cleanedUp.set(true));
256268

0 commit comments

Comments
 (0)