Skip to content

Commit 91e1ff0

Browse files
committed
Handle translog missing while preparing recovery
If a file-based recovery completes phase one successfully, but a network partition happens before the translog is opened, during the retry loop the recovery target will proceed to attempt a sequence-number-based recovery as the index files are present. However, as the translog was never opened it will be missing on disk leading to a no such file exception while preparing for a sequence-number-based recovery. We should not let this fail the recovery, but instead proceed to attempt another file-based recovery.
1 parent b9200cf commit 91e1ff0

File tree

1 file changed

+28
-9
lines changed

1 file changed

+28
-9
lines changed

core/src/main/java/org/elasticsearch/indices/recovery/PeerRecoveryTargetService.java

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
import org.elasticsearch.index.IndexNotFoundException;
4242
import org.elasticsearch.index.engine.RecoveryEngineException;
4343
import org.elasticsearch.index.mapper.MapperException;
44+
import org.elasticsearch.index.seqno.SequenceNumbers;
4445
import org.elasticsearch.index.seqno.SequenceNumbersService;
4546
import org.elasticsearch.index.shard.IllegalIndexShardStateException;
4647
import org.elasticsearch.index.shard.IndexEventListener;
@@ -338,16 +339,20 @@ private Optional<StartRecoveryRequest> getStartRecoveryRequest(final RecoveryTar
338339
final long startingSeqNo;
339340
if (metadataSnapshot.get().size() > 0) {
340341
startingSeqNo = getStartingSeqNo(recoveryTarget);
341-
logger.trace(
342-
"{} preparing for sequence number-based recovery starting at local checkpoint [{}] from [{}]",
343-
recoveryTarget.shardId(),
344-
startingSeqNo,
345-
recoveryTarget.sourceNode());
346342
} else {
347-
logger.trace("{} preparing for file-based recovery from [{}]", recoveryTarget.shardId(), recoveryTarget.sourceNode());
348343
startingSeqNo = SequenceNumbersService.UNASSIGNED_SEQ_NO;
349344
}
350345

346+
if (startingSeqNo == SequenceNumbersService.UNASSIGNED_SEQ_NO) {
347+
logger.trace("{} preparing for file-based recovery from [{}]", recoveryTarget.shardId(), recoveryTarget.sourceNode());
348+
} else {
349+
logger.trace(
350+
"{} preparing for sequence number-based recovery starting at local checkpoint [{}] from [{}]",
351+
recoveryTarget.shardId(),
352+
startingSeqNo,
353+
recoveryTarget.sourceNode());
354+
}
355+
351356
logger.trace("{} preparing shard for peer recovery", recoveryTarget.shardId());
352357
recoveryTarget.indexShard().prepareForIndexRecovery();
353358

@@ -370,9 +375,23 @@ private Optional<StartRecoveryRequest> getStartRecoveryRequest(final RecoveryTar
370375
return Optional.of(request);
371376
}
372377

373-
public static long getStartingSeqNo(RecoveryTarget recoveryTarget) throws IOException {
374-
final long globalCheckpoint = Translog.readGlobalCheckpoint(recoveryTarget.indexShard().shardPath().resolveTranslog());
375-
return recoveryTarget.store().loadSeqNoStats(globalCheckpoint).getLocalCheckpoint() + 1;
378+
/**
379+
* Get the starting sequence number for a sequence-number-based request.
380+
*
381+
* @param recoveryTarget the target of the recovery
382+
* @return the starting sequence number or {@link SequenceNumbersService#UNASSIGNED_SEQ_NO} if obtaining the starting sequence number
383+
* failed
384+
*/
385+
public static long getStartingSeqNo(final RecoveryTarget recoveryTarget) {
386+
try {
387+
final long globalCheckpoint = Translog.readGlobalCheckpoint(recoveryTarget.indexShard().shardPath().resolveTranslog());
388+
return recoveryTarget.store().loadSeqNoStats(globalCheckpoint).getLocalCheckpoint() + 1;
389+
} catch (final IOException e) {
390+
// this can happen, for example, if a phase one of the recovery completed successfully, a network partition happens before the
391+
// translog on the recovery target is opened, the recovery enters a retry loop seeing now that the index files are on disk to
392+
// proceeds to attempt a sequence-number-based recovery
393+
return SequenceNumbersService.UNASSIGNED_SEQ_NO;
394+
}
376395
}
377396

378397
public interface RecoveryListener {

0 commit comments

Comments
 (0)