-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Guarantee that translog generations are seqNo conflict free #24825
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
01ea4a9
1d88441
0da7ded
6e3876a
07d0076
e164882
3dc649d
0dbfc3e
ec14051
2c84f2c
c3945b5
d3abba9
9aac1f7
797b11c
d2a12c2
496d1c0
0ae4d11
984a03a
2ee3508
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -446,6 +446,7 @@ public WriteReplicaResult<BulkShardRequest> shardOperationOnReplica(BulkShardReq | |
|
|
||
| public static Translog.Location performOnReplica(BulkShardRequest request, IndexShard replica) throws Exception { | ||
| Translog.Location location = null; | ||
| final long primaryTerm = request.primaryTerm(); | ||
| for (int i = 0; i < request.items().length; i++) { | ||
| BulkItemRequest item = request.items()[i]; | ||
| final Engine.Result operationResult; | ||
|
|
@@ -457,10 +458,12 @@ public static Translog.Location performOnReplica(BulkShardRequest request, Index | |
| switch (docWriteRequest.opType()) { | ||
| case CREATE: | ||
| case INDEX: | ||
| operationResult = executeIndexRequestOnReplica(primaryResponse, (IndexRequest) docWriteRequest, replica); | ||
| operationResult = | ||
| executeIndexRequestOnReplica(primaryResponse, (IndexRequest) docWriteRequest, primaryTerm, replica); | ||
| break; | ||
| case DELETE: | ||
| operationResult = executeDeleteRequestOnReplica(primaryResponse, (DeleteRequest) docWriteRequest, replica); | ||
| operationResult = | ||
| executeDeleteRequestOnReplica(primaryResponse, (DeleteRequest) docWriteRequest, primaryTerm, replica); | ||
| break; | ||
| default: | ||
| throw new IllegalStateException("Unexpected request operation type on replica: " | ||
|
|
@@ -529,13 +532,13 @@ private static Translog.Location locationToSync(Translog.Location current, | |
| * {@link RetryOnReplicaException} if the operation needs to be re-tried. | ||
| */ | ||
| private static Engine.IndexResult executeIndexRequestOnReplica( | ||
| DocWriteResponse primaryResponse, | ||
| IndexRequest request, | ||
| IndexShard replica) throws IOException { | ||
| DocWriteResponse primaryResponse, | ||
| IndexRequest request, | ||
| long primaryTerm, IndexShard replica) throws IOException { | ||
|
|
||
| final Engine.Index operation; | ||
| try { | ||
| operation = prepareIndexOperationOnReplica(primaryResponse, request, replica); | ||
| operation = prepareIndexOperationOnReplica(primaryResponse, request, primaryTerm, replica); | ||
| } catch (MapperParsingException e) { | ||
| return new Engine.IndexResult(e, primaryResponse.getVersion(), primaryResponse.getSeqNo()); | ||
| } | ||
|
|
@@ -553,6 +556,7 @@ private static Engine.IndexResult executeIndexRequestOnReplica( | |
| static Engine.Index prepareIndexOperationOnReplica( | ||
| DocWriteResponse primaryResponse, | ||
| IndexRequest request, | ||
| long primaryTerm, | ||
| IndexShard replica) { | ||
|
|
||
| final ShardId shardId = replica.shardId(); | ||
|
|
@@ -565,7 +569,7 @@ static Engine.Index prepareIndexOperationOnReplica( | |
| final VersionType versionType = request.versionType().versionTypeForReplicationAndRecovery(); | ||
| assert versionType.validateVersionForWrites(version); | ||
|
|
||
| return replica.prepareIndexOnReplica(sourceToParse, seqNo, version, versionType, | ||
| return replica.prepareIndexOnReplica(sourceToParse, seqNo, primaryTerm, version, versionType, | ||
| request.getAutoGeneratedTimestamp(), request.isRetry()); | ||
| } | ||
|
|
||
|
|
@@ -647,7 +651,7 @@ private static Engine.DeleteResult executeDeleteRequestOnPrimary(DeleteRequest r | |
| } | ||
|
|
||
| private static Engine.DeleteResult executeDeleteRequestOnReplica(DocWriteResponse primaryResponse, DeleteRequest request, | ||
| IndexShard replica) throws Exception { | ||
| final long primaryTerm, IndexShard replica) throws Exception { | ||
| if (replica.indexSettings().isSingleType()) { | ||
| // We need to wait for the replica to have the mappings | ||
| Mapping update; | ||
|
|
@@ -667,7 +671,7 @@ private static Engine.DeleteResult executeDeleteRequestOnReplica(DocWriteRespons | |
| final long version = primaryResponse.getVersion(); | ||
| assert versionType.validateVersionForWrites(version); | ||
| final Engine.Delete delete = replica.prepareDeleteOnReplica(request.type(), request.id(), | ||
| primaryResponse.getSeqNo(), request.primaryTerm(), version, versionType); | ||
| primaryResponse.getSeqNo(), primaryTerm, version, versionType); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's trappy that the IndexRequest/DeleteRequest object might not have a primaryTerm properly set if it's wrapped in a BulkShardRequest. Maybe we could override |
||
| return replica.delete(delete); | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -522,10 +522,11 @@ public Engine.Index prepareIndexOnPrimary(SourceToParse source, long version, Ve | |
| } | ||
| } | ||
|
|
||
| public Engine.Index prepareIndexOnReplica(SourceToParse source, long seqNo, long version, VersionType versionType, long autoGeneratedIdTimestamp, | ||
| boolean isRetry) { | ||
| public Engine.Index prepareIndexOnReplica(SourceToParse source, long seqNo, long primaryTerm, long version, VersionType versionType, | ||
|
||
| long autoGeneratedIdTimestamp, boolean isRetry) { | ||
| try { | ||
| verifyReplicationTarget(); | ||
| assert primaryTerm == this.primaryTerm : "op term [ " + primaryTerm + " ] != shard term [" + this.primaryTerm + "]"; | ||
|
||
| return prepareIndex(docMapper(source.type()), source, seqNo, primaryTerm, version, versionType, | ||
| Engine.Operation.Origin.REPLICA, autoGeneratedIdTimestamp, isRetry); | ||
| } catch (Exception e) { | ||
|
|
@@ -597,6 +598,7 @@ public Engine.Delete prepareDeleteOnPrimary(String type, String id, long version | |
| public Engine.Delete prepareDeleteOnReplica(String type, String id, long seqNo, long primaryTerm, | ||
| long version, VersionType versionType) { | ||
| verifyReplicationTarget(); | ||
| assert primaryTerm == this.primaryTerm : "op term [ " + primaryTerm + " ] != shard term [" + this.primaryTerm + "]"; | ||
|
||
| final Term uid = extractUidForDelete(type, id); | ||
| return prepareDelete(type, id, uid, seqNo, primaryTerm, version, versionType, Engine.Operation.Origin.REPLICA); | ||
| } | ||
|
|
@@ -1879,8 +1881,9 @@ public void acquireReplicaOperationPermit( | |
| indexShardOperationPermits.blockOperations(30, TimeUnit.MINUTES, () -> { | ||
| assert operationPrimaryTerm > primaryTerm; | ||
|
||
| primaryTerm = operationPrimaryTerm; | ||
| getEngine().getTranslog().rollGeneration(); | ||
| }); | ||
| } catch (final InterruptedException | TimeoutException e) { | ||
| } catch (final InterruptedException | TimeoutException | IOException | AlreadyClosedException e) { | ||
|
||
| onPermitAcquired.onFailure(e); | ||
| return; | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -910,11 +910,11 @@ public Index(Engine.Index index, Engine.IndexResult indexResult) { | |
| this.autoGeneratedIdTimestamp = index.getAutoGeneratedIdTimestamp(); | ||
| } | ||
|
|
||
| public Index(String type, String id, byte[] source) { | ||
| public Index(String type, String id, long seqNo, byte[] source) { | ||
| this.type = type; | ||
| this.id = id; | ||
| this.source = new BytesArray(source); | ||
| this.seqNo = 0; | ||
| this.seqNo = seqNo; | ||
| version = Versions.MATCH_ANY; | ||
| versionType = VersionType.INTERNAL; | ||
| routing = null; | ||
|
|
@@ -1037,9 +1037,11 @@ public int hashCode() { | |
| @Override | ||
| public String toString() { | ||
| return "Index{" + | ||
| "id='" + id + '\'' + | ||
| ", type='" + type + '\'' + | ||
| '}'; | ||
| "id='" + id + '\'' + | ||
| ", type='" + type + '\'' + | ||
| ", seqNo=" + seqNo + | ||
| ", primaryTerm=" + primaryTerm + | ||
| '}'; | ||
| } | ||
|
|
||
| public long getAutoGeneratedIdTimestamp() { | ||
|
|
@@ -1079,8 +1081,8 @@ public Delete(Engine.Delete delete, Engine.DeleteResult deleteResult) { | |
| } | ||
|
|
||
| /** utility for testing */ | ||
| public Delete(String type, String id, Term uid) { | ||
| this(type, id, uid, 0, 0, Versions.MATCH_ANY, VersionType.INTERNAL); | ||
| public Delete(String type, String id, long seqNo, Term uid) { | ||
| this(type, id, uid, seqNo, 0, Versions.MATCH_ANY, VersionType.INTERNAL); | ||
| } | ||
|
|
||
| public Delete(String type, String id, Term uid, long seqNo, long primaryTerm, long version, VersionType versionType) { | ||
|
|
@@ -1180,10 +1182,11 @@ public int hashCode() { | |
| @Override | ||
| public String toString() { | ||
| return "Delete{" + | ||
| "uid=" + uid + | ||
| '}'; | ||
| "uid=" + uid + | ||
| ", seqNo=" + seqNo + | ||
| ", primaryTerm=" + primaryTerm + | ||
| '}'; | ||
| } | ||
|
|
||
| } | ||
|
|
||
| public static class NoOp implements Operation { | ||
|
|
@@ -1260,9 +1263,16 @@ public int hashCode() { | |
| return 31 * 31 * 31 + 31 * 31 * Long.hashCode(seqNo) + 31 * Long.hashCode(primaryTerm) + reason().hashCode(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this hashcode looks odd....
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. agreed. I generated a new one. |
||
| } | ||
|
|
||
| @Override | ||
| public String toString() { | ||
| return "NoOp{" + | ||
| "seqNo=" + seqNo + | ||
| ", primaryTerm=" + primaryTerm + | ||
| ", reason='" + reason + '\'' + | ||
| '}'; | ||
| } | ||
| } | ||
|
|
||
|
|
||
| public enum Durability { | ||
|
|
||
| /** | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,7 +24,9 @@ | |
| import org.apache.lucene.store.OutputStreamDataOutput; | ||
| import org.apache.lucene.util.BytesRef; | ||
| import org.apache.lucene.util.IOUtils; | ||
| import org.elasticsearch.common.bytes.BytesArray; | ||
| import org.elasticsearch.common.bytes.BytesReference; | ||
| import org.elasticsearch.common.collect.Tuple; | ||
| import org.elasticsearch.common.io.Channels; | ||
| import org.elasticsearch.common.unit.ByteSizeValue; | ||
| import org.elasticsearch.index.seqno.SequenceNumbers; | ||
|
|
@@ -39,6 +41,8 @@ | |
| import java.nio.channels.FileChannel; | ||
| import java.nio.file.Path; | ||
| import java.nio.file.StandardOpenOption; | ||
| import java.util.HashMap; | ||
| import java.util.Map; | ||
| import java.util.concurrent.atomic.AtomicBoolean; | ||
| import java.util.function.LongSupplier; | ||
|
|
||
|
|
@@ -71,6 +75,8 @@ public class TranslogWriter extends BaseTranslogReader implements Closeable { | |
| // lock order synchronized(syncLock) -> synchronized(this) | ||
| private final Object syncLock = new Object(); | ||
|
|
||
| private final Map<Long, Tuple<BytesReference, Exception>> seenSequenceNumbers; | ||
|
|
||
| private TranslogWriter( | ||
| final ChannelFactory channelFactory, | ||
| final ShardId shardId, | ||
|
|
@@ -90,6 +96,13 @@ private TranslogWriter( | |
| assert initialCheckpoint.maxSeqNo == SequenceNumbersService.NO_OPS_PERFORMED : initialCheckpoint.maxSeqNo; | ||
| this.maxSeqNo = initialCheckpoint.maxSeqNo; | ||
| this.globalCheckpointSupplier = globalCheckpointSupplier; | ||
| boolean assertionsEnabled = false; | ||
|
||
| assert assertionsEnabled = true; | ||
| if (assertionsEnabled) { | ||
| seenSequenceNumbers = new HashMap<>(); | ||
| } else { | ||
| seenSequenceNumbers = null; | ||
| } | ||
| } | ||
|
|
||
| static int getHeaderLength(String translogUUID) { | ||
|
|
@@ -195,9 +208,30 @@ public synchronized Translog.Location add(final BytesReference data, final long | |
|
|
||
| operationCounter++; | ||
|
|
||
| assert assertSeqNoNotSeen(seqNo, data); | ||
|
|
||
| return new Translog.Location(generation, offset, data.length()); | ||
| } | ||
|
|
||
| private boolean assertSeqNoNotSeen(long seqNo, BytesReference data) throws IOException { | ||
|
||
| if (seqNo == SequenceNumbersService.UNASSIGNED_SEQ_NO) { | ||
| // nothing to do | ||
| } else if (seenSequenceNumbers.containsKey(seqNo)) { | ||
| final Tuple<BytesReference, Exception> previous = seenSequenceNumbers.get(seqNo); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did it this way so it can easily integrate into a caused by clause and have a proper message. Do you have a better suggestion there? |
||
| if (previous.v1().equals(data) == false) { | ||
| Translog.Operation newOp = Translog.readOperation(new BufferedChecksumStreamInput(data.streamInput())); | ||
| Translog.Operation prvOp = Translog.readOperation(new BufferedChecksumStreamInput(previous.v1().streamInput())); | ||
| throw new AssertionError( | ||
| "seqNo [" + seqNo + "] was processed twice in generation [" + generation + "], with different data. " + | ||
| "prvOp [" + prvOp + "], newOp [" + newOp + "]", previous.v2()); | ||
| } | ||
| } else { | ||
| seenSequenceNumbers.put(seqNo, | ||
| new Tuple<>(new BytesArray(data.toBytesRef(), true), new RuntimeException("stack capture previous op"))); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not |
||
| } | ||
| return true; | ||
| } | ||
|
|
||
| /** | ||
| * write all buffered ops to disk and fsync file. | ||
| * | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you maybe un-indent this into a single line?