elastic · jasontedor · Dec 17, 2016 · Dec 15, 2016 · Dec 16, 2016 · Dec 16, 2016
diff --git a/core/src/main/java/org/elasticsearch/action/bulk/TransportShardBulkAction.java b/core/src/main/java/org/elasticsearch/action/bulk/TransportShardBulkAction.java
@@ -50,6 +50,7 @@
 import org.elasticsearch.index.engine.EngineClosedException;
 import org.elasticsearch.index.engine.VersionConflictEngineException;
 import org.elasticsearch.index.mapper.MapperParsingException;
+import org.elasticsearch.index.seqno.GlobalCheckpointSyncAction;
 import org.elasticsearch.index.seqno.SequenceNumbersService;
 import org.elasticsearch.index.shard.IndexShard;
 import org.elasticsearch.index.shard.IndexShardClosedException;
@@ -150,6 +151,7 @@ private Translog.Location executeBulkItemRequest(IndexMetaData metaData, IndexSh
                         final long version = indexResult.getVersion();
                         indexRequest.version(version);
                         indexRequest.versionType(indexRequest.versionType().versionTypeForReplicationAndRecovery());
+                        indexRequest.seqNo(indexResult.getSeqNo());
                         assert indexRequest.versionType().validateVersionForWrites(indexRequest.version());
                         response = new IndexResponse(primary.shardId(), indexRequest.type(), indexRequest.id(), indexResult.getSeqNo(),
                             indexResult.getVersion(), indexResult.isCreated());
@@ -173,6 +175,7 @@ private Translog.Location executeBulkItemRequest(IndexMetaData metaData, IndexSh
                         // update the request with the version so it will go to the replicas
                         deleteRequest.versionType(deleteRequest.versionType().versionTypeForReplicationAndRecovery());
                         deleteRequest.version(deleteResult.getVersion());
+                        deleteRequest.seqNo(deleteResult.getSeqNo());
                         assert deleteRequest.versionType().validateVersionForWrites(deleteRequest.version());
                         response = new DeleteResponse(request.shardId(), deleteRequest.type(), deleteRequest.id(), deleteResult.getSeqNo(),
                             deleteResult.getVersion(), deleteResult.isFound());
@@ -182,6 +185,7 @@ private Translog.Location executeBulkItemRequest(IndexMetaData metaData, IndexSh
                     break;
                 default: throw new IllegalStateException("unexpected opType [" + itemRequest.opType() + "] found");
             }
+
             // update the bulk item request because update request execution can mutate the bulk item request
             request.items()[requestIndex] = replicaRequest;
             if (operationResult == null) { // in case of noop update operation
@@ -282,6 +286,7 @@ private UpdateResultHolder executeUpdateRequest(UpdateRequest updateRequest, Ind
                         final long version = updateOperationResult.getVersion();
                         indexRequest.version(version);
                         indexRequest.versionType(indexRequest.versionType().versionTypeForReplicationAndRecovery());
+                        indexRequest.seqNo(updateOperationResult.getSeqNo());
                         assert indexRequest.versionType().validateVersionForWrites(indexRequest.version());
                     }
                     break;
@@ -292,6 +297,7 @@ private UpdateResultHolder executeUpdateRequest(UpdateRequest updateRequest, Ind
                         // update the request with the version so it will go to the replicas
                         deleteRequest.versionType(deleteRequest.versionType().versionTypeForReplicationAndRecovery());
                         deleteRequest.version(updateOperationResult.getVersion());
+                        deleteRequest.seqNo(updateOperationResult.getSeqNo());
                         assert deleteRequest.versionType().validateVersionForWrites(deleteRequest.version());
                     }
                     break;
@@ -342,6 +348,10 @@ private UpdateResultHolder executeUpdateRequest(UpdateRequest updateRequest, Ind
                         replicaRequest = new BulkItemRequest(request.items()[requestIndex].id(), updateDeleteRequest);
                         break;
                 }
+                assert (replicaRequest.request() instanceof IndexRequest
+                    && ((IndexRequest) replicaRequest.request()).seqNo() != SequenceNumbersService.UNASSIGNED_SEQ_NO) ||
+                    (replicaRequest.request() instanceof DeleteRequest
+                        && ((DeleteRequest) replicaRequest.request()).seqNo() != SequenceNumbersService.UNASSIGNED_SEQ_NO);
                 // successful operation
                 break; // out of retry loop
             } else if (updateOperationResult.getFailure() instanceof VersionConflictEngineException == false) {
@@ -364,10 +374,10 @@ protected WriteReplicaResult shardOperationOnReplica(BulkShardRequest request, I
                     switch (docWriteRequest.opType()) {
                         case CREATE:
                         case INDEX:
-                            operationResult = executeIndexRequestOnReplica(((IndexRequest) docWriteRequest), replica);
+                            operationResult = executeIndexRequestOnReplica((IndexRequest) docWriteRequest, replica);
                             break;
                         case DELETE:
-                            operationResult = executeDeleteRequestOnReplica(((DeleteRequest) docWriteRequest), replica);
+                            operationResult = executeDeleteRequestOnReplica((DeleteRequest) docWriteRequest, replica);
                             break;
                         default:
                             throw new IllegalStateException("Unexpected request operation type on replica: "

diff --git a/core/src/main/java/org/elasticsearch/action/delete/TransportDeleteAction.java b/core/src/main/java/org/elasticsearch/action/delete/TransportDeleteAction.java
@@ -129,6 +129,7 @@ protected WritePrimaryResult shardOperationOnPrimary(DeleteRequest request, Inde
             // update the request with the version so it will go to the replicas
             request.versionType(request.versionType().versionTypeForReplicationAndRecovery());
             request.version(result.getVersion());
+            request.seqNo(result.getSeqNo());
             assert request.versionType().validateVersionForWrites(request.version());
             response = new DeleteResponse(
                 primary.shardId(),

diff --git a/core/src/main/java/org/elasticsearch/index/engine/InternalEngine.java b/core/src/main/java/org/elasticsearch/index/engine/InternalEngine.java
@@ -175,8 +175,18 @@ public InternalEngine(EngineConfig engineConfig) throws EngineException {
                         throw new IllegalArgumentException(openMode.toString());
                 }
                 logger.trace("recovered [{}]", seqNoStats);
-                indexWriter = writer;
                 seqNoService = sequenceNumberService(shardId, engineConfig.getIndexSettings(), seqNoStats);
+                // norelease
+                /*
+                 * We have no guarantees that all operations above the local checkpoint are in the Lucene commit or the translog. This means
+                 * that we there might be operations greater than the local checkpoint that will not be replayed. Here we force the local
+                 * checkpoint to the maximum sequence number in the commit (at the potential expense of correctness).
+                 */
+                while (seqNoService.getLocalCheckpoint() < seqNoService.getMaxSeqNo()) {
+                    final long next = seqNoService.getLocalCheckpoint() + 1;
+                    seqNoService.markSeqNoAsCompleted(next);
+                }
+                indexWriter = writer;
                 translog = openTranslog(engineConfig, writer, seqNoService::getGlobalCheckpoint);
                 assert translog.getGeneration() != null;
             } catch (IOException | TranslogCorruptedException e) {
@@ -638,16 +648,23 @@ private IndexResult innerIndex(Index index) throws IOException {
                 }
             }
             final long expectedVersion = index.version();
-            if (checkVersionConflict(index, currentVersion, expectedVersion, deleted)) {
-                // skip index operation because of version conflict on recovery
-                indexResult = new IndexResult(expectedVersion, SequenceNumbersService.UNASSIGNED_SEQ_NO, false);
-            } else {
-                final long seqNo;
-                if (index.origin() == Operation.Origin.PRIMARY) {
+            final boolean conflict = checkVersionConflict(index, currentVersion, expectedVersion, deleted);
+
+            final long seqNo;
+            if (index.origin() == Operation.Origin.PRIMARY) {
+                if (!conflict) {
                     seqNo = seqNoService.generateSeqNo();
                 } else {
-                    seqNo = index.seqNo();
+                    seqNo = SequenceNumbersService.UNASSIGNED_SEQ_NO;
                 }
+            } else {
+                seqNo = index.seqNo();
+            }
+
+            if (conflict) {
+                // skip index operation because of version conflict on recovery
+                indexResult = new IndexResult(expectedVersion, seqNo, false);
+            } else {
                 updatedVersion = index.versionType().updateVersion(currentVersion, expectedVersion);
                 index.parsedDoc().version().setLongValue(updatedVersion);
 
@@ -764,16 +781,24 @@ private DeleteResult innerDelete(Delete delete) throws IOException {
             }
 
             final long expectedVersion = delete.version();
-            if (checkVersionConflict(delete, currentVersion, expectedVersion, deleted)) {
-                // skip executing delete because of version conflict on recovery
-                deleteResult = new DeleteResult(expectedVersion, SequenceNumbersService.UNASSIGNED_SEQ_NO, true);
-            } else {
-                final long seqNo;
-                if (delete.origin() == Operation.Origin.PRIMARY) {
+
+            final boolean conflict = checkVersionConflict(delete, currentVersion, expectedVersion, deleted);
+
+            final long seqNo;
+            if (delete.origin() == Operation.Origin.PRIMARY) {
+                if (!conflict) {
                     seqNo = seqNoService.generateSeqNo();
                 } else {
-                    seqNo = delete.seqNo();
+                    seqNo = SequenceNumbersService.UNASSIGNED_SEQ_NO;
                 }
+            } else {
+                seqNo = delete.seqNo();
+            }
+
+            if (conflict) {
+                // skip executing delete because of version conflict on recovery
+                deleteResult = new DeleteResult(expectedVersion, seqNo, true);
+            } else {
                 updatedVersion = delete.versionType().updateVersion(currentVersion, expectedVersion);
                 found = deleteIfFound(delete.uid(), currentVersion, deleted, versionValue);
                 deleteResult = new DeleteResult(updatedVersion, seqNo, found);

diff --git a/core/src/main/java/org/elasticsearch/index/seqno/GlobalCheckpointService.java b/core/src/main/java/org/elasticsearch/index/seqno/GlobalCheckpointService.java
@@ -149,12 +149,14 @@ public synchronized long getCheckpoint() {
      * updates the global checkpoint on a replica shard (after it has been updated by the primary).
      */
     synchronized void updateCheckpointOnReplica(long globalCheckpoint) {
+        /*
+         * The global checkpoint here is a local knowledge which is updated under the mandate of the primary. It can happen that the primary
+         * information is lagging compared to a replica (e.g., if a replica is promoted to primary but has stale info relative to other
+         * replica shards). In these cases, the local knowledge of the global checkpoint could be higher than sync from the lagging primary.
+         */
         if (this.globalCheckpoint <= globalCheckpoint) {
             this.globalCheckpoint = globalCheckpoint;
             logger.trace("global checkpoint updated from primary to [{}]", globalCheckpoint);
-        } else {
-            throw new IllegalArgumentException("global checkpoint from primary should never decrease. current [" +
-                this.globalCheckpoint + "], got [" + globalCheckpoint + "]");
         }
     }
 

diff --git a/core/src/main/java/org/elasticsearch/indices/recovery/PeerRecoveryTargetService.java b/core/src/main/java/org/elasticsearch/indices/recovery/PeerRecoveryTargetService.java
@@ -312,9 +312,9 @@ class FinalizeRecoveryRequestHandler implements TransportRequestHandler<Recovery
 
         @Override
         public void messageReceived(RecoveryFinalizeRecoveryRequest request, TransportChannel channel) throws Exception {
-            try (RecoveriesCollection.RecoveryRef recoveryRef = onGoingRecoveries.getRecoverySafe(request.recoveryId(), request.shardId()))
-            {
-                recoveryRef.status().finalizeRecovery();
+            try (RecoveriesCollection.RecoveryRef recoveryRef =
+                     onGoingRecoveries.getRecoverySafe(request.recoveryId(), request.shardId())) {
+                recoveryRef.status().finalizeRecovery(request.globalCheckpoint());
             }
             channel.sendResponse(TransportResponse.Empty.INSTANCE);
         }

diff --git a/core/src/main/java/org/elasticsearch/indices/recovery/RecoveryFinalizeRecoveryRequest.java b/core/src/main/java/org/elasticsearch/indices/recovery/RecoveryFinalizeRecoveryRequest.java
@@ -19,8 +19,10 @@
 
 package org.elasticsearch.indices.recovery;
 
+import org.elasticsearch.Version;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
+import org.elasticsearch.index.seqno.SequenceNumbersService;
 import org.elasticsearch.index.shard.ShardId;
 import org.elasticsearch.transport.TransportRequest;
 
@@ -29,15 +31,16 @@
 public class RecoveryFinalizeRecoveryRequest extends TransportRequest {
 
     private long recoveryId;
-
     private ShardId shardId;
+    private long globalCheckpoint;
 
     public RecoveryFinalizeRecoveryRequest() {
     }
 
-    RecoveryFinalizeRecoveryRequest(long recoveryId, ShardId shardId) {
+    RecoveryFinalizeRecoveryRequest(final long recoveryId, final ShardId shardId, final long globalCheckpoint) {
         this.recoveryId = recoveryId;
         this.shardId = shardId;
+        this.globalCheckpoint = globalCheckpoint;
     }
 
     public long recoveryId() {
@@ -48,17 +51,30 @@ public ShardId shardId() {
         return shardId;
     }
 
+    public long globalCheckpoint() {
+        return globalCheckpoint;
+    }
+
     @Override
     public void readFrom(StreamInput in) throws IOException {
         super.readFrom(in);
         recoveryId = in.readLong();
         shardId = ShardId.readShardId(in);
+        if (in.getVersion().onOrAfter(Version.V_6_0_0_alpha1_UNRELEASED)) {
+            globalCheckpoint = in.readZLong();
+        } else {
+            globalCheckpoint = SequenceNumbersService.UNASSIGNED_SEQ_NO;
+        }
     }
 
     @Override
     public void writeTo(StreamOutput out) throws IOException {
         super.writeTo(out);
         out.writeLong(recoveryId);
         shardId.writeTo(out);
+        if (out.getVersion().onOrAfter(Version.V_6_0_0_alpha1_UNRELEASED)) {
+            out.writeZLong(globalCheckpoint);
+        }
     }
+
 }
diff --git a/core/src/main/java/org/elasticsearch/indices/recovery/RecoverySourceHandler.java b/core/src/main/java/org/elasticsearch/indices/recovery/RecoverySourceHandler.java
@@ -391,8 +391,8 @@ public void finalizeRecovery() {
         StopWatch stopWatch = new StopWatch().start();
         logger.trace("[{}][{}] finalizing recovery to {}", indexName, shardId, request.targetNode());
         cancellableThreads.execute(() -> {
-            recoveryTarget.finalizeRecovery();
             shard.markAllocationIdAsInSync(recoveryTarget.getTargetAllocationId());
+            recoveryTarget.finalizeRecovery(shard.getGlobalCheckpoint());
         });
 
         if (request.isPrimaryRelocation()) {

diff --git a/core/src/main/java/org/elasticsearch/indices/recovery/RecoveryTarget.java b/core/src/main/java/org/elasticsearch/indices/recovery/RecoveryTarget.java
@@ -333,7 +333,8 @@ public void prepareForTranslogOperations(int totalTranslogOps, long maxUnsafeAut
     }
 
     @Override
-    public void finalizeRecovery() {
+    public void finalizeRecovery(final long globalCheckpoint) {
+        indexShard().updateGlobalCheckpointOnReplica(globalCheckpoint);
         final IndexShard indexShard = indexShard();
         indexShard.finalizeRecovery();
     }

diff --git a/core/src/main/java/org/elasticsearch/indices/recovery/RecoveryTargetHandler.java b/core/src/main/java/org/elasticsearch/indices/recovery/RecoveryTargetHandler.java
@@ -39,11 +39,12 @@ public interface RecoveryTargetHandler {
     void prepareForTranslogOperations(int totalTranslogOps, long maxUnsafeAutoIdTimestamp) throws IOException;
 
     /**
-     * The finalize request clears unreferenced translog files, refreshes the engine now that
-     * new segments are available, and enables garbage collection of
-     * tombstone files.
-     **/
-    void finalizeRecovery();
+     * The finalize request refreshes the engine now that new segments are available, enables garbage collection of tombstone files, and
+     * updates the global checkpoint.
+     *
+     * @param globalCheckpoint the global checkpoint on the recovery source
+     */
+    void finalizeRecovery(long globalCheckpoint);
 
     /**
      * Blockingly waits for cluster state with at least clusterStateVersion to be available
@@ -82,4 +83,5 @@ void writeFileChunk(StoreFileMetaData fileMetaData, long position, BytesReferenc
      * @return the allocation id of the target shard.
      */
     String getTargetAllocationId();
+
 }
diff --git a/core/src/main/java/org/elasticsearch/indices/recovery/RemoteRecoveryTargetHandler.java b/core/src/main/java/org/elasticsearch/indices/recovery/RemoteRecoveryTargetHandler.java
@@ -86,9 +86,9 @@ public void prepareForTranslogOperations(int totalTranslogOps, long maxUnsafeAut
     }
 
     @Override
-    public void finalizeRecovery() {
+    public void finalizeRecovery(final long globalCheckpoint) {
         transportService.submitRequest(targetNode, PeerRecoveryTargetService.Actions.FINALIZE,
-            new RecoveryFinalizeRecoveryRequest(recoveryId, shardId),
+            new RecoveryFinalizeRecoveryRequest(recoveryId, shardId, globalCheckpoint),
             TransportRequestOptions.builder().withTimeout(recoverySettings.internalActionLongTimeout()).build(),
             EmptyTransportResponseHandler.INSTANCE_SAME).txGet();
     }