Skip to content

Commit 23ddab2

Browse files
[Segment Replication][Remote Store] Remove commits when remote store is enabled (#8050) (#8753)
* remove commits + fix failing test * fix failing tests * fix precommit failure * remove logs * address review comments --------- (cherry picked from commit 2f830be) Signed-off-by: Poojita Raj <[email protected]> Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
1 parent c0da9f0 commit 23ddab2

File tree

3 files changed

+83
-42
lines changed

3 files changed

+83
-42
lines changed

server/src/main/java/org/opensearch/index/engine/NRTReplicationEngine.java

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ public class NRTReplicationEngine extends Engine implements LifecycleAware {
5858
private final CompletionStatsCache completionStatsCache;
5959
private final LocalCheckpointTracker localCheckpointTracker;
6060
private final WriteOnlyTranslogManager translogManager;
61+
private final boolean shouldCommit;
6162

6263
private volatile long lastReceivedGen = SequenceNumbers.NO_OPS_PERFORMED;
6364

@@ -116,6 +117,7 @@ public void onAfterTranslogSync() {
116117
engineConfig.getPrimaryModeSupplier()
117118
);
118119
this.translogManager = translogManagerRef;
120+
this.shouldCommit = engineConfig.getIndexSettings().isRemoteStoreEnabled() == false;
119121
} catch (IOException e) {
120122
IOUtils.closeWhileHandlingException(store::decRef, readerManager, translogManagerRef);
121123
throw new EngineCreationFailureException(shardId, "failed to create engine", e);
@@ -165,7 +167,9 @@ public synchronized void updateSegments(final SegmentInfos infos) throws IOExcep
165167
* @throws IOException - When there is an IO error committing the SegmentInfos.
166168
*/
167169
private void commitSegmentInfos(SegmentInfos infos) throws IOException {
168-
store.commitSegmentInfos(infos, localCheckpointTracker.getMaxSeqNo(), localCheckpointTracker.getProcessedCheckpoint());
170+
if (shouldCommit) {
171+
store.commitSegmentInfos(infos, localCheckpointTracker.getMaxSeqNo(), localCheckpointTracker.getProcessedCheckpoint());
172+
}
169173
this.lastCommittedSegmentInfos = store.readLastCommittedSegmentsInfo();
170174
translogManager.syncTranslog();
171175
}
@@ -426,15 +430,21 @@ protected final void closeNoLock(String reason, CountDownLatch closedLatch) {
426430
assert rwl.isWriteLockedByCurrentThread() || failEngineLock.isHeldByCurrentThread()
427431
: "Either the write lock must be held or the engine must be currently be failing itself";
428432
try {
429-
final SegmentInfos latestSegmentInfos = getLatestSegmentInfos();
430-
/*
431-
This is a workaround solution which decreases the chances of conflict on replica nodes when same file is copied
432-
from two different primaries during failover. Increasing counter helps in avoiding this conflict as counter is
433-
used to generate new segment file names. The ideal solution is to identify the counter from previous primary.
434-
*/
435-
latestSegmentInfos.counter = latestSegmentInfos.counter + SI_COUNTER_INCREMENT;
436-
latestSegmentInfos.changed();
437-
commitSegmentInfos(latestSegmentInfos);
433+
// if remote store is enabled, all segments durably persisted
434+
if (shouldCommit) {
435+
final SegmentInfos latestSegmentInfos = getLatestSegmentInfos();
436+
/*
437+
This is a workaround solution which decreases the chances of conflict on replica nodes when same file is copied
438+
from two different primaries during failover. Increasing counter helps in avoiding this conflict as counter is
439+
used to generate new segment file names. The ideal solution is to identify the counter from previous primary.
440+
*/
441+
latestSegmentInfos.counter = latestSegmentInfos.counter + SI_COUNTER_INCREMENT;
442+
latestSegmentInfos.changed();
443+
commitSegmentInfos(latestSegmentInfos);
444+
} else {
445+
store.directory().sync(List.of(store.directory().listAll()));
446+
store.directory().syncMetaData();
447+
}
438448
IOUtils.close(readerManager, translogManager, store::decRef);
439449
} catch (Exception e) {
440450
logger.warn("failed to close engine", e);

server/src/main/java/org/opensearch/index/shard/IndexShard.java

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4670,31 +4670,34 @@ public void syncSegmentsFromRemoteSegmentStore(boolean overrideLocal, boolean re
46704670
indexInput,
46714671
remoteSegmentMetadata.getGeneration()
46724672
);
4673+
// Replicas never need a local commit
46734674
if (shouldCommit) {
4674-
long processedLocalCheckpoint = Long.parseLong(infosSnapshot.getUserData().get(LOCAL_CHECKPOINT_KEY));
4675-
// Following code block makes sure to use SegmentInfosSnapshot in the remote store if generation differs
4676-
// with local filesystem. If local filesystem already has segments_N+2 and infosSnapshot has generation N,
4677-
// after commit, there would be 2 files that would be created segments_N+1 and segments_N+2. With the
4678-
// policy of preserving only the latest commit, we will delete segments_N+1 which in fact is the part of the latest
4679-
// commit.
4680-
Optional<String> localMaxSegmentInfos = localSegmentFiles.stream()
4681-
.filter(file -> file.startsWith(IndexFileNames.SEGMENTS))
4682-
.max(Comparator.comparingLong(SegmentInfos::generationFromSegmentsFileName));
4683-
if (localMaxSegmentInfos.isPresent()
4684-
&& infosSnapshot.getGeneration() < SegmentInfos.generationFromSegmentsFileName(localMaxSegmentInfos.get())
4685-
- 1) {
4686-
// If remote translog is not enabled, local translog will be created with different UUID.
4687-
// This fails in Store.trimUnsafeCommits() as translog UUID of checkpoint and SegmentInfos needs
4688-
// to be same. Following code block make sure to have the same UUID.
4689-
if (indexSettings.isRemoteTranslogStoreEnabled() == false) {
4690-
SegmentInfos localSegmentInfos = store.readLastCommittedSegmentsInfo();
4691-
Map<String, String> userData = new HashMap<>(infosSnapshot.getUserData());
4692-
userData.put(TRANSLOG_UUID_KEY, localSegmentInfos.userData.get(TRANSLOG_UUID_KEY));
4693-
infosSnapshot.setUserData(userData, false);
4675+
if (this.shardRouting.primary()) {
4676+
long processedLocalCheckpoint = Long.parseLong(infosSnapshot.getUserData().get(LOCAL_CHECKPOINT_KEY));
4677+
// Following code block makes sure to use SegmentInfosSnapshot in the remote store if generation differs
4678+
// with local filesystem. If local filesystem already has segments_N+2 and infosSnapshot has generation N,
4679+
// after commit, there would be 2 files that would be created segments_N+1 and segments_N+2. With the
4680+
// policy of preserving only the latest commit, we will delete segments_N+1 which in fact is the part of the
4681+
// latest commit.
4682+
Optional<String> localMaxSegmentInfos = localSegmentFiles.stream()
4683+
.filter(file -> file.startsWith(IndexFileNames.SEGMENTS))
4684+
.max(Comparator.comparingLong(SegmentInfos::generationFromSegmentsFileName));
4685+
if (localMaxSegmentInfos.isPresent()
4686+
&& infosSnapshot.getGeneration() < SegmentInfos.generationFromSegmentsFileName(localMaxSegmentInfos.get())
4687+
- 1) {
4688+
// If remote translog is not enabled, local translog will be created with different UUID.
4689+
// This fails in Store.trimUnsafeCommits() as translog UUID of checkpoint and SegmentInfos needs
4690+
// to be same. Following code block make sure to have the same UUID.
4691+
if (indexSettings.isRemoteTranslogStoreEnabled() == false) {
4692+
SegmentInfos localSegmentInfos = store.readLastCommittedSegmentsInfo();
4693+
Map<String, String> userData = new HashMap<>(infosSnapshot.getUserData());
4694+
userData.put(TRANSLOG_UUID_KEY, localSegmentInfos.userData.get(TRANSLOG_UUID_KEY));
4695+
infosSnapshot.setUserData(userData, false);
4696+
}
4697+
storeDirectory.deleteFile(localMaxSegmentInfos.get());
46944698
}
4695-
storeDirectory.deleteFile(localMaxSegmentInfos.get());
4699+
store.commitSegmentInfos(infosSnapshot, processedLocalCheckpoint, processedLocalCheckpoint);
46964700
}
4697-
store.commitSegmentInfos(infosSnapshot, processedLocalCheckpoint, processedLocalCheckpoint);
46984701
} else {
46994702
finalizeReplication(infosSnapshot);
47004703
}

server/src/test/java/org/opensearch/index/engine/NRTReplicationEngineTests.java

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,14 @@ public class NRTReplicationEngineTests extends EngineTestCase {
4747
Settings.builder().put(IndexMetadata.SETTING_REPLICATION_TYPE, ReplicationType.SEGMENT).build()
4848
);
4949

50+
private static final IndexSettings REMOTE_STORE_INDEX_SETTINGS = IndexSettingsModule.newIndexSettings(
51+
"index",
52+
Settings.builder()
53+
.put(IndexMetadata.SETTING_REPLICATION_TYPE, ReplicationType.SEGMENT)
54+
.put(IndexMetadata.SETTING_REMOTE_STORE_ENABLED, "true")
55+
.build()
56+
);
57+
5058
public void testCreateEngine() throws IOException {
5159
final AtomicLong globalCheckpoint = new AtomicLong(SequenceNumbers.NO_OPS_PERFORMED);
5260
try (
@@ -129,6 +137,29 @@ public void testUpdateSegments_replicaReceivesSISWithHigherGen() throws IOExcept
129137
}
130138
}
131139

140+
public void testUpdateSegments_replicaReceivesSISWithHigherGen_remoteStoreEnabled() throws IOException {
141+
final AtomicLong globalCheckpoint = new AtomicLong(SequenceNumbers.NO_OPS_PERFORMED);
142+
143+
try (
144+
final Store nrtEngineStore = createStore(REMOTE_STORE_INDEX_SETTINGS, newDirectory());
145+
final NRTReplicationEngine nrtEngine = buildNrtReplicaEngine(globalCheckpoint, nrtEngineStore, REMOTE_STORE_INDEX_SETTINGS)
146+
) {
147+
// assume we start at the same gen.
148+
assertEquals(2, nrtEngine.getLatestSegmentInfos().getGeneration());
149+
assertEquals(nrtEngine.getLatestSegmentInfos().getGeneration(), nrtEngine.getLastCommittedSegmentInfos().getGeneration());
150+
assertEquals(engine.getLatestSegmentInfos().getGeneration(), nrtEngine.getLatestSegmentInfos().getGeneration());
151+
152+
// flush the primary engine - we don't need any segments, just force a new commit point.
153+
engine.flush(true, true);
154+
assertEquals(3, engine.getLatestSegmentInfos().getGeneration());
155+
156+
// When remote store is enabled, we don't commit on replicas since all segments are durably persisted in the store
157+
nrtEngine.updateSegments(engine.getLatestSegmentInfos());
158+
assertEquals(2, nrtEngine.getLastCommittedSegmentInfos().getGeneration());
159+
assertEquals(2, nrtEngine.getLatestSegmentInfos().getGeneration());
160+
}
161+
}
162+
132163
public void testUpdateSegments_replicaReceivesSISWithLowerGen() throws IOException {
133164
// if the replica is already at segments_N that is received, it will commit segments_N+1.
134165
final AtomicLong globalCheckpoint = new AtomicLong(SequenceNumbers.NO_OPS_PERFORMED);
@@ -312,18 +343,11 @@ public void testCommitSegmentInfos() throws Exception {
312343
}
313344
}
314345

315-
private NRTReplicationEngine buildNrtReplicaEngine(AtomicLong globalCheckpoint, Store store) throws IOException {
346+
private NRTReplicationEngine buildNrtReplicaEngine(AtomicLong globalCheckpoint, Store store, IndexSettings settings)
347+
throws IOException {
316348
Lucene.cleanLuceneIndex(store.directory());
317349
final Path translogDir = createTempDir();
318-
final EngineConfig replicaConfig = config(
319-
defaultSettings,
320-
store,
321-
translogDir,
322-
NoMergePolicy.INSTANCE,
323-
null,
324-
null,
325-
globalCheckpoint::get
326-
);
350+
final EngineConfig replicaConfig = config(settings, store, translogDir, NoMergePolicy.INSTANCE, null, null, globalCheckpoint::get);
327351
if (Lucene.indexExists(store.directory()) == false) {
328352
store.createEmpty(replicaConfig.getIndexSettings().getIndexVersionCreated().luceneVersion);
329353
final String translogUuid = Translog.createEmptyTranslog(
@@ -336,4 +360,8 @@ private NRTReplicationEngine buildNrtReplicaEngine(AtomicLong globalCheckpoint,
336360
}
337361
return new NRTReplicationEngine(replicaConfig);
338362
}
363+
364+
private NRTReplicationEngine buildNrtReplicaEngine(AtomicLong globalCheckpoint, Store store) throws IOException {
365+
return buildNrtReplicaEngine(globalCheckpoint, store, defaultSettings);
366+
}
339367
}

0 commit comments

Comments
 (0)