diff --git a/.idea/runConfigurations/Debug_OpenSearch.xml b/.idea/runConfigurations/Debug_OpenSearch.xml deleted file mode 100644 index c18046f873477..0000000000000 --- a/.idea/runConfigurations/Debug_OpenSearch.xml +++ /dev/null @@ -1,15 +0,0 @@ - - - - \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 2baa708bbb6eb..a4a476808b80d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -55,6 +55,21 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Add streaming cardinality aggregator ([#19484](https://github.com/opensearch-project/OpenSearch/pull/19484)) - Disable request cache for streaming aggregation queries ([#19520](https://github.com/opensearch-project/OpenSearch/pull/19520)) +- Add support for a ForkJoinPool type ([#19008](https://github.com/opensearch-project/OpenSearch/pull/19008)) +- Add seperate shard limit validation for local and remote indices ([#19532](https://github.com/opensearch-project/OpenSearch/pull/19532)) +- Use Lucene `pack` method for `half_float` and `usigned_long` when using `ApproximatePointRangeQuery`. +- Add a mapper for context aware segments grouping criteria ([#19233](https://github.com/opensearch-project/OpenSearch/pull/19233)) +- Return full error for GRPC error response ([#19568](https://github.com/opensearch-project/OpenSearch/pull/19568)) +- Add support for repository with Server side encryption enabled and client side encryption as well based on a flag. ([#19630)](https://github.com/opensearch-project/OpenSearch/pull/19630)) +- Add pluggable gRPC interceptors with explicit ordering([#19005](https://github.com/opensearch-project/OpenSearch/pull/19005)) +- Add BindableServices extension point to transport-grpc-spi ([#19304](https://github.com/opensearch-project/OpenSearch/pull/19304)) +- Add metrics for the merged segment warmer feature ([#18929](https://github.com/opensearch-project/OpenSearch/pull/18929)) +- Add pointer based lag metric in pull-based ingestion ([#19635](https://github.com/opensearch-project/OpenSearch/pull/19635)) +- Introduced internal API for retrieving metadata about requested indices from transport actions ([#18523](https://github.com/opensearch-project/OpenSearch/pull/18523)) +- Add cluster defaults for merge autoThrottle, maxMergeThreads, and maxMergeCount; Add segment size filter to the merged segment warmer ([#19629](https://github.com/opensearch-project/OpenSearch/pull/19629)) +- Add build-tooling to run in FIPS environment ([#18921](https://github.com/opensearch-project/OpenSearch/pull/18921)) +- Add SMILE/CBOR/YAML document format support to Bulk GRPC endpoint ([#19744](https://github.com/opensearch-project/OpenSearch/pull/19744)) + ### Changed - Refactor `if-else` chains to use `Java 17 pattern matching switch expressions`(([#18965](https://github.com/opensearch-project/OpenSearch/pull/18965)) - Add CompletionStage variants to methods in the Client Interface and default to ActionListener impl ([#18998](https://github.com/opensearch-project/OpenSearch/pull/18998)) diff --git a/gradle/run.gradle b/gradle/run.gradle index 3b5a3eebab756..a2d96d31ad096 100644 --- a/gradle/run.gradle +++ b/gradle/run.gradle @@ -40,6 +40,34 @@ testClusters { testDistribution = 'archive' if (numZones > 1) numberOfZones = numZones if (numNodes > 1) numberOfNodes = numNodes + // S3 repository configuration + if (findProperty("enableS3")) { + plugin(':plugins:repository-s3') + if (findProperty("s3Endpoint")) { + setting 's3.client.default.endpoint', findProperty("s3Endpoint") + } + setting 's3.client.default.region', findProperty("s3Region") ?: 'us-east-1' + keystore 's3.client.default.access_key', findProperty("s3AccessKey") ?: System.getenv("AWS_ACCESS_KEY_ID") ?: 'test' + keystore 's3.client.default.secret_key', findProperty("s3SecretKey") ?: System.getenv("AWS_SECRET_ACCESS_KEY") ?: 'test' + + + // Remote store configuration + setting 'node.attr.remote_store.segment.repository', 'my-s3-repo' + setting 'node.attr.remote_store.translog.repository', 'my-s3-repo' + setting 'node.attr.remote_store.state.repository', 'my-s3-repo' + setting 'cluster.remote_store.state.enabled', 'true' + setting 'node.attr.remote_store.repository.my-s3-repo.type', 's3' + setting 'node.attr.remote_store.repository.my-s3-repo.settings.bucket', 'local-opensearch-bucket' + setting 'node.attr.remote_store.repository.my-s3-repo.settings.base_path', 'raghraaj-local-1230' + + // SSE-KMS configuration + if (findProperty("enableSseKms")) { + setting 'node.attr.remote_store.repository.my-s3-repo.settings.server_side_encryption_type', 'aws:kms' + setting 'node.attr.remote_store.repository.my-s3-repo.settings.server_side_encryption_kms_key_id', 'arn:aws:kms:us-east-1:389347062219:key/006ef490-5452-4f4d-8da3-a0cb7344ab59' + setting 'node.attr.remote_store.repository.my-s3-repo.settings.server_side_encryption_bucket_key_enabled', findProperty("sseBucketKeyEnabled") ?: 'true' + setting 'node.attr.remote_store.repository.my-s3-repo.settings.server_side_encryption_encryption_context', '{"identifier":"mustang"}' + } + } if (findProperty("installedPlugins")) { installedPlugins = Eval.me(installedPlugins) diff --git a/modules/parquet-data-format/src/internalClusterTest/java/com/parquet/parquetdataformat/ParquetSegmentReplicationIT.java b/modules/parquet-data-format/src/internalClusterTest/java/com/parquet/parquetdataformat/ParquetSegmentReplicationIT.java index a80ec3f158fa6..80090c0b178d2 100644 --- a/modules/parquet-data-format/src/internalClusterTest/java/com/parquet/parquetdataformat/ParquetSegmentReplicationIT.java +++ b/modules/parquet-data-format/src/internalClusterTest/java/com/parquet/parquetdataformat/ParquetSegmentReplicationIT.java @@ -314,80 +314,80 @@ public void testFormatAwareMetadataReplication() throws Exception { /** * Tests that replica can recover from remote store with Parquet files. */ -// public void testReplicaRecoveryWithParquetFiles() throws Exception { -// internalCluster().startClusterManagerOnlyNode(); -// internalCluster().startDataOnlyNodes(2); -// createReplicationIndex(INDEX_NAME, 1); -// -// // Index documents -// for (int i = 0; i < 20; i++) { -// client().prepareIndex(INDEX_NAME) -// .setId(String.valueOf(i)) -// .setSource("id", String.valueOf(i), "field", "recovery" + i, "value", (long) i) -// .setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE) -// .get(); -// } -// -// String primaryNode = getPrimaryNodeName(INDEX_NAME); -// String replicaNode = getReplicaNodeName(INDEX_NAME); -// -// // Wait for initial replication -// assertBusy(() -> { -// IndexShard primaryShard = getIndexShard(primaryNode, INDEX_NAME); -// IndexShard replicaShard = getIndexShard(replicaNode, INDEX_NAME); -// assertEquals( -// primaryShard.getLatestReplicationCheckpoint().getSegmentInfosVersion(), -// replicaShard.getLatestReplicationCheckpoint().getSegmentInfosVersion() -// ); -// }, 30, TimeUnit.SECONDS); -// -// // Stop replica node to simulate failure -// internalCluster().restartNode(replicaNode, new InternalTestCluster.RestartCallback() { -// @Override -// public Settings onNodeStopped(String nodeName) throws Exception { -// // Index more documents on primary while replica is down -// try { -// for (int i = 20; i < 40; i++) { -// client().prepareIndex(INDEX_NAME) -// .setId(String.valueOf(i)) -// .setSource("id", String.valueOf(i), "field", "after_failure" + i, "value", (long) i) -// .get(); -// } -// client().admin().indices().prepareRefresh(INDEX_NAME).get(); -// } catch (Exception e) { -// throw new RuntimeException(e); -// } -// return super.onNodeStopped(nodeName); -// } -// }); -// -// ensureGreen(INDEX_NAME); -// -// // Verify replica recovered with Parquet files -// assertBusy(() -> { -// IndexShard primaryShard = getIndexShard(primaryNode, INDEX_NAME); -// IndexShard replicaShard = getIndexShard(replicaNode, INDEX_NAME); -// -// // Verify checkpoints match after recovery -// assertEquals( -// "Replica should catch up after recovery", -// primaryShard.getLatestReplicationCheckpoint().getSegmentInfosVersion(), -// replicaShard.getLatestReplicationCheckpoint().getSegmentInfosVersion() -// ); -// -// // Verify replica has Parquet files -// RemoteSegmentStoreDirectory replicaRemoteDir = replicaShard.getRemoteDirectory(); -// Map replicaSegments = -// replicaRemoteDir.getSegmentsUploadedToRemoteStore(); -// -// Set formats = replicaSegments.keySet().stream() -// .map(file -> new FileMetadata(file).dataFormat()) -// .collect(Collectors.toSet()); -// -// assertTrue("Recovered replica should have Parquet files", formats.contains("parquet")); -// -// }, 60, TimeUnit.SECONDS); -// } + public void testReplicaRecoveryWithParquetFiles() throws Exception { + internalCluster().startClusterManagerOnlyNode(); + internalCluster().startDataOnlyNodes(2); + createReplicationIndex(INDEX_NAME, 1); + + // Index documents + for (int i = 0; i < 20; i++) { + client().prepareIndex(INDEX_NAME) + .setId(String.valueOf(i)) + .setSource("id", String.valueOf(i), "field", "recovery" + i, "value", (long) i) + .setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE) + .get(); + } + + String primaryNode = getPrimaryNodeName(INDEX_NAME); + String replicaNode = getReplicaNodeName(INDEX_NAME); + + // Wait for initial replication + assertBusy(() -> { + IndexShard primaryShard = getIndexShard(primaryNode, INDEX_NAME); + IndexShard replicaShard = getIndexShard(replicaNode, INDEX_NAME); + assertEquals( + primaryShard.getLatestReplicationCheckpoint().getSegmentInfosVersion(), + replicaShard.getLatestReplicationCheckpoint().getSegmentInfosVersion() + ); + }, 30, TimeUnit.SECONDS); + + // Stop replica node to simulate failure + internalCluster().restartNode(replicaNode, new InternalTestCluster.RestartCallback() { + @Override + public Settings onNodeStopped(String nodeName) throws Exception { + // Index more documents on primary while replica is down + try { + for (int i = 20; i < 40; i++) { + client().prepareIndex(INDEX_NAME) + .setId(String.valueOf(i)) + .setSource("id", String.valueOf(i), "field", "after_failure" + i, "value", (long) i) + .get(); + } + client().admin().indices().prepareRefresh(INDEX_NAME).get(); + } catch (Exception e) { + throw new RuntimeException(e); + } + return super.onNodeStopped(nodeName); + } + }); + + ensureGreen(INDEX_NAME); + + // Verify replica recovered with Parquet files + assertBusy(() -> { + IndexShard primaryShard = getIndexShard(primaryNode, INDEX_NAME); + IndexShard replicaShard = getIndexShard(replicaNode, INDEX_NAME); + + // Verify checkpoints match after recovery + assertEquals( + "Replica should catch up after recovery", + primaryShard.getLatestReplicationCheckpoint().getSegmentInfosVersion(), + replicaShard.getLatestReplicationCheckpoint().getSegmentInfosVersion() + ); + + // Verify replica has Parquet files + RemoteSegmentStoreDirectory replicaRemoteDir = replicaShard.getRemoteDirectory(); + Map replicaSegments = + replicaRemoteDir.getSegmentsUploadedToRemoteStore(); + + Set formats = replicaSegments.keySet().stream() + .map(file -> new FileMetadata(file).dataFormat()) + .collect(Collectors.toSet()); + + assertTrue("Recovered replica should have Parquet files", formats.contains("parquet")); + + }, 60, TimeUnit.SECONDS); + } /** * Tests that ReplicationCheckpoint contains format-aware metadata. diff --git a/modules/parquet-data-format/src/test/java/com/parquet/parquetdataformat/vsr/VSRManagerTests.java b/modules/parquet-data-format/src/test/java/com/parquet/parquetdataformat/vsr/VSRManagerTests.java index 046cc94f5433d..3c13cc511eb83 100644 --- a/modules/parquet-data-format/src/test/java/com/parquet/parquetdataformat/vsr/VSRManagerTests.java +++ b/modules/parquet-data-format/src/test/java/com/parquet/parquetdataformat/vsr/VSRManagerTests.java @@ -9,6 +9,7 @@ package com.parquet.parquetdataformat.vsr; import com.parquet.parquetdataformat.bridge.ArrowExport; +import com.parquet.parquetdataformat.bridge.ParquetFileMetadata; import com.parquet.parquetdataformat.bridge.RustBridge; import com.parquet.parquetdataformat.memory.ArrowBufferPool; import com.parquet.parquetdataformat.writer.ParquetDocumentInput; @@ -90,8 +91,8 @@ public void testVSRManagerInitializationAndActiveVSR() throws Exception { // Flush before close (transitions VSR to FROZEN) FlushIn flushIn = Mockito.mock(FlushIn.class); - String flushResult = vsrManager.flush(flushIn); - assertEquals("Flush should return filename", testFileName, flushResult); + ParquetFileMetadata flushResult = vsrManager.flush(flushIn); + assertNotNull("Flush should return metadata", flushResult); assertEquals("VSR should be FROZEN after flush", VSRState.FROZEN, vsrManager.getActiveManagedVSR().getState()); // Now close should succeed @@ -125,8 +126,8 @@ public void testDocumentAdditionThroughVSRManager() throws Exception { // Follow proper VSRManager lifecycle: Write → Flush → Close // Flush before close (transitions VSR to FROZEN) FlushIn flushIn = Mockito.mock(FlushIn.class); - String flushResult = vsrManager.flush(flushIn); - assertEquals("Flush should return filename", testFileName, flushResult); + ParquetFileMetadata flushResult = vsrManager.flush(flushIn); + assertNotNull("Flush should return metadata", flushResult); assertEquals("VSR should be FROZEN after flush", VSRState.FROZEN, vsrManager.getActiveManagedVSR().getState()); // Now close should succeed @@ -142,9 +143,9 @@ public void testFlushThroughVSRManager() throws Exception { // Flush through VSRManager (create mock FlushIn) FlushIn flushIn = Mockito.mock(FlushIn.class); - String result = vsrManager.flush(flushIn); + ParquetFileMetadata result = vsrManager.flush(flushIn); - assertEquals("Flush should return filename", testFileName, result); + assertNotNull("Flush should return metadata", result); // VSR should be FROZEN after flush assertEquals("VSR should be FROZEN after flush", @@ -166,9 +167,9 @@ public void testVSRManagerStateTransitionWorkflow() throws Exception { // 3. Flush - should transition VSR to FROZEN FlushIn flushIn = Mockito.mock(FlushIn.class); - String flushResult = vsrManager.flush(flushIn); + ParquetFileMetadata flushResult = vsrManager.flush(flushIn); - assertEquals("Flush should return filename", testFileName, flushResult); + assertNotNull("Flush should return metadata", flushResult); assertEquals("VSR should be FROZEN after flush", VSRState.FROZEN, vsrManager.getActiveManagedVSR().getState()); assertTrue("VSR should be immutable when frozen", vsrManager.getActiveManagedVSR().isImmutable()); diff --git a/plugins/engine-datafusion/build.gradle b/plugins/engine-datafusion/build.gradle index bb26ebd449612..edff9b4cba2ff 100644 --- a/plugins/engine-datafusion/build.gradle +++ b/plugins/engine-datafusion/build.gradle @@ -176,6 +176,12 @@ test { systemProperty 'java.library.path', file('src/main/resources/native').absolutePath } +internalClusterTest { + // Add same JVM arguments for integration tests + jvmArgs += ["--add-opens", "java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED"] + systemProperty 'java.library.path', file('src/main/resources/native').absolutePath +} + yamlRestTest { systemProperty 'tests.security.manager', 'false' // Disable yamlRestTest since this plugin doesn't have REST API endpoints diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/search/DatafusionReader.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/search/DatafusionReader.java index fb9a522adda07..ff5f1a3e7203e 100644 --- a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/search/DatafusionReader.java +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/search/DatafusionReader.java @@ -24,6 +24,8 @@ import java.util.Arrays; import java.util.Collection; import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/search/DatafusionReaderManager.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/search/DatafusionReaderManager.java index 9d60ee01ec006..471797fcdd6fa 100644 --- a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/search/DatafusionReaderManager.java +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/search/DatafusionReaderManager.java @@ -12,7 +12,6 @@ import java.util.HashSet; import java.util.Set; import java.util.function.Consumer; -import org.apache.lucene.search.ReferenceManager; import org.opensearch.index.engine.CatalogSnapshotAwareRefreshListener; import org.opensearch.index.engine.EngineReaderManager; import org.opensearch.index.engine.FileDeletionListener; diff --git a/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/DataFusionReaderManagerTests.java b/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/DataFusionReaderManagerTests.java index 284cd5bc5e857..8f5c329e0935a 100644 --- a/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/DataFusionReaderManagerTests.java +++ b/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/DataFusionReaderManagerTests.java @@ -31,8 +31,10 @@ import org.opensearch.env.Environment; import org.opensearch.index.engine.exec.*; import org.opensearch.index.engine.exec.coord.CatalogSnapshot; +import org.opensearch.index.engine.exec.coord.CompositeEngineCatalogSnapshot; import org.opensearch.index.engine.exec.coord.CompositeEngine; import org.opensearch.index.engine.exec.coord.IndexFileDeleter; +import org.opensearch.index.engine.exec.coord.Segment; import org.opensearch.index.shard.ShardPath; import org.opensearch.search.aggregations.SearchResultsCollector; import org.opensearch.test.OpenSearchTestCase; @@ -103,14 +105,14 @@ public void testInitialReaderCreation() throws IOException { DatafusionReaderManager readerManager = engine.getReferenceManager(INTERNAL); Path parquetDir = shardPath.getDataPath().resolve("parquet"); - CatalogSnapshot.Segment segment = new CatalogSnapshot.Segment(1); + Segment segment = new Segment(1); WriterFileSet writerFileSet = new WriterFileSet(parquetDir, 1, 4); writerFileSet.add(parquetDir + "/parquet_file_generation_0.parquet"); writerFileSet.add(parquetDir + "/parquet_file_generation_1.parquet"); segment.addSearchableFiles(getMockDataFormat().name(), writerFileSet); readerManager.afterRefresh(true, - () -> getCatalogSnapshotRef(new CatalogSnapshot(1, 1, List.of(segment), new HashMap<>(), noOpFileDeleterSupplier))); + () -> getCatalogSnapshotRef(new CompositeEngineCatalogSnapshot(1, 1, List.of(segment), new HashMap<>(), noOpFileDeleterSupplier))); DatafusionSearcher searcher = engine.acquireSearcher("test"); DatafusionReader reader = searcher.getReader(); @@ -134,13 +136,13 @@ public void testMultipleSearchersShareSameReader() throws IOException { DatafusionReaderManager readerManager = engine.getReferenceManager(INTERNAL); Path parquetDir = shardPath.getDataPath().resolve("parquet"); - CatalogSnapshot.Segment segment = new CatalogSnapshot.Segment(1); + Segment segment = new Segment(1); WriterFileSet writerFileSet = new WriterFileSet(parquetDir, 1, 2); writerFileSet.add(parquetDir + "/parquet_file_generation_0.parquet"); segment.addSearchableFiles(getMockDataFormat().name(), writerFileSet); readerManager.afterRefresh(true, - () -> getCatalogSnapshotRef(new CatalogSnapshot(1, 1, List.of(segment), new HashMap<>(), noOpFileDeleterSupplier))); + () -> getCatalogSnapshotRef(new CompositeEngineCatalogSnapshot(1, 1, List.of(segment), new HashMap<>(), noOpFileDeleterSupplier))); DatafusionSearcher searcher1 = engine.acquireSearcher("test1"); DatafusionSearcher searcher2 = engine.acquireSearcher("test2"); @@ -165,13 +167,13 @@ public void testReaderSurvivesPartialSearcherClose() throws IOException { DatafusionReaderManager readerManager = engine.getReferenceManager(INTERNAL); Path parquetDir = shardPath.getDataPath().resolve("parquet"); - CatalogSnapshot.Segment segment = new CatalogSnapshot.Segment(1); + Segment segment = new Segment(1); WriterFileSet writerFileSet = new WriterFileSet(parquetDir, 1, 2); writerFileSet.add(parquetDir + "/parquet_file_generation_0.parquet"); segment.addSearchableFiles(getMockDataFormat().name(), writerFileSet); readerManager.afterRefresh(true, - () -> getCatalogSnapshotRef(new CatalogSnapshot(1, 1, List.of(segment), new HashMap<>(), noOpFileDeleterSupplier))); + () -> getCatalogSnapshotRef(new CompositeEngineCatalogSnapshot(1, 1, List.of(segment), new HashMap<>(), noOpFileDeleterSupplier))); DatafusionSearcher searcher1 = engine.acquireSearcher("test1"); DatafusionSearcher searcher2 = engine.acquireSearcher("test2"); @@ -197,14 +199,14 @@ public void testRefreshCreatesNewReader() throws IOException { Path parquetDir = shardPath.getDataPath().resolve("parquet"); // Initial refresh - CatalogSnapshot.Segment segment1 = new CatalogSnapshot.Segment(1); + Segment segment1 = new Segment(1); WriterFileSet writerFileSet1 = new WriterFileSet(parquetDir, 1, 2); addFilesToShardPath(shardPath, "parquet_file_generation_0.parquet"); writerFileSet1.add(parquetDir + "/parquet_file_generation_0.parquet"); segment1.addSearchableFiles(getMockDataFormat().name(), writerFileSet1); readerManager.afterRefresh(true, - () -> getCatalogSnapshotRef(new CatalogSnapshot(1, 1, List.of(segment1), new HashMap<>(), noOpFileDeleterSupplier))); + () -> getCatalogSnapshotRef(new CompositeEngineCatalogSnapshot(1, 1, List.of(segment1), new HashMap<>(), noOpFileDeleterSupplier))); DatafusionSearcher searcher1 = engine.acquireSearcher("test1"); DatafusionReader reader1 = searcher1.getReader(); @@ -212,14 +214,14 @@ public void testRefreshCreatesNewReader() throws IOException { // Add new file and refresh addFilesToShardPath(shardPath, "parquet_file_generation_1.parquet"); - CatalogSnapshot.Segment segment2 = new CatalogSnapshot.Segment(2); + Segment segment2 = new Segment(2); WriterFileSet writerFileSet2 = new WriterFileSet(parquetDir, 2, 4); writerFileSet2.add(parquetDir + "/parquet_file_generation_0.parquet"); writerFileSet2.add(parquetDir + "/parquet_file_generation_1.parquet"); segment2.addSearchableFiles(getMockDataFormat().name(), writerFileSet2); readerManager.afterRefresh(true, - () -> getCatalogSnapshotRef(new CatalogSnapshot(2, 2, List.of(segment2), new HashMap<>(), noOpFileDeleterSupplier))); + () -> getCatalogSnapshotRef(new CompositeEngineCatalogSnapshot(2, 2, List.of(segment2), new HashMap<>(), noOpFileDeleterSupplier))); DatafusionSearcher searcher2 = engine.acquireSearcher("test2"); DatafusionReader reader2 = searcher2.getReader(); @@ -246,13 +248,13 @@ public void testDecRefAfterCloseThrowsException() throws IOException { DatafusionReaderManager readerManager = engine.getReferenceManager(INTERNAL); Path parquetDir = shardPath.getDataPath().resolve("parquet"); - CatalogSnapshot.Segment segment = new CatalogSnapshot.Segment(1); + Segment segment = new Segment(1); WriterFileSet writerFileSet = new WriterFileSet(parquetDir, 1, 4); writerFileSet.add(parquetDir + "/parquet_file_generation_2.parquet"); segment.addSearchableFiles(getMockDataFormat().name(), writerFileSet); readerManager.afterRefresh(true, - () -> getCatalogSnapshotRef(new CatalogSnapshot(1, 1, List.of(segment), new HashMap<>(), noOpFileDeleterSupplier))); + () -> getCatalogSnapshotRef(new CompositeEngineCatalogSnapshot(1, 1, List.of(segment), new HashMap<>(), noOpFileDeleterSupplier))); DatafusionSearcher searcher = engine.acquireSearcher("test"); DatafusionReader reader = searcher.getReader(); @@ -276,14 +278,14 @@ public void testReaderClosesAfterSearchRelease() throws IOException { DatafusionReaderManager readerManager = engine.getReferenceManager(INTERNAL); Path parquetDir = shardPath.getDataPath().resolve("parquet"); - CatalogSnapshot.Segment segment = new CatalogSnapshot.Segment(1); + Segment segment = new Segment(1); WriterFileSet writerFileSet = new WriterFileSet(parquetDir, 1, 6); writerFileSet.add(parquetDir + "/parquet_file_generation_2.parquet"); writerFileSet.add(parquetDir + "/parquet_file_generation_1.parquet"); segment.addSearchableFiles(getMockDataFormat().name(), writerFileSet); readerManager.afterRefresh(true, - () -> getCatalogSnapshotRef(new CatalogSnapshot(1, 1, List.of(segment), new HashMap<>(), noOpFileDeleterSupplier))); + () -> getCatalogSnapshotRef(new CompositeEngineCatalogSnapshot(1, 1, List.of(segment), new HashMap<>(), noOpFileDeleterSupplier))); // DatafusionReader readerR1 = readerManager.acquire(); DatafusionSearcher datafusionSearcherS1 = engine.acquireSearcher("Search"); @@ -299,14 +301,14 @@ public void testReaderClosesAfterSearchRelease() throws IOException { addFilesToShardPath(shardPath, "parquet_file_generation_0.parquet"); // now trigger refresh to have new Reader with F2, F3 - CatalogSnapshot.Segment segment2 = new CatalogSnapshot.Segment(2); + Segment segment2 = new Segment(2); WriterFileSet writerFileSet2 = new WriterFileSet(parquetDir, 2, 4); writerFileSet2.add(parquetDir + "/parquet_file_generation_1.parquet"); writerFileSet2.add(parquetDir + "/parquet_file_generation_0.parquet"); segment2.addSearchableFiles(getMockDataFormat().name(), writerFileSet2); readerManager.afterRefresh(true, - () -> getCatalogSnapshotRef(new CatalogSnapshot(2, 2, List.of(segment2), new HashMap<>(), noOpFileDeleterSupplier))); + () -> getCatalogSnapshotRef(new CompositeEngineCatalogSnapshot(2, 2, List.of(segment2), new HashMap<>(), noOpFileDeleterSupplier))); // now check if new Reader is created with F2, F3 // DatafusionReader readerR2 = readerManager.acquire(); @@ -345,13 +347,13 @@ public void testSearch() throws Exception { // Initial refresh - files are in the parquet subdirectory Path parquetDir = shardPath.getDataPath().resolve("parquet"); - CatalogSnapshot.Segment segment1 = new CatalogSnapshot.Segment(0); + Segment segment1 = new Segment(0); WriterFileSet writerFileSet1 = new WriterFileSet(parquetDir, 0, 2); writerFileSet1.add(parquetDir + "/parquet_file_generation_0.parquet"); segment1.addSearchableFiles(getMockDataFormat().name(), writerFileSet1); readerManager.afterRefresh(true, - () -> getCatalogSnapshotRef(new CatalogSnapshot(1, 1, List.of(segment1), new HashMap<>(), noOpFileDeleterSupplier))); + () -> getCatalogSnapshotRef(new CompositeEngineCatalogSnapshot(1, 1, List.of(segment1), new HashMap<>(), noOpFileDeleterSupplier))); DatafusionSearcher searcher1 = engine.acquireSearcher("search"); DatafusionReader reader1 = searcher1.getReader(); @@ -375,13 +377,13 @@ public void testSearch() throws Exception { logger.info("AFTER REFRESH"); addFilesToShardPath(shardPath, "parquet_file_generation_1.parquet"); - CatalogSnapshot.Segment segment2 = new CatalogSnapshot.Segment(1); + Segment segment2 = new Segment(1); WriterFileSet writerFileSet2 = new WriterFileSet(parquetDir, 1, 2); writerFileSet2.add(parquetDir + "/parquet_file_generation_1.parquet"); segment2.addSearchableFiles(getMockDataFormat().name(), writerFileSet2); readerManager.afterRefresh(true, - () -> getCatalogSnapshotRef(new CatalogSnapshot(2, 1, List.of(segment2), new HashMap<>(), noOpFileDeleterSupplier))); + () -> getCatalogSnapshotRef(new CompositeEngineCatalogSnapshot(2, 1, List.of(segment2), new HashMap<>(), noOpFileDeleterSupplier))); expectedResults = new HashMap<>(); expectedResults.put("min", 3L); diff --git a/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/DataFusionRemoteStoreRecoveryTests.java b/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/DataFusionRemoteStoreRecoveryTests.java new file mode 100644 index 0000000000000..ee7c5ac15cd23 --- /dev/null +++ b/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/DataFusionRemoteStoreRecoveryTests.java @@ -0,0 +1,583 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.datafusion; + +import com.parquet.parquetdataformat.ParquetDataFormatPlugin; +import org.opensearch.action.admin.cluster.remotestore.restore.RestoreRemoteStoreRequest; +import org.opensearch.action.support.PlainActionFuture; +import org.opensearch.cluster.metadata.IndexMetadata; +import org.opensearch.common.settings.Settings; +import org.opensearch.core.xcontent.MediaTypeRegistry; +import org.opensearch.index.engine.exec.FileMetadata; +import org.opensearch.index.shard.IndexShard; +import org.opensearch.index.store.RemoteSegmentStoreDirectory; +import org.opensearch.index.store.UploadedSegmentMetadata; +import org.opensearch.index.store.remote.metadata.RemoteSegmentMetadata; +import org.opensearch.indices.replication.common.ReplicationType; +import org.opensearch.plugins.Plugin; +import org.opensearch.test.OpenSearchIntegTestCase; +import org.opensearch.test.junit.annotations.TestLogging; +import org.junit.Before; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; + +import org.opensearch.index.store.CompositeStoreDirectory; + +import static org.opensearch.gateway.remote.RemoteClusterStateService.REMOTE_CLUSTER_STATE_ENABLED_SETTING; +import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertAcked; + +/** + * Integration tests for DataFusion engine remote store recovery scenarios. + * Tests format-aware metadata preservation, CatalogSnapshot recovery, and + * remote store recovery validation with Parquet/Arrow files. + */ +@TestLogging( + value = "org.opensearch.index.shard:DEBUG,org.opensearch.index.store:DEBUG,org.opensearch.datafusion:DEBUG", + reason = "Validate DataFusion recovery with format-aware metadata" +) +@OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.TEST, numDataNodes = 0) +public class DataFusionRemoteStoreRecoveryTests extends OpenSearchIntegTestCase { + + protected static final String REPOSITORY_NAME = "test-remote-store-repo"; + protected static final String INDEX_NAME = "datafusion-test-index"; + + protected Path repositoryPath; + + @Override + protected Collection> nodePlugins() { + return List.of(DataFusionPlugin.class, ParquetDataFormatPlugin.class); + } + + @Before + public void setup() { + repositoryPath = randomRepoPath().toAbsolutePath(); + } + + @Override + protected Settings nodeSettings(int nodeOrdinal) { + return Settings.builder() + .put(super.nodeSettings(nodeOrdinal)) + .put(remoteStoreClusterSettings(REPOSITORY_NAME, repositoryPath)) + .put(REMOTE_CLUSTER_STATE_ENABLED_SETTING.getKey(), true) + .build(); + } + + @Override + public Settings indexSettings() { + return Settings.builder() + .put(super.indexSettings()) + .put("index.queries.cache.enabled", false) + .put("index.refresh_interval", -1) + .put(IndexMetadata.SETTING_REPLICATION_TYPE, ReplicationType.SEGMENT) + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0) + .put("index.optimized.enabled", true) + .build(); + } + + @Override + protected void beforeIndexDeletion() throws Exception { + logger.info("--> Skipping beforeIndexDeletion cleanup to avoid DataFusion engine type conflicts"); + } + + @Override + protected void ensureClusterSizeConsistency() {} + + @Override + protected void ensureClusterStateConsistency() {} + + private IndexShard getIndexShard(String nodeName, String indexName) { + return internalCluster().getInstance(org.opensearch.indices.IndicesService.class, nodeName) + .indexServiceSafe(internalCluster().clusterService(nodeName).state().metadata().index(indexName).getIndex()) + .getShard(0); + } + + private void validateRemoteStoreSegments(IndexShard shard, String stageName) { + RemoteSegmentStoreDirectory remoteDir = shard.getRemoteDirectory(); + assertNotNull("RemoteSegmentStoreDirectory should not be null", remoteDir); + + Map uploadedSegmentsRaw = remoteDir.getSegmentsUploadedToRemoteStore(); + if (uploadedSegmentsRaw.isEmpty()) { + logger.warn("--> No segments uploaded yet at stage: {}", stageName); + return; + } + + Map uploadedSegments = uploadedSegmentsRaw.entrySet().stream() + .collect(Collectors.toMap(e -> new FileMetadata(e.getKey()), Map.Entry::getValue)); + + for (FileMetadata fileMetadata : uploadedSegments.keySet()) { + assertNotNull("FileMetadata should have format information", fileMetadata.dataFormat()); + assertFalse("Format should not be empty", fileMetadata.dataFormat().isEmpty()); + } + } + + private long validateLocalShardFiles(IndexShard shard, String stageName) { + try { + CompositeStoreDirectory compositeDir = shard.store().compositeStoreDirectory(); + if (compositeDir != null) { + FileMetadata[] allFiles = compositeDir.listFileMetadata(); + return Arrays.stream(allFiles).filter(fm -> "parquet".equals(fm.dataFormat())).count(); + } else { + String[] files = shard.store().directory().listAll(); + return Arrays.stream(files).filter(f -> f.contains("parquet") || f.endsWith(".parquet")).count(); + } + } catch (IOException e) { + logger.warn("--> Failed to list local shard files at stage {}: {}", stageName, e.getMessage()); + return -1; + } + } + + private void validateCatalogSnapshot(IndexShard shard, String stageName) { + RemoteSegmentStoreDirectory remoteDir = shard.getRemoteDirectory(); + assertNotNull("RemoteSegmentStoreDirectory should not be null", remoteDir); + + try { + RemoteSegmentMetadata metadata = remoteDir.readLatestMetadataFile(); + if (metadata == null) { + logger.warn("--> RemoteSegmentMetadata not found at stage {}", stageName); + return; + } + + byte[] catalogSnapshotBytes = metadata.getSegmentInfosBytes(); + if (catalogSnapshotBytes != null) { + assertTrue("CatalogSnapshot bytes should not be empty", catalogSnapshotBytes.length > 0); + } + + var checkpoint = metadata.getReplicationCheckpoint(); + if (checkpoint != null) { + assertTrue("Checkpoint version should be positive", checkpoint.getSegmentInfosVersion() > 0); + } + } catch (IOException e) { + logger.warn("--> Failed to read metadata at stage {}: {}", stageName, e.getMessage()); + } + } + + /** + * Tests DataFusion engine recovery from remote store with format-aware metadata preservation. + */ + public void testDataFusionWithRemoteStoreRecovery() throws Exception { + internalCluster().startClusterManagerOnlyNodes(1); + internalCluster().startDataOnlyNodes(1); + ensureStableCluster(2); + + String mappings = "{ \"properties\": { \"message\": { \"type\": \"long\" }, \"message2\": { \"type\": \"long\" }, \"message3\": { \"type\": \"long\" } } }"; + assertAcked(client().admin().indices().prepareCreate(INDEX_NAME).setSettings(indexSettings()).setMapping(mappings).get()); + ensureGreen(INDEX_NAME); + + client().prepareIndex(INDEX_NAME).setId("1").setSource("{ \"message\": 4, \"message2\": 3, \"message3\": 4 }", MediaTypeRegistry.JSON).get(); + client().prepareIndex(INDEX_NAME).setId("2").setSource("{ \"message\": 3, \"message2\": 4, \"message3\": 5 }", MediaTypeRegistry.JSON).get(); + client().prepareIndex(INDEX_NAME).setId("3").setSource("{ \"message\": 5, \"message2\": 2, \"message3\": 3 }", MediaTypeRegistry.JSON).get(); + + client().admin().indices().prepareRefresh(INDEX_NAME).get(); + client().admin().indices().prepareFlush(INDEX_NAME).get(); + + String dataNodeName = internalCluster().getDataNodeNames().iterator().next(); + IndexShard indexShard = getIndexShard(dataNodeName, INDEX_NAME); + validateRemoteStoreSegments(indexShard, "before recovery"); + validateCatalogSnapshot(indexShard, "before recovery"); + + // Capture state before recovery for comparison + long docCountBeforeRecovery = indexShard.docStats().getCount(); + long localFilesBeforeRecovery = validateLocalShardFiles(indexShard, "before recovery"); + + String clusterUUID = clusterService().state().metadata().clusterUUID(); + internalCluster().stopRandomDataNode(); + ensureRed(INDEX_NAME); + + internalCluster().startDataOnlyNode(); + ensureStableCluster(2); + + assertAcked(client().admin().indices().prepareClose(INDEX_NAME)); + client().admin().cluster().restoreRemoteStore(new RestoreRemoteStoreRequest().indices(INDEX_NAME).restoreAllShards(true), PlainActionFuture.newFuture()); + ensureGreen(INDEX_NAME); + client().admin().indices().prepareFlush(INDEX_NAME).setForce(true).get(); + + assertEquals("Cluster UUID should remain same", clusterUUID, clusterService().state().metadata().clusterUUID()); + + String newDataNodeName = internalCluster().getDataNodeNames().iterator().next(); + IndexShard recoveredIndexShard = getIndexShard(newDataNodeName, INDEX_NAME); + validateRemoteStoreSegments(recoveredIndexShard, "after recovery"); + validateCatalogSnapshot(recoveredIndexShard, "after recovery"); + + long localFilesAfterRecovery = validateLocalShardFiles(recoveredIndexShard, "after recovery"); + assertTrue("Should have local files after recovery", localFilesAfterRecovery >= 0); + + client().admin().indices().prepareRefresh(INDEX_NAME).get(); + long docCountAfterRecovery = recoveredIndexShard.docStats().getCount(); + + // Verify before/after comparison + assertEquals("Doc count should be same before and after recovery", docCountBeforeRecovery, docCountAfterRecovery); + assertEquals("Local file count should be same before and after recovery", localFilesBeforeRecovery, localFilesAfterRecovery); + + assertAcked(client().admin().indices().prepareDelete(INDEX_NAME).get()); + } + + /** + * Tests DataFusion recovery with multiple Parquet generation files. + */ + public void testDataFusionRecoveryWithMultipleParquetGenerations() throws Exception { + internalCluster().startClusterManagerOnlyNodes(1); + internalCluster().startDataOnlyNodes(1); + ensureStableCluster(2); + + String mappings = "{ \"properties\": { \"message\": { \"type\": \"long\" }, \"message2\": { \"type\": \"long\" }, \"generation\": { \"type\": \"keyword\" } } }"; + assertAcked(client().admin().indices().prepareCreate(INDEX_NAME).setSettings(indexSettings()).setMapping(mappings).get()); + ensureGreen(INDEX_NAME); + + String dataNodeName = internalCluster().getDataNodeNames().iterator().next(); + IndexShard indexShard = getIndexShard(dataNodeName, INDEX_NAME); + + int numGenerations = 4; + for (int gen = 1; gen <= numGenerations; gen++) { + for (int i = 1; i <= 3; i++) { + client().prepareIndex(INDEX_NAME).setId("gen" + gen + "_doc" + i) + .setSource("{ \"message\": " + (gen * 100 + i) + ", \"message2\": " + (gen * 200 + i) + ", \"generation\": \"gen" + gen + "\" }", MediaTypeRegistry.JSON).get(); + } + client().admin().indices().prepareFlush(INDEX_NAME).get(); + client().admin().indices().prepareRefresh(INDEX_NAME).get(); + Thread.sleep(500); + } + + validateRemoteStoreSegments(indexShard, "before recovery"); + RemoteSegmentStoreDirectory remoteDir = indexShard.getRemoteDirectory(); + Map uploadedSegments = remoteDir.getSegmentsUploadedToRemoteStore().entrySet().stream() + .collect(Collectors.toMap(e -> new FileMetadata(e.getKey()), Map.Entry::getValue)); + long parquetFileCount = uploadedSegments.keySet().stream().filter(fm -> "parquet".equals(fm.dataFormat())).count(); + assertTrue("Should have multiple Parquet generation files", parquetFileCount >= numGenerations); + + // Capture state before recovery for comparison + long docCountBeforeRecovery = indexShard.docStats().getCount(); + long localFilesBeforeRecovery = validateLocalShardFiles(indexShard, "before recovery"); + + String clusterUUID = clusterService().state().metadata().clusterUUID(); + internalCluster().stopRandomDataNode(); + ensureRed(INDEX_NAME); + + internalCluster().startDataOnlyNode(); + ensureStableCluster(2); + + assertAcked(client().admin().indices().prepareClose(INDEX_NAME)); + client().admin().cluster().restoreRemoteStore(new RestoreRemoteStoreRequest().indices(INDEX_NAME).restoreAllShards(true), PlainActionFuture.newFuture()); + ensureGreen(INDEX_NAME); + + String newDataNodeName = internalCluster().getDataNodeNames().iterator().next(); + IndexShard recoveredIndexShard = getIndexShard(newDataNodeName, INDEX_NAME); + validateRemoteStoreSegments(recoveredIndexShard, "after recovery"); + + RemoteSegmentStoreDirectory recoveredRemoteDir = recoveredIndexShard.getRemoteDirectory(); + Map recoveredSegments = recoveredRemoteDir.getSegmentsUploadedToRemoteStore().entrySet().stream() + .collect(Collectors.toMap(e -> new FileMetadata(e.getKey()), Map.Entry::getValue)); + long recoveredParquetFileCount = recoveredSegments.keySet().stream().filter(fm -> "parquet".equals(fm.dataFormat())).count(); + assertEquals("Should recover same number of Parquet files", parquetFileCount, recoveredParquetFileCount); + + long localFilesAfterRecovery = validateLocalShardFiles(recoveredIndexShard, "after recovery"); + assertTrue("Should have local files after recovery", localFilesAfterRecovery >= 0); + + client().admin().indices().prepareRefresh(INDEX_NAME).get(); + long docCountAfterRecovery = recoveredIndexShard.docStats().getCount(); + + // Verify before/after comparison + assertEquals("Doc count should be same before and after recovery", docCountBeforeRecovery, docCountAfterRecovery); + assertEquals("Local file count should be same before and after recovery", localFilesBeforeRecovery, localFilesAfterRecovery); + assertEquals("Cluster UUID should remain same", clusterUUID, clusterService().state().metadata().clusterUUID()); + } + + /** + * Tests DataFusion replica promotion to primary with Parquet format preservation. + */ + public void testDataFusionReplicaPromotionToPrimary() throws Exception { + internalCluster().startClusterManagerOnlyNodes(1); + internalCluster().startDataOnlyNodes(2); + ensureStableCluster(3); + + String mappings = "{ \"properties\": { \"message\": { \"type\": \"long\" }, \"phase\": { \"type\": \"keyword\" } } }"; + assertAcked(client().admin().indices().prepareCreate(INDEX_NAME) + .setSettings(Settings.builder().put(indexSettings()).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1).build()) + .setMapping(mappings).get()); + ensureGreen(INDEX_NAME); + + for (int i = 1; i <= 5; i++) { + client().prepareIndex(INDEX_NAME).setId("primary_doc" + i) + .setSource("{ \"message\": " + (i * 100) + ", \"phase\": \"primary\" }", MediaTypeRegistry.JSON).get(); + } + client().admin().indices().prepareFlush(INDEX_NAME).get(); + client().admin().indices().prepareRefresh(INDEX_NAME).get(); + ensureGreen(INDEX_NAME); + + var clusterState = clusterService().state(); + var shardRouting = clusterState.routingTable().index(INDEX_NAME).shard(0); + String primaryNodeId = shardRouting.primaryShard().currentNodeId(); + String replicaNodeId = shardRouting.replicaShards().get(0).currentNodeId(); + + String primaryNodeName = null, replicaNodeName = null; + for (String nodeName : internalCluster().getNodeNames()) { + String nodeId = internalCluster().clusterService(nodeName).localNode().getId(); + if (nodeId.equals(primaryNodeId)) primaryNodeName = nodeName; + else if (nodeId.equals(replicaNodeId)) replicaNodeName = nodeName; + } + + IndexShard replicaShard = internalCluster().getInstance(org.opensearch.indices.IndicesService.class, replicaNodeName) + .indexServiceSafe(resolveIndex(INDEX_NAME)).getShard(0); + Thread.sleep(2000); + validateRemoteStoreSegments(replicaShard, "replica before promotion"); + + // Capture state before promotion for comparison + long docCountBeforePromotion = replicaShard.docStats().getCount(); + long localFilesBeforePromotion = validateLocalShardFiles(replicaShard, "replica before promotion"); + + internalCluster().stopRandomNode(org.opensearch.test.InternalTestCluster.nameFilter(primaryNodeName)); + ensureStableCluster(2); + ensureYellow(INDEX_NAME); + + IndexShard promotedShard = internalCluster().getInstance(org.opensearch.indices.IndicesService.class, replicaNodeName) + .indexServiceSafe(resolveIndex(INDEX_NAME)).getShard(0); + assertTrue("Former replica should now be primary", promotedShard.routingEntry().primary()); + validateRemoteStoreSegments(promotedShard, "after promotion"); + + Set formats = promotedShard.getRemoteDirectory().getSegmentsUploadedToRemoteStore().entrySet().stream() + .map(e -> new FileMetadata(e.getKey()).dataFormat()).collect(Collectors.toSet()); + assertTrue("Promoted primary should have Parquet files", formats.contains("parquet")); + + for (int i = 1; i <= 3; i++) { + client().prepareIndex(INDEX_NAME).setId("promoted_doc" + i) + .setSource("{ \"message\": " + (i * 200) + ", \"phase\": \"promoted\" }", MediaTypeRegistry.JSON).get(); + } + client().admin().indices().prepareFlush(INDEX_NAME).get(); + client().admin().indices().prepareRefresh(INDEX_NAME).get(); + + long localFilesAfterPromotion = validateLocalShardFiles(promotedShard, "after promotion and new docs"); + assertTrue("Should have local files after promotion", localFilesAfterPromotion >= 0); + + // Verify final state (5 original + 3 new docs) + assertEquals("Final document count should match", 8, promotedShard.docStats().getCount()); + // Local files should increase after adding new docs + assertTrue("Local files should exist after new writes", localFilesAfterPromotion >= localFilesBeforePromotion); + } + + /** + * Tests cluster recovery from remote translog when no flush/refresh is performed. + */ + public void testClusterRecoveryFromTranslogWithoutFlush() throws Exception { + internalCluster().startClusterManagerOnlyNodes(1); + internalCluster().startDataOnlyNodes(1); + ensureStableCluster(2); + + String mappings = "{ \"properties\": { \"value\": { \"type\": \"long\" }, \"name\": { \"type\": \"keyword\" } } }"; + assertAcked(client().admin().indices().prepareCreate(INDEX_NAME) + .setSettings(Settings.builder().put(indexSettings()).put("index.translog.durability", "request").build()) + .setMapping(mappings).get()); + ensureGreen(INDEX_NAME); + + int numDocs = 10; + for (int i = 1; i <= numDocs; i++) { + client().prepareIndex(INDEX_NAME).setId("doc" + i) + .setSource("{ \"value\": " + (i * 100) + ", \"name\": \"doc" + i + "\" }", MediaTypeRegistry.JSON).get(); + } + // Intentionally NOT calling flush or refresh - documents exist only in translog + Thread.sleep(1000); + + String dataNodeName = internalCluster().getDataNodeNames().iterator().next(); + IndexShard indexShard = getIndexShard(dataNodeName, INDEX_NAME); + assertTrue("Translog should have uncommitted operations", indexShard.translogStats().getUncommittedOperations() >= numDocs); + + String clusterUUID = clusterService().state().metadata().clusterUUID(); + internalCluster().stopRandomDataNode(); + ensureRed(INDEX_NAME); + + internalCluster().startDataOnlyNode(); + ensureStableCluster(2); + + assertAcked(client().admin().indices().prepareClose(INDEX_NAME)); + client().admin().cluster().restoreRemoteStore(new RestoreRemoteStoreRequest().indices(INDEX_NAME).restoreAllShards(true), PlainActionFuture.newFuture()); + ensureGreen(INDEX_NAME); + client().admin().indices().prepareRefresh(INDEX_NAME).get(); + + String newDataNodeName = internalCluster().getDataNodeNames().iterator().next(); + IndexShard recoveredShard = getIndexShard(newDataNodeName, INDEX_NAME); + + assertBusy(() -> assertTrue("Translog should have processed operations", + recoveredShard.translogStats().estimatedNumberOfOperations() >= 0), 30, TimeUnit.SECONDS); + + long parquetFilesAfterRecovery = validateLocalShardFiles(recoveredShard, "after recovery"); + assertTrue("Should have local files after recovery", parquetFilesAfterRecovery >= 0); + assertEquals("Document count should match", numDocs, recoveredShard.docStats().getCount()); + assertEquals("Cluster UUID should remain same", clusterUUID, clusterService().state().metadata().clusterUUID()); + + assertAcked(client().admin().indices().prepareDelete(INDEX_NAME).get()); + } + + /** + * Tests replica promotion to primary with translog replay for uncommitted operations. + */ + public void testReplicaPromotionWithTranslogReplay() throws Exception { + internalCluster().startClusterManagerOnlyNodes(1); + internalCluster().startDataOnlyNodes(2); + ensureStableCluster(3); + + String mappings = "{ \"properties\": { \"value\": { \"type\": \"long\" }, \"phase\": { \"type\": \"keyword\" } } }"; + assertAcked(client().admin().indices().prepareCreate(INDEX_NAME) + .setSettings(Settings.builder().put(indexSettings()).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1) + .put("index.translog.durability", "request").build()) + .setMapping(mappings).get()); + ensureGreen(INDEX_NAME); + + int initialDocs = 5; + for (int i = 1; i <= initialDocs; i++) { + client().prepareIndex(INDEX_NAME).setId("initial_doc" + i) + .setSource("{ \"value\": " + (i * 100) + ", \"phase\": \"initial\" }", MediaTypeRegistry.JSON).get(); + } + client().admin().indices().prepareFlush(INDEX_NAME).get(); + client().admin().indices().prepareRefresh(INDEX_NAME).get(); + ensureGreen(INDEX_NAME); + + int uncommittedDocs = 7; + for (int i = 1; i <= uncommittedDocs; i++) { + client().prepareIndex(INDEX_NAME).setId("uncommitted_doc" + i) + .setSource("{ \"value\": " + (i * 200) + ", \"phase\": \"uncommitted\" }", MediaTypeRegistry.JSON).get(); + } + // Intentionally NOT calling flush or refresh - docs exist only in translog + Thread.sleep(1000); + + var clusterState = clusterService().state(); + var shardRouting = clusterState.routingTable().index(INDEX_NAME).shard(0); + String primaryNodeId = shardRouting.primaryShard().currentNodeId(); + String replicaNodeId = shardRouting.replicaShards().get(0).currentNodeId(); + + String primaryNodeName = null, replicaNodeName = null; + for (String nodeName : internalCluster().getNodeNames()) { + String nodeId = internalCluster().clusterService(nodeName).localNode().getId(); + if (nodeId.equals(primaryNodeId)) primaryNodeName = nodeName; + else if (nodeId.equals(replicaNodeId)) replicaNodeName = nodeName; + } + assertNotNull("Primary node name should be found", primaryNodeName); + assertNotNull("Replica node name should be found", replicaNodeName); + + IndexShard primaryShard = internalCluster().getInstance(org.opensearch.indices.IndicesService.class, primaryNodeName) + .indexServiceSafe(resolveIndex(INDEX_NAME)).getShard(0); + assertTrue("Primary should have uncommitted translog operations", primaryShard.translogStats().getUncommittedOperations() >= uncommittedDocs); + + IndexShard replicaShard = internalCluster().getInstance(org.opensearch.indices.IndicesService.class, replicaNodeName) + .indexServiceSafe(resolveIndex(INDEX_NAME)).getShard(0); + long replicaFilesBeforePromotion = validateLocalShardFiles(replicaShard, "replica before promotion"); + + String finalReplicaNodeName = replicaNodeName; + internalCluster().stopRandomNode(org.opensearch.test.InternalTestCluster.nameFilter(primaryNodeName)); + ensureStableCluster(2); + + assertBusy(() -> { + var health = client().admin().cluster().prepareHealth(INDEX_NAME).get(); + assertTrue("Index should not be red", health.getStatus() != org.opensearch.cluster.health.ClusterHealthStatus.RED); + }, 30, TimeUnit.SECONDS); + ensureYellow(INDEX_NAME); + + IndexShard promotedShard = internalCluster().getInstance(org.opensearch.indices.IndicesService.class, finalReplicaNodeName) + .indexServiceSafe(resolveIndex(INDEX_NAME)).getShard(0); + assertTrue("Former replica should now be primary", promotedShard.routingEntry().primary()); + + assertBusy(() -> assertTrue("Translog should have processed operations", + promotedShard.translogStats().estimatedNumberOfOperations() >= 0), 30, TimeUnit.SECONDS); + + validateRemoteStoreSegments(promotedShard, "after promotion"); + long promotedFilesAfterPromotion = validateLocalShardFiles(promotedShard, "after promotion"); + assertTrue("Promoted primary should have local files", promotedFilesAfterPromotion >= 0); + + client().admin().indices().prepareRefresh(INDEX_NAME).get(); + assertEquals("Document count should include all documents", initialDocs + uncommittedDocs, promotedShard.docStats().getCount()); + + int newDocs = 3; + for (int i = 1; i <= newDocs; i++) { + client().prepareIndex(INDEX_NAME).setId("post_promotion_doc" + i) + .setSource("{ \"value\": " + (i * 300) + ", \"phase\": \"post_promotion\" }", MediaTypeRegistry.JSON).get(); + } + client().admin().indices().prepareFlush(INDEX_NAME).get(); + + assertAcked(client().admin().indices().prepareDelete(INDEX_NAME).get()); + } + + /** + * Tests DataFusion primary restart with extra local commits. + */ + public void testDataFusionPrimaryRestartWithExtraCommits() throws Exception { + internalCluster().startClusterManagerOnlyNodes(1); + internalCluster().startDataOnlyNodes(1); + ensureStableCluster(2); + + String mappings = "{ \"properties\": { \"message\": { \"type\": \"long\" }, \"stage\": { \"type\": \"keyword\" } } }"; + assertAcked(client().admin().indices().prepareCreate(INDEX_NAME).setSettings(indexSettings()).setMapping(mappings).get()); + ensureGreen(INDEX_NAME); + + for (int i = 1; i <= 4; i++) { + client().prepareIndex(INDEX_NAME).setId("initial_doc" + i) + .setSource("{ \"message\": " + (i * 100) + ", \"stage\": \"initial\" }", MediaTypeRegistry.JSON).get(); + } + client().admin().indices().prepareFlush(INDEX_NAME).get(); + client().admin().indices().prepareRefresh(INDEX_NAME).get(); + + String dataNodeName = internalCluster().getDataNodeNames().iterator().next(); + IndexShard indexShard = getIndexShard(dataNodeName, INDEX_NAME); + validateRemoteStoreSegments(indexShard, "initial upload"); + + // Capture state before extra docs and restart for comparison + long docCountAfterInitial = indexShard.docStats().getCount(); + long localFilesAfterInitial = validateLocalShardFiles(indexShard, "after initial flush"); + + for (int i = 1; i <= 3; i++) { + client().prepareIndex(INDEX_NAME).setId("extra_doc" + i) + .setSource("{ \"message\": " + (i * 300) + ", \"stage\": \"extra\" }", MediaTypeRegistry.JSON).get(); + } + + try { + org.apache.lucene.index.SegmentInfos latestCommit = org.apache.lucene.index.SegmentInfos.readLatestCommit(indexShard.store().directory()); + latestCommit.commit(indexShard.store().directory()); + latestCommit.commit(indexShard.store().directory()); + } catch (Exception e) { + logger.warn("--> Could not create extra commits: {}", e.getMessage()); + } + + String nodeToRestart = internalCluster().getDataNodeNames().iterator().next(); + internalCluster().restartNode(nodeToRestart, new org.opensearch.test.InternalTestCluster.RestartCallback() { + @Override + public Settings onNodeStopped(String nodeName) throws Exception { + return super.onNodeStopped(nodeName); + } + }); + ensureStableCluster(2); + ensureGreen(INDEX_NAME); + + String restartedNodeName = internalCluster().getDataNodeNames().iterator().next(); + IndexShard recoveredShard = getIndexShard(restartedNodeName, INDEX_NAME); + validateRemoteStoreSegments(recoveredShard, "after restart"); + + long localFilesAfterRecovery = validateLocalShardFiles(recoveredShard, "after restart"); + assertTrue("Should have local files after restart", localFilesAfterRecovery >= 0); + + client().admin().indices().prepareRefresh(INDEX_NAME).get(); + long docCountAfterRestart = recoveredShard.docStats().getCount(); + + // Verify doc count: initial 4 + extra 3 = 7 + assertEquals("Document count should match total docs after restart", 7, docCountAfterRestart); + // Local files should be at least as many as after initial flush + assertTrue("Local files should be preserved after restart", localFilesAfterRecovery >= localFilesAfterInitial); + + client().prepareIndex(INDEX_NAME).setId("post_recovery_doc") + .setSource("{ \"message\": 999, \"stage\": \"post_recovery\" }", MediaTypeRegistry.JSON).get(); + client().admin().indices().prepareFlush(INDEX_NAME).get(); + client().admin().indices().prepareRefresh(INDEX_NAME).get(); + + assertEquals("Final document count should match", 8, recoveredShard.docStats().getCount()); + } +} diff --git a/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/DataFusionServiceTests.java b/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/DataFusionServiceTests.java index f6b5c176e41bb..08bb2b2bebc30 100644 --- a/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/DataFusionServiceTests.java +++ b/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/DataFusionServiceTests.java @@ -150,7 +150,7 @@ public void testQueryPhaseExecutor() throws IOException { Index index = new Index("index-7", "index-7"); final Path path = Path.of(resourceUrl.toURI()).resolve("index-7").resolve("0"); ShardPath shardPath = new ShardPath(false, path, path, new ShardId(index, 0)); - DatafusionEngine engine = new DatafusionEngine(DataFormat.CSV, List.of(new FileMetadata(DataFormat.CSV.toString(), "generation-1.parquet")), service, shardPath); + DatafusionEngine engine = new DatafusionEngine(DataFormat.CSV, List.of(new FileMetadata(DataFormat.CSV.getName(), "generation-1.parquet")), service, shardPath); datafusionSearcher = engine.acquireSearcher("search"); byte[] protoContent; @@ -289,7 +289,6 @@ public void testQueryThenFetchE2ETest() throws IOException, URISyntaxException, final Path path = Path.of(resourceUrl.toURI()).resolve("index-7").resolve("0"); ShardPath shardPath = new ShardPath(false, path, path, new ShardId(index, 0)); DatafusionEngine engine = new DatafusionEngine(DataFormat.CSV, List.of(new FileMetadata(DataFormat.CSV.toString(), "generation-1.parquet"), new FileMetadata(DataFormat.CSV.toString(), "generation-2.parquet")), service, shardPath); - SearchRequest searchRequest = new SearchRequest().allowPartialSearchResults(true).source(new SearchSourceBuilder().size(9).fetchSource(List.of("message").toArray(String[]::new), null)); ShardSearchRequest shardSearchRequest = new ShardSearchRequest( OriginalIndices.NONE, diff --git a/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/DataFusionSingleNodeTests.java b/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/DataFusionSingleNodeTests.java index 505a55e1514ec..98c0939122b84 100644 --- a/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/DataFusionSingleNodeTests.java +++ b/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/DataFusionSingleNodeTests.java @@ -31,12 +31,14 @@ import java.util.List; import java.util.Locale; + @OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.TEST) public class DataFusionSingleNodeTests extends OpenSearchSingleNodeTestCase { private static final String INDEX_MAPPING_JSON = "clickbench_index_mapping.json"; private static final String DATA = "clickbench.json"; private final String indexName = "hits"; + private static final String REPOSITORY_NAME = "test-remote-store-repo"; @Override protected Collection> getPlugins() { @@ -54,6 +56,8 @@ public void testClickBenchQueries() throws IOException { .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0) .put("index.refresh_interval", -1) + .put("index.replication.type", "SEGMENT") + .put("index.optimized.enabled", true)// Enable segment replication for remote store .build(), mappings ); @@ -76,8 +80,8 @@ public void testClickBenchQueries() throws IOException { XContentParser parser = createParser(JsonXContent.jsonXContent, sourceFile); source.parseXContent(parser); + SearchResponse response = client().prepareSearch(indexName).setSource(source).get(); - // TODO: Match expected results... System.out.println(response); } diff --git a/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3BlobStore.java b/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3BlobStore.java index 5b677a49694a2..677b3d3fdfd5d 100644 --- a/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3BlobStore.java +++ b/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3BlobStore.java @@ -230,7 +230,7 @@ public boolean serverSideEncryptionBucketKey() { * null as the S3 client ignores null header values */ public String serverSideEncryptionEncryptionContext() { - return serverSideEncryptionEncryptionContext.isEmpty() + return serverSideEncryptionEncryptionContext == null || serverSideEncryptionEncryptionContext.isEmpty() ? null : Base64.getEncoder().encodeToString(serverSideEncryptionEncryptionContext.getBytes(StandardCharsets.UTF_8)); } @@ -239,7 +239,7 @@ public String serverSideEncryptionEncryptionContext() { * Returns the expected bucket owner if set, else null as the S3 client ignores null header values */ public String expectedBucketOwner() { - return expectedBucketOwner.isEmpty() ? null : expectedBucketOwner; + return expectedBucketOwner == null || expectedBucketOwner.isEmpty() ? null : expectedBucketOwner; } public long bufferSizeInBytes() { diff --git a/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3Repository.java b/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3Repository.java index 8d8de283f75bb..a8a46065092de 100644 --- a/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3Repository.java +++ b/plugins/repository-s3/src/main/java/org/opensearch/repositories/s3/S3Repository.java @@ -683,4 +683,10 @@ protected void doClose() { } super.doClose(); } + + @Override + public boolean isSeverSideEncryptionEnabled() { + // s3 is always server side encrypted. + return true; + } } diff --git a/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/S3RepositoryTests.java b/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/S3RepositoryTests.java index 49c6a31e32816..d75598fb6b782 100644 --- a/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/S3RepositoryTests.java +++ b/plugins/repository-s3/src/test/java/org/opensearch/repositories/s3/S3RepositoryTests.java @@ -33,6 +33,7 @@ package org.opensearch.repositories.s3; import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.ServerSideEncryption; import org.opensearch.cluster.metadata.RepositoryMetadata; import org.opensearch.common.blobstore.BlobStoreException; @@ -175,6 +176,18 @@ public void testValidateHttpLClientType_Invalid_Values() { } } + public void testIsSeverSideEncryptionEnabled_When_AWSKMS_Type() { + Settings settings = Settings.builder() + .put(S3Repository.SERVER_SIDE_ENCRYPTION_TYPE_SETTING.getKey(), ServerSideEncryption.AWS_KMS.toString()) + .build(); + final RepositoryMetadata metadata = new RepositoryMetadata("dummy-repo", "mock", settings); + try (S3Repository s3Repo = createS3Repo(metadata)) { + + // Don't expect any Exception + assertTrue(s3Repo.isSeverSideEncryptionEnabled()); + } + } + private S3Repository createS3Repo(RepositoryMetadata metadata) { return new S3Repository( metadata, diff --git a/server/src/main/java/org/opensearch/action/admin/cluster/remotestore/metadata/TransportRemoteStoreMetadataAction.java b/server/src/main/java/org/opensearch/action/admin/cluster/remotestore/metadata/TransportRemoteStoreMetadataAction.java index 16c29d7586a98..c72820d8df19a 100644 --- a/server/src/main/java/org/opensearch/action/admin/cluster/remotestore/metadata/TransportRemoteStoreMetadataAction.java +++ b/server/src/main/java/org/opensearch/action/admin/cluster/remotestore/metadata/TransportRemoteStoreMetadataAction.java @@ -198,7 +198,9 @@ private Map> getSegmentMetadata( IndexMetadata.INDEX_REMOTE_SEGMENT_STORE_REPOSITORY_SETTING.get(indexMetadata.getSettings()), index.getUUID(), shardId, - indexSettings.getRemoteStorePathStrategy() + indexSettings.getRemoteStorePathStrategy(), + null, + RemoteStoreUtils.isServerSideEncryptionEnabledIndex(indexSettings.getIndexMetadata()) ); Map segmentMetadataMapWithFilenames = remoteDirectory.readLatestNMetadataFiles(5); @@ -257,7 +259,8 @@ private Map> getTranslogMetadataFiles( tracker, indexSettings.getRemoteStorePathStrategy(), new RemoteStoreSettings(clusterService.getSettings(), clusterService.getClusterSettings()), - RemoteStoreUtils.determineTranslogMetadataEnabled(indexMetadata) + RemoteStoreUtils.determineTranslogMetadataEnabled(indexMetadata), + RemoteStoreUtils.isServerSideEncryptionEnabledIndex(indexSettings.getIndexMetadata()) ); Map metadataMap = manager.readLatestNMetadataFiles(5); diff --git a/server/src/main/java/org/opensearch/cluster/metadata/IndexMetadata.java b/server/src/main/java/org/opensearch/cluster/metadata/IndexMetadata.java index 4d53a547db714..95d2e13c6d417 100644 --- a/server/src/main/java/org/opensearch/cluster/metadata/IndexMetadata.java +++ b/server/src/main/java/org/opensearch/cluster/metadata/IndexMetadata.java @@ -992,6 +992,7 @@ public Iterator> settings() { public static final String KEY_PRIMARY_TERMS = "primary_terms"; public static final String REMOTE_STORE_CUSTOM_KEY = "remote_store"; public static final String TRANSLOG_METADATA_KEY = "translog_metadata"; + public static final String REMOTE_STORE_SSE_ENABLED_INDEX_KEY = "sse_enabled_index"; public static final String CONTEXT_KEY = "context"; public static final String INGESTION_SOURCE_KEY = "ingestion_source"; public static final String INGESTION_STATUS_KEY = "ingestion_status"; diff --git a/server/src/main/java/org/opensearch/cluster/metadata/MetadataCreateIndexService.java b/server/src/main/java/org/opensearch/cluster/metadata/MetadataCreateIndexService.java index a889091140d12..7bc8c9ccb0855 100644 --- a/server/src/main/java/org/opensearch/cluster/metadata/MetadataCreateIndexService.java +++ b/server/src/main/java/org/opensearch/cluster/metadata/MetadataCreateIndexService.java @@ -632,7 +632,8 @@ static Optional validateOverlap(Set requestSettings, Settings co IndexMetadata buildAndValidateTemporaryIndexMetadata( final Settings aggregatedIndexSettings, final CreateIndexClusterStateUpdateRequest request, - final int routingNumShards + final int routingNumShards, + final ClusterState clusterState ) { final boolean isHiddenAfterTemplates = IndexMetadata.INDEX_HIDDEN_SETTING.get(aggregatedIndexSettings); @@ -642,7 +643,7 @@ IndexMetadata buildAndValidateTemporaryIndexMetadata( tmpImdBuilder.setRoutingNumShards(routingNumShards); tmpImdBuilder.settings(aggregatedIndexSettings); tmpImdBuilder.system(isSystem); - addRemoteStoreCustomMetadata(tmpImdBuilder, true); + addRemoteStoreCustomMetadata(tmpImdBuilder, true, clusterState); if (request.context() != null) { tmpImdBuilder.context(request.context()); @@ -661,7 +662,9 @@ IndexMetadata buildAndValidateTemporaryIndexMetadata( * @param tmpImdBuilder index metadata builder. * @param assertNullOldType flag to verify that the old remote store path type is null */ - public void addRemoteStoreCustomMetadata(IndexMetadata.Builder tmpImdBuilder, boolean assertNullOldType) { + public void addRemoteStoreCustomMetadata(IndexMetadata.Builder tmpImdBuilder, boolean assertNullOldType, ClusterState clusterState) { + + boolean isRestoreFromSnapshot = !assertNullOldType; if (remoteStoreCustomMetadataResolver == null) { return; } @@ -676,6 +679,24 @@ public void addRemoteStoreCustomMetadata(IndexMetadata.Builder tmpImdBuilder, bo boolean isTranslogMetadataEnabled = remoteStoreCustomMetadataResolver.isTranslogMetadataEnabled(); remoteCustomData.put(IndexMetadata.TRANSLOG_METADATA_KEY, Boolean.toString(isTranslogMetadataEnabled)); + Optional remoteNode = clusterState.nodes() + .getNodes() + .values() + .stream() + .filter(DiscoveryNode::isRemoteStoreNode) + .findFirst(); + + String sseEnabledIndex = existingCustomData == null + ? null + : existingCustomData.get(IndexMetadata.REMOTE_STORE_SSE_ENABLED_INDEX_KEY); + if (isRestoreFromSnapshot && sseEnabledIndex != null) { + remoteCustomData.put(IndexMetadata.REMOTE_STORE_SSE_ENABLED_INDEX_KEY, sseEnabledIndex); + } else if (remoteNode.isPresent() + && !isRestoreFromSnapshot + && remoteStoreCustomMetadataResolver.isRemoteStoreRepoServerSideEncryptionEnabled()) { + remoteCustomData.put(IndexMetadata.REMOTE_STORE_SSE_ENABLED_INDEX_KEY, Boolean.toString(true)); + } + // Determine the path type for use using the remoteStorePathResolver. RemoteStorePathStrategy newPathStrategy = remoteStoreCustomMetadataResolver.getPathStrategy(); remoteCustomData.put(PathType.NAME, newPathStrategy.getType().name()); @@ -730,7 +751,7 @@ private ClusterState applyCreateIndexRequestWithV1Templates( clusterService.getClusterSettings() ); int routingNumShards = getIndexNumberOfRoutingShards(aggregatedIndexSettings, null); - IndexMetadata tmpImd = buildAndValidateTemporaryIndexMetadata(aggregatedIndexSettings, request, routingNumShards); + IndexMetadata tmpImd = buildAndValidateTemporaryIndexMetadata(aggregatedIndexSettings, request, routingNumShards, currentState); return applyCreateIndexWithTemporaryService( currentState, @@ -795,7 +816,7 @@ private ClusterState applyCreateIndexRequestWithV2Template( clusterService.getClusterSettings() ); int routingNumShards = getIndexNumberOfRoutingShards(aggregatedIndexSettings, null); - IndexMetadata tmpImd = buildAndValidateTemporaryIndexMetadata(aggregatedIndexSettings, request, routingNumShards); + IndexMetadata tmpImd = buildAndValidateTemporaryIndexMetadata(aggregatedIndexSettings, request, routingNumShards, currentState); return applyCreateIndexWithTemporaryService( currentState, @@ -879,7 +900,7 @@ private ClusterState applyCreateIndexRequestWithExistingMetadata( clusterService.getClusterSettings() ); final int routingNumShards = getIndexNumberOfRoutingShards(aggregatedIndexSettings, sourceMetadata); - IndexMetadata tmpImd = buildAndValidateTemporaryIndexMetadata(aggregatedIndexSettings, request, routingNumShards); + IndexMetadata tmpImd = buildAndValidateTemporaryIndexMetadata(aggregatedIndexSettings, request, routingNumShards, currentState); return applyCreateIndexWithTemporaryService( currentState, @@ -1177,8 +1198,8 @@ public static void updateRemoteStoreSettings( .findFirst(); if (remoteNode.isPresent()) { - translogRepo = RemoteStoreNodeAttribute.getTranslogRepoName(remoteNode.get().getAttributes()); segmentRepo = RemoteStoreNodeAttribute.getSegmentRepoName(remoteNode.get().getAttributes()); + translogRepo = RemoteStoreNodeAttribute.getTranslogRepoName(remoteNode.get().getAttributes()); if (segmentRepo != null) { settingsBuilder.put(SETTING_REMOTE_STORE_ENABLED, true).put(SETTING_REMOTE_SEGMENT_STORE_REPOSITORY, segmentRepo); if (translogRepo != null) { diff --git a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java index 9b33bde32cb54..9efcff47166f0 100644 --- a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java @@ -807,6 +807,8 @@ public void apply(Settings value, Settings current, Settings previous) { RemoteStoreSettings.CLUSTER_REMOTE_STORE_PINNED_TIMESTAMP_ENABLED, RemoteStoreSettings.CLUSTER_REMOTE_STORE_SEGMENTS_PATH_PREFIX, RemoteStoreSettings.CLUSTER_REMOTE_STORE_TRANSLOG_PATH_PREFIX, + // Server Side encryption enabled + RemoteStoreSettings.CLUSTER_SERVER_SIDE_ENCRYPTION_ENABLED, // Snapshot related Settings BlobStoreRepository.SNAPSHOT_SHARD_PATH_PREFIX_SETTING, diff --git a/server/src/main/java/org/opensearch/index/IndexService.java b/server/src/main/java/org/opensearch/index/IndexService.java index 972b1c54d300f..939f6d2944073 100644 --- a/server/src/main/java/org/opensearch/index/IndexService.java +++ b/server/src/main/java/org/opensearch/index/IndexService.java @@ -84,6 +84,7 @@ import org.opensearch.index.query.QueryShardContext; import org.opensearch.index.query.SearchIndexNameMatcher; import org.opensearch.index.remote.RemoteStoreStatsTrackerFactory; +import org.opensearch.index.remote.RemoteStoreUtils; import org.opensearch.index.seqno.RetentionLeaseSyncer; import org.opensearch.index.shard.IndexEventListener; import org.opensearch.index.shard.IndexShard; @@ -719,7 +720,8 @@ public synchronized IndexShard createShard( this.indexSettings.getUUID(), shardId, this.indexSettings.getRemoteStorePathStrategy(), - this.indexSettings.getRemoteStoreSegmentPathPrefix() + this.indexSettings.getRemoteStoreSegmentPathPrefix(), + RemoteStoreUtils.isServerSideEncryptionEnabledIndex(this.indexSettings.getIndexMetadata()) ); } // When an instance of Store is created, a shardlock is created which is released on closing the instance of store. @@ -736,7 +738,9 @@ protected void closeInternal() { // Do nothing for shard lock on remote store } }; - CompositeStoreDirectory remoteCompositeStoreDirectory = createCompositeStoreDirectory(path); + CompositeStoreDirectory remoteCompositeStoreDirectory = this.indexSettings.isOptimizedIndex() + ? createCompositeStoreDirectory(shardId, path) + : null; remoteStore = new Store(shardId, this.indexSettings, remoteDirectory, remoteStoreLock, Store.OnClose.EMPTY, path, remoteCompositeStoreDirectory); } else { // Disallow shards with remote store based settings to be created on non-remote store enabled nodes @@ -767,7 +771,9 @@ protected void closeInternal() { directory = directoryFactory.newDirectory(this.indexSettings, path); } - CompositeStoreDirectory compositeStoreDirectory = createCompositeStoreDirectory(path); + CompositeStoreDirectory compositeStoreDirectory = this.indexSettings.isOptimizedIndex() + ? createCompositeStoreDirectory(shardId, path) + : null; store = new Store( shardId, @@ -1366,11 +1372,12 @@ final IndexStorePlugin.DirectoryFactory getDirectoryFactory() { * Creates CompositeStoreDirectory using the factory if available, otherwise fallback to Store's internal creation. * This method centralizes the directory creation logic and enables plugin-based format discovery. */ - private CompositeStoreDirectory createCompositeStoreDirectory(ShardPath shardPath) throws IOException { + private CompositeStoreDirectory createCompositeStoreDirectory(ShardId shardId, ShardPath shardPath) throws IOException { if (compositeStoreDirectoryFactory != null) { logger.debug("Using CompositeStoreDirectoryFactory to create directory for shard path: {}", shardPath); return compositeStoreDirectoryFactory.newCompositeStoreDirectory( indexSettings, + shardId, shardPath, pluginsService ); diff --git a/server/src/main/java/org/opensearch/index/engine/CombinedDeletionPolicy.java b/server/src/main/java/org/opensearch/index/engine/CombinedDeletionPolicy.java index 338112745eb54..4589455ab5d6e 100644 --- a/server/src/main/java/org/opensearch/index/engine/CombinedDeletionPolicy.java +++ b/server/src/main/java/org/opensearch/index/engine/CombinedDeletionPolicy.java @@ -175,10 +175,15 @@ public SafeCommitInfo getSafeCommitInfo() { * Index files of the capturing commit point won't be released until the commit reference is closed. * * @param acquiringSafeCommit captures the most recent safe commit point if true; otherwise captures the most recent commit point. + * @throws EngineNotInitializedException if the deletion policy has not been initialized yet (no commits exist) */ public synchronized IndexCommit acquireIndexCommit(boolean acquiringSafeCommit) { - assert safeCommit != null : "Safe commit is not initialized yet"; - assert lastCommit != null : "Last commit is not initialized yet"; + if (safeCommit == null) { + throw new EngineNotInitializedException("Safe commit is not initialized yet - deletion policy has not processed any commits"); + } + if (lastCommit == null) { + throw new EngineNotInitializedException("Last commit is not initialized yet - deletion policy has not processed any commits"); + } final IndexCommit snapshotting = acquiringSafeCommit ? safeCommit : lastCommit; snapshottedCommits.merge(snapshotting, 1, Integer::sum); // increase refCount return new SnapshotIndexCommit(snapshotting); diff --git a/server/src/main/java/org/opensearch/index/engine/Engine.java b/server/src/main/java/org/opensearch/index/engine/Engine.java index f9898382ffbdc..92938e6728192 100644 --- a/server/src/main/java/org/opensearch/index/engine/Engine.java +++ b/server/src/main/java/org/opensearch/index/engine/Engine.java @@ -84,6 +84,9 @@ import org.opensearch.index.engine.exec.bridge.IndexingThrottler; import org.opensearch.index.engine.exec.bridge.StatsHolder; import org.opensearch.index.engine.exec.composite.CompositeDataFormatWriter; +import org.opensearch.index.engine.exec.coord.CatalogSnapshot; +import org.opensearch.index.engine.exec.coord.CompositeEngine; +import org.opensearch.index.engine.exec.coord.SegmentInfosCatalogSnapshot; import org.opensearch.index.mapper.IdFieldMapper; import org.opensearch.index.mapper.Mapping; import org.opensearch.index.mapper.ParseContext.Document; @@ -301,6 +304,19 @@ public long getMaxSeqNoFromSegmentInfos(SegmentInfos segmentInfos) throws IOExce } } + @Override + public CompositeEngine.ReleasableRef acquireSnapshot() { + GatedCloseable segmentInfosCloseable = getSegmentInfosSnapshot(); + return new CompositeEngine.ReleasableRef( + new SegmentInfosCatalogSnapshot(segmentInfosCloseable.get()) + ) { + @Override + public void close() throws Exception { + segmentInfosCloseable.close(); + } + }; + } + /** * Get max sequence number that is part of given searcher. Sequence number is part of each document that is indexed. * This method fetches the _id of last indexed document that was part of the given searcher and diff --git a/server/src/main/java/org/opensearch/index/engine/InternalEngine.java b/server/src/main/java/org/opensearch/index/engine/InternalEngine.java index 4bfa1cbef71ab..814b5848c5be3 100644 --- a/server/src/main/java/org/opensearch/index/engine/InternalEngine.java +++ b/server/src/main/java/org/opensearch/index/engine/InternalEngine.java @@ -559,7 +559,7 @@ public final boolean assertSearcherIsWarmedUp(String source, SearcherScope scope case "segments_stats": break; default: -// assert externalReaderManager.isWarmedUp : "searcher was not warmed up yet for source[" + source + "]"; + // assert externalReaderManager.isWarmedUp : "searcher was not warmed up yet for source[" + source + "]"; } } return true; diff --git a/server/src/main/java/org/opensearch/index/engine/NRTReplicationCompositeEngine.java b/server/src/main/java/org/opensearch/index/engine/NRTReplicationCompositeEngine.java new file mode 100644 index 0000000000000..06c19e3786172 --- /dev/null +++ b/server/src/main/java/org/opensearch/index/engine/NRTReplicationCompositeEngine.java @@ -0,0 +1,466 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.index.engine; + +import org.apache.logging.log4j.Logger; +import org.apache.lucene.index.IndexCommit; +import org.apache.lucene.index.SegmentInfos; +import org.apache.lucene.search.ReferenceManager; +import org.apache.lucene.store.AlreadyClosedException; +import org.opensearch.common.concurrent.GatedCloseable; +import org.opensearch.common.logging.Loggers; +import org.opensearch.common.util.concurrent.ReleasableLock; +import org.opensearch.common.util.io.IOUtils; +import org.opensearch.core.index.shard.ShardId; +import org.opensearch.index.IndexSettings; +import org.opensearch.index.engine.exec.coord.CatalogSnapshot; +import org.opensearch.index.engine.exec.coord.CatalogSnapshotManager; +import org.opensearch.index.engine.exec.coord.CompositeEngine; +import org.opensearch.index.engine.exec.coord.CompositeEngineCatalogSnapshot; +import org.opensearch.index.engine.exec.coord.SegmentInfosCatalogSnapshot; +import org.opensearch.index.engine.exec.commit.LuceneCommitEngine; +import org.opensearch.index.engine.exec.FileMetadata; +import org.opensearch.index.engine.SearchExecEngine; +import org.opensearch.index.mapper.MapperService; +import org.opensearch.index.seqno.LocalCheckpointTracker; +import org.opensearch.index.seqno.SeqNoStats; +import org.opensearch.index.seqno.SequenceNumbers; +import org.opensearch.index.shard.ShardPath; +import org.opensearch.index.translog.Translog; +import org.opensearch.index.translog.TranslogCorruptedException; +import org.opensearch.index.translog.TranslogDeletionPolicy; +import org.opensearch.index.translog.DefaultTranslogDeletionPolicy; +import org.opensearch.index.translog.TranslogException; +import org.opensearch.index.translog.TranslogManager; +import org.opensearch.index.translog.TranslogOperationHelper; +import org.opensearch.index.translog.WriteOnlyTranslogManager; +import org.opensearch.index.translog.listener.TranslogEventListener; +import org.opensearch.plugins.PluginsService; +import org.opensearch.plugins.SearchEnginePlugin; +import org.opensearch.plugins.spi.vectorized.DataFormat; +import org.opensearch.search.suggest.completion.CompletionStats; + +import java.io.Closeable; +import java.io.IOException; +import java.util.*; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; +import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.function.BiFunction; +import java.util.stream.Collectors; + +import static org.opensearch.index.seqno.SequenceNumbers.MAX_SEQ_NO; + +/** + * Engine implementation for replica shards with optimized (multi-format) indices. + * Combines segment replication behavior (translog-only writes) with multi-format support. + * + * Similar to NRTReplicationEngine but for optimized indices using CatalogSnapshot. + */ +public class NRTReplicationCompositeEngine extends CompositeEngine { + + private final ShardId shardId; + private final Logger logger; + private final WriteOnlyTranslogManager translogManager; + private final LocalCheckpointTracker localCheckpointTracker; + private final AtomicBoolean isClosed = new AtomicBoolean(false); + private final List refreshListeners = new ArrayList<>(); + private final ReentrantReadWriteLock rwl = new ReentrantReadWriteLock(); + private final ReleasableLock readLock = new ReleasableLock(rwl.readLock()); + private final ReleasableLock writeLock = new ReleasableLock(rwl.writeLock()); + private final Lock flushLock = new ReentrantLock(); + private final Map>> readEngines = new HashMap<>(); + + private volatile long lastReceivedPrimaryGen = SequenceNumbers.NO_OPS_PERFORMED; + + public NRTReplicationCompositeEngine( + EngineConfig engineConfig, + MapperService mapperService, + PluginsService pluginsService, + IndexSettings indexSettings, + ShardPath shardPath, + BiFunction localCheckpointTrackerSupplier, + TranslogEventListener translogEventListener + ) { + super(engineConfig, mapperService, pluginsService, indexSettings, shardPath, localCheckpointTrackerSupplier, translogEventListener); + this.shardId = engineConfig.getShardId(); + this.logger = Loggers.getLogger(NRTReplicationCompositeEngine.class, shardId); + + store.incRef(); + WriteOnlyTranslogManager translogManagerRef = null; + CatalogSnapshotManager catalogSnapshotManagerRef = null; + boolean success = false; + + try { + // Read last committed segment infos + final SegmentInfos lastCommittedSegmentInfos = store.readLastCommittedSegmentsInfo(); + final Map userData = lastCommittedSegmentInfos.getUserData(); + final String translogUUID = Objects.requireNonNull(userData.get(Translog.TRANSLOG_UUID_KEY)); + + // Initialize local checkpoint tracker + final SequenceNumbers.CommitInfo commitInfo = SequenceNumbers.loadSeqNoInfoFromLuceneCommit(userData.entrySet()); + this.localCheckpointTracker = localCheckpointTrackerSupplier.apply(commitInfo.maxSeqNo, commitInfo.localCheckpoint); + + // Register ONLY internal refresh listeners (not external) + for (ReferenceManager.RefreshListener listener : engineConfig.getInternalRefreshListener()) { + this.refreshListeners.add(listener); + } + + // Create write-only translog manager + TranslogEventListener internalTranslogEventListener = new TranslogEventListener() { + @Override + public void onFailure(String reason, Exception ex) { + failEngine(reason, ex); + } + + @Override + public void onAfterTranslogSync() { + try { + translogManager.trimUnreferencedReaders(); + } catch (IOException ex) { + throw new TranslogException(shardId, "failed to trim unreferenced translog readers", ex); + } + } + }; + + translogManagerRef = new WriteOnlyTranslogManager( + engineConfig.getTranslogConfig(), + engineConfig.getPrimaryTermSupplier(), + engineConfig.getGlobalCheckpointSupplier(), + new DefaultTranslogDeletionPolicy( + engineConfig.getIndexSettings().getTranslogRetentionSize().getBytes(), + engineConfig.getIndexSettings().getTranslogRetentionAge().getMillis(), + engineConfig.getIndexSettings().getTranslogRetentionTotalFiles() + ), + shardId, + readLock, + this::getLocalCheckpointTracker, + translogUUID, + internalTranslogEventListener, + this, + engineConfig.getTranslogFactory(), + engineConfig.getStartedPrimarySupplier(), + TranslogOperationHelper.create(engineConfig) + ); + this.translogManager = translogManagerRef; + + success = true; + } catch (IOException | TranslogCorruptedException e) { + throw new EngineCreationFailureException(shardId, "failed to create NRTReplicationCompositeEngine", e); + } finally { + if (!success) { + if (translogManagerRef != null) { + try { + translogManagerRef.close(); + } catch (Exception e) { + logger.warn("Failed to close translog manager", e); + } + } + // CatalogSnapshotManager doesn't implement Closeable + if (isClosed.get() == false) { + store.decRef(); + } + } + } + } + + // Translog-only operations (from NRTReplicationEngine) + + @Override + public Engine.IndexResult index(Engine.Index index) throws IOException { + ensureOpen(); + Engine.IndexResult indexResult = new Engine.IndexResult(index.version(), index.primaryTerm(), index.seqNo(), false); + final Translog.Location location = translogManager.add(new Translog.Index(index, indexResult)); + indexResult.setTranslogLocation(location); + indexResult.setTook(System.nanoTime() - index.startTime()); + indexResult.freeze(); + localCheckpointTracker.advanceMaxSeqNo(index.seqNo()); + return indexResult; + } + + @Override + public Engine.DeleteResult delete(Engine.Delete delete) throws IOException { + ensureOpen(); + Engine.DeleteResult deleteResult = new Engine.DeleteResult(delete.version(), delete.primaryTerm(), delete.seqNo(), true); + final Translog.Location location = translogManager.add(new Translog.Delete(delete, deleteResult)); + deleteResult.setTranslogLocation(location); + deleteResult.setTook(System.nanoTime() - delete.startTime()); + deleteResult.freeze(); + localCheckpointTracker.advanceMaxSeqNo(delete.seqNo()); + return deleteResult; + } + + @Override + public Engine.NoOpResult noOp(Engine.NoOp noOp) throws IOException { + ensureOpen(); + Engine.NoOpResult noOpResult = new Engine.NoOpResult(noOp.primaryTerm(), noOp.seqNo()); + final Translog.Location location = translogManager.add(new Translog.NoOp(noOp.seqNo(), noOp.primaryTerm(), noOp.reason())); + noOpResult.setTranslogLocation(location); + noOpResult.setTook(System.nanoTime() - noOp.startTime()); + noOpResult.freeze(); + localCheckpointTracker.advanceMaxSeqNo(noOp.seqNo()); + return noOpResult; + } + + /** + * Updates segments from primary using CatalogSnapshot. + * CRITICAL: Invokes refresh listeners to update replication checkpoint. + */ + public synchronized void updateSegments(final CatalogSnapshot catalogSnapshot) throws IOException { + try (ReleasableLock lock = writeLock.acquire()) { + ensureOpen(); + + final long maxSeqNo = Long.parseLong(catalogSnapshot.getUserData().get(MAX_SEQ_NO)); + final long incomingGeneration = catalogSnapshot.getGeneration(); + + // For replicas, catalog is managed externally - just track the generation + // The catalog snapshot is already applied by IndexShard.finalizeReplication() + + // Invoke refresh listeners + invokeRefreshListeners(true); + + // Flush if generation changed + if (incomingGeneration != this.lastReceivedPrimaryGen) { + flush(false, true); + translogManager.getDeletionPolicy().setLocalCheckpointOfSafeCommit(maxSeqNo); + translogManager.rollTranslogGeneration(); + } + + this.lastReceivedPrimaryGen = incomingGeneration; + localCheckpointTracker.fastForwardProcessedSeqNo(maxSeqNo); + } + } + + public void finalizeReplication(CatalogSnapshot catalogSnapshot, ShardPath shardPath) throws IOException { + catalogSnapshotManager.applyReplicationChanges(catalogSnapshot, shardPath); + + if (catalogSnapshot != null) { + long maxGenerationInSnapshot = catalogSnapshot.getLastWriterGeneration(); + engine.updateWriterGenerationIfNeeded(maxGenerationInSnapshot); + } + + updateSearchEngine(); + updateSegments(catalogSnapshot); + } + + private void invokeRefreshListeners(boolean didRefresh) { + // Call beforeRefresh + refreshListeners.forEach(listener -> { + try { + listener.beforeRefresh(); + } catch (IOException e) { + logger.error("refresh listener beforeRefresh failed", e); + throw new RuntimeException(e); + } + }); + + // Call afterRefresh - ReplicationCheckpointUpdater runs here + refreshListeners.forEach(listener -> { + try { + listener.afterRefresh(didRefresh); + } catch (IOException e) { + logger.error("refresh listener afterRefresh failed", e); + throw new RuntimeException(e); + } + }); + } + + @Override + public void refresh(String source) throws EngineException { + // No-op for replicas + } + + @Override + public TranslogManager translogManager() { + return translogManager; + } + + @Override + public void writeIndexingBuffer() throws EngineException { + // No-op + } + + @Override + public boolean shouldPeriodicallyFlush() { + return false; + } + + @Override + public void flush(boolean force, boolean waitIfOngoing) throws EngineException { + ensureOpen(); + if (engineConfig.getIndexSettings().isWarmIndex()) { + return; + } + try (final ReleasableLock lock = readLock.acquire()) { + ensureOpen(); + if (flushLock.tryLock() == false) { + if (waitIfOngoing == false) { + return; + } + flushLock.lock(); + } + try { + // For replicas, flush is minimal - just update translog deletion policy + translogManager.getDeletionPolicy().setLocalCheckpointOfSafeCommit( + localCheckpointTracker.getProcessedCheckpoint() + ); + } catch (Exception e) { + maybeFailEngine("flush", e); + throw new FlushFailedEngineException(shardId, e); + } finally { + flushLock.unlock(); + } + } + } + + // Checkpoint methods + + @Override + public long getPersistedLocalCheckpoint() { + return localCheckpointTracker.getPersistedCheckpoint(); + } + + @Override + public long getProcessedLocalCheckpoint() { + return localCheckpointTracker.getProcessedCheckpoint(); + } + + @Override + public SeqNoStats getSeqNoStats(long globalCheckpoint) { + return localCheckpointTracker.getStats(globalCheckpoint); + } + + @Override + public long getLastSyncedGlobalCheckpoint() { + return translogManager.getLastSyncedGlobalCheckpoint(); + } + + // Metadata methods + + @Override + public long getIndexThrottleTimeInMillis() { + return 0; + } + + @Override + public boolean isThrottled() { + return false; + } + + @Override + public long getIndexBufferRAMBytesUsed() { + return 0; + } + + // Throttling methods + + @Override + public void activateThrottling() { + // No-op + } + + @Override + public void deactivateThrottling() { + // No-op + } + + // Unsupported operations for replicas + + @Override + public void forceMerge( + boolean flush, + int maxNumSegments, + boolean onlyExpungeDeletes, + boolean upgrade, + boolean upgradeOnlyAncientSegments, + String forceMergeUUID + ) throws EngineException, IOException { + // No-op - replicas don't merge + } + + @Override + public int fillSeqNoGaps(long primaryTerm) throws IOException { + return 0; + } + + @Override + public SafeCommitInfo getSafeCommitInfo() { + return new SafeCommitInfo(localCheckpointTracker.getProcessedCheckpoint(), 0); + } + + @Override + protected void closeNoLock(String reason, CountDownLatch closedLatch) { + if (isClosed.compareAndSet(false, true)) { + try { + logger.debug("closing NRTReplicationCompositeEngine, reason: {}", reason); + + // Close translog manager + if (translogManager != null) { + translogManager.close(); + } + store.decRef(); + } catch (Exception e) { + logger.error("failed to close NRTReplicationCompositeEngine", e); + } + } + super.closeNoLock(reason, closedLatch); + } + + public LocalCheckpointTracker getLocalCheckpointTracker() { + return localCheckpointTracker; + } + + @Override + public void updateMaxUnsafeAutoIdTimestamp(long newTimestamp) { + // No-op + } + + @Override + public long getMaxSeqNoOfUpdatesOrDeletes() { + return SequenceNumbers.UNASSIGNED_SEQ_NO; + } + + @Override + public void advanceMaxSeqNoOfUpdatesOrDeletes(long seqNo) { + // No-op for replicas + } + + @Override + public Translog.Snapshot newChangesSnapshot( + String source, + long fromSeqNo, + long toSeqNo, + boolean requiredFullRange, + boolean accurateCount + ) throws IOException { + throw new UnsupportedOperationException("Not implemented"); + } + + @Override + public int countNumberOfHistoryOperations(String source, long fromSeqNo, long toSeqNumber) throws IOException { + return 0; + } + + @Override + public boolean hasCompleteOperationHistory(String reason, long startingSeqNo) { + return false; + } + + @Override + public long getMinRetainedSeqNo() { + return localCheckpointTracker.getProcessedCheckpoint(); + } + + @Override + public Closeable acquireHistoryRetentionLock() { + return () -> {}; + } +} diff --git a/server/src/main/java/org/opensearch/index/engine/NRTReplicationEngine.java b/server/src/main/java/org/opensearch/index/engine/NRTReplicationEngine.java index b97d9931d1139..14f92c7738d45 100644 --- a/server/src/main/java/org/opensearch/index/engine/NRTReplicationEngine.java +++ b/server/src/main/java/org/opensearch/index/engine/NRTReplicationEngine.java @@ -438,7 +438,7 @@ public SafeCommitInfo getSafeCommitInfo() { } @Override - protected final void closeNoLock(String reason, CountDownLatch closedLatch) { + protected void closeNoLock(String reason, CountDownLatch closedLatch) { if (isClosed.compareAndSet(false, true)) { assert rwl.isWriteLockedByCurrentThread() || failEngineLock.isHeldByCurrentThread() : "Either the write lock must be held or the engine must be currently be failing itself"; diff --git a/server/src/main/java/org/opensearch/index/engine/exec/FileMetadata.java b/server/src/main/java/org/opensearch/index/engine/exec/FileMetadata.java index c1a732707b220..f9e0e85f0b5da 100644 --- a/server/src/main/java/org/opensearch/index/engine/exec/FileMetadata.java +++ b/server/src/main/java/org/opensearch/index/engine/exec/FileMetadata.java @@ -15,6 +15,7 @@ public class FileMetadata { public static final String DELIMITER = ":::"; + private static final String METADATA_KEY = "metadata"; private final String file; private final String dataFormat; @@ -25,12 +26,16 @@ public FileMetadata(String dataFormat, String file) { } public FileMetadata(String dataFormatAwareFile) { - String[] parts = dataFormatAwareFile.split(DELIMITER); - if (parts.length != 2) { - throw new IllegalArgumentException("Expected FileMetadata string to have 2 parts: " + dataFormatAwareFile); + if (!dataFormatAwareFile.contains(DELIMITER) && dataFormatAwareFile.startsWith(METADATA_KEY)) { + this.dataFormat = "metadata"; + this.file = dataFormatAwareFile; + return; } + String[] parts = dataFormatAwareFile.split(DELIMITER); + this.dataFormat = (parts.length == 1) + ? "lucene" + : parts[1]; this.file = parts[0]; - this.dataFormat = parts[1]; } public String serialize() { diff --git a/server/src/main/java/org/opensearch/index/engine/exec/RefreshInput.java b/server/src/main/java/org/opensearch/index/engine/exec/RefreshInput.java index b772e3ef4ed7a..320847dae9cfc 100644 --- a/server/src/main/java/org/opensearch/index/engine/exec/RefreshInput.java +++ b/server/src/main/java/org/opensearch/index/engine/exec/RefreshInput.java @@ -8,6 +8,8 @@ package org.opensearch.index.engine.exec; +import org.opensearch.index.engine.exec.coord.Segment; + import org.opensearch.index.engine.exec.coord.CatalogSnapshot; import java.util.ArrayList; @@ -15,7 +17,7 @@ public class RefreshInput { - private List existingSegments; + private List existingSegments; private final List writerFiles; public RefreshInput() { @@ -23,7 +25,7 @@ public RefreshInput() { this.existingSegments = new ArrayList<>(); } - public void setExistingSegments(List existingSegments) { + public void setExistingSegments(List existingSegments) { this.existingSegments = existingSegments; } @@ -35,7 +37,7 @@ public List getWriterFiles() { return writerFiles; } - public List getExistingSegments() { + public List getExistingSegments() { return existingSegments; } } diff --git a/server/src/main/java/org/opensearch/index/engine/exec/RefreshResult.java b/server/src/main/java/org/opensearch/index/engine/exec/RefreshResult.java index 2df905c49d4bc..809165608b15d 100644 --- a/server/src/main/java/org/opensearch/index/engine/exec/RefreshResult.java +++ b/server/src/main/java/org/opensearch/index/engine/exec/RefreshResult.java @@ -8,6 +8,8 @@ package org.opensearch.index.engine.exec; +import org.opensearch.index.engine.exec.coord.Segment; + import org.opensearch.index.engine.exec.coord.CatalogSnapshot; import java.util.ArrayList; @@ -15,17 +17,17 @@ public class RefreshResult { - private List refreshedSegments; + private List refreshedSegments; public RefreshResult() { this.refreshedSegments = new ArrayList<>(); } - public List getRefreshedSegments() { + public List getRefreshedSegments() { return refreshedSegments; } - public void setRefreshedSegments(List refreshedSegments) { + public void setRefreshedSegments(List refreshedSegments) { this.refreshedSegments = refreshedSegments; } } diff --git a/server/src/main/java/org/opensearch/index/engine/exec/WriterFileSet.java b/server/src/main/java/org/opensearch/index/engine/exec/WriterFileSet.java index 6996ca35305ba..932b12126b5ae 100644 --- a/server/src/main/java/org/opensearch/index/engine/exec/WriterFileSet.java +++ b/server/src/main/java/org/opensearch/index/engine/exec/WriterFileSet.java @@ -60,6 +60,7 @@ public WriterFileSet withDirectory(String newDirectory) { public void writeTo(StreamOutput out) throws IOException { out.writeString(directory); out.writeLong(writerGeneration); + out.writeVInt((int) numRows); out.writeVInt(files.size()); for (String file : files) { out.writeString(file); diff --git a/server/src/main/java/org/opensearch/index/engine/exec/bridge/Indexer.java b/server/src/main/java/org/opensearch/index/engine/exec/bridge/Indexer.java index 46e20f943e860..9559a84f59a6f 100644 --- a/server/src/main/java/org/opensearch/index/engine/exec/bridge/Indexer.java +++ b/server/src/main/java/org/opensearch/index/engine/exec/bridge/Indexer.java @@ -9,17 +9,19 @@ package org.opensearch.index.engine.exec.bridge; import org.apache.logging.log4j.Logger; +import org.apache.lucene.index.IndexCommit; import org.opensearch.ExceptionsHelper; import org.opensearch.common.Nullable; import org.opensearch.common.annotation.PublicApi; +import org.opensearch.common.concurrent.GatedCloseable; import org.opensearch.common.unit.TimeValue; import org.opensearch.core.common.unit.ByteSizeValue; -import org.opensearch.index.engine.Engine; -import org.opensearch.index.engine.EngineException; -import org.opensearch.index.engine.SafeCommitInfo; -import org.opensearch.index.engine.Segment; +import org.opensearch.index.engine.*; import org.opensearch.index.engine.exec.composite.CompositeDataFormatWriter; +import org.opensearch.index.engine.exec.coord.CatalogSnapshot; +import org.opensearch.index.engine.exec.coord.CompositeEngine; import org.opensearch.index.seqno.SequenceNumbers; +import org.opensearch.index.shard.ShardPath; import org.opensearch.index.translog.Translog; import org.opensearch.index.translog.TranslogManager; @@ -31,7 +33,13 @@ import static org.opensearch.index.engine.Engine.HISTORY_UUID_KEY; @PublicApi(since = "1.0.0") -public interface Indexer { +public interface Indexer extends LifecycleAware { + + /** + * Returns the engine configuration for this indexer. + * @return the engine configuration + */ + EngineConfig config(); /** * Perform document index operation on the engine @@ -191,6 +199,18 @@ void forceMerge( */ void refresh(String source) throws EngineException; + /** + * Finalizes replication by applying catalog snapshot changes. + * Default no-op implementation for engines that don't support replication. + * + * @param catalogSnapshot the catalog snapshot to apply + * @param shardPath the shard path + * @throws IOException if finalization fails + */ + default void finalizeReplication(CatalogSnapshot catalogSnapshot, ShardPath shardPath) throws IOException { + // No-op by default + } + /** * Commits the data and state to disk, resulting in documents being persisted onto the underlying formats. */ @@ -221,6 +241,8 @@ Translog.Snapshot newChangesSnapshot(String source, long fromSeqNo, long toSeqNo void failEngine(String reason, @Nullable Exception failure); + CompositeEngine.ReleasableRef acquireSnapshot(); + /** * If the specified throwable contains a fatal error in the throwable graph, such a fatal error will be thrown. Callers should ensure * that there are no catch statements that would catch an error in the stack as the fatal error here should go uncaught and be handled @@ -303,6 +325,8 @@ default boolean assertPrimaryIncomingSequenceNumber(final Engine.Operation.Origi return true; } + GatedCloseable acquireSafeIndexCommit() throws EngineException; + /** * the status of the current doc version in engine, compared to the version in an incoming * operation diff --git a/server/src/main/java/org/opensearch/index/engine/exec/commit/LuceneCommitEngine.java b/server/src/main/java/org/opensearch/index/engine/exec/commit/LuceneCommitEngine.java index 6d18035373027..11f25d2e7e2f1 100644 --- a/server/src/main/java/org/opensearch/index/engine/exec/commit/LuceneCommitEngine.java +++ b/server/src/main/java/org/opensearch/index/engine/exec/commit/LuceneCommitEngine.java @@ -17,6 +17,7 @@ import org.opensearch.common.collect.MapBuilder; import org.opensearch.common.concurrent.GatedCloseable; import org.opensearch.common.logging.Loggers; +import org.opensearch.common.util.io.IOUtils; import org.opensearch.index.engine.CombinedDeletionPolicy; import org.opensearch.index.engine.CommitStats; import org.opensearch.index.engine.EngineException; @@ -37,12 +38,12 @@ public class LuceneCommitEngine implements Committer { private final Logger logger; - private final IndexWriter indexWriter; + private IndexWriter indexWriter; private final CombinedDeletionPolicy combinedDeletionPolicy; private final Store store; private volatile SegmentInfos lastCommittedSegmentInfos; - public LuceneCommitEngine(Store store, TranslogDeletionPolicy translogDeletionPolicy, LongSupplier globalCheckpointSupplier) + public LuceneCommitEngine(Store store, TranslogDeletionPolicy translogDeletionPolicy, LongSupplier globalCheckpointSupplier, boolean primaryMode) throws IOException { this.logger = Loggers.getLogger(LuceneCommitEngine.class, store.shardId()); this.combinedDeletionPolicy = new CombinedDeletionPolicy(logger, translogDeletionPolicy, null, globalCheckpointSupplier); @@ -50,7 +51,9 @@ public LuceneCommitEngine(Store store, TranslogDeletionPolicy translogDeletionPo indexWriterConfig.setIndexDeletionPolicy(combinedDeletionPolicy); this.store = store; this.lastCommittedSegmentInfos = store.readLastCommittedSegmentsInfo(); - this.indexWriter = new IndexWriter(store.directory(), indexWriterConfig); + if (primaryMode) { + this.indexWriter = new IndexWriter(store.directory(), indexWriterConfig); + } } @Override @@ -136,6 +139,6 @@ public GatedCloseable acquireSafeIndexCommit() throws EngineExcepti @Override public void close() throws IOException { - this.indexWriter.close(); + IOUtils.close(indexWriter); } } diff --git a/server/src/main/java/org/opensearch/index/engine/exec/composite/CompositeIndexingExecutionEngine.java b/server/src/main/java/org/opensearch/index/engine/exec/composite/CompositeIndexingExecutionEngine.java index 76b13e90f6437..08603a3401629 100644 --- a/server/src/main/java/org/opensearch/index/engine/exec/composite/CompositeIndexingExecutionEngine.java +++ b/server/src/main/java/org/opensearch/index/engine/exec/composite/CompositeIndexingExecutionEngine.java @@ -8,6 +8,8 @@ package org.opensearch.index.engine.exec.composite; +import org.opensearch.index.engine.exec.coord.Segment; + import java.util.Collections; import java.util.LinkedList; import java.util.concurrent.atomic.AtomicLong; @@ -79,6 +81,26 @@ public long getNextWriterGeneration() { return writerGeneration.getAndIncrement(); } + /** + * Updates the writer generation counter to be at least minGeneration + 1. + * This is used during replication/recovery to ensure the replica's writer generation + * is always greater than any replicated file's generation, preventing file name collisions. + * + * @param minGeneration The minimum generation value from replicated files + */ + public void updateWriterGenerationIfNeeded(long minGeneration) { + writerGeneration.updateAndGet(current -> Math.max(current, minGeneration + 1)); + } + + /** + * Gets the current writer generation without incrementing. + * + * @return The current writer generation value + */ + public long getCurrentWriterGeneration() { + return writerGeneration.get(); + } + @Override public List supportedFieldTypes() { throw new UnsupportedOperationException(); @@ -114,11 +136,11 @@ public RefreshResult refresh(RefreshInput ignore) throws IOException { RefreshResult finalResult; try { List dataFormatWriters = dataFormatWriterPool.checkoutAll(); - List refreshedSegment = ignore.getExistingSegments(); - List newSegmentList = new ArrayList<>(); + List refreshedSegment = ignore.getExistingSegments(); + List newSegmentList = new ArrayList<>(); // flush to disk for (CompositeDataFormatWriter dataFormatWriter : dataFormatWriters) { - CatalogSnapshot.Segment newSegment = new CatalogSnapshot.Segment(dataFormatWriter.getWriterGeneration()); + Segment newSegment = new Segment(dataFormatWriter.getWriterGeneration()); FileInfos fileInfos = dataFormatWriter.flush(null); fileInfos.getWriterFilesMap().forEach((key, value) -> { newSegment.addSearchableFiles(key.name(), value); diff --git a/server/src/main/java/org/opensearch/index/engine/exec/coord/CatalogSnapshot.java b/server/src/main/java/org/opensearch/index/engine/exec/coord/CatalogSnapshot.java index 2945001caf1c2..2bfcaf5c91396 100644 --- a/server/src/main/java/org/opensearch/index/engine/exec/coord/CatalogSnapshot.java +++ b/server/src/main/java/org/opensearch/index/engine/exec/coord/CatalogSnapshot.java @@ -8,298 +8,73 @@ package org.opensearch.index.engine.exec.coord; -import org.opensearch.common.annotation.ExperimentalApi; -import org.opensearch.common.io.stream.BytesStreamOutput; import org.opensearch.common.util.concurrent.AbstractRefCounted; -import org.opensearch.core.common.io.stream.*; +import org.opensearch.core.common.io.stream.StreamInput; +import org.opensearch.core.common.io.stream.StreamOutput; +import org.opensearch.core.common.io.stream.Writeable; import org.opensearch.index.engine.exec.FileMetadata; import org.opensearch.index.engine.exec.WriterFileSet; -import java.io.*; +import java.io.IOException; import java.nio.file.Path; -import java.util.ArrayList; -import java.util.Base64; import java.util.Collection; -import java.util.Collections; -import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; -import java.util.function.Supplier; -@ExperimentalApi -public class CatalogSnapshot extends AbstractRefCounted implements Writeable, Cloneable { +public abstract class CatalogSnapshot extends AbstractRefCounted implements Writeable, Cloneable { + // Static constants public static final String CATALOG_SNAPSHOT_KEY = "_catalog_snapshot_"; public static final String LAST_COMPOSITE_WRITER_GEN_KEY = "_last_composite_writer_gen_"; public static final String CATALOG_SNAPSHOT_ID = "_id"; - private final long id; - private long version; - private Map userData; - private long lastWriterGeneration; - private final Map> dfGroupedSearchableFiles; - private List segmentList; - private Supplier indexFileDeleterSupplier; - private Map catalogSnapshotMap; - public CatalogSnapshot(long id, long version, List segmentList, Map catalogSnapshotMap, Supplier indexFileDeleterSupplier) { - super("catalog_snapshot_" + id); - this.id = id; - this.segmentList = segmentList; - this.version = version; - this.userData = new HashMap<>(); - this.dfGroupedSearchableFiles = new HashMap<>(); - this.lastWriterGeneration = -1; + protected final long generation; + protected long version; - segmentList.forEach(segment -> segment.getDFGroupedSearchableFiles().forEach((dataFormat, writerFiles) -> { - dfGroupedSearchableFiles.computeIfAbsent(dataFormat, k -> new ArrayList<>()).add(writerFiles); - this.lastWriterGeneration = Math.max(this.lastWriterGeneration, writerFiles.getWriterGeneration()); - })); - this.catalogSnapshotMap = catalogSnapshotMap; - this.indexFileDeleterSupplier = indexFileDeleterSupplier; - // Whenever a new CatalogSnapshot is created add its files to the IndexFileDeleter - indexFileDeleterSupplier.get().addFileReferences(this); + public CatalogSnapshot(String name, long generation, long version) { + super(name); + this.generation = generation; + this.version = version; } public CatalogSnapshot(StreamInput in) throws IOException { super("catalog_snapshot"); - this.id = in.readLong(); + this.generation = in.readLong(); this.version = in.readLong(); - - // Read userData map - int userDataSize = in.readVInt(); - this.userData = new HashMap<>(); - for (int i = 0; i < userDataSize; i++) { - String key = in.readString(); - String value = in.readString(); - userData.put(key, value); - } - - this.lastWriterGeneration = in.readLong(); - - int segmentCount = in.readVInt(); - this.segmentList = new ArrayList<>(segmentCount); - for (int i = 0; i < segmentCount; i++) { - segmentList.add(new Segment(in)); - } - - // Rebuild dfGroupedSearchableFiles from segmentList - this.dfGroupedSearchableFiles = new HashMap<>(); - segmentList.forEach(segment -> segment.getDFGroupedSearchableFiles().forEach((dataFormat, writerFiles) -> { - dfGroupedSearchableFiles.computeIfAbsent(dataFormat, k -> new ArrayList<>()).add(writerFiles); - })); - } - - public void remapPaths(Path newShardDataPath) { - List remappedSegments = new ArrayList<>(); - for (Segment segment : segmentList) { - Segment remappedSegment = new Segment(segment.getGeneration()); - for (Map.Entry entry : segment.getDFGroupedSearchableFiles().entrySet()) { - String dataFormat = entry.getKey(); - // TODO this path resolution should be handled by core components - Path newDataFormatSpecificShardPath = newShardDataPath.resolve(dataFormat); - WriterFileSet originalFileSet = entry.getValue(); - WriterFileSet remappedFileSet = originalFileSet.withDirectory(newDataFormatSpecificShardPath.toString()); - remappedSegment.addSearchableFiles(dataFormat, remappedFileSet); - } - remappedSegments.add(remappedSegment); - } - dfGroupedSearchableFiles.clear(); - this.segmentList = remappedSegments; - segmentList.forEach(segment -> segment.getDFGroupedSearchableFiles().forEach((dataFormat, writerFiles) -> { - dfGroupedSearchableFiles.computeIfAbsent(dataFormat, k -> new ArrayList<>()).add(writerFiles); - })); } @Override public void writeTo(StreamOutput out) throws IOException { - out.writeLong(id); + out.writeLong(generation); out.writeLong(version); - - // Write userData map - if (userData == null) { - out.writeVInt(0); - } else { - out.writeVInt(userData.size()); - for (Map.Entry entry : userData.entrySet()) { - out.writeString(entry.getKey()); - out.writeString(entry.getValue()); - } - } - - out.writeLong(lastWriterGeneration); - - out.writeVInt(segmentList != null ? segmentList.size() : 0); - if (segmentList != null) { - for (Segment segment : segmentList) { - segment.writeTo(out); - } - } - } - - public String serializeToString() throws IOException { - try (BytesStreamOutput out = new BytesStreamOutput()) { - this.writeTo(out); - return Base64.getEncoder().encodeToString(out.bytes().toBytesRef().bytes); - } - } - - public static CatalogSnapshot deserializeFromString(String serializedData) throws IOException { - byte[] bytes = Base64.getDecoder().decode(serializedData); - try (BytesStreamInput in = new BytesStreamInput(bytes)) { - return new CatalogSnapshot(in); - } - } - - public Collection getSearchableFiles(String dataFormat) { - if (dfGroupedSearchableFiles.containsKey(dataFormat)) { - return dfGroupedSearchableFiles.get(dataFormat); - } - return Collections.emptyList(); - } - - public List getSegments() { - return segmentList; - } - - public Collection getFileMetadataList() throws IOException { - Collection segments = getSegments(); - Collection allFileMetadata = new ArrayList<>(); - - for (Segment segment : segments) { - segment.dfGroupedSearchableFiles.forEach((dataFormatName, writerFileSet) -> { - for (String filePath : writerFileSet.getFiles()) { - File file = new File(filePath); - String fileName = file.getName(); - FileMetadata fileMetadata = new FileMetadata( - dataFormatName, - fileName - ); - allFileMetadata.add(fileMetadata); - } - }); - } - - return allFileMetadata; } public long getGeneration() { - return id; + return generation; } public long getVersion() { return version; } - /** - * Returns user data associated with this catalog snapshot. - * - * @return map of user data key-value pairs - */ - public Map getUserData() { - return userData; - } - - public void changed() { - version++; - } - - @Override - protected void closeInternal() { - // Notify to FileDeleter to remove references of files referenced in this CatalogSnapshot - indexFileDeleterSupplier.get().removeFileReferences(this); - // Remove entry from catalogSnapshotMap - catalogSnapshotMap.remove(this.id); - } - - public long getId() { - return id; - } - - public long getLastWriterGeneration() { - return lastWriterGeneration; - } - - public Set getDataFormats() { - return dfGroupedSearchableFiles.keySet(); - } - - // used only when catalog snapshot is created from last commited segment and hence the object is not initialized with the deleter and map - public void setIndexFileDeleterSupplier(Supplier supplier) { - if (this.indexFileDeleterSupplier == null) { - this.indexFileDeleterSupplier = supplier; - } - } - - public void setCatalogSnapshotMap(Map catalogSnapshotMap) { - this.catalogSnapshotMap = catalogSnapshotMap; - } - - @Override - public String toString() { - return "CatalogSnapshot{" + "id=" + id + ", version=" + version + ", dfGroupedSearchableFiles=" + dfGroupedSearchableFiles + ", List of Segment= " + segmentList + ", userData=" + userData +'}'; - } + // Abstract methods that subclasses must implement + public abstract Collection getFileMetadataList() throws IOException; + public abstract Map getUserData(); + public abstract long getId(); + public abstract List getSegments(); + public abstract Collection getSearchableFiles(String dataFormat); + public abstract Set getDataFormats(); + public abstract long getLastWriterGeneration(); + public abstract String serializeToString() throws IOException; + public abstract void remapPaths(Path newShardDataPath); + public abstract void setIndexFileDeleterSupplier(java.util.function.Supplier supplier); + public abstract void setCatalogSnapshotMap(Map catalogSnapshotMap); public CatalogSnapshot cloneNoAcquire() { // Still using the clone call since Lucene call requires clone. This will allow a SegmentsInfos backed CatalogSnapshot to use the same method in calls. return this; } - public static class Segment implements Serializable, Writeable { - - private final long generation; - private final Map dfGroupedSearchableFiles; - - public Segment(long generation) { - this.dfGroupedSearchableFiles = new HashMap<>(); - this.generation = generation; - } - - public Segment(StreamInput in) throws IOException { - this.generation = in.readLong(); - this.dfGroupedSearchableFiles = new HashMap<>(); - int mapSize = in.readVInt(); - for (int i = 0; i < mapSize; i++) { - String dataFormat = in.readString(); - WriterFileSet writerFileSet = new WriterFileSet(in); - dfGroupedSearchableFiles.put(dataFormat, writerFileSet); - } - } - - public void addSearchableFiles(String dataFormat, WriterFileSet writerFileSetGroup) { - dfGroupedSearchableFiles.put(dataFormat, writerFileSetGroup); - } - - public Map getDFGroupedSearchableFiles() { - return dfGroupedSearchableFiles; - } - - public Collection getSearchableFiles(String df) { - List searchableFiles = new ArrayList<>(); - String directory = dfGroupedSearchableFiles.get(df).getDirectory(); - for(String file : dfGroupedSearchableFiles.get(df).getFiles()) { - searchableFiles.add(new FileMetadata(df , file)); - } - return searchableFiles; - } - - public long getGeneration() { - return generation; - } - - @Override - public void writeTo(StreamOutput out) throws IOException { - out.writeLong(generation); - out.writeVInt(dfGroupedSearchableFiles.size()); - for (Map.Entry entry : dfGroupedSearchableFiles.entrySet()) { - out.writeString(entry.getKey()); - entry.getValue().writeTo(out); - } - } - - @Override - public String toString() { - return "Segment{" + "generation=" + generation + ", dfGroupedSearchableFiles=" + dfGroupedSearchableFiles + '}'; - } - } + public abstract void setUserData(Map userData, boolean b); } diff --git a/server/src/main/java/org/opensearch/index/engine/exec/coord/CatalogSnapshotManager.java b/server/src/main/java/org/opensearch/index/engine/exec/coord/CatalogSnapshotManager.java index a8f5043a2dd53..ebaba77ffdbf7 100644 --- a/server/src/main/java/org/opensearch/index/engine/exec/coord/CatalogSnapshotManager.java +++ b/server/src/main/java/org/opensearch/index/engine/exec/coord/CatalogSnapshotManager.java @@ -8,6 +8,10 @@ package org.opensearch.index.engine.exec.coord; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.opensearch.index.engine.exec.coord.Segment; + import org.opensearch.index.engine.exec.DataFormat; import org.opensearch.index.engine.exec.RefreshResult; import org.opensearch.index.engine.exec.WriterFileSet; @@ -30,26 +34,32 @@ public class CatalogSnapshotManager { - private CatalogSnapshot latestCatalogSnapshot; + private static final Logger logger = LogManager.getLogger(CatalogSnapshotManager.class); + + private CompositeEngineCatalogSnapshot latestCatalogSnapshot; private final Committer compositeEngineCommitter; - private final Map catalogSnapshotMap; + private final Map catalogSnapshotMap; private final AtomicReference indexFileDeleter; public CatalogSnapshotManager(CompositeEngine compositeEngine, Committer compositeEngineCommitter, ShardPath shardPath) throws IOException { catalogSnapshotMap = new HashMap<>(); this.compositeEngineCommitter = compositeEngineCommitter; indexFileDeleter = new AtomicReference<>(); - getLastCommittedCatalogSnapshot().ifPresent(lastCommittedCatalogSnapshot -> { + + Optional lastCommittedOpt = getLastCommittedCatalogSnapshot(); + + lastCommittedOpt.ifPresent(lastCommittedCatalogSnapshot -> { latestCatalogSnapshot = lastCommittedCatalogSnapshot; catalogSnapshotMap.put(latestCatalogSnapshot.getId(), latestCatalogSnapshot); latestCatalogSnapshot.remapPaths(shardPath.getDataPath()); }); + indexFileDeleter.set(new IndexFileDeleter(compositeEngine, latestCatalogSnapshot, shardPath)); if(latestCatalogSnapshot != null) { latestCatalogSnapshot.setIndexFileDeleterSupplier(indexFileDeleter::get); latestCatalogSnapshot.setCatalogSnapshotMap(catalogSnapshotMap); } else { - latestCatalogSnapshot = new CatalogSnapshot(1, 1, new ArrayList<>(), catalogSnapshotMap, indexFileDeleter::get); + latestCatalogSnapshot = new CompositeEngineCatalogSnapshot(1, 1, new ArrayList<>(), catalogSnapshotMap, indexFileDeleter::get); catalogSnapshotMap.put(latestCatalogSnapshot.getId(), latestCatalogSnapshot); } } @@ -67,21 +77,29 @@ public void close() { public synchronized void applyRefreshResult(RefreshResult refreshResult) { commitCatalogSnapshot( - new CatalogSnapshot( + new CompositeEngineCatalogSnapshot( latestCatalogSnapshot.getId() + 1, latestCatalogSnapshot.getVersion() + 1, refreshResult.getRefreshedSegments(), catalogSnapshotMap, - indexFileDeleter::get) + indexFileDeleter::get + ) ); } public synchronized void applyReplicationChanges(CatalogSnapshot catalogSnapshot, ShardPath shardPath) { - CatalogSnapshot oldSnapshot = latestCatalogSnapshot; + CompositeEngineCatalogSnapshot oldSnapshot = latestCatalogSnapshot; if (catalogSnapshot != null) { - catalogSnapshot.incRef(); catalogSnapshot.remapPaths(shardPath.getDataPath()); - latestCatalogSnapshot = catalogSnapshot; + + CompositeEngineCatalogSnapshot newSnapshot = (CompositeEngineCatalogSnapshot) catalogSnapshot; + + newSnapshot.setIndexFileDeleterSupplier(indexFileDeleter::get); + newSnapshot.setCatalogSnapshotMap(catalogSnapshotMap); + + indexFileDeleter.get().addFileReferences(newSnapshot); + + latestCatalogSnapshot = newSnapshot; catalogSnapshotMap.put(latestCatalogSnapshot.getId(), latestCatalogSnapshot); } if (oldSnapshot != null) { @@ -91,16 +109,16 @@ public synchronized void applyReplicationChanges(CatalogSnapshot catalogSnapshot public synchronized void applyMergeResults(MergeResult mergeResult, OneMerge oneMerge) { - List segmentList = latestCatalogSnapshot.getSegments(); + List segmentList = new ArrayList<>(latestCatalogSnapshot.getSegments()); - CatalogSnapshot.Segment segmentToAdd = getSegment(mergeResult.getMergedWriterFileSet()); - Set segmentsToRemove = new HashSet<>(oneMerge.getSegmentsToMerge()); + Segment segmentToAdd = getSegment(mergeResult.getMergedWriterFileSet()); + Set segmentsToRemove = new HashSet<>(oneMerge.getSegmentsToMerge()); boolean inserted = false; int newSegIdx = 0; for (int segIdx = 0, cnt = segmentList.size(); segIdx < cnt; segIdx++) { assert segIdx >= newSegIdx; - CatalogSnapshot.Segment currSegment = segmentList.get(segIdx); + Segment currSegment = segmentList.get(segIdx); if(segmentsToRemove.contains(currSegment)) { if (!inserted) { segmentList.set(segIdx, segmentToAdd); @@ -124,13 +142,13 @@ public synchronized void applyMergeResults(MergeResult mergeResult, OneMerge one if (!inserted) { segmentList.add(0, segmentToAdd); } - CatalogSnapshot newCatSnap = new CatalogSnapshot(latestCatalogSnapshot.getId() + 1, latestCatalogSnapshot.getVersion() + 1, segmentList, catalogSnapshotMap, indexFileDeleter::get); + CompositeEngineCatalogSnapshot newCatSnap = new CompositeEngineCatalogSnapshot(latestCatalogSnapshot.getId() + 1, latestCatalogSnapshot.getVersion() + 1, segmentList, catalogSnapshotMap, indexFileDeleter::get); // Commit new catalog snapshot commitCatalogSnapshot(newCatSnap); } - private synchronized void commitCatalogSnapshot(CatalogSnapshot newCatSnap) { + private synchronized void commitCatalogSnapshot(CompositeEngineCatalogSnapshot newCatSnap) { catalogSnapshotMap.put(newCatSnap.getId(), newCatSnap); if (latestCatalogSnapshot != null) { latestCatalogSnapshot.decRef(); @@ -139,8 +157,8 @@ private synchronized void commitCatalogSnapshot(CatalogSnapshot newCatSnap) { compositeEngineCommitter.addLuceneIndexes(latestCatalogSnapshot); } - private CatalogSnapshot.Segment getSegment(Map writerFileSetMap) { - CatalogSnapshot.Segment segment = new CatalogSnapshot.Segment(0); + private Segment getSegment(Map writerFileSetMap) { + Segment segment = new Segment(0); for(DataFormat dataFormat : writerFileSetMap.keySet()) { segment.addSearchableFiles(dataFormat.name(), writerFileSetMap.get(dataFormat)); @@ -148,11 +166,15 @@ private CatalogSnapshot.Segment getSegment(Map writer return segment; } - private Optional getLastCommittedCatalogSnapshot() throws IOException { + private Optional getLastCommittedCatalogSnapshot() throws IOException { Map lastCommittedData = compositeEngineCommitter.getLastCommittedData(); + if (lastCommittedData.containsKey(CATALOG_SNAPSHOT_KEY)) { - return Optional.of(CatalogSnapshot.deserializeFromString(lastCommittedData.get(CATALOG_SNAPSHOT_KEY))); + String serializedSnapshot = lastCommittedData.get(CATALOG_SNAPSHOT_KEY); + CompositeEngineCatalogSnapshot snapshot = CompositeEngineCatalogSnapshot.deserializeFromString(serializedSnapshot); + return Optional.of(snapshot); } + return Optional.empty(); } diff --git a/server/src/main/java/org/opensearch/index/engine/exec/coord/CompositeEngine.java b/server/src/main/java/org/opensearch/index/engine/exec/coord/CompositeEngine.java index ab731ba79863f..207c89213d60b 100644 --- a/server/src/main/java/org/opensearch/index/engine/exec/coord/CompositeEngine.java +++ b/server/src/main/java/org/opensearch/index/engine/exec/coord/CompositeEngine.java @@ -10,11 +10,13 @@ import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.message.ParameterizedMessage; +import org.apache.lucene.index.SegmentInfos; import org.apache.lucene.index.IndexCommit; import org.apache.lucene.search.ReferenceManager; import org.apache.lucene.store.AlreadyClosedException; import org.opensearch.common.Nullable; import org.opensearch.common.SetOnce; +import org.opensearch.common.TriConsumer; import org.opensearch.common.annotation.ExperimentalApi; import org.opensearch.common.concurrent.GatedCloseable; import org.opensearch.common.lease.Releasable; @@ -47,6 +49,7 @@ import org.opensearch.index.engine.Segment; import org.opensearch.index.engine.SegmentsStats; import org.opensearch.index.engine.VersionValue; +import org.opensearch.index.engine.exec.FileMetadata; import org.opensearch.index.engine.exec.FileStats; import org.opensearch.index.engine.exec.RefreshInput; import org.opensearch.index.engine.exec.RefreshResult; @@ -74,12 +77,14 @@ import org.opensearch.index.shard.DocsStats; import org.opensearch.index.shard.ShardPath; import org.opensearch.index.store.Store; +import org.opensearch.index.translog.Checkpoint; import org.opensearch.index.translog.DefaultTranslogDeletionPolicy; import org.opensearch.index.translog.InternalTranslogManager; import org.opensearch.index.translog.Translog; import org.opensearch.index.translog.TranslogCorruptedException; import org.opensearch.index.translog.TranslogDeletionPolicy; import org.opensearch.index.translog.TranslogException; +import org.opensearch.index.translog.TranslogHeader; import org.opensearch.index.translog.TranslogManager; import org.opensearch.index.translog.TranslogOperationHelper; import org.opensearch.index.translog.listener.CompositeTranslogEventListener; @@ -87,15 +92,15 @@ import org.opensearch.indices.pollingingest.PollingIngestStats; import org.opensearch.plugins.PluginsService; import org.opensearch.plugins.SearchEnginePlugin; -import org.opensearch.plugins.spi.vectorized.DataFormat; import org.opensearch.search.suggest.completion.CompletionStats; +import org.opensearch.plugins.spi.vectorized.DataFormat; import java.io.Closeable; import java.io.IOException; +import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; -import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -109,7 +114,6 @@ import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import java.util.concurrent.locks.ReentrantReadWriteLock; -import java.util.function.BiConsumer; import java.util.function.BiFunction; import java.util.function.Consumer; import java.util.function.Function; @@ -118,13 +122,13 @@ import static org.opensearch.index.engine.Engine.HISTORY_UUID_KEY; import static org.opensearch.index.engine.Engine.MAX_UNSAFE_AUTO_ID_TIMESTAMP_COMMIT_ID; -import static org.opensearch.index.engine.exec.coord.CatalogSnapshot.CATALOG_SNAPSHOT_ID; import static org.opensearch.index.engine.exec.coord.CatalogSnapshot.CATALOG_SNAPSHOT_KEY; import static org.opensearch.index.engine.exec.coord.CatalogSnapshot.LAST_COMPOSITE_WRITER_GEN_KEY; @ExperimentalApi public class CompositeEngine implements LifecycleAware, Closeable, Indexer, CheckpointState, IndexingThrottler, StatsHolder { + private static final Consumer PRE_REFRESH_LISTENER_CONSUMER = refreshListener -> { try { refreshListener.beforeRefresh(); @@ -139,10 +143,11 @@ public class CompositeEngine implements LifecycleAware, Closeable, Indexer, Chec throw new RuntimeException(e); } }; - private static final BiConsumer>, CatalogSnapshotAwareRefreshListener> - POST_REFRESH_CATALOG_SNAPSHOT_AWARE_LISTENER_CONSUMER = (catalogSnapshot, catalogSnapshotAwareRefreshListener) -> { + private static final TriConsumer>, CatalogSnapshotAwareRefreshListener, Boolean> + POST_REFRESH_CATALOG_SNAPSHOT_AWARE_LISTENER_CONSUMER = (catalogSnapshot, catalogSnapshotAwareRefreshListener, didRefresh) -> { try { - catalogSnapshotAwareRefreshListener.afterRefresh(true, catalogSnapshot); + // Wrap in Supplier as required by CatalogSnapshotAwareRefreshListener interface + catalogSnapshotAwareRefreshListener.afterRefresh(didRefresh, catalogSnapshot); } catch (IOException e) { throw new RuntimeException(e); } @@ -150,9 +155,9 @@ public class CompositeEngine implements LifecycleAware, Closeable, Indexer, Chec private static final Function extractSegmentName = name -> name.substring(name.lastIndexOf('_'), name.lastIndexOf('.')); private final ShardId shardId; - private final CompositeIndexingExecutionEngine engine; - private final EngineConfig engineConfig; - private final Store store; + protected final CompositeIndexingExecutionEngine engine; + protected final EngineConfig engineConfig; + protected final Store store; private final Logger logger; private final Committer compositeEngineCommitter; private final TranslogManager translogManager; @@ -200,7 +205,7 @@ public class CompositeEngine implements LifecycleAware, Closeable, Indexer, Chec // The value of this marker never goes backwards, and is tracked/updated differently on primary and replica. private final AtomicLong maxSeqNoOfUpdatesOrDeletes; private final IndexingStrategyPlanner indexingStrategyPlanner; - private final CatalogSnapshotManager catalogSnapshotManager; + protected final CatalogSnapshotManager catalogSnapshotManager; private ReleasableRef lastCommitedCatalogSnapshotRef; private final EventListener eventListener; @@ -231,9 +236,30 @@ public CompositeEngine( this.localCheckpointTracker = createLocalCheckpointTracker(localCheckpointTrackerSupplier); this.lastRefreshedCheckpointListener = new LastRefreshedCheckpointListener(localCheckpointTracker); refreshListeners.add(lastRefreshedCheckpointListener); - - final Map userData = store.readLastCommittedSegmentsInfo().getUserData(); - String translogUUID = Objects.requireNonNull(userData.get(Translog.TRANSLOG_UUID_KEY)); + Map userData; + String translogUUID; + // Note: lastRefreshedCheckpointListener is initialized later after localCheckpointTracker is ready + try { + final SegmentInfos segmentInfos = store.readLastCommittedSegmentsInfo(); + userData = segmentInfos.getUserData(); + translogUUID = Objects.requireNonNull(userData.get(Translog.TRANSLOG_UUID_KEY)); + } catch (java.io.FileNotFoundException e) { + // Local store is empty (remote store recovery scenario) + final Path translogPath = engineConfig.getTranslogConfig().getTranslogPath(); + final Checkpoint checkpoint = Checkpoint.read(translogPath.resolve(Translog.CHECKPOINT_FILE_NAME)); + final Path translogFile = translogPath.resolve(Translog.getFilename(checkpoint.getGeneration())); + try (java.nio.channels.FileChannel channel = java.nio.channels.FileChannel.open(translogFile, java.nio.file.StandardOpenOption.READ)) { + final TranslogHeader translogHeader = TranslogHeader.read(translogFile, channel); + translogUUID = translogHeader.getTranslogUUID(); + + // Create initial empty commit for LuceneCommitEngine + store.createEmpty(engineConfig.getIndexSettings().getIndexVersionCreated().luceneVersion, translogUUID); + + // Now read the userData from the newly created commit + userData = store.readLastCommittedSegmentsInfo().getUserData(); + logger.debug("Created initial empty commit with translog UUID: {}", translogUUID); + } + } TranslogEventListener internalTranslogEventListener = new TranslogEventListener() { @Override public void onAfterTranslogSync() { @@ -265,7 +291,7 @@ public void onFailure(String reason, Exception ex) { this.translogManager = translogManagerRef; // initialize committer and composite indexing execution engine - committerRef = new LuceneCommitEngine(store, translogDeletionPolicy, translogManager::getLastSyncedGlobalCheckpoint); + committerRef = new LuceneCommitEngine(store, translogDeletionPolicy, translogManager::getLastSyncedGlobalCheckpoint, !config().isReadOnlyReplica()); this.compositeEngineCommitter = committerRef; final AtomicLong lastCommittedWriterGeneration = new AtomicLong(-1); Map lastCommittedData = this.compositeEngineCommitter.getLastCommittedData(); @@ -285,7 +311,13 @@ public void onFailure(String reason, Exception ex) { //Initialize CatalogSnapshotManager before loadWriterFiles to ensure stale files are cleaned up before loading this.catalogSnapshotManager = new CatalogSnapshotManager(this, committerRef, shardPath); try (CompositeEngine.ReleasableRef catalogSnapshotReleasableRef = catalogSnapshotManager.acquireSnapshot()) { - this.engine.loadWriterFiles(catalogSnapshotReleasableRef.getRef()); + CatalogSnapshot loadedSnapshot = catalogSnapshotReleasableRef.getRef(); + this.engine.loadWriterFiles(loadedSnapshot); + + if (loadedSnapshot != null) { + long snapshotLastWriterGen = loadedSnapshot.getLastWriterGeneration(); + engine.updateWriterGenerationIfNeeded(snapshotLastWriterGen); + } } catch (Exception e) { failEngine("unable to close releasable catalog snapshot while bootstrapping composite engine", e); } @@ -313,6 +345,7 @@ public void onFailure(String reason, Exception ex) { // Refresh here so that catalog snapshot gets initialized // TODO : any better way to do this ? + initializeRefreshListeners(engineConfig); refresh("start"); // TODO : how to extend this for Lucene ? where engine is a r/w engine // Create read specific engines for each format which is associated with shard @@ -320,8 +353,20 @@ public void onFailure(String reason, Exception ex) { for (SearchEnginePlugin searchEnginePlugin : searchEnginePlugins) { for (DataFormat dataFormat : searchEnginePlugin.getSupportedFormats()) { List> currentSearchEngines = readEngines.getOrDefault(dataFormat, new ArrayList<>()); + + // Get FileMetadata filtered by data format from current catalog snapshot + Collection formatFiles; + try (ReleasableRef snapshotRef = acquireSnapshot()) { + CatalogSnapshot snapshot = snapshotRef.getRef(); + formatFiles = snapshot.getFileMetadataList().stream() + .filter(fm -> fm.dataFormat().equals(dataFormat.getName())) + .collect(Collectors.toList()); + } catch (Exception e) { + throw new EngineCreationFailureException(shardId, "failed to acquire catalog snapshot for read engine creation", e); + } + SearchExecEngine newSearchEngine = - searchEnginePlugin.createEngine(dataFormat, Collections.emptyList(), shardPath); + searchEnginePlugin.createEngine(dataFormat, formatFiles, shardPath); currentSearchEngines.add(newSearchEngine); readEngines.put(dataFormat, currentSearchEngines); @@ -342,10 +387,7 @@ public void onFailure(String reason, Exception ex) { } } } - catalogSnapshotAwareRefreshListeners.forEach(refreshListener -> POST_REFRESH_CATALOG_SNAPSHOT_AWARE_LISTENER_CONSUMER.accept( - this::acquireSnapshot, - refreshListener - )); + invokeRefreshListeners(true); success = true; } catch (IOException | TranslogCorruptedException e) { throw new EngineCreationFailureException(shardId, "failed to create engine", e); @@ -359,9 +401,6 @@ public void onFailure(String reason, Exception ex) { } } logger.trace("created new CompositeEngine"); - - initializeRefreshListeners(engineConfig); - } private LocalCheckpointTracker createLocalCheckpointTracker( @@ -369,11 +408,26 @@ private LocalCheckpointTracker createLocalCheckpointTracker( ) throws IOException { final long maxSeqNo; final long localCheckpoint; - final SequenceNumbers.CommitInfo seqNoStats = - SequenceNumbers.loadSeqNoInfoFromLuceneCommit(store.readLastCommittedSegmentsInfo().getUserData().entrySet()); - maxSeqNo = seqNoStats.maxSeqNo; - localCheckpoint = seqNoStats.localCheckpoint; - logger.trace("recovered maximum sequence number [{}] and local checkpoint [{}]", maxSeqNo, localCheckpoint); + + try { + final SequenceNumbers.CommitInfo seqNoStats = + SequenceNumbers.loadSeqNoInfoFromLuceneCommit(store.readLastCommittedSegmentsInfo().getUserData().entrySet()); + maxSeqNo = seqNoStats.maxSeqNo; + localCheckpoint = seqNoStats.localCheckpoint; + logger.trace("recovered maximum sequence number [{}] and local checkpoint [{}]", maxSeqNo, localCheckpoint); + } catch (org.apache.lucene.index.IndexNotFoundException e) { + // Local store is empty (remote store recovery scenario) + // Initialize with NO_OPS_PERFORMED (-1) - checkpoint will be restored from CatalogSnapshot during first flush + logger.debug( + "Local store is empty during engine initialization, initializing checkpoint tracker with NO_OPS_PERFORMED. " + + "This is expected during remote store recovery where local store has not been initialized yet." + ); + return localCheckpointTrackerSupplier.apply( + SequenceNumbers.NO_OPS_PERFORMED, + SequenceNumbers.NO_OPS_PERFORMED + ); + } + return localCheckpointTrackerSupplier.apply(maxSeqNo, localCheckpoint); } @@ -392,6 +446,11 @@ protected TranslogDeletionPolicy getTranslogDeletionPolicy(EngineConfig engineCo ); } + public final EngineConfig config() + { + return engineConfig; + } + protected TranslogManager createTranslogManager( String translogUUID, TranslogDeletionPolicy translogDeletionPolicy, @@ -421,12 +480,13 @@ public void ensureOpen() { } } - LocalCheckpointTracker getLocalCheckpointTracker() { + public LocalCheckpointTracker getLocalCheckpointTracker() { return localCheckpointTracker; } + public void updateSearchEngine() throws IOException { - catalogSnapshotAwareRefreshListeners.forEach(ref -> { + catalogSnapshotAwareRefreshListeners.forEach(ref -> { try { ref.afterRefresh(true, catalogSnapshotManager::acquireSnapshot); } catch (IOException e) { @@ -705,23 +765,25 @@ public void deactivateThrottling() { } public synchronized void refresh(String source) throws EngineException { + final long localCheckpointBeforeRefresh = localCheckpointTracker.getProcessedCheckpoint(); + boolean refreshed = false; try (CompositeEngine.ReleasableRef catalogSnapshotReleasableRef = catalogSnapshotManager.acquireSnapshot()) { refreshListeners.forEach(PRE_REFRESH_LISTENER_CONSUMER); RefreshInput refreshInput = new RefreshInput(); refreshInput.setExistingSegments(new ArrayList<>(catalogSnapshotReleasableRef.getRef().getSegments())); RefreshResult refreshResult = engine.refresh(refreshInput); - if (refreshResult == null) { - return; + if (refreshResult != null) { + catalogSnapshotManager.applyRefreshResult(refreshResult); + refreshed = true; + } + + invokeRefreshListeners(refreshed); + + // Call checkpoint listener's afterRefresh to update refreshed checkpoint + if (refreshed) { + triggerPossibleMerges(); // trigger merges } - catalogSnapshotManager.applyRefreshResult(refreshResult); - catalogSnapshotAwareRefreshListeners.forEach(refreshListener -> POST_REFRESH_CATALOG_SNAPSHOT_AWARE_LISTENER_CONSUMER.accept( - this::acquireSnapshot, - refreshListener - )); - - refreshListeners.forEach(POST_REFRESH_LISTENER_CONSUMER); - triggerPossibleMerges(); // trigger merges } catch (Exception ex) { try { failEngine("refresh failed source[" + source + "]", ex); @@ -730,11 +792,28 @@ public synchronized void refresh(String source) throws EngineException { } throw new RefreshFailedEngineException(shardId, ex); } + + assert refreshed == false || lastRefreshedCheckpoint() >= localCheckpointBeforeRefresh : "refresh checkpoint was not advanced; " + + "local_checkpoint=" + + localCheckpointBeforeRefresh + + " refresh_checkpoint=" + + lastRefreshedCheckpoint(); + } + + private void invokeRefreshListeners(boolean didRefresh) { + catalogSnapshotAwareRefreshListeners.forEach(refreshListener -> POST_REFRESH_CATALOG_SNAPSHOT_AWARE_LISTENER_CONSUMER.apply( + this::acquireSnapshot, + refreshListener, + didRefresh + )); + + refreshListeners.forEach(POST_REFRESH_LISTENER_CONSUMER); } public synchronized void applyMergeChanges(MergeResult mergeResult, OneMerge oneMerge) { try { catalogSnapshotManager.applyMergeResults(mergeResult, oneMerge); + invokeRefreshListeners(true); } catch (Exception ex) { try { logger.error( @@ -755,11 +834,6 @@ public void triggerPossibleMerges() { mergeScheduler.triggerMerges(); } - public void finalizeReplication(CatalogSnapshot catalogSnapshot, ShardPath shardPath) throws IOException { - catalogSnapshotManager.applyReplicationChanges(catalogSnapshot, shardPath); - updateSearchEngine(); - } - // This should get wired into searcher acquireSnapshot for initializing reader context later // this now becomes equivalent of the reader // Each search side specific impl can decide on how to init specific reader instances using this pit snapshot provided by writers @@ -832,7 +906,7 @@ public List segments(boolean verbose) { lastCommitedCatalogSnapshotRef.getRef() .getSegments() .stream() - .map(CatalogSnapshot.Segment::getGeneration) + .map(org.opensearch.index.engine.exec.coord.Segment::getGeneration) .collect(Collectors.toCollection(() -> committedSegments)); } Map segmentStats = getPrimaryReadEngine().fetchSegmentStats(); @@ -909,33 +983,45 @@ public void flush(boolean force, boolean waitIfOngoing) throws EngineException { boolean shouldPeriodicallyFlush = shouldPeriodicallyFlush(); if (force || shouldFlush() || shouldPeriodicallyFlush || getProcessedLocalCheckpoint() > Long.parseLong( readLastCommittedData().get(SequenceNumbers.LOCAL_CHECKPOINT_KEY))) { + translogManager.ensureCanFlush(); + try { translogManager.rollTranslogGeneration(); logger.trace("starting commit for flush; commitTranslog=true"); CompositeEngine.ReleasableRef catalogSnapshotToFlushRef = catalogSnapshotManager.acquireSnapshot(); final CatalogSnapshot catalogSnapshotToFlush = catalogSnapshotToFlushRef.getRef(); - System.out.println("FLUSH called, current snapshot to commit : " + catalogSnapshotToFlush.getId() - + ", previous commited snapshot : " + ((lastCommitedCatalogSnapshotRef != null) - ? lastCommitedCatalogSnapshotRef.getRef().getId() - : -1)); - final String serializedCatalogSnapshot = catalogSnapshotToFlush.serializeToString(); - final long lastWriterGeneration = catalogSnapshotToFlush.getLastWriterGeneration(); + + // FIX: Use MAX of engine's current counter and snapshot's lastWriterGeneration + // to ensure we never reuse a generation after restart. + // Engine counter - 1 = last assigned generation (counter points to NEXT generation) + final long engineLastAssignedGen = engine.getCurrentWriterGeneration() - 1; + final long snapshotLastWriterGen = catalogSnapshotToFlush.getLastWriterGeneration(); + final long lastWriterGeneration = Math.max(engineLastAssignedGen, snapshotLastWriterGen); + final long localCheckpoint = localCheckpointTracker.getProcessedCheckpoint(); - final long id = catalogSnapshotToFlush.getId(); + + // Create commitData with checkpoint information BEFORE serializing CatalogSnapshot + // This ensures CatalogSnapshot.userData contains the correct checkpoint values + final Map commitData = new HashMap<>(7); + commitData.put(Translog.TRANSLOG_UUID_KEY, translogManager.getTranslogUUID()); + commitData.put(SequenceNumbers.LOCAL_CHECKPOINT_KEY, Long.toString(localCheckpoint)); + commitData.put(SequenceNumbers.MAX_SEQ_NO, Long.toString(localCheckpointTracker.getMaxSeqNo())); + commitData.put(MAX_UNSAFE_AUTO_ID_TIMESTAMP_COMMIT_ID, Long.toString(maxUnsafeAutoIdTimestamp.get())); + commitData.put(HISTORY_UUID_KEY, historyUUID); + commitData.put(LAST_COMPOSITE_WRITER_GEN_KEY, Long.toString(lastWriterGeneration)); + + // Copy checkpoint data to CatalogSnapshot.userData BEFORE serialization + // This preserves checkpoint state for recovery scenarios (e.g., replica promotion) + catalogSnapshotToFlush.setUserData(commitData, false); + + // Now serialize CatalogSnapshot with checkpoint data in userData + final String serializedCatalogSnapshot = catalogSnapshotToFlush.serializeToString(); + commitData.put(CATALOG_SNAPSHOT_KEY, serializedCatalogSnapshot); + compositeEngineCommitter.commit( - () -> { - final Map commitData = new HashMap<>(7); - commitData.put(CATALOG_SNAPSHOT_ID, Long.toString(id)); - commitData.put(Translog.TRANSLOG_UUID_KEY, translogManager.getTranslogUUID()); - commitData.put(SequenceNumbers.LOCAL_CHECKPOINT_KEY, Long.toString(localCheckpoint)); - commitData.put(SequenceNumbers.MAX_SEQ_NO, Long.toString(localCheckpointTracker.getMaxSeqNo())); - commitData.put(MAX_UNSAFE_AUTO_ID_TIMESTAMP_COMMIT_ID, Long.toString(maxUnsafeAutoIdTimestamp.get())); - commitData.put(HISTORY_UUID_KEY, historyUUID); - commitData.put(CATALOG_SNAPSHOT_KEY, serializedCatalogSnapshot); - commitData.put(LAST_COMPOSITE_WRITER_GEN_KEY, Long.toString(lastWriterGeneration)); - return commitData.entrySet().iterator(); - }, catalogSnapshotToFlush + () -> commitData.entrySet().iterator(), + catalogSnapshotToFlush ); logger.trace("finished commit for flush"); if (lastCommitedCatalogSnapshotRef != null && lastCommitedCatalogSnapshotRef.getRef() != null) @@ -1065,7 +1151,7 @@ public Translog.Snapshot newChangesSnapshot( boolean requiredFullRange, boolean accurateCount ) throws IOException { - return null; + return translogManager.newChangesSnapshot(fromSeqNo, toSeqNo, requiredFullRange); } @Override @@ -1127,7 +1213,7 @@ private boolean failOnTragicEvent(AlreadyClosedException ex) { return engineFailed; } - private boolean maybeFailEngine(String source, Exception e) { + protected boolean maybeFailEngine(String source, Exception e) { // Check for AlreadyClosedException -- ACE is a very special // exception that should only be thrown in a tragic event. we pass on the checks to failOnTragicEvent which will // throw and AssertionError if the tragic event condition is not met. @@ -1206,34 +1292,33 @@ private void awaitPendingClose() { * called while the write lock is hold or in a disaster condition ie. if the engine * is failed. */ - private void closeNoLock(String reason, CountDownLatch closedLatch) { + protected void closeNoLock(String reason, CountDownLatch closedLatch) { if (isClosed.compareAndSet(false, true)) { assert rwl.isWriteLockedByCurrentThread() || failEngineLock.isHeldByCurrentThread() : "Either the write lock must be held or the engine must be currently be failing itself"; try { - try { IOUtils.close(engine, translogManager, compositeEngineCommitter); } catch (Exception e) { logger.warn("Failed to close translog", e); - } - } catch (Exception e) { - logger.warn("failed to close translog manager", e); - } finally { - try { - store.decRef(); - logger.debug("engine closed [{}]", reason); } finally { - closedLatch.countDown(); + try { + store.decRef(); + logger.debug("engine closed [{}]", reason); + } finally { + closedLatch.countDown(); + } } - } } } + + /** * Acquires the most recent safe index commit snapshot from the currently running engine. * All index files referenced by this commit won't be freed until the commit/snapshot is closed. * This method is required for replica recovery operations. */ + @Override public GatedCloseable acquireSafeIndexCommit() throws EngineException { ensureOpen(); if (compositeEngineCommitter instanceof LuceneCommitEngine) { diff --git a/server/src/main/java/org/opensearch/index/engine/exec/coord/CompositeEngineCatalogSnapshot.java b/server/src/main/java/org/opensearch/index/engine/exec/coord/CompositeEngineCatalogSnapshot.java new file mode 100644 index 0000000000000..262263d653372 --- /dev/null +++ b/server/src/main/java/org/opensearch/index/engine/exec/coord/CompositeEngineCatalogSnapshot.java @@ -0,0 +1,250 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.index.engine.exec.coord; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.opensearch.common.annotation.ExperimentalApi; +import org.opensearch.common.io.stream.BytesStreamOutput; +import org.opensearch.core.common.io.stream.*; +import org.opensearch.index.engine.exec.FileMetadata; +import org.opensearch.index.engine.exec.WriterFileSet; + +import java.io.*; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Base64; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.function.Supplier; + +@ExperimentalApi +public class CompositeEngineCatalogSnapshot extends CatalogSnapshot { + + private static final Logger logger = LogManager.getLogger(CompositeEngineCatalogSnapshot.class); + + public static final String CATALOG_SNAPSHOT_KEY = "_catalog_snapshot_"; + public static final String LAST_COMPOSITE_WRITER_GEN_KEY = "_last_composite_writer_gen_"; + private Map userData; + private long lastWriterGeneration; + private final Map> dfGroupedSearchableFiles; + private List segmentList; + private Supplier indexFileDeleterSupplier; + private Map catalogSnapshotMap; + + public CompositeEngineCatalogSnapshot(long id, long version, List segmentList, Map catalogSnapshotMap, Supplier indexFileDeleterSupplier) { + super("catalog_snapshot_" + id, id, version); + this.segmentList = segmentList; + this.userData = new HashMap<>(); + this.dfGroupedSearchableFiles = new HashMap<>(); + this.lastWriterGeneration = -1; + + segmentList.forEach(segment -> segment.getDFGroupedSearchableFiles().forEach((dataFormat, writerFiles) -> { + dfGroupedSearchableFiles.computeIfAbsent(dataFormat, k -> new ArrayList<>()).add(writerFiles); + this.lastWriterGeneration = Math.max(this.lastWriterGeneration, writerFiles.getWriterGeneration()); + })); + this.catalogSnapshotMap = catalogSnapshotMap; + this.indexFileDeleterSupplier = indexFileDeleterSupplier; + // Whenever a new CatalogSnapshot is created add its files to the IndexFileDeleter + indexFileDeleterSupplier.get().addFileReferences(this); + } + + public CompositeEngineCatalogSnapshot(StreamInput in) throws IOException { + super(in); + + // Read userData map + int userDataSize = in.readVInt(); + this.userData = new HashMap<>(); + for (int i = 0; i < userDataSize; i++) { + String key = in.readString(); + String value = in.readString(); + userData.put(key, value); + } + + this.lastWriterGeneration = in.readLong(); + + int segmentCount = in.readVInt(); + this.segmentList = new ArrayList<>(segmentCount); + for (int i = 0; i < segmentCount; i++) { + segmentList.add(new Segment(in)); + } + + // Rebuild dfGroupedSearchableFiles from segmentList + this.dfGroupedSearchableFiles = new HashMap<>(); + segmentList.forEach(segment -> segment.getDFGroupedSearchableFiles().forEach((dataFormat, writerFiles) -> { + dfGroupedSearchableFiles.computeIfAbsent(dataFormat, k -> new ArrayList<>()).add(writerFiles); + })); + } + + public void remapPaths(Path newShardDataPath) { + List remappedSegments = new ArrayList<>(); + for (Segment segment : segmentList) { + Segment remappedSegment = new Segment(segment.getGeneration()); + for (Map.Entry entry : segment.getDFGroupedSearchableFiles().entrySet()) { + String dataFormat = entry.getKey(); + // TODO this path resolution should be handled by core components + Path newDataFormatSpecificShardPath = newShardDataPath.resolve(dataFormat); + WriterFileSet originalFileSet = entry.getValue(); + WriterFileSet remappedFileSet = originalFileSet.withDirectory(newDataFormatSpecificShardPath.toString()); + remappedSegment.addSearchableFiles(dataFormat, remappedFileSet); + } + remappedSegments.add(remappedSegment); + } + dfGroupedSearchableFiles.clear(); + this.segmentList = remappedSegments; + segmentList.forEach(segment -> segment.getDFGroupedSearchableFiles().forEach((dataFormat, writerFiles) -> { + dfGroupedSearchableFiles.computeIfAbsent(dataFormat, k -> new ArrayList<>()).add(writerFiles); + })); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + super.writeTo(out); + + // Write userData map + if (userData == null) { + out.writeVInt(0); + } else { + out.writeVInt(userData.size()); + for (Map.Entry entry : userData.entrySet()) { + out.writeString(entry.getKey()); + out.writeString(entry.getValue()); + } + } + + out.writeLong(lastWriterGeneration); + + out.writeVInt(segmentList != null ? segmentList.size() : 0); + if (segmentList != null) { + for (Segment segment : segmentList) { + segment.writeTo(out); + } + } + } + + public String serializeToString() throws IOException { + try (BytesStreamOutput out = new BytesStreamOutput()) { + this.writeTo(out); + return Base64.getEncoder().encodeToString(out.bytes().toBytesRef().bytes); + } + } + + public static CompositeEngineCatalogSnapshot deserializeFromString(String serializedData) throws IOException { + byte[] bytes = Base64.getDecoder().decode(serializedData); + try (BytesStreamInput in = new BytesStreamInput(bytes)) { + return new CompositeEngineCatalogSnapshot(in); + } + } + + public Collection getSearchableFiles(String dataFormat) { + if (dfGroupedSearchableFiles.containsKey(dataFormat)) { + return dfGroupedSearchableFiles.get(dataFormat); + } + return Collections.emptyList(); + } + + public List getSegments() { + return segmentList; + } + + public Collection getFileMetadataList() throws IOException { + Collection segments = getSegments(); + Collection allFileMetadata = new ArrayList<>(); + + for (Segment segment : segments) { + segment.getDFGroupedSearchableFiles().forEach((dataFormatName, writerFileSet) -> { + for (String filePath : writerFileSet.getFiles()) { + File file = new File(filePath); + String fileName = file.getName(); + FileMetadata fileMetadata = new FileMetadata( + dataFormatName, + fileName + ); + allFileMetadata.add(fileMetadata); + } + }); + } + + return allFileMetadata; + } + + /** + * Returns user data associated with this catalog snapshot. + * + * @return map of user data key-value pairs + */ + public Map getUserData() { + return userData; + } + + @Override + protected void closeInternal() { + // Notify to FileDeleter to remove references of files referenced in this CatalogSnapshot + indexFileDeleterSupplier.get().removeFileReferences(this); + // Remove entry from catalogSnapshotMap + catalogSnapshotMap.remove(generation); + } + + public long getLastWriterGeneration() { + return lastWriterGeneration; + } + + public Set getDataFormats() { + return dfGroupedSearchableFiles.keySet(); + } + + // used only when catalog snapshot is created from last commited segment and hence the object is not initialized with the deleter and map + public void setIndexFileDeleterSupplier(Supplier supplier) { + if (this.indexFileDeleterSupplier == null) { + this.indexFileDeleterSupplier = supplier; + } + } + + @Override + public void setCatalogSnapshotMap(Map catalogSnapshotMap) { + this.catalogSnapshotMap = (Map) catalogSnapshotMap; + } + + @Override + public void setUserData(Map userData, boolean b) { + if (userData == null) { + this.userData = Collections.emptyMap(); + } else { + this.userData = new HashMap<>(userData); + } + } + + @Override + public long getId() { + return generation; + } + + @Override + public CompositeEngineCatalogSnapshot clone() { + CompositeEngineCatalogSnapshot cloned = new CompositeEngineCatalogSnapshot( + this.generation, + this.version, + new ArrayList<>(this.segmentList), + this.catalogSnapshotMap, + this.indexFileDeleterSupplier + ); + cloned.userData = new HashMap<>(this.userData); + cloned.lastWriterGeneration = this.lastWriterGeneration; + return cloned; + } + + @Override + public String toString() { + return "CatalogSnapshot{" + "id=" + generation + ", version=" + version + ", dfGroupedSearchableFiles=" + dfGroupedSearchableFiles + ", List of Segment= " + segmentList + ", userData=" + userData +'}'; + } +} diff --git a/server/src/main/java/org/opensearch/index/engine/exec/coord/IndexFileDeleter.java b/server/src/main/java/org/opensearch/index/engine/exec/coord/IndexFileDeleter.java index d365187b1e487..25a720cd56703 100644 --- a/server/src/main/java/org/opensearch/index/engine/exec/coord/IndexFileDeleter.java +++ b/server/src/main/java/org/opensearch/index/engine/exec/coord/IndexFileDeleter.java @@ -83,8 +83,10 @@ private Map> segregateFilesByFormat(CatalogSnapshot s Collection dfFiles = new HashSet<>(); Collection fileSets = snapshot.getSearchableFiles(dataFormat); for (WriterFileSet fileSet : fileSets) { + Path directory = Path.of(fileSet.getDirectory()); for (String file : fileSet.getFiles()) { - dfFiles.add(fileSet.getDirectory() + "/" + file); + // ToDo: @Shreyansh update this to relative path + dfFiles.add(directory.resolve(file).toAbsolutePath().normalize().toString()); } } dfSegregatedFiles.put(dataFormat, dfFiles); @@ -100,15 +102,18 @@ private void deleteUnreferencedFiles(ShardPath shardPath) throws IOException { String dataFormat = entry.getKey(); Collection referencedFiles = entry.getValue().keySet(); Collection filesToDelete = new HashSet<>(); - // TODO - Currently hardcoding to get all parquet files in data path. Fix this - try (DirectoryStream stream = Files.newDirectoryStream(shardPath.getDataPath(), "*.parquet")) { + Path dataFormatPath = shardPath.getDataPath().resolve(dataFormat); + if (!Files.exists(dataFormatPath)) continue; + + try (DirectoryStream stream = Files.newDirectoryStream(dataFormatPath, "*." + dataFormat)) { StreamSupport.stream(stream.spliterator(), false) - .map(Path::toString) + .map(p -> p.toAbsolutePath().normalize().toString()) .filter((file) -> (!referencedFiles.contains(file))) .forEach(filesToDelete::add); } - filesToDelete = filesToDelete.stream().map(file -> shardPath.getDataPath().resolve(file).toString()).collect(Collectors.toSet()); - dfFilesToDelete.put(dataFormat, filesToDelete); + if (!filesToDelete.isEmpty()) { + dfFilesToDelete.put(dataFormat, filesToDelete); + } } deleteUnreferencedFiles(dfFilesToDelete); } diff --git a/server/src/main/java/org/opensearch/index/engine/exec/coord/Segment.java b/server/src/main/java/org/opensearch/index/engine/exec/coord/Segment.java new file mode 100644 index 0000000000000..48fa6645b7757 --- /dev/null +++ b/server/src/main/java/org/opensearch/index/engine/exec/coord/Segment.java @@ -0,0 +1,82 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.index.engine.exec.coord; + +import org.opensearch.core.common.io.stream.StreamInput; +import org.opensearch.core.common.io.stream.StreamOutput; +import org.opensearch.core.common.io.stream.Writeable; +import org.opensearch.index.engine.exec.FileMetadata; +import org.opensearch.index.engine.exec.WriterFileSet; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Represents a segment in the catalog snapshot containing files grouped by data format. + */ +public class Segment implements Serializable, Writeable { + + private final long generation; + private final Map dfGroupedSearchableFiles; + + public Segment(long generation) { + this.dfGroupedSearchableFiles = new HashMap<>(); + this.generation = generation; + } + + public Segment(StreamInput in) throws IOException { + this.generation = in.readLong(); + this.dfGroupedSearchableFiles = new HashMap<>(); + int mapSize = in.readVInt(); + for (int i = 0; i < mapSize; i++) { + String dataFormat = in.readString(); + WriterFileSet writerFileSet = new WriterFileSet(in); + dfGroupedSearchableFiles.put(dataFormat, writerFileSet); + } + } + + public void addSearchableFiles(String dataFormat, WriterFileSet writerFileSetGroup) { + dfGroupedSearchableFiles.put(dataFormat, writerFileSetGroup); + } + + public Map getDFGroupedSearchableFiles() { + return dfGroupedSearchableFiles; + } + + public Collection getSearchableFiles(String df) { + List searchableFiles = new ArrayList<>(); + WriterFileSet fileSet = dfGroupedSearchableFiles.get(df); + if (fileSet != null) { + String directory = fileSet.getDirectory(); + for (String file : fileSet.getFiles()) { + searchableFiles.add(new FileMetadata(df, file)); + } + } + return searchableFiles; + } + + public long getGeneration() { + return generation; + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeLong(generation); + out.writeVInt(dfGroupedSearchableFiles.size()); + for (Map.Entry entry : dfGroupedSearchableFiles.entrySet()) { + out.writeString(entry.getKey()); + entry.getValue().writeTo(out); + } + } +} diff --git a/server/src/main/java/org/opensearch/index/engine/exec/coord/SegmentInfosCatalogSnapshot.java b/server/src/main/java/org/opensearch/index/engine/exec/coord/SegmentInfosCatalogSnapshot.java index 03883a7bb001a..5521d987de952 100644 --- a/server/src/main/java/org/opensearch/index/engine/exec/coord/SegmentInfosCatalogSnapshot.java +++ b/server/src/main/java/org/opensearch/index/engine/exec/coord/SegmentInfosCatalogSnapshot.java @@ -16,20 +16,24 @@ import org.opensearch.core.common.io.stream.StreamInput; import org.opensearch.core.common.io.stream.StreamOutput; import org.opensearch.index.engine.exec.FileMetadata; +import org.opensearch.index.engine.exec.WriterFileSet; import java.io.IOException; +import java.nio.file.Path; import java.util.Collection; import java.util.List; import java.util.Map; -import java.util.function.Supplier; +import java.util.Set; import java.util.stream.Collectors; public class SegmentInfosCatalogSnapshot extends CatalogSnapshot { + private static final String CATALOG_SNAPSHOT_KEY = "_segment_infos_catalog_snapshot_"; + private final SegmentInfos segmentInfos; - public SegmentInfosCatalogSnapshot(long id, long version, List segmentList, Map catalogSnapshotMap, Supplier indexFileDeleterSupplier, SegmentInfos segmentInfos) { - super(id, version, segmentList, catalogSnapshotMap, indexFileDeleterSupplier); + public SegmentInfosCatalogSnapshot(SegmentInfos segmentInfos) { + super(CATALOG_SNAPSHOT_KEY + segmentInfos.getGeneration(), segmentInfos.getGeneration(), segmentInfos.getVersion()); this.segmentInfos = segmentInfos; } @@ -55,10 +59,76 @@ public void writeTo(StreamOutput out) throws IOException { @Override public Collection getFileMetadataList() throws IOException { - return segmentInfos.files(true).stream().map(file -> new FileMetadata(file, "lucene")).collect(Collectors.toList()); + return segmentInfos.files(true).stream().map(file -> new FileMetadata("lucene", file)).collect(Collectors.toList()); } public SegmentInfos getSegmentInfos() { return segmentInfos; } + + @Override + public Map getUserData() { + return segmentInfos.getUserData(); + } + + @Override + public long getId() { + return generation; + } + + @Override + public List getSegments() { + throw new UnsupportedOperationException("SegmentInfosCatalogSnapshot does not support getSegments()"); + } + + @Override + public Collection getSearchableFiles(String dataFormat) { + throw new UnsupportedOperationException("SegmentInfosCatalogSnapshot does not support getSearchableFiles()"); + } + + @Override + public Set getDataFormats() { + throw new UnsupportedOperationException("SegmentInfosCatalogSnapshot does not support getDataFormats()"); + } + + @Override + public long getLastWriterGeneration() { + return -1; + } + + @Override + public String serializeToString() throws IOException { + throw new UnsupportedOperationException("SegmentInfosCatalogSnapshot does not support serializeToString()"); + } + + @Override + public void remapPaths(Path newShardDataPath) { + // No-op for SegmentInfosCatalogSnapshot + } + + @Override + public void setIndexFileDeleterSupplier(java.util.function.Supplier supplier) { + // No-op for SegmentInfosCatalogSnapshot + } + + @Override + public void setCatalogSnapshotMap(Map catalogSnapshotMap) { + // No-op for SegmentInfosCatalogSnapshot + } + + @Override + public SegmentInfosCatalogSnapshot clone() { + return new SegmentInfosCatalogSnapshot(segmentInfos); + } + + @Override + protected void closeInternal() { + // TODO no op since SegmentInfosCatalogSnapshot is not refcounted + } + + @Override + public void setUserData(Map userData, boolean b) + { + // TODO no op since SegmentInfosCatalogSnapshot is not refcounted + } } diff --git a/server/src/main/java/org/opensearch/index/engine/exec/merge/CompositeMergeHandler.java b/server/src/main/java/org/opensearch/index/engine/exec/merge/CompositeMergeHandler.java index e9aaeffebca5e..6786e041ca9ea 100644 --- a/server/src/main/java/org/opensearch/index/engine/exec/merge/CompositeMergeHandler.java +++ b/server/src/main/java/org/opensearch/index/engine/exec/merge/CompositeMergeHandler.java @@ -8,6 +8,8 @@ package org.opensearch.index.engine.exec.merge; +import org.opensearch.index.engine.exec.coord.Segment; + import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.message.ParameterizedMessage; import org.opensearch.common.logging.Loggers; @@ -50,12 +52,12 @@ public Collection findForceMerges(int maxSegmentCount) { try (CompositeEngine.ReleasableRef catalogSnapshotReleasableRef = compositeEngine.acquireSnapshot()) { CatalogSnapshot catalogSnapshot = catalogSnapshotReleasableRef.getRef(); - List segmentList = catalogSnapshot.getSegments(); - List> mergeCandidates = + List segmentList = catalogSnapshot.getSegments(); + List> mergeCandidates = mergePolicy.findForceMergeCandidates(segmentList, maxSegmentCount); // Process merge candidates - for (List mergeGroup : mergeCandidates) { + for (List mergeGroup : mergeCandidates) { oneMerges.add(new OneMerge(mergeGroup)); } } catch (Exception e) { @@ -71,12 +73,12 @@ public Collection findMerges() { try (CompositeEngine.ReleasableRef catalogSnapshotReleasableRef = compositeEngine.acquireSnapshot()) { CatalogSnapshot catalogSnapshot = catalogSnapshotReleasableRef.getRef(); - List segmentList = catalogSnapshot.getSegments(); - List> mergeCandidates = + List segmentList = catalogSnapshot.getSegments(); + List> mergeCandidates = mergePolicy.findMergeCandidates(segmentList); // Process merge candidates - for (List mergeGroup : mergeCandidates) { + for (List mergeGroup : mergeCandidates) { oneMerges.add(new OneMerge(mergeGroup)); } } catch (Exception e) { diff --git a/server/src/main/java/org/opensearch/index/engine/exec/merge/CompositeMergePolicy.java b/server/src/main/java/org/opensearch/index/engine/exec/merge/CompositeMergePolicy.java index c6f0c88a1ab88..f53e5efa0aba0 100644 --- a/server/src/main/java/org/opensearch/index/engine/exec/merge/CompositeMergePolicy.java +++ b/server/src/main/java/org/opensearch/index/engine/exec/merge/CompositeMergePolicy.java @@ -8,6 +8,8 @@ package org.opensearch.index.engine.exec.merge; +import org.opensearch.index.engine.exec.coord.Segment; + import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.message.ParameterizedMessage; import org.apache.lucene.codecs.Codec; @@ -66,8 +68,8 @@ public void close() throws IOException { }; } - public List> findForceMergeCandidates(List segments, int maxSegmentCount) throws IOException { - Map segmentMap = new HashMap<>(); + public List> findForceMergeCandidates(List segments, int maxSegmentCount) throws IOException { + Map segmentMap = new HashMap<>(); SegmentInfos segmentInfos = convertToSegmentInfos(segments, segmentMap); Map segmentsToMerge = new HashMap<>(); @@ -84,8 +86,8 @@ public List> findForceMergeCandidates(List> findMergeCandidates(List segments) throws IOException { - Map segmentMap = new HashMap<>(); + public List> findMergeCandidates(List segments) throws IOException { + Map segmentMap = new HashMap<>(); SegmentInfos segmentInfos = convertToSegmentInfos(segments, segmentMap); try { @@ -100,12 +102,12 @@ public List> findMergeCandidates(List segments, - Map segmentMap + List segments, + Map segmentMap ) throws IOException { SegmentInfos segmentInfos = new SegmentInfos(Version.LATEST.major); - for (CatalogSnapshot.Segment segment : segments) { + for (Segment segment : segments) { SegmentWrapper wrapper = new SegmentWrapper(segment, calculateTotalSize(segment), calculateNumDocs(segment)); segmentInfos.add(wrapper); segmentMap.put(wrapper, segment); @@ -114,15 +116,15 @@ private SegmentInfos convertToSegmentInfos( return segmentInfos; } - private List> convertMergeSpecification( + private List> convertMergeSpecification( MergePolicy.MergeSpecification mergeSpecification, - Map segmentMap + Map segmentMap ) { - List> merges = new ArrayList<>(); + List> merges = new ArrayList<>(); if (mergeSpecification != null) { for (MergePolicy.OneMerge merge : mergeSpecification.merges) { - List segmentMerge = new ArrayList<>(); + List segmentMerge = new ArrayList<>(); for (SegmentCommitInfo segment : merge.segments) { segmentMerge.add(segmentMap.get(segment)); } @@ -153,7 +155,7 @@ public Set getMergingSegments() { return Collections.unmodifiableSet(mergingSegments); } - private long calculateNumDocs(CatalogSnapshot.Segment segment) { + private long calculateNumDocs(Segment segment) { try { return segment.getDFGroupedSearchableFiles().values() .stream() @@ -166,7 +168,7 @@ private long calculateNumDocs(CatalogSnapshot.Segment segment) { return 0; } - private long calculateTotalSize(CatalogSnapshot.Segment segment) { + private long calculateTotalSize(Segment segment) { try { return segment.getDFGroupedSearchableFiles().values() .stream() @@ -179,9 +181,9 @@ private long calculateTotalSize(CatalogSnapshot.Segment segment) { return 0; } - public synchronized void addMergingSegment(Collection segments) { + public synchronized void addMergingSegment(Collection segments) { try { - for (CatalogSnapshot.Segment segment : segments) { + for (Segment segment : segments) { SegmentWrapper wrapper = new SegmentWrapper(segment, calculateTotalSize(segment), calculateNumDocs(segment)); mergingSegments.add(wrapper); } @@ -191,11 +193,11 @@ public synchronized void addMergingSegment(Collection s } } - public synchronized void removeMergingSegment(Collection segments) { + public synchronized void removeMergingSegment(Collection segments) { List segmentToRemove = new ArrayList<>(); try { - for (CatalogSnapshot.Segment segment : segments) { + for (Segment segment : segments) { SegmentWrapper wrapper = new SegmentWrapper(segment, calculateTotalSize(segment), calculateNumDocs(segment)); segmentToRemove.add(wrapper); } @@ -209,7 +211,7 @@ public synchronized void removeMergingSegment(Collection dataFormatMergerMap; private final Deque mergingSegments = new ArrayDeque<>(); - private final Set currentlyMergingSegments = new HashSet<>(); + private final Set currentlyMergingSegments = new HashSet<>(); private final Logger logger; private final ShardId shardId; @@ -77,7 +79,7 @@ public synchronized void updatePendingMerges() { Collection oneMerges = findMerges(); for (OneMerge oneMerge : oneMerges) { boolean isValidMerge = true; - for (CatalogSnapshot.Segment segment : oneMerge.getSegmentsToMerge()) { + for (Segment segment : oneMerge.getSegmentsToMerge()) { if (currentlyMergingSegments.contains(segment)) { isValidMerge = false; break; @@ -92,8 +94,8 @@ public synchronized void updatePendingMerges() { public synchronized void registerMerge(OneMerge merge) { try (CompositeEngine.ReleasableRef catalogSnapshotReleasableRef = compositeEngine.acquireSnapshot()) { // Validate segments exist in catalog - List catalogSegments = catalogSnapshotReleasableRef.getRef().getSegments(); - for (CatalogSnapshot.Segment mergeSegment : merge.getSegmentsToMerge()) { + List catalogSegments = catalogSnapshotReleasableRef.getRef().getSegments(); + for (Segment mergeSegment : merge.getSegmentsToMerge()) { if (!catalogSegments.contains(mergeSegment)) { return; } @@ -201,7 +203,7 @@ private void cleanupStaleMergedFiles(Map mergedWriter private List getFilesToMerge(OneMerge oneMerge, DataFormat dataFormat) { List writerFileSets = new ArrayList<>(); - for (CatalogSnapshot.Segment segment : oneMerge.getSegmentsToMerge()) { + for (Segment segment : oneMerge.getSegmentsToMerge()) { writerFileSets.add(segment.getDFGroupedSearchableFiles().get(dataFormat.name())); } return writerFileSets; diff --git a/server/src/main/java/org/opensearch/index/engine/exec/merge/OneMerge.java b/server/src/main/java/org/opensearch/index/engine/exec/merge/OneMerge.java index d3a015573a85d..991cdb255273c 100644 --- a/server/src/main/java/org/opensearch/index/engine/exec/merge/OneMerge.java +++ b/server/src/main/java/org/opensearch/index/engine/exec/merge/OneMerge.java @@ -9,23 +9,25 @@ package org.opensearch.index.engine.exec.merge; import org.opensearch.index.engine.exec.WriterFileSet; +import org.opensearch.index.engine.exec.coord.Segment; + import org.opensearch.index.engine.exec.coord.CatalogSnapshot; import java.util.Collections; import java.util.List; public class OneMerge { - private final List segmentsToMerge; + private final List segmentsToMerge; private final long totalSize; private final long totalNumDocs; - public OneMerge(List segmentsToMerge) { + public OneMerge(List segmentsToMerge) { this.segmentsToMerge = Collections.unmodifiableList(segmentsToMerge); this.totalSize = calculateTotalSizeInBytes(); this.totalNumDocs = calculateTotalNumDocs(); } - public List getSegmentsToMerge() { + public List getSegmentsToMerge() { return segmentsToMerge; } diff --git a/server/src/main/java/org/opensearch/index/engine/exec/text/TextEngine.java b/server/src/main/java/org/opensearch/index/engine/exec/text/TextEngine.java index d1b320a625729..8f43091693274 100644 --- a/server/src/main/java/org/opensearch/index/engine/exec/text/TextEngine.java +++ b/server/src/main/java/org/opensearch/index/engine/exec/text/TextEngine.java @@ -8,6 +8,8 @@ package org.opensearch.index.engine.exec.text; +import org.opensearch.index.engine.exec.coord.Segment; + import org.opensearch.index.engine.exec.DataFormat; import org.opensearch.index.engine.exec.DocumentInput; import org.opensearch.index.engine.exec.FileInfos; @@ -78,7 +80,7 @@ public void deleteFiles(Map> filesToDelete) throws IO public RefreshResult refresh(RefreshInput refreshInput) throws IOException { openFiles.addAll(refreshInput.getWriterFiles()); RefreshResult refreshResult = new RefreshResult(); - CatalogSnapshot.Segment segment = new CatalogSnapshot.Segment(0); + Segment segment = new Segment(0); openFiles.forEach(file -> segment.addSearchableFiles(DataFormat.TEXT.name(), file)); refreshResult.setRefreshedSegments(List.of(segment)); return refreshResult; diff --git a/server/src/main/java/org/opensearch/index/remote/RemoteStoreCustomMetadataResolver.java b/server/src/main/java/org/opensearch/index/remote/RemoteStoreCustomMetadataResolver.java index e8a0dda5a699e..ed537f5005b72 100644 --- a/server/src/main/java/org/opensearch/index/remote/RemoteStoreCustomMetadataResolver.java +++ b/server/src/main/java/org/opensearch/index/remote/RemoteStoreCustomMetadataResolver.java @@ -14,6 +14,7 @@ import org.opensearch.index.remote.RemoteStoreEnums.PathHashAlgorithm; import org.opensearch.index.remote.RemoteStoreEnums.PathType; import org.opensearch.indices.RemoteStoreSettings; +import org.opensearch.node.remotestore.RemoteStoreNodeAttribute; import org.opensearch.repositories.RepositoriesService; import org.opensearch.repositories.Repository; import org.opensearch.repositories.RepositoryMissingException; @@ -71,4 +72,19 @@ public boolean isTranslogMetadataEnabled() { && blobStoreRepository.blobStore().isBlobMetadataEnabled(); } + public boolean isRemoteStoreRepoServerSideEncryptionEnabled() { + BlobStoreRepository segmentRepository, translogRepository; + try { + segmentRepository = (BlobStoreRepository) repositoriesServiceSupplier.get() + .repository(RemoteStoreNodeAttribute.getRemoteStoreSegmentRepo(settings)); + translogRepository = (BlobStoreRepository) repositoriesServiceSupplier.get() + .repository(RemoteStoreNodeAttribute.getRemoteStoreTranslogRepo(settings)); + } catch (RepositoryMissingException ex) { + throw new IllegalArgumentException("Repository should be created before creating index with remote_store enabled setting", ex); + } + return Version.V_3_3_0.compareTo(minNodeVersionSupplier.get()) <= 0 + && remoteStoreSettings.isClusterServerSideEncryptionEnabled() + && segmentRepository.isSeverSideEncryptionEnabled() + && translogRepository.isSeverSideEncryptionEnabled(); + } } diff --git a/server/src/main/java/org/opensearch/index/remote/RemoteStoreUtils.java b/server/src/main/java/org/opensearch/index/remote/RemoteStoreUtils.java index 32a1ca0e5d5ab..3b3b9b729bc39 100644 --- a/server/src/main/java/org/opensearch/index/remote/RemoteStoreUtils.java +++ b/server/src/main/java/org/opensearch/index/remote/RemoteStoreUtils.java @@ -248,6 +248,11 @@ public static Map determineRemoteStoreCustomMetadataDuringMigrat return remoteCustomData; } + public static boolean isServerSideEncryptionEnabledIndex(IndexMetadata indexMetadata) { + Map remoteCustomData = indexMetadata.getCustomData(IndexMetadata.REMOTE_STORE_CUSTOM_KEY); + return remoteCustomData != null && "true".equalsIgnoreCase(remoteCustomData.get(IndexMetadata.REMOTE_STORE_SSE_ENABLED_INDEX_KEY)); + } + /** * Fetches segment and translog repository names from remote store node attributes. * Returns a blank {@link HashMap} if the cluster does not contain any remote nodes. diff --git a/server/src/main/java/org/opensearch/index/shard/IndexShard.java b/server/src/main/java/org/opensearch/index/shard/IndexShard.java index 24596eee8ae37..0ac0f2548ce29 100644 --- a/server/src/main/java/org/opensearch/index/shard/IndexShard.java +++ b/server/src/main/java/org/opensearch/index/shard/IndexShard.java @@ -48,11 +48,7 @@ import org.apache.lucene.search.QueryCachingPolicy; import org.apache.lucene.search.ReferenceManager; import org.apache.lucene.search.Sort; -import org.apache.lucene.store.AlreadyClosedException; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FilterDirectory; -import org.apache.lucene.store.IOContext; -import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.*; import org.apache.lucene.util.ThreadInterruptedException; import org.apache.lucene.util.Version; import org.opensearch.ExceptionsHelper; @@ -62,6 +58,7 @@ import org.opensearch.action.admin.indices.forcemerge.ForceMergeRequest; import org.opensearch.action.admin.indices.streamingingestion.state.ShardIngestionState; import org.opensearch.action.admin.indices.upgrade.post.UpgradeRequest; +import org.opensearch.action.support.PlainActionFuture; import org.opensearch.action.support.replication.PendingReplicationActions; import org.opensearch.action.support.replication.ReplicationResponse; import org.opensearch.cluster.metadata.DataStream; @@ -98,6 +95,7 @@ import org.opensearch.common.settings.Settings; import org.opensearch.common.unit.TimeValue; import org.opensearch.common.util.BigArrays; +import org.opensearch.common.util.CancellableThreads; import org.opensearch.common.util.concurrent.AbstractAsyncTask; import org.opensearch.common.util.concurrent.AbstractRunnable; import org.opensearch.common.util.concurrent.AsyncIOProcessor; @@ -138,6 +136,7 @@ import org.opensearch.index.engine.IngestionEngine; import org.opensearch.index.engine.MergedSegmentWarmerFactory; import org.opensearch.index.engine.NRTReplicationEngine; +import org.opensearch.index.engine.NRTReplicationCompositeEngine; import org.opensearch.index.engine.ReadOnlyEngine; import org.opensearch.index.engine.RefreshFailedEngineException; import org.opensearch.index.engine.SafeCommitInfo; @@ -151,6 +150,7 @@ import org.opensearch.index.engine.exec.composite.CompositeDataFormatWriter; import org.opensearch.index.engine.exec.coord.CatalogSnapshot; import org.opensearch.index.engine.exec.coord.CompositeEngine; +import org.opensearch.index.engine.exec.coord.SegmentInfosCatalogSnapshot; import org.opensearch.index.fielddata.FieldDataStats; import org.opensearch.index.fielddata.ShardFieldData; import org.opensearch.index.flush.FlushStats; @@ -171,6 +171,7 @@ import org.opensearch.index.remote.RemoteSegmentStats; import org.opensearch.index.remote.RemoteStorePathStrategy; import org.opensearch.index.remote.RemoteStoreStatsTrackerFactory; +import org.opensearch.index.remote.RemoteStoreUtils; import org.opensearch.index.search.stats.SearchStats; import org.opensearch.index.search.stats.ShardSearchStats; import org.opensearch.index.seqno.LocalCheckpointTracker; @@ -183,8 +184,14 @@ import org.opensearch.index.seqno.SequenceNumbers; import org.opensearch.index.shard.PrimaryReplicaSyncer.ResyncTask; import org.opensearch.index.similarity.SimilarityService; -import org.opensearch.index.store.*; +import org.opensearch.index.store.CompositeStoreDirectory; +import org.opensearch.index.store.RemoteSegmentStoreDirectory; +import org.opensearch.index.store.RemoteStoreFileDownloader; +import org.opensearch.index.store.Store; import org.opensearch.index.store.Store.MetadataSnapshot; +import org.opensearch.index.store.StoreFileMetadata; +import org.opensearch.index.store.StoreStats; +import org.opensearch.index.store.UploadedSegmentMetadata; import org.opensearch.index.store.remote.metadata.RemoteSegmentMetadata; import org.opensearch.index.translog.RemoteBlobStoreInternalTranslogFactory; import org.opensearch.index.translog.RemoteFsTranslog; @@ -210,6 +217,7 @@ import org.opensearch.indices.recovery.RecoverySettings; import org.opensearch.indices.recovery.RecoveryState; import org.opensearch.indices.recovery.RecoveryTarget; +import org.opensearch.indices.replication.CompositeStoreDirectoryStatsWrapper; import org.opensearch.indices.replication.checkpoint.MergedSegmentCheckpoint; import org.opensearch.indices.replication.checkpoint.MergedSegmentPublisher; import org.opensearch.indices.replication.checkpoint.ReferencedSegmentsCheckpoint; @@ -260,6 +268,7 @@ import java.util.stream.Collectors; import java.util.stream.StreamSupport; +import static org.opensearch.action.support.PlainActionFuture.newFuture; import static org.opensearch.index.seqno.RetentionLeaseActions.RETAIN_ALL; import static org.opensearch.index.seqno.SequenceNumbers.LOCAL_CHECKPOINT_KEY; import static org.opensearch.index.seqno.SequenceNumbers.MAX_SEQ_NO; @@ -551,7 +560,7 @@ public boolean shouldCache(Query query) { this.remoteStoreStatsTrackerFactory = remoteStoreStatsTrackerFactory; this.recoverySettings = recoverySettings; this.remoteStoreSettings = remoteStoreSettings; - this.fileDownloader = new RemoteStoreFileDownloader(shardRouting.shardId(), threadPool, recoverySettings); + this.fileDownloader = new RemoteStoreFileDownloader(shardRouting.shardId(), threadPool, recoverySettings, isOptimizedIndex()); this.shardMigrationState = getShardMigrationState(indexSettings, seedRemote); this.discoveryNodes = discoveryNodes; this.segmentReplicationStatsProvider = segmentReplicationStatsProvider; @@ -1174,7 +1183,7 @@ public Engine.IndexResult applyIndexOperationOnReplica( Engine.Operation.Origin.REPLICA, sourceToParse, id, - null + getIndexer()::documentInput ); } @@ -1533,9 +1542,14 @@ public DocsStats docStats() { * @throws AlreadyClosedException if shard is closed */ public CommitStats commitStats() { - return getStatsHolder().commitStats(); + final StatsHolder statsHolder = getStatsHolderOrNull(); + if (statsHolder == null) { + throw new AlreadyClosedException("engine is closed"); + } + return statsHolder.commitStats(); } + /** * @return {@link SeqNoStats} * @throws AlreadyClosedException if shard is closed @@ -1830,6 +1844,10 @@ public void finalizeReplication(SegmentInfos infos) throws IOException { * @throws IOException if an error occurs during replication finalization */ public void finalizeReplication(CatalogSnapshot catalogSnapshot, ReplicationCheckpoint replicationCheckpoint) throws IOException { + if (catalogSnapshot instanceof SegmentInfosCatalogSnapshot) { + finalizeReplication(((SegmentInfosCatalogSnapshot) catalogSnapshot).getSegmentInfos()); + return; + } if (Thread.holdsLock(mutex)) { throw new IllegalStateException("finalizeReplication must not be called under mutex - potential deadlock risk"); } @@ -1931,7 +1949,7 @@ public GatedCloseable acquireSafeIndexCommit() throws EngineExcepti final IndexShardState state = this.state; // one time volatile read // we allow snapshot on closed index shard, since we want to do one after we close the shard and before we close the engine if (state == IndexShardState.STARTED || state == IndexShardState.CLOSED) { - return getIndexingExecutionCoordinator().acquireSafeIndexCommit(); + return getIndexer().acquireSafeIndexCommit(); } else { throw new IllegalIndexShardStateException(shardId, state, "snapshot is not allowed"); } @@ -1991,6 +2009,9 @@ public Tuple, ReplicationCheckpoint> getLatestSegme * TODO: SegRep changes for decoupling. looks to depend on codec. */ ReplicationCheckpoint computeReplicationCheckpoint(CatalogSnapshot catalogSnapshot) throws IOException { + if (catalogSnapshot instanceof SegmentInfosCatalogSnapshot) { + return computeReplicationCheckpoint(((SegmentInfosCatalogSnapshot) catalogSnapshot).getSegmentInfos()); + } if (catalogSnapshot == null) { return ReplicationCheckpoint.empty(shardId); } @@ -2011,7 +2032,7 @@ ReplicationCheckpoint computeReplicationCheckpoint(CatalogSnapshot catalogSnapsh catalogSnapshot.getVersion(), formatAwareMetadataMap.values().stream().mapToLong(StoreFileMetadata::length).sum(), formatAwareMetadataMap, - getEngine().config().getCodec().getName() + getIndexer().config().getCodec().getName() ); logger.trace("Recomputed ReplicationCheckpoint from CatalogSnapshot for shard {}", checkpoint); return checkpoint; @@ -2022,16 +2043,26 @@ ReplicationCheckpoint computeReplicationCheckpoint(CatalogSnapshot catalogSnapsh * Creates a mapping from FileMetadata to StoreFileMetadata preserving format information. */ private Map extractFormatAwareMetadata(CatalogSnapshot catalogSnapshot) throws IOException { + if (!isOptimizedIndex()) { + return getSegmentMetadataMap().entrySet().stream().collect( + Collectors.toMap( + e -> new FileMetadata("lucene", e.getKey()), + Map.Entry::getValue + ) + ); + } Map formatAwareMap = new HashMap<>(); - if(catalogSnapshot == null){ + if (catalogSnapshot == null) { return formatAwareMap; } for (FileMetadata fileMetadata : catalogSnapshot.getFileMetadataList()) { try { - long fileLength = store.compositeStoreDirectory().fileLength(fileMetadata); - long checksum = store.compositeStoreDirectory().calculateChecksum(fileMetadata); + Directory storeDirectory = isOptimizedIndex() ? store.compositeStoreDirectory() : store().directory(); + String fileName = isOptimizedIndex() ? fileMetadata.serialize() : fileMetadata.file(); + long fileLength = storeDirectory.fileLength(fileName); + long checksum = ((CompositeStoreDirectory) storeDirectory).calculateChecksum(fileMetadata); StoreFileMetadata storeFileMetadata = new StoreFileMetadata( fileMetadata.file(), @@ -2248,8 +2279,15 @@ public Store.MetadataSnapshot snapshotStoreMetadata() throws IOException { logger.debug("CompositeEngine deletion policy not initialized during peer recovery, falling back to direct store access for shard [{}]", shardId); wrappedIndexCommit = null; } + } else { + // Use regular Engine for non-optimized indices + Engine engine = currentEngineReference.get(); + if (engine != null) { + wrappedIndexCommit = engine.acquireSafeIndexCommit(); + } } if (wrappedIndexCommit == null) { + // Only use direct store access when no engine is running return store.getMetadata(null, true); } } @@ -3021,7 +3059,17 @@ private void innerOpenEngineAndTranslog(LongSupplier globalCheckpointSupplier, b // we must create a new engine under mutex (see IndexShard#snapshotStoreMetadata). final Engine newEngine = engineFactory.newReadWriteEngine(config); if (indexSettings.isOptimizedIndex()) { - CompositeEngine compositeEngine = new CompositeEngine( + CompositeEngine compositeEngine = config.isReadOnlyReplica() + ? new NRTReplicationCompositeEngine( + config, + mapperService, + pluginsService, + indexSettings, + path, + LocalCheckpointTracker::new, + TranslogEventListener.NOOP_TRANSLOG_EVENT_LISTENER + ) + : new CompositeEngine( config, mapperService, pluginsService, @@ -3030,6 +3078,7 @@ private void innerOpenEngineAndTranslog(LongSupplier globalCheckpointSupplier, b LocalCheckpointTracker::new, TranslogEventListener.NOOP_TRANSLOG_EVENT_LISTENER ); + // Don't set currentCompositeEngineReference for replicas currentCompositeEngineReference.set(compositeEngine); } onNewEngine(newEngine); @@ -3068,6 +3117,7 @@ private boolean assertSequenceNumbersInCommit() throws IOException { + "] is different than engine [" + getHistoryUUID() + "]"; + assert userData.containsKey(Engine.MAX_UNSAFE_AUTO_ID_TIMESTAMP_COMMIT_ID) : "opening index which was created post 5.5.0 but " + Engine.MAX_UNSAFE_AUTO_ID_TIMESTAMP_COMMIT_ID + " is not found in commit"; @@ -3247,9 +3297,14 @@ public long getIndexBufferRAMBytesUsed() { } public long getNativeBytesUsed() { - return getIndexer().getNativeBytesUsed(); + Indexer indexer = getIndexer(); + if (indexer == null) { + return 0; + } + return indexer.getNativeBytesUsed(); } + public void addShardFailureCallback(Consumer onShardFailure) { this.shardEventListener.delegates.add(onShardFailure); } @@ -4157,7 +4212,7 @@ public CheckpointState getCheckpointState() { } public StatsHolder getStatsHolder() { - return indexSettings.isOptimizedIndex() ? getIndexingExecutionCoordinator(): currentEngineReference.get(); + return indexSettings.isOptimizedIndex() ? getIndexingExecutionCoordinator() : currentEngineReference.get(); } public IndexingThrottler getIndexingThrottler() { @@ -4529,6 +4584,10 @@ public boolean isRemoteTranslogEnabled() { return indexSettings() != null && (indexSettings().isRemoteTranslogStoreEnabled()); } + public boolean isOptimizedIndex() { + return indexSettings().isOptimizedIndex(); + } + /** * This checks if we are in state to upload to remote store. Until the cluster-manager informs the shard through * cluster state, the shard will not be in STARTED state. This method is used to prevent pre-emptive segment or @@ -5340,13 +5399,29 @@ public void afterRefresh(boolean didRefresh) throws IOException { } private void updateReplicationCheckpoint() { - try (CompositeEngine.ReleasableRef catalogSnapshotRef = getCatalogSnapshotFromEngine()) { - final ReplicationCheckpoint checkpoint = computeReplicationCheckpoint(catalogSnapshotRef.getRef()); - replicationTracker.setLatestReplicationCheckpoint(checkpoint); - logger.trace("Updated replication checkpoint from CatalogSnapshot: shard={}, checkpoint={}", shardId, checkpoint); - } catch (Exception e) { - logger.error("Error computing replication checkpoint from catalog snapshot for shard [{}]", shardId, e); - // throw new OpenSearchException("Error computing replication checkpoint from catalog snapshot", e); + if (isOptimizedIndex()) { + CompositeEngine compositeEngine = currentCompositeEngineReference.get(); + // Use CompositeEngine's CatalogSnapshot for optimized indices + try (CompositeEngine.ReleasableRef catalogSnapshotRef = compositeEngine.acquireSnapshot()) { + final ReplicationCheckpoint checkpoint = computeReplicationCheckpoint(catalogSnapshotRef.getRef()); + replicationTracker.setLatestReplicationCheckpoint(checkpoint); + } catch (Exception e) { + logger.error("Error computing replication checkpoint from catalog snapshot for shard [{}]", shardId, e); + } + } else { + // Fall back to standard engine for non-optimized segment replication + Engine engine = getEngineOrNull(); + if (engine == null) { + logger.debug("Skipping replication checkpoint update - engine not initialized yet for shard [{}]", shardId); + return; + } + try (GatedCloseable segmentInfosSnapshot = engine.getSegmentInfosSnapshot()) { + final ReplicationCheckpoint checkpoint = computeReplicationCheckpoint(segmentInfosSnapshot.get()); + replicationTracker.setLatestReplicationCheckpoint(checkpoint); + logger.trace("Updated replication checkpoint from SegmentInfos: shard={}, checkpoint={}", shardId, checkpoint); + } catch (Exception e) { + logger.error("Error computing replication checkpoint from engine for shard [{}]", shardId, e); + } } } @@ -5447,17 +5522,7 @@ public void close() throws IOException { if ((indexSettings.isRemoteTranslogStoreEnabled() || this.isRemoteSeeded()) && shardRouting.primary()) { syncRemoteTranslogAndUpdateGlobalCheckpoint(); } - newEngineReference.set(engineFactory.newReadWriteEngine(newEngineConfig(replicationTracker))); - onNewEngine(newEngineReference.get()); - } - final TranslogRecoveryRunner translogRunner = (snapshot) -> runTranslogRecovery( - newEngineReference.get(), - snapshot, - Engine.Operation.Origin.LOCAL_RESET, - () -> { - // TODO: add a dedicate recovery stats for the reset translog - } - ); + } // When the new engine is created, translogs are synced from remote store onto local. Since remote store is the source // of truth for translog, we play all translogs that exists locally. Otherwise, the recoverUpto happens upto global checkpoint. @@ -5466,13 +5531,66 @@ public void close() throws IOException { long recoverUpto = this.isRemoteTranslogEnabled() || indexSettings().isSegRepEnabledOrRemoteNode() ? Long.MAX_VALUE : globalCheckpoint; - newEngineReference.get() - .translogManager() - .recoverFromTranslog(translogRunner, newEngineReference.get().getProcessedLocalCheckpoint(), recoverUpto); - newEngineReference.get().refresh("reset_engine"); + + // Only create CompositeEngine for optimized indices + CompositeEngine newCompositeEngine; + if (indexSettings.isOptimizedIndex()) { + // Create NEW CompositeEngine OUTSIDE synchronized block with fresh translog + newCompositeEngine = new CompositeEngine( + newEngineConfig(replicationTracker), + mapperService, + pluginsService, + indexSettings, + path, + LocalCheckpointTracker::new, + TranslogEventListener.NOOP_TRANSLOG_EVENT_LISTENER + ); + + final TranslogRecoveryRunner translogRunner = (snapshot) -> runTranslogRecovery( + newCompositeEngine, + snapshot, + Engine.Operation.Origin.LOCAL_RESET, + () -> { + // TODO: add a dedicate recovery stats for the reset translog + } + ); + + // Recover the NEW CompositeEngine's translog FIRST + newCompositeEngine + .translogManager() + .recoverFromTranslog(translogRunner, newCompositeEngine.getProcessedLocalCheckpoint(), recoverUpto); + newCompositeEngine.refresh("reset_engine"); + } else { + newCompositeEngine = null; + } + + // Create InternalEngine AFTER translog recovery so it reads the updated commit with correct checkpoints + final Engine newEngine = engineFactory.newReadWriteEngine(newEngineConfig(replicationTracker)); + newEngineReference.set(newEngine); + + if (!indexSettings.isOptimizedIndex()) { + synchronized (engineMutex) { + onNewEngine(newEngineReference.get()); + } + final TranslogRecoveryRunner translogRunner = (snapshot) -> runTranslogRecovery( + newEngineReference.get(), + snapshot, + Engine.Operation.Origin.LOCAL_RESET, + () -> { + // TODO: add a dedicate recovery stats for the reset translog + } + ); + newEngineReference.get() + .translogManager() + .recoverFromTranslog(translogRunner, newEngineReference.get().getProcessedLocalCheckpoint(), recoverUpto); + newEngineReference.get().refresh("reset_engine"); + } + synchronized (engineMutex) { verifyNotClosed(); - IOUtils.close(currentEngineReference.getAndSet(newEngineReference.get())); + IOUtils.close(currentEngineReference.getAndSet(newEngineReference.get()), currentCompositeEngineReference.getAndSet(newCompositeEngine)); + + // onNewEngine must be called inside synchronized(engineMutex) block for both optimized and non-optimized indices // We set active because we are now writing operations to the engine; this way, // if we go idle after some time and become inactive, we still give sync'd flush a chance to run. active.set(true); @@ -5497,7 +5615,8 @@ public void deleteTranslogFilesFromRemoteTranslog() throws IOException { getThreadPool(), indexSettings.getRemoteStorePathStrategy(), remoteStoreSettings, - indexSettings().isTranslogMetadataEnabled() + indexSettings().isTranslogMetadataEnabled(), + RemoteStoreUtils.isServerSideEncryptionEnabledIndex(indexSettings.getIndexMetadata()) ); } @@ -5520,7 +5639,8 @@ public void syncTranslogFilesFromRemoteTranslog() throws IOException { shardId, indexSettings.getRemoteStorePathStrategy(), indexSettings().isTranslogMetadataEnabled(), - 0 + 0, + RemoteStoreUtils.isServerSideEncryptionEnabledIndex(indexSettings.getIndexMetadata()) ); } @@ -5530,6 +5650,24 @@ public void syncTranslogFilesFromGivenRemoteTranslog( RemoteStorePathStrategy remoteStorePathStrategy, boolean isTranslogMetadataEnabled, long timestamp + ) throws IOException { + this.syncTranslogFilesFromGivenRemoteTranslog( + repository, + shardId, + remoteStorePathStrategy, + isTranslogMetadataEnabled, + timestamp, + false + ); + } + + public void syncTranslogFilesFromGivenRemoteTranslog( + Repository repository, + ShardId shardId, + RemoteStorePathStrategy remoteStorePathStrategy, + boolean isTranslogMetadataEnabled, + long timestamp, + boolean isServerSideEncryptionEnabled ) throws IOException { RemoteFsTranslog.download( repository, @@ -5541,7 +5679,8 @@ public void syncTranslogFilesFromGivenRemoteTranslog( logger, shouldSeedRemoteStore(), isTranslogMetadataEnabled, - timestamp + timestamp, + isServerSideEncryptionEnabled ); } @@ -5570,36 +5709,51 @@ public void syncSegmentsFromRemoteSegmentStore(boolean overrideLocal, final Runn // are uploaded to the remote segment store. RemoteSegmentMetadata remoteSegmentMetadata = remoteDirectory.init(); - Map uploadedSegments = remoteDirectory - .getSegmentsUploadedToRemoteStore() - .entrySet() - .stream() - .filter(entry -> entry.getKey().startsWith(IndexFileNames.SEGMENTS) == false) - .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + Map uploadedSegments = remoteDirectory.getSegmentsUploadedToRemoteStore(); + Map filteredSegments = new HashMap<>(); + for (Map.Entry entry : uploadedSegments.entrySet()) { + if (!entry.getKey().startsWith(IndexFileNames.SEGMENTS)) { + filteredSegments.put(entry.getKey(), entry.getValue()); + } + } store.incRef(); remoteStore.incRef(); try { final Directory storeDirectory; if (recoveryState.getStage() == RecoveryState.Stage.INDEX) { - storeDirectory = new StoreRecovery.StatsDirectoryWrapper(store.directory(), recoveryState.getIndex()); - for (String file : uploadedSegments.keySet()) { - long checksum = Long.parseLong(uploadedSegments.get(file).getChecksum()); + Store.StoreDirectory directory = isOptimizedIndex() ? store().compositeStoreDirectory() : (Store.StoreDirectory) store().directory(); + storeDirectory = new StoreRecovery.StatsDirectoryWrapper(directory, recoveryState.getIndex()); + for (String file : filteredSegments.keySet()) { + long checksum = Long.parseLong(filteredSegments.get(file).getChecksum()); + boolean fileExistsLocally; + + // Parse FileMetadata from serialized key to get actual filename FileMetadata fileMetadata = new FileMetadata(file); - if (overrideLocal || localDirectoryContains(storeDirectory, fileMetadata, checksum) == false) { - recoveryState.getIndex().addFileDetail(fileMetadata.file(), uploadedSegments.get(file).getLength(), false); + if (isOptimizedIndex() && directory instanceof CompositeStoreDirectory) { + fileExistsLocally = localDirectoryContains((CompositeStoreDirectory) directory, fileMetadata, checksum); + } else { + // For non-optimized indices, use the actual filename from FileMetadata + fileExistsLocally = localDirectoryContainsFile(storeDirectory, fileMetadata.file(), checksum); + } + + if (overrideLocal || !fileExistsLocally) { + recoveryState.getIndex().addFileDetail(file, filteredSegments.get(file).getLength(), false); } else { - recoveryState.getIndex().addFileDetail(fileMetadata.file(), uploadedSegments.get(file).getLength(), true); + recoveryState.getIndex().addFileDetail(file, filteredSegments.get(file).getLength(), true); } } } else { - storeDirectory = store.directory(); + storeDirectory = isOptimizedIndex() + ? store().compositeStoreDirectory() + : store.directory(); } if (indexSettings.isWarmIndex() == false) { - // ToDo:@Kamal update while restore implementation - // copySegmentFiles(storeDirectory, remoteDirectory, null, uploadedSegments, overrideLocal, onFileSync); + copySegmentFiles(storeDirectory, remoteDirectory, null, filteredSegments, overrideLocal, onFileSync); } if (remoteSegmentMetadata != null) { + // Remote store always stores Lucene SegmentInfos format (for both optimized and non-optimized indices) + // For optimized indices, the CatalogSnapshot is embedded within userData of the SegmentInfos final SegmentInfos infosSnapshot = store.buildSegmentInfos( remoteSegmentMetadata.getSegmentInfosBytes(), remoteSegmentMetadata.getGeneration() @@ -5631,7 +5785,6 @@ public void syncSegmentsFromRemoteSegmentStore(boolean overrideLocal, final Runn } /** - * ToDo: @Kamal, Implement this API during Restore flow * Downloads segments from given remote segment store for a specific commit. * @param overrideLocal flag to override local segment files with those in remote store * @param sourceRemoteDirectory RemoteSegmentDirectory Instance from which we need to sync segments @@ -5643,47 +5796,158 @@ public void syncSegmentsFromGivenRemoteSegmentStore( RemoteSegmentMetadata remoteSegmentMetadata, boolean pinnedTimestamp ) throws IOException { - throw new UnsupportedOperationException("Not implemented yet"); + logger.trace("Downloading segments from given remote segment store"); + RemoteSegmentStoreDirectory remoteDirectory = null; + if (remoteStore != null) { + remoteDirectory = getRemoteDirectory(); + remoteDirectory.init(); + remoteStore.incRef(); + } + Map uploadedSegments = sourceRemoteDirectory + .getSegmentsUploadedToRemoteStore(); + store.incRef(); + try { + final Directory storeDirectory; + if (recoveryState.getStage() == RecoveryState.Stage.INDEX) { + // Fix: Add isOptimizedIndex() check for optimized indices + Store.StoreDirectory directory = isOptimizedIndex() + ? store().compositeStoreDirectory() + : (Store.StoreDirectory) store().directory(); + storeDirectory = new StoreRecovery.StatsDirectoryWrapper(directory, recoveryState.getIndex()); + for (String file : uploadedSegments.keySet()) { + long checksum = Long.parseLong(uploadedSegments.get(file).getChecksum()); + boolean fileExistsLocally; + + // Fix: Use format-aware checksum for optimized indices + FileMetadata fileMetadata = new FileMetadata(file); + if (isOptimizedIndex() && directory instanceof CompositeStoreDirectory) { + fileExistsLocally = localDirectoryContains((CompositeStoreDirectory) directory, fileMetadata, checksum); + } else { + fileExistsLocally = localDirectoryContainsFile(storeDirectory, fileMetadata.file(), checksum); + } + + if (overrideLocal || !fileExistsLocally) { + recoveryState.getIndex().addFileDetail(file, uploadedSegments.get(file).getLength(), false); + } else { + recoveryState.getIndex().addFileDetail(file, uploadedSegments.get(file).getLength(), true); + } + } + } else { + storeDirectory = isOptimizedIndex() + ? store().compositeStoreDirectory() + : store.directory(); + } + + String segmentsNFile = copySegmentFiles( + storeDirectory, + sourceRemoteDirectory, + remoteDirectory, + uploadedSegments, + overrideLocal, + () -> {} + ); + if (pinnedTimestamp) { + final SegmentInfos infosSnapshot = store.buildSegmentInfos( + remoteSegmentMetadata.getSegmentInfosBytes(), + remoteSegmentMetadata.getGeneration() + ); + long processedLocalCheckpoint = Long.parseLong(infosSnapshot.getUserData().get(LOCAL_CHECKPOINT_KEY)); + // delete any other commits, we want to start the engine only from a new commit made with the downloaded infos bytes. + // Extra segments will be wiped on engine open. + for (String file : List.of(store.directory().listAll())) { + if (file.startsWith(IndexFileNames.SEGMENTS)) { + store.deleteQuiet(file); + } + } + assert Arrays.stream(store.directory().listAll()).filter(f -> f.startsWith(IndexFileNames.SEGMENTS)).findAny().isEmpty() + || indexSettings.isWarmIndex() : "There should not be any segments file in the dir"; + store.commitSegmentInfos(infosSnapshot, processedLocalCheckpoint, processedLocalCheckpoint); + } else if (segmentsNFile != null) { + try ( + ChecksumIndexInput indexInput = new BufferedChecksumIndexInput( + storeDirectory.openInput(segmentsNFile, IOContext.READONCE) + ) + ) { + long commitGeneration = SegmentInfos.generationFromSegmentsFileName(segmentsNFile); + SegmentInfos infosSnapshot = SegmentInfos.readCommit(store.directory(), indexInput, commitGeneration); + long processedLocalCheckpoint = Long.parseLong(infosSnapshot.getUserData().get(LOCAL_CHECKPOINT_KEY)); + if (remoteStore != null) { + store.commitSegmentInfos(infosSnapshot, processedLocalCheckpoint, processedLocalCheckpoint); + } else { + store.directory().sync(infosSnapshot.files(true)); + store.directory().syncMetaData(); + } + } + } + } catch (IOException e) { + throw new IndexShardRecoveryException(shardId, "Exception while copying segment files from remote segment store", e); + } finally { + store.decRef(); + if (remoteStore != null) { + remoteStore.decRef(); + } + } } - // ToDo: Needs to be updated while Replication flow implementation + /** + * Unified method to copy segment files from remote store. + * Handles both optimized (multiformat) and non-optimized (plain Lucene) indices. + * For optimized indices, keys in uploadedSegments are serialized FileMetadata strings like "segment_1.si:::lucene". + * For non-optimized indices, keys are plain filenames like "segment_1.si". + */ private String copySegmentFiles( - CompositeStoreDirectory storeDirectory, + Directory storeDirectory, RemoteSegmentStoreDirectory sourceRemoteDirectory, RemoteSegmentStoreDirectory targetRemoteDirectory, - Map uploadedSegments, + Map uploadedSegments, boolean overrideLocal, final Runnable onFileSync - ) throws IOException { + ) throws IOException { Set toDownloadSegments = new HashSet<>(); Set skippedSegments = new HashSet<>(); String segmentNFile = null; try { if (overrideLocal) { - for (FileMetadata file : storeDirectory.listFileMetadata()) { + for (String file : storeDirectory.listAll()) { storeDirectory.deleteFile(file); } } - for (FileMetadata file : uploadedSegments.keySet()) { - long checksum = Long.parseLong(uploadedSegments.get(file).getChecksum()); - if (overrideLocal || localDirectoryContains(storeDirectory, file, checksum) == false) { - toDownloadSegments.add(file.file()); - } else { - skippedSegments.add(file.file()); - } + for (String file : uploadedSegments.keySet()) { + long checksum = Long.parseLong(uploadedSegments.get(file).getChecksum()); + boolean fileExistsLocally; + + // For optimized indices with multiformat support (e.g., Parquet files), + // use format-aware checksum validation since Parquet files don't have Lucene codec footers + FileMetadata fileMetadata = new FileMetadata(file); + if (isOptimizedIndex() && storeDirectory instanceof CompositeStoreDirectory) { + fileExistsLocally = localDirectoryContains((CompositeStoreDirectory) storeDirectory, fileMetadata, checksum); + } else if (storeDirectory instanceof StoreRecovery.StatsDirectoryWrapper + && ((StoreRecovery.StatsDirectoryWrapper) storeDirectory).getDelegate() instanceof CompositeStoreDirectory) { + // Handle case where storeDirectory is wrapped in StatsDirectoryWrapper + fileExistsLocally = localDirectoryContains( + (CompositeStoreDirectory) ((StoreRecovery.StatsDirectoryWrapper) storeDirectory).getDelegate(), + fileMetadata, checksum); + } else { + fileExistsLocally = localDirectoryContainsFile(storeDirectory, fileMetadata.file(), checksum); + } - if (file.file().startsWith(IndexFileNames.SEGMENTS)) { + if (overrideLocal || !fileExistsLocally) { + toDownloadSegments.add(file); + } else { + skippedSegments.add(file); + } + + if (file.startsWith(IndexFileNames.SEGMENTS)) { assert segmentNFile == null : "There should be only one SegmentInfosSnapshot file"; - segmentNFile = file.file(); + segmentNFile = file; } } if (toDownloadSegments.isEmpty() == false) { try { - // ToDo: @Kamal, Implement while restore flow implementation. - // fileDownloader.download(sourceRemoteDirectory, storeDirectory, targetRemoteDirectory, toDownloadSegments, onFileSync); + fileDownloader.download(sourceRemoteDirectory, storeDirectory, targetRemoteDirectory, toDownloadSegments, onFileSync); } catch (Exception e) { throw new IOException("Error occurred when downloading segments from remote store", e); } @@ -5696,40 +5960,72 @@ private String copySegmentFiles( return segmentNFile; } - // ToDo: @Kamal boolean localDirectoryContains(CompositeStoreDirectory localDirectory, FileMetadata fileMetadata, long checksum) throws IOException { - throw new UnsupportedOperationException("Not implemented yet"); - } + try { + // Use existing CompositeStoreDirectory checksum calculation (format-aware) + long localChecksum = localDirectory.calculateChecksum(fileMetadata); - // ToDo: @Kamal - @Deprecated - boolean localDirectoryContains(Directory localDirectory, FileMetadata fileMetadata, long checksum) throws IOException { - try (IndexInput indexInput = localDirectory.openInput(fileMetadata.file(), IOContext.READONCE)) { - if (checksum == CodecUtil.retrieveChecksum(indexInput)) { + if (checksum == localChecksum) { return true; } else { - logger.warn("Checksum mismatch between local and remote segment file: {}, will override local file", fileMetadata); + logger.warn("Checksum mismatch for file: {}, format: {}, expected: {}, local: {}, will override", + fileMetadata.file(), fileMetadata.dataFormat(), checksum, localChecksum); // If there is a checksum mismatch and we are not serving reads it is safe to go ahead and delete the file now. // Outside of engine resets this method will be invoked during recovery so this is safe. if (isReadAllowed() == false) { - localDirectory.deleteFile(fileMetadata.file()); + localDirectory.deleteFile(fileMetadata); } else { // segment conflict with remote store while the shard is serving reads. failShard("Local copy of segment " + fileMetadata.file() + " has a different checksum than the version in remote store", null); } } } catch (NoSuchFileException | FileNotFoundException e) { - logger.debug("File {} does not exist in local FS, downloading from remote store", fileMetadata.file()); + logger.debug("File {} with format {} does not exist in local FS, downloading from remote store", + fileMetadata.file(), fileMetadata.dataFormat()); } catch (IOException e) { - logger.warn("Exception while reading checksum of file: {}, this can happen if file is corrupted", fileMetadata.file()); - // For any other exception on reading checksum, we delete the file to re-download again - localDirectory.deleteFile(fileMetadata.file()); + // Check if root cause is "file not found" - MultiFormatStoreException wraps the original exception + Throwable cause = e.getCause(); + if (cause instanceof NoSuchFileException || cause instanceof FileNotFoundException) { + logger.debug("File {} with format {} does not exist in local FS (wrapped exception), downloading from remote store", + fileMetadata.file(), fileMetadata.dataFormat()); + } else { + logger.warn("Exception while reading checksum of file: {}, format: {}, this can happen if file is corrupted", + fileMetadata.file(), fileMetadata.dataFormat(), e); + // For any other exception on reading checksum, we delete the file to re-download again + try { + localDirectory.deleteFile(fileMetadata); + } catch (NoSuchFileException | FileNotFoundException ignored) { + // File already doesn't exist, nothing to delete + } + } } return false; } + boolean localDirectoryContainsFile(Directory localDirectory, String fileName, long checksum) throws IOException { + try (IndexInput indexInput = localDirectory.openInput(fileName, IOContext.READONCE)) { + if (checksum == CodecUtil.retrieveChecksum(indexInput)) { + return true; + } else { + logger.warn("Checksum mismatch between local and remote segment file: {}, will override local file", fileName); + if (isReadAllowed() == false) { + localDirectory.deleteFile(fileName); + } else { + failShard("Local copy of segment " + fileName + " has a different checksum than the version in remote store", null); + } + } + } catch (NoSuchFileException | FileNotFoundException e) { + logger.debug("File {} does not exist in local FS, downloading from remote store", fileName); + } catch (IOException e) { + logger.warn("Exception while reading checksum of file: {}, this can happen if file is corrupted", fileName, e); + localDirectory.deleteFile(fileName); + } + return false; + } + + /** * Returns the maximum sequence number of either update or delete operations have been processed in this shard @@ -5740,7 +6036,7 @@ boolean localDirectoryContains(Directory localDirectory, FileMetadata fileMetada * executing that replication request on a replica. */ public long getMaxSeqNoOfUpdatesOrDeletes() { - return getEngine().getMaxSeqNoOfUpdatesOrDeletes(); + return getIndexer().getMaxSeqNoOfUpdatesOrDeletes(); } /** @@ -5788,7 +6084,7 @@ public GatedCloseable getSegmentInfosSnapshot() { public CompositeEngine.ReleasableRef getCatalogSnapshotFromEngine() { try { - return getIndexingExecutionCoordinator().acquireSnapshot(); + return getIndexer().acquireSnapshot(); } catch (Exception e) { throw new OpenSearchException("Error occurred while getting catalog snapshot", e); } diff --git a/server/src/main/java/org/opensearch/index/shard/ReleasableRetryableRefreshListener.java b/server/src/main/java/org/opensearch/index/shard/ReleasableRetryableRefreshListener.java index 1628e2cfc567c..ad95c7b792259 100644 --- a/server/src/main/java/org/opensearch/index/shard/ReleasableRetryableRefreshListener.java +++ b/server/src/main/java/org/opensearch/index/shard/ReleasableRetryableRefreshListener.java @@ -73,7 +73,6 @@ public final void afterRefresh(boolean didRefresh) throws IOException { if (closed.get()) { return; } - runAfterRefreshExactlyOnce(didRefresh); runAfterRefreshWithPermit(didRefresh, () -> {}); } diff --git a/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java b/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java index f9f9c83a16839..dddbd059eb712 100644 --- a/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java +++ b/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java @@ -9,7 +9,11 @@ package org.opensearch.index.shard; import org.apache.logging.log4j.Logger; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.store.Directory; import org.apache.lucene.store.FilterDirectory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; import org.opensearch.action.LatchedActionListener; import org.opensearch.action.bulk.BackoffPolicy; import org.opensearch.cluster.routing.RecoverySource; @@ -19,6 +23,7 @@ import org.opensearch.core.action.ActionListener; import org.opensearch.index.engine.InternalEngine; import org.opensearch.index.engine.exec.FileMetadata; +import org.opensearch.index.engine.exec.bridge.Indexer; import org.opensearch.index.engine.exec.coord.CatalogSnapshot; import org.opensearch.index.engine.exec.coord.CompositeEngine; import org.opensearch.index.remote.RemoteSegmentTransferTracker; @@ -79,7 +84,7 @@ public final class RemoteStoreRefreshListener extends ReleasableRetryableRefresh public static final Set EXCLUDE_FILES = Set.of("write.lock"); private final IndexShard indexShard; - private final CompositeStoreDirectory compositeStoreDirectory; + private final Directory storeDirectory; private final RemoteSegmentStoreDirectory remoteDirectory; private final RemoteSegmentTransferTracker segmentTracker; private final Map localSegmentChecksumMap; @@ -98,10 +103,10 @@ public RemoteStoreRefreshListener( super(indexShard.getThreadPool()); logger = Loggers.getLogger(getClass(), indexShard.shardId()); this.indexShard = indexShard; - this.compositeStoreDirectory = indexShard.store().compositeStoreDirectory(); + this.storeDirectory = indexShard.isOptimizedIndex() ? indexShard.store().compositeStoreDirectory() : indexShard.store().directory(); this.remoteDirectory = (RemoteSegmentStoreDirectory) ((FilterDirectory) ((FilterDirectory) indexShard.remoteStore().directory()) .getDelegate()).getDelegate(); - remoteStoreUploader = new RemoteStoreUploaderService(indexShard, compositeStoreDirectory, remoteDirectory); + remoteStoreUploader = new RemoteStoreUploaderService(indexShard, storeDirectory, this.remoteDirectory, indexShard.isOptimizedIndex()); localSegmentChecksumMap = new HashMap<>(); RemoteSegmentMetadata remoteSegmentMetadata = null; if (indexShard.routingEntry().primary()) { @@ -240,6 +245,7 @@ private boolean syncSegments() { CompositeEngine.ReleasableRef catalogSnapshotRef = indexShard.getCatalogSnapshotFromEngine(); CatalogSnapshot catalogSnapshot = catalogSnapshotRef.getRef(); + final ReplicationCheckpoint checkpoint = indexShard.computeReplicationCheckpoint(catalogSnapshot); if (checkpoint.getPrimaryTerm() != indexShard.getOperationPrimaryTerm()) { throw new IllegalStateException( @@ -260,13 +266,10 @@ private boolean syncSegments() { // Log format-aware statistics Map formatCounts = localFilesPostRefresh.stream() .collect(Collectors.groupingBy( - fm -> fm.dataFormat(), + FileMetadata::dataFormat, Collectors.counting() )); - logger.debug("Format-aware segment upload initiated: totalFiles={}, formatBreakdown={}", - localFilesPostRefresh.size(), formatCounts); - Map fileMetadataToSizeMap = updateLocalSizeMapAndTracker(localFilesPostRefresh); CountDownLatch latch = new CountDownLatch(1); @@ -275,10 +278,8 @@ private boolean syncSegments() { @Override public void onResponse(Void unused) { try { - logger.debug("New segments upload successful"); // Start metadata file upload uploadMetadata(localFilesPostRefresh, catalogSnapshot, checkpoint); - logger.debug("Metadata upload successful"); clearStaleFilesFromLocalSegmentChecksumMap(localFilesPostRefresh); onSuccessfulSegmentsSync( refreshTimeMs, @@ -424,8 +425,10 @@ private void onSuccessfulSegmentsSync( updateRemoteRefreshTimeAndSeqNo(refreshTimeMs, refreshClockTimeMs, refreshSeqNo); // Reset the backoffDelayIterator for the future failures resetBackOffDelayIterator(); - // Set the minimum sequence number for keeping translog - indexShard.getIndexer().translogManager().setMinSeqNoToKeep(lastRefreshedCheckpoint + 1); + Indexer indexer = indexShard.getIndexer(); + if (indexer != null) { + indexer.translogManager().setMinSeqNoToKeep(lastRefreshedCheckpoint + 1); + } // Publishing the new checkpoint which is used for remote store + segrep indexes checkpointPublisher.publish(indexShard, checkpoint); logger.debug("onSuccessfulSegmentsSync lastRefreshedCheckpoint={} checkpoint={}", lastRefreshedCheckpoint, checkpoint); @@ -469,22 +472,19 @@ private boolean isRefreshAfterCommitSafe() { return false; } - // ToDo:@Kamal Update MaxSeqNo void uploadMetadata(Collection localFilesPostRefresh, CatalogSnapshot catalogSnapshot, ReplicationCheckpoint replicationCheckpoint) throws IOException { final long maxSeqNo = indexShard.getIndexer().currentOngoingRefreshCheckpoint(); + final Map segmentUserData = indexShard.store().readLastCommittedSegmentsInfo().getUserData(); + CatalogSnapshot catalogSnapshotCloned = catalogSnapshot.cloneNoAcquire(); // Create mutable copy and update checkpoint fields while preserving ALL existing metadata - catalogSnapshotCloned.getUserData().put(LOCAL_CHECKPOINT_KEY, String.valueOf(maxSeqNo)); - catalogSnapshotCloned.getUserData().put(SequenceNumbers.MAX_SEQ_NO, Long.toString(maxSeqNo)); - - // Log for verification during debugging - logger.debug("Uploading metadata with userData: translog_uuid={}, history_uuid={}, all_keys={}", - catalogSnapshotCloned.getUserData().get(Translog.TRANSLOG_UUID_KEY), - catalogSnapshotCloned.getUserData().get(org.opensearch.index.engine.Engine.HISTORY_UUID_KEY), - catalogSnapshotCloned.getUserData().keySet()); + final Map userData = new HashMap<>(segmentUserData); + userData.put(LOCAL_CHECKPOINT_KEY, String.valueOf(maxSeqNo)); + userData.put(SequenceNumbers.MAX_SEQ_NO, Long.toString(maxSeqNo)); + catalogSnapshotCloned.setUserData(userData, false); Translog.TranslogGeneration translogGeneration = indexShard.getIndexer().translogManager().getTranslogGeneration(); if (translogGeneration == null) { @@ -492,9 +492,9 @@ void uploadMetadata(Collection localFilesPostRefresh, CatalogSnaps } else { long translogFileGeneration = translogGeneration.translogFileGeneration; remoteDirectory.uploadMetadata( - localFilesPostRefresh.stream().map(FileMetadata::serialize).collect(Collectors.toList()), + localFilesPostRefresh.stream().map(this::fromFileMetadata).collect(Collectors.toList()), catalogSnapshotCloned, - compositeStoreDirectory, + storeDirectory, translogFileGeneration, replicationCheckpoint, indexShard.getNodeId() @@ -502,6 +502,10 @@ void uploadMetadata(Collection localFilesPostRefresh, CatalogSnaps } } + private String fromFileMetadata(FileMetadata fileMetadata) { + return indexShard.isOptimizedIndex() ? fileMetadata.serialize() : fileMetadata.file(); + } + boolean isLowPriorityUpload() { return isLocalOrSnapshotRecoveryOrSeeding(); } @@ -526,9 +530,12 @@ private boolean skipUpload(FileMetadata fileMetadata) { } private String getChecksumOfLocalFile(FileMetadata fileMetadata) throws IOException { + if (fileMetadata.dataFormat().equals("lucene")) { + return getChecksumOfLocalFile(fileMetadata.file()); + } if (!localSegmentChecksumMap.containsKey(fileMetadata.file())) { try{ - String checksum = Long.toString(compositeStoreDirectory.calculateChecksum(fileMetadata)); + String checksum = Long.toString(((CompositeStoreDirectory) storeDirectory).calculateChecksum(fileMetadata)); localSegmentChecksumMap.put(fileMetadata.file(), checksum); logger.debug("Calculated checksum for file: {}, format: {}, checksum: {}", fileMetadata.file(), fileMetadata.dataFormat(), checksum); @@ -542,6 +549,16 @@ private String getChecksumOfLocalFile(FileMetadata fileMetadata) throws IOExcept return localSegmentChecksumMap.get(fileMetadata.file()); } + private String getChecksumOfLocalFile(String file) throws IOException { + if (!localSegmentChecksumMap.containsKey(file)) { + try (IndexInput indexInput = storeDirectory.openInput(file, IOContext.READONCE)) { + String checksum = Long.toString(CodecUtil.retrieveChecksum(indexInput)); + localSegmentChecksumMap.put(file, checksum); + } + } + return localSegmentChecksumMap.get(file); + } + /** * Updates the last refresh time and refresh seq no which is seen by remote store. */ @@ -561,10 +578,10 @@ private void updateRemoteRefreshTimeAndSeqNo(long refreshTimeMs, long refreshClo */ private Map updateLocalSizeMapAndTracker(Collection localFilesPostRefresh) { Map fileSizeMap = new HashMap<>(); - for (FileMetadata fileMetadata : localFilesPostRefresh) { try { - long fileSize = compositeStoreDirectory.fileLength(fileMetadata); + String stringForFileLength = fileMetadata.dataFormat().equals("lucene") ? fileMetadata.file() : fileMetadata.serialize(); + long fileSize = storeDirectory.fileLength(stringForFileLength); fileSizeMap.put(fileMetadata, fileSize); } catch (IOException e) { logger.warn("Failed to get file length for file: {}, format: {}", diff --git a/server/src/main/java/org/opensearch/index/shard/RemoteStoreUploaderService.java b/server/src/main/java/org/opensearch/index/shard/RemoteStoreUploaderService.java index 6af789d89aa43..1faa4600047c3 100644 --- a/server/src/main/java/org/opensearch/index/shard/RemoteStoreUploaderService.java +++ b/server/src/main/java/org/opensearch/index/shard/RemoteStoreUploaderService.java @@ -12,7 +12,6 @@ import org.apache.logging.log4j.message.ParameterizedMessage; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FilterDirectory; import org.apache.lucene.store.IOContext; import org.opensearch.action.support.GroupedActionListener; import org.opensearch.common.logging.Loggers; @@ -20,7 +19,6 @@ import org.opensearch.core.action.ActionListener; import org.opensearch.index.engine.exec.FileMetadata; import org.opensearch.index.store.SegmentUploadFailedException; -import org.opensearch.index.store.CompositeStoreDirectory; import org.opensearch.index.store.RemoteSegmentStoreDirectory; import java.util.Collection; @@ -37,22 +35,16 @@ public class RemoteStoreUploaderService implements RemoteStoreUploader { private final Logger logger; private final IndexShard indexShard; - private final CompositeStoreDirectory storeDirectory; + private final Directory storeDirectory; private final RemoteSegmentStoreDirectory remoteDirectory; + private final boolean isOptimizedIndex; - // Todo: Remove - public RemoteStoreUploaderService(IndexShard indexShard, Directory storeDirectory, RemoteSegmentStoreDirectory remoteDirectory) { - logger = Loggers.getLogger(getClass(), indexShard.shardId()); - this.indexShard = indexShard; - this.storeDirectory = null; - this.remoteDirectory = remoteDirectory; - } - - public RemoteStoreUploaderService(IndexShard indexShard, CompositeStoreDirectory storeDirectory, RemoteSegmentStoreDirectory remoteDirectory) { + public RemoteStoreUploaderService(IndexShard indexShard, Directory storeDirectory, RemoteSegmentStoreDirectory remoteDirectory, boolean isOptimizedIndex) { logger = Loggers.getLogger(getClass(), indexShard.shardId()); this.indexShard = indexShard; this.storeDirectory = storeDirectory; this.remoteDirectory = remoteDirectory; + this.isOptimizedIndex = isOptimizedIndex; } @Override @@ -87,8 +79,6 @@ public void uploadSegments( ActionListener> mappedListener = ActionListener.map(listener, resp -> null); GroupedActionListener batchUploadListener = new GroupedActionListener<>(mappedListener, fileMetadataCollection.size()); - CompositeStoreDirectory directory = storeDirectory; - for (FileMetadata fileMetadata : fileMetadataCollection) { String fileName = fileMetadata.file(); // Initializing listener here to ensure that the stats increment operations are thread-safe @@ -123,7 +113,13 @@ public void uploadSegments( batchUploadListener.onFailure(ex); }); statsListener.beforeUpload(fileMetadata); - remoteDirectory.copyFrom(storeDirectory, fileMetadata.serialize(), IOContext.DEFAULT, aggregatedListener, isLowPriorityUpload); + remoteDirectory.copyFrom( + storeDirectory, + isOptimizedIndex ? fileMetadata.serialize() : fileMetadata.file(), + IOContext.DEFAULT, + aggregatedListener, + isLowPriorityUpload + ); } } } diff --git a/server/src/main/java/org/opensearch/index/shard/ShardPath.java b/server/src/main/java/org/opensearch/index/shard/ShardPath.java index 911bfec94e190..46b0997cb56e6 100644 --- a/server/src/main/java/org/opensearch/index/shard/ShardPath.java +++ b/server/src/main/java/org/opensearch/index/shard/ShardPath.java @@ -61,6 +61,7 @@ public final class ShardPath { public static final String INDEX_FOLDER_NAME = "index"; public static final String TRANSLOG_FOLDER_NAME = "translog"; + public static final String METADATA_FOLDER_NAME = "metadata"; private final Path path; private final ShardId shardId; diff --git a/server/src/main/java/org/opensearch/index/shard/StoreRecovery.java b/server/src/main/java/org/opensearch/index/shard/StoreRecovery.java index 8f99efe502858..8fc64b38dc860 100644 --- a/server/src/main/java/org/opensearch/index/shard/StoreRecovery.java +++ b/server/src/main/java/org/opensearch/index/shard/StoreRecovery.java @@ -422,7 +422,9 @@ void recoverFromSnapshotAndRemoteStore( remoteStoreRepository, indexUUID, shardId, - shallowCopyShardMetadata.getRemoteStorePathStrategy() + shallowCopyShardMetadata.getRemoteStorePathStrategy(), + null, + RemoteStoreUtils.isServerSideEncryptionEnabledIndex(indexShard.indexSettings.getIndexMetadata()) ); RemoteSegmentMetadata remoteSegmentMetadata = sourceRemoteDirectory.initializeToSpecificCommit( primaryTerm, @@ -503,7 +505,9 @@ void recoverShallowSnapshotV2( remoteSegmentStoreRepository, prevIndexMetadata.getIndexUUID(), shardId, - remoteStorePathStrategy + remoteStorePathStrategy, + null, + RemoteStoreUtils.isServerSideEncryptionEnabledIndex(prevIndexMetadata) ); RemoteSegmentMetadata remoteSegmentMetadata = sourceRemoteDirectory.initializeToSpecificTimestamp( recoverySource.pinnedTimestamp() @@ -523,7 +527,8 @@ void recoverShallowSnapshotV2( new ShardId(prevIndexMetadata.getIndex(), shardId.id()), remoteStorePathStrategy, RemoteStoreUtils.determineTranslogMetadataEnabled(prevIndexMetadata), - recoverySource.pinnedTimestamp() + recoverySource.pinnedTimestamp(), + RemoteStoreUtils.isServerSideEncryptionEnabledIndex(indexShard.indexSettings.getIndexMetadata()) ); assert indexShard.shardRouting.primary() : "only primary shards can recover from store"; diff --git a/server/src/main/java/org/opensearch/index/store/ByteSizeCachingDirectory.java b/server/src/main/java/org/opensearch/index/store/ByteSizeCachingDirectory.java index 343f2858fdc74..e50746f2c103a 100644 --- a/server/src/main/java/org/opensearch/index/store/ByteSizeCachingDirectory.java +++ b/server/src/main/java/org/opensearch/index/store/ByteSizeCachingDirectory.java @@ -52,7 +52,7 @@ * * @opensearch.internal */ -final class ByteSizeCachingDirectory extends FilterDirectory { +public final class ByteSizeCachingDirectory extends FilterDirectory { /** * Internal caching size and modulo count diff --git a/server/src/main/java/org/opensearch/index/store/CompositeRemoteSegmentStoreDirectory.java b/server/src/main/java/org/opensearch/index/store/CompositeRemoteSegmentStoreDirectory.java index dcce16213f97e..04523f1ab1dcc 100644 --- a/server/src/main/java/org/opensearch/index/store/CompositeRemoteSegmentStoreDirectory.java +++ b/server/src/main/java/org/opensearch/index/store/CompositeRemoteSegmentStoreDirectory.java @@ -15,10 +15,8 @@ import org.apache.lucene.index.SegmentCommitInfo; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentInfos; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.IOContext; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.*; +import org.apache.lucene.util.Version; import org.opensearch.common.Nullable; import org.opensearch.common.UUIDs; import org.opensearch.common.annotation.InternalApi; @@ -29,6 +27,7 @@ import org.opensearch.core.index.shard.ShardId; import org.opensearch.index.engine.exec.FileMetadata; import org.opensearch.index.engine.exec.coord.CatalogSnapshot; +import org.opensearch.index.engine.exec.coord.CompositeEngineCatalogSnapshot; import org.opensearch.index.remote.RemoteStoreUtils; import org.opensearch.index.store.lockmanager.FileLockInfo; import org.opensearch.index.store.lockmanager.RemoteStoreLockManager; @@ -44,14 +43,8 @@ import java.io.IOException; import java.io.InputStream; import java.nio.file.NoSuchFileException; -import java.util.Collection; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; +import java.nio.file.Path; +import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; @@ -66,12 +59,6 @@ @PublicApi(since = "2.3.0") public final class CompositeRemoteSegmentStoreDirectory extends RemoteSegmentStoreDirectory { - /** - * Each segment file is uploaded with unique suffix. - * For example, _0.cfe in local filesystem will be uploaded to remote segment store as _0.cfe__gX7bNIIBrs0AUNsR2yEG - */ - public static final String SEGMENT_NAME_UUID_SEPARATOR = "__"; - /** * compositeRemoteDirectory is used to store segment files with format-specific routing * Always present - never null @@ -157,7 +144,7 @@ public CompositeRemoteSegmentStoreDirectory( ShardId shardId, @Nullable Map pendingDownloadMergedSegments ) throws IOException { - super(null, remoteMetadataDirectory, mdLockManager, threadPool, shardId); + super(compositeRemoteDirectory, remoteMetadataDirectory, mdLockManager, threadPool, shardId); this.compositeRemoteDirectory = compositeRemoteDirectory; this.remoteMetadataDirectory = remoteMetadataDirectory; this.mdLockManager = mdLockManager; @@ -188,27 +175,6 @@ public RemoteSegmentMetadata init() throws IOException { return remoteSegmentMetadata; } - /** - * Read the latest metadata file to get the list of segments uploaded to the remote segment store. - * Delegates to CompositeRemoteDirectory when available for better format-aware metadata handling. - */ - public RemoteSegmentMetadata readLatestMetadataFile() throws IOException { - if (compositeRemoteDirectory != null) { - logger.debug("Reading latest metadata file from CompositeRemoteDirectory for better format-aware handling"); - return compositeRemoteDirectory.readLatestMetadataFile(); - } else { - logger.info("No CompositeRemoteDirectory found"); - return null; - } - } - - private RemoteSegmentMetadata readMetadataFile(String metadataFilename) throws IOException { - try (InputStream inputStream = remoteMetadataDirectory.getBlobStream(metadataFilename)) { - byte[] metadataBytes = inputStream.readAllBytes(); - return metadataStreamWrapper.readStream(new ByteArrayIndexInput(metadataFilename, metadataBytes)); - } - } - /** * Initializes the cache to a specific commit which keeps track of all the segment files uploaded to the remote segment store. */ @@ -475,8 +441,7 @@ public void copyFrom(Directory from, String src, String dest, IOContext context) } public boolean containsFile(String localFilename, String checksum) { - return segmentsUploadedToRemoteStore.containsKey(localFilename) - && segmentsUploadedToRemoteStore.get(localFilename).getChecksum().equals(checksum); + return containsFile(new FileMetadata(localFilename), checksum); } public boolean containsFile(FileMetadata fileMetadata, String checksum) { @@ -493,8 +458,14 @@ public String getExistingRemoteFilename(FileMetadata localFileMetadata) { return null; } - private String getNewRemoteSegmentFilename(String localFilename) { - return localFilename + SEGMENT_NAME_UUID_SEPARATOR + UUIDs.base64UUID(); + @Override + protected String getNewRemoteSegmentFilename(String localFilename) { + String[] fileNameAndExtension = extractFileExtension(localFilename); + return fileNameAndExtension[0] + SEGMENT_NAME_UUID_SEPARATOR + UUIDs.base64UUID() + "." + fileNameAndExtension[1]; + } + + private static String[] extractFileExtension(String localFilename) { + return localFilename.split("\\."); } public Map getSegmentsUploadedToRemoteStore() { @@ -533,7 +504,8 @@ private void uploadMetadataInternal(Collection fileMetadataCollect translogGeneration, metadataUploadCounter.incrementAndGet(), RemoteSegmentMetadata.CURRENT_VERSION, nodeId); - FileMetadata fileMetadata = new FileMetadata("TempMetadata", metadataFilename); + // Use "metadata" format instead of "TempMetadata" - temp metadata files use the same directory as metadata files + FileMetadata fileMetadata = new FileMetadata("metadata", metadataFilename); try { try (IndexOutput indexOutput = storeDirectory.createOutput(fileMetadata, IOContext.DEFAULT)) { @@ -557,53 +529,39 @@ private void uploadMetadataInternal(Collection fileMetadataCollect } } - // Serialize CatalogSnapshot using StreamOutput - byte[] catalogSnapshotByteArray; - try (org.opensearch.common.io.stream.BytesStreamOutput streamOutput = - new org.opensearch.common.io.stream.BytesStreamOutput()) { - catalogSnapshot.writeTo(streamOutput); - catalogSnapshotByteArray = streamOutput.bytes().toBytesRef().bytes; - } + SegmentInfos segmentInfosSnapshot = new SegmentInfos(Version.LATEST.major); + Map userData = catalogSnapshot.getUserData(); + userData.put(CompositeEngineCatalogSnapshot.CATALOG_SNAPSHOT_KEY, catalogSnapshot.serializeToString()); + segmentInfosSnapshot.setUserData(userData, false); + segmentInfosSnapshot.setNextWriteGeneration(replicationCheckpoint.getSegmentsGen()); + ByteBuffersDataOutput byteBuffersIndexOutput = new ByteBuffersDataOutput(); + segmentInfosSnapshot.write( + new ByteBuffersIndexOutput(byteBuffersIndexOutput, "Snapshot of SegmentInfos", "SegmentInfos") + ); + byte[] segmentInfoSnapshotByteArray = byteBuffersIndexOutput.toArrayCopy(); metadataStreamWrapper.writeStream(indexOutput, new RemoteSegmentMetadata( RemoteSegmentMetadata.fromMapOfStringsV2(uploadedSegments), - catalogSnapshotByteArray, replicationCheckpoint)); + segmentInfoSnapshotByteArray, replicationCheckpoint)); } storeDirectory.sync(Collections.singleton(fileMetadata.serialize())); - compositeRemoteDirectory.copyFrom(storeDirectory, fileMetadata, metadataFilename, IOContext.DEFAULT); + remoteMetadataDirectory.copyFrom(storeDirectory, metadataFilename, metadataFilename, IOContext.DEFAULT); } finally { tryAndDeleteLocalFile(fileMetadata, storeDirectory); } } } - public void deleteStaleSegments(int lastNMetadataFilesToKeep) throws IOException { - if (lastNMetadataFilesToKeep == -1) { - logger.info("Stale segment deletion is disabled if cluster.remote_store.index.segment_metadata.retention.max_count is set to -1"); - return; - } - - List sortedMetadataFileList = remoteMetadataDirectory.listFilesByPrefixInLexicographicOrder( - MetadataFilenameUtils.METADATA_PREFIX, Integer.MAX_VALUE); - - if (sortedMetadataFileList.size() <= lastNMetadataFilesToKeep) { - logger.debug("Number of commits in remote segment store={}, lastNMetadataFilesToKeep={}", - sortedMetadataFileList.size(), lastNMetadataFilesToKeep); - return; - } - - // Implementation continues... (keeping existing logic but using compositeRemoteDirectory directly) - Set deletedSegmentFiles = new HashSet<>(); - // ... stale segment deletion logic using compositeRemoteDirectory.deleteFile() directly - - logger.debug("deletedSegmentFiles={}", deletedSegmentFiles); - } - public void deleteStaleSegmentsAsync(int lastNMetadataFilesToKeep) { deleteStaleSegmentsAsync(lastNMetadataFilesToKeep, ActionListener.wrap(r -> {}, e -> {})); } + @Override + protected void removeFileFromSegmentsUploadedToRemoteStore(String file) { + segmentsUploadedToRemoteStore.remove(new FileMetadata(file)); + } + public void deleteStaleSegmentsAsync(int lastNMetadataFilesToKeep, ActionListener listener) { if (canDeleteStaleCommits.compareAndSet(true, false)) { try { diff --git a/server/src/main/java/org/opensearch/index/store/CompositeRemoteSegmentStoreDirectoryFactory.java b/server/src/main/java/org/opensearch/index/store/CompositeRemoteSegmentStoreDirectoryFactory.java deleted file mode 100644 index 9afe574024042..0000000000000 --- a/server/src/main/java/org/opensearch/index/store/CompositeRemoteSegmentStoreDirectoryFactory.java +++ /dev/null @@ -1,161 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.index.store; - -import org.apache.logging.log4j.LogManager; -import org.apache.lucene.store.Directory; -import org.opensearch.common.blobstore.BlobPath; -import org.opensearch.core.index.shard.ShardId; -import org.opensearch.index.IndexSettings; -import org.opensearch.index.remote.RemoteStorePathStrategy; -import org.opensearch.index.shard.ShardPath; -import org.opensearch.index.store.lockmanager.RemoteStoreLockManager; -import org.opensearch.index.store.lockmanager.RemoteStoreLockManagerFactory; -import org.opensearch.index.store.remote.CompositeRemoteDirectory; -import org.opensearch.plugins.IndexStorePlugin; -import org.opensearch.plugins.PluginsService; -import org.opensearch.repositories.RepositoriesService; -import org.opensearch.repositories.Repository; -import org.opensearch.repositories.RepositoryMissingException; -import org.opensearch.repositories.blobstore.BlobStoreRepository; -import org.opensearch.threadpool.ThreadPool; - -import java.io.IOException; -import java.util.Map; -import java.util.Objects; -import java.util.concurrent.ConcurrentHashMap; -import java.util.function.Supplier; - -import static org.opensearch.index.remote.RemoteStoreEnums.DataCategory.SEGMENTS; -import static org.opensearch.index.remote.RemoteStoreEnums.DataType.DATA; -import static org.opensearch.index.remote.RemoteStoreEnums.DataType.METADATA; - -/** - * Factory for composite remote segment store directory. - * - * @opensearch.internal - */ -public class CompositeRemoteSegmentStoreDirectoryFactory implements IndexStorePlugin.DirectoryFactory { - private final Supplier repositoriesService; - private final String segmentsPathFixedPrefix; - private final ThreadPool threadPool; - private final PluginsService pluginsService; - - public CompositeRemoteSegmentStoreDirectoryFactory( - Supplier repositoriesService, - ThreadPool threadPool, - String segmentsPathFixedPrefix - ) { - this(repositoriesService, threadPool, segmentsPathFixedPrefix, null); - } - - public CompositeRemoteSegmentStoreDirectoryFactory( - Supplier repositoriesService, - ThreadPool threadPool, - String segmentsPathFixedPrefix, - PluginsService pluginsService - ) { - this.repositoriesService = repositoriesService; - this.segmentsPathFixedPrefix = segmentsPathFixedPrefix; - this.threadPool = threadPool; - this.pluginsService = pluginsService; - } - - @Override - public Directory newDirectory(IndexSettings indexSettings, ShardPath path) throws IOException { - String repositoryName = indexSettings.getRemoteStoreRepository(); - String indexUUID = indexSettings.getIndex().getUUID(); - return newDirectory(repositoryName, indexUUID, path.getShardId(), indexSettings.getRemoteStorePathStrategy()); - } - - public Directory newDirectory(String repositoryName, String indexUUID, ShardId shardId, RemoteStorePathStrategy pathStrategy) - throws IOException { - return newDirectory(repositoryName, indexUUID, shardId, pathStrategy, null); - } - - public Directory newDirectory( - String repositoryName, - String indexUUID, - ShardId shardId, - RemoteStorePathStrategy pathStrategy, - String indexFixedPrefix - ) throws IOException { - assert Objects.nonNull(pathStrategy); - try (Repository repository = repositoriesService.get().repository(repositoryName)) { - - assert repository instanceof BlobStoreRepository : "repository should be instance of BlobStoreRepository"; - BlobStoreRepository blobStoreRepository = ((BlobStoreRepository) repository); - BlobPath repositoryBasePath = blobStoreRepository.basePath(); - String shardIdStr = String.valueOf(shardId.id()); - Map pendingDownloadMergedSegments = new ConcurrentHashMap<>(); - - RemoteStorePathStrategy.ShardDataPathInput dataPathInput = RemoteStorePathStrategy.ShardDataPathInput.builder() - .basePath(repositoryBasePath) - .indexUUID(indexUUID) - .shardId(shardIdStr) - .dataCategory(SEGMENTS) - .dataType(DATA) - .fixedPrefix(segmentsPathFixedPrefix) - .indexFixedPrefix(indexFixedPrefix) - .build(); - - BlobPath dataPath = pathStrategy.generatePath(dataPathInput); - - CompositeRemoteDirectory compositeDataDirectory = new CompositeRemoteDirectory( - blobStoreRepository.blobStore(), - dataPath, - blobStoreRepository::maybeRateLimitRemoteUploadTransfers, - blobStoreRepository::maybeRateLimitLowPriorityRemoteUploadTransfers, - blobStoreRepository::maybeRateLimitRemoteDownloadTransfers, - blobStoreRepository::maybeRateLimitLowPriorityDownloadTransfers, - pendingDownloadMergedSegments, - LogManager.getLogger("index.store.remote.composite." + shardId), - pluginsService - ); - - RemoteStorePathStrategy.ShardDataPathInput mdPathInput = RemoteStorePathStrategy.ShardDataPathInput.builder() - .basePath(repositoryBasePath) - .indexUUID(indexUUID) - .shardId(shardIdStr) - .dataCategory(SEGMENTS) - .dataType(METADATA) - .fixedPrefix(segmentsPathFixedPrefix) - .indexFixedPrefix(indexFixedPrefix) - .build(); - - BlobPath mdPath = pathStrategy.generatePath(mdPathInput); - RemoteDirectory metadataDirectory = new RemoteDirectory(blobStoreRepository.blobStore().blobContainer(mdPath)); - - RemoteStoreLockManager mdLockManager = RemoteStoreLockManagerFactory.newLockManager( - repositoriesService.get(), - repositoryName, - indexUUID, - shardIdStr, - pathStrategy, - segmentsPathFixedPrefix, - indexFixedPrefix - ); - - return new CompositeRemoteSegmentStoreDirectory( - compositeDataDirectory, - metadataDirectory, - mdLockManager, - threadPool, - shardId, - pendingDownloadMergedSegments - ); - } catch (RepositoryMissingException e) { - throw new IllegalArgumentException("Repository should be created before creating index with remote_store enabled setting", e); - } - } - - public Supplier getRepositoriesService() { - return this.repositoriesService; - } -} diff --git a/server/src/main/java/org/opensearch/index/store/CompositeStoreDirectory.java b/server/src/main/java/org/opensearch/index/store/CompositeStoreDirectory.java index b04daa41208dc..3b74e5a4a2ee8 100644 --- a/server/src/main/java/org/opensearch/index/store/CompositeStoreDirectory.java +++ b/server/src/main/java/org/opensearch/index/store/CompositeStoreDirectory.java @@ -11,10 +11,11 @@ import org.apache.logging.log4j.Logger; import org.apache.lucene.store.*; import org.opensearch.common.annotation.PublicApi; +import org.opensearch.common.logging.Loggers; import org.opensearch.common.util.io.IOUtils; +import org.opensearch.core.index.shard.ShardId; import org.opensearch.index.IndexSettings; import org.opensearch.index.engine.exec.FileMetadata; -import org.opensearch.index.engine.exec.coord.Any; import org.opensearch.index.shard.ShardPath; import org.opensearch.plugins.DataSourcePlugin; import org.opensearch.plugins.PluginsService; @@ -32,6 +33,9 @@ import java.util.Set; import java.util.stream.Collectors; +import static org.opensearch.index.shard.ShardPath.INDEX_FOLDER_NAME; +import static org.opensearch.index.shard.ShardPath.METADATA_FOLDER_NAME; + /** * Composite directory that coordinates multiple format-specific directories. * Routes file operations to appropriate format directories based on file type. @@ -42,27 +46,26 @@ * @opensearch.api */ @PublicApi(since = "3.0.0") -public class CompositeStoreDirectory extends Directory { +public class CompositeStoreDirectory extends Store.StoreDirectory { - private Any dataFormat; - private final Path directoryPath; public final List> delegates = new ArrayList<>(); public final HashMap> delegatesMap = new HashMap<>(); private final Logger logger; private final DirectoryFileTransferTracker directoryFileTransferTracker; - private final ShardPath shardPath; /** * Simplified constructor for auto-discovery (like CompositeIndexingExecutionEngine) */ - public CompositeStoreDirectory(IndexSettings indexSettings, PluginsService pluginsService, ShardPath shardPath, Logger logger) { - this.shardPath = shardPath; + public CompositeStoreDirectory(IndexSettings indexSettings, PluginsService pluginsService, ShardId shardId, ShardPath shardPath, Logger logger) { + super(null, Loggers.getLogger("index.store.deletes", shardId)); this.logger = logger; this.directoryFileTransferTracker = new DirectoryFileTransferTracker(); - this.directoryPath = shardPath.getDataPath(); try { + FormatStoreDirectory metadataDirectory = createMetadataDirectory(shardPath); + delegatesMap.put("metadata", metadataDirectory); + pluginsService.filterPlugins(DataSourcePlugin.class).forEach(plugin -> { try { FormatStoreDirectory formatDir = plugin.createFormatStoreDirectory(indexSettings, shardPath); @@ -82,8 +85,18 @@ public CompositeStoreDirectory(IndexSettings indexSettings, PluginsService plugi } } + /** + * Creates a metadata directory that points to the base Lucene directory where segments_N files are stored. + * This directory is at {@code /lucene/} and always exists regardless of active data formats. + */ + private FormatStoreDirectory createMetadataDirectory(ShardPath shardPath) throws IOException { + // Create FSDirectory pointing to /lucene/ where segments_N files live + Path luceneIndexPath = shardPath.resolveIndex(); // Returns /lucene/ + Directory luceneDirectory = FSDirectory.open(luceneIndexPath); + return new LuceneStoreDirectory(luceneIndexPath, luceneDirectory); + } + public void initialize() throws IOException { - // Initialize all delegates for (FormatStoreDirectory delegate : delegates) { delegate.initialize(); } @@ -109,11 +122,6 @@ public FormatStoreDirectory getDirectoryForFormat(String dataFormatName) { FormatStoreDirectory directory = delegatesMap.get(dataFormatName); if (directory == null) { - - if(dataFormatName.equalsIgnoreCase("TempMetadata") && !delegates.isEmpty()) - { - return delegates.getFirst(); - } List availableFormats = new ArrayList<>(delegatesMap.keySet()); logger.error("Format routing failed: requested format '{}' not found. Available formats: {}. " + diff --git a/server/src/main/java/org/opensearch/index/store/CompositeStoreDirectoryFactory.java b/server/src/main/java/org/opensearch/index/store/CompositeStoreDirectoryFactory.java index ad062573a8aa1..cd4aa65d7883c 100644 --- a/server/src/main/java/org/opensearch/index/store/CompositeStoreDirectoryFactory.java +++ b/server/src/main/java/org/opensearch/index/store/CompositeStoreDirectoryFactory.java @@ -9,6 +9,7 @@ package org.opensearch.index.store; import org.opensearch.common.annotation.ExperimentalApi; +import org.opensearch.core.index.shard.ShardId; import org.opensearch.index.IndexSettings; import org.opensearch.index.shard.ShardPath; import org.opensearch.plugins.PluginsService; @@ -19,7 +20,7 @@ * Factory interface for creating CompositeStoreDirectory instances. * This interface follows the existing IndexStorePlugin pattern to provide * a centralized way to create composite directories with format discovery. - * + * * @opensearch.experimental */ @ExperimentalApi @@ -28,21 +29,23 @@ public interface CompositeStoreDirectoryFactory { /** * Creates a new CompositeStoreDirectory per shard with automatic format discovery. - * + *

* The factory will: * - Use PluginsService to discover available DataFormat plugins * - Create format-specific directories for each discovered format * - Provide fallback behavior if no plugins are found * - Handle errors gracefully with proper logging - * - * @param indexSettings the shard's index settings containing configuration - * @param shardPath the path the shard is using for file storage + * + * @param indexSettings the shard's index settings containing configuration + * @param shardId + * @param shardPath the path the shard is using for file storage * @param pluginsService service for discovering DataFormat plugins and creating format directories * @return a new CompositeStoreDirectory instance supporting all discovered formats * @throws IOException if directory creation fails or resources cannot be allocated */ CompositeStoreDirectory newCompositeStoreDirectory( IndexSettings indexSettings, + ShardId shardId, ShardPath shardPath, PluginsService pluginsService ) throws IOException; diff --git a/server/src/main/java/org/opensearch/index/store/DefaultCompositeStoreDirectoryFactory.java b/server/src/main/java/org/opensearch/index/store/DefaultCompositeStoreDirectoryFactory.java index 8812d3ef491d6..21714fc65c76f 100644 --- a/server/src/main/java/org/opensearch/index/store/DefaultCompositeStoreDirectoryFactory.java +++ b/server/src/main/java/org/opensearch/index/store/DefaultCompositeStoreDirectoryFactory.java @@ -11,9 +11,9 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.opensearch.common.annotation.ExperimentalApi; +import org.opensearch.core.index.shard.ShardId; import org.opensearch.index.IndexSettings; import org.opensearch.index.engine.exec.DataFormat; -import org.opensearch.index.engine.exec.coord.Any; import org.opensearch.index.shard.ShardPath; import org.opensearch.plugins.PluginsService; @@ -21,8 +21,6 @@ import java.util.Arrays; import java.util.List; -import static org.opensearch.index.translog.transfer.TranslogTransferMetadata.logger; - /** * Default implementation of CompositeStoreDirectoryFactory that provides * plugin-based format discovery and fallback behavior. @@ -45,8 +43,9 @@ public class DefaultCompositeStoreDirectoryFactory implements CompositeStoreDire /** * Creates a new CompositeStoreDirectory with plugin-based format discovery. * - * @param indexSettings the shard's index settings - * @param shardPath the path the shard is using + * @param indexSettings the shard's index settings + * @param shardId + * @param shardPath the path the shard is using * @param pluginsService service for discovering DataFormat plugins * @return a new CompositeStoreDirectory instance * @throws IOException if directory creation fails @@ -54,7 +53,7 @@ public class DefaultCompositeStoreDirectoryFactory implements CompositeStoreDire @Override public CompositeStoreDirectory newCompositeStoreDirectory( IndexSettings indexSettings, - ShardPath shardPath, + ShardId shardId, ShardPath shardPath, PluginsService pluginsService ) throws IOException { @@ -67,6 +66,7 @@ public CompositeStoreDirectory newCompositeStoreDirectory( CompositeStoreDirectory compositeDirectory = new CompositeStoreDirectory( indexSettings, pluginsService, + shardId, shardPath, logger ); diff --git a/server/src/main/java/org/opensearch/index/store/GenericStoreDirectory.java b/server/src/main/java/org/opensearch/index/store/GenericStoreDirectory.java index b109ace1e2c76..8e497d362511d 100644 --- a/server/src/main/java/org/opensearch/index/store/GenericStoreDirectory.java +++ b/server/src/main/java/org/opensearch/index/store/GenericStoreDirectory.java @@ -10,9 +10,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import org.apache.lucene.store.IOContext; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.store.*; import org.opensearch.index.engine.exec.DataFormat; import org.opensearch.index.engine.exec.FileMetadata; import org.opensearch.index.shard.ShardPath; @@ -30,6 +28,7 @@ import java.nio.file.StandardOpenOption; import java.util.Collection; import java.util.stream.StreamSupport; +import java.util.zip.CRC32; /** * Generic FormatStoreDirectory implementation for non-Lucene formats. @@ -117,15 +116,14 @@ public void deleteFile(String name) throws IOException { @Override public long fileLength(String name) throws IOException { - Path filePath = directoryPath.resolve(name); - try { - return Files.size(filePath); + try (IndexInput input = openIndexInput(name, IOContext.READONCE)) { + return input.length(); } catch (IOException e) { throw new MultiFormatStoreException( "Failed to get file length: " + name, dataFormat, "fileLength", - filePath, + directoryPath.resolve(name), e ); } @@ -206,33 +204,35 @@ public void rename(String source, String dest) throws IOException { @Override public long calculateChecksum(String fileName) throws IOException { - Path filePath = directoryPath.resolve(fileName); - try (InputStream inputStream = Files.newInputStream(filePath, StandardOpenOption.READ)) { - return calculateGenericChecksum(inputStream); + try (IndexInput indexInput = openIndexInput(fileName, IOContext.READONCE)) { + return calculateGenericChecksum(indexInput); } catch (IOException e) { throw new MultiFormatStoreException( "Failed to calculate checksum for file: " + fileName, dataFormat, "calculateChecksum", - filePath, + directoryPath.resolve(fileName), e ); } } /** - * Calculates a generic CRC32 checksum for the given input stream - * @param inputStream the input stream to calculate checksum for + * Calculates a generic CRC32 checksum for the given index input + * @param indexInput the input stream to calculate checksum for * @return the checksum as a string representation * @throws IOException if reading from the stream fails */ - private long calculateGenericChecksum(InputStream inputStream) throws IOException { - java.util.zip.CRC32 crc32 = new java.util.zip.CRC32(); + private long calculateGenericChecksum(IndexInput indexInput) throws IOException { + CRC32 crc32 = new CRC32(); byte[] buffer = new byte[8192]; - int bytesRead; + long remaining = indexInput.length(); - while ((bytesRead = inputStream.read(buffer)) != -1) { - crc32.update(buffer, 0, bytesRead); + while (remaining > 0) { + int toRead = (int) Math.min(buffer.length, remaining); + indexInput.readBytes(buffer, 0, toRead); + crc32.update(buffer, 0, toRead); + remaining -= toRead; } return crc32.getValue(); @@ -257,8 +257,8 @@ public String calculateUploadChecksum(String fileName) throws IOException { long startTime = System.nanoTime(); - try (InputStream inputStream = Files.newInputStream(filePath)) { - long checksum = calculateGenericChecksum(inputStream); + try (IndexInput indexInput = openIndexInput(fileName, IOContext.READONCE)) { + long checksum = calculateGenericChecksum(indexInput); String checksumString = Long.toString(checksum); long calculationDurationMs = (System.nanoTime() - startTime) / 1_000_000; @@ -306,7 +306,7 @@ public IndexInput openIndexInput(String name, IOContext context) throws IOExcept long fileSize = channel.size(); // Create FileChannel-based IndexInput - return new GenericFileChannelIndexInput(name, channel, fileSize, context); + return new NIOFSIndexInput(name, channel, context); } catch (IOException e) { logger.error("Failed to create IndexInput for generic format: file={}, format={}, filePath={}, error={}", @@ -322,6 +322,130 @@ public IndexInput openIndexInput(String name, IOContext context) throws IOExcept } } + /** Reads bytes with {@link FileChannel#read(ByteBuffer, long)} */ + static final class NIOFSIndexInput extends BufferedIndexInput { + /** The maximum chunk size for reads of 16384 bytes. */ + private static final int CHUNK_SIZE = 16384; + + /** the file channel we will read from */ + protected final FileChannel channel; + + /** is this instance a clone and hence does not own the file to close it */ + boolean isClone = false; + + /** start offset: non-zero in the slice case */ + protected final long off; + + /** end offset (start+length) */ + protected final long end; + + public NIOFSIndexInput(String resourceDesc, FileChannel fc, IOContext context) + throws IOException { + super(resourceDesc, context); + this.channel = fc; + this.off = 0L; + this.end = fc.size(); + } + + public NIOFSIndexInput( + String resourceDesc, FileChannel fc, long off, long length, int bufferSize) { + super(resourceDesc, bufferSize); + this.channel = fc; + this.off = off; + this.end = off + length; + this.isClone = true; + } + + @Override + public void close() throws IOException { + if (!isClone) { + channel.close(); + } + } + + @Override + public NIOFSIndexInput clone() { + NIOFSIndexInput clone = (NIOFSIndexInput) super.clone(); + clone.isClone = true; + return clone; + } + + @Override + public IndexInput slice(String sliceDescription, long offset, long length) throws IOException { + if ((length | offset) < 0 || length > this.length() - offset) { + throw new IllegalArgumentException( + "slice() " + + sliceDescription + + " out of bounds: offset=" + + offset + + ",length=" + + length + + ",fileLength=" + + this.length() + + ": " + + this); + } + return new NIOFSIndexInput( + getFullSliceDescription(sliceDescription), + channel, + off + offset, + length, + getBufferSize()); + } + + @Override + public final long length() { + return end - off; + } + + @Override + protected void readInternal(ByteBuffer b) throws IOException { + long pos = getFilePointer() + off; + + if (pos + b.remaining() > end) { + throw new EOFException("read past EOF: " + this); + } + + try { + int readLength = b.remaining(); + while (readLength > 0) { + final int toRead = Math.min(CHUNK_SIZE, readLength); + b.limit(b.position() + toRead); + assert b.remaining() == toRead; + final int i = channel.read(b, pos); + if (i < 0) { + // be defensive here, even though we checked before hand, something could have changed + throw new EOFException( + "read past EOF: " + + this + + " buffer: " + + b + + " chunkLen: " + + toRead + + " end: " + + end); + } + assert i > 0 + : "FileChannel.read with non zero-length bb.remaining() must always read at least " + + "one byte (FileChannel is in blocking mode, see spec of ReadableByteChannel)"; + pos += i; + readLength -= i; + } + assert readLength == 0; + } catch (IOException ioe) { + throw new IOException(ioe.getMessage() + ": " + this, ioe); + } + } + + @Override + protected void seekInternal(long pos) throws IOException { + if (pos > length()) { + throw new EOFException( + "read past EOF: pos=" + pos + " vs length=" + length() + ": " + this); + } + } + } + /** * FileChannel-based IndexInput implementation that provides full Lucene compatibility. * This implementation mirrors NIOFSDirectory's internal IndexInput behavior. diff --git a/server/src/main/java/org/opensearch/index/store/RemoteDirectory.java b/server/src/main/java/org/opensearch/index/store/RemoteDirectory.java index 25ee020de8562..db7792933df3a 100644 --- a/server/src/main/java/org/opensearch/index/store/RemoteDirectory.java +++ b/server/src/main/java/org/opensearch/index/store/RemoteDirectory.java @@ -184,6 +184,10 @@ public void deleteFile(String name) throws IOException { blobContainer.deleteBlobsIgnoringIfNotExists(Collections.singletonList(name)); } + public void deleteFile(UploadedSegmentMetadata uploadedSegmentMetadata) throws IOException { + deleteFile(uploadedSegmentMetadata.getUploadedFilename()); + } + /** * Creates and returns a new instance of {@link RemoteIndexOutput} which will be used to copy files to the remote * store. diff --git a/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectory.java b/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectory.java index 452020eef835d..c8d9ba4ca4023 100644 --- a/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectory.java +++ b/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectory.java @@ -23,7 +23,6 @@ import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.util.Version; import org.opensearch.common.Nullable; import org.opensearch.common.UUIDs; import org.opensearch.common.annotation.InternalApi; @@ -61,7 +60,6 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; -import java.util.Objects; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicBoolean; @@ -176,14 +174,12 @@ public RemoteSegmentStoreDirectory( * @throws IOException if there were any failures in reading the metadata file */ public RemoteSegmentMetadata init() throws IOException { - logger.debug("Start initialisation of remote segment metadata"); RemoteSegmentMetadata remoteSegmentMetadata = readLatestMetadataFile(); if (remoteSegmentMetadata != null) { this.segmentsUploadedToRemoteStore = new ConcurrentHashMap<>(remoteSegmentMetadata.getMetadata()); } else { this.segmentsUploadedToRemoteStore = new ConcurrentHashMap<>(); } - logger.debug("Initialisation of remote segment metadata completed"); return remoteSegmentMetadata; } @@ -278,7 +274,7 @@ public RemoteSegmentMetadata readLatestMetadataFile() throws IOException { return remoteSegmentMetadata; } - private RemoteSegmentMetadata readMetadataFile(String metadataFilename) throws IOException { + protected RemoteSegmentMetadata readMetadataFile(String metadataFilename) throws IOException { try (InputStream inputStream = remoteMetadataDirectory.getBlobStream(metadataFilename)) { byte[] metadataBytes = inputStream.readAllBytes(); return metadataStreamWrapper.readStream(new ByteArrayIndexInput(metadataFilename, metadataBytes)); @@ -525,7 +521,7 @@ String getMetadataFileForCommit(long primaryTerm, long generation) throws IOExce private void postUpload(Directory from, String src, String remoteFilename, String checksum) throws IOException { UploadedSegmentMetadata segmentMetadata = new UploadedSegmentMetadata(src, remoteFilename, checksum, from.fileLength(src)); - segmentsUploadedToRemoteStore.put(src, segmentMetadata); + segmentsUploadedToRemoteStore.put(new FileMetadata(src).serialize(), segmentMetadata); } /** @@ -600,12 +596,13 @@ public void uploadMetadata( Map segmentToLuceneVersion = getSegmentToLuceneVersion(segmentFiles, segmentInfosSnapshot); Map uploadedSegments = new HashMap<>(); for (String file : segmentFiles) { - if (segmentsUploadedToRemoteStore.containsKey(file)) { - UploadedSegmentMetadata metadata = segmentsUploadedToRemoteStore.get(file); + String normalizedFile = new FileMetadata(file).serialize(); + if (segmentsUploadedToRemoteStore.containsKey(normalizedFile)) { + UploadedSegmentMetadata metadata = segmentsUploadedToRemoteStore.get(normalizedFile); metadata.setWrittenByMajor(segmentToLuceneVersion.get(metadata.getOriginalFilename())); - uploadedSegments.put(file, metadata.toString()); + uploadedSegments.put(normalizedFile, metadata.toString()); } else { - throw new NoSuchFileException(file); + throw new NoSuchFileException(normalizedFile); } } @@ -620,7 +617,7 @@ public void uploadMetadata( new RemoteSegmentMetadata( RemoteSegmentMetadata.fromMapOfStrings(uploadedSegments).entrySet().stream().collect( Collectors.toMap( - entry -> new FileMetadata(entry.getKey() + FileMetadata.DELIMITER + "lucene"), + entry -> new FileMetadata(entry.getKey()), // Keys are already serialized, don't add :::lucene again Map.Entry::getValue ) ), @@ -701,7 +698,7 @@ public String getExistingRemoteFilename(String localFilename) { return null; } - private String getNewRemoteSegmentFilename(String localFilename) { + protected String getNewRemoteSegmentFilename(String localFilename) { return localFilename + SEGMENT_NAME_UUID_SEPARATOR + UUIDs.base64UUID(); } @@ -856,20 +853,17 @@ public void deleteStaleSegments(int lastNMetadataFilesToKeep) throws IOException Set deletedSegmentFiles = new HashSet<>(); for (String metadataFile : metadataFilesToBeDeleted) { Map staleSegmentFilesMetadataMap = readMetadataFile(metadataFile).getMetadata(); - Set staleSegmentRemoteFilenames = staleSegmentFilesMetadataMap.values() - .stream() - .map(metadata -> metadata.getUploadedFilename()) - .collect(Collectors.toSet()); AtomicBoolean deletionSuccessful = new AtomicBoolean(true); - staleSegmentRemoteFilenames.stream() - .filter(file -> activeSegmentRemoteFilenames.contains(file) == false) - .filter(file -> deletedSegmentFiles.contains(file) == false) - .forEach(file -> { + staleSegmentFilesMetadataMap.entrySet().stream() + .filter(e -> activeSegmentRemoteFilenames.contains(e.getValue().getUploadedFilename()) == false) + .filter(e -> deletedSegmentFiles.contains(e.getValue().getUploadedFilename()) == false) + .forEach(entry -> { + String file = entry.getValue().getUploadedFilename(); try { - remoteDataDirectory.deleteFile(file); + remoteDataDirectory.deleteFile(entry.getValue()); deletedSegmentFiles.add(file); - if (!activeSegmentFilesMetadataMap.containsKey(getLocalSegmentFilename(file))) { - segmentsUploadedToRemoteStore.remove(getLocalSegmentFilename(file)); + if (!activeSegmentFilesMetadataMap.containsKey(entry.getKey())) { + removeFileFromSegmentsUploadedToRemoteStore(file); } } catch (NoSuchFileException e) { logger.info("Segment file {} corresponding to metadata file {} does not exist in remote", file, metadataFile); @@ -890,6 +884,10 @@ public void deleteStaleSegments(int lastNMetadataFilesToKeep) throws IOException logger.debug("deletedSegmentFiles={}", deletedSegmentFiles); } + protected void removeFileFromSegmentsUploadedToRemoteStore(String file) { + segmentsUploadedToRemoteStore.remove(getLocalSegmentFilename(file)); + } + public void deleteStaleSegmentsAsync(int lastNMetadataFilesToKeep) { deleteStaleSegmentsAsync(lastNMetadataFilesToKeep, ActionListener.wrap(r -> {}, e -> {})); } diff --git a/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryFactory.java b/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryFactory.java index 35aba694729cb..df66f2977be37 100644 --- a/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryFactory.java +++ b/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryFactory.java @@ -8,16 +8,21 @@ package org.opensearch.index.store; +import org.apache.logging.log4j.LogManager; import org.apache.lucene.store.Directory; import org.opensearch.common.annotation.PublicApi; import org.opensearch.common.blobstore.BlobPath; import org.opensearch.core.index.shard.ShardId; import org.opensearch.index.IndexSettings; +import org.opensearch.index.engine.exec.FileMetadata; import org.opensearch.index.remote.RemoteStorePathStrategy; +import org.opensearch.index.remote.RemoteStoreUtils; import org.opensearch.index.shard.ShardPath; import org.opensearch.index.store.lockmanager.RemoteStoreLockManager; import org.opensearch.index.store.lockmanager.RemoteStoreLockManagerFactory; +import org.opensearch.index.store.remote.CompositeRemoteDirectory; import org.opensearch.plugins.IndexStorePlugin; +import org.opensearch.plugins.PluginsService; import org.opensearch.repositories.RepositoriesService; import org.opensearch.repositories.Repository; import org.opensearch.repositories.RepositoryMissingException; @@ -43,29 +48,58 @@ public class RemoteSegmentStoreDirectoryFactory implements IndexStorePlugin.DirectoryFactory { private final Supplier repositoriesService; private final String segmentsPathFixedPrefix; - private final ThreadPool threadPool; + private final PluginsService pluginsService; public RemoteSegmentStoreDirectoryFactory( Supplier repositoriesService, ThreadPool threadPool, String segmentsPathFixedPrefix + ) { + this(repositoriesService, threadPool, segmentsPathFixedPrefix, null); + } + + public RemoteSegmentStoreDirectoryFactory( + Supplier repositoriesService, + ThreadPool threadPool, + String segmentsPathFixedPrefix, + PluginsService pluginsService ) { this.repositoriesService = repositoriesService; this.segmentsPathFixedPrefix = segmentsPathFixedPrefix; this.threadPool = threadPool; + this.pluginsService = pluginsService; } @Override public Directory newDirectory(IndexSettings indexSettings, ShardPath path) throws IOException { String repositoryName = indexSettings.getRemoteStoreRepository(); String indexUUID = indexSettings.getIndex().getUUID(); - return newDirectory(repositoryName, indexUUID, path.getShardId(), indexSettings.getRemoteStorePathStrategy()); + + // Check if this is an optimized index to determine directory type + if (indexSettings.isOptimizedIndex()) { + return newCompositeDirectory( + repositoryName, + indexUUID, + path.getShardId(), + indexSettings.getRemoteStorePathStrategy(), + RemoteStoreUtils.isServerSideEncryptionEnabledIndex(indexSettings.getIndexMetadata()) + ); + } else { + return newDirectory( + repositoryName, + indexUUID, + path.getShardId(), + indexSettings.getRemoteStorePathStrategy(), + null, + RemoteStoreUtils.isServerSideEncryptionEnabledIndex(indexSettings.getIndexMetadata()) + ); + } } public Directory newDirectory(String repositoryName, String indexUUID, ShardId shardId, RemoteStorePathStrategy pathStrategy) throws IOException { - return newDirectory(repositoryName, indexUUID, shardId, pathStrategy, null); + return newDirectory(repositoryName, indexUUID, shardId, pathStrategy, null, false); } public Directory newDirectory( @@ -75,9 +109,21 @@ public Directory newDirectory( RemoteStorePathStrategy pathStrategy, String indexFixedPrefix ) throws IOException { - assert Objects.nonNull(pathStrategy); - try (Repository repository = repositoriesService.get().repository(repositoryName)) { + return newDirectory(repositoryName, indexUUID, shardId, pathStrategy, indexFixedPrefix, false); + } + public Directory newDirectory( + String repositoryName, + String indexUUID, + ShardId shardId, + RemoteStorePathStrategy pathStrategy, + String indexFixedPrefix, + boolean isServerSideEncryptionEnabled + ) throws IOException { + assert Objects.nonNull(pathStrategy); + // We should be not calling close for repository. + Repository repository = repositoriesService.get().repository(repositoryName); + try { assert repository instanceof BlobStoreRepository : "repository should be instance of BlobStoreRepository"; BlobStoreRepository blobStoreRepository = ((BlobStoreRepository) repository); BlobPath repositoryBasePath = blobStoreRepository.basePath(); @@ -93,10 +139,11 @@ public Directory newDirectory( .fixedPrefix(segmentsPathFixedPrefix) .indexFixedPrefix(indexFixedPrefix) .build(); + // Derive the path for data directory of SEGMENTS BlobPath dataPath = pathStrategy.generatePath(dataPathInput); RemoteDirectory dataDirectory = new RemoteDirectory( - blobStoreRepository.blobStore().blobContainer(dataPath), + blobStoreRepository.blobStore(isServerSideEncryptionEnabled).blobContainer(dataPath), blobStoreRepository::maybeRateLimitRemoteUploadTransfers, blobStoreRepository::maybeRateLimitLowPriorityRemoteUploadTransfers, blobStoreRepository::maybeRateLimitRemoteDownloadTransfers, @@ -115,7 +162,9 @@ public Directory newDirectory( .build(); // Derive the path for metadata directory of SEGMENTS BlobPath mdPath = pathStrategy.generatePath(mdPathInput); - RemoteDirectory metadataDirectory = new RemoteDirectory(blobStoreRepository.blobStore().blobContainer(mdPath)); + RemoteDirectory metadataDirectory = new RemoteDirectory( + blobStoreRepository.blobStore(isServerSideEncryptionEnabled).blobContainer(mdPath) + ); // The path for lock is derived within the RemoteStoreLockManagerFactory RemoteStoreLockManager mdLockManager = RemoteStoreLockManagerFactory.newLockManager( @@ -145,4 +194,97 @@ public Supplier getRepositoriesService() { return this.repositoriesService; } + /** + * Creates a CompositeRemoteSegmentStoreDirectory for optimized indices. + * This method is called when indexSettings.isOptimizedIndex() returns true. + */ + private Directory newCompositeDirectory( + String repositoryName, + String indexUUID, + ShardId shardId, + RemoteStorePathStrategy pathStrategy, + boolean isServerSideEncryptionEnabled + ) throws IOException { + return newCompositeDirectory(repositoryName, indexUUID, shardId, pathStrategy, null, isServerSideEncryptionEnabled); + } + + private Directory newCompositeDirectory( + String repositoryName, + String indexUUID, + ShardId shardId, + RemoteStorePathStrategy pathStrategy, + String indexFixedPrefix, + boolean isServerSideEncryptionEnabled + ) throws IOException { + assert Objects.nonNull(pathStrategy); + try (Repository repository = repositoriesService.get().repository(repositoryName)) { + + assert repository instanceof BlobStoreRepository : "repository should be instance of BlobStoreRepository"; + BlobStoreRepository blobStoreRepository = ((BlobStoreRepository) repository); + BlobPath repositoryBasePath = blobStoreRepository.basePath(); + String shardIdStr = String.valueOf(shardId.id()); + Map pendingDownloadMergedSegments = new ConcurrentHashMap<>(); + + RemoteStorePathStrategy.ShardDataPathInput dataPathInput = RemoteStorePathStrategy.ShardDataPathInput.builder() + .basePath(repositoryBasePath) + .indexUUID(indexUUID) + .shardId(shardIdStr) + .dataCategory(SEGMENTS) + .dataType(DATA) + .fixedPrefix(segmentsPathFixedPrefix) + .indexFixedPrefix(indexFixedPrefix) + .build(); + + BlobPath dataPath = pathStrategy.generatePath(dataPathInput); + + CompositeRemoteDirectory compositeDataDirectory = new CompositeRemoteDirectory( + blobStoreRepository.blobStore(isServerSideEncryptionEnabled), + dataPath, + blobStoreRepository::maybeRateLimitRemoteUploadTransfers, + blobStoreRepository::maybeRateLimitLowPriorityRemoteUploadTransfers, + blobStoreRepository::maybeRateLimitRemoteDownloadTransfers, + blobStoreRepository::maybeRateLimitLowPriorityDownloadTransfers, + pendingDownloadMergedSegments, + LogManager.getLogger("index.store.remote.composite." + shardId), + pluginsService + ); + + RemoteStorePathStrategy.ShardDataPathInput mdPathInput = RemoteStorePathStrategy.ShardDataPathInput.builder() + .basePath(repositoryBasePath) + .indexUUID(indexUUID) + .shardId(shardIdStr) + .dataCategory(SEGMENTS) + .dataType(METADATA) + .fixedPrefix(segmentsPathFixedPrefix) + .indexFixedPrefix(indexFixedPrefix) + .build(); + + BlobPath mdPath = pathStrategy.generatePath(mdPathInput); + RemoteDirectory metadataDirectory = new RemoteDirectory( + blobStoreRepository.blobStore(isServerSideEncryptionEnabled).blobContainer(mdPath) + ); + + RemoteStoreLockManager mdLockManager = RemoteStoreLockManagerFactory.newLockManager( + repositoriesService.get(), + repositoryName, + indexUUID, + shardIdStr, + pathStrategy, + segmentsPathFixedPrefix, + indexFixedPrefix + ); + + return new CompositeRemoteSegmentStoreDirectory( + compositeDataDirectory, + metadataDirectory, + mdLockManager, + threadPool, + shardId, + pendingDownloadMergedSegments + ); + } catch (RepositoryMissingException e) { + throw new IllegalArgumentException("Repository should be created before creating index with remote_store enabled setting", e); + } + } + } diff --git a/server/src/main/java/org/opensearch/index/store/RemoteStoreFileDownloader.java b/server/src/main/java/org/opensearch/index/store/RemoteStoreFileDownloader.java index 69b7671262fdc..da6ddfff80581 100644 --- a/server/src/main/java/org/opensearch/index/store/RemoteStoreFileDownloader.java +++ b/server/src/main/java/org/opensearch/index/store/RemoteStoreFileDownloader.java @@ -21,7 +21,7 @@ import org.opensearch.core.index.shard.ShardId; import org.opensearch.index.engine.exec.FileMetadata; import org.opensearch.indices.recovery.RecoverySettings; -import org.opensearch.indices.replication.CompositeStoreDirectoryStatsWrapper; +import org.opensearch.indices.replication.SegmentReplicationSource; import org.opensearch.threadpool.ThreadPool; import java.io.IOException; @@ -43,11 +43,13 @@ public final class RemoteStoreFileDownloader { private final Logger logger; private final ThreadPool threadPool; private final RecoverySettings recoverySettings; + private final boolean isOptimizedIndex; - public RemoteStoreFileDownloader(ShardId shardId, ThreadPool threadPool, RecoverySettings recoverySettings) { + public RemoteStoreFileDownloader(ShardId shardId, ThreadPool threadPool, RecoverySettings recoverySettings, boolean isOptimizedIndex) { this.logger = Loggers.getLogger(RemoteStoreFileDownloader.class, shardId); this.threadPool = threadPool; this.recoverySettings = recoverySettings; + this.isOptimizedIndex = isOptimizedIndex; } /** @@ -79,7 +81,7 @@ public void downloadAsync( public void downloadAsync( CancellableThreads cancellableThreads, RemoteSegmentStoreDirectory source, - CompositeStoreDirectoryStatsWrapper destination, + SegmentReplicationSource.ReplicationStatsDirectoryWrapper destination, List toDownloadFileMetadata, ActionListener listener ) { @@ -158,7 +160,7 @@ private void downloadInternal( private void downloadInternalFormatAware( CancellableThreads cancellableThreads, RemoteSegmentStoreDirectory source, - CompositeStoreDirectoryStatsWrapper destination, + SegmentReplicationSource.ReplicationStatsDirectoryWrapper destination, List toDownloadFileMetadata, Runnable onFileCompletion, ActionListener listener @@ -197,11 +199,20 @@ private void copyOneFile( logger.trace("Downloading file {}", file); try { cancellableThreads.executeIO(() -> { - destination.copyFrom(source, file, file, IOContext.DEFAULT); - logger.trace("Downloaded file {} of size {}", file, destination.fileLength(file)); + String localFileName; + if (isOptimizedIndex) { + // Optimized indices use FileMetadata for proper routing + localFileName = file; + } else { + // Non-optimized indices: extract plain filename, stripping format suffix + FileMetadata fm = new FileMetadata(file); + localFileName = fm.file(); + } + destination.copyFrom(source, file, localFileName, IOContext.DEFAULT); + logger.trace("Downloaded file {} as {} of size {}", file, localFileName, destination.fileLength(localFileName)); onFileCompletion.run(); if (secondDestination != null) { - secondDestination.copyFrom(destination, file, file, IOContext.DEFAULT); + secondDestination.copyFrom(destination, localFileName, localFileName, IOContext.DEFAULT); } }); } catch (Exception e) { @@ -222,7 +233,7 @@ private void copyOneFile( private void copyOneFileFormatAware( CancellableThreads cancellableThreads, RemoteSegmentStoreDirectory source, - CompositeStoreDirectoryStatsWrapper destination, + SegmentReplicationSource.ReplicationStatsDirectoryWrapper destination, Queue queue, Runnable onFileCompletion, ActionListener listener @@ -236,11 +247,11 @@ private void copyOneFileFormatAware( logger.trace("Downloading format-aware file {} with format {}", fileMetadata.file(), fileMetadata.dataFormat()); try { cancellableThreads.executeIO(() -> { +// String fileName = fileMetadata.serialize(); // Use format-aware copy - CompositeStoreDirectoryStatsWrapper will route based on format - destination.copyFrom(fileMetadata, source, IOContext.DEFAULT); - logger.trace("Downloaded format-aware file {} of format {} of size {}", - fileMetadata.file(), fileMetadata.dataFormat(), - destination.getDelegate().fileLength(fileMetadata)); + destination.copyFrom(source, fileMetadata.serialize(), fileMetadata.file(), IOContext.DEFAULT); + logger.trace("Downloaded format-aware file {} of format {}", + fileMetadata.file(), fileMetadata.dataFormat()); onFileCompletion.run(); // TODO: @kamal, Add second destination support for format-aware operations if needed diff --git a/server/src/main/java/org/opensearch/index/store/Store.java b/server/src/main/java/org/opensearch/index/store/Store.java index 9e2cbbd177f3f..a08b3d5250936 100644 --- a/server/src/main/java/org/opensearch/index/store/Store.java +++ b/server/src/main/java/org/opensearch/index/store/Store.java @@ -74,7 +74,6 @@ import org.opensearch.common.lucene.store.InputStreamIndexInput; import org.opensearch.common.settings.Setting; import org.opensearch.common.settings.Setting.Property; -import org.opensearch.common.settings.Settings; import org.opensearch.common.unit.TimeValue; import org.opensearch.common.util.concurrent.AbstractRefCounted; import org.opensearch.common.util.concurrent.RefCounted; @@ -91,18 +90,15 @@ import org.opensearch.env.ShardLockObtainFailedException; import org.opensearch.index.IndexSettings; import org.opensearch.index.engine.CombinedDeletionPolicy; -import org.opensearch.index.engine.DataFormatPlugin; import org.opensearch.index.engine.Engine; import org.opensearch.index.engine.exec.FileMetadata; -import org.opensearch.index.engine.exec.coord.Any; import org.opensearch.index.engine.exec.coord.CatalogSnapshot; +import org.opensearch.index.engine.exec.coord.CompositeEngineCatalogSnapshot; import org.opensearch.index.seqno.SequenceNumbers; import org.opensearch.index.shard.AbstractIndexShardComponent; import org.opensearch.index.shard.IndexShard; import org.opensearch.index.shard.ShardPath; import org.opensearch.index.translog.Translog; -import org.opensearch.plugins.PluginsService; -import org.opensearch.index.engine.exec.DataFormat; import java.io.Closeable; import java.io.EOFException; @@ -189,6 +185,7 @@ public class Store extends AbstractIndexShardComponent implements Closeable, Ref private final ShardPath shardPath; private final boolean isParentFieldEnabledVersion; private final boolean isIndexSortEnabled; + private final IndexSettings indexSettings; // used to ref count files when a new Reader is opened for PIT/Scroll queries // prevents segment files deletion until the PIT/Scroll expires or is discarded @@ -223,6 +220,8 @@ public Store( ) { super(shardId, indexSettings); + this.indexSettings = indexSettings; + ShardPath actualShardPath = shardPath != null ? shardPath : createTempShardPath(shardId); final TimeValue refreshInterval = indexSettings.getValue(INDEX_STORE_STATS_REFRESH_INTERVAL_SETTING); @@ -904,7 +903,7 @@ public void cleanupAndVerify(String reason, MetadataSnapshot sourceMetadata) thr * @param segmentsGen segment generation number * @throws IOException Exception while reading store and building segment infos */ - public SegmentInfos buildSegmentInfos(byte[] infosBytes, long segmentsGen) throws IOException { + public SegmentInfos buildSegmentInfosFromSerializedCatalogSnapshot(byte[] infosBytes, long segmentsGen) throws IOException { try (final ChecksumIndexInput input = toIndexInput(infosBytes)) { return convertCatalogSnapshotToSegmentInfos(infosBytes, segmentsGen); } catch (Exception e) { @@ -914,13 +913,29 @@ public SegmentInfos buildSegmentInfos(byte[] infosBytes, long segmentsGen) throw } } + /** + * Segment replication method + *

+ * This method takes the segment info bytes to build SegmentInfos. It inc'refs files pointed by passed in SegmentInfos + * bytes to ensure they are not deleted. + * + * @param infosBytes bytes[] of SegmentInfos supposed to be sent over by primary excluding segment_N file + * @param segmentsGen segment generation number + * @throws IOException Exception while reading store and building segment infos + */ + public SegmentInfos buildSegmentInfos(byte[] infosBytes, long segmentsGen) throws IOException { + try (final ChecksumIndexInput input = toIndexInput(infosBytes)) { + return SegmentInfos.readCommit(directory, input, segmentsGen); + } + } + private SegmentInfos convertCatalogSnapshotToSegmentInfos(byte[] catalogSnapshotBytes, long segmentsGen) throws IOException { logger.debug("Converting CatalogSnapshot to SegmentInfos for generation: {}", segmentsGen); // Step 1: Deserialize CatalogSnapshot CatalogSnapshot catalogSnapshot; try (BytesStreamInput input = new BytesStreamInput(catalogSnapshotBytes)) { - catalogSnapshot = new CatalogSnapshot(input); + catalogSnapshot = new CompositeEngineCatalogSnapshot(input); } catch (Exception e) { throw new IOException("Failed to deserialize CatalogSnapshot bytes", e); } @@ -1062,7 +1077,7 @@ public DirectoryFileTransferTracker getDirectoryFileTransferTracker() { * * @opensearch.internal */ - static final class StoreDirectory extends FilterDirectory { + public static class StoreDirectory extends FilterDirectory { private final Logger deletesLogger; public final DirectoryFileTransferTracker directoryFileTransferTracker; @@ -1079,7 +1094,7 @@ long estimateSize() throws IOException { } @Override - public void close() { + public void close() throws IOException { assert false : "Nobody should close this directory except of the Store itself"; } diff --git a/server/src/main/java/org/opensearch/index/store/lockmanager/RemoteStoreLockManagerFactory.java b/server/src/main/java/org/opensearch/index/store/lockmanager/RemoteStoreLockManagerFactory.java index b1742695b6748..08926c09fa033 100644 --- a/server/src/main/java/org/opensearch/index/store/lockmanager/RemoteStoreLockManagerFactory.java +++ b/server/src/main/java/org/opensearch/index/store/lockmanager/RemoteStoreLockManagerFactory.java @@ -67,7 +67,8 @@ public static RemoteStoreMetadataLockManager newLockManager( String segmentsPathFixedPrefix, String indexFixedPrefix ) { - try (Repository repository = repositoriesService.repository(repositoryName)) { + Repository repository = repositoriesService.repository(repositoryName); + try { assert repository instanceof BlobStoreRepository : "repository should be instance of BlobStoreRepository"; BlobPath repositoryBasePath = ((BlobStoreRepository) repository).basePath(); diff --git a/server/src/main/java/org/opensearch/index/store/remote/CompositeRemoteDirectory.java b/server/src/main/java/org/opensearch/index/store/remote/CompositeRemoteDirectory.java index 1a854c0360805..f9f0de0573212 100644 --- a/server/src/main/java/org/opensearch/index/store/remote/CompositeRemoteDirectory.java +++ b/server/src/main/java/org/opensearch/index/store/remote/CompositeRemoteDirectory.java @@ -34,9 +34,7 @@ import org.opensearch.index.engine.exec.DataFormat; import org.opensearch.index.engine.exec.FileMetadata; import org.opensearch.index.engine.exec.coord.Any; -import org.opensearch.index.store.CompositeStoreDirectory; -import org.opensearch.index.store.RemoteIndexInput; -import org.opensearch.index.store.RemoteIndexOutput; +import org.opensearch.index.store.*; import org.opensearch.index.store.remote.metadata.RemoteSegmentMetadata; import org.opensearch.index.store.remote.metadata.RemoteSegmentMetadataHandlerFactory; import org.opensearch.common.io.VersionedCodecStreamWrapper; @@ -47,11 +45,14 @@ import java.io.IOException; import java.io.InputStream; import java.nio.file.NoSuchFileException; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.concurrent.ConcurrentHashMap; import java.util.function.UnaryOperator; +import java.util.stream.Collectors; + /** * CompositeRemoteDirectory with direct BlobContainer access per format. @@ -65,7 +66,7 @@ * @opensearch.api */ @PublicApi(since = "3.0.0") -public class CompositeRemoteDirectory implements Closeable { +public class CompositeRemoteDirectory extends RemoteDirectory { /** * Metadata stream wrapper for reading/writing RemoteSegmentMetadata @@ -88,7 +89,6 @@ public class CompositeRemoteDirectory implements Closeable { final Map pendingDownloadMergedSegments; private final Map formatBlobContainers; - private final BlobContainer metadataBlobContainer; private final BlobStore blobStore; private final BlobPath baseBlobPath; private final Logger logger; @@ -107,6 +107,19 @@ public CompositeRemoteDirectory( Logger logger, PluginsService pluginsService ) { + super( + blobStore.blobContainer(baseBlobPath), + uploadRateLimiter, + lowPriorityUploadRateLimiter, + downloadRateLimiter, + lowPriorityDownloadRateLimiter, + pendingDownloadMergedSegments.entrySet().stream().collect( + Collectors.toMap( + e -> e.getKey().serialize(), + Map.Entry::getValue + ) + ) + ); this.formatBlobContainers = new ConcurrentHashMap<>(); this.blobStore = blobStore; this.baseBlobPath = baseBlobPath; @@ -116,9 +129,6 @@ public CompositeRemoteDirectory( this.pendingDownloadMergedSegments = pendingDownloadMergedSegments; this.logger = logger; - BlobPath metadataBlobPath = Objects.requireNonNull(baseBlobPath.parent()).add("metadata"); - this.metadataBlobContainer = blobStore.blobContainer(metadataBlobPath); - try { pluginsService.filterPlugins(DataSourcePlugin.class).forEach( plugin -> { @@ -304,6 +314,13 @@ private long calculateChecksumOfChecksum(CompositeStoreDirectory from, FileMetad return from.calculateChecksum(fileMetadata); } + @Override + public void deleteFile(UploadedSegmentMetadata uploadedSegmentMetadata) throws IOException { + FileMetadata fileMetadata = new FileMetadata(uploadedSegmentMetadata.getDataFormat(), uploadedSegmentMetadata.getUploadedFilename()); + BlobContainer blobContainer = getBlobContainer(fileMetadata.dataFormat()); + blobContainer.deleteBlobsIgnoringIfNotExists(Collections.singletonList(fileMetadata.file())); + } + /** /** @@ -353,9 +370,6 @@ public RemoteIndexOutput createOutput(String remoteFileName, String df, IOContex logger.debug("File {} already exists, using existing container", remoteFileName); return new RemoteIndexOutput(remoteFileName, blobContainer); } - else if(df !=null && df.equals("TempMetadata")) { - return new RemoteIndexOutput(remoteFileName, metadataBlobContainer); - } throw new IOException( String.format("Failed to create output for file %s in format %s", remoteFileName, df) @@ -409,52 +423,6 @@ public void delete() throws IOException { logger.debug("Deleted all format containers from CompositeRemoteDirectory"); } - - /** - * Read the latest metadata file from the metadata blob container. - * This method provides compatibility with RemoteSegmentStoreDirectory.readLatestMetadataFile() - */ - public RemoteSegmentMetadata readLatestMetadataFile() throws IOException { - try { - List metadataFiles = metadataBlobContainer.listBlobsByPrefixInSortedOrder( - "metadata", 10, BlobContainer.BlobNameSortOrder.LEXICOGRAPHIC); - - if (metadataFiles.isEmpty()) { - logger.debug("No metadata files found in composite remote directory"); - return null; - } - - // Get the latest (first in reverse lexicographic order) - String latestMetadataFile = metadataFiles.get(0).name(); - logger.debug("Reading latest metadata file: {}", latestMetadataFile); - return readMetadataFile(latestMetadataFile); - } catch (Exception e) { - logger.error("Failed to read latest metadata file from composite directory", e); - throw new IOException("Failed to read latest metadata file", e); - } - } - - /** - * Read a specific metadata file by name from the metadata blob container. - * This method provides compatibility with RemoteSegmentStoreDirectory.readMetadataFile() - */ - public RemoteSegmentMetadata readMetadataFile(String metadataFileName) throws IOException { - try (InputStream inputStream = metadataBlobContainer.readBlob(metadataFileName)) { - byte[] metadataBytes = inputStream.readAllBytes(); - - // Use our own metadata stream wrapper - return metadataStreamWrapper.readStream( - new ByteArrayIndexInput(metadataFileName, metadataBytes) - ); - } catch (NoSuchFileException e) { - logger.debug("Metadata file not found: {}", metadataFileName); - return null; - } catch (Exception e) { - logger.error("Failed to read metadata file: {}", metadataFileName, e); - throw new IOException("Failed to read metadata file: " + metadataFileName, e); - } - } - @Override public void close() throws IOException { formatBlobContainers.clear(); diff --git a/server/src/main/java/org/opensearch/index/translog/RemoteBlobStoreInternalTranslogFactory.java b/server/src/main/java/org/opensearch/index/translog/RemoteBlobStoreInternalTranslogFactory.java index 1f2b2c48b471a..501dbe2962d29 100644 --- a/server/src/main/java/org/opensearch/index/translog/RemoteBlobStoreInternalTranslogFactory.java +++ b/server/src/main/java/org/opensearch/index/translog/RemoteBlobStoreInternalTranslogFactory.java @@ -37,12 +37,15 @@ public class RemoteBlobStoreInternalTranslogFactory implements TranslogFactory { private final RemoteStoreSettings remoteStoreSettings; + private final boolean isServerSideEncryptionEnabled; + public RemoteBlobStoreInternalTranslogFactory( Supplier repositoriesServiceSupplier, ThreadPool threadPool, String repositoryName, RemoteTranslogTransferTracker remoteTranslogTransferTracker, - RemoteStoreSettings remoteStoreSettings + RemoteStoreSettings remoteStoreSettings, + boolean isServerSideEncryptionEnabled ) { Repository repository; try { @@ -54,6 +57,7 @@ public RemoteBlobStoreInternalTranslogFactory( this.threadPool = threadPool; this.remoteTranslogTransferTracker = remoteTranslogTransferTracker; this.remoteStoreSettings = remoteStoreSettings; + this.isServerSideEncryptionEnabled = isServerSideEncryptionEnabled; } @Override @@ -107,7 +111,8 @@ public Translog newTranslog( startedPrimarySupplier, remoteTranslogTransferTracker, remoteStoreSettings, - translogOperationHelper + translogOperationHelper, + isServerSideEncryptionEnabled ); } else { return new RemoteFsTranslog( @@ -123,7 +128,8 @@ public Translog newTranslog( remoteTranslogTransferTracker, remoteStoreSettings, translogOperationHelper, - null + null, + isServerSideEncryptionEnabled ); } } diff --git a/server/src/main/java/org/opensearch/index/translog/RemoteFsTimestampAwareTranslog.java b/server/src/main/java/org/opensearch/index/translog/RemoteFsTimestampAwareTranslog.java index 7fd915ba2c297..1832d1e7d035a 100644 --- a/server/src/main/java/org/opensearch/index/translog/RemoteFsTimestampAwareTranslog.java +++ b/server/src/main/java/org/opensearch/index/translog/RemoteFsTimestampAwareTranslog.java @@ -76,7 +76,8 @@ public RemoteFsTimestampAwareTranslog( BooleanSupplier startedPrimarySupplier, RemoteTranslogTransferTracker remoteTranslogTransferTracker, RemoteStoreSettings remoteStoreSettings, - TranslogOperationHelper translogOperationHelper + TranslogOperationHelper translogOperationHelper, + boolean isServerSideEncryptionEnabled ) throws IOException { super( config, @@ -91,7 +92,8 @@ public RemoteFsTimestampAwareTranslog( remoteTranslogTransferTracker, remoteStoreSettings, translogOperationHelper, - null + null, + isServerSideEncryptionEnabled ); logger = Loggers.getLogger(getClass(), shardId); this.metadataFilePinnedTimestampMap = new HashMap<>(); diff --git a/server/src/main/java/org/opensearch/index/translog/RemoteFsTranslog.java b/server/src/main/java/org/opensearch/index/translog/RemoteFsTranslog.java index bbe8b739e2da4..76072609963f1 100644 --- a/server/src/main/java/org/opensearch/index/translog/RemoteFsTranslog.java +++ b/server/src/main/java/org/opensearch/index/translog/RemoteFsTranslog.java @@ -39,7 +39,9 @@ import java.nio.file.Files; import java.nio.file.NoSuchFileException; import java.nio.file.Path; +import java.util.ArrayList; import java.util.HashSet; +import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Objects; @@ -95,6 +97,7 @@ public class RemoteFsTranslog extends Translog { private final Semaphore syncPermit = new Semaphore(SYNC_PERMIT); protected final AtomicBoolean pauseSync = new AtomicBoolean(false); private final boolean isTranslogMetadataEnabled; + private final boolean isServerSideEncryptionEnabled; public RemoteFsTranslog( TranslogConfig config, @@ -109,7 +112,8 @@ public RemoteFsTranslog( RemoteTranslogTransferTracker remoteTranslogTransferTracker, RemoteStoreSettings remoteStoreSettings, TranslogOperationHelper translogOperationHelper, - ChannelFactory channelFactory + ChannelFactory channelFactory, + boolean isServerSideEncryptionEnabled ) throws IOException { super( config, @@ -126,6 +130,7 @@ public RemoteFsTranslog( this.remoteTranslogTransferTracker = remoteTranslogTransferTracker; fileTransferTracker = new FileTransferTracker(shardId, remoteTranslogTransferTracker); isTranslogMetadataEnabled = indexSettings().isTranslogMetadataEnabled(); + this.isServerSideEncryptionEnabled = isServerSideEncryptionEnabled; this.translogTransferManager = buildTranslogTransferManager( blobStoreRepository, threadPool, @@ -134,7 +139,8 @@ public RemoteFsTranslog( remoteTranslogTransferTracker, indexSettings().getRemoteStorePathStrategy(), remoteStoreSettings, - isTranslogMetadataEnabled + isTranslogMetadataEnabled, + isServerSideEncryptionEnabled ); try { if (config.downloadRemoteTranslogOnInit()) { @@ -193,7 +199,8 @@ public static void download( Logger logger, boolean seedRemote, boolean isTranslogMetadataEnabled, - long timestamp + long timestamp, + boolean isServerSideEncryptionEnabled ) throws IOException { assert repository instanceof BlobStoreRepository : String.format( Locale.ROOT, @@ -213,7 +220,8 @@ public static void download( remoteTranslogTransferTracker, pathStrategy, remoteStoreSettings, - isTranslogMetadataEnabled + isTranslogMetadataEnabled, + isServerSideEncryptionEnabled ); RemoteFsTranslog.download(translogTransferManager, location, logger, seedRemote, timestamp); logger.trace(remoteTranslogTransferTracker.toString()); @@ -325,7 +333,8 @@ public static TranslogTransferManager buildTranslogTransferManager( RemoteTranslogTransferTracker tracker, RemoteStorePathStrategy pathStrategy, RemoteStoreSettings remoteStoreSettings, - boolean isTranslogMetadataEnabled + boolean isTranslogMetadataEnabled, + boolean isServerSideEncryptionEnabled ) { assert Objects.nonNull(pathStrategy); String indexUUID = shardId.getIndex().getUUID(); @@ -348,7 +357,10 @@ public static TranslogTransferManager buildTranslogTransferManager( .fixedPrefix(remoteStoreSettings.getTranslogPathFixedPrefix()) .build(); BlobPath mdPath = pathStrategy.generatePath(mdPathInput); - BlobStoreTransferService transferService = new BlobStoreTransferService(blobStoreRepository.blobStore(), threadPool); + BlobStoreTransferService transferService = new BlobStoreTransferService( + blobStoreRepository.blobStore(isServerSideEncryptionEnabled), + threadPool + ); return new TranslogTransferManager( shardId, transferService, @@ -655,7 +667,8 @@ public static void cleanup( ThreadPool threadPool, RemoteStorePathStrategy pathStrategy, RemoteStoreSettings remoteStoreSettings, - boolean isTranslogMetadataEnabled + boolean isTranslogMetadataEnabled, + boolean isServerSideEncryptionEnabled ) throws IOException { assert repository instanceof BlobStoreRepository : "repository should be instance of BlobStoreRepository"; BlobStoreRepository blobStoreRepository = (BlobStoreRepository) repository; @@ -671,7 +684,8 @@ public static void cleanup( remoteTranslogTransferTracker, pathStrategy, remoteStoreSettings, - isTranslogMetadataEnabled + isTranslogMetadataEnabled, + isServerSideEncryptionEnabled ); // clean up all remote translog files translogTransferManager.deleteTranslogFiles(); diff --git a/server/src/main/java/org/opensearch/index/translog/transfer/TranslogTransferManager.java b/server/src/main/java/org/opensearch/index/translog/transfer/TranslogTransferManager.java index 389d98adcc4eb..f852f2c458d4e 100644 --- a/server/src/main/java/org/opensearch/index/translog/transfer/TranslogTransferManager.java +++ b/server/src/main/java/org/opensearch/index/translog/transfer/TranslogTransferManager.java @@ -292,6 +292,7 @@ public boolean downloadTranslog(String primaryTerm, String generation, Path loca } else { // Download translog.tlog file with object metadata from remote to local FS Map metadata = downloadToFS(translogFilename, location, primaryTerm, true); + try { assert metadata != null && !metadata.isEmpty() && metadata.containsKey(CHECKPOINT_FILE_DATA_KEY); recoverCkpFileUsingMetadata(metadata, location, generation, translogFilename); diff --git a/server/src/main/java/org/opensearch/indices/IndicesService.java b/server/src/main/java/org/opensearch/indices/IndicesService.java index ad10465f70af7..0426e54e4cbc6 100644 --- a/server/src/main/java/org/opensearch/indices/IndicesService.java +++ b/server/src/main/java/org/opensearch/indices/IndicesService.java @@ -131,6 +131,7 @@ import org.opensearch.index.recovery.RecoveryStats; import org.opensearch.index.refresh.RefreshStats; import org.opensearch.index.remote.RemoteStoreStatsTrackerFactory; +import org.opensearch.index.remote.RemoteStoreUtils; import org.opensearch.index.search.stats.SearchStats; import org.opensearch.index.seqno.RetentionLeaseStats; import org.opensearch.index.seqno.RetentionLeaseSyncer; @@ -713,7 +714,8 @@ private static BiFunction getTrans threadPool, indexSettings.getRemoteStoreTranslogRepository(), remoteStoreStatsTrackerFactory.getRemoteTranslogTransferTracker(shardRouting.shardId()), - remoteStoreSettings + remoteStoreSettings, + RemoteStoreUtils.isServerSideEncryptionEnabledIndex(indexSettings.getIndexMetadata()) ); } else if (RemoteStoreNodeAttribute.isTranslogRepoConfigured(settings) && shardRouting.primary()) { return new RemoteBlobStoreInternalTranslogFactory( @@ -721,7 +723,8 @@ private static BiFunction getTrans threadPool, RemoteStoreNodeAttribute.getRemoteStoreTranslogRepo(indexSettings.getNodeSettings()), remoteStoreStatsTrackerFactory.getRemoteTranslogTransferTracker(shardRouting.shardId()), - remoteStoreSettings + remoteStoreSettings, + RemoteStoreUtils.isServerSideEncryptionEnabledIndex(indexSettings.getIndexMetadata()) ); } return new InternalTranslogFactory(); diff --git a/server/src/main/java/org/opensearch/indices/RemoteStoreSettings.java b/server/src/main/java/org/opensearch/indices/RemoteStoreSettings.java index 1f09af234ae30..44647f020e085 100644 --- a/server/src/main/java/org/opensearch/indices/RemoteStoreSettings.java +++ b/server/src/main/java/org/opensearch/indices/RemoteStoreSettings.java @@ -184,6 +184,16 @@ public class RemoteStoreSettings { Property.Final ); + /** + * Controls the ServerSideEncryption Settings. + */ + public static final Setting CLUSTER_SERVER_SIDE_ENCRYPTION_ENABLED = Setting.boolSetting( + "cluster.remote_store.server_side_encryption", + true, + Setting.Property.NodeScope, + Setting.Property.Dynamic + ); + private volatile TimeValue clusterRemoteTranslogBufferInterval; private volatile int minRemoteSegmentMetadataFiles; private volatile TimeValue clusterRemoteTranslogTransferTimeout; @@ -191,6 +201,7 @@ public class RemoteStoreSettings { private volatile RemoteStoreEnums.PathType pathType; private volatile RemoteStoreEnums.PathHashAlgorithm pathHashAlgorithm; private volatile int maxRemoteTranslogReaders; + private volatile boolean isClusterServerSideEncryptionRepoEnabled; private volatile boolean isTranslogMetadataEnabled; private static volatile boolean isPinnedTimestampsEnabled; private static volatile TimeValue pinnedTimestampsSchedulerInterval; @@ -235,6 +246,9 @@ public RemoteStoreSettings(Settings settings, ClusterSettings clusterSettings) { this::setClusterRemoteSegmentTransferTimeout ); + isClusterServerSideEncryptionRepoEnabled = CLUSTER_SERVER_SIDE_ENCRYPTION_ENABLED.get(settings); + clusterSettings.addSettingsUpdateConsumer(CLUSTER_SERVER_SIDE_ENCRYPTION_ENABLED, this::setClusterServerSideEncryptionEnabled); + pinnedTimestampsSchedulerInterval = CLUSTER_REMOTE_STORE_PINNED_TIMESTAMP_SCHEDULER_INTERVAL.get(settings); pinnedTimestampsLookbackInterval = CLUSTER_REMOTE_STORE_PINNED_TIMESTAMP_LOOKBACK_INTERVAL.get(settings); isPinnedTimestampsEnabled = CLUSTER_REMOTE_STORE_PINNED_TIMESTAMP_ENABLED.get(settings); @@ -309,6 +323,14 @@ private void setMaxRemoteTranslogReaders(int maxRemoteTranslogReaders) { this.maxRemoteTranslogReaders = maxRemoteTranslogReaders; } + public boolean isClusterServerSideEncryptionEnabled() { + return isClusterServerSideEncryptionRepoEnabled; + } + + private void setClusterServerSideEncryptionEnabled(boolean clusterServerSideEncryptionEnabled) { + isClusterServerSideEncryptionRepoEnabled = clusterServerSideEncryptionEnabled; + } + public static TimeValue getPinnedTimestampsSchedulerInterval() { return pinnedTimestampsSchedulerInterval; } diff --git a/server/src/main/java/org/opensearch/indices/replication/CompositeStoreDirectoryStatsWrapper.java b/server/src/main/java/org/opensearch/indices/replication/CompositeStoreDirectoryStatsWrapper.java index 717cf4970663d..745fc8a35ac51 100644 --- a/server/src/main/java/org/opensearch/indices/replication/CompositeStoreDirectoryStatsWrapper.java +++ b/server/src/main/java/org/opensearch/indices/replication/CompositeStoreDirectoryStatsWrapper.java @@ -8,6 +8,7 @@ package org.opensearch.indices.replication; +import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.opensearch.index.engine.exec.FileMetadata; import org.opensearch.index.store.CompositeRemoteSegmentStoreDirectory; @@ -26,15 +27,21 @@ * * @opensearch.internal */ -public final class CompositeStoreDirectoryStatsWrapper { +public final class CompositeStoreDirectoryStatsWrapper extends SegmentReplicationSource.ReplicationStatsDirectoryWrapper { private final CompositeStoreDirectory delegate; private final BiConsumer fileProgressTracker; public CompositeStoreDirectoryStatsWrapper(CompositeStoreDirectory delegate, BiConsumer fileProgressTracker) { + super(delegate, fileProgressTracker); this.delegate = delegate; this.fileProgressTracker = fileProgressTracker; } + @Override + public void copyFrom(Directory from, String src, String dest, IOContext context) throws IOException { + copyFrom(new FileMetadata(src), (RemoteSegmentStoreDirectory) from, context); + } + /** * Copies a file from source directory with format-agnostic progress tracking. * This method is format-aware and uses callback-based progress tracking instead of FilterDirectory. @@ -76,7 +83,7 @@ public void copyFrom(CompositeRemoteSegmentStoreDirectory from, String src, Stri /** * Gets the underlying CompositeStoreDirectory for direct access when needed. */ - public CompositeStoreDirectory getDelegate() { + public CompositeStoreDirectory getCompositeStoreDirectory() { return delegate; } diff --git a/server/src/main/java/org/opensearch/indices/replication/RemoteStoreReplicationSource.java b/server/src/main/java/org/opensearch/indices/replication/RemoteStoreReplicationSource.java index 7627b27e46a81..59090b3207227 100644 --- a/server/src/main/java/org/opensearch/indices/replication/RemoteStoreReplicationSource.java +++ b/server/src/main/java/org/opensearch/indices/replication/RemoteStoreReplicationSource.java @@ -30,10 +30,7 @@ import org.opensearch.indices.replication.checkpoint.ReplicationCheckpoint; import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; @@ -139,8 +136,12 @@ public void getSegmentFiles( } logger.debug("Downloading format-aware segment files from remote store {}", filesToFetch); if (remoteMetadataExists()) { - final CompositeStoreDirectory storeDirectory = indexShard.store().compositeStoreDirectory(); - final List directoryFiles = List.of(storeDirectory.listFileMetadata()); + final Directory storeDirectory = indexShard.isOptimizedIndex() + ? indexShard.store().compositeStoreDirectory() + : indexShard.store().directory(); + final List directoryFiles = Arrays.stream(storeDirectory.listAll()).map( + file -> indexShard.isOptimizedIndex() ? new FileMetadata(file) : new FileMetadata("lucene", file) + ).collect(Collectors.toList()); final List toDownloadFileMetadata = new ArrayList<>(); @@ -155,6 +156,7 @@ public void getSegmentFiles( if (directoryFiles.contains(fileMetadata)) { logger.info("ReplicationCheckpoint: {}, filesToFetch: {}", checkpoint.getSegmentInfosVersion(), filesToFetch); logger.info(directoryFiles); + continue; } assert directoryFiles.contains(fileMetadata) == false : "Local store already contains the file " + fileMetadata; @@ -164,7 +166,16 @@ public void getSegmentFiles( } // Use CompositeStoreDirectory with format-aware progress tracking - final CompositeStoreDirectoryStatsWrapper statsWrapper = new CompositeStoreDirectoryStatsWrapper(storeDirectory, fileProgressTracker); + final ReplicationStatsDirectoryWrapper statsWrapper = indexShard.isOptimizedIndex() + ? new CompositeStoreDirectoryStatsWrapper((CompositeStoreDirectory) storeDirectory, fileProgressTracker) + : new ReplicationStatsDirectoryWrapper(storeDirectory, fileProgressTracker); + + // After the for loop that builds toDownloadFileMetadata + if (toDownloadFileMetadata.isEmpty()) { + logger.debug("All files already exist locally, skipping download"); + listener.onResponse(new GetSegmentFilesResponse(filesToFetch)); + return; + } indexShard.getFileDownloader() .downloadAsync( diff --git a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationSource.java b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationSource.java index 1519720d08bf9..0fea6f0bd36eb 100644 --- a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationSource.java +++ b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationSource.java @@ -93,7 +93,7 @@ default void cancel() {} * * @opensearch.internal */ - final class ReplicationStatsDirectoryWrapper extends FilterDirectory { + class ReplicationStatsDirectoryWrapper extends FilterDirectory { private final BiConsumer fileProgressTracker; ReplicationStatsDirectoryWrapper(Directory in, BiConsumer fileProgressTracker) { diff --git a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTarget.java b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTarget.java index 6878f9eefb98f..0e9fe6124bf2e 100644 --- a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTarget.java +++ b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTarget.java @@ -20,6 +20,8 @@ import org.opensearch.common.util.CancellableThreads; import org.opensearch.core.common.io.stream.BytesStreamInput; import org.opensearch.index.engine.exec.coord.CatalogSnapshot; +import org.opensearch.index.engine.exec.coord.CompositeEngineCatalogSnapshot; +import org.opensearch.index.engine.exec.coord.SegmentInfosCatalogSnapshot; import org.opensearch.index.shard.IndexShard; import org.opensearch.index.store.Store; import org.opensearch.index.store.StoreFileMetadata; @@ -27,7 +29,6 @@ import org.opensearch.indices.replication.common.ReplicationFailedException; import org.opensearch.indices.replication.common.ReplicationListener; -import java.io.ByteArrayInputStream; import java.io.IOException; import java.util.List; import java.util.function.BiConsumer; @@ -40,6 +41,8 @@ public class SegmentReplicationTarget extends AbstractSegmentReplicationTarget { public final static String REPLICATION_PREFIX = "replication."; + private final IndexShard indexShard; + public SegmentReplicationTarget( IndexShard indexShard, ReplicationCheckpoint checkpoint, @@ -47,6 +50,7 @@ public SegmentReplicationTarget( ReplicationListener listener ) { super("replication_target", indexShard, checkpoint, source, listener); + this.indexShard = indexShard; } @Override @@ -91,7 +95,16 @@ protected void finalizeReplication(CheckpointInfoResponse checkpointInfoResponse store = store(); store.incRef(); multiFileWriter.renameAllTempFiles(); - final CatalogSnapshot catalogSnapshot = deserializeCatalogSnapshot(checkpointInfoResponse.getInfosBytes()); + CatalogSnapshot catalogSnapshot = null; + final SegmentInfos infos = store.buildSegmentInfos( + checkpointInfoResponse.getInfosBytes(), + checkpointInfoResponse.getCheckpoint().getSegmentsGen() + ); + if (!indexShard.isOptimizedIndex()) { + catalogSnapshot = new SegmentInfosCatalogSnapshot(infos); + } else { + catalogSnapshot = CompositeEngineCatalogSnapshot.deserializeFromString(infos.getUserData().get(CompositeEngineCatalogSnapshot.CATALOG_SNAPSHOT_KEY)); + } indexShard.finalizeReplication(catalogSnapshot, checkpointInfoResponse.getCheckpoint()); } catch (CorruptIndexException | IndexFormatTooNewException | IndexFormatTooOldException ex) { // this is a fatal exception at this stage. @@ -142,7 +155,7 @@ public SegmentReplicationTarget retryCopy() { */ private CatalogSnapshot deserializeCatalogSnapshot(byte[] infoBytes) throws IOException { try (BytesStreamInput in = new BytesStreamInput(infoBytes)) { - return new CatalogSnapshot(in); + return new CompositeEngineCatalogSnapshot(in); } } } diff --git a/server/src/main/java/org/opensearch/indices/replication/checkpoint/RemoteStorePublishMergedSegmentAction.java b/server/src/main/java/org/opensearch/indices/replication/checkpoint/RemoteStorePublishMergedSegmentAction.java index f11c8ebfcc754..baa358030a39d 100644 --- a/server/src/main/java/org/opensearch/indices/replication/checkpoint/RemoteStorePublishMergedSegmentAction.java +++ b/server/src/main/java/org/opensearch/indices/replication/checkpoint/RemoteStorePublishMergedSegmentAction.java @@ -208,6 +208,6 @@ public void onFailure(FileMetadata file) { } private RemoteStoreUploader getRemoteStoreUploaderService(IndexShard indexShard) { - return new RemoteStoreUploaderService(indexShard, indexShard.store().compositeStoreDirectory(), indexShard.getRemoteDirectory()); + return new RemoteStoreUploaderService(indexShard, indexShard.store().directory(), indexShard.getRemoteDirectory(), indexShard.isOptimizedIndex()); } } diff --git a/server/src/main/java/org/opensearch/node/Node.java b/server/src/main/java/org/opensearch/node/Node.java index 165dfa7da0fdc..4083484648bca 100644 --- a/server/src/main/java/org/opensearch/node/Node.java +++ b/server/src/main/java/org/opensearch/node/Node.java @@ -171,7 +171,6 @@ import org.opensearch.index.remote.RemoteStoreStatsTrackerFactory; import org.opensearch.index.store.DefaultCompositeDirectoryFactory; import org.opensearch.index.store.IndexStoreListener; -import org.opensearch.index.store.CompositeRemoteSegmentStoreDirectoryFactory; import org.opensearch.index.store.RemoteSegmentStoreDirectoryFactory; import org.opensearch.index.store.remote.filecache.FileCache; import org.opensearch.index.store.remote.filecache.FileCacheCleaner; @@ -970,7 +969,7 @@ protected Node(final Environment initialEnvironment, Collection clas final CompositeIndexSettings compositeIndexSettings = new CompositeIndexSettings(settings, settingsModule.getClusterSettings()); - final IndexStorePlugin.DirectoryFactory remoteDirectoryFactory = new CompositeRemoteSegmentStoreDirectoryFactory( + final IndexStorePlugin.DirectoryFactory remoteDirectoryFactory = new RemoteSegmentStoreDirectoryFactory( repositoriesServiceReference::get, threadPool, remoteStoreSettings.getSegmentsPathFixedPrefix(), diff --git a/server/src/main/java/org/opensearch/repositories/blobstore/BlobStoreProvider.java b/server/src/main/java/org/opensearch/repositories/blobstore/BlobStoreProvider.java new file mode 100644 index 0000000000000..210ef8058d0f8 --- /dev/null +++ b/server/src/main/java/org/opensearch/repositories/blobstore/BlobStoreProvider.java @@ -0,0 +1,99 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.repositories.blobstore; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.opensearch.cluster.metadata.RepositoryMetadata; +import org.opensearch.common.SetOnce; +import org.opensearch.common.blobstore.BlobStore; +import org.opensearch.common.blobstore.EncryptedBlobStore; +import org.opensearch.common.lifecycle.Lifecycle; +import org.opensearch.repositories.RepositoryException; + +/** + * Provide for the BlobStore class + * + * @opensearch.internal + */ +public class BlobStoreProvider { + private static final Logger logger = LogManager.getLogger(BlobStoreProvider.class); + protected final Lifecycle lifecycle; + protected final RepositoryMetadata metadata; + protected final Object lock; + protected final BlobStoreRepository repository; + private final SetOnce blobStore = new SetOnce<>(); + private final SetOnce serverSideEncryptedBlobStore = new SetOnce<>(); + + public BlobStoreProvider(BlobStoreRepository repository, RepositoryMetadata metadata, Lifecycle lifecycle, Object lock) { + this.lifecycle = lifecycle; + this.metadata = metadata; + this.lock = lock; + this.repository = repository; + } + + protected BlobStore blobStore(boolean serverSideEncryptionEnabled) { + if (serverSideEncryptionEnabled) { + return createBlobStore(serverSideEncryptedBlobStore, true); + } + return createBlobStore(blobStore, false); + } + + protected BlobStore createBlobStore(SetOnce blobStore, boolean serverSideEncryption) { + // assertSnapshotOrGenericThread(); + BlobStore store = blobStore.get(); + logger.debug("blob store fetched = " + store); + if (store == null) { + synchronized (lock) { + store = blobStore.get(); + if (store == null) { + store = initBlobStore(); + if (!serverSideEncryption && metadata.cryptoMetadata() != null) { + store = new EncryptedBlobStore(store, metadata.cryptoMetadata()); + } + blobStore.set(store); + } + } + } + return store; + } + + public BlobStore getBlobStore(boolean serverSideEncryptionEnabled) { + if (serverSideEncryptionEnabled) { + return serverSideEncryptedBlobStore.get(); + } + return blobStore.get(); + } + + protected BlobStore initBlobStore() { + if (lifecycle.started() == false) { + throw new RepositoryException(metadata.name(), "repository is not in started state" + lifecycle.state()); + } + try { + return repository.createBlobStore(); + } catch (RepositoryException e) { + throw e; + } catch (Exception e) { + throw new RepositoryException(metadata.name(), "cannot create blob store", e); + } + } + + public void close() { + try { + if (blobStore.get() != null) { + blobStore.get().close(); + } + if (serverSideEncryptedBlobStore.get() != null) { + serverSideEncryptedBlobStore.get().close(); + } + } catch (Exception t) { + logger.warn("cannot close blob store", t); + } + } +} diff --git a/server/src/main/java/org/opensearch/repositories/blobstore/BlobStoreRepository.java b/server/src/main/java/org/opensearch/repositories/blobstore/BlobStoreRepository.java index 1b6aa3df2bc8a..fa2c9c247439a 100644 --- a/server/src/main/java/org/opensearch/repositories/blobstore/BlobStoreRepository.java +++ b/server/src/main/java/org/opensearch/repositories/blobstore/BlobStoreRepository.java @@ -74,7 +74,6 @@ import org.opensearch.common.blobstore.BlobPath; import org.opensearch.common.blobstore.BlobStore; import org.opensearch.common.blobstore.DeleteResult; -import org.opensearch.common.blobstore.EncryptedBlobStore; import org.opensearch.common.blobstore.fs.FsBlobContainer; import org.opensearch.common.blobstore.transfer.stream.OffsetRangeInputStream; import org.opensearch.common.blobstore.transfer.stream.RateLimitingOffsetRangeInputStream; @@ -567,7 +566,7 @@ protected static long calculateMaxWithinIntLimit(long defaultThresholdOfHeap, lo private final SetOnce snapshotShardPathBlobContainer = new SetOnce<>(); - private final SetOnce blobStore = new SetOnce<>(); + private final SetOnce blobStoreProvider = new SetOnce<>(); protected final ClusterService clusterService; @@ -683,19 +682,20 @@ protected void doStop() {} @Override protected void doClose() { - BlobStore store; + BlobStoreProvider provider = null; // to close blobStore if blobStore initialization is started during close synchronized (lock) { - store = blobStore.get(); - } - if (store != null) { - try { - closed = true; - store.close(); - } catch (Exception t) { - logger.warn("cannot close blob store", t); + provider = blobStoreProvider.get(); + if (provider != null) { + try { + provider.close(); + closed = true; + } catch (Exception t) { + logger.warn("cannot close blob store", t); + } } } + } @Override @@ -983,7 +983,15 @@ public SetOnce getSnapshotShardPathBlobContainer() { // for test purposes only protected BlobStore getBlobStore() { - return blobStore.get(); + return getBlobStore(false); + } + + BlobStore getBlobStore(boolean isServerSideEncryptionEnabled) { + BlobStoreProvider provider = blobStoreProvider.get(); + if (provider != null) { + return provider.getBlobStore(isServerSideEncryptionEnabled); + } + return null; } boolean getPrefixModeVerification() { @@ -1052,29 +1060,35 @@ protected BlobContainer snapshotShardPathBlobContainer() { * Public for testing. */ public BlobStore blobStore() { - BlobStore store = blobStore.get(); - if (store == null) { + return blobStore(false); + } + + /** + * Calls the existing blobStore() method. Specific repositories can implement the support for + * Server side encryption + * @param serverSideEncryptionEnabled ServerSideEncryptionEnabled Value. + * @return BlobStore `Blobstore` for the repository + */ + public BlobStore blobStore(boolean serverSideEncryptionEnabled) { + BlobStoreProvider provider = this.blobStoreProvider.get(); + if (provider == null) { synchronized (lock) { - store = blobStore.get(); - if (store == null) { - if (lifecycle.started() == false) { - throw new RepositoryException(metadata.name(), "repository is not in started state"); - } - try { - store = createBlobStore(); - if (metadata.cryptoMetadata() != null) { - store = new EncryptedBlobStore(store, metadata.cryptoMetadata()); - } - } catch (RepositoryException e) { - throw e; - } catch (Exception e) { - throw new RepositoryException(metadata.name(), "cannot create blob store", e); - } - blobStore.set(store); + provider = this.blobStoreProvider.get(); + if (provider == null) { + provider = new BlobStoreProvider(this, metadata, lifecycle, lock); + this.blobStoreProvider.set(provider); } } } - return store; + return provider.blobStore(serverSideEncryptionEnabled); + } + + /** + * Specific repositories should have specific implementation. + * @return true/false based on repository type + */ + public boolean isSeverSideEncryptionEnabled() { + return false; } /** @@ -1122,13 +1136,40 @@ public Compressor getCompressor() { @Override public RepositoryStats stats() { - final BlobStore store = blobStore.get(); - if (store == null) { + BlobStore store = getBlobStore(false); + BlobStore serverSideEncryptedStore = getBlobStore(true); + + if (store == null && serverSideEncryptedStore == null) { return RepositoryStats.EMPTY_STATS; - } else if (store.extendedStats() != null && store.extendedStats().isEmpty() == false) { + } + + RepositoryStats extendedStoreStats = getExtendedStats(store); + RepositoryStats extendedSseStoreStats = getExtendedStats(serverSideEncryptedStore); + + if (extendedStoreStats != null && extendedSseStoreStats != null) { + return extendedStoreStats.merge(extendedSseStoreStats); + } else if (extendedStoreStats != null) { + return extendedStoreStats; + } else if (extendedSseStoreStats != null) { + return extendedSseStoreStats; + } + + RepositoryStats storeStats = store != null ? new RepositoryStats(store.stats()) : null; + RepositoryStats sseStoreStats = serverSideEncryptedStore != null ? new RepositoryStats(serverSideEncryptedStore.stats()) : null; + + if (storeStats != null && sseStoreStats != null) { + return storeStats.merge(sseStoreStats); + } else if (storeStats == null) { + return sseStoreStats; + } + return storeStats; + } + + private RepositoryStats getExtendedStats(BlobStore store) { + if (store != null && store.extendedStats() != null && store.extendedStats().isEmpty() == false) { return new RepositoryStats(store.extendedStats(), true); } - return new RepositoryStats(store.stats()); + return null; } public void deleteSnapshotsInternal( @@ -2394,7 +2435,8 @@ private void remoteTranslogCleanupAsync( remoteTranslogTransferTracker, remoteStorePathStrategy, remoteStoreSettings, - indexMetadataEnabled + indexMetadataEnabled, + false ); try { RemoteFsTimestampAwareTranslog.cleanupOfDeletedIndex(translogTransferManager, forceClean); @@ -4518,7 +4560,7 @@ public void verify(String seed, DiscoveryNode localNode) { @Override public String toString() { - return "BlobStoreRepository[" + "[" + metadata.name() + "], [" + blobStore.get() + ']' + ']'; + return "BlobStoreRepository[" + "[" + metadata.name() + "], [" + blobStoreProvider.get() + ']' + ']'; } /** diff --git a/server/src/main/java/org/opensearch/snapshots/RestoreService.java b/server/src/main/java/org/opensearch/snapshots/RestoreService.java index 3a333abe5b59f..ef4d0df2e9636 100644 --- a/server/src/main/java/org/opensearch/snapshots/RestoreService.java +++ b/server/src/main/java/org/opensearch/snapshots/RestoreService.java @@ -469,7 +469,7 @@ public ClusterState execute(ClusterState currentState) { .put(snapshotIndexMetadata.getSettings()) .put(IndexMetadata.SETTING_INDEX_UUID, UUIDs.randomBase64UUID()) ); - createIndexService.addRemoteStoreCustomMetadata(indexMdBuilder, false); + createIndexService.addRemoteStoreCustomMetadata(indexMdBuilder, false, currentState); shardLimitValidator.validateShardLimit( renamedIndexName, snapshotIndexMetadata.getSettings(), diff --git a/server/src/test/java/org/opensearch/cluster/metadata/MetadataCreateIndexServiceTests.java b/server/src/test/java/org/opensearch/cluster/metadata/MetadataCreateIndexServiceTests.java index 785ba35afaf4d..dca505d1104b8 100644 --- a/server/src/test/java/org/opensearch/cluster/metadata/MetadataCreateIndexServiceTests.java +++ b/server/src/test/java/org/opensearch/cluster/metadata/MetadataCreateIndexServiceTests.java @@ -132,6 +132,8 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import org.mockito.Mockito; + import static java.util.Collections.emptyList; import static java.util.Collections.emptyMap; import static java.util.Collections.singleton; @@ -174,6 +176,7 @@ import static org.opensearch.node.remotestore.RemoteStoreNodeAttribute.REMOTE_STORE_CLUSTER_STATE_REPOSITORY_NAME_ATTRIBUTE_KEY; import static org.opensearch.node.remotestore.RemoteStoreNodeAttribute.REMOTE_STORE_SEGMENT_REPOSITORY_NAME_ATTRIBUTE_KEY; import static org.opensearch.node.remotestore.RemoteStoreNodeAttribute.REMOTE_STORE_TRANSLOG_REPOSITORY_NAME_ATTRIBUTE_KEY; +import static org.opensearch.node.remotestore.RemoteStoreNodeAttribute.getRemoteStoreSegmentRepo; import static org.opensearch.node.remotestore.RemoteStoreNodeAttribute.getRemoteStoreTranslogRepo; import static org.opensearch.node.remotestore.RemoteStoreNodeService.MIGRATION_DIRECTION_SETTING; import static org.opensearch.node.remotestore.RemoteStoreNodeService.REMOTE_STORE_COMPATIBILITY_MODE_SETTING; @@ -1821,7 +1824,12 @@ private IndexMetadata testRemoteCustomData(boolean remoteStoreEnabled, PathType .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1) .build(); - IndexMetadata indexMetadata = metadataCreateIndexService.buildAndValidateTemporaryIndexMetadata(indexSettings, request, 0); + IndexMetadata indexMetadata = metadataCreateIndexService.buildAndValidateTemporaryIndexMetadata( + indexSettings, + request, + 0, + clusterService.state() + ); threadPool.shutdown(); return indexMetadata; } @@ -1861,7 +1869,8 @@ public void testNumberOfRoutingShardsShowsInIndexSettings() { IndexMetadata indexMetadata = checkerService.buildAndValidateTemporaryIndexMetadata( indexSettings, request, - routingNumberOfShards + routingNumberOfShards, + clusterService.state() ); assertEquals(INDEX_NUMBER_OF_ROUTING_SHARDS_SETTING.get(indexMetadata.getSettings()).intValue(), routingNumberOfShards); })); @@ -2615,6 +2624,308 @@ public void testIndexTotalPrimaryShardsPerNodeSettingValidationWithoutRemoteStor ); } + public void testAddRemoteStoreCustomMetadata() { + Settings clusterSettingsSetting = Settings.builder() + .put(RemoteStoreSettings.CLUSTER_SERVER_SIDE_ENCRYPTION_ENABLED.getKey(), true) + .put(REMOTE_STORE_COMPATIBILITY_MODE_SETTING.getKey(), RemoteStoreNodeService.CompatibilityMode.STRICT) + .build(); + clusterSettings = new ClusterSettings(clusterSettingsSetting, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + + Settings settings = Settings.builder() + .put("node.attr.remote_store.segment.repository", "my-segment-repo-1") + .put("node.attr.remote_store.translog.repository", "my-translog-repo-1") + .build(); + + BlobStoreRepository repositoryMock = mock(BlobStoreRepository.class); + when(repositoryMock.blobStore()).thenReturn(mock(BlobStore.class)); + when(repositoryMock.isSeverSideEncryptionEnabled()).thenReturn(true); + + BlobStore blobStoreMock = mock(BlobStore.class); + when(repositoryMock.blobStore()).thenReturn(blobStoreMock); + when(blobStoreMock.isBlobMetadataEnabled()).thenReturn(randomBoolean()); + + when(repositoriesServiceSupplier.get()).thenReturn(repositoriesService); + when(repositoriesService.repository(getRemoteStoreTranslogRepo(settings))).thenReturn(repositoryMock); + when(repositoriesService.repository(getRemoteStoreSegmentRepo(settings))).thenReturn(repositoryMock); + when(repositoriesService.repository(Mockito.any())).thenReturn(repositoryMock); + + Map attributes = getNodeAttributes(); + DiscoveryNode remoteNode = new DiscoveryNode( + UUIDs.base64UUID(), + buildNewFakeTransportAddress(), + attributes, + DiscoveryNodeRole.BUILT_IN_ROLES, + Version.CURRENT + ); + ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT) + .nodes(DiscoveryNodes.builder().add(remoteNode).build()) + .build(); + ClusterService clusterService = mock(ClusterService.class); + when(clusterService.state()).thenReturn(clusterState); + + Mockito.when(clusterService.getClusterSettings()).thenReturn(clusterSettings); + MetadataCreateIndexService checkerService = new MetadataCreateIndexService( + settings, + clusterService, + indicesServices, + null, + null, + createTestShardLimitService(randomIntBetween(1, 1000), false, clusterService), + null, + null, + null, + null, + new SystemIndices(Collections.emptyMap()), + false, + new AwarenessReplicaBalance(Settings.EMPTY, clusterService.getClusterSettings()), + DefaultRemoteStoreSettings.INSTANCE, + repositoriesServiceSupplier + ); + + Settings indexSettings = Settings.builder() + .put(SETTING_VERSION_CREATED, Version.CURRENT) + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1) + .build(); + + IndexMetadata.Builder imdBuilder = IndexMetadata.builder("test").settings(indexSettings); + checkerService.addRemoteStoreCustomMetadata(imdBuilder, true, clusterState); + + assertNotNull(imdBuilder.build().getCustomData()); + Map remoteCustomData = imdBuilder.build().getCustomData().get(IndexMetadata.REMOTE_STORE_CUSTOM_KEY); + assertNotNull(remoteCustomData); + assertTrue(Boolean.valueOf(remoteCustomData.get(IndexMetadata.REMOTE_STORE_SSE_ENABLED_INDEX_KEY))); + } + + public void testAddRemoteStoreCustomMetadata_WhenSSEDisabled() { + Settings clusterSettingsSetting = Settings.builder() + .put(RemoteStoreSettings.CLUSTER_SERVER_SIDE_ENCRYPTION_ENABLED.getKey(), true) + .put(REMOTE_STORE_COMPATIBILITY_MODE_SETTING.getKey(), RemoteStoreNodeService.CompatibilityMode.STRICT) + .build(); + clusterSettings = new ClusterSettings(clusterSettingsSetting, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + + Settings settings = Settings.builder() + .put("node.attr.remote_store.segment.repository", "my-segment-repo-1") + .put("node.attr.remote_store.translog.repository", "my-translog-repo-1") + .build(); + + BlobStoreRepository repositoryMock = mock(BlobStoreRepository.class); + when(repositoryMock.blobStore()).thenReturn(mock(BlobStore.class)); + when(repositoryMock.isSeverSideEncryptionEnabled()).thenReturn(false); + + BlobStore blobStoreMock = mock(BlobStore.class); + when(repositoryMock.blobStore()).thenReturn(blobStoreMock); + when(blobStoreMock.isBlobMetadataEnabled()).thenReturn(randomBoolean()); + + when(repositoriesServiceSupplier.get()).thenReturn(repositoriesService); + when(repositoriesService.repository(getRemoteStoreTranslogRepo(settings))).thenReturn(repositoryMock); + when(repositoriesService.repository(getRemoteStoreSegmentRepo(settings))).thenReturn(repositoryMock); + when(repositoriesService.repository(Mockito.any())).thenReturn(repositoryMock); + + Map attributes = getNodeAttributes(); + DiscoveryNode remoteNode = new DiscoveryNode( + UUIDs.base64UUID(), + buildNewFakeTransportAddress(), + attributes, + DiscoveryNodeRole.BUILT_IN_ROLES, + Version.CURRENT + ); + ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT) + .nodes(DiscoveryNodes.builder().add(remoteNode).build()) + .build(); + ClusterService clusterService = mock(ClusterService.class); + when(clusterService.state()).thenReturn(clusterState); + + Mockito.when(clusterService.getClusterSettings()).thenReturn(clusterSettings); + MetadataCreateIndexService checkerService = new MetadataCreateIndexService( + settings, + clusterService, + indicesServices, + null, + null, + createTestShardLimitService(randomIntBetween(1, 1000), false, clusterService), + null, + null, + null, + null, + new SystemIndices(Collections.emptyMap()), + false, + new AwarenessReplicaBalance(Settings.EMPTY, clusterService.getClusterSettings()), + DefaultRemoteStoreSettings.INSTANCE, + repositoriesServiceSupplier + ); + + Settings indexSettings = Settings.builder() + .put(SETTING_VERSION_CREATED, Version.CURRENT) + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1) + .build(); + + IndexMetadata.Builder imdBuilder = IndexMetadata.builder("test").settings(indexSettings); + checkerService.addRemoteStoreCustomMetadata(imdBuilder, true, clusterState); + + assertNotNull(imdBuilder.build().getCustomData()); + Map remoteCustomData = imdBuilder.build().getCustomData().get(IndexMetadata.REMOTE_STORE_CUSTOM_KEY); + assertNotNull(remoteCustomData); + assertNull(remoteCustomData.get(IndexMetadata.REMOTE_STORE_SSE_ENABLED_INDEX_KEY)); + } + + public void testAddRemoteStoreCustomMetadata_ForSnapshotRestore() { + Settings clusterSettingsSetting = Settings.builder() + .put(RemoteStoreSettings.CLUSTER_SERVER_SIDE_ENCRYPTION_ENABLED.getKey(), true) + .put(REMOTE_STORE_COMPATIBILITY_MODE_SETTING.getKey(), RemoteStoreNodeService.CompatibilityMode.STRICT) + .build(); + clusterSettings = new ClusterSettings(clusterSettingsSetting, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + + Settings settings = Settings.builder() + .put("node.attr.remote_store.segment.repository", "my-segment-repo-1") + .put("node.attr.remote_store.translog.repository", "my-translog-repo-1") + .build(); + + BlobStoreRepository repositoryMock = mock(BlobStoreRepository.class); + when(repositoryMock.blobStore()).thenReturn(mock(BlobStore.class)); + + BlobStore blobStoreMock = mock(BlobStore.class); + when(repositoryMock.blobStore()).thenReturn(blobStoreMock); + when(blobStoreMock.isBlobMetadataEnabled()).thenReturn(randomBoolean()); + + when(repositoriesServiceSupplier.get()).thenReturn(repositoriesService); + when(repositoriesService.repository(Mockito.any())).thenReturn(repositoryMock); + + Map attributes = getNodeAttributes(); + DiscoveryNode remoteNode = new DiscoveryNode( + UUIDs.base64UUID(), + buildNewFakeTransportAddress(), + attributes, + DiscoveryNodeRole.BUILT_IN_ROLES, + Version.CURRENT + ); + ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT) + .nodes(DiscoveryNodes.builder().add(remoteNode).build()) + .build(); + ClusterService clusterService = mock(ClusterService.class); + when(clusterService.state()).thenReturn(clusterState); + + Mockito.when(clusterService.getClusterSettings()).thenReturn(clusterSettings); + MetadataCreateIndexService checkerService = new MetadataCreateIndexService( + settings, + clusterService, + indicesServices, + null, + null, + createTestShardLimitService(randomIntBetween(1, 1000), false, clusterService), + null, + null, + null, + null, + new SystemIndices(Collections.emptyMap()), + false, + new AwarenessReplicaBalance(Settings.EMPTY, clusterService.getClusterSettings()), + DefaultRemoteStoreSettings.INSTANCE, + repositoriesServiceSupplier + ); + + Settings indexSettings = Settings.builder() + .put(SETTING_VERSION_CREATED, Version.CURRENT) + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1) + .build(); + + Map remoteCustomData = new HashMap<>(); + remoteCustomData.put(IndexMetadata.REMOTE_STORE_SSE_ENABLED_INDEX_KEY, "true"); + IndexMetadata.Builder imdBuilder = IndexMetadata.builder("test").settings(indexSettings); + imdBuilder.putCustom(IndexMetadata.REMOTE_STORE_CUSTOM_KEY, remoteCustomData); + checkerService.addRemoteStoreCustomMetadata(imdBuilder, false, clusterState); + + assertNotNull(imdBuilder.build().getCustomData()); + Map finalCustomData = imdBuilder.build().getCustomData().get(IndexMetadata.REMOTE_STORE_CUSTOM_KEY); + assertNotNull(finalCustomData); + assertEquals("true", finalCustomData.get(IndexMetadata.REMOTE_STORE_SSE_ENABLED_INDEX_KEY)); + } + + public void testAddRemoteStoreCustomMetadata_ForSnapshotRestore_WhenSSE_False() { + Settings clusterSettingsSetting = Settings.builder() + .put(RemoteStoreSettings.CLUSTER_SERVER_SIDE_ENCRYPTION_ENABLED.getKey(), true) + .put(REMOTE_STORE_COMPATIBILITY_MODE_SETTING.getKey(), RemoteStoreNodeService.CompatibilityMode.STRICT) + .build(); + clusterSettings = new ClusterSettings(clusterSettingsSetting, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + + Settings settings = Settings.builder() + .put("node.attr.remote_store.segment.repository", "my-segment-repo-1") + .put("node.attr.remote_store.translog.repository", "my-translog-repo-1") + .build(); + + BlobStoreRepository repositoryMock = mock(BlobStoreRepository.class); + when(repositoryMock.blobStore()).thenReturn(mock(BlobStore.class)); + + BlobStore blobStoreMock = mock(BlobStore.class); + when(repositoryMock.blobStore()).thenReturn(blobStoreMock); + when(blobStoreMock.isBlobMetadataEnabled()).thenReturn(randomBoolean()); + + when(repositoriesServiceSupplier.get()).thenReturn(repositoriesService); + when(repositoriesService.repository(Mockito.any())).thenReturn(repositoryMock); + + Map attributes = getNodeAttributes(); + DiscoveryNode remoteNode = new DiscoveryNode( + UUIDs.base64UUID(), + buildNewFakeTransportAddress(), + attributes, + DiscoveryNodeRole.BUILT_IN_ROLES, + Version.CURRENT + ); + ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT) + .nodes(DiscoveryNodes.builder().add(remoteNode).build()) + .build(); + ClusterService clusterService = mock(ClusterService.class); + when(clusterService.state()).thenReturn(clusterState); + + Mockito.when(clusterService.getClusterSettings()).thenReturn(clusterSettings); + MetadataCreateIndexService checkerService = new MetadataCreateIndexService( + settings, + clusterService, + indicesServices, + null, + null, + createTestShardLimitService(randomIntBetween(1, 1000), false, clusterService), + null, + null, + null, + null, + new SystemIndices(Collections.emptyMap()), + false, + new AwarenessReplicaBalance(Settings.EMPTY, clusterService.getClusterSettings()), + DefaultRemoteStoreSettings.INSTANCE, + repositoriesServiceSupplier + ); + + Settings indexSettings = Settings.builder() + .put(SETTING_VERSION_CREATED, Version.CURRENT) + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1) + .build(); + + Map remoteCustomData = new HashMap<>(); + remoteCustomData.put(IndexMetadata.REMOTE_STORE_SSE_ENABLED_INDEX_KEY, "false"); + IndexMetadata.Builder imdBuilder = IndexMetadata.builder("test").settings(indexSettings); + imdBuilder.putCustom(IndexMetadata.REMOTE_STORE_CUSTOM_KEY, remoteCustomData); + checkerService.addRemoteStoreCustomMetadata(imdBuilder, false, clusterState); + + assertNotNull(imdBuilder.build().getCustomData()); + Map finalCustomData = imdBuilder.build().getCustomData().get(IndexMetadata.REMOTE_STORE_CUSTOM_KEY); + assertNotNull(finalCustomData); + assertEquals("false", finalCustomData.get(IndexMetadata.REMOTE_STORE_SSE_ENABLED_INDEX_KEY)); + } + + private static Map getNodeAttributes() { + String segmentRepositoryName = "my-segment-repo-1"; + Map attributes = new HashMap<>(); + + attributes.put(REMOTE_STORE_CLUSTER_STATE_REPOSITORY_NAME_ATTRIBUTE_KEY, "my-cluster-rep-1"); + attributes.put(REMOTE_STORE_SEGMENT_REPOSITORY_NAME_ATTRIBUTE_KEY, segmentRepositoryName); + attributes.put(REMOTE_STORE_TRANSLOG_REPOSITORY_NAME_ATTRIBUTE_KEY, "my-translog-repo-1"); + return attributes; + } + public void testIndexTotalPrimaryShardsPerNodeSettingValidationWithDefaultValue() { // Test case with default value (-1) without remote store (should succeed) Settings settings = Settings.builder().build(); diff --git a/server/src/test/java/org/opensearch/index/IndexModuleTests.java b/server/src/test/java/org/opensearch/index/IndexModuleTests.java index 3352534b2a770..f54d5d710afad 100644 --- a/server/src/test/java/org/opensearch/index/IndexModuleTests.java +++ b/server/src/test/java/org/opensearch/index/IndexModuleTests.java @@ -87,6 +87,7 @@ import org.opensearch.index.fielddata.IndexFieldDataCache; import org.opensearch.index.mapper.ParsedDocument; import org.opensearch.index.mapper.Uid; +import org.opensearch.index.remote.RemoteStoreUtils; import org.opensearch.index.remote.RemoteTranslogTransferTracker; import org.opensearch.index.shard.IndexEventListener; import org.opensearch.index.shard.IndexingOperationListener; @@ -239,7 +240,8 @@ private IndexService newIndexService(IndexModule module) throws IOException { threadPool, indexSettings.getRemoteStoreTranslogRepository(), new RemoteTranslogTransferTracker(shardRouting.shardId(), 10), - DefaultRemoteStoreSettings.INSTANCE + DefaultRemoteStoreSettings.INSTANCE, + RemoteStoreUtils.isServerSideEncryptionEnabledIndex(indexSettings.getIndexMetadata()) ); } return new InternalTranslogFactory(); diff --git a/server/src/test/java/org/opensearch/index/engine/exec/coord/IndexFileDeleterTests.java b/server/src/test/java/org/opensearch/index/engine/exec/coord/IndexFileDeleterTests.java index 3366d5839e928..d8a6fccdb63e2 100644 --- a/server/src/test/java/org/opensearch/index/engine/exec/coord/IndexFileDeleterTests.java +++ b/server/src/test/java/org/opensearch/index/engine/exec/coord/IndexFileDeleterTests.java @@ -40,8 +40,8 @@ public class IndexFileDeleterTests extends OpenSearchTestCase { private IndexFileDeleter indexFileDeleter; private CompositeEngine mockEngine; private ShardPath shardPath; - private CatalogSnapshot catalogSnapshot; - private Map catalogSnapshotMap; + private CompositeEngineCatalogSnapshot catalogSnapshot; + private Map catalogSnapshotMap; private AtomicLong catalogSnapshotId; private AtomicLong lastCommittedSnapshotId; private Set deletedFiles; @@ -82,8 +82,8 @@ public void testMultipleDataFormats() { assertEquals(1, indexFileDeleter.getFileRefCounts().get("parquet").size()); assertEquals(1, indexFileDeleter.getFileRefCounts().get("lucene").size()); - assertEquals(1, indexFileDeleter.getFileRefCounts().get("parquet").get("dir1/file1.parquet").get()); - assertEquals(1, indexFileDeleter.getFileRefCounts().get("lucene").get("dir2/file1.lucene").get()); + assertEquals(1, indexFileDeleter.getFileRefCounts().get("parquet").get(getAbsolutePath("dir1", "file1.parquet")).get()); + assertEquals(1, indexFileDeleter.getFileRefCounts().get("lucene").get(getAbsolutePath("dir2", "file1.lucene")).get()); } public void testRefreshCreatesNewSnapshotAndAddsReferences() { @@ -91,7 +91,7 @@ public void testRefreshCreatesNewSnapshotAndAddsReferences() { simulateRefresh(Map.of("parquet", createWriterFileSet("dir1", "file1.parquet"))); assertEquals(1, indexFileDeleter.getFileRefCounts().get("parquet").size()); - assertEquals(1, indexFileDeleter.getFileRefCounts().get("parquet").get("dir1/file1.parquet").get()); + assertEquals(1, indexFileDeleter.getFileRefCounts().get("parquet").get(getAbsolutePath("dir1", "file1.parquet")).get()); } public void testMultipleSnapshotsWithOverlappingFiles() { @@ -103,10 +103,10 @@ public void testMultipleSnapshotsWithOverlappingFiles() { // After first refresh refCounts: file1(1), file2 (1) // After second refresh refcounts: file1(0, delete should be called), file2(1), file3(1) - assertNull(indexFileDeleter.getFileRefCounts().get("parquet").get("dir1/file1.parquet")); - assertTrue(deletedFiles.contains("dir1/file1.parquet")); - assertEquals(1, indexFileDeleter.getFileRefCounts().get("parquet").get("dir1/file2.parquet").get()); - assertEquals(1, indexFileDeleter.getFileRefCounts().get("parquet").get("dir1/file3.parquet").get()); + assertNull(indexFileDeleter.getFileRefCounts().get("parquet").get(getAbsolutePath("dir1", "file1.parquet"))); + assertTrue(deletedFiles.contains(getAbsolutePath("dir1", "file1.parquet"))); + assertEquals(1, indexFileDeleter.getFileRefCounts().get("parquet").get(getAbsolutePath("dir1", "file2.parquet")).get()); + assertEquals(1, indexFileDeleter.getFileRefCounts().get("parquet").get(getAbsolutePath("dir1", "file3.parquet")).get()); } public void testFileDeletionDuringSearch() throws IOException { @@ -119,17 +119,17 @@ public void testFileDeletionDuringSearch() throws IOException { simulateRefresh(Map.of("parquet", createWriterFileSet("dir1", "file2.parquet", "file3.parquet"))); // since we have a active search request, files from previous snapshot won't be deleted - assertEquals(1, indexFileDeleter.getFileRefCounts().get("parquet").get("dir1/file1.parquet").get()); - assertEquals(2, indexFileDeleter.getFileRefCounts().get("parquet").get("dir1/file2.parquet").get()); - assertEquals(1, indexFileDeleter.getFileRefCounts().get("parquet").get("dir1/file3.parquet").get()); + assertEquals(1, indexFileDeleter.getFileRefCounts().get("parquet").get(getAbsolutePath("dir1", "file1.parquet")).get()); + assertEquals(2, indexFileDeleter.getFileRefCounts().get("parquet").get(getAbsolutePath("dir1", "file2.parquet")).get()); + assertEquals(1, indexFileDeleter.getFileRefCounts().get("parquet").get(getAbsolutePath("dir1", "file3.parquet")).get()); searchContext.close(); // After search is closed, files from previous snapshot should be deleted - assertNull(indexFileDeleter.getFileRefCounts().get("parquet").get("dir1/file1.parquet")); - assertTrue(deletedFiles.contains("dir1/file1.parquet")); - assertEquals(1, indexFileDeleter.getFileRefCounts().get("parquet").get("dir1/file2.parquet").get()); - assertEquals(1, indexFileDeleter.getFileRefCounts().get("parquet").get("dir1/file3.parquet").get()); + assertNull(indexFileDeleter.getFileRefCounts().get("parquet").get(getAbsolutePath("dir1", "file1.parquet"))); + assertTrue(deletedFiles.contains(getAbsolutePath("dir1", "file1.parquet"))); + assertEquals(1, indexFileDeleter.getFileRefCounts().get("parquet").get(getAbsolutePath("dir1", "file2.parquet")).get()); + assertEquals(1, indexFileDeleter.getFileRefCounts().get("parquet").get(getAbsolutePath("dir1", "file3.parquet")).get()); } public void testDeletionsWthFlush() { @@ -141,20 +141,20 @@ public void testDeletionsWthFlush() { simulateRefresh(Map.of("parquet", createWriterFileSet("dir1", "file2.parquet", "file3.parquet"))); // Since file1 is part of last commited data(flushed) it will not be deleted even if it is not part of current snapshot - assertEquals(1, indexFileDeleter.getFileRefCounts().get("parquet").get("dir1/file1.parquet").get()); - assertFalse(deletedFiles.contains("dir1/file1.parquet")); + assertEquals(1, indexFileDeleter.getFileRefCounts().get("parquet").get(getAbsolutePath("dir1", "file1.parquet")).get()); + assertFalse(deletedFiles.contains(getAbsolutePath("dir1", "file1.parquet"))); simulateFlush(); // After flush, file1 should be deleted since it is now no more part of last commited data and neither current snapshot as well - assertNull(indexFileDeleter.getFileRefCounts().get("parquet").get("dir1/file1.parquet")); - assertTrue(deletedFiles.contains("dir1/file1.parquet")); + assertNull(indexFileDeleter.getFileRefCounts().get("parquet").get(getAbsolutePath("dir1", "file1.parquet"))); + assertTrue(deletedFiles.contains(getAbsolutePath("dir1", "file1.parquet"))); } private void simulateRefresh(Map> files) { // Create RefreshResult with segments RefreshResult refreshResult = new RefreshResult(); - CatalogSnapshot.Segment segment = new CatalogSnapshot.Segment(catalogSnapshotId.get() + 1); + Segment segment = new Segment(catalogSnapshotId.get() + 1); files.forEach((formatName, fileSets) -> { fileSets.forEach(fileSet -> { @@ -164,11 +164,11 @@ private void simulateRefresh(Map> files) { refreshResult.setRefreshedSegments(List.of(segment)); - CatalogSnapshot prevSnap = catalogSnapshot; + CompositeEngineCatalogSnapshot prevSnap = catalogSnapshot; // Create new snapshot long id = catalogSnapshotId.incrementAndGet(); - catalogSnapshot = new CatalogSnapshot(id, id, List.of(segment), catalogSnapshotMap, () -> indexFileDeleter); + catalogSnapshot = new CompositeEngineCatalogSnapshot(id, id, List.of(segment), catalogSnapshotMap, () -> indexFileDeleter); catalogSnapshotMap.put(id, catalogSnapshot); // Release previous snapshot if exists @@ -208,4 +208,13 @@ private List createWriterFileSet(String directory, String... file return Collections.singletonList(builder.build()); } + + /** + * Helper method to compute absolute path for test assertions. + * This matches the behavior in IndexFileDeleter.segregateFilesByFormat() + * which uses directory.resolve(file).toAbsolutePath().normalize().toString() + */ + private String getAbsolutePath(String directory, String file) { + return Path.of(directory).resolve(file).toAbsolutePath().normalize().toString(); + } } diff --git a/server/src/test/java/org/opensearch/index/remote/RemoteStoreCustomMetadataResolverTests.java b/server/src/test/java/org/opensearch/index/remote/RemoteStoreCustomMetadataResolverTests.java index 53aceca5c3222..120780f5654cb 100644 --- a/server/src/test/java/org/opensearch/index/remote/RemoteStoreCustomMetadataResolverTests.java +++ b/server/src/test/java/org/opensearch/index/remote/RemoteStoreCustomMetadataResolverTests.java @@ -16,12 +16,16 @@ import org.opensearch.index.remote.RemoteStoreEnums.PathType; import org.opensearch.indices.RemoteStoreSettings; import org.opensearch.repositories.RepositoriesService; +import org.opensearch.repositories.RepositoryMissingException; import org.opensearch.repositories.blobstore.BlobStoreRepository; import org.opensearch.test.OpenSearchTestCase; +import org.mockito.Mockito; + import static org.opensearch.indices.RemoteStoreSettings.CLUSTER_REMOTE_STORE_PATH_HASH_ALGORITHM_SETTING; import static org.opensearch.indices.RemoteStoreSettings.CLUSTER_REMOTE_STORE_PATH_TYPE_SETTING; import static org.opensearch.indices.RemoteStoreSettings.CLUSTER_REMOTE_STORE_TRANSLOG_METADATA; +import static org.opensearch.indices.RemoteStoreSettings.CLUSTER_SERVER_SIDE_ENCRYPTION_ENABLED; import static org.opensearch.node.remotestore.RemoteStoreNodeAttribute.getRemoteStoreTranslogRepo; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -269,6 +273,56 @@ public void testSegmentsPathFixedPathSetting() { .build() ); assertEquals(randomPrefix, remoteStoreSettings.getSegmentsPathFixedPrefix()); + } + + public void testIsRemoteStoreRepoServerSideEncryptionEnabled() { + Settings settings = Settings.builder().put(CLUSTER_SERVER_SIDE_ENCRYPTION_ENABLED.getKey(), true).build(); + ClusterSettings clusterSettings = new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + RemoteStoreSettings remoteStoreSettings = new RemoteStoreSettings(settings, clusterSettings); + + BlobStoreRepository repositoryMock = mock(BlobStoreRepository.class); + when(repositoryMock.isSeverSideEncryptionEnabled()).thenReturn(Boolean.TRUE); + when(repositoriesService.repository(Mockito.any())).thenReturn(repositoryMock); + + RemoteStoreCustomMetadataResolver resolver = new RemoteStoreCustomMetadataResolver( + remoteStoreSettings, + () -> Version.V_3_3_0, + () -> repositoriesService, + settings + ); + assertTrue(resolver.isRemoteStoreRepoServerSideEncryptionEnabled()); + } + + public void testIsRemoteStoreRepoServerSideEncryptionDisabled() { + Settings settings = Settings.builder().put(CLUSTER_SERVER_SIDE_ENCRYPTION_ENABLED.getKey(), true).build(); + ClusterSettings clusterSettings = new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + RemoteStoreSettings remoteStoreSettings = new RemoteStoreSettings(settings, clusterSettings); + BlobStoreRepository repositoryMock = mock(BlobStoreRepository.class); + when(repositoryMock.isSeverSideEncryptionEnabled()).thenReturn(Boolean.FALSE); + when(repositoriesService.repository(Mockito.any())).thenReturn(repositoryMock); + + RemoteStoreCustomMetadataResolver resolver = new RemoteStoreCustomMetadataResolver( + remoteStoreSettings, + () -> Version.V_3_3_0, + () -> repositoriesService, + settings + ); + assertFalse(resolver.isRemoteStoreRepoServerSideEncryptionEnabled()); + } + + public void testIsRemoteStoreRepoServerSideEncryptionWithOldVersion() { + Settings settings = Settings.builder().put(CLUSTER_SERVER_SIDE_ENCRYPTION_ENABLED.getKey(), true).build(); + ClusterSettings clusterSettings = new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + RemoteStoreSettings remoteStoreSettings = new RemoteStoreSettings(settings, clusterSettings); + when(repositoriesService.repository(Mockito.any())).thenThrow(new RepositoryMissingException("Repository missing")); + + RemoteStoreCustomMetadataResolver resolver = new RemoteStoreCustomMetadataResolver( + remoteStoreSettings, + () -> Version.V_3_1_0, + () -> repositoriesService, + settings + ); + expectThrows(IllegalArgumentException.class, resolver::isRemoteStoreRepoServerSideEncryptionEnabled); } } diff --git a/server/src/test/java/org/opensearch/index/remote/RemoteStoreUtilsTests.java b/server/src/test/java/org/opensearch/index/remote/RemoteStoreUtilsTests.java index 2e3d2e6d385c2..5edb01b615d32 100644 --- a/server/src/test/java/org/opensearch/index/remote/RemoteStoreUtilsTests.java +++ b/server/src/test/java/org/opensearch/index/remote/RemoteStoreUtilsTests.java @@ -52,6 +52,8 @@ import java.util.UUID; import java.util.stream.Collectors; +import org.mockito.Mockito; + import static org.opensearch.cluster.metadata.IndexMetadata.REMOTE_STORE_CUSTOM_KEY; import static org.opensearch.index.remote.RemoteMigrationIndexMetadataUpdaterTests.createIndexMetadataWithDocrepSettings; import static org.opensearch.index.remote.RemoteStoreUtils.URL_BASE64_CHARSET; @@ -1212,4 +1214,33 @@ public void testGetPinnedTimestampLockedFilesWithCache() { assertEquals(0, implicitLockedFiles.size()); assertEquals(0, metadataFilePinnedTimestampCache.size()); } + + public void testIsServerSideEncryptionEnabledIndex_when_enabled() { + IndexMetadata indexMetadata = Mockito.mock(IndexMetadata.class); + Map metadata = new HashMap<>(); + metadata.put(IndexMetadata.REMOTE_STORE_SSE_ENABLED_INDEX_KEY, "true"); + Mockito.when(indexMetadata.getCustomData(IndexMetadata.REMOTE_STORE_CUSTOM_KEY)).thenReturn(metadata); + assertTrue(RemoteStoreUtils.isServerSideEncryptionEnabledIndex(indexMetadata)); + } + + public void testIsServerSideEncryptionEnabledIndex_when_disabled() { + IndexMetadata indexMetadata = Mockito.mock(IndexMetadata.class); + Map metadata = new HashMap<>(); + metadata.put(IndexMetadata.REMOTE_STORE_SSE_ENABLED_INDEX_KEY, "false"); + Mockito.when(indexMetadata.getCustomData(IndexMetadata.REMOTE_STORE_CUSTOM_KEY)).thenReturn(metadata); + assertFalse(RemoteStoreUtils.isServerSideEncryptionEnabledIndex(indexMetadata)); + } + + public void testIsServerSideEncryptionEnabledIndex_when_No_Custom_key() { + IndexMetadata indexMetadata = Mockito.mock(IndexMetadata.class); + Map metadata = new HashMap<>(); + Mockito.when(indexMetadata.getCustomData(IndexMetadata.REMOTE_STORE_CUSTOM_KEY)).thenReturn(metadata); + assertFalse(RemoteStoreUtils.isServerSideEncryptionEnabledIndex(indexMetadata)); + } + + public void testIsServerSideEncryptionEnabledIndex_when_Custom_key_is_null() { + IndexMetadata indexMetadata = Mockito.mock(IndexMetadata.class); + Mockito.when(indexMetadata.getCustomData(IndexMetadata.REMOTE_STORE_CUSTOM_KEY)).thenReturn(null); + assertFalse(RemoteStoreUtils.isServerSideEncryptionEnabledIndex(indexMetadata)); + } } diff --git a/server/src/test/java/org/opensearch/index/shard/RemoteIndexShardCorruptionTests.java b/server/src/test/java/org/opensearch/index/shard/RemoteIndexShardCorruptionTests.java index d88bd6e8e6f79..aa216d2a9edd1 100644 --- a/server/src/test/java/org/opensearch/index/shard/RemoteIndexShardCorruptionTests.java +++ b/server/src/test/java/org/opensearch/index/shard/RemoteIndexShardCorruptionTests.java @@ -52,11 +52,10 @@ public void testLocalDirectoryContains() throws IOException { CorruptionUtils.corruptAt(shardPath.resolve(file), raf, (int) (raf.size() - 8)); } } - org.opensearch.index.engine.exec.FileMetadata fileMetadata = new org.opensearch.index.engine.exec.FileMetadata("lucene", file); if (corrupted == false) { - assertTrue(indexShard.localDirectoryContains(localDirectory, fileMetadata, checksum)); + assertTrue(indexShard.localDirectoryContainsFile(localDirectory, file, checksum)); } else { - assertFalse(indexShard.localDirectoryContains(localDirectory, fileMetadata, checksum)); + assertFalse(indexShard.localDirectoryContainsFile(localDirectory, file, checksum)); assertFalse(Files.exists(shardPath.resolve(file))); } } diff --git a/server/src/test/java/org/opensearch/index/shard/RemoteStoreUploaderServiceTests.java b/server/src/test/java/org/opensearch/index/shard/RemoteStoreUploaderServiceTests.java index 44ed1273f7261..47ab9280b1b34 100644 --- a/server/src/test/java/org/opensearch/index/shard/RemoteStoreUploaderServiceTests.java +++ b/server/src/test/java/org/opensearch/index/shard/RemoteStoreUploaderServiceTests.java @@ -93,7 +93,7 @@ public void setUp() throws Exception { when(mockUploadListenerFunction.apply(any())).thenReturn(mockUploadListener); - uploaderService = new RemoteStoreUploaderService(mockIndexShard, mockStoreDirectory, mockRemoteDirectory); + uploaderService = new RemoteStoreUploaderService(mockIndexShard, mockStoreDirectory, mockRemoteDirectory, false); } /** @@ -187,8 +187,8 @@ public void testUploadSegmentsSuccessWithHighPriorityUpload() throws Exception { RemoteStoreUploaderService testUploaderService = new RemoteStoreUploaderService( freshMockShard, mockCompositeStoreDirectory, - remoteSegmentStoreDirectory - ); + remoteSegmentStoreDirectory, + false); doAnswer(invocation -> { ActionListener callback = invocation.getArgument(3); @@ -250,8 +250,8 @@ public void testUploadSegmentsSuccessWithLowPriorityUpload() throws Exception { RemoteStoreUploaderService testUploaderService = new RemoteStoreUploaderService( freshMockShard, mockCompositeStoreDirectory, - remoteSegmentStoreDirectory - ); + remoteSegmentStoreDirectory, + false); doAnswer(invocation -> { ActionListener callback = invocation.getArgument(3); @@ -314,8 +314,8 @@ public void testUploadSegmentsWithCompositeDirectory() throws Exception { RemoteStoreUploaderService testUploaderService = new RemoteStoreUploaderService( freshMockShard, mockCompositeStoreDirectory, - remoteSegmentStoreDirectory - ); + remoteSegmentStoreDirectory, + false); // Setup the real RemoteSegmentStoreDirectory to handle copyFrom calls doAnswer(invocation -> { @@ -377,8 +377,8 @@ public void testUploadSegmentsWithCorruptIndexException() throws Exception { RemoteStoreUploaderService testUploaderService = new RemoteStoreUploaderService( freshMockShard, mockCompositeStoreDirectory, - remoteSegmentStoreDirectory - ); + remoteSegmentStoreDirectory, + false); CorruptIndexException corruptException = new CorruptIndexException("Index corrupted", "test"); CountDownLatch latch = new CountDownLatch(1); @@ -441,8 +441,8 @@ public void testUploadSegmentsWithGenericException() throws Exception { RemoteStoreUploaderService testUploaderService = new RemoteStoreUploaderService( freshMockShard, mockCompositeStoreDirectory, - remoteSegmentStoreDirectory - ); + remoteSegmentStoreDirectory, + false); RuntimeException genericException = new RuntimeException("Generic error"); CountDownLatch latch = new CountDownLatch(1); diff --git a/server/src/test/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryFactoryTests.java b/server/src/test/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryFactoryTests.java index 866100415fdef..62a1e65089ade 100644 --- a/server/src/test/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryFactoryTests.java +++ b/server/src/test/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryFactoryTests.java @@ -71,6 +71,7 @@ public void testNewDirectory() throws IOException { BlobStoreRepository repository = mock(BlobStoreRepository.class); BlobStore blobStore = mock(BlobStore.class); BlobContainer blobContainer = mock(BlobContainer.class); + when(repository.blobStore(false)).thenReturn(blobStore); when(repository.blobStore()).thenReturn(blobStore); when(repository.basePath()).thenReturn(new BlobPath().add("base_path")); when(blobStore.blobContainer(any())).thenReturn(blobContainer); @@ -117,7 +118,7 @@ public void testNewDirectoryRepositoryDoesNotExist() { when(repositoriesService.repository("remote_store_repository")).thenThrow(new RepositoryMissingException("Missing")); - assertThrows(IllegalArgumentException.class, () -> remoteSegmentStoreDirectoryFactory.newDirectory(indexSettings, shardPath)); + assertThrows(RepositoryMissingException.class, () -> remoteSegmentStoreDirectoryFactory.newDirectory(indexSettings, shardPath)); } } diff --git a/server/src/test/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryTests.java b/server/src/test/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryTests.java index d1aa8bb3184ca..13bc489bcb89b 100644 --- a/server/src/test/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryTests.java +++ b/server/src/test/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryTests.java @@ -61,6 +61,7 @@ import org.mockito.Mockito; +import static org.mockito.ArgumentMatchers.anyString; import static org.opensearch.index.store.RemoteSegmentStoreDirectory.METADATA_FILES_TO_FETCH; import static org.opensearch.index.store.MetadataFilenameUtils.SEPARATOR; import static org.opensearch.test.RemoteStoreTestUtils.createMetadataFileBytes; @@ -212,7 +213,7 @@ public void testDeleteFileException() throws IOException { populateMetadata(); remoteSegmentStoreDirectory.init(); - doThrow(new IOException("Error")).when(remoteDataDirectory).deleteFile(any()); + doThrow(new IOException("Error")).when(remoteDataDirectory).deleteFile(anyString()); assertThrows(IOException.class, () -> remoteSegmentStoreDirectory.deleteFile("_0.si")); } @@ -1014,7 +1015,7 @@ public void testDeleteStaleCommitsNoDeletesDueToLocks() throws Exception { remoteSegmentStoreDirectory.deleteStaleSegmentsAsync(1); assertBusy(() -> assertThat(remoteSegmentStoreDirectory.canDeleteStaleCommits.get(), is(true))); - verify(remoteMetadataDirectory, times(0)).deleteFile(any()); + verify(remoteMetadataDirectory, times(0)).deleteFile(anyString()); } public void testDeleteStaleCommitsExceptionWhileFetchingLocks() throws Exception { @@ -1027,7 +1028,7 @@ public void testDeleteStaleCommitsExceptionWhileFetchingLocks() throws Exception // We are passing lastNMetadataFilesToKeep=2 here so that oldest 1 metadata file will be deleted remoteSegmentStoreDirectory.deleteStaleSegmentsAsync(1); - verify(remoteMetadataDirectory, times(0)).deleteFile(any()); + verify(remoteMetadataDirectory, times(0)).deleteFile(anyString()); } public void testDeleteStaleCommitsDeleteDedup() throws Exception { diff --git a/server/src/test/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryWithPinnedTimestampTests.java b/server/src/test/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryWithPinnedTimestampTests.java index bdbf09f5636ab..ec59014d3d3dd 100644 --- a/server/src/test/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryWithPinnedTimestampTests.java +++ b/server/src/test/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryWithPinnedTimestampTests.java @@ -31,6 +31,7 @@ import org.mockito.Mockito; +import static org.mockito.ArgumentMatchers.anyString; import static org.opensearch.indices.RemoteStoreSettings.CLUSTER_REMOTE_STORE_PINNED_TIMESTAMP_ENABLED; import static org.hamcrest.CoreMatchers.is; import static org.mockito.ArgumentMatchers.any; @@ -175,8 +176,8 @@ public void testDeleteStaleCommitsNoPinnedTimestampMdFilesLatest() throws Except remoteSegmentStoreDirectory.deleteStaleSegmentsAsync(2); assertBusy(() -> assertThat(remoteSegmentStoreDirectory.canDeleteStaleCommits.get(), is(true))); - verify(remoteDataDirectory, times(0)).deleteFile(any()); - verify(remoteMetadataDirectory, times(0)).deleteFile(any()); + verify(remoteDataDirectory, times(0)).deleteFile(anyString()); + verify(remoteMetadataDirectory, times(0)).deleteFile(anyString()); } public void testDeleteStaleCommitsPinnedTimestampMdFile() throws Exception { diff --git a/server/src/test/java/org/opensearch/index/store/RemoteStoreFileDownloaderTests.java b/server/src/test/java/org/opensearch/index/store/RemoteStoreFileDownloaderTests.java index 6d8b3fe4d69fb..b242c4a23b2e8 100644 --- a/server/src/test/java/org/opensearch/index/store/RemoteStoreFileDownloaderTests.java +++ b/server/src/test/java/org/opensearch/index/store/RemoteStoreFileDownloaderTests.java @@ -71,9 +71,10 @@ public void setup() throws IOException { files.put(filename, content); } fileDownloader = new RemoteStoreFileDownloader( - ShardId.fromString("[RemoteStoreFileDownloaderTests][0]"), - threadPool, - recoverySettings + ShardId.fromString("[RemoteStoreFileDownloaderTests][0]"), + threadPool, + recoverySettings, + false ); } diff --git a/server/src/test/java/org/opensearch/index/translog/RemoteFsTimestampAwareTranslogTests.java b/server/src/test/java/org/opensearch/index/translog/RemoteFsTimestampAwareTranslogTests.java index 6c89cf2adf988..5ab26084dcfd2 100644 --- a/server/src/test/java/org/opensearch/index/translog/RemoteFsTimestampAwareTranslogTests.java +++ b/server/src/test/java/org/opensearch/index/translog/RemoteFsTimestampAwareTranslogTests.java @@ -137,7 +137,8 @@ public void setUp() throws Exception { protected RemoteFsTranslog createTranslogInstance( TranslogConfig translogConfig, String translogUUID, - TranslogDeletionPolicy deletionPolicy + TranslogDeletionPolicy deletionPolicy, + boolean isServerSideEncryptionEnabled ) throws IOException { return new RemoteFsTimestampAwareTranslog( translogConfig, @@ -151,7 +152,8 @@ protected RemoteFsTranslog createTranslogInstance( primaryMode::get, new RemoteTranslogTransferTracker(shardId, 10), DefaultRemoteStoreSettings.INSTANCE, - TranslogOperationHelper.DEFAULT + TranslogOperationHelper.DEFAULT, + isServerSideEncryptionEnabled ); } @@ -622,7 +624,8 @@ public void testExtraGenToKeep() throws Exception { new RemoteTranslogTransferTracker(shardId, 10), DefaultRemoteStoreSettings.INSTANCE, TranslogOperationHelper.DEFAULT, - channelFactory + channelFactory, + false ) ) { addToTranslogAndListAndUpload(translog, ops, new Translog.Index("1", 0, primaryTerm.get(), new byte[] { 1 })); diff --git a/server/src/test/java/org/opensearch/index/translog/RemoteFsTranslogTests.java b/server/src/test/java/org/opensearch/index/translog/RemoteFsTranslogTests.java index edcdca3f7b3de..7b20c7e22f2f4 100644 --- a/server/src/test/java/org/opensearch/index/translog/RemoteFsTranslogTests.java +++ b/server/src/test/java/org/opensearch/index/translog/RemoteFsTranslogTests.java @@ -171,23 +171,30 @@ public void tearDown() throws Exception { protected RemoteFsTranslog create(Path path) throws IOException { final String translogUUID = Translog.createEmptyTranslog(path, SequenceNumbers.NO_OPS_PERFORMED, shardId, primaryTerm.get()); - return create(path, createRepository(), translogUUID, 0); + return create(path, createRepository(), translogUUID, 0, false); } - private RemoteFsTranslog create(Path path, BlobStoreRepository repository, String translogUUID, int extraGenToKeep) throws IOException { + private RemoteFsTranslog create( + Path path, + BlobStoreRepository repository, + String translogUUID, + int extraGenToKeep, + boolean isServerSideEncryptionEnabled + ) throws IOException { this.repository = repository; globalCheckpoint = new AtomicLong(SequenceNumbers.NO_OPS_PERFORMED); final TranslogConfig translogConfig = getTranslogConfig(path, extraGenToKeep); final TranslogDeletionPolicy deletionPolicy = createTranslogDeletionPolicy(translogConfig.getIndexSettings()); threadPool = new TestThreadPool(getClass().getName()); blobStoreTransferService = new BlobStoreTransferService(repository.blobStore(), threadPool); - return createTranslogInstance(translogConfig, translogUUID, deletionPolicy); + return createTranslogInstance(translogConfig, translogUUID, deletionPolicy, isServerSideEncryptionEnabled); } protected RemoteFsTranslog createTranslogInstance( TranslogConfig translogConfig, String translogUUID, - TranslogDeletionPolicy deletionPolicy + TranslogDeletionPolicy deletionPolicy, + boolean isServerSideEncryptionEnabled ) throws IOException { return new RemoteFsTranslog( translogConfig, @@ -202,12 +209,14 @@ protected RemoteFsTranslog createTranslogInstance( new RemoteTranslogTransferTracker(shardId, 10), DefaultRemoteStoreSettings.INSTANCE, TranslogOperationHelper.DEFAULT, - null + null, + isServerSideEncryptionEnabled ); } - private RemoteFsTranslog create(Path path, BlobStoreRepository repository, String translogUUID) throws IOException { - return create(path, repository, translogUUID, 0); + private RemoteFsTranslog create(Path path, BlobStoreRepository repository, String translogUUID, boolean isServerSideEncryptionEnabled) + throws IOException { + return create(path, repository, translogUUID, 0, isServerSideEncryptionEnabled); } private TranslogConfig getTranslogConfig(final Path path) { @@ -477,7 +486,8 @@ public void testExtraGenToKeep() throws Exception { new RemoteTranslogTransferTracker(shardId, 10), DefaultRemoteStoreSettings.INSTANCE, TranslogOperationHelper.DEFAULT, - null + null, + false ) ) { addToTranslogAndListAndUpload(translog, ops, new Translog.Index("1", 0, primaryTerm.get(), new byte[] { 1 })); @@ -554,7 +564,7 @@ public void testReadLocationDownload() throws IOException { } // Creating RemoteFsTranslog with the same location - RemoteFsTranslog newTranslog = create(translogDir, repository, translogUUID); + RemoteFsTranslog newTranslog = create(translogDir, repository, translogUUID, false); i = 0; for (Translog.Operation op : ops) { assertEquals(op, newTranslog.readOperation(locs.get(i++))); @@ -825,7 +835,7 @@ public void testMetadataFileDeletion() throws Exception { long newPrimaryTerm = primaryTerm.incrementAndGet(); // Creating RemoteFsTranslog with the same location - Translog newTranslog = create(translogDir, repository, translogUUID); + Translog newTranslog = create(translogDir, repository, translogUUID, false); int newPrimaryTermDocs = randomIntBetween(5, 10); for (int i = totalDocs + 1; i <= totalDocs + newPrimaryTermDocs; i++) { addToTranslogAndListAndUpload(newTranslog, ops, new Translog.Index(String.valueOf(i), i, primaryTerm.get(), new byte[] { 1 })); @@ -1523,7 +1533,8 @@ public void testTranslogWriterCanFlushInAddOrReadCall() throws IOException { new RemoteTranslogTransferTracker(shardId, 10), DefaultRemoteStoreSettings.INSTANCE, TranslogOperationHelper.DEFAULT, - channelFactory + channelFactory, + false ) ) { TranslogWriter writer = translog.getCurrent(); @@ -1630,7 +1641,8 @@ public void force(boolean metaData) throws IOException { new RemoteTranslogTransferTracker(shardId, 10), DefaultRemoteStoreSettings.INSTANCE, TranslogOperationHelper.DEFAULT, - channelFactory + channelFactory, + false ) ) { TranslogWriter writer = translog.getCurrent(); diff --git a/server/src/test/java/org/opensearch/repositories/blobstore/BlobStoreProviderTests.java b/server/src/test/java/org/opensearch/repositories/blobstore/BlobStoreProviderTests.java new file mode 100644 index 0000000000000..4a77fafb94deb --- /dev/null +++ b/server/src/test/java/org/opensearch/repositories/blobstore/BlobStoreProviderTests.java @@ -0,0 +1,125 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.repositories.blobstore; + +import org.opensearch.cluster.metadata.RepositoryMetadata; +import org.opensearch.common.blobstore.BlobStore; +import org.opensearch.common.lifecycle.Lifecycle; +import org.opensearch.repositories.RepositoryException; +import org.opensearch.test.OpenSearchTestCase; +import org.junit.Before; + +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +/** + * Test class for {@link BlobStoreProvider}. + */ +public class BlobStoreProviderTests extends OpenSearchTestCase { + @Mock + private BlobStoreRepository mockRepository; + + @Mock + private RepositoryMetadata mockMetadata; + + @Mock + private Lifecycle mockLifecycle; + + @Mock + private BlobStore mockBlobStore; + + @Mock + private BlobStore mockServerSideEncryptionBlobStore; + + private Object lock; + private BlobStoreProvider provider; + + @Before + public void setUp() throws Exception { + super.setUp(); + MockitoAnnotations.openMocks(this); + lock = new Object(); + when(mockMetadata.name()).thenReturn("test-repository"); + provider = new BlobStoreProvider(mockRepository, mockMetadata, mockLifecycle, lock); + } + + public void testGetBlobStore() throws Exception { + // Setup: Mock the serverSideEncryptedBlobStore to return a value + // Note: Since SetOnce is used internally, we need to first call blobStore() to initialize it + when(mockLifecycle.started()).thenReturn(true); + when(mockRepository.createBlobStore()).thenReturn(mockBlobStore); + + // Initialize the server-side encrypted blob store + provider.blobStore(false); + + // Test + BlobStore result = provider.getBlobStore(false); + + // Verify + assertEquals(mockBlobStore, result); + } + + public void testGetBlobStoreWithServerSideEncryption() throws Exception { + // Setup: Mock the serverSideEncryptedBlobStore to return a value + // Note: Since SetOnce is used internally, we need to first call blobStore() to initialize it + when(mockLifecycle.started()).thenReturn(true); + when(mockRepository.createBlobStore()).thenReturn(mockServerSideEncryptionBlobStore); + provider.blobStore(true); + + BlobStore result = provider.getBlobStore(true); + + // Verify + assertEquals(mockServerSideEncryptionBlobStore, result); + } + + public void testBlobStoreWithClientSideEncryptionFirstTime() throws Exception { + // Setup + when(mockLifecycle.started()).thenReturn(true); + when(mockRepository.createBlobStore()).thenReturn(mockBlobStore); + + // Test + BlobStore result = provider.blobStore(false); + + // Verify + assertEquals(mockBlobStore, result); + verify(mockRepository).createBlobStore(); + } + + public void testBlobStoreWithClientSideEncryptionEnabledSubsequentCalls() throws Exception { + // Setup + when(mockLifecycle.started()).thenReturn(true); + when(mockRepository.createBlobStore()).thenReturn(mockBlobStore); + + // First call + BlobStore firstResult = provider.blobStore(false); + + // Second call + BlobStore secondResult = provider.blobStore(false); + + // Verify + assertEquals(mockBlobStore, firstResult); + assertEquals(mockBlobStore, secondResult); + assertSame(firstResult, secondResult); + // Verify createServerSideEncryptedBlobStore is called only once + verify(mockRepository, times(1)).createBlobStore(); + } + + public void testInitBlobStoreWhenLifecycleNotStarted() { + // Setup + when(mockLifecycle.started()).thenReturn(false); + when(mockLifecycle.state()).thenReturn(Lifecycle.State.STOPPED); + + // Test - should throw RepositoryException + expectThrows(RepositoryException.class, () -> provider.initBlobStore()); + } +} diff --git a/server/src/test/java/org/opensearch/repositories/blobstore/BlobStoreRepositoryTests.java b/server/src/test/java/org/opensearch/repositories/blobstore/BlobStoreRepositoryTests.java index 7ca9c16f6da5d..135875c768ed2 100644 --- a/server/src/test/java/org/opensearch/repositories/blobstore/BlobStoreRepositoryTests.java +++ b/server/src/test/java/org/opensearch/repositories/blobstore/BlobStoreRepositoryTests.java @@ -42,6 +42,7 @@ import org.opensearch.common.UUIDs; import org.opensearch.common.blobstore.BlobContainer; import org.opensearch.common.blobstore.BlobMetadata; +import org.opensearch.common.blobstore.BlobStore; import org.opensearch.common.blobstore.DeleteResult; import org.opensearch.common.settings.Setting; import org.opensearch.common.settings.Settings; @@ -91,6 +92,8 @@ import java.util.function.Function; import java.util.stream.Collectors; +import org.mockito.Mockito; + import static org.opensearch.repositories.RepositoryDataTests.generateRandomRepoData; import static org.opensearch.repositories.blobstore.BlobStoreRepository.calculateMaxWithinIntLimit; import static org.hamcrest.Matchers.equalTo; @@ -598,6 +601,127 @@ public void testGetStats() { repository.close(); } + public void testGetStats_When_Sse_Enabled_WithExtended_Stats() { + BlobStoreRepository repository = setupRepo(); + BlobStoreRepository repoSpy = Mockito.spy(repository); + + BlobStore blobStore = getMockedBlobStoreWithStats(10L, 20L, true); + BlobStore sseBlobStore = getMockedBlobStoreWithStats(5L, 10L, true); + + Mockito.doReturn(blobStore).when(repoSpy).getBlobStore(false); + Mockito.doReturn(sseBlobStore).when(repoSpy).getBlobStore(true); + + RepositoryStats stats = repoSpy.stats(); + assertNotNull(stats); + assertTrue(stats.detailed); + Map mergedStats = stats.extendedStats.get(BlobStore.Metric.REQUEST_SUCCESS); + + assertEquals(15, mergedStats.get("GET").longValue()); + assertEquals(30, mergedStats.get("PUT").longValue()); + + repository.close(); + } + + public void testGetStats_When_Sse_Only_Enabled_WithExtended_Stats() { + BlobStoreRepository repository = setupRepo(); + BlobStoreRepository repoSpy = Mockito.spy(repository); + + BlobStore sseBlobStore = getMockedBlobStoreWithStats(5L, 10L, true); + Mockito.doReturn(sseBlobStore).when(repoSpy).getBlobStore(true); + + RepositoryStats stats = repoSpy.stats(); + assertNotNull(stats); + assertTrue(stats.detailed); + Map mergedStats = stats.extendedStats.get(BlobStore.Metric.REQUEST_SUCCESS); + + assertEquals(5, mergedStats.get("GET").longValue()); + assertEquals(10, mergedStats.get("PUT").longValue()); + + repository.close(); + } + + public void testGetStats_When_Sse_not_Enabled_WithExtended_Stats() { + BlobStoreRepository repository = setupRepo(); + BlobStoreRepository repoSpy = Mockito.spy(repository); + + BlobStore blobStore = getMockedBlobStoreWithStats(10L, 20L, true); + Mockito.doReturn(blobStore).when(repoSpy).getBlobStore(false); + + RepositoryStats stats = repoSpy.stats(); + assertNotNull(stats); + assertTrue(stats.detailed); + Map mergedStats = stats.extendedStats.get(BlobStore.Metric.REQUEST_SUCCESS); + + assertEquals(10, mergedStats.get("GET").longValue()); + assertEquals(20, mergedStats.get("PUT").longValue()); + + repository.close(); + } + + public void testGetStats_When_Sse_Enabled() { + BlobStoreRepository repository = setupRepo(); + BlobStoreRepository repoSpy = Mockito.spy(repository); + + BlobStore blobStore = getMockedBlobStoreWithStats(10L, 20L, false); + BlobStore sseBlobStore = getMockedBlobStoreWithStats(5L, 10L, false); + + Mockito.doReturn(blobStore).when(repoSpy).getBlobStore(false); + Mockito.doReturn(sseBlobStore).when(repoSpy).getBlobStore(true); + + RepositoryStats stats = repoSpy.stats(); + assertNotNull(stats); + assertFalse(stats.detailed); + + assertEquals(45, stats.requestCounts.get("requests_count").longValue()); + repository.close(); + } + + public void testGetStats_When_Sse_Disabled() { + BlobStoreRepository repository = setupRepo(); + BlobStoreRepository repoSpy = Mockito.spy(repository); + + BlobStore blobStore = getMockedBlobStoreWithStats(10L, 20L, false); + + Mockito.doReturn(blobStore).when(repoSpy).getBlobStore(false); + + RepositoryStats stats = repoSpy.stats(); + assertNotNull(stats); + assertFalse(stats.detailed); + + assertEquals(30, stats.requestCounts.get("requests_count").longValue()); + repository.close(); + } + + public void testGetStats_When_Sse_Only_Enabled() { + BlobStoreRepository repository = setupRepo(); + BlobStoreRepository repoSpy = Mockito.spy(repository); + + BlobStore sseBlobStore = getMockedBlobStoreWithStats(5L, 10L, false); + Mockito.doReturn(sseBlobStore).when(repoSpy).getBlobStore(true); + + RepositoryStats stats = repoSpy.stats(); + assertNotNull(stats); + assertFalse(stats.detailed); + + assertEquals(15, stats.requestCounts.get("requests_count").longValue()); + repository.close(); + } + + private BlobStore getMockedBlobStoreWithStats(long getCount, long putCount, boolean extendedStats) { + BlobStore blobStore = Mockito.mock(BlobStore.class); + HashMap blobStoreStatsMap = new HashMap<>(); + if (extendedStats) { + blobStoreStatsMap.put("GET", getCount); + blobStoreStatsMap.put("PUT", putCount); + Map> blobStoreMetricMap = Map.of(BlobStore.Metric.REQUEST_SUCCESS, blobStoreStatsMap); + Mockito.when(blobStore.extendedStats()).thenReturn(blobStoreMetricMap); + } else { + blobStoreStatsMap.put("requests_count", getCount + putCount); + Mockito.when(blobStore.stats()).thenReturn(blobStoreStatsMap); + } + return blobStore; + } + public void testGetSnapshotThrottleTimeInNanos() { BlobStoreRepository repository = setupRepo(); long throttleTime = repository.getSnapshotThrottleTimeInNanos(); diff --git a/test/framework/src/main/java/org/opensearch/index/engine/EngineTestCase.java b/test/framework/src/main/java/org/opensearch/index/engine/EngineTestCase.java index d39994d4fdc58..b432747e539aa 100644 --- a/test/framework/src/main/java/org/opensearch/index/engine/EngineTestCase.java +++ b/test/framework/src/main/java/org/opensearch/index/engine/EngineTestCase.java @@ -1629,7 +1629,7 @@ public static Translog getTranslog(Engine engine) { /** * Exposes a translog associated with the given engine for testing purpose. */ - public static Translog getTranslog(CompositeEngine engine) { + public static Translog getTranslog(Indexer engine) { // assert engine instanceof InternalEngine || engine instanceof NRTReplicationEngine || engine // : "only InternalEngines or NRTReplicationEngines have translogs, got: " + engine.getClass(); engine.ensureOpen(); diff --git a/test/framework/src/main/java/org/opensearch/index/shard/IndexShardTestCase.java b/test/framework/src/main/java/org/opensearch/index/shard/IndexShardTestCase.java index 80d77efd77dac..057d7d728e65e 100644 --- a/test/framework/src/main/java/org/opensearch/index/shard/IndexShardTestCase.java +++ b/test/framework/src/main/java/org/opensearch/index/shard/IndexShardTestCase.java @@ -104,6 +104,7 @@ import org.opensearch.index.mapper.MapperService; import org.opensearch.index.mapper.SourceToParse; import org.opensearch.index.remote.RemoteStoreStatsTrackerFactory; +import org.opensearch.index.remote.RemoteStoreUtils; import org.opensearch.index.remote.RemoteTranslogTransferTracker; import org.opensearch.index.replication.TestReplicationSource; import org.opensearch.index.seqno.ReplicationTracker; @@ -694,7 +695,8 @@ protected IndexShard newShard( threadPool, settings.getRemoteStoreTranslogRepository(), new RemoteTranslogTransferTracker(shardRouting.shardId(), 20), - DefaultRemoteStoreSettings.INSTANCE + DefaultRemoteStoreSettings.INSTANCE, + RemoteStoreUtils.isServerSideEncryptionEnabledIndex(settings.getIndexMetadata()) ); } return new InternalTranslogFactory(); @@ -1500,7 +1502,7 @@ public static Indexer getIndexer(IndexShard indexShard) { } public static Translog getTranslog(IndexShard shard) { - return EngineTestCase.getTranslog((CompositeEngine) getIndexer(shard)); + return EngineTestCase.getTranslog(getIndexer(shard)); } public static ReplicationTracker getReplicationTracker(IndexShard indexShard) {