diff --git a/CHANGELOG.md b/CHANGELOG.md index f11f358fd3c4a..2026884cd2e54 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -48,6 +48,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Add sub aggregation support for histogram aggregation using skiplist ([19438](https://github.com/opensearch-project/OpenSearch/pull/19438)) - Optimization in String Terms Aggregation query for Large Bucket Counts([#18732](https://github.com/opensearch-project/OpenSearch/pull/18732)) - New cluster setting search.query.max_query_string_length ([#19491](https://github.com/opensearch-project/OpenSearch/pull/19491)) +- Configurable hash size for segment file metadata ([#19499](https://github.com/opensearch-project/OpenSearch/pull/19499)) ### Changed - Refactor `if-else` chains to use `Java 17 pattern matching switch expressions`(([#18965](https://github.com/opensearch-project/OpenSearch/pull/18965)) diff --git a/modules/store-subdirectory/src/main/java/org/opensearch/plugin/store/subdirectory/SubdirectoryAwareStore.java b/modules/store-subdirectory/src/main/java/org/opensearch/plugin/store/subdirectory/SubdirectoryAwareStore.java index 351fb4f0c7fe5..930d757697e70 100644 --- a/modules/store-subdirectory/src/main/java/org/opensearch/plugin/store/subdirectory/SubdirectoryAwareStore.java +++ b/modules/store-subdirectory/src/main/java/org/opensearch/plugin/store/subdirectory/SubdirectoryAwareStore.java @@ -20,6 +20,8 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.opensearch.common.lucene.Lucene; +import org.opensearch.core.common.unit.ByteSizeUnit; +import org.opensearch.core.common.unit.ByteSizeValue; import org.opensearch.core.index.shard.ShardId; import org.opensearch.env.ShardLock; import org.opensearch.index.IndexSettings; @@ -82,16 +84,25 @@ public SubdirectoryAwareStore( @Override public MetadataSnapshot getMetadata(IndexCommit commit) throws IOException { long totalNumDocs = 0; + // Use 10MB default for subdirectory stores if not configured + final ByteSizeValue hashSize = indexSettings.getSettings() + .getAsBytesSize(Store.INDEX_STORE_METADATA_HASH_SIZE_SETTING.getKey(), new ByteSizeValue(10, ByteSizeUnit.MB)); // Load regular segment files metadata final SegmentInfos segmentCommitInfos = Lucene.readSegmentInfos(commit); - MetadataSnapshot.LoadedMetadata regularMetadata = MetadataSnapshot.loadMetadata(segmentCommitInfos, super.directory(), logger); + MetadataSnapshot.LoadedMetadata regularMetadata = MetadataSnapshot.loadMetadata( + segmentCommitInfos, + super.directory(), + logger, + false, + hashSize + ); Map builder = new HashMap<>(regularMetadata.fileMetadata); Map commitUserDataBuilder = new HashMap<>(regularMetadata.userData); totalNumDocs += regularMetadata.numDocs; // Load subdirectory files metadata from segments_N files in subdirectories - totalNumDocs += this.loadSubdirectoryMetadataFromSegments(commit, builder); + totalNumDocs += this.loadSubdirectoryMetadataFromSegments(commit, builder, hashSize); return new MetadataSnapshot(Collections.unmodifiableMap(builder), Collections.unmodifiableMap(commitUserDataBuilder), totalNumDocs); } @@ -102,7 +113,8 @@ public MetadataSnapshot getMetadata(IndexCommit commit) throws IOException { * * @return the total number of documents in all subdirectory segments */ - private long loadSubdirectoryMetadataFromSegments(IndexCommit commit, Map builder) throws IOException { + private long loadSubdirectoryMetadataFromSegments(IndexCommit commit, Map builder, ByteSizeValue hashSize) + throws IOException { // Find all segments_N files in subdirectories from the commit Set subdirectorySegmentFiles = new HashSet<>(); for (String fileName : commit.getFileNames()) { @@ -114,7 +126,7 @@ private long loadSubdirectoryMetadataFromSegments(IndexCommit commit, Map builder) - throws IOException { + private long loadMetadataFromSubdirectorySegmentsFile( + String segmentsFilePath, + Map builder, + ByteSizeValue hashSize + ) throws IOException { // Parse the directory path from the segments file path // e.g., "subdir/path/segments_1" -> "subdir/path" Path filePath = Path.of(segmentsFilePath); @@ -143,7 +158,7 @@ private long loadMetadataFromSubdirectorySegmentsFile(String segmentsFilePath, M SegmentInfos segmentInfos = SegmentInfos.readCommit(subdirectory, segmentsFileName); // Use the same pattern as Store.loadMetadata to extract file metadata - loadMetadataFromSegmentInfos(segmentInfos, subdirectory, builder, parent); + loadMetadataFromSegmentInfos(segmentInfos, subdirectory, builder, parent, hashSize); // Return the number of documents in this segments file return Lucene.getNumDocs(segmentInfos); @@ -157,14 +172,16 @@ private static void loadMetadataFromSegmentInfos( SegmentInfos segmentInfos, Directory directory, Map builder, - Path pathPrefix + Path pathPrefix, + ByteSizeValue hashSize ) throws IOException { - // Reuse the existing Store.loadMetadata method + // Reuse the existing Store.loadMetadata method with hashSize Store.MetadataSnapshot.LoadedMetadata loadedMetadata = Store.MetadataSnapshot.loadMetadata( segmentInfos, directory, SubdirectoryAwareStore.logger, - false + false, + hashSize ); // Add all files with proper relative path prefix diff --git a/modules/store-subdirectory/src/test/java/org/opensearch/plugin/store/subdirectory/SubdirectoryStorePluginTests.java b/modules/store-subdirectory/src/test/java/org/opensearch/plugin/store/subdirectory/SubdirectoryStorePluginTests.java index 51fe5dfa019fd..8aad06a198053 100644 --- a/modules/store-subdirectory/src/test/java/org/opensearch/plugin/store/subdirectory/SubdirectoryStorePluginTests.java +++ b/modules/store-subdirectory/src/test/java/org/opensearch/plugin/store/subdirectory/SubdirectoryStorePluginTests.java @@ -18,6 +18,7 @@ import org.opensearch.common.settings.Settings; import org.opensearch.common.unit.TimeValue; import org.opensearch.common.util.io.IOUtils; +import org.opensearch.core.common.unit.ByteSizeValue; import org.opensearch.core.index.shard.ShardId; import org.opensearch.index.shard.ShardPath; import org.opensearch.index.store.Store; @@ -134,4 +135,51 @@ public int numNonExtraFiles(Store store) throws IOException { } return numNonExtra; } + + public void testSubdirectoryStoreConfigurableHashSize() throws IOException { + final ShardId shardId = new ShardId("index", "_na_", 1); + Settings settings = Settings.builder() + .put(Store.INDEX_STORE_METADATA_HASH_SIZE_SETTING.getKey(), "5MB") + .put(IndexMetadata.SETTING_VERSION_CREATED, org.opensearch.Version.CURRENT) + .build(); + Path path = createTempDir().resolve("indices").resolve(shardId.getIndex().getUUID()).resolve(String.valueOf(shardId.id())); + + try ( + SubdirectoryAwareStore store = new SubdirectoryAwareStore( + shardId, + IndexSettingsModule.newIndexSettings("index", settings), + SubdirectoryStorePluginTests.newFSDirectory(path.resolve("index")), + new DummyShardLock(shardId), + Store.OnClose.EMPTY, + new ShardPath(false, path, path, shardId) + ) + ) { + // Verify subdirectory store uses configured hash size - 5MB = 5 * 1024 * 1024 = 5242880 bytes + ByteSizeValue hashSize = IndexSettingsModule.newIndexSettings("index", settings) + .getValue(Store.INDEX_STORE_METADATA_HASH_SIZE_SETTING); + assertEquals(5242880L, hashSize.getBytes()); + } + } + + public void testSubdirectoryStoreDefaultHashSize() throws IOException { + final ShardId shardId = new ShardId("index", "_na_", 1); + Settings settings = Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, org.opensearch.Version.CURRENT).build(); + Path path = createTempDir().resolve("indices").resolve(shardId.getIndex().getUUID()).resolve(String.valueOf(shardId.id())); + + try ( + SubdirectoryAwareStore store = new SubdirectoryAwareStore( + shardId, + IndexSettingsModule.newIndexSettings("index", settings), + SubdirectoryStorePluginTests.newFSDirectory(path.resolve("index")), + new DummyShardLock(shardId), + Store.OnClose.EMPTY, + new ShardPath(false, path, path, shardId) + ) + ) { + // Verify subdirectory store uses 1MB default when not configured (standard Store default) + ByteSizeValue hashSize = IndexSettingsModule.newIndexSettings("index", settings) + .getValue(Store.INDEX_STORE_METADATA_HASH_SIZE_SETTING); + assertEquals(1048576L, hashSize.getBytes()); // 1MB = 1024 * 1024 = 1048576 bytes + } + } } diff --git a/server/src/main/java/org/opensearch/common/settings/IndexScopedSettings.java b/server/src/main/java/org/opensearch/common/settings/IndexScopedSettings.java index 8a5eafef4a10a..5bde35ba10a2f 100644 --- a/server/src/main/java/org/opensearch/common/settings/IndexScopedSettings.java +++ b/server/src/main/java/org/opensearch/common/settings/IndexScopedSettings.java @@ -190,6 +190,7 @@ public final class IndexScopedSettings extends AbstractScopedSettings { FieldMapper.IGNORE_MALFORMED_SETTING, FieldMapper.COERCE_SETTING, Store.INDEX_STORE_STATS_REFRESH_INTERVAL_SETTING, + Store.INDEX_STORE_METADATA_HASH_SIZE_SETTING, MapperService.INDEX_MAPPER_DYNAMIC_SETTING, MapperService.INDEX_MAPPING_NESTED_FIELDS_LIMIT_SETTING, MapperService.INDEX_MAPPING_NESTED_DOCS_LIMIT_SETTING, diff --git a/server/src/main/java/org/opensearch/index/store/Store.java b/server/src/main/java/org/opensearch/index/store/Store.java index 0173d2faa46ca..41f51ba6170d5 100644 --- a/server/src/main/java/org/opensearch/index/store/Store.java +++ b/server/src/main/java/org/opensearch/index/store/Store.java @@ -83,6 +83,8 @@ import org.opensearch.core.common.io.stream.StreamInput; import org.opensearch.core.common.io.stream.StreamOutput; import org.opensearch.core.common.io.stream.Writeable; +import org.opensearch.core.common.unit.ByteSizeUnit; +import org.opensearch.core.common.unit.ByteSizeValue; import org.opensearch.core.index.shard.ShardId; import org.opensearch.env.NodeEnvironment; import org.opensearch.env.ShardLock; @@ -171,6 +173,13 @@ public class Store extends AbstractIndexShardComponent implements Closeable, Ref Property.IndexScope ); + public static final Setting INDEX_STORE_METADATA_HASH_SIZE_SETTING = Setting.byteSizeSetting( + "index.store.metadata_hash.size", + new ByteSizeValue(1, ByteSizeUnit.MB), + Property.IndexScope, + Property.Dynamic + ); + private final AtomicBoolean isClosed = new AtomicBoolean(false); private final StoreDirectory directory; private final ReentrantReadWriteLock metadataLock = new ReentrantReadWriteLock(); @@ -361,7 +370,8 @@ public Map getSegmentMetadataMap(SegmentInfos segment assert indexSettings.isSegRepEnabledOrRemoteNode(); failIfCorrupted(); try { - return loadMetadata(segmentInfos, directory, logger, true).fileMetadata; + final ByteSizeValue hashSize = indexSettings.getValue(INDEX_STORE_METADATA_HASH_SIZE_SETTING); + return loadMetadata(segmentInfos, directory, logger, true, hashSize).fileMetadata; } catch (NoSuchFileException | CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException ex) { markStoreCorrupted(ex); throw ex; @@ -1132,6 +1142,16 @@ public static LoadedMetadata loadMetadata(SegmentInfos segmentInfos, Directory d public static LoadedMetadata loadMetadata(SegmentInfos segmentInfos, Directory directory, Logger logger, boolean ignoreSegmentsFile) throws IOException { + return loadMetadata(segmentInfos, directory, logger, ignoreSegmentsFile, new ByteSizeValue(1, ByteSizeUnit.MB)); + } + + public static LoadedMetadata loadMetadata( + SegmentInfos segmentInfos, + Directory directory, + Logger logger, + boolean ignoreSegmentsFile, + ByteSizeValue hashSize + ) throws IOException { long numDocs = Lucene.getNumDocs(segmentInfos); Map commitUserDataBuilder = new HashMap<>(); commitUserDataBuilder.putAll(segmentInfos.getUserData()); @@ -1159,7 +1179,8 @@ public static LoadedMetadata loadMetadata(SegmentInfos segmentInfos, Directory d builder, logger, version, - SEGMENT_INFO_EXTENSION.equals(IndexFileNames.getExtension(file)) + SEGMENT_INFO_EXTENSION.equals(IndexFileNames.getExtension(file)), + hashSize ); } } @@ -1168,7 +1189,7 @@ public static LoadedMetadata loadMetadata(SegmentInfos segmentInfos, Directory d } if (ignoreSegmentsFile == false) { final String segmentsFile = segmentInfos.getSegmentsFileName(); - checksumFromLuceneFile(directory, segmentsFile, builder, logger, maxVersion, true); + checksumFromLuceneFile(directory, segmentsFile, builder, logger, maxVersion, true, hashSize); } return new LoadedMetadata(unmodifiableMap(builder), unmodifiableMap(commitUserDataBuilder), numDocs); } @@ -1179,7 +1200,8 @@ private static void checksumFromLuceneFile( Map builder, Logger logger, Version version, - boolean readFileAsHash + boolean readFileAsHash, + ByteSizeValue hashSize ) throws IOException { final String checksum; final BytesRefBuilder fileHash = new BytesRefBuilder(); @@ -1199,12 +1221,20 @@ private static void checksumFromLuceneFile( in ); } - if (readFileAsHash) { + if (readFileAsHash && length <= hashSize.getBytes()) { // additional safety we checksum the entire file we read the hash for... final VerifyingIndexInput verifyingIndexInput = new VerifyingIndexInput(in); - hashFile(fileHash, new InputStreamIndexInput(verifyingIndexInput, length), length); + hashFile(fileHash, new InputStreamIndexInput(verifyingIndexInput, length), length, hashSize); checksum = digestToString(verifyingIndexInput.verify()); } else { + if (readFileAsHash && length > hashSize.getBytes()) { + logger.warn( + "Skipping hash computation for file [{}] with size [{}] bytes as it exceeds the configured limit of [{}] bytes", + file, + length, + hashSize.getBytes() + ); + } checksum = digestToString(CodecUtil.retrieveChecksum(in)); } @@ -1217,10 +1247,10 @@ private static void checksumFromLuceneFile( } /** - * Computes a strong hash value for small files. Note that this method should only be used for files < 1MB + * Computes a strong hash value for files up to the specified size limit. */ - public static void hashFile(BytesRefBuilder fileHash, InputStream in, long size) throws IOException { - final int len = (int) Math.min(1024 * 1024, size); // for safety we limit this to 1MB + public static void hashFile(BytesRefBuilder fileHash, InputStream in, long size, ByteSizeValue maxSize) throws IOException { + final int len = (int) Math.min(maxSize.getBytes(), size); fileHash.grow(len); fileHash.setLength(len); final int readBytes = in.readNBytes(fileHash.bytes(), 0, len); diff --git a/server/src/test/java/org/opensearch/index/store/StoreTests.java b/server/src/test/java/org/opensearch/index/store/StoreTests.java index 7957a3b0d45e0..6f747c324e053 100644 --- a/server/src/test/java/org/opensearch/index/store/StoreTests.java +++ b/server/src/test/java/org/opensearch/index/store/StoreTests.java @@ -63,6 +63,7 @@ import org.apache.lucene.tests.store.BaseDirectoryWrapper; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.Version; import org.opensearch.ExceptionsHelper; import org.opensearch.cluster.metadata.IndexMetadata; @@ -74,6 +75,8 @@ import org.opensearch.common.util.io.IOUtils; import org.opensearch.core.common.io.stream.InputStreamStreamInput; import org.opensearch.core.common.io.stream.OutputStreamStreamOutput; +import org.opensearch.core.common.unit.ByteSizeUnit; +import org.opensearch.core.common.unit.ByteSizeValue; import org.opensearch.core.index.Index; import org.opensearch.core.index.shard.ShardId; import org.opensearch.env.ShardLock; @@ -1345,4 +1348,43 @@ private IndexWriter indexRandomDocs(Store store) throws IOException { writer.addDocument(doc); return writer; } + + public void testConfigurableHashSize() throws IOException { + final ShardId shardId = new ShardId("index", "_na_", 1); + Settings settings = Settings.builder() + .put(Store.INDEX_STORE_METADATA_HASH_SIZE_SETTING.getKey(), "2MB") + .put(IndexMetadata.SETTING_VERSION_CREATED, org.opensearch.Version.CURRENT) + .build(); + IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("index", settings); + + try (Directory directory = newDirectory()) { + try (Store store = new Store(shardId, indexSettings, directory, new DummyShardLock(shardId))) { + // Verify setting is properly configured - 2MB = 2 * 1024 * 1024 = 2097152 bytes + assertEquals(2097152L, indexSettings.getValue(Store.INDEX_STORE_METADATA_HASH_SIZE_SETTING).getBytes()); + } + } + } + + public void testHashFileSizeLimit() throws IOException { + // Test that hashFile respects the size limit + byte[] data = new byte[2048]; // 2KB of data + random().nextBytes(data); + + BytesRefBuilder fileHash = new BytesRefBuilder(); + ByteArrayInputStream inputStream = new ByteArrayInputStream(data); + + // Test with 1KB limit - should only hash first 1KB + Store.MetadataSnapshot.hashFile(fileHash, inputStream, data.length, new ByteSizeValue(1, ByteSizeUnit.KB)); + + assertEquals(1024, fileHash.length()); // Should be limited to 1KB + + // Reset for next test + fileHash = new BytesRefBuilder(); + inputStream = new ByteArrayInputStream(data); + + // Test with 4KB limit - should hash all 2KB + Store.MetadataSnapshot.hashFile(fileHash, inputStream, data.length, new ByteSizeValue(4, ByteSizeUnit.KB)); + + assertEquals(2048, fileHash.length()); // Should hash all data + } }