Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
429c14a
Integrate stored fields format bloom filter with synthetic _id
fcofdez Nov 24, 2025
38984b6
Update docs/changelog/138515.yaml
fcofdez Nov 24, 2025
24fc7e8
Wrong version
fcofdez Nov 24, 2025
60b1eb7
Merge branch 'new-codec' of github.com:fcofdez/elasticsearch into new…
fcofdez Nov 24, 2025
7f6d6f3
Use same name for codecs
fcofdez Nov 24, 2025
a72a66a
Get rid of StorageMode
fcofdez Nov 24, 2025
9d0a1f8
Some renaming
fcofdez Nov 24, 2025
799fb3a
Remove setting
fcofdez Nov 25, 2025
089f6f5
Merge remote-tracking branch 'origin/main' into new-codec
fcofdez Nov 25, 2025
d0d94ef
Improve use syntheticId check
fcofdez Nov 25, 2025
e01019b
Add comment about read order
fcofdez Nov 25, 2025
e1cbce6
Rename to delegate
fcofdez Nov 25, 2025
680edf1
Initial capacity
fcofdez Nov 25, 2025
3e632b8
Add clarifying comment
fcofdez Nov 25, 2025
279cff3
Add TODO
fcofdez Nov 25, 2025
acc82c1
Add javadoc and rename
fcofdez Nov 25, 2025
cfec7f9
Improve javadoc
fcofdez Nov 25, 2025
92a6daa
Clarify delegates
fcofdez Nov 25, 2025
ad711c3
Merge remote-tracking branch 'origin/main' into new-codec
fcofdez Nov 25, 2025
8826d6d
Merge remote-tracking branch 'origin/main' into new-codec
fcofdez Dec 1, 2025
18e19f5
Review comments
fcofdez Dec 1, 2025
86908c1
Merge remote-tracking branch 'origin/main' into new-codec
fcofdez Dec 2, 2025
e7efa2b
Simplify codecs
fcofdez Dec 2, 2025
c07b92a
Merge remote-tracking branch 'origin/main' into new-codec
fcofdez Dec 3, 2025
1ff2ccd
Remove isFilterAvailable
fcofdez Dec 4, 2025
4a92842
Merge remote-tracking branch 'origin/main' into new-codec
fcofdez Dec 4, 2025
52dd5e0
Merge remote-tracking branch 'origin/main' into new-codec
fcofdez Dec 10, 2025
3afbc55
Fix nits
fcofdez Dec 10, 2025
3f89769
whitespace
fcofdez Dec 10, 2025
810f5f4
Merge remote-tracking branch 'origin/main' into new-codec
fcofdez Dec 10, 2025
83cdf7c
Merge remote-tracking branch 'origin/main' into new-codec
fcofdez Dec 10, 2025
186b4b0
Merge remote-tracking branch 'origin/main' into new-codec
fcofdez Dec 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/138515.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 138515
summary: Integrate stored fields format bloom filter with synthetic `_id`
area: Codec
type: enhancement
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
import org.elasticsearch.common.util.set.Sets;
import org.elasticsearch.index.IndexMode;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.codec.CodecService;
import org.elasticsearch.index.engine.EngineConfig;
import org.elasticsearch.index.mapper.IdFieldMapper;
import org.elasticsearch.index.query.TermQueryBuilder;
import org.elasticsearch.plugins.Plugin;
Expand Down Expand Up @@ -54,6 +56,7 @@
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.notNullValue;
import static org.hamcrest.Matchers.nullValue;

/**
* Test suite for time series indices that use synthetic ids for documents.
Expand Down Expand Up @@ -101,6 +104,40 @@ public void testInvalidIndexMode() {
);
}

public void testInvalidCodec() {
assumeTrue("Test should only run with feature flag", IndexSettings.TSDB_SYNTHETIC_ID_FEATURE_FLAG);
final var indexName = randomIdentifier();
internalCluster().startDataOnlyNode();
var randomNonDefaultCodec = randomFrom(
CodecService.BEST_COMPRESSION_CODEC,
CodecService.LEGACY_DEFAULT_CODEC,
CodecService.BEST_COMPRESSION_CODEC,
CodecService.LUCENE_DEFAULT_CODEC
);

var exception = expectThrows(
IllegalArgumentException.class,
() -> createIndex(
indexName,
indexSettings(1, 0).put(IndexSettings.MODE.getKey(), IndexMode.TIME_SERIES)
.put("index.routing_path", "hostname")
.put(IndexSettings.USE_SYNTHETIC_ID.getKey(), true)
.put(EngineConfig.INDEX_CODEC_SETTING.getKey(), randomNonDefaultCodec)
.build()
)
);
assertThat(
exception.getMessage(),
containsString(
"The setting ["
+ IndexSettings.USE_SYNTHETIC_ID.getKey()
+ "] is only permitted when [index.codec] is set to [default]. Current mode: ["
+ randomNonDefaultCodec
+ "]."
)
);
}

public void testSyntheticId() throws Exception {
assumeTrue("Test should only run with feature flag", IndexSettings.TSDB_SYNTHETIC_ID_FEATURE_FLAG);
final var dataStreamName = randomIdentifier();
Expand Down Expand Up @@ -260,12 +297,19 @@ enum Operation {

flush(dataStreamName);

// TODO: Restart the node or relocate the shard randomly

// Check that synthetic _id field have no postings on disk
var indices = new HashSet<>(docs.values());
for (var index : indices) {
var diskUsage = diskUsage(index);
var diskUsageIdField = AnalyzeIndexDiskUsageTestUtils.getPerFieldDiskUsage(diskUsage, IdFieldMapper.NAME);
assertThat("_id field should not have postings on disk", diskUsageIdField.getInvertedIndexBytes(), equalTo(0L));
// When _id's are only used to populate the bloom filter,
// IndexDiskUsageStats won't account for anything since
// the bloom filter it's not exposed through the Reader API and
// the analyzer expects to get documents with fields to do the
// disk usage accounting.
assertThat(diskUsageIdField, nullValue());
}
}

Expand Down Expand Up @@ -376,7 +420,12 @@ public void testGetFromTranslogBySyntheticId() throws Exception {
for (var index : indices) {
var diskUsage = diskUsage(index);
var diskUsageIdField = AnalyzeIndexDiskUsageTestUtils.getPerFieldDiskUsage(diskUsage, IdFieldMapper.NAME);
assertThat("_id field should not have postings on disk", diskUsageIdField.getInvertedIndexBytes(), equalTo(0L));
// When _id's are only used to populate the bloom filter,
// IndexDiskUsageStats won't account for anything since
// the bloom filter it's not exposed through the Reader API and
// the analyzer expects to get documents with fields to do the
// disk usage accounting.
assertThat(diskUsageIdField, nullValue());
}

assertHitCount(client().prepareSearch(dataStreamName).setSize(0), 10L);
Expand Down
4 changes: 3 additions & 1 deletion server/src/main/java/module-info.java
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@
exports org.elasticsearch.index.codec;
exports org.elasticsearch.index.codec.tsdb;
exports org.elasticsearch.index.codec.bloomfilter;
exports org.elasticsearch.index.codec.storedfields;
exports org.elasticsearch.index.codec.zstd;
exports org.elasticsearch.index.engine;
exports org.elasticsearch.index.fielddata;
Expand Down Expand Up @@ -481,7 +482,8 @@
org.elasticsearch.index.codec.Elasticsearch816Codec,
org.elasticsearch.index.codec.Elasticsearch900Codec,
org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec,
org.elasticsearch.index.codec.Elasticsearch92Lucene103Codec;
org.elasticsearch.index.codec.Elasticsearch92Lucene103Codec,
org.elasticsearch.index.codec.ES93TSDBDefaultCompressionLucene103Codec;

provides org.apache.logging.log4j.core.util.ContextDataProvider with org.elasticsearch.common.logging.DynamicContextDataProvider;

Expand Down
18 changes: 17 additions & 1 deletion server/src/main/java/org/elasticsearch/index/IndexSettings.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.elasticsearch.common.util.FeatureFlag;
import org.elasticsearch.core.Booleans;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.index.codec.CodecService;
import org.elasticsearch.index.mapper.IgnoredSourceFieldMapper;
import org.elasticsearch.index.mapper.Mapper;
import org.elasticsearch.index.mapper.SeqNoFieldMapper;
Expand All @@ -49,6 +50,7 @@

import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_INDEX_VERSION_CREATED;
import static org.elasticsearch.cluster.routing.allocation.ExistingShardsAllocator.EXISTING_SHARDS_ALLOCATOR_SETTING;
import static org.elasticsearch.index.engine.EngineConfig.INDEX_CODEC_SETTING;
import static org.elasticsearch.index.mapper.MapperService.INDEX_MAPPING_DEPTH_LIMIT_SETTING;
import static org.elasticsearch.index.mapper.MapperService.INDEX_MAPPING_DIMENSION_FIELDS_LIMIT_SETTING;
import static org.elasticsearch.index.mapper.MapperService.INDEX_MAPPING_FIELD_NAME_LENGTH_LIMIT_SETTING;
Expand Down Expand Up @@ -715,12 +717,26 @@ public void validate(Boolean enabled, Map<Setting<?>, Object> settings) {
)
);
}

var codecName = (String) settings.get(INDEX_CODEC_SETTING);
if (codecName.equals(CodecService.DEFAULT_CODEC) == false) {
throw new IllegalArgumentException(
String.format(
Locale.ROOT,
"The setting [%s] is only permitted when [%s] is set to [%s]. Current mode: [%s].",
USE_SYNTHETIC_ID.getKey(),
INDEX_CODEC_SETTING.getKey(),
CodecService.DEFAULT_CODEC,
codecName
)
);
}
}
}

@Override
public Iterator<Setting<?>> settings() {
List<Setting<?>> list = List.of(MODE);
List<Setting<?>> list = List.of(MODE, INDEX_CODEC_SETTING);
return list.iterator();
}
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ private static Version parseUnchecked(String version) {
public static final IndexVersion SKIPPER_DEFAULTS_ONLY_ON_TSDB = def(9_052_0_00, Version.LUCENE_10_3_2);
public static final IndexVersion DISK_BBQ_LICENSE_ENFORCEMENT = def(9_053_0_00, Version.LUCENE_10_3_2);
public static final IndexVersion STORE_IGNORED_KEYWORDS_IN_BINARY_DOC_VALUES = def(9_054_0_00, Version.LUCENE_10_3_2);
public static final IndexVersion TIME_SERIES_USE_STORED_FIELDS_BLOOM_FILTER_FOR_ID = def(9_055_0_00, Version.LUCENE_10_3_2);

/*
* STOP! READ THIS FIRST! No, really,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.common.util.FeatureFlag;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.index.IndexMode;
import org.elasticsearch.index.codec.tsdb.TSDBSyntheticIdCodec;
import org.elasticsearch.index.IndexVersions;
import org.elasticsearch.index.codec.zstd.Zstd814StoredFieldsFormat;
import org.elasticsearch.index.mapper.MapperService;

Expand Down Expand Up @@ -48,8 +47,17 @@ public class CodecService implements CodecProvider {
public CodecService(@Nullable MapperService mapperService, BigArrays bigArrays) {
final var codecs = new HashMap<String, Codec>();

Codec legacyBestSpeedCodec = new LegacyPerFieldMapperCodec(Lucene103Codec.Mode.BEST_SPEED, mapperService, bigArrays);
if (ZSTD_STORED_FIELDS_FEATURE_FLAG) {
boolean useSyntheticId = mapperService != null
&& mapperService.getIndexSettings().useTimeSeriesSyntheticId()
&& mapperService.getIndexSettings()
.getIndexVersionCreated()
.onOrAfter(IndexVersions.TIME_SERIES_USE_STORED_FIELDS_BLOOM_FILTER_FOR_ID);

var legacyBestSpeedCodec = new LegacyPerFieldMapperCodec(Lucene103Codec.Mode.BEST_SPEED, mapperService, bigArrays);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe just having one useSyntheticId if statement with an else clause is clearer then having useSyntheticId checks in several places?

if (useSyntheticId) {
// Use the default Lucene compression when the synthetic id is used even if the ZSTD feature flag is enabled
codecs.put(DEFAULT_CODEC, new ES93TSDBDefaultCompressionLucene103Codec(legacyBestSpeedCodec, bigArrays));
} else if (ZSTD_STORED_FIELDS_FEATURE_FLAG) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should remove this feature flag. I don't we will in the near term future use zstd for default codec.
But I don't think this affects this PR.

codecs.put(DEFAULT_CODEC, new PerFieldMapperCodec(Zstd814StoredFieldsFormat.Mode.BEST_SPEED, mapperService, bigArrays));
} else {
codecs.put(DEFAULT_CODEC, legacyBestSpeedCodec);
Expand All @@ -67,8 +75,6 @@ public CodecService(@Nullable MapperService mapperService, BigArrays bigArrays)
for (String codec : Codec.availableCodecs()) {
codecs.put(codec, Codec.forName(codec));
}
final boolean useTsdbSyntheticId = mapperService != null && mapperService.getIndexSettings().useTimeSeriesSyntheticId();
assert useTsdbSyntheticId == false || mapperService.getIndexSettings().getMode() == IndexMode.TIME_SERIES;

this.codecs = codecs.entrySet().stream().collect(Collectors.toUnmodifiableMap(Map.Entry::getKey, e -> {
Codec codec;
Expand All @@ -77,9 +83,6 @@ public CodecService(@Nullable MapperService mapperService, BigArrays bigArrays)
} else {
codec = new DeduplicateFieldInfosCodec(e.getValue().getName(), e.getValue());
}
if (useTsdbSyntheticId && codec instanceof TSDBSyntheticIdCodec == false) {
codec = new TSDBSyntheticIdCodec(codec.getName(), codec);
}
return codec;
}));
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.index.codec;

import org.apache.lucene.codecs.lucene103.Lucene103Codec;
import org.elasticsearch.common.util.BigArrays;

public class ES93TSDBDefaultCompressionLucene103Codec extends TSDBCodecWithSyntheticId {
/** Public no-arg constructor, needed for SPI loading at read-time. */
public ES93TSDBDefaultCompressionLucene103Codec() {
this(new Lucene103Codec(), null);
}

ES93TSDBDefaultCompressionLucene103Codec(Lucene103Codec delegate, BigArrays bigArrays) {
super("ES93TSDBDefaultCompressionLucene103Codec", delegate, bigArrays);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.index.codec;

import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.index.codec.bloomfilter.ES93BloomFilterStoredFieldsFormat;
import org.elasticsearch.index.codec.storedfields.TSDBStoredFieldsFormat;
import org.elasticsearch.index.codec.tsdb.TSDBSyntheticIdCodec;
import org.elasticsearch.index.mapper.IdFieldMapper;

/**
* Abstract base class for ES codecs used with time-series ({@code TIME_SERIES}) indices
* that employ synthetic document IDs for storage optimization.
*
* <p>This class configures the codec to use the following formats:
* <ul>
* <li>
* Use {@link TSDBSyntheticIdCodec} as the underlying codec for synthesizing the `_id` field from
* the values of other fields of the document (ex: _tsid, @timestamp, etc.) so that no inverted index
* or stored field are required for the `_id`. As such, looking up documents by `_id` might be very
* slow and that's why it is used along with a Bloom filter.
* </li>
* <li>
* Apply {@link TSDBStoredFieldsFormat} with bloom filter optimization for efficient ID lookups
* </li>
* </ul>
*
* <p>Synthetic IDs in TSDB indices are generated from the document's dimensions and timestamp,
* replacing the standard {@code _id} field to reduce storage overhead.
*
* @see TSDBSyntheticIdCodec
* @see TSDBStoredFieldsFormat
*/
abstract class TSDBCodecWithSyntheticId extends FilterCodec {
private final TSDBStoredFieldsFormat storedFieldsFormat;

TSDBCodecWithSyntheticId(String name, Codec delegate, BigArrays bigArrays) {
super(name, new TSDBSyntheticIdCodec(delegate));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can merge TSDBSyntheticIdCodec and TSDBCodecWithSyntheticId together in a follow up.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I just saw your #138515 (comment) 👍

Copy link
Contributor Author

@fcofdez fcofdez Nov 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm planning to incorporate the code from TSDBSyntheticIdCodec into this class in a follow-up PR. But I wanted to keep the change size under control.

this.storedFieldsFormat = new TSDBStoredFieldsFormat(
delegate.storedFieldsFormat(),
new ES93BloomFilterStoredFieldsFormat(
bigArrays,
ES93BloomFilterStoredFieldsFormat.DEFAULT_BLOOM_FILTER_SIZE,
IdFieldMapper.NAME
)
);
}

@Override
public StoredFieldsFormat storedFieldsFormat() {
return storedFieldsFormat;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.index.codec.bloomfilter;

import org.apache.lucene.util.BytesRef;

import java.io.Closeable;
import java.io.IOException;

public interface BloomFilter extends Closeable {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mainly a style thing, but I think this would be nicer as a function interface with a no-op static implementation. Something like:

static BloomFilter NO_FILTER = (field, term) -> true;

And then rather than checking for isFilterAvailable() you check for == BloomFilter.NO_FILTER.

Also I don't think it needs to be Closeable any more?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for looking into this @romseygeek, I've removed the isFilterAvailable method as is indeed redundant (see 1ff2ccd). Regarding the Closeable we need it for DelegatingBloomFilterFieldsProducer to close the underlying bloom filter file once it's done with it.

BloomFilter NO_FILTER = new BloomFilter() {
@Override
public void close() throws IOException {

}

@Override
public boolean mayContainTerm(String field, BytesRef term) throws IOException {
return true;
}
};

/**
* Tests whether the given term may exist in the specified field.
*
* @param field the field name to check
* @param term the term to test for membership
* @return true if term may be present, false if definitely absent
*/
boolean mayContainTerm(String field, BytesRef term) throws IOException;
}
Loading