Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,24 @@ public TokenStream create(TokenStream tokenStream) {
return synonyms.fst == null ? tokenStream : new SynonymGraphFilter(tokenStream, synonyms, false);
}

@Override
public TokenFilterFactory getSynonymFilter() {
// When building a synonym filter, we must prevent previous synonym filters in the chain
// from being active, as this would cause recursive synonym expansion during the building phase.
//
// Without this fix, when chaining multiple synonym filters (e.g., synonym_A → synonym_B → synonym_C),
// building synonym_C would use an analyzer containing active synonym_A and synonym_B filters.
// This causes:
// 1. Recursive synonym expansion when parsing synonym rules (e.g., synonyms are expanded via previous filters)
// 2. Each SynonymMap inflates since it applies all previous synonym rules again
// 3. Triggering O(n²) operations in SynonymGraphFilter.bufferOutputTokens()
// 4. Massive memory allocation during analyzer reload → OutOfMemoryError
//
// This matches the behavior of SynonymTokenFilterFactory and prevents OOM with chained
// synonym filters (critical for users with many chained synonym sets).
return IDENTITY_FILTER;
}

@Override
public AnalysisMode getAnalysisMode() {
return analysisMode;
Expand All @@ -77,5 +95,4 @@ public String getResourceName() {
}
};
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,11 @@
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.function.BiConsumer;

Expand Down Expand Up @@ -333,6 +335,109 @@ public void testChainedSynonymFilters() throws IOException {
);
}

public void testChainedSynonymGraphFilters() throws IOException {
Settings settings = Settings.builder()
.put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current())
.put("path.home", createTempDir().toString())
.put("index.analysis.filter.synonyms1.type", "synonym_graph")
.putList("index.analysis.filter.synonyms1.synonyms", "foo, bar")
.put("index.analysis.filter.synonyms2.type", "synonym_graph")
.putList("index.analysis.filter.synonyms2.synonyms", "baz, qux")
.put("index.analysis.filter.synonyms3.type", "synonym_graph")
.putList("index.analysis.filter.synonyms3.synonyms", "hello, world")
.put("index.analysis.analyzer.syn.tokenizer", "standard")
.putList("index.analysis.analyzer.syn.filter", "lowercase", "synonyms1", "synonyms2", "synonyms3")
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;

// Test single word - synonym_graph produces both original and synonym at same position
BaseTokenStreamTestCase.assertAnalyzesTo(
indexAnalyzers.get("syn"),
"foo",
new String[] { "bar", "foo" },
new int[] { 0, 0 }, // start offsets
new int[] { 3, 3 }, // end offsets
new int[] { 1, 0 } // position increments
);

// Test multi-word query with all three filters active
BaseTokenStreamTestCase.assertAnalyzesTo(
indexAnalyzers.get("syn"),
"foo baz hello",
new String[] { "bar", "foo", "qux", "baz", "world", "hello" },
new int[] { 0, 0, 4, 4, 8, 8 }, // start offsets
new int[] { 3, 3, 7, 7, 13, 13 }, // end offsets
new int[] { 1, 0, 1, 0, 1, 0 } // position increments: each synonym pair at same position
);
}

public void testManyChainedSynonymGraphFilters() throws IOException {
Settings.Builder settingsBuilder = Settings.builder()
.put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current())
.put("path.home", createTempDir().toString());

String[] vocab = randomArray(50_000, 100_000, String[]::new, () -> randomAlphanumericOfLength(20));
int synonymsPerFilter = 10_000;
int synonymSets = 100;
List<String> filterNames = new ArrayList<>();
filterNames.add("lowercase");

for (int i = 1; i <= synonymSets; i++) {
String filterName = "synonyms_" + i;
StringBuilder sb = new StringBuilder();

for (int j = 0; j < synonymsPerFilter; j++) {
if (j > 0) {
sb.append("\n");
}
for (int k = 0; k < between(1, 3); k++) {
if (k > 0) {
sb.append(", ");
}
for (int l = 0; l < between(1, 3); l++) {
if (l > 0) {
sb.append(" ");
}
sb.append(randomFrom(vocab));
}
}

sb.append(" => ");
sb.append("syn").append(i * (j + 1)); // Shared ID appears in ALL filters
}

filterNames.add(filterName);
settingsBuilder.put("index.analysis.filter." + filterName + ".type", "synonym_graph")
.put("index.analysis.filter." + filterName + ".lenient", true)
.putList("index.analysis.filter." + filterName + ".synonyms", sb.toString());
}

settingsBuilder.put("index.analysis.analyzer.many_syn.tokenizer", "standard")
.putList("index.analysis.analyzer.many_syn.filter", filterNames);

Settings settings = settingsBuilder.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);

long startTime = System.currentTimeMillis();

// This would OOM without the SynonymGraphTokenFilterFactory::getSynonymFilter() fix (filters built sequentially)
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;

// Verify the analyzer was built successfully and can analyze text
// With cross-referencing synonyms, the exact output is complex, so just verify it works
Analyzer analyzer = indexAnalyzers.get("many_syn");
assertNotNull("Analyzer should be created", analyzer);

for (int i = 0; i < 1000; i++) {
// Test that it can analyze without throwing exceptions
TokenStream ts = analyzer.tokenStream("test", randomFrom(vocab));
ts.reset();
assertTrue("Should produce at least one token", ts.incrementToken());
ts.close();
}
Copy link
Contributor

@markjhoy markjhoy Dec 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to help the test GC along, you may want to place:

indexAnalyzers = null;

at the end of the test. (context: playing with the test, I had it loop through the entire test 20 times, and it OOM'd for me... adding this inside the loop, it passed). Not saying an OOM will happen during a CI test run with just the one time through in the test, but it can't hurt just in case.

}

public void testShingleFilters() {

Settings settings = Settings.builder()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
---
"Test chained synonym_graph filters":
- do:
indices.create:
index: test_chained_synonyms
body:
settings:
analysis:
filter:
synonyms_a:
type: synonym_graph
synonyms: [ "foo, bar" ]
synonyms_b:
type: synonym_graph
synonyms: [ "baz, qux" ]
synonyms_c:
type: synonym_graph
synonyms: [ "hello, world" ]
analyzer:
chained_syn:
tokenizer: standard
filter: [ lowercase, synonyms_a, synonyms_b, synonyms_c ]
mappings:
properties:
text:
type: text
analyzer: chained_syn

- do:
index:
index: test_chained_synonyms
id: "1"
body:
text: "foo baz hello"
refresh: true

# Test that all three chained synonym filters work correctly
- do:
search:
index: test_chained_synonyms
body:
query:
match:
text: "bar qux world"
- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "1" }

# Verify analyzer behavior - synonym_graph produces synonym first, then original
- do:
indices.analyze:
index: test_chained_synonyms
body:
text: "foo"
analyzer: chained_syn
- length: { tokens: 2 }
- match: { tokens.0.position: 0 }
- match: { tokens.1.position: 0 }

# Test with multi-word query to verify all chained filters work together
- do:
indices.analyze:
index: test_chained_synonyms
body:
text: "foo baz hello"
analyzer: chained_syn
# Each word that matches a synonym expands to 2 tokens: foo→(foo,bar), baz→(baz,qux), hello→(hello,world)
- length: { tokens: 6 }
# Verify positions: each synonym pair at same position
- match: { tokens.0.position: 0 }
- match: { tokens.1.position: 0 }
- match: { tokens.2.position: 1 }
- match: { tokens.3.position: 1 }
- match: { tokens.4.position: 2 }
- match: { tokens.5.position: 2 }
# Verify token content - synonym_graph produces synonym first, then original
- match: { tokens.0.token: "bar" }
- match: { tokens.1.token: "foo" }
- match: { tokens.2.token: "qux" }
- match: { tokens.3.token: "baz" }
- match: { tokens.4.token: "world" }
- match: { tokens.5.token: "hello" }

# Test that analyzer reload doesn't cause issues (critical for the fix)
- do:
indices.close:
index: test_chained_synonyms

- do:
indices.open:
index: test_chained_synonyms

# Verify it still works after reload
- do:
search:
index: test_chained_synonyms
body:
query:
match:
text: "bar qux world"
- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "1" }