diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SynonymGraphTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SynonymGraphTokenFilterFactory.java index d6d4e644b06b2..2a6d30511feda 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SynonymGraphTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SynonymGraphTokenFilterFactory.java @@ -66,6 +66,24 @@ public TokenStream create(TokenStream tokenStream) { return synonyms.fst == null ? tokenStream : new SynonymGraphFilter(tokenStream, synonyms, false); } + @Override + public TokenFilterFactory getSynonymFilter() { + // When building a synonym filter, we must prevent previous synonym filters in the chain + // from being active, as this would cause recursive synonym expansion during the building phase. + // + // Without this fix, when chaining multiple synonym filters (e.g., synonym_A → synonym_B → synonym_C), + // building synonym_C would use an analyzer containing active synonym_A and synonym_B filters. + // This causes: + // 1. Recursive synonym expansion when parsing synonym rules (e.g., synonyms are expanded via previous filters) + // 2. Each SynonymMap inflates since it applies all previous synonym rules again + // 3. Triggering O(n²) operations in SynonymGraphFilter.bufferOutputTokens() + // 4. Massive memory allocation during analyzer reload → OutOfMemoryError + // + // This matches the behavior of SynonymTokenFilterFactory and prevents OOM with chained + // synonym filters (critical for users with many chained synonym sets). + return IDENTITY_FILTER; + } + @Override public AnalysisMode getAnalysisMode() { return analysisMode; @@ -77,5 +95,4 @@ public String getResourceName() { } }; } - } diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java index af57b8270ff02..766e51068d790 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/SynonymsAnalysisTests.java @@ -34,9 +34,11 @@ import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; +import java.util.List; import java.util.Set; import java.util.function.BiConsumer; @@ -333,6 +335,109 @@ public void testChainedSynonymFilters() throws IOException { ); } + public void testChainedSynonymGraphFilters() throws IOException { + Settings settings = Settings.builder() + .put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current()) + .put("path.home", createTempDir().toString()) + .put("index.analysis.filter.synonyms1.type", "synonym_graph") + .putList("index.analysis.filter.synonyms1.synonyms", "foo, bar") + .put("index.analysis.filter.synonyms2.type", "synonym_graph") + .putList("index.analysis.filter.synonyms2.synonyms", "baz, qux") + .put("index.analysis.filter.synonyms3.type", "synonym_graph") + .putList("index.analysis.filter.synonyms3.synonyms", "hello, world") + .put("index.analysis.analyzer.syn.tokenizer", "standard") + .putList("index.analysis.analyzer.syn.filter", "lowercase", "synonyms1", "synonyms2", "synonyms3") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers; + + // Test single word - synonym_graph produces both original and synonym at same position + BaseTokenStreamTestCase.assertAnalyzesTo( + indexAnalyzers.get("syn"), + "foo", + new String[] { "bar", "foo" }, + new int[] { 0, 0 }, // start offsets + new int[] { 3, 3 }, // end offsets + new int[] { 1, 0 } // position increments + ); + + // Test multi-word query with all three filters active + BaseTokenStreamTestCase.assertAnalyzesTo( + indexAnalyzers.get("syn"), + "foo baz hello", + new String[] { "bar", "foo", "qux", "baz", "world", "hello" }, + new int[] { 0, 0, 4, 4, 8, 8 }, // start offsets + new int[] { 3, 3, 7, 7, 13, 13 }, // end offsets + new int[] { 1, 0, 1, 0, 1, 0 } // position increments: each synonym pair at same position + ); + } + + public void testManyChainedSynonymGraphFilters() throws IOException { + Settings.Builder settingsBuilder = Settings.builder() + .put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current()) + .put("path.home", createTempDir().toString()); + + String[] vocab = randomArray(50_000, 100_000, String[]::new, () -> randomAlphanumericOfLength(20)); + int synonymsPerFilter = 10_000; + int synonymSets = 100; + List filterNames = new ArrayList<>(); + filterNames.add("lowercase"); + + for (int i = 1; i <= synonymSets; i++) { + String filterName = "synonyms_" + i; + StringBuilder sb = new StringBuilder(); + + for (int j = 0; j < synonymsPerFilter; j++) { + if (j > 0) { + sb.append("\n"); + } + for (int k = 0; k < between(1, 3); k++) { + if (k > 0) { + sb.append(", "); + } + for (int l = 0; l < between(1, 3); l++) { + if (l > 0) { + sb.append(" "); + } + sb.append(randomFrom(vocab)); + } + } + + sb.append(" => "); + sb.append("syn").append(i * (j + 1)); // Shared ID appears in ALL filters + } + + filterNames.add(filterName); + settingsBuilder.put("index.analysis.filter." + filterName + ".type", "synonym_graph") + .put("index.analysis.filter." + filterName + ".lenient", true) + .putList("index.analysis.filter." + filterName + ".synonyms", sb.toString()); + } + + settingsBuilder.put("index.analysis.analyzer.many_syn.tokenizer", "standard") + .putList("index.analysis.analyzer.many_syn.filter", filterNames); + + Settings settings = settingsBuilder.build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + + long startTime = System.currentTimeMillis(); + + // This would OOM without the SynonymGraphTokenFilterFactory::getSynonymFilter() fix (filters built sequentially) + indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers; + + // Verify the analyzer was built successfully and can analyze text + // With cross-referencing synonyms, the exact output is complex, so just verify it works + Analyzer analyzer = indexAnalyzers.get("many_syn"); + assertNotNull("Analyzer should be created", analyzer); + + for (int i = 0; i < 1000; i++) { + // Test that it can analyze without throwing exceptions + TokenStream ts = analyzer.tokenStream("test", randomFrom(vocab)); + ts.reset(); + assertTrue("Should produce at least one token", ts.incrementToken()); + ts.close(); + } + } + public void testShingleFilters() { Settings settings = Settings.builder() diff --git a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/50_chained_synonym_filters.yml b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/50_chained_synonym_filters.yml new file mode 100644 index 0000000000000..5603c933bef4b --- /dev/null +++ b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/50_chained_synonym_filters.yml @@ -0,0 +1,101 @@ +--- +"Test chained synonym_graph filters": + - do: + indices.create: + index: test_chained_synonyms + body: + settings: + analysis: + filter: + synonyms_a: + type: synonym_graph + synonyms: [ "foo, bar" ] + synonyms_b: + type: synonym_graph + synonyms: [ "baz, qux" ] + synonyms_c: + type: synonym_graph + synonyms: [ "hello, world" ] + analyzer: + chained_syn: + tokenizer: standard + filter: [ lowercase, synonyms_a, synonyms_b, synonyms_c ] + mappings: + properties: + text: + type: text + analyzer: chained_syn + + - do: + index: + index: test_chained_synonyms + id: "1" + body: + text: "foo baz hello" + refresh: true + + # Test that all three chained synonym filters work correctly + - do: + search: + index: test_chained_synonyms + body: + query: + match: + text: "bar qux world" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "1" } + + # Verify analyzer behavior - synonym_graph produces synonym first, then original + - do: + indices.analyze: + index: test_chained_synonyms + body: + text: "foo" + analyzer: chained_syn + - length: { tokens: 2 } + - match: { tokens.0.position: 0 } + - match: { tokens.1.position: 0 } + + # Test with multi-word query to verify all chained filters work together + - do: + indices.analyze: + index: test_chained_synonyms + body: + text: "foo baz hello" + analyzer: chained_syn + # Each word that matches a synonym expands to 2 tokens: foo→(foo,bar), baz→(baz,qux), hello→(hello,world) + - length: { tokens: 6 } + # Verify positions: each synonym pair at same position + - match: { tokens.0.position: 0 } + - match: { tokens.1.position: 0 } + - match: { tokens.2.position: 1 } + - match: { tokens.3.position: 1 } + - match: { tokens.4.position: 2 } + - match: { tokens.5.position: 2 } + # Verify token content - synonym_graph produces synonym first, then original + - match: { tokens.0.token: "bar" } + - match: { tokens.1.token: "foo" } + - match: { tokens.2.token: "qux" } + - match: { tokens.3.token: "baz" } + - match: { tokens.4.token: "world" } + - match: { tokens.5.token: "hello" } + + # Test that analyzer reload doesn't cause issues (critical for the fix) + - do: + indices.close: + index: test_chained_synonyms + + - do: + indices.open: + index: test_chained_synonyms + + # Verify it still works after reload + - do: + search: + index: test_chained_synonyms + body: + query: + match: + text: "bar qux world" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "1" }