-
Notifications
You must be signed in to change notification settings - Fork 0
[main] Fix an OOM error when creating to many chained synonym graph token filter. (#140026) #11
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: copilot_only-issues-20260113-qodo-grep-copilot_base_main_fix_an_oom_error_when_creating_to_many_chained_synonym_graph_token_filter_140026_pr68
Are you sure you want to change the base?
Changes from all commits
930ce6e
3538330
e907faa
8aaa34f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,9 +34,11 @@ | |
| import java.io.InputStream; | ||
| import java.nio.file.Files; | ||
| import java.nio.file.Path; | ||
| import java.util.ArrayList; | ||
| import java.util.Arrays; | ||
| import java.util.Collections; | ||
| import java.util.HashSet; | ||
| import java.util.List; | ||
| import java.util.Set; | ||
| import java.util.function.BiConsumer; | ||
|
|
||
|
|
@@ -333,6 +335,107 @@ public void testChainedSynonymFilters() throws IOException { | |
| ); | ||
| } | ||
|
|
||
| public void testChainedSynonymGraphFilters() throws IOException { | ||
| Settings settings = Settings.builder() | ||
| .put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current()) | ||
| .put("path.home", createTempDir().toString()) | ||
| .put("index.analysis.filter.synonyms1.type", "synonym_graph") | ||
| .putList("index.analysis.filter.synonyms1.synonyms", "foo, bar") | ||
| .put("index.analysis.filter.synonyms2.type", "synonym_graph") | ||
| .putList("index.analysis.filter.synonyms2.synonyms", "baz, qux") | ||
| .put("index.analysis.filter.synonyms3.type", "synonym_graph") | ||
| .putList("index.analysis.filter.synonyms3.synonyms", "hello, world") | ||
| .put("index.analysis.analyzer.syn.tokenizer", "standard") | ||
| .putList("index.analysis.analyzer.syn.filter", "lowercase", "synonyms1", "synonyms2", "synonyms3") | ||
| .build(); | ||
| IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); | ||
| indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers; | ||
|
|
||
| // Test single word - synonym_graph produces both original and synonym at same position | ||
| BaseTokenStreamTestCase.assertAnalyzesTo( | ||
| indexAnalyzers.get("syn"), | ||
| "foo", | ||
| new String[] { "foo", "bar" }, | ||
| new int[] { 0, 0 }, // start offsets | ||
| new int[] { 3, 3 }, // end offsets | ||
| new int[] { 1, 0 } // position increments | ||
| ); | ||
|
|
||
| // Test multi-word query with all three filters active | ||
| BaseTokenStreamTestCase.assertAnalyzesTo( | ||
| indexAnalyzers.get("syn"), | ||
| "foo baz hello", | ||
| new String[] { "bar", "foo", "qux", "baz", "world", "hello" }, | ||
| new int[] { 0, 0, 4, 4, 8, 8 }, // start offsets | ||
| new int[] { 3, 3, 7, 7, 13, 13 }, // end offsets | ||
| new int[] { 1, 0, 1, 0, 1, 0 } // position increments: each synonym pair at same position | ||
| ); | ||
| } | ||
|
|
||
| public void testManyChainedSynonymGraphFilters() throws IOException { | ||
| Settings.Builder settingsBuilder = Settings.builder() | ||
| .put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current()) | ||
| .put("path.home", createTempDir().toString()); | ||
|
|
||
| String[] vocab = randomArray(50_000, 100_000, String[]::new, () -> randomAlphanumericOfLength(20)); | ||
| int synonymsPerFilter = 10_000; | ||
| int synonymSets = 10; | ||
| List<String> filterNames = new ArrayList<>(); | ||
| filterNames.add("lowercase"); | ||
|
|
||
| for (int i = 1; i <= synonymSets; i++) { | ||
| String filterName = "synonyms_" + i; | ||
| StringBuilder sb = new StringBuilder(); | ||
|
|
||
| for (int j = 0; j < synonymsPerFilter; j++) { | ||
| if (j > 0) { | ||
| sb.append("\n"); | ||
| } | ||
| for (int k = 0; k < between(1, 3); k++) { | ||
| if (k > 0) { | ||
| sb.append(", "); | ||
| } | ||
| for (int l = 0; l < between(1, 3); l++) { | ||
| if (l > 0) { | ||
| sb.append(" "); | ||
| } | ||
| sb.append(randomFrom(vocab)); | ||
| } | ||
| } | ||
|
|
||
| sb.append(" => "); | ||
| sb.append("syn").append(i * (j + 1)); // Shared ID appears in ALL filters | ||
|
||
| } | ||
|
|
||
| filterNames.add(filterName); | ||
| settingsBuilder.put("index.analysis.filter." + filterName + ".type", "synonym_graph") | ||
| .put("index.analysis.filter." + filterName + ".lenient", true) | ||
| .putList("index.analysis.filter." + filterName + ".synonyms", sb.toString()); | ||
| } | ||
|
|
||
| settingsBuilder.put("index.analysis.analyzer.many_syn.tokenizer", "standard") | ||
| .putList("index.analysis.analyzer.many_syn.filter", filterNames); | ||
|
|
||
| Settings settings = settingsBuilder.build(); | ||
| IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); | ||
|
|
||
| // This would OOM without the SynonymGraphTokenFilterFactory::getSynonymFilter() fix (filters built sequentially) | ||
| indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers; | ||
|
|
||
| // Verify the analyzer was built successfully and can analyze text | ||
| // With cross-referencing synonyms, the exact output is complex, so just verify it works | ||
| Analyzer analyzer = indexAnalyzers.get("many_syn"); | ||
| assertNotNull("Analyzer should be created", analyzer); | ||
|
|
||
| for (int i = 0; i < 1000; i++) { | ||
| // Test that it can analyze without throwing exceptions | ||
| TokenStream ts = analyzer.tokenStream("test", randomFrom(vocab)); | ||
| ts.reset(); | ||
| assertTrue("Should produce at least one token", ts.incrementToken()); | ||
| ts.close(); | ||
| } | ||
| } | ||
|
|
||
| public void testShingleFilters() { | ||
|
|
||
| Settings settings = Settings.builder() | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,101 @@ | ||
| --- | ||
| "Test chained synonym_graph filters": | ||
| - do: | ||
| indices.create: | ||
| index: test_chained_synonyms | ||
| body: | ||
| settings: | ||
| analysis: | ||
| filter: | ||
| synonyms_a: | ||
| type: synonym_graph | ||
| synonyms: [ "foo, bar" ] | ||
| synonyms_b: | ||
| type: synonym_graph | ||
| synonyms: [ "baz, qux" ] | ||
| synonyms_c: | ||
| type: synonym_graph | ||
| synonyms: [ "hello, world" ] | ||
| analyzer: | ||
| chained_syn: | ||
| tokenizer: standard | ||
| filter: [ lowercase, synonyms_a, synonyms_b, synonyms_c ] | ||
| mappings: | ||
| properties: | ||
| text: | ||
| type: text | ||
| analyzer: chained_syn | ||
|
|
||
| - do: | ||
| index: | ||
| index: test_chained_synonyms | ||
| id: "1" | ||
| body: | ||
| text: "foo baz hello" | ||
| refresh: true | ||
|
|
||
| # Test that all three chained synonym filters work correctly | ||
| - do: | ||
| search: | ||
| index: test_chained_synonyms | ||
| body: | ||
| query: | ||
| match: | ||
| text: "bar qux world" | ||
| - match: { hits.total.value: 1 } | ||
| - match: { hits.hits.0._id: "1" } | ||
|
|
||
| # Verify analyzer behavior - synonym_graph produces synonym first, then original | ||
| - do: | ||
| indices.analyze: | ||
| index: test_chained_synonyms | ||
| body: | ||
| text: "foo" | ||
| analyzer: chained_syn | ||
| - length: { tokens: 2 } | ||
| - match: { tokens.0.position: 0 } | ||
| - match: { tokens.1.position: 0 } | ||
|
|
||
| # Test with multi-word query to verify all chained filters work together | ||
| - do: | ||
| indices.analyze: | ||
| index: test_chained_synonyms | ||
| body: | ||
| text: "foo baz hello" | ||
| analyzer: chained_syn | ||
| # Each word that matches a synonym expands to 2 tokens: foo→(foo,bar), baz→(baz,qux), hello→(hello,world) | ||
| - length: { tokens: 6 } | ||
| # Verify positions: each synonym pair at same position | ||
| - match: { tokens.0.position: 0 } | ||
| - match: { tokens.1.position: 0 } | ||
| - match: { tokens.2.position: 1 } | ||
| - match: { tokens.3.position: 1 } | ||
| - match: { tokens.4.position: 2 } | ||
| - match: { tokens.5.position: 2 } | ||
| # Verify token content - synonym_graph produces synonym first, then original | ||
| - match: { tokens.0.token: "foo" } | ||
| - match: { tokens.1.token: "bar" } | ||
| - match: { tokens.2.token: "qux" } | ||
| - match: { tokens.3.token: "baz" } | ||
| - match: { tokens.4.token: "world" } | ||
| - match: { tokens.5.token: "hello" } | ||
|
|
||
| # Test that analyzer reload doesn't cause issues (critical for the fix) | ||
| - do: | ||
| indices.close: | ||
| index: test_chained_synonyms | ||
|
|
||
| - do: | ||
| indices.open: | ||
| index: test_chained_synonyms | ||
|
|
||
| # Verify it still works after reload | ||
| - do: | ||
| search: | ||
| index: test_chained_synonyms | ||
| body: | ||
| query: | ||
| match: | ||
| text: "bar qux world" | ||
| - match: { hits.total.value: 1 } | ||
| - match: { hits.hits.0._id: "1" } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The expected token order appears to contradict the comment on line 375 which states 'synonym_graph produces synonym first, then original'. The test expects 'bar' before 'foo', but this ordering should be explicitly verified and documented as the actual behavior, especially since it differs from the comment pattern in the YAML test file (lines 76-81) which shows original tokens first in position.