Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,18 @@ public TokenFilterFactory getChainAwareTokenFilterFactory(
List<TokenFilterFactory> previousTokenFilters,
Function<String, TokenFilterFactory> allFilters
) {
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters);
return getChainAwareTokenFilterFactory(tokenizer, charFilters, previousTokenFilters, allFilters, name -> null);
}

@Override
public TokenFilterFactory getChainAwareTokenFilterFactory(
TokenizerFactory tokenizer,
List<CharFilterFactory> charFilters,
List<TokenFilterFactory> previousTokenFilters,
Function<String, TokenFilterFactory> allFilters,
Function<String, Analyzer> analyzersBuiltSoFar
) {
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters, analyzersBuiltSoFar);
final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment));
final String name = name();
return new TokenFilterFactory() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,18 @@ public TokenFilterFactory getChainAwareTokenFilterFactory(
List<TokenFilterFactory> previousTokenFilters,
Function<String, TokenFilterFactory> allFilters
) {
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters);
return getChainAwareTokenFilterFactory(tokenizer, charFilters, previousTokenFilters, allFilters, name -> null);
}

@Override
public TokenFilterFactory getChainAwareTokenFilterFactory(
TokenizerFactory tokenizer,
List<CharFilterFactory> charFilters,
List<TokenFilterFactory> previousTokenFilters,
Function<String, TokenFilterFactory> allFilters,
Function<String, Analyzer> analyzersBuiltSoFar
) {
final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters, analyzersBuiltSoFar);
final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment));
final String name = name();
return new TokenFilterFactory() {
Expand Down Expand Up @@ -147,10 +158,15 @@ Analyzer buildSynonymAnalyzer(
TokenizerFactory tokenizer,
List<CharFilterFactory> charFilters,
List<TokenFilterFactory> tokenFilters,
Function<String, TokenFilterFactory> allFilters
Function<String, TokenFilterFactory> allFilters,
Function<String, Analyzer> analyzersBuiltSoFar
) {
if (synonymAnalyzerName != null) {
Analyzer customSynonymAnalyzer;
// first, check settings analyzers
Analyzer customSynonymAnalyzer = analyzersBuiltSoFar.apply(synonymAnalyzerName);
if (customSynonymAnalyzer != null) {
return customSynonymAnalyzer;
}
try {
customSynonymAnalyzer = analysisRegistry.getAnalyzer(synonymAnalyzerName);
} catch (IOException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,9 @@

import java.io.IOException;
import java.io.StringReader;
import java.util.Arrays;
import java.util.Collections;

import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.hasToString;

public class EdgeNGramTokenizerTests extends OpenSearchTokenStreamTestCase {

private IndexAnalyzers buildAnalyzers(Version version, String tokenizer) throws IOException {
Expand Down Expand Up @@ -99,7 +97,14 @@ public void testPreConfiguredTokenizer() throws IOException {
IllegalArgumentException.class,
() -> buildAnalyzers(VersionUtils.randomVersionBetween(random(), Version.V_3_0_0, Version.CURRENT), "edgeNGram")
);
assertThat(e, hasToString(containsString("The [edgeNGram] tokenizer name was deprecated pre 1.0.")));

boolean found = Arrays.stream(e.getSuppressed())
.map(org.opensearch.ExceptionsHelper::unwrapCause)
.map(Throwable::getMessage)
.findFirst()
.get()
.contains("The [edgeNGram] tokenizer name was deprecated pre 1.0.");
assertTrue("expected deprecation message in suppressed causes", found);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ public void testSynonymWordDeleteByAnalyzer() throws IOException {
fail("fail! due to synonym word deleted by analyzer");
} catch (Exception e) {
assertThat(e, instanceOf(IllegalArgumentException.class));
assertThat(e.getMessage(), startsWith("Failed to build synonyms"));
assertThat(e.getMessage(), startsWith("Failed to build analyzers: [synonymAnalyzerWithStopSynonymBeforeSynonym]"));
}
}

Expand All @@ -141,7 +141,7 @@ public void testExpandSynonymWordDeleteByAnalyzer() throws IOException {
fail("fail! due to synonym word deleted by analyzer");
} catch (Exception e) {
assertThat(e, instanceOf(IllegalArgumentException.class));
assertThat(e.getMessage(), startsWith("Failed to build synonyms"));
assertThat(e.getMessage(), startsWith("Failed to build analyzers: [synonymAnalyzerExpandWithStopBeforeSynonym]"));
}
}

Expand Down Expand Up @@ -269,7 +269,7 @@ public void testTokenFiltersBypassSynonymAnalysis() throws IOException {
TokenFilterFactory tff = plugin.getTokenFilters(analysisModule).get(factory).get(idxSettings, environment, factory, settings);
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, environment, "keyword", settings);
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, environment, "synonym", settings, analysisRegistry);
Analyzer analyzer = stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null);
Analyzer analyzer = stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null, null);

try (TokenStream ts = analyzer.tokenStream("field", "text")) {
assertThat(ts, instanceOf(KeywordTokenizer.class));
Expand Down Expand Up @@ -350,7 +350,7 @@ public void testDisallowedTokenFilters() throws IOException {
IllegalArgumentException e = expectThrows(
IllegalArgumentException.class,
"Expected IllegalArgumentException for factory " + factory,
() -> stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null)
() -> stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null, null)
);

assertEquals(factory, "Token filter [" + factory + "] cannot be used to parse synonyms", e.getMessage());
Expand Down Expand Up @@ -443,4 +443,48 @@ public void testSynonymAnalyzerWithWordDelimiter() throws IOException {
assertTokenStreamContents(ts, new String[] { "notebook" }, new int[] { 0 }, new int[] { 6 });
}
}

/**
* Test the core dependency resolution issue from GitHub #18037:
* synonym_graph with custom synonym_analyzer should work even when
* the main analyzer contains word_delimiter_graph that would normally
* cause "cannot be used to parse synonyms" error.
*/
public void testSynonymAnalyzerDependencyResolution() throws IOException {
Settings settings = Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)

// Main analyzer with word_delimiter. order=2
.put("index.analysis.analyzer.main_analyzer.type", "custom")
.put("index.analysis.analyzer.main_analyzer.order", "2")
.put("index.analysis.analyzer.main_analyzer.tokenizer", "standard")
.putList("index.analysis.analyzer.main_analyzer.filter", "lowercase", "test_word_delimiter", "test_synonyms")

// Problematic filter for synonym parsing
.put("index.analysis.filter.test_word_delimiter.type", "word_delimiter_graph")
.put("index.analysis.filter.test_word_delimiter.generate_word_parts", true)

// Custom analyzer dependency. order=1 (built before main_analyzer whose order=2)
.put("index.analysis.analyzer.simple_synonym_analyzer.type", "custom")
.put("index.analysis.analyzer.simple_synonym_analyzer.order", "1")
.put("index.analysis.analyzer.simple_synonym_analyzer.tokenizer", "standard")

// Synonym filter that depends on custom analyzer
.put("index.analysis.filter.test_synonyms.type", "synonym_graph")
.putList("index.analysis.filter.test_synonyms.synonyms", "laptop,notebook")
.put("index.analysis.filter.test_synonyms.synonym_analyzer", "simple_synonym_analyzer")
.build();

IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test_index", settings);

// Should succeed with the fix (would fail before due to registration order)
IndexAnalyzers analyzers = new AnalysisModule(
TestEnvironment.newEnvironment(settings),
Collections.singletonList(new CommonAnalysisModulePlugin())
).getAnalysisRegistry().build(idxSettings);

assertNotNull("main_analyzer should be created", analyzers.get("main_analyzer"));
assertNotNull("simple_synonym_analyzer should be created", analyzers.get("simple_synonym_analyzer"));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
Expand Down Expand Up @@ -305,7 +306,7 @@ public NamedAnalyzer buildCustomAnalyzer(
} catch (IOException e) {
throw new UncheckedIOException(e);
}
});
}, null);
tokenFilterFactories.add(tff);
}

Expand Down Expand Up @@ -362,7 +363,30 @@ public Map<String, CharFilterFactory> buildCharFilterFactories(IndexSettings ind

private Map<String, AnalyzerProvider<?>> buildAnalyzerFactories(IndexSettings indexSettings) throws IOException {
final Map<String, Settings> analyzersSettings = indexSettings.getSettings().getGroups("index.analysis.analyzer");
return buildMapping(Component.ANALYZER, indexSettings, analyzersSettings, analyzers, prebuiltAnalysis.analyzerProviderFactories);

// Some analyzers depend on others that need to be built first
// Sort by 'order', default to 1000
List<Map.Entry<String, Settings>> sortedEntries = analyzersSettings.entrySet().stream().sorted((a, b) -> {
int orderA = a.getValue().getAsInt("order", 100);
int orderB = b.getValue().getAsInt("order", 100);
if (orderA != orderB) {
return Integer.compare(orderA, orderB);
}
return a.getKey().compareTo(b.getKey());
}).collect(Collectors.toList());

Map<String, Settings> sortedAnalyzersSettings = new LinkedHashMap<>();
for (Map.Entry<String, Settings> entry : sortedEntries) {
sortedAnalyzersSettings.put(entry.getKey(), entry.getValue());
}

return buildMapping(
Component.ANALYZER,
indexSettings,
sortedAnalyzersSettings,
analyzers,
prebuiltAnalysis.analyzerProviderFactories
);
}

private Map<String, AnalyzerProvider<?>> buildNormalizerFactories(IndexSettings indexSettings) throws IOException {
Expand Down Expand Up @@ -486,7 +510,7 @@ private <T> Map<String, T> buildMapping(
Map<String, ? extends AnalysisModule.AnalysisProvider<T>> defaultInstance
) throws IOException {
Settings defaultSettings = Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, settings.getIndexVersionCreated()).build();
Map<String, T> factories = new HashMap<>();
Map<String, T> factories = new LinkedHashMap<>(); // keep insertion order
for (Map.Entry<String, Settings> entry : settingsMap.entrySet()) {
String name = entry.getKey();
Settings currentSettings = entry.getValue();
Expand Down Expand Up @@ -637,21 +661,27 @@ public IndexAnalyzers build(
Map<String, NamedAnalyzer> analyzers = new HashMap<>();
Map<String, NamedAnalyzer> normalizers = new HashMap<>();
Map<String, NamedAnalyzer> whitespaceNormalizers = new HashMap<>();
Map<String, Exception> buildErrors = new LinkedHashMap<>();
Map<String, Analyzer> analyzersBuiltSoFar = new HashMap<>();
for (Map.Entry<String, AnalyzerProvider<?>> entry : analyzerProviders.entrySet()) {
analyzers.merge(
entry.getKey(),
produceAnalyzer(
try {
NamedAnalyzer namedAnalyzer = produceAnalyzer(
entry.getKey(),
entry.getValue(),
tokenFilterFactoryFactories,
charFilterFactoryFactories,
tokenizerFactoryFactories
),
(k, v) -> {
tokenizerFactoryFactories,
analyzersBuiltSoFar
);
analyzers.merge(entry.getKey(), namedAnalyzer, (k, v) -> {
throw new IllegalStateException("already registered analyzer with name: " + entry.getKey());
}
);
});
analyzersBuiltSoFar.put(entry.getKey(), namedAnalyzer);
} catch (Exception e) {
buildErrors.put(entry.getKey(), e);
}
}

for (Map.Entry<String, AnalyzerProvider<?>> entry : normalizerProviders.entrySet()) {
processNormalizerFactory(
entry.getKey(),
Expand Down Expand Up @@ -707,6 +737,14 @@ public IndexAnalyzers build(
throw new IllegalArgumentException("analyzer name must not start with '_'. got \"" + analyzer.getKey() + "\"");
}
}

if (!buildErrors.isEmpty()) {
IllegalArgumentException aggregated = new IllegalArgumentException("Failed to build analyzers: " + buildErrors.keySet());
buildErrors.forEach(
(name, ex) -> aggregated.addSuppressed(new IllegalArgumentException("[" + name + "] " + ex.getMessage(), ex))
);
throw aggregated;
}
return new IndexAnalyzers(analyzers, normalizers, whitespaceNormalizers);
}

Expand All @@ -716,6 +754,17 @@ private static NamedAnalyzer produceAnalyzer(
Map<String, TokenFilterFactory> tokenFilters,
Map<String, CharFilterFactory> charFilters,
Map<String, TokenizerFactory> tokenizers
) {
return produceAnalyzer(name, analyzerFactory, tokenFilters, charFilters, tokenizers, Collections.emptyMap());
}

private static NamedAnalyzer produceAnalyzer(
String name,
AnalyzerProvider<?> analyzerFactory,
Map<String, TokenFilterFactory> tokenFilters,
Map<String, CharFilterFactory> charFilters,
Map<String, TokenizerFactory> tokenizers,
Map<String, Analyzer> analyzersBuiltSoFar
) {
/*
* Lucene defaults positionIncrementGap to 0 in all analyzers but
Expand All @@ -725,7 +774,7 @@ private static NamedAnalyzer produceAnalyzer(
*/
int overridePositionIncrementGap = TextFieldMapper.Defaults.POSITION_INCREMENT_GAP;
if (analyzerFactory instanceof CustomAnalyzerProvider) {
((CustomAnalyzerProvider) analyzerFactory).build(tokenizers, charFilters, tokenFilters);
((CustomAnalyzerProvider) analyzerFactory).build(tokenizers, charFilters, tokenFilters, analyzersBuiltSoFar);
/*
* Custom analyzers already default to the correct, version
* dependent positionIncrementGap and the user is be able to
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,11 @@

package org.opensearch.index.analysis;

import org.apache.lucene.analysis.Analyzer;
import org.opensearch.common.settings.Settings;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;

Expand Down Expand Up @@ -69,6 +71,17 @@ static AnalyzerComponents createComponents(
final Map<String, TokenizerFactory> tokenizers,
final Map<String, CharFilterFactory> charFilters,
final Map<String, TokenFilterFactory> tokenFilters
) {
return createComponents(name, analyzerSettings, tokenizers, charFilters, tokenFilters, Collections.emptyMap());
}

static AnalyzerComponents createComponents(
String name,
Settings analyzerSettings,
final Map<String, TokenizerFactory> tokenizers,
final Map<String, CharFilterFactory> charFilters,
final Map<String, TokenFilterFactory> tokenFilters,
final Map<String, Analyzer> analyzersBuiltSoFar
) {
String tokenizerName = analyzerSettings.get("tokenizer");
if (tokenizerName == null) {
Expand Down Expand Up @@ -103,7 +116,13 @@ static AnalyzerComponents createComponents(
"Custom Analyzer [" + name + "] failed to find filter under name " + "[" + tokenFilterName + "]"
);
}
tokenFilter = tokenFilter.getChainAwareTokenFilterFactory(tokenizer, charFiltersList, tokenFilterList, tokenFilters::get);
tokenFilter = tokenFilter.getChainAwareTokenFilterFactory(
tokenizer,
charFiltersList,
tokenFilterList,
tokenFilters::get,
analyzersBuiltSoFar::get
);
tokenFilterList.add(tokenFilter);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,10 @@ public CustomAnalyzerProvider(IndexSettings indexSettings, String name, Settings
void build(
final Map<String, TokenizerFactory> tokenizers,
final Map<String, CharFilterFactory> charFilters,
final Map<String, TokenFilterFactory> tokenFilters
final Map<String, TokenFilterFactory> tokenFilters,
final Map<String, Analyzer> analyzersBuiltSoFar
) {
customAnalyzer = create(name(), analyzerSettings, tokenizers, charFilters, tokenFilters);
customAnalyzer = create(name(), analyzerSettings, tokenizers, charFilters, tokenFilters, analyzersBuiltSoFar);
}

/**
Expand All @@ -75,12 +76,20 @@ private static Analyzer create(
Settings analyzerSettings,
Map<String, TokenizerFactory> tokenizers,
Map<String, CharFilterFactory> charFilters,
Map<String, TokenFilterFactory> tokenFilters
Map<String, TokenFilterFactory> tokenFilters,
Map<String, Analyzer> analyzersBuiltSoFar
) {
int positionIncrementGap = TextFieldMapper.Defaults.POSITION_INCREMENT_GAP;
positionIncrementGap = analyzerSettings.getAsInt("position_increment_gap", positionIncrementGap);
int offsetGap = analyzerSettings.getAsInt("offset_gap", -1);
AnalyzerComponents components = createComponents(name, analyzerSettings, tokenizers, charFilters, tokenFilters);
AnalyzerComponents components = createComponents(
name,
analyzerSettings,
tokenizers,
charFilters,
tokenFilters,
analyzersBuiltSoFar
);
if (components.analysisMode().equals(AnalysisMode.SEARCH_TIME)) {
return new ReloadableCustomAnalyzer(components, positionIncrementGap, offsetGap);
} else {
Expand Down
Loading
Loading