Skip to content

Commit b82017c

Browse files
authored
Fix daitch_mokotoff phonetic filter to use the dedicated Lucene filter (#28225)
This commit changes the phonetic filter factory to use a DaitchMokotoffSoundexFilter instead of a PhoneticFilter with a daitch_mokotoff encoder when daitch_mokotoff is selected. The latter does not hanlde branching when computing the soundex and fails to encode multiple variations when possible. Closes #28211
1 parent 0a92e43 commit b82017c

File tree

2 files changed

+19
-1
lines changed

2 files changed

+19
-1
lines changed

plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/PhoneticTokenFilterFactory.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
import org.apache.commons.codec.language.bm.RuleType;
3434
import org.apache.lucene.analysis.TokenStream;
3535
import org.apache.lucene.analysis.phonetic.BeiderMorseFilter;
36+
import org.apache.lucene.analysis.phonetic.DaitchMokotoffSoundexFilter;
3637
import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter;
3738
import org.apache.lucene.analysis.phonetic.PhoneticFilter;
3839
import org.elasticsearch.common.settings.Settings;
@@ -53,13 +54,15 @@ public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
5354
private List<String> languageset;
5455
private NameType nametype;
5556
private RuleType ruletype;
57+
private boolean isDaitchMokotoff;
5658

5759
public PhoneticTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
5860
super(indexSettings, name, settings);
5961
this.languageset = null;
6062
this.nametype = null;
6163
this.ruletype = null;
6264
this.maxcodelength = 0;
65+
this.isDaitchMokotoff = false;
6366
this.replace = settings.getAsBoolean("replace", true);
6467
// weird, encoder is null at last step in SimplePhoneticAnalysisTests, so we set it to metaphone as default
6568
String encodername = settings.get("encoder", "metaphone");
@@ -106,7 +109,8 @@ public PhoneticTokenFilterFactory(IndexSettings indexSettings, Environment envir
106109
} else if ("nysiis".equalsIgnoreCase(encodername)) {
107110
this.encoder = new Nysiis();
108111
} else if ("daitch_mokotoff".equalsIgnoreCase(encodername)) {
109-
this.encoder = new DaitchMokotoffSoundex();
112+
this.encoder = null;
113+
this.isDaitchMokotoff = true;
110114
} else {
111115
throw new IllegalArgumentException("unknown encoder [" + encodername + "] for phonetic token filter");
112116
}
@@ -115,6 +119,9 @@ public PhoneticTokenFilterFactory(IndexSettings indexSettings, Environment envir
115119
@Override
116120
public TokenStream create(TokenStream tokenStream) {
117121
if (encoder == null) {
122+
if (isDaitchMokotoff) {
123+
return new DaitchMokotoffSoundexFilter(tokenStream, !replace);
124+
}
118125
if (ruletype != null && nametype != null) {
119126
LanguageSet langset = null;
120127
if (languageset != null && languageset.size() > 0) {

plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/SimplePhoneticAnalysisTests.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
2323
import org.apache.lucene.analysis.Tokenizer;
2424
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
25+
import org.apache.lucene.analysis.phonetic.DaitchMokotoffSoundexFilter;
2526
import org.elasticsearch.Version;
2627
import org.elasticsearch.cluster.metadata.IndexMetaData;
2728
import org.elasticsearch.common.settings.Settings;
@@ -72,4 +73,14 @@ public void testPhoneticTokenFilterBeiderMorseWithLanguage() throws IOException
7273
"rmba", "rmbalt", "rmbo", "rmbolt", "rmbu", "rmbult" };
7374
BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
7475
}
76+
77+
public void testPhoneticTokenFilterDaitchMotokoff() throws IOException {
78+
TokenFilterFactory filterFactory = analysis.tokenFilter.get("daitch_mokotoff");
79+
Tokenizer tokenizer = new WhitespaceTokenizer();
80+
tokenizer.setReader(new StringReader("chauptman"));
81+
String[] expected = new String[] { "473660", "573660" };
82+
assertThat(filterFactory.create(tokenizer), instanceOf(DaitchMokotoffSoundexFilter.class));
83+
BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
84+
}
85+
7586
}

0 commit comments

Comments
 (0)