Fix daitch_mokotoff phonetic filter to use the dedicated Lucene filter (#28225)
This commit changes the phonetic filter factory to use a DaitchMokotoffSoundexFilter instead of a PhoneticFilter with a daitch_mokotoff encoder when daitch_mokotoff is selected. The latter does not hanlde branching when computing the soundex and fails to encode multiple variations when possible. Closes #28211
This commit is contained in:
parent
0a92e43f62
commit
b82017cbfe
|
@ -33,6 +33,7 @@ import org.apache.commons.codec.language.bm.PhoneticEngine;
|
||||||
import org.apache.commons.codec.language.bm.RuleType;
|
import org.apache.commons.codec.language.bm.RuleType;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.phonetic.BeiderMorseFilter;
|
import org.apache.lucene.analysis.phonetic.BeiderMorseFilter;
|
||||||
|
import org.apache.lucene.analysis.phonetic.DaitchMokotoffSoundexFilter;
|
||||||
import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter;
|
import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter;
|
||||||
import org.apache.lucene.analysis.phonetic.PhoneticFilter;
|
import org.apache.lucene.analysis.phonetic.PhoneticFilter;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
@ -53,6 +54,7 @@ public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
private List<String> languageset;
|
private List<String> languageset;
|
||||||
private NameType nametype;
|
private NameType nametype;
|
||||||
private RuleType ruletype;
|
private RuleType ruletype;
|
||||||
|
private boolean isDaitchMokotoff;
|
||||||
|
|
||||||
public PhoneticTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
public PhoneticTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||||
super(indexSettings, name, settings);
|
super(indexSettings, name, settings);
|
||||||
|
@ -60,6 +62,7 @@ public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
this.nametype = null;
|
this.nametype = null;
|
||||||
this.ruletype = null;
|
this.ruletype = null;
|
||||||
this.maxcodelength = 0;
|
this.maxcodelength = 0;
|
||||||
|
this.isDaitchMokotoff = false;
|
||||||
this.replace = settings.getAsBoolean("replace", true);
|
this.replace = settings.getAsBoolean("replace", true);
|
||||||
// weird, encoder is null at last step in SimplePhoneticAnalysisTests, so we set it to metaphone as default
|
// weird, encoder is null at last step in SimplePhoneticAnalysisTests, so we set it to metaphone as default
|
||||||
String encodername = settings.get("encoder", "metaphone");
|
String encodername = settings.get("encoder", "metaphone");
|
||||||
|
@ -106,7 +109,8 @@ public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
} else if ("nysiis".equalsIgnoreCase(encodername)) {
|
} else if ("nysiis".equalsIgnoreCase(encodername)) {
|
||||||
this.encoder = new Nysiis();
|
this.encoder = new Nysiis();
|
||||||
} else if ("daitch_mokotoff".equalsIgnoreCase(encodername)) {
|
} else if ("daitch_mokotoff".equalsIgnoreCase(encodername)) {
|
||||||
this.encoder = new DaitchMokotoffSoundex();
|
this.encoder = null;
|
||||||
|
this.isDaitchMokotoff = true;
|
||||||
} else {
|
} else {
|
||||||
throw new IllegalArgumentException("unknown encoder [" + encodername + "] for phonetic token filter");
|
throw new IllegalArgumentException("unknown encoder [" + encodername + "] for phonetic token filter");
|
||||||
}
|
}
|
||||||
|
@ -115,6 +119,9 @@ public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
@Override
|
@Override
|
||||||
public TokenStream create(TokenStream tokenStream) {
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
if (encoder == null) {
|
if (encoder == null) {
|
||||||
|
if (isDaitchMokotoff) {
|
||||||
|
return new DaitchMokotoffSoundexFilter(tokenStream, !replace);
|
||||||
|
}
|
||||||
if (ruletype != null && nametype != null) {
|
if (ruletype != null && nametype != null) {
|
||||||
LanguageSet langset = null;
|
LanguageSet langset = null;
|
||||||
if (languageset != null && languageset.size() > 0) {
|
if (languageset != null && languageset.size() > 0) {
|
||||||
|
|
|
@ -22,6 +22,7 @@ package org.elasticsearch.index.analysis;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.phonetic.DaitchMokotoffSoundexFilter;
|
||||||
import org.elasticsearch.Version;
|
import org.elasticsearch.Version;
|
||||||
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
@ -72,4 +73,14 @@ public class SimplePhoneticAnalysisTests extends ESTestCase {
|
||||||
"rmba", "rmbalt", "rmbo", "rmbolt", "rmbu", "rmbult" };
|
"rmba", "rmbalt", "rmbo", "rmbolt", "rmbu", "rmbult" };
|
||||||
BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
|
BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testPhoneticTokenFilterDaitchMotokoff() throws IOException {
|
||||||
|
TokenFilterFactory filterFactory = analysis.tokenFilter.get("daitch_mokotoff");
|
||||||
|
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||||
|
tokenizer.setReader(new StringReader("chauptman"));
|
||||||
|
String[] expected = new String[] { "473660", "573660" };
|
||||||
|
assertThat(filterFactory.create(tokenizer), instanceOf(DaitchMokotoffSoundexFilter.class));
|
||||||
|
BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue