Fix daitch_mokotoff phonetic filter to use the dedicated Lucene filter (#28225)

This commit changes the phonetic filter factory to use a DaitchMokotoffSoundexFilter
instead of a PhoneticFilter with a daitch_mokotoff encoder when daitch_mokotoff is selected.
The latter does not hanlde branching when computing the soundex and fails to encode multiple
variations when possible.

Closes #28211
This commit is contained in:
Jim Ferenczi 2018-01-15 19:35:54 +01:00 committed by GitHub
parent 0a92e43f62
commit b82017cbfe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 19 additions and 1 deletions

View File

@ -33,6 +33,7 @@ import org.apache.commons.codec.language.bm.PhoneticEngine;
import org.apache.commons.codec.language.bm.RuleType; import org.apache.commons.codec.language.bm.RuleType;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.phonetic.BeiderMorseFilter; import org.apache.lucene.analysis.phonetic.BeiderMorseFilter;
import org.apache.lucene.analysis.phonetic.DaitchMokotoffSoundexFilter;
import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter; import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter;
import org.apache.lucene.analysis.phonetic.PhoneticFilter; import org.apache.lucene.analysis.phonetic.PhoneticFilter;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
@ -53,6 +54,7 @@ public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
private List<String> languageset; private List<String> languageset;
private NameType nametype; private NameType nametype;
private RuleType ruletype; private RuleType ruletype;
private boolean isDaitchMokotoff;
public PhoneticTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { public PhoneticTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings); super(indexSettings, name, settings);
@ -60,6 +62,7 @@ public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
this.nametype = null; this.nametype = null;
this.ruletype = null; this.ruletype = null;
this.maxcodelength = 0; this.maxcodelength = 0;
this.isDaitchMokotoff = false;
this.replace = settings.getAsBoolean("replace", true); this.replace = settings.getAsBoolean("replace", true);
// weird, encoder is null at last step in SimplePhoneticAnalysisTests, so we set it to metaphone as default // weird, encoder is null at last step in SimplePhoneticAnalysisTests, so we set it to metaphone as default
String encodername = settings.get("encoder", "metaphone"); String encodername = settings.get("encoder", "metaphone");
@ -106,7 +109,8 @@ public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
} else if ("nysiis".equalsIgnoreCase(encodername)) { } else if ("nysiis".equalsIgnoreCase(encodername)) {
this.encoder = new Nysiis(); this.encoder = new Nysiis();
} else if ("daitch_mokotoff".equalsIgnoreCase(encodername)) { } else if ("daitch_mokotoff".equalsIgnoreCase(encodername)) {
this.encoder = new DaitchMokotoffSoundex(); this.encoder = null;
this.isDaitchMokotoff = true;
} else { } else {
throw new IllegalArgumentException("unknown encoder [" + encodername + "] for phonetic token filter"); throw new IllegalArgumentException("unknown encoder [" + encodername + "] for phonetic token filter");
} }
@ -115,6 +119,9 @@ public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
@Override @Override
public TokenStream create(TokenStream tokenStream) { public TokenStream create(TokenStream tokenStream) {
if (encoder == null) { if (encoder == null) {
if (isDaitchMokotoff) {
return new DaitchMokotoffSoundexFilter(tokenStream, !replace);
}
if (ruletype != null && nametype != null) { if (ruletype != null && nametype != null) {
LanguageSet langset = null; LanguageSet langset = null;
if (languageset != null && languageset.size() > 0) { if (languageset != null && languageset.size() > 0) {

View File

@ -22,6 +22,7 @@ package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.phonetic.DaitchMokotoffSoundexFilter;
import org.elasticsearch.Version; import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
@ -72,4 +73,14 @@ public class SimplePhoneticAnalysisTests extends ESTestCase {
"rmba", "rmbalt", "rmbo", "rmbolt", "rmbu", "rmbult" }; "rmba", "rmbalt", "rmbo", "rmbolt", "rmbu", "rmbult" };
BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected); BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
} }
public void testPhoneticTokenFilterDaitchMotokoff() throws IOException {
TokenFilterFactory filterFactory = analysis.tokenFilter.get("daitch_mokotoff");
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader("chauptman"));
String[] expected = new String[] { "473660", "573660" };
assertThat(filterFactory.create(tokenizer), instanceOf(DaitchMokotoffSoundexFilter.class));
BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
}
} }