Fix beidermorse phonetic token filter for unspecified `languageset` (#27112)

Currently, when we create a BeiderMorseFilter with an unspecified `languageset`,
the filter will not guess the language, which should be the default behaviour.
This change fixes this and adds a simple test for the cases with and without
provided `languageset` settings.

Closes #26771
This commit is contained in:
Christoph Büscher 2017-10-27 10:07:36 +02:00 committed by GitHub
parent 6625ecfff4
commit 9253ea8aec
3 changed files with 44 additions and 9 deletions

View File

@ -19,9 +19,6 @@
package org.elasticsearch.index.analysis;
import java.util.HashSet;
import java.util.List;
import org.apache.commons.codec.Encoder;
import org.apache.commons.codec.language.Caverphone1;
import org.apache.commons.codec.language.Caverphone2;
@ -45,6 +42,9 @@ import org.elasticsearch.index.analysis.phonetic.HaasePhonetik;
import org.elasticsearch.index.analysis.phonetic.KoelnerPhonetik;
import org.elasticsearch.index.analysis.phonetic.Nysiis;
import java.util.HashSet;
import java.util.List;
public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
private final Encoder encoder;
@ -116,11 +116,11 @@ public class PhoneticTokenFilterFactory extends AbstractTokenFilterFactory {
public TokenStream create(TokenStream tokenStream) {
if (encoder == null) {
if (ruletype != null && nametype != null) {
if (languageset != null) {
final LanguageSet languages = LanguageSet.from(new HashSet<>(languageset));
return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true), languages);
LanguageSet langset = null;
if (languageset != null && languageset.size() > 0) {
langset = LanguageSet.from(new HashSet<>(languageset));
}
return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true));
return new BeiderMorseFilter(tokenStream, new PhoneticEngine(nametype, ruletype, true), langset);
}
if (maxcodelength > 0) {
return new DoubleMetaphoneFilter(tokenStream, maxcodelength, !replace);

View File

@ -19,6 +19,9 @@
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
@ -26,19 +29,47 @@ import org.elasticsearch.index.Index;
import org.elasticsearch.plugin.analysis.AnalysisPhoneticPlugin;
import org.elasticsearch.test.ESTestCase;
import org.hamcrest.MatcherAssert;
import org.junit.Before;
import java.io.IOException;
import java.io.StringReader;
import static org.hamcrest.Matchers.instanceOf;
public class SimplePhoneticAnalysisTests extends ESTestCase {
public void testPhoneticTokenFilterFactory() throws IOException {
private TestAnalysis analysis;
@Before
public void setup() throws IOException {
String yaml = "/org/elasticsearch/index/analysis/phonetic-1.yml";
Settings settings = Settings.builder().loadFromStream(yaml, getClass().getResourceAsStream(yaml), false)
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.build();
TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisPhoneticPlugin());
this.analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisPhoneticPlugin());
}
public void testPhoneticTokenFilterFactory() throws IOException {
TokenFilterFactory filterFactory = analysis.tokenFilter.get("phonetic");
MatcherAssert.assertThat(filterFactory, instanceOf(PhoneticTokenFilterFactory.class));
}
public void testPhoneticTokenFilterBeiderMorseNoLanguage() throws IOException {
TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilter");
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader("ABADIAS"));
String[] expected = new String[] { "abYdias", "abYdios", "abadia", "abadiaS", "abadias", "abadio", "abadioS", "abadios", "abodia",
"abodiaS", "abodias", "abodio", "abodioS", "abodios", "avadias", "avadios", "avodias", "avodios", "obadia", "obadiaS",
"obadias", "obadio", "obadioS", "obadios", "obodia", "obodiaS", "obodias", "obodioS" };
BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
}
public void testPhoneticTokenFilterBeiderMorseWithLanguage() throws IOException {
TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilterfrench");
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader("Rimbault"));
String[] expected = new String[] { "rimbD", "rimbDlt", "rimba", "rimbalt", "rimbo", "rimbolt", "rimbu", "rimbult", "rmbD", "rmbDlt",
"rmba", "rmbalt", "rmbo", "rmbolt", "rmbu", "rmbult" };
BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
}
}

View File

@ -19,6 +19,10 @@ index:
beidermorsefilter:
type: phonetic
encoder: beidermorse
beidermorsefilterfrench:
type: phonetic
encoder: beidermorse
languageset : [ "french" ]
koelnerphonetikfilter:
type: phonetic
encoder: koelnerphonetik